gate.creole.splitter
Class RegexSentenceSplitter

java.lang.Object
  extended by gate.util.AbstractFeatureBearer
      extended by gate.creole.AbstractResource
          extended by gate.creole.AbstractProcessingResource
              extended by gate.creole.AbstractLanguageAnalyser
                  extended by gate.creole.splitter.RegexSentenceSplitter
All Implemented Interfaces:
ANNIEConstants, Executable, LanguageAnalyser, ProcessingResource, Resource, FeatureBearer, NameBearer, Serializable

public class RegexSentenceSplitter
extends AbstractLanguageAnalyser

A fast sentence splitter replacement based on regular expressions.

See Also:
Serialized Form

Nested Class Summary
 
Nested classes/interfaces inherited from class gate.creole.AbstractProcessingResource
AbstractProcessingResource.InternalStatusListener, AbstractProcessingResource.IntervalProgressListener
 
Field Summary
protected  String encoding
          Encoding used when reading config files
protected  URL externalSplitListURL
          URL pointing to a file with regex patterns for external sentence splits.
protected  Pattern externalSplitsPattern
           
protected  URL internalSplitListURL
          URL pointing to a file with regex patterns for internal sentence splits.
protected  Pattern internalSplitsPattern
           
protected  URL nonSplitListURL
          URL pointing to a file with regex patterns for non sentence splits.
protected  Pattern nonSplitsPattern
           
protected  String outputASName
          Output annotation set name.
static String SPLIT_DOCUMENT_PARAMETER_NAME
          Parameter name
static String SPLIT_ENCODING_PARAMETER_NAME
          Parameter name
static String SPLIT_INPUT_AS_PARAMETER_NAME
          Parameter name
static String SPLIT_NON_SPLIT_LIST_PARAMETER_NAME
          Parameter name
static String SPLIT_OUTPUT_AS_PARAMETER_NAME
          Parameter name
static String SPLIT_SPLIT_LIST_PARAMETER_NAME
          Parameter name
 
Fields inherited from class gate.creole.AbstractLanguageAnalyser
corpus, document
 
Fields inherited from class gate.creole.AbstractProcessingResource
interrupted
 
Fields inherited from class gate.creole.AbstractResource
name
 
Fields inherited from class gate.util.AbstractFeatureBearer
features
 
Fields inherited from interface gate.creole.ANNIEConstants
ANNOTATION_COREF_FEATURE_NAME, DATE_ANNOTATION_TYPE, DATE_POSTED_ANNOTATION_TYPE, DEFAULT_FILE, DOCUMENT_COREF_FEATURE_NAME, JOB_ID_ANNOTATION_TYPE, LOCATION_ANNOTATION_TYPE, LOOKUP_ANNOTATION_TYPE, LOOKUP_CLASS_FEATURE_NAME, LOOKUP_INSTANCE_FEATURE_NAME, LOOKUP_LANGUAGE_FEATURE_NAME, LOOKUP_MAJOR_TYPE_FEATURE_NAME, LOOKUP_MINOR_TYPE_FEATURE_NAME, LOOKUP_ONTOLOGY_FEATURE_NAME, MONEY_ANNOTATION_TYPE, ORGANIZATION_ANNOTATION_TYPE, PERSON_ANNOTATION_TYPE, PERSON_GENDER_FEATURE_NAME, PLUGIN_DIR, PR_NAMES, SENTENCE_ANNOTATION_TYPE, SPACE_TOKEN_ANNOTATION_TYPE, TOKEN_ANNOTATION_TYPE, TOKEN_CATEGORY_FEATURE_NAME, TOKEN_KIND_FEATURE_NAME, TOKEN_LENGTH_FEATURE_NAME, TOKEN_ORTH_FEATURE_NAME, TOKEN_STRING_FEATURE_NAME
 
Constructor Summary
RegexSentenceSplitter()
           
 
Method Summary
protected  Pattern compilePattern(URL paternsListUrl, String encoding)
           
 void execute()
          Run the resource.
 String getEncoding()
           
 URL getExternalSplitListURL()
           
 URL getInternalSplitListURL()
           
 Pattern getInternalSplitsPattern()
           
 URL getNonSplitListURL()
           
 String getOutputASName()
           
 Resource init()
          Initialise this resource, and return it.
 void setEncoding(String encoding)
           
 void setExternalSplitListURL(URL externalSplitListURL)
           
 void setInternalSplitListURL(URL internalSplitListURL)
           
 void setInternalSplitsPattern(Pattern internalSplitsPattern)
           
 void setNonSplitListURL(URL nonSplitListURL)
           
 void setOutputASName(String outputASName)
           
 
Methods inherited from class gate.creole.AbstractLanguageAnalyser
getCorpus, getDocument, setCorpus, setDocument
 
Methods inherited from class gate.creole.AbstractProcessingResource
addProgressListener, addStatusListener, cleanup, fireProcessFinished, fireProgressChanged, fireStatusChanged, getRuntimeParameterValues, getRuntimeParameterValues, interrupt, isInterrupted, reInit, removeProgressListener, removeStatusListener
 
Methods inherited from class gate.creole.AbstractResource
checkParameterValues, getBeanInfo, getInitParameterValues, getInitParameterValues, getName, getParameterValue, getParameterValue, getParameterValues, removeResourceListeners, setName, setParameterValue, setParameterValue, setParameterValues, setParameterValues, setResourceListeners
 
Methods inherited from class gate.util.AbstractFeatureBearer
getFeatures, setFeatures
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 
Methods inherited from interface gate.ProcessingResource
reInit
 
Methods inherited from interface gate.Resource
cleanup, getParameterValue, setParameterValue, setParameterValues
 
Methods inherited from interface gate.util.FeatureBearer
getFeatures, setFeatures
 
Methods inherited from interface gate.util.NameBearer
getName, setName
 
Methods inherited from interface gate.Executable
interrupt, isInterrupted
 

Field Detail

SPLIT_DOCUMENT_PARAMETER_NAME

public static final String SPLIT_DOCUMENT_PARAMETER_NAME
Parameter name

See Also:
Constant Field Values

SPLIT_INPUT_AS_PARAMETER_NAME

public static final String SPLIT_INPUT_AS_PARAMETER_NAME
Parameter name

See Also:
Constant Field Values

SPLIT_OUTPUT_AS_PARAMETER_NAME

public static final String SPLIT_OUTPUT_AS_PARAMETER_NAME
Parameter name

See Also:
Constant Field Values

SPLIT_ENCODING_PARAMETER_NAME

public static final String SPLIT_ENCODING_PARAMETER_NAME
Parameter name

See Also:
Constant Field Values

SPLIT_SPLIT_LIST_PARAMETER_NAME

public static final String SPLIT_SPLIT_LIST_PARAMETER_NAME
Parameter name

See Also:
Constant Field Values

SPLIT_NON_SPLIT_LIST_PARAMETER_NAME

public static final String SPLIT_NON_SPLIT_LIST_PARAMETER_NAME
Parameter name

See Also:
Constant Field Values

outputASName

protected String outputASName
Output annotation set name.


encoding

protected String encoding
Encoding used when reading config files


internalSplitListURL

protected URL internalSplitListURL
URL pointing to a file with regex patterns for internal sentence splits.


externalSplitListURL

protected URL externalSplitListURL
URL pointing to a file with regex patterns for external sentence splits.


nonSplitListURL

protected URL nonSplitListURL
URL pointing to a file with regex patterns for non sentence splits.


internalSplitsPattern

protected Pattern internalSplitsPattern

externalSplitsPattern

protected Pattern externalSplitsPattern

nonSplitsPattern

protected Pattern nonSplitsPattern
Constructor Detail

RegexSentenceSplitter

public RegexSentenceSplitter()
Method Detail

compilePattern

protected Pattern compilePattern(URL paternsListUrl,
                                 String encoding)
                          throws UnsupportedEncodingException,
                                 IOException
Throws:
UnsupportedEncodingException
IOException

execute

public void execute()
             throws ExecutionException
Description copied from class: AbstractProcessingResource
Run the resource. It doesn't make sense not to override this in subclasses so the default implementation signals an exception.

Specified by:
execute in interface Executable
Overrides:
execute in class AbstractProcessingResource
Throws:
ExecutionException

init

public Resource init()
              throws ResourceInstantiationException
Description copied from class: AbstractProcessingResource
Initialise this resource, and return it.

Specified by:
init in interface Resource
Overrides:
init in class AbstractProcessingResource
Throws:
ResourceInstantiationException

getOutputASName

public String getOutputASName()
Returns:
the outputASName

setOutputASName

public void setOutputASName(String outputASName)
Parameters:
outputASName - the outputASName to set

getEncoding

public String getEncoding()
Returns:
the encoding

setEncoding

public void setEncoding(String encoding)
Parameters:
encoding - the encoding to set

getInternalSplitListURL

public URL getInternalSplitListURL()
Returns:
the internalSplitListURL

setInternalSplitListURL

public void setInternalSplitListURL(URL internalSplitListURL)
Parameters:
internalSplitListURL - the internalSplitListURL to set

getExternalSplitListURL

public URL getExternalSplitListURL()
Returns:
the externalSplitListURL

setExternalSplitListURL

public void setExternalSplitListURL(URL externalSplitListURL)
Parameters:
externalSplitListURL - the externalSplitListURL to set

getNonSplitListURL

public URL getNonSplitListURL()
Returns:
the nonSplitListURL

setNonSplitListURL

public void setNonSplitListURL(URL nonSplitListURL)
Parameters:
nonSplitListURL - the nonSplitListURL to set

getInternalSplitsPattern

public Pattern getInternalSplitsPattern()
Returns:
the internalSplitsPattern

setInternalSplitsPattern

public void setInternalSplitsPattern(Pattern internalSplitsPattern)
Parameters:
internalSplitsPattern - the internalSplitsPattern to set