@CreoleResource(name="RegEx Sentence Splitter", icon="sentence-splitter", comment="A sentence splitter based on regular expressions.", helpURL="http://gate.ac.uk/userguide/sec:annie:regex-splitter") public class RegexSentenceSplitter extends AbstractLanguageAnalyser
AbstractProcessingResource.InternalStatusListener, AbstractProcessingResource.IntervalProgressListener
Modifier and Type | Field and Description |
---|---|
protected String |
encoding
Encoding used when reading config files
|
protected URL |
externalSplitListURL
URL pointing to a file with regex patterns for external sentence splits.
|
protected Pattern |
externalSplitsPattern |
protected URL |
internalSplitListURL
URL pointing to a file with regex patterns for internal sentence splits.
|
protected Pattern |
internalSplitsPattern |
protected URL |
nonSplitListURL
URL pointing to a file with regex patterns for non sentence splits.
|
protected Pattern |
nonSplitsPattern |
protected String |
outputASName
Output annotation set name.
|
static String |
SPLIT_DOCUMENT_PARAMETER_NAME
Parameter name
|
static String |
SPLIT_ENCODING_PARAMETER_NAME
Parameter name
|
static String |
SPLIT_INPUT_AS_PARAMETER_NAME
Parameter name
|
static String |
SPLIT_NON_SPLIT_LIST_PARAMETER_NAME
Parameter name
|
static String |
SPLIT_OUTPUT_AS_PARAMETER_NAME
Parameter name
|
static String |
SPLIT_SPLIT_LIST_PARAMETER_NAME
Parameter name
|
corpus, document
interrupted
name
features
ANNOTATION_COREF_FEATURE_NAME, DATE_ANNOTATION_TYPE, DATE_POSTED_ANNOTATION_TYPE, DEFAULT_FILE, DOCUMENT_COREF_FEATURE_NAME, JOB_ID_ANNOTATION_TYPE, LOCATION_ANNOTATION_TYPE, LOOKUP_ANNOTATION_TYPE, LOOKUP_CLASS_FEATURE_NAME, LOOKUP_INSTANCE_FEATURE_NAME, LOOKUP_LANGUAGE_FEATURE_NAME, LOOKUP_MAJOR_TYPE_FEATURE_NAME, LOOKUP_MINOR_TYPE_FEATURE_NAME, LOOKUP_ONTOLOGY_FEATURE_NAME, MONEY_ANNOTATION_TYPE, ORGANIZATION_ANNOTATION_TYPE, PERSON_ANNOTATION_TYPE, PERSON_GENDER_FEATURE_NAME, PLUGIN_DIR, SENTENCE_ANNOTATION_TYPE, SPACE_TOKEN_ANNOTATION_TYPE, TOKEN_ANNOTATION_TYPE, TOKEN_CATEGORY_FEATURE_NAME, TOKEN_KIND_FEATURE_NAME, TOKEN_LENGTH_FEATURE_NAME, TOKEN_ORTH_FEATURE_NAME, TOKEN_STRING_FEATURE_NAME
Constructor and Description |
---|
RegexSentenceSplitter() |
Modifier and Type | Method and Description |
---|---|
protected Pattern |
compilePattern(URL paternsListUrl,
String encoding) |
void |
execute()
Run the resource.
|
String |
getEncoding() |
URL |
getExternalSplitListURL() |
URL |
getInternalSplitListURL() |
Pattern |
getInternalSplitsPattern() |
URL |
getNonSplitListURL() |
String |
getOutputASName() |
Resource |
init()
Initialise this resource, and return it.
|
void |
setEncoding(String encoding) |
void |
setExternalSplitListURL(URL externalSplitListURL) |
void |
setInternalSplitListURL(URL internalSplitListURL) |
void |
setInternalSplitsPattern(Pattern internalSplitsPattern) |
void |
setNonSplitListURL(URL nonSplitListURL) |
void |
setOutputASName(String outputASName) |
getCorpus, getDocument, setCorpus, setDocument
addProgressListener, addStatusListener, cleanup, fireProcessFinished, fireProgressChanged, fireStatusChanged, getRuntimeParameterValues, getRuntimeParameterValues, interrupt, isInterrupted, reInit, removeProgressListener, removeStatusListener
checkParameterValues, flushBeanInfoCache, forgetBeanInfo, getBeanInfo, getInitParameterValues, getInitParameterValues, getName, getParameterValue, getParameterValue, getParameterValues, removeResourceListeners, setName, setParameterValue, setParameterValue, setParameterValues, setParameterValues, setResourceListeners, toString
getFeatures, setFeatures
clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait
reInit
cleanup, getParameterValue, setParameterValue, setParameterValues
getFeatures, setFeatures
getName, setName
interrupt, isInterrupted
public static final String SPLIT_DOCUMENT_PARAMETER_NAME
public static final String SPLIT_INPUT_AS_PARAMETER_NAME
public static final String SPLIT_OUTPUT_AS_PARAMETER_NAME
public static final String SPLIT_ENCODING_PARAMETER_NAME
public static final String SPLIT_SPLIT_LIST_PARAMETER_NAME
public static final String SPLIT_NON_SPLIT_LIST_PARAMETER_NAME
protected String outputASName
protected URL internalSplitListURL
protected URL externalSplitListURL
protected URL nonSplitListURL
protected Pattern internalSplitsPattern
protected Pattern externalSplitsPattern
protected Pattern nonSplitsPattern
public RegexSentenceSplitter()
protected Pattern compilePattern(URL paternsListUrl, String encoding) throws UnsupportedEncodingException, IOException
public void execute() throws ExecutionException
AbstractProcessingResource
execute
in interface Executable
execute
in class AbstractProcessingResource
ExecutionException
public Resource init() throws ResourceInstantiationException
AbstractProcessingResource
init
in interface Resource
init
in class AbstractProcessingResource
ResourceInstantiationException
public String getOutputASName()
@RunTime @Optional @CreoleParameter(comment="The annotation set to be used as output for \'Sentence\' and \'Split\' annotations") public void setOutputASName(String outputASName)
outputASName
- the outputASName to setpublic String getEncoding()
@CreoleParameter(comment="The encoding used for reading the definition files", defaultValue="UTF-8") public void setEncoding(String encoding)
encoding
- the encoding to setpublic URL getInternalSplitListURL()
@CreoleParameter(defaultValue="resources/regex-splitter/internal-split-patterns.txt", suffixes="txt", comment="The URL to the internal splits pattern list") public void setInternalSplitListURL(URL internalSplitListURL)
internalSplitListURL
- the internalSplitListURL to setpublic URL getExternalSplitListURL()
@CreoleParameter(defaultValue="resources/regex-splitter/external-split-patterns.txt", comment="The URL to the external splits pattern list", suffixes="txt") public void setExternalSplitListURL(URL externalSplitListURL)
externalSplitListURL
- the externalSplitListURL to setpublic URL getNonSplitListURL()
@CreoleParameter(defaultValue="resources/regex-splitter/non-split-patterns.txt", comment="The URL to the non splits pattern list", suffixes="txt") public void setNonSplitListURL(URL nonSplitListURL)
nonSplitListURL
- the nonSplitListURL to setpublic Pattern getInternalSplitsPattern()
public void setInternalSplitsPattern(Pattern internalSplitsPattern)
internalSplitsPattern
- the internalSplitsPattern to set