@CreoleResource(name="ANNIE English Tokeniser", comment="A customisable English tokeniser.", helpURL="http://gate.ac.uk/userguide/sec:annie:tokeniser", icon="tokeniser") public class DefaultTokeniser extends AbstractLanguageAnalyser implements Benchmarkable
SimpleTokeniser
and a
Transducer
.
The simple tokeniser tokenises the document and the transducer processes its
output.AbstractProcessingResource.InternalStatusListener, AbstractProcessingResource.IntervalProgressListener
Modifier and Type | Field and Description |
---|---|
static String |
DEF_TOK_ANNOT_SET_PARAMETER_NAME |
static String |
DEF_TOK_DOCUMENT_PARAMETER_NAME |
static String |
DEF_TOK_ENCODING_PARAMETER_NAME |
static String |
DEF_TOK_GRAMRULES_URL_PARAMETER_NAME |
static String |
DEF_TOK_TOKRULES_URL_PARAMETER_NAME |
protected SimpleTokeniser |
tokeniser
the simple tokeniser used for tokenisation
|
protected Transducer |
transducer
the transducer used for post-processing
|
corpus, document
interrupted
name
features
ANNOTATION_COREF_FEATURE_NAME, DATE_ANNOTATION_TYPE, DATE_POSTED_ANNOTATION_TYPE, DEFAULT_FILE, DOCUMENT_COREF_FEATURE_NAME, JOB_ID_ANNOTATION_TYPE, LOCATION_ANNOTATION_TYPE, LOOKUP_ANNOTATION_TYPE, LOOKUP_CLASS_FEATURE_NAME, LOOKUP_INSTANCE_FEATURE_NAME, LOOKUP_LANGUAGE_FEATURE_NAME, LOOKUP_MAJOR_TYPE_FEATURE_NAME, LOOKUP_MINOR_TYPE_FEATURE_NAME, LOOKUP_ONTOLOGY_FEATURE_NAME, MONEY_ANNOTATION_TYPE, ORGANIZATION_ANNOTATION_TYPE, PERSON_ANNOTATION_TYPE, PERSON_GENDER_FEATURE_NAME, PLUGIN_DIR, SENTENCE_ANNOTATION_TYPE, SPACE_TOKEN_ANNOTATION_TYPE, TOKEN_ANNOTATION_TYPE, TOKEN_CATEGORY_FEATURE_NAME, TOKEN_KIND_FEATURE_NAME, TOKEN_LENGTH_FEATURE_NAME, TOKEN_ORTH_FEATURE_NAME, TOKEN_STRING_FEATURE_NAME
Constructor and Description |
---|
DefaultTokeniser() |
Modifier and Type | Method and Description |
---|---|
void |
cleanup()
should clear all internal data of the resource.
|
void |
execute()
Run the resource.
|
String |
getAnnotationSetName() |
String |
getBenchmarkId()
Returns the benchmark ID of this resource.
|
String |
getEncoding() |
URL |
getTokeniserRulesURL() |
URL |
getTransducerGrammarURL() |
Resource |
init()
Initialise this resource, and return it.
|
void |
interrupt()
Notifies all the PRs in this controller that they should stop their
execution as soon as possible.
|
void |
setAnnotationSetName(String annotationSetName) |
void |
setBenchmarkId(String benchmarkId)
This method sets the benchmarkID for this resource.
|
void |
setEncoding(String encoding) |
void |
setTokeniserRulesURL(URL tokeniserRulesURL) |
void |
setTransducerGrammarURL(URL transducerGrammarURL) |
getCorpus, getDocument, setCorpus, setDocument
addProgressListener, addStatusListener, fireProcessFinished, fireProgressChanged, fireStatusChanged, getRuntimeParameterValues, getRuntimeParameterValues, isInterrupted, reInit, removeProgressListener, removeStatusListener
checkParameterValues, flushBeanInfoCache, forgetBeanInfo, getBeanInfo, getInitParameterValues, getInitParameterValues, getName, getParameterValue, getParameterValue, getParameterValues, removeResourceListeners, setName, setParameterValue, setParameterValue, setParameterValues, setParameterValues, setResourceListeners, toString
getFeatures, setFeatures
clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait
reInit
getParameterValue, setParameterValue, setParameterValues
getFeatures, setFeatures
getName, setName
isInterrupted
public static final String DEF_TOK_DOCUMENT_PARAMETER_NAME
public static final String DEF_TOK_ANNOT_SET_PARAMETER_NAME
public static final String DEF_TOK_TOKRULES_URL_PARAMETER_NAME
public static final String DEF_TOK_GRAMRULES_URL_PARAMETER_NAME
public static final String DEF_TOK_ENCODING_PARAMETER_NAME
protected SimpleTokeniser tokeniser
protected Transducer transducer
public DefaultTokeniser()
public Resource init() throws ResourceInstantiationException
init
in interface Resource
init
in class AbstractProcessingResource
ResourceInstantiationException
public void cleanup()
AbstractProcessingResource
cleanup
in interface Resource
cleanup
in class AbstractProcessingResource
public void execute() throws ExecutionException
AbstractProcessingResource
execute
in interface Executable
execute
in class AbstractProcessingResource
ExecutionException
public void interrupt()
interrupt
in interface Executable
interrupt
in class AbstractProcessingResource
@CreoleParameter(defaultValue="resources/tokeniser/DefaultTokeniser.rules", comment="The URL to the rules file", suffixes="rules") public void setTokeniserRulesURL(URL tokeniserRulesURL)
public URL getTokeniserRulesURL()
@CreoleParameter(defaultValue="UTF-8", comment="The encoding used for reading the definitions") public void setEncoding(String encoding)
public String getEncoding()
@CreoleParameter(defaultValue="resources/tokeniser/postprocess.jape", comment="The URL to the postprocessing transducer", suffixes="jape") public void setTransducerGrammarURL(URL transducerGrammarURL)
public URL getTransducerGrammarURL()
@RunTime @Optional @CreoleParameter(comment="The annotation set to be used for the generated annotations") public void setAnnotationSetName(String annotationSetName)
public String getAnnotationSetName()
public void setBenchmarkId(String benchmarkId)
Benchmarkable
setBenchmarkId
in interface Benchmarkable
benchmarkId
- the benchmark ID, which must not contain spaces
as it is already used as a separator in the log, you can use
Benchmark.createBenchmarkId(String, String)
for it.public String getBenchmarkId()
Benchmarkable
getBenchmarkId
in interface Benchmarkable