|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object gate.util.AbstractFeatureBearer gate.creole.AbstractResource gate.creole.AbstractProcessingResource gate.creole.AbstractLanguageAnalyser gate.creole.orthomatcher.OrthoMatcher
public class OrthoMatcher
Nested Class Summary |
---|
Nested classes/interfaces inherited from class gate.creole.AbstractProcessingResource |
---|
AbstractProcessingResource.InternalStatusListener, AbstractProcessingResource.IntervalProgressListener |
Field Summary | |
---|---|
protected HashMap |
alias
|
protected static String |
ALIASLISTNAME
|
protected boolean |
allMatchingNeeded
This is an internal variable to indicate whether we matched using a rule that requires that the newly matched annotation matches all the others This is needed, because organizations can share first/last tokens like News and be different |
protected String |
annotationSetName
the name of the annotation set |
protected List |
annotationTypes
the types of the annotation |
protected HashMap |
annots2Remove
|
protected static String |
ARTLISTNAME
|
protected boolean |
caseSensitive
|
protected HashSet |
cdg
|
protected static String |
CDGLISTNAME
|
protected HashMap |
connector
|
protected static String |
CONNECTORLISTNAME
|
static boolean |
DEBUG
|
protected HashMap |
def_art
|
protected boolean |
extLists
internal or external list |
protected Boolean |
highPrecisionOrgs
Use only high precision rules for Organizations |
protected static org.apache.log4j.Logger |
log
|
protected Annotation |
longAnnot
|
protected List |
matchesDocFeature
|
protected boolean |
matchingUnknowns
matching unknowns or not |
protected AnnotationSet |
nameAllAnnots
|
protected ArrayList<Annotation> |
normalizedTokensLongAnnot
|
protected HashMap |
normalizedTokensMap
|
protected ArrayList<Annotation> |
normalizedTokensShortAnnot
|
static String |
OM_ANN_SET_PARAMETER_NAME
|
static String |
OM_ANN_TYPES_PARAMETER_NAME
|
static String |
OM_CASE_SENSITIVE_PARAMETER_NAME
|
static String |
OM_DOCUMENT_PARAMETER_NAME
|
static String |
OM_EXT_LISTS_PARAMETER_NAME
|
static String |
OM_ORG_TYPE_PARAMETER_NAME
|
static String |
OM_PERSON_TYPE_PARAMETER_NAME
|
protected String |
organizationType
the organization type |
protected String |
personType
the person type |
protected static String |
PREPLISTNAME
|
protected HashMap |
prepos
|
protected HashMap |
processedAnnots
|
protected static String |
PUNCTUATION_VALUE
|
protected Annotation |
shortAnnot
|
protected HashMap |
spur_match
|
protected static String |
SPURLISTNAME
|
protected static String |
THE_VALUE
|
protected ArrayList<Annotation> |
tokensLongAnnot
|
protected HashMap |
tokensMap
|
protected ArrayList<Annotation> |
tokensShortAnnot
|
protected String |
unknownType
|
Fields inherited from class gate.creole.AbstractLanguageAnalyser |
---|
corpus, document |
Fields inherited from class gate.creole.AbstractProcessingResource |
---|
interrupted |
Fields inherited from class gate.creole.AbstractResource |
---|
name |
Fields inherited from class gate.util.AbstractFeatureBearer |
---|
features |
Constructor Summary | |
---|---|
OrthoMatcher()
|
Method Summary | |
---|---|
protected void |
createAnnotList(String nameFile,
String nameList)
creates the lookup tables |
protected void |
docCleanup()
|
void |
execute()
Run the resource. |
String |
getAnnotationSetName()
get the name of the annotation set |
List |
getAnnotationTypes()
get the types of the annotation |
Boolean |
getCaseSensitive()
Are we running in a case-sensitive mode? |
URL |
getDefinitionFileURL()
|
String |
getEncoding()
|
Boolean |
getExtLists()
|
Boolean |
getHighPrecisionOrgs()
|
Double |
getMinimumNicknameLikelihood()
|
String |
getOrganizationType()
|
AnnotationOrthography |
getOrthography()
|
String |
getPersonType()
|
Boolean |
getProcessUnknown()
Return whether or not we're processing the Unknown annots |
HashMap |
getTokensMap()
|
Resource |
init()
Initialise this resource, and return it. |
protected boolean |
matchAnnotations(Annotation newAnnot,
String annotString,
Annotation prevAnnot)
|
protected void |
matchNameAnnotations()
|
protected boolean |
matchOtherAnnots(List toMatchList,
Annotation newAnnot,
String annotString)
This method checkes whether the new annotation matches all annotations given in the toMatchList (it contains ids) The idea is that the new annotation needs to match all those, because assuming transitivity does not always work, when two different entities share a common token: e.g., BT Cellnet and BT and British Telecom. |
protected void |
matchUnknown(ArrayList<Annotation> sortedAnnotationsForAType)
|
protected void |
matchWithPrevious(Annotation nameAnnot,
String annotString,
ArrayList<Annotation> listOfThisType,
int startIndex)
Attempt to match nameAnnot against all previous annotations of the same type, which are passed down in listOfThisType. |
protected void |
modifyRules(Map<Integer,OrthoMatcherRule> rules)
Override this method to add, replace, remove rules |
boolean |
noMatchRule1(String s1,
String s2,
Annotation previousAnnot,
boolean longerPrevious)
No Match Rule 1: Avoids the problem of matching David Jones ... |
boolean |
noMatchRule2(String s1,
String s2)
NoMatch Rule #2: Do we have a mismatch of middle initial? |
protected String |
normalizeOrganizationName(String annotString,
Annotation annot)
return an organization without a designator and starting The |
protected void |
normalizePersonName(Annotation annot)
|
protected void |
propagatePropertyToExactMatchingMatches(Annotation updateAnnot,
String featureName,
Object value)
|
void |
setAnnotationSetName(String newAnnotationSetName)
set the annotation set name |
void |
setAnnotationTypes(List newType)
set the types of the annotations |
void |
setCaseSensitive(Boolean newCase)
set the caseSensitive flag |
void |
setDefinitionFileURL(URL definitionFileURL)
|
void |
setEncoding(String encoding)
|
void |
setExtLists(Boolean newExtLists)
set the extLists flag |
void |
setHighPrecisionOrgs(Boolean highPrecisionOrgs)
|
void |
setMinimumNicknameLikelihood(Double minimumNicknameLikelihood)
|
void |
setOrganizationType(String newOrganizationType)
|
void |
setOrthography(AnnotationOrthography orthography)
|
void |
setPersonType(String newPersonType)
|
void |
setProcessUnknown(Boolean processOrNot)
set whether to process the Unknown annotations |
Methods inherited from class gate.creole.AbstractLanguageAnalyser |
---|
getCorpus, getDocument, setCorpus, setDocument |
Methods inherited from class gate.creole.AbstractProcessingResource |
---|
addProgressListener, addStatusListener, cleanup, fireProcessFinished, fireProgressChanged, fireStatusChanged, getRuntimeParameterValues, getRuntimeParameterValues, interrupt, isInterrupted, reInit, removeProgressListener, removeStatusListener |
Methods inherited from class gate.creole.AbstractResource |
---|
checkParameterValues, getBeanInfo, getInitParameterValues, getInitParameterValues, getName, getParameterValue, getParameterValue, getParameterValues, removeResourceListeners, setName, setParameterValue, setParameterValue, setParameterValues, setParameterValues, setResourceListeners |
Methods inherited from class gate.util.AbstractFeatureBearer |
---|
getFeatures, setFeatures |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Methods inherited from interface gate.ProcessingResource |
---|
reInit |
Methods inherited from interface gate.Resource |
---|
cleanup, getParameterValue, setParameterValue, setParameterValues |
Methods inherited from interface gate.util.FeatureBearer |
---|
getFeatures, setFeatures |
Methods inherited from interface gate.util.NameBearer |
---|
getName, setName |
Methods inherited from interface gate.Executable |
---|
interrupt, isInterrupted |
Field Detail |
---|
protected static final org.apache.log4j.Logger log
public static final boolean DEBUG
public static final String OM_DOCUMENT_PARAMETER_NAME
public static final String OM_ANN_SET_PARAMETER_NAME
public static final String OM_CASE_SENSITIVE_PARAMETER_NAME
public static final String OM_ANN_TYPES_PARAMETER_NAME
public static final String OM_ORG_TYPE_PARAMETER_NAME
public static final String OM_PERSON_TYPE_PARAMETER_NAME
public static final String OM_EXT_LISTS_PARAMETER_NAME
protected static final String CDGLISTNAME
protected static final String ALIASLISTNAME
protected static final String ARTLISTNAME
protected static final String PREPLISTNAME
protected static final String CONNECTORLISTNAME
protected static final String SPURLISTNAME
protected static final String PUNCTUATION_VALUE
protected static final String THE_VALUE
protected String annotationSetName
protected List annotationTypes
protected String organizationType
protected String personType
protected String unknownType
protected boolean extLists
protected Boolean highPrecisionOrgs
protected boolean matchingUnknowns
protected boolean allMatchingNeeded
protected boolean caseSensitive
protected HashMap alias
protected HashSet cdg
protected HashMap spur_match
protected HashMap def_art
protected HashMap connector
protected HashMap prepos
protected AnnotationSet nameAllAnnots
protected HashMap processedAnnots
protected HashMap annots2Remove
protected List matchesDocFeature
protected HashMap tokensMap
protected HashMap normalizedTokensMap
protected Annotation shortAnnot
protected Annotation longAnnot
protected ArrayList<Annotation> tokensLongAnnot
protected ArrayList<Annotation> tokensShortAnnot
protected ArrayList<Annotation> normalizedTokensLongAnnot
protected ArrayList<Annotation> normalizedTokensShortAnnot
Constructor Detail |
---|
public OrthoMatcher()
Method Detail |
---|
public HashMap getTokensMap()
protected void modifyRules(Map<Integer,OrthoMatcherRule> rules)
public Resource init() throws ResourceInstantiationException
init
in interface Resource
init
in class AbstractProcessingResource
ResourceInstantiationException
public void execute() throws ExecutionException
execute
in interface Executable
execute
in class AbstractProcessingResource
ExecutionException
protected void matchNameAnnotations() throws ExecutionException
ExecutionException
protected void matchUnknown(ArrayList<Annotation> sortedAnnotationsForAType) throws ExecutionException
ExecutionException
protected void matchWithPrevious(Annotation nameAnnot, String annotString, ArrayList<Annotation> listOfThisType, int startIndex)
nameAnnot
- Annotation we are trying to matchannotString
- Normalized string representation of annotationlistOfThisType
- ArrayList of Annotations of the same type as nameAnnotstartIndex
- Index in listOfThisType that we will start from in matching the current annotationprotected void propagatePropertyToExactMatchingMatches(Annotation updateAnnot, String featureName, Object value)
protected boolean matchAnnotations(Annotation newAnnot, String annotString, Annotation prevAnnot)
protected boolean matchOtherAnnots(List toMatchList, Annotation newAnnot, String annotString)
protected void docCleanup()
protected void normalizePersonName(Annotation annot) throws ExecutionException
ExecutionException
protected String normalizeOrganizationName(String annotString, Annotation annot)
protected void createAnnotList(String nameFile, String nameList) throws IOException
IOException
public void setExtLists(Boolean newExtLists)
public void setCaseSensitive(Boolean newCase)
public void setAnnotationSetName(String newAnnotationSetName)
public void setAnnotationTypes(List newType)
public void setProcessUnknown(Boolean processOrNot)
public void setOrganizationType(String newOrganizationType)
public void setPersonType(String newPersonType)
public String getAnnotationSetName()
public List getAnnotationTypes()
public String getOrganizationType()
public String getPersonType()
public Boolean getExtLists()
public Boolean getCaseSensitive()
public Boolean getProcessUnknown()
public boolean noMatchRule1(String s1, String s2, Annotation previousAnnot, boolean longerPrevious)
public boolean noMatchRule2(String s1, String s2)
public void setDefinitionFileURL(URL definitionFileURL)
public URL getDefinitionFileURL()
public void setEncoding(String encoding)
public String getEncoding()
public Double getMinimumNicknameLikelihood()
public void setMinimumNicknameLikelihood(Double minimumNicknameLikelihood)
public Boolean getHighPrecisionOrgs()
public void setHighPrecisionOrgs(Boolean highPrecisionOrgs)
highPrecisionOrgs
- the highPrecisionOrgs to setpublic void setOrthography(AnnotationOrthography orthography)
public AnnotationOrthography getOrthography()
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |