gate.creole.orthomatcher
Class OrthoMatcher

java.lang.Object
  extended by gate.util.AbstractFeatureBearer
      extended by gate.creole.AbstractResource
          extended by gate.creole.AbstractProcessingResource
              extended by gate.creole.AbstractLanguageAnalyser
                  extended by gate.creole.orthomatcher.OrthoMatcher
All Implemented Interfaces:
ANNIEConstants, Executable, LanguageAnalyser, ProcessingResource, Resource, FeatureBearer, NameBearer, Serializable
Direct Known Subclasses:
SampleOrthoMatcher

public class OrthoMatcher
extends AbstractLanguageAnalyser

See Also:
Serialized Form

Nested Class Summary
 
Nested classes/interfaces inherited from class gate.creole.AbstractProcessingResource
AbstractProcessingResource.InternalStatusListener, AbstractProcessingResource.IntervalProgressListener
 
Field Summary
protected  HashMap alias
           
protected static String ALIASLISTNAME
           
protected  boolean allMatchingNeeded
          This is an internal variable to indicate whether we matched using a rule that requires that the newly matched annotation matches all the others This is needed, because organizations can share first/last tokens like News and be different
protected  String annotationSetName
          the name of the annotation set
protected  List annotationTypes
          the types of the annotation
protected  HashMap annots2Remove
           
protected static String ARTLISTNAME
           
protected  boolean caseSensitive
           
protected  HashSet cdg
           
protected static String CDGLISTNAME
           
protected  HashMap connector
           
protected static String CONNECTORLISTNAME
           
static boolean DEBUG
           
protected  HashMap def_art
           
protected  boolean extLists
          internal or external list
protected  Boolean highPrecisionOrgs
          Use only high precision rules for Organizations
protected static org.apache.log4j.Logger log
           
protected  Annotation longAnnot
           
protected  List matchesDocFeature
           
protected  boolean matchingUnknowns
          matching unknowns or not
protected  AnnotationSet nameAllAnnots
           
protected  ArrayList<Annotation> normalizedTokensLongAnnot
           
protected  HashMap normalizedTokensMap
           
protected  ArrayList<Annotation> normalizedTokensShortAnnot
           
static String OM_ANN_SET_PARAMETER_NAME
           
static String OM_ANN_TYPES_PARAMETER_NAME
           
static String OM_CASE_SENSITIVE_PARAMETER_NAME
           
static String OM_DOCUMENT_PARAMETER_NAME
           
static String OM_EXT_LISTS_PARAMETER_NAME
           
static String OM_ORG_TYPE_PARAMETER_NAME
           
static String OM_PERSON_TYPE_PARAMETER_NAME
           
protected  String organizationType
          the organization type
protected  String personType
          the person type
protected static String PREPLISTNAME
           
protected  HashMap prepos
           
protected  HashMap processedAnnots
           
protected static String PUNCTUATION_VALUE
           
protected  Annotation shortAnnot
           
protected  HashMap spur_match
           
protected static String SPURLISTNAME
           
protected static String THE_VALUE
           
protected  ArrayList<Annotation> tokensLongAnnot
           
protected  HashMap tokensMap
           
protected  ArrayList<Annotation> tokensShortAnnot
           
protected  String unknownType
           
 
Fields inherited from class gate.creole.AbstractLanguageAnalyser
corpus, document
 
Fields inherited from class gate.creole.AbstractProcessingResource
interrupted
 
Fields inherited from class gate.creole.AbstractResource
name
 
Fields inherited from class gate.util.AbstractFeatureBearer
features
 
Fields inherited from interface gate.creole.ANNIEConstants
ANNOTATION_COREF_FEATURE_NAME, DATE_ANNOTATION_TYPE, DATE_POSTED_ANNOTATION_TYPE, DEFAULT_FILE, DOCUMENT_COREF_FEATURE_NAME, JOB_ID_ANNOTATION_TYPE, LOCATION_ANNOTATION_TYPE, LOOKUP_ANNOTATION_TYPE, LOOKUP_CLASS_FEATURE_NAME, LOOKUP_INSTANCE_FEATURE_NAME, LOOKUP_LANGUAGE_FEATURE_NAME, LOOKUP_MAJOR_TYPE_FEATURE_NAME, LOOKUP_MINOR_TYPE_FEATURE_NAME, LOOKUP_ONTOLOGY_FEATURE_NAME, MONEY_ANNOTATION_TYPE, ORGANIZATION_ANNOTATION_TYPE, PERSON_ANNOTATION_TYPE, PERSON_GENDER_FEATURE_NAME, PLUGIN_DIR, PR_NAMES, SENTENCE_ANNOTATION_TYPE, SPACE_TOKEN_ANNOTATION_TYPE, TOKEN_ANNOTATION_TYPE, TOKEN_CATEGORY_FEATURE_NAME, TOKEN_KIND_FEATURE_NAME, TOKEN_LENGTH_FEATURE_NAME, TOKEN_ORTH_FEATURE_NAME, TOKEN_STRING_FEATURE_NAME
 
Constructor Summary
OrthoMatcher()
           
 
Method Summary
protected  void createAnnotList(String nameFile, String nameList)
          creates the lookup tables
protected  void docCleanup()
           
 void execute()
          Run the resource.
 String getAnnotationSetName()
          get the name of the annotation set
 List getAnnotationTypes()
          get the types of the annotation
 Boolean getCaseSensitive()
          Are we running in a case-sensitive mode?
 URL getDefinitionFileURL()
           
 String getEncoding()
           
 Boolean getExtLists()
           
 Boolean getHighPrecisionOrgs()
           
 Double getMinimumNicknameLikelihood()
           
 String getOrganizationType()
           
 AnnotationOrthography getOrthography()
           
 String getPersonType()
           
 Boolean getProcessUnknown()
          Return whether or not we're processing the Unknown annots
 HashMap getTokensMap()
           
 Resource init()
          Initialise this resource, and return it.
protected  boolean matchAnnotations(Annotation newAnnot, String annotString, Annotation prevAnnot)
           
protected  void matchNameAnnotations()
           
protected  boolean matchOtherAnnots(List toMatchList, Annotation newAnnot, String annotString)
          This method checkes whether the new annotation matches all annotations given in the toMatchList (it contains ids) The idea is that the new annotation needs to match all those, because assuming transitivity does not always work, when two different entities share a common token: e.g., BT Cellnet and BT and British Telecom.
protected  void matchUnknown(ArrayList<Annotation> sortedAnnotationsForAType)
           
protected  void matchWithPrevious(Annotation nameAnnot, String annotString, ArrayList<Annotation> listOfThisType, int startIndex)
          Attempt to match nameAnnot against all previous annotations of the same type, which are passed down in listOfThisType.
protected  void modifyRules(Map<Integer,OrthoMatcherRule> rules)
          Override this method to add, replace, remove rules
 boolean noMatchRule1(String s1, String s2, Annotation previousAnnot, boolean longerPrevious)
          No Match Rule 1: Avoids the problem of matching David Jones ...
 boolean noMatchRule2(String s1, String s2)
          NoMatch Rule #2: Do we have a mismatch of middle initial?
protected  String normalizeOrganizationName(String annotString, Annotation annot)
          return an organization without a designator and starting The
protected  void normalizePersonName(Annotation annot)
           
protected  void propagatePropertyToExactMatchingMatches(Annotation updateAnnot, String featureName, Object value)
           
 void setAnnotationSetName(String newAnnotationSetName)
          set the annotation set name
 void setAnnotationTypes(List newType)
          set the types of the annotations
 void setCaseSensitive(Boolean newCase)
          set the caseSensitive flag
 void setDefinitionFileURL(URL definitionFileURL)
           
 void setEncoding(String encoding)
           
 void setExtLists(Boolean newExtLists)
          set the extLists flag
 void setHighPrecisionOrgs(Boolean highPrecisionOrgs)
           
 void setMinimumNicknameLikelihood(Double minimumNicknameLikelihood)
           
 void setOrganizationType(String newOrganizationType)
           
 void setOrthography(AnnotationOrthography orthography)
           
 void setPersonType(String newPersonType)
           
 void setProcessUnknown(Boolean processOrNot)
          set whether to process the Unknown annotations
 
Methods inherited from class gate.creole.AbstractLanguageAnalyser
getCorpus, getDocument, setCorpus, setDocument
 
Methods inherited from class gate.creole.AbstractProcessingResource
addProgressListener, addStatusListener, cleanup, fireProcessFinished, fireProgressChanged, fireStatusChanged, getRuntimeParameterValues, getRuntimeParameterValues, interrupt, isInterrupted, reInit, removeProgressListener, removeStatusListener
 
Methods inherited from class gate.creole.AbstractResource
checkParameterValues, getBeanInfo, getInitParameterValues, getInitParameterValues, getName, getParameterValue, getParameterValue, getParameterValues, removeResourceListeners, setName, setParameterValue, setParameterValue, setParameterValues, setParameterValues, setResourceListeners
 
Methods inherited from class gate.util.AbstractFeatureBearer
getFeatures, setFeatures
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 
Methods inherited from interface gate.ProcessingResource
reInit
 
Methods inherited from interface gate.Resource
cleanup, getParameterValue, setParameterValue, setParameterValues
 
Methods inherited from interface gate.util.FeatureBearer
getFeatures, setFeatures
 
Methods inherited from interface gate.util.NameBearer
getName, setName
 
Methods inherited from interface gate.Executable
interrupt, isInterrupted
 

Field Detail

log

protected static final org.apache.log4j.Logger log

DEBUG

public static final boolean DEBUG
See Also:
Constant Field Values

OM_DOCUMENT_PARAMETER_NAME

public static final String OM_DOCUMENT_PARAMETER_NAME
See Also:
Constant Field Values

OM_ANN_SET_PARAMETER_NAME

public static final String OM_ANN_SET_PARAMETER_NAME
See Also:
Constant Field Values

OM_CASE_SENSITIVE_PARAMETER_NAME

public static final String OM_CASE_SENSITIVE_PARAMETER_NAME
See Also:
Constant Field Values

OM_ANN_TYPES_PARAMETER_NAME

public static final String OM_ANN_TYPES_PARAMETER_NAME
See Also:
Constant Field Values

OM_ORG_TYPE_PARAMETER_NAME

public static final String OM_ORG_TYPE_PARAMETER_NAME
See Also:
Constant Field Values

OM_PERSON_TYPE_PARAMETER_NAME

public static final String OM_PERSON_TYPE_PARAMETER_NAME
See Also:
Constant Field Values

OM_EXT_LISTS_PARAMETER_NAME

public static final String OM_EXT_LISTS_PARAMETER_NAME
See Also:
Constant Field Values

CDGLISTNAME

protected static final String CDGLISTNAME
See Also:
Constant Field Values

ALIASLISTNAME

protected static final String ALIASLISTNAME
See Also:
Constant Field Values

ARTLISTNAME

protected static final String ARTLISTNAME
See Also:
Constant Field Values

PREPLISTNAME

protected static final String PREPLISTNAME
See Also:
Constant Field Values

CONNECTORLISTNAME

protected static final String CONNECTORLISTNAME
See Also:
Constant Field Values

SPURLISTNAME

protected static final String SPURLISTNAME
See Also:
Constant Field Values

PUNCTUATION_VALUE

protected static final String PUNCTUATION_VALUE
See Also:
Constant Field Values

THE_VALUE

protected static final String THE_VALUE
See Also:
Constant Field Values

annotationSetName

protected String annotationSetName
the name of the annotation set


annotationTypes

protected List annotationTypes
the types of the annotation


organizationType

protected String organizationType
the organization type


personType

protected String personType
the person type


unknownType

protected String unknownType

extLists

protected boolean extLists
internal or external list


highPrecisionOrgs

protected Boolean highPrecisionOrgs
Use only high precision rules for Organizations


matchingUnknowns

protected boolean matchingUnknowns
matching unknowns or not


allMatchingNeeded

protected boolean allMatchingNeeded
This is an internal variable to indicate whether we matched using a rule that requires that the newly matched annotation matches all the others This is needed, because organizations can share first/last tokens like News and be different


caseSensitive

protected boolean caseSensitive

alias

protected HashMap alias

cdg

protected HashSet cdg

spur_match

protected HashMap spur_match

def_art

protected HashMap def_art

connector

protected HashMap connector

prepos

protected HashMap prepos

nameAllAnnots

protected AnnotationSet nameAllAnnots

processedAnnots

protected HashMap processedAnnots

annots2Remove

protected HashMap annots2Remove

matchesDocFeature

protected List matchesDocFeature

tokensMap

protected HashMap tokensMap

normalizedTokensMap

protected HashMap normalizedTokensMap

shortAnnot

protected Annotation shortAnnot

longAnnot

protected Annotation longAnnot

tokensLongAnnot

protected ArrayList<Annotation> tokensLongAnnot

tokensShortAnnot

protected ArrayList<Annotation> tokensShortAnnot

normalizedTokensLongAnnot

protected ArrayList<Annotation> normalizedTokensLongAnnot

normalizedTokensShortAnnot

protected ArrayList<Annotation> normalizedTokensShortAnnot
Constructor Detail

OrthoMatcher

public OrthoMatcher()
Method Detail

getTokensMap

public HashMap getTokensMap()

modifyRules

protected void modifyRules(Map<Integer,OrthoMatcherRule> rules)
Override this method to add, replace, remove rules


init

public Resource init()
              throws ResourceInstantiationException
Initialise this resource, and return it.

Specified by:
init in interface Resource
Overrides:
init in class AbstractProcessingResource
Throws:
ResourceInstantiationException

execute

public void execute()
             throws ExecutionException
Run the resource. It doesn't make sense not to override this in subclasses so the default implementation signals an exception.

Specified by:
execute in interface Executable
Overrides:
execute in class AbstractProcessingResource
Throws:
ExecutionException

matchNameAnnotations

protected void matchNameAnnotations()
                             throws ExecutionException
Throws:
ExecutionException

matchUnknown

protected void matchUnknown(ArrayList<Annotation> sortedAnnotationsForAType)
                     throws ExecutionException
Throws:
ExecutionException

matchWithPrevious

protected void matchWithPrevious(Annotation nameAnnot,
                                 String annotString,
                                 ArrayList<Annotation> listOfThisType,
                                 int startIndex)
Attempt to match nameAnnot against all previous annotations of the same type, which are passed down in listOfThisType. Matches are tested in order from most recent to oldest.

Parameters:
nameAnnot - Annotation we are trying to match
annotString - Normalized string representation of annotation
listOfThisType - ArrayList of Annotations of the same type as nameAnnot
startIndex - Index in listOfThisType that we will start from in matching the current annotation

propagatePropertyToExactMatchingMatches

protected void propagatePropertyToExactMatchingMatches(Annotation updateAnnot,
                                                       String featureName,
                                                       Object value)

matchAnnotations

protected boolean matchAnnotations(Annotation newAnnot,
                                   String annotString,
                                   Annotation prevAnnot)

matchOtherAnnots

protected boolean matchOtherAnnots(List toMatchList,
                                   Annotation newAnnot,
                                   String annotString)
This method checkes whether the new annotation matches all annotations given in the toMatchList (it contains ids) The idea is that the new annotation needs to match all those, because assuming transitivity does not always work, when two different entities share a common token: e.g., BT Cellnet and BT and British Telecom.


docCleanup

protected void docCleanup()

normalizePersonName

protected void normalizePersonName(Annotation annot)
                            throws ExecutionException
Throws:
ExecutionException

normalizeOrganizationName

protected String normalizeOrganizationName(String annotString,
                                           Annotation annot)
return an organization without a designator and starting The


createAnnotList

protected void createAnnotList(String nameFile,
                               String nameList)
                        throws IOException
creates the lookup tables

Throws:
IOException

setExtLists

public void setExtLists(Boolean newExtLists)
set the extLists flag


setCaseSensitive

public void setCaseSensitive(Boolean newCase)
set the caseSensitive flag


setAnnotationSetName

public void setAnnotationSetName(String newAnnotationSetName)
set the annotation set name


setAnnotationTypes

public void setAnnotationTypes(List newType)
set the types of the annotations


setProcessUnknown

public void setProcessUnknown(Boolean processOrNot)
set whether to process the Unknown annotations


setOrganizationType

public void setOrganizationType(String newOrganizationType)

setPersonType

public void setPersonType(String newPersonType)

getAnnotationSetName

public String getAnnotationSetName()
get the name of the annotation set


getAnnotationTypes

public List getAnnotationTypes()
get the types of the annotation


getOrganizationType

public String getOrganizationType()

getPersonType

public String getPersonType()

getExtLists

public Boolean getExtLists()

getCaseSensitive

public Boolean getCaseSensitive()
Are we running in a case-sensitive mode?


getProcessUnknown

public Boolean getProcessUnknown()
Return whether or not we're processing the Unknown annots


noMatchRule1

public boolean noMatchRule1(String s1,
                            String s2,
                            Annotation previousAnnot,
                            boolean longerPrevious)
No Match Rule 1: Avoids the problem of matching David Jones ... David ... David Smith Since "David" was matched with David Jones, we don't match David with David Smith.


noMatchRule2

public boolean noMatchRule2(String s1,
                            String s2)
NoMatch Rule #2: Do we have a mismatch of middle initial? Condition(s): Only applies to person names with more than two tokens in the name Want George W. Bush != George H. W. Bush and George Walker Bush != George Herbert Walker Bush and John T. Smith != John Q. Smith however John T. Smith == John Thomas Smith be careful about Hillary Rodham Clinton == Hillary Rodham-Clinton be careful about Carlos Bueno de Lopez == Bueno de Lopez and Cynthia Morgan de Rothschild == Cynthia de Rothschild


setDefinitionFileURL

public void setDefinitionFileURL(URL definitionFileURL)

getDefinitionFileURL

public URL getDefinitionFileURL()

setEncoding

public void setEncoding(String encoding)

getEncoding

public String getEncoding()

getMinimumNicknameLikelihood

public Double getMinimumNicknameLikelihood()

setMinimumNicknameLikelihood

public void setMinimumNicknameLikelihood(Double minimumNicknameLikelihood)

getHighPrecisionOrgs

public Boolean getHighPrecisionOrgs()
Returns:
the highPrecisionOrgs

setHighPrecisionOrgs

public void setHighPrecisionOrgs(Boolean highPrecisionOrgs)
Parameters:
highPrecisionOrgs - the highPrecisionOrgs to set

setOrthography

public void setOrthography(AnnotationOrthography orthography)

getOrthography

public AnnotationOrthography getOrthography()