public final class Tokenizer extends Object
Modifier and Type | Field and Description |
---|---|
private com.yahoo.language.process.CharacterClasses |
characterClasses |
private int |
indexLastExplicitlyChangedAt |
private int |
parensToEat |
private String |
source |
private SpecialTokens |
specialTokens
Tokens which should be words, regardless of which characters they contain
|
private boolean |
substringSpecialTokens
Whether to recognize tokens also as substrings of other tokens, needed for cjk
|
private List<Token> |
tokens |
Constructor and Description |
---|
Tokenizer(com.yahoo.language.Linguistics linguistics)
Creates a tokenizer which initializes from a given Linguistics
|
Modifier and Type | Method and Description |
---|---|
private boolean |
acceptApostropheAsWordCharacter(Index currentIndex) |
private void |
addToken(Token.Kind kind,
String word,
int start,
int end) |
private void |
addToken(Token token) |
private int |
consumeExact(int start,
Index index) |
private int |
consumeHeuristicExact(int start) |
private int |
consumeSpecialToken(int start) |
private int |
consumeToTerminator(int start,
String terminator) |
private int |
consumeWordOrNumber(int start,
Index currentIndex)
Consumes a word or number and/or possibly a special token starting within this word or number
|
private Index |
determineCurrentIndex(Index defaultIndex,
IndexFacts.Session indexFacts) |
private SpecialTokens.SpecialToken |
getSpecialToken(int start) |
private boolean |
looksLikeExactEnd(int end) |
void |
setSpecialTokens(SpecialTokens specialTokens)
Sets a list of tokens (Strings) which should be returned as WORD tokens regardless
of their content.
|
void |
setSubstringSpecialTokens(boolean substringSpecialTokens)
Sets whether to recognize tokens also as substrings of other tokens, needed for cjk.
|
private boolean |
terminatorStartsAt(int start,
String terminator) |
List<Token> |
tokenize(String string)
Resets this tokenizer and create tokens from the given string, using
"default" as the default index, and using no index information.
|
List<Token> |
tokenize(String string,
IndexFacts.Session indexFacts)
Resets this tokenizer and create tokens from the given string, using
"default" as the default index
|
List<Token> |
tokenize(String string,
String defaultIndexName,
IndexFacts.Session indexFacts)
Resets this tokenizer and create tokens from the given string.
|
private String source
private SpecialTokens specialTokens
private boolean substringSpecialTokens
private final com.yahoo.language.process.CharacterClasses characterClasses
private int parensToEat
private int indexLastExplicitlyChangedAt
public Tokenizer(com.yahoo.language.Linguistics linguistics)
public void setSpecialTokens(SpecialTokens specialTokens)
public void setSubstringSpecialTokens(boolean substringSpecialTokens)
public List<Token> tokenize(String string)
public List<Token> tokenize(String string, IndexFacts.Session indexFacts)
public List<Token> tokenize(String string, String defaultIndexName, IndexFacts.Session indexFacts)
string
- the string to tokenizedefaultIndexName
- the name of the index to use as defaultindexFacts
- information about the indexes we will searchprivate boolean acceptApostropheAsWordCharacter(Index currentIndex)
private Index determineCurrentIndex(Index defaultIndex, IndexFacts.Session indexFacts)
private int consumeSpecialToken(int start)
private SpecialTokens.SpecialToken getSpecialToken(int start)
private int consumeExact(int start, Index index)
private boolean looksLikeExactEnd(int end)
private int consumeHeuristicExact(int start)
private int consumeToTerminator(int start, String terminator)
private boolean terminatorStartsAt(int start, String terminator)
private int consumeWordOrNumber(int start, Index currentIndex)
private void addToken(Token.Kind kind, String word, int start, int end)
private void addToken(Token token)
Copyright © 2018. All rights reserved.