Package com.linkedin.dagli.text.token
Class Tokens
java.lang.Object
com.linkedin.dagli.util.cloneable.AbstractCloneable<S>
com.linkedin.dagli.producer.AbstractChildProducer<R,I,S>
com.linkedin.dagli.transformer.AbstractPreparedTransformer2<A,B,R,S>
com.linkedin.dagli.transformer.AbstractPreparedStatefulTransformer2<java.util.Locale,java.lang.CharSequence,java.util.List<java.lang.String>,java.util.concurrent.ConcurrentHashMap<java.util.Locale,com.linkedin.dagli.util.function.Function1<java.lang.CharSequence,java.util.List<java.lang.String>>>,Tokens>
com.linkedin.dagli.text.token.Tokens
- All Implemented Interfaces:
com.linkedin.dagli.producer.ChildProducer<java.util.List<java.lang.String>>
,com.linkedin.dagli.producer.Producer<java.util.List<java.lang.String>>
,com.linkedin.dagli.producer.ProducerType<java.util.List<java.lang.String>,com.linkedin.dagli.transformer.PreparedTransformer<java.util.List<java.lang.String>>>
,com.linkedin.dagli.transformer.PreparedTransformer<java.util.List<java.lang.String>>
,com.linkedin.dagli.transformer.PreparedTransformer2<java.util.Locale,java.lang.CharSequence,java.util.List<java.lang.String>>
,com.linkedin.dagli.transformer.Transformer<java.util.List<java.lang.String>>
,com.linkedin.dagli.transformer.Transformer2<java.util.Locale,java.lang.CharSequence,java.util.List<java.lang.String>>
,com.linkedin.dagli.transformer.TransformerWithInputBound<java.lang.Object,java.util.List<java.lang.String>>
,com.linkedin.dagli.util.named.Named
,java.io.Serializable
,java.lang.Cloneable
@ValueEquality public class Tokens extends com.linkedin.dagli.transformer.AbstractPreparedStatefulTransformer2<java.util.Locale,java.lang.CharSequence,java.util.List<java.lang.String>,java.util.concurrent.ConcurrentHashMap<java.util.Locale,com.linkedin.dagli.util.function.Function1<java.lang.CharSequence,java.util.List<java.lang.String>>>,Tokens>
Tokenizes a string of natural text into its constituent tokens (words, punctuation, emoticons, etc.) using the best
available tokenizer for each locale provided.
The locale may be given for each example using
withLocale(Locale)
or
withLocaleInput(Producer)
. The default is US English (en-US) unless configured otherwise.
E.g. a tokenizer created with new Tokens().withInput(texts).withLocale(Locale.FRENCH)
will tokenize all
the "texts" inputs under the assumption that they are all in French.
Alternatively, new Tokens().withInput(texts).withLocaleInput(locales)
will tokenize each text input using the
tokenizer appropriate for its corresponding locale input; this allows a single Tokenizer to tokenize inputs in
multiple, dynamically-determined languages.
Please note that it is possible for the underlying tokenizer to change in future versions (be replaced, be retrained,
etc.)
Current language support:
English and German are considered to be well supported.
Other Latin-charset languages should also tokenize well.
Non-Latin languages such as Hindi and Russian may have poor results.
Logographic languages such as Chinese or Japanese will perform very poorly.- See Also:
- Serialized Form
-
Nested Class Summary
Nested classes/interfaces inherited from class com.linkedin.dagli.transformer.AbstractPreparedStatefulTransformer2
com.linkedin.dagli.transformer.AbstractPreparedStatefulTransformer2.InternalAPI
-
Field Summary
-
Constructor Summary
Constructors Constructor Description Tokens()
Creates a new tokenizer with the default locale of US English. -
Method Summary
Modifier and Type Method Description protected java.util.List<java.lang.String>
apply(java.util.concurrent.ConcurrentHashMap<java.util.Locale,com.linkedin.dagli.util.function.Function1<java.lang.CharSequence,java.util.List<java.lang.String>>> executionCache, java.util.Locale locale, java.lang.CharSequence text)
protected Tokens
clone()
protected boolean
computeEqualsUnsafe(Tokens arg0)
protected int
computeHashCode()
protected java.util.concurrent.ConcurrentHashMap<java.util.Locale,com.linkedin.dagli.util.function.Function1<java.lang.CharSequence,java.util.List<java.lang.String>>>
createExecutionCache(long exampleCountGuess)
boolean
equals(java.lang.Object arg0)
protected com.linkedin.dagli.reducer.ClassReducerTable
getClassReducerTable()
protected java.lang.String
getDefaultName()
protected java.lang.String
getDefaultShortName()
protected java.util.Collection<? extends com.linkedin.dagli.reducer.Reducer<? super Tokens>>
getGraphReducers()
protected com.linkedin.dagli.handle.ProducerHandle<Tokens>
getHandle()
protected com.linkedin.dagli.producer.Producer<? extends java.util.Locale>
getInput1()
protected com.linkedin.dagli.producer.Producer<? extends java.lang.CharSequence>
getInput2()
protected java.util.List<com.linkedin.dagli.producer.Producer<?>>
getInputList()
java.lang.String
getName()
protected java.lang.reflect.Type
getResultSupertype()
java.lang.String
getShortName()
protected boolean
handleEquality(Tokens arg0)
protected int
handleHashCode()
protected boolean
hasAlwaysConstantResult()
boolean
hasConstantResult()
int
hashCode()
protected boolean
hasName()
protected com.linkedin.dagli.dag.Graph<java.lang.Object>
subgraph()
java.lang.String
toString()
protected Tokens
withAllInputs(com.linkedin.dagli.producer.Producer<? extends java.util.Locale> arg0, com.linkedin.dagli.producer.Producer<? extends java.lang.CharSequence> arg1)
protected Tokens
withInput1(com.linkedin.dagli.producer.Producer<? extends java.util.Locale> arg0)
protected Tokens
withInput2(com.linkedin.dagli.producer.Producer<? extends java.lang.CharSequence> arg0)
Tokens
withLocale(java.util.Locale locale)
Sets the locale of the text.Tokens
withLocaleInput(com.linkedin.dagli.producer.Producer<? extends java.util.Locale> localeInput)
Sets the input that will provide the locale of the text.Tokens
withName(java.lang.String arg0)
Tokens
withTextInput(com.linkedin.dagli.producer.Producer<? extends java.lang.CharSequence> textInput)
Sets the input that will provide the text to be tokenized.Methods inherited from class com.linkedin.dagli.transformer.AbstractPreparedStatefulTransformer2
apply, applyAll, createInternalAPI, getPreferredMinibatchSize
Methods inherited from class com.linkedin.dagli.transformer.AbstractPreparedTransformer2
internalAPI
Methods inherited from class com.linkedin.dagli.producer.AbstractChildProducer
validate
Methods inherited from class com.linkedin.dagli.util.cloneable.AbstractCloneable
clone
Methods inherited from class java.lang.Object
finalize, getClass, notify, notifyAll, wait, wait, wait
Methods inherited from interface com.linkedin.dagli.transformer.PreparedTransformer2
applyAll
Methods inherited from interface com.linkedin.dagli.producer.Producer
getName, getShortName, hasConstantResult, validate
-
Field Details
-
_input1
protected com.linkedin.dagli.producer.Producer<? extends java.util.Locale> _input1 -
_input2
protected com.linkedin.dagli.producer.Producer<? extends java.lang.CharSequence> _input2
-
-
Constructor Details
-
Tokens
public Tokens()Creates a new tokenizer with the default locale of US English.
-
-
Method Details
-
withLocaleInput
public Tokens withLocaleInput(com.linkedin.dagli.producer.Producer<? extends java.util.Locale> localeInput)Sets the input that will provide the locale of the text. Different tokenizers may be used for different locales. If no locale input is specified, the default locale isLocale.US
.- Parameters:
localeInput
- the input that will provide the locale for the text- Returns:
- a copy of this instance that will use the specified input
-
withLocale
Sets the locale of the text. Different tokenizers may be used for different locales. If no locale is specified, the default locale isLocale.US
.- Parameters:
locale
- locale of the text- Returns:
- a copy of this instance that will use the specified locale
-
withTextInput
public Tokens withTextInput(com.linkedin.dagli.producer.Producer<? extends java.lang.CharSequence> textInput)Sets the input that will provide the text to be tokenized.- Parameters:
textInput
- the input that will provide the text- Returns:
- a copy of this instance that will use the specified input
-
createExecutionCache
protected java.util.concurrent.ConcurrentHashMap<java.util.Locale,com.linkedin.dagli.util.function.Function1<java.lang.CharSequence,java.util.List<java.lang.String>>> createExecutionCache(long exampleCountGuess)- Overrides:
createExecutionCache
in classcom.linkedin.dagli.transformer.AbstractPreparedStatefulTransformer2<java.util.Locale,java.lang.CharSequence,java.util.List<java.lang.String>,java.util.concurrent.ConcurrentHashMap<java.util.Locale,com.linkedin.dagli.util.function.Function1<java.lang.CharSequence,java.util.List<java.lang.String>>>,Tokens>
-
apply
protected java.util.List<java.lang.String> apply(java.util.concurrent.ConcurrentHashMap<java.util.Locale,com.linkedin.dagli.util.function.Function1<java.lang.CharSequence,java.util.List<java.lang.String>>> executionCache, java.util.Locale locale, java.lang.CharSequence text)- Specified by:
apply
in classcom.linkedin.dagli.transformer.AbstractPreparedStatefulTransformer2<java.util.Locale,java.lang.CharSequence,java.util.List<java.lang.String>,java.util.concurrent.ConcurrentHashMap<java.util.Locale,com.linkedin.dagli.util.function.Function1<java.lang.CharSequence,java.util.List<java.lang.String>>>,Tokens>
-
getInputList
protected java.util.List<com.linkedin.dagli.producer.Producer<?>> getInputList() -
getInput1
protected com.linkedin.dagli.producer.Producer<? extends java.util.Locale> getInput1() -
getInput2
protected com.linkedin.dagli.producer.Producer<? extends java.lang.CharSequence> getInput2() -
withAllInputs
protected Tokens withAllInputs(com.linkedin.dagli.producer.Producer<? extends java.util.Locale> arg0, com.linkedin.dagli.producer.Producer<? extends java.lang.CharSequence> arg1) -
withInput1
-
withInput2
protected Tokens withInput2(com.linkedin.dagli.producer.Producer<? extends java.lang.CharSequence> arg0) -
getName
public java.lang.String getName()- Specified by:
getName
in interfacecom.linkedin.dagli.util.named.Named
- Specified by:
getName
in interfacecom.linkedin.dagli.producer.Producer<R extends java.lang.Object>
-
getShortName
public java.lang.String getShortName()- Specified by:
getShortName
in interfacecom.linkedin.dagli.util.named.Named
- Specified by:
getShortName
in interfacecom.linkedin.dagli.producer.Producer<R extends java.lang.Object>
-
hasName
protected boolean hasName() -
getDefaultName
protected java.lang.String getDefaultName() -
getDefaultShortName
protected java.lang.String getDefaultShortName() -
withName
-
getGraphReducers
protected java.util.Collection<? extends com.linkedin.dagli.reducer.Reducer<? super Tokens>> getGraphReducers() -
getClassReducerTable
protected com.linkedin.dagli.reducer.ClassReducerTable getClassReducerTable() -
hasAlwaysConstantResult
protected boolean hasAlwaysConstantResult() -
hasConstantResult
public final boolean hasConstantResult()- Specified by:
hasConstantResult
in interfacecom.linkedin.dagli.producer.Producer<R extends java.lang.Object>
-
subgraph
protected com.linkedin.dagli.dag.Graph<java.lang.Object> subgraph() -
hashCode
public final int hashCode()- Overrides:
hashCode
in classjava.lang.Object
-
equals
public final boolean equals(java.lang.Object arg0)- Overrides:
equals
in classjava.lang.Object
-
computeEqualsUnsafe
-
computeHashCode
protected int computeHashCode() -
handleEquality
-
handleHashCode
protected int handleHashCode() -
clone
- Overrides:
clone
in classcom.linkedin.dagli.util.cloneable.AbstractCloneable<S extends com.linkedin.dagli.producer.AbstractProducer<R,I,S>>
-
getHandle
-
toString
public java.lang.String toString()- Overrides:
toString
in classjava.lang.Object
-
getResultSupertype
protected java.lang.reflect.Type getResultSupertype()
-