LuceneTextAnalyzer

This class allows simple access to custom Lucene text processing pipelines, a.k.a. text analyzers, which are specified via a JSON schema that hosts named analyzer specifications and mappings from field name(s) to analyzer(s).

Here's an example schema with descriptions inline as comments:

{
  "defaultLuceneMatchVersion": "6.0.0" // Optional.  Supplied to analysis components
                                        //     that don't explicitly specify "luceneMatchVersion".
  "analyzers": [              // Optional.  If not included, all field mappings must be
    {                         //     to fully qualified class names of Lucene Analyzer subclasses.
      "name": "html",         // Required.  Mappings in the "fields" array below refer to this name.
      "charFilters":[{        // Optional.
        "type": "htmlstrip"   // Required. "htmlstrip" is the SPI name for HTMLStripCharFilter
      }],
      "tokenizer": {          // Required.  Only one allowed.
        "type": "standard"    // Required. "standard" is the SPI name for StandardTokenizer
      },
      "filters": [{           // Optional.
          "type": "stop",     // Required.  "stop" is the SPI name for StopFilter
          "ignoreCase": "true",  // Component-specific params
          "format": "snowball",
          "words": "org/apache/lucene/analysis/snowball/english_stop.txt"
        }, {
          "type": "lowercase" // Required. "lowercase" is the SPI name for LowerCaseFilter
      }]
    },
    { "name": "stdtok", "tokenizer": { "type": "standard" } }
  ],
  "fields": [{                // Required.  To lookup an analyzer for a field, first the "name"
                              //     mappings are consulted, and then the "regex" mappings are
                              //     tested, in the order specified.
      "name": "keywords",     // Either "name" or "regex" is required.  "name" matches the field name exactly.
      "analyzer": "org.apache.lucene.analysis.core.KeywordAnalyzer" // FQCN of an Analyzer subclass
    }, {
      "regex": ".*html.*"     // Either "name" or "regex" is required.  "regex" must match the whole field name.
      "analyzer": "html"      // Reference to the named analyzer specified in the "analyzers" section.
    }, {
      "regex": ".+",          // Either "name" or "regex" is required.  "regex" must match the whole field name.
      "analyzer": "stdtok"    // Reference to the named analyzer specified in the "analyzers" section.
  }]
}

Linear Supertypes

Serializable, Serializable, AnyRef, Any

Instance Constructors

new LuceneTextAnalyzer(analysisSchema: String)

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def analyze(fieldValues: Map[String, String]): Map[String, Seq[String]]

For each of the field->value pairs in fieldValues, looks up the analyzer mapped to the field from the configured analysis schema, and uses it to perform analysis on the value.
For each of the field->value pairs in fieldValues, looks up the analyzer mapped to the field from the configured analysis schema, and uses it to perform analysis on the value. Returns a map from the fields to the produced token sequences.
def analyze(field: String, reader: Reader): Seq[String]

Looks up the analyzer mapped to the given field from the configured analysis schema, uses it to perform analysis on the given reader, returning the produced token sequence.
def analyze(field: String, str: String): Seq[String]

Looks up the analyzer mapped to the given field from the configured analysis schema, uses it to perform analysis on the given string, returning the produced token sequence.
def analyze(field: String, o: Any): Seq[String]
def analyzeJava(fieldValues: Map[String, String]): Map[String, List[String]]

Java-friendly version: for each of the field->value pairs in fieldValues, looks up the analyzer mapped to the field from the configured analysis schema, and uses it to perform analysis on the value.
Java-friendly version: for each of the field->value pairs in fieldValues, looks up the analyzer mapped to the field from the configured analysis schema, and uses it to perform analysis on the value. Returns a map from the fields to the produced token sequences.
def analyzeJava(field: String, reader: Reader): List[String]

Java-friendly version: looks up the analyzer mapped to the given field from the configured analysis schema, uses it to perform analysis on the given reader, returning the produced token sequence.
def analyzeJava(field: String, str: String): List[String]

Java-friendly version: looks up the analyzer mapped to the given field from the configured analysis schema, uses it to perform analysis on the given string, returning the produced token sequence.
def analyzeJava(field: String, o: Any): List[String]
def analyzeMV(fieldValues: Map[String, Seq[String]]): Map[String, Seq[String]]

For each of the field->multi-value pairs in fieldValues, looks up the analyzer mapped to the field from the configured analysis schema, and uses it to perform analysis on the each of the values.
For each of the field->multi-value pairs in fieldValues, looks up the analyzer mapped to the field from the configured analysis schema, and uses it to perform analysis on the each of the values. Returns a map from the fields to the flattened concatenation of the produced token sequences.
def analyzeMV(field: String, values: Seq[String]): Seq[String]

Looks up the analyzer mapped to the given field from the configured analysis schema, uses it to perform analysis on each of the given values, and returns the flattened concatenation of the produced token sequence.
def analyzeMVJava(fieldValues: Map[String, List[String]]): Map[String, List[String]]

Java-friendly version: for each of the field->multi-value pairs in fieldValues, looks up the analyzer mapped to the field from the configured analysis schema, and uses it to perform analysis on each of the values.
Java-friendly version: for each of the field->multi-value pairs in fieldValues, looks up the analyzer mapped to the field from the configured analysis schema, and uses it to perform analysis on each of the values. Returns a map from the fields to the flattened concatenation of the produced token sequences.
def analyzeMVJava(field: String, values: List[String]): List[String]

Java-friendly version: looks up the analyzer mapped to the given field from the configured analysis schema, uses it to perform analysis on each of the given values, and returns the flattened concatenation of the produced token sequence.
final def asInstanceOf[T0]: T0

Definition Classes
Any
def clone(): AnyRef

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( ... )
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def equals(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def finalize(): Unit

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
def getFieldAnalyzer(field: String): Option[Analyzer]

Returns the analyzer mapped to the given field in the configured analysis schema, if any.
def hashCode(): Int

Definition Classes
AnyRef → Any
def invalidMessages: String
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
def isValid: Boolean
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
final def notifyAll(): Unit

Definition Classes
AnyRef
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
def toPreAnalyzedJson(field: String, reader: Reader, stored: Boolean): String

Looks up the analyzer mapped to the given field from the configured analysis schema, uses it to perform analysis on the given reader, and returns a PreAnalyzedField-compatible JSON string with the following serialized attributes:
Looks up the analyzer mapped to the given field from the configured analysis schema, uses it to perform analysis on the given reader, and returns a PreAnalyzedField-compatible JSON string with the following serialized attributes:
- CharTermAttribute (token text), - OffsetAttribute (start and end position) - PositionIncrementAttribute (token position relative to the previous token)
If stored = true, the original reader input value, read into a string, will be included as a value to be stored. (Note that the Solr schema for the destination Solr field must be configured to store the value; if it is not, then the stored value included in the JSON will be ignored by Solr.)
def toPreAnalyzedJson(field: String, str: String, stored: Boolean): String

Looks up the analyzer mapped to the given field from the configured analysis schema, uses it to perform analysis on the given string, and returns a PreAnalyzedField-compatible JSON string with the following serialized attributes:
Looks up the analyzer mapped to the given field from the configured analysis schema, uses it to perform analysis on the given string, and returns a PreAnalyzedField-compatible JSON string with the following serialized attributes:
- CharTermAttribute (token text)
- OffsetAttribute (start and end character offsets)
- PositionIncrementAttribute (token position relative to the previous token)
If stored = true, the original string input value will be included as a value to be stored. (Note that the Solr schema for the destination Solr field must be configured to store the value; if it is not, then the stored value included in the JSON will be ignored by Solr.)
def toString(): String

Definition Classes
AnyRef → Any
def tokenStream(fieldName: String, reader: Reader): TokenStream

Looks up the analyzer mapped to fieldName and returns a org.apache.lucene.analysis.TokenStream for the analyzer to tokenize the contents of reader.
def tokenStream(fieldName: String, text: String): TokenStream

Looks up the analyzer mapped to fieldName and returns a org.apache.lucene.analysis.TokenStream for the analyzer to tokenize the contents of text.
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )

Related Doc: package analysis

class LuceneTextAnalyzer extends Serializable

Instance Constructors

new LuceneTextAnalyzer(analysisSchema: String)

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: Any): Boolean

def analyze(fieldValues: Map[String, String]): Map[String, Seq[String]]

def analyze(field: String, reader: Reader): Seq[String]

def analyze(field: String, str: String): Seq[String]

def analyze(field: String, o: Any): Seq[String]

def analyzeJava(fieldValues: Map[String, String]): Map[String, List[String]]

def analyzeJava(field: String, reader: Reader): List[String]

def analyzeJava(field: String, str: String): List[String]

def analyzeJava(field: String, o: Any): List[String]

def analyzeMV(fieldValues: Map[String, Seq[String]]): Map[String, Seq[String]]

def analyzeMV(field: String, values: Seq[String]): Seq[String]

def analyzeMVJava(fieldValues: Map[String, List[String]]): Map[String, List[String]]

def analyzeMVJava(field: String, values: List[String]): List[String]

final def asInstanceOf[T0]: T0

def clone(): AnyRef

final def eq(arg0: AnyRef): Boolean

def equals(arg0: Any): Boolean

def finalize(): Unit

final def getClass(): Class[_]

def getFieldAnalyzer(field: String): Option[Analyzer]

def hashCode(): Int

def invalidMessages: String

final def isInstanceOf[T0]: Boolean

def isValid: Boolean

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

final def synchronized[T0](arg0: ⇒ T0): T0

def toPreAnalyzedJson(field: String, reader: Reader, stored: Boolean): String

def toPreAnalyzedJson(field: String, str: String, stored: Boolean): String

def toString(): String

def tokenStream(fieldName: String, reader: Reader): TokenStream

def tokenStream(fieldName: String, text: String): TokenStream

final def wait(): Unit

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

Inherited from Serializable

Inherited from Serializable

Inherited from AnyRef

Inherited from Any

Ungrouped