Class

com.johnsnowlabs.nlp.training

CoNLL

Related Doc: package training

Permalink

case class CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true) extends Product with Serializable

Helper class to load a CoNLL type dataset for training.

The dataset should be in the format of CoNLL 2003 and needs to be specified with readDataset. Other CoNLL datasets are not supported.

Example

val trainingData = CoNLL().readDataset(spark, "src/test/resources/conll2003/eng.train")
trainingData.selectExpr("text", "token.result as tokens", "pos.result as pos", "label.result as label")
  .show(3, false)
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|text                                            |tokens                                                    |pos                                  |label                                    |
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|EU rejects German call to boycott British lamb .|[EU, rejects, German, call, to, boycott, British, lamb, .]|[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]|[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]|
|Peter Blackburn                                 |[Peter, Blackburn]                                        |[NNP, NNP]                           |[B-PER, I-PER]                           |
|BRUSSELS 1996-08-22                             |[BRUSSELS, 1996-08-22]                                    |[NNP, CD]                            |[B-LOC, O]                               |
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+

trainingData.printSchema
root
 |-- text: string (nullable = true)
 |-- document: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- pos: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- label: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
documentCol

Name of the DOCUMENT Annotator type column

sentenceCol

Name of the Sentences of DOCUMENT Annotator type column

tokenCol

Name of the TOKEN Annotator type column

posCol

Name of the POS Annotator type column

conllLabelIndex

Index of the column for NER Label in the dataset

conllPosIndex

Index of the column for the POS tags in the dataset

conllTextCol

Index of the column for the text in the dataset

labelCol

Name of the NAMED_ENTITY Annotator type column

explodeSentences

Whether to explode each sentence to a separate row

Linear Supertypes
Serializable, Serializable, Product, Equals, AnyRef, Any
Ordering
  1. Alphabetic
  2. By Inheritance
Inherited
  1. CoNLL
  2. Serializable
  3. Serializable
  4. Product
  5. Equals
  6. AnyRef
  7. Any
  1. Hide All
  2. Show All
Visibility
  1. Public
  2. All

Instance Constructors

  1. new CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true)

    Permalink

    documentCol

    Name of the DOCUMENT Annotator type column

    sentenceCol

    Name of the Sentences of DOCUMENT Annotator type column

    tokenCol

    Name of the TOKEN Annotator type column

    posCol

    Name of the POS Annotator type column

    conllLabelIndex

    Index of the column for NER Label in the dataset

    conllPosIndex

    Index of the column for the POS tags in the dataset

    conllTextCol

    Index of the column for the text in the dataset

    labelCol

    Name of the NAMED_ENTITY Annotator type column

    explodeSentences

    Whether to explode each sentence to a separate row

Value Members

  1. final def !=(arg0: Any): Boolean

    Permalink
    Definition Classes
    AnyRef → Any
  2. final def ##(): Int

    Permalink
    Definition Classes
    AnyRef → Any
  3. final def ==(arg0: Any): Boolean

    Permalink
    Definition Classes
    AnyRef → Any
  4. val annotationType: ArrayType

    Permalink
  5. final def asInstanceOf[T0]: T0

    Permalink
    Definition Classes
    Any
  6. def clearTokens(tokens: Array[IndexedTaggedWord]): Array[IndexedTaggedWord]

    Permalink
  7. def clone(): AnyRef

    Permalink
    Attributes
    protected[java.lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  8. val conllLabelIndex: Int

    Permalink

    Index of the column for NER Label in the dataset

  9. val conllPosIndex: Int

    Permalink

    Index of the column for the POS tags in the dataset

  10. val conllTextCol: String

    Permalink

    Index of the column for the text in the dataset

  11. val documentCol: String

    Permalink

    Name of the DOCUMENT Annotator type column

  12. final def eq(arg0: AnyRef): Boolean

    Permalink
    Definition Classes
    AnyRef
  13. val explodeSentences: Boolean

    Permalink

    Whether to explode each sentence to a separate row

  14. def finalize(): Unit

    Permalink
    Attributes
    protected[java.lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( classOf[java.lang.Throwable] )
  15. def getAnnotationType(column: String, annotatorType: String, addMetadata: Boolean = true): StructField

    Permalink
  16. final def getClass(): Class[_]

    Permalink
    Definition Classes
    AnyRef → Any
  17. final def isInstanceOf[T0]: Boolean

    Permalink
    Definition Classes
    Any
  18. val labelCol: String

    Permalink

    Name of the NAMED_ENTITY Annotator type column

  19. final def ne(arg0: AnyRef): Boolean

    Permalink
    Definition Classes
    AnyRef
  20. final def notify(): Unit

    Permalink
    Definition Classes
    AnyRef
  21. final def notifyAll(): Unit

    Permalink
    Definition Classes
    AnyRef
  22. def packAssembly(text: String, isTraining: Boolean = true): Seq[Annotation]

    Permalink
  23. def packDocs(docs: Seq[CoNLLDocument], spark: SparkSession): Dataset[_]

    Permalink
  24. def packNerTagged(sentences: Seq[NerTaggedSentence]): Seq[Annotation]

    Permalink
  25. def packPosTagged(sentences: Seq[TaggedSentence]): Seq[Annotation]

    Permalink
  26. def packSentence(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]

    Permalink
  27. def packTokenized(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]

    Permalink
  28. val posCol: String

    Permalink

    Name of the POS Annotator type column

  29. def readDataset(spark: SparkSession, path: String, readAs: String = ReadAs.TEXT.toString): Dataset[_]

    Permalink
  30. def readDatasetFromLines(lines: Array[String], spark: SparkSession): Dataset[_]

    Permalink
  31. def readDocs(er: ExternalResource): Seq[CoNLLDocument]

    Permalink
  32. def readLines(lines: Array[String]): Seq[CoNLLDocument]

    Permalink
  33. def schema: StructType

    Permalink
  34. val sentenceCol: String

    Permalink

    Name of the Sentences of DOCUMENT Annotator type column

  35. final def synchronized[T0](arg0: ⇒ T0): T0

    Permalink
    Definition Classes
    AnyRef
  36. val tokenCol: String

    Permalink

    Name of the TOKEN Annotator type column

  37. final def wait(): Unit

    Permalink
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  38. final def wait(arg0: Long, arg1: Int): Unit

    Permalink
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  39. final def wait(arg0: Long): Unit

    Permalink
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )

Inherited from Serializable

Inherited from Serializable

Inherited from Product

Inherited from Equals

Inherited from AnyRef

Inherited from Any

Ungrouped