Class

com.johnsnowlabs.nlp.training

CoNLL

Related Doc: package training

Permalink

case class CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true, delimiter: String = " ") extends Product with Serializable

Helper class to load a CoNLL type dataset for training.

The dataset should be in the format of CoNLL 2003 and needs to be specified with readDataset. Other CoNLL datasets are not supported.

Two types of input paths are supported,

Folder: this is a path ending in *, and representing a collection of CoNLL files within a directory. E.g., 'path/to/multiple/conlls/*' Using this pattern will result in all the files being read into a single Dataframe. Some constraints apply on the schemas of the multiple files.

File: this is a path to a single file. E.g., 'path/to/single_file.conll'

Example

val trainingData = CoNLL().readDataset(spark, "src/test/resources/conll2003/eng.train")
trainingData.selectExpr("text", "token.result as tokens", "pos.result as pos", "label.result as label")
  .show(3, false)
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|text                                            |tokens                                                    |pos                                  |label                                    |
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|EU rejects German call to boycott British lamb .|[EU, rejects, German, call, to, boycott, British, lamb, .]|[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]|[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]|
|Peter Blackburn                                 |[Peter, Blackburn]                                        |[NNP, NNP]                           |[B-PER, I-PER]                           |
|BRUSSELS 1996-08-22                             |[BRUSSELS, 1996-08-22]                                    |[NNP, CD]                            |[B-LOC, O]                               |
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+

trainingData.printSchema
root
 |-- text: string (nullable = true)
 |-- document: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- pos: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- label: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
documentCol

Name of the DOCUMENT Annotator type column

sentenceCol

Name of the Sentences of DOCUMENT Annotator type column

tokenCol

Name of the TOKEN Annotator type column

posCol

Name of the POS Annotator type column

conllLabelIndex

Index of the column for NER Label in the dataset

conllPosIndex

Index of the column for the POS tags in the dataset

conllTextCol

Index of the column for the text in the dataset

labelCol

Name of the NAMED_ENTITY Annotator type column

explodeSentences

Whether to explode each sentence to a separate row

delimiter

Delimiter used to separate columns inside CoNLL file

Linear Supertypes
Serializable, Serializable, Product, Equals, AnyRef, Any
Ordering
  1. Alphabetic
  2. By Inheritance
Inherited
  1. CoNLL
  2. Serializable
  3. Serializable
  4. Product
  5. Equals
  6. AnyRef
  7. Any
  1. Hide All
  2. Show All
Visibility
  1. Public
  2. All

Instance Constructors

  1. new CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true, delimiter: String = " ")

    Permalink

    documentCol

    Name of the DOCUMENT Annotator type column

    sentenceCol

    Name of the Sentences of DOCUMENT Annotator type column

    tokenCol

    Name of the TOKEN Annotator type column

    posCol

    Name of the POS Annotator type column

    conllLabelIndex

    Index of the column for NER Label in the dataset

    conllPosIndex

    Index of the column for the POS tags in the dataset

    conllTextCol

    Index of the column for the text in the dataset

    labelCol

    Name of the NAMED_ENTITY Annotator type column

    explodeSentences

    Whether to explode each sentence to a separate row

    delimiter

    Delimiter used to separate columns inside CoNLL file

Value Members

  1. final def !=(arg0: Any): Boolean

    Permalink
    Definition Classes
    AnyRef → Any
  2. final def ##(): Int

    Permalink
    Definition Classes
    AnyRef → Any
  3. final def ==(arg0: Any): Boolean

    Permalink
    Definition Classes
    AnyRef → Any
  4. val annotationType: ArrayType

    Permalink
  5. final def asInstanceOf[T0]: T0

    Permalink
    Definition Classes
    Any
  6. def clearTokens(tokens: Array[IndexedTaggedWord]): Array[IndexedTaggedWord]

    Permalink
  7. def clone(): AnyRef

    Permalink
    Attributes
    protected[java.lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  8. val conllLabelIndex: Int

    Permalink

    Index of the column for NER Label in the dataset

  9. val conllPosIndex: Int

    Permalink

    Index of the column for the POS tags in the dataset

  10. val conllTextCol: String

    Permalink

    Index of the column for the text in the dataset

  11. val delimiter: String

    Permalink

    Delimiter used to separate columns inside CoNLL file

  12. val documentCol: String

    Permalink

    Name of the DOCUMENT Annotator type column

  13. final def eq(arg0: AnyRef): Boolean

    Permalink
    Definition Classes
    AnyRef
  14. val explodeSentences: Boolean

    Permalink

    Whether to explode each sentence to a separate row

  15. def finalize(): Unit

    Permalink
    Attributes
    protected[java.lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( classOf[java.lang.Throwable] )
  16. def getAnnotationType(column: String, annotatorType: String, addMetadata: Boolean = true): StructField

    Permalink
  17. final def getClass(): Class[_]

    Permalink
    Definition Classes
    AnyRef → Any
  18. final def isInstanceOf[T0]: Boolean

    Permalink
    Definition Classes
    Any
  19. val labelCol: String

    Permalink

    Name of the NAMED_ENTITY Annotator type column

  20. final def ne(arg0: AnyRef): Boolean

    Permalink
    Definition Classes
    AnyRef
  21. final def notify(): Unit

    Permalink
    Definition Classes
    AnyRef
  22. final def notifyAll(): Unit

    Permalink
    Definition Classes
    AnyRef
  23. def packAssembly(text: String, isTraining: Boolean = true): Seq[Annotation]

    Permalink
  24. def packDocs(docs: Seq[CoNLLDocument], spark: SparkSession): Dataset[_]

    Permalink
  25. def packNerTagged(sentences: Seq[NerTaggedSentence]): Seq[Annotation]

    Permalink
  26. def packPosTagged(sentences: Seq[TaggedSentence]): Seq[Annotation]

    Permalink
  27. def packSentence(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]

    Permalink
  28. def packTokenized(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]

    Permalink
  29. val posCol: String

    Permalink

    Name of the POS Annotator type column

  30. def readDataset(spark: SparkSession, path: String, readAs: String = ReadAs.TEXT.toString, parallelism: Int = 8, storageLevel: StorageLevel = StorageLevel.DISK_ONLY): Dataset[_]

    Permalink
  31. def readDatasetFromLines(lines: Array[String], spark: SparkSession): Dataset[_]

    Permalink
  32. def readDocs(er: ExternalResource): Seq[CoNLLDocument]

    Permalink
  33. def readLines(lines: Array[String]): Seq[CoNLLDocument]

    Permalink
  34. def schema: StructType

    Permalink
  35. val sentenceCol: String

    Permalink

    Name of the Sentences of DOCUMENT Annotator type column

  36. final def synchronized[T0](arg0: ⇒ T0): T0

    Permalink
    Definition Classes
    AnyRef
  37. val tokenCol: String

    Permalink

    Name of the TOKEN Annotator type column

  38. final def wait(): Unit

    Permalink
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  39. final def wait(arg0: Long, arg1: Int): Unit

    Permalink
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  40. final def wait(arg0: Long): Unit

    Permalink
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )

Inherited from Serializable

Inherited from Serializable

Inherited from Product

Inherited from Equals

Inherited from AnyRef

Inherited from Any

Ungrouped