CoNLL

Helper class to load a CoNLL type dataset for training.

The dataset should be in the format of CoNLL 2003 and needs to be specified with readDataset. Other CoNLL datasets are not supported.

Two types of input paths are supported,

Folder: this is a path ending in *, and representing a collection of CoNLL files within a directory. E.g., 'path/to/multiple/conlls/*' Using this pattern will result in all the files being read into a single Dataframe. Some constraints apply on the schemas of the multiple files.

File: this is a path to a single file. E.g., 'path/to/single_file.conll'

Example

val trainingData = CoNLL().readDataset(spark, "src/test/resources/conll2003/eng.train")
trainingData.selectExpr("text", "token.result as tokens", "pos.result as pos", "label.result as label")
  .show(3, false)
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|text                                            |tokens                                                    |pos                                  |label                                    |
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|EU rejects German call to boycott British lamb .|[EU, rejects, German, call, to, boycott, British, lamb, .]|[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]|[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]|
|Peter Blackburn                                 |[Peter, Blackburn]                                        |[NNP, NNP]                           |[B-PER, I-PER]                           |
|BRUSSELS 1996-08-22                             |[BRUSSELS, 1996-08-22]                                    |[NNP, CD]                            |[B-LOC, O]                               |
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+

trainingData.printSchema
root
 |-- text: string (nullable = true)
 |-- document: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- pos: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- label: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)

documentCol: Name of the DOCUMENT Annotator type column
sentenceCol: Name of the Sentences of DOCUMENT Annotator type column
tokenCol: Name of the TOKEN Annotator type column
posCol: Name of the POS Annotator type column
conllLabelIndex: Index of the column for NER Label in the dataset
conllPosIndex: Index of the column for the POS tags in the dataset
conllTextCol: Index of the column for the text in the dataset
labelCol: Name of the NAMED_ENTITY Annotator type column
explodeSentences: Whether to explode each sentence to a separate row
delimiter: Delimiter used to separate columns inside CoNLL file

Linear Supertypes

Serializable, Serializable, Product, Equals, AnyRef, Any

Instance Constructors

new CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true, delimiter: String = " ")

documentCol
Name of the DOCUMENT Annotator type column
sentenceCol
Name of the Sentences of DOCUMENT Annotator type column
tokenCol
Name of the TOKEN Annotator type column
posCol
Name of the POS Annotator type column
conllLabelIndex
Index of the column for NER Label in the dataset
conllPosIndex
Index of the column for the POS tags in the dataset
conllTextCol
Index of the column for the text in the dataset
labelCol
Name of the NAMED_ENTITY Annotator type column
explodeSentences
Whether to explode each sentence to a separate row
delimiter
Delimiter used to separate columns inside CoNLL file

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
val annotationType: ArrayType
final def asInstanceOf[T0]: T0

Definition Classes
Any
def clearTokens(tokens: Array[IndexedTaggedWord]): Array[IndexedTaggedWord]
def clone(): AnyRef

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( ... )
val conllLabelIndex: Int

Index of the column for NER Label in the dataset
val conllPosIndex: Int

Index of the column for the POS tags in the dataset
val conllTextCol: String

Index of the column for the text in the dataset
val delimiter: String

Delimiter used to separate columns inside CoNLL file
val documentCol: String

Name of the DOCUMENT Annotator type column
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
val explodeSentences: Boolean

Whether to explode each sentence to a separate row
def finalize(): Unit

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
def getAnnotationType(column: String, annotatorType: String, addMetadata: Boolean = true): StructField
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
val labelCol: String

Name of the NAMED_ENTITY Annotator type column
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
final def notifyAll(): Unit

Definition Classes
AnyRef
def packAssembly(text: String, isTraining: Boolean = true): Seq[Annotation]
def packDocs(docs: Seq[CoNLLDocument], spark: SparkSession): Dataset[_]
def packNerTagged(sentences: Seq[NerTaggedSentence]): Seq[Annotation]
def packPosTagged(sentences: Seq[TaggedSentence]): Seq[Annotation]
def packSentence(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]
def packTokenized(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]
val posCol: String

Name of the POS Annotator type column
def readDataset(spark: SparkSession, path: String, readAs: String = ReadAs.TEXT.toString, parallelism: Int = 8, storageLevel: StorageLevel = StorageLevel.DISK_ONLY): Dataset[_]
def readDatasetFromLines(lines: Array[String], spark: SparkSession): Dataset[_]
def readDocs(er: ExternalResource): Seq[CoNLLDocument]
def readLines(lines: Array[String]): Seq[CoNLLDocument]
def schema: StructType
val sentenceCol: String

Name of the Sentences of DOCUMENT Annotator type column
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
val tokenCol: String

Name of the TOKEN Annotator type column
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )

Related Doc: package training

Example

Instance Constructors

new CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true, delimiter: String = " ")

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: Any): Boolean

val annotationType: ArrayType

final def asInstanceOf[T0]: T0

def clearTokens(tokens: Array[IndexedTaggedWord]): Array[IndexedTaggedWord]

def clone(): AnyRef

val conllLabelIndex: Int

val conllPosIndex: Int

val conllTextCol: String

val delimiter: String

val documentCol: String

final def eq(arg0: AnyRef): Boolean

val explodeSentences: Boolean

def finalize(): Unit

def getAnnotationType(column: String, annotatorType: String, addMetadata: Boolean = true): StructField

final def getClass(): Class[_]

final def isInstanceOf[T0]: Boolean

val labelCol: String

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

def packAssembly(text: String, isTraining: Boolean = true): Seq[Annotation]

def packDocs(docs: Seq[CoNLLDocument], spark: SparkSession): Dataset[_]

def packNerTagged(sentences: Seq[NerTaggedSentence]): Seq[Annotation]

def packPosTagged(sentences: Seq[TaggedSentence]): Seq[Annotation]

def packSentence(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]

def packTokenized(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]

val posCol: String

def readDataset(spark: SparkSession, path: String, readAs: String = ReadAs.TEXT.toString, parallelism: Int = 8, storageLevel: StorageLevel = StorageLevel.DISK_ONLY): Dataset[_]

def readDatasetFromLines(lines: Array[String], spark: SparkSession): Dataset[_]

def readDocs(er: ExternalResource): Seq[CoNLLDocument]

def readLines(lines: Array[String]): Seq[CoNLLDocument]

def schema: StructType

val sentenceCol: String

final def synchronized[T0](arg0: ⇒ T0): T0

val tokenCol: String

final def wait(): Unit

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

Inherited from Serializable

Inherited from Serializable

Inherited from Product

Inherited from Equals

Inherited from AnyRef

Inherited from Any

Ungrouped