CoNLLU

Instantiates the class to read a CoNLL-U dataset.

The dataset should be in the format of CoNLL-U and needs to be specified with readDataset, which will create a dataframe with the data.

Example

import com.johnsnowlabs.nlp.training.CoNLLU

val conlluFile = "src/test/resources/conllu/en.test.conllu"
val conllDataSet = CoNLLU(false).readDataset(ResourceHelper.spark, conlluFile)
conllDataSet.selectExpr("text", "form.result as form", "upos.result as upos", "xpos.result as xpos", "lemma.result as lemma")
  .show(1, false)
+---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
|text                                   |form                                          |upos                                         |xpos                          |lemma                                       |
+---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
|What if Google Morphed Into GoogleOS?  |[What, if, Google, Morphed, Into, GoogleOS, ?]|[PRON, SCONJ, PROPN, VERB, ADP, PROPN, PUNCT]|[WP, IN, NNP, VBD, IN, NNP, .]|[what, if, Google, morph, into, GoogleOS, ?]|
+---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+

explodeSentences: Whether to split each sentence into a separate row

Linear Supertypes

Serializable, Serializable, Product, Equals, AnyRef, Any

Instance Constructors

new CoNLLU(explodeSentences: Boolean = true)

explodeSentences
Whether to split each sentence into a separate row

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def asInstanceOf[T0]: T0

Definition Classes
Any
def clone(): AnyRef

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( ... )
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
val explodeSentences: Boolean

Whether to split each sentence into a separate row
def finalize(): Unit

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
def getAnnotationType(column: String, annotatorType: String, addMetadata: Boolean = true): StructField
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
final def notifyAll(): Unit

Definition Classes
AnyRef
def packAssembly(text: String, isTraining: Boolean = true): Seq[Annotation]
def packDocs(docs: Seq[CoNLLUDocument], spark: SparkSession): Dataset[_]
def packPosTagged(sentences: Seq[TaggedSentence]): Seq[Annotation]
def packSentence(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]
def packTokenized(sentences: Seq[TaggedSentence]): Seq[Annotation]
def readDataset(spark: SparkSession, path: String, readAs: String = ReadAs.TEXT.toString): Dataset[_]
def readDatasetFromLines(lines: Array[String], spark: SparkSession): Dataset[_]
def readDocs(er: ExternalResource): Seq[CoNLLUDocument]
def schema: StructType
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )

Related Doc: package training

case class CoNLLU(explodeSentences: Boolean = true) extends Product with Serializable

Example

Instance Constructors

new CoNLLU(explodeSentences: Boolean = true)

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: Any): Boolean

final def asInstanceOf[T0]: T0

def clone(): AnyRef

final def eq(arg0: AnyRef): Boolean

val explodeSentences: Boolean

def finalize(): Unit

def getAnnotationType(column: String, annotatorType: String, addMetadata: Boolean = true): StructField

final def getClass(): Class[_]

final def isInstanceOf[T0]: Boolean

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

def packAssembly(text: String, isTraining: Boolean = true): Seq[Annotation]

def packDocs(docs: Seq[CoNLLUDocument], spark: SparkSession): Dataset[_]

def packPosTagged(sentences: Seq[TaggedSentence]): Seq[Annotation]

def packSentence(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]

def packTokenized(sentences: Seq[TaggedSentence]): Seq[Annotation]

def readDataset(spark: SparkSession, path: String, readAs: String = ReadAs.TEXT.toString): Dataset[_]

def readDatasetFromLines(lines: Array[String], spark: SparkSession): Dataset[_]

def readDocs(er: ExternalResource): Seq[CoNLLUDocument]

def schema: StructType

final def synchronized[T0](arg0: ⇒ T0): T0

final def wait(): Unit

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

Inherited from Serializable

Inherited from Serializable

Inherited from Product

Inherited from Equals

Inherited from AnyRef

Inherited from Any

Ungrouped