POS

Helper class for creating DataFrames for training a part-of-speech tagger.

The dataset needs to consist of sentences on each line, where each word is delimited with its respective tag:

Pierre|NNP Vinken|NNP ,|, 61|CD years|NNS old|JJ ,|, will|MD join|VB the|DT board|NN as|IN a|DT nonexecutive|JJ director|NN Nov.|NNP 29|CD .|.

The sentence can then be parsed with readDataset into a column with annotations of type POS.

Example

In this example, the file test-training.txt has the content of the sentence above.

import com.johnsnowlabs.nlp.training.POS

val pos = POS()
val path = "src/test/resources/anc-pos-corpus-small/test-training.txt"
val posDf = pos.readDataset(spark, path, "|", "tags")

posDf.selectExpr("explode(tags) as tags").show(false)
+---------------------------------------------+
|tags                                         |
+---------------------------------------------+
|[pos, 0, 5, NNP, [word -> Pierre], []]       |
|[pos, 7, 12, NNP, [word -> Vinken], []]      |
|[pos, 14, 14, ,, [word -> ,], []]            |
|[pos, 16, 17, CD, [word -> 61], []]          |
|[pos, 19, 23, NNS, [word -> years], []]      |
|[pos, 25, 27, JJ, [word -> old], []]         |
|[pos, 29, 29, ,, [word -> ,], []]            |
|[pos, 31, 34, MD, [word -> will], []]        |
|[pos, 36, 39, VB, [word -> join], []]        |
|[pos, 41, 43, DT, [word -> the], []]         |
|[pos, 45, 49, NN, [word -> board], []]       |
|[pos, 51, 52, IN, [word -> as], []]          |
|[pos, 47, 47, DT, [word -> a], []]           |
|[pos, 56, 67, JJ, [word -> nonexecutive], []]|
|[pos, 69, 76, NN, [word -> director], []]    |
|[pos, 78, 81, NNP, [word -> Nov.], []]       |
|[pos, 83, 84, CD, [word -> 29], []]          |
|[pos, 81, 81, ., [word -> .], []]            |
+---------------------------------------------+

Linear Supertypes

Serializable, Serializable, Product, Equals, AnyRef, Any

Instance Constructors

new POS()

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def asInstanceOf[T0]: T0

Definition Classes
Any
def clone(): AnyRef

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( ... )
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def finalize(): Unit

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
final def notifyAll(): Unit

Definition Classes
AnyRef
def readDataset(sparkSession: SparkSession, path: String, delimiter: String = "|", outputPosCol: String = "tags", outputDocumentCol: String = "document", outputTextCol: String = "text"): DataFrame

Reads the provided dataset file with given parameters and returns a DataFrame ready to for training a part-of-speech tagger.
Reads the provided dataset file with given parameters and returns a DataFrame ready to for training a part-of-speech tagger.
sparkSession
Current Spark sessions
path
Path to the resource
delimiter
Delimiter used to separate word from their tag in the text
outputPosCol
Name for the output column of the part-of-tags
outputDocumentCol
Name for the DocumentAssembler column
outputTextCol
Name for the column of the raw text
returns
DataFrame of parsed text
def readFromDataframe(posDataframe: DataFrame, tokensCol: String = "tokens", labelsCol: String = "labels", outPutDocColName: String = "text", outPutPosColName: String = "tags"): DataFrame
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
def wrapColumnMetadata(col: Column, annotatorType: String, outPutColName: String): Column

Related Doc: package training

case class POS() extends Product with Serializable

Example

Instance Constructors

new POS()

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: Any): Boolean

final def asInstanceOf[T0]: T0

def clone(): AnyRef

final def eq(arg0: AnyRef): Boolean

def finalize(): Unit

final def getClass(): Class[_]

final def isInstanceOf[T0]: Boolean

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

def readDataset(sparkSession: SparkSession, path: String, delimiter: String = "|", outputPosCol: String = "tags", outputDocumentCol: String = "document", outputTextCol: String = "text"): DataFrame

def readFromDataframe(posDataframe: DataFrame, tokensCol: String = "tokens", labelsCol: String = "labels", outPutDocColName: String = "text", outPutPosColName: String = "tags"): DataFrame

final def synchronized[T0](arg0: ⇒ T0): T0

final def wait(): Unit

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

def wrapColumnMetadata(col: Column, annotatorType: String, outPutColName: String): Column

Inherited from Serializable

Inherited from Serializable

Inherited from Product

Inherited from Equals

Inherited from AnyRef

Inherited from Any

Ungrouped