Package

com.johnsnowlabs.nlp

training

Permalink

package training

Visibility
  1. Public
  2. All

Type Members

  1. case class CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true) extends Product with Serializable

    Permalink

    Helper class to load a CoNLL type dataset for training.

    Helper class to load a CoNLL type dataset for training.

    The dataset should be in the format of CoNLL 2003 and needs to be specified with readDataset. Other CoNLL datasets are not supported.

    Example

    val trainingData = CoNLL().readDataset(spark, "src/test/resources/conll2003/eng.train")
    trainingData.selectExpr("text", "token.result as tokens", "pos.result as pos", "label.result as label")
      .show(3, false)
    +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
    |text                                            |tokens                                                    |pos                                  |label                                    |
    +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
    |EU rejects German call to boycott British lamb .|[EU, rejects, German, call, to, boycott, British, lamb, .]|[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]|[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]|
    |Peter Blackburn                                 |[Peter, Blackburn]                                        |[NNP, NNP]                           |[B-PER, I-PER]                           |
    |BRUSSELS 1996-08-22                             |[BRUSSELS, 1996-08-22]                                    |[NNP, CD]                            |[B-LOC, O]                               |
    +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
    
    trainingData.printSchema
    root
     |-- text: string (nullable = true)
     |-- document: array (nullable = false)
     |    |-- element: struct (containsNull = true)
     |    |    |-- annotatorType: string (nullable = true)
     |    |    |-- begin: integer (nullable = false)
     |    |    |-- end: integer (nullable = false)
     |    |    |-- result: string (nullable = true)
     |    |    |-- metadata: map (nullable = true)
     |    |    |    |-- key: string
     |    |    |    |-- value: string (valueContainsNull = true)
     |    |    |-- embeddings: array (nullable = true)
     |    |    |    |-- element: float (containsNull = false)
     |-- sentence: array (nullable = false)
     |    |-- element: struct (containsNull = true)
     |    |    |-- annotatorType: string (nullable = true)
     |    |    |-- begin: integer (nullable = false)
     |    |    |-- end: integer (nullable = false)
     |    |    |-- result: string (nullable = true)
     |    |    |-- metadata: map (nullable = true)
     |    |    |    |-- key: string
     |    |    |    |-- value: string (valueContainsNull = true)
     |    |    |-- embeddings: array (nullable = true)
     |    |    |    |-- element: float (containsNull = false)
     |-- token: array (nullable = false)
     |    |-- element: struct (containsNull = true)
     |    |    |-- annotatorType: string (nullable = true)
     |    |    |-- begin: integer (nullable = false)
     |    |    |-- end: integer (nullable = false)
     |    |    |-- result: string (nullable = true)
     |    |    |-- metadata: map (nullable = true)
     |    |    |    |-- key: string
     |    |    |    |-- value: string (valueContainsNull = true)
     |    |    |-- embeddings: array (nullable = true)
     |    |    |    |-- element: float (containsNull = false)
     |-- pos: array (nullable = false)
     |    |-- element: struct (containsNull = true)
     |    |    |-- annotatorType: string (nullable = true)
     |    |    |-- begin: integer (nullable = false)
     |    |    |-- end: integer (nullable = false)
     |    |    |-- result: string (nullable = true)
     |    |    |-- metadata: map (nullable = true)
     |    |    |    |-- key: string
     |    |    |    |-- value: string (valueContainsNull = true)
     |    |    |-- embeddings: array (nullable = true)
     |    |    |    |-- element: float (containsNull = false)
     |-- label: array (nullable = false)
     |    |-- element: struct (containsNull = true)
     |    |    |-- annotatorType: string (nullable = true)
     |    |    |-- begin: integer (nullable = false)
     |    |    |-- end: integer (nullable = false)
     |    |    |-- result: string (nullable = true)
     |    |    |-- metadata: map (nullable = true)
     |    |    |    |-- key: string
     |    |    |    |-- value: string (valueContainsNull = true)
     |    |    |-- embeddings: array (nullable = true)
     |    |    |    |-- element: float (containsNull = false)
    documentCol

    Name of the DOCUMENT Annotator type column

    sentenceCol

    Name of the Sentences of DOCUMENT Annotator type column

    tokenCol

    Name of the TOKEN Annotator type column

    posCol

    Name of the POS Annotator type column

    conllLabelIndex

    Index of the column for NER Label in the dataset

    conllPosIndex

    Index of the column for the POS tags in the dataset

    conllTextCol

    Index of the column for the text in the dataset

    labelCol

    Name of the NAMED_ENTITY Annotator type column

    explodeSentences

    Whether to explode each sentence to a separate row

  2. class CoNLL2003NerReader extends AnyRef

    Permalink

    Helper class for to work with CoNLL 2003 dataset for NER task Class is made for easy use from Java

  3. case class CoNLLDocument(text: String, nerTagged: Seq[NerTaggedSentence], posTagged: Seq[PosTaggedSentence]) extends Product with Serializable

    Permalink
  4. case class CoNLLU(explodeSentences: Boolean = true) extends Product with Serializable

    Permalink
  5. case class CoNLLUDocument(text: String, uPosTagged: Seq[PosTaggedSentence], xPosTagged: Seq[PosTaggedSentence], lemma: Seq[PosTaggedSentence]) extends Product with Serializable

    Permalink
  6. case class POS() extends Product with Serializable

    Permalink

    Helper class for creating DataFrames for training a part-of-speech tagger.

    Helper class for creating DataFrames for training a part-of-speech tagger.

    The dataset needs to consist of sentences on each line, where each word is delimited with its respective tag:

    Pierre|NNP Vinken|NNP ,|, 61|CD years|NNS old|JJ ,|, will|MD join|VB the|DT board|NN as|IN a|DT nonexecutive|JJ director|NN Nov.|NNP 29|CD .|.

    The sentence can then be parsed with readDataset into a column with annotations of type POS.

    Example

    In this example, the file test-training.txt has the content of the sentence above.

    import com.johnsnowlabs.nlp.training.POS
    
    val pos = POS()
    val path = "src/test/resources/anc-pos-corpus-small/test-training.txt"
    val posDf = pos.readDataset(spark, path, "|", "tags")
    
    posDf.selectExpr("explode(tags) as tags").show(false)
    +---------------------------------------------+
    |tags                                         |
    +---------------------------------------------+
    |[pos, 0, 5, NNP, [word -> Pierre], []]       |
    |[pos, 7, 12, NNP, [word -> Vinken], []]      |
    |[pos, 14, 14, ,, [word -> ,], []]            |
    |[pos, 16, 17, CD, [word -> 61], []]          |
    |[pos, 19, 23, NNS, [word -> years], []]      |
    |[pos, 25, 27, JJ, [word -> old], []]         |
    |[pos, 29, 29, ,, [word -> ,], []]            |
    |[pos, 31, 34, MD, [word -> will], []]        |
    |[pos, 36, 39, VB, [word -> join], []]        |
    |[pos, 41, 43, DT, [word -> the], []]         |
    |[pos, 45, 49, NN, [word -> board], []]       |
    |[pos, 51, 52, IN, [word -> as], []]          |
    |[pos, 47, 47, DT, [word -> a], []]           |
    |[pos, 56, 67, JJ, [word -> nonexecutive], []]|
    |[pos, 69, 76, NN, [word -> director], []]    |
    |[pos, 78, 81, NNP, [word -> Nov.], []]       |
    |[pos, 83, 84, CD, [word -> 29], []]          |
    |[pos, 81, 81, ., [word -> .], []]            |
    +---------------------------------------------+
  7. case class PubTator() extends Product with Serializable

    Permalink

Value Members

  1. object CoNLLHelper

    Permalink
  2. object CoNLLUCols extends Enumeration

    Permalink

Ungrouped