org.allenai.common.indexing

WaterlooSegmentScript

Related Doc: package indexing

object WaterlooSegmentScript extends App with Logging

Script used to segment waterloo corpus on a sentence level. Splits docs based on <DOC> ... </DOC> tags, determines whether the doc is in "English" by counting the fraction of stop words, and throws out the doc if it is not. Sentence segments the doc using nlp stack, wraps each sentence in <SENT> ... </SENT> tags, and then rewrites the entire doc to file.

Linear Supertypes
Logging, App, DelayedInit, AnyRef, Any
Ordering
  1. Alphabetic
  2. By inheritance
Inherited
  1. WaterlooSegmentScript
  2. Logging
  3. App
  4. DelayedInit
  5. AnyRef
  6. Any
  1. Hide All
  2. Show all
Learn more about member selection
Visibility
  1. Public
  2. All

Value Members

  1. final def !=(arg0: Any): Boolean

    Definition Classes
    AnyRef → Any
  2. final def ##(): Int

    Definition Classes
    AnyRef → Any
  3. final def ==(arg0: Any): Boolean

    Definition Classes
    AnyRef → Any
  4. def args: Array[String]

    Attributes
    protected
    Definition Classes
    App
    Annotations
    @deprecatedOverriding( "args should not be overridden" , "2.11.0" )
  5. final def asInstanceOf[T0]: T0

    Definition Classes
    Any
  6. def clone(): AnyRef

    Attributes
    protected[java.lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  7. val config: com.typesafe.config.Config

  8. val corpusConfig: com.typesafe.config.Config

  9. def dealWithDoc(input: String, bufferedWriter: BufferedWriter, source: String): Unit

  10. val englishThreshold: Double

  11. final def eq(arg0: AnyRef): Boolean

    Definition Classes
    AnyRef
  12. def equals(arg0: Any): Boolean

    Definition Classes
    AnyRef → Any
  13. val esConfig: com.typesafe.config.Config

  14. val executionStart: Long

    Definition Classes
    App
  15. def finalize(): Unit

    Attributes
    protected[java.lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( classOf[java.lang.Throwable] )
  16. final def getClass(): Class[_]

    Definition Classes
    AnyRef → Any
  17. def hashCode(): Int

    Definition Classes
    AnyRef → Any
  18. val indexName: String

  19. val indirPath: String

  20. val internalLogger: Logger

    Definition Classes
    Logging
  21. def isEnglish(input: String): Boolean

  22. final def isInstanceOf[T0]: Boolean

    Definition Classes
    Any
  23. object logger

    Definition Classes
    Logging
  24. def main(args: Array[String]): Unit

    Definition Classes
    App
    Annotations
    @deprecatedOverriding( "main should not be overridden" , "2.11.0" )
  25. final def ne(arg0: AnyRef): Boolean

    Definition Classes
    AnyRef
  26. final def notify(): Unit

    Definition Classes
    AnyRef
  27. final def notifyAll(): Unit

    Definition Classes
    AnyRef
  28. val outdirPath: String

  29. val rootConfig: com.typesafe.config.Config

  30. def segmentDirectory(inputDirectoryName: String, outputDirectoryName: String): Unit

  31. def segmentIntoDocs(inputFile: File, outputFile: File): Unit

  32. val splitRegex: UnanchoredRegex

  33. val splitString: String

  34. val stopWords: Set[String]

  35. val stopWordsConfig: com.typesafe.config.Config

  36. final def synchronized[T0](arg0: ⇒ T0): T0

    Definition Classes
    AnyRef
  37. def toString(): String

    Definition Classes
    AnyRef → Any
  38. final def wait(): Unit

    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  39. final def wait(arg0: Long, arg1: Int): Unit

    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  40. final def wait(arg0: Long): Unit

    Definition Classes
    AnyRef
    Annotations
    @throws( ... )

Deprecated Value Members

  1. def delayedInit(body: ⇒ Unit): Unit

    Definition Classes
    App → DelayedInit
    Annotations
    @deprecated
    Deprecated

    (Since version 2.11.0) The delayedInit mechanism will disappear.

Inherited from Logging

Inherited from App

Inherited from DelayedInit

Inherited from AnyRef

Inherited from Any

Ungrouped