BuildCorpusIndex

Instance Constructors

new BuildCorpusIndex(config: com.typesafe.config.Config)

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def addFileToIndex(file: File, bulkProcessor: BulkProcessor, codec: Codec, documentFormat: String): Unit

Index a single file into elasticsearch.
Index a single file into elasticsearch.
file
to be indexed
bulkProcessor
to communicate with the elasticsearch instance
def addSegmentToIndex(segment: String, documentFormat: String, source: String, segmentIndex: Int, bulkProcessor: BulkProcessor): Unit

Index a single segment into elasticsearch.
Index a single segment into elasticsearch.
segment
to be indexed
documentFormat
also describes the format of the segment
source
name of source for reference
segmentIndex
index of segment in file (for deduplication)
bulkProcessor
to communicate with the elasticsearch instance
def addTreeToIndex(fileTree: Iterator[Path], codec: Codec, documentFormat: String): Seq[Future[Unit]]

Index a file tree into the elasticSearch instance.
Index a file tree into the elasticSearch instance. Divides work into nThreads*4 Futures. Each future syncs on currentFile which is a logging variable, and then grabs the next file from the stream if it is not empty.
fileTree
file stream to be indexed
returns
a sequence of Futures each representing the work done by a thread on this file tree.
def addWaterlooFileToIndex(inputFile: File, documentFormat: String, bulkProcessor: BulkProcessor, codec: Codec): Unit

Index a file into the elasticsearch instance, following the convention of the waterloo corpus.
Index a file into the elasticsearch instance, following the convention of the waterloo corpus. Sentences are encapsulated by <SENT> ... </SENT> tags.
inputFile
path to the input directory
bulkProcessor
to communicate with the elasticsearch instace
final def asInstanceOf[T0]: T0

Definition Classes
Any
def buildElasticSearchIndex(): Unit

Build an index in ElasticSearch using the corpora specified in config.
val buildFromScratch: Boolean
val bulkProcessorUtility: BulkProcessorUtility
def clone(): AnyRef

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( ... )
val dumpFolderPath: String

On failure, dump serialized requests to this path.
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def equals(arg0: Any): Boolean

Definition Classes
AnyRef → Any
val esConfig: com.typesafe.config.Config

Get Index Name and Index Type.
def finalize(): Unit

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
def getDatastorePathFromConfig(corpusConfig: com.typesafe.config.Config): (Path, Boolean)
def getDirectoryFromDatastore(privacy: String, group: String, directory: String, version: Int): Path
def getFileFromDatastore(privacy: String, group: String, directory: Option[String], file: String, version: Int): Path
def getLocalPathFromConfig(corpusConfig: com.typesafe.config.Config): (Path, Boolean)
def getSegmentsFromDocument(document: SegmentedDocument): Iterator[String]
def hashCode(): Int

Definition Classes
AnyRef → Any
val indexName: String
val indexType: String
val internalLogger: Logger

Definition Classes
Logging
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
object logger

Definition Classes
Logging
val nThreads: Int
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
final def notifyAll(): Unit

Definition Classes
AnyRef
def parseCorpusConfig(corpusConfig: com.typesafe.config.Config): ParsedConfig

Take the config for a corpus, resolve paths, and return a simple object containing information about the corpus.
def segmentFile(file: File, codec: Codec, documentFormat: String): Iterator[String]
def segmentPlainTextFile(file: File, codec: Codec): Iterator[String]
def segmentWikipediaFile(file: File, codec: Codec): Iterator[String]
val sentenceSplitRegex: UnanchoredRegex

Regex used to split sentences in waterloo corpus.
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
def toString(): String

Definition Classes
AnyRef → Any
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )

Related Docs: object BuildCorpusIndex | package indexing

class BuildCorpusIndex extends Logging

Instance Constructors

new BuildCorpusIndex(config: com.typesafe.config.Config)

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: Any): Boolean

def addFileToIndex(file: File, bulkProcessor: BulkProcessor, codec: Codec, documentFormat: String): Unit

def addSegmentToIndex(segment: String, documentFormat: String, source: String, segmentIndex: Int, bulkProcessor: BulkProcessor): Unit

def addTreeToIndex(fileTree: Iterator[Path], codec: Codec, documentFormat: String): Seq[Future[Unit]]

def addWaterlooFileToIndex(inputFile: File, documentFormat: String, bulkProcessor: BulkProcessor, codec: Codec): Unit

final def asInstanceOf[T0]: T0

def buildElasticSearchIndex(): Unit

val buildFromScratch: Boolean

val bulkProcessorUtility: BulkProcessorUtility

def clone(): AnyRef

val dumpFolderPath: String

final def eq(arg0: AnyRef): Boolean

def equals(arg0: Any): Boolean

val esConfig: com.typesafe.config.Config

def finalize(): Unit

final def getClass(): Class[_]

def getDatastorePathFromConfig(corpusConfig: com.typesafe.config.Config): (Path, Boolean)

def getDirectoryFromDatastore(privacy: String, group: String, directory: String, version: Int): Path

def getFileFromDatastore(privacy: String, group: String, directory: Option[String], file: String, version: Int): Path

def getLocalPathFromConfig(corpusConfig: com.typesafe.config.Config): (Path, Boolean)

def getSegmentsFromDocument(document: SegmentedDocument): Iterator[String]

def hashCode(): Int

val indexName: String

val indexType: String

val internalLogger: Logger

final def isInstanceOf[T0]: Boolean

object logger

val nThreads: Int

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

def parseCorpusConfig(corpusConfig: com.typesafe.config.Config): ParsedConfig

def segmentFile(file: File, codec: Codec, documentFormat: String): Iterator[String]

def segmentPlainTextFile(file: File, codec: Codec): Iterator[String]

def segmentWikipediaFile(file: File, codec: Codec): Iterator[String]

val sentenceSplitRegex: UnanchoredRegex

final def synchronized[T0](arg0: ⇒ T0): T0

def toString(): String

final def wait(): Unit

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

Inherited from Logging

Inherited from AnyRef

Inherited from Any

Ungrouped