parameters

Type Members

case class Bdw(isBigEndian: Boolean, adjustment: Int, blockLength: Option[Int], recordsPerBlock: Option[Int]) extends Product with Serializable
case class CobolParameters(copybookPath: Option[String], multiCopybookPath: Seq[String], copybookContent: Option[String], sourcePaths: Seq[String], recordFormat: RecordFormat, isText: Boolean, isEbcdic: Boolean, ebcdicCodePage: String, ebcdicCodePageClass: Option[String], asciiCharset: String, isUtf16BigEndian: Boolean, floatingPointFormat: FloatingPointFormat, recordStartOffset: Int, recordEndOffset: Int, recordLength: Option[Int], variableLengthParams: Option[VariableLengthParameters], variableSizeOccurs: Boolean, schemaRetentionPolicy: SchemaRetentionPolicy, stringTrimmingPolicy: StringTrimmingPolicy, allowPartialRecords: Boolean, multisegmentParams: Option[MultisegmentParameters], commentPolicy: CommentPolicy, strictSignOverpunch: Boolean, improvedNullDetection: Boolean, dropGroupFillers: Boolean, dropValueFillers: Boolean, fillerNamingPolicy: FillerNamingPolicy, nonTerminals: Seq[String], occursMappings: Map[String, Map[String, Int]], debugFieldsPolicy: DebugFieldsPolicy, debugIgnoreFileSize: Boolean) extends Product with Serializable

This class holds parameters for the job.
This class holds parameters for the job.
copybookPath
String containing the path to the copybook in a given file system.
multiCopybookPath
Sequence containing the paths to the copybooks.
copybookContent
String containing the actual content of the copybook. Either this, the copybookPath, or multiCopybookPath parameter must be specified.
sourcePaths
The list of source file paths.
recordFormat
The record format (F, V, VB, D)
isText
[deprecated by recordFormat] If true the input data consists of text files where records are separated by a line ending character
isEbcdic
If true the input data file encoding is EBCDIC, otherwise it is ASCII
ebcdicCodePage
Specifies what code page to use for EBCDIC to ASCII/Unicode conversions
ebcdicCodePageClass
An optional custom code page conversion class provided by a user
asciiCharset
A charset for ASCII data
isUtf16BigEndian
If true UTF-16 is considered big-endian.
floatingPointFormat
A format of floating-point numbers
recordStartOffset
A number of bytes to skip at the beginning of the record before parsing a record according to a copybook
recordEndOffset
A number of bytes to skip at the end of each record
recordLength
Specifies the length of the record disregarding the copybook record size. Implied the file has fixed record length.
variableLengthParams
VariableLengthParameters containing the specifications for the consumption of variable-length Cobol records.
variableSizeOccurs
If true, OCCURS DEPENDING ON data size will depend on the number of elements
schemaRetentionPolicy
A copybook usually has a root group struct element that acts like a rowtag in XML. This can be retained in Spark schema or can be collapsed
stringTrimmingPolicy
Specify if and how strings should be trimmed when parsed
allowPartialRecords
If true, partial ASCII records can be parsed (in cases when LF character is missing for example)
multisegmentParams
Parameters for reading multisegment mainframe files
commentPolicy
A comment truncation policy
improvedNullDetection
If true, string values that contain only zero bytes (0x0) will be considered null.
dropGroupFillers
If true the parser will drop all FILLER fields, even GROUP FILLERS that have non-FILLER nested fields
dropValueFillers
If true the parser will drop all value FILLER fields
nonTerminals
A list of non-terminals (GROUPS) to combine and parse as primitive fields
debugFieldsPolicy
Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
debugIgnoreFileSize
If true the fixed length file reader won't check file size divisibility. Useful for debugging binary file / copybook mismatches.
case class MultisegmentParameters(segmentIdField: String, segmentIdFilter: Option[Seq[String]], segmentLevelIds: Seq[String], segmentIdPrefix: String, segmentIdRedefineMap: Map[String, String], fieldParentMap: Map[String, String]) extends Product with Serializable

This class holds the parameters currently used for parsing variable-length records.
case class ReaderParameters(recordFormat: RecordFormat = FixedLength, isEbcdic: Boolean = true, isText: Boolean = false, ebcdicCodePage: String = "common", ebcdicCodePageClass: Option[String] = None, asciiCharset: String = "", isUtf16BigEndian: Boolean = true, floatingPointFormat: FloatingPointFormat = FloatingPointFormat.IBM, variableSizeOccurs: Boolean = false, recordLength: Option[Int] = None, lengthFieldName: Option[String] = None, isRecordSequence: Boolean = false, bdw: Option[Bdw] = None, isRdwBigEndian: Boolean = false, isRdwPartRecLength: Boolean = false, rdwAdjustment: Int = 0, isIndexGenerationNeeded: Boolean = false, inputSplitRecords: Option[Int] = None, inputSplitSizeMB: Option[Int] = None, hdfsDefaultBlockSize: Option[Int] = None, startOffset: Int = 0, endOffset: Int = 0, fileStartOffset: Int = 0, fileEndOffset: Int = 0, generateRecordId: Boolean = false, schemaPolicy: SchemaRetentionPolicy = SchemaRetentionPolicy.KeepOriginal, stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth, allowPartialRecords: Boolean = false, multisegment: Option[MultisegmentParameters] = None, commentPolicy: CommentPolicy = CommentPolicy(), strictSignOverpunch: Boolean = true, improvedNullDetection: Boolean = false, dropGroupFillers: Boolean = false, dropValueFillers: Boolean = true, fillerNamingPolicy: FillerNamingPolicy = FillerNamingPolicy.SequenceNumbers, nonTerminals: Seq[String] = Nil, occursMappings: Map[String, Map[String, Int]] = Map(), debugFieldsPolicy: DebugFieldsPolicy = DebugFieldsPolicy.NoDebug, recordHeaderParser: Option[String] = None, recordExtractor: Option[String] = None, rhpAdditionalInfo: Option[String] = None, reAdditionalInfo: String = "", inputFileNameColumn: String = "") extends Product with Serializable

These are properties for customizing mainframe binary data reader.
These are properties for customizing mainframe binary data reader.
recordFormat
Record format
isEbcdic
If true the input data file encoding is EBCDIC, otherwise it is ASCII
isText
If true line ending characters will be used (LF / CRLF) as the record separator
ebcdicCodePage
Specifies what code page to use for EBCDIC to ASCII/Unicode conversions
ebcdicCodePageClass
An optional custom code page conversion class provided by a user
asciiCharset
A charset for ASCII data
isUtf16BigEndian
If true UTF-16 strings are considered big-endian.
floatingPointFormat
A format of floating-point numbers
variableSizeOccurs
If true, OCCURS DEPENDING ON data size will depend on the number of elements
recordLength
Specifies the length of the record disregarding the copybook record size. Implied the file has fixed record length.
lengthFieldName
A name of a field that contains record length. Optional. If not set the copybook record length will be used.
isRecordSequence
Does input files have 4 byte record length headers
bdw
Block descriptor word (if specified), for FB and VB record formats
isRdwPartRecLength
Does RDW count itself as part of record length itself
rdwAdjustment
Controls a mismatch between RDW and record length
isIndexGenerationNeeded
Is indexing input file before processing is requested
inputSplitRecords
The number of records to include in each partition. Notice mainframe records may have variable size, inputSplitMB is the recommended option
inputSplitSizeMB
A partition size to target. In certain circumstances this size may not be exactly that, but the library will do the best effort to target that size
hdfsDefaultBlockSize
Default HDFS block size for the HDFS filesystem used. This value is used as the default split size if inputSplitSizeMB is not specified
startOffset
An offset to the start of the record in each binary data block.
endOffset
An offset from the end of the record to the end of the binary data block.
fileStartOffset
A number of bytes to skip at the beginning of each file
fileEndOffset
A number of bytes to skip at the end of each file
generateRecordId
If true, a record id field will be prepended to each record.
schemaPolicy
Specifies a policy to transform the input schema. The default policy is to keep the schema exactly as it is in the copybook.
stringTrimmingPolicy
Specifies if and how strings should be trimmed when parsed.
allowPartialRecords
If true, partial ASCII records can be parsed (in cases when LF character is missing for example)
multisegment
Parameters specific to reading multisegment files
commentPolicy
A comment truncation policy
improvedNullDetection
If true, string values that contain only zero bytes (0x0) will be considered null.
dropGroupFillers
If true the parser will drop all FILLER fields, even GROUP FILLERS that have non-FILLER nested fields
dropValueFillers
If true the parser will drop all value FILLER fields
fillerNamingPolicy
Specifies the strategy of renaming FILLER names to make them unique
nonTerminals
A list of non-terminals (GROUPS) to combine and parse as primitive fields
debugFieldsPolicy
Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
recordHeaderParser
A parser used to parse data field record headers
rhpAdditionalInfo
An optional additional option string passed to a custom record header parser
inputFileNameColumn
A column name to add to the dataframe. The column will contain input file name for each record similar to 'input_file_name()' function
case class VariableLengthParameters(isRecordSequence: Boolean, bdw: Option[Bdw], isRdwBigEndian: Boolean, isRdwPartRecLength: Boolean, rdwAdjustment: Int, recordHeaderParser: Option[String], recordExtractor: Option[String], rhpAdditionalInfo: Option[String], reAdditionalInfo: String, recordLengthField: String, fileStartOffset: Int, fileEndOffset: Int, generateRecordId: Boolean, isUsingIndex: Boolean, inputSplitRecords: Option[Int], inputSplitSizeMB: Option[Int], improveLocality: Boolean, optimizeAllocation: Boolean, inputFileNameColumn: String, occursMappings: Map[String, Map[String, Int]]) extends Product with Serializable

This class holds the parameters currently used for parsing variable-length records.
This class holds the parameters currently used for parsing variable-length records.
isRecordSequence
Does input files have 4 byte record length headers
bdw
Block descriptor word (if specified), for FB and VB record formats
isRdwBigEndian
Is RDW big endian? It may depend on flavor of mainframe and/or mainframe to PC transfer method
isRdwPartRecLength
Does RDW count itself as part of record length itself
rdwAdjustment
Controls a mismatch between RDW and record length
recordHeaderParser
An optional custom record header parser for non-standard RDWs
recordExtractor
An optional custom raw record parser class non-standard record types
rhpAdditionalInfo
An optional additional option string passed to a custom record header parser
reAdditionalInfo
An optional additional option string passed to a custom record extractor
recordLengthField
A field that stores record length
fileStartOffset
A number of bytes to skip at the beginning of each file
fileEndOffset
A number of bytes to skip at the end of each file
generateRecordId
Generate a sequential record number for each record to be able to retain the order of the original data
isUsingIndex
Is indexing input file before processing is requested
inputSplitRecords
The number of records to include in each partition. Notice mainframe records may have variable size, inputSplitMB is the recommended option
inputSplitSizeMB
A partition size to target. In certain circumstances this size may not be exactly that, but the library will do the best effort to target that size
improveLocality
Tries to improve locality by extracting preferred locations for variable-length records
optimizeAllocation
Optimizes cluster usage in case of optimization for locality in the presence of new nodes (nodes that do not contain any blocks of the files being processed)
inputFileNameColumn
A column name to add to the dataframe. The column will contain input file name for each record similar to 'input_file_name()' function

package parameters

Type Members

case class Bdw(isBigEndian: Boolean, adjustment: Int, blockLength: Option[Int], recordsPerBlock: Option[Int]) extends Product with Serializable

case class MultisegmentParameters(segmentIdField: String, segmentIdFilter: Option[Seq[String]], segmentLevelIds: Seq[String], segmentIdPrefix: String, segmentIdRedefineMap: Map[String, String], fieldParentMap: Map[String, String]) extends Product with Serializable

Ungrouped