class ADAMContext extends Serializable with Logging
The ADAMContext provides functions on top of a SparkContext for loading genomic data.
- Alphabetic
- By Inheritance
- ADAMContext
- Logging
- Serializable
- Serializable
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Instance Constructors
-
new
ADAMContext(sc: SparkContext)
- sc
The SparkContext to wrap.
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
-
def
clone(): AnyRef
- Attributes
- protected[java.lang]
- Definition Classes
- AnyRef
- Annotations
- @native() @throws( ... )
-
def
debug(mkr: Marker, msg: ⇒ Any, t: ⇒ Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
debug(msg: ⇒ Any, t: ⇒ Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
debug(msg: ⇒ Any): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
equals(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
def
error(mkr: Marker, msg: ⇒ Any, t: ⇒ Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
error(msg: ⇒ Any, t: ⇒ Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
error(msg: ⇒ Any): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
finalize(): Unit
- Attributes
- protected[java.lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] )
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
def
getFiles(path: Path, fs: FileSystem): Array[Path]
Elaborates out a directory/glob/plain path.
Elaborates out a directory/glob/plain path.
- path
Path to elaborate.
- fs
The underlying file system that this path is on.
- returns
Returns an array of Paths to load.
- Attributes
- protected
- Exceptions thrown
FileNotFoundException
if the path does not match any files.- See also
getFsAndFiles
-
def
getFsAndFiles(path: Path): Array[Path]
Elaborates out a directory/glob/plain path.
Elaborates out a directory/glob/plain path.
- path
Path to elaborate.
- returns
Returns an array of Paths to load.
- Attributes
- protected
- Exceptions thrown
FileNotFoundException
if the path does not match any files.- See also
getFiles
-
def
getFsAndFilesWithFilter(pathName: String, filter: PathFilter): Array[Path]
Elaborates out a directory/glob/plain path name.
Elaborates out a directory/glob/plain path name.
- pathName
Path name to elaborate.
- filter
Filter to discard paths.
- returns
Returns an array of Paths to load.
- Attributes
- protected
- Exceptions thrown
FileNotFoundException
if the path does not match any files.- See also
getFiles
-
def
hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
def
info(mkr: Marker, msg: ⇒ Any, t: ⇒ Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
info(msg: ⇒ Any, t: ⇒ Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
info(msg: ⇒ Any): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
isDebugEnabled: Boolean
- Attributes
- protected
- Definition Classes
- Logging
-
def
isErrorEnabled: Boolean
- Attributes
- protected
- Definition Classes
- Logging
-
def
isInfoEnabled: Boolean
- Attributes
- protected
- Definition Classes
- Logging
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
-
def
isPartitioned(pathName: String): Boolean
Return true if the specified path of Parquet + Avro files is partitioned.
Return true if the specified path of Parquet + Avro files is partitioned.
- pathName
Path in which to look for partitioned flag.
- returns
Return true if the specified path of Parquet + Avro files is partitioned. Behavior is undefined if some paths in glob contain _partitionedByStartPos flag file and some do not.
-
def
isTraceEnabled: Boolean
- Attributes
- protected
- Definition Classes
- Logging
-
def
isWarnEnabled: Boolean
- Attributes
- protected
- Definition Classes
- Logging
-
def
loadAlignments(df: DataFrame, references: SequenceDictionary, readGroups: ReadGroupDictionary, processingSteps: Seq[ProcessingStep]): AlignmentDataset
Load the specified data frame, references, read groups, and processing steps into a AlignmentDataset.
Load the specified data frame, references, read groups, and processing steps into a AlignmentDataset.
- df
Data frame to load from.
- references
References for the AlignmentDataset, may be empty.
- readGroups
Read groups for the AlignmentDataset, may be empty.
- processingSteps
Processing steps for the AlignmentDataset, may be empty.
- returns
Returns a new AlignmentDataset loaded from the specified data frame, references, read groups, and processing steps.
-
def
loadAlignments(df: DataFrame, metadataPathName: String): AlignmentDataset
Load the specified data frame into a AlignmentDataset, with metadata loaded from the specified metadata path name.
Load the specified data frame into a AlignmentDataset, with metadata loaded from the specified metadata path name.
- df
Data frame to load from.
- metadataPathName
Path name to load metadata from.
- returns
Returns a new AlignmentDataset loaded from the specified data frame, with metadata loaded from the specified metadata path name.
-
def
loadAlignments(df: DataFrame): AlignmentDataset
Load the specified data frame into a AlignmentDataset, with empty metadata.
Load the specified data frame into a AlignmentDataset, with empty metadata.
- df
Data frame to load from.
- returns
Returns a new AlignmentDataset loaded from the specified data frame, with empty metadata.
-
def
loadAlignments(pathName: String, optPathName2: Option[String] = None, optReadGroup: Option[String] = None, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None, stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentDataset
Load alignments into an AlignmentDataset.
Load alignments into an AlignmentDataset.
Loads path names ending in: * .bam/.cram/.sam as BAM/CRAM/SAM format, * .fa/.fasta as FASTA format, * .fq/.fastq as FASTQ format, and * .ifq as interleaved FASTQ format.
If none of these match, fall back to Parquet + Avro.
For FASTA, FASTQ, and interleaved FASTQ formats, compressed files are supported through compression codecs configured in Hadoop, which by default include .gz and .bz2, but can include more.
- pathName
The path name to load alignments from. Globs/directories are supported, although file extension must be present for BAM/CRAM/SAM, FASTA, and FASTQ formats.
- optPathName2
The optional path name to load the second set of alignment records from, if loading paired FASTQ format. Globs/directories are supported, although file extension must be present. Defaults to None.
- optReadGroup
The optional read group identifier to associate to the alignment records. Defaults to None.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An option projection schema to use when reading Parquet + Avro. Defaults to None.
- stringency
The validation stringency to use when validating BAM/CRAM/SAM or FASTQ formats. Defaults to ValidationStringency.STRICT.
- returns
Returns an AlignmentDataset which wraps the genomic dataset of alignments, sequence dictionary representing reference sequences the alignments may be aligned to, and the read group dictionary for the alignments if one is available.
- See also
loadBam
loadFastq
loadFastaDna(String, Long)
loadInterleavedFastq
loadParquetAlignments
-
def
loadBam(pathName: String, stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentDataset
Load alignments from BAM/CRAM/SAM into an AlignmentDataset.
Load alignments from BAM/CRAM/SAM into an AlignmentDataset.
This reads the sequence and read group dictionaries from the BAM/CRAM/SAM file header. SAMRecords are read from the file and converted to the Alignment schema.
- pathName
The path name to load BAM/CRAM/SAM formatted alignments from. Globs/directories are supported.
- stringency
The validation stringency to use when validating the BAM/CRAM/SAM format header. Defaults to ValidationStringency.STRICT.
- returns
Returns an AlignmentDataset which wraps the genomic dataset of alignments, sequence dictionary representing reference sequences the alignments may be aligned to, and the read group dictionary for the alignments if one is available.
-
def
loadBed(pathName: String, optSequenceDictionary: Option[SequenceDictionary] = None, optMinPartitions: Option[Int] = None, stringency: ValidationStringency = ValidationStringency.STRICT): FeatureDataset
Load a path name in BED6/12 format into a FeatureDataset.
Load a path name in BED6/12 format into a FeatureDataset.
- pathName
The path name to load features in BED6/12 format from. Globs/directories are supported.
- optSequenceDictionary
Optional sequence dictionary. Defaults to None.
- optMinPartitions
An optional minimum number of partitions to load. If not set, falls back to the configured Spark default parallelism. Defaults to None.
- stringency
The validation stringency to use when validating BED6/12 format. Defaults to ValidationStringency.STRICT.
- returns
Returns a FeatureDataset.
-
def
loadCoverage(pathName: String, optSequenceDictionary: Option[SequenceDictionary] = None, optMinPartitions: Option[Int] = None, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None, stringency: ValidationStringency = ValidationStringency.STRICT): CoverageDataset
Load features into a FeatureDataset and convert to a CoverageDataset.
Load features into a FeatureDataset and convert to a CoverageDataset. Coverage is stored in the score field of Feature.
Loads path names ending in: * .bed as BED6/12 format, * .gff3 as GFF3 format, * .gtf/.gff as GTF/GFF2 format, * .narrow[pP]eak as NarrowPeak format, and * .interval_list as IntervalList format.
If none of these match, fall back to Parquet + Avro.
For BED6/12, GFF3, GTF/GFF2, NarrowPeak, and IntervalList formats, compressed files are supported through compression codecs configured in Hadoop, which by default include .gz and .bz2, but can include more.
- pathName
The path name to load features from. Globs/directories are supported, although file extension must be present for BED6/12, GFF3, GTF/GFF2, NarrowPeak, or IntervalList formats.
- optSequenceDictionary
Optional sequence dictionary. Defaults to None.
- optMinPartitions
An optional minimum number of partitions to use. For textual formats, if this is None, fall back to the Spark default parallelism. Defaults to None.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An option projection schema to use when reading Parquet + Avro. Defaults to None.
- stringency
The validation stringency to use when validating BED6/12, GFF3, GTF/GFF2, NarrowPeak, or IntervalList formats. Defaults to ValidationStringency.STRICT.
- returns
Returns a FeatureDataset converted to a CoverageDataset.
- See also
loadBed
loadGtf
loadGff3
loadNarrowPeak
loadIntervalList
loadParquetFeatures
-
def
loadDnaSequences(pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None): SequenceDataset
Load DNA sequences into a SequenceDataset.
Load DNA sequences into a SequenceDataset.
If the path name has a .fa/.fasta extension, load as FASTA format. Else, fall back to Parquet + Avro.
For FASTA format, compressed files are supported through compression codecs configured in Hadoop, which by default include .gz and .bz2, but can include more.
- pathName
The path name to load sequences from. Globs/directories are supported, although file extension must be present for FASTA format.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An optional projection schema to use when reading Parquet + Avro. Defaults to None.
- returns
Returns a SequenceDataset containing DNA sequences.
- See also
loadFastaDna
loadParquetSequences
-
def
loadFastaDna(pathName: String, maximumLength: Long): SliceDataset
Load DNA slices from FASTA into a SliceDataset.
Load DNA slices from FASTA into a SliceDataset.
- pathName
The path name to load slices from. Globs/directories are supported.
- maximumLength
Maximum fragment length. Values greater than 1e9 should be avoided.
- returns
Returns a SliceDataset containing DNA slices.
-
def
loadFastaDna(pathName: String): SequenceDataset
Load DNA sequences from FASTA into a SequenceDataset.
Load DNA sequences from FASTA into a SequenceDataset.
- pathName
The path name to load sequences from. Globs/directories are supported.
- returns
Returns a SequenceDataset containing DNA sequences.
-
def
loadFastaProtein(pathName: String): SequenceDataset
Load protein sequences from FASTA into a SequenceDataset.
Load protein sequences from FASTA into a SequenceDataset.
- pathName
The path name to load sequences from. Globs/directories are supported.
- returns
Returns a SequenceDataset containing protein sequences.
-
def
loadFastaRna(pathName: String): SequenceDataset
Load RNA sequences from FASTA into a SequenceDataset.
Load RNA sequences from FASTA into a SequenceDataset.
- pathName
The path name to load sequences from. Globs/directories are supported.
- returns
Returns a SequenceDataset containing RNA sequences.
-
def
loadFastq(pathName1: String, optPathName2: Option[String], optReadGroup: Option[String] = None, stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentDataset
Load unaligned alignments from (possibly paired) FASTQ into an AlignmentDataset.
Load unaligned alignments from (possibly paired) FASTQ into an AlignmentDataset.
- pathName1
The path name to load the first set of unaligned alignments from. Globs/directories are supported.
- optPathName2
The path name to load the second set of unaligned alignments from, if provided. Globs/directories are supported.
- optReadGroup
The optional read group identifier to associate to the unaligned alignment records. Defaults to None.
- stringency
The validation stringency to use when validating (possibly paired) FASTQ format. Defaults to ValidationStringency.STRICT.
- returns
Returns an unaligned AlignmentDataset.
- See also
loadPairedFastq
loadUnpairedFastq
-
def
loadFeatures(df: DataFrame, references: SequenceDictionary, samples: Seq[Sample]): FeatureDataset
Load the specified data frame, references, and samples into a FeatureDataset.
Load the specified data frame, references, and samples into a FeatureDataset.
- df
Data frame to load from.
- references
References for the FeatureDataset, may be empty.
- samples
Samples for the FeatureDataset, may be empty.
- returns
Returns a new FeatureDataset loaded from the specified data frame, references, and samples.
-
def
loadFeatures(df: DataFrame, metadataPathName: String): FeatureDataset
Load the specified data frame into a FeatureDataset, with metadata loaded from the specified metadata path name.
Load the specified data frame into a FeatureDataset, with metadata loaded from the specified metadata path name.
- df
Data frame to load from.
- metadataPathName
Path name to load metadata from.
- returns
Returns a new FeatureDataset loaded from the specified data frame, with metadata loaded from the specified metadata path name.
-
def
loadFeatures(df: DataFrame): FeatureDataset
Load the specified data frame into a FeatureDataset, with empty metadata.
Load the specified data frame into a FeatureDataset, with empty metadata.
- df
Data frame to load from.
- returns
Returns a new FeatureDataset loaded from the specified data frame, with empty metadata.
-
def
loadFeatures(pathName: String, optSequenceDictionary: Option[SequenceDictionary] = None, optMinPartitions: Option[Int] = None, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None, stringency: ValidationStringency = ValidationStringency.STRICT): FeatureDataset
Load features into a FeatureDataset.
Load features into a FeatureDataset.
Loads path names ending in: * .bed as BED6/12 format, * .gff3 as GFF3 format, * .gtf/.gff as GTF/GFF2 format, * .narrow[pP]eak as NarrowPeak format, and * .interval_list as IntervalList format.
If none of these match, fall back to Parquet + Avro.
For BED6/12, GFF3, GTF/GFF2, NarrowPeak, and IntervalList formats, compressed files are supported through compression codecs configured in Hadoop, which by default include .gz and .bz2, but can include more.
- pathName
The path name to load features from. Globs/directories are supported, although file extension must be present for BED6/12, GFF3, GTF/GFF2, NarrowPeak, or IntervalList formats.
- optSequenceDictionary
Optional sequence dictionary. Defaults to None.
- optMinPartitions
An optional minimum number of partitions to use. For textual formats, if this is None, fall back to the Spark default parallelism. Defaults to None.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An option projection schema to use when reading Parquet + Avro. Defaults to None.
- stringency
The validation stringency to use when validating BED6/12, GFF3, GTF/GFF2, NarrowPeak, or IntervalList formats. Defaults to ValidationStringency.STRICT.
- returns
Returns a FeatureDataset.
- See also
loadBed
loadGtf
loadGff3
loadNarrowPeak
loadIntervalList
loadParquetFeatures
-
def
loadFragments(df: DataFrame, references: SequenceDictionary, readGroups: ReadGroupDictionary, processingSteps: Seq[ProcessingStep]): FragmentDataset
Load the specified data frame, references, read groups, and processing steps into a FragmentDataset.
Load the specified data frame, references, read groups, and processing steps into a FragmentDataset.
- df
Data frame to load from.
- references
References for the FragmentDataset, may be empty.
- readGroups
Read groups for the FragmentDataset, may be empty.
- processingSteps
Processing steps for the FragmentDataset, may be empty.
- returns
Returns a new FragmentDataset loaded from the specified data frame, references, read groups, and processing steps.
-
def
loadFragments(df: DataFrame, metadataPathName: String): FragmentDataset
Load the specified data frame into a FragmentDataset, with metadata loaded from the specified metadata path name.
Load the specified data frame into a FragmentDataset, with metadata loaded from the specified metadata path name.
- df
Data frame to load from.
- metadataPathName
Path name to load metadata from.
- returns
Returns a new FragmentDataset loaded from the specified data frame, with metadata loaded from the specified metadata path name.
-
def
loadFragments(df: DataFrame): FragmentDataset
Load the specified data frame into a FragmentDataset, with empty metadata.
Load the specified data frame into a FragmentDataset, with empty metadata.
- df
Data frame to load from.
- returns
Returns a new FragmentDataset loaded from the specified data frame, with empty metadata.
-
def
loadFragments(pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None, stringency: ValidationStringency = ValidationStringency.STRICT): FragmentDataset
Load fragments into a FragmentDataset.
Load fragments into a FragmentDataset.
Loads path names ending in: * .bam/.cram/.sam as BAM/CRAM/SAM format and * .ifq as interleaved FASTQ format.
If none of these match, fall back to Parquet + Avro.
For interleaved FASTQ format, compressed files are supported through compression codecs configured in Hadoop, which by default include .gz and .bz2, but can include more.
- pathName
The path name to load fragments from. Globs/directories are supported, although file extension must be present for BAM/CRAM/SAM and FASTQ formats.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An option projection schema to use when reading Parquet + Avro. Defaults to None.
- stringency
The validation stringency to use when validating BAM/CRAM/SAM or FASTQ formats. Defaults to ValidationStringency.STRICT.
- returns
Returns a FragmentDataset.
- See also
loadBam
loadAlignments
loadInterleavedFastqAsFragments
loadParquetFragments
-
def
loadGenotypes(df: DataFrame, references: SequenceDictionary, samples: Seq[Sample], headerLines: Seq[VCFHeaderLine]): GenotypeDataset
Load the specified data frame, references, samples, and header lines into a GenotypeDataset.
Load the specified data frame, references, samples, and header lines into a GenotypeDataset.
- df
Data frame to load from.
- references
References for the GenotypeDataset, may be empty.
- samples
Samples for the GenotypeDataset, may be empty.
- headerLines
Header lines for the GenotypeDataset, may be empty.
- returns
Returns a new GenotypeDataset loaded from the specified data frame, references, samples, and header lines.
-
def
loadGenotypes(df: DataFrame, references: SequenceDictionary, samples: Seq[Sample]): GenotypeDataset
Load the specified data frame, references, samples, and header lines into a GenotypeDataset, with the default header lines.
Load the specified data frame, references, samples, and header lines into a GenotypeDataset, with the default header lines.
- df
Data frame to load from.
- references
References for the GenotypeDataset, may be empty.
- samples
Samples for the GenotypeDataset, may be empty.
- returns
Returns a new GenotypeDataset loaded from the specified data frame, references, and samples, with the default header lines.
-
def
loadGenotypes(df: DataFrame, metadataPathName: String): GenotypeDataset
Load the specified data frame into a GenotypeDataset, with metadata loaded from the specified metadata path name.
Load the specified data frame into a GenotypeDataset, with metadata loaded from the specified metadata path name.
- df
Data frame to load from.
- metadataPathName
Path name to load metadata from.
- returns
Returns a new GenotypeDataset loaded from the specified data frame, with metadata loaded from the specified metadata path name.
-
def
loadGenotypes(df: DataFrame): GenotypeDataset
Load the specified data frame into a GenotypeDataset, with empty metadata and the default header lines.
Load the specified data frame into a GenotypeDataset, with empty metadata and the default header lines.
- df
Data frame to load from.
- returns
Returns a new GenotypeDataset loaded from the specified data frame, with empty metadata and the default header lines.
-
def
loadGenotypes(pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None, stringency: ValidationStringency = ValidationStringency.STRICT): GenotypeDataset
Load genotypes into a GenotypeDataset.
Load genotypes into a GenotypeDataset.
If the path name has a .vcf/.vcf.gz/.vcf.bgz extension, load as VCF format. Else, fall back to Parquet + Avro.
- pathName
The path name to load genotypes from. Globs/directories are supported, although file extension must be present for VCF format.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An option projection schema to use when reading Parquet + Avro. Defaults to None.
- stringency
The validation stringency to use when validating VCF format. Defaults to ValidationStringency.STRICT.
- returns
Returns a GenotypeDataset.
- See also
loadVcf
loadParquetGenotypes
-
def
loadGff3(pathName: String, optSequenceDictionary: Option[SequenceDictionary] = None, optMinPartitions: Option[Int] = None, stringency: ValidationStringency = ValidationStringency.STRICT): FeatureDataset
Load a path name in GFF3 format into a FeatureDataset.
Load a path name in GFF3 format into a FeatureDataset.
- pathName
The path name to load features in GFF3 format from. Globs/directories are supported.
- optSequenceDictionary
Optional sequence dictionary. Defaults to None.
- optMinPartitions
An optional minimum number of partitions to load. If not set, falls back to the configured Spark default parallelism. Defaults to None.
- stringency
The validation stringency to use when validating GFF3 format. Defaults to ValidationStringency.STRICT.
- returns
Returns a FeatureDataset.
-
def
loadGtf(pathName: String, optSequenceDictionary: Option[SequenceDictionary] = None, optMinPartitions: Option[Int] = None, stringency: ValidationStringency = ValidationStringency.STRICT): FeatureDataset
Load a path name in GTF/GFF2 format into a FeatureDataset.
Load a path name in GTF/GFF2 format into a FeatureDataset.
- pathName
The path name to load features in GTF/GFF2 format from. Globs/directories are supported.
- optSequenceDictionary
Optional sequence dictionary. Defaults to None.
- optMinPartitions
An optional minimum number of partitions to load. If not set, falls back to the configured Spark default parallelism. Defaults to None.
- stringency
The validation stringency to use when validating GTF/GFF2 format. Defaults to ValidationStringency.STRICT.
- returns
Returns a FeatureDataset.
-
def
loadIndexedBam(pathName: String, viewRegions: Iterable[ReferenceRegion], stringency: ValidationStringency = ValidationStringency.STRICT)(implicit s: DummyImplicit): AlignmentDataset
Functions like loadBam, but uses BAM index files to look at fewer blocks, and only returns records within the specified ReferenceRegions.
Functions like loadBam, but uses BAM index files to look at fewer blocks, and only returns records within the specified ReferenceRegions. BAM index file required.
- pathName
The path name to load indexed BAM formatted alignments from. Globs/directories are supported.
- viewRegions
Iterable of ReferenceRegion we are filtering on.
- stringency
The validation stringency to use when validating the BAM/CRAM/SAM format header. Defaults to ValidationStringency.STRICT.
- returns
Returns an AlignmentDataset which wraps the genomic dataset of alignments, sequence dictionary representing reference sequences the alignments may be aligned to, and the read group dictionary for the alignments if one is available.
-
def
loadIndexedBam(pathName: String, viewRegion: ReferenceRegion): AlignmentDataset
Functions like loadBam, but uses BAM index files to look at fewer blocks, and only returns records within a specified ReferenceRegion.
Functions like loadBam, but uses BAM index files to look at fewer blocks, and only returns records within a specified ReferenceRegion. BAM index file required.
- pathName
The path name to load indexed BAM formatted alignments from. Globs/directories are supported.
- viewRegion
The ReferenceRegion we are filtering on.
- returns
Returns an AlignmentDataset which wraps the genomic dataset of alignments, sequence dictionary representing reference sequences the alignments may be aligned to, and the read group dictionary for the alignments if one is available.
-
def
loadIndexedVcf(pathName: String, viewRegions: Iterable[ReferenceRegion], stringency: ValidationStringency = ValidationStringency.STRICT)(implicit s: DummyImplicit): VariantContextDataset
Load variant context records from VCF indexed by tabix (tbi) into a VariantContextDataset.
Load variant context records from VCF indexed by tabix (tbi) into a VariantContextDataset.
- pathName
The path name to load VCF variant context records from. Globs/directories are supported.
- viewRegions
Iterator of ReferenceRegions we are filtering on.
- stringency
The validation stringency to use when validating VCF format. Defaults to ValidationStringency.STRICT.
- returns
Returns a VariantContextDataset.
-
def
loadIndexedVcf(pathName: String, viewRegion: ReferenceRegion): VariantContextDataset
Load variant context records from VCF indexed by tabix (tbi) into a VariantContextDataset.
Load variant context records from VCF indexed by tabix (tbi) into a VariantContextDataset.
- pathName
The path name to load VCF variant context records from. Globs/directories are supported.
- viewRegion
ReferenceRegion we are filtering on.
- returns
Returns a VariantContextDataset.
-
def
loadInterleavedFastq(pathName: String): AlignmentDataset
Load unaligned alignments from interleaved FASTQ into an AlignmentDataset.
Load unaligned alignments from interleaved FASTQ into an AlignmentDataset.
In interleaved FASTQ, the two reads from a paired sequencing protocol are interleaved in a single file. This is a zipped representation of the typical paired FASTQ.
- pathName
The path name to load unaligned alignments from. Globs/directories are supported.
- returns
Returns an unaligned AlignmentDataset.
-
def
loadInterleavedFastqAsFragments(pathName: String): FragmentDataset
Load paired unaligned alignments grouped by sequencing fragment from interleaved FASTQ into an FragmentDataset.
Load paired unaligned alignments grouped by sequencing fragment from interleaved FASTQ into an FragmentDataset.
In interleaved FASTQ, the two reads from a paired sequencing protocol are interleaved in a single file. This is a zipped representation of the typical paired FASTQ.
Fragments represent all of the reads from a single sequenced fragment as a single object, which is a useful representation for some tasks.
- pathName
The path name to load unaligned alignments from. Globs/directories are supported.
- returns
Returns a FragmentDataset containing the paired reads grouped by sequencing fragment.
-
def
loadIntervalList(pathName: String, optMinPartitions: Option[Int] = None, stringency: ValidationStringency = ValidationStringency.STRICT): FeatureDataset
Load a path name in IntervalList format into a FeatureDataset.
Load a path name in IntervalList format into a FeatureDataset.
- pathName
The path name to load features in IntervalList format from. Globs/directories are supported.
- optMinPartitions
An optional minimum number of partitions to load. If not set, falls back to the configured Spark default parallelism. Defaults to None.
- stringency
The validation stringency to use when validating IntervalList format. Defaults to ValidationStringency.STRICT.
- returns
Returns a FeatureDataset.
-
def
loadNarrowPeak(pathName: String, optSequenceDictionary: Option[SequenceDictionary] = None, optMinPartitions: Option[Int] = None, stringency: ValidationStringency = ValidationStringency.STRICT): FeatureDataset
Load a path name in NarrowPeak format into a FeatureDataset.
Load a path name in NarrowPeak format into a FeatureDataset.
- pathName
The path name to load features in NarrowPeak format from. Globs/directories are supported.
- optSequenceDictionary
Optional sequence dictionary. Defaults to None.
- optMinPartitions
An optional minimum number of partitions to load. If not set, falls back to the configured Spark default parallelism. Defaults to None.
- stringency
The validation stringency to use when validating NarrowPeak format. Defaults to ValidationStringency.STRICT.
- returns
Returns a FeatureDataset.
-
def
loadPairedFastq(pathName1: String, pathName2: String, optReadGroup: Option[String] = None, persistLevel: Option[StorageLevel] = Some(StorageLevel.MEMORY_ONLY), stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentDataset
Load unaligned alignments from paired FASTQ into an AlignmentDataset.
Load unaligned alignments from paired FASTQ into an AlignmentDataset.
- pathName1
The path name to load the first set of unaligned alignments from. Globs/directories are supported.
- pathName2
The path name to load the second set of unaligned alignments from. Globs/directories are supported.
- optReadGroup
The optional read group identifier to associate to the unaligned alignment records. Defaults to None.
- persistLevel
An optional persistance level to set. If this level is set, then reads will be cached (at the given persistance) level as part of validation. Defaults to StorageLevel.MEMORY_ONLY.
- stringency
The validation stringency to use when validating paired FASTQ format. Defaults to ValidationStringency.STRICT.
- returns
Returns an unaligned AlignmentDataset.
-
def
loadPairedFastqAsFragments(pathName1: String, pathName2: String, optReadGroup: Option[String] = None, persistLevel: Option[StorageLevel] = Some(StorageLevel.MEMORY_ONLY), stringency: ValidationStringency = ValidationStringency.STRICT): FragmentDataset
Load paired unaligned alignments grouped by sequencing fragment from paired FASTQ files into an FragmentDataset.
Load paired unaligned alignments grouped by sequencing fragment from paired FASTQ files into an FragmentDataset.
Fragments represent all of the reads from a single sequenced fragment as a single object, which is a useful representation for some tasks.
- pathName1
The path name to load the first set of unaligned alignments from. Globs/directories are supported.
- pathName2
The path name to load the second set of unaligned alignments from. Globs/directories are supported.
- optReadGroup
The optional read group identifier to associate to the unaligned alignment records. Defaults to None.
- persistLevel
An optional persistance level to set. If this level is set, then reads will be cached (at the given persistance) level as part of validation. Defaults to StorageLevel.MEMORY_ONLY.
- stringency
The validation stringency to use when validating paired FASTQ format. Defaults to ValidationStringency.STRICT.
- returns
Returns a FragmentDataset containing the paired reads grouped by sequencing fragment.
-
def
loadParquet[T](pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None)(implicit ev1: (T) ⇒ SpecificRecord, ev2: Manifest[T]): RDD[T]
Load a path name in Parquet + Avro format into an RDD.
Load a path name in Parquet + Avro format into an RDD.
- T
The type of records to return.
- pathName
The path name to load Parquet + Avro formatted data from. Globs/directories are supported.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An option projection schema to use when reading Parquet + Avro. Defaults to None.
- returns
An RDD with records of the specified type.
-
def
loadParquetAlignments(pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None): AlignmentDataset
Load a path name in Parquet + Avro format into an AlignmentDataset.
Load a path name in Parquet + Avro format into an AlignmentDataset.
- pathName
The path name to load alignments from. Globs/directories are supported.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An option projection schema to use when reading Parquet + Avro. Defaults to None.
- returns
Returns an AlignmentDataset which wraps the genomic dataset of alignments, sequence dictionary representing reference sequences the alignments may be aligned to, and the read group dictionary for the alignments if one is available.
- Note
The sequence dictionary is read from an Avro file stored at pathName/_references.avro and the read group dictionary is read from an Avro file stored at pathName/_readGroups.avro. These files are pure Avro, not Parquet + Avro.
-
def
loadParquetCoverage(pathName: String, optPredicate: Option[FilterPredicate] = None, forceRdd: Boolean = false): CoverageDataset
Load a path name in Parquet + Avro format into a FeatureDataset and convert to a CoverageDataset.
Load a path name in Parquet + Avro format into a FeatureDataset and convert to a CoverageDataset. Coverage is stored in the score field of Feature.
- pathName
The path name to load features from. Globs/directories are supported.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- forceRdd
Forces loading the RDD.
- returns
Returns a FeatureDataset converted to a CoverageDataset.
-
def
loadParquetFeatures(pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None): FeatureDataset
Load a path name in Parquet + Avro format into a FeatureDataset.
Load a path name in Parquet + Avro format into a FeatureDataset.
- pathName
The path name to load features from. Globs/directories are supported.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An option projection schema to use when reading Parquet + Avro. Defaults to None.
- returns
Returns a FeatureDataset.
-
def
loadParquetFragments(pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None): FragmentDataset
Load a path name in Parquet + Avro format into a FragmentDataset.
Load a path name in Parquet + Avro format into a FragmentDataset.
- pathName
The path name to load fragments from. Globs/directories are supported.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An option projection schema to use when reading Parquet + Avro. Defaults to None.
- returns
Returns a FragmentDataset.
-
def
loadParquetGenotypes(pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None): GenotypeDataset
Load a path name in Parquet + Avro format into a GenotypeDataset.
Load a path name in Parquet + Avro format into a GenotypeDataset.
- pathName
The path name to load genotypes from. Globs/directories are supported.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An option projection schema to use when reading Parquet + Avro. Defaults to None.
- returns
Returns a GenotypeDataset.
-
def
loadParquetReads(pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None): ReadDataset
Load a path name in Parquet + Avro format into a ReadDataset.
Load a path name in Parquet + Avro format into a ReadDataset.
- pathName
The path name to load reads from. Globs/directories are supported.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An optional projection schema to use when reading Parquet + Avro. Defaults to None.
- returns
Returns a ReadDataset.
-
def
loadParquetSequences(pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None): SequenceDataset
Load a path name in Parquet + Avro format into a SequenceDataset.
Load a path name in Parquet + Avro format into a SequenceDataset.
- pathName
The path name to load sequences from. Globs/directories are supported.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An optional projection schema to use when reading Parquet + Avro. Defaults to None.
- returns
Returns a SequenceDataset.
-
def
loadParquetSlices(pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None): SliceDataset
Load a path name in Parquet + Avro format into a SliceDataset.
Load a path name in Parquet + Avro format into a SliceDataset.
- pathName
The path name to load slices from. Globs/directories are supported.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An optional projection schema to use when reading Parquet + Avro. Defaults to None.
- returns
Returns a SliceDataset.
-
def
loadParquetVariantContexts(pathName: String): VariantContextDataset
Load a path name in Parquet + Avro format into a VariantContextDataset.
Load a path name in Parquet + Avro format into a VariantContextDataset.
- pathName
The path name to load variant context records from. Globs/directories are supported.
- returns
Returns a VariantContextDataset.
-
def
loadParquetVariants(pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None): VariantDataset
Load a path name in Parquet format into a VariantDataset.
Load a path name in Parquet format into a VariantDataset.
- pathName
The path name to load variants from. Globs/directories are supported.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An option projection schema to use when reading Parquet + Avro. Defaults to None.
- returns
Returns a VariantDataset.
-
def
loadPartitionedParquetAlignments(pathName: String, regions: Iterable[ReferenceRegion] = Iterable.empty, optLookbackPartitions: Option[Int] = Some(1)): AlignmentDataset
Load a path name with range binned partitioned Parquet format into an AlignmentDataset.
Load a path name with range binned partitioned Parquet format into an AlignmentDataset.
- pathName
The path name to load alignments from. Globs/directories are supported.
- regions
Optional list of genomic regions to load.
- optLookbackPartitions
Number of partitions to lookback to find beginning of an overlapping region when using the filterByOverlappingRegions function on the returned dataset. Defaults to one partition.
- returns
Returns an AlignmentDataset.
- Note
The sequence dictionary is read from an Avro file stored at pathName/_references.avro and the read group dictionary is read from an Avro file stored at pathName/_readGroups.avro. These files are pure Avro, not Parquet + Avro.
-
def
loadPartitionedParquetFeatures(pathName: String, regions: Iterable[ReferenceRegion] = Iterable.empty, optLookbackPartitions: Option[Int] = Some(1)): FeatureDataset
Load a path name with range binned partitioned Parquet format into a FeatureDataset.
Load a path name with range binned partitioned Parquet format into a FeatureDataset.
- pathName
The path name to load alignments from. Globs/directories are supported.
- regions
Optional list of genomic regions to load.
- optLookbackPartitions
Number of partitions to lookback to find beginning of an overlapping region when using the filterByOverlappingRegions function on the returned dataset. Defaults to one partition.
- returns
Returns a FeatureDataset.
-
def
loadPartitionedParquetGenotypes(pathName: String, regions: Iterable[ReferenceRegion] = Iterable.empty, optLookbackPartitions: Option[Int] = Some(1)): GenotypeDataset
Load a path name with range binned partitioned Parquet format into a GenotypeDataset.
Load a path name with range binned partitioned Parquet format into a GenotypeDataset.
- pathName
The path name to load alignments from. Globs/directories are supported.
- regions
Optional list of genomic regions to load.
- optLookbackPartitions
Number of partitions to lookback to find beginning of an overlapping region when using the filterByOverlappingRegions function on the returned dataset. Defaults to one partition.
- returns
Returns a GenotypeDataset.
-
def
loadPartitionedParquetVariantContexts(pathName: String, regions: Iterable[ReferenceRegion] = Iterable.empty, optLookbackPartitions: Option[Int] = Some(1)): VariantContextDataset
Load a path name with range binned partitioned Parquet format into a VariantContextDataset.
Load a path name with range binned partitioned Parquet format into a VariantContextDataset.
- pathName
The path name to load variant context records from. Globs/directories are supported.
- regions
Optional list of genomic regions to load.
- optLookbackPartitions
Number of partitions to lookback to find beginning of an overlapping region when using the filterByOverlappingRegions function on the returned dataset. Defaults to one partition.
- returns
Returns a VariantContextDataset.
-
def
loadPartitionedParquetVariants(pathName: String, regions: Iterable[ReferenceRegion] = Iterable.empty, optLookbackPartitions: Option[Int] = Some(1)): VariantDataset
Load a path name with range binned partitioned Parquet format into a VariantDataset.
Load a path name with range binned partitioned Parquet format into a VariantDataset.
- pathName
The path name to load alignments from. Globs/directories are supported.
- regions
Optional list of genomic regions to load.
- optLookbackPartitions
Number of partitions to lookback to find beginning of an overlapping region when using the filterByOverlappingRegions function on the returned dataset. Defaults to one partition.
- returns
Returns a VariantDataset.
-
def
loadProteinSequences(pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None): SequenceDataset
Load protein sequences into a SequenceDataset.
Load protein sequences into a SequenceDataset.
If the path name has a .fa/.fasta extension, load as FASTA format. Else, fall back to Parquet + Avro.
For FASTA format, compressed files are supported through compression codecs configured in Hadoop, which by default include .gz and .bz2, but can include more.
- pathName
The path name to load sequences from. Globs/directories are supported, although file extension must be present for FASTA format.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An optional projection schema to use when reading Parquet + Avro. Defaults to None.
- returns
Returns a SequenceRDD containing protein sequences.
- See also
loadFastaProtein
loadParquetSequences
-
def
loadReads(df: DataFrame, references: SequenceDictionary): ReadDataset
Load the specified data frame and references into a ReadDataset.
Load the specified data frame and references into a ReadDataset.
- df
Data frame to load from.
- references
References for the ReadDataset, may be empty.
- returns
Returns a new ReadDataset loaded from the specified data frame and references.
-
def
loadReads(df: DataFrame, metadataPathName: String): ReadDataset
Load the specified data frame into a ReadDataset, with metadata loaded from the specified metadata path name.
Load the specified data frame into a ReadDataset, with metadata loaded from the specified metadata path name.
- df
Data frame to load from.
- metadataPathName
Path name to load metadata from.
- returns
Returns a new ReadDataset loaded from the specified data frame, with metadata loaded from the specified metadata path name.
-
def
loadReads(df: DataFrame): ReadDataset
Load the specified data frame into a ReadDataset, with empty metadata.
Load the specified data frame into a ReadDataset, with empty metadata.
- df
Data frame to load from.
- returns
Returns a new ReadDataset loaded from the specified data frame, with empty metadata.
-
def
loadReferenceFile(pathName: String, maximumLength: Long): ReferenceFile
Load reference sequences into a broadcastable ReferenceFile.
Load reference sequences into a broadcastable ReferenceFile.
If the path name has a .2bit extension, loads a 2bit file. Else, uses loadSlices to load the reference as an RDD, which is then collected to the driver.
- pathName
The path name to load reference sequences from. Globs/directories for 2bit format are not supported.
- maximumLength
Maximum fragment length. Defaults to 10000L. Values greater than 1e9 should be avoided.
- returns
Returns a broadcastable ReferenceFile.
- See also
loadSlices
-
def
loadRnaSequences(pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None): SequenceDataset
Load RNA sequences into a SequenceDataset.
Load RNA sequences into a SequenceDataset.
If the path name has a .fa/.fasta extension, load as FASTA format. Else, fall back to Parquet + Avro.
For FASTA format, compressed files are supported through compression codecs configured in Hadoop, which by default include .gz and .bz2, but can include more.
- pathName
The path name to load sequences from. Globs/directories are supported, although file extension must be present for FASTA format.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An optional projection schema to use when reading Parquet + Avro. Defaults to None.
- returns
Returns a SequenceDataset containing RNA sequences.
- See also
loadFastaRna
loadParquetSequences
-
def
loadSequenceDictionary(pathName: String): SequenceDictionary
Load a sequence dictionary.
Load a sequence dictionary.
Loads path names ending in: * .dict as HTSJDK sequence dictionary format, * .genome as Bedtools genome file format, * .txt as UCSC Genome Browser chromInfo files.
Compressed files are supported through compression codecs configured in Hadoop, which by default include .gz and .bz2, but can include more.
- pathName
The path name to load a sequence dictionary from.
- returns
Returns a sequence dictionary.
- Exceptions thrown
IllegalArgumentException
if pathName file extension not one of .dict, .genome, or .txt
-
def
loadSequences(df: DataFrame, references: SequenceDictionary): SequenceDataset
Load the specified data frame and references into a SequenceDataset.
Load the specified data frame and references into a SequenceDataset.
- df
Data frame to load from.
- references
References for the SequenceDataset, may be empty.
- returns
Returns a new SequenceDataset loaded from the specified data frame and references.
-
def
loadSequences(df: DataFrame, metadataPathName: String): SequenceDataset
Load the specified data frame into a SequenceDataset, with metadata loaded from the specified metadata path name.
Load the specified data frame into a SequenceDataset, with metadata loaded from the specified metadata path name.
- df
Data frame to load from.
- metadataPathName
Path name to load metadata from.
- returns
Returns a new SequenceDataset loaded from the specified data frame, with metadata loaded from the specified metadata path name.
-
def
loadSequences(df: DataFrame): SequenceDataset
Load the specified data frame into a SequenceDataset, with empty metadata.
Load the specified data frame into a SequenceDataset, with empty metadata.
- df
Data frame to load from.
- returns
Returns a new SequenceDataset loaded from the specified data frame, with empty metadata.
-
def
loadSlices(df: DataFrame, references: SequenceDictionary): SliceDataset
Load the specified data frame and references into a SliceDataset.
Load the specified data frame and references into a SliceDataset.
- df
Data frame to load from.
- references
References for the SliceDataset, may be empty.
- returns
Returns a new SliceDataset loaded from the specified data frame and references.
-
def
loadSlices(df: DataFrame, metadataPathName: String): SliceDataset
Load the specified data frame into a SliceDataset, with metadata loaded from the specified metadata path name.
Load the specified data frame into a SliceDataset, with metadata loaded from the specified metadata path name.
- df
Data frame to load from.
- metadataPathName
Path name to load metadata from.
- returns
Returns a new SliceDataset loaded from the specified data frame, with metadata loaded from the specified metadata path name.
-
def
loadSlices(df: DataFrame): SliceDataset
Load the specified data frame into a SliceDataset, with empty metadata.
Load the specified data frame into a SliceDataset, with empty metadata.
- df
Data frame to load from.
- returns
Returns a new SliceDataset loaded from the specified data frame, with empty metadata.
-
def
loadSlices(pathName: String, maximumLength: Long = 10000L, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None): SliceDataset
Load slices into a SliceDataset.
Load slices into a SliceDataset.
If the path name has a .fa/.fasta extension, load as DNA in FASTA format. Else, fall back to Parquet + Avro.
For FASTA format, compressed files are supported through compression codecs configured in Hadoop, which by default include .gz and .bz2, but can include more.
- pathName
The path name to load DNA slices from. Globs/directories are supported, although file extension must be present for FASTA format.
- maximumLength
Maximum slice length. Defaults to 10000L. Values greater than 1e9 should be avoided.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An optional projection schema to use when reading Parquet + Avro. Defaults to None.
- returns
Returns a SliceDataset.
- See also
loadFastaDna(String, Long)
loadParquetSlices
-
def
loadUnpairedFastq(pathName: String, setFirstOfPair: Boolean = false, setSecondOfPair: Boolean = false, optReadGroup: Option[String] = None, stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentDataset
Load unaligned alignments from unpaired FASTQ into an AlignmentDataset.
Load unaligned alignments from unpaired FASTQ into an AlignmentDataset.
- pathName
The path name to load unaligned alignments from. Globs/directories are supported.
- setFirstOfPair
If true, sets the unaligned alignment as first from the fragment. Defaults to false.
- setSecondOfPair
If true, sets the unaligned alignment as second from the fragment. Defaults to false.
- optReadGroup
The optional read group identifier to associate to the unaligned alignment records. Defaults to None.
- stringency
The validation stringency to use when validating unpaired FASTQ format. Defaults to ValidationStringency.STRICT.
- returns
Returns an unaligned AlignmentDataset.
-
def
loadVariantContexts(df: DataFrame, references: SequenceDictionary, samples: Seq[Sample], headerLines: Seq[VCFHeaderLine]): VariantContextDataset
Load the specified data frame, references, samples, and header lines into a VariantContextDataset.
Load the specified data frame, references, samples, and header lines into a VariantContextDataset.
- df
Data frame to load from.
- references
References for the VariantContextDataset, may be empty.
- samples
Samples for the VariantContextDataset, may be empty.
- headerLines
Header lines for the VariantContextDataset, may be empty.
- returns
Returns a new VariantContextDataset loaded from the specified data frame, references, samples, and header lines.
-
def
loadVariantContexts(df: DataFrame, references: SequenceDictionary, samples: Seq[Sample]): VariantContextDataset
Load the specified data frame, references, and samples into a VariantContextDataset, with the default header lines.
Load the specified data frame, references, and samples into a VariantContextDataset, with the default header lines.
- df
Data frame to load from.
- references
References for the VariantContextDataset, may be empty.
- samples
Samples for the GenotypeDataset, may be empty.
- returns
Returns a new VariantContextDataset loaded from the specified data frame, references, and samples, with the default header lines.
-
def
loadVariantContexts(df: DataFrame, metadataPathName: String): VariantContextDataset
Load the specified data frame into a VariantContextDataset, with metadata loaded from the specified metadata path name.
Load the specified data frame into a VariantContextDataset, with metadata loaded from the specified metadata path name.
- df
Data frame to load from.
- metadataPathName
Path name to load metadata from.
- returns
Returns a new VariantContextDataset loaded from the specified data frame, with metadata loaded from the specified metadata path name.
-
def
loadVariantContexts(df: DataFrame): VariantContextDataset
Load the specified data frame into a VariantContextDataset, with empty metadata and the default header lines.
Load the specified data frame into a VariantContextDataset, with empty metadata and the default header lines.
- df
Data frame to load from.
- returns
Returns a new VariantContextDataset loaded from the specified data frame, with empty metadata and the default header lines.
-
def
loadVariantContexts(pathName: String): VariantContextDataset
Load a path name in VCF or Parquet format into a VariantContextDataset.
Load a path name in VCF or Parquet format into a VariantContextDataset.
- pathName
The path name to load variant context records from. Globs/directories are supported.
- returns
Returns a VariantContextDataset.
-
def
loadVariantContexts(pathName: String, stringency: ValidationStringency): VariantContextDataset
Load a path name in VCF or Parquet format into a VariantContextDataset.
Load a path name in VCF or Parquet format into a VariantContextDataset.
- pathName
The path name to load variant context records from. Globs/directories are supported.
- stringency
The validation stringency to use when validating VCF format.
- returns
Returns a VariantContextDataset.
-
def
loadVariants(df: DataFrame, references: SequenceDictionary, headerLines: Seq[VCFHeaderLine]): VariantDataset
Load the specified data frame, references, and header lines into a VariantDataset.
Load the specified data frame, references, and header lines into a VariantDataset.
- df
Data frame to load from.
- references
References for the VariantDataset, may be empty.
- headerLines
Header lines for the VariantDataset, may be empty.
- returns
Returns a new VariantDataset loaded from the specified data frame, references, and header lines.
-
def
loadVariants(df: DataFrame, references: SequenceDictionary): VariantDataset
Load the specified data frame and references into a VariantDataset, with the default header lines.
Load the specified data frame and references into a VariantDataset, with the default header lines.
- df
Data frame to load from.
- references
References for the VariantDataset, may be empty.
- returns
Returns a new VariantDataset loaded from the specified data frame and references, with the default header lines.
-
def
loadVariants(df: DataFrame, metadataPathName: String): VariantDataset
Load the specified data frame into a VariantDataset, with metadata loaded from the specified metadata path name.
Load the specified data frame into a VariantDataset, with metadata loaded from the specified metadata path name.
- df
Data frame to load from.
- metadataPathName
Path name to load metadata from.
- returns
Returns a new VariantDataset loaded from the specified data frame, with metadata loaded from the specified metadata path name.
-
def
loadVariants(df: DataFrame): VariantDataset
Load the specified data frame into a VariantDataset, with empty metadata and the default header lines.
Load the specified data frame into a VariantDataset, with empty metadata and the default header lines.
- df
Data frame to load from.
- returns
Returns a new VariantDataset loaded from the specified data frame, with empty metadata and the default header lines.
-
def
loadVariants(pathName: String, optPredicate: Option[FilterPredicate] = None, optProjection: Option[Schema] = None, stringency: ValidationStringency = ValidationStringency.STRICT): VariantDataset
Load variants into a VariantDataset.
Load variants into a VariantDataset.
If the path name has a .vcf/.vcf.gz/.vcf.bgz extension, load as VCF format. Else, fall back to Parquet + Avro.
- pathName
The path name to load variants from. Globs/directories are supported, although file extension must be present for VCF format.
- optPredicate
An optional pushdown predicate to use when reading Parquet + Avro. Defaults to None.
- optProjection
An option projection schema to use when reading Parquet + Avro. Defaults to None.
- stringency
The validation stringency to use when validating VCF format. Defaults to ValidationStringency.STRICT.
- returns
Returns a VariantDataset.
- See also
loadVcf
loadParquetVariants
-
def
loadVcf(pathName: String, stringency: ValidationStringency = ValidationStringency.STRICT): VariantContextDataset
Load variant context records from VCF into a VariantContextDataset.
Load variant context records from VCF into a VariantContextDataset.
- pathName
The path name to load VCF variant context records from. Globs/directories are supported.
- stringency
The validation stringency to use when validating VCF format. Defaults to ValidationStringency.STRICT.
- returns
Returns a VariantContextDataset.
-
def
loadVcfWithProjection(pathName: String, infoFields: Set[String], formatFields: Set[String], stringency: ValidationStringency = ValidationStringency.STRICT): VariantContextDataset
Load variant context records from VCF into a VariantContextDataset.
Load variant context records from VCF into a VariantContextDataset.
Only converts the core Genotype/Variant fields, and the fields set in the requested projection. Core variant fields include:
* Names (ID) * Filters (FILTER)
Core genotype fields include:
* Allelic depth (AD) * Read depth (DP) * Min read depth (MIN_DP) * Genotype quality (GQ) * Genotype likelihoods (GL/PL) * Strand bias components (SB) * Phase info (PS,PQ)
- pathName
The path name to load VCF variant context records from. Globs/directories are supported.
- infoFields
The info fields to include, in addition to the ID and FILTER attributes.
- formatFields
The format fields to include, in addition to the core fields listed above.
- stringency
The validation stringency to use when validating VCF format. Defaults to ValidationStringency.STRICT.
- returns
Returns a VariantContextDataset.
-
def
logger: Logger
- Attributes
- protected
- Definition Classes
- Logging
-
def
loggerName: String
- Attributes
- protected
- Definition Classes
- Logging
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
- val sc: SparkContext
-
lazy val
spark: SparkSession
- Annotations
- @transient()
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
-
def
toString(): String
- Definition Classes
- AnyRef → Any
-
def
trace(mkr: Marker, msg: ⇒ Any, t: ⇒ Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
trace(msg: ⇒ Any, t: ⇒ Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
trace(msg: ⇒ Any): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @native() @throws( ... )
-
def
warn(mkr: Marker, msg: ⇒ Any, t: ⇒ Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
warn(msg: ⇒ Any, t: ⇒ Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
warn(msg: ⇒ Any): Unit
- Attributes
- protected
- Definition Classes
- Logging