spark

Type Members

class CacheAsParquetConfigurationExtension extends CacheConfigurationExtension
case class CacheAsParquetMetaItem(partitions: Option[Either[Seq[String], Int]], repartition: Boolean) extends CacheMetaItem with Product with Serializable
trait CacheConfigurationExtension extends DataFlowConfigurationExtension[SparkDataFlow]
case class CacheMeta(cached: Map[String, CacheMetaItem]) extends Product with Serializable
trait CacheMetaItem extends AnyRef
case class CacheMetadataExtension(cacheMeta: CacheMeta) extends DataFlowMetadataExtension[SparkDataFlow] with Logging with Product with Serializable
type CleanUpStrategy[T] = (TableName, InputSnapshots[T]) ⇒ SnapshotsToDelete[T]
case class FSCleanUp(baseFolder: String, toRemove: CleanUpStrategy[FileStatus], labelsToClean: Seq[String]) extends SparkDataFlowAction with Logging with Product with Serializable

Action that deletes snapshots based on the cleanup strategy.
Action that deletes snapshots based on the cleanup strategy. It can cleanup one or more labels.
baseFolder
root folder that contains label folders
toRemove
returns list of snapshot/folder to remove
type InputSnapshots[T] = Seq[T]
case class LabelCommitDefinition(basePath: String, timestampFolder: Option[String] = None, partitions: Seq[String] = Seq.empty, connection: Option[HadoopDBConnector] = None) extends Product with Serializable
case class ParquetDataCommitter(outputBaseFolder: String, snapshotFolder: Option[String] = None, cleanupStrategy: Option[CleanUpStrategy[FileStatus]] = None, hadoopDBConnector: Option[HadoopDBConnector] = None) extends DataCommitter[SparkDataFlow] with Logging with Product with Serializable

Adds actions necessary to commit labels as parquet parquet, supports snapshot folders and interaction with a DB connector.
Adds actions necessary to commit labels as parquet parquet, supports snapshot folders and interaction with a DB connector.
Created by Alexei Perelighin on 2018/11/05
outputBaseFolder
folder under which final labels will store its data. Ex: baseFolder/label_1/
snapshotFolder
optional name of the snapshot folder that will be used by all of the labels committed via this committer. It needs to be a full name and must not be the same as in any of the previous snapshots for any of the commit managed labels. Ex: baseFolder/label_1/snapshot_folder=20181128 baseFolder/label_1/snapshot_folder=20181129 baseFolder/label_2/snapshot_folder=20181128 baseFolder/label_2/snapshot_folder=20181129
cleanupStrategy
optional function that takes the list of available snapshots and returns list of snapshots to remove
hadoopDBConnector
optional connector to the DB.
class SimpleAction extends SparkDataFlowAction

Instances of this class build a bridge between OOP part of the Waimak engine and functional definition of the data flow.
Instances of this class build a bridge between OOP part of the Waimak engine and functional definition of the data flow.
Created by Alexei Perelighin on 03/11/17.
type SnapshotsToDelete[T] = Seq[T]
class SparkCacheConfigurationExtension extends CacheConfigurationExtension
case class SparkCacheMetaItem(partitions: Option[Int], storageLevel: StorageLevel) extends CacheMetaItem with Product with Serializable
class SparkDataFlow extends DataFlow[SparkDataFlow] with Logging

Introduces spark session into the data flows
trait SparkDataFlowAction extends DataFlowAction
implicit class SparkDataFlowExtension extends Logging

Defines functional builder for spark specific data flows and common functionalities like reading csv/parquet/hive data, adding spark SQL steps, data set steps, writing data out into various formats, staging and committing multiple outputs into storage like HDFS, Hive/Impala.
case class SparkDataFlowInfo(spark: SparkSession, inputs: DataFlowEntities, actions: Seq[DataFlowAction], sqlTables: Set[String], tempFolder: Option[Path], schedulingMeta: SchedulingMeta, commitLabels: Map[String, LabelCommitDefinition] = Map.empty, tagState: DataFlowTagState = ..., extensionMetadata: Set[DataFlowMetadataExtension[SparkDataFlow]] = Set.empty, executor: DataFlowExecutor = Waimak.sparkExecutor()) extends Product with Serializable
case class SparkFlowContext(spark: SparkSession) extends FlowContext with Product with Serializable

Context required in a Spark data flow (SparkSession and FileSystem)
Context required in a Spark data flow (SparkSession and FileSystem)
Created by Vicky Avison on 23/02/2018.
spark
the SparkSession
implicit class SparkInterceptorActions extends Logging
class SparkSimpleAction extends SimpleAction

Spark specific simple action, that sets spark specific generics.
type TableName = String
case class WriteAsNamedFilesAction(label: String, tempBasePath: Path, destBasePath: Path, numberOfFiles: Int, filenamePrefix: String, format: String, options: Map[String, String]) extends SparkDataFlowAction with Product with Serializable

Write a file or files with a specific filename to a folder.
Write a file or files with a specific filename to a folder. Allows you to control the final output filename without the Spark-generated part UUIDs. Filename will be $filenamePrefix.extension if number of files is 1, otherwise $filenamePrefix.$fileNumber.extension where file number is incremental and zero-padded.
label
Label to write
tempBasePath
Base location of temporary folder
destBasePath
Destination path to put files in
numberOfFiles
Number of files to generate
filenamePrefix
Prefix of name of the file up to the filenumber and extension
format
Format to write (e.g. parquet, csv)
options
Options to pass to the DataFrameWriter

Value Members

object CacheAsParquetMetadataExtensionIdentifier extends DataFlowMetadataExtensionIdentifier with Product with Serializable
object CacheMeta extends Serializable
object CacheMetadataExtension extends Serializable
object ParquetDataCommitter extends Serializable
object SparkActionHelpers
object SparkDataFlow
object SparkFlowReporter extends FlowReporter
object SparkInterceptors extends Logging

Defines builder functions that add various interceptors to a SparkDataFlow
Defines builder functions that add various interceptors to a SparkDataFlow
Created by Alexei Perelighin on 2018/02/24
object WriteAsNamedFilesAction extends Serializable

Deprecated Value Members

object SparkActions

Annotations
@deprecated
Deprecated
Spark Actions are now automatically included in a SparkDataFlow

package spark

Type Members

class CacheAsParquetConfigurationExtension extends CacheConfigurationExtension

case class CacheAsParquetMetaItem(partitions: Option[Either[Seq[String], Int]], repartition: Boolean) extends CacheMetaItem with Product with Serializable

trait CacheConfigurationExtension extends DataFlowConfigurationExtension[SparkDataFlow]

case class CacheMeta(cached: Map[String, CacheMetaItem]) extends Product with Serializable

trait CacheMetaItem extends AnyRef

case class CacheMetadataExtension(cacheMeta: CacheMeta) extends DataFlowMetadataExtension[SparkDataFlow] with Logging with Product with Serializable

type CleanUpStrategy[T] = (TableName, InputSnapshots[T]) ⇒ SnapshotsToDelete[T]

case class FSCleanUp(baseFolder: String, toRemove: CleanUpStrategy[FileStatus], labelsToClean: Seq[String]) extends SparkDataFlowAction with Logging with Product with Serializable

type InputSnapshots[T] = Seq[T]

case class LabelCommitDefinition(basePath: String, timestampFolder: Option[String] = None, partitions: Seq[String] = Seq.empty, connection: Option[HadoopDBConnector] = None) extends Product with Serializable

case class ParquetDataCommitter(outputBaseFolder: String, snapshotFolder: Option[String] = None, cleanupStrategy: Option[CleanUpStrategy[FileStatus]] = None, hadoopDBConnector: Option[HadoopDBConnector] = None) extends DataCommitter[SparkDataFlow] with Logging with Product with Serializable

class SimpleAction extends SparkDataFlowAction

type SnapshotsToDelete[T] = Seq[T]

class SparkCacheConfigurationExtension extends CacheConfigurationExtension

case class SparkCacheMetaItem(partitions: Option[Int], storageLevel: StorageLevel) extends CacheMetaItem with Product with Serializable

class SparkDataFlow extends DataFlow[SparkDataFlow] with Logging

trait SparkDataFlowAction extends DataFlowAction

implicit class SparkDataFlowExtension extends Logging

case class SparkFlowContext(spark: SparkSession) extends FlowContext with Product with Serializable

implicit class SparkInterceptorActions extends Logging

class SparkSimpleAction extends SimpleAction

type TableName = String

case class WriteAsNamedFilesAction(label: String, tempBasePath: Path, destBasePath: Path, numberOfFiles: Int, filenamePrefix: String, format: String, options: Map[String, String]) extends SparkDataFlowAction with Product with Serializable

Value Members

object CacheAsParquetMetadataExtensionIdentifier extends DataFlowMetadataExtensionIdentifier with Product with Serializable

object CacheMeta extends Serializable

object CacheMetadataExtension extends Serializable

object ParquetDataCommitter extends Serializable

object SparkActionHelpers

object SparkDataFlow

object SparkFlowReporter extends FlowReporter

object SparkInterceptors extends Logging

object WriteAsNamedFilesAction extends Serializable

Deprecated Value Members

object SparkActions

Inherited from AnyRef

Inherited from Any

Ungrouped