scalding

Type Members

sealed abstract class AccessMode extends AnyRef
class BufferOp[I, T, X] extends BaseOperation[Any] with Buffer[Any]
abstract class CascadeJob extends Job
class CascadeTest extends JobTest
trait CascadingLocal extends Mode
trait CaseClassPackers extends LowPriorityTuplePackers
class CoGroupBuilder extends GroupBuilder

Builder classes used internally to implement coGroups (joins).
case class Csv(p: String, separator: String = ",", fields: Fields = cascading.tuple.Fields.ALL, skipHeader: Boolean = false, writeHeader: Boolean = false, quote: String = "\"") extends FixedPathSource with DelimitedScheme with Product with Serializable

Csv value source separated by commas and quotes wrapping all fields
trait DefaultDateRangeJob extends Job

Sets up an implicit dateRange to use in your sources and an implicit timezone.
trait DelimitedScheme extends Source

Mix this in for delimited schemes such as TSV or one-separated values By default, TSV is given
sealed abstract class Field[T] extends Serializable
trait FieldConversions extends LowPriorityFieldConversions
abstract class FileSource extends Source

This is a base class for File-based sources
class FilterFunction[T] extends BaseOperation[Any] with Filter[Any]
abstract class FixedPathSource extends FileSource
class FlatMapFunction[S, T] extends BaseOperation[Any] with Function[Any]
class FoldAggregator[T, X] extends BaseOperation[X] with Aggregator[X]
abstract class FoldFunctor[X] extends Functor

This handles the mapReduceMap work on the map-side of the operation.
trait FoldOperations[+Self <: FoldOperations[Self]] extends ReduceOperations[Self] with Sortable[Self]

Implements reductions on top of a simple abstraction for the Fields-API We use the f-bounded polymorphism trick to return the type called Self in each operation.
trait GeneratedConversions extends LowPriorityConversions
trait GeneratedTupleAdders extends AnyRef
class GroupBuilder extends FoldOperations[GroupBuilder] with StreamOperations[GroupBuilder]

This controls the sequence of reductions that happen inside a particular grouping operation.
class Grouped[K, +T] extends KeyedList[K, T] with Serializable

Represents a grouping which is the transition from map to reduce phase in hadoop.
trait HadoopMode extends Mode
case class HadoopTest(conf: Configuration, buffers: Map[Source, Buffer[Tuple]]) extends Mode with HadoopMode with TestMode with Product with Serializable
case class Hdfs(strict: Boolean, conf: Configuration) extends Mode with HadoopMode with Product with Serializable
case class IntField[T](id: Integer)(implicit ord: Ordering[T], mf: Option[Manifest[T]]) extends Field[T] with Product with Serializable
class IntegralComparator extends Comparator[AnyRef] with Hasher[AnyRef] with Serializable
class InvalidJoinModeException extends Exception
class InvalidSourceException extends RuntimeException

thrown when validateTaps fails
case class IterableSource[T](iter: Iterable[T], inFields: Fields = cascading.tuple.Fields.NONE)(implicit set: TupleSetter[T], converter: TupleConverter[T]) extends Source with Mappable[T] with Product with Serializable

Allows working with an iterable object defined in the job (on the submitter) to be used within a Job as you would a Pipe/RichPipe
class Job extends TupleConversions with FieldConversions with Serializable
class JobTest extends TupleConversions

This class is used to construct unit tests for scalding jobs.
trait JoinAlgorithms extends AnyRef
sealed abstract class JoinMode extends AnyRef
case class JsonLine(p: String, fields: Fields = cascading.tuple.Fields.ALL) extends FixedPathSource with TextLineScheme with Product with Serializable

This Source writes out the TupleEntry as a simple JSON object, using the field names as keys and the string representation of the values.
trait KeyedList[K, +T] extends AnyRef

Represents sharded lists of items of type T
case class Local(strict: Boolean) extends Mode with CascadingLocal with Product with Serializable
trait LocalTapSource extends FileSource
trait LowPriorityConversions extends AnyRef
trait LowPriorityFieldConversions extends AnyRef
trait LowPriorityTuplePackers extends TupleConversions
trait LowPriorityTupleUnpackers extends TupleConversions
class MRMAggregator[T, X, U] extends BaseOperation[Tuple] with Aggregator[Tuple]
class MRMBy[T, X, U] extends AggregateBy

MapReduceMapBy Class
class MRMFunctor[T, X] extends FoldFunctor[X]

This handles the mapReduceMap work on the map-side of the operation.
class MapFunction[S, T] extends BaseOperation[Any] with Function[Any]
trait Mappable[T] extends Source

Usually as soon as we open a source, we read and do some mapping operation on a single column or set of columns.
class MappedOrdering[B, T] extends Ordering[T] with Serializable
class MapsideReduce[V] extends BaseOperation[SummingCache[Tuple, V]] with Function[SummingCache[Tuple, V]]

An implementation of map-side combining which is appropriate for associative and commutative functions If a cacheSize is given, it is used, else we query the config for cascading.
class MemoryTap[In, Out] extends Tap[Properties, In, Out]
class MemoryTupleEntryCollector extends TupleEntryCollector
abstract class Mode extends AnyRef

There are three ways to run jobs sourceStrictness is set to true
abstract class MostRecentGoodSource extends TimePathedSource
case class MultipleDelimitedFiles(f: Fields, separator: String, quote: String, skipHeader: Boolean, writeHeader: Boolean, p: String*) extends FixedPathSource with DelimitedScheme with Product with Serializable

Delimited files source allowing to override separator and quotation characters and header configuration
case class MultipleSequenceFiles(p: String*) extends FixedPathSource with SequenceFileScheme with LocalTapSource with Product with Serializable
case class MultipleTextLineFiles(p: String*) extends FixedPathSource with TextLineScheme with Product with Serializable
case class MultipleWritableSequenceFiles[K <: Writable, V <: Writable](p: Seq[String], f: Fields)(implicit evidence$9: Manifest[K], evidence$10: Manifest[V]) extends FixedPathSource with WritableSequenceFileScheme with LocalTapSource with Product with Serializable
class NamedPoolThreadFactory extends ThreadFactory
class NullTap[Config, Input, Output, SourceContext, SinkContext] extends SinkTap[Config, Output]

A tap that output nothing.
class OrderedConstructorConverter[T] extends TupleConverter[T]
class OrderedTuplePacker[T] extends TuplePacker[T]

This just blindly uses the first public constructor with the same arity as the fields size
case class Osv(p: String, f: Fields = cascading.tuple.Fields.ALL) extends FixedPathSource with DelimitedScheme with Product with Serializable

One separated value (commonly used by Pig)
class PipeTExtensions extends Serializable
trait ReduceOperations[+Self <: ReduceOperations[Self]] extends Serializable

Implements reductions on top of a simple abstraction for the Fields-API This is for associative and commutive operations (particularly Monoids play a big role here)
class ReflectionSetter[T] extends TupleSetter[T]
class ReflectionTupleConverter[T] extends TupleConverter[T]
class ReflectionTuplePacker[T] extends TuplePacker[T]

Packs a tuple into any object with set methods, e.
class ReflectionTupleUnpacker[T] extends TupleUnpacker[T]
class RichFields extends Fields
class RichPipe extends Serializable with JoinAlgorithms
class ScaldingMultiSourceTap extends MultiSourceTap[Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]], JobConf, RecordReader[_, _]]
class ScanLeftIterator[T, U] extends Iterator[U] with Serializable

Scala 2.
class ScriptJob extends Job
case class SequenceFile(p: String, f: Fields = cascading.tuple.Fields.ALL) extends FixedPathSource with SequenceFileScheme with LocalTapSource with Product with Serializable
trait SequenceFileScheme extends Source
abstract class SideEffectBaseOperation[C] extends BaseOperation[C]
class SideEffectBufferOp[I, T, C, X] extends SideEffectBaseOperation[C] with Buffer[C]
class SideEffectFlatMapFunction[S, C, T] extends SideEffectBaseOperation[C] with Function[C]
class SideEffectMapFunction[S, C, T] extends SideEffectBaseOperation[C] with Function[C]
sealed abstract class SkewReplication extends AnyRef

Represents a strategy for replicating rows when performing skewed joins.
case class SkewReplicationA(replicationFactor: Int = 1) extends SkewReplication with Product with Serializable

See https://github.
case class SkewReplicationB(maxKeysInMemory: Int = 1000000.0.toInt, maxReducerOutput: Int = 1.0E7.toInt) extends SkewReplication with Product with Serializable

See https://github.
trait Sortable[+Self] extends AnyRef
abstract class Source extends Serializable

Every source must have a correct toString method.
trait Stateful extends AnyRef

A simple trait for releasable resource.
trait StreamOperations[+Self <: StreamOperations[Self]] extends Sortable[Self] with Serializable

Implements reductions on top of a simple abstraction for the Fields-API We use the f-bounded polymorphism trick to return the type called Self in each operation.
case class StringField[T](id: String)(implicit ord: Ordering[T], mf: Option[Manifest[T]]) extends Field[T] with Product with Serializable
trait SuccessFileSource extends FileSource

Ensures that a _SUCCESS file is present in the Source path.
case class Test(buffers: Map[Source, Buffer[Tuple]]) extends Mode with TestMode with CascadingLocal with Product with Serializable

Memory only testing for unit tests
trait TestMode extends Mode
case class TextLine(p: String) extends FixedPathSource with TextLineScheme with Product with Serializable
trait TextLineScheme extends Source with Mappable[String]

The fields here are ('offset, 'line)
abstract class TimePathedSource extends FileSource

This will automatically produce a globbed version of the given path.
class Tool extends Configured with org.apache.hadoop.util.Tool
case class Tsv(p: String, fields: Fields = cascading.tuple.Fields.ALL, skipHeader: Boolean = false, writeHeader: Boolean = false) extends FixedPathSource with DelimitedScheme with Product with Serializable

Tab separated value source
trait TupleArity extends AnyRef

Mixed in to both TupleConverter and TupleSetter to improve arity safety of cascading jobs before we run anything on Hadoop.
trait TupleConversions extends GeneratedConversions
abstract class TupleConverter[T] extends Serializable with TupleArity
abstract class TupleGetter[T] extends Serializable
abstract class TuplePacker[T] extends Serializable
abstract class TupleSetter[-T] extends Serializable with TupleArity
abstract class TupleUnpacker[-T] extends Serializable
class TupleUnpackerException extends Exception
class TypedDelimited[T] extends FixedPathSource with DelimitedScheme with Mappable[T]
class TypedPipe[+T] extends Serializable

Represents a phase in a distributed computation on an input data source Wraps a cascading Pipe object, and holds the transformation done up until that point
trait UtcDateRangeJob extends Job with DefaultDateRangeJob
case class WritableSequenceFile[K <: Writable, V <: Writable](p: String, f: Fields)(implicit evidence$7: Manifest[K], evidence$8: Manifest[V]) extends FixedPathSource with WritableSequenceFileScheme with LocalTapSource with Product with Serializable
trait WritableSequenceFileScheme extends Source
class XHandler extends AnyRef

Provide handlers and mapping for exceptions
class LtOrdering[T] extends Ordering[T] with Serializable

Annotations
@deprecated
Deprecated
(Since version 0.8.3) Using Ordering.fromLessThan, duh..

Value Members

object CascadeTest
object CascadingUtils
object Dsl extends FieldConversions with TupleConversions with GeneratedTupleAdders with Serializable

This object has all the implicit functions and values that are used to make the scalding DSL.
object Field extends Serializable
object Grouped extends Serializable
object HadoopSchemeInstance
object InnerJoinMode extends JoinMode with Product with Serializable
object Job extends Serializable
object JobTest
object JsonLine extends AbstractFunction2[String, Fields, JsonLine] with Serializable with Serializable

TODO: at the next binary incompatible version remove the AbstractFunction2/scala.
object Mode
object NullSource extends Source

A source outputs nothing.
object OuterJoinMode extends JoinMode with Product with Serializable
object Read extends AccessMode with Product with Serializable
object ReflectionUtils

A helper for working with class reflection.
object RichFields extends Serializable
object RichPipe extends Serializable
object RichXHandler

Provide apply method for creating XHandlers with default or custom settings and contain messages and mapping
object TDsl extends Serializable with GeneratedTupleAdders

implicits for the type-safe DSL import TDsl.
object TimePathedSource extends Serializable
object Tool
object TuplePacker extends CaseClassPackers with Serializable

Base class for classes which pack a Tuple into a serializable object.
object TupleUnpacker extends LowPriorityTupleUnpackers with Serializable

Base class for objects which unpack an object into a tuple.
object TypedPipe extends Serializable

factory methods for TypedPipe
object TypedTsv

Allows you to set the types, prefer this: If T is a subclass of Product, we assume it is a tuple.
object Write extends AccessMode with Product with Serializable
package examples
package mathematics
package serialization
package source
package typed

package scalding

Type Members

sealed abstract class AccessMode extends AnyRef

class BufferOp[I, T, X] extends BaseOperation[Any] with Buffer[Any]

abstract class CascadeJob extends Job

class CascadeTest extends JobTest

trait CascadingLocal extends Mode

trait CaseClassPackers extends LowPriorityTuplePackers

class CoGroupBuilder extends GroupBuilder

case class Csv(p: String, separator: String = ",", fields: Fields = cascading.tuple.Fields.ALL, skipHeader: Boolean = false, writeHeader: Boolean = false, quote: String = "\"") extends FixedPathSource with DelimitedScheme with Product with Serializable

trait DefaultDateRangeJob extends Job

trait DelimitedScheme extends Source

sealed abstract class Field[T] extends Serializable

trait FieldConversions extends LowPriorityFieldConversions

abstract class FileSource extends Source

class FilterFunction[T] extends BaseOperation[Any] with Filter[Any]

abstract class FixedPathSource extends FileSource

class FlatMapFunction[S, T] extends BaseOperation[Any] with Function[Any]

class FoldAggregator[T, X] extends BaseOperation[X] with Aggregator[X]

abstract class FoldFunctor[X] extends Functor

trait FoldOperations[+Self <: FoldOperations[Self]] extends ReduceOperations[Self] with Sortable[Self]

trait GeneratedConversions extends LowPriorityConversions

trait GeneratedTupleAdders extends AnyRef

class GroupBuilder extends FoldOperations[GroupBuilder] with StreamOperations[GroupBuilder]

class Grouped[K, +T] extends KeyedList[K, T] with Serializable

trait HadoopMode extends Mode

case class HadoopTest(conf: Configuration, buffers: Map[Source, Buffer[Tuple]]) extends Mode with HadoopMode with TestMode with Product with Serializable

case class Hdfs(strict: Boolean, conf: Configuration) extends Mode with HadoopMode with Product with Serializable

case class IntField[T](id: Integer)(implicit ord: Ordering[T], mf: Option[Manifest[T]]) extends Field[T] with Product with Serializable

class IntegralComparator extends Comparator[AnyRef] with Hasher[AnyRef] with Serializable

class InvalidJoinModeException extends Exception

class InvalidSourceException extends RuntimeException

case class IterableSource[T](iter: Iterable[T], inFields: Fields = cascading.tuple.Fields.NONE)(implicit set: TupleSetter[T], converter: TupleConverter[T]) extends Source with Mappable[T] with Product with Serializable

class Job extends TupleConversions with FieldConversions with Serializable

class JobTest extends TupleConversions

trait JoinAlgorithms extends AnyRef

sealed abstract class JoinMode extends AnyRef

case class JsonLine(p: String, fields: Fields = cascading.tuple.Fields.ALL) extends FixedPathSource with TextLineScheme with Product with Serializable

trait KeyedList[K, +T] extends AnyRef

case class Local(strict: Boolean) extends Mode with CascadingLocal with Product with Serializable

trait LocalTapSource extends FileSource

trait LowPriorityConversions extends AnyRef

trait LowPriorityFieldConversions extends AnyRef

trait LowPriorityTuplePackers extends TupleConversions

trait LowPriorityTupleUnpackers extends TupleConversions

class MRMAggregator[T, X, U] extends BaseOperation[Tuple] with Aggregator[Tuple]

class MRMBy[T, X, U] extends AggregateBy

class MRMFunctor[T, X] extends FoldFunctor[X]

class MapFunction[S, T] extends BaseOperation[Any] with Function[Any]

trait Mappable[T] extends Source

class MappedOrdering[B, T] extends Ordering[T] with Serializable

class MapsideReduce[V] extends BaseOperation[SummingCache[Tuple, V]] with Function[SummingCache[Tuple, V]]

class MemoryTap[In, Out] extends Tap[Properties, In, Out]

class MemoryTupleEntryCollector extends TupleEntryCollector

abstract class Mode extends AnyRef

abstract class MostRecentGoodSource extends TimePathedSource

case class MultipleDelimitedFiles(f: Fields, separator: String, quote: String, skipHeader: Boolean, writeHeader: Boolean, p: String*) extends FixedPathSource with DelimitedScheme with Product with Serializable

case class MultipleSequenceFiles(p: String*) extends FixedPathSource with SequenceFileScheme with LocalTapSource with Product with Serializable

case class MultipleTextLineFiles(p: String*) extends FixedPathSource with TextLineScheme with Product with Serializable

case class MultipleWritableSequenceFiles[K <: Writable, V <: Writable](p: Seq[String], f: Fields)(implicit evidence$9: Manifest[K], evidence$10: Manifest[V]) extends FixedPathSource with WritableSequenceFileScheme with LocalTapSource with Product with Serializable

class NamedPoolThreadFactory extends ThreadFactory

class NullTap[Config, Input, Output, SourceContext, SinkContext] extends SinkTap[Config, Output]

class OrderedConstructorConverter[T] extends TupleConverter[T]

class OrderedTuplePacker[T] extends TuplePacker[T]

case class Osv(p: String, f: Fields = cascading.tuple.Fields.ALL) extends FixedPathSource with DelimitedScheme with Product with Serializable

class PipeTExtensions extends Serializable

trait ReduceOperations[+Self <: ReduceOperations[Self]] extends Serializable

class ReflectionSetter[T] extends TupleSetter[T]

class ReflectionTupleConverter[T] extends TupleConverter[T]

class ReflectionTuplePacker[T] extends TuplePacker[T]

class ReflectionTupleUnpacker[T] extends TupleUnpacker[T]

class RichFields extends Fields

class RichPipe extends Serializable with JoinAlgorithms

class ScaldingMultiSourceTap extends MultiSourceTap[Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]], JobConf, RecordReader[_, _]]

class ScanLeftIterator[T, U] extends Iterator[U] with Serializable

class ScriptJob extends Job

case class SequenceFile(p: String, f: Fields = cascading.tuple.Fields.ALL) extends FixedPathSource with SequenceFileScheme with LocalTapSource with Product with Serializable

trait SequenceFileScheme extends Source

abstract class SideEffectBaseOperation[C] extends BaseOperation[C]