ShuffleSerializer

A serializer for Shark/Hive-specific serialization used in Spark shuffle. Since this is only used in shuffle operations, only serializeStream and deserializeStream are implemented.

The serialization process is very simple: - Shark operators use Hive serializers to serialize the data structures into byte arrays (wrapped in BytesWritable object). - Shark operators wrap each key (BytesWritable) in a ReduceKeyMapSide object. The values remain unchanged as BytesWritable. - ShuffleSerializationStream simply flushes the underlying byte arrays for key/value into the serialization stream. The length is prepended before the byte array so the deserializer knows how many bytes to read.

The deserialization process simply reverses the above, with a few caveats: - The data type for the keys becomes ReduceKeyReduceSide, wrapping around a byte array (rather than a BytesWritable). - The data type for the values becomes a byte array, rather than a BytesWritable. The reason is that during aggregations and joins (post shuffle), the key-value pairs are inserted into a hash table. We want to reduce the size of the hash table. Having the BytesWritable wrapper would increase the size of the hash table by another 16 bytes per key-value pair.

Linear Supertypes

Serializer, AnyRef, Any

Instance Constructors

new ShuffleSerializer()
new ShuffleSerializer(conf: SparkConf)

Value Members

final def !=(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def !=(arg0: Any): Boolean

Definition Classes
Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def ==(arg0: Any): Boolean

Definition Classes
Any
final def asInstanceOf[T0]: T0

Definition Classes
Any
def clone(): AnyRef

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( ... )
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def equals(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def finalize(): Unit

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
def hashCode(): Int

Definition Classes
AnyRef → Any
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def newInstance(): SerializerInstance

Definition Classes
ShuffleSerializer → Serializer
final def notify(): Unit

Definition Classes
AnyRef
final def notifyAll(): Unit

Definition Classes
AnyRef
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
def toString(): String

Definition Classes
AnyRef → Any
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )

class ShuffleSerializer extends Serializer

Instance Constructors

new ShuffleSerializer()

new ShuffleSerializer(conf: SparkConf)

Value Members

final def !=(arg0: AnyRef): Boolean

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: AnyRef): Boolean

final def ==(arg0: Any): Boolean

final def asInstanceOf[T0]: T0

def clone(): AnyRef

final def eq(arg0: AnyRef): Boolean

def equals(arg0: Any): Boolean

def finalize(): Unit

final def getClass(): Class[_]

def hashCode(): Int

final def isInstanceOf[T0]: Boolean

final def ne(arg0: AnyRef): Boolean

def newInstance(): SerializerInstance

final def notify(): Unit

final def notifyAll(): Unit

final def synchronized[T0](arg0: ⇒ T0): T0

def toString(): String

final def wait(): Unit

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

Inherited from Serializer

Inherited from AnyRef

Inherited from Any

Ungrouped