class
CachedCountRegistry extends AnyRef
Value Members
-
final
def
!=(arg0: AnyRef): Boolean
-
final
def
!=(arg0: Any): Boolean
-
final
def
##(): Int
-
final
def
==(arg0: AnyRef): Boolean
-
final
def
==(arg0: Any): Boolean
-
final
def
asInstanceOf[T0]: T0
-
def
cachedCount(rdd: RDD[_]): Long
-
def
clone(): AnyRef
-
final
def
eq(arg0: AnyRef): Boolean
-
def
equals(arg0: Any): Boolean
-
def
finalize(): Unit
-
final
def
getClass(): Class[_]
-
def
hashCode(): Int
-
final
def
isInstanceOf[T0]: Boolean
-
def
multiCachedCount(rdds: Seq[RDD[_]]): Seq[Long]
-
final
def
ne(arg0: AnyRef): Boolean
-
final
def
notify(): Unit
-
final
def
notifyAll(): Unit
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
-
def
toString(): String
-
final
def
wait(): Unit
-
final
def
wait(arg0: Long, arg1: Int): Unit
-
final
def
wait(arg0: Long): Unit
Inherited from AnyRef
Inherited from Any
CachedCountRegistry adds a
.size
method to RDDs that mimicks RDD.count, but caches its result.It also exposes
.sizes
and.total
on Seq[RDD]s, Tuple2[RDD]s, and Tuple3[RDD]'s which compute the constituent RDDs' sizes (per above) in one Spark job.Additionally, all the above APIs optimize computations on UnionRDDs by computing their component RDDs' sizes and caching those as well as the UnionRDD's total.
Cached
size
info is keyed by a SparkContext for robustness in apps that stop their SparkContext and then resume with a new one; this is especially useful for testing!Usage: