没有进行cache时耗时:
scala> dataRdd.flatMap(_.split(",")).map((_, 1)).reduceByKey(_+_).count
res5: Long = 10
加了一个cache,第一次执行
scala> dataRdd.flatMap(_.split(",")).map((_, 1)).reduceByKey(_+_).cache.count
res6: Long = 10
注意,只有触发了action才会放到内存
加了一个cache,第二次执行
scala> dataRdd.flatMap(_.split(",")).map((_, 1)).reduceByKey(_+_).cache.count
res7: Long = 10
scala> import org.apache.spark.storage.StorageLevel
import org.apache.spark.storage.StorageLevel
scala> val cached1 = dataRdd1.flatMap(_.split(",")).map((_, 1)).reduceByKey(_+_).persist(StorageLevel.MEMORY_ONLY_SER)
cached1: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[10] at reduceByKey at <console>:26
scala> cached
cached cached1
scala> cached1.count
res2: Long = 10
scala> cached1.count
res3: Long = 10
scala> cached.count
res4: Long = 10
带序列化的作业执行的时间会比不带序列化的时间要长,但是序列化之后体积变小了
下面使用IDEA编写代码重新来一次
一开始界面啥都没有
package com.ruozedata.spark.homework
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.storage.StorageLevel
object DataSerialization {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName("someTestApp").setMaster("local[2]")
val sc = new SparkContext(sparkConf)
val lines = sc.textFile("/Users/Aaron/Downloads/bigdata/data/hello_data.txt")
val count = lines.flatMap(_.split(",")).map((_, 1)).reduceByKey(_+_).count
println("------------------->" + ":" + count.toString)
Thread.sleep(66666666)
sc.stop()
}
}
结果如下
下面来整一下cache
加了一个cache,第一次执行
package com.ruozedata.spark.homework
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.storage.StorageLevel
object DataSerialization {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName("someTestApp").setMaster("local[2]")
// .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val sc = new SparkContext(sparkConf)
val lines = sc.textFile("/Users/Aaron/Downloads/bigdata/data/hello_data.txt")
// MEMORY_ONLY
val count = lines.flatMap(_.split(",")).map((_, 1)).reduceByKey(_ + _).count
println("------------------->" + ":" + count.toString)
val count1 = lines.flatMap(_.split(",")).map((_, 1)).reduceByKey(_ + _).cache.count
println("------------------->" + ":" + count1.toString)
Thread.sleep(66666666)
sc.stop()
}
}
结果如下
点击job进去看一看
很明显,使用cache之后,作业执行的速度提升了
下面使用StorageLevel为MEMORY_ONLY_SER的序列化方式
package com.ruozedata.spark.homework
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.storage.StorageLevel
object DataSerialization {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName("someTestApp").setMaster("local[2]")
// .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val sc = new SparkContext(sparkConf)
val lines = sc.textFile("/Users/Aaron/Downloads/bigdata/data/hello_data.txt")
// val count = lines.flatMap(_.split(",")).map((_, 1)).reduceByKey(_+_).cache.count
// println("------------------->" + ":" + count.toString)
// val count1 = lines.flatMap(_.split(",")).map((_, 1)).reduceByKey(_+_).count
// println("------------------->" + ":" + count1.toString)
val countSer = lines.flatMap(_.split(","))
.map((_, 1)).reduceByKey(_+_)
.persist(StorageLevel.MEMORY_ONLY_SER)
.count
println("------------------->" + ":" + countSer.toString)
val countSer1 = lines.flatMap(_.split(","))
.map((_, 1)).reduceByKey(_+_)
.count
println("------------------->" + ":" + countSer1.toString)
Thread.sleep(66666666)
sc.stop()
}
}
接着是使用Kryo但没有注册
package com.ruozedata.spark.homework
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.storage.StorageLevel
object DataSerialization {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName("someTestApp").setMaster("local[2]")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val sc = new SparkContext(sparkConf)
val lines = sc.textFile("/Users/Aaron/Downloads/bigdata/data/hello_data.txt")
// MEMORY_ONLY
// val count = lines.flatMap(_.split(",")).map((_, 1)).reduceByKey(_ + _).count
// println("------------------->" + ":" + count.toString)
//
// val count1 = lines.flatMap(_.split(",")).map((_, 1)).reduceByKey(_ + _).cache.count
// println("------------------->" + ":" + count1.toString)
// MEMORY_ONLY_SER
// val countSer = lines.flatMap(_.split(","))
// .map((_, 1)).reduceByKey(_ + _)
// .persist(StorageLevel.MEMORY_ONLY_SER)
// .count
// println("------------------->" + ":" + countSer.toString)
//
// val countSer1 = lines.flatMap(_.split(","))
// .map((_, 1)).reduceByKey(_ + _)
// .count
// println("------------------->" + ":" + countSer1.toString)
// MEMORY_ONLY_SER_KRYO
val countSerKryoWithoutRegister = lines.flatMap(_.split(","))
.map((_, 1)).reduceByKey(_ + _)
.persist(StorageLevel.MEMORY_ONLY_SER)
.count
println("------------------->" + ":" + countSerKryoWithoutRegister.toString)
val countSerKryoWithoutRegister1 = lines.flatMap(_.split(","))
.map((_, 1)).reduceByKey(_ + _)
.count
println("------------------->" + ":" + countSerKryoWithoutRegister1.toString)
Thread.sleep(66666666)
sc.stop()
}
}
接着是使用Kryo并且注册
package com.ruozedata.spark.homework
import com.esotericsoftware.kryo.Kryo
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.serializer.KryoRegistrator
object DataSerialization {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName("someTestApp").setMaster("local[2]")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.registerKryoClasses(Array(classOf[MyRegistrator]))
// .set("spark.kryo.registrationRequired","true")
val sc = new SparkContext(sparkConf)
val lines = sc.textFile("/Users/Aaron/Downloads/bigdata/data/hello_data.txt")
// MEMORY_ONLY
// MEMORY_ONLY_SER_KRYO
val countSerKryoWithRegister = lines.flatMap(_.split(","))
.map((_, 1)).reduceByKey(_ + _)
.persist(StorageLevel.MEMORY_ONLY_SER)
.count
println("------------------->" + ":" + countSerKryoWithRegister.toString)
val countSerKryoWithRegister1 = lines.flatMap(_.split(","))
.map((_, 1)).reduceByKey(_ + _)
.count
println("------------------->" + ":" + countSerKryoWithRegister1.toString)
Thread.sleep(66666666)
sc.stop()
}
class MyRegistrator extends KryoRegistrator {
override def registerClasses(kryo: Kryo) {
kryo.register(classOf[Qualify])
}
}
case class Qualify(s1:String, s2:String, s3:String, s4:String)
}
总结
map | count | size in moemory(KB) | ||
原始数据 | 16 | 0.3 | ||
MEMORY_ONLY | 12 | 0.2 | 3.3 | |
MEMORY_ONLY_SER | 13 | 0.3 | ||
MEMORY_ONLY_SER_KYRO | ||||
MEMORY_ONLY_SER_KYRO_REGISTER | ||||