由于intellij十分消耗内存,
并且在概念上来讲,scala导入jar包应该和在集成开发环境中导入jar包是等效的。
所以我想,能否纯命令行,不用spark-submit的情况下来运行呢?
折腾了好几个小时,方案如下。
TestRdd.scala代码:
import scala.Iterator
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.log4j.Logger
import org.apache.log4j.Level
object TestRdd {
def sumOfEveryPartition(input: Iterator[Int]): Int = {
var total = 0
input.foreach { elem =>
total += elem
}
total
}
def main(args: Array[String]) {
// val conf = new SparkConf().setAppName("Spark Rdd Test")
// val spark = new SparkContext(conf)
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("akka").setLevel(Level.OFF)
Logger.getRootLogger().setLevel(Level.ERROR)
val spark = SparkSession.builder
.appName("Intro").config("spark.master", "local")
.getOrCreate();
spark.sparkContext.
setLogLevel("ERROR")
val input = spark.sparkContext.parallelize(List(1, 2, 3, 4, 5, 6), 2)//RDD有6个元素,分成2个partition
val result = input.mapPartitions(
partition => Iterator(sumOfEveryPartition(partition)))//partition是传入的参数,是个list,要求返回也是list,即Iterator(sumOfEveryPartition(partition))
result.collect().foreach {
println(_)//6 15
}
spark.stop()
}
}
运行方法:
scala -classpath $(echo *.jar ~/bigdata/spark-2.3.1-bin-hadoop2.7/jars/*.jar| tr ' ' ':') TestRdd.scala
运行结果:
6
15
注意:上面导入了spark本地部署中的所有jars包