上一篇讲解了Spark源码解读之RDD构建和转换过程,上一篇的RDD操作都是transform,也就是说结果会是一个新的RDD,并不会进行真正地计算,真正会引发Spark计算操作时action。比如first、count、collect。
WordCounts.collect()会真正出发一个Job的执行。
调用流程:
org.apache.spark.SparkContext.runJob
org.apache.spark.scheduler.DAGScheduler.runJob
org.apache.spark.scheduler.DAGScheduler.submitJob
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.post
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.run
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive
![](/qrcode.jpg)
org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted
RDD的collect源码如下:
// 返回RDD所包含的所有元素
def collect(): Array[T] = withScope {
val results = sc.runJob(this, (iter: Iterator[T]) => iter.toArray)
Array.concat(results: _*)
}
内部直接调用SparkContext的runJob方法,SparkContext中的runJob方法进行了重载,这儿会从最开始的一个runJob方法调用到最后的一个runJob方法,源码如下:
/**
* rdd为最后转换后生成的ShuffledRDD
* func为(ctx: TaskContext, it: Iterator[T]) => cleanedFunc(it),cleanedFunc为val cleanedFunc = clean(func)
* partitions为0 until rdd.partitions.length
* resultHandler为(index, res) => results(index) = res,results为val results = new Array[U](partitions.size)
*/
def runJob[T, U: ClassTag](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
resultHandler: (Int, U) => Unit): Unit = {
if (stopped.get()) {
throw new IllegalStateException("SparkContext has been shutdown")
}
val callSite = getCallSite
val cleanedFunc = clean(func)
logInfo("Starting job: " + callSite.shortForm)
if (conf.getBoolean("spark.logLineage", false)) {
logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
}
// 调用dagScheduler的runJob方法
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
progressBar.foreach(_.finishAll())
// 对RDD进行checkpoint
rdd.doCheckpoint()
}
DAGScheduler中的runJob方法,该方法中通过submitJob方法进行任务的提交,具体源码如下:
def runJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): Unit = {
val start = System.nanoTime
// 调用DAGScheduler中的submitJob方法,返回JobWaiter对象,该对象等待job完成,完成后调用resultHandler函数进行后续处理
val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
waiter.awaitResult() match {
// 成功
case JobSucceeded =>
logInfo("Job %d finished: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
// 失败
case JobFailed(exception: Exception) =>
logInfo("Job %d failed: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
// SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.
val callerStackTrace = Thread.currentThread().getStackTrace.tail
exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)
throw exception
}
}
submitJob方法源码如下:
def submitJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): JobWaiter[U] = {
// Check to make sure we are not launching a task on a partition that does not exist.
// 检查任务,确保所执行的任务所在的partition存在
val maxPartitions = rdd.partitions.length
partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>
throw new IllegalArgumentException(
"Attempting to access a non-existent partition: " + p + ". " +
"Total number of partitions: " + maxPartitions)
}
// 唯一的Job Id
val jobId = nextJobId.getAndIncrement()
if (partitions.size == 0) {
// Return immediately if the job is running 0 tasks
return new JobWaiter[U](this, jobId, 0, resultHandler)
}
assert(partitions.size > 0)
val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
// 创建JobWaiter,该JobWaiter会被阻塞,直到Job完成或取消
val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
// eventProcessLoop是DAGSchedulerEventProcessLoop类的实例,调用post方法提交JobSubmitted到event队列,eventThread后台进程会对该任务进行提交处理
eventProcessLoop.post(JobSubmitted(
jobId, rdd, func2, partitions.toArray, callSite, waiter,
SerializationUtils.clone(properties)))
waiter
}
DAGSchedulerEventProcessLoop继承EventLoop,先来看下EventLoop类源码:
/**
* An event loop to receive events from the caller and process all events in the event thread. It
* will start an exclusive event thread to process all events.
*
* Note: The event queue will grow indefinitely. So subclasses should make sure `onReceive` can
* handle events in time to avoid the potential OOM.
*/
private[spark] abstract class EventLoop[E](name: String) extends Logging {
private val eventQueue: BlockingQueue[E] = new LinkedBlockingDeque[E]()
private val stopped = new AtomicBoolean(false)
private val eventThread = new Thread(name) {
setDaemon(true)
override def run(): Unit = {
try {
while (!stopped.get) {
val event = eventQueue.take()
try {
onReceive(event) // 抽象方法,由子类去实现
} catch {
case NonFatal(e) => {
try {
onError(e)
} catch {
case NonFatal(e) => logError("Unexpected error in " + name, e)
}
}
}
}
} catch {
case ie: InterruptedException => // exit even if eventQueue is not empty
case NonFatal(e) => logError("Unexpected error in " + name, e)
}
}
}
// 省略部分方法
/**
* Put the event into the event queue. The event thread will process it later.
*/
def post(event: E): Unit = {
eventQueue.put(event)
}
/**
* Invoked in the event thread when polling events from the event queue.
*
* Note: Should avoid calling blocking actions in `onReceive`, or the event thread will be blocked
* and cannot process events in time. If you want to call some blocking actions, run them in
* another thread.
*/
protected def onReceive(event: E): Unit
}
从EventLoop类的源码可以看出来EventLoop中包含了一个eventThread的后台线程,该线程的作用是从队列中拿到任务,然后对其调用onReceive方法。此外,post方法是将事件提交到队列中。由于onReceive是一个抽象方法,具体实现需要看其子类,下面是DAGSchedulerEventProcessLoop的部分源码:
private[scheduler] class DAGSchedulerEventProcessLoop(dagScheduler: DAGScheduler)
extends EventLoop[DAGSchedulerEvent]("dag-scheduler-event-loop") with Logging {
private[this] val timer = dagScheduler.metricsSource.messageProcessingTimer
/**
* The main event loop of the DAG scheduler.
* 在使用post方法提交后,eventThread进程会将调用该方法进行处理
*/
override def onReceive(event: DAGSchedulerEvent): Unit = {
val timerContext = timer.time()
try {
doOnReceive(event)
} finally {
timerContext.stop()
}
}
private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
case MapStageSubmitted(jobId, dependency, callSite, listener, properties) =>
dagScheduler.handleMapStageSubmitted(jobId, dependency, callSite, listener, properties)
case StageCancelled(stageId) =>
dagScheduler.handleStageCancellation(stageId)
case JobCancelled(jobId) =>
dagScheduler.handleJobCancellation(jobId)
case JobGroupCancelled(groupId) =>
dagScheduler.handleJobGroupCancelled(groupId)
case AllJobsCancelled =>
dagScheduler.doCancelAllJobs()
case ExecutorAdded(execId, host) =>
dagScheduler.handleExecutorAdded(execId, host)
case ExecutorLost(execId) =>
dagScheduler.handleExecutorLost(execId, fetchFailed = false)
case BeginEvent(task, taskInfo) =>
dagScheduler.handleBeginEvent(task, taskInfo)
case GettingResultEvent(taskInfo) =>
dagScheduler.handleGetTaskResult(taskInfo)
case completion @ CompletionEvent(task, reason, _, _, taskInfo, taskMetrics) =>
dagScheduler.handleTaskCompletion(completion)
case TaskSetFailed(taskSet, reason, exception) =>
dagScheduler.handleTaskSetFailed(taskSet, reason, exception)
case ResubmitFailedStages =>
dagScheduler.resubmitFailedStages()
}
}
从上面的分析可以看出来,Spark Job的提交最后会由名为“dag-scheduler-event-loop”的后台进程去处理,然后调用dagScheduler.handleJobSubmitted完成Job的最终提交,之后下一节将讲解Job如何划分Stage。