spark2.11-2.3源码8_DAGSchedule源码

调用action操作,进入dagScheduler.runJob,接着submitJob,有eventProcessLoop.post(JobSubmitted(),JobSubmitted里dagScheduler.handleJobSubmitted

handleJobSubmitted

//DAGScheduler的job调度的核心入口
    private[scheduler] def handleJobSubmitted(jobId: Int,
          finalRDD: RDD[_],
          func: (TaskContext, Iterator[_]) => _,
          partitions: Array[Int],
          callSite: CallSite,
          listener: JobListener,
          properties: Properties) {
          //使用触发job的最后一个rdd,创建finalStage
        var finalStage: ResultStage = null
        try {
          // New stage creation may throw an exception if, for example, jobs are run on a
          // HadoopRDD whose underlying HDFS files have been deleted.
          //之前是newStage,现在改成createResultStage,**创建一个Stage对象**,并且将stage加入DAGScheduler内部的内存缓存中
          finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)
        } catch {
          case e: Exception =>
            logWarning("Creating new stage failed due to exception - job: " + jobId, e)
            listener.jobFailed(e)
            return
        }
    
        //**用finalStage,创建一个Job**,就是说,这个job的最后一个stage,当然就是我们的finalStage
        val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
        clearCacheLocs()
        logInfo("Got job %s (%s) with %d output partitions".format(
          job.jobId, callSite.shortForm, partitions.length))
        logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")
        logInfo("Parents of final stage: " + finalStage.parents)
        logInfo("Missing parents: " + getMissingParentStages(finalStage))
    
        //**将job加入内存缓存中**
        val jobSubmissionTime = clock.getTimeMillis()
        jobIdToActiveJob(jobId) = job
        activeJobs += job
        finalStage.setActiveJob(job)
        val stageIds = jobIdToStageIds(jobId).toArray
        val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
        listenerBus.post(
          SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
        //**submitStage提交finalStage,导致第一个stage提交,其他放入等待队列**
        submitStage(finalStage)
      }

submitStage

stage划分算法:1、从finalStage倒推,2、通过宽依赖,来进行新的stage的划分,3、使用递归,优先提交父stage

//这个其实就是stage划分算法的入口,但是,stage划分算法,其实是由submitStage()方法与getMissingParentStages()方法共同组成的
private def submitStage(stage: Stage) {
    val jobId = activeJobForStage(stage)
    if (jobId.isDefined) {
      logDebug("submitStage(" + stage + ")")
      if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
         //调用getMissingParentStages()方法,去获取当前这个stage的父stage
        val missing = getMissingParentStages(stage).sortBy(_.id)
        logDebug("missing: " + missing)
        //反复递归调用,直到最初的stage,没有父stage了,那么此时,就会去自行提交这个第一个stage,stage0。其余的stage,此时全部都在waitingStages里面
        if (missing.isEmpty) {
          logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
          submitMissingTasks(stage, jobId.get)
        } else {
            //递归调用submit()方法,去提交父stage(重要)
          for (parent <- missing) {
            submitStage(parent)
          }
          //将当前stage,放入waitingStages 等待执行的stage队列中
          waitingStages += stage
        }
      }
    } else {
      abortStage(stage, "No active job for stage " + stage.id, None)
    }
  }

getMissingParentStages

 //获取某个stage的父stage,从final stage往上倒推
 //对一个stage,如果它的最后一个rdd的所有依赖,都是窄依赖,那么就不会创建任何新的stage,但是,只要发现这个stage的rdd宽依赖了某个rdd,那个就用宽依赖的那个rdd,创建一个新的stage,然后立即将新的stage返回
 private def getMissingParentStages(stage: Stage): List[Stage] = {
    val missing = new HashSet[Stage]
    val visited = new HashSet[RDD[_]]
    // We are manually maintaining a stack here to prevent StackOverflowError
    // caused by recursively visiting
    val waitingForVisit = new ArrayStack[RDD[_]]
    def visit(rdd: RDD[_]) {
      if (!visited(rdd)) {
        visited += rdd
        //Nil:val Nil = scala.collection.immutable.即List[Nothing]一个没有元素的List集合
        val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil)
        //先判断是否有未cache的分区,若全部都被cache了就不用计算parent Stage了。
        if (rddHasUncachedPartitions) {
          //遍历rdd的依赖
          for (dep <- rdd.dependencies) {            
            dep match {
              //宽依赖
              case shufDep: ShuffleDependency[_, _, _] =>
              //使用宽依赖的那个rdd创建ShuffleMapStage,并且将isShuffleMap设置为true
              //默认最后一个stage,不是shuffleMap stage,但是finalStage之前所有的stage都是shuffleMap stage
                val mapStage = getOrCreateShuffleMapStage(shufDep, stage.firstJobId)
                if (!mapStage.isAvailable) {
                  missing += mapStage
                }
              //窄依赖,将依赖的rdd放入栈中
              case narrowDep: NarrowDependency[_] =>
                waitingForVisit.push(narrowDep.rdd)
            }
          }
        }
      }
    }
    //首先往栈中,推入了stage最后的一个rdd
    waitingForVisit.push(stage.rdd)
    //然后循环
    while (waitingForVisit.nonEmpty) {
    //对stage的最后一个rdd,调用自已内部定义的visit()方法
      visit(waitingForVisit.pop())
    }
    missing.toList
  }

getOrCreateShuffleMapStage

createShuffleMapStage(dep, firstJobId)

createShuffleMapStage

val stage = new ShuffleMapStage(
      id, rdd, numTasks, parents, jobId, rdd.creationSite, shuffleDep, mapOutputTracker)

ShuffleMapStage

//设置成宽依赖的stage
 val shuffleDep: ShuffleDependency[_, _, _],

submitMissingTasks(stage提交处理)
为stage创建一批task,task数量与partition数量相同

private def submitMissingTasks(stage: Stage, jobId: Int) {
logDebug(“submitMissingTasks(” + stage + “)”)

// First figure out the indexes of partition ids to compute.
//获取要创建的task的数量 
val partitionsToCompute: Seq[Int] = stage.findMissingPartitions()

// Use the scheduling pool, job group, description, etc. from an ActiveJob associated
// with this Stage
val properties = jobIdToActiveJob(jobId).properties
//将stage加入stage运行队列
runningStages += stage

  stage match {
      case s: ShuffleMapStage =>
        outputCommitCoordinator.stageStart(stage = s.id, maxPartitionId = s.numPartitions - 1)
      case s: ResultStage =>
        outputCommitCoordinator.stageStart(
          stage = s.id, maxPartitionId = s.rdd.partitions.length - 1)
    }

 val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {
  stage match {
    case s: ShuffleMapStage =>
      partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
    case s: ResultStage =>
      partitionsToCompute.map { id =>
        val p = s.partitions(id)
        (id, getPreferredLocs(stage.rdd, p))
      }.toMap
  }
} catch {
  case NonFatal(e) =>
    stage.makeNewStageAttempt(partitionsToCompute.size)
    listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
    abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))
    runningStages -= stage
    return
}
    //为stage创建指定数量的task,最关键一点,task的最佳位置计算算法
        val tasks: Seq[Task[_]] = try {
          val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()
          stage match {
            //匹配ShuffleMapStage
            case stage: ShuffleMapStage =>
              stage.pendingPartitions.clear()
              partitionsToCompute.map { id =>
              //为每一个partition创建一个task,给每个task计算最佳位置
                val locs = taskIdToLocations(id)
                val part = partitions(id)
                stage.pendingPartitions += id
                //然后对于ResultStage之外的stage,isShuffleMap都是true,所以会创建ShuffleMapTask
                new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,
                  taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
                  Option(sc.applicationId), sc.applicationAttemptId)
              }
		 //匹配ShuffleMapStage
        case stage: ResultStage =>
          partitionsToCompute.map { id =>
            val p: Int = stage.partitions(id)
            val part = partitions(p)
            val locs = taskIdToLocations(id)
            new ResultTask(stage.id, stage.latestInfo.attemptNumber,
              taskBinary, part, locs, id, properties, serializedTaskMetrics,
              Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)
          }
      }
    } catch {
      case NonFatal(e) =>
        abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))
        runningStages -= stage
        return
    }
         if (tasks.size > 0) {
      logInfo(s"Submitting ${tasks.size} missing tasks from $stage (${stage.rdd}) (first 15 " +
        s"tasks are for partitions ${tasks.take(15).map(_.partitionId)})")
        //最后,针对stage的task,创建TaskSet对象 ,调用TaskScheduler的submitTasks()方法,提交TaskSet
      taskScheduler.submitTasks(new TaskSet(
        tasks.toArray, stage.id, stage.latestInfo.attemptNumber, jobId, properties))
    } else {
      // Because we posted SparkListenerStageSubmitted earlier, we should mark
      // the stage as completed here in case there are no tasks to run
      markStageAsFinished(stage, None)

      val debugString = stage match {
        case stage: ShuffleMapStage =>
          s"Stage ${stage} is actually done; " +
            s"(available: ${stage.isAvailable}," +
            s"available outputs: ${stage.numAvailableOutputs}," +
            s"partitions: ${stage.numPartitions})"
        case stage : ResultStage =>
          s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})"
      }
      logDebug(debugString)

      submitWaitingChildStages(stage)
    }
  }

taskIdToLocations里调用getPreferredLocsInternal计算task最挂位置
从stage的最后一个rdd开始,去找哪个rdd的partition,是被cache了,或者checkpoint了,那么,task的最佳位置就是cache/checkpoint的partition位置,因为这个,task就在那个节点上执行,不需要计算之前的rdd了。

   private def getPreferredLocsInternal(
          rdd: RDD[_],
          partition: Int,
          visited: HashSet[(RDD[_], Int)]): Seq[TaskLocation] = {
        // If the partition has already been visited, no need to re-visit.
        // This avoids exponential path exploration.  SPARK-695
        if (!visited.add((rdd, partition))) {
          // Nil has already been returned for previously visited partitions.
          return Nil
        }
        // If the partition is cached, return the cache locations
        //寻找当前rdd的partition是否缓存了
        val cached = getCacheLocs(rdd)(partition)
        if (cached.nonEmpty) {
          return cached
        }
        // If the RDD has some placement preferences (as is the case for input RDDs), get those
        //寻找当前rdd的partition是否缓存了
        val rddPrefs = rdd.preferredLocations(rdd.partitions(partition)).toList
        if (rddPrefs.nonEmpty) {
          return rddPrefs.map(TaskLocation(_))
        }
    
        // If the RDD has narrow dependencies, pick the first partition of the first narrow dependency
        // that has any placement preferences. Ideally we would choose based on transfer sizes,
        // but this will do for now.
        递归调用自己,去寻找rdd的父rdd,看看对应的partition是否缓存或者checkpoint
        rdd.dependencies.foreach {
          case n: NarrowDependency[_] =>
            for (inPart <- n.getParents(partition)) {
              val locs = getPreferredLocsInternal(n.rdd, inPart, visited)
              if (locs != Nil) {
                return locs
              }
            }
    
          case _ =>
        }
    	//如果这个stage从最后一个rdd,到最开始的rdd,partition都没有缓存或者checkpoint,那么,task的最佳位置(preferredLocs)就是Nil,就没有。要通过后面的TaskScheduler去决定
        Nil
      }

猜你喜欢

转载自blog.csdn.net/u011607686/article/details/86560643