SparkContext 初始化流程
SparkContext 是通往Spark 集群的唯一入口,可以用来在 Spark 集群中创建 RDD,累加
器和广播变量,并对RDD执行各种算子,SparkContext的核心作用是初始化Spark程序运行需要组件。
SparkContext初始化过程会构建三大核心对象:
- DAGScheduler:DAGScheduler是面向Job的Stage高层调度器
- TaskScheduler:TaskScheduler个接口,是底层调度器,根据具体的ClusterManager 的不同会有不同的实现
- SchedulerBacker:ScdulerBackend个接口,根据具体的Cluster Manager不同会有不同的实现。
源代码分析
创建TaskScheduler位于SparkContext主构造方法中
...
val (sched, ts) = SparkContext.createTaskScheduler(this, master, deployMode)
// yarn cluster 模式下返回 YarnClusterSchedulerBackend YarnClusterScheduler
_schedulerBackend = sched
_taskScheduler = ts
_dagScheduler = new DAGScheduler(this)
...
// 启动taskScheduler
_taskScheduler.start()
...
createTaskScheduler
private def createTaskScheduler(
sc: SparkContext,
master: String,
deployMode: String): (SchedulerBackend, TaskScheduler) = {
...
case masterUrl =>
val cm = getClusterManager(masterUrl) match {
// yarn clsuter模式实际调用的是YarnClusterManager
case Some(clusterMgr) => clusterMgr
case None => throw new SparkException("Could not parse Master URL: '" + master + "'")
}
try {
//创建TaskScheduler
val scheduler = cm.createTaskScheduler(sc, masterUrl)
//创建SchedulerBackend
val backend = cm.createSchedulerBackend(sc, masterUrl, scheduler)
cm.initialize(scheduler, backend)
(backend, scheduler)
} catch {
case se: SparkException => throw se
case NonFatal(e) =>
throw new SparkException("External scheduler cannot be instantiated", e)
}
...
}
创建TaskScheduler时会根据masterUrl模式匹配,Yarn Cluster模式ClusterManager实际是YarnClusterManager。
YarnClusterManager
private[spark] class YarnClusterManager extends ExternalClusterManager {
override def canCreate(masterURL: String): Boolean = {
masterURL == "yarn"
}
override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
sc.deployMode match {
case "cluster" => new YarnClusterScheduler(sc)
case "client" => new YarnScheduler(sc)
case _ => throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn")
}
}
override def createSchedulerBackend(sc: SparkContext,
masterURL: String,
scheduler: TaskScheduler): SchedulerBackend = {
sc.deployMode match {
case "cluster" =>
new YarnClusterSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc)
case "client" =>
new YarnClientSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc)
case _ =>
throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn")
}
}
override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
// 调用TaskSchedulerImpl -> initialize方法
scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
}
}
TaskSchedulerImpl initialize方法
// schedulingMode默认策略是FIFO
private val schedulingModeConf = conf.get(SCHEDULER_MODE_PROPERTY, SchedulingMode.FIFO.toString)
val schedulingMode: SchedulingMode =
try {
SchedulingMode.withName(schedulingModeConf.toUpperCase(Locale.ROOT))
} catch {
case e: java.util.NoSuchElementException =>
throw new SparkException(s"Unrecognized $SCHEDULER_MODE_PROPERTY: $schedulingModeConf")
}
def initialize(backend: SchedulerBackend) {
this.backend = backend
schedulableBuilder = {
schedulingMode match {
case SchedulingMode.FIFO =>
new FIFOSchedulableBuilder(rootPool)
case SchedulingMode.FAIR =>
new FairSchedulableBuilder(rootPool, conf)
case _ =>
throw new IllegalArgumentException(s"Unsupported $SCHEDULER_MODE_PROPERTY: " +
s"$schedulingMode")
}
}
// 创建任务调度池,默认调度策略是FIFO
schedulableBuilder.buildPools()
}