Hadoop Yarn 3.1.0 源码分析（03 容器分配和投运）

接下来我们看容器的分配和在NM节点上的投运过程，接着上一篇：
FairScheduler.handle() -> FairScheduler.nodeUpdate() -> FairScheduler.attemptScheduling() -> queueMgr.getRootQueue().assignContainer():

public Resource assignContainer(FSSchedulerNode node) {
    Resource assigned = Resources.none();

    // If this queue is over its limit, reject
    if (!assignContainerPreCheck(node)) {
      return assigned;
    }

    // 根据FairScheduler的调度规则对子队列进行排序
    writeLock.lock();
    try {
      Collections.sort(childQueues, policy.getComparator());
    } finally {
      writeLock.unlock();
    }

    readLock.lock();
    try {
      for (FSQueue child : childQueues) {
        assigned = child.assignContainer(node);
        if (!Resources.equals(assigned, Resources.none())) {
          break;
        }
      }
    } finally {
      readLock.unlock();
    }
    return assigned;
  }

FairScheduler.handle() -> FairScheduler.nodeUpdate() -> FairScheduler.attemptScheduling() -> queueMgr.getRootQueue().assignContainer() -> FSLeafQueue.assignContainer()

 public Resource assignContainer(FSSchedulerNode node) {
    Resource assigned = none();
    if (LOG.isDebugEnabled()) {
      LOG.debug("Node " + node.getNodeName() + " offered to queue: " +
          getName() + " fairShare: " + getFairShare());
    }

    if (!assignContainerPreCheck(node)) {
      return assigned;
    }

    for (FSAppAttempt sched : fetchAppsWithDemand(true)) {
      if (SchedulerAppUtils.isPlaceBlacklisted(sched, node, LOG)) {
        continue;
      }
      assigned = sched.assignContainer(node);
      if (!assigned.equals(none())) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Assigned container in queue:" + getName() + " " +
              "container:" + assigned);
        }
        break;
      }
    }
    return assigned;
  }

从根队列然后到子队列，最后到子队列中按照调度规则，应该被调度用于分配容器的应用程序的实例进行容器分配。
FairScheduler.handle() -> FairScheduler.nodeUpdate() -> FairScheduler.attemptScheduling() -> queueMgr.getRootQueue().assignContainer() -> FSLeafQueue.assignContainer() -> FSAppAttempt.assignContainer():

 public Resource assignContainer(FSSchedulerNode node) {
    if (isOverAMShareLimit()) {
      PendingAsk amAsk = appSchedulingInfo.getNextPendingAsk();
      updateAMDiagnosticMsg(amAsk.getPerAllocationResource(),
          " exceeds maximum AM resource allowed).");
      if (LOG.isDebugEnabled()) {
        LOG.debug("AM resource request: " + amAsk.getPerAllocationResource()
            + " exceeds maximum AM resource allowed, "
            + getQueue().dumpState());
      }
      return Resources.none();
    }
    return assignContainer(node, false);
  }

 private Resource assignContainer(FSSchedulerNode node, boolean reserved) {
    if (LOG.isTraceEnabled()) {
      LOG.trace("Node offered to app: " + getName() + " reserved: " + reserved);
    }

    Collection<SchedulerRequestKey> keysToTry = (reserved) ?
        Collections.singletonList(
            node.getReservedContainer().getReservedSchedulerKey()) :
        getSchedulerKeys();
    //对于应用中的每一种优先级请求，看是否满足NODE_LOCAL,RACK_LOCAL,OFF_SWITCH这三种本地性需求
    //请求可能被延迟调度，如果设置了延期调度参数来提升本地性
    try {
      writeLock.lock();
      for (SchedulerRequestKey schedulerKey : keysToTry) {

        if (!reserved && !hasContainerForNode(schedulerKey, node)) {
          continue;
        }

        addSchedulingOpportunity(schedulerKey);

        PendingAsk rackLocalPendingAsk = getPendingAsk(schedulerKey,
            node.getRackName());
        PendingAsk nodeLocalPendingAsk = getPendingAsk(schedulerKey,
            node.getNodeName());

        if (nodeLocalPendingAsk.getCount() > 0
            && !appSchedulingInfo.canDelayTo(schedulerKey,
            node.getNodeName())) {
          LOG.warn("Relax locality off is not supported on local request: "
              + nodeLocalPendingAsk);
        }

        NodeType allowedLocality;
        if (scheduler.isContinuousSchedulingEnabled()) {
          allowedLocality = getAllowedLocalityLevelByTime(schedulerKey,
              scheduler.getNodeLocalityDelayMs(),
              scheduler.getRackLocalityDelayMs(),
              scheduler.getClock().getTime());
        } else {
          allowedLocality = getAllowedLocalityLevel(schedulerKey,
              scheduler.getNumClusterNodes(),
              scheduler.getNodeLocalityThreshold(),
              scheduler.getRackLocalityThreshold());
        }

        if (rackLocalPendingAsk.getCount() > 0
            && nodeLocalPendingAsk.getCount() > 0) {
          if (LOG.isTraceEnabled()) {
            LOG.trace("Assign container on " + node.getNodeName()
                + " node, assignType: NODE_LOCAL" + ", allowedLocality: "
                + allowedLocality + ", priority: " + schedulerKey.getPriority()
                + ", app attempt id: " + this.attemptId);
          }
          return assignContainer(node, nodeLocalPendingAsk, NodeType.NODE_LOCAL,
              reserved, schedulerKey);
        }

        if (!appSchedulingInfo.canDelayTo(schedulerKey, node.getRackName())) {
          continue;
        }

        if (rackLocalPendingAsk.getCount() > 0
            && (allowedLocality.equals(NodeType.RACK_LOCAL) || allowedLocality
            .equals(NodeType.OFF_SWITCH))) {
          if (LOG.isTraceEnabled()) {
            LOG.trace("Assign container on " + node.getNodeName()
                + " node, assignType: RACK_LOCAL" + ", allowedLocality: "
                + allowedLocality + ", priority: " + schedulerKey.getPriority()
                + ", app attempt id: " + this.attemptId);
          }
          return assignContainer(node, rackLocalPendingAsk, NodeType.RACK_LOCAL,
              reserved, schedulerKey);
        }

        PendingAsk offswitchAsk = getPendingAsk(schedulerKey,
            ResourceRequest.ANY);
        if (!appSchedulingInfo.canDelayTo(schedulerKey, ResourceRequest.ANY)) {
          continue;
        }

        if (offswitchAsk.getCount() > 0) {
          if (getAppPlacementAllocator(schedulerKey).getUniqueLocationAsks()
              <= 1 || allowedLocality.equals(NodeType.OFF_SWITCH)) {
            if (LOG.isTraceEnabled()) {
              LOG.trace("Assign container on " + node.getNodeName()
                  + " node, assignType: OFF_SWITCH" + ", allowedLocality: "
                  + allowedLocality + ", priority: "
                  + schedulerKey.getPriority()
                  + ", app attempt id: " + this.attemptId);
            }
            return assignContainer(node, offswitchAsk, NodeType.OFF_SWITCH,
                reserved, schedulerKey);
          }
        }

        if (LOG.isTraceEnabled()) {
          LOG.trace("Can't assign container on " + node.getNodeName()
              + " node, allowedLocality: " + allowedLocality + ", priority: "
              + schedulerKey.getPriority() + ", app attempt id: "
              + this.attemptId);
        }
      }
    } finally {
      writeLock.unlock();
    }

    return Resources.none();
  }

有关Delay Schedulering 提高本地性的分析，详见Hadoop Yarn延迟调度分析（Delay Schedulering）。
根据不同的本地性需求调用：

private Resource assignContainer(
      FSSchedulerNode node, PendingAsk pendingAsk, NodeType type,
      boolean reserved, SchedulerRequestKey schedulerKey) {

    // 该请求需要的资源量
    Resource capability = pendingAsk.getPerAllocationResource();

    // 调度节点上可以被分配的资源量
    Resource available = node.getUnallocatedResource();
    //是否是被预留的容器
    Container reservedContainer = null;
    if (reserved) {
      reservedContainer = node.getReservedContainer().getContainer();
    }

    // 如果需求的资源量小于节点可以被分配的资源量
    if (Resources.fitsIn(capability, available)) {
      // 把新分配的容器通知给应用程序
      RMContainer allocatedContainer =
          allocate(type, node, schedulerKey, pendingAsk,
              reservedContainer);
      if (allocatedContainer == null) {
        if (reserved) {
          unreserve(schedulerKey, node);
        }
        return Resources.none();
      }
      //预留的情况下取消预留
      if (reserved) {
        unreserve(schedulerKey, node);
      }

      // 把新分配的容器通知给节点
      node.allocateContainer(allocatedContainer);

      if (!isAmRunning() && !getUnmanagedAM()) {
        setAMResource(capability);
        getQueue().addAMResourceUsage(capability);
        setAmRunning(true);
      }

      return capability;
    }

    if (LOG.isDebugEnabled()) {
      LOG.debug("Resource request: " + capability + " exceeds the available"
          + " resources of the node.");
    }

把新分配容器通知给application是我们关注的重点：
FairScheduler.handle() -> FairScheduler.nodeUpdate() -> FairScheduler.attemptScheduling() -> queueMgr.getRootQueue().assignContainer() -> FSLeafQueue.assignContainer() -> FSAppAttempt.assignContainer() -> FSAppAttempt.allocate():

public RMContainer allocate(NodeType type, FSSchedulerNode node,
      SchedulerRequestKey schedulerKey, PendingAsk pendingAsk,
      Container reservedContainer) {
    RMContainer rmContainer;
    Container container;

    try {
      writeLock.lock();
      // 根据实际调度的本地性需求，对allowedLocalityLevel进行更新
      NodeType allowed = allowedLocalityLevel.get(schedulerKey);
      if (allowed != null) {
        if (allowed.equals(NodeType.OFF_SWITCH) && (type.equals(
            NodeType.NODE_LOCAL) || type.equals(NodeType.RACK_LOCAL))) {
          this.resetAllowedLocalityLevel(schedulerKey, type);
        } else if (allowed.equals(NodeType.RACK_LOCAL) && type.equals(
            NodeType.NODE_LOCAL)) {
          this.resetAllowedLocalityLevel(schedulerKey, type);
        }
      }

      if (getOutstandingAsksCount(schedulerKey) <= 0) {
        return null;
      }

      container = reservedContainer;
      if (container == null) {
        container = createContainer(node, pendingAsk.getPerAllocationResource(),
            schedulerKey);
      }

      // 创建一个RMContainer对象，是容器在RM中的表现形式
      rmContainer = new RMContainerImpl(container, schedulerKey,
          getApplicationAttemptId(), node.getNodeID(),
          appSchedulingInfo.getUser(), rmContext);
      ((RMContainerImpl) rmContainer).setQueueName(this.getQueueName());

      // 把容器加入到已分配容器的列表中
      addToNewlyAllocatedContainers(node, rmContainer);
      liveContainers.put(container.getId(), rmContainer);

      // 把容器的信息更新到appSchedulingInfo中
      ContainerRequest containerRequest = appSchedulingInfo.allocate(
          type, node, schedulerKey, container);
      this.attemptResourceUsage.incUsed(container.getResource());
      getQueue().incUsedResource(container.getResource());

    RMContainer
      ((RMContainerImpl) rmContainer).setContainerRequest(containerRequest);

      // 做完相关工作后，触发RMContainer的状态机
      rmContainer.handle(
          new RMContainerEvent(container.getId(), RMContainerEventType.START));

      if (LOG.isDebugEnabled()) {
        LOG.debug("allocate: applicationAttemptId=" + container.getId()
            .getApplicationAttemptId() + " container=" + container.getId()
            + " host=" + container.getNodeId().getHost() + " type=" + type);
      }
      RMAuditLogger.logSuccess(getUser(), AuditConstants.ALLOC_CONTAINER,
          "SchedulerApp", getApplicationId(), container.getId(),
          container.getResource());
    } finally {
      writeLock.unlock();
    }

    return rmContainer;
  }

我们看到这个函数主要是创建了Container在RM中的表示形式RMContainer，然后更新了一些相关信息后，触发了RMContainer的状态机，向其发送了RMContainerEventType.START状态。对应的状态机转移操作为：

addTransition(RMContainerState.NEW, RMContainerState.ALLOCATED,
        RMContainerEventType.START, new ContainerStartedTransition())

private static final class ContainerStartedTransition extends
      BaseTransition {

    public void transition(RMContainerImpl container, RMContainerEvent event) {
      container.rmContext.getAllocationTagsManager().addContainer(
          container.getNodeId(), container.getContainerId(),
          container.getAllocationTags());

      container.eventHandler.handle(new RMAppAttemptEvent(
          container.appAttemptId, RMAppAttemptEventType.CONTAINER_ALLOCATED));
    }
  }

这个状态机跳变的伴随操作，终于推动了RMAppAttemptEventType操作，之前没有分配到容器的RMAppAttempt一直处于RMAppAttemptState.SCHEDULED状态，现在终于可能可以脱离前进了。

addTransition(RMAppAttemptState.SCHEDULED,
          EnumSet.of(RMAppAttemptState.ALLOCATED_SAVING,
            RMAppAttemptState.SCHEDULED),
          RMAppAttemptEventType.CONTAINER_ALLOCATED,
          new AMContainerAllocatedTransition())

可以看到这是一个多弧跳变，相应的状态转移对应跳函数为：

private static final class AMContainerAllocatedTransition
      implements
      MultipleArcTransition<RMAppAttemptImpl, RMAppAttemptEvent, RMAppAttemptState> {

    public RMAppAttemptState transition(RMAppAttemptImpl appAttempt,
        RMAppAttemptEvent event) {
      // 想从获取AM所需的那个容器
      Allocation amContainerAllocation =
          appAttempt.scheduler.allocate(appAttempt.applicationAttemptId,
            EMPTY_CONTAINER_REQUEST_LIST, null, EMPTY_CONTAINER_RELEASE_LIST, null,
            null, new ContainerUpdates());
      //既然创建完一个RMContainer之后，CONTAINER_ALLOCATED触发了，就说明至少存在一个容器可以获取，并且存放在了       //SchedulerApplication#newlyAllocatedContainers 中。
      //但是对应调度器FairScheduler的allocate函数不能保证一定能够拉到容器，因为容器可能不能被拉过来因为某些原因，比如DNS不可达，就会回到上一个状态也就是SCHEDULED状态，然后再重新去获取AM所需的一个容器。
      if (amContainerAllocation.getContainers().size() == 0) {
        appAttempt.retryFetchingAMContainer(appAttempt);
        return RMAppAttemptState.SCHEDULED;
      }

      // 有容器获取就分配给AM使用
      appAttempt.setMasterContainer(amContainerAllocation.getContainers()
          .get(0));
      RMContainerImpl rmMasterContainer = (RMContainerImpl)appAttempt.scheduler
          .getRMContainer(appAttempt.getMasterContainer().getId());
      rmMasterContainer.setAMContainer(true);
      appAttempt.rmContext.getNMTokenSecretManager()
        .clearNodeSetForAttempt(appAttempt.applicationAttemptId);
      appAttempt.getSubmissionContext().setResource(
        appAttempt.getMasterContainer().getResource());
      appAttempt.storeAttempt();
      return RMAppAttemptState.ALLOCATED_SAVING;
    }
  }

在Hadoop Yarn 3.1.0 源码分析（02 作业调度）我们看到，当一个RMAppAttempt处于SUBMITTED状态时，收到ATTEMPT_ADDED事件触发的时候，会执行ScheduleTransition.transition()然后调用appAttempt.scheduler.allocate()，企图从newlyAllocatedContainers集合中收揽已经分配的容器，若没有则会停留在SCHEDULED状态，若收揽到容器，则RMContainer会收到ACQUIRED事件，此时在心跳过后allocate中创建的RMContainer受到RMContainerEventType.START事件处于RMContainerState.ALLOCATED状态。那么对应的RMContainer的状态机就能继续推进。

addTransition(RMContainerState.ALLOCATED, RMContainerState.ACQUIRED,
        RMContainerEventType.ACQUIRED, new AcquiredTransition())

我们回到上面AMContainerAllocatedTransition

if (amContainerAllocation.getContainers().size() == 0) {
        appAttempt.retryFetchingAMContainer(appAttempt);
        //没有收揽到容器继续停留在SCHEDULED状态
        return RMAppAttemptState.SCHEDULED;
}

 appAttempt.storeAttempt();

 return RMAppAttemptState.ALLOCATED_SAVING;

如果现在还是收揽不到容器，那么会开一个线程周期性的去获取。
appAttempt.retryFetchingAMContainer():

private void retryFetchingAMContainer(final RMAppAttemptImpl appAttempt) {
    new Thread() {
      public void run() {
        try {
          //500ms试一下
          Thread.sleep(500);
        } catch (InterruptedException e) {
          LOG.warn("Interrupted while waiting to resend the"
              + " ContainerAllocated Event.");
        }
        //再次触发这个事件进行尝试
        appAttempt.eventHandler.handle(
            new RMAppAttemptEvent(appAttempt.applicationAttemptId,
                RMAppAttemptEventType.CONTAINER_ALLOCATED));
      }
    }.start();
  }

若RMAppAttempt收揽到了容器之后，那么当前RMAppAttempt处于RMAppAttemptState.ALLOCATED_SAVING状态，而appAttempt.storeAttempt()是异步的过程，结束了会给RMAppAttempt发送事件：
appAttempt.storeAttempt() -> RMStateStore.storeNewApplicationAttempt() :

public void storeNewApplicationAttempt(RMAppAttempt appAttempt) {
    //....
    getRMStateStoreEventHandler().handle(
      new RMStateStoreAppAttemptEvent(attemptState));
  }
   public RMStateStoreAppAttemptEvent(ApplicationAttemptStateData attemptState) {
    super(RMStateStoreEventType.STORE_APP_ATTEMPT);
    this.attemptState = attemptState;
  }

对应的状态机转移为：

addTransition(RMStateStoreState.ACTIVE,
          EnumSet.of(RMStateStoreState.ACTIVE, RMStateStoreState.FENCED),
          RMStateStoreEventType.STORE_APP_ATTEMPT,
          new StoreAppAttemptTransition())

private static class StoreAppAttemptTransition implements
      MultipleArcTransition<RMStateStore, RMStateStoreEvent,
          RMStateStoreState> {
    public RMStateStoreState transition(RMStateStore store,
        RMStateStoreEvent event) {
       //.....
        store.notifyApplicationAttempt(new RMAppAttemptEvent
               (attemptState.getAttemptId(),
               RMAppAttemptEventType.ATTEMPT_NEW_SAVED));

      return finalState(isFenced);
    };
  }

可以看到保存完AppAttemp信息后像RMAppAttempt发送了RMAppAttemptEventType.ATTEMPT_NEW_SAVED事件：

addTransition(RMAppAttemptState.ALLOCATED_SAVING, 
          RMAppAttemptState.ALLOCATED,
          RMAppAttemptEventType.ATTEMPT_NEW_SAVED, new AttemptStoredTransition())

RMAppAttempt进入了RMAppAttemptEventType.ALLOCATED状态，状态机对应的伴随操作是：

 private static final class AttemptStoredTransition extends BaseTransition {
    @Override
    public void transition(RMAppAttemptImpl appAttempt,
                                                    RMAppAttemptEvent event) {
      //安全认证相关
      appAttempt.registerClientToken();
      appAttempt.launchAttempt();
    }
  }

private void launchAttempt(){
    launchAMStartTime = System.currentTimeMillis();
    // Send event to launch the AM Container
    eventHandler.handle(new AMLauncherEvent(AMLauncherEventType.LAUNCH, this));
  }

凡哲_Lucas

发布了14 篇原创文章 · 获赞 4 · 访问量 5680

私信关注

Hadoop Yarn 3.1.0 源码分析 （03 容器分配和投运）

猜你喜欢

Hadoop Yarn 3.1.0 源码分析（03 容器分配和投运）