思科VPP源码分析（ethernet node分析）

基本概念

核心函数

ethernet_input_init
初始化函数，主循环之前会调用。

static clib_error_t *
ethernet_input_init (vlib_main_t * vm)
{
  //支持vlan，和qinq协议
  ethernet_main_t *em = &ethernet_main;
  __attribute__ ((unused)) vlan_table_t *invalid_vlan_table;
  __attribute__ ((unused)) qinq_table_t *invalid_qinq_table;

  /*只是给format_buffer，unformat_buffer赋值，值得注意的是有对packet generate初始化，基本协议都有自己的pg实现。*/
  ethernet_setup_node (vm, ethernet_input_node.index);
  ethernet_setup_node (vm, ethernet_input_type_node.index);
  ethernet_setup_node (vm, ethernet_input_not_l2_node.index);

  //初始化sparse_vec，用于根据3层协议来区分下一跳node这个目的。
  next_by_ethertype_init (&em->l3_next);

  // Initialize pools and vector for vlan parsing
  vec_validate (em->main_intfs, 10);    // 10 main interfaces
  pool_alloc (em->vlan_pool, 10);
  pool_alloc (em->qinq_pool, 1);

  // The first vlan pool will always be reserved for an invalid table
  pool_get (em->vlan_pool, invalid_vlan_table); // first id = 0
  // The first qinq pool will always be reserved for an invalid table
  pool_get (em->qinq_pool, invalid_qinq_table); // first id = 0

  return 0;
}

ethernet_input_inline 完成了该node业务逻辑功能

static_always_inline uword
ethernet_input_inline (vlib_main_t * vm,
               vlib_node_runtime_t * node,
               vlib_frame_t * from_frame,
               ethernet_input_variant_t variant)
{
  vnet_main_t *vnm = vnet_get_main ();
  ethernet_main_t *em = &ethernet_main;
  vlib_node_runtime_t *error_node;
  u32 n_left_from, next_index, *from, *to_next;
  u32 stats_sw_if_index, stats_n_packets, stats_n_bytes;
  u32 cpu_index = os_get_cpu_number ();

  /*ETHERNET_INPUT_VARIANT_ETHERNET_TYPE，ETHERNET_INPUT_VARIANT_NOT_L2，   
  ETHERNET_INPUT_VARIANT_ETHERNET三种模式下，公用ethernet_input_node的
  error信息。博主没有看出这里有什么特殊的含义*/
  if (variant != ETHERNET_INPUT_VARIANT_ETHERNET)
    error_node = vlib_node_get_runtime (vm, ethernet_input_node.index);
  else
    error_node = node;

  //返回frame尾部保存数据包信息内存的起始地址
  from = vlib_frame_vector_args (from_frame);
  //frame中的数据包个数
  n_left_from = from_frame->n_vectors;

  if (node->flags & VLIB_NODE_FLAG_TRACE)
    vlib_trace_frame_buffers_only (vm, node,
                   from,
                   n_left_from,
                   sizeof (from[0]),
                   sizeof (ethernet_input_trace_t));

  //上次数据包的下一跳这里直接使用，后面有机会修正
  next_index = node->cached_next_index;
  stats_sw_if_index = node->runtime_data[0];
  stats_n_packets = stats_n_bytes = 0;

  while (n_left_from > 0)
    {
      u32 n_left_to_next;

      //获取传给下一跳node的保存数据包的缓存
      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      while (n_left_from >= 4 && n_left_to_next >= 2)
    {
    //操作两个数据包，再预取两个数据包
      u32 bi0, bi1;
      vlib_buffer_t *b0, *b1;
      u8 next0, next1, error0, error1;
      u16 type0, orig_type0, type1, orig_type1;
      u16 outer_id0, inner_id0, outer_id1, inner_id1;
      u32 match_flags0, match_flags1;
      u32 old_sw_if_index0, new_sw_if_index0, len0, old_sw_if_index1,
        new_sw_if_index1, len1;
      vnet_hw_interface_t *hi0, *hi1;
      main_intf_t *main_intf0, *main_intf1;
      vlan_intf_t *vlan_intf0, *vlan_intf1;
      qinq_intf_t *qinq_intf0, *qinq_intf1;
      u32 is_l20, is_l21;

      /* Prefetch next iteration. */
      {
        vlib_buffer_t *b2, *b3;

        b2 = vlib_get_buffer (vm, from[2]);
        b3 = vlib_get_buffer (vm, from[3]);

        vlib_prefetch_buffer_header (b2, STORE);
        vlib_prefetch_buffer_header (b3, STORE);

        CLIB_PREFETCH (b2->data, sizeof (ethernet_header_t), LOAD);
        CLIB_PREFETCH (b3->data, sizeof (ethernet_header_t), LOAD);
      }

      bi0 = from[0];
      bi1 = from[1];
      to_next[0] = bi0;
      to_next[1] = bi1;
      from += 2;
      to_next += 2;
      n_left_to_next -= 2;
      n_left_from -= 2;

      b0 = vlib_get_buffer (vm, bi0);
      b1 = vlib_get_buffer (vm, bi1);

      error0 = error1 = ETHERNET_ERROR_NONE;

      /*解析2层信息，有多重封装的也解封，最终把
      vlib_buffer_t->current_data指向三层头部*/
      parse_header (variant,
            b0,
            &type0,
            &orig_type0, &outer_id0, &inner_id0, &match_flags0);

      parse_header (variant,
            b1,
            &type1,
            &orig_type1, &outer_id1, &inner_id1, &match_flags1);

      old_sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
      old_sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX];

      eth_vlan_table_lookups (em,
                  vnm,
                  old_sw_if_index0,
                  orig_type0,
                  outer_id0,
                  inner_id0,
                  &hi0,
                  &main_intf0, &vlan_intf0, &qinq_intf0);

      eth_vlan_table_lookups (em,
                  vnm,
                  old_sw_if_index1,
                  orig_type1,
                  outer_id1,
                  inner_id1,
                  &hi1,
                  &main_intf1, &vlan_intf1, &qinq_intf1);

      identify_subint (hi0,
               b0,
               match_flags0,
               main_intf0,
               vlan_intf0,
               qinq_intf0, &new_sw_if_index0, &error0, &is_l20);

      identify_subint (hi1,
               b1,
               match_flags1,
               main_intf1,
               vlan_intf1,
               qinq_intf1, &new_sw_if_index1, &error1, &is_l21);

      // Save RX sw_if_index for later nodes
      vnet_buffer (b0)->sw_if_index[VLIB_RX] =
        error0 !=
        ETHERNET_ERROR_NONE ? old_sw_if_index0 : new_sw_if_index0;
      vnet_buffer (b1)->sw_if_index[VLIB_RX] =
        error1 !=
        ETHERNET_ERROR_NONE ? old_sw_if_index1 : new_sw_if_index1;

      // Check if there is a stat to take (valid and non-main sw_if_index for pkt 0 or pkt 1)
      /*更新统计信息，vpp中大量代码都是先按照预测执行逻辑，随后再修正，或许对代码流水线有帮助，有空再仔细琢磨下*/
      if (((new_sw_if_index0 != ~0)
           && (new_sw_if_index0 != old_sw_if_index0))
          || ((new_sw_if_index1 != ~0)
          && (new_sw_if_index1 != old_sw_if_index1)))
        {

          len0 = vlib_buffer_length_in_chain (vm, b0) + b0->current_data
        - vnet_buffer (b0)->ethernet.start_of_ethernet_header;
          len1 = vlib_buffer_length_in_chain (vm, b1) + b1->current_data
        - vnet_buffer (b1)->ethernet.start_of_ethernet_header;

          stats_n_packets += 2;
          stats_n_bytes += len0 + len1;

          if (PREDICT_FALSE
          (!(new_sw_if_index0 == stats_sw_if_index
             && new_sw_if_index1 == stats_sw_if_index)))
        {
          stats_n_packets -= 2;
          stats_n_bytes -= len0 + len1;

          if (new_sw_if_index0 != old_sw_if_index0
              && new_sw_if_index0 != ~0)
            vlib_increment_combined_counter (vnm->
                             interface_main.combined_sw_if_counters
                             +
                             VNET_INTERFACE_COUNTER_RX,
                             cpu_index,
                             new_sw_if_index0, 1,
                             len0);
          if (new_sw_if_index1 != old_sw_if_index1
              && new_sw_if_index1 != ~0)
            vlib_increment_combined_counter (vnm->
                             interface_main.combined_sw_if_counters
                             +
                             VNET_INTERFACE_COUNTER_RX,
                             cpu_index,
                             new_sw_if_index1, 1,
                             len1);

          if (new_sw_if_index0 == new_sw_if_index1)
            {
              if (stats_n_packets > 0)
            {
              vlib_increment_combined_counter
                (vnm->interface_main.combined_sw_if_counters
                 + VNET_INTERFACE_COUNTER_RX,
                 cpu_index,
                 stats_sw_if_index,
                 stats_n_packets, stats_n_bytes);
              stats_n_packets = stats_n_bytes = 0;
            }
              stats_sw_if_index = new_sw_if_index0;
            }
        }
        }

      if (variant == ETHERNET_INPUT_VARIANT_NOT_L2)
        is_l20 = is_l21 = 0;
      //决定下一跳node，根据设置可以支持按照协议决定下一跳
      determine_next_node (em, variant, is_l20, type0, b0, &error0,
                   &next0);
      determine_next_node (em, variant, is_l21, type1, b1, &error1,
                   &next1);

      b0->error = error_node->errors[error0];
      b1->error = error_node->errors[error1];

      // verify speculative enqueue
      //修正这两个数据包下一跳node
      vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next,
                       n_left_to_next, bi0, bi1, next0,
                       next1);
    }

      while (n_left_from > 0 && n_left_to_next > 0)
    {
      u32 bi0;
      vlib_buffer_t *b0;
      u8 error0, next0;
      u16 type0, orig_type0;
      u16 outer_id0, inner_id0;
      u32 match_flags0;
      u32 old_sw_if_index0, new_sw_if_index0, len0;
      vnet_hw_interface_t *hi0;
      main_intf_t *main_intf0;
      vlan_intf_t *vlan_intf0;
      qinq_intf_t *qinq_intf0;
      u32 is_l20;

      // Prefetch next iteration
      if (n_left_from > 1)
        {
          vlib_buffer_t *p2;

          p2 = vlib_get_buffer (vm, from[1]);
          vlib_prefetch_buffer_header (p2, STORE);
          CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, LOAD);
        }

      bi0 = from[0];
      to_next[0] = bi0;
      from += 1;
      to_next += 1;
      n_left_from -= 1;
      n_left_to_next -= 1;

      b0 = vlib_get_buffer (vm, bi0);

      error0 = ETHERNET_ERROR_NONE;

      parse_header (variant,
            b0,
            &type0,
            &orig_type0, &outer_id0, &inner_id0, &match_flags0);

      old_sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];

      eth_vlan_table_lookups (em,
                  vnm,
                  old_sw_if_index0,
                  orig_type0,
                  outer_id0,
                  inner_id0,
                  &hi0,
                  &main_intf0, &vlan_intf0, &qinq_intf0);

      identify_subint (hi0,
               b0,
               match_flags0,
               main_intf0,
               vlan_intf0,
               qinq_intf0, &new_sw_if_index0, &error0, &is_l20);

      // Save RX sw_if_index for later nodes
      vnet_buffer (b0)->sw_if_index[VLIB_RX] =
        error0 !=
        ETHERNET_ERROR_NONE ? old_sw_if_index0 : new_sw_if_index0;

      // Increment subinterface stats
      // Note that interface-level counters have already been incremented
      // prior to calling this function. Thus only subinterface counters
      // are incremented here.
      //
      // Interface level counters include packets received on the main
      // interface and all subinterfaces. Subinterface level counters
      // include only those packets received on that subinterface
      // Increment stats if the subint is valid and it is not the main intf
      if ((new_sw_if_index0 != ~0)
          && (new_sw_if_index0 != old_sw_if_index0))
        {

          len0 = vlib_buffer_length_in_chain (vm, b0) + b0->current_data
        - vnet_buffer (b0)->ethernet.start_of_ethernet_header;

          stats_n_packets += 1;
          stats_n_bytes += len0;

          // Batch stat increments from the same subinterface so counters
          // don't need to be incremented for every packet.
          if (PREDICT_FALSE (new_sw_if_index0 != stats_sw_if_index))
        {
          stats_n_packets -= 1;
          stats_n_bytes -= len0;

          if (new_sw_if_index0 != ~0)
            vlib_increment_combined_counter
              (vnm->interface_main.combined_sw_if_counters
               + VNET_INTERFACE_COUNTER_RX,
               cpu_index, new_sw_if_index0, 1, len0);
          if (stats_n_packets > 0)
            {
              vlib_increment_combined_counter
            (vnm->interface_main.combined_sw_if_counters
             + VNET_INTERFACE_COUNTER_RX,
             cpu_index,
             stats_sw_if_index, stats_n_packets, stats_n_bytes);
              stats_n_packets = stats_n_bytes = 0;
            }
          stats_sw_if_index = new_sw_if_index0;
        }
        }

      if (variant == ETHERNET_INPUT_VARIANT_NOT_L2)
        is_l20 = 0;

      determine_next_node (em, variant, is_l20, type0, b0, &error0,
                   &next0);

      b0->error = error_node->errors[error0];

      // verify speculative enqueue
      vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
                       to_next, n_left_to_next,
                       bi0, next0);
    }

      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
    }

  // Increment any remaining batched stats
  if (stats_n_packets > 0)
    {
      vlib_increment_combined_counter
    (vnm->interface_main.combined_sw_if_counters
     + VNET_INTERFACE_COUNTER_RX,
     cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
      node->runtime_data[0] = stats_sw_if_index;
    }

  return from_frame->n_vectors;
}

以下几个函数用于hook该node，替代determine_next_node中默认的下一跳node挑选机制
/默认有ETHERNET_TYPE_IP4，ETHERNET_TYPE_IP6，ETHERNET_TYPE_MPLS_UNICAST三种协议，用户可以自己添加更多协议。可以根据协议来做不同下一跳/

void
ethernet_register_input_type (vlib_main_t * vm,
                  ethernet_type_t type, u32 node_index)
{
  ethernet_main_t *em = &ethernet_main;
  ethernet_type_info_t *ti;
  u32 i;

  {
    clib_error_t *error = vlib_call_init_function (vm, ethernet_init);
    if (error)
      clib_error_report (error);
  }

  ti = ethernet_get_type_info (em, type);
  ti->node_index = node_index;
  ti->next_index = vlib_node_add_next (vm,
                       ethernet_input_node.index, node_index);
  i = vlib_node_add_next (vm, ethernet_input_type_node.index, node_index);
  ASSERT (i == ti->next_index);

  i = vlib_node_add_next (vm, ethernet_input_not_l2_node.index, node_index);
  ASSERT (i == ti->next_index);

  // Add the L3 node for this ethertype to the next nodes structure
  next_by_ethertype_register (&em->l3_next, type, ti->next_index);

  // Call the registration functions for other nodes that want a mapping
  l2bvi_register_input_type (vm, type, node_index);
}

/vlan包下一跳判定机制，可以还原ethernet头部，根据这里注册的值作跳转/

void
ethernet_register_l2_input (vlib_main_t * vm, u32 node_index)
{
  ethernet_main_t *em = &ethernet_main;
  u32 i;

  em->l2_next =
    vlib_node_add_next (vm, ethernet_input_node.index, node_index);

  /*
   * Even if we never use these arcs, we have to align the next indices...
   */
  i = vlib_node_add_next (vm, ethernet_input_type_node.index, node_index);

  ASSERT (i == em->l2_next);

  i = vlib_node_add_next (vm, ethernet_input_not_l2_node.index, node_index);
  ASSERT (i == em->l2_next);
}

//调用该函数后，大多数下一跳基本就由这里注册的值决定了

// Register a next node for L3 redirect, and enable L3 redirect
void
ethernet_register_l3_redirect (vlib_main_t * vm, u32 node_index)
{
  ethernet_main_t *em = &ethernet_main;
  u32 i;

  em->redirect_l3 = 1;
  em->redirect_l3_next = vlib_node_add_next (vm,
                         ethernet_input_node.index,
                         node_index);
  /*
   * Change the cached next nodes to the redirect node
   */
  em->l3_next.input_next_ip4 = em->redirect_l3_next;
  em->l3_next.input_next_ip6 = em->redirect_l3_next;
  em->l3_next.input_next_mpls = em->redirect_l3_next;

  /*
   * Even if we never use these arcs, we have to align the next indices...
   */
  i = vlib_node_add_next (vm, ethernet_input_type_node.index, node_index);

  ASSERT (i == em->redirect_l3_next);
}

思科VPP源码分析（ethernet node分析）

基本概念

核心函数

猜你喜欢