上篇博文讲过,dpvs分为"数据平面线程"和"控制平面线程",
数据平面线程:处理从网卡上接收到的数据包,进行抗ddos处理和负载均衡操作
控制平面线程:接收来自ipvsadm和ipip程序发送的命令,响应相应的操作
今天我们详细说下数据平面,先浏览下调用的流程图
main -> netif_lcore_start -> netif_loop -> do_lcore_job 执行每个lcore上的job任务
rte_eal_mp_remote_launch(netif_loop, NULL, SKIP_MASTER);函数解释可以查找dpdk的官方文档
在所有cpu核心上执行netif_loop,最后一个参数SKIP_MASTER代表:除了master lcore以外的逻辑cpu核心,都执行netif_loop
在这里,netif_loop可以暂且理解成一个线程函数。
一、每个cpu核心上,任务的执行
函数中首先执行netif_lcore_jobs[NETIF_LCORE_JOB_INIT],在里面做一些初始化的代码。
NETIF_LCORE_JOB_LOOP:一直循环执行的任务
NETIF_LCORE_JOB_INIT:仅执行一次的任务,用于初始化程序状态等。
NETIF_LCORE_JOB_SLOW:慢速任务,每隔一定频率执行一次的任务,频率由job->skip_loops指定。
netif_lcore_loop_job结构体定义如下:
struct netif_lcore_loop_job
{
char name[32];
void (*func)(void *arg);
void *data;
enum netif_lcore_job_type type;
uint32_t skip_loops; /* for NETIF_LCORE_JOB_SLOW type only */
#ifdef CONFIG_RECORD_BIG_LOOP
uint32_t job_time[DPVS_MAX_LCORE];
#endif
struct list_head list;
} __rte_cache_aligned;
name:工作任务名称
func:保存用户注册的回调函数
data:用户传递的数据
type:工作类型,主要有NETIF_LCORE_JOB_INIT,NETIF_LCORE_JOB_LOOP,NETIF_LCORE_JOB_SLOW三种类型
skip_loops:仅仅用于NETIF_LCORE_JOB_SLOW慢速类型,表示跳过循环的次数
list:存储job工作任务的链表头
二、func在何处赋值?
在sourceInsight搜索,发现有两处对func赋值的地方
1.ipv4.c文件的488处
static struct pkt_type ip4_pkt_type = {
//.type = rte_cpu_to_be_16(ETHER_TYPE_IPv4),
.func = ipv4_rcv,
.port = NULL,
};
但是结构体类型不是netif_lcore_loop_job结构体类型,所以此处不是给netif_lcore_loop_job结构体的func回调函数赋值处
2.netif.c文件的netif_lcore_init()函数中有如下代码
static void netif_lcore_init(void)
{
int ii, res;
lcoreid_t cid;
timer_sched_interval_us = dpvs_timer_sched_interval_get();
for (cid = 0; cid < DPVS_MAX_LCORE; cid++) {
if (rte_lcore_is_enabled(cid))
RTE_LOG(INFO, NETIF, "%s: lcore%d is enabled\n", __func__, cid);
else
RTE_LOG(INFO, NETIF, "%s: lcore%d is disabled\n", __func__, cid);
}
/* build lcore fast searching table */
lcore_index_init();
/* init isolate rxqueue table */
isol_rxq_init();
/* check and set lcore config */
config_lcores(&worker_list);
if ((res = check_lcore_conf(rte_lcore_count(), lcore_conf)) != EDPVS_OK)
rte_exit(EXIT_FAILURE, "[%s] bad lcore configuration (err=%d),"
" exit ...\n", __func__, res);
/* build port fast searching table */
port_index_init();
/* register lcore jobs*/
snprintf(netif_jobs[0].name, sizeof(netif_jobs[0].name) - 1, "%s", "recv_fwd");
netif_jobs[0].func = lcore_job_recv_fwd; //1
netif_jobs[0].data = NULL;
netif_jobs[0].type = NETIF_LCORE_JOB_LOOP;
snprintf(netif_jobs[1].name, sizeof(netif_jobs[1].name) - 1, "%s", "xmit");
netif_jobs[1].func = lcore_job_xmit; //2
netif_jobs[1].data = NULL;
netif_jobs[1].type = NETIF_LCORE_JOB_LOOP;
snprintf(netif_jobs[2].name, sizeof(netif_jobs[2].name) - 1, "%s", "timer_manage");
netif_jobs[2].func = lcore_job_timer_manage; //3
netif_jobs[2].data = NULL;
netif_jobs[2].type = NETIF_LCORE_JOB_LOOP;
for (ii = 0; ii < NETIF_JOB_COUNT; ii++) {
res = netif_lcore_loop_job_register(&netif_jobs[ii]);
if (res < 0) {
rte_exit(EXIT_FAILURE,
"[%s] Fail to register netif lcore jobs, exiting ...\n", __func__);
break;
}
}
}
标注为1,2,3处是对func回调的赋值操作,
netif_jobs[0]注册func回调函数为lcore_job_recv_fwd
netif_jobs[1]注册func回调函数为lcore_job_xmit
netif_jobs[2]注册func回调函数为lcore_job_timer_manage
lcore_job_recv_fwd():从dpdk网口上接收数据进行转发
lcore_job_xmit:将数据从kni口发送出去,调用netif_tx_burst函数
lcore_job_timer_manage:运行定时器调度管理函数rte_timer_manage,驱动定时器正常运行
三、详细分析lcore_job_recv_fwd函数
我们来详细分析下lcore_job_recv_fwd函数
static void lcore_job_recv_fwd(void *arg)
{
int i, j;
portid_t pid;
lcoreid_t cid;
struct netif_queue_conf *qconf;
cid = rte_lcore_id();
assert(LCORE_ID_ANY != cid);
for (i = 0; i < lcore_conf[lcore2index[cid]].nports; i++) {
pid = lcore_conf[lcore2index[cid]].pqs[i].id;
assert(pid <= bond_pid_end);
for (j = 0; j < lcore_conf[lcore2index[cid]].pqs[i].nrxq; j++) {
qconf = &lcore_conf[lcore2index[cid]].pqs[i].rxqs[j];
/*处理arp ring队列上数据*/
lcore_process_arp_ring(qconf, cid);
/*调用rte_eth_rx_burst将数据接收到qconf->mbufs数组中*/
qconf->len = netif_rx_burst(pid, qconf);
/*统计接收到的各种数据包*/
lcore_stats_burst(&lcore_stats[cid], qconf->len);
/*处理数据包*/
lcore_process_packets(qconf, qconf->mbufs, cid, qconf->len, 0);
kni_send2kern_loop(pid, qconf);
}
}
}
/*处理arp ring队列上数据*/
lcore_process_arp_ring(qconf, cid);
/*调用rte_eth_rx_burst将数据接收到qconf->mbufs数组中*/
qconf->len = netif_rx_burst(pid, qconf);
/*统计接收到的各种数据包*/
lcore_stats_burst(&lcore_stats[cid], qconf->len);
/*处理数据包*/
lcore_process_packets(qconf, qconf->mbufs, cid, qconf->len, 0);
前三个都比较简单,小伙伴们看看源代码就能懂,重点看下lcore_process_packets(qconf, qconf->mbufs, cid, qconf->len, 0);
四、lcore_process_packets函数,在每个逻辑核心上处理数据包
static void lcore_process_packets(struct netif_queue_conf *qconf, struct rte_mbuf **mbufs,
lcoreid_t cid, uint16_t count, bool pkts_from_ring)
{
int i, t;
struct ether_hdr *eth_hdr;
struct rte_mbuf *mbuf_copied = NULL;
/* prefetch packets */
for (t = 0; t < count && t < NETIF_PKT_PREFETCH_OFFSET; t++)
rte_prefetch0(rte_pktmbuf_mtod(mbufs[t], void *));
/* L2 filter */
for (i = 0; i < count; i++) {
struct rte_mbuf *mbuf = mbufs[i];
struct netif_port *dev = netif_port_get(mbuf->port);
if (unlikely(!dev)) {
rte_pktmbuf_free(mbuf);
lcore_stats[cid].dropped++;
continue;
}
if (dev->type == PORT_TYPE_BOND_SLAVE) {
dev = dev->bond->slave.master;
mbuf->port = dev->id;
}
if (t < count) {
rte_prefetch0(rte_pktmbuf_mtod(mbufs[t], void *));
t++;
}
eth_hdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *);
/* reuse mbuf.packet_type, it was RTE_PTYPE_XXX */
mbuf->packet_type = eth_type_parse(eth_hdr, dev);
/*
* In NETIF_PORT_FLAG_FORWARD2KNI mode.
* All packets received are deep copied and sent to KNI
* for the purpose of capturing forwarding packets.Since the
* rte_mbuf will be modified in the following procedure,
* we should use mbuf_copy instead of rte_pktmbuf_clone.
*/
if (dev->flag & NETIF_PORT_FLAG_FORWARD2KNI) {
if (likely(NULL != (mbuf_copied = mbuf_copy(mbuf,
pktmbuf_pool[dev->socket]))))
kni_ingress(mbuf_copied, dev, qconf);
else
RTE_LOG(WARNING, NETIF, "%s: Failed to copy mbuf\n",
__func__);
}
/*
* do not drop pkt to other hosts (ETH_PKT_OTHERHOST)
* since virtual devices may have different MAC with
* underlying device.
*/
/*
* handle VLAN
* if HW offload vlan strip, it's still need vlan module
* to act as VLAN filter.
*/
if (eth_hdr->ether_type == htons(ETH_P_8021Q) ||
mbuf->ol_flags & PKT_RX_VLAN_STRIPPED) {
if (vlan_rcv(mbuf, netif_port_get(mbuf->port)) != EDPVS_OK) {
rte_pktmbuf_free(mbuf);
lcore_stats[cid].dropped++;
continue;
}
dev = netif_port_get(mbuf->port);
if (unlikely(!dev)) {
rte_pktmbuf_free(mbuf);
lcore_stats[cid].dropped++;
continue;
}
eth_hdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *);
}
/* handler should free mbuf */
netif_deliver_mbuf(mbuf, eth_hdr->ether_type, dev, qconf,
(dev->flag & NETIF_PORT_FLAG_FORWARD2KNI) ? true:false,
cid, pkts_from_ring);
lcore_stats[cid].ibytes += mbuf->pkt_len;
lcore_stats[cid].ipackets++;
}
}
rte_prefetch0:对数据报文的预取
rte_pktmbuf_mtod:A macro that points to the start of the data in the mbuf.官方解释,就不翻译了。
如果dev->flag & NETIF_PORT_FLAG_FORWARD2KNI,则标志为NETIF_PORT_FLAG_FORWARD2KNI,进而将mbuf进行拷贝后,调用kni_ingress(mbuf_copied, dev, qconf);来进行处理
vlan协议的处理
最后调用netif_deliver_mbuf,接下来再好好分析下这函数
五、netif_deliver_mbuf函数,分发数据包
static inline int netif_deliver_mbuf(struct rte_mbuf *mbuf,
uint16_t eth_type,
struct netif_port *dev,
struct netif_queue_conf *qconf,
bool forward2kni,
lcoreid_t cid,
bool pkts_from_ring)
{
struct pkt_type *pt;
int err;
uint16_t data_off;
assert(mbuf->port <= NETIF_MAX_PORTS);
assert(dev != NULL);
pt = pkt_type_get(eth_type, dev);
if (NULL == pt) {
if (!forward2kni)
kni_ingress(mbuf, dev, qconf);
else
rte_pktmbuf_free(mbuf);
return EDPVS_OK;
}
/*clone arp pkt to every queue*/
/*将apr协议数据包拷贝到每个队列上,此处掠过*/
if (pt->type == rte_cpu_to_be_16(ETHER_TYPE_ARP) && !pkts_from_ring) {
struct rte_mempool *mbuf_pool;
struct rte_mbuf *mbuf_clone;
uint8_t i;
struct arp_hdr *arp;
unsigned socket_id;
socket_id = rte_socket_id();
mbuf_pool = pktmbuf_pool[socket_id];
rte_pktmbuf_adj(mbuf, sizeof(struct ether_hdr));
arp = rte_pktmbuf_mtod(mbuf, struct arp_hdr *);
rte_pktmbuf_prepend(mbuf,(uint16_t)sizeof(struct ether_hdr));
if (rte_be_to_cpu_16(arp->arp_op) == ARP_OP_REPLY) {
for (i = 0; i < DPVS_MAX_LCORE; i++) {
if ((i == cid) || (!is_lcore_id_fwd(i))
|| (i == rte_get_master_lcore()))
continue;
/*rte_pktmbuf_clone will not clone pkt.data, just copy pointer!*/
mbuf_clone = rte_pktmbuf_clone(mbuf, mbuf_pool);
if (mbuf_clone) {
int ret = rte_ring_enqueue(arp_ring[i], mbuf_clone);
if (unlikely(-EDQUOT == ret)) {
RTE_LOG(WARNING, NETIF, "%s: arp ring of lcore %d quota exceeded\n",
__func__, i);
}
else if (ret < 0) {
RTE_LOG(WARNING, NETIF, "%s: arp ring of lcore %d enqueue failed\n",
__func__, i);
rte_pktmbuf_free(mbuf_clone);
}
}
}
}
}
mbuf->l2_len = sizeof(struct ether_hdr);
/* Remove ether_hdr at the beginning of an mbuf */
/*划重点,rte_pktmbuf_adj函数移除以太网头部了*/
data_off = mbuf->data_off;
if (unlikely(NULL == rte_pktmbuf_adj(mbuf, sizeof(struct ether_hdr))))
return EDPVS_INVPKT;
/*调用func回调*/
err = pt->func(mbuf, dev);
/*kni类型,调用kni_ingress函数处理*/
if (err == EDPVS_KNICONTINUE) {
if (pkts_from_ring || forward2kni) {
rte_pktmbuf_free(mbuf);
return EDPVS_OK;
}
if (likely(NULL != rte_pktmbuf_prepend(mbuf,
(mbuf->data_off - data_off)))) {
kni_ingress(mbuf, dev, qconf);
} else {
rte_pktmbuf_free(mbuf);
}
}
return EDPVS_OK;
}
重点注意下!!!rte_pktmbuf_adj函数移除以太网头部!
/*调用func回调,重点*/
err = pt->func(mbuf, dev);
这里的func回调函数是在ipv4.c文件中注册的,func = ipv4_rcv
static struct pkt_type ip4_pkt_type = {
//.type = rte_cpu_to_be_16(ETHER_TYPE_IPv4),
.func = ipv4_rcv,
.port = NULL,
};
ipv4_rcv函数对ip层协议各参数解析后,调用INET_HOOK(INET_HOOK_PRE_ROUTING, mbuf, port, NULL, ipv4_rcv_fin);
int INET_HOOK(unsigned int hook, struct rte_mbuf *mbuf,
struct netif_port *in, struct netif_port *out,
int (*okfn)(struct rte_mbuf *mbuf))
{
struct list_head *hook_list;
struct inet_hook_ops *ops;
struct inet_hook_state state;
int verdict = INET_ACCEPT;
state.hook = hook;
hook_list = &inet_hooks[hook];
#ifdef CONFIG_DPVS_IPV4_INET_HOOK
rte_rwlock_read_lock(&inet_hook_lock);
#endif
ops = list_entry(hook_list, struct inet_hook_ops, list);
if (!list_empty(hook_list)) {
verdict = INET_ACCEPT;
list_for_each_entry_continue(ops, hook_list, list) {
repeat:
verdict = ops->hook(ops->priv, mbuf, &state);
if (verdict != INET_ACCEPT) {
if (verdict == INET_REPEAT)
goto repeat;
break;
}
}
}
#ifdef CONFIG_DPVS_IPV4_INET_HOOK
rte_rwlock_read_unlock(&inet_hook_lock);
#endif
if (verdict == INET_ACCEPT || verdict == INET_STOP) {
return okfn(mbuf);
} else if (verdict == INET_DROP) {
rte_pktmbuf_free(mbuf);
return EDPVS_DROP;
} else { /* INET_STOLEN */
return EDPVS_OK;
}
}
遍历hook链表,并依次执行被注册的hook函数,当hook函数的返回值verdict为INET_ACCEPT或INET_STOP,调用okfn(mbuf);
六、何处注册hoook回调函数
如上图所示,有这几个位置注册了hook回调函数。
在ip_vs_core.c文件的701行处注册了hook的代码如下
static struct inet_hook_ops dp_vs_ops[] = {
{
.hook = dp_vs_in,
.hooknum = INET_HOOK_PRE_ROUTING,
.priority = 100,
},
{
.hook = dp_vs_pre_routing,
.hooknum = INET_HOOK_PRE_ROUTING,
.priority = 99,
},
};
注册的hook函数为dp_vs_in和dp_vs_pre_routing,比较好奇的是hooknum和priority成员是什么作用呢?耐心看下注册hook的地方
ipv4_register_hooks(dp_vs_ops, NELEMS(dp_vs_ops))
int ipv4_register_hooks(struct inet_hook_ops *reg, size_t n)
{
size_t i, err;
struct list_head *hook_list;
assert(reg);
//遍历inet_hook_ops数组中各个元素
for (i = 0; i < n; i++) {
//hooknum大于最大值或hook回调函数为null,则失败
if (reg[i].hooknum >= INET_HOOK_NUMHOOKS || !reg[i].hook) {
err = EDPVS_INVAL;
goto rollback;
}
hook_list = &inet_hooks[reg[i].hooknum];
#ifdef CONFIG_DPVS_IPV4_INET_HOOK
rte_rwlock_write_lock(&inet_hook_lock);
#endif
err = __inet_register_hooks(hook_list, ®[i]);
#ifdef CONFIG_DPVS_IPV4_INET_HOOK
rte_rwlock_write_unlock(&inet_hook_lock);
#endif
if (err != EDPVS_OK)
goto rollback;
}
return EDPVS_OK;
rollback:
ipv4_unregister_hooks(reg, n);
return err;
}
hook_list = &inet_hooks[reg[i].hooknum];获取到hook列表后,调用__inet_register_hooks(hook_list, ®[i]);
static int __inet_register_hooks(struct list_head *head,
struct inet_hook_ops *reg)
{
struct inet_hook_ops *elem;
/* check if exist */
list_for_each_entry(elem, head, list) {
if (elem == reg) {
RTE_LOG(ERR, IPV4, "%s: hook already exist\n", __func__);
return EDPVS_EXIST; /* error ? */
}
}
list_for_each_entry(elem, head, list) {
if (reg->priority < elem->priority)
break;
}
list_add(®->list, elem->list.prev);
return EDPVS_OK;
}
就优先级priority值小的元素插入到链表的前端,越会被先执行到。
小结:
1.工作任务job的注册和执行
2.func回调函数的注册和执行
3.hook函数的注册和执行