IP输出 之 分片ip_fragment、ip_do_fragment

概述

ip_fragment函数用于判断是否进行分片,在没有设置DF标记的情况下进入分片,如果设置了DF标记,则继续判断,如果不允许DF分片或者收到的最大分片大于MTU大小,则回复ICMP,释放skb,其余情况仍然需要走分片;

ip_do_fragment是详细的分片流程,整个过程分为快速分片和慢速分片两种,如果存在分片列表frag_list,并且通过检查,则走快速路径,复制每个分片的ip头等信息之后,发送出去;如果不存在分片列表,或者分片列表检查失败,则走慢速路径,慢速路径会根据MTU大小,对整个数据进行重新划分,分配skb,进行数据拷贝,设置ip头等信息,然后发送出去;

源码分析
 1 static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 2                unsigned int mtu,
 3                int (*output)(struct net *, struct sock *, struct sk_buff *))
 4 {
 5     struct iphdr *iph = ip_hdr(skb);
 6 
 7     /* 如果没有DF标记,则进行分片 */
 8     if ((iph->frag_off & htons(IP_DF)) == 0)
 9         return ip_do_fragment(net, sk, skb, output);
10     
11     /* 有DF标记则继续判断 */
12 
13     /* 不允许本地分片 || 分片最大长度>MTU */
14     if (unlikely(!skb->ignore_df ||
15              (IPCB(skb)->frag_max_size &&
16               IPCB(skb)->frag_max_size > mtu))) {
17         IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
18         /* ICMP错误 */
19         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
20               htonl(mtu));
21         /* 释放skb */
22         kfree_skb(skb);
23         return -EMSGSIZE;
24     }
25 
26     /* 其他情况,继续分片 */
27     return ip_do_fragment(net, sk, skb, output);
28 }
  1 int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
  2            int (*output)(struct net *, struct sock *, struct sk_buff *))
  3 {
  4     struct iphdr *iph;
  5     int ptr;
  6     struct sk_buff *skb2;
  7     unsigned int mtu, hlen, left, len, ll_rs;
  8     int offset;
  9     __be16 not_last_frag;
 10     struct rtable *rt = skb_rtable(skb);
 11     int err = 0;
 12 
 13     /* for offloaded checksums cleanup checksum before fragmentation */
 14     /* PARTIAL类型需要清除校验和 */
 15     if (skb->ip_summed == CHECKSUM_PARTIAL &&
 16         (err = skb_checksum_help(skb)))
 17         goto fail;
 18 
 19     /*
 20      *    Point into the IP datagram header.
 21      */
 22 
 23     iph = ip_hdr(skb);
 24 
 25     /* 获取mtu */
 26     mtu = ip_skb_dst_mtu(sk, skb);
 27 
 28     /* 接收到的最大分片长度 < mtu,则将mtu设置为该值 */
 29     if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
 30         mtu = IPCB(skb)->frag_max_size;
 31 
 32     /*
 33      *    Setup starting values.
 34      */
 35 
 36     hlen = iph->ihl * 4;
 37     mtu = mtu - hlen;    /* Size of data space */
 38     IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 39 
 40     /* When frag_list is given, use it. First, check its validity:
 41      * some transformers could create wrong frag_list or break existing
 42      * one, it is not prohibited. In this case fall back to copying.
 43      *
 44      * LATER: this step can be merged to real generation of fragments,
 45      * we can switch to copy when see the first bad fragment.
 46      */
 47     /* 有分片列表 */
 48     if (skb_has_frag_list(skb)) {
 49         struct sk_buff *frag, *frag2;
 50 
 51         /* 线性区域和分页区的数据长度 */
 52         unsigned int first_len = skb_pagelen(skb);
 53 
 54         /* 以下情况,进入慢路处理 */
 55         if (first_len - hlen > mtu || /* 分片长度>MTU */
 56             ((first_len - hlen) & 7) || /* 没有8字节对齐 */
 57             ip_is_fragment(iph) || /* 是一个分片 */
 58             skb_cloned(skb)) /* 是克隆的 */
 59             goto slow_path;
 60 
 61         /* 遍历分片列表 */
 62         skb_walk_frags(skb, frag) {
 63             /* Correct geometry. */
 64             /* 以下情况,恢复状态,进入慢速路径 */
 65             if (frag->len > mtu || /* 分片长度>mtu */
 66                 ((frag->len & 7) && frag->next) || /* 除最后一个分片外,其余有非8字节对齐的 */
 67                 skb_headroom(frag) < hlen) /* 头部长度过小 */
 68                 goto slow_path_clean;
 69 
 70             /* Partially cloned skb? */
 71             /* 克隆的,恢复状态,进入慢速路径 */    
 72             if (skb_shared(frag))
 73                 goto slow_path_clean;
 74 
 75             BUG_ON(frag->sk);
 76 
 77             /* 分片关联控制块 */
 78             if (skb->sk) {
 79                 frag->sk = skb->sk;
 80                 frag->destructor = sock_wfree;
 81             }
 82 
 83             /* 第一个skb的长度去掉当前分片的长度 */
 84             skb->truesize -= frag->truesize;
 85         }
 86 
 87         /* Everything is OK. Generate! */
 88 
 89         /* 现在分片没问题了,设置分片信息 */
 90         err = 0;
 91         offset = 0;
 92         frag = skb_shinfo(skb)->frag_list;
 93         skb_frag_list_init(skb);
 94         skb->data_len = first_len - skb_headlen(skb);
 95         skb->len = first_len;
 96         iph->tot_len = htons(first_len);
 97         iph->frag_off = htons(IP_MF);
 98         ip_send_check(iph);
 99 
100         /* 循环设置分片信息,并发送 */
101         for (;;) {
102             /* Prepare header of the next frame,
103              * before previous one went down. */
104              /* 为每一片都拷贝ip头,设置偏移信息 */
105             if (frag) {
106                 frag->ip_summed = CHECKSUM_NONE;
107                 skb_reset_transport_header(frag);
108                 __skb_push(frag, hlen);
109                 skb_reset_network_header(frag);
110                 memcpy(skb_network_header(frag), iph, hlen);
111                 iph = ip_hdr(frag);
112                 iph->tot_len = htons(frag->len);
113                 ip_copy_metadata(frag, skb);
114                 if (offset == 0)
115                     ip_options_fragment(frag);
116                 offset += skb->len - hlen;
117                 iph->frag_off = htons(offset>>3);
118                 if (frag->next)
119                     iph->frag_off |= htons(IP_MF);
120                 /* Ready, complete checksum */
121                 ip_send_check(iph);
122             }
123 
124             /* 调用发送回调 */
125             err = output(net, sk, skb);
126 
127             if (!err)
128                 IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
129             if (err || !frag)
130                 break;
131 
132             skb = frag;
133             frag = skb->next;
134             skb->next = NULL;
135         }
136 
137         if (err == 0) {
138             IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
139             return 0;
140         }
141 
142         /* 出错,释放分片 */
143         while (frag) {
144             skb = frag->next;
145             kfree_skb(frag);
146             frag = skb;
147         }
148         IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
149         return err;
150 
151 slow_path_clean:
152         /* 将分片恢复原状态 */
153         skb_walk_frags(skb, frag2) {
154             if (frag2 == frag)
155                 break;
156             frag2->sk = NULL;
157             frag2->destructor = NULL;
158             skb->truesize += frag2->truesize;
159         }
160     }
161 
162 slow_path:
163     /* 慢速分片路径 */
164 
165 
166     iph = ip_hdr(skb);
167 
168     /* 除去首部的剩余空间 */
169     left = skb->len - hlen;        /* Space per frame */
170     ptr = hlen;        /* Where to start from */
171 
172     /* 二层头部空间 */
173     ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
174 
175     /*
176      *    Fragment the datagram.
177      */
178 
179     /* 初始化mf和offset */
180     offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
181     not_last_frag = iph->frag_off & htons(IP_MF);
182 
183     /*
184      *    Keep copying data until we run out.
185      */
186 
187     /* 开始分片了 */
188     while (left > 0) {
189         /* len初始为剩余长度 */
190         len = left;
191         /* IF: it doesn't fit, use 'mtu' - the data space left */
192         /* 根据mtu确认长度 */
193         if (len > mtu)
194             len = mtu;
195         /* IF: we are not sending up to and including the packet end
196            then align the next start on an eight byte boundary */
197         /* 除最后分片外,其余8字节对齐 */
198         if (len < left)    {
199             len &= ~7;
200         }
201 
202         /* Allocate buffer */
203         /* 分配skb */
204         skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
205         if (!skb2) {
206             err = -ENOMEM;
207             goto fail;
208         }
209 
210         /*
211          *    Set up data on packet
212          */
213 
214         /* 拷贝元数据 */
215         ip_copy_metadata(skb2, skb);
216 
217         /* 预留空间,设置头部偏移 */
218         skb_reserve(skb2, ll_rs);
219         skb_put(skb2, len + hlen);
220         skb_reset_network_header(skb2);
221         skb2->transport_header = skb2->network_header + hlen;
222 
223         /*
224          *    Charge the memory for the fragment to any owner
225          *    it might possess
226          */
227         /* 关联sk */
228         if (skb->sk)
229             skb_set_owner_w(skb2, skb->sk);
230 
231         /*
232          *    Copy the packet header into the new buffer.
233          */
234 
235         /* 拷贝头部 */
236         skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
237 
238         /*
239          *    Copy a block of the IP datagram.
240          */
241         /* 拷贝数据 */
242         if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
243             BUG();
244         left -= len;
245 
246         /*
247          *    Fill in the new header fields.
248          */
249         iph = ip_hdr(skb2);
250 
251         /* 设置偏移 *//
252         iph->frag_off = htons((offset >> 3));
253 
254         /* 转发的数据包,带有FRAG_PMTU标记,则打上DF */
255         if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
256             iph->frag_off |= htons(IP_DF);
257 
258         /* ANK: dirty, but effective trick. Upgrade options only if
259          * the segment to be fragmented was THE FIRST (otherwise,
260          * options are already fixed) and make it ONCE
261          * on the initial skb, so that all the following fragments
262          * will inherit fixed options.
263          */
264         /* 第一个分片包含ip选项 */
265         if (offset == 0)
266             ip_options_fragment(skb);
267 
268         /*
269          *    Added AC : If we are fragmenting a fragment that's not the
270          *           last fragment then keep MF on each bit
271          */
272         /* 不是最后分片需要设定MF标记 */
273         if (left > 0 || not_last_frag)
274             iph->frag_off |= htons(IP_MF);
275 
276         /* 指针和偏移更新 */
277         ptr += len;
278         offset += len;
279 
280         /*
281          *    Put this fragment into the sending queue.
282          */
283         /* 设置数据长度 */
284         iph->tot_len = htons(len + hlen);
285 
286         /* 校验和 */
287         ip_send_check(iph);
288 
289         /* 发送分片 */
290         err = output(net, sk, skb2);
291         if (err)
292             goto fail;
293 
294         IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
295     }
296 
297     /* 分片完成并发送,释放skb */
298     consume_skb(skb);
299     IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
300     return err;
301 
302 fail:
303 
304     /* 出错,释放skb */
305     kfree_skb(skb);
306     IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
307     return err;
308 }

猜你喜欢

转载自www.cnblogs.com/wanpengcoder/p/11755374.html