inet_csk_get_port(...)

1. tcp协议内部绑定tcp_hashinfo

struct proto tcp_prot = {
	.name			= "TCP",
	.owner			= THIS_MODULE,
	.close			= tcp_close,
	.connect		= tcp_v4_connect,
	.disconnect		= tcp_disconnect,
	.accept			= inet_csk_accept,
	.ioctl			= tcp_ioctl,
	.init			= tcp_v4_init_sock,
	.destroy		= tcp_v4_destroy_sock,
	.shutdown		= tcp_shutdown,
	.setsockopt		= tcp_setsockopt,
	.getsockopt		= tcp_getsockopt,
	.recvmsg		= tcp_recvmsg,
	.sendmsg		= tcp_sendmsg,
	.sendpage		= tcp_sendpage,
	.backlog_rcv		= tcp_v4_do_rcv,
	.release_cb		= tcp_release_cb,
	.mtu_reduced		= tcp_v4_mtu_reduced,
	.hash			= inet_hash,
	.unhash			= inet_unhash,
	.get_port		= inet_csk_get_port,
	.enter_memory_pressure	= tcp_enter_memory_pressure,
	.sockets_allocated	= &tcp_sockets_allocated,
	.orphan_count		= &tcp_orphan_count,
	.memory_allocated	= &tcp_memory_allocated,
	.memory_pressure	= &tcp_memory_pressure,
	.sysctl_wmem		= sysctl_tcp_wmem,
	.sysctl_rmem		= sysctl_tcp_rmem,
	.max_header		= MAX_TCP_HEADER,
	.obj_size		= sizeof(struct tcp_sock),
	.slab_flags		= SLAB_DESTROY_BY_RCU,
	.twsk_prot		= &tcp_timewait_sock_ops,
	.rsk_prot		= &tcp_request_sock_ops,
	.h.hashinfo		= &tcp_hashinfo,
	.no_autobind		= true,
#ifdef CONFIG_COMPAT
	.compat_setsockopt	= compat_tcp_setsockopt,
	.compat_getsockopt	= compat_tcp_getsockopt,
#endif
#ifdef CONFIG_MEMCG_KMEM
	.init_cgroup		= tcp_init_cgroup,
	.destroy_cgroup		= tcp_destroy_cgroup,
	.proto_cgroup		= tcp_proto_cgroup,
#endif
};
EXPORT_SYMBOL(tcp_prot);

其中有如下成员初始化

.h.hashinfo		= &tcp_hashinfo,

而tcp_hashinfo的结构体类型如下

struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);

2. struct inet_hashinfo 结构体定义

struct inet_hashinfo {
	/* This is for sockets with full identity only.  Sockets here will
	 * always be without wildcards and will have the following invariant:
	 *
	 *          TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
	 *
	 * TIME_WAIT sockets use a separate chain (twchain).
	 */
	struct inet_ehash_bucket	*ehash; //established hash链表
	spinlock_t			*ehash_locks;
	unsigned int			ehash_mask; //内核打印信息为512
	unsigned int			ehash_locks_mask;

	/* Ok, let's try this, I give up, we do need a local binding
	 * TCP hash as well as the others for fast bind/connect.
	 */
	struct inet_bind_hashbucket	*bhash; //绑定的bind hash桶链表

	unsigned int			bhash_size; //上面 bhash 指针的个数,即所能bind hashbucket,内核打印为512
	/* 4 bytes hole on 64 bit */

	struct kmem_cache		*bind_bucket_cachep;

	/* All the above members are written once at bootup and
	 * never written again _or_ are predominantly read-access.
	 *
	 * Now align to a new cache line as all the following members
	 * might be often dirty.
	 */
	/* All sockets in TCP_LISTEN state will be in here.  This is the only
	 * table where wildcard'd TCP sockets can exist.  Hash function here
	 * is just local port number.
	 */
	struct inet_listen_hashbucket	listening_hash[INET_LHTABLE_SIZE]
					____cacheline_aligned_in_smp;

	atomic_t			bsockets;
};

在这个结构体内部主要定义了如下成员

a. established hash桶及相关参数

	struct inet_ehash_bucket	*ehash; //established hash桶
	spinlock_t			*ehash_locks; //established 锁
	unsigned int			ehash_mask; //established屏蔽字,内核打印信息为512
	unsigned int			ehash_locks_mask; //established屏蔽字锁

其中inet_ehash_bucket桶定义如下

struct inet_ehash_bucket {
	struct hlist_nulls_head chain; 
	struct hlist_nulls_head twchain;
};
struct hlist_nulls_head {
	struct hlist_nulls_node *first; //节点
};
struct hlist_nulls_node {
	struct hlist_nulls_node *next, **pprev; //hash链表指针成员
};

b. bind hash桶及相应的内存缓存指针

	struct inet_bind_hashbucket	*bhash; //bind hash桶链表

	unsigned int			bhash_size; //上面 bind bhash 指针的个数,内核打印为512
	/* 4 bytes hole on 64 bit */

	struct kmem_cache		*bind_bucket_cachep; //bind桶内存缓存

其中inet_bind_hashbucket bind hash桶定义如下

struct inet_bind_hashbucket {
	spinlock_t		lock;
	struct hlist_head	chain;
};
struct hlist_head {
	struct hlist_node *first;
};
struct hlist_node {
	struct hlist_node *next, **pprev;
};

3. struct inet_hashinfo 结构体成员初始化

源码路径: linux-3.10.x\net\ipv4\tcp.c

源码调用: inet_init()-->tcp_init()

void __init tcp_init(void)
{
	struct sk_buff *skb = NULL;
	unsigned long limit;
	int max_rshare, max_wshare, cnt;
	unsigned int i;

	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));

	percpu_counter_init(&tcp_sockets_allocated, 0); //TCP socket的个数
	percpu_counter_init(&tcp_orphan_count, 0); //tcp 孤儿的个数

	//分配一个 tcp_hashinfo.bind_bucket_cachep 内存缓存
	tcp_hashinfo.bind_bucket_cachep =
		kmem_cache_create("tcp_bind_bucket",
				  sizeof(struct inet_bind_bucket), 0,
				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);

	/* Size and allocate the main established and bind bucket
	 * hash tables.
	 *
	 * The methodology is similar to that of the buffer cache.
	 */
	 
	//分配一个 established bucket链表
	tcp_hashinfo.ehash =
		alloc_large_system_hash("TCP established",
					sizeof(struct inet_ehash_bucket),
					thash_entries,
					17, /* one slot per 128 KB of memory */
					0,
					NULL,
					//返回ehash_mask屏蔽字
					&tcp_hashinfo.ehash_mask, 
					0,
					thash_entries ? 0 : 512 * 1024);

	//遍历ehash_mask屏蔽字
	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { 
		//初始化链表
		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); 
		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); 
	}

	//分配并初始化锁,及锁的个数
	if (inet_ehash_locks_alloc(&tcp_hashinfo)) //ehash锁分配
		panic("TCP: failed to alloc ehash_locks");

	//---------------------------------------------------------------------

	//分配bind hash桶链表
	tcp_hashinfo.bhash =
		alloc_large_system_hash("TCP bind",
					sizeof(struct inet_bind_hashbucket),
					tcp_hashinfo.ehash_mask + 1, //512+1
					17, /* one slot per 128 KB of memory */
					0,
					//内核日志打印 tcp_hashinfo.bhash_size 为512
					&tcp_hashinfo.bhash_size,
					NULL,
					0,
					64 * 1024);
	tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
	for (i = 0; i < tcp_hashinfo.bhash_size; i++) { //遍历绑定的hashbucket
		spin_lock_init(&tcp_hashinfo.bhash[i].lock); //初始化 bind hash链表的锁
		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); //初始化 bind hash链表
	}


	cnt = tcp_hashinfo.ehash_mask + 1;

	tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
	sysctl_tcp_max_orphans = cnt / 2;
	sysctl_max_syn_backlog = max(128, cnt / 256);

	tcp_init_mem(&init_net);
	/* Set per-socket limits to no more than 1/128 the pressure threshold */
	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
	max_wshare = min(4UL*1024*1024, limit);
	max_rshare = min(6UL*1024*1024, limit);

	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
	sysctl_tcp_wmem[1] = 16*1024;
	sysctl_tcp_wmem[2] = max(64*1024, max_wshare);

	sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
	sysctl_tcp_rmem[1] = 87380;
	sysctl_tcp_rmem[2] = max(87380, max_rshare);

	pr_info("Hash tables configured (established %u bind %u)\n",
		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
	//内核输出如上信息:“TCP: Hash tables configured (established 512 bind 512)”	

	tcp_metrics_init();

	tcp_register_congestion_control(&tcp_reno);

	tcp_tasklet_init();
}

4. inet_csk_get_port()

4.1 源码分析

前面在1~3所描述的内容是为4做“嫁衣”,现在开始inet_csk_get_port()源码分析

int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
	//得到该sock对应协议族的全局的底层容器 hashinfo = tcp_hashinfo ,
	//其中它在struct proto tcp_prot内部初始化。而tcp_hashinfo的部分成
	//员是在 tcp_init()函数内部初始化,要搞清楚这里的关系,一定要查看 
	//tcp_init() 函数内部的实现
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 
	struct inet_bind_hashbucket *head;
	struct inet_bind_bucket *tb;
	int ret, attempts = 5;
	struct net *net = sock_net(sk);
	int smallest_size = -1, smallest_rover;
	kuid_t uid = sock_i_uid(sk); //获取当前socket对应的用户id

	local_bh_disable();
	//端口无效(我们的应用程序在开发的时候配置的无效端口,所以这里会随机
	//分配一个),这种情况就是随机绑定一个没有使用的端口
	if (!snum) { //端口无效
		int remaining, rover, low, high;

again:
		inet_get_local_port_range(&low, &high); //获取端口范围,一般就是1到65535,就是我们常用的端口号范围,当然也可以自己配置
		remaining = (high - low) + 1; //剩余端口个数
		smallest_rover = rover = net_random() % remaining + low; //随机分配一个数字作为端口

		smallest_size = -1;
		do {
			//是否是保留的端口
			if (inet_is_reserved_local_port(rover)) 
				goto next_nolock; //如果是保留的端口就切换到下一个,即++rover

			//通过端口号,即哈希值,确定其所在的链表head
			head = &hashinfo->bhash[inet_bhashfn(net, rover,
					hashinfo->bhash_size)];

			/* 锁住哈希桶 */ 
			spin_lock(&head->lock);

			/* 从头遍历哈希桶,在inet_bind_bucket_for_each函数内部运用了
			   container_of机制,通过指针成员获取其对应的结构体,这里既是tb*/
			inet_bind_bucket_for_each(tb, &head->chain)
				 /* 如果端口被使用了,就进行冲突检测 */
				if (net_eq(ib_net(tb), net) && tb->port == rover) { 
					if (((tb->fastreuse > 0 && //tb中的参数可“快速重用”
					      sk->sk_reuse && //socket参数允许快速重用
					      sk->sk_state != TCP_LISTEN) || //不在监听状态
					     (tb->fastreuseport > 0 &&
					      sk->sk_reuseport &&
					      uid_eq(tb->fastuid, uid))) && //socket用户id相等
					    (tb->num_owners < smallest_size || smallest_size == -1)) { 
						smallest_size = tb->num_owners; /* 记下这个端口使用者的个数 */  
						smallest_rover = rover; /* 记下这个端口 */

						/* 如果系统绑定的端口已经很多了,那么就判断端口是否有绑定冲突*/  
						if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
						    !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
							snum = smallest_rover; /* 没有冲突,使用此端口 */  
							goto tb_found;
						}
					}

					/* 检查是否有端口绑定冲突,该端口是否能重用 */ 
					if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
						snum = rover;
						goto tb_found;
					}
					goto next; /* 此端口不可重用,看下一个 */  
				}

			/* 找到了没被用的端口,退出 */  
			break; //如果一个桶遍历过了,没有冲突的,那么就需要在下面建立一个inet_bind_bucket
		next:
			spin_unlock(&head->lock);
		next_nolock:
			if (++rover > high)
				rover = low;
		} while (--remaining > 0);

		/* Exhausted local port range during search?  It is not
		 * possible for us to be holding one of the bind hash
		 * locks if this test triggers, because if 'remaining'
		 * drops to zero, we broke out of the do/while loop at
		 * the top level, not from the 'break;' statement.
		 */
		ret = 1;
		if (remaining <= 0) {
			if (smallest_size != -1) {
				snum = smallest_rover;
				goto have_snum;
			}
			goto fail;
		}
		/* OK, here is the one we will use.  HEAD is
		 * non-NULL and we hold it's mutex.
		 */
		snum = rover; /* 自动选择的可用端口 */
	} else { /* 如果应用层有指定要绑定的端口 */ 
have_snum: //有端口
		/* 走到这里,表示用户已经自己绑定了端口
		 1. inet_bhashfn(net, snum, hashinfo->bhash_size): 计算struct inet_bind_hashbucket指针索引
		 2. head = &hashinfo->bhash[*]: 返回struct inet_bind_hashbucket hash桶指针,即端口所在的哈希桶 
		 3. inet_bind_bucket_for_each(tb, &head->chain):遍历当前hash桶内部的chain(hlist)链表,该链表
		 	上注册了已被绑定端口,通过该chain链表及node成员找到(运用container_of)找到所属的结构体,即
		 	结构体为tb (struct inet_bind_bucket),具体的端口绑定到链表详见inet_bind_bucket_create()函
		 	数内部的实现
		 4. net_eq(ib_net(tb), net) && tb->port == snum: 是否是同一个net[个人理解,这个应该是创建一个socket就对应一个net] && 端口是否相等 
		*/
		head = &hashinfo->bhash[inet_bhashfn(net, snum,
				hashinfo->bhash_size)];
		spin_lock(&head->lock);
		inet_bind_bucket_for_each(tb, &head->chain)
			if (net_eq(ib_net(tb), net) && tb->port == snum) //从hash链表里获取的端口与应用配置的端口相等?
				goto tb_found; /* 发现端口在用 */ 
	}
	tb = NULL;
	goto tb_not_found;
tb_found:
	 /* 端口上有绑定sock时 */  
	if (!hlist_empty(&tb->owners)) { //为NULL表示tb未被使用
		/* 这是强制的绑定啊,不管端口是否会绑定冲突!*/ 
		if (sk->sk_reuse == SK_FORCE_REUSE)
			goto success;

		//根据socket的参数判断当前的端口是否快速重用
		if (((tb->fastreuse > 0 &&
		      sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
		     (tb->fastreuseport > 0 &&
		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
		    smallest_size == -1) {  /* 指定端口的情况 */
			goto success;
		} else {
			ret = 1;
			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {  /* 端口绑定冲突 */
				/* 自动分配的端口绑定冲突了,再次尝试,最多重试5次。  
                 * 我觉得以下if不必要,因为自动选择时goto tb_found之前都有检测过了。 
                 */
				if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
				     (tb->fastreuseport > 0 &&
				      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
				    smallest_size != -1 && --attempts >= 0) {
					spin_unlock(&head->lock);
					goto again;
				}

				goto fail_unlock;
			}
		}
	}
tb_not_found: //到这里表示在hash桶里面没有找到端口
	ret = 1;
	/* 申请和初始化一个inet_bind_bucket结构, 返回一个tb hash桶*/  
	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
					net, head, snum)) == NULL)
		goto fail_unlock;
	if (hlist_empty(&tb->owners)) { //在inet_bind_bucket_create()函数内部tb->owners初始化为NULL
		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) //sk->sk_reuse变量在inet_create()函数内部初始化的
			tb->fastreuse = 1;
		else
			tb->fastreuse = 0;
		if (sk->sk_reuseport) { //端口重用
			tb->fastreuseport = 1; 
			tb->fastuid = uid; //用户id
		} else
			tb->fastreuseport = 0;
	} else {
		if (tb->fastreuse && //重用
		    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) //禁止端口复用 || socket状态为监听
			tb->fastreuse = 0; //禁止重用
		if (tb->fastreuseport &&
		    (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) //端口禁止重用 || 用户id不相等
			tb->fastreuseport = 0;
	}
success:
	/* 赋值icsk中的inet_bind_bucket */ 
	if (!inet_csk(sk)->icsk_bind_hash) //未绑定hash桶, 在下面的 inet_bind_hash()函数内部绑定
		inet_bind_hash(sk, tb, snum); //重要,将hash桶绑定到sk->sk_prot->h.hashinfo上
	WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
	ret = 0;

fail_unlock:
	spin_unlock(&head->lock);
fail:
	local_bh_enable();
	return ret;
}
EXPORT_SYMBOL_GPL(inet_csk_get_port);

该函数的主要功能是通过协议层bind()函数绑定的本地端口判断,端口是否为无效?

a. 端口无效就使用inet_get_local_port_range()自动分配一个端口,否则就使用应用层经过系统调用传入的端口进行绑定

b. 端口已经分配或协议层传入的有效就进行端口有效性判断,确定端口是否在使用并且申请该端口的用户id使用相同,如果端口未被使用就进入到d. 否则进入c.

		head = &hashinfo->bhash[inet_bhashfn(net, snum,
				hashinfo->bhash_size)];
		spin_lock(&head->lock);
		inet_bind_bucket_for_each(tb, &head->chain)
			if (net_eq(ib_net(tb), net) && tb->port == snum) //从hash链表里获取的端口与应用配置的端口相等?
				goto tb_found; /* 发现端口在用 */ 

c. 判定该端口对应的tb(struct inet_bind_bucket)是否在使用、确定该端口是否可重用、在进行冲突检测

	if (!hlist_empty(&tb->owners)) { //为NULL表示tb未被使用
		/* 这是强制的绑定啊,不管端口是否会绑定冲突!*/ 
		if (sk->sk_reuse == SK_FORCE_REUSE)
			goto success;

d. 当端口未被使用就申请一个inet_bind_bucket bind桶,保存当前应用层的端口,最后将端口信息添加到hash桶链表上,详见下面的流程图结构

	/* 申请和初始化一个inet_bind_bucket结构, 返回一个tb hash桶*/  
	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
					net, head, snum)) == NULL)
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
						 struct net *net,
						 struct inet_bind_hashbucket *head,
						 const unsigned short snum)
{
	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);

	if (tb != NULL) {
		write_pnet(&tb->ib_net, hold_net(net)); //tb->ib_net指向命名空间为net
		tb->port      = snum; //绑定端口
		tb->fastreuse = 0;
		tb->fastreuseport = 0;
		tb->num_owners = 0;
		INIT_HLIST_HEAD(&tb->owners); //初始化tb->owners链表为NULL
		hlist_add_head(&tb->node, &head->chain); //将分配的节点tb->node添加到hash桶链表head->chain上
	}
	return tb;
}

e. 当申请一个inet_bind_bucket tb桶成功后,通过inet_bind_hash(sk, tb, snum)将该tb添加到sk->sk_bind_node链表上

f. 至此,就完成了端口在hash桶及sock上的绑定。

4.2 sock与inet_bind_hashbucket与inet_bind_bucket的关系图


这里有个核心的功能,就是通过结构体的成员获取该结构体的指针

inet_bind_bucket_for_each(tb, &head->chain)
struct inet_bind_hashbucket *head;
struct inet_bind_bucket *tb;
struct inet_bind_hashbucket {
	spinlock_t		lock;
	struct hlist_head	chain;
};
struct inet_bind_bucket {
#ifdef CONFIG_NET_NS
	struct net		*ib_net; //网络命名空间net
#endif
	unsigned short		port; //端口
	signed char		fastreuse; //快速重用,初始化为0
	signed char		fastreuseport; //快速重用端口,初始化0
	kuid_t			fastuid;
	int			num_owners; //初始化为0
	struct hlist_node	node;
	struct hlist_head	owners;
};
inet_bind_bucket_for_each(tb, &head->chain)
#define inet_bind_bucket_for_each(tb, head) \
	hlist_for_each_entry(tb, head, node)
#define hlist_for_each_entry(pos, head, member)				\
	for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\
	     pos;							\
	     pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
#define hlist_entry_safe(ptr, type, member) \
	({ typeof(ptr) ____ptr = (ptr); \
	   ____ptr ? hlist_entry(____ptr, type, member) : NULL; \
	})
#define hlist_entry(ptr, type, member) container_of(ptr,type,member)




另外这位大神写的很详细:点击打开链接



猜你喜欢

转载自blog.csdn.net/chenliang0224/article/details/80867326
今日推荐