Linux Kernel2.6.9内核源码分析--select

Linux Kernel2.6.9内核源码分析–select

需要解决的问题:
通过追踪内核源码,查看内核是如何实现select监听的功能

首先来看下select API的定义和参数:
int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout);
参数说明:
int nfds:是一个整数值, 表示集合中所有文件描述符的范围,即所有文件描述符的最大值+1
在后面的代码中可以看到Linux 内核的实现方式是从0 ~~ nfds作为下标在进程描述符中的files数组中,依次监听各个文件描述
符有没有事件上报
fd_set *readfds, *writefds,*exceptfds:分别代表监听读/写/错误的文件描述符集,实际上是一个long型的数组.当select返回后,内核会修改集合中的值从而集合中的值不再代表原始的文件描述符,因此每次调用select前都需要重新初始化这些文件描述符集.如果某个文件描述上有事件发生,则将对应fds中的值设置为1,没有时间发生就设置为0。如readfds集合中第二个文件有读时间发生,则该数组中第二个值变成了1,不再是原始的文件描述符的值.
struct timeval *timeout:超时时间,超过这个时间,无论有没有监听到事件,则不再阻塞都立刻返回.
返回值:返回执行错误代码

再来看下select API对应的系统调用:
long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp)
1.查看参数n是否超过了最大值max_fdset,超过了n就等于max_fdset
2.从slab中分配6 x n x sizeof(long)空间,分别为:fds.in ,fds.out, fds.ex, fds.res_in fds.res_out,fds.res_ex,然后将user space的参数inp, outp, exp copy赋值分别赋值给fds.in ,fds.out, fds.ex
3.调用do_select,在该函数中会sleep直到timeout //核心函数
4.将fds.res_in fds.res_out,fds.res_ex 赋值给fds.in ,fds.out, fds.ex,从而用户就知道哪个文件描述符有事件发生.

long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp)
{
    ...............
	/* max_fdset can increase, so grab it once to avoid race */
	max_fdset = current->files->max_fdset;
	if (n > max_fdset)
		n = max_fdset;
	/*
	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	 * since we used fdset we need to allocate memory in units of
	 * long-words. 
	 */
	ret = -ENOMEM;
	size = FDS_BYTES(n);
	bits = select_bits_alloc(size);
	if (!bits)
		goto out_nofds;
	fds.in      = (unsigned long *)  bits;
	fds.out     = (unsigned long *) (bits +   size);
	fds.ex      = (unsigned long *) (bits + 2*size);
	fds.res_in  = (unsigned long *) (bits + 3*size);
	fds.res_out = (unsigned long *) (bits + 4*size);
	fds.res_ex  = (unsigned long *) (bits + 5*size);

	if ((ret = get_fd_set(n, inp, fds.in)) ||
	    (ret = get_fd_set(n, outp, fds.out)) ||
	    (ret = get_fd_set(n, exp, fds.ex)))
		goto out;
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);

	ret = do_select(n, &fds, &timeout);
    ...........
	if (set_fd_set(n, inp, fds.res_in) ||
	    set_fd_set(n, outp, fds.res_out) ||
	    set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;
out:
	select_bits_free(bits, size);
out_nofds:
	return ret;
}

再来看 int do_select(int n, fd_set_bits *fds, long *timeout)

  1. void poll_initwait(struct poll_wqueues *pwq) 是将struct poll_wqueues table变量进行初始化:
    struct poll_wqueues {
    poll_table pt;
    struct poll_table_page * table;
    int error;
    };
    其中pt 是一个函数指针:typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
    table.pt = __pollwait 函数,后面再来分析这个函数
    table.table = NULL,
    table.error = 0
  2. 大循环依次遍历每个文件描述符,调用file->f_op->poll,并只传入table.pt 一次(不是很理解!!!!)
    在前一边博客eventpoll中有解释到,以本地socket为例,file->f_op->poll,最终会调用到unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait),其会先调用
    static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
    {
    if (p && wait_address)
    p->qproc(filp, wait_address, p);
    }
    也就是前面table.pt 即__pollwait 函数
    再来看下__pollwait函数:
    void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *_p)
    {

    {
    struct poll_table_entry * entry = table->entry;
    table->entry = entry+1;
    get_file(filp);
    entry->filp = filp;
    entry->wait_address = wait_address;
    init_waitqueue_entry(&entry->wait, current);
    add_wait_queue(wait_address,&entry->wait);
    }
    }
    也就是将当前file关联的等待队列加入到sk->sk_sleep链表,再socket状态变化时,执行等待队列的回调函数,唤醒等待的进程.
  3. 调用file->f_op->poll返回后,如果有event,则将对应的res_in/res_out/res_ex设定为1.
    static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
    {
    	struct sock *sk = sock->sk;
    	unsigned int mask;
    
    	poll_wait(file, sk->sk_sleep, wait);
    	mask = 0;
    
    	/* exceptional events? */
    	if (sk->sk_err)
    		mask |= POLLERR;
    	if (sk->sk_shutdown == SHUTDOWN_MASK)
    		mask |= POLLHUP;
    
    	/* readable? */
    	if (!skb_queue_empty(&sk->sk_receive_queue) ||
    	    (sk->sk_shutdown & RCV_SHUTDOWN))
    		mask |= POLLIN | POLLRDNORM;
    
    	/* Connection-based need to check for termination and startup */
    	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
    		mask |= POLLHUP;
    
    	/*
    	 * we set writable also when the other side has shut down the
    	 * connection. This prevents stuck sockets.
    	 */
    	if (unix_writable(sk))
    		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
    
    	return mask;
    }
    
int do_select(int n, fd_set_bits *fds, long *timeout)
{
    .............
	poll_initwait(&table); // -------> step 1
	wait = &table.pt;
	if (!__timeout)
		wait = NULL;
	retval = 0;
	for (;;) {
		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
		set_current_state(TASK_INTERRUPTIBLE);
		inp = fds->in; outp = fds->out; exp = fds->ex;
		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
			unsigned long res_in = 0, res_out = 0, res_ex = 0;
			struct file_operations *f_op = NULL;
			struct file *file = NULL;

			in = *inp++; out = *outp++; ex = *exp++;
			all_bits = in | out | ex;
			if (all_bits == 0) {
				i += __NFDBITS;
				continue;
			}

			for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
				if (i >= n)
					break;
				if (!(bit & all_bits))
					continue;
				file = fget(i);
				if (file) {
					f_op = file->f_op;
					mask = DEFAULT_POLLMASK;
					if (f_op && f_op->poll)
						mask = (*f_op->poll)(file, retval ? NULL : wait);  // -------> 为何retval > 0 就不传入wait了 ??????
					fput(file);
					if ((mask & POLLIN_SET) && (in & bit)) {
						res_in |= bit;
						retval++;
					}
					if ((mask & POLLOUT_SET) && (out & bit)) {
						res_out |= bit;
						retval++;
					}
					if ((mask & POLLEX_SET) && (ex & bit)) {
						res_ex |= bit;
						retval++;
					}
				}
			}
			if (res_in)
				*rinp = res_in;
			if (res_out)
				*routp = res_out;
			if (res_ex)
				*rexp = res_ex;
		}
		wait = NULL;
		if (retval || !__timeout || signal_pending(current))  // -------> step 3
			break;
		if(table.error) {
			retval = table.error;
			break;
		}
		__timeout = schedule_timeout(__timeout);
	}
	__set_current_state(TASK_RUNNING);
	poll_freewait(&table);
	*timeout = __timeout;
	return retval;
}
发布了20 篇原创文章 · 获赞 0 · 访问量 569

猜你喜欢

转载自blog.csdn.net/weixin_38537730/article/details/104097648