Linux内核源代码情形分析-nanosleep()和pause()

Linux内核源代码情景分析-nanosleep()和pause()

    我们介绍nanosleep()和pause()两个系统调用。

    系统调用nanosleep()在内核中的实现为sys_nanosleep(),代码如下:

asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)//第一个指针rqtp指向给定所需睡眠时间的数据结构;第二个指针rmtp,指向返回剩余时间的数据结构
{
	struct timespec t;
	unsigned long expire;

	if(copy_from_user(&t, rqtp, sizeof(struct timespec)))//所需睡眠时间从用户空间复制到内核空间
		return -EFAULT;

	if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0)
		return -EINVAL;


	if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
	    current->policy != SCHED_OTHER)//由于时钟中断只能达到10毫秒的精度,如果要求睡眠的时间小于2毫秒,而要求睡眠的进程又是个实时要求的进程,那就不能真的让这个进程进入睡眠,因为那样有可能10毫秒以后才能将其唤醒,对于实时进程来说是不能接受的	
	{
		/*
		 * Short delay requests up to 2 ms will be handled with
		 * high precision by a busy wait for all real-time processes.
		 *
		 * Its important on SMP not to do this holding locks.
		 */
		udelay((t.tv_nsec + 999) / 1000);//延迟两秒
		return 0;
	}

	expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);//将数据结构t中的数值换算成时钟中断的次数	

	current->state = TASK_INTERRUPTIBLE;//将当期进程的状态设置为TASK_INTERRUPTIBLE
	expire = schedule_timeout(expire);//让当期进程睡眠给定的时间;返回剩余的时钟中断次数,如果没有,返回0

	if (expire) {
		if (rmtp) {
			jiffies_to_timespec(expire, &t);//剩余的时钟中断次数转换成数据结构t的数值
			if (copy_to_user(rmtp, &t, sizeof(struct timespec)))//剩余时间从内核空间复制到用户空间
				return -EFAULT;
		}
		return -EINTR;
	}
	return 0;
}

    

    schedule_timeout,让当期进程睡眠给定的时间,代码如下:

signed long schedule_timeout(signed long timeout)
{
	struct timer_list timer;
	unsigned long expire;

	switch (timeout)
	{
	case MAX_SCHEDULE_TIMEOUT:
		/*
		 * These two special cases are useful to be comfortable
		 * in the caller. Nothing more. We could take
		 * MAX_SCHEDULE_TIMEOUT from one of the negative value
		 * but I' d like to return a valid offset (>=0) to allow
		 * the caller to do everything it want with the retval.
		 */
		schedule();//无限期等待
		goto out;
	default:
		/*
		 * Another bit of PARANOID. Note that the retval will be
		 * 0 since no piece of kernel is supposed to do a check
		 * for a negative retval of schedule_timeout() (since it
		 * should never happens anyway). You just have the printk()
		 * that will tell you if something is gone wrong and where.
		 */
		if (timeout < 0)
		{
			printk(KERN_ERR "schedule_timeout: wrong timeout "
			       "value %lx from %p\n", timeout,
			       __builtin_return_address(0));
			current->state = TASK_RUNNING;
			goto out;
		}
	}

	expire = timeout + jiffies;//timeout是把需要睡眠的时间先换算成时钟中断的次数,把这个次数与当前的jiffies相加就得到了"到点"的时间

	init_timer(&timer);
	timer.expires = expire;//初始化数据结构timer
	timer.data = (unsigned long) current;
	timer.function = process_timeout;

	add_timer(&timer);//将timer挂入定时器队列
	schedule();
	del_timer_sync(&timer);

	timeout = expire - jiffies;//剩余的时钟中断次数

 out:
	return timeout < 0 ? 0 : timeout;
}
struct timer_list {
	struct list_head list;
	unsigned long expires;
	unsigned long data;
	void (*function)(unsigned long);
};


    add_timer,将timer挂入定时器队列,代码如下:

void add_timer(struct timer_list *timer)
{
	unsigned long flags;

	spin_lock_irqsave(&timerlist_lock, flags);
	if (timer_pending(timer))
		goto bug;
	internal_add_timer(timer);
	spin_unlock_irqrestore(&timerlist_lock, flags);
	return;
bug:
	spin_unlock_irqrestore(&timerlist_lock, flags);
	printk("bug: kernel timer added twice at %p.\n",
			__builtin_return_address(0));
}
static inline void internal_add_timer(struct timer_list *timer)
{
	/*
	 * must be cli-ed when calling this
	 */
	unsigned long expires = timer->expires;
	unsigned long idx = expires - timer_jiffies;//期望时间点于当前时间点之差,就是中间要经过的中断次数,为32位
	struct list_head * vec;

	if (idx < TVR_SIZE) {
		int i = expires & TVR_MASK;
		vec = tv1.vec + i;
	} else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
		int i = (expires >> TVR_BITS) & TVN_MASK;//第一个expires为256,i为1
		vec = tv2.vec + i;//所以tv2.vec[0]为空
	} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
		int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
		vec =  tv3.vec + i;
	} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
		int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
		vec = tv4.vec + i;
	} else if ((signed long) idx < 0) {
		/* can happen if you add a timer with expires == jiffies,
		 * or you set a timer to go off in the past
		 */
		vec = tv1.vec + tv1.index;
	} else if (idx <= 0xffffffffUL) {
		int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
		vec = tv5.vec + i;
	} else {
		/* Can only get here on architectures with 64-bit jiffies */
		INIT_LIST_HEAD(&timer->list);
		return;
	}
	/*
	 * Timers are FIFO!
	 */
	list_add(&timer->list, vec->prev);
}
#define TVN_BITS 6
#define TVR_BITS 8
#define TVN_SIZE (1 << TVN_BITS)
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)

struct timer_vec {
	int index;
	struct list_head vec[TVN_SIZE];
};

struct timer_vec_root {
	int index;
	struct list_head vec[TVR_SIZE];
};

static struct timer_vec tv5;
static struct timer_vec tv4;
static struct timer_vec tv3;
static struct timer_vec tv2;
static struct timer_vec_root tv1;

static struct timer_vec * const tvecs[] = {
	(struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
};
    数据结构tv1、tv2、...、tv5每个都包含了一个timer_list指针数组,这就是所谓杂凑表,表中的每个指针都指向一个定时器队列。其中tv1与其它几个数据结构的不同仅在于数组的大小,tv1的数组大小是2 8,而其它几个的大小都是2 ^ 6。这样队列中的数量是8 + 4 * (2 ^ 6) = 512个。

    idx为32位,如下图:

Linux内核源代码情形分析-nanosleep()和pause()

    如果idx小于2 ^ 8,以8位为下标链入数据结构tv1的vec[2 ^ 8]。

    如果idx大于2 ^ 8小于2 ^ 14,也就是idx为14位;以14位中前6位为下标链入数据结构tv1的vec[2 ^ 6],也就是说一个队列中最多会有256个定时器;因为相同的前6位,不同的后8位,一共有256种组合,都会链入一个队列(以前6位为下标)。

    以此类推......


    在Linux内核源代码情景分析-中断下半部(软中断),最后时钟中断处理程序的下半部,执行timer_bh,代码如下:

void timer_bh(void)
{
	update_times();
	run_timer_list();
}
static inline void run_timer_list(void)
{
	spin_lock_irq(&timerlist_lock);
	while ((long)(jiffies - timer_jiffies) >= 0) {
		struct list_head *head, *curr;
		if (!tv1.index) {//当tv1.index为0,根据tv2.index的指引将tv2中的一个队列搬运到tv1中
			int n = 1;
			do {
				cascade_timers(tvecs[n]);
			} while (tvecs[n]->index == 1 && ++n < NOOF_TVECS);//当tv2.index为1,根据tv3.index的指引将tv2中的一个队列搬运到tv2中
		}
repeat:
		head = tv1.vec + tv1.index;//index共256个
		curr = head->next;
		if (curr != head) {//tv1的队列
			struct timer_list *timer;
			void (*fn)(unsigned long);
			unsigned long data;

			timer = list_entry(curr, struct timer_list, list);
 			fn = timer->function;
 			data= timer->data;

			detach_timer(timer);//把定时器从队里中删除
			timer->list.next = timer->list.prev = NULL;
			timer_enter(timer);
			spin_unlock_irq(&timerlist_lock);
			fn(data);//process_timeout(current)
			spin_lock_irq(&timerlist_lock);
			timer_exit();
			goto repeat;
		}
		++timer_jiffies; 
		tv1.index = (tv1.index + 1) & TVR_MASK;
	}
	spin_unlock_irq(&timerlist_lock);
}
static inline void cascade_timers(struct timer_vec *tv)
{
	/* cascade all the timers from tv up one level */
	struct list_head *head, *curr, *next;

	head = tv->vec + tv->index;
	curr = head->next;
	/*
	 * We are removing _all_ timers from the list, so we don't  have to
	 * detach them individually, just clear the list afterwards.
	 */
	while (curr != head) {
		struct timer_list *tmp;

		tmp = list_entry(curr, struct timer_list, list);
		next = curr->next;
		list_del(curr); // not needed
		internal_add_timer(tmp);//链入到下一级的队列
		curr = next;
	}
	INIT_LIST_HEAD(head);
	tv->index = (tv->index + 1) & TVN_MASK;
}
    tv1一共有256个队列,每个队列只链入了一个定时器。把它们都执行完后,也就是tv1.index再次为0,就把tv2->vec + tv2->index这个队列这个队列的定时器,放入tv1的256个队列。我们说过tv2队列中的定时器最多是256个,所以正好链入tv1的256个队列。当tv2.index再次为1(tv2->vec + 0这个队列为空),就把tv3->vec + tv3->index这个队列的定时器,放入tv2的128个队列。依次类推。


    每个被唤醒的定时器都会执行fn(data);//process_timeout(current),代码如下:

static void process_timeout(unsigned long __data)
{
	struct task_struct * p = (struct task_struct *) __data;

	wake_up_process(p);
}
    被唤醒后,这个进程再次被调度执行(我们不关心在什么情况下,调度到这个进程),进程继续运行,返回schedule_timeout,又执行了一次del_timer_sync,这是因为该进程有可能被其他进程通过信号唤醒,且这个进程再次被调度执行。此事由于没有在run_timer_list执行detach_timer,所以这里补上一次。最后返回睡眠剩余的时间。


    pause的系统调用是,sys_pause,代码如下:

asmlinkage int sys_pause(void)
{
	current->state = TASK_INTERRUPTIBLE;
	schedule();
	return -ERESTARTNOHAND;
}
    和sys_nanosleep的区别是只有在接受到信号时才会被唤醒,不会定时被唤醒。

    TASK_INTERRUPTIBLE和TASK_UNINTERRUPTIBLE最大的区别是是否在接受到信号,可以被唤醒。