node99.org | cv

disabling preemption in the linux scheduler.

Homework 2
Due November 10th via handin
Late penalty only 10% until Sunday the 15th

Similar to homework 1, there is almost no coding in this assignment. The goal is for you to research the Linux scheduler found in linux/kernel/sched.c, identify the relevant functions, and tweak the code.

System call

You'll be adding a system call again to set whether the kernel should allow preemption (the default) or disable preemption. So your system call will need to take a parameter.

int sys_set_preemption(int mode)

You'll need to define a global variable in such a way that it is accessible in both sched.c and your system call. You can do this by defining the global variable in sched.c and using the extern keyword. For example:

file1.c:

int a;
void foobar(int);

void main()
{
	a = 1;
	printf("%d\n", a);

	foobar(2);
	printf("%d\n", a);
}

file2.c:

extern int a;

void foobar(int b)
{
	a = b;
}

Changing the scheduler

The hardest part is finding what function to modify. There are several hints on Facebook and the newsgroup that I'll recount here. The first is to consider the per-process variables defined by task_struct and see if there are any that could be related to preemption. Here's an excerpt from the task_struct definition:
struct task_struct {
	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
	struct thread_info *thread_info;
	atomic_t usage;
	unsigned long flags;	/* per process flags, defined below */
	unsigned long ptrace;

	int lock_depth;		/* BKL lock depth */

#ifdef CONFIG_SMP
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
	int oncpu;
#endif
#endif
	int load_weight;	/* for niceness load balancing purposes */
	int prio, static_prio, normal_prio;
	struct list_head run_list;
	struct prio_array *array;

	unsigned short ioprio;
#ifdef CONFIG_BLK_DEV_IO_TRACE
	unsigned int btrace_seq;
#endif
	unsigned long sleep_avg;
	unsigned long long timestamp, last_ran;
	unsigned long long sched_time; /* sched_clock time spent running */
	enum sleep_type sleep_type;

	unsigned long policy;
	cpumask_t cpus_allowed;
	unsigned int time_slice, first_time_slice;

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
	struct sched_info sched_info;
#endif

	struct list_head tasks;

Here's a more recent tip from Daniela: "The flag need_resched is set by a function called scheduler_tick when a process runs out of timeslice. You need to understand what this function is doing and for regular processes, prevent this flag from being set."

Look at this function and the functions it calls, as well as the process variables that are manipulated by these functions:

/*
 * This function gets called by the timer code, with HZ frequency.
 * We call it with interrupts disabled.
 *
 * It also gets called by the fork code, when changing the parent's
 * timeslices.
 */
void scheduler_tick(void)
{
	unsigned long long now = sched_clock();
	struct task_struct *p = current;
	int cpu = smp_processor_id();
	struct rq *rq = cpu_rq(cpu);

	update_cpu_clock(p, rq, now);

	if (p == rq->idle)
		/* Task on the idle queue */
		wake_priority_sleeper(rq);
	else
		task_running_tick(rq, p);
#ifdef CONFIG_SMP
	update_load(rq);
	if (time_after_eq(jiffies, rq->next_balance))
		raise_softirq(SCHED_SOFTIRQ);
#endif
}

static void task_running_tick(struct rq *rq, struct task_struct *p)
{
	if (p->array != rq->active) {
		/* Task has expired but was not scheduled yet */
		set_tsk_need_resched(p);
		return;
	}

	spin_lock(&rq->lock);
	
	/*
	 * The task was running during this tick - update the
	 * time slice counter. Note: we do not update a thread's
	 * priority until it either goes to sleep or uses up its
	 * timeslice. This makes it possible for interactive tasks
	 * to use up their timeslices at their highest priority levels.
	 */
	if (rt_task(p)) {
		/*
		 * RR tasks need a special form of timeslice management.
		 * FIFO tasks have no timeslices.
		 */
		if ((p->policy == SCHED_RR) && !--p->time_slice) {
			p->time_slice = task_timeslice(p);
			p->first_time_slice = 0;
			set_tsk_need_resched(p);

			/* put it at the end of the queue: */
			requeue_task(p, rq->active);
		}
		goto out_unlock;
	}
	if (!--p->time_slice) {
		dequeue_task(p, rq->active);
		set_tsk_need_resched(p);
		p->prio = effective_prio(p);
		p->time_slice = task_timeslice(p);
		p->first_time_slice = 0;

		if (!rq->expired_timestamp)
			rq->expired_timestamp = jiffies;
		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
			enqueue_task(p, rq->expired);
			if (p->static_prio < rq->best_expired_prio)
				rq->best_expired_prio = p->static_prio;
		} else
			enqueue_task(p, rq->active);
	} else {
		/*
		 * Prevent a too long timeslice allowing a task to monopolize
		 * the CPU. We do this by splitting up the timeslice into
		 * smaller pieces.
		 *
		 * Note: this does not mean the task's timeslices expire or
		 * get lost in any way, they just might be preempted by
		 * another task of equal priority. (one with higher
		 * priority would have preempted this task already.) We
		 * requeue this task to the end of the list on this priority
		 * level, which is in essence a round-robin of tasks with
		 * equal priority.
		 *
		 * This only applies to tasks in the interactive
		 * delta range with at least TIMESLICE_GRANULARITY to requeue.
		 */
		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
			(p->array == rq->active)) {

			requeue_task(p, rq->active);
			set_tsk_need_resched(p);
		}
	}
out_unlock:
	spin_unlock(&rq->lock);
}

Unixbench

You'll need to edit the Makefile for this to compile. Look for a line setting a variable named OPTON, and comment it out using # at the start of the line.