--- linux/kernel/fork.c.orig +++ linux/kernel/fork.c @@ -59,6 +59,7 @@ void __put_task_struct(struct task_struc { int cpu = smp_processor_id(); + free_uid(tsk->user); if (tsk != current) { __free_task_struct(tsk); return; @@ -177,7 +178,7 @@ static struct task_struct *dup_task_stru } memcpy(tsk, orig, sizeof(*tsk)); - atomic_set(&tsk->usage,1); + atomic_set(&tsk->usage,2); return tsk; } --- linux/kernel/sched.c.orig +++ linux/kernel/sched.c @@ -31,6 +31,12 @@ #include #include +#ifdef CONFIG_NUMA +#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) +#else +#define cpu_to_node_mask(cpu) (cpu_online_map) +#endif + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -64,9 +70,9 @@ #define PRIO_BONUS_RATIO 25 #define INTERACTIVE_DELTA 2 #define MAX_SLEEP_AVG (10*HZ) -#define STARVATION_LIMIT (30*HZ) -#define SYNC_WAKEUPS 1 -#define SMART_WAKE_CHILD 1 +#define STARVATION_LIMIT (10*HZ) +#define AGRESSIVE_IDLE_STEAL 0 +#define NODE_THRESHOLD 125 /* * If a task is 'interactive' then we reinsert it in the active @@ -140,6 +146,48 @@ struct prio_array { }; /* + * It's possible for two CPUs to share the same runqueue. + * This makes sense if they eg. share caches. + * + * We take the common 1:1 (SMP, UP) case and optimize it, + * the rest goes via remapping: rq_idx(cpu) gives the + * runqueue on which a particular cpu is on, cpu_idx(cpu) + * gives the rq-specific index of the cpu. + * + * (Note that the generic scheduler code does not impose any + * restrictions on the mappings - there can be 4 CPUs per + * runqueue or even assymetric mappings.) + */ +#if CONFIG_SHARE_RUNQUEUE +# define MAX_NR_SIBLINGS CONFIG_NR_SIBLINGS + long __rq_idx[NR_CPUS] __cacheline_aligned; + static long __cpu_idx[NR_CPUS] __cacheline_aligned; +# define rq_idx(cpu) (__rq_idx[(cpu)]) +# define cpu_idx(cpu) (__cpu_idx[(cpu)]) +# define for_each_sibling(idx, rq) \ + for ((idx) = 0; (idx) < (rq)->nr_cpus; (idx)++) +# define rq_nr_cpus(rq) ((rq)->nr_cpus) +# define cpu_active_balance(c) (cpu_rq(c)->cpu[0].active_balance) +#else +# define MAX_NR_SIBLINGS 1 +# define rq_idx(cpu) (cpu) +# define cpu_idx(cpu) 0 +# define for_each_sibling(idx, rq) while (0) +# define cpu_active_balance(c) 0 +# define do_active_balance(rq, cpu) do { } while (0) +# define rq_nr_cpus(rq) 1 + static inline void active_load_balance(runqueue_t *rq, int this_cpu) { } +#endif + +typedef struct cpu_s { + task_t *curr, *idle; + task_t *migration_thread; + struct list_head migration_queue; + int active_balance; + int cpu; +} cpu_t; + +/* * This is the main, per-CPU runqueue data structure. * * Locking rule: those places that want to lock multiple runqueues @@ -150,34 +198,90 @@ struct runqueue { spinlock_t lock; unsigned long nr_running, nr_switches, expired_timestamp, nr_uninterruptible; - task_t *curr, *idle; struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2]; - int prev_nr_running[NR_CPUS]; - - task_t *migration_thread; - struct list_head migration_queue; + int prev_cpu_load[NR_CPUS]; +#ifdef CONFIG_NUMA + atomic_t *node_nr_running; + int prev_node_load[MAX_NUMNODES]; +#endif + int nr_cpus; + cpu_t cpu[MAX_NR_SIBLINGS]; atomic_t nr_iowait; } ____cacheline_aligned; static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; -#define cpu_rq(cpu) (runqueues + (cpu)) +#define cpu_rq(cpu) (runqueues + (rq_idx(cpu))) +#define cpu_int(c) ((cpu_rq(c))->cpu + cpu_idx(c)) +#define cpu_curr_ptr(cpu) (cpu_int(cpu)->curr) +#define cpu_idle_ptr(cpu) (cpu_int(cpu)->idle) + #define this_rq() cpu_rq(smp_processor_id()) #define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define rt_task(p) ((p)->prio < MAX_RT_PRIO) +#define migration_thread(cpu) (cpu_int(cpu)->migration_thread) +#define migration_queue(cpu) (&cpu_int(cpu)->migration_queue) + +#if NR_CPUS > 1 +# define task_allowed(p, cpu) ((p)->cpus_allowed & (1UL << (cpu))) +#else +# define task_allowed(p, cpu) 1 +#endif + /* * Default context-switch locking: */ #ifndef prepare_arch_switch # define prepare_arch_switch(rq, next) do { } while(0) # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) -# define task_running(rq, p) ((rq)->curr == (p)) +# define task_running(p) (cpu_curr_ptr(task_cpu(p)) == (p)) #endif +#ifdef CONFIG_NUMA + +/* + * Keep track of running tasks. + */ + +static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp = + {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)}; + +static inline void nr_running_init(struct runqueue *rq) +{ + rq->node_nr_running = &node_nr_running[0]; +} + +static inline void nr_running_inc(runqueue_t *rq) +{ + atomic_inc(rq->node_nr_running); + rq->nr_running++; +} + +static inline void nr_running_dec(runqueue_t *rq) +{ + atomic_dec(rq->node_nr_running); + rq->nr_running--; +} + +__init void node_nr_running_init(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) + cpu_rq(i)->node_nr_running = &node_nr_running[cpu_to_node(i)]; +} + +#else /* !CONFIG_NUMA */ + +# define nr_running_init(rq) do { } while (0) +# define nr_running_inc(rq) do { (rq)->nr_running++; } while (0) +# define nr_running_dec(rq) do { (rq)->nr_running--; } while (0) + +#endif /* CONFIG_NUMA */ + /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without @@ -255,10 +359,13 @@ static inline void enqueue_task(struct t * * Both properties are important to certain workloads. */ -static inline int effective_prio(task_t *p) +static int effective_prio(task_t *p) { int bonus, prio; + if (rt_task(p)) + return p->prio; + bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 - MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2; @@ -271,35 +378,54 @@ static inline int effective_prio(task_t } /* - * activate_task - move a task to the runqueue. - - * Also update all the scheduling statistics stuff. (sleep average - * calculation, priority modifiers, etc.) + * __activate_task - move a task to the runqueue. */ static inline void __activate_task(task_t *p, runqueue_t *rq) { enqueue_task(p, rq->active); - rq->nr_running++; + nr_running_inc(rq); } -static inline void activate_task(task_t *p, runqueue_t *rq) +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static inline int activate_task(task_t *p, runqueue_t *rq) { - unsigned long sleep_time = jiffies - p->last_run; + long sleep_time = jiffies - p->last_run - 1; + int requeue_waker = 0; + + if (sleep_time > 0) { + int sleep_avg; - if (!rt_task(p) && sleep_time) { /* - * This code gives a bonus to interactive tasks. We update - * an 'average sleep time' value here, based on - * ->last_run. The more time a task spends sleeping, - * the higher the average gets - and the higher the priority - * boost gets as well. + * This code gives a bonus to interactive tasks. + * + * The boost works by updating the 'average sleep time' + * value here, based on ->last_run. The more time a task + * spends sleeping, the higher the average gets - and the + * higher the priority boost gets as well. */ - p->sleep_avg += sleep_time; - if (p->sleep_avg > MAX_SLEEP_AVG) - p->sleep_avg = MAX_SLEEP_AVG; - p->prio = effective_prio(p); + sleep_avg = p->sleep_avg + sleep_time; + + /* + * 'Overflow' bonus ticks go to the waker as well, so the + * ticks are not lost. This has the effect of further + * boosting tasks that are related to maximum-interactive + * tasks. + */ + if (sleep_avg > MAX_SLEEP_AVG) + sleep_avg = MAX_SLEEP_AVG; + if (p->sleep_avg != sleep_avg) { + p->sleep_avg = sleep_avg; + p->prio = effective_prio(p); + } } __activate_task(p, rq); + + return requeue_waker; } /* @@ -307,7 +433,7 @@ static inline void activate_task(task_t */ static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) { - rq->nr_running--; + nr_running_dec(rq); if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; dequeue_task(p, p->array); @@ -335,13 +461,27 @@ static inline void resched_task(task_t * set_tsk_need_resched(p); #endif } + +static inline void resched_cpu(int cpu) +{ + resched_task(cpu_curr_ptr(cpu)); +} + #ifdef CONFIG_SMP +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + p->cpu = cpu; +} + /* * wait_task_inactive - wait for a thread to unschedule. * * The caller must ensure that the task *will* unschedule sometime soon, - * else this function might spin for a *long* time. + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. */ void wait_task_inactive(task_t * p) { @@ -350,25 +490,61 @@ void wait_task_inactive(task_t * p) repeat: rq = task_rq(p); - if (unlikely(task_running(rq, p))) { + if (unlikely(task_running(p))) { cpu_relax(); - /* - * enable/disable preemption just to make this - * a preemption point - we are busy-waiting - * anyway. - */ goto repeat; } rq = task_rq_lock(p, &flags); - if (unlikely(task_running(rq, p))) { + if (unlikely(task_running(p))) { task_rq_unlock(rq, &flags); goto repeat; } task_rq_unlock(rq, &flags); } +#else + +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ +} + #endif +static void wake_up_cpu(runqueue_t *rq, int cpu, task_t *p) +{ + cpu_t *curr_cpu; + task_t *curr; + int idx; + + if (idle_cpu(cpu)) + return resched_cpu(cpu); + + for_each_sibling(idx, rq) { + curr_cpu = rq->cpu + idx; + if (!task_allowed(p, curr_cpu->cpu)) + continue; + if (curr_cpu->idle == curr_cpu->curr) + return resched_cpu(curr_cpu->cpu); + } + + if (p->prio < cpu_curr_ptr(cpu)->prio) + return resched_task(cpu_curr_ptr(cpu)); + if (p->prio == cpu_curr_ptr(cpu)->prio && + p->time_slice > cpu_curr_ptr(cpu)->time_slice) + return resched_task(cpu_curr_ptr(cpu)); + + for_each_sibling(idx, rq) { + curr_cpu = rq->cpu + idx; + if (!task_allowed(p, curr_cpu->cpu)) + continue; + curr = curr_cpu->curr; + if (p->prio < curr->prio) + return resched_task(curr); + if (p->prio == curr->prio && p->time_slice > curr->time_slice) + return resched_task(curr); + } +} + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -386,12 +562,12 @@ repeat: */ static int try_to_wake_up(task_t * p, unsigned int state, int sync, int kick) { + int success = 0, requeue_waker = 0; unsigned long flags; - int success = 0; long old_state; runqueue_t *rq; - sync &= SYNC_WAKEUPS; + BUG_ON(!p); repeat_lock_task: rq = task_rq_lock(p, &flags); old_state = p->state; @@ -401,7 +577,7 @@ repeat_lock_task: * Fast-migrate the task if it's not running or runnable * currently. Do not violate hard affinity. */ - if (unlikely(sync && !task_running(rq, p) && + if (unlikely(sync && !task_running(p) && (task_cpu(p) != smp_processor_id()) && (p->cpus_allowed & (1UL << smp_processor_id())))) { @@ -414,24 +590,36 @@ repeat_lock_task: if (sync) __activate_task(p, rq); else { - activate_task(p, rq); - if (p->prio < rq->curr->prio) - resched_task(rq->curr); + requeue_waker = activate_task(p, rq); + wake_up_cpu(rq, task_cpu(p), p); } success = 1; } #if CONFIG_SMP else - if (unlikely(kick) && task_running(rq, p) && + if (unlikely(kick) && task_running(p) && (p->cpu != smp_processor_id())) smp_send_reschedule(p->cpu); #endif - if (p->state >= TASK_ZOMBIE) - BUG(); p->state = TASK_RUNNING; } task_rq_unlock(rq, &flags); + /* + * We have to do this outside the other spinlock, the two + * runqueues might be different: + */ + if (requeue_waker) { + prio_array_t *array; + + rq = task_rq_lock(current, &flags); + array = current->array; + dequeue_task(current, array); + current->prio = effective_prio(current); + enqueue_task(current, array); + task_rq_unlock(rq, &flags); + } + return success; } @@ -462,30 +650,25 @@ void wake_up_forked_process(task_t * p) runqueue_t *rq = task_rq_lock(current, &flags); p->state = TASK_RUNNING; - if (!rt_task(p)) { - /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. - */ - current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100; - p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; - p->prio = effective_prio(p); - } + /* + * We decrease the sleep average of forking parents + * and children as well, to keep max-interactive tasks + * from forking tasks that are max-interactive. + */ + current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100; + p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; + p->prio = effective_prio(p); set_task_cpu(p, smp_processor_id()); - if (SMART_WAKE_CHILD) { - if (unlikely(!current->array)) - __activate_task(p, rq); - else { - p->prio = current->prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - rq->nr_running++; - } - } else - activate_task(p, rq); + if (unlikely(!current->array)) + __activate_task(p, rq); + else { + p->prio = current->prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->array = current->array; + p->array->nr_active++; + nr_running_inc(rq); + } task_rq_unlock(rq, &flags); } @@ -519,12 +702,39 @@ void sched_exit(task_t * p) } /** + * finish_task_switch - clean up after a task-switch + * @prev: the thread we just switched away from. + * + * We enter this with the runqueue still locked, and finish_arch_switch() + * will unlock it along with doing any other architecture-specific cleanup + * actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static inline void finish_task_switch(task_t *prev) +{ + runqueue_t *rq = this_rq(); + struct mm_struct *mm = rq->prev_mm; + + rq->prev_mm = NULL; + finish_arch_switch(rq, prev); + if (mm) + mmdrop(mm); + if (prev->state & (TASK_DEAD | TASK_ZOMBIE)) + put_task_struct(prev); +} + +/** * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. */ asmlinkage void schedule_tail(task_t *prev) { - finish_arch_switch(this_rq(), prev); + finish_task_switch(prev); + if (current->set_child_tid) put_user(current->pid, current->set_child_tid); } @@ -547,6 +757,7 @@ static inline task_t * context_switch(ru if (unlikely(!prev->mm)) { prev->active_mm = NULL; + BUG_ON(rq->prev_mm); rq->prev_mm = oldmm; } @@ -568,8 +779,9 @@ unsigned long nr_running(void) unsigned long i, sum = 0; for (i = 0; i < NR_CPUS; i++) - sum += cpu_rq(i)->nr_running; - + /* Shared runqueues are counted only once. */ + if (!cpu_idx(i)) + sum += cpu_rq(i)->nr_running; return sum; } @@ -580,7 +792,9 @@ unsigned long nr_uninterruptible(void) for (i = 0; i < NR_CPUS; i++) { if (!cpu_online(i)) continue; - sum += cpu_rq(i)->nr_uninterruptible; + /* Shared runqueues are counted only once. */ + if (!cpu_idx(i)) + sum += cpu_rq(i)->nr_uninterruptible; } return sum; } @@ -643,7 +857,111 @@ static inline void double_rq_unlock(runq spin_unlock(&rq2->lock); } -#if CONFIG_SMP +#ifdef CONFIG_NUMA +/* + * If dest_cpu is allowed for this process, migrate the task to it. + * This is accomplished by forcing the cpu_allowed mask to only + * allow dest_cpu, which will force the cpu onto dest_cpu. Then + * the cpu_allowed mask is restored. + */ +static void sched_migrate_task(task_t *p, int dest_cpu) +{ + unsigned long old_mask; + + old_mask = p->cpus_allowed; + if (!(old_mask & (1UL << dest_cpu))) + return; + /* force the process onto the specified CPU */ + set_cpus_allowed(p, 1UL << dest_cpu); + + /* restore the cpus allowed mask */ + set_cpus_allowed(p, old_mask); +} + +/* + * Find the least loaded CPU. Slightly favor the current CPU by + * setting its runqueue length as the minimum to start. + */ +static int sched_best_cpu(struct task_struct *p) +{ + int i, minload, load, best_cpu, node = 0; + unsigned long cpumask; + + best_cpu = task_cpu(p); + if (cpu_rq(best_cpu)->nr_running <= 2) + return best_cpu; + + minload = 10000000; + for (i = 0; i < numnodes; i++) { + load = atomic_read(&node_nr_running[i]); + if (load < minload) { + minload = load; + node = i; + } + } + + minload = 10000000; + cpumask = node_to_cpumask(node); + for (i = 0; i < NR_CPUS; ++i) { + if (!(cpumask & (1UL << i))) + continue; + if (cpu_rq(i)->nr_running < minload) { + best_cpu = i; + minload = cpu_rq(i)->nr_running; + } + } + return best_cpu; +} + +void sched_balance_exec(void) +{ + int new_cpu; + + if (numnodes > 1) { + new_cpu = sched_best_cpu(current); + if (new_cpu != smp_processor_id()) + sched_migrate_task(current, new_cpu); + } +} + +/* + * Find the busiest node. All previous node loads contribute with a + * geometrically deccaying weight to the load measure: + * load_{t} = load_{t-1}/2 + nr_node_running_{t} + * This way sudden load peaks are flattened out a bit. + */ +static int find_busiest_node(int this_node) +{ + int i, node = -1, load, this_load, maxload; + + this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1) + + atomic_read(&node_nr_running[this_node]); + this_rq()->prev_node_load[this_node] = this_load; + for (i = 0; i < numnodes; i++) { + if (i == this_node) + continue; + load = (this_rq()->prev_node_load[i] >> 1) + + atomic_read(&node_nr_running[i]); + this_rq()->prev_node_load[i] = load; + if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) { + maxload = load; + node = i; + } + } + return node; +} + +#endif /* CONFIG_NUMA */ + +#if !CONFIG_SMP + +/* + * on UP we do not need to balance between CPUs: + */ +static inline void load_balance(runqueue_t *this_rq, int this_cpu, int idle, unsigned long cpumask) { } +static inline void rebalance_tick(runqueue_t *this_rq, int this_cpu, int idle) { } + +#else /* * double_lock_balance - lock the busiest runqueue @@ -660,10 +978,10 @@ static inline unsigned int double_lock_b spin_lock(&busiest->lock); spin_lock(&this_rq->lock); /* Need to recalculate nr_running */ - if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu])) nr_running = this_rq->nr_running; else - nr_running = this_rq->prev_nr_running[this_cpu]; + nr_running = this_rq->prev_cpu_load[this_cpu]; } else spin_lock(&busiest->lock); } @@ -671,9 +989,9 @@ static inline unsigned int double_lock_b } /* - * find_busiest_queue - find the busiest runqueue. + * find_busiest_queue - find the busiest runqueue among the cpus in cpumask. */ -static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance) +static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance, unsigned long cpumask) { int nr_running, load, max_load, i; runqueue_t *busiest, *rq_src; @@ -700,23 +1018,23 @@ static inline runqueue_t *find_busiest_q * that case we are less picky about moving a task across CPUs and * take what can be taken. */ - if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu])) nr_running = this_rq->nr_running; else - nr_running = this_rq->prev_nr_running[this_cpu]; + nr_running = this_rq->prev_cpu_load[this_cpu]; busiest = NULL; max_load = 1; for (i = 0; i < NR_CPUS; i++) { - if (!cpu_online(i)) + if (!((1UL << i) & cpumask)) continue; rq_src = cpu_rq(i); - if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i])) + if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i])) load = rq_src->nr_running; else - load = this_rq->prev_nr_running[i]; - this_rq->prev_nr_running[i] = rq_src->nr_running; + load = this_rq->prev_cpu_load[i]; + this_rq->prev_cpu_load[i] = rq_src->nr_running; if ((load > max_load) && (rq_src != this_rq)) { busiest = rq_src; @@ -755,21 +1073,11 @@ out: static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) { dequeue_task(p, src_array); - src_rq->nr_running--; + nr_running_dec(src_rq); set_task_cpu(p, this_cpu); - this_rq->nr_running++; + nr_running_inc(this_rq); enqueue_task(p, this_rq->active); - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. - */ - if (p->prio < this_rq->curr->prio) - set_need_resched(); - else { - if (p->prio == this_rq->curr->prio && - p->time_slice > this_rq->curr->time_slice) - set_need_resched(); - } + wake_up_cpu(this_rq, this_cpu, p); } /* @@ -780,15 +1088,15 @@ static inline void pull_task(runqueue_t * We call this with the current runqueue locked, * irqs disabled. */ -static void load_balance(runqueue_t *this_rq, int idle) +static void load_balance(runqueue_t *this_rq, int this_cpu, int idle, unsigned long cpumask) { - int imbalance, idx, this_cpu = smp_processor_id(); + struct list_head *head, *curr; runqueue_t *busiest; prio_array_t *array; - struct list_head *head, *curr; + int imbalance, idx; task_t *tmp; - busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance); + busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask); if (!busiest) goto out; @@ -819,6 +1127,7 @@ skip_bitmap: goto out_unlock; } + head = array->queue + idx; curr = head->prev; skip_queue: @@ -829,12 +1138,15 @@ skip_queue: * 1) running (obviously), or * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. + * + * (except if we are in idle mode which is a more agressive + * form of rebalancing.) */ -#define CAN_MIGRATE_TASK(p,rq,this_cpu) \ - ((jiffies - (p)->last_run > cache_decay_ticks) && \ - !task_running(rq, p) && \ - ((p)->cpus_allowed & (1UL << (this_cpu)))) +#define CAN_MIGRATE_TASK(p,rq,cpu) \ + (((idle && AGRESSIVE_IDLE_STEAL) || \ + (jiffies - (p)->last_run > cache_decay_ticks)) && \ + !task_running(p) && task_allowed(p, cpu)) curr = curr->prev; @@ -857,29 +1169,206 @@ out: ; } +#if CONFIG_SHARE_RUNQUEUE +static void active_load_balance(runqueue_t *this_rq, int this_cpu) +{ + runqueue_t *rq; + int i, idx; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + rq = cpu_rq(i); + if (rq == this_rq) + continue; + /* + * If there's only one CPU mapped to this runqueue + * then there can be no SMT imbalance: + */ + if (rq->nr_cpus == 1) + continue; + /* + * Any SMT-specific imbalance? + */ + for_each_sibling(idx, rq) + if (rq->cpu[idx].idle == rq->cpu[idx].curr) + goto next_cpu; + + /* + * At this point it's sure that we have a SMT + * imbalance: this (physical) CPU is idle but + * another CPU has two (or more) tasks running. + * + * We wake up one of the migration threads (it + * doesnt matter which one) and let it fix things up: + */ + if (!cpu_active_balance(i)) { + cpu_active_balance(i) = 1; + spin_unlock(&this_rq->lock); + if (rq->cpu[0].migration_thread) + wake_up_process(rq->cpu[0].migration_thread); + spin_lock(&this_rq->lock); + } +next_cpu: + ; + } +} + +static void do_active_balance(runqueue_t *this_rq, int this_cpu) +{ + runqueue_t *rq; + int i, idx; + + spin_unlock(&this_rq->lock); + + cpu_active_balance(this_cpu) = 0; + + /* + * Is the imbalance still present? + */ + for_each_sibling(idx, this_rq) + if (this_rq->cpu[idx].idle == this_rq->cpu[idx].curr) + goto out; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + rq = cpu_rq(i); + if (rq == this_rq) + continue; + + /* completely idle CPU? */ + if (rq->nr_running) + continue; + + /* + * At this point it's reasonably sure that we have an + * imbalance. Since we are the migration thread, try to + * balance a thread over to the target queue. + */ + spin_lock(&this_rq->lock); + this_rq->nr_running--; + spin_unlock(&this_rq->lock); + + spin_lock(&rq->lock); + load_balance(rq, i, 1, cpu_to_node_mask(i)); + spin_unlock(&rq->lock); + + spin_lock(&this_rq->lock); + this_rq->nr_running++; + spin_unlock(&this_rq->lock); + goto out; + } +out: + spin_lock(&this_rq->lock); +} + +/* + * This routine is called to map a CPU into another CPU's runqueue. + * + * This must be called during bootup with the merged runqueue having + * no tasks. + */ +void sched_map_runqueue(int cpu1, int cpu2) +{ + runqueue_t *rq1 = cpu_rq(cpu1); + runqueue_t *rq2 = cpu_rq(cpu2); + int cpu2_idx_orig = cpu_idx(cpu2), cpu2_idx; + + printk("mapping CPU#%d's runqueue to CPU#%d's runqueue.\n", cpu1, cpu2); + BUG_ON(rq1 == rq2 || rq2->nr_running || rq_idx(cpu1) != cpu1); + /* + * At this point, we dont have anything in the runqueue yet. So, + * there is no need to move processes between the runqueues. + * Only, the idle processes should be combined and accessed + * properly. + */ + cpu2_idx = rq1->nr_cpus++; + + rq_idx(cpu2) = cpu1; + cpu_idx(cpu2) = cpu2_idx; + rq1->cpu[cpu2_idx].cpu = cpu2; + rq1->cpu[cpu2_idx].idle = rq2->cpu[cpu2_idx_orig].idle; + rq1->cpu[cpu2_idx].curr = rq2->cpu[cpu2_idx_orig].curr; + INIT_LIST_HEAD(&rq1->cpu[cpu2_idx].migration_queue); + + /* just to be safe: */ + rq2->cpu[cpu2_idx_orig].idle = NULL; + rq2->cpu[cpu2_idx_orig].curr = NULL; +} +#endif + /* * One of the idle_cpu_tick() and busy_cpu_tick() functions will * get called every timer tick, on every CPU. Our balancing action * frequency and balancing agressivity depends on whether the CPU is * idle or not. * - * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on + * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on * systems with HZ=100, every 10 msecs.) + * + * On NUMA, do a node-rebalance every 400 msecs. */ -#define BUSY_REBALANCE_TICK (HZ/4 ?: 1) #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) +#define BUSY_REBALANCE_TICK (HZ/5 ?: 1) +#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5) +#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2) + +#ifdef CONFIG_NUMA +static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) +{ + int node = find_busiest_node(cpu_to_node(this_cpu)); + unsigned long cpumask, this_cpumask = 1UL << this_cpu; + + if (node >= 0) { + cpumask = node_to_cpumask(node) | this_cpumask; + spin_lock(&this_rq->lock); + load_balance(this_rq, this_cpu, idle, cpumask); + spin_unlock(&this_rq->lock); + } +} +#endif -static inline void idle_tick(runqueue_t *rq) +static void rebalance_tick(runqueue_t *this_rq, int this_cpu, int idle) { - if (jiffies % IDLE_REBALANCE_TICK) +#ifdef CONFIG_NUMA + int this_cpu = smp_processor_id(); +#endif + unsigned long j = jiffies; + + /* + * First do inter-node rebalancing, then intra-node rebalancing, + * if both events happen in the same tick. The inter-node + * rebalancing does not necessarily have to create a perfect + * balance within the node, since we load-balance the most loaded + * node with the current CPU. (ie. other CPUs in the local node + * are not balanced.) + */ + if (idle) { +#ifdef CONFIG_NUMA + if (!(j % IDLE_NODE_REBALANCE_TICK)) + balance_node(this_rq, idle, this_cpu); +#endif + if (!(j % IDLE_REBALANCE_TICK)) { + spin_lock(&this_rq->lock); + load_balance(this_rq, this_cpu, idle, cpu_to_node_mask(this_cpu)); + spin_unlock(&this_rq->lock); + } return; - spin_lock(&rq->lock); - load_balance(rq, 1); - spin_unlock(&rq->lock); + } +#ifdef CONFIG_NUMA + if (!(j % BUSY_NODE_REBALANCE_TICK)) + balance_node(this_rq, idle, this_cpu); +#endif + if (!(j % BUSY_REBALANCE_TICK)) { + spin_lock(&this_rq->lock); + load_balance(this_rq, this_cpu, idle, cpu_to_node_mask(this_cpu)); + spin_unlock(&this_rq->lock); + } } - #endif + /* * We place interactive tasks back into the active array, if possible. * @@ -905,14 +1394,13 @@ void scheduler_tick(int user_ticks, int { int cpu = smp_processor_id(); runqueue_t *rq = this_rq(); + unsigned long j = jiffies; task_t *p = current; - if (p == rq->idle) { + if (p == cpu_idle_ptr(cpu)) { if (local_bh_count(cpu) || local_irq_count(cpu) > 1) kstat.per_cpu_system[cpu] += sys_ticks; -#if CONFIG_SMP - idle_tick(rq); -#endif + rebalance_tick(rq, cpu, 1); return; } if (TASK_NICE(p) > 0) @@ -922,11 +1410,22 @@ void scheduler_tick(int user_ticks, int kstat.per_cpu_system[cpu] += sys_ticks; /* Task might have expired already, but not scheduled off yet */ + spin_lock(&rq->lock); if (p->array != rq->active) { set_tsk_need_resched(p); + spin_unlock(&rq->lock); return; } - spin_lock(&rq->lock); + /* + * The task was running during this tick - update the + * time slice counter and the sleep average. Note: we + * do not update a thread's priority until it either + * goes to sleep or uses up its timeslice. This makes + * it possible for interactive tasks to use up their + * timeslices at their highest priority levels. + */ + if (p->sleep_avg) + p->sleep_avg--; if (unlikely(rt_task(p))) { /* * RR tasks need a special form of timeslice management. @@ -943,16 +1442,6 @@ void scheduler_tick(int user_ticks, int } goto out; } - /* - * The task was running during this tick - update the - * time slice counter and the sleep average. Note: we - * do not update a thread's priority until it either - * goes to sleep or uses up its timeslice. This makes - * it possible for interactive tasks to use up their - * timeslices at their highest priority levels. - */ - if (p->sleep_avg) - p->sleep_avg--; if (!--p->time_slice) { dequeue_task(p, rq->active); set_tsk_need_resched(p); @@ -962,17 +1451,14 @@ void scheduler_tick(int user_ticks, int if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; + rq->expired_timestamp = j; enqueue_task(p, rq->expired); } else enqueue_task(p, rq->active); } out: -#if CONFIG_SMP - if (!(jiffies % BUSY_REBALANCE_TICK)) - load_balance(rq, 0); -#endif spin_unlock(&rq->lock); + rebalance_tick(rq, cpu, 0); } void scheduling_functions_start_here(void) { } @@ -982,23 +1468,20 @@ void scheduling_functions_start_here(voi */ asmlinkage void schedule(void) { + int idx, this_cpu, retry = 0; + struct list_head *queue; task_t *prev, *next; - runqueue_t *rq; prio_array_t *array; - struct list_head *queue; - int idx; + runqueue_t *rq; BUG_ON(in_interrupt()); need_resched: prev = current; + this_cpu = smp_processor_id(); rq = this_rq(); - release_kernel_lock(prev, smp_processor_id()); - /* - * Ok, we are leaving the CPU now, lets update the 'last run' - * timestamp: - */ + release_kernel_lock(prev, this_cpu); prev->last_run = jiffies; spin_lock_irq(&rq->lock); @@ -1013,16 +1496,17 @@ need_resched: case TASK_RUNNING: ; } -#if CONFIG_SMP + pick_next_task: -#endif if (unlikely(!rq->nr_running)) { -#if CONFIG_SMP - load_balance(rq, 1); + load_balance(rq, this_cpu, 1, cpu_to_node_mask(this_cpu)); if (rq->nr_running) goto pick_next_task; -#endif - next = rq->idle; + active_load_balance(rq, this_cpu); + if (rq->nr_running) + goto pick_next_task; +pick_idle: + next = cpu_idle_ptr(this_cpu); rq->expired_timestamp = 0; goto switch_tasks; } @@ -1038,28 +1522,54 @@ pick_next_task: rq->expired_timestamp = 0; } +new_array: idx = sched_find_first_bit(array->bitmap); - queue = array->queue + idx; - next = list_entry(queue->next, task_t, run_list); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); + if ((next != prev) && (rq_nr_cpus(rq) > 1)) { + struct list_head *tmp = queue->next; + + while ((task_running(next) && (next != prev)) || !task_allowed(next, this_cpu)) { + tmp = tmp->next; + if (tmp != queue) { + next = list_entry(tmp, task_t, run_list); + continue; + } + idx = find_next_bit(array->bitmap, MAX_PRIO, ++idx); + if (idx == MAX_PRIO) { + if (retry || !rq->expired->nr_active) { + goto pick_idle; + } + /* + * To avoid infinite changing of arrays, + * when we have only tasks runnable by + * sibling. + */ + retry = 1; + + array = rq->expired; + goto new_array; + } + queue = array->queue + idx; + tmp = queue->next; + next = list_entry(tmp, task_t, run_list); + } + } switch_tasks: prefetch(next); clear_tsk_need_resched(prev); if (likely(prev != next)) { - struct mm_struct *prev_mm; rq->nr_switches++; - rq->curr = next; - + cpu_curr_ptr(this_cpu) = next; + set_task_cpu(next, this_cpu); + prepare_arch_switch(rq, next); prev = context_switch(rq, prev, next); barrier(); - rq = this_rq(); - prev_mm = rq->prev_mm; - rq->prev_mm = NULL; - finish_arch_switch(rq, prev); - if (prev_mm) - mmdrop(prev_mm); + + finish_task_switch(prev); } else spin_unlock_irq(&rq->lock); @@ -1068,6 +1578,12 @@ switch_tasks: goto need_resched; } +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync) +{ + task_t *p = curr->task; + return try_to_wake_up(p, mode, sync, 0); +} + /* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve @@ -1281,9 +1797,8 @@ void set_user_nice(task_t *p, long nice) * If the task is running and lowered its priority, * or increased its priority then reschedule its CPU: */ - if ((NICE_TO_PRIO(nice) < p->static_prio) || - task_running(rq, p)) - resched_task(rq->curr); + if ((NICE_TO_PRIO(nice) < p->static_prio) || task_running(p)) + resched_task(cpu_curr_ptr(task_cpu(p))); } out_unlock: task_rq_unlock(rq, &flags); @@ -1321,6 +1836,7 @@ asmlinkage long sys_nice(int increment) nice = -20; if (nice > 19) nice = 19; + set_user_nice(current, nice); return 0; } @@ -1355,7 +1871,7 @@ int task_nice(task_t *p) */ int task_curr(task_t *p) { - return cpu_curr(task_cpu(p)) == p; + return cpu_curr_ptr(task_cpu(p)) == p; } /** @@ -1364,7 +1880,7 @@ int task_curr(task_t *p) */ int idle_cpu(int cpu) { - return cpu_curr(cpu) == cpu_rq(cpu)->idle; + return cpu_curr_ptr(cpu) == cpu_idle_ptr(cpu); } /** @@ -1547,7 +2063,7 @@ out_unlock: * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to the new cpu mask */ -asmlinkage int sys_sched_setaffinity(pid_t pid, unsigned int len, +asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long *user_mask_ptr) { unsigned long new_mask; @@ -1599,7 +2115,7 @@ out_unlock: * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask */ -asmlinkage int sys_sched_getaffinity(pid_t pid, unsigned int len, +asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, unsigned long *user_mask_ptr) { unsigned int real_len; @@ -1656,7 +2172,11 @@ asmlinkage long sys_sched_yield(void) list_del(¤t->run_list); list_add_tail(¤t->run_list, array->queue + current->prio); } - spin_unlock(&rq->lock); + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt: + */ + spin_unlock_irq(&rq->lock); schedule(); @@ -1732,7 +2252,7 @@ asmlinkage long sys_sched_get_priority_m } /** - * sys_sched_get_priority_min - return minimum RT priority. + * sys_sched_get_priority_mix - return minimum RT priority. * @policy: scheduling class. * * this syscall returns the minimum rt_priority that can be used @@ -1773,14 +2293,20 @@ asmlinkage long sys_sched_rr_get_interva retval = -ESRCH; read_lock(&tasklist_lock); p = find_process_by_pid(pid); - if (p) - jiffies_to_timespec(p->policy & SCHED_FIFO ? - 0 : task_timeslice(p), &t); + if (!p) + goto out_unlock; + + retval = 0; + + jiffies_to_timespec(p->policy & SCHED_FIFO ? + 0 : task_timeslice(p), &t); read_unlock(&tasklist_lock); - if (p) - retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: return retval; +out_unlock: + read_unlock(&tasklist_lock); + return retval; } static inline struct task_struct *eldest_child(struct task_struct *p) @@ -1904,7 +2430,7 @@ void __init init_idle(task_t *idle, int local_irq_save(flags); double_rq_lock(idle_rq, rq); - idle_rq->curr = idle_rq->idle = idle; + cpu_curr_ptr(cpu) = cpu_idle_ptr(cpu) = idle; deactivate_task(idle, rq); idle->array = NULL; idle->prio = MAX_PRIO; @@ -1952,6 +2478,7 @@ void set_cpus_allowed(task_t *p, unsigne unsigned long flags; migration_req_t req; runqueue_t *rq; + int cpu; #if 0 /* FIXME: Grab cpu_lock, return error on this case. --RR */ new_mask &= cpu_online_map; @@ -1973,31 +2500,32 @@ void set_cpus_allowed(task_t *p, unsigne * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!p->array && !task_running(rq, p)) { + if (!p->array && !task_running(p)) { set_task_cpu(p, __ffs(p->cpus_allowed)); task_rq_unlock(rq, &flags); return; } init_completion(&req.done); req.task = p; - list_add(&req.list, &rq->migration_queue); + cpu = task_cpu(p); + list_add(&req.list, migration_queue(cpu)); task_rq_unlock(rq, &flags); - - wake_up_process(rq->migration_thread); + wake_up_process(migration_thread(cpu)); wait_for_completion(&req.done); } /* - * migration_thread - this is a highprio system thread that performs + * migration_task - this is a highprio system thread that performs * thread migration by 'pulling' threads into the target runqueue. */ -static int migration_thread(void * data) +static int migration_task(void * data) { + /* Marking "param" is ok, since we do a set_fs(KERNEL_DS); */ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; int cpu = (long) data; runqueue_t *rq; - int ret; + int ret, idx; daemonize(); sigfillset(¤t->blocked); @@ -2012,9 +2540,8 @@ static int migration_thread(void * data) ret = setscheduler(0, SCHED_FIFO, ¶m); rq = this_rq(); - rq->migration_thread = current; - - sprintf(current->comm, "migration/%d", smp_processor_id()); + migration_thread(cpu) = current; + idx = cpu_idx(cpu); for (;;) { runqueue_t *rq_src, *rq_dest; @@ -2025,7 +2552,9 @@ static int migration_thread(void * data) task_t *p; spin_lock_irqsave(&rq->lock, flags); - head = &rq->migration_queue; + if (cpu_active_balance(cpu)) + do_active_balance(rq, cpu); + head = migration_queue(cpu); current->state = TASK_INTERRUPTIBLE; if (list_empty(head)) { spin_unlock_irqrestore(&rq->lock, flags); @@ -2037,7 +2566,7 @@ static int migration_thread(void * data) spin_unlock_irqrestore(&rq->lock, flags); p = req->task; - cpu_dest = __ffs(p->cpus_allowed); + cpu_dest = __ffs(p->cpus_allowed & cpu_online_map); rq_dest = cpu_rq(cpu_dest); repeat: cpu_src = task_cpu(p); @@ -2055,8 +2584,7 @@ repeat: if (p->array) { deactivate_task(p, rq_src); __activate_task(p, rq_dest); - if (p->prio < rq_dest->curr->prio) - resched_task(rq_dest->curr); + wake_up_cpu(rq_dest, cpu_dest, p); } } double_rq_unlock(rq_src, rq_dest); @@ -2072,9 +2600,11 @@ repeat: */ void migration_call(void *hcpu) { + long cpu = (long) hcpu; + printk("Starting migration thread for cpu %li\n", (long)hcpu); - kernel_thread(migration_thread, hcpu, CLONE_KERNEL); - while (!cpu_rq((long)hcpu)->migration_thread) + kernel_thread(migration_task, hcpu, CLONE_KERNEL); + while (!migration_thread(cpu)) yield(); } @@ -2108,12 +2638,22 @@ void __init sched_init(void) for (i = 0; i < NR_CPUS; i++) { prio_array_t *array; + /* + * Start with a 1:1 mapping between CPUs and runqueues: + */ +#if CONFIG_SHARE_RUNQUEUE + rq_idx(i) = i; + cpu_idx(i) = 0; +#endif rq = cpu_rq(i); rq->active = rq->arrays; rq->expired = rq->arrays + 1; spin_lock_init(&rq->lock); - INIT_LIST_HEAD(&rq->migration_queue); + INIT_LIST_HEAD(migration_queue(i)); + rq->nr_cpus = 1; + rq->cpu[cpu_idx(i)].cpu = i; atomic_set(&rq->nr_iowait, 0); + nr_running_init(rq); for (j = 0; j < 2; j++) { array = rq->arrays + j; @@ -2129,9 +2669,9 @@ void __init sched_init(void) * We have to do a little magic to get the first * thread right in SMP mode. */ - rq = this_rq(); - rq->curr = current; - rq->idle = current; + cpu_curr_ptr(smp_processor_id()) = current; + cpu_idle_ptr(smp_processor_id()) = current; + set_task_cpu(current, smp_processor_id()); wake_up_forked_process(current); --- linux/kernel/exit.c.orig +++ linux/kernel/exit.c @@ -49,11 +49,7 @@ void release_task(struct task_struct * p BUG_ON(p->state < TASK_ZOMBIE); - if (p != current) - wait_task_inactive(p); - atomic_dec(&p->user->processes); - free_uid(p->user); write_lock_irq(&tasklist_lock); if (unlikely(p->ptrace)) __ptrace_unlink(p); --- linux/include/linux/sched.h.orig +++ linux/include/linux/sched.h @@ -185,6 +185,24 @@ extern void set_cpus_allowed(struct task #endif /* + * Is there a way to do this via Kconfig? + */ +#if CONFIG_NR_SIBLINGS_2 +# define CONFIG_NR_SIBLINGS 2 +#elif CONFIG_NR_SIBLINGS_4 +# define CONFIG_NR_SIBLINGS 4 +#else +# define CONFIG_NR_SIBLINGS 0 +#endif + +#if CONFIG_NR_SIBLINGS +# define CONFIG_SHARE_RUNQUEUE 1 +#else +# define CONFIG_SHARE_RUNQUEUE 0 +#endif +extern void sched_map_runqueue(int cpu1, int cpu2); + +/* * Priority of a process goes from 0..MAX_PRIO-1, valid RT * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values @@ -593,11 +611,6 @@ static inline unsigned int task_cpu(stru return p->cpu; } -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - p->cpu = cpu; -} - #else @@ -607,10 +620,6 @@ static inline unsigned int task_cpu(stru return 0; } -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) -{ -} - #endif /* CONFIG_SMP */ --- linux/include/asm-i386/apic.h.orig +++ linux/include/asm-i386/apic.h @@ -96,4 +96,6 @@ extern unsigned int nmi_watchdog; #endif /* CONFIG_X86_LOCAL_APIC */ +extern int phys_proc_id[NR_CPUS]; + #endif /* __ASM_APIC_H */ --- linux/include/asm-i386/system.h.orig +++ linux/include/asm-i386/system.h @@ -10,24 +10,22 @@ #ifdef __KERNEL__ struct task_struct; /* one of the stranger aspects of C forward declarations.. */ -extern void FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next)); +extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next)); #define switch_to(prev,next,last) do { \ - asm volatile("pushl %%esi\n\t" \ - "pushl %%edi\n\t" \ - "pushl %%ebp\n\t" \ + unsigned long esi,edi; \ + asm volatile("pushl %%ebp\n\t" \ "movl %%esp,%0\n\t" /* save ESP */ \ - "movl %2,%%esp\n\t" /* restore ESP */ \ + "movl %5,%%esp\n\t" /* restore ESP */ \ "movl $1f,%1\n\t" /* save EIP */ \ - "pushl %3\n\t" /* restore EIP */ \ + "pushl %6\n\t" /* restore EIP */ \ "jmp __switch_to\n" \ "1:\t" \ "popl %%ebp\n\t" \ - "popl %%edi\n\t" \ - "popl %%esi\n\t" \ - :"=m" (prev->thread.esp),"=m" (prev->thread.eip) \ + :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \ + "=a" (last),"=S" (esi),"=D" (edi) \ :"m" (next->thread.esp),"m" (next->thread.eip), \ - "a" (prev), "d" (next)); \ + "2" (prev), "d" (next)); \ } while (0) #define _set_base(addr,base) do { unsigned long __pr; \ --- linux/arch/i386/kernel/process.c.orig +++ linux/arch/i386/kernel/process.c @@ -696,7 +696,7 @@ int dump_task_regs(struct task_struct *t * More important, however, is the fact that this allows us much * more flexibility. */ -void __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; @@ -768,6 +768,7 @@ void __switch_to(struct task_struct *pre */ tss->bitmap = INVALID_IO_BITMAP_OFFSET; } + return prev_p; } asmlinkage int sys_fork(struct pt_regs regs) --- linux/arch/i386/kernel/setup.c.orig +++ linux/arch/i386/kernel/setup.c @@ -3114,6 +3114,13 @@ static int show_cpuinfo(struct seq_file fpu_exception ? "yes" : "no", c->cpuid_level, c->wp_works_ok ? "yes" : "no"); +#if CONFIG_SHARE_RUNQUEUE +{ + extern long __rq_idx[NR_CPUS]; + + seq_printf(m, "\nrunqueue\t: %ld\n", __rq_idx[n]); +} +#endif for ( i = 0 ; i < 32*NCAPINTS ; i++ ) if ( test_bit(i, &c->x86_capability) && --- linux/arch/i386/kernel/smpboot.c.orig +++ linux/arch/i386/kernel/smpboot.c @@ -977,6 +977,16 @@ void *xquad_portio; int cpu_sibling_map[NR_CPUS] __cacheline_aligned; +static int test_ht; + +static int __init ht_setup(char *str) +{ + test_ht = 1; + return 1; +} + +__setup("test_ht", ht_setup); + void __init smp_boot_cpus(void) { int apicid, cpu, bit; @@ -1172,6 +1182,26 @@ void __init smp_boot_cpus(void) if (smp_b_stepping) printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n"); Dprintk("Boot done.\n"); + +#ifndef CONFIG_VISWS + /* + * Here we can be sure that there is an IO-APIC in the system. Let's + * go and set it up: + */ + if (!skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +#endif + + /* + * Set up all local APIC timers in the system: + */ + setup_APIC_clocks(); + + /* + * Synchronize the TSC with the AP + */ + if (cpu_has_tsc > 1 && cpucount && cpu_khz) + synchronize_tsc_bp(); /* * If Hyper-Threading is avaialble, construct cpu_sibling_map[], so @@ -1199,28 +1229,41 @@ void __init smp_boot_cpus(void) printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu); } } +#if CONFIG_SHARE_RUNQUEUE + /* + * At this point APs would have synchronised TSC and + * waiting for smp_commenced, with their APIC timer + * disabled. So BP can go ahead do some initialization + * for Hyper-Threading (if needed). + */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + int i; + if (!test_bit(cpu, &cpu_online_map)) + continue; + for (i = 0; i < NR_CPUS; i++) { + if (i <= cpu) + continue; + if (!test_bit(i, &cpu_online_map)) + continue; + + if (phys_proc_id[cpu] != phys_proc_id[i]) + continue; + /* + * merge runqueues, resulting in one + * runqueue per package: + */ + sched_map_runqueue(cpu, i); + break; + } + } +#endif + } +#if CONFIG_SHARE_RUNQUEUE + if (smp_num_siblings == 1 && test_ht) { + printk("Simulating shared runqueues - mapping CPU#1's runqueue to CPU#0's runqueue.\n"); + sched_map_runqueue(0, 1); } - -#ifndef CONFIG_VISWS - /* - * Here we can be sure that there is an IO-APIC in the system. Let's - * go and set it up: - */ - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); #endif - - /* - * Set up all local APIC timers in the system: - */ - setup_APIC_clocks(); - - /* - * Synchronize the TSC with the AP - */ - if (cpu_has_tsc > 1 && cpucount) - synchronize_tsc_bp(); - smp_done: zap_low_mappings(); } --- linux/arch/i386/kernel/entry.S.orig +++ linux/arch/i386/kernel/entry.S @@ -184,7 +184,7 @@ ENTRY(lcall27) ENTRY(ret_from_fork) - pushl %ebx + pushl %eax call SYMBOL_NAME(schedule_tail) addl $4, %esp GET_CURRENT(%ebx) --- linux/arch/i386/config.in.orig +++ linux/arch/i386/config.in @@ -252,6 +252,18 @@ fi bool 'Math emulation' CONFIG_MATH_EMULATION bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR bool 'Symmetric multi-processing support' CONFIG_SMP + +if [ "$CONFIG_SMP" = "y" ]; then + choice 'Hyperthreading Support' \ + "off CONFIG_NR_SIBLINGS_0 \ + 2-siblings CONFIG_NR_SIBLINGS_2" off +fi + +if [ "$CONFIG_NR_SIBLINGS_2" = "y" ]; then + define_bool CONFIG_SHARE_RUNQUEUE y + define_int CONFIG_MAX_NR_SIBLINGS 2 +fi + if [ "$CONFIG_SMP" != "y" ]; then bool 'Local APIC support on uniprocessors' CONFIG_X86_UP_APIC dep_bool 'IO-APIC support on uniprocessors' CONFIG_X86_UP_IOAPIC $CONFIG_X86_UP_APIC