--- linux/arch/sparc64/kernel/sys_sparc32.c.orig +++ linux/arch/sparc64/kernel/sys_sparc32.c @@ -1958,7 +1958,7 @@ do_execve32(char * filename, u32 * argv, int retval; int i; - sched_balance_exec(); + sched_balance_context(); file = open_exec(filename); --- linux/arch/i386/kernel/smpboot.c.orig +++ linux/arch/i386/kernel/smpboot.c @@ -1153,9 +1153,11 @@ __init void arch_init_sched_domains(void *phys_domain = SD_CPU_INIT; phys_domain->span = nodemask; + phys_domain->cache_hot_time = cacheflush_time / 2; *node_domain = SD_NODE_INIT; node_domain->span = cpu_possible_map; + node_domain->cache_hot_time = cacheflush_time; } /* Set up CPU (sibling) groups */ @@ -1165,9 +1167,9 @@ __init void arch_init_sched_domains(void first_cpu = last_cpu = NULL; if (i != first_cpu(cpu_domain->span)) { - cpu_sched_domain(i)->flags |= SD_FLAG_SHARE_CPUPOWER; + cpu_sched_domain(i)->flags |= SD_SHARE_CPUPOWER; cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= - SD_FLAG_SHARE_CPUPOWER; + SD_SHARE_CPUPOWER; continue; } @@ -1261,7 +1263,7 @@ __init void arch_init_sched_domains(void cpu_domain->groups = cpu_group; } } -#else /* CONFIG_NUMA */ +#else /* !CONFIG_NUMA */ static struct sched_group sched_group_cpus[NR_CPUS]; static struct sched_group sched_group_phys[NR_CPUS]; static DEFINE_PER_CPU(struct sched_domain, phys_domains); @@ -1280,6 +1282,7 @@ __init void arch_init_sched_domains(void *phys_domain = SD_CPU_INIT; phys_domain->span = cpu_possible_map; + phys_domain->cache_hot_time = cacheflush_time / 2; } /* Set up CPU (sibling) groups */ @@ -1289,9 +1292,9 @@ __init void arch_init_sched_domains(void first_cpu = last_cpu = NULL; if (i != first_cpu(cpu_domain->span)) { - cpu_sched_domain(i)->flags |= SD_FLAG_SHARE_CPUPOWER; + cpu_sched_domain(i)->flags |= SD_SHARE_CPUPOWER; cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= - SD_FLAG_SHARE_CPUPOWER; + SD_SHARE_CPUPOWER; continue; } @@ -1344,7 +1347,48 @@ __init void arch_init_sched_domains(void } } #endif /* CONFIG_NUMA */ -#endif /* CONFIG_SCHED_SMT */ +#else /* !CONFIG_SCHED_SMT */ + +static struct sched_group sched_group_cpus[NR_CPUS]; + +__init void arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_sd = cpu_sched_domain(i); + + *cpu_sd = SD_CPU_INIT; + cpu_sd->span = cpu_possible_map; + cpu_sd->cache_hot_time = cacheflush_time / 2; + } + + /* Set up CPU groups */ + for_each_cpu_mask(i, cpu_possible_map) { + struct sched_group *cpu = &sched_group_cpus[i]; + + cpus_clear(cpu->cpumask); + cpu_set(i, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_sd = cpu_sched_domain(i); + cpu_sd->groups = &sched_group_cpus[i]; + } +} + +#endif /* These are wrappers to interface to the new boot process. Someone who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ --- linux/arch/i386/Kconfig.orig +++ linux/arch/i386/Kconfig @@ -763,7 +763,7 @@ config X86_PAE # Common NUMA Features config NUMA - bool "Numa Memory Allocation Support" + bool "Numa Memory Allocation and Scheduler Support" depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI)) default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT) --- linux/arch/mips/kernel/linux32.c.orig +++ linux/arch/mips/kernel/linux32.c @@ -279,7 +279,7 @@ do_execve32(char * filename, u32 * argv, struct file * file; int retval; - sched_balance_exec(); + sched_balance_context(); file = open_exec(filename); --- linux/arch/ppc64/kernel/smp.c.orig +++ linux/arch/ppc64/kernel/smp.c @@ -836,9 +836,9 @@ __init void arch_init_sched_domains(void first_cpu = last_cpu = NULL; if (i != first_cpu(cpu_domain->span)) { - cpu_sched_domain(i)->flags |= SD_FLAG_SHARE_CPUPOWER; + cpu_sched_domain(i)->flags |= SD_SHARE_CPUPOWER; cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= - SD_FLAG_SHARE_CPUPOWER; + SD_SHARE_CPUPOWER; continue; } @@ -962,9 +962,9 @@ __init void arch_init_sched_domains(void first_cpu = last_cpu = NULL; if (i != first_cpu(cpu_domain->span)) { - cpu_sched_domain(i)->flags |= SD_FLAG_SHARE_CPUPOWER; + cpu_sched_domain(i)->flags |= SD_SHARE_CPUPOWER; cpu_sched_domain(first_cpu(cpu_domain->span))->flags |= - SD_FLAG_SHARE_CPUPOWER; + SD_SHARE_CPUPOWER; continue; } --- linux/arch/ppc64/kernel/sys_ppc32.c.orig +++ linux/arch/ppc64/kernel/sys_ppc32.c @@ -2009,7 +2009,7 @@ static int do_execve32(char * filename, int retval; int i; - sched_balance_exec(); + sched_balance_context(); file = open_exec(filename); --- linux/arch/s390/kernel/compat_linux.c.orig +++ linux/arch/s390/kernel/compat_linux.c @@ -1296,7 +1296,7 @@ do_execve32(char * filename, u32 * argv, int retval; int i; - sched_balance_exec(); + sched_balance_context(); file = open_exec(filename); --- linux/include/linux/sched.h.orig +++ linux/include/linux/sched.h @@ -540,14 +540,15 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ #ifdef CONFIG_SMP -#define SCHED_LOAD_SHIFT 7 /* increase resolution of load calculations */ -#define SCHED_LOAD_SCALE (1UL << SCHED_LOAD_SHIFT) +#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */ -#define SD_FLAG_NEWIDLE 1 /* Balance when about to become idle */ -#define SD_FLAG_EXEC 2 /* Balance on exec */ -#define SD_FLAG_WAKE 4 /* Balance on task wakeup */ -#define SD_FLAG_FASTMIGRATE 8 /* Sync wakes put task on waking CPU */ -#define SD_FLAG_SHARE_CPUPOWER 16 /* Domain members share cpu power */ +#define SD_BALANCE_NEWIDLE 1 /* Balance when about to become idle */ +#define SD_BALANCE_CONTEXT 2 /* Balance new context */ +#define SD_WAKE_IDLE 4 /* Balance to idle on task wakeup */ +#define SD_AFFINE_WAKEUPS 8 /* Wakeups put task on waking CPU */ +#define SD_SHARE_CPUPOWER 16 /* Domain members share cpu power */ + +extern void sched_debug(void); struct sched_group { struct sched_group *next; /* Must be a circular list */ @@ -574,7 +575,7 @@ struct sched_domain { unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ - int flags; /* See SD_FLAG_* */ + int flags; /* See SD_* */ /* Runtime fields. */ unsigned long last_balance; /* init to jiffies. units in jiffies */ @@ -594,7 +595,9 @@ struct sched_domain { .cache_hot_time = 0, \ .cache_nice_tries = 0, \ .per_cpu_gain = 15, \ - .flags = SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE | SD_FLAG_WAKE,\ + .flags = SD_AFFINE_WAKEUPS | \ + SD_BALANCE_NEWIDLE | \ + SD_WAKE_IDLE, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ @@ -610,9 +613,11 @@ struct sched_domain { .busy_factor = 64, \ .imbalance_pct = 125, \ .cache_hot_time = (5*1000000/2), \ - .cache_nice_tries = 1, \ + .cache_nice_tries = 2, \ .per_cpu_gain = 100, \ - .flags = SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE,\ + .flags = SD_AFFINE_WAKEUPS | \ + SD_BALANCE_NEWIDLE | \ + SD_BALANCE_CONTEXT, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ @@ -629,9 +634,9 @@ struct sched_domain { .busy_factor = 8, \ .imbalance_pct = 125, \ .cache_hot_time = (10*1000000), \ - .cache_nice_tries = 1, \ + .cache_nice_tries = 3, \ .per_cpu_gain = 100, \ - .flags = SD_FLAG_EXEC, \ + .flags = SD_BALANCE_CONTEXT, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ @@ -642,6 +647,9 @@ DECLARE_PER_CPU(struct sched_domain, bas #define cpu_sched_domain(cpu) (&per_cpu(base_domains, (cpu))) #define this_sched_domain() (&__get_cpu_var(base_domains)) +#define for_each_domain(cpu, domain) \ + for (domain = cpu_sched_domain(cpu); domain; domain = domain->parent) + extern int set_cpus_allowed(task_t *p, cpumask_t new_mask); #else static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask) @@ -651,12 +659,7 @@ static inline int set_cpus_allowed(task_ #endif extern unsigned long long sched_clock(void); - -#ifdef CONFIG_NUMA -extern void sched_balance_exec(void); -#else -#define sched_balance_exec() {} -#endif +extern void sched_balance_context(void); /* Move tasks off this (offline) CPU onto another. */ extern void migrate_all_tasks(void); --- linux/include/asm-ppc64/processor.h.orig +++ linux/include/asm-ppc64/processor.h @@ -620,7 +620,7 @@ static inline void prefetchw(const void #ifdef CONFIG_SCHED_SMT #define ARCH_HAS_SCHED_DOMAIN -#define ARCH_HAS_SCHED_WAKE_BALANCE +#define ARCH_HAS_SCHED_WAKE_IDLE #endif #endif /* ASSEMBLY */ --- linux/include/asm-i386/param.h.orig +++ linux/include/asm-i386/param.h @@ -5,6 +5,8 @@ # define HZ 1000 /* Internal kernel timer frequency */ # define USER_HZ 100 /* .. some user interfaces are in "ticks" */ # define CLOCKS_PER_SEC (USER_HZ) /* like times() */ +# define JIFFIES_TO_MSEC(x) (x) +# define MSEC_TO_JIFFIES(x) (x) #endif #ifndef HZ --- linux/include/asm-i386/processor.h.orig +++ linux/include/asm-i386/processor.h @@ -668,9 +668,11 @@ extern inline void prefetchw(const void extern void select_idle_routine(const struct cpuinfo_x86 *c); -#ifdef CONFIG_SCHED_SMT -#define ARCH_HAS_SCHED_DOMAIN -#define ARCH_HAS_SCHED_WAKE_BALANCE +#ifdef CONFIG_SMP +# define ARCH_HAS_SCHED_DOMAIN +# ifdef CONFIG_SCHED_SMT +# define ARCH_HAS_SCHED_WAKE_IDLE +# endif #endif #endif /* __ASM_I386_PROCESSOR_H */ --- linux/fs/exec.c.orig +++ linux/fs/exec.c @@ -1098,7 +1098,7 @@ int do_execve(char * filename, int retval; int i; - sched_balance_exec(); + sched_balance_context(); file = open_exec(filename); --- linux/kernel/fork.c.orig +++ linux/kernel/fork.c @@ -544,6 +544,8 @@ static int copy_mm(unsigned long clone_f goto good_mm; } + sched_balance_context(); + retval = -ENOMEM; mm = allocate_mm(); if (!mm) --- linux/kernel/sched.c.orig +++ linux/kernel/sched.c @@ -73,6 +73,13 @@ #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +#ifndef JIFFIES_TO_MSEC +# define JIFFIES_TO_MSEC(x) ((x) * 1000 / HZ) +#endif +#ifndef MSEC_TO_JIFFIES +# define MSEC_TO_JIFFIES(x) ((x) * HZ / 1000) +#endif + /* * These are the 'tuning knobs' of the scheduler: * @@ -173,11 +180,13 @@ ((MAX_TIMESLICE - MIN_TIMESLICE) * \ (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) -static inline unsigned int task_timeslice(task_t *p) +static unsigned int task_timeslice(task_t *p) { return BASE_TIMESLICE(p); } +#define task_hot(p, now, sd) \ + ((now) - (p)->timestamp < (sd)->cache_hot_time) /* * These are the runqueue data structures: */ @@ -202,14 +211,7 @@ struct prio_array { struct runqueue { spinlock_t lock; - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ unsigned long nr_running; -#ifdef CONFIG_SMP - unsigned long cpu_load; -#endif unsigned long long nr_switches; unsigned long expired_timestamp, nr_uninterruptible; unsigned long long timestamp_last_tick; @@ -248,22 +250,12 @@ DEFINE_PER_CPU(struct sched_domain, base # define task_running(rq, p) ((rq)->curr == (p)) #endif -static inline void nr_running_inc(runqueue_t *rq) -{ - rq->nr_running++; -} - -static inline void nr_running_dec(runqueue_t *rq) -{ - rq->nr_running--; -} - /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. */ -static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) +static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) { struct runqueue *rq; @@ -286,7 +278,7 @@ static inline void task_rq_unlock(runque /* * rq_lock - lock a given runqueue and disable interrupts. */ -static inline runqueue_t *this_rq_lock(void) +static runqueue_t *this_rq_lock(void) { runqueue_t *rq; @@ -305,7 +297,7 @@ static inline void rq_unlock(runqueue_t /* * Adding/removing a task to/from a priority array: */ -static inline void dequeue_task(struct task_struct *p, prio_array_t *array) +static void dequeue_task(struct task_struct *p, prio_array_t *array) { array->nr_active--; list_del(&p->run_list); @@ -313,7 +305,7 @@ static inline void dequeue_task(struct t __clear_bit(p->prio, array->bitmap); } -static inline void enqueue_task(struct task_struct *p, prio_array_t *array) +static void enqueue_task(struct task_struct *p, prio_array_t *array) { list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); @@ -321,6 +313,21 @@ static inline void enqueue_task(struct t p->array = array; } +#ifdef CONFIG_SMP +/* + * Used by the migration code - we pull tasks from the head of the + * remote queue so we want these tasks to show up at the head of the + * local queue: + */ +static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) +{ + list_add(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; +} +#endif + /* * effective_prio - return the priority that is based on the static * priority but is modified by bonuses/penalties. @@ -358,7 +365,7 @@ static int effective_prio(task_t *p) static inline void __activate_task(task_t *p, runqueue_t *rq) { enqueue_task(p, rq->active); - nr_running_inc(rq); + rq->nr_running++; } static void recalc_task_prio(task_t *p, unsigned long long now) @@ -441,7 +448,7 @@ static void recalc_task_prio(task_t *p, * Update all the scheduling statistics stuff. (sleep average * calculation, priority modifiers, etc.) */ -static inline void activate_task(task_t *p, runqueue_t *rq) +static void activate_task(task_t *p, runqueue_t *rq) { unsigned long long now = sched_clock(); @@ -477,9 +484,9 @@ static inline void activate_task(task_t /* * deactivate_task - remove a task from the runqueue. */ -static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +static void deactivate_task(struct task_struct *p, runqueue_t *rq) { - nr_running_dec(rq); + rq->nr_running--; if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; dequeue_task(p, p->array); @@ -493,9 +500,9 @@ static inline void deactivate_task(struc * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -static inline void resched_task(task_t *p) -{ #ifdef CONFIG_SMP +static void resched_task(task_t *p) +{ int need_resched, nrpolling; preempt_disable(); @@ -507,10 +514,13 @@ static inline void resched_task(task_t * if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) smp_send_reschedule(task_cpu(p)); preempt_enable(); +} #else +static inline void resched_task(task_t *p) +{ set_tsk_need_resched(p); -#endif } +#endif /** * task_curr - is this task currently executing on a CPU? @@ -602,52 +612,42 @@ void kick_process(task_t *p) } EXPORT_SYMBOL_GPL(kick_process); + /* * Return a low guess at the load of cpu. */ -static inline unsigned long get_low_cpu_load(int cpu) +static inline unsigned long cpu_load(int cpu) { - runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->nr_running << SCHED_LOAD_SHIFT; - - return min(rq->cpu_load, load_now); -} - -static inline unsigned long get_high_cpu_load(int cpu) -{ - runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->nr_running << SCHED_LOAD_SHIFT; - - return max(rq->cpu_load, load_now); + return cpu_rq(cpu)->nr_running * SCHED_LOAD_SCALE; } #endif /* - * sched_balance_wake can be used with SMT architectures to wake a - * task onto an idle sibling if cpu is not idle. Returns cpu if - * cpu is idle or no siblings are idle, otherwise returns an idle - * sibling. + * wake_idle() can be used on SMT architectures to wake a task onto + * an idle sibling if 'cpu' is not idle. + * + * Returns 'cpu' if 'cpu' is idle or no siblings of 'cpu' are idle, + * otherwise returns an idle sibling. */ -#if defined(CONFIG_SMP) && defined(ARCH_HAS_SCHED_WAKE_BALANCE) -static int sched_balance_wake(int cpu, task_t *p) +#if defined(ARCH_HAS_SCHED_WAKE_IDLE) +static inline int wake_idle(int cpu, task_t *p) { cpumask_t tmp; - struct sched_domain *domain; + struct sched_domain *sd; int i; if (idle_cpu(cpu)) return cpu; - domain = cpu_sched_domain(cpu); - if (!(domain->flags & SD_FLAG_WAKE)) + sd = cpu_sched_domain(cpu); + if (!(sd->flags & SD_WAKE_IDLE)) return cpu; - cpus_and(tmp, domain->span, cpu_online_map); + cpus_and(tmp, sd->span, cpu_online_map); for_each_cpu_mask(i, tmp) { if (!cpu_isset(i, p->cpus_allowed)) continue; - if (idle_cpu(i)) return i; } @@ -655,7 +655,7 @@ static int sched_balance_wake(int cpu, t return cpu; } #else -static inline int sched_balance_wake(int cpu, task_t *p) +static inline int wake_idle(int cpu, task_t *p) { return cpu; } @@ -681,11 +681,10 @@ static int try_to_wake_up(task_t * p, un int success = 0; long old_state; runqueue_t *rq; - int cpu, this_cpu; + int cpu, new_cpu, this_cpu; #ifdef CONFIG_SMP unsigned long long now; unsigned long load, this_load; - int new_cpu; struct sched_domain *sd; #endif @@ -693,53 +692,50 @@ static int try_to_wake_up(task_t * p, un old_state = p->state; if (!(old_state & state)) goto out; - if (p->array) goto out_running; - this_cpu = smp_processor_id(); cpu = task_cpu(p); + this_cpu = new_cpu = smp_processor_id(); + sd = cpu_sched_domain(this_cpu); #ifdef CONFIG_SMP - if (cpu == this_cpu || unlikely(cpu_is_offline(this_cpu))) - goto out_activate; + now = sched_clock(); - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed) - || task_running(rq, p))) + if (cpu == this_cpu || unlikely(cpu_is_offline(this_cpu))) + goto out_set_cpu; + if (task_running(rq, p)) goto out_activate; - /* Passive load balancing */ - load = get_low_cpu_load(cpu); - this_load = get_high_cpu_load(this_cpu) + SCHED_LOAD_SCALE; - if (load > this_load) { - new_cpu = sched_balance_wake(this_cpu, p); - set_task_cpu(p, new_cpu); - goto repeat_lock_task; - } - - now = sched_clock(); - sd = cpu_sched_domain(this_cpu); + /* + * Passive load balancing, migrate the task if: + * + * - remote load is higher than local load, and + * - task is woken up by another task + * - or task is woken up from an irq handler and task is cache-cold. + */ + load = cpu_load(cpu); + this_load = cpu_load(this_cpu); + if (load > this_load && (!in_interrupt() || !task_hot(p, now, sd))) + goto out_set_cpu; /* - * Fast-migrate the task if it's not running or - * runnable currently. Do not violate hard affinity. + * Migrate the task to the waking domain. + * Do not violate soft affinity. */ - do { - if (!(sd->flags & SD_FLAG_FASTMIGRATE)) + for_each_domain(this_cpu, sd) { + if (!(sd->flags & SD_AFFINE_WAKEUPS)) break; - if (now - p->timestamp < sd->cache_hot_time) + if (task_hot(p, now, sd)) break; + if (cpu_isset(cpu, sd->span)) + goto out_set_cpu; + } - if (cpu_isset(cpu, sd->span)) { - new_cpu = sched_balance_wake(this_cpu, p); - set_task_cpu(p, new_cpu); - goto repeat_lock_task; - } - sd = sd->parent; - } while (sd); - - new_cpu = sched_balance_wake(cpu, p); - if (new_cpu != cpu) { + new_cpu = cpu; +out_set_cpu: + new_cpu = wake_idle(new_cpu, p); + if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) { set_task_cpu(p, new_cpu); goto repeat_lock_task; } @@ -751,10 +747,8 @@ repeat_lock_task: old_state = p->state; if (!(old_state & state)) goto out; - if (p->array) goto out_running; - this_cpu = smp_processor_id(); cpu = task_cpu(p); @@ -769,6 +763,14 @@ out_activate: p->activated = -1; } + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * dont trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ if (sync && cpu == this_cpu) { __activate_task(p, rq); } else { @@ -785,6 +787,7 @@ out: return success; } + int fastcall wake_up_process(task_t * p) { return try_to_wake_up(p, TASK_STOPPED | @@ -888,7 +891,7 @@ void fastcall wake_up_forked_process(tas list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; - nr_running_inc(rq); + rq->nr_running++; } task_rq_unlock(rq, &flags); } @@ -939,7 +942,7 @@ void fastcall sched_exit(task_t * p) * with the lock held can cause deadlocks; see schedule() for * details.) */ -static inline void finish_task_switch(task_t *prev) +static void finish_task_switch(task_t *prev) { runqueue_t *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; @@ -1060,7 +1063,7 @@ unsigned long nr_iowait(void) * Note this does not disable interrupts like task_rq_lock, * you need to do so manually before calling. */ -static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) +static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) { if (rq1 == rq2) spin_lock(&rq1->lock); @@ -1081,7 +1084,7 @@ static inline void double_rq_lock(runque * Note this does not restore interrupts like task_rq_unlock, * you need to do so manually after calling. */ -static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) +static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) { spin_unlock(&rq1->lock); if (rq1 != rq2) @@ -1096,7 +1099,6 @@ enum idle_type }; #ifdef CONFIG_SMP -#ifdef CONFIG_NUMA /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only @@ -1105,9 +1107,9 @@ enum idle_type */ static void sched_migrate_task(task_t *p, int dest_cpu) { - runqueue_t *rq; migration_req_t req; unsigned long flags; + runqueue_t *rq; lock_cpu_hotplug(); rq = task_rq_lock(p, &flags); @@ -1127,6 +1129,7 @@ static void sched_migrate_task(task_t *p * the migration. */ tlb_migrate_prepare(current->mm); + unlock_cpu_hotplug(); return; } @@ -1137,59 +1140,72 @@ out: /* * Find the least loaded CPU. Slightly favor the current CPU by - * setting its runqueue length as the minimum to start. + * setting its load as the minimum to start. */ -static int sched_best_cpu(struct task_struct *p, struct sched_domain *domain) +static int sched_best_cpu(struct task_struct *p, struct sched_domain *sd) { + int i = 0, min_load, this_cpu, best_cpu; cpumask_t tmp; - int i, min_load, this_cpu, best_cpu; best_cpu = this_cpu = task_cpu(p); - min_load = INT_MAX; - cpus_and(tmp, domain->span, cpu_online_map); + /* subtract the currently running task's load effect: */ + min_load = cpu_load(i) - SCHED_LOAD_SCALE; + + cpus_and(tmp, sd->span, cpu_online_map); + cpu_clear(this_cpu, tmp); + for_each_cpu_mask(i, tmp) { - unsigned long load; - if (i == this_cpu) - load = get_low_cpu_load(i); - else - load = get_high_cpu_load(i) + SCHED_LOAD_SCALE; + unsigned long load = cpu_load(i); if (min_load > load) { best_cpu = i; min_load = load; } - } return best_cpu; } -void sched_balance_exec(void) +/* + * sched_balance_context(): find the highest-level, context-balance-capable + * domain and try to migrate the current task to the least loaded CPU. + * + * execve() (and fork()) is a good balancing opportunity, because at this point + * the task has the smallest effective cache footprint - a completely new + * process image is being created, so almost all of the currently existing + * cache footprint is irrelevant. So we attempt to balance this task as + * broadly as possible, without considering migration costs, which costs + * otherwise affect all other types of task migrations. + */ +void sched_balance_context(void) { - struct sched_domain *domain = this_sched_domain(); - int new_cpu; - int this_cpu = smp_processor_id(); - if (numnodes == 1) - return; + struct sched_domain *sd, *best_sd = NULL; + int new_cpu, this_cpu = get_cpu(); + /* Prefer the current CPU if there's only this task running: */ if (this_rq()->nr_running <= 1) - return; - - while (domain->parent && !(domain->flags & SD_FLAG_EXEC)) - domain = domain->parent; + goto out; - if (domain->flags & SD_FLAG_EXEC) { - new_cpu = sched_best_cpu(current, domain); - if (new_cpu != this_cpu) + for_each_domain(this_cpu, sd) + if (sd->flags & SD_BALANCE_CONTEXT) + best_sd = sd; + + if (best_sd) { + new_cpu = sched_best_cpu(current, best_sd); + if (new_cpu != this_cpu) { + put_cpu(); sched_migrate_task(current, new_cpu); + return; + } } +out: + put_cpu(); } -#endif /* CONFIG_NUMA */ /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. */ -static inline void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) +static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) { if (unlikely(!spin_trylock(&busiest->lock))) { if (busiest < this_rq) { @@ -1205,15 +1221,15 @@ static inline void double_lock_balance(r * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, - task_t *p, runqueue_t *this_rq, prio_array_t *this_array, - int this_cpu) +static inline +void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, + runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) { dequeue_task(p, src_array); - nr_running_dec(src_rq); + src_rq->nr_running--; set_task_cpu(p, this_cpu); - nr_running_inc(this_rq); - enqueue_task(p, this_array); + this_rq->nr_running++; + enqueue_task_head(p, this_array); p->timestamp = sched_clock() - (src_rq->timestamp_last_tick - p->timestamp); /* @@ -1229,7 +1245,7 @@ static inline void pull_task(runqueue_t */ static inline int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, - struct sched_domain *domain, enum idle_type idle) + struct sched_domain *sd, enum idle_type idle) { /* * We do not migrate tasks that are: @@ -1242,15 +1258,19 @@ int can_migrate_task(task_t *p, runqueue if (!cpu_isset(this_cpu, p->cpus_allowed)) return 0; - /* Aggressive migration if we've failed balancing */ - if (idle == NEWLY_IDLE || - domain->nr_balance_failed < domain->cache_nice_tries) { - if ((rq->timestamp_last_tick - p->timestamp) - < domain->cache_hot_time) - return 0; - } + if (!task_hot(p, rq->timestamp_last_tick, sd)) + return 1; - return 1; + /* Aggressive migration if newly idle or we've failed balancing */ + if (idle == NEWLY_IDLE) + return 1; + if (idle == IDLE && (sd->flags & SD_BALANCE_NEWIDLE)) + return 1; + if (sd->nr_balance_failed >= sd->cache_nice_tries) + return 1; + + /* abort the search: */ + return -1; } /* @@ -1261,30 +1281,24 @@ int can_migrate_task(task_t *p, runqueue * Called with both runqueues locked. */ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *domain, - enum idle_type idle) + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) { - int idx; - int pulled = 0; prio_array_t *array, *dst_array; struct list_head *head, *curr; + int ret, idx, pulled = 0; task_t *tmp; if (max_nr_move <= 0 || busiest->nr_running <= 1) goto out; - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (busiest->expired->nr_active) { - array = busiest->expired; - dst_array = this_rq->expired; - } else { + /* We first consider active tasks. */ + if (busiest->active->nr_active) { array = busiest->active; dst_array = this_rq->active; + } else { + array = busiest->expired; + dst_array = this_rq->expired; } new_array: @@ -1296,22 +1310,27 @@ skip_bitmap: else idx = find_next_bit(array->bitmap, MAX_PRIO, idx); if (idx >= MAX_PRIO) { - if (array == busiest->expired) { - array = busiest->active; - dst_array = this_rq->active; + if (array == busiest->active && busiest->expired->nr_active) { + array = busiest->expired; + dst_array = this_rq->expired; goto new_array; } goto out; } head = array->queue + idx; - curr = head->prev; + curr = head->next; skip_queue: tmp = list_entry(curr, task_t, run_list); - curr = curr->prev; + curr = curr->next; - if (!can_migrate_task(tmp, busiest, this_cpu, domain, idle)) { + ret = can_migrate_task(tmp, busiest, this_cpu, sd, idle); + if (ret == -1) { + idx++; + goto skip_bitmap; + } + if (!ret) { if (curr != head) goto skip_queue; idx++; @@ -1334,56 +1353,38 @@ out: /* * find_busiest_group finds and returns the busiest CPU group within the * domain. It calculates and returns the number of tasks which should be - * moved to restore balance via the imbalance parameter. + * moved to restore balance, via the imbalance parameter. */ static struct sched_group * -find_busiest_group(struct sched_domain *domain, int this_cpu, - unsigned long *imbalance, enum idle_type idle) +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *imbalance, enum idle_type idle) { + struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load; unsigned int total_pwr; - struct sched_group *busiest = NULL, *this = NULL, *group = domain->groups; - max_load = 0; - this_load = 0; - total_load = 0; + max_load = this_load = total_load = 0; total_pwr = 0; - if (group == NULL) - goto out_balanced; - do { cpumask_t tmp; - unsigned long load; - int local_group; - int i, nr_cpus = 0; - - local_group = cpu_isset(this_cpu, group->cpumask); + int i; /* Tally up the load of all CPUs in the group */ - avg_load = 0; cpus_and(tmp, group->cpumask, cpu_online_map); - for_each_cpu_mask(i, tmp) { - /* Bias balancing toward cpus of our domain */ - if (local_group) { - load = get_high_cpu_load(i); - } else - load = get_low_cpu_load(i); - - nr_cpus++; - avg_load += load; - } + WARN_ON(cpus_empty(tmp)); - if (!nr_cpus) - goto nextgroup; + avg_load = 0; + for_each_cpu_mask(i, tmp) + avg_load += cpu_load(i); total_load += avg_load; total_pwr += group->cpu_power; /* Adjust by relative CPU power of the group */ - avg_load = (avg_load << SCHED_LOAD_SHIFT) / group->cpu_power; + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; - if (local_group) { + if (cpu_isset(this_cpu, group->cpumask)) { this_load = avg_load; this = group; goto nextgroup; @@ -1394,7 +1395,7 @@ find_busiest_group(struct sched_domain * } nextgroup: group = group->next; - } while (group != domain->groups); + } while (group != sd->groups); if (!busiest || this_load >= max_load) goto out_balanced; @@ -1403,7 +1404,7 @@ nextgroup: if (idle == NOT_IDLE) { if (this_load >= avg_load || - 100*max_load <= domain->imbalance_pct*this_load) + 100*max_load <= sd->imbalance_pct*this_load) goto out_balanced; } @@ -1420,43 +1421,15 @@ nextgroup: */ *imbalance = (min(max_load - avg_load, avg_load - this_load) + 1) / 2; - if (*imbalance <= SCHED_LOAD_SCALE/2) { - unsigned long pwr_now = 0, pwr_move = 0; - unsigned long tmp; + if (*imbalance <= SCHED_LOAD_SCALE/2) + goto out_balanced; - /* - * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by - * moving them. - */ - - pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); - pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); - pwr_now >>= SCHED_LOAD_SHIFT; - - /* Amount of load we'd subtract */ - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; - if (max_load > tmp) - pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, - max_load - tmp); - - /* Amount of load we'd add */ - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; - pwr_move += this->cpu_power*min(this->cpu_power, this_load + tmp); - pwr_move >>= SCHED_LOAD_SHIFT; - - /* Move if we gain another 8th of a CPU worth of throughput */ - if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) - goto out_balanced; - *imbalance = 1; - return busiest; - } /* How many tasks to actually move to equalise the imbalance */ *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) - >> SCHED_LOAD_SHIFT; + / SCHED_LOAD_SCALE; /* Get rid of the scaling factor, rounding *up* as we divide */ - *imbalance = (*imbalance + SCHED_LOAD_SCALE/2) >> SCHED_LOAD_SHIFT; + *imbalance = (*imbalance + SCHED_LOAD_SCALE/2) / SCHED_LOAD_SCALE; return busiest; @@ -1475,16 +1448,15 @@ out_balanced: */ static runqueue_t *find_busiest_queue(struct sched_group *group) { + unsigned long load, max_load = 0; + runqueue_t *busiest = NULL; cpumask_t tmp; int i; - unsigned long max_load = 0; - runqueue_t *busiest = NULL; cpus_and(tmp, group->cpumask, cpu_online_map); - for_each_cpu_mask(i, tmp) { - unsigned long load; - load = get_low_cpu_load(i); + for_each_cpu_mask(i, tmp) { + load = cpu_load(i); if (load >= max_load) { max_load = load; @@ -1502,68 +1474,64 @@ static runqueue_t *find_busiest_queue(st * Called with this_rq unlocked. */ static int load_balance(int this_cpu, runqueue_t *this_rq, - struct sched_domain *domain, enum idle_type idle) + struct sched_domain *sd, enum idle_type idle) { struct sched_group *group; - runqueue_t *busiest = NULL; unsigned long imbalance; - int balanced = 0, failed = 0; - int nr_moved = 0; + runqueue_t *busiest; + int nr_moved; spin_lock(&this_rq->lock); - group = find_busiest_group(domain, this_cpu, &imbalance, idle); - if (!group) { - balanced = 1; - goto out; - } - + group = find_busiest_group(sd, this_cpu, &imbalance, idle); + if (!group) + goto out_balanced; busiest = find_busiest_queue(group); - if (!busiest || busiest == this_rq) { - balanced = 1; - goto out; - } + if (!busiest || busiest == this_rq) + goto out_balanced; /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); - - nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, domain, idle); - spin_unlock(&busiest->lock); -out: + nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle); spin_unlock(&this_rq->lock); + spin_unlock(&busiest->lock); - if (!balanced && nr_moved == 0) - failed = 1; + if (!nr_moved) { + sd->nr_balance_failed++; + if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries)) { + int wake = 0; - if (failed && busiest && - domain->nr_balance_failed > domain->cache_nice_tries) { - int wake = 0; - - spin_lock(&busiest->lock); - if (!busiest->active_balance) { - busiest->active_balance = 1; - busiest->push_cpu = this_cpu; - wake = 1; + spin_lock(&busiest->lock); + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + wake = 1; + } + spin_unlock(&busiest->lock); + if (wake) + wake_up_process(busiest->migration_thread); + /* + * We've kicked active balancing, reset the + * failure counter: + */ + sd->nr_balance_failed = 0; } - spin_unlock(&busiest->lock); - if (wake) - wake_up_process(busiest->migration_thread); - } - - if (failed) - domain->nr_balance_failed++; - else - domain->nr_balance_failed = 0; + } else + sd->nr_balance_failed = 0; - if (balanced) { - if (domain->balance_interval < domain->max_interval) - domain->balance_interval *= 2; - } else { - domain->balance_interval = domain->min_interval; - } + /* reset the balancing interval: */ + sd->balance_interval = sd->min_interval; return nr_moved; + +out_balanced: + spin_unlock(&this_rq->lock); + + /* tune up the balancing interval: */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + + return 0; } /* @@ -1574,14 +1542,14 @@ out: * this_rq is locked. */ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, - struct sched_domain *domain) + struct sched_domain *sd) { struct sched_group *group; runqueue_t *busiest = NULL; unsigned long imbalance; int nr_moved = 0; - group = find_busiest_group(domain, this_cpu, &imbalance, NEWLY_IDLE); + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); if (!group) goto out; @@ -1593,7 +1561,7 @@ static int load_balance_newidle(int this double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, domain, NEWLY_IDLE); + imbalance, sd, NEWLY_IDLE); spin_unlock(&busiest->lock); @@ -1607,25 +1575,16 @@ out: */ static inline void idle_balance(int this_cpu, runqueue_t *this_rq) { - struct sched_domain *domain = this_sched_domain(); + struct sched_domain *sd; if (unlikely(cpu_is_offline(this_cpu))) return; - do { - if (unlikely(!domain->groups)) - /* hasn't been setup yet */ - break; - - if (domain->flags & SD_FLAG_NEWIDLE) { - if (load_balance_newidle(this_cpu, this_rq, domain)) { + for_each_domain(this_cpu, sd) + if (sd->flags & SD_BALANCE_NEWIDLE) + if (load_balance_newidle(this_cpu, this_rq, sd)) /* We've pulled tasks over so stop searching */ break; - } - } - - domain = domain->parent; - } while (domain); } /* @@ -1638,36 +1597,25 @@ static inline void idle_balance(int this */ static void active_load_balance(runqueue_t *busiest, int busiest_cpu) { - int i; - struct sched_domain *sd = cpu_sched_domain(busiest_cpu); struct sched_group *group, *busy_group; + struct sched_domain *sd; + int i; if (busiest->nr_running <= 1) return; - /* sd->parent should never cause a NULL dereference, if it did so, - * then push_cpu was set to a buggy value */ - while (!cpu_isset(busiest->push_cpu, sd->span)) { - sd = sd->parent; - if (!sd->parent && !cpu_isset(busiest->push_cpu, sd->span)) { - WARN_ON(1); - return; - } - } + for_each_domain(busiest_cpu, sd) + if (cpu_isset(busiest->push_cpu, sd->span)) + break; - if (!sd->groups) { + if (!sd->parent && !cpu_isset(busiest->push_cpu, sd->span)) { WARN_ON(1); return; } group = sd->groups; - while (!cpu_isset(busiest_cpu, group->cpumask)) { + while (!cpu_isset(busiest_cpu, group->cpumask)) group = group->next; - if (group == sd->groups) { - WARN_ON(1); - return; - } - } busy_group = group; group = sd->groups; @@ -1686,7 +1634,7 @@ static void active_load_balance(runqueue push_cpu = i; nr++; } - if (nr == 0) + if (!nr) goto next_group; rq = cpu_rq(push_cpu); @@ -1710,59 +1658,52 @@ next_group: /* Don't have all balancing operations going off at once */ #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) -static void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle) +static void +rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle) { - unsigned long old_load, this_load; unsigned long j = jiffies + CPU_OFFSET(this_cpu); - struct sched_domain *domain = this_sched_domain(); + struct sched_domain *sd; if (unlikely(cpu_is_offline(this_cpu))) return; - /* Update our load */ - old_load = this_rq->cpu_load; - this_load = this_rq->nr_running << SCHED_LOAD_SHIFT; - this_rq->cpu_load = (old_load + this_load) / 2; - /* Run through all this CPU's domains */ - do { - unsigned long interval; + for_each_domain(this_cpu, sd) { + unsigned long interval = sd->balance_interval; - if (unlikely(!domain->groups)) - break; - - interval = domain->balance_interval; if (idle != IDLE) - interval *= domain->busy_factor; + interval *= sd->busy_factor; /* scale ms to jiffies */ - interval = interval * HZ / 1000; - if (unlikely(interval == 0)) + interval = MSEC_TO_JIFFIES(interval); + if (unlikely(!interval)) interval = 1; - if (j - domain->last_balance >= interval) { - if (load_balance(this_cpu, this_rq, domain, idle)) { + if (j - sd->last_balance >= interval) { + if (load_balance(this_cpu, this_rq, sd, idle)) { /* We've pulled tasks over so no longer idle */ idle = NOT_IDLE; } - domain->last_balance += interval; + sd->last_balance += interval; } - - domain = domain->parent; - } while (domain); + } } #else /* * on UP we do not need to balance between CPUs: */ -static inline void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle) +static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) +{ +} +static inline void idle_balance(int this_cpu, runqueue_t *this_rq) { } #endif -#ifdef CONFIG_SCHED_SMT static inline int wake_priority_sleeper(runqueue_t *rq) -{ /* +{ +#ifdef CONFIG_SCHED_SMT + /* * If an SMT sibling task has been put to sleep for priority * reasons reschedule the idle task to see if it can now run. */ @@ -1770,14 +1711,9 @@ static inline int wake_priority_sleeper( resched_task(rq->idle); return 1; } +#endif return 0; } -#else -static inline int wake_priority_sleeper(runqueue_t *rq) -{ - return 0; -} -#endif DEFINE_PER_CPU(struct kernel_stat, kstat); @@ -1928,10 +1864,8 @@ static inline void wake_sleeping_depende struct sched_domain *sd = cpu_sched_domain(cpu); cpumask_t sibling_map; - if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) { - /* Not SMT */ + if (!(sd->flags & SD_SHARE_CPUPOWER)) return; - } cpus_and(sibling_map, sd->span, cpu_online_map); cpu_clear(cpu, sibling_map); @@ -1951,14 +1885,12 @@ static inline void wake_sleeping_depende static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) { - int ret = 0, i; struct sched_domain *sd = cpu_sched_domain(cpu); cpumask_t sibling_map; + int ret = 0, i; - if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) { - /* Not SMT */ + if (!(sd->flags & SD_SHARE_CPUPOWER)) return 0; - } cpus_and(sibling_map, sd->span, cpu_online_map); cpu_clear(cpu, sibling_map); @@ -1980,7 +1912,7 @@ static inline int dependent_sleeper(int if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > task_timeslice(p) || rt_task(smt_curr)) && p->mm && smt_curr->mm && !rt_task(p)) - ret |= 1; + ret = 1; /* * Reschedule a lower priority task on the SMT sibling, @@ -2072,9 +2004,7 @@ need_resched: cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { -#ifdef CONFIG_SMP idle_balance(cpu, rq); -#endif if (!rq->nr_running) { next = rq->idle; rq->expired_timestamp = 0; @@ -2631,7 +2561,7 @@ static int setscheduler(pid_t pid, int p if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (p->prio < rq->curr->prio) + } else if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } @@ -3178,7 +3108,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); * Move (not current) task off this cpu, onto dest cpu. We're doing * this because either it can't run here any more (set_cpus_allowed() * away from this CPU, or CPU going down), or because we're - * attempting to rebalance this task on exec (sched_balance_exec). + * attempting to rebalance this task on exec/fork (sched_balance_context). * * So we race with normal scheduler movements, but that's OK, as long * as the task is no longer on this CPU. @@ -3401,24 +3331,24 @@ static void __init arch_init_sched_domai for_each_cpu(i) { int node = cpu_to_node(i); cpumask_t nodemask = node_to_cpumask(node); - struct sched_domain *node_domain = &per_cpu(node_domains, i); - struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *node_sd = &per_cpu(node_domains, i); + struct sched_domain *cpu_sd = cpu_sched_domain(i); - *node_domain = SD_NODE_INIT; - node_domain->span = cpu_possible_map; + *node_sd = SD_NODE_INIT; + node_sd->span = cpu_possible_map; - *cpu_domain = SD_CPU_INIT; - cpus_and(cpu_domain->span, nodemask, cpu_possible_map); - cpu_domain->parent = node_domain; + *cpu_sd = SD_CPU_INIT; + cpus_and(cpu_sd->span, nodemask, cpu_possible_map); + cpu_sd->parent = node_sd; } /* Set up groups */ for (i = 0; i < MAX_NUMNODES; i++) { struct sched_group *first_cpu = NULL, *last_cpu = NULL; - int j; - cpumask_t nodemask; struct sched_group *node = &sched_group_nodes[i]; cpumask_t tmp = node_to_cpumask(i); + cpumask_t nodemask; + int j; cpus_and(nodemask, tmp, cpu_possible_map); @@ -3453,14 +3383,14 @@ static void __init arch_init_sched_domai mb(); for_each_cpu(i) { - struct sched_domain *node_domain = &per_cpu(node_domains, i); - struct sched_domain *cpu_domain = cpu_sched_domain(i); - node_domain->groups = &sched_group_nodes[cpu_to_node(i)]; - cpu_domain->groups = &sched_group_cpus[i]; + struct sched_domain *node_sd = &per_cpu(node_domains, i); + struct sched_domain *cpu_sd = cpu_sched_domain(i); + node_sd->groups = &sched_group_nodes[cpu_to_node(i)]; + cpu_sd->groups = &sched_group_cpus[i]; } } -#else /* CONFIG_NUMA */ +#else /* !CONFIG_NUMA */ static void __init arch_init_sched_domains(void) { int i; @@ -3468,10 +3398,10 @@ static void __init arch_init_sched_domai /* Set up domains */ for_each_cpu(i) { - struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *cpu_sd = cpu_sched_domain(i); - *cpu_domain = SD_CPU_INIT; - cpu_domain->span = cpu_possible_map; + *cpu_sd = SD_CPU_INIT; + cpu_sd->span = cpu_possible_map; } /* Set up CPU groups */ @@ -3492,15 +3422,15 @@ static void __init arch_init_sched_domai mb(); for_each_cpu(i) { - struct sched_domain *cpu_domain = cpu_sched_domain(i); - cpu_domain->groups = &sched_group_cpus[i]; + struct sched_domain *cpu_sd = cpu_sched_domain(i); + cpu_sd->groups = &sched_group_cpus[i]; } } #endif /* CONFIG_NUMA */ #endif /* ARCH_HAS_SCHED_DOMAIN */ -#undef SCHED_DOMAIN_DEBUG +#define SCHED_DOMAIN_DEBUG #ifdef SCHED_DOMAIN_DEBUG void sched_domain_debug(void) { @@ -3508,7 +3438,7 @@ void sched_domain_debug(void) for_each_cpu(i) { int level = 0; - struct sched_domain *cpu_domain = cpu_sched_domain(i); + struct sched_domain *cpu_sd = cpu_sched_domain(i); printk(KERN_DEBUG "CPU%d: %s\n", i, (cpu_online(i) ? " online" : "offline")); @@ -3516,10 +3446,10 @@ void sched_domain_debug(void) do { int j; char str[NR_CPUS]; - struct sched_group *group = cpu_domain->groups; + struct sched_group *group = cpu_sd->groups; cpumask_t groupmask, tmp; - cpumask_snprintf(str, NR_CPUS, cpu_domain->span); + cpumask_scnprintf(str, NR_CPUS, cpu_sd->span); cpus_clear(groupmask); printk(KERN_DEBUG); @@ -3527,22 +3457,24 @@ void sched_domain_debug(void) printk(" "); printk("domain %d: span %s\n", level, str); - if (!cpu_isset(i, cpu_domain->span)) + if (!cpu_isset(i, cpu_sd->span)) printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i); if (!cpu_isset(i, group->cpumask)) printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i); + if (!group->cpu_power) + printk(KERN_DEBUG "ERROR domain->cpu_power not set\n"); printk(KERN_DEBUG); for (j = 0; j < level + 2; j++) printk(" "); printk("groups:"); do { - if (group == NULL) { + if (!group) { printk(" ERROR: NULL"); break; } - if (cpus_weight(group->cpumask) == 0) + if (!cpus_weight(group->cpumask)) printk(" ERROR empty group:"); cpus_and(tmp, groupmask, group->cpumask); @@ -3551,26 +3483,26 @@ void sched_domain_debug(void) cpus_or(groupmask, groupmask, group->cpumask); - cpumask_snprintf(str, NR_CPUS, group->cpumask); + cpumask_scnprintf(str, NR_CPUS, group->cpumask); printk(" %s", str); group = group->next; - } while (group != cpu_domain->groups); + } while (group != cpu_sd->groups); printk("\n"); - if (!cpus_equal(cpu_domain->span, groupmask)) + if (!cpus_equal(cpu_sd->span, groupmask)) printk(KERN_DEBUG "ERROR groups don't span domain->span\n"); level++; - cpu_domain = cpu_domain->parent; + cpu_sd = cpu_sd->parent; - if (cpu_domain) { - cpus_and(tmp, groupmask, cpu_domain->span); + if (cpu_sd) { + cpus_and(tmp, groupmask, cpu_sd->span); if (!cpus_equal(tmp, groupmask)) printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n"); } - } while (cpu_domain); + } while (cpu_sd); } } #else @@ -3596,9 +3528,29 @@ void __init sched_init(void) for (i = 0; i < NR_CPUS; i++) { prio_array_t *array; #ifdef CONFIG_SMP - struct sched_domain *domain; - domain = cpu_sched_domain(i); - memset(domain, 0, sizeof(struct sched_domain)); + static struct sched_group __initdata sched_group_init[NR_CPUS]; + struct sched_domain *sd; + struct sched_group *group; + + /* + * Create isolated, 1-CPU, no-balancing domains to avoid + * special-cases during early bootup. Once topology info + * is available later into the bootup, the architecture + * sets up an optimal domain-hierarchy, in the + * arch_init_sched_domains() function. + */ + sd = cpu_sched_domain(i); + memset(sd, 0, sizeof(struct sched_domain)); + cpus_clear(sd->span); + cpu_set(i, sd->span); + + group = sched_group_init + i; + group->next = group; + cpus_clear(group->cpumask); + cpu_set(i, group->cpumask); + group->cpu_power = SCHED_LOAD_SCALE; + + sd->groups = group; #endif rq = cpu_rq(i); @@ -3630,8 +3582,6 @@ void __init sched_init(void) set_task_cpu(current, smp_processor_id()); wake_up_forked_process(current); - init_timers(); - /* * The boot idle thread does lazy MMU switching as well: */ --- linux/init/main.c.orig +++ linux/init/main.c @@ -418,6 +418,12 @@ asmlinkage void __init start_kernel(void * printk() and can access its per-cpu storage. */ smp_prepare_boot_cpu(); + /* + * Set up the scheduler prior starting any interrupts (such as the + * timer interrupt). Full topology setup happens at smp_init() + * time - but meanwhile we still have a functioning scheduler. + */ + sched_init(); build_all_zonelists(); page_alloc_init(); @@ -430,7 +436,7 @@ asmlinkage void __init start_kernel(void rcu_init(); init_IRQ(); pidhash_init(); - sched_init(); + init_timers(); softirq_init(); time_init();