rollup of scheduler patches queued in -mm. -- Index: linux/arch/i386/kernel/smpboot.c =================================================================== --- linux.orig/arch/i386/kernel/smpboot.c +++ linux/arch/i386/kernel/smpboot.c @@ -1059,6 +1059,7 @@ static void smp_tune_scheduling (void) cachesize = 16; /* Pentiums, 2x8kB cache */ bandwidth = 100; } + max_cache_size = cachesize * 1024; } } Index: linux/arch/ia64/kernel/domain.c =================================================================== --- linux.orig/arch/ia64/kernel/domain.c +++ linux/arch/ia64/kernel/domain.c @@ -350,6 +350,10 @@ next_sg: #endif cpu_attach_domain(sd, i); } + /* + * Tune cache-hot values: + */ + calibrate_migration_costs(); } /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. Index: linux/arch/ia64/kernel/setup.c =================================================================== --- linux.orig/arch/ia64/kernel/setup.c +++ linux/arch/ia64/kernel/setup.c @@ -59,6 +59,7 @@ #include #include #include +#include #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE) # error "struct cpuinfo_ia64 too big!" @@ -646,6 +647,7 @@ static void get_max_cacheline_size (void) { unsigned long line_size, max = 1; + unsigned int cache_size = 0; u64 l, levels, unique_caches; pal_cache_config_info_t cci; s64 status; @@ -675,6 +677,8 @@ get_max_cacheline_size (void) line_size = 1 << cci.pcci_line_size; if (line_size > max) max = line_size; + if (cache_size < cci.pcci_cache_size) + cache_size = cci.pcci_cache_size; if (!cci.pcci_unified) { status = ia64_pal_cache_config_info(l, /* cache_type (instruction)= */ 1, @@ -691,6 +695,9 @@ get_max_cacheline_size (void) ia64_i_cache_stride_shift = cci.pcci_stride; } out: +#ifdef CONFIG_SMP + max_cache_size = max(max_cache_size, cache_size); +#endif if (max > ia64_max_cacheline_size) ia64_max_cacheline_size = max; } @@ -827,3 +834,12 @@ check_bugs (void) ia64_patch_mckinley_e9((unsigned long) __start___mckinley_e9_bundles, (unsigned long) __end___mckinley_e9_bundles); } + +/* + * On SMP systems, when the scheduler does migration-cost autodetection, + * it needs a way to flush as much of the CPU's caches as possible. + */ +void sched_cacheflush(void) +{ + ia64_sal_cache_flush(3); +} Index: linux/arch/ppc/xmon/xmon.c =================================================================== --- linux.orig/arch/ppc/xmon/xmon.c +++ linux/arch/ppc/xmon/xmon.c @@ -99,7 +99,7 @@ static void remove_bpts(void); static void insert_bpts(void); static struct bpt *at_breakpoint(unsigned pc); static void bpt_cmds(void); -static void cacheflush(void); +void cacheflush(void); #ifdef CONFIG_SMP static void cpu_cmd(void); #endif /* CONFIG_SMP */ Index: linux/fs/pipe.c =================================================================== --- linux.orig/fs/pipe.c +++ linux/fs/pipe.c @@ -39,7 +39,11 @@ void pipe_wait(struct inode * inode) { DEFINE_WAIT(wait); - prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE); + /* + * Pipes are system-local resources, so sleeping on them + * is considered a noninteractive wait: + */ + prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE); up(PIPE_SEM(*inode)); schedule(); finish_wait(PIPE_WAIT(*inode), &wait); Index: linux/include/asm-alpha/system.h =================================================================== --- linux.orig/include/asm-alpha/system.h +++ linux/include/asm-alpha/system.h @@ -139,6 +139,16 @@ extern void halt(void) __attribute__((no struct task_struct; extern struct task_struct *alpha_switch_to(unsigned long, struct task_struct*); +/* + * On SMP systems, when the scheduler does migration-cost autodetection, + * it needs a way to flush as much of the CPU's caches as possible. + * + * TODO: fill this in! + */ +static inline void sched_cacheflush(void) +{ +} + #define mb() \ __asm__ __volatile__("mb": : :"memory") Index: linux/include/asm-arm/system.h =================================================================== --- linux.orig/include/asm-arm/system.h +++ linux/include/asm-arm/system.h @@ -172,6 +172,16 @@ do { \ } while (0) /* + * On SMP systems, when the scheduler does migration-cost autodetection, + * it needs a way to flush as much of the CPU's caches as possible. + * + * TODO: fill this in! + */ +static inline void sched_cacheflush(void) +{ +} + +/* * CPU interrupt mask handling. */ #if __LINUX_ARM_ARCH__ >= 6 Index: linux/include/asm-arm26/system.h =================================================================== --- linux.orig/include/asm-arm26/system.h +++ linux/include/asm-arm26/system.h @@ -115,6 +115,16 @@ do { \ } while (0) /* + * On SMP systems, when the scheduler does migration-cost autodetection, + * it needs a way to flush as much of the CPU's caches as possible. + * + * TODO: fill this in! + */ +static inline void sched_cacheflush(void) +{ +} + +/* * Save the current interrupt enable state & disable IRQs */ #define local_irq_save(x) \ Index: linux/include/asm-i386/system.h =================================================================== --- linux.orig/include/asm-i386/system.h +++ linux/include/asm-i386/system.h @@ -468,6 +468,15 @@ void enable_hlt(void); extern int es7000_plat; void cpu_idle_wait(void); +/* + * On SMP systems, when the scheduler does migration-cost autodetection, + * it needs a way to flush as much of the CPU's caches as possible: + */ +static inline void sched_cacheflush(void) +{ + wbinvd(); +} + extern unsigned long arch_align_stack(unsigned long sp); #endif Index: linux/include/asm-i386/topology.h =================================================================== --- linux.orig/include/asm-i386/topology.h +++ linux/include/asm-i386/topology.h @@ -72,7 +72,6 @@ static inline int node_to_first_cpu(int .max_interval = 32, \ .busy_factor = 32, \ .imbalance_pct = 125, \ - .cache_hot_time = (10*1000000), \ .cache_nice_tries = 1, \ .busy_idx = 3, \ .idle_idx = 1, \ Index: linux/include/asm-ia64/system.h =================================================================== --- linux.orig/include/asm-ia64/system.h +++ linux/include/asm-ia64/system.h @@ -277,6 +277,7 @@ extern void ia64_load_extra (struct task #define ia64_platform_is(x) (strcmp(x, platform_name) == 0) void cpu_idle_wait(void); +void sched_cacheflush(void); #define arch_align_stack(x) (x) Index: linux/include/asm-ia64/topology.h =================================================================== --- linux.orig/include/asm-ia64/topology.h +++ linux/include/asm-ia64/topology.h @@ -55,7 +55,6 @@ void build_cpu_to_node_map(void); .max_interval = 4, \ .busy_factor = 64, \ .imbalance_pct = 125, \ - .cache_hot_time = (10*1000000), \ .per_cpu_gain = 100, \ .cache_nice_tries = 2, \ .busy_idx = 2, \ @@ -81,7 +80,6 @@ void build_cpu_to_node_map(void); .max_interval = 8*(min(num_online_cpus(), 32)), \ .busy_factor = 64, \ .imbalance_pct = 125, \ - .cache_hot_time = (10*1000000), \ .cache_nice_tries = 2, \ .busy_idx = 3, \ .idle_idx = 2, \ @@ -107,7 +105,6 @@ void build_cpu_to_node_map(void); .max_interval = 64*num_online_cpus(), \ .busy_factor = 128, \ .imbalance_pct = 133, \ - .cache_hot_time = (10*1000000), \ .cache_nice_tries = 1, \ .busy_idx = 3, \ .idle_idx = 3, \ Index: linux/include/asm-m32r/system.h =================================================================== --- linux.orig/include/asm-m32r/system.h +++ linux/include/asm-m32r/system.h @@ -67,6 +67,16 @@ last = __last; \ } while(0) +/* + * On SMP systems, when the scheduler does migration-cost autodetection, + * it needs a way to flush as much of the CPU's caches as possible. + * + * TODO: fill this in! + */ +static inline void sched_cacheflush(void) +{ +} + /* Interrupt Control */ #if !defined(CONFIG_CHIP_M32102) #define local_irq_enable() \ Index: linux/include/asm-mips/mach-ip27/topology.h =================================================================== --- linux.orig/include/asm-mips/mach-ip27/topology.h +++ linux/include/asm-mips/mach-ip27/topology.h @@ -24,7 +24,6 @@ extern unsigned char __node_distances[MA .max_interval = 32, \ .busy_factor = 32, \ .imbalance_pct = 125, \ - .cache_hot_time = (10*1000), \ .cache_nice_tries = 1, \ .per_cpu_gain = 100, \ .flags = SD_LOAD_BALANCE \ Index: linux/include/asm-mips/system.h =================================================================== --- linux.orig/include/asm-mips/system.h +++ linux/include/asm-mips/system.h @@ -159,6 +159,16 @@ do { \ (last) = resume(prev, next, next->thread_info); \ } while(0) +/* + * On SMP systems, when the scheduler does migration-cost autodetection, + * it needs a way to flush as much of the CPU's caches as possible. + * + * TODO: fill this in! + */ +static inline void sched_cacheflush(void) +{ +} + #define ROT_IN_PIECES \ " .set noreorder \n" \ " .set reorder \n" Index: linux/include/asm-parisc/system.h =================================================================== --- linux.orig/include/asm-parisc/system.h +++ linux/include/asm-parisc/system.h @@ -49,6 +49,15 @@ extern struct task_struct *_switch_to(st (last) = _switch_to(prev, next); \ } while(0) +/* + * On SMP systems, when the scheduler does migration-cost autodetection, + * it needs a way to flush as much of the CPU's caches as possible. + * + * TODO: fill this in! + */ +static inline void sched_cacheflush(void) +{ +} /* interrupt control */ Index: linux/include/asm-ppc/system.h =================================================================== --- linux.orig/include/asm-ppc/system.h +++ linux/include/asm-ppc/system.h @@ -95,6 +95,16 @@ extern struct task_struct *__switch_to(s struct task_struct *); #define switch_to(prev, next, last) ((last) = __switch_to((prev), (next))) +/* + * On SMP systems, when the scheduler does migration-cost autodetection, + * it needs a way to flush as much of the CPU's caches as possible. + * + * TODO: fill this in! + */ +static inline void sched_cacheflush(void) +{ +} + struct thread_struct; extern struct task_struct *_switch(struct thread_struct *prev, struct thread_struct *next); Index: linux/include/asm-ppc64/system.h =================================================================== --- linux.orig/include/asm-ppc64/system.h +++ linux/include/asm-ppc64/system.h @@ -141,6 +141,15 @@ struct thread_struct; extern struct task_struct * _switch(struct thread_struct *prev, struct thread_struct *next); +/* + * On SMP systems, when the scheduler does migration-cost autodetection, + * it needs a way to flush as much of the CPU's caches as possible. + * TODO: how do you cacheflush on ppc64? + */ +static inline void sched_cacheflush(void) +{ +} + static inline int __is_processor(unsigned long pv) { unsigned long pvr; Index: linux/include/asm-ppc64/topology.h =================================================================== --- linux.orig/include/asm-ppc64/topology.h +++ linux/include/asm-ppc64/topology.h @@ -46,7 +46,6 @@ static inline int node_to_first_cpu(int .max_interval = 32, \ .busy_factor = 32, \ .imbalance_pct = 125, \ - .cache_hot_time = (10*1000000), \ .cache_nice_tries = 1, \ .per_cpu_gain = 100, \ .flags = SD_LOAD_BALANCE \ Index: linux/include/asm-s390/system.h =================================================================== --- linux.orig/include/asm-s390/system.h +++ linux/include/asm-s390/system.h @@ -104,6 +104,16 @@ static inline void restore_access_regs(u prev = __switch_to(prev,next); \ } while (0) +/* + * On SMP systems, when the scheduler does migration-cost autodetection, + * it needs a way to flush as much of the CPU's caches as possible. + * + * TODO: fill this in! + */ +static inline void sched_cacheflush(void) +{ +} + #ifdef CONFIG_VIRT_CPU_ACCOUNTING extern void account_user_vtime(struct task_struct *); extern void account_system_vtime(struct task_struct *); Index: linux/include/asm-sh/system.h =================================================================== --- linux.orig/include/asm-sh/system.h +++ linux/include/asm-sh/system.h @@ -57,6 +57,16 @@ last = __last; \ } while (0) +/* + * On SMP systems, when the scheduler does migration-cost autodetection, + * it needs a way to flush as much of the CPU's caches as possible. + * + * TODO: fill this in! + */ +static inline void sched_cacheflush(void) +{ +} + #define nop() __asm__ __volatile__ ("nop") Index: linux/include/asm-sparc/system.h =================================================================== --- linux.orig/include/asm-sparc/system.h +++ linux/include/asm-sparc/system.h @@ -167,6 +167,16 @@ extern void fpsave(unsigned long *fpregs } while(0) /* + * On SMP systems, when the scheduler does migration-cost autodetection, + * it needs a way to flush as much of the CPU's caches as possible. + * + * TODO: fill this in! + */ +static inline void sched_cacheflush(void) +{ +} + +/* * Changing the IRQ level on the Sparc. */ extern void local_irq_restore(unsigned long); Index: linux/include/asm-sparc64/system.h =================================================================== --- linux.orig/include/asm-sparc64/system.h +++ linux/include/asm-sparc64/system.h @@ -219,6 +219,16 @@ do { if (test_thread_flag(TIF_PERFCTR)) } \ } while(0) +/* + * On SMP systems, when the scheduler does migration-cost autodetection, + * it needs a way to flush as much of the CPU's caches as possible. + * + * TODO: fill this in! + */ +static inline void sched_cacheflush(void) +{ +} + static inline unsigned long xchg32(__volatile__ unsigned int *m, unsigned int val) { unsigned long tmp1, tmp2; Index: linux/include/asm-x86_64/system.h =================================================================== --- linux.orig/include/asm-x86_64/system.h +++ linux/include/asm-x86_64/system.h @@ -178,6 +178,15 @@ static inline void write_cr4(unsigned lo #define wbinvd() \ __asm__ __volatile__ ("wbinvd": : :"memory"); +/* + * On SMP systems, when the scheduler does migration-cost autodetection, + * it needs a way to flush as much of the CPU's caches as possible. + */ +static inline void sched_cacheflush(void) +{ + wbinvd(); +} + #endif /* __KERNEL__ */ #define nop() __asm__ __volatile__ ("nop") Index: linux/include/asm-x86_64/topology.h =================================================================== --- linux.orig/include/asm-x86_64/topology.h +++ linux/include/asm-x86_64/topology.h @@ -38,7 +38,6 @@ extern int __node_distance(int, int); .max_interval = 32, \ .busy_factor = 32, \ .imbalance_pct = 125, \ - .cache_hot_time = (10*1000000), \ .cache_nice_tries = 2, \ .busy_idx = 3, \ .idle_idx = 2, \ Index: linux/include/linux/sched.h =================================================================== --- linux.orig/include/linux/sched.h +++ linux/include/linux/sched.h @@ -112,6 +112,7 @@ extern unsigned long nr_iowait(void); #define TASK_TRACED 8 #define EXIT_ZOMBIE 16 #define EXIT_DEAD 32 +#define TASK_NONINTERACTIVE 64 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -551,7 +552,17 @@ extern cpumask_t cpu_isolated_map; extern void init_sched_build_groups(struct sched_group groups[], cpumask_t span, int (*group_fn)(int cpu)); extern void cpu_attach_domain(struct sched_domain *sd, int cpu); + #endif /* ARCH_HAS_SCHED_DOMAIN */ + +/* + * Maximum cache size the migration-costs auto-tuning code will + * search from: + */ +extern unsigned int max_cache_size; + +extern void calibrate_migration_costs(void); + #endif /* CONFIG_SMP */ Index: linux/include/linux/topology.h =================================================================== --- linux.orig/include/linux/topology.h +++ linux/include/linux/topology.h @@ -86,7 +86,6 @@ .max_interval = 2, \ .busy_factor = 8, \ .imbalance_pct = 110, \ - .cache_hot_time = 0, \ .cache_nice_tries = 0, \ .per_cpu_gain = 25, \ .busy_idx = 0, \ @@ -117,7 +116,6 @@ .max_interval = 4, \ .busy_factor = 64, \ .imbalance_pct = 125, \ - .cache_hot_time = (5*1000000/2), \ .cache_nice_tries = 1, \ .per_cpu_gain = 100, \ .busy_idx = 2, \ Index: linux/kernel/sched.c =================================================================== --- linux.orig/kernel/sched.c +++ linux/kernel/sched.c @@ -44,9 +44,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -206,6 +208,7 @@ struct runqueue { */ unsigned long nr_running; #ifdef CONFIG_SMP + unsigned long prio_bias; unsigned long cpu_load[3]; #endif unsigned long long nr_switches; @@ -655,13 +658,68 @@ static int effective_prio(task_t *p) return prio; } +#ifdef CONFIG_SMP +static inline void inc_prio_bias(runqueue_t *rq, int prio) +{ + rq->prio_bias += MAX_PRIO - prio; +} + +static inline void dec_prio_bias(runqueue_t *rq, int prio) +{ + rq->prio_bias -= MAX_PRIO - prio; +} + +static inline void inc_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running++; + if (rt_task(p)) { + if (p != rq->migration_thread) + /* + * The migration thread does the actual balancing. Do + * not bias by its priority as the ultra high priority + * will skew balancing adversely. + */ + inc_prio_bias(rq, p->prio); + } else + inc_prio_bias(rq, p->static_prio); +} + +static inline void dec_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running--; + if (rt_task(p)) { + if (p != rq->migration_thread) + dec_prio_bias(rq, p->prio); + } else + dec_prio_bias(rq, p->static_prio); +} +#else +static inline void inc_prio_bias(runqueue_t *rq, int prio) +{ +} + +static inline void dec_prio_bias(runqueue_t *rq, int prio) +{ +} + +static inline void inc_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running++; +} + +static inline void dec_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running--; +} +#endif + /* * __activate_task - move a task to the runqueue. */ static inline void __activate_task(task_t *p, runqueue_t *rq) { enqueue_task(p, rq->active); - rq->nr_running++; + inc_nr_running(p, rq); } /* @@ -670,7 +728,7 @@ static inline void __activate_task(task_ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { enqueue_task_head(p, rq->active); - rq->nr_running++; + inc_nr_running(p, rq); } static int recalc_task_prio(task_t *p, unsigned long long now) @@ -789,7 +847,7 @@ static void activate_task(task_t *p, run */ static void deactivate_task(struct task_struct *p, runqueue_t *rq) { - rq->nr_running--; + dec_nr_running(p, rq); dequeue_task(p, p->array); p->array = NULL; } @@ -875,7 +933,7 @@ static int migrate_task(task_t *p, int d * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -void wait_task_inactive(task_t * p) +void wait_task_inactive(task_t *p) { unsigned long flags; runqueue_t *rq; @@ -926,27 +984,59 @@ void kick_process(task_t *p) * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static inline unsigned long source_load(int cpu, int type) +static inline unsigned long __source_load(int cpu, int type, enum idle_type idle) { runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + unsigned long source_load, cpu_load = rq->cpu_load[type-1], + load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) - return load_now; + source_load = load_now; + else + source_load = min(cpu_load, load_now); + + if (rq->nr_running > 1 || (idle == NOT_IDLE && rq->nr_running)) + /* + * If we are busy rebalancing the load is biased by + * priority to create 'nice' support across cpus. When + * idle rebalancing we should only bias the source_load if + * there is more than one task running on that queue to + * prevent idle rebalance from trying to pull tasks from a + * queue with only one running task. + */ + source_load = source_load * rq->prio_bias / rq->nr_running; + + return source_load; +} - return min(rq->cpu_load[type-1], load_now); +static inline unsigned long source_load(int cpu, int type) +{ + return __source_load(cpu, type, NOT_IDLE); } /* * Return a high guess at the load of a migration-target cpu */ -static inline unsigned long target_load(int cpu, int type) +static inline unsigned long __target_load(int cpu, int type, enum idle_type idle) { runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + unsigned long target_load, cpu_load = rq->cpu_load[type-1], + load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) - return load_now; + target_load = load_now; + else + target_load = max(cpu_load, load_now); + + if (rq->nr_running > 1 || (idle == NOT_IDLE && rq->nr_running)) + target_load = target_load * rq->prio_bias / rq->nr_running; + + return target_load; +} - return max(rq->cpu_load[type-1], load_now); +static inline unsigned long target_load(int cpu, int type) +{ + return __target_load(cpu, type, NOT_IDLE); } /* @@ -966,8 +1056,11 @@ find_idlest_group(struct sched_domain *s int local_group; int i; + /* Skip over this group if it has no CPUs allowed */ + if (!cpus_intersects(group->cpumask, p->cpus_allowed)) + goto nextgroup; + local_group = cpu_isset(this_cpu, group->cpumask); - /* XXX: put a cpus allowed check */ /* Tally up the load of all CPUs in the group */ avg_load = 0; @@ -992,6 +1085,7 @@ find_idlest_group(struct sched_domain *s min_load = avg_load; idlest = group; } +nextgroup: group = group->next; } while (group != sd->groups); @@ -1003,13 +1097,18 @@ find_idlest_group(struct sched_domain *s /* * find_idlest_queue - find the idlest runqueue among the cpus in group. */ -static int find_idlest_cpu(struct sched_group *group, int this_cpu) +static int +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { + cpumask_t tmp; unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; - for_each_cpu_mask(i, group->cpumask) { + /* Traverse only the allowed CPUs */ + cpus_and(tmp, group->cpumask, p->cpus_allowed); + + for_each_cpu_mask(i, tmp) { load = source_load(i, 0); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -1052,7 +1151,7 @@ static int sched_balance_self(int cpu, i if (!group) goto nextlevel; - new_cpu = find_idlest_cpu(group, cpu); + new_cpu = find_idlest_cpu(group, t, cpu); if (new_cpu == -1 || new_cpu == cpu) goto nextlevel; @@ -1127,7 +1226,7 @@ static inline int wake_idle(int cpu, tas * * returns failure only if the task is already active. */ -static int try_to_wake_up(task_t * p, unsigned int state, int sync) +static int try_to_wake_up(task_t *p, unsigned int state, int sync) { int cpu, this_cpu, success = 0; unsigned long flags; @@ -1252,6 +1351,16 @@ out_activate: } /* + * Tasks that have marked their sleep as noninteractive get + * woken up without updating their sleep average. (i.e. their + * sleep is handled in a priority-neutral manner, no priority + * boost and no penalty.) + */ + if (old_state & TASK_NONINTERACTIVE) + __activate_task(p, rq); + else + activate_task(p, rq, cpu == this_cpu); + /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) * don't trigger a preemption, if the woken up task will run on @@ -1259,7 +1368,6 @@ out_activate: * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */ - activate_task(p, rq, cpu == this_cpu); if (!sync || cpu != this_cpu) { if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -1274,7 +1382,7 @@ out: return success; } -int fastcall wake_up_process(task_t * p) +int fastcall wake_up_process(task_t *p) { return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); @@ -1353,7 +1461,7 @@ void fastcall sched_fork(task_t *p, int * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ -void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) +void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) { unsigned long flags; int this_cpu, cpu; @@ -1389,7 +1497,7 @@ void fastcall wake_up_new_task(task_t * list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; - rq->nr_running++; + inc_nr_running(p, rq); } set_need_resched(); } else @@ -1436,7 +1544,7 @@ void fastcall wake_up_new_task(task_t * * artificially, because any timeslice recovered here * was given away by the parent in the first place.) */ -void fastcall sched_exit(task_t * p) +void fastcall sched_exit(task_t *p) { unsigned long flags; runqueue_t *rq; @@ -1733,9 +1841,9 @@ void pull_task(runqueue_t *src_rq, prio_ runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) { dequeue_task(p, src_array); - src_rq->nr_running--; + dec_nr_running(p, src_rq); set_task_cpu(p, this_cpu); - this_rq->nr_running++; + inc_nr_running(p, this_rq); enqueue_task(p, this_array); p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; @@ -1752,7 +1860,8 @@ void pull_task(runqueue_t *src_rq, prio_ */ static inline int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, - struct sched_domain *sd, enum idle_type idle, int *all_pinned) + struct sched_domain *sd, enum idle_type idle, + int *all_pinned) { /* * We do not migrate tasks that are: @@ -1909,9 +2018,9 @@ find_busiest_group(struct sched_domain * for_each_cpu_mask(i, group->cpumask) { /* Bias balancing toward cpus of our domain */ if (local_group) - load = target_load(i, load_idx); + load = __target_load(i, load_idx, idle); else - load = source_load(i, load_idx); + load = __source_load(i, load_idx, idle); avg_load += load; } @@ -2012,14 +2121,15 @@ out_balanced: /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ -static runqueue_t *find_busiest_queue(struct sched_group *group) +static runqueue_t *find_busiest_queue(struct sched_group *group, + enum idle_type idle) { unsigned long load, max_load = 0; runqueue_t *busiest = NULL; int i; for_each_cpu_mask(i, group->cpumask) { - load = source_load(i, 0); + load = __source_load(i, 0, idle); if (load > max_load) { max_load = load; @@ -2060,7 +2170,7 @@ static int load_balance(int this_cpu, ru goto out_balanced; } - busiest = find_busiest_queue(group); + busiest = find_busiest_queue(group, idle); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -2168,7 +2278,7 @@ static int load_balance_newidle(int this goto out_balanced; } - busiest = find_busiest_queue(group); + busiest = find_busiest_queue(group, NEWLY_IDLE); if (!busiest) { schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); goto out_balanced; @@ -2575,6 +2685,13 @@ out: } #ifdef CONFIG_SCHED_SMT +static inline void wakeup_busy_runqueue(runqueue_t *rq) +{ + /* If an SMT runqueue is sleeping due to priority reasons wake it up */ + if (rq->curr == rq->idle && rq->nr_running) + resched_task(rq->idle); +} + static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) { struct sched_domain *tmp, *sd = NULL; @@ -2608,12 +2725,7 @@ static inline void wake_sleeping_depende for_each_cpu_mask(i, sibling_map) { runqueue_t *smt_rq = cpu_rq(i); - /* - * If an SMT sibling task is sleeping due to priority - * reasons wake it up now. - */ - if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) - resched_task(smt_rq->idle); + wakeup_busy_runqueue(smt_rq); } for_each_cpu_mask(i, sibling_map) @@ -2624,6 +2736,16 @@ static inline void wake_sleeping_depende */ } +/* + * number of 'lost' timeslices this task wont be able to fully + * utilize, if another task runs on a sibling. This models the + * slowdown effect of other tasks running on siblings: + */ +static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) +{ + return p->time_slice * (100 - sd->per_cpu_gain) / 100; +} + static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) { struct sched_domain *tmp, *sd = NULL; @@ -2667,6 +2789,10 @@ static inline int dependent_sleeper(int runqueue_t *smt_rq = cpu_rq(i); task_t *smt_curr = smt_rq->curr; + /* Kernel threads do not participate in dependent sleeping */ + if (!p->mm || !smt_curr->mm || rt_task(p)) + goto check_smt_task; + /* * If a user task with lower static priority than the * running task on the SMT sibling is trying to schedule, @@ -2675,21 +2801,45 @@ static inline int dependent_sleeper(int * task from using an unfair proportion of the * physical cpu's resources. -ck */ - if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(p) || rt_task(smt_curr)) && - p->mm && smt_curr->mm && !rt_task(p)) - ret = 1; + if (rt_task(smt_curr)) { + /* + * With real time tasks we run non-rt tasks only + * per_cpu_gain% of the time. + */ + if ((jiffies % DEF_TIMESLICE) > + (sd->per_cpu_gain * DEF_TIMESLICE / 100)) + ret = 1; + } else + if (smt_curr->static_prio < p->static_prio && + !TASK_PREEMPTS_CURR(p, smt_rq) && + smt_slice(smt_curr, sd) > task_timeslice(p)) + ret = 1; + +check_smt_task: + if ((!smt_curr->mm && smt_curr != smt_rq->idle) || + rt_task(smt_curr)) + continue; + if (!p->mm) { + wakeup_busy_runqueue(smt_rq); + continue; + } /* - * Reschedule a lower priority task on the SMT sibling, - * or wake it up if it has been put to sleep for priority - * reasons. + * Reschedule a lower priority task on the SMT sibling for + * it to be put to sleep, or wake it up if it has been put to + * sleep for priority reasons to see if it should run now. */ - if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(smt_curr) || rt_task(p)) && - smt_curr->mm && p->mm && !rt_task(smt_curr)) || - (smt_curr == smt_rq->idle && smt_rq->nr_running)) - resched_task(smt_curr); + if (rt_task(p)) { + if ((jiffies % DEF_TIMESLICE) > + (sd->per_cpu_gain * DEF_TIMESLICE / 100)) + resched_task(smt_curr); + } else { + if (TASK_PREEMPTS_CURR(p, smt_rq) && + smt_slice(p, sd) > task_timeslice(smt_curr)) + resched_task(smt_curr); + else + wakeup_busy_runqueue(smt_rq); + } } out_unlock: for_each_cpu_mask(i, sibling_map) @@ -3014,7 +3164,8 @@ need_resched: #endif /* CONFIG_PREEMPT */ -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, + void *key) { task_t *p = curr->private; return try_to_wake_up(p, mode, sync); @@ -3056,7 +3207,7 @@ static void __wake_up_common(wait_queue_ * @key: is directly passed to the wakeup function */ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) + int nr_exclusive, void *key) { unsigned long flags; @@ -3088,7 +3239,8 @@ void fastcall __wake_up_locked(wait_queu * * On UP it can prevent extra preemption. */ -void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +void fastcall +__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) { unsigned long flags; int sync = 1; @@ -3279,7 +3431,8 @@ void fastcall __sched interruptible_slee EXPORT_SYMBOL(interruptible_sleep_on); -long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +long fastcall __sched +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR @@ -3347,8 +3500,10 @@ void set_user_nice(task_t *p, long nice) goto out_unlock; } array = p->array; - if (array) + if (array) { dequeue_task(p, array); + dec_prio_bias(rq, p->static_prio); + } old_prio = p->prio; new_prio = NICE_TO_PRIO(nice); @@ -3358,6 +3513,7 @@ void set_user_nice(task_t *p, long nice) if (array) { enqueue_task(p, array); + inc_prio_bias(rq, p->static_prio); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -3498,7 +3654,8 @@ static void __setscheduler(struct task_s * @policy: new policy. * @param: structure containing the new RT priority. */ -int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) +int sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param) { int retval; int oldprio, oldpolicy = -1; @@ -3581,7 +3738,8 @@ recheck: } EXPORT_SYMBOL_GPL(sched_setscheduler); -static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +static int +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { int retval; struct sched_param lparam; @@ -3912,7 +4070,7 @@ EXPORT_SYMBOL(cond_resched); * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t * lock) +int cond_resched_lock(spinlock_t *lock) { int ret = 0; @@ -4095,7 +4253,7 @@ static inline struct task_struct *younge return list_entry(p->sibling.next,struct task_struct,sibling); } -static void show_task(task_t * p) +static void show_task(task_t *p) { task_t *relative; unsigned state; @@ -4121,7 +4279,7 @@ static void show_task(task_t * p) #endif #ifdef CONFIG_DEBUG_STACK_USAGE { - unsigned long * n = (unsigned long *) (p->thread_info+1); + unsigned long *n = (unsigned long *) (p->thread_info+1); while (!*n) n++; free = (unsigned long) n - (unsigned long)(p->thread_info+1); @@ -4330,7 +4488,7 @@ out: * thread migration by bumping thread off CPU then 'pushing' onto * another runqueue. */ -static int migration_thread(void * data) +static int migration_thread(void *data) { runqueue_t *rq; int cpu = (long)data; @@ -4542,6 +4700,129 @@ static void migrate_dead_tasks(unsigned } #endif /* CONFIG_HOTPLUG_CPU */ +#if defined(CONFIG_DEBUG_KERNEL) && defined(CONFIG_SYSCTL) +static struct ctl_table sd_ctl_dir[] = { + {1, "sched_domain", NULL, 0, 0755, NULL, }, + {0,}, +}; + +static struct ctl_table sd_ctl_root[] = { + {1, "kernel", NULL, 0, 0755, sd_ctl_dir, }, + {0,}, +}; + +static char *sched_strdup(char *str) +{ + int n = strlen(str)+1; + char *s = kmalloc(n, GFP_KERNEL); + if (!s) + return NULL; + return strcpy(s, str); +} + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); + BUG_ON(!entry); + memset(entry, 0, n * sizeof(struct ctl_table)); + return entry; +} + +static void set_table_entry(struct ctl_table *entry, int ctl_name, + const char *procname, void *data, int maxlen, + mode_t mode, proc_handler *proc_handler) +{ + entry->ctl_name = ctl_name; + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table; + table = sd_alloc_ctl_entry(14); + + set_table_entry(&table[0], 1, "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[1], 2, "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], 3, "busy_idx", &sd->busy_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], 4, "idle_idx", &sd->idle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], 5, "newidle_idx", &sd->newidle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], 6, "wake_idx", &sd->wake_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[6], 7, "forkexec_idx", &sd->forkexec_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[7], 8, "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[9], 10, "cache_hot_time", &sd->cache_hot_time, + sizeof(long long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[10], 11, "cache_nice_tries", &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[11], 12, "per_cpu_gain", &sd->per_cpu_gain, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[12], 13, "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax); + return table; +} + +static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ + struct sched_domain *sd; + int domain_num = 0, i; + struct ctl_table *entry, *table; + char buf[32]; + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + + i = 0; + for_each_domain(cpu, sd) { + snprintf(buf, 32, "domain%d", i); + entry->ctl_name = i + 1; + entry->procname = sched_strdup(buf); + entry->mode = 0755; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +static void init_sched_domain_sysctl(void) +{ + int i, cpu_num = num_online_cpus(); + char buf[32]; + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + + sd_ctl_dir[0].child = entry; + + for (i = 0; i < cpu_num; i++, entry++) { + snprintf(buf, 32, "cpu%d", i); + entry->ctl_name = i + 1; + entry->procname = sched_strdup(buf); + entry->mode = 0755; + entry->child = sd_alloc_ctl_cpu_table(i); + } + sd_sysctl_header = register_sysctl_table(sd_ctl_root, 0); +} +#else +static void init_sched_domain_sysctl(void) +{ +} +#endif + /* * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. @@ -4865,6 +5146,487 @@ void init_sched_build_groups(struct sche } +/* + * Self-tuning task migration cost measurement between source and target CPUs. + * + * This is done by measuring the cost of manipulating buffers of varying + * sizes. For a given buffer-size here are the steps that are taken: + * + * 1) the source CPU reads+dirties a shared buffer + * 2) the target CPU reads+dirties the same shared buffer + * + * We measure how long they take, in the following 4 scenarios: + * + * - source: CPU1, target: CPU2 | cost1 + * - source: CPU2, target: CPU1 | cost2 + * - source: CPU1, target: CPU1 | cost3 + * - source: CPU2, target: CPU2 | cost4 + * + * We then calculate the cost3+cost4-cost1-cost2 difference - this is + * the cost of migration. + * + * We then start off from a small buffer-size and iterate up to larger + * buffer sizes, in 5% steps - measuring each buffer-size separately, and + * doing a maximum search for the cost. (The maximum cost for a migration + * normally occurs when the working set size is around the effective cache + * size.) + */ +#define SEARCH_SCOPE 2 +#define MIN_CACHE_SIZE (64*1024U) +#define DEFAULT_CACHE_SIZE (5*1024*1024U) +#define ITERATIONS 2 +#define SIZE_THRESH 130 +#define COST_THRESH 130 + +/* + * The migration cost is a function of 'domain distance'. Domain + * distance is the number of steps a CPU has to iterate down its + * domain tree to share a domain with the other CPU. The farther + * two CPUs are from each other, the larger the distance gets. + * + * Note that we use the distance only to cache measurement results, + * the distance value is not used numerically otherwise. When two + * CPUs have the same distance it is assumed that the migration + * cost is the same. (this is a simplification but quite practical) + */ +#define MAX_DOMAIN_DISTANCE 32 + +static __initdata unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = + { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL }; + +/* + * Allow override of migration cost - in units of microseconds. + * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost + * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: + */ +static int __init migration_cost_setup(char *str) +{ + int ints[MAX_DOMAIN_DISTANCE+1], i; + + str = get_options(str, ARRAY_SIZE(ints), ints); + + printk("#ints: %d\n", ints[0]); + for (i = 1; i <= ints[0]; i++) { + migration_cost[i-1] = (unsigned long long)ints[i]*1000; + printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); + } + return 1; +} + +__setup ("migration_cost=", migration_cost_setup); + +/* + * Global multiplier (divisor) for migration-cutoff values, + * in percentiles. E.g. use a value of 150 to get 1.5 times + * longer cache-hot cutoff times. + * + * (We scale it from 100 to 128 to long long handling easier.) + */ + +#define MIGRATION_FACTOR_SCALE 128 + +static __initdata unsigned int migration_factor = MIGRATION_FACTOR_SCALE; + +static int __init setup_migration_factor(char *str) +{ + get_option(&str, &migration_factor); + migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; + return 1; +} + +__setup("migration_factor=", setup_migration_factor); + +/* + * Estimated distance of two CPUs, measured via the number of domains + * we have to pass for the two CPUs to be in the same span: + */ +__devinit static unsigned long domain_distance(int cpu1, int cpu2) +{ + unsigned long distance = 0; + struct sched_domain *sd; + + for_each_domain(cpu1, sd) { + WARN_ON(!cpu_isset(cpu1, sd->span)); + if (cpu_isset(cpu2, sd->span)) + return distance; + distance++; + } + if (distance >= MAX_DOMAIN_DISTANCE) { + WARN_ON(1); + distance = MAX_DOMAIN_DISTANCE-1; + } + + return distance; +} + +static __initdata unsigned int migration_debug = 1; + +static int __init setup_migration_debug(char *str) +{ + get_option(&str, &migration_debug); + return 1; +} + +__setup("migration_debug=", setup_migration_debug); + +/* + * Maximum cache-size that the scheduler should try to measure. + * Architectures with larger caches should tune this up during + * bootup. Gets used in the domain-setup code (i.e. during SMP + * bootup). + */ +__initdata unsigned int max_cache_size; + +static int __init setup_max_cache_size(char *str) +{ + get_option(&str, &max_cache_size); + return 1; +} + +__setup("max_cache_size=", setup_max_cache_size); + +/* + * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This + * is the operation that is timed, so we try to generate unpredictable + * cachemisses that still end up filling the L2 cache: + */ +__init static void touch_cache(void *__cache, unsigned long __size) +{ + unsigned long size = __size/sizeof(long), chunk1 = size/3, + chunk2 = 2*size/3; + unsigned long *cache = __cache; + int i; + + for (i = 0; i < size/6; i += 8) { + switch (i % 6) { + case 0: cache[i]++; + case 1: cache[size-1-i]++; + case 2: cache[chunk1-i]++; + case 3: cache[chunk1+i]++; + case 4: cache[chunk2-i]++; + case 5: cache[chunk2+i]++; + } + } +} + +/* + * Measure the cache-cost of one task migration. Returns in units of nsec. + */ +__init static unsigned long long measure_one(void *cache, unsigned long size, + int source, int target) +{ + cpumask_t mask, saved_mask; + unsigned long long t0, t1, t2, t3, cost; + + saved_mask = current->cpus_allowed; + + /* + * Flush source caches to RAM and invalidate them: + */ + sched_cacheflush(); + + /* + * Migrate to the source CPU: + */ + mask = cpumask_of_cpu(source); + set_cpus_allowed(current, mask); + WARN_ON(smp_processor_id() != source); + + /* + * Dirty the working set: + */ + t0 = sched_clock(); + touch_cache(cache, size); + t1 = sched_clock(); + + /* + * Migrate to the target CPU, dirty the L2 cache and access + * the shared buffer. (which represents the working set + * of a migrated task.) + */ + mask = cpumask_of_cpu(target); + set_cpus_allowed(current, mask); + WARN_ON(smp_processor_id() != target); + + t2 = sched_clock(); + touch_cache(cache, size); + t3 = sched_clock(); + + cost = t1-t0 + t3-t2; + + if (migration_debug >= 2) + printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", + source, target, t1-t0, t1-t0, t3-t2, cost); + /* + * Flush target caches to RAM and invalidate them: + */ + sched_cacheflush(); + + set_cpus_allowed(current, saved_mask); + + return cost; +} + +/* + * Measure a series of task migrations and return the average + * result. Since this code runs early during bootup the system + * is 'undisturbed' and the average latency makes sense. + * + * The algorithm in essence auto-detects the relevant cache-size, + * so it will properly detect different cachesizes for different + * cache-hierarchies, depending on how the CPUs are connected. + * + * Architectures can prime the upper limit of the search range via + * max_cache_size, otherwise the search range defaults to 20MB...64K. + */ +__init static unsigned long long +measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) +{ + unsigned long long cost1, cost2; + int i; + + /* + * Measure the migration cost of 'size' bytes, over an + * average of 10 runs: + * + * (We perturb the cache size by a small (0..4k) + * value to compensate size/alignment related artifacts. + * We also subtract the cost of the operation done on + * the same CPU.) + */ + cost1 = 0; + + /* + * dry run, to make sure we start off cache-cold on cpu1, + * and to get any vmalloc pagefaults in advance: + */ + measure_one(cache, size, cpu1, cpu2); + for (i = 0; i < ITERATIONS; i++) + cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); + + measure_one(cache, size, cpu2, cpu1); + for (i = 0; i < ITERATIONS; i++) + cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); + + /* + * (We measure the non-migrating [cached] cost on both + * cpu1 and cpu2, to handle CPUs with different speeds) + */ + cost2 = 0; + + measure_one(cache, size, cpu1, cpu1); + for (i = 0; i < ITERATIONS; i++) + cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); + + measure_one(cache, size, cpu2, cpu2); + for (i = 0; i < ITERATIONS; i++) + cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); + + /* + * Get the per-iteration migration cost: + */ + do_div(cost1, 2*ITERATIONS); + do_div(cost2, 2*ITERATIONS); + + return cost1 - cost2; +} + +__devinit static unsigned long long measure_migration_cost(int cpu1, int cpu2) +{ + unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; + unsigned int max_size, size, size_found = 0; + long long cost = 0, prev_cost; + void *cache; + + /* + * Search from max_cache_size*5 down to 64K - the real relevant + * cachesize has to lie somewhere inbetween. + */ + if (max_cache_size) { + max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); + size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); + } else { + /* + * Since we have no estimation about the relevant + * search range + */ + max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; + size = MIN_CACHE_SIZE; + } + + if (!cpu_online(cpu1) || !cpu_online(cpu2)) { + printk("cpu %d and %d not both online!\n", cpu1, cpu2); + return 0; + } + + /* + * Allocate the working set: + */ + cache = vmalloc(max_size); + if (!cache) { + printk("could not vmalloc %d bytes for cache!\n", 2*max_size); + return 1000000; // return 1 msec on very small boxen + } + + while (size <= max_size) { + prev_cost = cost; + cost = measure_cost(cpu1, cpu2, cache, size); + + /* + * Update the max: + */ + if (cost > 0) { + if (max_cost < cost) { + max_cost = cost; + size_found = size; + } + } + /* + * Calculate average fluctuation, we use this to prevent + * noise from triggering an early break out of the loop: + */ + fluct = abs(cost - prev_cost); + avg_fluct = (avg_fluct + fluct)/2; + + if (migration_debug) + printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", + cpu1, cpu2, size, + (long)cost / 1000000, + ((long)cost / 100000) % 10, + (long)max_cost / 1000000, + ((long)max_cost / 100000) % 10, + domain_distance(cpu1, cpu2), + cost, avg_fluct); + + /* + * If we iterated at least 20% past the previous maximum, + * and the cost has dropped by more than 20% already, + * (taking fluctuations into account) then we assume to + * have found the maximum and break out of the loop early: + */ + if (size_found && (size*100 > size_found*SIZE_THRESH)) + if (cost+avg_fluct <= 0 || + max_cost*100 > (cost+avg_fluct)*COST_THRESH) { + + if (migration_debug) + printk("-> found max.\n"); + break; + } + /* + * Increase the cachesize in 5% steps: + */ + size = size * 20 / 19; + } + + if (migration_debug) + printk("[%d][%d] working set size found: %d, cost: %Ld\n", + cpu1, cpu2, size_found, max_cost); + + vfree(cache); + + /* + * A task is considered 'cache cold' if at least 2 times + * the worst-case cost of migration has passed. + * + * (this limit is only listened to if the load-balancing + * situation is 'nice' - if there is a large imbalance we + * ignore it for the sake of CPU utilization and + * processing fairness.) + */ + return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; +} + +void __devinit calibrate_migration_costs(void) +{ + int cpu1 = -1, cpu2 = -1, cpu; + struct sched_domain *sd; + unsigned long distance, max_distance = 0; + unsigned long long cost; + unsigned long flags, j0, j1; + + local_irq_save(flags); + local_irq_enable(); + j0 = jiffies; + + /* + * First pass - calculate the cacheflush times: + */ + for_each_online_cpu(cpu1) { + for_each_online_cpu(cpu2) { + if (cpu1 == cpu2) + continue; + distance = domain_distance(cpu1, cpu2); + max_distance = max(max_distance, distance); + /* + * Do we have the result cached already? + */ + if (migration_cost[distance] != -1LL) + cost = migration_cost[distance]; + else { + cost = measure_migration_cost(cpu1, cpu2); + migration_cost[distance] = cost; + } + } + } + /* + * Second pass - update the sched domain hierarchy with + * the new cache-hot-time estimations: + */ + for_each_online_cpu(cpu) { + distance = 0; + for_each_domain(cpu, sd) { + sd->cache_hot_time = migration_cost[distance]; + distance++; + } + } + /* + * Print the matrix: + */ + printk("---------------------\n"); + printk("| migration cost matrix (max_cache_size: %d, cpu: %d MHz):\n", + max_cache_size, +#ifdef CONFIG_X86 + cpu_khz/1000 +#else + -1 +#endif + ); + printk("---------------------\n"); + printk(" "); + for_each_online_cpu(cpu1) + printk(" [%02d]", cpu1); + printk("\n"); + for_each_online_cpu(cpu1) { + printk("[%02d]: ", cpu1); + for_each_online_cpu(cpu2) { + if (cpu1 == cpu2) { + printk(" - "); + continue; + } + distance = domain_distance(cpu1, cpu2); + max_distance = max(max_distance, distance); + cost = migration_cost[distance]; + printk(" %2ld.%ld(%ld)", (long)cost / 1000000, + ((long)cost / 100000) % 10, distance); + } + printk("\n"); + } + printk("--------------------------------\n"); + printk("| cacheflush times [%ld]:", max_distance+1); + for (distance = 0; distance <= max_distance; distance++) { + cost = migration_cost[distance]; + printk(" %ld.%ld (%Ld)", (long)cost / 1000000, + ((long)cost / 100000) % 10, cost); + } + printk("\n"); + j1 = jiffies; + printk("| calibration delay: %ld seconds\n", (j1-j0)/HZ); + printk("--------------------------------\n"); + + local_irq_restore(flags); +} + + #ifdef ARCH_HAS_SCHED_DOMAIN extern void build_sched_domains(const cpumask_t *cpu_map); extern void arch_init_sched_domains(const cpumask_t *cpu_map); @@ -5035,6 +5797,10 @@ static void build_sched_domains(const cp #endif cpu_attach_domain(sd, i); } + /* + * Tune cache-hot values: + */ + calibrate_migration_costs(); } /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. @@ -5143,6 +5909,7 @@ void __init sched_init_smp(void) unlock_cpu_hotplug(); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); + init_sched_domain_sysctl(); } #else void __init sched_init_smp(void)