--- linux/fs/proc/proc_misc.c.orig +++ linux/fs/proc/proc_misc.c @@ -321,7 +321,7 @@ static struct file_operations proc_slabi static int show_stat(struct seq_file *p, void *v) { int i; - unsigned long jif; + unsigned long jif, rt_avg; cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; u64 sum = 0; @@ -330,6 +330,7 @@ static int show_stat(struct seq_file *p, jif = - wall_to_monotonic.tv_sec; if (wall_to_monotonic.tv_nsec) --jif; + rt_avg = 0; for_each_cpu(i) { int j; @@ -344,9 +345,10 @@ static int show_stat(struct seq_file *p, steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); for (j = 0 ; j < NR_IRQS ; j++) sum += kstat_cpu(i).irqs[j]; + rt_avg += rt_cpu_average(i); } - seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu\n", + seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %lu\n", (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), (unsigned long long)cputime64_to_clock_t(system), @@ -354,7 +356,8 @@ static int show_stat(struct seq_file *p, (unsigned long long)cputime64_to_clock_t(iowait), (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal)); + (unsigned long long)cputime64_to_clock_t(steal), + rt_avg); for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ @@ -366,7 +369,8 @@ static int show_stat(struct seq_file *p, irq = kstat_cpu(i).cpustat.irq; softirq = kstat_cpu(i).cpustat.softirq; steal = kstat_cpu(i).cpustat.steal; - seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n", + rt_avg = rt_cpu_average(i); + seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %lu\n", i, (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), @@ -375,7 +379,8 @@ static int show_stat(struct seq_file *p, (unsigned long long)cputime64_to_clock_t(iowait), (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal)); + (unsigned long long)cputime64_to_clock_t(steal), + rt_avg); } seq_printf(p, "intr %llu", (unsigned long long)sum); --- linux/kernel/sched.c.orig +++ linux/kernel/sched.c @@ -223,6 +223,11 @@ struct runqueue { prio_array_t *active, *expired, arrays[2]; int best_expired_prio; atomic_t nr_iowait; + /* + * Short-term CPU usage history/load-average of all RT tasks + * on this CPU, linearly scaled to between 0...HZ: + */ + unsigned long rt_cpu_avg; #ifdef CONFIG_SMP struct sched_domain *sd; @@ -297,6 +302,37 @@ static DEFINE_PER_CPU(struct runqueue, r # define task_running(rq, p) ((rq)->curr == (p)) #endif +#if MAX_PRIO != 140 +# error update the BITMAP_SCHED_OTHER definitions! +#endif + +#if (BITS_PER_LONG == 32) +/* + * b[3] is bits 96...127, of which bits 4..32 are SCHED_OTHER: 28 bits + * b[4] is bits 128...140, of which bits 0..11 are SCHED_OTHER: 12 bits + */ +# define BITMAP_SCHED_OTHER(b) \ + ((b[3] & 0xfffffff0UL) || \ + (b[4] & 0x00000fffUL)) +#else +/* + * b[1] is bits 64..127, of which bits 36..127 are SCHED_OTHER: 28 bits + * b[2] is bits 128..140, of which bits 0..11 are SCHED_OTHER: 12 bits + */ +# define BITMAP_SCHED_OTHER(b) \ + ((b[1] & 0xfffffff000000000UL) || \ + (b[2] & 0x0000000000000fffUL)) +#endif + +/* + * non_rt_tasks() - are there any non-RT tasks running in the runqueue? + */ +static inline int non_rt_tasks(runqueue_t *rq) +{ + return BITMAP_SCHED_OTHER(rq->active->bitmap) || + BITMAP_SCHED_OTHER(rq->expired->bitmap); +} + /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without @@ -766,6 +802,52 @@ static void deactivate_task(struct task_ p->array = NULL; } +#define WEIGHT (HZ/10) + +/* + * Here we maintain the ->rq_cpu_avg load-average. + * + * It is implemented as a decaying average, where ->rt_cpu_avg + * moves in the range of 0 ... HZ, and where the 100% -> 5% decay + * time (which is HZ-independent) is ~300 msecs: + */ +static inline void rt_cpu_avg_inc(runqueue_t *rq) +{ + rq->rt_cpu_avg = (rq->rt_cpu_avg*(WEIGHT-1) + HZ)/WEIGHT; +} + +static inline void rt_cpu_avg_dec(runqueue_t *rq) +{ + rq->rt_cpu_avg = rq->rt_cpu_avg*(WEIGHT-1)/WEIGHT; +} + +#undef WEIGHT + +static int rt_task_over_cpu_limit(struct task_struct *p, runqueue_t *rq) +{ + unsigned long limit; + + /* + * An RT task needs delaying if there are any non-RT tasks + * running on this CPU, and if the current RT CPU usage value + * violates the task's RT CPU rlimit: + */ + if (!rt_task(p) || !non_rt_tasks(rq) || !p->signal) + return 0; + + limit = p->signal->rlim[RLIMIT_RT_CPU].rlim_cur; + if (!limit || (rq->rt_cpu_avg <= limit * HZ / 100)) + return 0; + + /* + * Delay the task by queueing it into the expired array: + */ + dequeue_task(p, p->array); + enqueue_task(p, rq->expired); + + return 1; +} + /* * resched_task - mark a task 'to be rescheduled now'. * @@ -1441,6 +1523,16 @@ unsigned long nr_iowait(void) return sum; } +/* + * rt_cpu_average - the total load from RT tasks on this CPU + * + * returns in units of 0.1%. (i.e. for 51.5% it returns 515) + */ +unsigned long rt_cpu_average(int cpu) +{ + return cpu_rq(cpu)->rt_cpu_avg * 1000 / HZ; +} + #ifdef CONFIG_SMP /* @@ -2412,6 +2504,7 @@ void scheduler_tick(void) rq->timestamp_last_tick = sched_clock(); if (p == rq->idle) { + rt_cpu_avg_dec(rq); // rq not locked - not a problem if (wake_priority_sleeper(rq)) goto out; rebalance_tick(cpu, rq, SCHED_IDLE); @@ -2433,6 +2526,17 @@ void scheduler_tick(void) */ if (rt_task(p)) { /* + * Maintain the CPU-average of RT tasks and if there's a + * RT-CPU rlimit for the current task and the limit has + * been violated then put the current task into the + * expired array: + */ + rt_cpu_avg_inc(rq); + if (rt_task_over_cpu_limit(p, rq)) { + set_tsk_need_resched(p); + goto out_unlock; + } + /* * RR tasks need a special form of timeslice management. * FIFO tasks have no timeslices. */ @@ -2446,6 +2550,7 @@ void scheduler_tick(void) } goto out_unlock; } + rt_cpu_avg_dec(rq); if (!--p->time_slice) { dequeue_task(p, rq->active); set_tsk_need_resched(p); @@ -2756,6 +2861,7 @@ go_idle: goto go_idle; } +pick_next_task: array = rq->active; if (unlikely(!array->nr_active)) { /* @@ -2774,6 +2880,13 @@ go_idle: queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); + /* + * If the RT task would violate the RT CPU use rlimit + * then delay the task and pick the next one: + */ + if (rt_task_over_cpu_limit(next, rq)) + goto pick_next_task; + if (!rt_task(next) && next->activated > 0) { unsigned long long delta = now - next->timestamp; @@ -3368,8 +3481,14 @@ recheck: if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) return -EINVAL; - if ((policy == SCHED_FIFO || policy == SCHED_RR) && - !capable(CAP_SYS_NICE)) + /* + * If the RT CPU limit has been set then it activates a + * mechanism that makes RT tasks deadlock-safe - thus + * ordinary users may specify RT priorities too: + */ + if (!p->signal->rlim[RLIMIT_RT_CPU].rlim_cur && + (policy == SCHED_FIFO || policy == SCHED_RR) && + !capable(CAP_SYS_NICE)) return -EPERM; if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) --- linux/kernel/sys.c.orig +++ linux/kernel/sys.c @@ -1493,6 +1493,16 @@ asmlinkage long sys_setrlimit(unsigned i return -EPERM; if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN) return -EPERM; + /* + * Special security rule for RT_CPU: if a task sets its rlimit + * back to 0 then drop any RT priority it might have: + */ + if (resource == RLIMIT_RT_CPU && !capable(CAP_SYS_NICE) && + (!new_rlim.rlim_cur || !new_rlim.rlim_max)) { + struct sched_param param = { .sched_priority = 0 }; + + sched_setscheduler(current, SCHED_NORMAL, ¶m); + } retval = security_task_setrlimit(resource, &new_rlim); if (retval) --- linux/include/asm-generic/resource.h.orig +++ linux/include/asm-generic/resource.h @@ -21,7 +21,27 @@ #define RLIMIT_SIGPENDING 11 /* max number of pending signals */ #define RLIMIT_MSGQUEUE 12 /* maximum bytes in POSIX mqueues */ -#define RLIM_NLIMITS 13 +/* + * RLIMIT_RT_CPU - the maximum amount of CPU time an RT task + * may use, in percent. Defaults to 80%. + * + * - if there's idle time in the system then RT tasks will be + * allowed to use more than the limit. + * + * - if an RT task goes above the limit all the time then there + * is no guarantee that exactly the limit will be allowed for + * it. (i.e. you should set the limit to somewhat above the real + * needs of the RT task in question.) + * + * - a zero RLIMIT_RT_CPU value means unlimited CPU time to that + * RT task. + * + * - a nonzero rt_cpu_limit value also has the effect of allowing + * the use of RT priorities to nonprivileged users. + */ +#define RLIMIT_RT_CPU 13 + +#define RLIM_NLIMITS 14 #endif /* @@ -53,6 +73,7 @@ [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ [RLIMIT_SIGPENDING] = { MAX_SIGPENDING, MAX_SIGPENDING }, \ [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ + [RLIMIT_RT_CPU] = { 80, 100 }, \ } #endif /* __KERNEL__ */ --- linux/include/linux/sched.h.orig +++ linux/include/linux/sched.h @@ -95,6 +95,7 @@ extern int nr_processes(void); extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_iowait(void); +extern unsigned long rt_cpu_average(int cpu); #include #include