--- linux/fs/exec.c.orig Sun Jul 11 13:20:22 1999 +++ linux/fs/exec.c Sun Jul 11 17:36:47 1999 @@ -366,7 +366,7 @@ static int exec_mmap(void) { struct mm_struct * mm, * old_mm; - int retval, nr; + int retval; if (atomic_read(¤t->mm->count) == 1) { flush_cache_mm(current->mm); @@ -386,10 +386,9 @@ mm->total_vm = 0; mm->rss = 0; /* - * Make sure we have a private ldt if needed ... + * Make sure we have a private LDT if needed ... */ - nr = current->tarray_ptr - &task[0]; - copy_segments(nr, current, mm); + copy_segments(current, mm); old_mm = current->mm; current->mm = mm; @@ -408,7 +407,7 @@ fail_restore: current->mm = old_mm; /* restore the ldt for this task */ - copy_segments(nr, current, NULL); + copy_segments(current, NULL); release_segments(mm); kmem_cache_free(mm_cachep, mm); --- linux/init/main.c.orig Sun Jul 11 13:20:22 1999 +++ linux/init/main.c Sun Jul 11 17:36:47 1999 @@ -75,7 +75,7 @@ extern void init_modules(void); extern long console_init(long, long); extern void sock_init(void); -extern void uidcache_init(void); +extern void fork_init(unsigned long); extern void mca_init(void); extern void sbus_init(void); extern void ppc_init(void); @@ -1187,7 +1187,7 @@ #ifdef CONFIG_PROC_FS proc_root_init(); #endif - uidcache_init(); + fork_init(memory_end-memory_start); filescache_init(); dcache_init(); vma_init(); --- linux/kernel/exit.c.orig Sun Jul 11 13:20:22 1999 +++ linux/kernel/exit.c Sun Jul 11 17:36:47 1999 @@ -43,13 +43,7 @@ } #endif free_uid(p); - nr_tasks--; - add_free_taskslot(p->tarray_ptr); - - write_lock_irq(&tasklist_lock); - unhash_pid(p); - REMOVE_LINKS(p); - write_unlock_irq(&tasklist_lock); + unhash_process(p); release_thread(p); current->cmin_flt += p->min_flt + p->cmin_flt; --- linux/kernel/sched.c.orig Sun Jul 11 13:20:22 1999 +++ linux/kernel/sched.c Sun Jul 11 17:36:47 1999 @@ -94,7 +94,23 @@ * via the SMP irq return path. */ -struct task_struct * task[NR_TASKS] = {&init_task, }; +struct task_struct * init_tasks[NR_CPUS] = {&init_task, }; + +/* + * The tasklist_lock protects the linked list of processes. + * + * The scheduler lock is protecting against multiple entry + * into the scheduling code, and doesn't need to worry + * about interrupts (because interrupts cannot call the + * scheduler). + * + * The run-queue lock locks the parts that actually access + * and change the run-queues, and have to be interrupt-safe. + */ +spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED; /* second */ +rwlock_t tasklist_lock = RW_LOCK_UNLOCKED; /* third */ + +static LIST_HEAD(runqueue_head); /* * We align per-CPU scheduling data on cacheline boundaries, @@ -114,7 +130,7 @@ #ifdef __SMP__ -#define idle_task(cpu) (task[cpu_number_map[(cpu)]]) +#define idle_task(cpu) (init_tasks[cpu_number_map[(cpu)]]) #define can_schedule(p) (!(p)->has_cpu) #else @@ -366,74 +382,30 @@ */ static inline void add_to_runqueue(struct task_struct * p) { - struct task_struct *next = init_task.next_run; - - p->prev_run = &init_task; - init_task.next_run = p; - p->next_run = next; - next->prev_run = p; + list_add(&p->run_list, &runqueue_head); nr_running++; } -static inline void del_from_runqueue(struct task_struct * p) +inline void del_from_runqueue(struct task_struct * p) { - struct task_struct *next = p->next_run; - struct task_struct *prev = p->prev_run; - nr_running--; - next->prev_run = prev; - prev->next_run = next; - p->next_run = NULL; - p->prev_run = NULL; + list_del(&p->run_list); + set_task_nonrunning(p); } static inline void move_last_runqueue(struct task_struct * p) { - struct task_struct *next = p->next_run; - struct task_struct *prev = p->prev_run; - - /* remove from list */ - next->prev_run = prev; - prev->next_run = next; - /* add back to list */ - p->next_run = &init_task; - prev = init_task.prev_run; - init_task.prev_run = p; - p->prev_run = prev; - prev->next_run = p; + list_del(&p->run_list); + list_add_tail(&p->run_list, &runqueue_head); } static inline void move_first_runqueue(struct task_struct * p) { - struct task_struct *next = p->next_run; - struct task_struct *prev = p->prev_run; - - /* remove from list */ - next->prev_run = prev; - prev->next_run = next; - /* add back to list */ - p->prev_run = &init_task; - next = init_task.next_run; - init_task.next_run = p; - p->next_run = next; - next->prev_run = p; + list_del(&p->run_list); + list_add(&p->run_list, &runqueue_head); } /* - * The tasklist_lock protects the linked list of processes. - * - * The scheduler lock is protecting against multiple entry - * into the scheduling code, and doesn't need to worry - * about interrupts (because interrupts cannot call the - * scheduler). - * - * The run-queue lock locks the parts that actually access - * and change the run-queues, and have to be interrupt-safe. - */ -spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED; /* second */ -rwlock_t tasklist_lock = RW_LOCK_UNLOCKED; /* third */ - -/* * Wake up a process. Put it on the run-queue if it's not * already there. The "current" process is always on the * run-queue (except when the actual re-schedule is in @@ -450,7 +422,7 @@ */ spin_lock_irqsave(&runqueue_lock, flags); p->state = TASK_RUNNING; - if (p->next_run) + if (task_running(p)) goto out; add_to_runqueue(p); spin_unlock_irqrestore(&runqueue_lock, flags); @@ -687,6 +659,7 @@ { struct schedule_data * sched_data; struct task_struct *prev, *next, *p; + struct list_head *tmp; int this_cpu, c; if (tq_scheduler) @@ -731,42 +704,29 @@ } prev->need_resched = 0; -repeat_schedule: - /* * this is the scheduler proper: */ - p = init_task.next_run; - /* Default process to select.. */ +repeat_schedule: + /* + * Default process to select.. + */ next = idle_task(this_cpu); c = -1000; if (prev->state == TASK_RUNNING) goto still_running; still_running_back: - /* - * This is subtle. - * Note how we can enable interrupts here, even - * though interrupts can add processes to the run- - * queue. This is because any new processes will - * be added to the front of the queue, so "p" above - * is a safe starting point. - * run-queue deletion and re-ordering is protected by - * the scheduler lock - */ -/* - * Note! there may appear new tasks on the run-queue during this, as - * interrupts are enabled. However, they will be put on front of the - * list, so our list starting at "p" is essentially fixed. - */ - while (p != &init_task) { + tmp = runqueue_head.next; + while (tmp != &runqueue_head) { + p = list_entry(tmp, struct task_struct, run_list); if (can_schedule(p)) { int weight = goodness(prev, p, this_cpu); if (weight > c) c = weight, next = p; } - p = p->next_run; + tmp = tmp->next; } /* Do we need to re-calculate counters? */ @@ -837,8 +797,8 @@ p->counter = (p->counter >> 1) + p->priority; read_unlock(&tasklist_lock); spin_lock_irq(&runqueue_lock); - goto repeat_schedule; } + goto repeat_schedule; still_running: c = prev_goodness(prev, prev, this_cpu); @@ -1760,7 +1720,7 @@ retval = 0; p->policy = policy; p->rt_priority = lp.sched_priority; - if (p->next_run) + if (task_running(p)) move_first_runqueue(p); current->need_resched = 1; @@ -1934,13 +1894,13 @@ return 0; } -static void show_task(int nr,struct task_struct * p) +static void show_task(struct task_struct * p) { unsigned long free = 0; int state; static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" }; - printk("%-8s %3d ", p->comm, (p == current) ? -nr : nr); + printk("%-8s ", p->comm); state = p->state ? ffz(~p->state) + 1 : 0; if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *)) printk(stat_nam[state]); @@ -1950,12 +1910,12 @@ if (p == current) printk(" current "); else - printk(" %08lX ", thread_saved_pc(&p->tss)); + printk(" %08lX ", thread_saved_pc(&p->thread)); #else if (p == current) printk(" current task "); else - printk(" %016lx ", thread_saved_pc(&p->tss)); + printk(" %016lx ", thread_saved_pc(&p->thread)); #endif { unsigned long * n = (unsigned long *) (p+1); @@ -2020,7 +1980,7 @@ #endif read_lock(&tasklist_lock); for_each_task(p) - show_task((p->tarray_ptr - &task[0]),p); + show_task(p); read_unlock(&tasklist_lock); } @@ -2030,6 +1990,11 @@ struct schedule_data * sched_data; sched_data = &aligned_data[smp_processor_id()].schedule_data; + if (current != &init_task && task_running(current)) { + printk("UGH! (%d:%d) was on the runqueue, removing.\n", + smp_processor_id(), current->pid); + del_from_runqueue(current); + } t = get_cycles(); sched_data->curr = current; sched_data->last_schedule = t; @@ -2042,14 +2007,10 @@ * process right in SMP mode. */ int cpu=hard_smp_processor_id(); - int nr = NR_TASKS; + int nr; init_task.processor=cpu; - /* Init task array free list and pidhash table. */ - while(--nr > 0) - add_free_taskslot(&task[nr]); - for(nr = 0; nr < PIDHASH_SZ; nr++) pidhash[nr] = NULL; @@ -2057,3 +2018,4 @@ init_bh(TQUEUE_BH, tqueue_bh); init_bh(IMMEDIATE_BH, immediate_bh); } + --- linux/kernel/fork.c.orig Sun Jul 11 13:20:22 1999 +++ linux/kernel/fork.c Sun Jul 11 17:36:47 1999 @@ -22,11 +22,12 @@ #include #include -/* The idle tasks do not count.. */ -int nr_tasks=0; +/* The idle threads do not count.. */ +int nr_threads=0; int nr_running=0; -unsigned long int total_forks=0; /* Handle normal Linux uptimes. */ +int max_threads; +unsigned long total_forks = 0; /* Handle normal Linux uptimes. */ int last_pid=0; /* SLAB cache for mm_struct's. */ @@ -37,9 +38,6 @@ struct task_struct *pidhash[PIDHASH_SZ]; -struct task_struct **tarray_freelist = NULL; -spinlock_t taskslot_lock = SPIN_LOCK_UNLOCKED; - /* UID task count cache, to prevent walking entire process list every * single fork() operation. */ @@ -159,7 +157,7 @@ return 0; } -void __init uidcache_init(void) +void __init fork_init(unsigned long memsize) { int i; @@ -171,15 +169,16 @@ for(i = 0; i < UIDHASH_SZ; i++) uidhash[i] = 0; -} -static inline struct task_struct ** find_empty_process(void) -{ - struct task_struct **tslot = NULL; + /* + * The default maximum number of threads is set to a safe + * value: the thread structures can take up at most half + * of memory. + */ + max_threads = memsize / THREAD_SIZE / 2; - if ((nr_tasks < NR_TASKS - MIN_TASKS_LEFT_FOR_ROOT) || !current->uid) - tslot = get_free_taskslot(); - return tslot; + init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; + init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2; } /* Protects next_safe and last_pid. */ @@ -358,7 +357,7 @@ } } -static inline int copy_mm(int nr, unsigned long clone_flags, struct task_struct * tsk) +static inline int copy_mm(unsigned long clone_flags, struct task_struct * tsk) { struct mm_struct * mm; int retval; @@ -370,9 +369,10 @@ if (clone_flags & CLONE_VM) { mmget(current->mm); /* - * Set up the LDT descriptor for the clone task. + * No need to worry about the LDT descriptor for the + * cloned task, LDTs get magically loaded at + * __switch_to time if necessary. */ - copy_segments(nr, tsk, NULL); SET_PAGE_DIR(tsk, current->mm->pgd); return 0; } @@ -383,7 +383,11 @@ goto fail_nomem; tsk->mm = mm; - copy_segments(nr, tsk, mm); + /* + * child gets a private LDT (if there was an LDT in the parent) + */ + copy_segments(tsk, mm); + retval = new_page_tables(tsk); if (retval) goto free_mm; @@ -542,7 +546,6 @@ */ int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs) { - int nr; int retval = -ENOMEM; struct task_struct *p; DECLARE_MUTEX_LOCKED(sem); @@ -565,15 +568,12 @@ atomic_inc(&p->user->count); } - { - struct task_struct **tslot; - tslot = find_empty_process(); - if (!tslot) - goto bad_fork_cleanup_count; - p->tarray_ptr = tslot; - *tslot = p; - nr = tslot - &task[0]; - } + /* + * Counter atomicity is protected by + * the kernel lock + */ + if (nr_threads >= max_threads) + goto bad_fork_cleanup_count; if (p->exec_domain && p->exec_domain->module) __MOD_INC_USE_COUNT(p->exec_domain->module); @@ -594,8 +594,7 @@ * very end). */ p->state = TASK_RUNNING; - p->next_run = p; - p->prev_run = p; + set_task_nonrunning(p); p->p_pptr = p->p_opptr = current; p->p_cptr = NULL; @@ -638,9 +637,9 @@ goto bad_fork_cleanup_files; if (copy_sighand(clone_flags, p)) goto bad_fork_cleanup_fs; - if (copy_mm(nr, clone_flags, p)) + if (copy_mm(clone_flags, p)) goto bad_fork_cleanup_sighand; - retval = copy_thread(nr, clone_flags, usp, p, regs); + retval = copy_thread(0, clone_flags, usp, p, regs); if (retval) goto bad_fork_cleanup_sighand; p->semundo = NULL; @@ -666,19 +665,15 @@ * Let it rip! */ retval = p->pid; - if (retval) { - write_lock_irq(&tasklist_lock); - SET_LINKS(p); - hash_pid(p); - write_unlock_irq(&tasklist_lock); - - nr_tasks++; - - p->next_run = NULL; - p->prev_run = NULL; - wake_up_process(p); /* do this last */ - } + write_lock_irq(&tasklist_lock); + SET_LINKS(p); + hash_pid(p); + write_unlock_irq(&tasklist_lock); + + nr_threads++; + wake_up_process(p); /* do this last */ ++total_forks; + bad_fork: unlock_kernel(); up(¤t->mm->mmap_sem); @@ -699,7 +694,7 @@ if (p->binfmt && p->binfmt->module) __MOD_DEC_USE_COUNT(p->binfmt->module); - add_free_taskslot(p->tarray_ptr); + nr_threads--; bad_fork_cleanup_count: if (p->user) free_uid(p); --- linux/kernel/panic.c.orig Tue Dec 29 16:37:08 1998 +++ linux/kernel/panic.c Sun Jul 11 17:36:47 1999 @@ -40,7 +40,7 @@ vsprintf(buf, fmt, args); va_end(args); printk(KERN_EMERG "Kernel panic: %s\n",buf); - if (current == task[0]) + if (current == init_tasks[0]) printk(KERN_EMERG "In swapper task - not syncing\n"); else if (in_interrupt()) printk(KERN_EMERG "In interrupt handler - not syncing\n"); --- linux/kernel/sysctl.c.orig Sun Jul 11 13:20:22 1999 +++ linux/kernel/sysctl.c Sun Jul 11 17:36:47 1999 @@ -34,6 +34,7 @@ extern int console_loglevel, C_A_D; extern int bdf_prm[], bdflush_min[], bdflush_max[]; extern int sysctl_overcommit_memory; +extern int max_threads; extern int nr_queued_signals, max_queued_signals; #ifdef CONFIG_KMOD @@ -207,6 +208,8 @@ {KERN_SHMMAX, "shmmax", &shmmax, sizeof (int), 0644, NULL, &proc_dointvec}, #endif + {KERN_MAX_THREADS, "threads-max", &max_threads, sizeof(int), + 0644, NULL, &proc_dointvec}, {0} }; --- linux/kernel/info.c.orig Tue Dec 1 12:34:28 1998 +++ linux/kernel/info.c Sun Jul 11 17:36:47 1999 @@ -26,7 +26,7 @@ val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); - val.procs = nr_tasks-1; + val.procs = nr_threads-1; sti(); si_meminfo(&val); --- linux/mm/vmscan.c.orig Sun Jul 11 13:20:22 1999 +++ linux/mm/vmscan.c Sun Jul 11 17:36:47 1999 @@ -338,11 +338,11 @@ * Think of swap_cnt as a "shadow rss" - it tells us which process * we want to page out (always try largest first). */ - counter = nr_tasks / (priority+1); + counter = nr_threads / (priority+1); if (counter < 1) counter = 1; - if (counter > nr_tasks) - counter = nr_tasks; + if (counter > nr_threads) + counter = nr_threads; for (; counter >= 0; counter--) { assign = 0; --- linux/include/linux/sched.h.orig Sun Jul 11 13:20:22 1999 +++ linux/include/linux/sched.h Sun Jul 11 17:36:47 1999 @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include #include @@ -63,7 +63,7 @@ #define CT_TO_SECS(x) ((x) / HZ) #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) -extern int nr_running, nr_tasks; +extern int nr_running, nr_threads; extern int last_pid; #include @@ -119,6 +119,7 @@ extern void sched_init(void); extern void init_idle(void); extern void show_state(void); +extern void cpu_init (void); extern void trap_init(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX @@ -243,7 +244,7 @@ int last_processor; int lock_depth; /* Lock depth. We can context switch in and out of holding a syscall kernel lock... */ struct task_struct *next_task, *prev_task; - struct task_struct *next_run, *prev_run; + struct list_head run_list; /* task state */ struct linux_binfmt *binfmt; @@ -270,9 +271,6 @@ struct task_struct *pidhash_next; struct task_struct **pidhash_pprev; - /* Pointer to task[] array linkage. */ - struct task_struct **tarray_ptr; - wait_queue_head_t wait_chldexit; /* for wait4() */ struct semaphore *vfork_sem; /* for vfork() */ unsigned long policy, rt_priority; @@ -302,8 +300,8 @@ /* ipc stuff */ struct sem_undo *semundo; struct sem_queue *semsleeping; -/* tss for this task */ - struct thread_struct tss; +/* CPU-specific state of this task */ + struct soft_thread_struct thread; /* filesystem information */ struct fs_struct *fs; /* open file information */ @@ -355,13 +353,12 @@ /* state etc */ { 0,0,0,KERNEL_DS,&default_exec_domain,0, \ /* counter */ DEF_PRIORITY,DEF_PRIORITY,0, \ /* SMP */ 0,0,0,-1, \ -/* schedlink */ &init_task,&init_task, &init_task, &init_task, \ +/* schedlink */ &init_task,&init_task, LIST_HEAD_INIT(init_task.run_list), \ /* binfmt */ NULL, \ /* ec,brk... */ 0,0,0,0,0,0, \ /* pid etc.. */ 0,0,0,0,0, \ /* proc links*/ &init_task,&init_task,NULL,NULL,NULL, \ /* pidhash */ NULL, NULL, \ -/* tarray */ &task[0], \ /* chld wait */ __WAIT_QUEUE_HEAD_INITIALIZER(name.wait_chldexit), NULL, \ /* timeout */ SCHED_OTHER,0,0,0,0,0,0,0, \ /* timer */ { NULL, NULL, 0, 0, it_real_fn }, \ @@ -379,7 +376,7 @@ /* comm */ "swapper", \ /* fs info */ 0,NULL, \ /* ipc */ NULL, NULL, \ -/* tss */ INIT_TSS, \ +/* thread */ INIT_THREAD, \ /* fs */ &init_fs, \ /* files */ &init_files, \ /* mm */ &init_mm, \ @@ -398,33 +395,10 @@ extern union task_union init_task_union; extern struct mm_struct init_mm; -extern struct task_struct *task[NR_TASKS]; - -extern struct task_struct **tarray_freelist; -extern spinlock_t taskslot_lock; - -extern __inline__ void add_free_taskslot(struct task_struct **t) -{ - spin_lock(&taskslot_lock); - *t = (struct task_struct *) tarray_freelist; - tarray_freelist = t; - spin_unlock(&taskslot_lock); -} +extern struct task_struct *init_tasks[NR_CPUS]; -extern __inline__ struct task_struct **get_free_taskslot(void) -{ - struct task_struct **tslot; - - spin_lock(&taskslot_lock); - if((tslot = tarray_freelist) != NULL) - tarray_freelist = (struct task_struct **) *tslot; - spin_unlock(&taskslot_lock); - - return tslot; -} - -/* PID hashing. */ -#define PIDHASH_SZ (NR_TASKS >> 2) +/* PID hashing. (shouldnt this be dynamic?) */ +#define PIDHASH_SZ (4096 >> 2) extern struct task_struct *pidhash[PIDHASH_SZ]; #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) @@ -740,6 +714,30 @@ #define for_each_task(p) \ for (p = &init_task ; (p = p->next_task) != &init_task ; ) + + +extern inline void del_from_runqueue(struct task_struct * p); + +extern inline void set_task_nonrunning (struct task_struct *p) +{ + p->run_list.next = 0; +} + +extern inline int task_running (struct task_struct *p) +{ + return (int)p->run_list.next; +} + +extern inline void unhash_process (struct task_struct *p) +{ + nr_threads--; + write_lock_irq(&tasklist_lock); + unhash_pid(p); + REMOVE_LINKS(p); + write_unlock_irq(&tasklist_lock); + if (task_running(p)) + del_from_runqueue(p); +} #endif /* __KERNEL__ */ --- linux/include/linux/tasks.h.orig Mon Jan 11 05:08:03 1999 +++ linux/include/linux/tasks.h Sun Jul 11 17:36:47 1999 @@ -1,25 +0,0 @@ -#ifndef _LINUX_TASKS_H -#define _LINUX_TASKS_H - -/* - * This is the maximum nr of tasks - change it if you need to - */ - -#ifdef __SMP__ -#define NR_CPUS 32 /* Max processors that can be running in SMP */ -#else -#define NR_CPUS 1 -#endif - -#define NR_TASKS 512 /* On x86 Max 4092, or 4090 w/APM configured. */ - -#define MAX_TASKS_PER_USER (NR_TASKS/2) -#define MIN_TASKS_LEFT_FOR_ROOT 4 - - -/* - * This controls the maximum pid allocated to a process - */ -#define PID_MAX 0x8000 - -#endif --- linux/include/linux/sysctl.h.orig Sun Jul 11 13:20:20 1999 +++ linux/include/linux/sysctl.h Sun Jul 11 17:36:47 1999 @@ -100,7 +100,8 @@ KERN_SHMMAX=34, /* int: Maximum shared memory segment */ KERN_MSGMAX=35, /* int: Maximum size of a messege */ KERN_MSGMNB=36, /* int: Maximum message queue size */ - KERN_MSGPOOL=37 /* int: Maximum system message pool size */ + KERN_MSGPOOL=37, /* int: Maximum system message pool size */ + KERN_MAX_THREADS=38 /* int: Maximum nr of threads in the system */ }; --- linux/include/linux/kernel_stat.h.orig Mon Jan 11 05:08:04 1999 +++ linux/include/linux/kernel_stat.h Sun Jul 11 17:36:47 1999 @@ -3,7 +3,7 @@ #include #include -#include +#include /* * 'kernel_stat.h' contains the definitions needed for doing --- linux/include/linux/threads.h.orig Sun Jul 11 13:20:38 1999 +++ linux/include/linux/threads.h Sun Jul 11 17:36:47 1999 @@ -0,0 +1,22 @@ +#ifndef _LINUX_THREADS_H +#define _LINUX_THREADS_H + +/* + * The default limit for the nr of threads is now in + * /proc/sys/kernel/max-threads. + */ + +#ifdef __SMP__ +#define NR_CPUS 32 /* Max processors that can be running in SMP */ +#else +#define NR_CPUS 1 +#endif + +#define MIN_THREADS_LEFT_FOR_ROOT 4 + +/* + * This controls the maximum pid allocated to a process + */ +#define PID_MAX 0x8000 + +#endif --- linux/include/linux/list.h.orig Mon Dec 1 20:16:57 1997 +++ linux/include/linux/list.h Sun Jul 11 17:36:47 1999 @@ -17,8 +17,10 @@ struct list_head *next, *prev; }; +#define LIST_HEAD_INIT(name) { &(name), &(name) } + #define LIST_HEAD(name) \ - struct list_head name = { &name, &name } + struct list_head name = LIST_HEAD_INIT(name) #define INIT_LIST_HEAD(ptr) do { \ (ptr)->next = (ptr); (ptr)->prev = (ptr); \ @@ -46,6 +48,15 @@ static __inline__ void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); +} + +/* + * Insert a new entry before the specified head.. + */ +static __inline__ void list_add_tail(struct list_head *new, + struct list_head *head) +{ + __list_add(new, head, head->prev); } /* --- linux/include/asm-i386/pgtable.h.orig Sun Jul 11 13:20:20 1999 +++ linux/include/asm-i386/pgtable.h Sun Jul 11 17:36:47 1999 @@ -15,7 +15,7 @@ #ifndef __ASSEMBLY__ #include #include -#include +#include /* Caches aren't brain-dead on the intel. */ #define flush_cache_all() do { } while (0) @@ -306,7 +306,7 @@ #define SET_PAGE_DIR(tsk,pgdir) \ do { \ unsigned long __pgdir = __pa(pgdir); \ - (tsk)->tss.cr3 = __pgdir; \ + (tsk)->thread.cr3 = __pgdir; \ if ((tsk) == current) \ __asm__ __volatile__("movl %0,%%cr3": :"r" (__pgdir)); \ } while (0) @@ -481,9 +481,9 @@ extern void __bad_pte(pmd_t *pmd); extern void __bad_pte_kernel(pmd_t *pmd); -#define pte_free_kernel(pte) free_pte_fast(pte) -#define pte_free(pte) free_pte_fast(pte) -#define pgd_free(pgd) free_pgd_fast(pgd) +#define pte_free_kernel(pte) free_pte_slow(pte) +#define pte_free(pte) free_pte_slow(pte) +#define pgd_free(pgd) free_pgd_slow(pgd) #define pgd_alloc() get_pgd_fast() extern inline pte_t * pte_alloc_kernel(pmd_t * pmd, unsigned long address) --- linux/include/asm-i386/processor.h.orig Thu Jun 17 01:01:24 1999 +++ linux/include/asm-i386/processor.h Sun Jul 11 17:36:47 1999 @@ -12,6 +12,7 @@ #include #include #include +#include /* * Default implementation of macro that returns current @@ -95,6 +96,7 @@ #define X86_FEATURE_AMD3D 0x80000000 extern struct cpuinfo_x86 boot_cpu_data; +extern struct hard_thread_struct init_tss[NR_CPUS]; #ifdef __SMP__ extern struct cpuinfo_x86 cpu_data[]; @@ -124,6 +126,48 @@ : "cc"); } + +/* + * Intel CPU features in CR4 + */ +#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ +#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ +#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ +#define X86_CR4_DE 0x0008 /* enable debugging extensions */ +#define X86_CR4_PSE 0x0010 /* enable page size extensions */ +#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ +#define X86_CR4_MCE 0x0040 /* Machine check enable */ +#define X86_CR4_PGE 0x0080 /* enable global pages */ +#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ + +/* + * Save the cr4 feature set we're using (ie + * Pentium 4MB enable and PPro Global page + * enable), so that any CPU's that boot up + * after us can get the correct flags. + */ +extern unsigned long mmu_cr4_features; + +static inline void set_in_cr4 (unsigned long mask) +{ + mmu_cr4_features |= mask; + __asm__("movl %%cr4,%%eax\n\t" + "orl %0,%%eax\n\t" + "movl %%eax,%%cr4\n" + : : "irg" (mask) + :"ax"); +} + +static inline void clear_in_cr4 (unsigned long mask) +{ + mmu_cr4_features &= ~mask; + __asm__("movl %%cr4,%%eax\n\t" + "andl %0,%%eax\n\t" + "movl %%eax,%%cr4\n" + : : "irg" (~mask) + :"ax"); +} + /* * Cyrix CPU configuration register indexes */ @@ -177,6 +221,8 @@ * Size of io_bitmap in longwords: 32 is ports 0-0x3ff. */ #define IO_BITMAP_SIZE 32 +#define IO_BITMAP_OFFSET offsetof(struct hard_thread_struct,io_bitmap) +#define INVALID_IO_BITMAP_OFFSET 0x8000 struct i387_hard_struct { long cwd; @@ -213,7 +259,7 @@ unsigned long seg; } mm_segment_t; -struct thread_struct { +struct hard_thread_struct { unsigned short back_link,__blh; unsigned long esp0; unsigned short ss0,__ss0h; @@ -238,19 +284,44 @@ unsigned short ldt, __ldth; unsigned short trace, bitmap; unsigned long io_bitmap[IO_BITMAP_SIZE+1]; - unsigned long tr; + /* + * pads the TSS to be cacheline-aligned (size is 0x100) + */ + unsigned long __cacheline_filler[5]; +}; + +struct soft_thread_struct { + unsigned long esp0; + unsigned long cr3; + unsigned long eip; + unsigned long esp; + unsigned long fs; + unsigned long gs; +/* Hardware debugging registers */ + unsigned long debugreg[8]; /* %%db0-7 debug registers */ +/* fault info */ unsigned long cr2, trap_no, error_code; - mm_segment_t segment; -/* debug registers */ - long debugreg[8]; /* Hardware debugging registers */ /* floating point info */ - union i387_union i387; + union i387_union i387; /* virtual 86 mode info */ - struct vm86_struct * vm86_info; - unsigned long screen_bitmap; - unsigned long v86flags, v86mask, v86mode, saved_esp0; + struct vm86_struct * vm86_info; + unsigned long screen_bitmap; + unsigned long v86flags, v86mask, v86mode, saved_esp0; +/* IO permissions */ + int ioperm; + unsigned long io_bitmap[IO_BITMAP_SIZE+1]; }; +#define INIT_THREAD { \ + 0,(long) &swapper_pg_dir - PAGE_OFFSET, \ + 0, 0, 0, 0, \ + { [0 ... 7] = 0 }, /* debugging registers */ \ + 0, 0, 0, \ + { { 0, }, }, /* 387 state */ \ + 0,0,0,0,0,0, \ + 0,{~0,} /* io permissions */ \ +} + #define INIT_MMAP \ { &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL } @@ -265,13 +336,9 @@ 0,0,0,0, /* esp,ebp,esi,edi */ \ 0,0,0,0,0,0, /* es,cs,ss */ \ 0,0,0,0,0,0, /* ds,fs,gs */ \ - _LDT(0),0, /* ldt */ \ - 0, 0x8000, /* tace, bitmap */ \ - {~0, }, /* ioperm */ \ - _TSS(0), 0, 0, 0, (mm_segment_t) { 0 }, /* obsolete */ \ - { 0, }, \ - { { 0, }, }, /* 387 state */ \ - NULL, 0, 0, 0, 0, 0, /* vm86_info */ \ + __LDT(0),0, /* ldt */ \ + 0, INVALID_IO_BITMAP_OFFSET, /* tace, bitmap */ \ + {~0, } /* ioperm */ \ } #define start_thread(regs, new_eip, new_esp) do { \ @@ -291,10 +358,13 @@ /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); +/* + * create a kernel thread without removing it from tasklists + */ extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); /* Copy and release all segment info associated with a VM */ -extern void copy_segments(int nr, struct task_struct *p, struct mm_struct * mm); +extern void copy_segments(struct task_struct *p, struct mm_struct * mm); extern void release_segments(struct mm_struct * mm); extern void forget_segments(void); @@ -302,7 +372,7 @@ * FPU lazy state save handling.. */ #define save_fpu(tsk) do { \ - asm volatile("fnsave %0\n\tfwait":"=m" (tsk->tss.i387)); \ + asm volatile("fnsave %0\n\tfwait":"=m" (tsk->thread.i387)); \ tsk->flags &= ~PF_USEDFPU; \ stts(); \ } while (0) @@ -322,11 +392,12 @@ /* * Return saved PC of a blocked thread. */ -extern inline unsigned long thread_saved_pc(struct thread_struct *t) +extern inline unsigned long thread_saved_pc(struct soft_thread_struct *t) { return ((unsigned long *)t->esp)[3]; } +#define THREAD_SIZE (2*PAGE_SIZE) extern struct task_struct * alloc_task_struct(void); extern void free_task_struct(struct task_struct *); --- linux/include/asm-i386/system.h.orig Thu Jun 17 01:01:24 1999 +++ linux/include/asm-i386/system.h Sun Jul 11 17:36:48 1999 @@ -22,9 +22,9 @@ "popl %%ebp\n\t" \ "popl %%edi\n\t" \ "popl %%esi\n\t" \ - :"=m" (prev->tss.esp),"=m" (prev->tss.eip), \ + :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \ "=b" (last) \ - :"m" (next->tss.esp),"m" (next->tss.eip), \ + :"m" (next->thread.esp),"m" (next->thread.eip), \ "a" (prev), "d" (next), \ "b" (prev)); \ } while (0) --- linux/include/asm-i386/desc.h.orig Fri Oct 30 00:17:40 1998 +++ linux/include/asm-i386/desc.h Sun Jul 11 17:36:48 1999 @@ -1,6 +1,46 @@ #ifndef __ARCH_DESC_H #define __ARCH_DESC_H +#include + +/* + * The layout of the GDT under Linux: + * + * 0 - null + * 1 - not used + * 2 - kernel code segment + * 3 - kernel data segment + * 4 - user code segment <-- new cacheline + * 5 - user data segment + * 6 - not used + * 7 - not used + * 8 - APM BIOS support <-- new cacheline + * 9 - APM BIOS support + * 10 - APM BIOS support + * 11 - APM BIOS support + * + * The TSS+LDT descriptors are spread out a bit so that every CPU + * has an exclusive cacheline for the per-CPU TSS and LDT: + * + * 12 - CPU#0 TSS <-- new cacheline + * 13 - CPU#0 LDT + * 14 - not used + * 15 - not used + * 16 - CPU#1 TSS <-- new cacheline + * 17 - CPU#1 LDT + * 18 - not used + * 19 - not used + * ... NR_CPUS per-CPU TSS+LDT's if on SMP + * + * Entry into gdt where to find first TSS. + */ +#define __FIRST_TSS_ENTRY 12 +#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY+1) + +#define __TSS(n) (((n)<<2) + __FIRST_TSS_ENTRY) +#define __LDT(n) (((n)<<2) + __FIRST_LDT_ENTRY) + +#ifndef __ASSEMBLY__ struct desc_struct { unsigned long a,b; }; @@ -16,46 +56,33 @@ #define idt_descr (*(struct Xgt_desc_struct *)((char *)&idt - 2)) #define gdt_descr (*(struct Xgt_desc_struct *)((char *)&gdt - 2)) +#define load_TR(n) __asm__ __volatile__("ltr %%ax"::"a" (__TSS(n)<<3)) + +#define __load_LDT(n) __asm__ __volatile__("lldt %%ax"::"a" (__LDT(n)<<3)) + /* - * Entry into gdt where to find first TSS. GDT layout: - * 0 - null - * 1 - not used - * 2 - kernel code segment - * 3 - kernel data segment - * 4 - user code segment - * 5 - user data segment - * 6 - not used - * 7 - not used - * 8 - APM BIOS support - * 9 - APM BIOS support - * 10 - APM BIOS support - * 11 - APM BIOS support - * 12 - TSS #0 - * 13 - LDT #0 - * 14 - TSS #1 - * 15 - LDT #1 + * This is the ldt that every process will get unless we need + * something other than this. */ -#define FIRST_TSS_ENTRY 12 -#define FIRST_LDT_ENTRY (FIRST_TSS_ENTRY+1) -#define _TSS(n) ((((unsigned long) n)<<4)+(FIRST_TSS_ENTRY<<3)) -#define _LDT(n) ((((unsigned long) n)<<4)+(FIRST_LDT_ENTRY<<3)) -#define load_TR(n) __asm__ __volatile__("ltr %%ax": /* no output */ :"a" (_TSS(n))) -#define load_ldt(n) __asm__ __volatile__("lldt %%ax": /* no output */ :"a" (_LDT(n))) -#define store_TR(n) \ -__asm__("str %%ax\n\t" \ - "subl %2,%%eax\n\t" \ - "shrl $4,%%eax" \ - :"=a" (n) \ - :"0" (0),"i" (FIRST_TSS_ENTRY<<3)) - +extern struct desc_struct default_ldt; extern void set_intr_gate(unsigned int irq, void * addr); extern void set_ldt_desc(unsigned int n, void *addr, unsigned int size); extern void set_tss_desc(unsigned int n, void *addr); /* - * This is the ldt that every process will get unless we need - * something other than this. + * load one particular LDT into the current CPU */ -extern struct desc_struct default_ldt; +extern inline void load_LDT (struct mm_struct *mm) +{ + int cpu = smp_processor_id(); + + if (mm->segments) + set_ldt_desc(cpu, mm->segments, LDT_ENTRIES); + else + set_ldt_desc(cpu, &default_ldt, 1); + __load_LDT(cpu); +} + +#endif /* !__ASSEMBLY__ */ #endif --- linux/include/asm-i386/mmu_context.h.orig Wed May 6 06:33:33 1998 +++ linux/include/asm-i386/mmu_context.h Sun Jul 11 17:36:48 1999 @@ -1,13 +1,19 @@ #ifndef __I386_MMU_CONTEXT_H #define __I386_MMU_CONTEXT_H +#include + /* - * get a new mmu context.. x86's don't know about contexts. + * get a new mmu context.. x86's don't know much about contexts, + * but we have to reload the new LDT in exec(). */ -#define get_mmu_context(x) do { } while (0) +#define get_mmu_context(tsk) do { } while(0) #define init_new_context(mm) do { } while(0) +/* + * possibly do the LDT unload here? + */ #define destroy_context(mm) do { } while(0) -#define activate_context(tsk) do { } while(0) +#define activate_context(x) load_LDT((x)->mm) #endif --- linux/include/asm-i386/ldt.h.orig Wed Dec 11 15:41:23 1996 +++ linux/include/asm-i386/ldt.h Sun Jul 11 17:36:48 1999 @@ -11,6 +11,7 @@ /* The size of each LDT entry. */ #define LDT_ENTRY_SIZE 8 +#ifndef __ASSEMBLY__ struct modify_ldt_ldt_s { unsigned int entry_number; unsigned long base_addr; @@ -27,4 +28,5 @@ #define MODIFY_LDT_CONTENTS_STACK 1 #define MODIFY_LDT_CONTENTS_CODE 2 +#endif /* !__ASSEMBLY__ */ #endif --- linux/include/asm-i386/hardirq.h.orig Mon Jan 11 05:08:03 1999 +++ linux/include/asm-i386/hardirq.h Sun Jul 11 17:36:48 1999 @@ -1,7 +1,7 @@ #ifndef __ASM_HARDIRQ_H #define __ASM_HARDIRQ_H -#include +#include extern unsigned int local_irq_count[NR_CPUS]; --- linux/include/asm-i386/smp.h.orig Tue Jan 26 10:47:16 1999 +++ linux/include/asm-i386/smp.h Sun Jul 11 17:36:48 1999 @@ -16,7 +16,7 @@ #ifdef __SMP__ #ifndef ASSEMBLY -#include +#include #include /* --- linux/include/asm-i386/resource.h.orig Tue Nov 24 12:41:28 1998 +++ linux/include/asm-i386/resource.h Sun Jul 11 17:36:48 1999 @@ -28,7 +28,7 @@ { _STK_LIM, LONG_MAX }, \ { 0, LONG_MAX }, \ { LONG_MAX, LONG_MAX }, \ - { MAX_TASKS_PER_USER, MAX_TASKS_PER_USER }, \ + { 0, 0 }, \ { NR_OPEN, NR_OPEN }, \ { LONG_MAX, LONG_MAX }, \ { LONG_MAX, LONG_MAX }, \ --- linux/arch/i386/mm/init.c.orig Sun Jul 11 13:20:19 1999 +++ linux/arch/i386/mm/init.c Sun Jul 11 17:36:48 1999 @@ -181,34 +181,6 @@ extern char _text, _etext, _edata, __bss_start, _end; extern char __init_begin, __init_end; -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ -#define X86_CR4_DE 0x0008 /* enable debugging extensions */ -#define X86_CR4_PSE 0x0010 /* enable page size extensions */ -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ -#define X86_CR4_MCE 0x0040 /* Machine check enable */ -#define X86_CR4_PGE 0x0080 /* enable global pages */ -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ - -/* - * Save the cr4 feature set we're using (ie - * Pentium 4MB enable and PPro Global page - * enable), so that any CPU's that boot up - * after us can get the correct flags. - */ -unsigned long mmu_cr4_features __initdata = 0; - -static inline void set_in_cr4(unsigned long mask) -{ - mmu_cr4_features |= mask; - __asm__("movl %%cr4,%%eax\n\t" - "orl %0,%%eax\n\t" - "movl %%eax,%%cr4\n" - : : "irg" (mask) - :"ax"); -} - /* * allocate page table(s) for compile-time fixed mappings */ --- linux/arch/i386/mm/fault.c.orig Sun Jul 11 13:20:20 1999 +++ linux/arch/i386/mm/fault.c Sun Jul 11 17:36:48 1999 @@ -177,7 +177,7 @@ if (regs->eflags & VM_MASK) { unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; if (bit < 32) - tsk->tss.screen_bitmap |= 1 << bit; + tsk->thread.screen_bitmap |= 1 << bit; } up(&mm->mmap_sem); return; @@ -191,9 +191,9 @@ /* User mode accesses just cause a SIGSEGV */ if (error_code & 4) { - tsk->tss.cr2 = address; - tsk->tss.error_code = error_code; - tsk->tss.trap_no = 14; + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; force_sig(SIGSEGV, tsk); return; } @@ -243,9 +243,11 @@ else printk(KERN_ALERT "Unable to handle kernel paging request"); printk(" at virtual address %08lx\n",address); + printk(" printing eip:\n"); + printk("%08lx\n", regs->eip); __asm__("movl %%cr3,%0" : "=r" (page)); - printk(KERN_ALERT "current->tss.cr3 = %08lx, %%cr3 = %08lx\n", - tsk->tss.cr3, page); + printk(KERN_ALERT "current->thread.cr3 = %08lx, %%cr3 = %08lx\n", + tsk->thread.cr3, page); page = ((unsigned long *) __va(page))[address >> 22]; printk(KERN_ALERT "*pde = %08lx\n", page); if (page & 1) { @@ -275,9 +277,9 @@ * Send a sigbus, regardless of whether we were in kernel * or user mode. */ - tsk->tss.cr2 = address; - tsk->tss.error_code = error_code; - tsk->tss.trap_no = 14; + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; force_sig(SIGBUS, tsk); /* Kernel mode? Handle exceptions or die */ --- linux/arch/i386/kernel/ldt.c.orig Fri Oct 30 00:17:39 1998 +++ linux/arch/i386/kernel/ldt.c Sun Jul 11 17:36:48 1999 @@ -2,6 +2,7 @@ * linux/kernel/ldt.c * * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1998 Ingo Molnar */ #include @@ -19,17 +20,27 @@ static int read_ldt(void * ptr, unsigned long bytecount) { - void * address = current->mm->segments; + struct mm_struct * mm = current->mm; + int err; unsigned long size; - if (!ptr) - return -EINVAL; - if (!address) - return 0; + down(&mm->mmap_sem); + err = 0; + if (!mm->segments) + goto out; + size = LDT_ENTRIES*LDT_ENTRY_SIZE; if (size > bytecount) size = bytecount; - return copy_to_user(ptr, address, size) ? -EFAULT : size; + + err = copy_to_user(ptr, mm->segments, size); + if (err) + err = -EFAULT; + else + err = size; +out: + up(¤t->mm->mmap_sem); + return err; } static int write_ldt(void * ptr, unsigned long bytecount, int oldmode) @@ -64,31 +75,29 @@ * you get strange behaviour (the kernel is safe, it's just user * space strangeness). * - * For no good reason except historical, the GDT index of the LDT - * is chosen to follow the index number in the task[] array. + * we have two choices: either we preallocate the LDT descriptor + * and can do a shared modify_ldt(), or we postallocate it and do + * an smp message pass to update it. Currently we are a bit + * un-nice to user-space and reload the LDT only on the next + * schedule. (only an issue on SMP) + * + * the GDT index of the LDT is allocated dynamically, and is + * limited by MAX_LDT_DESCRIPTORS. */ + down(&mm->mmap_sem); if (!mm->segments) { - void * ldt; + error = -ENOMEM; - ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE); - if (!ldt) - goto out; - memset(ldt, 0, LDT_ENTRIES*LDT_ENTRY_SIZE); + mm->segments = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE); + if (!mm->segments) + goto out_unlock; + + if (atomic_read(&mm->count) > 1) + printk(KERN_WARNING "LDT allocated for cloned task!\n"); /* - * Make sure someone else hasn't allocated it for us ... + * Possibly do an SMP cross-call to other CPUs to reload + * their LDTs */ - if (!mm->segments) { - int i = current->tarray_ptr - &task[0]; - mm->segments = ldt; - set_ldt_desc(i, ldt, LDT_ENTRIES); - current->tss.ldt = _LDT(i); - load_ldt(i); - if (atomic_read(&mm->count) > 1) - printk(KERN_WARNING - "LDT allocated for cloned task!\n"); - } else { - vfree(ldt); - } } lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->segments); @@ -127,6 +136,9 @@ *lp = entry_1; *(lp+1) = entry_2; error = 0; + +out_unlock: + up(&mm->mmap_sem); out: return error; } @@ -135,7 +147,6 @@ { int ret = -ENOSYS; - lock_kernel(); switch (func) { case 0: ret = read_ldt(ptr, bytecount); @@ -147,6 +158,5 @@ ret = write_ldt(ptr, bytecount, 0); break; } - unlock_kernel(); return ret; } --- linux/arch/i386/kernel/head.S.orig Tue Jan 26 10:47:09 1999 +++ linux/arch/i386/kernel/head.S Sun Jul 11 17:36:48 1999 @@ -8,11 +8,12 @@ */ .text -#include +#include #include #include #include #include +#include #define CL_MAGIC_ADDR 0x90020 @@ -330,7 +331,7 @@ * of tasks we can have.. */ #define IDT_ENTRIES 256 -#define GDT_ENTRIES (12+2*NR_TASKS) +#define GDT_ENTRIES (__TSS(NR_CPUS)) .globl SYMBOL_NAME(idt) @@ -519,8 +520,7 @@ ALIGN /* - * This contains up to 8192 quadwords depending on NR_TASKS - 64kB of - * gdt entries. Ugh. + * This contains typically 140 quadwords, depending on NR_CPUS. * * NOTE! Make sure the gdt descriptor in head.S matches this if you * change anything. @@ -542,7 +542,7 @@ .quad 0x00409a0000000000 /* 0x48 APM CS code */ .quad 0x00009a0000000000 /* 0x50 APM CS 16 code (16 bit) */ .quad 0x0040920000000000 /* 0x58 APM DS data */ - .fill 2*NR_TASKS,8,0 /* space for LDT's and TSS's etc */ + .fill NR_CPUS*4,8,0 /* space for TSS's and LDT's */ /* * This is to aid debugging, the various locking macros will be putting --- linux/arch/i386/kernel/process.c.orig Sun Jul 11 13:20:20 1999 +++ linux/arch/i386/kernel/process.c Sun Jul 11 17:36:48 1999 @@ -405,6 +405,7 @@ regs->esi, regs->edi, regs->ebp); printk(" DS: %04x ES: %04x\n", 0xffff & regs->xds,0xffff & regs->xes); + __asm__("movl %%cr0, %0": "=r" (cr0)); __asm__("movl %%cr2, %0": "=r" (cr2)); __asm__("movl %%cr3, %0": "=r" (cr3)); @@ -475,11 +476,28 @@ free_pages((unsigned long) p, 1); } +/* + * No need to lock the MM as we are the last user + */ void release_segments(struct mm_struct *mm) { - if (mm->segments) { - void * ldt = mm->segments; + void * ldt = mm->segments; + + /* + * free the LDT + */ + if (ldt) { mm->segments = NULL; + /* + * special case, when we release the LDT from under + * the running CPU. Other CPUs cannot possibly use + * this LDT as we were getting here through mmput() ... + */ + if (mm == current->mm) + load_LDT(mm); + /* + * Nobody anymore uses the LDT, we can free it: + */ vfree(ldt); } } @@ -492,10 +510,9 @@ : "r" (0)); /* - * Get the LDT entry from init_task. + * Load the LDT entry of init_task. */ - current->tss.ldt = _LDT(0); - load_ldt(0); + load_LDT(init_task.mm); } /* @@ -537,12 +554,9 @@ void flush_thread(void) { - int i; struct task_struct *tsk = current; - for (i=0 ; i<8 ; i++) - tsk->tss.debugreg[i] = 0; - + memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); /* * Forget coprocessor state.. */ @@ -552,33 +566,50 @@ void release_thread(struct task_struct *dead_task) { + void * ldt = dead_task->mm->segments; + + // temporary debugging check + if (ldt) { + printk("WARNING: dead process %8s still has LDT? <%p>\n", + dead_task->comm, ldt); + BUG(); + } } /* - * If new_mm is NULL, we're being called to set up the LDT descriptor - * for a clone task. Each clone must have a separate entry in the GDT. + * If new_mm is NULL, we're being called to set up the LDT for + * a clone task: this is easy since the clone is not running yet. + * otherwise we copy the old segment into a new segment. + * + * we do not have to muck with descriptors here, that is + * done in __switch_to() and get_mmu_context(). */ -void copy_segments(int nr, struct task_struct *p, struct mm_struct *new_mm) +void copy_segments(struct task_struct *p, struct mm_struct *new_mm) { struct mm_struct * old_mm = current->mm; void * old_ldt = old_mm->segments, * ldt = old_ldt; - /* default LDT - use the one from init_task */ - p->tss.ldt = _LDT(0); - if (old_ldt) { - if (new_mm) { - ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE); - new_mm->segments = ldt; - if (!ldt) { - printk(KERN_WARNING "ldt allocation failed\n"); - return; - } - memcpy(ldt, old_ldt, LDT_ENTRIES*LDT_ENTRY_SIZE); - } - p->tss.ldt = _LDT(nr); - set_ldt_desc(nr, ldt, LDT_ENTRIES); + if (!old_mm->segments) { + /* + * default LDT - use the one from init_task + */ + if (new_mm) + new_mm->segments = NULL; return; } + + if (new_mm) { + /* + * Completely new LDT, we initialize it from the parent: + */ + ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE); + if (!ldt) + printk(KERN_WARNING "ldt allocation failed\n"); + else + memcpy(ldt, old_ldt, LDT_ENTRIES*LDT_ENTRY_SIZE); + new_mm->segments = ldt; + } + return; } /* @@ -592,31 +623,21 @@ { struct pt_regs * childregs; - childregs = ((struct pt_regs *) (2*PAGE_SIZE + (unsigned long) p)) - 1; + childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1; *childregs = *regs; childregs->eax = 0; childregs->esp = esp; - p->tss.esp = (unsigned long) childregs; - p->tss.esp0 = (unsigned long) (childregs+1); - p->tss.ss0 = __KERNEL_DS; - - p->tss.tr = _TSS(nr); - set_tss_desc(nr,&(p->tss)); - p->tss.eip = (unsigned long) ret_from_fork; + p->thread.esp = (unsigned long) childregs; + p->thread.esp0 = (unsigned long) (childregs+1); - savesegment(fs,p->tss.fs); - savesegment(gs,p->tss.gs); + p->thread.eip = (unsigned long) ret_from_fork; - /* - * a bitmap offset pointing outside of the TSS limit causes a nicely - * controllable SIGSEGV. The first sys_ioperm() call sets up the - * bitmap properly. - */ - p->tss.bitmap = sizeof(struct thread_struct); + savesegment(fs,p->thread.fs); + savesegment(gs,p->thread.gs); unlazy_fpu(current); - p->tss.i387 = current->tss.i387; + p->thread.i387 = current->thread.i387; return 0; } @@ -632,7 +653,7 @@ fpvalid = tsk->used_math; if (fpvalid) { unlazy_fpu(tsk); - memcpy(fpu,&tsk->tss.i387.hard,sizeof(*fpu)); + memcpy(fpu,&tsk->thread.i387.hard,sizeof(*fpu)); } return fpvalid; @@ -654,7 +675,7 @@ dump->u_dsize -= dump->u_tsize; dump->u_ssize = 0; for (i = 0; i < 8; i++) - dump->u_debugreg[i] = current->tss.debugreg[i]; + dump->u_debugreg[i] = current->thread.debugreg[i]; if (dump->start_stack < TASK_SIZE) dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; @@ -683,11 +704,10 @@ /* * This special macro can be used to load a debugging register */ -#define loaddebug(tsk,register) \ +#define loaddebug(thread,register) \ __asm__("movl %0,%%db" #register \ : /* no output */ \ - :"r" (tsk->tss.debugreg[register])) - + :"r" (thread->debugreg[register])) /* * switch_to(x,yn) should switch tasks from x to y. @@ -712,60 +732,80 @@ * More important, however, is the fact that this allows us much * more flexibility. */ -void __switch_to(struct task_struct *prev, struct task_struct *next) +extern int cpus_initialized; +void __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { - /* Do the FPU save and set TS if it wasn't set before.. */ - unlazy_fpu(prev); + struct soft_thread_struct *prev = &prev_p->thread, + *next = &next_p->thread; + struct hard_thread_struct *tss = init_tss + smp_processor_id(); + + unlazy_fpu(prev_p); /* - * Reload TR, LDT and the page table pointers.. - * - * We need TR for the IO permission bitmask (and - * the vm86 bitmasks in case we ever use enhanced - * v86 mode properly). - * - * We may want to get rid of the TR register some - * day, and copy the bitmaps around by hand. Oh, - * well. In the meantime we have to clear the busy - * bit in the TSS entry, ugh. + * Reload esp0, LDT and the page table pointer: */ - gdt_table[next->tss.tr >> 3].b &= 0xfffffdff; - asm volatile("ltr %0": :"g" (*(unsigned short *)&next->tss.tr)); + tss->esp0 = next->esp0; /* * Save away %fs and %gs. No need to save %es and %ds, as * those are always kernel segments while inside the kernel. */ - asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->tss.fs)); - asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->tss.gs)); + asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs)); + asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs)); /* Re-load LDT if necessary */ - if (next->mm->segments != prev->mm->segments) - asm volatile("lldt %0": :"g" (*(unsigned short *)&next->tss.ldt)); + if (prev_p->mm->segments != next_p->mm->segments) + load_LDT(next_p->mm); /* Re-load page tables */ { - unsigned long new_cr3 = next->tss.cr3; - if (new_cr3 != prev->tss.cr3) + unsigned long new_cr3 = next->cr3; + + tss->cr3 = new_cr3; + if (new_cr3 != prev->cr3) asm volatile("movl %0,%%cr3": :"r" (new_cr3)); } /* * Restore %fs and %gs. */ - loadsegment(fs,next->tss.fs); - loadsegment(gs,next->tss.gs); + loadsegment(fs, next->fs); + loadsegment(gs, next->gs); /* * Now maybe reload the debug registers */ - if (next->tss.debugreg[7]){ - loaddebug(next,0); - loaddebug(next,1); - loaddebug(next,2); - loaddebug(next,3); - loaddebug(next,6); - loaddebug(next,7); + if (next->debugreg[7]){ + loaddebug(next, 0); + loaddebug(next, 1); + loaddebug(next, 2); + loaddebug(next, 3); + /* no 4 and 5 */ + loaddebug(next, 6); + loaddebug(next, 7); + } + + if (prev->ioperm || next->ioperm) { + if (next->ioperm) { + /* + * 4 cachelines copy ... not good, but not that + * bad either. Anyone got something better? + * This only affects processes which use ioperm(). + * [Putting the TSSs into 4k-tlb mapped regions + * and playing VM tricks to switch the IO bitmap + * is not really acceptable.] + */ + memcpy(tss->io_bitmap, next->io_bitmap, + IO_BITMAP_SIZE*sizeof(unsigned long)); + tss->bitmap = IO_BITMAP_OFFSET; + } else + /* + * a bitmap offset pointing outside of the TSS limit + * causes a nicely controllable SIGSEGV if a process + * tries to use a port IO instruction. The first + * sys_ioperm() call sets up the bitmap properly. + */ + tss->bitmap = INVALID_IO_BITMAP_OFFSET; } } --- linux/arch/i386/kernel/smp.c.orig Sun Jul 11 13:20:20 1999 +++ linux/arch/i386/kernel/smp.c Sun Jul 11 17:36:48 1999 @@ -104,7 +104,7 @@ unsigned long cpu_present_map = 0; /* Bitmask of physically existing CPUs */ unsigned long cpu_online_map = 0; /* Bitmask of currently online CPUs */ -int smp_num_cpus = 1; /* Total count of live CPUs */ +int smp_num_cpus = 0; /* Total count of live CPUs */ int smp_threads_ready=0; /* Set when the idlers are all forked */ volatile int cpu_number_map[NR_CPUS]; /* which CPU maps to which logical number */ volatile int __cpu_logical_map[NR_CPUS]; /* which logical number maps to which CPU */ @@ -225,6 +225,7 @@ return n; } + /* * Read the MPC */ @@ -637,6 +638,8 @@ #endif } + + /* * Trampoline 80x86 program as an array. */ @@ -882,6 +885,7 @@ * booting is too fragile that we want to limit the * things done here to the most necessary things. */ + cpu_init(); smp_callin(); while (!atomic_read(&smp_commenced)) /* nothing */ ; @@ -896,15 +900,6 @@ */ void __init initialize_secondary(void) { - struct thread_struct * p = ¤t->tss; - - /* - * Load up the LDT and the task register. - */ - asm volatile("lldt %%ax": :"a" (p->ldt)); - asm volatile("ltr %%ax": :"a" (p->tr)); - stts(); - /* * We don't actually need to load the full TSS, * basically just the stack pointer and the eip. @@ -914,7 +909,7 @@ "movl %0,%%esp\n\t" "jmp *%1" : - :"r" (p->esp),"r" (p->eip)); + :"r" (current->thread.esp),"r" (current->thread.eip)); } extern struct { @@ -937,7 +932,13 @@ kernel_thread(start_secondary, NULL, CLONE_PID); cpucount++; - idle = task[cpucount]; + /* + * We remove it from the pidhash and the runqueue + * once we got the process: + */ + idle = init_task.prev_task; + + init_tasks[cpucount] = idle; if (!idle) panic("No idle process for CPU %d", i); @@ -945,7 +946,13 @@ __cpu_logical_map[cpucount] = i; cpu_number_map[i] = cpucount; idle->has_cpu = 1; /* we schedule the first task manually */ - idle->tss.eip = (unsigned long) start_secondary; + idle->thread.eip = (unsigned long) start_secondary; + + /* + * Remove the idle thread from the tasklist, pidhash and runqueue, + * it's not needed there (just slows things down). + */ + unhash_process(idle); /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); @@ -1179,7 +1186,6 @@ /* Must be done before other processors booted */ mtrr_init_boot_cpu (); #endif - init_idle(); /* * Initialize the logical to physical CPU number mapping * and the per-CPU profiling counter/multiplier @@ -1210,6 +1216,8 @@ cpu_number_map[boot_cpu_id] = 0; + init_idle(); + /* * If we couldnt find an SMP configuration at boot time, * get out of here now! @@ -1356,30 +1364,32 @@ */ SMP_PRINTK(("Before bogomips.\n")); - if (cpucount==0) - { + if (!cpucount) { printk(KERN_ERR "Error: only one processor found.\n"); cpu_online_map = (1< #include #include +#include static struct vm_area_struct init_mmap = INIT_MMAP; static struct fs_struct init_fs = INIT_FS; @@ -22,4 +23,14 @@ union task_union init_task_union __attribute__((__section__(".data.init_task"))) = { INIT_TASK(init_task_union.task) }; - + +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. The TSS size is kept cacheline-aligned + * so they are allowed to end up in the .data.cacheline_aligned + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +struct hard_thread_struct init_tss[NR_CPUS] __cacheline_aligned = + { [0 ... NR_CPUS-1] = INIT_TSS }; + --- linux/arch/i386/kernel/ioport.c.orig Thu Jan 14 14:38:55 1999 +++ linux/arch/i386/kernel/ioport.c Sun Jul 11 17:36:48 1999 @@ -54,7 +54,8 @@ */ asmlinkage int sys_ioperm(unsigned long from, unsigned long num, int turn_on) { - struct thread_struct * t = ¤t->tss; + struct soft_thread_struct * t = ¤t->thread; + struct hard_thread_struct * tss = init_tss + smp_processor_id(); if ((from + num <= from) || (from + num > IO_BITMAP_SIZE*32)) return -EINVAL; @@ -65,14 +66,24 @@ * IO bitmap up. ioperm() is much less timing critical than clone(), * this is why we delay this operation until now: */ -#define IO_BITMAP_OFFSET offsetof(struct thread_struct,io_bitmap) - - if (t->bitmap != IO_BITMAP_OFFSET) { - t->bitmap = IO_BITMAP_OFFSET; + if (!t->ioperm) { + /* + * just in case ... + */ memset(t->io_bitmap,0xff,(IO_BITMAP_SIZE+1)*4); + t->ioperm = 1; + /* + * this activates it in the TSS + */ + tss->bitmap = IO_BITMAP_OFFSET; } - - set_bitmap((unsigned long *)t->io_bitmap, from, num, !turn_on); + + /* + * do it in the per-thread copy and in the TSS ... + */ + set_bitmap(t->io_bitmap, from, num, !turn_on); + set_bitmap(tss->io_bitmap, from, num, !turn_on); + return 0; } --- linux/arch/i386/kernel/ptrace.c.orig Sun Jul 11 13:20:20 1999 +++ linux/arch/i386/kernel/ptrace.c Sun Jul 11 17:36:48 1999 @@ -45,7 +45,7 @@ { unsigned char *stack; - stack = (unsigned char *)task->tss.esp0; + stack = (unsigned char *)task->thread.esp0; stack += offset; return (*((int *)stack)); } @@ -61,7 +61,7 @@ { unsigned char * stack; - stack = (unsigned char *) task->tss.esp0; + stack = (unsigned char *) task->thread.esp0; stack += offset; *(unsigned long *) stack = data; return 0; @@ -76,12 +76,12 @@ case FS: if (value && (value & 3) != 3) return -EIO; - child->tss.fs = value; + child->thread.fs = value; return 0; case GS: if (value && (value & 3) != 3) return -EIO; - child->tss.gs = value; + child->thread.gs = value; return 0; case DS: case ES: @@ -112,10 +112,10 @@ switch (regno >> 2) { case FS: - retval = child->tss.fs; + retval = child->thread.fs; break; case GS: - retval = child->tss.gs; + retval = child->thread.gs; break; case DS: case ES: @@ -229,7 +229,7 @@ addr <= (long) &dummy->u_debugreg[7]){ addr -= (long) &dummy->u_debugreg[0]; addr = addr >> 2; - tmp = child->tss.debugreg[addr]; + tmp = child->thread.debugreg[addr]; }; ret = put_user(tmp,(unsigned long *) data); goto out; @@ -278,7 +278,7 @@ addr -= (long) &dummy->u_debugreg; addr = addr >> 2; - child->tss.debugreg[addr] = data; + child->thread.debugreg[addr] = data; ret = 0; goto out; }; @@ -409,18 +409,18 @@ ret = 0; if ( !child->used_math ) { /* Simulate an empty FPU. */ - child->tss.i387.hard.cwd = 0xffff037f; - child->tss.i387.hard.swd = 0xffff0000; - child->tss.i387.hard.twd = 0xffffffff; + child->thread.i387.hard.cwd = 0xffff037f; + child->thread.i387.hard.swd = 0xffff0000; + child->thread.i387.hard.twd = 0xffffffff; } #ifdef CONFIG_MATH_EMULATION if ( boot_cpu_data.hard_math ) { #endif - __copy_to_user((void *)data, &child->tss.i387.hard, + __copy_to_user((void *)data, &child->thread.i387.hard, sizeof(struct user_i387_struct)); #ifdef CONFIG_MATH_EMULATION } else { - save_i387_soft(&child->tss.i387.soft, + save_i387_soft(&child->thread.i387.soft, (struct _fpstate *)data); } #endif @@ -438,11 +438,11 @@ #ifdef CONFIG_MATH_EMULATION if ( boot_cpu_data.hard_math ) { #endif - __copy_from_user(&child->tss.i387.hard, (void *)data, + __copy_from_user(&child->thread.i387.hard, (void *)data, sizeof(struct user_i387_struct)); #ifdef CONFIG_MATH_EMULATION } else { - restore_i387_soft(&child->tss.i387.soft, + restore_i387_soft(&child->thread.i387.soft, (struct _fpstate *)data); } #endif --- linux/arch/i386/kernel/signal.c.orig Thu Jun 17 01:01:39 1999 +++ linux/arch/i386/kernel/signal.c Sun Jul 11 17:36:48 1999 @@ -155,7 +155,7 @@ { struct task_struct *tsk = current; clear_fpu(tsk); - return __copy_from_user(&tsk->tss.i387.hard, buf, sizeof(*buf)); + return __copy_from_user(&tsk->thread.i387.hard, buf, sizeof(*buf)); } static inline int restore_i387(struct _fpstate *buf) @@ -167,7 +167,7 @@ if (boot_cpu_data.hard_math) err = restore_i387_hard(buf); else - err = restore_i387_soft(¤t->tss.i387.soft, buf); + err = restore_i387_soft(¤t->thread.i387.soft, buf); #endif current->used_math = 1; return err; @@ -308,8 +308,8 @@ struct task_struct *tsk = current; unlazy_fpu(tsk); - tsk->tss.i387.hard.status = tsk->tss.i387.hard.swd; - if (__copy_to_user(buf, &tsk->tss.i387.hard, sizeof(*buf))) + tsk->thread.i387.hard.status = tsk->thread.i387.hard.swd; + if (__copy_to_user(buf, &tsk->thread.i387.hard, sizeof(*buf))) return -1; return 1; } @@ -328,7 +328,7 @@ return save_i387_hard(buf); #else return boot_cpu_data.hard_math ? save_i387_hard(buf) - : save_i387_soft(¤t->tss.i387.soft, buf); + : save_i387_soft(¤t->thread.i387.soft, buf); #endif } @@ -354,8 +354,8 @@ err |= __put_user(regs->edx, &sc->edx); err |= __put_user(regs->ecx, &sc->ecx); err |= __put_user(regs->eax, &sc->eax); - err |= __put_user(current->tss.trap_no, &sc->trapno); - err |= __put_user(current->tss.error_code, &sc->err); + err |= __put_user(current->thread.trap_no, &sc->trapno); + err |= __put_user(current->thread.error_code, &sc->err); err |= __put_user(regs->eip, &sc->eip); err |= __put_user(regs->xcs, (unsigned int *)&sc->cs); err |= __put_user(regs->eflags, &sc->eflags); @@ -370,7 +370,7 @@ /* non-iBCS2 extensions.. */ err |= __put_user(mask, &sc->oldmask); - err |= __put_user(current->tss.cr2, &sc->cr2); + err |= __put_user(current->thread.cr2, &sc->cr2); return err; } --- linux/arch/i386/kernel/traps.c.orig Tue Mar 9 11:03:39 1999 +++ linux/arch/i386/kernel/traps.c Sun Jul 11 17:36:48 1999 @@ -65,8 +65,8 @@ #define DO_ERROR(trapnr, signr, str, name, tsk) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ - tsk->tss.error_code = error_code; \ - tsk->tss.trap_no = trapnr; \ + tsk->thread.error_code = error_code; \ + tsk->thread.trap_no = trapnr; \ force_sig(signr, tsk); \ die_if_no_fixup(str,regs,error_code); \ } @@ -80,8 +80,8 @@ goto out; \ /* else fall through */ \ } \ - tsk->tss.error_code = error_code; \ - tsk->tss.trap_no = trapnr; \ + tsk->thread.error_code = error_code; \ + tsk->thread.trap_no = trapnr; \ force_sig(signr, tsk); \ die_if_kernel(str,regs,error_code); \ out: \ @@ -143,10 +143,8 @@ regs->esi, regs->edi, regs->ebp, esp); printk("ds: %04x es: %04x ss: %04x\n", regs->xds & 0xffff, regs->xes & 0xffff, ss); - store_TR(i); - printk("Process %s (pid: %d, process nr: %d, stackpage=%08lx)", - current->comm, current->pid, 0xffff & i, 4096+(unsigned long)current); - + printk("Process %s (pid: %d, stackpage=%08lx)", + current->comm, current->pid, 4096+(unsigned long)current); /* * When in-kernel, we also print out the stack and code at the * time of the fault.. @@ -249,8 +247,8 @@ return; } die_if_kernel("cache flush denied",regs,error_code); - current->tss.error_code = error_code; - current->tss.trap_no = 19; + current->thread.error_code = error_code; + current->thread.trap_no = 19; force_sig(SIGSEGV, current); } @@ -262,8 +260,8 @@ if (!(regs->xcs & 3)) goto gp_in_kernel; - current->tss.error_code = error_code; - current->tss.trap_no = 13; + current->thread.error_code = error_code; + current->thread.trap_no = 13; force_sig(SIGSEGV, current); return; @@ -374,9 +372,9 @@ goto clear_TF; } - /* Mast out spurious debug traps due to lazy DR7 setting */ + /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->tss.debugreg[7]) + if (!tsk->thread.debugreg[7]) goto clear_dr7; } @@ -385,8 +383,8 @@ goto clear_dr7; /* Ok, finally something we can handle */ - tsk->tss.trap_no = 1; - tsk->tss.error_code = error_code; + tsk->thread.trap_no = 1; + tsk->thread.error_code = error_code; force_sig(SIGTRAP, tsk); return; @@ -422,8 +420,8 @@ */ task = current; save_fpu(task); - task->tss.trap_no = 16; - task->tss.error_code = 0; + task->thread.trap_no = 16; + task->thread.error_code = 0; force_sig(SIGFPE, task); } @@ -453,7 +451,7 @@ { __asm__ __volatile__("clts"); /* Allow maths ops (or we recurse) */ if(current->used_math) - __asm__("frstor %0": :"m" (current->tss.i387)); + __asm__("frstor %0": :"m" (current->thread.i387)); else { /* @@ -486,6 +484,7 @@ pmd_t * pmd; pte_t * pte; +return; /* * Allocate a new page in virtual address space, * move the IDT into it and write protect this page. @@ -570,12 +569,12 @@ void set_tss_desc(unsigned int n, void *addr) { - _set_tssldt_desc(gdt_table+FIRST_TSS_ENTRY+(n<<1), (int)addr, 235, 0x89); + _set_tssldt_desc(gdt_table+__TSS(n), (int)addr, 235, 0x89); } void set_ldt_desc(unsigned int n, void *addr, unsigned int size) { - _set_tssldt_desc(gdt_table+FIRST_LDT_ENTRY+(n<<1), (int)addr, ((size << 3) - 1), 0x82); + _set_tssldt_desc(gdt_table+__LDT(n), (int)addr, ((size << 3)-1), 0x82); } #ifdef CONFIG_X86_VISWS_APIC @@ -672,7 +671,7 @@ { if (readl(0x0FFFD9) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) EISA_bus = 1; - set_call_gate(&default_ldt,lcall7); + set_trap_gate(0,÷_error); set_trap_gate(1,&debug); set_trap_gate(2,&nmi); @@ -693,14 +692,20 @@ set_trap_gate(17,&alignment_check); set_system_gate(SYSCALL_VECTOR,&system_call); - /* set up GDT task & ldt entries */ - set_tss_desc(0, &init_task.tss); - set_ldt_desc(0, &default_ldt, 1); - - /* Clear NT, so that we won't have troubles with that later on */ - __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl"); - load_TR(0); - load_ldt(0); + /* + * default LDT is a single-entry callgate to lcall7 + */ + set_call_gate(&default_ldt,lcall7); + + /* + * on SMP we do not yet know which CPU is on which TSS, + * so we delay this until smp_init(). (the CPU is already + * in a reasonable state, otherwise we wouldnt have gotten so far :) + */ +#ifndef __SMP__ + cpu_init(); +#endif + #ifdef CONFIG_X86_VISWS_APIC superio_init(); lithium_init(); --- linux/arch/i386/kernel/vm86.c.orig Tue Dec 29 16:36:58 1998 +++ linux/arch/i386/kernel/vm86.c Sun Jul 11 17:36:48 1999 @@ -48,8 +48,8 @@ /* * virtual flags (16 and 32-bit versions) */ -#define VFLAGS (*(unsigned short *)&(current->tss.v86flags)) -#define VEFLAGS (current->tss.v86flags) +#define VFLAGS (*(unsigned short *)&(current->thread.v86flags)) +#define VEFLAGS (current->thread.v86flags) #define set_flags(X,new,mask) \ ((X) = ((X) & ~(mask)) | ((new) & (mask))) @@ -65,25 +65,27 @@ asmlinkage struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); struct pt_regs * save_v86_state(struct kernel_vm86_regs * regs) { + struct hard_thread_struct *tss; struct pt_regs *ret; unsigned long tmp; lock_kernel(); - if (!current->tss.vm86_info) { + if (!current->thread.vm86_info) { printk("no vm86_info: BAD\n"); do_exit(SIGSEGV); } - set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->tss.v86mask); - tmp = copy_to_user(¤t->tss.vm86_info->regs,regs, VM86_REGS_SIZE1); - tmp += copy_to_user(¤t->tss.vm86_info->regs.VM86_REGS_PART2, + set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); + tmp = copy_to_user(¤t->thread.vm86_info->regs,regs, VM86_REGS_SIZE1); + tmp += copy_to_user(¤t->thread.vm86_info->regs.VM86_REGS_PART2, ®s->VM86_REGS_PART2, VM86_REGS_SIZE2); - tmp += put_user(current->tss.screen_bitmap,¤t->tss.vm86_info->screen_bitmap); + tmp += put_user(current->thread.screen_bitmap,¤t->thread.vm86_info->screen_bitmap); if (tmp) { printk("vm86: could not access userspace vm86_info\n"); do_exit(SIGSEGV); } - current->tss.esp0 = current->tss.saved_esp0; - current->tss.saved_esp0 = 0; + tss = init_tss + smp_processor_id(); + tss->esp0 = current->thread.esp0 = current->thread.saved_esp0; + current->thread.saved_esp0 = 0; ret = KVM86->regs32; unlock_kernel(); return ret; @@ -138,7 +140,7 @@ lock_kernel(); tsk = current; - if (tsk->tss.saved_esp0) + if (tsk->thread.saved_esp0) goto out; tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, @@ -148,7 +150,7 @@ goto out; memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); info.regs32 = (struct pt_regs *) &v86; - tsk->tss.vm86_info = v86; + tsk->thread.vm86_info = v86; do_sys_vm86(&info, tsk); ret = 0; /* we never return here */ out: @@ -188,7 +190,7 @@ /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ ret = -EPERM; - if (tsk->tss.saved_esp0) + if (tsk->thread.saved_esp0) goto out; tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, @@ -198,7 +200,7 @@ goto out; info.regs32 = (struct pt_regs *) &subfunction; info.vm86plus.is_vm86pus = 1; - tsk->tss.vm86_info = (struct vm86_struct *)v86; + tsk->thread.vm86_info = (struct vm86_struct *)v86; do_sys_vm86(&info, tsk); ret = 0; /* we never return here */ out: @@ -209,6 +211,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) { + struct hard_thread_struct *tss; /* * make sure the vm86() system call doesn't try to do anything silly */ @@ -231,16 +234,16 @@ switch (info->cpu_type) { case CPU_286: - tsk->tss.v86mask = 0; + tsk->thread.v86mask = 0; break; case CPU_386: - tsk->tss.v86mask = NT_MASK | IOPL_MASK; + tsk->thread.v86mask = NT_MASK | IOPL_MASK; break; case CPU_486: - tsk->tss.v86mask = AC_MASK | NT_MASK | IOPL_MASK; + tsk->thread.v86mask = AC_MASK | NT_MASK | IOPL_MASK; break; default: - tsk->tss.v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK; + tsk->thread.v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK; break; } @@ -248,10 +251,11 @@ * Save old state, set default return value (%eax) to 0 */ info->regs32->eax = 0; - tsk->tss.saved_esp0 = tsk->tss.esp0; - tsk->tss.esp0 = (unsigned long) &info->VM86_TSS_ESP0; + tsk->thread.saved_esp0 = tsk->thread.esp0; + tss = init_tss + smp_processor_id(); + tss->esp0 = tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; - tsk->tss.screen_bitmap = info->screen_bitmap; + tsk->thread.screen_bitmap = info->screen_bitmap; if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk); unlock_kernel(); @@ -295,7 +299,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) { - set_flags(VEFLAGS, eflags, current->tss.v86mask); + set_flags(VEFLAGS, eflags, current->thread.v86mask); set_flags(regs->eflags, eflags, SAFE_MASK); if (eflags & IF_MASK) set_IF(regs); @@ -303,7 +307,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) { - set_flags(VFLAGS, flags, current->tss.v86mask); + set_flags(VFLAGS, flags, current->thread.v86mask); set_flags(regs->eflags, flags, SAFE_MASK); if (flags & IF_MASK) set_IF(regs); @@ -315,7 +319,7 @@ if (VEFLAGS & VIF_MASK) flags |= IF_MASK; - return flags | (VEFLAGS & current->tss.v86mask); + return flags | (VEFLAGS & current->thread.v86mask); } static inline int is_revectored(int nr, struct revectored_struct * bitmap) @@ -447,8 +451,8 @@ spin_unlock_irqrestore(¤t->sigmask_lock, flags); } send_sig(SIGTRAP, current, 1); - current->tss.trap_no = trapno; - current->tss.error_code = error_code; + current->thread.trap_no = trapno; + current->thread.error_code = error_code; return 0; } --- linux/arch/i386/kernel/setup.c.orig Sun Jul 11 13:20:22 1999 +++ linux/arch/i386/kernel/setup.c Sun Jul 11 17:36:48 1999 @@ -49,6 +49,7 @@ #include #include #include +#include /* * Machine setup.. @@ -57,6 +58,8 @@ char ignore_irq13 = 0; /* set if exception 16 works */ struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; +unsigned long mmu_cr4_features __initdata = 0; + /* * Bus types .. */ @@ -993,4 +996,64 @@ ((c->loops_per_sec+2500)/5000) % 100); } return p - buffer; +} + +int cpus_initialized = 0; +unsigned long cpu_initialized = 0; + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + */ +void cpu_init (void) +{ + int nr = smp_processor_id(); + struct hard_thread_struct * t = &init_tss[nr]; + + if (test_and_set_bit(nr,&cpu_initialized)) { + printk("CPU#%d ALREADY INITIALIZED!!!!!!!!!\n", nr); + for (;;) __sti(); + } + cpus_initialized++; + printk("INITIALIZING CPU#%d\n", nr); + + if (boot_cpu_data.x86_capability & X86_FEATURE_PSE) + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + __asm__ __volatile__("lgdt %0": "=m" (gdt_descr)); + __asm__ __volatile__("lidt %0": "=m" (idt_descr)); + + /* + * Delete NT + */ + __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl"); + + /* + * set up and load the per-CPU TSS and LDT + */ + t->esp0 = current->thread.esp0; + set_tss_desc(nr,t); + gdt_table[__TSS(nr)].b &= 0xfffffdff; + load_TR(nr); + + load_LDT(current->mm); + + /* + * Clear all 6 debug registers: + */ + +#define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) ); + + CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); + +#undef CD + + /* + * Force FPU initialization: + */ + current->flags &= ~PF_USEDFPU; + current->used_math = 0; + stts(); } --- linux/fs/proc/array.c.orig Sun Jul 11 13:20:20 1999 +++ linux/fs/proc/array.c Sun Jul 11 17:36:47 1999 @@ -223,7 +223,7 @@ LOAD_INT(a), LOAD_FRAC(a), LOAD_INT(b), LOAD_FRAC(b), LOAD_INT(c), LOAD_FRAC(c), - nr_running, nr_tasks, last_pid); + nr_running, nr_threads, last_pid); } static int get_kstat(char * buffer) @@ -312,7 +312,7 @@ unsigned long idle; uptime = jiffies; - idle = task[0]->times.tms_utime + task[0]->times.tms_stime; + idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime; /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but that would overflow about every five days at HZ == 100. @@ -495,7 +495,7 @@ int count = 0; stack_page = (unsigned long)p; - esp = p->tss.esp; + esp = p->thread.esp; if (!stack_page || esp < stack_page || esp >= 8188+stack_page) return 0; /* include/asm-i386/system.h:switch_to() pushes ebp last. */ --- linux/fs/proc/fd.c.orig Sun Jul 11 13:20:22 1999 +++ linux/fs/proc/fd.c Sun Jul 11 17:36:47 1999 @@ -133,7 +133,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) { struct inode *inode = filp->f_dentry->d_inode; - struct task_struct * p, **tarrayp; + struct task_struct *p, *tmp; unsigned int fd, pid, ino; int retval; char buf[NUMBUF]; @@ -157,7 +157,6 @@ p = find_task_by_pid(pid); if (!p) goto out_unlock; - tarrayp = p->tarray_ptr; for (fd -= 2 ; p->files && fd < p->files->max_fds; fd++, filp->f_pos++) { @@ -182,8 +181,13 @@ goto out; read_lock(&tasklist_lock); - /* filldir() might have slept, so we must re-validate "p" */ - if (p != *tarrayp || p->pid != pid) + /* + * filldir() might have slept, so we must + * re-validate "p". This is fast enough due + * to the pidhash + */ + tmp = find_task_by_pid(pid); + if (p != tmp) break; } out_unlock: --- linux/fs/proc/root.c.orig Sun Jul 11 16:26:32 1999 +++ linux/fs/proc/root.c Sun Jul 11 17:36:47 1999 @@ -849,14 +849,29 @@ int len; if (dir->i_ino == PROC_ROOT_INO) { /* check for safety... */ - dir->i_nlink = proc_root.nlink; + extern unsigned long total_forks; + static int last_timestamp = 0; - read_lock(&tasklist_lock); - for_each_task(p) { - if (p->pid) - dir->i_nlink++; + /* + * this one can be a serious 'ps' performance problem if + * there are many threads running - thus we do 'lazy' + * link-recalculation - we change it only if the number + * of threads has increased. + */ + if (total_forks != last_timestamp) { + int nlink = proc_root.nlink; + + read_lock(&tasklist_lock); + last_timestamp = total_forks; + for_each_task(p) + nlink++; + read_unlock(&tasklist_lock); + /* + * subtract the # of idle threads which + * do not show up in /proc: + */ + dir->i_nlink = nlink - smp_num_cpus; } - read_unlock(&tasklist_lock); } if (!proc_lookup(dir, dentry))