--- linux/kernel/sysctl.c.orig Mon Feb 28 13:27:57 2000 +++ linux/kernel/sysctl.c Mon Feb 28 13:39:22 2000 @@ -38,6 +38,7 @@ #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ +extern int use_kni; extern int panic_timeout; extern int console_loglevel, C_A_D; extern int bdf_prm[], bdflush_min[], bdflush_max[]; @@ -222,6 +223,8 @@ {KERN_MAX_THREADS, "threads-max", &max_threads, sizeof(int), 0644, NULL, &proc_dointvec}, {KERN_RANDOM, "random", NULL, 0, 0555, random_table}, + {KERN_USE_KNI, "use_kni", &use_kni, sizeof(int), + 0644, NULL, &proc_dointvec}, {KERN_OVERFLOWUID, "overflowuid", &overflowuid, sizeof(int), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &minolduid, &maxolduid}, --- linux/include/linux/sysctl.h.orig Mon Feb 28 13:27:57 2000 +++ linux/include/linux/sysctl.h Mon Feb 28 13:29:37 2000 @@ -110,7 +110,8 @@ KERN_SPARC_STOP_A=44, /* int: Sparc Stop-A enable */ KERN_SHMMNI=45, /* int: shm array identifiers */ KERN_OVERFLOWUID=46, /* int: overflow UID */ - KERN_OVERFLOWGID=47 /* int: overflow GID */ + KERN_OVERFLOWGID=47, /* int: overflow GID */ + KERN_USE_KNI=48 /* int: use KNI mode */ }; --- linux/include/asm-i386/bugs.h.orig Sat Feb 12 19:56:00 2000 +++ linux/include/asm-i386/bugs.h Mon Feb 28 13:39:26 2000 @@ -18,6 +18,7 @@ */ #include +#include #include #include @@ -76,6 +77,44 @@ #endif return; } +#ifdef CONFIG_X86_FX + /* + * If we got so far we can safely turn on FXSAVE/FXRESTORE, + * but make sure we are 16-byte aligned first. + */ + if (offsetof(struct task_struct, thread.i387.hard.fxsave.fxcwd) & 15) { + /* + * This triggers a link-time error if we manage to + * break alignment somehow. + */ + extern void __buggy_fxsr_alignment(void); + + __buggy_fxsr_alignment(); + } + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { + printk("Enabling extended fast FPU save and restore..."); + set_in_cr4(X86_CR4_OSFXSR); + printk("done.\n"); + } + /* + * Note, Katmai instructions are enabled as soon as you start + * using the FXSAVE/RESTORE stuff. This setting only + * indicates support for the masked/unmasked exceptions on + * the new PIII cpus. We don't have an Exception 16 handler + * for this yet, but we set this bit anyway. It'll kill us + * the first time we take an umasked KNI exception, but since + * no userland apps currently use KNI, it isn't an issue yet. + * We should have the handler added by then. + */ + if (boot_cpu_data.x86_capability & X86_FEATURE_XMM) { + printk("Not enabling KNI unmasked exception support\n"); + printk("Exception 19 error handler not integrated yet\n"); +#if 0 + set_in_cr4(X86_CR4_OSXMMEXCPT); + printk("done.\n"); +#endif + } +#endif if (mca_pentium_flag) { /* The IBM Model 95 machines with pentiums lock up on * fpu test, so we avoid it. All pentiums have inbuilt @@ -124,23 +163,23 @@ return; if (!ignore_irq13) { printk("OK, FPU using old IRQ 13 error reporting\n"); - return; + } else { + __asm__("fninit\n\t" + "fldl %1\n\t" + "fdivl %2\n\t" + "fmull %2\n\t" + "fldl %1\n\t" + "fsubp %%st,%%st(1)\n\t" + "fistpl %0\n\t" + "fwait\n\t" + "fninit" + : "=m" (*&boot_cpu_data.fdiv_bug) + : "m" (*&x), "m" (*&y)); + if (!boot_cpu_data.fdiv_bug) + printk("OK, FPU using exception 16 error reporting.\n"); + else + printk("Hmm, FPU using exception 16 error reporting with FDIV bug.\n"); } - __asm__("fninit\n\t" - "fldl %1\n\t" - "fdivl %2\n\t" - "fmull %2\n\t" - "fldl %1\n\t" - "fsubp %%st,%%st(1)\n\t" - "fistpl %0\n\t" - "fwait\n\t" - "fninit" - : "=m" (*&boot_cpu_data.fdiv_bug) - : "m" (*&x), "m" (*&y)); - if (!boot_cpu_data.fdiv_bug) - printk("OK, FPU using exception 16 error reporting.\n"); - else - printk("Hmm, FPU using exception 16 error reporting with FDIV bug.\n"); } static void __init check_hlt(void) @@ -438,5 +477,7 @@ check_pentium_f00f(); #endif check_cyrix_coma(); + boot_cpu_data.enable_fixups = 1; /* should be safe to use MMX/MMX2 */ + /* kernel functions now */ system_utsname.machine[1] = '0' + boot_cpu_data.x86; } --- linux/include/asm-i386/i387.h.orig Mon Feb 28 13:28:03 2000 +++ linux/include/asm-i386/i387.h Mon Feb 28 14:19:50 2000 @@ -0,0 +1,327 @@ +/* + * include/asm-i386/i387.h + * + * MMX2/KNI support, + * + * Copyright (c) 1999 Doug Ledford , + * Ingo Molnar + * + * Made from various code bits pulled from other files + * in order to put things together in a way that made + * sense. + * + * Many thanks to Gabriel Paubert for + * finding subtle bugs and thinkos all along :-) + */ + +#ifndef __ASM_I386_I387_H +#define __ASM_I386_I387_H + +extern int i387_hard_to_user ( struct user_i387_struct * user, + union i387_hard_union * hard); +extern int i387_user_to_hard ( union i387_hard_union * hard, + struct user_i387_struct * user); + +/* + * Fill out the reserved bits, treat it as an fsave struct since the + * union makes this work for both fsave and fxsave structs. + */ +#ifdef CONFIG_X86_FX + +#define i387_save_hard(x) \ +do { \ + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \ + __asm__ __volatile__("fxsave %0" \ + : "=m" ((x).hard.fxsave.fxcwd)); \ + } else { \ + __asm__ __volatile__("fnsave %0; fwait;" \ + : "=m" ((x).hard.fsave.cwd)); \ + } \ +} while(0) + +#define i387_restore_hard(x) \ +do { \ + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \ + __asm__ __volatile__("fxrstor %0" \ + : \ + : "m" ((x).hard.fxsave.fxcwd)); \ + } else { \ + __asm__ __volatile__("frstor %0" \ + : \ + :"m" ((x).hard.fsave.cwd)); \ + } \ +} while(0) + +#define i387_set_cwd(x,v) \ +do { \ + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \ + (x).fxsave.fxcwd = (short)(v); \ + } else { \ + (x).fsave.cwd = ((long)(v) | 0xffff0000); \ + } \ +} while(0) + +#define i387_get_swd(x) \ +({ \ + unsigned int __tmp; \ + \ + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \ + __tmp = (x).fxsave.fxswd; \ + } else { \ + __tmp = (x).fsave.swd; \ + } \ + __tmp; \ +}) + +#define i387_set_swd(x,v) \ +do { \ + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \ + (x).fxsave.fxswd = (short)(v); \ + } else { \ + (x).fsave.swd = ((long)(v) | 0xffff0000); \ + } \ +} while(0) + +#define i387_set_twd(x,v) \ +do { \ + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \ + (x).fxsave.fxtwd = (short)(v); \ + } else { \ + (x).fsave.twd = ((long)(v) | 0xffff0000); \ + } \ +} while(0) + +static inline unsigned short fputag_KNI_to_387(unsigned char tb) { + unsigned short tw = tb; + tw = (tw | (tw << 4)) & 0x0f0f; /* zzzz7654zzzz3210 */ + tw = (tw | (tw << 2)) & 0x3333; /* zz76zz54zz32zz10 */ + tw = (tw | (tw << 1)) & 0x5555; /* z7z6z5z4z3z2z1z0 */ + tw = ~(tw * 3); + return tw; +} + +static inline unsigned char fputag_387_to_KNI(unsigned short tw) { + tw = ~tw & 0x5555; /* z7z6z5z4z3z2z1z0 */ + tw = (tw | (tw >> 1)) & 0x3333; /* zz76zz54zz32zz10 */ + tw = (tw | (tw >> 2)) & 0x0f0f; /* zzzz7654zzzz3210 */ + tw = (tw | (tw >> 4)) & 0x00ff; /* zzzzzzzz76543210 */ + return tw; +} + +#else /* CONFIG_X86_FX */ + +#define i387_save_hard(x) \ +do { \ + __asm__ __volatile__("fnsave %0; fwait;" \ + : "=m" ((x).hard.fsave.cwd)); \ +} while(0) + +#define i387_restore_hard(x) \ +do { \ + __asm__ __volatile__("frstor %0" \ + : \ + :"m" ((x).hard.fsave.cwd)); \ +} while(0) + +#define i387_set_cwd(x,v) \ +do { (x).fsave.cwd = ((long)(v) | 0xffff0000); } while(0) + +#define i387_set_swd(x,v) \ +do { (x).fsave.swd = ((long)(v) | 0xffff0000); } while(0) + +#define i387_set_twd(x,v) \ +do { (x).fsave.twd = ((long)(v) | 0xffff0000); } while(0) + +#endif /* CONFIG_X86_FX */ + +/* + * FPU lazy state save handling.. + */ +#define save_kern_fpu(tsk) do { \ + if(tsk->thread.mmx_reg_space != NULL) \ + __asm__("movq %%mm0, 0x00(%0)\n\t" \ + "movq %%mm1, 0x08(%0)\n\t" \ + "movq %%mm2, 0x10(%0)\n\t" \ + "movq %%mm3, 0x18(%0)\n\t" \ + :: "r" (tsk->thread.mmx_reg_space):"memory"); \ + if(tsk->thread.kni_reg_space != NULL) \ + __asm__("movups %%xmm0, 0x00(%0)\n\t" \ + "movups %%xmm1, 0x10(%0)\n\t" \ + "movups %%xmm2, 0x20(%0)\n\t" \ + "movups %%xmm3, 0x30(%0)\n\t" \ + :: "r" (tsk->thread.kni_reg_space):"memory"); \ +} while (0) + +#define unlazy_fpu(tsk) do { \ + if (tsk->thread.x86_fpustate & X86_FPUSTATE_KERN_ANY) { \ + save_kern_fpu(tsk); \ + if (!(tsk->flags & PF_USEDFPU)) { \ + stts(); \ + } \ + } \ + if (tsk->flags & PF_USEDFPU) { \ + if (!(tsk->thread.x86_fpustate & X86_FPUSTATE_USER_SAVED)) { \ + i387_save_hard(tsk->thread.i387); \ + } \ + tsk->flags &= ~PF_USEDFPU; \ + stts(); \ + } \ +} while (0) + +#define clear_fpu(tsk) do { \ + if ( (tsk->flags & PF_USEDFPU) || \ + (tsk->thread.x86_fpustate) ) { \ + tsk->flags &= ~PF_USEDFPU; \ + tsk->thread.x86_fpustate = 0; \ + stts(); \ + } \ +} while (0) + +/* + * For when we want to use the FPU in kernel code + * + * These functions allow the use of up to 4 KNI based xmm registers on the + * Pentium III processors or up to 4 MMX registers on Pentium MMX and above + * or compatible processors. Pick the routines that you need based on the + * regs you are going to use. Keep in mind that these are intended to be + * used only after you've verified that the processor supports these + * operations. Use them before you've done that and watch your machine go + * boom. Take a look in arch/i386/lib/best_function.c for an example of + * how to fixup the kernel with kni/mmx using functions once the CPU + * capabilities have been determined. + * + * In all of these functions: + * + * recursive - int, used to determine what the state is at restore time + * regs - char * to an array that is 32 bytes for mmx and 64 bytes for kni + * which is then used to save off the contents of the current + * regs to be recursively safe + * task_switch_regs - char * to another array of the same size as the one + * above, but this array is optional. If your function might get + * pre-empted by another task then this pointer should be non-NULL + * so that at unlazy_fpu() time in the switch_to() function we + * can save your register state (copy_*_user functions are an example + * of functions that need this, since they can take a page fault and + * while that fault is being serviced the scheduler is free to run + * another task entirely). + * irqflags - unsigned long used to store IRQ state + */ + +#define SAVE_MMX_REGS(regs) \ + __asm__ __volatile__("movq %%mm0, 0x00(%0)\n\t" \ + "movq %%mm1, 0x08(%0)\n\t" \ + "movq %%mm2, 0x10(%0)\n\t" \ + "movq %%mm3, 0x18(%0)\n\t" \ + : : "r" ((regs)) : "memory" ); + +#define RESTORE_MMX_REGS(regs) \ + __asm__ __volatile__("movq 0x00(%0), %%mm0\n\t" \ + "movq 0x08(%0), %%mm1\n\t" \ + "movq 0x10(%0), %%mm2\n\t" \ + "movq 0x18(%0), %%mm3\n\t" \ + : : "r" ((regs))); + +#define SAVE_KNI_REGS(regs) \ + __asm__ __volatile__("movups %%xmm0, 0x00(%0)\n\t" \ + "movups %%xmm1, 0x10(%0)\n\t" \ + "movups %%xmm2, 0x20(%0)\n\t" \ + "movups %%xmm3, 0x30(%0)\n\t" \ + : : "r" ((regs)) : "memory" ); + +#define RESTORE_KNI_REGS(regs) \ + __asm__ __volatile__("movups 0x00(%0), %%xmm0\n\t" \ + "movups 0x10(%0), %%xmm1\n\t" \ + "movups 0x20(%0), %%xmm2\n\t" \ + "movups 0x30(%0), %%xmm3\n\t" \ + : : "r" ((regs))); + +#define SFENCE() \ + __asm__ __volatile__("sfence":::"memory") + + +extern spinlock_t kern_fpu_lock; + +/* + * Although it seems wasteful to do a unilateral clts() in the take_fpu + * functions, the reason I did it that way is because the alternative is + * to test for: + * + * if ( ( (current->flags & PF_USEDFPU) && + * (current->thread.x86_fpustate & X86_FPUSTATE_USER_SAVED) ) || + * ( !(current->flags & PF_USEDFPU) && + * !(current->thread.x86_fpustate & X86_FPUSTATE_KERN_ANY) ) ) + * + */ + +#define kernel_take_fpu_mmx(recursive, regs, task_switch_regs, irqflags) do { \ + spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \ + clts(); \ + (recursive) = (current->thread.x86_fpustate & X86_FPUSTATE_KERN_ANY); \ + if ( (current->flags & PF_USEDFPU) && \ + !(current->thread.x86_fpustate & X86_FPUSTATE_USER_SAVED) ){ \ + i387_save_hard(current->thread.i387); \ + current->thread.x86_fpustate |= X86_FPUSTATE_USER_SAVED; \ + } \ + if ((recursive) & X86_FPUSTATE_KERN_MMX) { \ + SAVE_MMX_REGS((regs)); \ + } else { \ + current->thread.mmx_reg_space = (task_switch_regs); \ + current->thread.x86_fpustate |= X86_FPUSTATE_KERN_MMX; \ + } \ + spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \ +} while (0) + +#define kernel_release_fpu_mmx(recursive, regs, irqflags) do { \ + spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \ + if ((recursive) & X86_FPUSTATE_KERN_MMX) { \ + RESTORE_MMX_REGS((regs)); \ + } else { \ + current->thread.x86_fpustate &= ~X86_FPUSTATE_KERN_MMX; \ + current->thread.mmx_reg_space = NULL; \ + } \ + if ((recursive) == 0) { \ + stts(); \ + } \ + spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \ +} while (0) + +#define kernel_take_fpu_kni(recursive, regs, task_switch_regs, irqflags) do { \ + spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \ + clts(); \ + (recursive) = current->thread.x86_fpustate; \ + if ( (current->flags & PF_USEDFPU) || \ + (current->thread.x86_fpustate & X86_FPUSTATE_KERN_KNI) ) { \ + SAVE_KNI_REGS((regs)); \ + } \ + if (!(current->thread.x86_fpustate & X86_FPUSTATE_KERN_KNI)) { \ + current->thread.kni_reg_space = (task_switch_regs); \ + current->thread.x86_fpustate |= X86_FPUSTATE_KERN_KNI; \ + } \ + spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \ +} while (0) + + +#define kernel_release_fpu_kni(recursive, regs, irqflags) do { \ + spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \ + if ( (current->thread.x86_fpustate & X86_FPUSTATE_USER_SAVED) && \ + !(((recursive) & X86_FPUSTATE_USER_SAVED) && \ + (current->flags & PF_USEDFPU)) ) { \ + i387_restore_hard(current->thread.i387); \ + current->thread.x86_fpustate &= ~X86_FPUSTATE_USER_SAVED; \ + } \ + if ( ((recursive) & X86_FPUSTATE_KERN_KNI) || \ + (current->flags & PF_USEDFPU) ) { \ + RESTORE_KNI_REGS((regs)); \ + } \ + if (((recursive) & X86_FPUSTATE_KERN_KNI) == 0) { \ + current->thread.x86_fpustate &= ~X86_FPUSTATE_KERN_KNI; \ + current->thread.kni_reg_space = NULL; \ + } \ + if ( ((recursive) == 0) && ((current->flags & PF_USEDFPU) == 0) ) { \ + stts(); \ + } \ + spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \ +} while (0) + + +#endif /* __ASM_I386_I387_H */ --- linux/include/asm-i386/io.h.orig Sat Feb 12 19:48:22 2000 +++ linux/include/asm-i386/io.h Mon Feb 28 13:42:13 2000 @@ -181,9 +181,9 @@ #define __raw_writew writew #define __raw_writel writel -#define memset_io(a,b,c) memset(__io_virt(a),(b),(c)) -#define memcpy_fromio(a,b,c) memcpy((a),__io_virt(b),(c)) -#define memcpy_toio(a,b,c) memcpy(__io_virt(a),(b),(c)) +#define memset_io(a,b,c) __memset_generic(__io_virt(a),(b),(c)) +#define memcpy_fromio(a,b,c) __memcpy((a),__io_virt(b),(c)) +#define memcpy_toio(a,b,c) __memcpy(__io_virt(a),(b),(c)) /* * ISA space is 'always mapped' on a typical x86 system, no need to --- linux/include/asm-i386/processor.h.orig Mon Feb 28 13:27:49 2000 +++ linux/include/asm-i386/processor.h Mon Feb 28 13:38:45 2000 @@ -12,6 +12,7 @@ #include #include #include +#include #include /* @@ -37,6 +38,7 @@ char rfu; int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ __u32 x86_capability; + __u32 x86_cr4_features; char x86_vendor_id[16]; char x86_model_id[64]; int x86_cache_size; /* in KB - valid for CPUS which support this @@ -44,6 +46,7 @@ int fdiv_bug; int f00f_bug; int coma_bug; + int enable_fixups; unsigned long loops_per_sec; unsigned long *pgd_quick; unsigned long *pmd_quick; @@ -80,16 +83,16 @@ #define X86_FEATURE_PGE 0x00002000 /* Page Global Enable */ #define X86_FEATURE_MCA 0x00004000 /* Machine Check Architecture */ #define X86_FEATURE_CMOV 0x00008000 /* CMOV instruction (FCMOVCC and FCOMI too if FPU present) */ -#define X86_FEATURE_PAT 0x00010000 /* Page Attribute Table */ +#define X86_FEATURE_PAT 0x00010000 /* Page Attribute Table */ #define X86_FEATURE_PSE36 0x00020000 /* 36-bit PSEs */ -#define X86_FEATURE_18 0x00040000 +#define X86_FEATURE_PN 0x00040000 /* 96 bit CPU serial # */ #define X86_FEATURE_19 0x00080000 #define X86_FEATURE_20 0x00100000 #define X86_FEATURE_21 0x00200000 #define X86_FEATURE_22 0x00400000 #define X86_FEATURE_MMX 0x00800000 /* multimedia extensions */ #define X86_FEATURE_FXSR 0x01000000 /* FXSAVE and FXRSTOR instructions (fast save and restore of FPU context), and CR4.OSFXSR (OS uses these instructions) available */ -#define X86_FEATURE_25 0x02000000 +#define X86_FEATURE_XMM 0x02000000 /* Intel MMX2 instruction set */ #define X86_FEATURE_26 0x04000000 #define X86_FEATURE_27 0x08000000 #define X86_FEATURE_28 0x10000000 @@ -100,6 +103,40 @@ extern struct cpuinfo_x86 boot_cpu_data; extern struct tss_struct init_tss[NR_CPUS]; +#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ +#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ +#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ +#define X86_CR4_DE 0x0008 /* enable debugging extensions */ +#define X86_CR4_PSE 0x0010 /* enable page size extensions */ +#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ +#define X86_CR4_MCE 0x0040 /* Machine check enable */ +#define X86_CR4_PGE 0x0080 /* enable global pages */ +#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ +#define X86_CR4_OSFXSR 0x0200 /* fast FPU save/restore */ +#define X86_CR4_OSXMMEXCPT 0x0400 /* KNI (MMX2) unmasked exception 16 */ + /* handler is available */ + +/* + * Some defines for using with the x86_fpu_state variable in the new + * thread struct. We use these because the rest of the kernel doesn't + * like us messing with current->flags at arbitrary times ;-) + */ +#define X86_FPUSTATE_USER_SAVED 0x0001 +#define X86_FPUSTATE_KERN_ANY 0x0006 +#define X86_FPUSTATE_KERN_MMX 0x0002 +#define X86_FPUSTATE_KERN_KNI 0x0004 + +static inline void load_default_mxcsr(void) +{ + long mxcsr = 0x1f80; + + if ( (boot_cpu_data.x86_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + __asm__("ldmxcsr %0": :"m" (mxcsr)); + } +} + + #ifdef __SMP__ extern struct cpuinfo_x86 cpu_data[]; #define current_cpu_data cpu_data[smp_processor_id()] @@ -137,7 +174,6 @@ : "cc"); } - /* * Intel CPU features in CR4 */ @@ -157,11 +193,10 @@ * enable), so that any CPU's that boot up * after us can get the correct flags. */ -extern unsigned long mmu_cr4_features; static inline void set_in_cr4 (unsigned long mask) { - mmu_cr4_features |= mask; + boot_cpu_data.x86_cr4_features |= mask; __asm__("movl %%cr4,%%eax\n\t" "orl %0,%%eax\n\t" "movl %%eax,%%cr4\n" @@ -171,7 +206,7 @@ static inline void clear_in_cr4 (unsigned long mask) { - mmu_cr4_features &= ~mask; + boot_cpu_data.x86_cr4_features &= ~mask; __asm__("movl %%cr4,%%eax\n\t" "andl %0,%%eax\n\t" "movl %%eax,%%cr4\n" @@ -235,36 +270,61 @@ #define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap) #define INVALID_IO_BITMAP_OFFSET 0x8000 -struct i387_hard_struct { - long cwd; - long swd; - long twd; - long fip; - long fcs; - long foo; - long fos; - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ - long status; /* software status information */ +struct i387_hard_fsave { + long cwd; + long swd; + long twd; + long fip; + long fcs; + long foo; + long fos; + long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ +}; + +/* + * has to be 128-bit aligned + */ +struct i387_hard_fxsave { + unsigned short fxcwd; + unsigned short fxswd; + unsigned short fxtwd; + unsigned short fxfopcode; + long fxfip; + short fxfcs; + short __reserved_00; + long fxfoo; + short fxfos; + short __reserved_01; + long mxcsr; + long __reserved_02; + long st_space[32]; /* 8*16 bytes for each FP/MMX-reg = 128 bytes */ + long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ + long __reserved_03 [14*4]; /* 14 16byte lines for remainder */ +} __attribute__ ((aligned (16))); + +union i387_hard_union { + struct i387_hard_fxsave fxsave; + struct i387_hard_fsave fsave; }; struct i387_soft_struct { - long cwd; - long swd; - long twd; - long fip; - long fcs; - long foo; - long fos; - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ - unsigned char ftop, changed, lookahead, no_update, rm, alimit; - struct info *info; - unsigned long entry_eip; + long cwd; + long swd; + long twd; + long fip; + long fcs; + long foo; + long fos; + long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ + unsigned char ftop, changed, lookahead, no_update, rm, alimit; + struct info *info; + unsigned long entry_eip; }; union i387_union { - struct i387_hard_struct hard; + union i387_hard_union hard; struct i387_soft_struct soft; -}; +} __attribute__ ((aligned(16))); typedef struct { unsigned long seg; @@ -311,8 +371,11 @@ unsigned long debugreg[8]; /* %%db0-7 debug registers */ /* fault info */ unsigned long cr2, trap_no, error_code; -/* floating point info */ +/* floating point / MMX / MMX2 info */ union i387_union i387; + volatile long x86_fpustate; + char *mmx_reg_space; + char *kni_reg_space; /* virtual 86 mode info */ struct vm86_struct * vm86_info; unsigned long screen_bitmap; @@ -322,14 +385,14 @@ unsigned long io_bitmap[IO_BITMAP_SIZE+1]; }; -#define INIT_THREAD { \ - 0, \ - 0, 0, 0, 0, \ - { [0 ... 7] = 0 }, /* debugging registers */ \ - 0, 0, 0, \ - { { 0, }, }, /* 387 state */ \ - 0,0,0,0,0,0, \ - 0,{~0,} /* io permissions */ \ +#define INIT_THREAD { \ + 0, \ + 0, 0, 0, 0, \ + { [0 ... 7] = 0 }, /* debugging registers */ \ + 0, 0, 0, \ + { { { 0, }, }, }, 0, NULL, NULL, /* 387/MMX state */ \ + 0,0,0,0,0,0, \ + 0,{~0,} /* io permissions */ \ } #define INIT_MMAP \ @@ -377,27 +440,6 @@ extern void copy_segments(struct task_struct *p, struct mm_struct * mm); extern void release_segments(struct mm_struct * mm); extern void forget_segments(void); - -/* - * FPU lazy state save handling.. - */ -#define save_fpu(tsk) do { \ - asm volatile("fnsave %0\n\tfwait":"=m" (tsk->thread.i387)); \ - tsk->flags &= ~PF_USEDFPU; \ - stts(); \ -} while (0) - -#define unlazy_fpu(tsk) do { \ - if (tsk->flags & PF_USEDFPU) \ - save_fpu(tsk); \ -} while (0) - -#define clear_fpu(tsk) do { \ - if (tsk->flags & PF_USEDFPU) { \ - tsk->flags &= ~PF_USEDFPU; \ - stts(); \ - } \ -} while (0) /* * Return saved PC of a blocked thread. --- linux/include/asm-i386/string.h.orig Sat Feb 12 19:48:09 2000 +++ linux/include/asm-i386/string.h Mon Feb 28 13:38:45 2000 @@ -15,6 +15,10 @@ #include #else +#ifndef _LINUX_CONFIG_H +#include +#endif + /* * This string-include defines all string functions as inline * functions. Use gcc. It also assumes ds=es=data space, this should be @@ -286,6 +290,17 @@ #include +#ifdef CONFIG_X86_CPU_OPTIMIZATIONS +extern void * __kni_memcpy(void * to, const void * from, size_t n); +extern void * best_memcpy(void * to, const void * from, size_t n); +#define memcpy(t, f, n) \ +(__builtin_constant_p(n) ? \ + (((n) < 128) ? \ + __constant_memcpy((t),(f),(n)) : \ + best_memcpy((t),(f),(n))) : \ + best_memcpy((t),(f),(n))) +#else + #ifdef CONFIG_X86_USE_3DNOW /* All this just for in_interrupt() ... */ @@ -332,6 +347,7 @@ __memcpy((t),(f),(n))) #endif +#endif /* * struct_cpy(x,y), copy structure *x into (matching structure) *y. @@ -496,21 +512,32 @@ #undef COMMON } -#define __constant_c_x_memset(s, c, count) \ -(__builtin_constant_p(count) ? \ - __constant_c_and_count_memset((s),(c),(count)) : \ - __constant_c_memset((s),(c),(count))) +#define __constant_x_count_memset(s, c, count) \ +(__builtin_constant_p(c) ? \ + __constant_c_and_count_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) :\ + __constant_count_memset((s),(c),(count))) #define __memset(s, c, count) \ -(__builtin_constant_p(count) ? \ - __constant_count_memset((s),(c),(count)) : \ +(__builtin_constant_p(c) ? \ + __constant_c_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) : \ __memset_generic((s),(c),(count))) #define __HAVE_ARCH_MEMSET +#ifdef CONFIG_X86_CPU_OPTIMIZATIONS +extern void * __kni_memset(void * s, char c, size_t count); +extern void * best_memset(void * s, char c, size_t count); #define memset(s, c, count) \ -(__builtin_constant_p(c) ? \ - __constant_c_x_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) : \ +(__builtin_constant_p(count) ? \ + (((count) < 128) ? \ + __constant_x_count_memset((s),(c),(count)) : \ + best_memset((s),(c),(count))) : \ + best_memset((s),(c),(count))) +#else +#define memset(s, c, count) \ +(__builtin_constant_p(count) ? \ + __constant_x_count_memset((s),(c),(count)) : \ __memset((s),(c),(count))) +#endif /* * find the first occurrence of byte 'c', or 1 past the area if none --- linux/include/asm-i386/uaccess.h.orig Sat Feb 12 19:48:17 2000 +++ linux/include/asm-i386/uaccess.h Mon Feb 28 13:38:46 2000 @@ -571,19 +571,61 @@ return n; } +#ifdef CONFIG_X86_CPU_OPTIMIZATIONS + +/* + * The XMM based copy_*_user() function declarations...the best_*_user() + * routines need this + */ +unsigned long kni_copy_to_user(void *, const void *, unsigned long); +unsigned long kni_copy_from_user(void *, const void *, unsigned long); +unsigned long __kni_copy_to_user_nocheck(void *, const void *, unsigned long); +unsigned long __kni_copy_from_user_nocheck(void *, const void *, unsigned long); + +unsigned long best_copy_to_user(void *, const void *, unsigned long); +unsigned long best_copy_from_user(void *, const void *, unsigned long); +unsigned long __best_copy_to_user(void *, const void *, unsigned long); +unsigned long __best_copy_from_user(void *, const void *, unsigned long); + #define copy_to_user(to,from,n) \ (__builtin_constant_p(n) ? \ + (((n) < 128) ? \ __constant_copy_to_user((to),(from),(n)) : \ - __generic_copy_to_user((to),(from),(n))) + best_copy_to_user((to),(from),(n))) : \ + best_copy_to_user((to),(from),(n))) #define copy_from_user(to,from,n) \ (__builtin_constant_p(n) ? \ + (((n) < 128) ? \ __constant_copy_from_user((to),(from),(n)) : \ - __generic_copy_from_user((to),(from),(n))) + best_copy_from_user((to),(from),(n))) : \ + best_copy_from_user((to),(from),(n))) -#define copy_to_user_ret(to,from,n,retval) ({ if (copy_to_user(to,from,n)) return retval; }) +#define __copy_to_user(to,from,n) \ + (__builtin_constant_p(n) ? \ + (((n) < 128) ? \ + __constant_copy_to_user_nocheck((to),(from),(n)) : \ + __best_copy_to_user((to),(from),(n))) : \ + __best_copy_to_user((to),(from),(n))) -#define copy_from_user_ret(to,from,n,retval) ({ if (copy_from_user(to,from,n)) return retval; }) +#define __copy_from_user(to,from,n) \ + (__builtin_constant_p(n) ? \ + (((n) < 128) ? \ + __constant_copy_from_user_nocheck((to),(from),(n)) : \ + __best_copy_from_user((to),(from),(n))) : \ + __best_copy_from_user((to),(from),(n))) + +#else /* CONFIG_X86_CPU_OPTIMIZATIONS */ + +#define copy_to_user(to,from,n) \ + (__builtin_constant_p(n) ? \ + __constant_copy_to_user((to),(from),(n)) : \ + __generic_copy_to_user((to),(from),(n))) + +#define copy_from_user(to,from,n) \ + (__builtin_constant_p(n) ? \ + __constant_copy_from_user((to),(from),(n)) : \ + __generic_copy_from_user((to),(from),(n))) #define __copy_to_user(to,from,n) \ (__builtin_constant_p(n) ? \ @@ -594,6 +636,11 @@ (__builtin_constant_p(n) ? \ __constant_copy_from_user_nocheck((to),(from),(n)) : \ __generic_copy_from_user_nocheck((to),(from),(n))) +#endif + +#define copy_to_user_ret(to,from,n,retval) ({ if (copy_to_user(to,from,n)) return retval; }) + +#define copy_from_user_ret(to,from,n,retval) ({ if (copy_from_user(to,from,n)) return retval; }) long strncpy_from_user(char *dst, const char *src, long count); long __strncpy_from_user(char *dst, const char *src, long count); --- linux/include/asm-i386/pgtable.h.orig Mon Feb 28 13:41:37 2000 +++ linux/include/asm-i386/pgtable.h Mon Feb 28 13:42:08 2000 @@ -54,8 +54,8 @@ "movl %0, %%cr3; \n" \ "movl %2, %%cr4; # turn PGE back on \n" \ : "=&r" (tmpreg) \ - : "r" (mmu_cr4_features & ~X86_CR4_PGE), \ - "r" (mmu_cr4_features) \ + : "r" (boot_cpu_data.x86_cr4_features & ~X86_CR4_PGE),\ + "r" (boot_cpu_data.x86_cr4_features) \ : "memory"); \ } while (0) --- linux/arch/i386/lib/Makefile.orig Tue Oct 19 21:36:05 1999 +++ linux/arch/i386/lib/Makefile Mon Feb 28 13:28:03 2000 @@ -13,4 +13,8 @@ L_OBJS += mmx.o endif +ifeq ($(CONFIG_X86_CPU_OPTIMIZATIONS),y) + L_OBJS += best_function.o simd.o mmx.o +endif + include $(TOPDIR)/Rules.make --- linux/arch/i386/lib/best_function.c.orig Mon Feb 28 13:28:03 2000 +++ linux/arch/i386/lib/best_function.c Mon Feb 28 13:28:03 2000 @@ -0,0 +1,315 @@ +/* + * SIMD functions. These replace the functions in asm-i386/string.h + * whenever it makes sense. These also un-inline those functions. + * + * Copyright 1999, Doug Ledford + * + * These functions are simple and trivial, consider them to be + * public domain + */ + +#include +#include +#include +#include + +/* + * We declare our accelerator functions here since this is the only place + * that needs the declarations which makes a header file a pain to deal + * with + */ +extern void * kni_memcpy(void *, const void *, size_t); +extern void * kni_memset(void *, char, size_t); +extern void * athlon_memcpy(void *, const void *, size_t); +extern void * athlon_memset(void *, char, size_t); +extern void * mmx_memcpy(void *, const void *, size_t); +extern void * mmx_memset(void *, char, size_t); +extern unsigned long kni_copy_to_user(void *, const void *, unsigned long); +extern unsigned long kni_copy_from_user(void *, const void *, unsigned long); +extern unsigned long athlon_copy_to_user(void *, const void *, unsigned long); +extern unsigned long athlon_copy_from_user(void *, const void *, unsigned long); +extern unsigned long mmx_copy_to_user(void *, const void *, unsigned long); +extern unsigned long mmx_copy_from_user(void *, const void *, unsigned long); +extern unsigned long __kni_copy_to_user_nocheck(void *, const void *, unsigned long); +extern unsigned long __kni_copy_from_user_nocheck(void *, const void *, unsigned long); +extern unsigned long __athlon_copy_to_user_nocheck(void *, const void *, unsigned long); +extern unsigned long __athlon_copy_from_user_nocheck(void *, const void *, unsigned long); +extern unsigned long __mmx_copy_to_user_nocheck(void *, const void *, unsigned long); +extern unsigned long __mmx_copy_from_user_nocheck(void *, const void *, unsigned long); + +static void * best_memcpy_final(void *, const void *, size_t); +static void * best_memset_final(void *, char, size_t); +static unsigned long best_copy_to_user_final(void *, const void *, unsigned long); +static unsigned long best_copy_from_user_final(void *, const void *, unsigned long); +static unsigned long __best_copy_to_user_final(void *, const void *, unsigned long); +static unsigned long __best_copy_from_user_final(void *, const void *, unsigned long); + +void * best_memcpy(void * to, const void * from, size_t n) +{ + static int first=1; + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.x86_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) + { + if(first) + { + first=0; + printk(KERN_INFO "KNI optimisations selcted.\n"); + } + *caller = (int)kni_memcpy - BAR; + return(kni_memcpy(to, from, n)); + } + else if(boot_cpu_data.x86_vendor==X86_VENDOR_AMD && + boot_cpu_data.x86 == 7) + { + if(first) + { + first=0; + printk(KERN_INFO "Athlon MMX optimisations selcted.\n"); + } + *caller = (int)athlon_memcpy - BAR; + return athlon_memcpy(to,from, n); + } + /* + * It isnt this simple, for testing tho.. + */ + else if(boot_cpu_data.x86_vendor==X86_VENDOR_CYRIX && + boot_cpu_data.x86 == 6) + { + if(first) + { + first=0; + printk(KERN_INFO "Generic MMX optimisations selcted.\n"); + } + *caller = (int)mmx_memcpy - BAR; + return mmx_memcpy(to,from, n); + } else { + *caller = (int)best_memcpy_final - BAR; + return(__memcpy(to, from, n)); + } + } else { + return(__memcpy(to, from, n)); + } +} + +static void * best_memcpy_final(void * to, const void * from, size_t n) +{ + return(__memcpy(to, from, n)); +} + +void * best_memset(void * s, char c, size_t count) +{ + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.x86_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + *caller = (int)kni_memset - BAR; + return(kni_memset(s, c, count)); + } + else if(boot_cpu_data.x86_vendor==X86_VENDOR_AMD && + boot_cpu_data.x86 == 7) + { + *caller = (int)athlon_memset - BAR; + return athlon_memset(s, c, count); + } + /* + * It isnt this simple, for testing tho.. + */ + else if(boot_cpu_data.x86_vendor==X86_VENDOR_CYRIX && + boot_cpu_data.x86 == 6) + { + *caller = (int)mmx_memset - BAR; + return mmx_memset(s, c, count); + } else { + *caller = (int)best_memset_final - BAR; + return(__memset_generic(s, c, count)); + } + } else { + return(__memset_generic(s, c, count)); + } +} + +static void * best_memset_final(void * s, char c, size_t count) +{ + return(__memset_generic(s, c, count)); +} + +unsigned long +best_copy_to_user(void *to, const void *from, unsigned long n) +{ + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.x86_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + *caller = (int)kni_copy_to_user - BAR; + return(kni_copy_to_user(to, from, n)); + } + else if(boot_cpu_data.x86_vendor==X86_VENDOR_AMD && + boot_cpu_data.x86_model == 7) + { + *caller = (int)athlon_copy_to_user - BAR; + return athlon_copy_to_user(to,from, n); + } + /* + * It isnt this simple, for testing tho.. + */ + else if(boot_cpu_data.x86_vendor==X86_VENDOR_CYRIX && + boot_cpu_data.x86 == 6) + { + *caller = (int)mmx_copy_to_user - BAR; + return mmx_copy_to_user(to, from, n); + } else { + *caller = (int)best_copy_to_user_final - BAR; + return(best_copy_to_user_final(to, from, n)); + } + } else { + if (access_ok(VERIFY_WRITE, to, n)) { + __copy_user(to,from,n); + } + return n; + } +} + +static unsigned long +best_copy_to_user_final(void *to, const void *from, unsigned long n) +{ + if (access_ok(VERIFY_WRITE, to, n)) { + __copy_user(to,from,n); + } + return n; +} + +unsigned long +best_copy_from_user(void *to, const void *from, unsigned long n) +{ + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.x86_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + *caller = (int)kni_copy_from_user - BAR; + return(kni_copy_from_user(to, from, n)); + } + else if(boot_cpu_data.x86_vendor==X86_VENDOR_AMD && + boot_cpu_data.x86_model == 7) + { + *caller = (int)athlon_copy_from_user - BAR; + return athlon_copy_from_user(to,from,n); + } + /* + * It isnt this simple, for testing tho.. + */ + else if(boot_cpu_data.x86_vendor==X86_VENDOR_CYRIX && + boot_cpu_data.x86 == 6) + { + *caller = (int)mmx_copy_from_user - BAR; + return mmx_copy_from_user(to, from, n); + } else { + *caller = (int)best_copy_from_user_final - BAR; + return(best_copy_from_user_final(to, from, n)); + } + } else { + if (access_ok(VERIFY_READ, from, n)) { + __copy_user_zeroing(to,from,n); + } + return n; + } +} + +static unsigned long +best_copy_from_user_final(void *to, const void *from, unsigned long n) +{ + if (access_ok(VERIFY_READ, from, n)) { + __copy_user_zeroing(to,from,n); + } + return n; +} + +unsigned long +__best_copy_to_user(void *to, const void *from, unsigned long n) +{ + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.x86_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + *caller = (int)__kni_copy_to_user_nocheck - BAR; + return(__kni_copy_to_user_nocheck(to, from, n)); + } + else if(boot_cpu_data.x86_vendor==X86_VENDOR_AMD && + boot_cpu_data.x86_model == 7) + { + *caller = (int)__athlon_copy_to_user_nocheck - BAR; + return __athlon_copy_to_user_nocheck(to,from, n); + } + /* + * It isnt this simple, for testing tho.. + */ + else if(boot_cpu_data.x86_vendor==X86_VENDOR_CYRIX && + boot_cpu_data.x86 == 6) + { + *caller = (int)__mmx_copy_to_user_nocheck - BAR; + return __mmx_copy_to_user_nocheck(to, from, n); + } else { + *caller = (int)__best_copy_to_user_final - BAR; + return(__best_copy_to_user_final(to, from, n)); + } + } else { + __copy_user(to,from,n); + return n; + } +} + +static unsigned long +__best_copy_to_user_final(void *to, const void *from, unsigned long n) +{ + __copy_user(to,from,n); + return n; +} + +unsigned long +__best_copy_from_user(void *to, const void *from, unsigned long n) +{ + int BAR = (int)__builtin_return_address(0); + int *caller = (int *)BAR - 1; + if(boot_cpu_data.enable_fixups) { + if ( (boot_cpu_data.x86_cr4_features & X86_CR4_OSFXSR) && + (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) { + *caller = (int)__kni_copy_from_user_nocheck - BAR; + return(__kni_copy_from_user_nocheck(to, from, n)); + } + else if(boot_cpu_data.x86_vendor==X86_VENDOR_AMD && + boot_cpu_data.x86_model == 7) + { + *caller = (int)__athlon_copy_from_user_nocheck - BAR; + return __athlon_copy_from_user_nocheck(to,from, n); + } + /* + * It isnt this simple, for testing tho.. + */ + else if(boot_cpu_data.x86_vendor==X86_VENDOR_CYRIX && + boot_cpu_data.x86 == 6) + { + *caller = (int)__mmx_copy_from_user_nocheck - BAR; + return __mmx_copy_from_user_nocheck(to, from, n); + } else { + *caller = (int)__best_copy_from_user_final - BAR; + return(__best_copy_from_user_final(to, from, n)); + } + } else { + __copy_user_zeroing(to,from,n); + return n; + } +} + +static unsigned long +__best_copy_from_user_final(void *to, const void *from, unsigned long n) +{ + __copy_user_zeroing(to,from,n); + return n; +} + --- linux/arch/i386/lib/mmx.c.orig Thu Oct 28 03:30:39 1999 +++ linux/arch/i386/lib/mmx.c Mon Feb 28 13:28:03 2000 @@ -2,58 +2,37 @@ #include #include +#include +#include +#include + /* * MMX 3DNow! library helper functions - * - * To do: - * We can use MMX just for prefetch in IRQ's. This may be a win. - * (reported so on K6-III) - * We should use a better code neutral filler for the short jump - * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ?? - * We also want to clobber the filler register so we dont get any - * register forwarding stalls on the filler. - * - * Add *user handling. Checksums are not a win with MMX on any CPU - * tested so far for any MMX solution figured. - * */ -void *_mmx_memcpy(void *to, const void *from, size_t len) +void *athlon_memcpy(void *to, const void *from, size_t len) { void *p=to; int i= len >> 6; /* len/64 */ + char fpu_save[108]; + unsigned long flags; + int recursive; - if (!(current->flags & PF_USEDFPU)) - clts(); - else - { - __asm__ __volatile__ ( " fnsave %0; fwait\n"::"m"(current->thread.i387)); - current->flags &= ~PF_USEDFPU; - } + kernel_take_fpu_mmx(recursive, fpu_save, NULL, flags); __asm__ __volatile__ ( - "1: prefetch (%0)\n" /* This set is 28 bytes */ - " prefetch 64(%0)\n" - " prefetch 128(%0)\n" - " prefetch 192(%0)\n" - " prefetch 256(%0)\n" - "2: \n" - ".section .fixup, \"ax\"\n" - "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + " prefetch (%0)\n" + " prefetch 64(%0)\n" + " prefetch 128(%0)\n" + " prefetch 192(%0)\n" + " prefetch 256(%0)\n" : : "r" (from) ); - for(; i>0; i--) { __asm__ __volatile__ ( - "1: prefetch 320(%0)\n" - "2: movq (%0), %%mm0\n" + " prefetch 320(%0)\n" + " movq (%0), %%mm0\n" " movq 8(%0), %%mm1\n" " movq 16(%0), %%mm2\n" " movq 24(%0), %%mm3\n" @@ -69,14 +48,6 @@ " movq %%mm1, 40(%1)\n" " movq %%mm2, 48(%1)\n" " movq %%mm3, 56(%1)\n" - ".section .fixup, \"ax\"\n" - "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; @@ -84,29 +55,54 @@ /* * Now do the tail of the block */ + + kernel_release_fpu_mmx(recursive, fpu_save, flags); __memcpy(to, from, len&63); - stts(); return p; } -static void fast_clear_page(void *page) +void *athlon_memset(void *s, char c, size_t count) { + char fpu_save[108]; int i; - if (!(current->flags & PF_USEDFPU)) - clts(); - else - { - __asm__ __volatile__ ( " fnsave %0; fwait\n"::"m"(current->thread.i387)); - current->flags &= ~PF_USEDFPU; - } + u32 page = (u32)s; + int recursive; + unsigned long flags; - __asm__ __volatile__ ( - " pxor %%mm0, %%mm0\n" : : - ); + if(count < 128) + return __memset_generic(s,c,count); + + kernel_take_fpu_mmx(recursive, fpu_save, NULL, flags); - for(i=0;i<4096/128;i++) + __asm__ __volatile__ ( + " prefetchw (%0)\n" + " prefetchw 64(%0)\n" + " prefetchw 128(%0)\n" + " prefetchw 192(%0)\n" + " prefetchw 256(%0)\n" + :: "r" (page) ); + + if(c==0) + { + __asm__ __volatile__ (" pxor %%mm0, %%mm0\n" + :: "r" (page) ); + } + else + { + __memset_generic(s, c, 0x08); + /* + * One day I should try rearranging this so we do the + * final FPU take in the stall right here.. + */ + __asm__ __volatile__ ( + " movq (%0), %%mm0\n" + :: "r" (page) ); + } + + for(i=0;iflags & PF_USEDFPU)) - clts(); - else + char save_buf[108]; + char tmp_buf[108]; + unsigned long flags; + int recursive; + + if(n>=128) { - __asm__ __volatile__ ( " fnsave %0; fwait\n"::"m"(current->thread.i387)); - current->flags &= ~PF_USEDFPU; + kernel_take_fpu_mmx(recursive, save_buf, tmp_buf, flags); + __asm__ __volatile__ ( + " prefetch (%0)\n" + " prefetch 64(%0)\n" + " prefetch 128(%0)\n" + " prefetch 192(%0)\n" + " prefetch 256(%0)\n" + : : "r" (from) ); + + /* + * Copy a cache line per loop + */ + + __asm__ __volatile__ ( + " subl $64, %2\n" + "0: prefetch 320(%0)\n" + " movq (%0), %%mm0\n" + " movq 8(%0), %%mm1\n" + " movq 16(%0), %%mm2\n" + " movq 24(%0), %%mm3\n" + "1: movq %%mm0, (%1)\n" + "2: movq %%mm1, 8(%1)\n" + "3: movq %%mm2, 16(%1)\n" + "4: movq %%mm3, 24(%1)\n" + " movq 32(%0), %%mm0\n" + " movq 40(%0), %%mm1\n" + " movq 48(%0), %%mm2\n" + " movq 56(%0), %%mm3\n" + "5: movq %%mm0, 32(%1)\n" + "6: movq %%mm1, 40(%1)\n" + "7: movq %%mm2, 48(%1)\n" + "8: movq %%mm3, 56(%1)\n" + " addl $64, %0\n" + " addl $64, %1\n" + " subl $64, %2\n" + " jnc 0b\n" + " addl $64, %2\n" + "20:\n" + ".section .fixup,\"ax\"\n" + "11: addl $8, %2\n" + "12: addl $8, %2\n" + "13: addl $8, %2\n" + "14: addl $8, %2\n" + "15: addl $8, %2\n" + "16: addl $8, %2\n" + "17: addl $8, %2\n" + "18: addl $8, %2\n" + " jmp 20b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b,11b\n" + " .long 2b,12b\n" + " .long 3b,13b\n" + " .long 4b,14b\n" + " .long 5b,15b\n" + " .long 6b,16b\n" + " .long 7b,17b\n" + " .long 8b,18b\n" + ".previous\n" + : "=r"(from), "=r"(to), "=c"(n) + : "0"(from), "1"(to), "2"(n) : "memory"); + /* + * Now do the tail of the block + */ + + kernel_release_fpu_mmx(recursive, save_buf, flags); } + __copy_user(to, from, n); + return n; +} - __asm__ __volatile__ ( - "1: prefetch (%0)\n" - " prefetch 64(%0)\n" - " prefetch 128(%0)\n" - " prefetch 192(%0)\n" - " prefetch 256(%0)\n" - "2: \n" - ".section .fixup, \"ax\"\n" - "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" - : : "r" (from) ); +unsigned long __athlon_copy_from_user_nocheck(void *to, const void *from, unsigned long n) +{ + char save_buf[108]; + char tmp_buf[108]; + unsigned long flags; + int recursive; + + if(n>=128) + { + kernel_take_fpu_mmx(recursive, save_buf, tmp_buf, flags); + __asm__ __volatile__ ( + " prefetch (%0)\n" + " prefetch 64(%0)\n" + " prefetch 128(%0)\n" + " prefetch 192(%0)\n" + " prefetch 256(%0)\n" + : : "r" (from) ); + + /* + * Copy a cache line per loop + */ + + __asm__ __volatile__ ( + " subl $64, %2\n" + "0: prefetch 320(%0)\n" + "1: movq (%0), %%mm0\n" + "2: movq 8(%0), %%mm1\n" + "3: movq 16(%0), %%mm2\n" + "4: movq 24(%0), %%mm3\n" + " movq %%mm0, (%1)\n" + " movq %%mm1, 8(%1)\n" + " movq %%mm2, 16(%1)\n" + " movq %%mm3, 24(%1)\n" + "5: movq 32(%0), %%mm0\n" + "6: movq 40(%0), %%mm1\n" + "7: movq 48(%0), %%mm2\n" + "8: movq 56(%0), %%mm3\n" + " movq %%mm0, 32(%1)\n" + " movq %%mm1, 40(%1)\n" + " movq %%mm2, 48(%1)\n" + " movq %%mm3, 56(%1)\n" + " addl $64, %0\n" + " addl $64, %1\n" + " subl $64, %2\n" + " jnc 0b\n" + " addl $64, %2\n" + "20:\n" + ".section .fixup,\"ax\"\n" + "11: addl $8, %2\n" + "12: addl $8, %2\n" + "13: addl $8, %2\n" + "14: addl $8, %2\n" + "15: addl $8, %2\n" + "16: addl $8, %2\n" + "17: addl $8, %2\n" + "18: addl $8, %2\n" + " jmp 20b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b,11b\n" + " .long 2b,12b\n" + " .long 3b,13b\n" + " .long 4b,14b\n" + " .long 5b,15b\n" + " .long 6b,16b\n" + " .long 7b,17b\n" + " .long 8b,18b\n" + ".previous" + : "=r"(from), "=r"(to), "=c"(n) + : "0" (from), "1" (to), "2" (n) : "memory"); + /* + * Now do the tail of the block + */ - for(i=0; i<4096/64; i++) + kernel_release_fpu_mmx(recursive, save_buf, flags); + } + __copy_user_zeroing(to, from, n); + return n; +} + +unsigned long athlon_copy_to_user(void *to, const void *from, unsigned long n) +{ + if (access_ok(VERIFY_WRITE, to, n)) + { + return __athlon_copy_to_user_nocheck(to,from,n); + } + return n; +} + +unsigned long athlon_copy_from_user(void *to, const void *from, unsigned long n) +{ + if (access_ok(VERIFY_READ, from, n)) + { + return __athlon_copy_from_user_nocheck(to,from,n); + } + return n; +} + + +/* + * Pure MMX functions. + */ + +void *mmx_memcpy(void *to, const void *from, size_t len) +{ + void *p=to; + int i= len >> 6; /* len/64 */ + char fpu_save[108]; + unsigned long flags; + int recursive; + + kernel_take_fpu_mmx(recursive, fpu_save, NULL, flags); + + for(; i>0; i--) { __asm__ __volatile__ ( - "1: prefetch 320(%0)\n" - "2: movq (%0), %%mm0\n" - " movq 8(%0), %%mm1\n" - " movq 16(%0), %%mm2\n" - " movq 24(%0), %%mm3\n" - " movq %%mm0, (%1)\n" - " movq %%mm1, 8(%1)\n" - " movq %%mm2, 16(%1)\n" - " movq %%mm3, 24(%1)\n" - " movq 32(%0), %%mm0\n" - " movq 40(%0), %%mm1\n" - " movq 48(%0), %%mm2\n" - " movq 56(%0), %%mm3\n" - " movq %%mm0, 32(%1)\n" - " movq %%mm1, 40(%1)\n" - " movq %%mm2, 48(%1)\n" - " movq %%mm3, 56(%1)\n" - ".section .fixup, \"ax\"\n" - "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + " movq (%0), %%mm0\n" + " movq 8(%0), %%mm1\n" + " movq 16(%0), %%mm2\n" + " movq 24(%0), %%mm3\n" + " movq %%mm0, (%1)\n" + " movq %%mm1, 8(%1)\n" + " movq %%mm2, 16(%1)\n" + " movq %%mm3, 24(%1)\n" + " movq 32(%0), %%mm0\n" + " movq 40(%0), %%mm1\n" + " movq 48(%0), %%mm2\n" + " movq 56(%0), %%mm3\n" + " movq %%mm0, 32(%1)\n" + " movq %%mm1, 40(%1)\n" + " movq %%mm2, 48(%1)\n" + " movq %%mm3, 56(%1)\n" : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; } - stts(); + /* + * Now do the tail of the block + */ + + kernel_release_fpu_mmx(recursive, fpu_save, flags); + __memcpy(to, from, len&63); + return p; +} + +void *mmx_memset(void *s, char c, size_t count) +{ + char fpu_save[108]; + int i; + u32 page = (u32)s; + unsigned long flags; + int recursive; + + if(count < 128) + return __memset_generic(s,c,count); + + kernel_take_fpu_mmx(recursive, fpu_save, NULL, flags); + + if(c==0) + { + __asm__ __volatile__ ( + " pxor %%mm0, %%mm0\n" + :: ); + } + else + { + __memset_generic(s, c, 0x08); + /* + * One day I should try rearranging this so we do the + * final FPU take in the stall right here.. + */ + __asm__ __volatile__ ( + " movq (%0), %%mm0\n" + :: "r" (page) ); + } + + for(i=0;i=128) + { + kernel_take_fpu_mmx(recursive, save_buf, tmp_buf, flags); + + /* + * Copy a cache line per loop + */ + + __asm__ __volatile__ ( + " subl $64, %2\n" + "0: movq (%0), %%mm0\n" + " movq 8(%0), %%mm1\n" + " movq 16(%0), %%mm2\n" + " movq 24(%0), %%mm3\n" + "1: movq %%mm0, (%1)\n" + "2: movq %%mm1, 8(%1)\n" + "3: movq %%mm2, 16(%1)\n" + "4: movq %%mm3, 24(%1)\n" + " movq 32(%0), %%mm0\n" + " movq 40(%0), %%mm1\n" + " movq 48(%0), %%mm2\n" + " movq 56(%0), %%mm3\n" + "5: movq %%mm0, 32(%1)\n" + "6: movq %%mm1, 40(%1)\n" + "7: movq %%mm2, 48(%1)\n" + "8: movq %%mm3, 56(%1)\n" + " addl $64, %0\n" + " addl $64, %1\n" + " subl $64, %2\n" + " jnc 0b\n" + " addl $64, %2\n" + "20:\n" + ".section .fixup,\"ax\"\n" + "11: addl $8, %2\n" + "12: addl $8, %2\n" + "13: addl $8, %2\n" + "14: addl $8, %2\n" + "15: addl $8, %2\n" + "16: addl $8, %2\n" + "17: addl $8, %2\n" + "18: addl $8, %2\n" + " jmp 20b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b,11b\n" + " .long 2b,12b\n" + " .long 3b,13b\n" + " .long 4b,14b\n" + " .long 5b,15b\n" + " .long 6b,16b\n" + " .long 7b,17b\n" + " .long 8b,18b\n" + ".previous" + : "=r"(from), "=r"(to), "=c"(n) + : "0" (from), "1" (to), "2" (n) : "memory"); + /* + * Now do the tail of the block + */ + + kernel_release_fpu_mmx(recursive, save_buf, flags); + } + if(n) + __copy_user(to, from, n); + return n; } - -void mmx_clear_page(void * page) + +unsigned long __mmx_copy_from_user_nocheck(void *to, const void *from, unsigned long n) { - if(in_interrupt()) - slow_zero_page(page); - else - fast_clear_page(page); + char save_buf[108]; + char tmp_buf[108]; + unsigned long flags; + int recursive; + + if(n>=128) + { + kernel_take_fpu_mmx(recursive, save_buf, tmp_buf, flags); + + /* + * Copy a cache line per loop + */ + + __asm__ __volatile__ ( + " subl $64, %2\n" + "1: movq (%0), %%mm0\n" + "2: movq 8(%0), %%mm1\n" + "3: movq 16(%0), %%mm2\n" + "4: movq 24(%0), %%mm3\n" + " movq %%mm0, (%1)\n" + " movq %%mm1, 8(%1)\n" + " movq %%mm2, 16(%1)\n" + " movq %%mm3, 24(%1)\n" + "5: movq 32(%0), %%mm0\n" + "6: movq 40(%0), %%mm1\n" + "7: movq 48(%0), %%mm2\n" + "8: movq 56(%0), %%mm3\n" + " movq %%mm0, 32(%1)\n" + " movq %%mm1, 40(%1)\n" + " movq %%mm2, 48(%1)\n" + " movq %%mm3, 56(%1)\n" + " addl $64, %0\n" + " addl $64, %1\n" + " subl $64, %2\n" + " jnc 1b\n" + " addl $64, %2\n" + "20:\n" + ".section .fixup,\"ax\"\n" + "11: addl $8, %2\n" + "12: addl $8, %2\n" + "13: addl $8, %2\n" + "14: addl $8, %2\n" + "15: addl $8, %2\n" + "16: addl $8, %2\n" + "17: addl $8, %2\n" + "18: addl $8, %2\n" + " jmp 20b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b,11b\n" + " .long 2b,12b\n" + " .long 3b,13b\n" + " .long 4b,14b\n" + " .long 5b,15b\n" + " .long 6b,16b\n" + " .long 7b,17b\n" + " .long 8b,18b\n" + ".previous" + : "=r"(from), "=r"(to), "=c"(n) + : "0" (from), "1" (to), "2" (n) : "memory"); + /* + * Now do the tail of the block + */ + + kernel_release_fpu_mmx(recursive, save_buf, flags); + } + if(n) + __copy_user_zeroing(to, from, n); + return n; } -static void slow_copy_page(void *to, void *from) +unsigned long mmx_copy_to_user(void *to, const void *from, unsigned long n) { - int d0, d1, d2; - __asm__ __volatile__( \ - "cld\n\t" \ - "rep ; movsl" \ - : "=&c" (d0), "=&D" (d1), "=&S" (d2) \ - : "0" (1024),"1" ((long) to),"2" ((long) from) \ - : "memory"); + if (access_ok(VERIFY_WRITE, to, n)) + { + return __mmx_copy_to_user_nocheck(to,from,n); + } + return n; } - -void mmx_copy_page(void *to, void *from) +unsigned long mmx_copy_from_user(void *to, const void *from, unsigned long n) { - if(in_interrupt()) - slow_copy_page(to, from); - else - fast_copy_page(to, from); + if (access_ok(VERIFY_READ, from, n)) + { + return __mmx_copy_from_user_nocheck(to,from,n); + } + return n; } + --- linux/arch/i386/lib/simd.c.orig Mon Feb 28 13:28:03 2000 +++ linux/arch/i386/lib/simd.c Mon Feb 28 13:28:03 2000 @@ -0,0 +1,460 @@ +/* + * SIMD functions. These replace the functions in asm-i386/string.h + * whenever it makes sense. These also un-inline those functions. + * + * Copyright 1999, Doug Ledford + * Ingo Molnar + * + * These functions are simple and trivial, consider them to be + * public domain + */ + +#include +#include +#include +#include +#include + +#define COPY_TRESHOLD 128 +int use_kni = 1; + +#define XMM_SIZE 64 + +extern void * kni_memcpy(void * to, const void * from, size_t n) +{ + unsigned long flags; + void *ret = to; + size_t size; + int recursive = 0; + char xmm_space[XMM_SIZE]; + + /* + * If the transfer is too small, then use the generic routine. + */ + if (!use_kni || (n < COPY_TRESHOLD)) + return __memcpy(to, from, n); + + kernel_take_fpu_kni(recursive, &xmm_space[0],NULL,flags); + + /* + * Align the destination on a 32byte boundary. + * The source doesn't have to be aligned. + */ +#define ALIGN 0x10 + if ((unsigned long)to & (ALIGN-1)) { + size = ALIGN - ((unsigned long)to & (ALIGN-1)); + __asm__ __volatile__("movups (%0),%%xmm0\n\t" + "movups %%xmm0,(%1)\n\t" + : + : "r" (from), + "r" (to)); + n -= size; + from += size; + to += size; + } + /* + * If the copy would have tailings, take care of them + * now instead of later + */ + if (n & (ALIGN-1)) { + size = n - ALIGN; + __asm__ __volatile__("movups (%0),%%xmm0\n\t" + "movups %%xmm0,(%1)\n\t" + : + : "r" (from + size), + "r" (to + size)); + n &= ~(ALIGN-1); + } + /* + * Prefetch the first two cachelines now. + */ + __asm__ __volatile__("prefetchnta 0x00(%0)\n\t" + "prefetchnta 0x20(%0)\n\t" + : + : "r" (from)); + /* + * Copy 32 bytes at a time. The single unroll is good + * for a 30% performance boost in the copy. Additional + * unrolls are not productive. We are guaranteed to + * have at least 32 bytes of data to copy since the + * macro in string.h doesn't call into this function + * with less than 64 bytes of copy and we lost < 32 + * bytes to alignment earlier. + */ +#define STEP 0x20 + while (n >= STEP) { + __asm__ __volatile__( + "movups 0x00(%0),%%xmm0\n\t" + "movups 0x10(%0),%%xmm1\n\t" + "movntps %%xmm0,0x00(%1)\n\t" + "movntps %%xmm1,0x10(%1)\n\t" + : + : "r" (from), "r" (to) + : "memory"); + from += STEP; + /* + * Note: Intermixing the prefetch at *exactly* this point + * in time has been shown to be the fastest possible. + * Timing these prefetch instructions is a complete black + * art with nothing but trial and error showing the way. + * To that extent, this optimum version was found by using + * a userland version of this routine that we clocked for + * lots of runs. We then fiddled with ordering until we + * settled on our highest speen routines. So, the long + * and short of this is, don't mess with instruction ordering + * here or suffer permance penalties you will. + */ + __asm__ __volatile__( + "prefetchnta 0x20(%0)\n\t" + : + : "r" (from)); + to += STEP; + n -= STEP; + } + if (n) { + __asm__ __volatile__("movups 0x00(%0),%%xmm0\n\t" + "movntps %%xmm0,0x00(%1)\n\t" + : + : "r" (from), "r" (to) + : "memory"); + } + SFENCE(); + kernel_release_fpu_kni(recursive,&xmm_space[0],flags); + return(ret); +} + +extern void * kni_memset(void * s, char c, size_t count) +{ + unsigned long flags; + size_t size; + void *ret = s; + int recursive = 0; + char xmm_space[XMM_SIZE]; + + /* + * If the transfer is too small, then use the generic routine. + */ + if (!use_kni || (count < COPY_TRESHOLD)) + return __memset_generic(s, c, count); + + kernel_take_fpu_kni(recursive, &xmm_space[0], NULL, flags); + /* + * Load up our XMM register with the stuff to set mem with + */ + if(c == '\0') { + __asm__ __volatile__("xorps %%xmm0,%%xmm0\n\t" + "movups %%xmm0,(%0)\n\t" + : + : "r" (s)); + } else { + __memset_generic(s, c, 0x10); + __asm__ __volatile__("movups (%0),%%xmm0" + : + : "r" (s)); + } + /* + * align the destination on a 16 byte boundary, we can simply + * do the math to align things since we already populated the + * first 16 bytes. + */ + size = (0x10 - ((unsigned long)s & 0xf)); + count -= size; + s += size; + /* + * On the off chance we have tailings due to alignment issues, + * do them now to make later more efficient + */ + if(count & 0xf) { + __asm__ __volatile__("movups %%xmm0,(%0)" + : + : "r" (s + (count - 0x10)) + : "memory"); + count &= ~0xf; + } + /* + * Do the copy by plopping out the register to memory. + * Note: Unrolling this was *totally* unproductive. My benchmark + * showed that one or two plops per iteration produced the same + * speed to within .06 MByte/s of speed. Considering that the + * routine benchmarked at over 3000 MByte/s, .06 is not statistically + * significant and only doing one drop per loop simplifies + * overhead of book keeping. + */ + while(count) { + __asm__ __volatile__("movntps %%xmm0,0x00(%0)\n\t" + : + : "r" (s)); + s += 0x10; + count -= 0x10; + } + SFENCE(); + kernel_release_fpu_kni(recursive,&xmm_space[0],flags); + return(ret); +} + +#define __kni_copy_to_user(to,from,size) \ +do { \ + int __d0, __d1, tmp, tmp2; \ + __asm__ __volatile__( \ + " movl %1,%4\n" \ + " andl $0xf,%4\n" \ + " movups (%2),%%xmm0\n" \ + "1: movups %%xmm0,(%1)\n" \ + " movl $0x10,%3\n" \ + " subl %4,%3\n" \ + " addl %3,%2\n" \ + " addl %3,%1\n" \ + " subl %3,%0\n" \ + " prefetchnta 0x00(%2)\n" \ + " prefetchnta 0x20(%2)\n" \ + " jmp 200f\n" \ + "100: movups 0x00(%2),%%xmm0\n" \ + " movups 0x10(%2),%%xmm1\n" \ + "2: movntps %%xmm0,0x00(%1)\n" \ + "3: movntps %%xmm1,0x10(%1)\n" \ + " addl $0x20,%2\n" \ + " prefetchnta 0x20(%2)\n" \ + " addl $0x20,%1\n" \ + " subl $0x20,%0\n" \ + "200: cmpl $0x1f,%0\n" \ + " ja 100b\n" \ + " cmpl $0xf,%0\n" \ + " jbe 300f\n" \ + " movups 0x00(%2),%%xmm0\n" \ + "4: movntps %%xmm0,0x00(%1)\n" \ + " addl $0x10,%2\n" \ + " addl $0x10,%1\n" \ + " subl $0x10,%0\n" \ + "300: testl %0,%0\n" \ + " je 400f\n" \ + " movl $0x10,%3\n" \ + " subl %0,%3\n" \ + " subl %3,%1\n" \ + " subl %3,%2\n" \ + " movups 0x00(%2),%%xmm0\n" \ + "5: movups %%xmm0,0x00(%1)\n" \ + " addl $0x10,%2\n" \ + " addl $0x10,%1\n" \ + " xorl %0,%0\n" \ + "400:\n" \ + ".section .fixup,\"ax\"\n" \ + "6: jmp 400b\n" \ + "7: addl $0x10,%1\n" \ + " addl $0x10,%2\n" \ + " subl $0x10,%0\n" \ + " jmp 400b\n" \ + "8: addl %3,%1\n" \ + " addl %3,%2\n" \ + " jmp 400b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,6b\n" \ + " .long 2b,6b\n" \ + " .long 3b,7b\n" \ + " .long 4b,6b\n" \ + " .long 5b,8b\n" \ + ".previous" \ + : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(tmp), \ + "=r"(tmp2) \ + : "0"(size), "1"(to), "2"(from) \ + : "memory"); \ +} while (0) + +#define __kni_copy_from_user(to,from,size) \ +do { \ + int __d0, __d1, tmp, tmp2; \ + __asm__ __volatile__( \ + " movl %1,%4\n" \ + " andl $0xf,%4\n" \ + "1: movups (%2),%%xmm0\n" \ + " movups %%xmm0,(%1)\n" \ + " movl $0x10,%3\n" \ + " subl %4,%3\n" \ + " addl %3,%2\n" \ + " addl %3,%1\n" \ + " subl %3,%0\n" \ + " prefetchnta 0x00(%2)\n" \ + " prefetchnta 0x20(%2)\n" \ + " jmp 100f\n" \ + "2: movups 0x00(%2),%%xmm0\n" \ + "3: movups 0x10(%2),%%xmm1\n" \ + " movntps %%xmm0,0x00(%1)\n" \ + " movntps %%xmm1,0x10(%1)\n" \ + " addl $0x20,%2\n" \ + " prefetchnta 0x20(%2)\n" \ + " addl $0x20,%1\n" \ + " subl $0x20,%0\n" \ + "100: cmpl $0x1f,%0\n" \ + " ja 2b\n" \ + " cmpl $0xf,%0\n" \ + " jbe 200f\n" \ + "4: movups 0x00(%2),%%xmm0\n" \ + " movntps %%xmm0,0x00(%1)\n" \ + " addl $0x10,%2\n" \ + " addl $0x10,%1\n" \ + " subl $0x10,%0\n" \ + "200: testl %0,%0\n" \ + " je 300f\n" \ + " movl $0x10,%3\n" \ + " subl %0,%3\n" \ + " subl %3,%1\n" \ + " subl %3,%2\n" \ + "5: movups 0x00(%2),%%xmm0\n" \ + " movups %%xmm0,0x00(%1)\n" \ + " addl $0x10,%2\n" \ + " addl $0x10,%1\n" \ + " xorl %0,%0\n" \ + "300:\n" \ + ".section .fixup,\"ax\"\n" \ + "6: xorps %%xmm0,%%xmm0\n" \ + " movups %%xmm0,(%1)\n" \ + " movl $0x10,%3\n" \ + " subl %4,%3\n" \ + " addl %3,%1\n" \ + " movl %3,%4\n" \ + " movl %0,%3\n" \ + " subl %4,%3\n" \ + " jmp 600f\n" \ + "7: subl $0x10,%0\n" \ + " addl $0x10,%1\n" \ + "400: movl %0,%3\n" \ + " xorps %%xmm0,%%xmm0\n" \ + " jmp 600f\n" \ + "500: movntps %%xmm0,0x00(%1)\n" \ + " movntps %%xmm0,0x10(%1)\n" \ + " addl $0x20,%1\n" \ + " subl $0x20,%3\n" \ + "600: cmpl $0x1f,%3\n" \ + " ja 500b\n" \ + " cmpl $0xf,%3\n" \ + " jbe 700f\n" \ + " movntps %%xmm0,0x00(%1)\n" \ + " addl $0x10,%1\n" \ + " subl $0x10,%3\n" \ + "700: testl %3,%3\n" \ + " je 300b\n" \ + " xorl %4,%4\n" \ + " movb %4,(%1)\n" \ + " inc %1\n" \ + " dec %3\n" \ + " jmp 700b\n" \ + "8: addl %3,%1\n" \ + " movl %0,%3\n" \ + " jmp 700b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,6b\n" \ + " .long 2b,400b\n" \ + " .long 3b,7b\n" \ + " .long 4b,400b\n" \ + " .long 5b,8b\n" \ + ".previous" \ + : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(tmp), \ + "=q"(tmp2) \ + : "0"(size), "1"(to), "2"(from) \ + : "memory"); \ +} while (0) + + +unsigned long +__kni_copy_to_user_nocheck(void *to, const void *from, unsigned long n) +{ + unsigned long flags; + int recursive = 0; + char xmm_space[XMM_SIZE]; + char xmm_reg_space[XMM_SIZE]; /* in case we switch context */ + + if (use_kni && (n >= COPY_TRESHOLD)) { + unsigned int size; + + if ((unsigned long)to & (ALIGN-1)) { + size = ALIGN - ((unsigned long)to & (ALIGN-1)); + __copy_user(to,from,size); + n -= size; + from += size; + to += size; + } + kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags); + __kni_copy_to_user(to,from,n); + SFENCE(); + kernel_release_fpu_kni(recursive,&xmm_space[0],flags); + } else { + __copy_user(to,from,n); + } + return n; +} + +unsigned long +__kni_copy_from_user_nocheck(void *to, const void *from, unsigned long n) +{ + unsigned long flags; + int recursive = 0; + char xmm_space[XMM_SIZE]; + char xmm_reg_space[XMM_SIZE]; /* in case we switch context */ + + if (use_kni && (n >= COPY_TRESHOLD)) { + kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags); + __kni_copy_from_user(to,from,n); + SFENCE(); + kernel_release_fpu_kni(recursive,&xmm_space[0],flags); + } else { + __copy_user_zeroing(to,from,n); + } + return n; +} + + + +unsigned long +kni_copy_to_user(void *to, const void *from, unsigned long n) +{ + unsigned long flags; + int recursive = 0; + char xmm_space[XMM_SIZE]; + char xmm_reg_space[XMM_SIZE]; /* in case we switch context */ + + if (access_ok(VERIFY_WRITE, to, n)) { + if (use_kni && (n >= COPY_TRESHOLD)) { + unsigned int size; + if ((unsigned long)to & (ALIGN-1)) { + size = ALIGN - ((unsigned long)to & (ALIGN-1)); + __copy_user(to,from,size); + n -= size; + from += size; + to += size; + } + kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags); + __kni_copy_to_user(to,from,n); + SFENCE(); + kernel_release_fpu_kni(recursive,&xmm_space[0],flags); + } else { + __copy_user(to,from,n); + } + } + return n; +} + +unsigned long +kni_copy_from_user(void *to, const void *from, unsigned long n) +{ + unsigned long flags; + int recursive = 0; + char xmm_space[XMM_SIZE]; + char xmm_reg_space[XMM_SIZE]; /* in case we switch context */ + + if (access_ok(VERIFY_READ, from, n)) { + if (use_kni && (n >= COPY_TRESHOLD)) { + kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags); + __kni_copy_from_user(to,from,n); + SFENCE(); + kernel_release_fpu_kni(recursive,&xmm_space[0],flags); + } else { + __copy_user_zeroing(to,from,n); + } + } + return n; +} + + --- linux/arch/i386/kernel/head.S.orig Tue Feb 8 04:59:39 2000 +++ linux/arch/i386/kernel/head.S Mon Feb 28 13:28:03 2000 @@ -34,7 +34,8 @@ #define X86_HARD_MATH CPU_PARAMS+6 #define X86_CPUID CPU_PARAMS+8 #define X86_CAPABILITY CPU_PARAMS+12 -#define X86_VENDOR_ID CPU_PARAMS+16 +#define X86_CR4 CPU_PARAMS+16 +#define X86_VENDOR_ID CPU_PARAMS+20 /* * swapper_pg_dir is the main page directory, address 0x00101000 @@ -61,9 +62,8 @@ * NOTE! We have to correct for the fact that we're * not yet offset PAGE_OFFSET.. */ -#define cr4_bits mmu_cr4_features-__PAGE_OFFSET - movl %cr4,%eax # Turn on 4Mb pages - orl cr4_bits,%eax + movl %cr4,%eax # Turn on 4Mb pages and other CPU features + orl X86_CR4-__PAGE_OFFSET,%eax movl %eax,%cr4 #endif /* --- linux/arch/i386/kernel/i386_ksyms.c.orig Mon Feb 28 13:27:52 2000 +++ linux/arch/i386/kernel/i386_ksyms.c Mon Feb 28 13:31:43 2000 @@ -101,6 +101,14 @@ EXPORT_SYMBOL(mmx_clear_page); EXPORT_SYMBOL(mmx_copy_page); #endif +#ifdef CONFIG_X86_CPU_OPTIMIZATIONS +EXPORT_SYMBOL(best_memcpy); +EXPORT_SYMBOL(best_memset); +EXPORT_SYMBOL(best_copy_to_user); +EXPORT_SYMBOL(best_copy_from_user); +EXPORT_SYMBOL(__best_copy_to_user); +EXPORT_SYMBOL(__best_copy_from_user); +#endif #ifdef CONFIG_SMP EXPORT_SYMBOL(cpu_data); --- linux/arch/i386/kernel/process.c.orig Mon Feb 28 13:27:52 2000 +++ linux/arch/i386/kernel/process.c Mon Feb 28 13:28:03 2000 @@ -38,6 +38,7 @@ #include #include #include +#include #ifdef CONFIG_MATH_EMULATION #include #endif @@ -470,6 +471,106 @@ } /* + * FPU state handling functions + */ + +int i387_hard_to_user ( struct user_i387_struct * user, + union i387_hard_union * hard) +{ +#ifdef CONFIG_X86_FX + int i, err = 0; + short *tmp, *tmp2; + union i387_hard_union hard2; +#else + int err = 0; +#endif + + if (!access_ok(VERIFY_WRITE, user, sizeof(*user))) + return -EFAULT; +#ifdef CONFIG_X86_FX + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { + hard2.fsave.cwd = 0xffff0000 | hard->fxsave.fxcwd; + hard2.fsave.swd = 0xffff0000 | hard->fxsave.fxswd; + hard2.fsave.twd = fputag_KNI_to_387(hard->fxsave.fxtwd); + hard2.fsave.fip = hard->fxsave.fxfip; + hard2.fsave.fcs = hard->fxsave.fxfcs; + hard2.fsave.foo = hard->fxsave.fxfoo; + hard2.fsave.fos = hard->fxsave.fxfos; + + tmp = (short *)&hard2.fsave.st_space[0]; + tmp2 = (short *)&hard->fxsave.st_space[0]; + + /* + * Transform the two layouts: + * (we do not mix 32-bit access with 16-bit access because + * thats suboptimal on PPros) + */ + + for (i = 0; i < 8; i++) { + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2 += 4; + } + err = copy_to_user((void *)(user),(&(hard2)), + sizeof(struct i387_hard_fsave)); + } else +#endif + err = copy_to_user((void *)(user), + (&(hard->fsave.cwd)), + sizeof(struct i387_hard_fsave)); + return err; +} + +int i387_user_to_hard (union i387_hard_union * hard, + struct user_i387_struct * user) +{ +#ifdef CONFIG_X86_FX + int i, err = 0; + short *tmp, *tmp2; + union i387_hard_union hard2; +#else + int err = 0; +#endif + + if (!access_ok(VERIFY_READ, user, sizeof(*user))) + return -EFAULT; +#ifdef CONFIG_X86_FX + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { + err = copy_from_user((&(hard2)),(void *)(user), + sizeof(struct i387_hard_fsave)); + hard->fxsave.fxcwd = hard2.fsave.cwd & 0xffff; + hard->fxsave.fxswd = hard2.fsave.swd & 0xffff; + hard->fxsave.fxtwd = fputag_387_to_KNI(hard2.fsave.twd); + hard->fxsave.fxfip = hard2.fsave.fip; + hard->fxsave.fxfcs = hard2.fsave.fcs & 0xffff; + hard->fxsave.fxfoo = hard2.fsave.foo; + hard->fxsave.fxfos = hard2.fsave.fos & 0xffff; + + tmp2 = (short *)&hard->fxsave.st_space[0]; + tmp = (short *)&hard2.fsave.st_space[0]; + + for (i = 0; i < 8; i++) { + *tmp2 = *tmp; tmp++; tmp2++; + *tmp2 = *tmp; tmp++; tmp2++; + *tmp2 = *tmp; tmp++; tmp2++; + *tmp2 = *tmp; tmp++; tmp2++; + *tmp2 = *tmp; tmp++; tmp2++; + *tmp2 = 0; tmp2++; + *tmp2 = 0; tmp2++; + *tmp2 = 0; tmp2++; + } + } else +#endif + err = copy_from_user((&(hard->fsave.cwd)), + (void *)(user), + sizeof(struct i387_hard_fsave)); + return err; +} + + +/* * Save a segment. */ #define savesegment(seg,value) \ @@ -499,23 +600,51 @@ return 0; } -/* - * fill in the FPU structure for a core dump. - */ int dump_fpu (struct pt_regs * regs, struct user_i387_struct* fpu) { +#ifdef CONFIG_X86_FX + int fpvalid, i; + short *tmp, *tmp2; + struct task_struct *tsk = current; + union i387_hard_union *hard; +#else int fpvalid; struct task_struct *tsk = current; - +#endif fpvalid = tsk->used_math; if (fpvalid) { unlazy_fpu(tsk); - memcpy(fpu,&tsk->thread.i387.hard,sizeof(*fpu)); +#ifdef CONFIG_X86_FX + if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { + hard = &tsk->thread.i387.hard; + + fpu->cwd = 0xffff0000 | hard->fxsave.fxcwd; + fpu->swd = 0xffff0000 | hard->fxsave.fxswd; + fpu->twd = fputag_KNI_to_387(hard->fxsave.fxtwd); + fpu->fip = hard->fxsave.fxfip; + fpu->fcs = hard->fxsave.fxfcs; + fpu->foo = hard->fxsave.fxfoo; + fpu->fos = hard->fxsave.fxfos; + + tmp = (short *)&fpu->st_space[0]; + tmp2 = (short *)&hard->fxsave.st_space[0]; + + for (i = 0; i < 8; i++) { + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2++; + *tmp = *tmp2; tmp++; tmp2+=4; + } + } else +#endif + memcpy(fpu,&tsk->thread.i387.hard.fsave,sizeof(*fpu)); } return fpvalid; } + /* * fill in the user structure for a core dump.. */ @@ -569,8 +698,8 @@ /* * switch_to(x,yn) should switch tasks from x to y. * - * We fsave/fwait so that an exception goes off at the right time - * (as a call from the fsave or fwait in effect) rather than to + * We fpu_save so that an exception goes off at the right time + * (as a call from the f*save or fwait in effect) rather than to * the wrong process. Lazy FP saving no longer makes any sense * with modern CPU's, and this simplifies a lot of things (SMP * and UP become the same). --- linux/arch/i386/kernel/ptrace.c.orig Sat Jan 15 20:04:17 2000 +++ linux/arch/i386/kernel/ptrace.c Mon Feb 28 13:34:59 2000 @@ -17,6 +17,7 @@ #include #include #include +#include /* * does not yet catch signals sent when the child dies. @@ -396,14 +397,15 @@ ret = 0; if ( !child->used_math ) { /* Simulate an empty FPU. */ - child->thread.i387.hard.cwd = 0xffff037f; - child->thread.i387.hard.swd = 0xffff0000; - child->thread.i387.hard.twd = 0xffffffff; + i387_set_cwd(child->thread.i387.hard, 0x037f); + i387_set_swd(child->thread.i387.hard, 0x0000); + i387_set_twd(child->thread.i387.hard, 0xffff); } #ifdef CONFIG_MATH_EMULATION if ( boot_cpu_data.hard_math ) { #endif - __copy_to_user((void *)data, &child->thread.i387.hard, sizeof(struct user_i387_struct)); + i387_hard_to_user((struct user_i387_struct *)data, + &child->thread.i387.hard); #ifdef CONFIG_MATH_EMULATION } else { save_i387_soft(&child->thread.i387.soft, (struct _fpstate *)data); @@ -421,7 +423,8 @@ #ifdef CONFIG_MATH_EMULATION if ( boot_cpu_data.hard_math ) { #endif - __copy_from_user(&child->thread.i387.hard, (void *)data, sizeof(struct user_i387_struct)); + i387_user_to_hard(&child->thread.i387.hard, + (struct user_i387_struct *)data); #ifdef CONFIG_MATH_EMULATION } else { restore_i387_soft(&child->thread.i387.soft, (struct _fpstate *)data); --- linux/arch/i386/kernel/signal.c.orig Fri Jan 21 18:48:31 2000 +++ linux/arch/i386/kernel/signal.c Mon Feb 28 14:19:55 2000 @@ -21,6 +21,7 @@ #include #include #include +#include #define DEBUG_SIG 0 @@ -150,12 +151,20 @@ char retcode[8]; }; - static inline int restore_i387_hard(struct _fpstate *buf) { + int err = 0; + unsigned int tmp; struct task_struct *tsk = current; + clear_fpu(tsk); - return __copy_from_user(&tsk->thread.i387.hard, buf, sizeof(*buf)); + + err = i387_user_to_hard(&tsk->thread.i387.hard, + (struct user_i387_struct *)buf); + err |= get_user(tmp, &buf->status); + if (!err) + i387_set_swd(tsk->thread.i387.hard, tmp); + return err; } static inline int restore_i387(struct _fpstate *buf) @@ -305,11 +314,14 @@ static inline int save_i387_hard(struct _fpstate * buf) { + int err = 0; struct task_struct *tsk = current; unlazy_fpu(tsk); - tsk->thread.i387.hard.status = tsk->thread.i387.hard.swd; - if (__copy_to_user(buf, &tsk->thread.i387.hard, sizeof(*buf))) + err = i387_hard_to_user((struct user_i387_struct *)buf, + &tsk->thread.i387.hard); + err |= put_user(i387_get_swd(tsk->thread.i387.hard), &buf->status); + if (err) return -1; return 1; } --- linux/arch/i386/kernel/smp.c.orig Fri Feb 11 17:19:45 2000 +++ linux/arch/i386/kernel/smp.c Mon Feb 28 13:37:25 2000 @@ -534,6 +534,7 @@ static void stop_this_cpu (void * dummy) { + load_default_mxcsr(); /* * Remove this CPU: */ --- linux/arch/i386/kernel/traps.c.orig Mon Feb 28 13:27:52 2000 +++ linux/arch/i386/kernel/traps.c Mon Feb 28 13:28:04 2000 @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -552,7 +553,9 @@ * (this will also clear the error) */ task = current; - save_fpu(task); + i387_save_hard(task->thread.i387); + task->flags &= ~PF_USEDFPU; + stts(); task->thread.trap_no = 16; task->thread.error_code = 0; force_sig(SIGFPE, task); @@ -583,18 +586,44 @@ asmlinkage void math_state_restore(struct pt_regs regs) { __asm__ __volatile__("clts"); /* Allow maths ops (or we recurse) */ - - if(current->used_math) - __asm__("frstor %0": :"m" (current->thread.i387)); - else - { + /* + * If we have either of the kernel FPU use states set in the + * fpustate variable, then this will be a kernel math trap. + * Otherwise, this is userspace trying to use the FPU. + */ + if (current->thread.x86_fpustate & X86_FPUSTATE_KERN_ANY) { + load_default_mxcsr(); /* we don't ever mess with this in + kernel space, so just make sure + we have a reasonable one so we + don't start taking unmasked + exceptions by accident */ + if(current->thread.mmx_reg_space != NULL) + __asm__("movq 0x00(%0), %%mm0\n\t" + "movq 0x08(%0), %%mm1\n\t" + "movq 0x10(%0), %%mm2\n\t" + "movq 0x18(%0), %%mm3\n\t" + :: "r" (current->thread.mmx_reg_space)); + if(current->thread.kni_reg_space != NULL) + __asm__("movups 0x00(%0), %%xmm0\n\t" + "movups 0x10(%0), %%xmm1\n\t" + "movups 0x20(%0), %%xmm2\n\t" + "movups 0x30(%0), %%xmm3\n\t" + :: "r" (current->thread.kni_reg_space)); + } else if (current->thread.x86_fpustate & X86_FPUSTATE_USER_SAVED) { + i387_restore_hard(current->thread.i387); + current->thread.x86_fpustate = 0; + } else if (current->used_math) { + i387_restore_hard(current->thread.i387); + current->flags|=PF_USEDFPU; /* make switch_to() work */ + } else { /* * Our first FPU usage, clean the chip. */ __asm__("fninit"); + load_default_mxcsr(); current->used_math = 1; + current->flags|=PF_USEDFPU; /* make switch_to() work */ } - current->flags|=PF_USEDFPU; /* So we fnsave on switch_to() */ } #ifndef CONFIG_MATH_EMULATION --- linux/arch/i386/kernel/setup.c.orig Mon Feb 28 16:28:57 2000 +++ linux/arch/i386/kernel/setup.c Mon Feb 28 16:29:04 2000 @@ -86,11 +86,6 @@ unsigned long mmu_cr4_features = 0; /* - * For the various FPU using kernel accelerator routines - */ -spinlock_t kern_fpu_lock = SPIN_LOCK_UNLOCKED; - -/* * Bus types .. */ int EISA_bus = 0; --- linux/arch/i386/Makefile.orig Mon Jan 24 20:04:37 2000 +++ linux/arch/i386/Makefile Mon Feb 28 13:28:04 2000 @@ -72,6 +72,10 @@ AFLAGS := $(AFLAGS) -DCPU=686 endif +ifdef CONFIG_M686FX +CFLAGS := $(CFLAGS) -m486 -malign-loops=0 -malign-jumps=0 -malign-functions=0 -DCPU=686 +endif + HEAD := arch/i386/kernel/head.o arch/i386/kernel/init_task.o SUBDIRS := $(SUBDIRS) arch/i386/kernel arch/i386/mm arch/i386/lib --- linux/arch/i386/config.in.orig Mon Feb 28 13:27:50 2000 +++ linux/arch/i386/config.in Mon Feb 28 13:37:51 2000 @@ -21,9 +21,11 @@ 486/Cx486 CONFIG_M486 \ 586/K5/5x86/6x86 CONFIG_M586 \ Pentium/TSC CONFIG_M586TSC \ - PPro/6x86MX CONFIG_M686 \ - K6/II/III CONFIG_MK6 \ - Athlon CONFIG_MK7" PPro + K6/II/III CONFIG_MK6 \ + PPro/6x86MX/PII CONFIG_M686 \ + Athlon CONFIG_MK7 \ + PIII/Xeon/Deschutes CONFIG_M686FX" PIII + # # Define implied options from the CPU selection here # @@ -33,10 +35,10 @@ define_bool CONFIG_X86_BSWAP y define_bool CONFIG_X86_POPAD_OK y fi -if [ "$CONFIG_M686" = "y" -o "$CONFIG_M586TSC" = "y" ]; then +if [ "$CONFIG_M686FX" = "y" -o "$CONFIG_M686" = "y" -o "$CONFIG_M586TSC" = "y" ]; then define_bool CONFIG_X86_TSC y fi -if [ "$CONFIG_M686" = "y" ]; then +if [ "$CONFIG_M686FX" = "y" -o "$CONFIG_M686" = "y" ]; then define_bool CONFIG_X86_GOOD_APIC y define_bool CONFIG_X86_PGE y fi @@ -65,6 +67,8 @@ define_bool CONFIG_X86_PAE y fi +bool 'Enable PII/PIII Extended/Fast FPU save and restore support' CONFIG_X86_FX +bool 'Enable CPU Specific (MMX/MMX2) Optimization Functions' CONFIG_X86_CPU_OPTIMIZATIONS bool 'Math emulation' CONFIG_MATH_EMULATION bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR bool 'Symmetric multi-processing support' CONFIG_SMP --- linux/arch/i386/defconfig.orig Mon Feb 28 13:27:52 2000 +++ linux/arch/i386/defconfig Mon Feb 28 13:28:04 2000 @@ -29,6 +29,9 @@ CONFIG_X86_PGE=y # CONFIG_MICROCODE is not set CONFIG_NOHIGHMEM=y +CONFIG_X86_PN_OFF=y +CONFIG_X86_FX=y +CONFIG_X86_CPU_OPTIMIZATIONS=y # CONFIG_HIGHMEM4G is not set # CONFIG_HIGHMEM64G is not set # CONFIG_MATH_EMULATION is not set --- linux/Documentation/Configure.help.orig Mon Feb 28 13:27:52 2000 +++ linux/Documentation/Configure.help Mon Feb 28 13:28:04 2000 @@ -2258,10 +2258,10 @@ all x86 CPU types (albeit not optimally fast), you can specify "386" here. - If you specify one of "486" or "586" or "Pentium" or "PPro", then - the kernel will not necessarily run on earlier architectures (e.g. a - Pentium optimized kernel will run on a PPro, but not necessarily on - a i486). + If you specify one of "486" or "586" or "Pentium" or "PPro" or "PIII", + then the kernel will not necessarily run on earlier architectures + (e.g. a Pentium optimized kernel will run on a PPro, but not necessarily + on a i486). Here are the settings recommended for greatest speed: - "386" for the AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI @@ -2276,8 +2276,30 @@ K6-3D. - "PPro" for the Cyrix/IBM/National Semiconductor 6x86MX, MII and Intel Pentium II/Pentium Pro. + - "PIII/Xeon/Deschutes" for the PIII (Katmai), Xeon and later PIIs + with the Deschutes or Mendocino core. You have to chose this for + MMX2 support. If you don't know what to do, choose "386". + +Disable PII/PIII Serial Number at bootup +CONFIG_X86_PN_OFF + This makes the kernel disable the CPUID serial number that is embedded on + the new PIII CPUs at bootup. + +Enable PII/PIII Extended Fast FPU save and restore support +CONFIG_X86_FX + This enables use of the new PII/PIII FXSAVE/FXRSTOR support. This item + is required to make use of the new PIII 128bit XMM registers. It is safe + to leave this enabled all the time. + +Enable CPU Specific (MMX/MMX2) Optimizations +CONFIG_X86_CPU_OPTIMIZATIONS + This enables use of the MMX registers and 128bit MMX2 registers on CPUs + that can support the new instructions (Pentium/AMD K6 or newer). In + order to support the Pentium III 128 bit XMM registers you must enable + both this and PII/PIII Extended Fast FPU save support. It is safe to + leave this enabled all the time. VGA text console CONFIG_VGA_CONSOLE