Index: linux/Documentation/kernel-parameters.txt =================================================================== --- linux.orig/Documentation/kernel-parameters.txt +++ linux/Documentation/kernel-parameters.txt @@ -61,6 +61,7 @@ parameter is applicable: MTD MTD support is enabled. NET Appropriate network support is enabled. NUMA NUMA support is enabled. + GENERIC_TIME The generic timeofday code is enabled. NFS Appropriate NFS support is enabled. OSS OSS sound support is enabled. PARIDE The ParIDE subsystem is enabled. @@ -176,6 +177,11 @@ running once the system is up. override platform specific driver. See also Documentation/acpi-hotkey.txt. + acpi_pm_good [IA-32,X86-64] + Override the pmtimer bug detection: force the kernel + to assume that this machine's pmtimer latches its value + and always returns good values. + enable_timer_pin_1 [i386,x86-64] Enable PIN 1 of APIC timer Can be useful to work around chipset bugs @@ -338,10 +344,11 @@ running once the system is up. Value can be changed at runtime via /selinux/checkreqprot. - clock= [BUGS=IA-32,HW] gettimeofday timesource override. - Forces specified timesource (if avaliable) to be used - when calculating gettimeofday(). If specicified - timesource is not avalible, it defaults to PIT. + clock= [BUGS=IA-32, HW] gettimeofday clocksource override. + [Deprecated] + Forces specified clocksource (if avaliable) to be used + when calculating gettimeofday(). If specified + clocksource is not avalible, it defaults to PIT. Format: { pit | tsc | cyclone | pmtmr } disable_8254_timer @@ -1605,6 +1612,16 @@ running once the system is up. time Show timing data prefixed to each printk message line + timeout_granularity= + [KNL] + Timeout granularity: process timer wheel timers every + timeout_granularity jiffies. Defaults to 1 (process + timers HZ times per second - most finegrained). + + clocksource= [GENERIC_TIME] Override the default clocksource + Override the default clocksource and use the clocksource + with the name specified. + tipar.timeout= [HW,PPT] Set communications timeout in tenths of a second (default 15). Index: linux/Makefile =================================================================== --- linux.orig/Makefile +++ linux/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 17 -EXTRAVERSION = +EXTRAVERSION =-hrt-dyntick1 NAME=Crazed Snow-Weasel # *DOCUMENTATION* Index: linux/arch/i386/Kconfig =================================================================== --- linux.orig/arch/i386/Kconfig +++ linux/arch/i386/Kconfig @@ -14,6 +14,10 @@ config X86_32 486, 586, Pentiums, and various instruction-set-compatible chips by AMD, Cyrix, and others. +config GENERIC_TIME + bool + default y + config SEMAPHORE_SLEEPERS bool default y @@ -53,6 +57,8 @@ source "init/Kconfig" menu "Processor type and features" +source "kernel/time/Kconfig" + config SMP bool "Symmetric multi-processing support" ---help--- Index: linux/arch/i386/kernel/Makefile =================================================================== --- linux.orig/arch/i386/kernel/Makefile +++ linux/arch/i386/kernel/Makefile @@ -7,10 +7,9 @@ extra-y := head.o init_task.o vmlinux.ld obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o bootflag.o \ - quirks.o i8237.o topology.o alternative.o + quirks.o i8237.o topology.o alternative.o i8253.o tsc.o obj-y += cpu/ -obj-y += timers/ obj-y += acpi/ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca.o @@ -26,6 +25,7 @@ obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_X86_NUMAQ) += numaq.o obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o obj-$(CONFIG_KPROBES) += kprobes.o Index: linux/arch/i386/kernel/acpi/boot.c =================================================================== --- linux.orig/arch/i386/kernel/acpi/boot.c +++ linux/arch/i386/kernel/acpi/boot.c @@ -574,7 +574,8 @@ static int __init acpi_parse_sbf(unsigne } #ifdef CONFIG_HPET_TIMER - +#include +extern unsigned long hpet_address; static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) { struct acpi_table_hpet *hpet_tbl; @@ -594,20 +595,13 @@ static int __init acpi_parse_hpet(unsign return -1; } #ifdef CONFIG_X86_64 - vxtime.hpet_address = hpet_tbl->addr.addrl | + hpet_address = hpet_tbl->addr.addrl | ((long)hpet_tbl->addr.addrh << 32); - - printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, vxtime.hpet_address); #else /* X86 */ - { - extern unsigned long hpet_address; - - hpet_address = hpet_tbl->addr.addrl; + hpet_address = hpet_tbl->addr.addrl; +#endif /* X86 */ printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, hpet_address); - } -#endif /* X86 */ + hpet_tbl->id, hpet_address); return 0; } Index: linux/arch/i386/kernel/apic.c =================================================================== --- linux.orig/arch/i386/kernel/apic.c +++ linux/arch/i386/kernel/apic.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -59,6 +60,23 @@ int enable_local_apic __initdata = 0; /* */ int apic_verbosity; +static unsigned int calibration_result; + +static void lapic_next_event(unsigned long delta, struct clock_event *evt); +static void lapic_timer_setup(int mode, struct clock_event *evt); + +static struct clock_event lapic_clockevent = { + .name = "lapic", + .capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE +#ifdef CONFIG_SMP + | CLOCK_CAP_UPDATE +#endif + , + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, +}; +static DEFINE_PER_CPU(struct clock_event, lapic_events); static void apic_pm_activate(void); @@ -909,6 +927,11 @@ fake_ioapic_page: */ /* + * FIXME: Move this to i8253.h. There is no need to keep the access to + * the PIT scattered all around the place -tglx + */ + +/* * The timer chip is already set up at HZ interrupts per second here, * but we do not accept timer interrupts yet. We only allow the BP * to calibrate. @@ -966,13 +989,15 @@ void (*wait_timer_tick)(void) __devinitd #define APIC_DIVISOR 16 -static void __setup_APIC_LVTT(unsigned int clocks) +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot) { unsigned int lvtt_value, tmp_value, ver; int cpu = smp_processor_id(); ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; if (!APIC_INTEGRATED(ver)) lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); @@ -989,23 +1014,31 @@ static void __setup_APIC_LVTT(unsigned i & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | APIC_TDR_DIV_16); - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); + if (!oneshot) + apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); +} + +static void lapic_next_event(unsigned long delta, struct clock_event *evt) +{ + apic_write_around(APIC_TMICT, delta); } -static void __devinit setup_APIC_timer(unsigned int clocks) +static void lapic_timer_setup(int mode, struct clock_event *evt) { unsigned long flags; local_irq_save(flags); + __setup_APIC_LVTT(calibration_result, mode != CLOCK_EVT_PERIODIC); + local_irq_restore(flags); +} - /* - * Wait for IRQ0's slice: - */ - wait_timer_tick(); +static void __devinit setup_APIC_timer(void) +{ + struct clock_event *levt = &__get_cpu_var(lapic_events); - __setup_APIC_LVTT(clocks); + memcpy(levt, &lapic_clockevent, sizeof(*levt)); - local_irq_restore(flags); + setup_local_clockevent(levt, CPU_MASK_NONE); } /* @@ -1014,6 +1047,8 @@ static void __devinit setup_APIC_timer(u * to calibrate, since some later bootup code depends on getting * the first irq? Ugh. * + * TODO: Fix this rather than saying "Ugh" -tglx + * * We want to do the calibration only once since we * want to have local timer irqs syncron. CPUs connected * by the same APIC bus have the very same bus frequency. @@ -1036,7 +1071,7 @@ static int __init calibrate_APIC_clock(v * value into the APIC clock, we just want to get the * counter running for calibration. */ - __setup_APIC_LVTT(1000000000); + __setup_APIC_LVTT(1000000000, 0); /* * The timer chip counts down to zero. Let's wait @@ -1073,6 +1108,14 @@ static int __init calibrate_APIC_clock(v result = (tt1-tt2)*APIC_DIVISOR/LOOPS; + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(tt1-tt2, TICK_NSEC * LOOPS, 32); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + printk("lapic max_delta_ns: %ld\n", lapic_clockevent.max_delta_ns); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + if (cpu_has_tsc) apic_printk(APIC_VERBOSE, "..... CPU clock speed is " "%ld.%04ld MHz.\n", @@ -1087,8 +1130,6 @@ static int __init calibrate_APIC_clock(v return result; } -static unsigned int calibration_result; - void __init setup_boot_APIC_clock(void) { unsigned long flags; @@ -1101,14 +1142,14 @@ void __init setup_boot_APIC_clock(void) /* * Now set up the timer for real. */ - setup_APIC_timer(calibration_result); + setup_APIC_timer(); local_irq_restore(flags); } void __devinit setup_secondary_APIC_clock(void) { - setup_APIC_timer(calibration_result); + setup_APIC_timer(); } void disable_APIC_timer(void) @@ -1143,6 +1184,13 @@ void switch_APIC_timer_to_ipi(void *cpum !cpu_isset(cpu, timer_bcast_ipi)) { disable_APIC_timer(); cpu_set(cpu, timer_bcast_ipi); +#ifdef CONFIG_HIGH_RES_TIMERS + printk("Disabling NO_HZ and high resolution timers " + "due to timer broadcasting\n"); + for_each_possible_cpu(cpu) + per_cpu(lapic_events, cpu).capabilities &= + ~CLOCK_CAP_NEXTEVT; +#endif } } EXPORT_SYMBOL(switch_APIC_timer_to_ipi); @@ -1174,21 +1222,10 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer); inline void smp_local_timer_interrupt(struct pt_regs * regs) { - profile_tick(CPU_PROFILING, regs); -#ifdef CONFIG_SMP - update_process_times(user_mode_vm(regs)); -#endif + int cpu = smp_processor_id(); + struct clock_event *evt = &per_cpu(lapic_events, cpu); - /* - * We take the 'long' return path, and there every subsystem - * grabs the apropriate locks (kernel lock/ irq lock). - * - * we might want to decouple profiling from the 'long path', - * and do the profiling totally in assembly. - * - * Currently this isn't too much of an issue (performance wise), - * we can take more than 100K local irqs per second on a 100 MHz P5. - */ + evt->event_handler(regs); } /* @@ -1203,6 +1240,7 @@ inline void smp_local_timer_interrupt(st fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) { int cpu = smp_processor_id(); + struct clock_event *evt = &per_cpu(lapic_events, cpu); /* * the NMI deadlock-detector uses this. @@ -1220,7 +1258,7 @@ fastcall void smp_apic_timer_interrupt(s * interrupt lock, which is the WrongThing (tm) to do. */ irq_enter(); - smp_local_timer_interrupt(regs); + evt->event_handler(regs); irq_exit(); } Index: linux/arch/i386/kernel/hpet.c =================================================================== --- /dev/null +++ linux/arch/i386/kernel/hpet.c @@ -0,0 +1,67 @@ +#include +#include +#include +#include + +#include +#include + +#define HPET_MASK CLOCKSOURCE_MASK(32) +#define HPET_SHIFT 22 + +/* FSEC = 10^-15 NSEC = 10^-9 */ +#define FSEC_PER_NSEC 1000000 + +static void *hpet_ptr; + +static cycle_t read_hpet(void) +{ + return (cycle_t)readl(hpet_ptr); +} + +static struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 250, + .read = read_hpet, + .mask = HPET_MASK, + .mult = 0, /* set below */ + .shift = HPET_SHIFT, + .is_continuous = 1, +}; + +static int __init init_hpet_clocksource(void) +{ + unsigned long hpet_period; + void __iomem* hpet_base; + u64 tmp; + + if (!hpet_address) + return -ENODEV; + + /* calculate the hpet address: */ + hpet_base = + (void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + hpet_ptr = hpet_base + HPET_COUNTER; + + /* calculate the frequency: */ + hpet_period = readl(hpet_base + HPET_PERIOD); + + /* + * hpet period is in femto seconds per cycle + * so we need to convert this to ns/cyc units + * aproximated by mult/2^shift + * + * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift + * fsec/cyc * 1ns/1000000fsec * 2^shift = mult + * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult + * (fsec/cyc << shift)/1000000 = mult + * (hpet_period << shift)/FSEC_PER_NSEC = mult + */ + tmp = (u64)hpet_period << HPET_SHIFT; + do_div(tmp, FSEC_PER_NSEC); + clocksource_hpet.mult = (u32)tmp; + + return clocksource_register(&clocksource_hpet); +} + +module_init(init_hpet_clocksource); Index: linux/arch/i386/kernel/i8253.c =================================================================== --- /dev/null +++ linux/arch/i386/kernel/i8253.c @@ -0,0 +1,162 @@ +/* + * i8253.c 8253/PIT functions + * + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "io_ports.h" + +DEFINE_SPINLOCK(i8253_lock); +EXPORT_SYMBOL(i8253_lock); + +static void init_pit_timer(int mode, struct clock_event *evt) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + + switch(mode) { + case CLOCK_EVT_PERIODIC: + /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(0x34, PIT_MODE); + udelay(10); + outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ + outb(LATCH >> 8 , PIT_CH0); /* MSB */ + break; + + case CLOCK_EVT_ONESHOT: + case CLOCK_EVT_SHUTDOWN: + /* One shot setup */ + outb_p(0x38, PIT_MODE); + udelay(10); + break; + } + spin_unlock_irqrestore(&i8253_lock, flags); +} + +static void pit_next_event(unsigned long delta, struct clock_event *evt) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + outb_p(delta & 0xff , PIT_CH0); /* LSB */ + outb(delta >> 8 , PIT_CH0); /* MSB */ + spin_unlock_irqrestore(&i8253_lock, flags); +} + +struct clock_event pit_clockevent = { + .name = "pit", + .capabilities = CLOCK_CAP_TICK | CLOCK_CAP_PROFILE | CLOCK_CAP_UPDATE +#ifndef CONFIG_SMP + | CLOCK_CAP_NEXTEVT +#endif + , + .set_mode = init_pit_timer, + .set_next_event = pit_next_event, + .shift = 32, +}; + +void setup_pit_timer(void) +{ + pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32); + pit_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFF, &pit_clockevent); + pit_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &pit_clockevent); + setup_global_clockevent(&pit_clockevent, CPU_MASK_NONE); +} + +/* + * Since the PIT overflows every tick, its not very useful + * to just read by itself. So use jiffies to emulate a free + * running counter: + */ +static cycle_t pit_read(void) +{ + unsigned long flags; + int count; + u32 jifs; + static int old_count; + static u32 old_jifs; + + spin_lock_irqsave(&i8253_lock, flags); + /* + * Although our caller may have the read side of xtime_lock, + * this is now a seqlock, and we are cheating in this routine + * by having side effects on state that we cannot undo if + * there is a collision on the seqlock and our caller has to + * retry. (Namely, old_jifs and old_count.) So we must treat + * jiffies as volatile despite the lock. We read jiffies + * before latching the timer count to guarantee that although + * the jiffies value might be older than the count (that is, + * the counter may underflow between the last point where + * jiffies was incremented and the point where we latch the + * count), it cannot be newer. + */ + jifs = jiffies; + outb_p(0x00, PIT_MODE); /* latch the count ASAP */ + count = inb_p(PIT_CH0); /* read the latched count */ + count |= inb_p(PIT_CH0) << 8; + + /* VIA686a test code... reset the latch if count > max + 1 */ + if (count > LATCH) { + outb_p(0x34, PIT_MODE); + outb_p(LATCH & 0xff, PIT_CH0); + outb(LATCH >> 8, PIT_CH0); + count = LATCH - 1; + } + + /* + * It's possible for count to appear to go the wrong way for a + * couple of reasons: + * + * 1. The timer counter underflows, but we haven't handled the + * resulting interrupt and incremented jiffies yet. + * 2. Hardware problem with the timer, not giving us continuous time, + * the counter does small "jumps" upwards on some Pentium systems, + * (see c't 95/10 page 335 for Neptun bug.) + * + * Previous attempts to handle these cases intelligently were + * buggy, so we just do the simple thing now. + */ + if (count > old_count && jifs == old_jifs) { + count = old_count; + } + old_count = count; + old_jifs = jifs; + + spin_unlock_irqrestore(&i8253_lock, flags); + + count = (LATCH - 1) - count; + + return (cycle_t)(jifs * LATCH) + count; +} + +static struct clocksource clocksource_pit = { + .name = "pit", + .rating = 110, + .read = pit_read, + .mask = CLOCKSOURCE_MASK(32), + .mult = 0, + .shift = 20, +}; + +static int __init init_pit_clocksource(void) +{ + if (num_possible_cpus() > 4) /* PIT does not scale! */ + return 0; + + clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); + return clocksource_register(&clocksource_pit); +} +module_init(init_pit_clocksource); Index: linux/arch/i386/kernel/irq.c =================================================================== --- linux.orig/arch/i386/kernel/irq.c +++ linux/arch/i386/kernel/irq.c @@ -75,6 +75,19 @@ fastcall unsigned int do_IRQ(struct pt_r } } #endif +#ifdef CONFIG_NO_HZ + if (idle_cpu(smp_processor_id())) { + update_jiffies(); + /* + * Force polling-idle loops to break out into + * the sched-timer setting code, to make sure + * that timer interval changes due to __mod_timer() + * in IRQ context get properly propagated: + */ + if (tsk_is_polling(current)) + set_need_resched(); + } +#endif #ifdef CONFIG_4KSTACKS Index: linux/arch/i386/kernel/nmi.c =================================================================== --- linux.orig/arch/i386/kernel/nmi.c +++ linux/arch/i386/kernel/nmi.c @@ -532,7 +532,7 @@ void nmi_watchdog_tick (struct pt_regs * unsigned int sum; int cpu = smp_processor_id(); - sum = per_cpu(irq_stat, cpu).apic_timer_irqs; + sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0); if (last_irq_sums[cpu] == sum) { /* Index: linux/arch/i386/kernel/numaq.c =================================================================== --- linux.orig/arch/i386/kernel/numaq.c +++ linux/arch/i386/kernel/numaq.c @@ -79,10 +79,12 @@ int __init get_memcfg_numaq(void) return 1; } -static int __init numaq_dsc_disable(void) +static int __init numaq_tsc_disable(void) { - printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); - tsc_disable = 1; + if (num_online_nodes() > 1) { + printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); + tsc_disable = 1; + } return 0; } -core_initcall(numaq_dsc_disable); +arch_initcall(numaq_tsc_disable); Index: linux/arch/i386/kernel/process.c =================================================================== --- linux.orig/arch/i386/kernel/process.c +++ linux/arch/i386/kernel/process.c @@ -102,16 +102,20 @@ void default_idle(void) local_irq_enable(); if (!hlt_counter && boot_cpu_data.hlt_works_ok) { - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); while (!need_resched()) { local_irq_disable(); - if (!need_resched()) - safe_halt(); - else + if (!need_resched()) { + if (!hrtimer_stop_sched_tick()) + safe_halt(); + else + local_irq_enable(); + hrtimer_restart_sched_tick(); + } else local_irq_enable(); } - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; } else { while (!need_resched()) cpu_relax(); @@ -126,16 +130,18 @@ EXPORT_SYMBOL(default_idle); * to poll the ->work.need_resched flag instead of waiting for the * cross-CPU IPI to arrive. Use this option with caution. */ -static void poll_idle (void) +static void poll_idle(void) { local_irq_enable(); - asm volatile( - "2:" - "testl %0, %1;" - "rep; nop;" - "je 2b;" - : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); + while (!need_resched()) { + hrtimer_stop_sched_tick(); + local_irq_enable(); + while (!need_resched() && !rcu_pending(smp_processor_id()) && !local_softirq_pending()) + rep_nop(); + hrtimer_restart_sched_tick(); + local_irq_enable(); + } } #ifdef CONFIG_HOTPLUG_CPU @@ -174,7 +180,7 @@ void cpu_idle(void) { int cpu = smp_processor_id(); - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { @@ -242,12 +248,15 @@ static void mwait_idle(void) local_irq_enable(); while (!need_resched()) { + if (hrtimer_stop_sched_tick()) + break; __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (need_resched()) break; __mwait(0, 0); } + hrtimer_restart_sched_tick(); } void __devinit select_idle_routine(const struct cpuinfo_x86 *c) Index: linux/arch/i386/kernel/setup.c =================================================================== --- linux.orig/arch/i386/kernel/setup.c +++ linux/arch/i386/kernel/setup.c @@ -1583,6 +1583,7 @@ void __init setup_arch(char **cmdline_p) conswitchp = &dummy_con; #endif #endif + tsc_init(); } static __init int add_pcspkr(void) Index: linux/arch/i386/kernel/time.c =================================================================== --- linux.orig/arch/i386/kernel/time.c +++ linux/arch/i386/kernel/time.c @@ -82,13 +82,6 @@ extern unsigned long wall_jiffies; DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); -#include - -DEFINE_SPINLOCK(i8253_lock); -EXPORT_SYMBOL(i8253_lock); - -struct timer_opts *cur_timer __read_mostly = &timer_none; - /* * This is a special lock that is owned by the CPU and holds the index * register we are working with. It is required for NMI access to the @@ -118,99 +111,19 @@ void rtc_cmos_write(unsigned char val, u } EXPORT_SYMBOL(rtc_cmos_write); -/* - * This version of gettimeofday has microsecond resolution - * and better than microsecond precision on fast x86 machines with TSC. - */ -void do_gettimeofday(struct timeval *tv) -{ - unsigned long seq; - unsigned long usec, sec; - unsigned long max_ntp_tick; - - do { - unsigned long lost; - - seq = read_seqbegin(&xtime_lock); - - usec = cur_timer->get_offset(); - lost = jiffies - wall_jiffies; - - /* - * If time_adjust is negative then NTP is slowing the clock - * so make sure not to go into next possible interval. - * Better to lose some accuracy than have time go backwards.. - */ - if (unlikely(time_adjust < 0)) { - max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj; - usec = min(usec, max_ntp_tick); - - if (lost) - usec += lost * max_ntp_tick; - } - else if (unlikely(lost)) - usec += lost * (USEC_PER_SEC / HZ); - - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - } while (read_seqretry(&xtime_lock, seq)); - - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - /* - * This is revolting. We need to set "xtime" correctly. However, the - * value in this location is the value at the most recent update of - * wall time. Discover what correction gettimeofday() would have - * made, and then undo it! - */ - nsec -= cur_timer->get_offset() * NSEC_PER_USEC; - nsec -= (jiffies - wall_jiffies) * TICK_NSEC; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - static int set_rtc_mmss(unsigned long nowtime) { int retval; - - WARN_ON(irqs_disabled()); + unsigned long flags; /* gets recalled with irq locally disabled */ - spin_lock_irq(&rtc_lock); + /* XXX - does irqsave resolve this? -johnstul */ + spin_lock_irqsave(&rtc_lock, flags); if (efi_enabled) retval = efi_set_rtc_mmss(nowtime); else retval = mach_set_rtc_mmss(nowtime); - spin_unlock_irq(&rtc_lock); + spin_unlock_irqrestore(&rtc_lock, flags); return retval; } @@ -218,16 +131,6 @@ static int set_rtc_mmss(unsigned long no int timer_ack; -/* monotonic_clock(): returns # of nanoseconds passed since time_init() - * Note: This function is required to return accurate - * time even in the absence of multiple timer ticks. - */ -unsigned long long monotonic_clock(void) -{ - return cur_timer->monotonic_clock(); -} -EXPORT_SYMBOL(monotonic_clock); - #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) unsigned long profile_pc(struct pt_regs *regs) { @@ -242,10 +145,11 @@ EXPORT_SYMBOL(profile_pc); #endif /* - * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * This is the same as the above, except we _also_ save the current + * Time Stamp Counter value at the time of the timer interrupt, so that + * we later on can estimate the time of day more exactly. */ -static inline void do_timer_interrupt(int irq, struct pt_regs *regs) +irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { #ifdef CONFIG_X86_IO_APIC if (timer_ack) { @@ -265,7 +169,6 @@ static inline void do_timer_interrupt(in do_timer_interrupt_hook(regs); - if (MCA_bus) { /* The PS/2 uses level-triggered interrupts. You can't turn them off, nor would you want to (any attempt to @@ -279,29 +182,6 @@ static inline void do_timer_interrupt(in irq = inb_p( 0x61 ); /* read the current state */ outb_p( irq|0x80, 0x61 ); /* reset the IRQ */ } -} - -/* - * This is the same as the above, except we _also_ save the current - * Time Stamp Counter value at the time of the timer interrupt, so that - * we later on can estimate the time of day more exactly. - */ -irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - /* - * Here we are in the timer irq handler. We just have irqs locally - * disabled but we don't know if the timer_bh is running on the other - * CPU. We need to avoid to SMP race with it. NOTE: we don' t need - * the irq version of write_lock because as just said we have irq - * locally disabled. -arca - */ - write_seqlock(&xtime_lock); - - cur_timer->mark_offset(); - - do_timer_interrupt(irq, regs); - - write_sequnlock(&xtime_lock); #ifdef CONFIG_X86_LOCAL_APIC if (using_apic_timer) @@ -380,7 +260,6 @@ void notify_arch_cmos_timer(void) static long clock_cmos_diff, sleep_start; -static struct timer_opts *last_timer; static int timer_suspend(struct sys_device *dev, pm_message_t state) { /* @@ -389,10 +268,6 @@ static int timer_suspend(struct sys_devi clock_cmos_diff = -get_cmos_time(); clock_cmos_diff += get_seconds(); sleep_start = get_cmos_time(); - last_timer = cur_timer; - cur_timer = &timer_none; - if (last_timer->suspend) - last_timer->suspend(state); return 0; } @@ -415,10 +290,6 @@ static int timer_resume(struct sys_devic jiffies_64 += sleep_length; wall_jiffies += sleep_length; write_sequnlock_irqrestore(&xtime_lock, flags); - if (last_timer->resume) - last_timer->resume(); - cur_timer = last_timer; - last_timer = NULL; touch_softlockup_watchdog(); return 0; } @@ -460,9 +331,6 @@ static void __init hpet_time_init(void) printk("Using HPET for base-timer\n"); } - cur_timer = select_timer(); - printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); - time_init_hook(); } #endif @@ -484,8 +352,5 @@ void __init time_init(void) set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - cur_timer = select_timer(); - printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); - time_init_hook(); } Index: linux/arch/i386/kernel/timers/Makefile =================================================================== --- linux.orig/arch/i386/kernel/timers/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# -# Makefile for x86 timers -# - -obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o common.o - -obj-$(CONFIG_X86_CYCLONE_TIMER) += timer_cyclone.o -obj-$(CONFIG_HPET_TIMER) += timer_hpet.o -obj-$(CONFIG_X86_PM_TIMER) += timer_pm.o Index: linux/arch/i386/kernel/timers/common.c =================================================================== --- linux.orig/arch/i386/kernel/timers/common.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Common functions used across the timers go here - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "mach_timer.h" - -/* ------ Calibrate the TSC ------- - * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset(). - * Too much 64-bit arithmetic here to do this cleanly in C, and for - * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2) - * output busy loop as low as possible. We avoid reading the CTC registers - * directly because of the awkward 8-bit access mechanism of the 82C54 - * device. - */ - -#define CALIBRATE_TIME (5 * 1000020/HZ) - -unsigned long calibrate_tsc(void) -{ - mach_prepare_counter(); - - { - unsigned long startlow, starthigh; - unsigned long endlow, endhigh; - unsigned long count; - - rdtsc(startlow,starthigh); - mach_countup(&count); - rdtsc(endlow,endhigh); - - - /* Error: ECTCNEVERSET */ - if (count <= 1) - goto bad_ctc; - - /* 64-bit subtract - gcc just messes up with long longs */ - __asm__("subl %2,%0\n\t" - "sbbl %3,%1" - :"=a" (endlow), "=d" (endhigh) - :"g" (startlow), "g" (starthigh), - "0" (endlow), "1" (endhigh)); - - /* Error: ECPUTOOFAST */ - if (endhigh) - goto bad_ctc; - - /* Error: ECPUTOOSLOW */ - if (endlow <= CALIBRATE_TIME) - goto bad_ctc; - - __asm__("divl %2" - :"=a" (endlow), "=d" (endhigh) - :"r" (endlow), "0" (0), "1" (CALIBRATE_TIME)); - - return endlow; - } - - /* - * The CTC wasn't reliable: we got a hit on the very first read, - * or the CPU was so fast/slow that the quotient wouldn't fit in - * 32 bits.. - */ -bad_ctc: - return 0; -} - -#ifdef CONFIG_HPET_TIMER -/* ------ Calibrate the TSC using HPET ------- - * Return 2^32 * (1 / (TSC clocks per usec)) for getting the CPU freq. - * Second output is parameter 1 (when non NULL) - * Set 2^32 * (1 / (tsc per HPET clk)) for delay_hpet(). - * calibrate_tsc() calibrates the processor TSC by comparing - * it to the HPET timer of known frequency. - * Too much 64-bit arithmetic here to do this cleanly in C - */ -#define CALIBRATE_CNT_HPET (5 * hpet_tick) -#define CALIBRATE_TIME_HPET (5 * KERNEL_TICK_USEC) - -unsigned long __devinit calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr) -{ - unsigned long tsc_startlow, tsc_starthigh; - unsigned long tsc_endlow, tsc_endhigh; - unsigned long hpet_start, hpet_end; - unsigned long result, remain; - - hpet_start = hpet_readl(HPET_COUNTER); - rdtsc(tsc_startlow, tsc_starthigh); - do { - hpet_end = hpet_readl(HPET_COUNTER); - } while ((hpet_end - hpet_start) < CALIBRATE_CNT_HPET); - rdtsc(tsc_endlow, tsc_endhigh); - - /* 64-bit subtract - gcc just messes up with long longs */ - __asm__("subl %2,%0\n\t" - "sbbl %3,%1" - :"=a" (tsc_endlow), "=d" (tsc_endhigh) - :"g" (tsc_startlow), "g" (tsc_starthigh), - "0" (tsc_endlow), "1" (tsc_endhigh)); - - /* Error: ECPUTOOFAST */ - if (tsc_endhigh) - goto bad_calibration; - - /* Error: ECPUTOOSLOW */ - if (tsc_endlow <= CALIBRATE_TIME_HPET) - goto bad_calibration; - - ASM_DIV64_REG(result, remain, tsc_endlow, 0, CALIBRATE_TIME_HPET); - if (remain > (tsc_endlow >> 1)) - result++; /* rounding the result */ - - if (tsc_hpet_quotient_ptr) { - unsigned long tsc_hpet_quotient; - - ASM_DIV64_REG(tsc_hpet_quotient, remain, tsc_endlow, 0, - CALIBRATE_CNT_HPET); - if (remain > (tsc_endlow >> 1)) - tsc_hpet_quotient++; /* rounding the result */ - *tsc_hpet_quotient_ptr = tsc_hpet_quotient; - } - - return result; -bad_calibration: - /* - * the CPU was so fast/slow that the quotient wouldn't fit in - * 32 bits.. - */ - return 0; -} -#endif - - -unsigned long read_timer_tsc(void) -{ - unsigned long retval; - rdtscl(retval); - return retval; -} - - -/* calculate cpu_khz */ -void init_cpu_khz(void) -{ - if (cpu_has_tsc) { - unsigned long tsc_quotient = calibrate_tsc(); - if (tsc_quotient) { - /* report CPU clock rate in Hz. - * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = - * clock/second. Our precision is about 100 ppm. - */ - { unsigned long eax=0, edx=1000; - __asm__("divl %2" - :"=a" (cpu_khz), "=d" (edx) - :"r" (tsc_quotient), - "0" (eax), "1" (edx)); - printk("Detected %u.%03u MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - } - } - } -} - Index: linux/arch/i386/kernel/timers/timer.c =================================================================== --- linux.orig/arch/i386/kernel/timers/timer.c +++ /dev/null @@ -1,75 +0,0 @@ -#include -#include -#include -#include - -#ifdef CONFIG_HPET_TIMER -/* - * HPET memory read is slower than tsc reads, but is more dependable as it - * always runs at constant frequency and reduces complexity due to - * cpufreq. So, we prefer HPET timer to tsc based one. Also, we cannot use - * timer_pit when HPET is active. So, we default to timer_tsc. - */ -#endif -/* list of timers, ordered by preference, NULL terminated */ -static struct init_timer_opts* __initdata timers[] = { -#ifdef CONFIG_X86_CYCLONE_TIMER - &timer_cyclone_init, -#endif -#ifdef CONFIG_HPET_TIMER - &timer_hpet_init, -#endif -#ifdef CONFIG_X86_PM_TIMER - &timer_pmtmr_init, -#endif - &timer_tsc_init, - &timer_pit_init, - NULL, -}; - -static char clock_override[10] __initdata; - -static int __init clock_setup(char* str) -{ - if (str) - strlcpy(clock_override, str, sizeof(clock_override)); - return 1; -} -__setup("clock=", clock_setup); - - -/* The chosen timesource has been found to be bad. - * Fall back to a known good timesource (the PIT) - */ -void clock_fallback(void) -{ - cur_timer = &timer_pit; -} - -/* iterates through the list of timers, returning the first - * one that initializes successfully. - */ -struct timer_opts* __init select_timer(void) -{ - int i = 0; - - /* find most preferred working timer */ - while (timers[i]) { - if (timers[i]->init) - if (timers[i]->init(clock_override) == 0) - return timers[i]->opts; - ++i; - } - - panic("select_timer: Cannot find a suitable timer\n"); - return NULL; -} - -int read_current_timer(unsigned long *timer_val) -{ - if (cur_timer->read_timer) { - *timer_val = cur_timer->read_timer(); - return 0; - } - return -1; -} Index: linux/arch/i386/kernel/timers/timer_cyclone.c =================================================================== --- linux.orig/arch/i386/kernel/timers/timer_cyclone.c +++ /dev/null @@ -1,259 +0,0 @@ -/* Cyclone-timer: - * This code implements timer_ops for the cyclone counter found - * on IBM x440, x360, and other Summit based systems. - * - * Copyright (C) 2002 IBM, John Stultz (johnstul@us.ibm.com) - */ - - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "io_ports.h" - -/* Number of usecs that the last interrupt was delayed */ -static int delay_at_last_interrupt; - -#define CYCLONE_CBAR_ADDR 0xFEB00CD0 -#define CYCLONE_PMCC_OFFSET 0x51A0 -#define CYCLONE_MPMC_OFFSET 0x51D0 -#define CYCLONE_MPCS_OFFSET 0x51A8 -#define CYCLONE_TIMER_FREQ 100000000 -#define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */ -int use_cyclone = 0; - -static u32* volatile cyclone_timer; /* Cyclone MPMC0 register */ -static u32 last_cyclone_low; -static u32 last_cyclone_high; -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* helper macro to atomically read both cyclone counter registers */ -#define read_cyclone_counter(low,high) \ - do{ \ - high = cyclone_timer[1]; low = cyclone_timer[0]; \ - } while (high != cyclone_timer[1]); - - -static void mark_offset_cyclone(void) -{ - unsigned long lost, delay; - unsigned long delta = last_cyclone_low; - int count; - unsigned long long this_offset, last_offset; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; - - spin_lock(&i8253_lock); - read_cyclone_counter(last_cyclone_low,last_cyclone_high); - - /* read values for delay_at_last_interrupt */ - outb_p(0x00, 0x43); /* latch the count ASAP */ - - count = inb_p(0x40); /* read the latched count */ - count |= inb(0x40) << 8; - - /* - * VIA686a test code... reset the latch if count > max + 1 - * from timer_pit.c - cjb - */ - if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - spin_unlock(&i8253_lock); - - /* lost tick compensation */ - delta = last_cyclone_low - delta; - delta /= (CYCLONE_TIMER_FREQ/1000000); - delta += delay_at_last_interrupt; - lost = delta/(1000000/HZ); - delay = delta%(1000000/HZ); - if (lost >= 2) - jiffies_64 += lost-1; - - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; - monotonic_base += (this_offset - last_offset) & CYCLONE_TIMER_MASK; - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - count = ((LATCH-1) - count) * TICK_SIZE; - delay_at_last_interrupt = (count + LATCH/2) / LATCH; - - - /* catch corner case where tick rollover occured - * between cyclone and pit reads (as noted when - * usec delta is > 90% # of usecs/tick) - */ - if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ)) - jiffies_64++; -} - -static unsigned long get_offset_cyclone(void) -{ - u32 offset; - - if(!cyclone_timer) - return delay_at_last_interrupt; - - /* Read the cyclone timer */ - offset = cyclone_timer[0]; - - /* .. relative to previous jiffy */ - offset = offset - last_cyclone_low; - - /* convert cyclone ticks to microseconds */ - /* XXX slow, can we speed this up? */ - offset = offset/(CYCLONE_TIMER_FREQ/1000000); - - /* our adjusted time offset in microseconds */ - return delay_at_last_interrupt + offset; -} - -static unsigned long long monotonic_clock_cyclone(void) -{ - u32 now_low, now_high; - unsigned long long last_offset, this_offset, base; - unsigned long long ret; - unsigned seq; - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - - /* Read the cyclone counter */ - read_cyclone_counter(now_low,now_high); - this_offset = ((unsigned long long)now_high<<32)|now_low; - - /* convert to nanoseconds */ - ret = base + ((this_offset - last_offset)&CYCLONE_TIMER_MASK); - return ret * (1000000000 / CYCLONE_TIMER_FREQ); -} - -static int __init init_cyclone(char* override) -{ - u32* reg; - u32 base; /* saved cyclone base address */ - u32 pageaddr; /* page that contains cyclone_timer register */ - u32 offset; /* offset from pageaddr to cyclone_timer register */ - int i; - - /* check clock override */ - if (override[0] && strncmp(override,"cyclone",7)) - return -ENODEV; - - /*make sure we're on a summit box*/ - if(!use_cyclone) return -ENODEV; - - printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n"); - - /* find base address */ - pageaddr = (CYCLONE_CBAR_ADDR)&PAGE_MASK; - offset = (CYCLONE_CBAR_ADDR)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n"); - return -ENODEV; - } - base = *reg; - if(!base){ - printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n"); - return -ENODEV; - } - - /* setup PMCC */ - pageaddr = (base + CYCLONE_PMCC_OFFSET)&PAGE_MASK; - offset = (base + CYCLONE_PMCC_OFFSET)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n"); - return -ENODEV; - } - reg[0] = 0x00000001; - - /* setup MPCS */ - pageaddr = (base + CYCLONE_MPCS_OFFSET)&PAGE_MASK; - offset = (base + CYCLONE_MPCS_OFFSET)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n"); - return -ENODEV; - } - reg[0] = 0x00000001; - - /* map in cyclone_timer */ - pageaddr = (base + CYCLONE_MPMC_OFFSET)&PAGE_MASK; - offset = (base + CYCLONE_MPMC_OFFSET)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - cyclone_timer = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!cyclone_timer){ - printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n"); - return -ENODEV; - } - - /*quick test to make sure its ticking*/ - for(i=0; i<3; i++){ - u32 old = cyclone_timer[0]; - int stall = 100; - while(stall--) barrier(); - if(cyclone_timer[0] == old){ - printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n"); - cyclone_timer = 0; - return -ENODEV; - } - } - - init_cpu_khz(); - - /* Everything looks good! */ - return 0; -} - - -static void delay_cyclone(unsigned long loops) -{ - unsigned long bclock, now; - if(!cyclone_timer) - return; - bclock = cyclone_timer[0]; - do { - rep_nop(); - now = cyclone_timer[0]; - } while ((now-bclock) < loops); -} -/************************************************************/ - -/* cyclone timer_opts struct */ -static struct timer_opts timer_cyclone = { - .name = "cyclone", - .mark_offset = mark_offset_cyclone, - .get_offset = get_offset_cyclone, - .monotonic_clock = monotonic_clock_cyclone, - .delay = delay_cyclone, -}; - -struct init_timer_opts __initdata timer_cyclone_init = { - .init = init_cyclone, - .opts = &timer_cyclone, -}; Index: linux/arch/i386/kernel/timers/timer_hpet.c =================================================================== --- linux.orig/arch/i386/kernel/timers/timer_hpet.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "io_ports.h" -#include "mach_timer.h" -#include - -static unsigned long hpet_usec_quotient __read_mostly; /* convert hpet clks to usec */ -static unsigned long tsc_hpet_quotient __read_mostly; /* convert tsc to hpet clks */ -static unsigned long hpet_last; /* hpet counter value at last tick*/ -static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ -static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better percision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ -static unsigned long cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline void set_cyc2ns_scale(unsigned long cpu_khz) -{ - cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - -static unsigned long long monotonic_clock_hpet(void) -{ - unsigned long long last_offset, this_offset, base; - unsigned seq; - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return base + cycles_2_ns(this_offset - last_offset); -} - -static unsigned long get_offset_hpet(void) -{ - register unsigned long eax, edx; - - eax = hpet_readl(HPET_COUNTER); - eax -= hpet_last; /* hpet delta */ - eax = min(hpet_tick, eax); - /* - * Time offset = (hpet delta) * ( usecs per HPET clock ) - * = (hpet delta) * ( usecs per tick / HPET clocks per tick) - * = (hpet delta) * ( hpet_usec_quotient ) / (2^32) - * - * Where, - * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick - * - * Using a mull instead of a divl saves some cycles in critical path. - */ - ASM_MUL64_REG(eax, edx, hpet_usec_quotient, eax); - - /* our adjusted time offset in microseconds */ - return edx; -} - -static void mark_offset_hpet(void) -{ - unsigned long long this_offset, last_offset; - unsigned long offset; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - rdtsc(last_tsc_low, last_tsc_high); - - if (hpet_use_timer) - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - offset = hpet_readl(HPET_COUNTER); - if (unlikely(((offset - hpet_last) >= (2*hpet_tick)) && (hpet_last != 0))) { - int lost_ticks = ((offset - hpet_last) / hpet_tick) - 1; - jiffies_64 += lost_ticks; - } - hpet_last = offset; - - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); -} - -static void delay_hpet(unsigned long loops) -{ - unsigned long hpet_start, hpet_end; - unsigned long eax; - - /* loops is the number of cpu cycles. Convert it to hpet clocks */ - ASM_MUL64_REG(eax, loops, tsc_hpet_quotient, loops); - - hpet_start = hpet_readl(HPET_COUNTER); - do { - rep_nop(); - hpet_end = hpet_readl(HPET_COUNTER); - } while ((hpet_end - hpet_start) < (loops)); -} - -static struct timer_opts timer_hpet; - -static int __init init_hpet(char* override) -{ - unsigned long result, remain; - - /* check clock override */ - if (override[0] && strncmp(override,"hpet",4)) - return -ENODEV; - - if (!is_hpet_enabled()) - return -ENODEV; - - printk("Using HPET for gettimeofday\n"); - if (cpu_has_tsc) { - unsigned long tsc_quotient = calibrate_tsc_hpet(&tsc_hpet_quotient); - if (tsc_quotient) { - /* report CPU clock rate in Hz. - * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = - * clock/second. Our precision is about 100 ppm. - */ - { unsigned long eax=0, edx=1000; - ASM_DIV64_REG(cpu_khz, edx, tsc_quotient, - eax, edx); - printk("Detected %u.%03u MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - } - set_cyc2ns_scale(cpu_khz); - } - /* set this only when cpu_has_tsc */ - timer_hpet.read_timer = read_timer_tsc; - } - - /* - * Math to calculate hpet to usec multiplier - * Look for the comments at get_offset_hpet() - */ - ASM_DIV64_REG(result, remain, hpet_tick, 0, KERNEL_TICK_USEC); - if (remain > (hpet_tick >> 1)) - result++; /* rounding the result */ - hpet_usec_quotient = result; - - return 0; -} - -static int hpet_resume(void) -{ - write_seqlock(&monotonic_lock); - /* Assume this is the last mark offset time */ - rdtsc(last_tsc_low, last_tsc_high); - - if (hpet_use_timer) - hpet_last = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - hpet_last = hpet_readl(HPET_COUNTER); - write_sequnlock(&monotonic_lock); - return 0; -} -/************************************************************/ - -/* tsc timer_opts struct */ -static struct timer_opts timer_hpet __read_mostly = { - .name = "hpet", - .mark_offset = mark_offset_hpet, - .get_offset = get_offset_hpet, - .monotonic_clock = monotonic_clock_hpet, - .delay = delay_hpet, - .resume = hpet_resume, -}; - -struct init_timer_opts __initdata timer_hpet_init = { - .init = init_hpet, - .opts = &timer_hpet, -}; Index: linux/arch/i386/kernel/timers/timer_none.c =================================================================== --- linux.orig/arch/i386/kernel/timers/timer_none.c +++ /dev/null @@ -1,39 +0,0 @@ -#include -#include - -static void mark_offset_none(void) -{ - /* nothing needed */ -} - -static unsigned long get_offset_none(void) -{ - return 0; -} - -static unsigned long long monotonic_clock_none(void) -{ - return 0; -} - -static void delay_none(unsigned long loops) -{ - int d0; - __asm__ __volatile__( - "\tjmp 1f\n" - ".align 16\n" - "1:\tjmp 2f\n" - ".align 16\n" - "2:\tdecl %0\n\tjns 2b" - :"=&a" (d0) - :"0" (loops)); -} - -/* none timer_opts struct */ -struct timer_opts timer_none = { - .name = "none", - .mark_offset = mark_offset_none, - .get_offset = get_offset_none, - .monotonic_clock = monotonic_clock_none, - .delay = delay_none, -}; Index: linux/arch/i386/kernel/timers/timer_pit.c =================================================================== --- linux.orig/arch/i386/kernel/timers/timer_pit.c +++ /dev/null @@ -1,177 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "do_timer.h" -#include "io_ports.h" - -static int count_p; /* counter in get_offset_pit() */ - -static int __init init_pit(char* override) -{ - /* check clock override */ - if (override[0] && strncmp(override,"pit",3)) - printk(KERN_ERR "Warning: clock= override failed. Defaulting " - "to PIT\n"); - init_cpu_khz(); - count_p = LATCH; - return 0; -} - -static void mark_offset_pit(void) -{ - /* nothing needed */ -} - -static unsigned long long monotonic_clock_pit(void) -{ - return 0; -} - -static void delay_pit(unsigned long loops) -{ - int d0; - __asm__ __volatile__( - "\tjmp 1f\n" - ".align 16\n" - "1:\tjmp 2f\n" - ".align 16\n" - "2:\tdecl %0\n\tjns 2b" - :"=&a" (d0) - :"0" (loops)); -} - - -/* This function must be called with xtime_lock held. - * It was inspired by Steve McCanne's microtime-i386 for BSD. -- jrs - * - * However, the pc-audio speaker driver changes the divisor so that - * it gets interrupted rather more often - it loads 64 into the - * counter rather than 11932! This has an adverse impact on - * do_gettimeoffset() -- it stops working! What is also not - * good is that the interval that our timer function gets called - * is no longer 10.0002 ms, but 9.9767 ms. To get around this - * would require using a different timing source. Maybe someone - * could use the RTC - I know that this can interrupt at frequencies - * ranging from 8192Hz to 2Hz. If I had the energy, I'd somehow fix - * it so that at startup, the timer code in sched.c would select - * using either the RTC or the 8253 timer. The decision would be - * based on whether there was any other device around that needed - * to trample on the 8253. I'd set up the RTC to interrupt at 1024 Hz, - * and then do some jiggery to have a version of do_timer that - * advanced the clock by 1/1024 s. Every time that reached over 1/100 - * of a second, then do all the old code. If the time was kept correct - * then do_gettimeoffset could just return 0 - there is no low order - * divider that can be accessed. - * - * Ideally, you would be able to use the RTC for the speaker driver, - * but it appears that the speaker driver really needs interrupt more - * often than every 120 us or so. - * - * Anyway, this needs more thought.... pjsg (1993-08-28) - * - * If you are really that interested, you should be reading - * comp.protocols.time.ntp! - */ - -static unsigned long get_offset_pit(void) -{ - int count; - unsigned long flags; - static unsigned long jiffies_p = 0; - - /* - * cache volatile jiffies temporarily; we have xtime_lock. - */ - unsigned long jiffies_t; - - spin_lock_irqsave(&i8253_lock, flags); - /* timer count may underflow right here */ - outb_p(0x00, PIT_MODE); /* latch the count ASAP */ - - count = inb_p(PIT_CH0); /* read the latched count */ - - /* - * We do this guaranteed double memory access instead of a _p - * postfix in the previous port access. Wheee, hackady hack - */ - jiffies_t = jiffies; - - count |= inb_p(PIT_CH0) << 8; - - /* VIA686a test code... reset the latch if count > max + 1 */ - if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - - /* - * avoiding timer inconsistencies (they are rare, but they happen)... - * there are two kinds of problems that must be avoided here: - * 1. the timer counter underflows - * 2. hardware problem with the timer, not giving us continuous time, - * the counter does small "jumps" upwards on some Pentium systems, - * (see c't 95/10 page 335 for Neptun bug.) - */ - - if( jiffies_t == jiffies_p ) { - if( count > count_p ) { - /* the nutcase */ - count = do_timer_overflow(count); - } - } else - jiffies_p = jiffies_t; - - count_p = count; - - spin_unlock_irqrestore(&i8253_lock, flags); - - count = ((LATCH-1) - count) * TICK_SIZE; - count = (count + LATCH/2) / LATCH; - - return count; -} - - -/* tsc timer_opts struct */ -struct timer_opts timer_pit = { - .name = "pit", - .mark_offset = mark_offset_pit, - .get_offset = get_offset_pit, - .monotonic_clock = monotonic_clock_pit, - .delay = delay_pit, -}; - -struct init_timer_opts __initdata timer_pit_init = { - .init = init_pit, - .opts = &timer_pit, -}; - -void setup_pit_timer(void) -{ - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); - outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ - udelay(10); - outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - udelay(10); - outb(LATCH >> 8 , PIT_CH0); /* MSB */ - spin_unlock_irqrestore(&i8253_lock, flags); -} Index: linux/arch/i386/kernel/timers/timer_pm.c =================================================================== --- linux.orig/arch/i386/kernel/timers/timer_pm.c +++ /dev/null @@ -1,342 +0,0 @@ -/* - * (C) Dominik Brodowski 2003 - * - * Driver to use the Power Management Timer (PMTMR) available in some - * southbridges as primary timing source for the Linux kernel. - * - * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, - * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. - * - * This file is licensed under the GPL v2. - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "mach_timer.h" - -/* Number of PMTMR ticks expected during calibration run */ -#define PMTMR_TICKS_PER_SEC 3579545 -#define PMTMR_EXPECTED_RATE \ - ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10)) - - -/* The I/O port the PMTMR resides at. - * The location is detected during setup_arch(), - * in arch/i386/acpi/boot.c */ -u32 pmtmr_ioport = 0; - - -/* value of the Power timer at last timer interrupt */ -static u32 offset_tick; -static u32 offset_delay; - -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ - -static int pmtmr_need_workaround __read_mostly = 1; - -/*helper function to safely read acpi pm timesource*/ -static inline u32 read_pmtmr(void) -{ - if (pmtmr_need_workaround) { - u32 v1, v2, v3; - - /* It has been reported that because of various broken - * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time - * source is not latched, so you must read it multiple - * times to insure a safe value is read. - */ - do { - v1 = inl(pmtmr_ioport); - v2 = inl(pmtmr_ioport); - v3 = inl(pmtmr_ioport); - } while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1) - || (v3 > v1 && v3 < v2)); - - /* mask the output to 24 bits */ - return v2 & ACPI_PM_MASK; - } - - return inl(pmtmr_ioport) & ACPI_PM_MASK; -} - - -/* - * Some boards have the PMTMR running way too fast. We check - * the PMTMR rate against PIT channel 2 to catch these cases. - */ -static int verify_pmtmr_rate(void) -{ - u32 value1, value2; - unsigned long count, delta; - - mach_prepare_counter(); - value1 = read_pmtmr(); - mach_countup(&count); - value2 = read_pmtmr(); - delta = (value2 - value1) & ACPI_PM_MASK; - - /* Check that the PMTMR delta is within 5% of what we expect */ - if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 || - delta > (PMTMR_EXPECTED_RATE * 21) / 20) { - printk(KERN_INFO "PM-Timer running at invalid rate: %lu%% of normal - aborting.\n", 100UL * delta / PMTMR_EXPECTED_RATE); - return -1; - } - - return 0; -} - - -static int init_pmtmr(char* override) -{ - u32 value1, value2; - unsigned int i; - - if (override[0] && strncmp(override,"pmtmr",5)) - return -ENODEV; - - if (!pmtmr_ioport) - return -ENODEV; - - /* we use the TSC for delay_pmtmr, so make sure it exists */ - if (!cpu_has_tsc) - return -ENODEV; - - /* "verify" this timing source */ - value1 = read_pmtmr(); - for (i = 0; i < 10000; i++) { - value2 = read_pmtmr(); - if (value2 == value1) - continue; - if (value2 > value1) - goto pm_good; - if ((value2 < value1) && ((value2) < 0xFFF)) - goto pm_good; - printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2); - return -EINVAL; - } - printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1); - return -ENODEV; - -pm_good: - if (verify_pmtmr_rate() != 0) - return -ENODEV; - - init_cpu_khz(); - return 0; -} - -static inline u32 cyc2us(u32 cycles) -{ - /* The Power Management Timer ticks at 3.579545 ticks per microsecond. - * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] - * - * Even with HZ = 100, delta is at maximum 35796 ticks, so it can - * easily be multiplied with 286 (=0x11E) without having to fear - * u32 overflows. - */ - cycles *= 286; - return (cycles >> 10); -} - -/* - * this gets called during each timer interrupt - * - Called while holding the writer xtime_lock - */ -static void mark_offset_pmtmr(void) -{ - u32 lost, delta, last_offset; - static int first_run = 1; - last_offset = offset_tick; - - write_seqlock(&monotonic_lock); - - offset_tick = read_pmtmr(); - - /* calculate tick interval */ - delta = (offset_tick - last_offset) & ACPI_PM_MASK; - - /* convert to usecs */ - delta = cyc2us(delta); - - /* update the monotonic base value */ - monotonic_base += delta * NSEC_PER_USEC; - write_sequnlock(&monotonic_lock); - - /* convert to ticks */ - delta += offset_delay; - lost = delta / (USEC_PER_SEC / HZ); - offset_delay = delta % (USEC_PER_SEC / HZ); - - - /* compensate for lost ticks */ - if (lost >= 2) - jiffies_64 += lost - 1; - - /* don't calculate delay for first run, - or if we've got less then a tick */ - if (first_run || (lost < 1)) { - first_run = 0; - offset_delay = 0; - } -} - -static int pmtmr_resume(void) -{ - write_seqlock(&monotonic_lock); - /* Assume this is the last mark offset time */ - offset_tick = read_pmtmr(); - write_sequnlock(&monotonic_lock); - return 0; -} - -static unsigned long long monotonic_clock_pmtmr(void) -{ - u32 last_offset, this_offset; - unsigned long long base, ret; - unsigned seq; - - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = offset_tick; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - /* Read the pmtmr */ - this_offset = read_pmtmr(); - - /* convert to nanoseconds */ - ret = (this_offset - last_offset) & ACPI_PM_MASK; - ret = base + (cyc2us(ret) * NSEC_PER_USEC); - return ret; -} - -static void delay_pmtmr(unsigned long loops) -{ - unsigned long bclock, now; - - rdtscl(bclock); - do - { - rep_nop(); - rdtscl(now); - } while ((now-bclock) < loops); -} - - -/* - * get the offset (in microseconds) from the last call to mark_offset() - * - Called holding a reader xtime_lock - */ -static unsigned long get_offset_pmtmr(void) -{ - u32 now, offset, delta = 0; - - offset = offset_tick; - now = read_pmtmr(); - delta = (now - offset)&ACPI_PM_MASK; - - return (unsigned long) offset_delay + cyc2us(delta); -} - - -/* acpi timer_opts struct */ -static struct timer_opts timer_pmtmr = { - .name = "pmtmr", - .mark_offset = mark_offset_pmtmr, - .get_offset = get_offset_pmtmr, - .monotonic_clock = monotonic_clock_pmtmr, - .delay = delay_pmtmr, - .read_timer = read_timer_tsc, - .resume = pmtmr_resume, -}; - -struct init_timer_opts __initdata timer_pmtmr_init = { - .init = init_pmtmr, - .opts = &timer_pmtmr, -}; - -#ifdef CONFIG_PCI -/* - * PIIX4 Errata: - * - * The power management timer may return improper results when read. - * Although the timer value settles properly after incrementing, - * while incrementing there is a 3 ns window every 69.8 ns where the - * timer value is indeterminate (a 4.2% chance that the data will be - * incorrect when read). As a result, the ACPI free running count up - * timer specification is violated due to erroneous reads. - */ -static int __init pmtmr_bug_check(void) -{ - static struct pci_device_id gray_list[] __initdata = { - /* these chipsets may have bug. */ - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801DB_0) }, - { }, - }; - struct pci_dev *dev; - int pmtmr_has_bug = 0; - u8 rev; - - if (cur_timer != &timer_pmtmr || !pmtmr_need_workaround) - return 0; - - dev = pci_get_device(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82371AB_3, NULL); - if (dev) { - pci_read_config_byte(dev, PCI_REVISION_ID, &rev); - /* the bug has been fixed in PIIX4M */ - if (rev < 3) { - printk(KERN_WARNING "* Found PM-Timer Bug on this " - "chipset. Due to workarounds for a bug,\n" - "* this time source is slow. Consider trying " - "other time sources (clock=)\n"); - pmtmr_has_bug = 1; - } - pci_dev_put(dev); - } - - if (pci_dev_present(gray_list)) { - printk(KERN_WARNING "* This chipset may have PM-Timer Bug. Due" - " to workarounds for a bug,\n" - "* this time source is slow. If you are sure your timer" - " does not have\n" - "* this bug, please use \"pmtmr_good\" to disable the " - "workaround\n"); - pmtmr_has_bug = 1; - } - - if (!pmtmr_has_bug) - pmtmr_need_workaround = 0; - - return 0; -} -device_initcall(pmtmr_bug_check); -#endif - -static int __init pmtr_good_setup(char *__str) -{ - pmtmr_need_workaround = 0; - return 1; -} -__setup("pmtmr_good", pmtr_good_setup); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Dominik Brodowski "); -MODULE_DESCRIPTION("Power Management Timer (PMTMR) as primary timing source for x86"); Index: linux/arch/i386/kernel/timers/timer_tsc.c =================================================================== --- linux.orig/arch/i386/kernel/timers/timer_tsc.c +++ /dev/null @@ -1,617 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - * - * 2004-06-25 Jesper Juhl - * moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4 - * failing to inline. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -/* processor.h for distable_tsc flag */ -#include - -#include "io_ports.h" -#include "mach_timer.h" - -#include -#include - -#ifdef CONFIG_HPET_TIMER -static unsigned long hpet_usec_quotient; -static unsigned long hpet_last; -static struct timer_opts timer_tsc; -#endif - -static inline void cpufreq_delayed_get(void); - -int tsc_disable __devinitdata = 0; - -static int use_tsc; -/* Number of usecs that the last interrupt was delayed */ -static int delay_at_last_interrupt; - -static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ -static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* Avoid compensating for lost ticks before TSCs are synched */ -static int detect_lost_ticks; -static int __init start_lost_tick_compensation(void) -{ - detect_lost_ticks = 1; - return 0; -} -late_initcall(start_lost_tick_compensation); - -/* convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better percision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ -static unsigned long cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline void set_cyc2ns_scale(unsigned long cpu_khz) -{ - cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - -static int count2; /* counter for mark_offset_tsc() */ - -/* Cached *multiplier* to convert TSC counts to microseconds. - * (see the equation below). - * Equal to 2^32 * (1 / (clocks per usec) ). - * Initialized in time_init. - */ -static unsigned long fast_gettimeoffset_quotient; - -static unsigned long get_offset_tsc(void) -{ - register unsigned long eax, edx; - - /* Read the Time Stamp Counter */ - - rdtsc(eax,edx); - - /* .. relative to previous jiffy (32 bits is enough) */ - eax -= last_tsc_low; /* tsc_low delta */ - - /* - * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient - * = (tsc_low delta) * (usecs_per_clock) - * = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy) - * - * Using a mull instead of a divl saves up to 31 clock cycles - * in the critical path. - */ - - __asm__("mull %2" - :"=a" (eax), "=d" (edx) - :"rm" (fast_gettimeoffset_quotient), - "0" (eax)); - - /* our adjusted time offset in microseconds */ - return delay_at_last_interrupt + edx; -} - -static unsigned long long monotonic_clock_tsc(void) -{ - unsigned long long last_offset, this_offset, base; - unsigned seq; - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return base + cycles_2_ns(this_offset - last_offset); -} - -/* - * Scheduler clock - returns current time in nanosec units. - */ -unsigned long long sched_clock(void) -{ - unsigned long long this_offset; - - /* - * In the NUMA case we dont use the TSC as they are not - * synchronized across all CPUs. - */ -#ifndef CONFIG_NUMA - if (!use_tsc) -#endif - /* no locking but a rare wrong value is not a big deal */ - return jiffies_64 * (1000000000 / HZ); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return cycles_2_ns(this_offset); -} - -static void delay_tsc(unsigned long loops) -{ - unsigned long bclock, now; - - rdtscl(bclock); - do - { - rep_nop(); - rdtscl(now); - } while ((now-bclock) < loops); -} - -#ifdef CONFIG_HPET_TIMER -static void mark_offset_tsc_hpet(void) -{ - unsigned long long this_offset, last_offset; - unsigned long offset, temp, hpet_current; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - /* - * It is important that these two operations happen almost at - * the same time. We do the RDTSC stuff first, since it's - * faster. To avoid any inconsistencies, we need interrupts - * disabled locally. - */ - /* - * Interrupts are just disabled locally since the timer irq - * has the SA_INTERRUPT flag set. -arca - */ - /* read Pentium cycle counter */ - - hpet_current = hpet_readl(HPET_COUNTER); - rdtsc(last_tsc_low, last_tsc_high); - - /* lost tick compensation */ - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0)) - && detect_lost_ticks) { - int lost_ticks = (offset - hpet_last) / hpet_tick; - jiffies_64 += lost_ticks; - } - hpet_last = hpet_current; - - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - /* - * Time offset = (hpet delta) * ( usecs per HPET clock ) - * = (hpet delta) * ( usecs per tick / HPET clocks per tick) - * = (hpet delta) * ( hpet_usec_quotient ) / (2^32) - * Where, - * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick - */ - delay_at_last_interrupt = hpet_current - offset; - ASM_MUL64_REG(temp, delay_at_last_interrupt, - hpet_usec_quotient, delay_at_last_interrupt); -} -#endif - - -#ifdef CONFIG_CPU_FREQ -#include - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(void *v) -{ - unsigned int cpu; - for_each_online_cpu(cpu) { - cpufreq_get(cpu); - } - cpufreq_delayed_issched = 0; -} - -/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static inline void cpufreq_delayed_get(void) -{ - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n"); - schedule_work(&cpufreq_delayed_get_work); - } -} - -/* If the CPU frequency is scaled, TSC-based delays will need a different - * loops_per_jiffy value to function properly. - */ - -static unsigned int ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -#ifndef CONFIG_SMP -static unsigned long fast_gettimeoffset_ref = 0; -static unsigned int cpu_khz_ref = 0; -#endif - -static int -time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - - if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) - write_seqlock_irq(&xtime_lock); - if (!ref_freq) { - if (!freq->old){ - ref_freq = freq->new; - goto end; - } - ref_freq = freq->old; - loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; -#ifndef CONFIG_SMP - fast_gettimeoffset_ref = fast_gettimeoffset_quotient; - cpu_khz_ref = cpu_khz; -#endif - } - - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); -#ifndef CONFIG_SMP - if (cpu_khz) - cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); - if (use_tsc) { - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { - fast_gettimeoffset_quotient = cpufreq_scale(fast_gettimeoffset_ref, freq->new, ref_freq); - set_cyc2ns_scale(cpu_khz); - } - } -#endif - } - -end: - if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) - write_sequnlock_irq(&xtime_lock); - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - - -static int __init cpufreq_tsc(void) -{ - int ret; - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); - ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - if (!ret) - cpufreq_init = 1; - return ret; -} -core_initcall(cpufreq_tsc); - -#else /* CONFIG_CPU_FREQ */ -static inline void cpufreq_delayed_get(void) { return; } -#endif - -int recalibrate_cpu_khz(void) -{ -#ifndef CONFIG_SMP - unsigned int cpu_khz_old = cpu_khz; - - if (cpu_has_tsc) { - local_irq_disable(); - init_cpu_khz(); - local_irq_enable(); - cpu_data[0].loops_per_jiffy = - cpufreq_scale(cpu_data[0].loops_per_jiffy, - cpu_khz_old, - cpu_khz); - return 0; - } else - return -ENODEV; -#else - return -ENODEV; -#endif -} -EXPORT_SYMBOL(recalibrate_cpu_khz); - -static void mark_offset_tsc(void) -{ - unsigned long lost,delay; - unsigned long delta = last_tsc_low; - int count; - int countmp; - static int count1 = 0; - unsigned long long this_offset, last_offset; - static int lost_count = 0; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - /* - * It is important that these two operations happen almost at - * the same time. We do the RDTSC stuff first, since it's - * faster. To avoid any inconsistencies, we need interrupts - * disabled locally. - */ - - /* - * Interrupts are just disabled locally since the timer irq - * has the SA_INTERRUPT flag set. -arca - */ - - /* read Pentium cycle counter */ - - rdtsc(last_tsc_low, last_tsc_high); - - spin_lock(&i8253_lock); - outb_p(0x00, PIT_MODE); /* latch the count ASAP */ - - count = inb_p(PIT_CH0); /* read the latched count */ - count |= inb(PIT_CH0) << 8; - - /* - * VIA686a test code... reset the latch if count > max + 1 - * from timer_pit.c - cjb - */ - if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - - spin_unlock(&i8253_lock); - - if (pit_latch_buggy) { - /* get center value of last 3 time lutch */ - if ((count2 >= count && count >= count1) - || (count1 >= count && count >= count2)) { - count2 = count1; count1 = count; - } else if ((count1 >= count2 && count2 >= count) - || (count >= count2 && count2 >= count1)) { - countmp = count;count = count2; - count2 = count1;count1 = countmp; - } else { - count2 = count1; count1 = count; count = count1; - } - } - - /* lost tick compensation */ - delta = last_tsc_low - delta; - { - register unsigned long eax, edx; - eax = delta; - __asm__("mull %2" - :"=a" (eax), "=d" (edx) - :"rm" (fast_gettimeoffset_quotient), - "0" (eax)); - delta = edx; - } - delta += delay_at_last_interrupt; - lost = delta/(1000000/HZ); - delay = delta%(1000000/HZ); - if (lost >= 2 && detect_lost_ticks) { - jiffies_64 += lost-1; - - /* sanity check to ensure we're not always losing ticks */ - if (lost_count++ > 100) { - printk(KERN_WARNING "Losing too many ticks!\n"); - printk(KERN_WARNING "TSC cannot be used as a timesource. \n"); - printk(KERN_WARNING "Possible reasons for this are:\n"); - printk(KERN_WARNING " You're running with Speedstep,\n"); - printk(KERN_WARNING " You don't have DMA enabled for your hard disk (see hdparm),\n"); - printk(KERN_WARNING " Incorrect TSC synchronization on an SMP system (see dmesg).\n"); - printk(KERN_WARNING "Falling back to a sane timesource now.\n"); - - clock_fallback(); - } - /* ... but give the TSC a fair chance */ - if (lost_count > 25) - cpufreq_delayed_get(); - } else - lost_count = 0; - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - count = ((LATCH-1) - count) * TICK_SIZE; - delay_at_last_interrupt = (count + LATCH/2) / LATCH; - - /* catch corner case where tick rollover occured - * between tsc and pit reads (as noted when - * usec delta is > 90% # of usecs/tick) - */ - if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ)) - jiffies_64++; -} - -static int __init init_tsc(char* override) -{ - - /* check clock override */ - if (override[0] && strncmp(override,"tsc",3)) { -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled()) { - printk(KERN_ERR "Warning: clock= override failed. Defaulting to tsc\n"); - } else -#endif - { - return -ENODEV; - } - } - - /* - * If we have APM enabled or the CPU clock speed is variable - * (CPU stops clock on HLT or slows clock to save power) - * then the TSC timestamps may diverge by up to 1 jiffy from - * 'real time' but nothing will break. - * The most frequent case is that the CPU is "woken" from a halt - * state by the timer interrupt itself, so we get 0 error. In the - * rare cases where a driver would "wake" the CPU and request a - * timestamp, the maximum error is < 1 jiffy. But timestamps are - * still perfectly ordered. - * Note that the TSC counter will be reset if APM suspends - * to disk; this won't break the kernel, though, 'cuz we're - * smart. See arch/i386/kernel/apm.c. - */ - /* - * Firstly we have to do a CPU check for chips with - * a potentially buggy TSC. At this point we haven't run - * the ident/bugs checks so we must run this hook as it - * may turn off the TSC flag. - * - * NOTE: this doesn't yet handle SMP 486 machines where only - * some CPU's have a TSC. Thats never worked and nobody has - * moaned if you have the only one in the world - you fix it! - */ - - count2 = LATCH; /* initialize counter for mark_offset_tsc() */ - - if (cpu_has_tsc) { - unsigned long tsc_quotient; -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled() && hpet_use_timer) { - unsigned long result, remain; - printk("Using TSC for gettimeofday\n"); - tsc_quotient = calibrate_tsc_hpet(NULL); - timer_tsc.mark_offset = &mark_offset_tsc_hpet; - /* - * Math to calculate hpet to usec multiplier - * Look for the comments at get_offset_tsc_hpet() - */ - ASM_DIV64_REG(result, remain, hpet_tick, - 0, KERNEL_TICK_USEC); - if (remain > (hpet_tick >> 1)) - result++; /* rounding the result */ - - hpet_usec_quotient = result; - } else -#endif - { - tsc_quotient = calibrate_tsc(); - } - - if (tsc_quotient) { - fast_gettimeoffset_quotient = tsc_quotient; - use_tsc = 1; - /* - * We could be more selective here I suspect - * and just enable this for the next intel chips ? - */ - /* report CPU clock rate in Hz. - * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = - * clock/second. Our precision is about 100 ppm. - */ - { unsigned long eax=0, edx=1000; - __asm__("divl %2" - :"=a" (cpu_khz), "=d" (edx) - :"r" (tsc_quotient), - "0" (eax), "1" (edx)); - printk("Detected %u.%03u MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - } - set_cyc2ns_scale(cpu_khz); - return 0; - } - } - return -ENODEV; -} - -static int tsc_resume(void) -{ - write_seqlock(&monotonic_lock); - /* Assume this is the last mark offset time */ - rdtsc(last_tsc_low, last_tsc_high); -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled() && hpet_use_timer) - hpet_last = hpet_readl(HPET_COUNTER); -#endif - write_sequnlock(&monotonic_lock); - return 0; -} - -#ifndef CONFIG_X86_TSC -/* disable flag for tsc. Takes effect by clearing the TSC cpu flag - * in cpu/common.c */ -static int __init tsc_setup(char *str) -{ - tsc_disable = 1; - return 1; -} -#else -static int __init tsc_setup(char *str) -{ - printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC.\n"); - return 1; -} -#endif -__setup("notsc", tsc_setup); - - - -/************************************************************/ - -/* tsc timer_opts struct */ -static struct timer_opts timer_tsc = { - .name = "tsc", - .mark_offset = mark_offset_tsc, - .get_offset = get_offset_tsc, - .monotonic_clock = monotonic_clock_tsc, - .delay = delay_tsc, - .read_timer = read_timer_tsc, - .resume = tsc_resume, -}; - -struct init_timer_opts __initdata timer_tsc_init = { - .init = init_tsc, - .opts = &timer_tsc, -}; Index: linux/arch/i386/kernel/tsc.c =================================================================== --- /dev/null +++ linux/arch/i386/kernel/tsc.c @@ -0,0 +1,478 @@ +/* + * This code largely moved from arch/i386/kernel/timer/timer_tsc.c + * which was originally moved from arch/i386/kernel/time.c. + * See comments there for proper credits. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "mach_timer.h" + +/* + * On some systems the TSC frequency does not + * change with the cpu frequency. So we need + * an extra value to store the TSC freq + */ +unsigned int tsc_khz; + +int tsc_disable __cpuinitdata = 0; + +#ifdef CONFIG_X86_TSC +static int __init tsc_setup(char *str) +{ + printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " + "cannot disable TSC.\n"); + return 1; +} +#else +/* + * disable flag for tsc. Takes effect by clearing the TSC cpu flag + * in cpu/common.c + */ +static int __init tsc_setup(char *str) +{ + tsc_disable = 1; + + return 1; +} +#endif + +__setup("notsc", tsc_setup); + +/* + * code to mark and check if the TSC is unstable + * due to cpufreq or due to unsynced TSCs + */ +static int tsc_unstable; + +static inline int check_tsc_unstable(void) +{ + return tsc_unstable; +} + +void mark_tsc_unstable(void) +{ + tsc_unstable = 1; +} +EXPORT_SYMBOL_GPL(mark_tsc_unstable); + +/* Accellerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better percision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +static unsigned long cyc2ns_scale __read_mostly; + +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static inline void set_cyc2ns_scale(unsigned long cpu_khz) +{ + cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +} + +/* + * Scheduler clock - returns current time in nanosec units. + */ +unsigned long long sched_clock(void) +{ + unsigned long long this_offset; + + /* + * in the NUMA case we dont use the TSC as they are not + * synchronized across all CPUs. + */ +#ifndef CONFIG_NUMA + if (!cpu_khz || check_tsc_unstable()) +#endif + /* no locking but a rare wrong value is not a big deal */ + return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); + + /* read the Time Stamp Counter: */ + rdtscll(this_offset); + + /* return the value in ns */ + return cycles_2_ns(this_offset); +} + +static unsigned long calculate_cpu_khz(void) +{ + unsigned long long start, end; + unsigned long count; + u64 delta64; + int i; + unsigned long flags; + + local_irq_save(flags); + + /* run 3 times to ensure the cache is warm */ + for (i = 0; i < 3; i++) { + mach_prepare_counter(); + rdtscll(start); + mach_countup(&count); + rdtscll(end); + } + /* + * Error: ECTCNEVERSET + * The CTC wasn't reliable: we got a hit on the very first read, + * or the CPU was so fast/slow that the quotient wouldn't fit in + * 32 bits.. + */ + if (count <= 1) + goto err; + + delta64 = end - start; + + /* cpu freq too fast: */ + if (delta64 > (1ULL<<32)) + goto err; + + /* cpu freq too slow: */ + if (delta64 <= CALIBRATE_TIME_MSEC) + goto err; + + delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */ + do_div(delta64,CALIBRATE_TIME_MSEC); + + local_irq_restore(flags); + return (unsigned long)delta64; +err: + local_irq_restore(flags); + return 0; +} + +int recalibrate_cpu_khz(void) +{ +#ifndef CONFIG_SMP + unsigned long cpu_khz_old = cpu_khz; + + if (cpu_has_tsc) { + cpu_khz = calculate_cpu_khz(); + tsc_khz = cpu_khz; + cpu_data[0].loops_per_jiffy = + cpufreq_scale(cpu_data[0].loops_per_jiffy, + cpu_khz_old, cpu_khz); + return 0; + } else + return -ENODEV; +#else + return -ENODEV; +#endif +} + +EXPORT_SYMBOL(recalibrate_cpu_khz); + +void tsc_init(void) +{ + if (!cpu_has_tsc || tsc_disable) + return; + + cpu_khz = calculate_cpu_khz(); + tsc_khz = cpu_khz; + + if (!cpu_khz) + return; + + printk("Detected %lu.%03lu MHz processor.\n", + (unsigned long)cpu_khz / 1000, + (unsigned long)cpu_khz % 1000); + + set_cyc2ns_scale(cpu_khz); + use_tsc_delay(); +} + +#ifdef CONFIG_CPU_FREQ + +static unsigned int cpufreq_delayed_issched = 0; +static unsigned int cpufreq_init = 0; +static struct work_struct cpufreq_delayed_get_work; + +static void handle_cpufreq_delayed_get(void *v) +{ + unsigned int cpu; + + for_each_online_cpu(cpu) + cpufreq_get(cpu); + + cpufreq_delayed_issched = 0; +} + +/* + * if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries + * to verify the CPU frequency the timing core thinks the CPU is running + * at is still correct. + */ +static inline void cpufreq_delayed_get(void) +{ + if (cpufreq_init && !cpufreq_delayed_issched) { + cpufreq_delayed_issched = 1; + printk(KERN_DEBUG "Checking if CPU frequency changed.\n"); + schedule_work(&cpufreq_delayed_get_work); + } +} + +/* + * if the CPU frequency is scaled, TSC-based delays will need a different + * loops_per_jiffy value to function properly. + */ +static unsigned int ref_freq = 0; +static unsigned long loops_per_jiffy_ref = 0; +static unsigned long cpu_khz_ref = 0; + +static int +time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = data; + + if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) + write_seqlock_irq(&xtime_lock); + + if (!ref_freq) { + if (!freq->old){ + ref_freq = freq->new; + goto end; + } + ref_freq = freq->old; + loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; + cpu_khz_ref = cpu_khz; + } + + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || + (val == CPUFREQ_RESUMECHANGE)) { + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + cpu_data[freq->cpu].loops_per_jiffy = + cpufreq_scale(loops_per_jiffy_ref, + ref_freq, freq->new); + + if (cpu_khz) { + + if (num_online_cpus() == 1) + cpu_khz = cpufreq_scale(cpu_khz_ref, + ref_freq, freq->new); + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { + tsc_khz = cpu_khz; + set_cyc2ns_scale(cpu_khz); + /* + * TSC based sched_clock turns + * to junk w/ cpufreq + */ + mark_tsc_unstable(); + } + } + } +end: + if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) + write_sequnlock_irq(&xtime_lock); + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + +static int __init cpufreq_tsc(void) +{ + int ret; + + INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); + ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + if (!ret) + cpufreq_init = 1; + + return ret; +} + +core_initcall(cpufreq_tsc); + +#endif + +/* clock source code */ + +static unsigned long current_tsc_khz = 0; +static int tsc_update_callback(void); + +static cycle_t read_tsc(void) +{ + cycle_t ret; + + rdtscll(ret); + + return ret; +} + +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .mult = 0, /* to be set */ + .shift = 22, + .update_callback = tsc_update_callback, + .is_continuous = 1, +}; + +static int tsc_update_callback(void) +{ + int change = 0; + + /* check to see if we should switch to the safe clocksource: */ + if (clocksource_tsc.rating != 50 && check_tsc_unstable()) { + clocksource_tsc.rating = 50; + clocksource_reselect(); + change = 1; + } + + /* only update if tsc_khz has changed: */ + if (current_tsc_khz != tsc_khz) { + current_tsc_khz = tsc_khz; + clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, + clocksource_tsc.shift); + change = 1; + } + + return change; +} + +static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d) +{ + printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", + d->ident); + mark_tsc_unstable(); + return 0; +} + +/* List of systems that have known TSC problems */ +static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { + { + .callback = dmi_mark_tsc_unstable, + .ident = "IBM Thinkpad 380XD", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), + }, + }, + {} +}; + +#define TSC_FREQ_CHECK_INTERVAL (10*MSEC_PER_SEC) /* 10sec in MS */ +static struct timer_list verify_tsc_freq_timer; + +/* XXX - Probably should add locking */ +static void verify_tsc_freq(unsigned long unused) +{ + static u64 last_tsc; + static unsigned long last_jiffies; + + u64 now_tsc, interval_tsc; + unsigned long now_jiffies, interval_jiffies; + + + if (check_tsc_unstable()) + return; + + rdtscll(now_tsc); + now_jiffies = jiffies; + + if (!last_jiffies) { + goto out; + } + + interval_jiffies = now_jiffies - last_jiffies; + interval_tsc = now_tsc - last_tsc; + interval_tsc *= HZ; + do_div(interval_tsc, cpu_khz*1000); + + if (interval_tsc < (interval_jiffies * 3 / 4)) { + printk("TSC appears to be running slowly. " + "Marking it as unstable\n"); + mark_tsc_unstable(); + return; + } + +out: + last_tsc = now_tsc; + last_jiffies = now_jiffies; + /* set us up to go off on the next interval: */ + mod_timer(&verify_tsc_freq_timer, + jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL)); +} + +/* + * Make an educated guess if the TSC is trustworthy and synchronized + * over all CPUs. + */ +static __init int unsynchronized_tsc(void) +{ + /* + * Intel systems are normally all synchronized. + * Exceptions must mark TSC as unstable: + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return 0; + + /* assume multi socket systems are not synchronized: */ + return num_possible_cpus() > 1; +} + +static int __init init_tsc_clocksource(void) +{ + + if (cpu_has_tsc && tsc_khz && !tsc_disable) { + /* check blacklist */ + dmi_check_system(bad_tsc_dmi_table); + + if (unsynchronized_tsc()) /* mark unstable if unsynced */ + mark_tsc_unstable(); + current_tsc_khz = tsc_khz; + clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, + clocksource_tsc.shift); + /* lower the rating if we already know its unstable: */ + if (check_tsc_unstable()) + clocksource_tsc.rating = 50; + + init_timer(&verify_tsc_freq_timer); + verify_tsc_freq_timer.function = verify_tsc_freq; + verify_tsc_freq_timer.expires = + jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL); + add_timer(&verify_tsc_freq_timer); + + return clocksource_register(&clocksource_tsc); + } + + return 0; +} + +module_init(init_tsc_clocksource); Index: linux/arch/i386/lib/delay.c =================================================================== --- linux.orig/arch/i386/lib/delay.c +++ linux/arch/i386/lib/delay.c @@ -10,43 +10,92 @@ * we have to worry about. */ +#include #include #include #include -#include + #include #include #include #ifdef CONFIG_SMP -#include +# include #endif -extern struct timer_opts* timer; +/* simple loop based delay: */ +static void delay_loop(unsigned long loops) +{ + int d0; + + __asm__ __volatile__( + "\tjmp 1f\n" + ".align 16\n" + "1:\tjmp 2f\n" + ".align 16\n" + "2:\tdecl %0\n\tjns 2b" + :"=&a" (d0) + :"0" (loops)); +} + +/* TSC based delay: */ +static void delay_tsc(unsigned long loops) +{ + unsigned long bclock, now; + + rdtscl(bclock); + do { + rep_nop(); + rdtscl(now); + } while ((now-bclock) < loops); +} + +/* + * Since we calibrate only once at boot, this + * function should be set once at boot and not changed + */ +static void (*delay_fn)(unsigned long) = delay_loop; + +void use_tsc_delay(void) +{ + delay_fn = delay_tsc; +} + +int read_current_timer(unsigned long *timer_val) +{ + if (delay_fn == delay_tsc) { + rdtscl(*timer_val); + return 0; + } + return -1; +} void __delay(unsigned long loops) { - cur_timer->delay(loops); + delay_fn(loops); } inline void __const_udelay(unsigned long xloops) { int d0; + xloops *= 4; __asm__("mull %0" :"=d" (xloops), "=&a" (d0) - :"1" (xloops),"0" (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4))); - __delay(++xloops); + :"1" (xloops), "0" + (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4))); + + __delay(++xloops); } void __udelay(unsigned long usecs) { - __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ + __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ } void __ndelay(unsigned long nsecs) { - __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ + __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ } EXPORT_SYMBOL(__delay); Index: linux/arch/ia64/kernel/process.c =================================================================== --- linux.orig/arch/ia64/kernel/process.c +++ linux/arch/ia64/kernel/process.c @@ -272,9 +272,9 @@ cpu_idle (void) /* endless idle loop with no priority at all */ while (1) { if (can_do_pal_halt) - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; else - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; if (!need_resched()) { void (*idle)(void); Index: linux/arch/powerpc/kernel/time.c =================================================================== --- linux.orig/arch/powerpc/kernel/time.c +++ linux/arch/powerpc/kernel/time.c @@ -535,7 +535,7 @@ static __inline__ void timer_recalc_offs if (__USE_RTC()) return; - tlen = current_tick_length(); + tlen = current_tick_length(SHIFT_SCALE - 10); offset = cur_tb - do_gtod.varp->tb_orig_stamp; if (tlen == last_tick_len && offset < 0x80000000u) return; Index: linux/arch/x86_64/Kconfig =================================================================== --- linux.orig/arch/x86_64/Kconfig +++ linux/arch/x86_64/Kconfig @@ -24,6 +24,14 @@ config X86 bool default y +config GENERIC_TIME + bool + default y + +config VSYSCALL_GTOD + bool + default y + config SEMAPHORE_SLEEPERS bool default y @@ -230,6 +238,8 @@ config MTRR See for more information. +source "kernel/time/Kconfig" + config SMP bool "Symmetric multi-processing support" ---help--- Index: linux/arch/x86_64/kernel/Makefile =================================================================== --- linux.orig/arch/x86_64/kernel/Makefile +++ linux/arch/x86_64/kernel/Makefile @@ -31,7 +31,6 @@ obj-$(CONFIG_EARLY_PRINTK) += early_prin obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o obj-$(CONFIG_X86_VSMP) += vsmp.o obj-$(CONFIG_MODULES) += module.o Index: linux/arch/x86_64/kernel/apic.c =================================================================== --- linux.orig/arch/x86_64/kernel/apic.c +++ linux/arch/x86_64/kernel/apic.c @@ -721,6 +721,7 @@ static void __setup_APIC_LVTT(unsigned i apic_write(APIC_TMICT, clocks/APIC_DIVISOR); } +extern unsigned long hpet_address; static void setup_APIC_timer(unsigned int clocks) { unsigned long flags; @@ -728,7 +729,7 @@ static void setup_APIC_timer(unsigned in local_irq_save(flags); /* wait for irq slice */ - if (vxtime.hpet_address && hpet_use_timer) { + if (hpet_address && hpet_use_timer) { int trigger = hpet_readl(HPET_T0_CMP); while (hpet_readl(HPET_COUNTER) >= trigger) /* do nothing */ ; @@ -747,6 +748,8 @@ static void setup_APIC_timer(unsigned in } while (c2 - c1 < 300); } __setup_APIC_LVTT(clocks); +/* XXX Fixme! */ +#if 0 /* Turn off PIT interrupt if we use APIC timer as main timer. Only works with the PM timer right now TBD fix it for HPET too. */ @@ -757,6 +760,7 @@ static void setup_APIC_timer(unsigned in stop_timer_interrupt(); apic_runs_main_timer++; } +#endif local_irq_restore(flags); } @@ -787,7 +791,7 @@ static int __init calibrate_APIC_clock(v __setup_APIC_LVTT(1000000000); apic_start = apic_read(APIC_TMCCT); -#ifdef CONFIG_X86_PM_TIMER +#ifdef CONFIG_X86_PM_TIMER_XXXTHISNEEDSFIXING if (apic_calibrate_pmtmr && pmtmr_ioport) { pmtimer_wait(5000); /* 5ms wait */ apic = apic_read(APIC_TMCCT); Index: linux/arch/x86_64/kernel/pmtimer.c =================================================================== --- linux.orig/arch/x86_64/kernel/pmtimer.c +++ /dev/null @@ -1,127 +0,0 @@ -/* Ported over from i386 by AK, original copyright was: - * - * (C) Dominik Brodowski 2003 - * - * Driver to use the Power Management Timer (PMTMR) available in some - * southbridges as primary timing source for the Linux kernel. - * - * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, - * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. - * - * This file is licensed under the GPL v2. - * - * Dropped all the hardware bug workarounds for now. Hopefully they - * are not needed on 64bit chipsets. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* The I/O port the PMTMR resides at. - * The location is detected during setup_arch(), - * in arch/i386/kernel/acpi/boot.c */ -u32 pmtmr_ioport; - -/* value of the Power timer at last timer interrupt */ -static u32 offset_delay; -static u32 last_pmtmr_tick; - -#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ - -static inline u32 cyc2us(u32 cycles) -{ - /* The Power Management Timer ticks at 3.579545 ticks per microsecond. - * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] - * - * Even with HZ = 100, delta is at maximum 35796 ticks, so it can - * easily be multiplied with 286 (=0x11E) without having to fear - * u32 overflows. - */ - cycles *= 286; - return (cycles >> 10); -} - -int pmtimer_mark_offset(void) -{ - static int first_run = 1; - unsigned long tsc; - u32 lost; - - u32 tick = inl(pmtmr_ioport); - u32 delta; - - delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK); - - last_pmtmr_tick = tick; - monotonic_base += delta * NSEC_PER_USEC; - - delta += offset_delay; - - lost = delta / (USEC_PER_SEC / HZ); - offset_delay = delta % (USEC_PER_SEC / HZ); - - rdtscll(tsc); - vxtime.last_tsc = tsc - offset_delay * (u64)cpu_khz / 1000; - - /* don't calculate delay for first run, - or if we've got less then a tick */ - if (first_run || (lost < 1)) { - first_run = 0; - offset_delay = 0; - } - - return lost - 1; -} - -static unsigned pmtimer_wait_tick(void) -{ - u32 a, b; - for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK; - a == b; - b = inl(pmtmr_ioport) & ACPI_PM_MASK) - cpu_relax(); - return b; -} - -/* note: wait time is rounded up to one tick */ -void pmtimer_wait(unsigned us) -{ - u32 a, b; - a = pmtimer_wait_tick(); - do { - b = inl(pmtmr_ioport); - cpu_relax(); - } while (cyc2us(b - a) < us); -} - -void pmtimer_resume(void) -{ - last_pmtmr_tick = inl(pmtmr_ioport); -} - -unsigned int do_gettimeoffset_pm(void) -{ - u32 now, offset, delta = 0; - - offset = last_pmtmr_tick; - now = inl(pmtmr_ioport); - delta = (now - offset) & ACPI_PM_MASK; - - return offset_delay + cyc2us(delta); -} - - -static int __init nopmtimer_setup(char *s) -{ - pmtmr_ioport = 0; - return 1; -} - -__setup("nopmtimer", nopmtimer_setup); Index: linux/arch/x86_64/kernel/process.c =================================================================== --- linux.orig/arch/x86_64/kernel/process.c +++ linux/arch/x86_64/kernel/process.c @@ -111,16 +111,17 @@ static void default_idle(void) { local_irq_enable(); - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); while (!need_resched()) { local_irq_disable(); - if (!need_resched()) + if (!hrtimer_stop_sched_tick()) safe_halt(); else local_irq_enable(); + hrtimer_restart_sched_tick(); } - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; } /* @@ -203,8 +204,7 @@ static inline void play_dead(void) */ void cpu_idle (void) { - set_thread_flag(TIF_POLLING_NRFLAG); - + current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { while (!need_resched()) { Index: linux/arch/x86_64/kernel/time.c =================================================================== --- linux.orig/arch/x86_64/kernel/time.c +++ linux/arch/x86_64/kernel/time.c @@ -38,18 +38,14 @@ #include #include #include +#include #ifdef CONFIG_X86_LOCAL_APIC #include #endif -#ifdef CONFIG_CPU_FREQ -static void cpufreq_delayed_get(void); -#endif extern void i8254_timer_resume(void); extern int using_apic_timer; -static char *time_init_gtod(void); - DEFINE_SPINLOCK(rtc_lock); DEFINE_SPINLOCK(i8253_lock); @@ -58,122 +54,15 @@ static int notsc __initdata = 0; #undef HPET_HACK_ENABLE_DANGEROUS -unsigned int cpu_khz; /* TSC clocks / usec, not used here */ +unsigned int cpu_khz; /* CPU clocks / usec, not used here */ +unsigned int tsc_khz; /* TSC clocks / usec, not used here */ +unsigned long hpet_address; static unsigned long hpet_period; /* fsecs / HPET clock */ unsigned long hpet_tick; /* HPET clocks / interrupt */ int hpet_use_timer; /* Use counter of hpet for time keeping, otherwise PIT */ -unsigned long vxtime_hz = PIT_TICK_RATE; int report_lost_ticks; /* command line option */ -unsigned long long monotonic_base; - -struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ - -volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES; -struct timespec __xtime __section_xtime; -struct timezone __sys_tz __section_sys_tz; - -/* - * do_gettimeoffset() returns microseconds since last timer interrupt was - * triggered by hardware. A memory read of HPET is slower than a register read - * of TSC, but much more reliable. It's also synchronized to the timer - * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a - * timer interrupt has happened already, but vxtime.trigger wasn't updated yet. - * This is not a problem, because jiffies hasn't updated either. They are bound - * together by xtime_lock. - */ - -static inline unsigned int do_gettimeoffset_tsc(void) -{ - unsigned long t; - unsigned long x; - t = get_cycles_sync(); - if (t < vxtime.last_tsc) - t = vxtime.last_tsc; /* hack */ - x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32; - return x; -} - -static inline unsigned int do_gettimeoffset_hpet(void) -{ - /* cap counter read to one tick to avoid inconsistencies */ - unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last; - return (min(counter,hpet_tick) * vxtime.quot) >> 32; -} - -unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; - -/* - * This version of gettimeofday() has microsecond resolution and better than - * microsecond precision, as we're using at least a 10 MHz (usually 14.31818 - * MHz) HPET timer. - */ - -void do_gettimeofday(struct timeval *tv) -{ - unsigned long seq, t; - unsigned int sec, usec; - - do { - seq = read_seqbegin(&xtime_lock); - - sec = xtime.tv_sec; - usec = xtime.tv_nsec / 1000; - - /* i386 does some correction here to keep the clock - monotonous even when ntpd is fixing drift. - But they didn't work for me, there is a non monotonic - clock anyways with ntp. - I dropped all corrections now until a real solution can - be found. Note when you fix it here you need to do the same - in arch/x86_64/kernel/vsyscall.c and export all needed - variables in vmlinux.lds. -AK */ - - t = (jiffies - wall_jiffies) * (1000000L / HZ) + - do_gettimeoffset(); - usec += t; - - } while (read_seqretry(&xtime_lock, seq)); - - tv->tv_sec = sec + usec / 1000000; - tv->tv_usec = usec % 1000000; -} - -EXPORT_SYMBOL(do_gettimeofday); - -/* - * settimeofday() first undoes the correction that gettimeofday would do - * on the time, and then saves it. This is ugly, but has been like this for - * ages already. - */ - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - - nsec -= do_gettimeoffset() * 1000 + - (jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ); - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); +volatile unsigned long jiffies = INITIAL_JIFFIES; unsigned long profile_pc(struct pt_regs *regs) { @@ -269,83 +158,9 @@ static void set_rtc_mmss(unsigned long n } -/* monotonic_clock(): returns # of nanoseconds passed since time_init() - * Note: This function is required to return accurate - * time even in the absence of multiple timer ticks. - */ -unsigned long long monotonic_clock(void) -{ - unsigned long seq; - u32 last_offset, this_offset, offset; - unsigned long long base; - - if (vxtime.mode == VXTIME_HPET) { - do { - seq = read_seqbegin(&xtime_lock); - - last_offset = vxtime.last; - base = monotonic_base; - this_offset = hpet_readl(HPET_COUNTER); - } while (read_seqretry(&xtime_lock, seq)); - offset = (this_offset - last_offset); - offset *= (NSEC_PER_SEC/HZ) / hpet_tick; - } else { - do { - seq = read_seqbegin(&xtime_lock); - - last_offset = vxtime.last_tsc; - base = monotonic_base; - } while (read_seqretry(&xtime_lock, seq)); - this_offset = get_cycles_sync(); - offset = (this_offset - last_offset)*1000 / cpu_khz; - } - return base + offset; -} -EXPORT_SYMBOL(monotonic_clock); - -static noinline void handle_lost_ticks(int lost, struct pt_regs *regs) -{ - static long lost_count; - static int warned; - if (report_lost_ticks) { - printk(KERN_WARNING "time.c: Lost %d timer tick(s)! ", lost); - print_symbol("rip %s)\n", regs->rip); - } - - if (lost_count == 1000 && !warned) { - printk(KERN_WARNING "warning: many lost ticks.\n" - KERN_WARNING "Your time source seems to be instable or " - "some driver is hogging interupts\n"); - print_symbol("rip %s\n", regs->rip); - if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) { - printk(KERN_WARNING "Falling back to HPET\n"); - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); - vxtime.mode = VXTIME_HPET; - do_gettimeoffset = do_gettimeoffset_hpet; - } - /* else should fall back to PIT, but code missing. */ - warned = 1; - } else - lost_count++; - -#ifdef CONFIG_CPU_FREQ - /* In some cases the CPU can change frequency without us noticing - Give cpufreq a change to catch up. */ - if ((lost_count+1) % 25 == 0) - cpufreq_delayed_get(); -#endif -} - void main_timer_handler(struct pt_regs *regs) { static unsigned long rtc_update = 0; - unsigned long tsc; - int delay = 0, offset = 0, lost = 0; - /* * Here we are in the timer irq handler. We have irqs locally disabled (so we * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running @@ -355,67 +170,6 @@ void main_timer_handler(struct pt_regs * write_seqlock(&xtime_lock); - if (vxtime.hpet_address) - offset = hpet_readl(HPET_COUNTER); - - if (hpet_use_timer) { - /* if we're using the hpet timer functionality, - * we can more accurately know the counter value - * when the timer interrupt occured. - */ - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - delay = hpet_readl(HPET_COUNTER) - offset; - } else if (!pmtmr_ioport) { - spin_lock(&i8253_lock); - outb_p(0x00, 0x43); - delay = inb_p(0x40); - delay |= inb(0x40) << 8; - spin_unlock(&i8253_lock); - delay = LATCH - 1 - delay; - } - - tsc = get_cycles_sync(); - - if (vxtime.mode == VXTIME_HPET) { - if (offset - vxtime.last > hpet_tick) { - lost = (offset - vxtime.last) / hpet_tick - 1; - } - - monotonic_base += - (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick; - - vxtime.last = offset; -#ifdef CONFIG_X86_PM_TIMER - } else if (vxtime.mode == VXTIME_PMTMR) { - lost = pmtimer_mark_offset(); -#endif - } else { - offset = (((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ); - - if (offset < 0) - offset = 0; - - if (offset > (USEC_PER_SEC / HZ)) { - lost = offset / (USEC_PER_SEC / HZ); - offset %= (USEC_PER_SEC / HZ); - } - - monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ; - - vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; - - if ((((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> 32) < offset) - vxtime.last_tsc = tsc - - (((long) offset << 32) / vxtime.tsc_quot) - 1; - } - - if (lost > 0) { - handle_lost_ticks(lost, regs); - jiffies += lost; - } - /* * Do the timer stuff. */ @@ -484,15 +238,6 @@ unsigned long long sched_clock(void) { unsigned long a = 0; -#if 0 - /* Don't do a HPET read here. Using TSC always is much faster - and HPET may not be mapped yet when the scheduler first runs. - Disadvantage is a small drift between CPUs in some configurations, - but that should be tolerable. */ - if (__vxtime.mode == VXTIME_HPET) - return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32; -#endif - /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, which means it is not completely exact and may not be monotonous between CPUs. But the errors should be too small to matter for scheduling @@ -502,7 +247,20 @@ unsigned long long sched_clock(void) return cycles_2_ns(a); } -static unsigned long get_cmos_time(void) +static int tsc_unstable; + +static inline int check_tsc_unstable(void) +{ + return tsc_unstable; +} + +void mark_tsc_unstable(void) +{ + tsc_unstable = 1; +} +EXPORT_SYMBOL_GPL(mark_tsc_unstable); + +unsigned long get_cmos_time(void) { unsigned int year, mon, day, hour, min, sec; unsigned long flags; @@ -580,24 +338,6 @@ static void handle_cpufreq_delayed_get(v cpufreq_delayed_issched = 0; } -/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static void cpufreq_delayed_get(void) -{ - static int warned; - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - if (!warned) { - warned = 1; - printk(KERN_DEBUG - "Losing some ticks... checking if CPU frequency changed.\n"); - } - schedule_work(&cpufreq_delayed_get_work); - } -} - static unsigned int ref_freq = 0; static unsigned long loops_per_jiffy_ref = 0; @@ -632,8 +372,10 @@ static int time_cpufreq_notifier(struct cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - vxtime.tsc_quot = (1000L << 32) / cpu_khz; + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { + tsc_khz = cpu_khz; + } + } set_cyc2ns_scale(cpu_khz_ref); @@ -725,7 +467,7 @@ static __init int late_hpet_init(void) struct hpet_data hd; unsigned int ntimer; - if (!vxtime.hpet_address) + if (!hpet_address) return 0; memset(&hd, 0, sizeof (hd)); @@ -738,7 +480,7 @@ static __init int late_hpet_init(void) * Register with driver. * Timer0 and Timer1 is used by platform. */ - hd.hd_phys_address = vxtime.hpet_address; + hd.hd_phys_address = hpet_address; hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); hd.hd_nirqs = ntimer; hd.hd_flags = HPET_DATA_PLATFORM; @@ -807,10 +549,10 @@ static int hpet_init(void) { unsigned int id; - if (!vxtime.hpet_address) + if (!hpet_address) return -1; - set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address); - __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); + set_fixmap_nocache(FIX_HPET_BASE, hpet_address); + __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); /* * Read the period, compute tick and quotient. @@ -865,7 +607,7 @@ void __init pit_stop_interrupt(void) void __init stop_timer_interrupt(void) { char *name; - if (vxtime.hpet_address) { + if (hpet_address) { name = "HPET"; hpet_timer_stop_set_go(0); } else { @@ -888,22 +630,21 @@ static struct irqaction irq0 = { void __init time_init(void) { char *timename; - char *gtod; #ifdef HPET_HACK_ENABLE_DANGEROUS - if (!vxtime.hpet_address) { + if (!hpet_address) { printk(KERN_WARNING "time.c: WARNING: Enabling HPET base " "manually!\n"); outl(0x800038a0, 0xcf8); outl(0xff000001, 0xcfc); outl(0x800038a0, 0xcf8); - vxtime.hpet_address = inl(0xcfc) & 0xfffffffe; + hpet_address = inl(0xcfc) & 0xfffffffe; printk(KERN_WARNING "time.c: WARNING: Enabled HPET " - "at %#lx.\n", vxtime.hpet_address); + "at %#lx.\n", hpet_address); } #endif if (nohpet) - vxtime.hpet_address = 0; + hpet_address = 0; xtime.tv_sec = get_cmos_time(); xtime.tv_nsec = 0; @@ -911,39 +652,27 @@ void __init time_init(void) set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - if (!hpet_init()) - vxtime_hz = (1000000000000000L + hpet_period / 2) / hpet_period; - else - vxtime.hpet_address = 0; + if (hpet_init()) + hpet_address = 0; if (hpet_use_timer) { /* set tick_nsec to use the proper rate for HPET */ tick_nsec = TICK_NSEC_HPET; cpu_khz = hpet_calibrate_tsc(); timename = "HPET"; -#ifdef CONFIG_X86_PM_TIMER - } else if (pmtmr_ioport && !vxtime.hpet_address) { - vxtime_hz = PM_TIMER_FREQUENCY; - timename = "PM"; - pit_init(); - cpu_khz = pit_calibrate_tsc(); -#endif } else { pit_init(); cpu_khz = pit_calibrate_tsc(); timename = "PIT"; } + tsc_khz = cpu_khz; + - vxtime.mode = VXTIME_TSC; - gtod = time_init_gtod(); + if (unsynchronized_tsc()) + mark_tsc_unstable(); - printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n", - vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod); printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); - vxtime.quot = (1000000L << 32) / vxtime_hz; - vxtime.tsc_quot = (1000L << 32) / cpu_khz; - vxtime.last_tsc = get_cycles_sync(); setup_irq(0, &irq0); set_cyc2ns_scale(cpu_khz); @@ -967,39 +696,6 @@ __cpuinit int unsynchronized_tsc(void) return num_present_cpus() > 1; } -/* - * Decide what mode gettimeofday should use. - */ -__init static char *time_init_gtod(void) -{ - char *timetype; - - if (unsynchronized_tsc()) - notsc = 1; - if (vxtime.hpet_address && notsc) { - timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); - vxtime.mode = VXTIME_HPET; - do_gettimeoffset = do_gettimeoffset_hpet; -#ifdef CONFIG_X86_PM_TIMER - /* Using PM for gettimeofday is quite slow, but we have no other - choice because the TSC is too unreliable on some systems. */ - } else if (pmtmr_ioport && !vxtime.hpet_address && notsc) { - timetype = "PM"; - do_gettimeoffset = do_gettimeoffset_pm; - vxtime.mode = VXTIME_PMTMR; - sysctl_vsyscall = 0; - printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n"); -#endif - } else { - timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; - vxtime.mode = VXTIME_TSC; - } - return timetype; -} __setup("report_lost_ticks", time_setup); @@ -1030,7 +726,7 @@ static int timer_resume(struct sys_devic unsigned long ctime = get_cmos_time(); unsigned long sleep_length = (ctime - sleep_start) * HZ; - if (vxtime.hpet_address) + if (hpet_address) hpet_reenable(); else i8254_timer_resume(); @@ -1039,21 +735,9 @@ static int timer_resume(struct sys_devic write_seqlock_irqsave(&xtime_lock,flags); xtime.tv_sec = sec; xtime.tv_nsec = 0; - if (vxtime.mode == VXTIME_HPET) { - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); -#ifdef CONFIG_X86_PM_TIMER - } else if (vxtime.mode == VXTIME_PMTMR) { - pmtimer_resume(); -#endif - } else - vxtime.last_tsc = get_cycles_sync(); - write_sequnlock_irqrestore(&xtime_lock,flags); jiffies += sleep_length; wall_jiffies += sleep_length; - monotonic_base += sleep_length * (NSEC_PER_SEC/HZ); + write_sequnlock_irqrestore(&xtime_lock,flags); touch_softlockup_watchdog(); return 0; } @@ -1115,7 +799,7 @@ static unsigned int hpet_t1_cmp; /* cach int is_hpet_enabled(void) { - return vxtime.hpet_address != 0; + return hpet_address != 0; } /* @@ -1320,3 +1004,137 @@ int __init notsc_setup(char *s) } __setup("notsc", notsc_setup); + + +/* clock source code: */ + +static unsigned long current_tsc_khz = 0; + +static int tsc_update_callback(void); + +static cycle_t read_tsc(void) +{ + cycle_t ret; + rdtscll(ret); + return ret; +} + +static cycle_t __vsyscall_fn vread_tsc(void) +{ + cycle_t ret; + rdtscll(ret); + return ret; +} + +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = (cycle_t)-1, + .mult = 0, /* to be set */ + .shift = 22, + .update_callback = tsc_update_callback, + .is_continuous = 1, + .vread = vread_tsc, +}; + +static int tsc_update_callback(void) +{ + int change = 0; + + /* check to see if we should switch to the safe clocksource: */ + if (clocksource_tsc.rating != 50 && check_tsc_unstable()) { + clocksource_tsc.rating = 50; + clocksource_reselect(); + change = 1; + } + + /* only update if tsc_khz has changed: */ + if (current_tsc_khz != tsc_khz){ + current_tsc_khz = tsc_khz; + clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, + clocksource_tsc.shift); + change = 1; + } + return change; +} + +static int __init init_tsc_clocksource(void) +{ + if (!notsc && tsc_khz) { + current_tsc_khz = tsc_khz; + clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, + clocksource_tsc.shift); + return clocksource_register(&clocksource_tsc); + } + return 0; +} + +module_init(init_tsc_clocksource); + + +#define HPET_MASK 0xFFFFFFFF +#define HPET_SHIFT 22 + +/* FSEC = 10^-15 NSEC = 10^-9 */ +#define FSEC_PER_NSEC 1000000 + +static void *hpet_ptr; + +static cycle_t read_hpet(void) +{ + return (cycle_t)readl(hpet_ptr); +} + +static cycle_t __vsyscall_fn vread_hpet(void) +{ + return (cycle_t)readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0); +} + +struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 250, + .read = read_hpet, + .mask = (cycle_t)HPET_MASK, + .mult = 0, /* set below */ + .shift = HPET_SHIFT, + .is_continuous = 1, + .vread = vread_hpet, +}; + +static int __init init_hpet_clocksource(void) +{ + unsigned long hpet_period; + void __iomem *hpet_base; + u64 tmp; + + if (!hpet_address) + return -ENODEV; + + /* calculate the hpet address: */ + hpet_base = + (void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + hpet_ptr = hpet_base + HPET_COUNTER; + + /* calculate the frequency: */ + hpet_period = readl(hpet_base + HPET_PERIOD); + + /* + * hpet period is in femto seconds per cycle + * so we need to convert this to ns/cyc units + * aproximated by mult/2^shift + * + * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift + * fsec/cyc * 1ns/1000000fsec * 2^shift = mult + * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult + * (fsec/cyc << shift)/1000000 = mult + * (hpet_period << shift)/FSEC_PER_NSEC = mult + */ + tmp = (u64)hpet_period << HPET_SHIFT; + do_div(tmp, FSEC_PER_NSEC); + clocksource_hpet.mult = (u32)tmp; + + return clocksource_register(&clocksource_hpet); +} + +module_init(init_hpet_clocksource); Index: linux/arch/x86_64/kernel/vmlinux.lds.S =================================================================== --- linux.orig/arch/x86_64/kernel/vmlinux.lds.S +++ linux/arch/x86_64/kernel/vmlinux.lds.S @@ -85,27 +85,11 @@ SECTIONS __vsyscall_0 = VSYSCALL_VIRT_ADDR; . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) } - xtime_lock = VVIRT(.xtime_lock); - - .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } - vxtime = VVIRT(.vxtime); - - .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) } - wall_jiffies = VVIRT(.wall_jiffies); - - .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) } - sys_tz = VVIRT(.sys_tz); - - .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) } - sysctl_vsyscall = VVIRT(.sysctl_vsyscall); - - .xtime : AT(VLOAD(.xtime)) { *(.xtime) } - xtime = VVIRT(.xtime); - + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } - jiffies = VVIRT(.jiffies); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { *(.vsyscall_gtod_data) } + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) } Index: linux/arch/x86_64/kernel/vsyscall.c =================================================================== --- linux.orig/arch/x86_64/kernel/vsyscall.c +++ linux/arch/x86_64/kernel/vsyscall.c @@ -26,65 +26,106 @@ #include #include #include +#include #include #include #include +#include #include #include #include #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) -int __sysctl_vsyscall __section_sysctl_vsyscall = 1; -seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; +struct vsyscall_gtod_data_t { + seqlock_t lock; + int sysctl_enabled; + struct timeval wall_time_tv; + struct timezone sys_tz; + cycle_t offset_base; + struct clocksource clock; +}; + +struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data = { + .lock = SEQLOCK_UNLOCKED, + .sysctl_enabled = 1, +}; +extern struct vsyscall_gtod_data_t vsyscall_gtod_data; -#include -static __always_inline void timeval_normalize(struct timeval * tv) +void update_vsyscall(struct clocksource* clock, cycle_t base) { - time_t __sec; + unsigned long flags; - __sec = tv->tv_usec / 1000000; - if (__sec) { - tv->tv_usec %= 1000000; - tv->tv_sec += __sec; - } + write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + /* copy vsyscall data */ + vsyscall_gtod_data.clock = *clock; + vsyscall_gtod_data.offset_base = base; + vsyscall_gtod_data.wall_time_tv.tv_sec = xtime.tv_sec; + vsyscall_gtod_data.wall_time_tv.tv_usec = xtime.tv_nsec/1000; + vsyscall_gtod_data.sys_tz = sys_tz; + + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); +} + +/* + * XXX - this is ugly. gettimeofday() has a label in it so we can't + * call it twice. + */ +static __always_inline int syscall_gtod(struct timeval *tv, struct timezone *tz) +{ + int ret; + + asm volatile("syscall" + : "=a" (ret) + : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) + : __syscall_clobber); + + return ret; } + static __always_inline void do_vgettimeofday(struct timeval * tv) { - long sequence, t; - unsigned long sec, usec; + cycle_t now, base, mask, cycle_delta; + unsigned long seq, mult, shift, nsec_delta; do { - sequence = read_seqbegin(&__xtime_lock); - - sec = __xtime.tv_sec; - usec = (__xtime.tv_nsec / 1000) + - (__jiffies - __wall_jiffies) * (1000000 / HZ); - - if (__vxtime.mode != VXTIME_HPET) { - t = get_cycles_sync(); - if (t < __vxtime.last_tsc) - t = __vxtime.last_tsc; - usec += ((t - __vxtime.last_tsc) * - __vxtime.tsc_quot) >> 32; - /* See comment in x86_64 do_gettimeofday. */ - } else { - usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - - __vxtime.last) * __vxtime.quot) >> 32; + seq = read_seqbegin(&__vsyscall_gtod_data.lock); + if (!__vsyscall_gtod_data.clock.vread) { + syscall_gtod(tv, NULL); + return; } - } while (read_seqretry(&__xtime_lock, sequence)); - tv->tv_sec = sec + usec / 1000000; - tv->tv_usec = usec % 1000000; + now = __vsyscall_gtod_data.clock.vread(); + + base = __vsyscall_gtod_data.offset_base; + mask = __vsyscall_gtod_data.clock.mask; + mult = __vsyscall_gtod_data.clock.mult; + shift = __vsyscall_gtod_data.clock.shift; + + *tv = __vsyscall_gtod_data.wall_time_tv; + + } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + + /* calculate interval: */ + cycle_delta = (now - base) & mask; + /* convert to nsecs: */ + nsec_delta = (cycle_delta * mult) >> shift; + + /* convert to usecs and add to timespec: */ + tv->tv_usec += nsec_delta / NSEC_PER_USEC; + while (tv->tv_usec > USEC_PER_SEC) { + tv->tv_sec += 1; + tv->tv_usec -= USEC_PER_SEC; + } } /* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ static __always_inline void do_get_tz(struct timezone * tz) { - *tz = __sys_tz; + *tz = __vsyscall_gtod_data.sys_tz; } static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) @@ -107,7 +148,7 @@ static __always_inline long time_syscall int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) { - if (unlikely(!__sysctl_vsyscall)) + if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return gettimeofday(tv,tz); if (tv) do_vgettimeofday(tv); @@ -120,11 +161,11 @@ int __vsyscall(0) vgettimeofday(struct t * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { - if (unlikely(!__sysctl_vsyscall)) + if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); else if (t) - *t = __xtime.tv_sec; - return __xtime.tv_sec; + *t = __vsyscall_gtod_data.wall_time_tv.tv_sec; + return __vsyscall_gtod_data.wall_time_tv.tv_sec; } long __vsyscall(2) venosys_0(void) @@ -163,7 +204,7 @@ static int vsyscall_sysctl_change(ctl_ta ret = -ENOMEM; goto out; } - if (!sysctl_vsyscall) { + if (!vsyscall_gtod_data.sysctl_enabled) { *map1 = SYSCALL; *map2 = SYSCALL; } else { @@ -186,7 +227,7 @@ static int vsyscall_sysctl_nostrat(ctl_t static ctl_table kernel_table2[] = { { .ctl_name = 99, .procname = "vsyscall64", - .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644, + .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), .mode = 0644, .strategy = vsyscall_sysctl_nostrat, .proc_handler = vsyscall_sysctl_change }, { 0, } Index: linux/drivers/Makefile =================================================================== --- linux.orig/drivers/Makefile +++ linux/drivers/Makefile @@ -58,6 +58,7 @@ obj-$(CONFIG_RTC_LIB) += rtc/ obj-$(CONFIG_I2C) += i2c/ obj-$(CONFIG_W1) += w1/ obj-$(CONFIG_HWMON) += hwmon/ +obj-$(CONFIG_GENERIC_TIME) += clocksource/ obj-$(CONFIG_PHONE) += telephony/ obj-$(CONFIG_MD) += md/ obj-$(CONFIG_BT) += bluetooth/ Index: linux/drivers/acpi/processor_idle.c =================================================================== --- linux.orig/drivers/acpi/processor_idle.c +++ linux/drivers/acpi/processor_idle.c @@ -206,11 +206,11 @@ acpi_processor_power_activate(struct acp static void acpi_safe_halt(void) { - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); if (!need_resched()) safe_halt(); - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; } static atomic_t c3_cpu_count; @@ -330,10 +330,10 @@ static void acpi_processor_idle(void) * Invoke the current Cx state to put the processor to sleep. */ if (cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3) { - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); if (need_resched()) { - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; local_irq_enable(); return; } @@ -369,9 +369,14 @@ static void acpi_processor_idle(void) t2 = inl(acpi_fadt.xpm_tmr_blk.address); /* Get end time (ticks) */ t2 = inl(acpi_fadt.xpm_tmr_blk.address); + +#ifdef CONFIG_GENERIC_TIME + /* TSC halts in C2, so notify users */ + mark_tsc_unstable(); +#endif /* Re-enable interrupts */ local_irq_enable(); - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; /* Compute time (ticks) that we were actually asleep */ sleep_ticks = ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD; @@ -409,9 +414,13 @@ static void acpi_processor_idle(void) ACPI_MTX_DO_NOT_LOCK); } +#ifdef CONFIG_GENERIC_TIME + /* TSC halts in C3, so notify users */ + mark_tsc_unstable(); +#endif /* Re-enable interrupts */ local_irq_enable(); - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; /* Compute time (ticks) that we were actually asleep */ sleep_ticks = ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD; Index: linux/drivers/char/hangcheck-timer.c =================================================================== --- linux.orig/drivers/char/hangcheck-timer.c +++ linux/drivers/char/hangcheck-timer.c @@ -117,12 +117,12 @@ __setup("hcheck_reboot", hangcheck_parse __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks); #endif /* not MODULE */ -#if defined(CONFIG_X86) || defined(CONFIG_S390) +#if defined(CONFIG_X86_64) || defined(CONFIG_S390) # define HAVE_MONOTONIC # define TIMER_FREQ 1000000000ULL #elif defined(CONFIG_IA64) # define TIMER_FREQ ((unsigned long long)local_cpu_data->itc_freq) -#elif defined(CONFIG_PPC64) +#else # define TIMER_FREQ (HZ*loops_per_jiffy) #endif Index: linux/drivers/char/ipmi/ipmi_si_intf.c =================================================================== --- linux.orig/drivers/char/ipmi/ipmi_si_intf.c +++ linux/drivers/char/ipmi/ipmi_si_intf.c @@ -55,23 +55,6 @@ #include #include #include -#ifdef CONFIG_HIGH_RES_TIMERS -#include -# if defined(schedule_next_int) -/* Old high-res timer code, do translations. */ -# define get_arch_cycles(a) quick_update_jiffies_sub(a) -# define arch_cycles_per_jiffy cycles_per_jiffies -# endif -static inline void add_usec_to_timer(struct timer_list *t, long v) -{ - t->arch_cycle_expires += nsec_to_arch_cycle(v * 1000); - while (t->arch_cycle_expires >= arch_cycles_per_jiffy) - { - t->expires++; - t->arch_cycle_expires -= arch_cycles_per_jiffy; - } -} -#endif #include #include #include @@ -836,32 +819,6 @@ static int initialized = 0; /* Must be called with interrupts off and with the si_lock held. */ static void si_restart_short_timer(struct smi_info *smi_info) { -#if defined(CONFIG_HIGH_RES_TIMERS) - unsigned long flags; - unsigned long jiffies_now; - unsigned long seq; - - if (del_timer(&(smi_info->si_timer))) { - /* If we don't delete the timer, then it will go off - immediately, anyway. So we only process if we - actually delete the timer. */ - - do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); - jiffies_now = jiffies; - smi_info->si_timer.expires = jiffies_now; - smi_info->si_timer.arch_cycle_expires - = get_arch_cycles(jiffies_now); - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - - add_usec_to_timer(&smi_info->si_timer, SI_SHORT_TIMEOUT_USEC); - - add_timer(&(smi_info->si_timer)); - spin_lock_irqsave(&smi_info->count_lock, flags); - smi_info->timeout_restarts++; - spin_unlock_irqrestore(&smi_info->count_lock, flags); - } -#endif } static void smi_timeout(unsigned long data) @@ -904,31 +861,15 @@ static void smi_timeout(unsigned long da /* If the state machine asks for a short delay, then shorten the timer timeout. */ if (smi_result == SI_SM_CALL_WITH_DELAY) { -#if defined(CONFIG_HIGH_RES_TIMERS) - unsigned long seq; -#endif spin_lock_irqsave(&smi_info->count_lock, flags); smi_info->short_timeouts++; spin_unlock_irqrestore(&smi_info->count_lock, flags); -#if defined(CONFIG_HIGH_RES_TIMERS) - do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); - smi_info->si_timer.expires = jiffies; - smi_info->si_timer.arch_cycle_expires - = get_arch_cycles(smi_info->si_timer.expires); - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - add_usec_to_timer(&smi_info->si_timer, SI_SHORT_TIMEOUT_USEC); -#else smi_info->si_timer.expires = jiffies + 1; -#endif } else { spin_lock_irqsave(&smi_info->count_lock, flags); smi_info->long_timeouts++; spin_unlock_irqrestore(&smi_info->count_lock, flags); smi_info->si_timer.expires = jiffies + SI_TIMEOUT_JIFFIES; -#if defined(CONFIG_HIGH_RES_TIMERS) - smi_info->si_timer.arch_cycle_expires = 0; -#endif } do_add_timer: Index: linux/drivers/clocksource/Makefile =================================================================== --- /dev/null +++ linux/drivers/clocksource/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_X86_CYCLONE_TIMER) += cyclone.o +obj-$(CONFIG_X86_PM_TIMER) += acpi_pm.o Index: linux/drivers/clocksource/acpi_pm.c =================================================================== --- /dev/null +++ linux/drivers/clocksource/acpi_pm.c @@ -0,0 +1,177 @@ +/* + * linux/drivers/clocksource/acpi_pm.c + * + * This file contains the ACPI PM based clocksource. + * + * This code was largely moved from the i386 timer_pm.c file + * which was (C) Dominik Brodowski 2003 + * and contained the following comments: + * + * Driver to use the Power Management Timer (PMTMR) available in some + * southbridges as primary timing source for the Linux kernel. + * + * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, + * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. + * + * This file is licensed under the GPL v2. + */ + +#include +#include +#include +#include +#include + +/* Number of PMTMR ticks expected during calibration run */ +#define PMTMR_TICKS_PER_SEC 3579545 + +/* + * The I/O port the PMTMR resides at. + * The location is detected during setup_arch(), + * in arch/i386/acpi/boot.c + */ +u32 pmtmr_ioport __read_mostly; + +#define ACPI_PM_MASK CLOCKSOURCE_MASK(24) /* limit it to 24 bits */ + +static inline u32 read_pmtmr(void) +{ + /* mask the output to 24 bits */ + return inl(pmtmr_ioport) & ACPI_PM_MASK; +} + +static cycle_t acpi_pm_read_verified(void) +{ + u32 v1 = 0, v2 = 0, v3 = 0; + + /* + * It has been reported that because of various broken + * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM clock + * source is not latched, you must read it multiple + * times to ensure a safe value is read: + */ + do { + v1 = read_pmtmr(); + v2 = read_pmtmr(); + v3 = read_pmtmr(); + } while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1) + || (v3 > v1 && v3 < v2)); + + return (cycle_t)v2; +} + +static cycle_t acpi_pm_read(void) +{ + return (cycle_t)read_pmtmr(); +} + +static struct clocksource clocksource_acpi_pm = { + .name = "acpi_pm", + .rating = 200, + .read = acpi_pm_read, + .mask = (cycle_t)ACPI_PM_MASK, + .mult = 0, /*to be caluclated*/ + .shift = 22, + .is_continuous = 1, +}; + + +#ifdef CONFIG_PCI +static int acpi_pm_good; +static int __init acpi_pm_good_setup(char *__str) +{ + acpi_pm_good = 1; + return 1; +} +__setup("acpi_pm_good", acpi_pm_good_setup); + +static inline void acpi_pm_need_workaround(void) +{ + clocksource_acpi_pm.read = acpi_pm_read_verified; + clocksource_acpi_pm.rating = 110; +} + +/* + * PIIX4 Errata: + * + * The power management timer may return improper results when read. + * Although the timer value settles properly after incrementing, + * while incrementing there is a 3 ns window every 69.8 ns where the + * timer value is indeterminate (a 4.2% chance that the data will be + * incorrect when read). As a result, the ACPI free running count up + * timer specification is violated due to erroneous reads. + */ +static void __devinit acpi_pm_check_blacklist(struct pci_dev *dev) +{ + u8 rev; + + if (acpi_pm_good) + return; + + pci_read_config_byte(dev, PCI_REVISION_ID, &rev); + /* the bug has been fixed in PIIX4M */ + if (rev < 3) { + printk(KERN_WARNING "* Found PM-Timer Bug on the chipset." + " Due to workarounds for a bug,\n" + "* this clock source is slow. Consider trying" + " other clock sources\n"); + + acpi_pm_need_workaround(); + } +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_3, + acpi_pm_check_blacklist); + +static void __devinit acpi_pm_check_graylist(struct pci_dev *dev) +{ + if (acpi_pm_good) + return; + + printk(KERN_WARNING "* The chipset may have PM-Timer Bug. Due to" + " workarounds for a bug,\n" + "* this clock source is slow. If you are sure your timer" + " does not have\n" + "* this bug, please use \"acpi_pm_good\" to disable the" + " workaround\n"); + + acpi_pm_need_workaround(); +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0, + acpi_pm_check_graylist); +#endif + + +static int __init init_acpi_pm_clocksource(void) +{ + u32 value1, value2; + unsigned int i; + + if (!pmtmr_ioport) + return -ENODEV; + + clocksource_acpi_pm.mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, + clocksource_acpi_pm.shift); + + /* "verify" this timing source: */ + value1 = read_pmtmr(); + for (i = 0; i < 10000; i++) { + value2 = read_pmtmr(); + if (value2 == value1) + continue; + if (value2 > value1) + goto pm_good; + if ((value2 < value1) && ((value2) < 0xFFF)) + goto pm_good; + printk(KERN_INFO "PM-Timer had inconsistent results:" + " 0x%#x, 0x%#x - aborting.\n", value1, value2); + return -EINVAL; + } + printk(KERN_INFO "PM-Timer had no reasonable result:" + " 0x%#x - aborting.\n", value1); + return -ENODEV; + +pm_good: + return clocksource_register(&clocksource_acpi_pm); +} + +module_init(init_acpi_pm_clocksource); Index: linux/drivers/clocksource/cyclone.c =================================================================== --- /dev/null +++ linux/drivers/clocksource/cyclone.c @@ -0,0 +1,119 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include "mach_timer.h" + +#define CYCLONE_CBAR_ADDR 0xFEB00CD0 /* base address ptr */ +#define CYCLONE_PMCC_OFFSET 0x51A0 /* offset to control register */ +#define CYCLONE_MPCS_OFFSET 0x51A8 /* offset to select register */ +#define CYCLONE_MPMC_OFFSET 0x51D0 /* offset to count register */ +#define CYCLONE_TIMER_FREQ 99780000 /* 100Mhz, but not really */ +#define CYCLONE_TIMER_MASK CLOCKSOURCE_MASK(32) /* 32 bit mask */ + +int use_cyclone = 0; +static void __iomem *cyclone_ptr; + +static cycle_t read_cyclone(void) +{ + return (cycle_t)readl(cyclone_ptr); +} + +static struct clocksource clocksource_cyclone = { + .name = "cyclone", + .rating = 250, + .read = read_cyclone, + .mask = CYCLONE_TIMER_MASK, + .mult = 10, + .shift = 0, + .is_continuous = 1, +}; + +static int __init init_cyclone_clocksource(void) +{ + unsigned long base; /* saved value from CBAR */ + unsigned long offset; + u32 __iomem* volatile cyclone_timer; /* Cyclone MPMC0 register */ + u32 __iomem* reg; + int i; + + /* make sure we're on a summit box: */ + if (!use_cyclone) + return -ENODEV; + + printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n"); + + /* find base address: */ + offset = CYCLONE_CBAR_ADDR; + reg = ioremap_nocache(offset, sizeof(reg)); + if (!reg) { + printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n"); + return -ENODEV; + } + /* even on 64bit systems, this is only 32bits: */ + base = readl(reg); + if (!base) { + printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n"); + return -ENODEV; + } + iounmap(reg); + + /* setup PMCC: */ + offset = base + CYCLONE_PMCC_OFFSET; + reg = ioremap_nocache(offset, sizeof(reg)); + if (!reg) { + printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n"); + return -ENODEV; + } + writel(0x00000001,reg); + iounmap(reg); + + /* setup MPCS: */ + offset = base + CYCLONE_MPCS_OFFSET; + reg = ioremap_nocache(offset, sizeof(reg)); + if (!reg) { + printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n"); + return -ENODEV; + } + writel(0x00000001,reg); + iounmap(reg); + + /* map in cyclone_timer: */ + offset = base + CYCLONE_MPMC_OFFSET; + cyclone_timer = ioremap_nocache(offset, sizeof(u64)); + if (!cyclone_timer) { + printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n"); + return -ENODEV; + } + + /* quick test to make sure its ticking: */ + for (i = 0; i < 3; i++){ + u32 old = readl(cyclone_timer); + int stall = 100; + + while (stall--) + barrier(); + + if (readl(cyclone_timer) == old) { + printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n"); + iounmap(cyclone_timer); + cyclone_timer = NULL; + return -ENODEV; + } + } + cyclone_ptr = cyclone_timer; + + /* sort out mult/shift values: */ + clocksource_cyclone.shift = 22; + clocksource_cyclone.mult = clocksource_hz2mult(CYCLONE_TIMER_FREQ, + clocksource_cyclone.shift); + + return clocksource_register(&clocksource_cyclone); +} + +module_init(init_cyclone_clocksource); Index: linux/drivers/input/serio/i8042.c =================================================================== --- linux.orig/drivers/input/serio/i8042.c +++ linux/drivers/input/serio/i8042.c @@ -1085,7 +1085,7 @@ static int __devinit i8042_probe(struct goto err_controller_cleanup; } - mod_timer(&i8042_timer, jiffies + I8042_POLL_PERIOD); + mod_timer(&i8042_timer, jiffies + 2); //I8042_POLL_PERIOD); return 0; err_unregister_ports: Index: linux/drivers/input/serio/i8042.h =================================================================== --- linux.orig/drivers/input/serio/i8042.h +++ linux/drivers/input/serio/i8042.h @@ -44,7 +44,7 @@ * polling. */ -#define I8042_POLL_PERIOD HZ/20 +#define I8042_POLL_PERIOD (10*HZ) /* * Status register bits. Index: linux/fs/proc/proc_misc.c =================================================================== --- linux.orig/fs/proc/proc_misc.c +++ linux/fs/proc/proc_misc.c @@ -508,6 +508,8 @@ static int show_stat(struct seq_file *p, nr_running(), nr_iowait()); + show_no_hz_stats(p); + return 0; } Index: linux/include/asm-generic/percpu.h =================================================================== --- linux.orig/include/asm-generic/percpu.h +++ linux/include/asm-generic/percpu.h @@ -14,6 +14,7 @@ extern unsigned long __per_cpu_offset[NR /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu])) #define __get_cpu_var(var) per_cpu(var, smp_processor_id()) +#define __raw_get_cpu_var(var) per_cpu(var, raw_smp_processor_id()) /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ @@ -30,6 +31,7 @@ do { \ #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #endif /* SMP */ Index: linux/include/asm-i386/delay.h =================================================================== --- linux.orig/include/asm-i386/delay.h +++ linux/include/asm-i386/delay.h @@ -23,4 +23,6 @@ extern void __delay(unsigned long loops) ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \ __ndelay(n)) +void use_tsc_delay(void); + #endif /* defined(_I386_DELAY_H) */ Index: linux/include/asm-i386/i8253.h =================================================================== --- linux.orig/include/asm-i386/i8253.h +++ linux/include/asm-i386/i8253.h @@ -2,5 +2,6 @@ #define __ASM_I8253_H__ extern spinlock_t i8253_lock; +extern struct clock_event pit_clockevent; #endif /* __ASM_I8253_H__ */ Index: linux/include/asm-i386/mach-default/do_timer.h =================================================================== --- linux.orig/include/asm-i386/mach-default/do_timer.h +++ linux/include/asm-i386/mach-default/do_timer.h @@ -1,7 +1,8 @@ /* defines for inline arch setup functions */ - +#include #include #include +#include /** * do_timer_interrupt_hook - hook into timer tick @@ -16,24 +17,9 @@ static inline void do_timer_interrupt_hook(struct pt_regs *regs) { - do_timer(regs); -#ifndef CONFIG_SMP - update_process_times(user_mode_vm(regs)); -#endif -/* - * In the SMP case we use the local APIC timer interrupt to do the - * profiling, except when we simulate SMP mode on a uniprocessor - * system, in that case we have to call the local interrupt handler. - */ -#ifndef CONFIG_X86_LOCAL_APIC - profile_tick(CPU_PROFILING, regs); -#else - if (!using_apic_timer) - smp_local_timer_interrupt(regs); -#endif + pit_clockevent.event_handler(regs); } - /* you can safely undefine this if you don't have the Neptune chipset */ #define BUGGY_NEPTUN_TIMER Index: linux/include/asm-i386/mach-default/mach_timer.h =================================================================== --- linux.orig/include/asm-i386/mach-default/mach_timer.h +++ linux/include/asm-i386/mach-default/mach_timer.h @@ -15,7 +15,9 @@ #ifndef _MACH_TIMER_H #define _MACH_TIMER_H -#define CALIBRATE_LATCH (5 * LATCH) +#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */ +#define CALIBRATE_LATCH \ + ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000) static inline void mach_prepare_counter(void) { Index: linux/include/asm-i386/mach-summit/mach_mpparse.h =================================================================== --- linux.orig/include/asm-i386/mach-summit/mach_mpparse.h +++ linux/include/asm-i386/mach-summit/mach_mpparse.h @@ -2,6 +2,7 @@ #define __ASM_MACH_MPPARSE_H #include +#include extern int use_cyclone; @@ -29,6 +30,7 @@ static inline int mps_oem_check(struct m (!strncmp(productid, "VIGIL SMP", 9) || !strncmp(productid, "EXA", 3) || !strncmp(productid, "RUTHLESS SMP", 12))){ + mark_tsc_unstable(); use_cyclone = 1; /*enable cyclone-timer*/ setup_summit(); return 1; @@ -42,6 +44,7 @@ static inline int acpi_madt_oem_check(ch if (!strncmp(oem_id, "IBM", 3) && (!strncmp(oem_table_id, "SERVIGIL", 8) || !strncmp(oem_table_id, "EXA", 3))){ + mark_tsc_unstable(); use_cyclone = 1; /*enable cyclone-timer*/ setup_summit(); return 1; Index: linux/include/asm-i386/mach-visws/do_timer.h =================================================================== --- linux.orig/include/asm-i386/mach-visws/do_timer.h +++ linux/include/asm-i386/mach-visws/do_timer.h @@ -9,7 +9,10 @@ static inline void do_timer_interrupt_ho /* Clear the interrupt */ co_cpu_write(CO_CPU_STAT,co_cpu_read(CO_CPU_STAT) & ~CO_STAT_TIMEINTR); + write_seqlock(&xtime_lock); do_timer(regs); + write_sequnlock(&xtime_lock); + #ifndef CONFIG_SMP update_process_times(user_mode_vm(regs)); #endif Index: linux/include/asm-i386/thread_info.h =================================================================== --- linux.orig/include/asm-i386/thread_info.h +++ linux/include/asm-i386/thread_info.h @@ -141,8 +141,7 @@ register unsigned long current_stack_poi #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ -#define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ -#define TIF_MEMDIE 17 +#define TIF_MEMDIE 16 #define _TIF_SYSCALL_TRACE (1<thread_info->status & TS_POLLING) #endif /* __KERNEL__ */ Index: linux/include/asm-i386/timer.h =================================================================== --- linux.orig/include/asm-i386/timer.h +++ linux/include/asm-i386/timer.h @@ -3,68 +3,11 @@ #include #include -/** - * struct timer_ops - used to define a timer source - * - * @name: name of the timer. - * @init: Probes and initializes the timer. Takes clock= override - * string as an argument. Returns 0 on success, anything else - * on failure. - * @mark_offset: called by the timer interrupt. - * @get_offset: called by gettimeofday(). Returns the number of microseconds - * since the last timer interupt. - * @monotonic_clock: returns the number of nanoseconds since the init of the - * timer. - * @delay: delays this many clock cycles. - */ -struct timer_opts { - char* name; - void (*mark_offset)(void); - unsigned long (*get_offset)(void); - unsigned long long (*monotonic_clock)(void); - void (*delay)(unsigned long); - unsigned long (*read_timer)(void); - int (*suspend)(pm_message_t state); - int (*resume)(void); -}; - -struct init_timer_opts { - int (*init)(char *override); - struct timer_opts *opts; -}; - #define TICK_SIZE (tick_nsec / 1000) - -extern struct timer_opts* __init select_timer(void); -extern void clock_fallback(void); void setup_pit_timer(void); - /* Modifiers for buggy PIT handling */ - extern int pit_latch_buggy; - -extern struct timer_opts *cur_timer; extern int timer_ack; - -/* list of externed timers */ -extern struct timer_opts timer_none; -extern struct timer_opts timer_pit; -extern struct init_timer_opts timer_pit_init; -extern struct init_timer_opts timer_tsc_init; -#ifdef CONFIG_X86_CYCLONE_TIMER -extern struct init_timer_opts timer_cyclone_init; -#endif - -extern unsigned long calibrate_tsc(void); -extern unsigned long read_timer_tsc(void); -extern void init_cpu_khz(void); extern int recalibrate_cpu_khz(void); -#ifdef CONFIG_HPET_TIMER -extern struct init_timer_opts timer_hpet_init; -extern unsigned long calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr); -#endif -#ifdef CONFIG_X86_PM_TIMER -extern struct init_timer_opts timer_pmtmr_init; -#endif #endif Index: linux/include/asm-i386/timex.h =================================================================== --- linux.orig/include/asm-i386/timex.h +++ linux/include/asm-i386/timex.h @@ -8,6 +8,7 @@ #include #include +#include #ifdef CONFIG_X86_ELAN # define CLOCK_TICK_RATE 1189200 /* AMD Elan has different frequency! */ @@ -16,39 +17,6 @@ #endif -/* - * Standard way to access the cycle counter on i586+ CPUs. - * Currently only used on SMP. - * - * If you really have a SMP machine with i486 chips or older, - * compile for that, and this will just always return zero. - * That's ok, it just means that the nicer scheduling heuristics - * won't work for you. - * - * We only use the low 32 bits, and we'd simply better make sure - * that we reschedule before that wraps. Scheduling at least every - * four billion cycles just basically sounds like a good idea, - * regardless of how fast the machine is. - */ -typedef unsigned long long cycles_t; - -static inline cycles_t get_cycles (void) -{ - unsigned long long ret=0; - -#ifndef CONFIG_X86_TSC - if (!cpu_has_tsc) - return 0; -#endif - -#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC) - rdtscll(ret); -#endif - return ret; -} - -extern unsigned int cpu_khz; - extern int read_current_timer(unsigned long *timer_value); #define ARCH_HAS_READ_CURRENT_TIMER 1 Index: linux/include/asm-i386/tsc.h =================================================================== --- /dev/null +++ linux/include/asm-i386/tsc.h @@ -0,0 +1,49 @@ +/* + * linux/include/asm-i386/tsc.h + * + * i386 TSC related functions + */ +#ifndef _ASM_i386_TSC_H +#define _ASM_i386_TSC_H + +#include +#include + +/* + * Standard way to access the cycle counter on i586+ CPUs. + * Currently only used on SMP. + * + * If you really have a SMP machine with i486 chips or older, + * compile for that, and this will just always return zero. + * That's ok, it just means that the nicer scheduling heuristics + * won't work for you. + * + * We only use the low 32 bits, and we'd simply better make sure + * that we reschedule before that wraps. Scheduling at least every + * four billion cycles just basically sounds like a good idea, + * regardless of how fast the machine is. + */ +typedef unsigned long long cycles_t; + +extern unsigned int cpu_khz; +extern unsigned int tsc_khz; + +static inline cycles_t get_cycles(void) +{ + unsigned long long ret = 0; + +#ifndef CONFIG_X86_TSC + if (!cpu_has_tsc) + return 0; +#endif + +#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC) + rdtscll(ret); +#endif + return ret; +} + +extern void tsc_init(void); +extern void mark_tsc_unstable(void); + +#endif Index: linux/include/asm-ia64/percpu.h =================================================================== --- linux.orig/include/asm-ia64/percpu.h +++ linux/include/asm-ia64/percpu.h @@ -43,6 +43,7 @@ DECLARE_PER_CPU(unsigned long, local_per #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu])) #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __ia64_per_cpu_var(local_per_cpu_offset))) +#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __ia64_per_cpu_var(local_per_cpu_offset))) extern void percpu_modcopy(void *pcpudst, const void *src, unsigned long size); extern void setup_per_cpu_areas (void); @@ -52,6 +53,7 @@ extern void *per_cpu_init(void); #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #define per_cpu_init() (__phys_per_cpu_start) #endif /* SMP */ Index: linux/include/asm-ia64/thread_info.h =================================================================== --- linux.orig/include/asm-ia64/thread_info.h +++ linux/include/asm-ia64/thread_info.h @@ -27,6 +27,7 @@ struct thread_info { __u32 flags; /* thread_info flags (see TIF_*) */ __u32 cpu; /* current CPU */ __u32 last_cpu; /* Last CPU thread ran on */ + __u32 status; /* Thread synchronous flags */ mm_segment_t addr_limit; /* user-level address space limit */ int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ struct restart_block restart_block; @@ -103,4 +104,8 @@ struct thread_info { /* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */ #define TIF_WORK_MASK (TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)) +#define TS_POLLING 1 /* true if in idle loop and not sleeping */ + +#define tsk_is_polling(t) ((t)->thread_info->status & TS_POLLING) + #endif /* _ASM_IA64_THREAD_INFO_H */ Index: linux/include/asm-powerpc/percpu.h =================================================================== --- linux.orig/include/asm-powerpc/percpu.h +++ linux/include/asm-powerpc/percpu.h @@ -22,6 +22,7 @@ /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) +#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ @@ -41,6 +42,7 @@ extern void setup_per_cpu_areas(void); #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #endif /* SMP */ Index: linux/include/asm-s390/percpu.h =================================================================== --- linux.orig/include/asm-s390/percpu.h +++ linux/include/asm-s390/percpu.h @@ -40,6 +40,7 @@ extern unsigned long __per_cpu_offset[NR __typeof__(type) per_cpu__##name #define __get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset) +#define __raw_get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset) #define per_cpu(var,cpu) __reloc_hide(var,__per_cpu_offset[cpu]) /* A macro to avoid #include hell... */ @@ -57,6 +58,7 @@ do { \ __typeof__(type) per_cpu__##name #define __get_cpu_var(var) __reloc_hide(var,0) +#define __raw_get_cpu_var(var) __reloc_hide(var,0) #define per_cpu(var,cpu) __reloc_hide(var,0) #endif /* SMP */ Index: linux/include/asm-sparc64/percpu.h =================================================================== --- linux.orig/include/asm-sparc64/percpu.h +++ linux/include/asm-sparc64/percpu.h @@ -21,6 +21,7 @@ register unsigned long __local_per_cpu_o /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __local_per_cpu_offset)) +#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __local_per_cpu_offset)) /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ @@ -37,6 +38,7 @@ do { \ #define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #endif /* SMP */ Index: linux/include/asm-x86_64/percpu.h =================================================================== --- linux.orig/include/asm-x86_64/percpu.h +++ linux/include/asm-x86_64/percpu.h @@ -21,6 +21,7 @@ /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) +#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ @@ -40,6 +41,7 @@ extern void setup_per_cpu_areas(void); #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #endif /* SMP */ Index: linux/include/asm-x86_64/proto.h =================================================================== --- linux.orig/include/asm-x86_64/proto.h +++ linux/include/asm-x86_64/proto.h @@ -48,10 +48,7 @@ extern u32 pmtmr_ioport; #else #define pmtmr_ioport 0 #endif -extern unsigned long long monotonic_base; -extern int sysctl_vsyscall; extern int nohpet; -extern unsigned long vxtime_hz; extern int numa_setup(char *opt); Index: linux/include/asm-x86_64/thread_info.h =================================================================== --- linux.orig/include/asm-x86_64/thread_info.h +++ linux/include/asm-x86_64/thread_info.h @@ -101,7 +101,7 @@ static inline struct thread_info *stack_ #define TIF_IRET 5 /* force IRET */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ -#define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ +/* 16 free */ #define TIF_IA32 17 /* 32bit process */ #define TIF_FORK 18 /* ret_from_fork */ #define TIF_ABI_PENDING 19 @@ -115,7 +115,6 @@ static inline struct thread_info *stack_ #define _TIF_IRET (1<thread_info->status & TS_POLLING) #endif /* __KERNEL__ */ Index: linux/include/asm-x86_64/timex.h =================================================================== --- linux.orig/include/asm-x86_64/timex.h +++ linux/include/asm-x86_64/timex.h @@ -40,10 +40,11 @@ static __always_inline cycles_t get_cycl } extern unsigned int cpu_khz; +extern unsigned int tsc_khz; + +extern void mark_tsc_unstable(void); extern int read_current_timer(unsigned long *timer_value); #define ARCH_HAS_READ_CURRENT_TIMER 1 -extern struct vxtime_data vxtime; - #endif Index: linux/include/asm-x86_64/vsyscall.h =================================================================== --- linux.orig/include/asm-x86_64/vsyscall.h +++ linux/include/asm-x86_64/vsyscall.h @@ -15,49 +15,19 @@ enum vsyscall_num { #ifdef __KERNEL__ -#define __section_vxtime __attribute__ ((unused, __section__ (".vxtime"), aligned(16))) -#define __section_wall_jiffies __attribute__ ((unused, __section__ (".wall_jiffies"), aligned(16))) -#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16))) -#define __section_sys_tz __attribute__ ((unused, __section__ (".sys_tz"), aligned(16))) -#define __section_sysctl_vsyscall __attribute__ ((unused, __section__ (".sysctl_vsyscall"), aligned(16))) -#define __section_xtime __attribute__ ((unused, __section__ (".xtime"), aligned(16))) -#define __section_xtime_lock __attribute__ ((unused, __section__ (".xtime_lock"), aligned(16))) - -#define VXTIME_TSC 1 -#define VXTIME_HPET 2 -#define VXTIME_PMTMR 3 - -struct vxtime_data { - long hpet_address; /* HPET base address */ - int last; - unsigned long last_tsc; - long quot; - long tsc_quot; - int mode; -}; +/* Definitions for CONFIG_GENERIC_TIME definitions */ +#define __section_vsyscall_gtod_data __attribute__ ((unused, __section__ (".vsyscall_gtod_data"),aligned(16))) +#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn"))) + #define hpet_readl(a) readl((const void __iomem *)fix_to_virt(FIX_HPET_BASE) + a) #define hpet_writel(d,a) writel(d, (void __iomem *)fix_to_virt(FIX_HPET_BASE) + a) -/* vsyscall space (readonly) */ -extern struct vxtime_data __vxtime; -extern struct timespec __xtime; -extern volatile unsigned long __jiffies; -extern unsigned long __wall_jiffies; -extern struct timezone __sys_tz; -extern seqlock_t __xtime_lock; - /* kernel space (writeable) */ -extern struct vxtime_data vxtime; extern unsigned long wall_jiffies; extern struct timezone sys_tz; -extern int sysctl_vsyscall; extern seqlock_t xtime_lock; -extern int sysctl_vsyscall; - -#define ARCH_HAVE_XTIME_LOCK 1 - #endif /* __KERNEL__ */ #endif /* _ASM_X86_64_VSYSCALL_H_ */ Index: linux/include/linux/clockchips.h =================================================================== --- /dev/null +++ linux/include/linux/clockchips.h @@ -0,0 +1,104 @@ +/* linux/include/linux/clockchips.h + * + * This file contains the structure definitions for clockchips. + * + * If you are not a clockchip, or the time of day code, you should + * not be including this file! + */ +#ifndef _LINUX_CLOCKCHIPS_H +#define _LINUX_CLOCKCHIPS_H + +#include + +#ifdef CONFIG_GENERIC_TIME + +#include +#include + +/* Clock event mode commands */ +enum { + CLOCK_EVT_PERIODIC, + CLOCK_EVT_ONESHOT, + CLOCK_EVT_SHUTDOWN, +}; + +/* Clock event capability flags */ +#define CLOCK_CAP_TICK 0x000001 +#define CLOCK_CAP_UPDATE 0x000002 +#ifndef CONFIG_PROFILE_NMI +# define CLOCK_CAP_PROFILE 0x000004 +#else +# define CLOCK_CAP_PROFILE 0x000000 +#endif +#ifdef CONFIG_HIGH_RES_TIMERS +# define CLOCK_CAP_NEXTEVT 0x000008 +#else +# define CLOCK_CAP_NEXTEVT 0x000000 +#endif + +#define CLOCK_BASE_CAPS_MASK (CLOCK_CAP_TICK | CLOCK_CAP_PROFILE | \ + CLOCK_CAP_UPDATE) +#define CLOCK_CAPS_MASK (CLOCK_BASE_CAPS_MASK | CLOCK_CAP_NEXTEVT) + +struct clock_event; + +/** + * struct clock_event - clock event descriptor + * + * @name: ptr to clock event name + * @capabilities: capabilities of the event chip + * @max_delta_ns: maximum delta value in ns + * @min_delta_ns: minimum delta value in ns + * @mult: nanosecond to cycles multiplier + * @shift: nanoseconds to cycles divisor (power of two) + * @set_next_event: set next event + * @set_mode: set mode function + * @suspend: suspend function (optional) + * @resume: resume function (optional) + * @evthandler: Assigned by the framework to be called by the low + * level handler of the event source + */ +struct clock_event { + const char *name; + unsigned int capabilities; + unsigned long max_delta_ns; + unsigned long min_delta_ns; + unsigned long mult; + int shift; + void (*set_next_event)(unsigned long evt, + struct clock_event *); + void (*set_mode)(int mode, struct clock_event *); + int (*suspend)(struct clock_event *); + int (*resume)(struct clock_event *); + void (*event_handler)(struct pt_regs *regs); +}; + +/* + * Calculate a multiplication factor + */ +static inline unsigned long div_sc(unsigned long a, unsigned long b, + int shift) +{ + uint64_t tmp = ((uint64_t)a) << shift; + do_div(tmp, b); + return (unsigned long) tmp; +} + +/* Clock event layer functions */ +extern int setup_local_clockevent(struct clock_event *, cpumask_t cpumask); +extern int setup_global_clockevent(struct clock_event *, cpumask_t cpumask); +extern unsigned long clockevent_delta2ns(unsigned long latch, + struct clock_event *evt); +extern void init_clockevents(void); + +extern int clockevents_init_next_event(void); +extern int clockevents_set_next_event(ktime_t expires, int force); +extern void clockevents_stop_next_event(void); +extern void clockevents_trigger_next_event(void); +extern int clockevents_next_event_available(void); + +#else +# define init_clockevents() do { } while(0) +#endif + +#endif Index: linux/include/linux/clocksource.h =================================================================== --- /dev/null +++ linux/include/linux/clocksource.h @@ -0,0 +1,281 @@ +/* linux/include/linux/clocksource.h + * + * This file contains the structure definitions for clocksources. + * + * If you are not a clocksource, or timekeeping code, you should + * not be including this file! + */ +#ifndef _LINUX_CLOCKSOURCE_H +#define _LINUX_CLOCKSOURCE_H + +#include +#include +#include +#include +#include +#include + +/* clocksource cycle base type */ +typedef u64 cycle_t; + +/** + * struct clocksource - hardware abstraction for a free running counter + * Provides mostly state-free accessors to the underlying hardware. + * + * @name: ptr to clocksource name + * @list: list head for registration + * @rating: rating value for selection (higher is better) + * To avoid rating inflation the following + * list should give you a guide as to how + * to assign your clocksource a rating + * 1-99: Unfit for real use + * Only available for bootup and testing purposes. + * 100-199: Base level usability. + * Functional for real use, but not desired. + * 200-299: Good. + * A correct and usable clocksource. + * 300-399: Desired. + * A reasonably fast and accurate clocksource. + * 400-499: Perfect + * The ideal clocksource. A must-use where + * available. + * @read: returns a cycle value + * @mask: bitmask for two's complement + * subtraction of non 64 bit counters + * @mult: cycle to nanosecond multiplier + * @shift: cycle to nanosecond divisor (power of two) + * @update_callback: called when safe to alter clocksource values + * @is_continuous: defines if clocksource is free-running. + * @vread: vsyscall based read + * @interval_cycles: Used internally by timekeeping core, please ignore. + * @interval_snsecs: Used internally by timekeeping core, please ignore. + */ +struct clocksource { + char *name; + struct list_head list; + int rating; + cycle_t (*read)(void); + cycle_t mask; + u32 mult; + u32 shift; + int (*update_callback)(void); + int is_continuous; + cycle_t (*vread)(void); + /* timekeeping specific data, ignore */ + cycle_t interval_cycles; + u64 interval_snsecs; +}; + +/* simplify initialization of mask field */ +#define CLOCKSOURCE_MASK(bits) (cycle_t)(bits<64 ? ((1ULL<read(); +} + +/** + * cyc2ns - converts clocksource cycles to nanoseconds + * @cs: Pointer to clocksource + * @cycles: Cycles + * + * Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds. + * + * XXX - This could use some mult_lxl_ll() asm optimization + */ +static inline s64 cyc2ns(struct clocksource *cs, cycle_t cycles) +{ + u64 ret = (u64)cycles; + ret = (ret * cs->mult) >> cs->shift; + return ret; +} + +/** + * clocksource_calculate_interval - Calculates a clocksource interval struct + * + * @c: Pointer to clocksource. + * @length_nsec: Desired interval length in nanoseconds. + * + * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment + * pair and interval request. + * + * Unless you're the timekeeping code, you should not be using this! + */ +static inline void clocksource_calculate_interval(struct clocksource *c, + unsigned long length_nsec) +{ + u64 tmp; + + /* XXX - All of this could use a whole lot of optimization */ + tmp = length_nsec; + tmp <<= c->shift; + tmp += c->mult/2; + do_div(tmp, c->mult); + + c->interval_cycles = (cycle_t)tmp; + if(c->interval_cycles == 0) + c->interval_cycles = 1; + + c->interval_snsecs = (u64)c->interval_cycles * c->mult; +} + + +/** + * error_aproximation - calculates an error adjustment for a given error + * + * @error: Error value (unsigned) + * @unit: Adjustment unit + * + * For a given error value, this function takes the adjustment unit + * and uses binary approximation to return a power of two adjustment value. + * + * This function is only for use by the the make_ntp_adj() function + * and you must hold a write on the xtime_lock when calling. + */ +static inline int error_aproximation(u64 error, u64 unit) +{ + static int saved_adj = 0; + u64 adjusted_unit = unit << saved_adj; + + if (error > (adjusted_unit * 2)) { + /* large error, so increment the adjustment factor */ + saved_adj++; + } else if (error > adjusted_unit) { + /* just right, don't touch it */ + } else if (saved_adj) { + /* small error, so drop the adjustment factor */ + saved_adj--; + return 0; + } + + return saved_adj; +} + + +/** + * make_ntp_adj - Adjusts the specified clocksource for a given error + * + * @clock: Pointer to clock to be adjusted + * @cycles_delta: Current unacounted cycle delta + * @error: Pointer to current error value + * + * Returns clock shifted nanosecond adjustment to be applied against + * the accumulated time value (ie: xtime). + * + * If the error value is large enough, this function calulates the + * (power of two) adjustment value, and adjusts the clock's mult and + * interval_snsecs values accordingly. + * + * However, since there may be some unaccumulated cycles, to avoid + * time inconsistencies we must adjust the accumulation value + * accordingly. + * + * This is not very intuitive, so the following proof should help: + * The basic timeofday algorithm: base + cycle * mult + * Thus: + * new_base + cycle * new_mult = old_base + cycle * old_mult + * new_base = old_base + cycle * old_mult - cycle * new_mult + * new_base = old_base + cycle * (old_mult - new_mult) + * new_base - old_base = cycle * (old_mult - new_mult) + * base_delta = cycle * (old_mult - new_mult) + * base_delta = cycle * (mult_delta) + * + * Where mult_delta is the adjustment value made to mult + * + */ +static inline s64 make_ntp_adj(struct clocksource *clock, + cycles_t cycles_delta, s64* error) +{ + s64 ret = 0; + if (*error > ((s64)clock->interval_cycles+1)/2) { + /* calculate adjustment value */ + int adjustment = error_aproximation(*error, + clock->interval_cycles); + /* adjust clock */ + clock->mult += 1 << adjustment; + clock->interval_snsecs += clock->interval_cycles << adjustment; + + /* adjust the base and error for the adjustment */ + ret = -(cycles_delta << adjustment); + *error -= clock->interval_cycles << adjustment; + /* XXX adj error for cycle_delta offset? */ + } else if ((-(*error)) > ((s64)clock->interval_cycles+1)/2) { + /* calculate adjustment value */ + int adjustment = error_aproximation(-(*error), + clock->interval_cycles); + /* adjust clock */ + clock->mult -= 1 << adjustment; + clock->interval_snsecs -= clock->interval_cycles << adjustment; + + /* adjust the base and error for the adjustment */ + ret = cycles_delta << adjustment; + *error += clock->interval_cycles << adjustment; + /* XXX adj error for cycle_delta offset? */ + } + return ret; +} + + +/* used to install a new clocksource */ +int clocksource_register(struct clocksource*); +void clocksource_reselect(void); +struct clocksource* clocksource_get_next(void); + +#endif /* _LINUX_CLOCKSOURCE_H */ Index: linux/include/linux/hrtimer.h =================================================================== --- linux.orig/include/linux/hrtimer.h +++ linux/include/linux/hrtimer.h @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -34,9 +35,19 @@ enum hrtimer_restart { HRTIMER_RESTART, }; -#define HRTIMER_INACTIVE ((void *)1UL) +enum hrtimer_state { + HRTIMER_INACTIVE, + HRTIMER_PENDING, + HRTIMER_ACTIVE, +}; + +enum hrtimer_cb_mode { + HRTIMER_CB_SOFTIRQ, + HRTIMER_CB_IRQSAFE, + HRTIMER_CB_IRQSAFE_NO_RESTART, +}; -struct hrtimer_base; +struct hrtimer_clock_base; /** * struct hrtimer - the basic hrtimer structure @@ -48,13 +59,22 @@ struct hrtimer_base; * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * + * @mode: high resolution timer feature to allow executing the + * callback in the hardirq context (wakeups) + * @cb_entry: list head to enqueue an expired timer into the callback list + * * The hrtimer structure must be initialized by init_hrtimer_#CLOCKTYPE() */ struct hrtimer { - struct rb_node node; - ktime_t expires; - int (*function)(struct hrtimer *); - struct hrtimer_base *base; + struct rb_node node; + ktime_t expires; + int (*function)(struct hrtimer *); + struct hrtimer_clock_base *base; + enum hrtimer_state state; +#ifdef CONFIG_HIGH_RES_TIMERS + int cb_mode; + struct list_head cb_entry; +#endif }; /** @@ -70,6 +90,8 @@ struct hrtimer_sleeper { struct task_struct *task; }; +struct hrtimer_cpu_base; + /** * struct hrtimer_base - the timer base for a specific clock * @@ -83,25 +105,95 @@ struct hrtimer_sleeper { * @get_softirq_time: function to retrieve the current time from the softirq * @curr_timer: the timer which is executing a callback right now * @softirq_time: the time when running the hrtimer queue in the softirq + * + * @cb_pending: list of timers where the callback is pending + * @offset: hmmm + * @reprogram: function to reprogram the timer event */ -struct hrtimer_base { +struct hrtimer_clock_base { + struct hrtimer_cpu_base *cpu_base; clockid_t index; - spinlock_t lock; struct rb_root active; struct rb_node *first; ktime_t resolution; ktime_t (*get_time)(void); ktime_t (*get_softirq_time)(void); - struct hrtimer *curr_timer; ktime_t softirq_time; +#ifdef CONFIG_HIGH_RES_TIMERS + ktime_t offset; + int (*reprogram)(struct hrtimer *t, + struct hrtimer_clock_base *b, + ktime_t n); +#endif }; +#define HRTIMER_MAX_CLOCK_BASES 2 + +/* + * struct hrtimer_cpu_base + */ +struct hrtimer_cpu_base { + spinlock_t lock; + struct hrtimer *curr_timer; + struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; +#ifdef CONFIG_HIGH_RES_TIMERS + ktime_t expires_next; + int hres_active; + unsigned long check_clocks; + struct list_head cb_pending; + struct hrtimer sched_timer; + struct pt_regs *sched_regs; + unsigned long events; +#ifdef CONFIG_NO_HZ + ktime_t idle_tick; + unsigned long idle_jiffies; + unsigned long idle_calls; + unsigned long idle_sleeps; + unsigned long idle_sleeptime; +#endif +#endif +}; + +#ifdef CONFIG_HIGH_RES_TIMERS + +/* temporary hack */ +DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); + +extern void hrtimer_clock_notify(void); +extern void clock_was_set(void); +extern void hrtimer_interrupt(struct pt_regs *regs); + +# define hrtimer_cb_get_time(t) (t)->base->get_time() +# define hrtimer_hres_active (__get_cpu_var(hrtimer_bases).hres_active) +/* + * The resolution of the clocks. The resolution value is returned in + * the clock_getres() system call to give application programmers an + * idea of the (in)accuracy of timers. Timer values are rounded up to + * this resolution values. + */ +# define KTIME_REALTIME_RES (ktime_t) { .tv64 = CONFIG_HIGH_RES_RESOLUTION } +# define KTIME_MONOTONIC_RES (ktime_t) { .tv64 = CONFIG_HIGH_RES_RESOLUTION } + +#else + +# define KTIME_REALTIME_RES KTIME_LOW_RES +# define KTIME_MONOTONIC_RES KTIME_LOW_RES + /* * clock_was_set() is a NOP for non- high-resolution systems. The * time-sorted order guarantees that a timer does not expire early and * is expired in the next softirq when the clock was advanced. */ -#define clock_was_set() do { } while (0) +# define clock_was_set() do { } while (0) +# define hrtimer_clock_notify() do { } while (0) + +# define hrtimer_cb_get_time(t) (t)->base->softirq_time +# define hrtimer_hres_active 0 + +#endif + +extern ktime_t ktime_get(void); +extern ktime_t ktime_get_real(void); /* Exported timer functions: */ @@ -127,7 +219,7 @@ extern ktime_t hrtimer_get_next_event(vo static inline int hrtimer_active(const struct hrtimer *timer) { - return timer->node.rb_parent != HRTIMER_INACTIVE; + return timer->state != HRTIMER_INACTIVE; } /* Forward a hrtimer so it expires after now: */ @@ -149,4 +241,22 @@ extern void hrtimer_run_queues(void); /* Bootup initialization: */ extern void __init hrtimers_init(void); +#ifdef CONFIG_NO_HZ +extern void hrtimer_trigger_next_hz_tick(struct tvec_t_base_s *base); +extern int hrtimer_stop_sched_tick(void); +extern void hrtimer_restart_sched_tick(void); +extern void update_jiffies(void); +struct seq_file; +extern void show_no_hz_stats(struct seq_file *p); +#else +# define hrtimer_trigger_next_hz_tick(base) do { } while (0) +static inline int hrtimer_stop_sched_tick(void) +{ + return 0; +} +# define hrtimer_restart_sched_tick() do { } while (0) +# define update_jiffies() do { } while (0) +# define show_no_hz_stats(p) do { } while (0) +#endif + #endif Index: linux/include/linux/interrupt.h =================================================================== --- linux.orig/include/linux/interrupt.h +++ linux/include/linux/interrupt.h @@ -113,7 +113,10 @@ enum NET_TX_SOFTIRQ, NET_RX_SOFTIRQ, BLOCK_SOFTIRQ, - TASKLET_SOFTIRQ + TASKLET_SOFTIRQ, +#ifdef CONFIG_HIGH_RES_TIMERS + HRTIMER_SOFTIRQ, +#endif }; /* softirq mask and active fields moved to irq_cpustat_t in Index: linux/include/linux/jiffies.h =================================================================== --- linux.orig/include/linux/jiffies.h +++ linux/include/linux/jiffies.h @@ -127,13 +127,13 @@ static inline u64 get_jiffies_64(void) * * And some not so obvious. * - * Note that we don't want to return MAX_LONG, because + * Note that we don't want to return LONG_MAX, because * for various timeout reasons we often end up having * to wait "jiffies+1" in order to guarantee that we wait * at _least_ "jiffies" - so "jiffies+1" had better still * be positive. */ -#define MAX_JIFFY_OFFSET ((~0UL >> 1)-1) +#define MAX_JIFFY_OFFSET ((LONG_MAX >> 1)-1) /* * We want to do realistic conversions of time so we need to use the same @@ -244,207 +244,24 @@ static inline u64 get_jiffies_64(void) #endif /* - * Convert jiffies to milliseconds and back. - * - * Avoid unnecessary multiplications/divisions in the - * two most common HZ cases: - */ -static inline unsigned int jiffies_to_msecs(const unsigned long j) -{ -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (MSEC_PER_SEC / HZ) * j; -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); -#else - return (j * MSEC_PER_SEC) / HZ; -#endif -} - -static inline unsigned int jiffies_to_usecs(const unsigned long j) -{ -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (USEC_PER_SEC / HZ) * j; -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); -#else - return (j * USEC_PER_SEC) / HZ; -#endif -} - -static inline unsigned long msecs_to_jiffies(const unsigned int m) -{ - if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - return m * (HZ / MSEC_PER_SEC); -#else - return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC; -#endif -} - -static inline unsigned long usecs_to_jiffies(const unsigned int u) -{ - if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return u * (HZ / USEC_PER_SEC); -#else - return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC; -#endif -} - -/* - * The TICK_NSEC - 1 rounds up the value to the next resolution. Note - * that a remainder subtract here would not do the right thing as the - * resolution values don't fall on second boundries. I.e. the line: - * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. - * - * Rather, we just shift the bits off the right. - * - * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec - * value to a scaled second value. + * Convert various time units to each other: */ -static __inline__ unsigned long -timespec_to_jiffies(const struct timespec *value) -{ - unsigned long sec = value->tv_sec; - long nsec = value->tv_nsec + TICK_NSEC - 1; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - nsec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)nsec * NSEC_CONVERSION) >> - (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; - -} - -static __inline__ void -jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u64 nsec = (u64)jiffies * TICK_NSEC; - value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec); -} +extern unsigned int jiffies_to_msecs(const unsigned long j); +extern unsigned int jiffies_to_usecs(const unsigned long j); +extern unsigned long msecs_to_jiffies(const unsigned int m); +extern unsigned long usecs_to_jiffies(const unsigned int u); +extern unsigned long timespec_to_jiffies(const struct timespec *value); +extern void jiffies_to_timespec(const unsigned long jiffies, + struct timespec *value); +extern unsigned long timeval_to_jiffies(const struct timeval *value); +extern void jiffies_to_timeval(const unsigned long jiffies, + struct timeval *value); +extern clock_t jiffies_to_clock_t(long x); +extern unsigned long clock_t_to_jiffies(unsigned long x); +extern u64 jiffies_64_to_clock_t(u64 x); +extern u64 nsec_to_clock_t(u64 x); +extern int nsec_to_timestamp(char *s, u64 t); -/* Same for "timeval" - * - * Well, almost. The problem here is that the real system resolution is - * in nanoseconds and the value being converted is in micro seconds. - * Also for some machines (those that use HZ = 1024, in-particular), - * there is a LARGE error in the tick size in microseconds. - - * The solution we use is to do the rounding AFTER we convert the - * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. - * Instruction wise, this should cost only an additional add with carry - * instruction above the way it was done above. - */ -static __inline__ unsigned long -timeval_to_jiffies(const struct timeval *value) -{ - unsigned long sec = value->tv_sec; - long usec = value->tv_usec; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - usec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> - (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; -} - -static __inline__ void -jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u64 nsec = (u64)jiffies * TICK_NSEC; - long tv_usec; - - value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec); - tv_usec /= NSEC_PER_USEC; - value->tv_usec = tv_usec; -} - -/* - * Convert jiffies/jiffies_64 to clock_t and back. - */ -static inline clock_t jiffies_to_clock_t(long x) -{ -#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 - return x / (HZ / USER_HZ); -#else - u64 tmp = (u64)x * TICK_NSEC; - do_div(tmp, (NSEC_PER_SEC / USER_HZ)); - return (long)tmp; -#endif -} - -static inline unsigned long clock_t_to_jiffies(unsigned long x) -{ -#if (HZ % USER_HZ)==0 - if (x >= ~0UL / (HZ / USER_HZ)) - return ~0UL; - return x * (HZ / USER_HZ); -#else - u64 jif; - - /* Don't worry about loss of precision here .. */ - if (x >= ~0UL / HZ * USER_HZ) - return ~0UL; - - /* .. but do try to contain it here */ - jif = x * (u64) HZ; - do_div(jif, USER_HZ); - return jif; -#endif -} - -static inline u64 jiffies_64_to_clock_t(u64 x) -{ -#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 - do_div(x, HZ / USER_HZ); -#else - /* - * There are better ways that don't overflow early, - * but even this doesn't overflow in hundreds of years - * in 64 bits, so.. - */ - x *= TICK_NSEC; - do_div(x, (NSEC_PER_SEC / USER_HZ)); -#endif - return x; -} - -static inline u64 nsec_to_clock_t(u64 x) -{ -#if (NSEC_PER_SEC % USER_HZ) == 0 - do_div(x, (NSEC_PER_SEC / USER_HZ)); -#elif (USER_HZ % 512) == 0 - x *= USER_HZ/512; - do_div(x, (NSEC_PER_SEC / 512)); -#else - /* - * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, - * overflow after 64.99 years. - * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... - */ - x *= 9; - do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) - / USER_HZ)); -#endif - return x; -} +#define TIMESTAMP_SIZE 30 #endif Index: linux/include/linux/ktime.h =================================================================== --- linux.orig/include/linux/ktime.h +++ linux/include/linux/ktime.h @@ -264,8 +264,7 @@ static inline u64 ktime_to_ns(const ktim * idea of the (in)accuracy of timers. Timer values are rounded up to * this resolution values. */ -#define KTIME_REALTIME_RES (ktime_t){ .tv64 = TICK_NSEC } -#define KTIME_MONOTONIC_RES (ktime_t){ .tv64 = TICK_NSEC } +#define KTIME_LOW_RES (ktime_t){ .tv64 = TICK_NSEC } /* Get the monotonic time in timespec format: */ extern void ktime_get_ts(struct timespec *ts); Index: linux/include/linux/rcupdate.h =================================================================== --- linux.orig/include/linux/rcupdate.h +++ linux/include/linux/rcupdate.h @@ -256,6 +256,7 @@ extern int rcu_needs_cpu(int cpu); extern void rcu_init(void); extern void rcu_check_callbacks(int cpu, int user); +extern void rcu_advance_callbacks(int cpu, int user); extern void rcu_restart_cpu(int cpu); extern long rcu_batches_completed(void); @@ -268,6 +269,7 @@ extern __deprecated_for_modules void syn extern void synchronize_rcu(void); void synchronize_idle(void); extern void rcu_barrier(void); +extern void rcu_process_callbacks(unsigned long unused); #endif /* __KERNEL__ */ #endif /* __LINUX_RCUPDATE_H */ Index: linux/include/linux/sysctl.h =================================================================== --- linux.orig/include/linux/sysctl.h +++ linux/include/linux/sysctl.h @@ -148,6 +148,7 @@ enum KERN_SPIN_RETRY=70, /* int: number of spinlock retries */ KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */ KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */ + KERN_TIMEOUT_GRANULARITY=73, /* int: timeout granularity in jiffies */ }; Index: linux/include/linux/time.h =================================================================== --- linux.orig/include/linux/time.h +++ linux/include/linux/time.h @@ -77,6 +77,8 @@ extern struct timespec xtime; extern struct timespec wall_to_monotonic; extern seqlock_t xtime_lock; +void timekeeping_init(void); + static inline unsigned long get_seconds(void) { return xtime.tv_sec; @@ -100,6 +102,7 @@ extern int do_getitimer(int which, struc extern void getnstimeofday(struct timespec *tv); extern struct timespec timespec_trunc(struct timespec t, unsigned gran); +extern int timekeeping_is_continuous(void); /** * timespec_to_ns - Convert timespec to nanoseconds @@ -142,6 +145,20 @@ extern struct timespec ns_to_timespec(co */ extern struct timeval ns_to_timeval(const s64 nsec); +/** + * timespec_add_ns - Adds nanoseconds to a timespec + * @a: pointer to timespec to be incremented + * @ns: unsigned nanoseconds value to be added + */ +static inline void timespec_add_ns(struct timespec *a, u64 ns) +{ + ns += a->tv_nsec; + while(unlikely(ns >= NSEC_PER_SEC)) { + ns -= NSEC_PER_SEC; + a->tv_sec++; + } + a->tv_nsec = ns; +} #endif /* __KERNEL__ */ #define NFDBITS __NFDBITS Index: linux/include/linux/timer.h =================================================================== --- linux.orig/include/linux/timer.h +++ linux/include/linux/timer.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -16,9 +17,15 @@ struct timer_list { unsigned long data; struct tvec_t_base_s *base; +#ifdef CONFIG_TIMER_INFO + void *start_site; + char start_comm[16]; + int start_pid; +#endif }; extern struct tvec_t_base_s boot_tvec_bases; +extern unsigned int timeout_granularity; #define TIMER_INITIALIZER(_function, _expires, _data) { \ .function = (_function), \ @@ -62,7 +69,52 @@ extern int del_timer(struct timer_list * extern int __mod_timer(struct timer_list *timer, unsigned long expires); extern int mod_timer(struct timer_list *timer, unsigned long expires); +/* + * Return when the next timer-wheel timeout occurs (in absolute jiffies), + * locks the timer base: + */ extern unsigned long next_timer_interrupt(void); +/* + * Return when the next timer-wheel timeout occurs (in absolute jiffies), + * locks the timer base and does the comparison against the given + * jiffie. + */ +extern unsigned long get_next_timer_interrupt(unsigned long now); + +/* + * Timer-top info: + */ +#ifdef CONFIG_TIMER_INFO +extern int account_timer(struct timer_list *timer); + +extern void __timer_set_start_info(struct timer_list *timer, void *addr); + +static inline void timer_set_start_info(struct timer_list *timer) +{ + __timer_set_start_info(timer, __builtin_return_address(0)); +} + +static inline void timer_clear_start_info(struct timer_list *timer) +{ + timer->start_site = NULL; +} +#else +static inline int account_timer(struct timer_list *timer) +{ + return 0; +} + +static inline void timer_set_start_info(struct timer_list *timer) +{ +} + +static inline void timer_clear_start_info(struct timer_list *timer) +{ +} +#endif + +extern void delayed_work_timer_fn(unsigned long __data); + /*** * add_timer - start a timer Index: linux/include/linux/timex.h =================================================================== --- linux.orig/include/linux/timex.h +++ linux/include/linux/timex.h @@ -305,7 +305,7 @@ time_interpolator_reset(void) #endif /* !CONFIG_TIME_INTERPOLATION */ /* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */ -extern u64 current_tick_length(void); +extern u64 current_tick_length(long); extern int do_adjtimex(struct timex *); Index: linux/init/main.c =================================================================== --- linux.orig/init/main.c +++ linux/init/main.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -486,10 +487,12 @@ asmlinkage void __init start_kernel(void rcu_init(); init_IRQ(); pidhash_init(); + init_clockevents(); init_timers(); hrtimers_init(); softirq_init(); time_init(); + timekeeping_init(); /* * HACK ALERT! This is early. We're enabling the console before Index: linux/kernel/Makefile =================================================================== --- linux.orig/kernel/Makefile +++ linux/kernel/Makefile @@ -10,11 +10,13 @@ obj-y = sched.o fork.o exec_domain.o kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o +obj-y += time/ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_FUTEX) += futex.o ifeq ($(CONFIG_COMPAT),y) obj-$(CONFIG_FUTEX) += futex_compat.o endif +obj-$(CONFIG_TIMER_INFO) += timer_top.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o Index: linux/kernel/hrtimer.c =================================================================== --- linux.orig/kernel/hrtimer.c +++ linux/kernel/hrtimer.c @@ -1,8 +1,8 @@ /* * linux/kernel/hrtimer.c * - * Copyright(C) 2005, Thomas Gleixner - * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2005, 2006 Thomas Gleixner + * Copyright(C) 2005, 2006 Red Hat, Inc., Ingo Molnar * * High-resolution kernel timers * @@ -36,7 +36,10 @@ #include #include #include +#include #include +#include +#include #include @@ -45,7 +48,7 @@ * * returns the time in ktime_t format */ -static ktime_t ktime_get(void) +ktime_t ktime_get(void) { struct timespec now; @@ -59,7 +62,7 @@ static ktime_t ktime_get(void) * * returns the time in ktime_t format */ -static ktime_t ktime_get_real(void) +ktime_t ktime_get_real(void) { struct timespec now; @@ -79,21 +82,22 @@ EXPORT_SYMBOL_GPL(ktime_get_real); * This ensures that we capture erroneous accesses to these clock ids * rather than moving them into the range of valid clock id's. */ - -#define MAX_HRTIMER_BASES 2 - -static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = +DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { + + .clock_base = { - .index = CLOCK_REALTIME, - .get_time = &ktime_get_real, - .resolution = KTIME_REALTIME_RES, - }, - { - .index = CLOCK_MONOTONIC, - .get_time = &ktime_get, - .resolution = KTIME_MONOTONIC_RES, - }, + { + .index = CLOCK_REALTIME, + .get_time = &ktime_get_real, + .resolution = KTIME_REALTIME_RES, + }, + { + .index = CLOCK_MONOTONIC, + .get_time = &ktime_get, + .resolution = KTIME_MONOTONIC_RES, + }, + } }; /** @@ -126,7 +130,7 @@ EXPORT_SYMBOL_GPL(ktime_get_ts); * Get the coarse grained time at the softirq based on xtime and * wall_to_monotonic. */ -static void hrtimer_get_softirq_time(struct hrtimer_base *base) +static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { ktime_t xtim, tomono; unsigned long seq; @@ -138,8 +142,9 @@ static void hrtimer_get_softirq_time(str } while (read_seqretry(&xtime_lock, seq)); - base[CLOCK_REALTIME].softirq_time = xtim; - base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono); + base->clock_base[CLOCK_REALTIME].softirq_time = xtim; + base->clock_base[CLOCK_MONOTONIC].softirq_time = + ktime_add(xtim, tomono); } /* @@ -148,7 +153,12 @@ static void hrtimer_get_softirq_time(str */ #ifdef CONFIG_SMP -#define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0) +static inline int base_same_cpu(struct hrtimer_clock_base *base) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + return (base == &cpu_base->clock_base[base->index]); +} /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock @@ -162,19 +172,20 @@ static void hrtimer_get_softirq_time(str * possible to set timer->base = NULL and drop the lock: the timer remains * locked. */ -static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, - unsigned long *flags) +static +struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, + unsigned long *flags) { - struct hrtimer_base *base; + struct hrtimer_clock_base *base; for (;;) { base = timer->base; if (likely(base != NULL)) { - spin_lock_irqsave(&base->lock, *flags); + spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* The timer has migrated to another CPU: */ - spin_unlock_irqrestore(&base->lock, *flags); + spin_unlock_irqrestore(&base->cpu_base->lock, *flags); } cpu_relax(); } @@ -183,12 +194,14 @@ static struct hrtimer_base *lock_hrtimer /* * Switch the timer base to the current CPU when possible. */ -static inline struct hrtimer_base * -switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) +static inline struct hrtimer_clock_base * +switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base) { - struct hrtimer_base *new_base; + struct hrtimer_clock_base *new_base; + struct hrtimer_cpu_base *new_cpu_base; - new_base = &__get_cpu_var(hrtimer_bases[base->index]); + new_cpu_base = &__get_cpu_var(hrtimer_bases); + new_base = &new_cpu_base->clock_base[base->index]; if (base != new_base) { /* @@ -200,33 +213,35 @@ switch_hrtimer_base(struct hrtimer *time * completed. There is no conflict as we hold the lock until * the timer is enqueued. */ - if (unlikely(base->curr_timer == timer)) + if (unlikely(base->cpu_base->curr_timer == timer)) return base; /* See the comment in lock_timer_base() */ timer->base = NULL; - spin_unlock(&base->lock); - spin_lock(&new_base->lock); + spin_unlock(&base->cpu_base->lock); + spin_lock(&new_base->cpu_base->lock); timer->base = new_base; } return new_base; } -#else /* CONFIG_SMP */ +# define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0) -#define set_curr_timer(b, t) do { } while (0) +#else /* CONFIG_SMP */ -static inline struct hrtimer_base * +static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { - struct hrtimer_base *base = timer->base; + struct hrtimer_clock_base *base = timer->base; - spin_lock_irqsave(&base->lock, *flags); + spin_lock_irqsave(&base->cpu_base->lock, *flags); return base; } -#define switch_hrtimer_base(t, b) (b) +# define switch_hrtimer_base(t, b) (b) +# define base_same_cpu(b) 1 +# define set_curr_timer(b, t) do { } while (0) #endif /* !CONFIG_SMP */ @@ -258,9 +273,6 @@ ktime_t ktime_add_ns(const ktime_t kt, u return ktime_add(kt, tmp); } - -#else /* CONFIG_KTIME_SCALAR */ - # endif /* !CONFIG_KTIME_SCALAR */ /* @@ -288,13 +300,479 @@ static unsigned long ktime_divns(const k # define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div)) #endif /* BITS_PER_LONG >= 64 */ +/* High resolution timer related functions */ +#ifdef CONFIG_HIGH_RES_TIMERS + +static ktime_t last_jiffies_update; + +/* + * Reprogramm the event source with checking both queues for the + * next event + * Called with interrupts disabled and base->lock held + */ +static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) +{ + int i; + struct hrtimer_clock_base *base = cpu_base->clock_base; + ktime_t expires; + + cpu_base->expires_next.tv64 = KTIME_MAX; + + for (i = HRTIMER_MAX_CLOCK_BASES; i ; i--, base++) { + struct hrtimer *timer; + + if (!base->first) + continue; + timer = rb_entry(base->first, struct hrtimer, node); + expires = ktime_sub(timer->expires, base->offset); + if (expires.tv64 < cpu_base->expires_next.tv64) + cpu_base->expires_next = expires; + } + + if (cpu_base->expires_next.tv64 != KTIME_MAX) + clockevents_set_next_event(cpu_base->expires_next, 1); + else + clockevents_stop_next_event(); +} + +/* + * Shared reprogramming for clock_realtime and clock_monotonic + * + * When a new expires first timer is enqueued, we have + * to check, whether it expires earlier than the timer + * for which the hrt time source was armed. + * + * Called with interrupts disabled and base->cpu_base.lock held + */ +static int hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; + ktime_t expires = ktime_sub(timer->expires, base->offset); + int res; + + if (!base_same_cpu(base)) + return 0; + + if (expires.tv64 >= expires_next->tv64) + return 0; + + res = clockevents_set_next_event(expires, 0); + if (!res) + *expires_next = expires; + return res; +} + + +/* + * Retrigger next event is called after clock was set + */ +static void retrigger_next_event(void *arg) +{ + struct hrtimer_cpu_base *base; + struct timespec realtime_offset; + + base = &per_cpu(hrtimer_bases, smp_processor_id()); + + /* Adjust CLOCK_REALTIME offset */ + spin_lock(&base->lock); + set_normalized_timespec(&realtime_offset, + -wall_to_monotonic.tv_sec, + -wall_to_monotonic.tv_nsec); + base->clock_base[CLOCK_REALTIME].offset = + timespec_to_ktime(realtime_offset); + + hrtimer_force_reprogram(base); + spin_unlock(&base->lock); +} + +/* + * Clock realtime was set + * + * Change the offset of the realtime clock vs. the monotonic + * clock. Called with xtime lock held ! + * + * We might have to reprogram the high resolution timer interrupt. On + * SMP we call the architecture specific code to retrigger _all_ high + * resolution timer interrupts. On UP we just disable interrupts and + * call the high resolution interrupt code. + */ +void clock_was_set(void) +{ + preempt_disable(); + local_irq_disable(); + + if (hrtimer_hres_active) { + retrigger_next_event(NULL); + local_irq_enable(); + + if (smp_call_function(retrigger_next_event, NULL, 1, 1)) + BUG(); + } else + local_irq_enable(); + preempt_enable(); +} + +/** + * hrtimer_clock_notify - A clock source or a clock event has been installed + * + * Notify the per cpu softirqs to recheck the clock sources and events + */ +void hrtimer_clock_notify(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) + set_bit(0, &per_cpu(hrtimer_bases, i).check_clocks); +} + + +static const ktime_t nsec_per_hz = { .tv64 = NSEC_PER_SEC / HZ }; + +/* + * We switched off the global tick source when switching to high resolution + * mode. Update jiffies64. + * + * Must be called with interrupts disabled ! + */ +static void update_jiffies64(ktime_t now) +{ + ktime_t delta; + + write_seqlock(&xtime_lock); + + delta = ktime_sub(now, last_jiffies_update); + if (delta.tv64 >= nsec_per_hz.tv64) { + + delta = ktime_sub(delta, nsec_per_hz); + last_jiffies_update = ktime_add(last_jiffies_update, + nsec_per_hz); + + /* Slow path for long timeouts */ + if (unlikely(delta.tv64 >= nsec_per_hz.tv64)) { + s64 incr = ktime_to_ns(nsec_per_hz); + unsigned long orun = ktime_divns(delta, incr); + + last_jiffies_update = ktime_add_ns(last_jiffies_update, + incr * orun); + jiffies_64 += orun; + } + do_timer(NULL); + } + write_sequnlock(&xtime_lock); +} + +#ifdef CONFIG_NO_HZ +/* + * Called from interrupt entry when then CPU was idle + */ +void update_jiffies(void) +{ + unsigned long flags; + ktime_t now; + + if (unlikely(!hrtimer_hres_active)) + return; + + now = ktime_get(); + + local_irq_save(flags); + update_jiffies64(now); + local_irq_restore(flags); +} + +/* + * Called from the idle thread so careful! + */ +int hrtimer_stop_sched_tick(void) +{ + int cpu = smp_processor_id(); + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + unsigned long seq, last_jiffies, next_jiffies; + ktime_t last_update, expires; + unsigned long delta_jiffies; + unsigned long flags; + + if (unlikely(!hrtimer_hres_active)) + return 0; + + local_irq_save(flags); + + do { + seq = read_seqbegin(&xtime_lock); + last_update = last_jiffies_update; + last_jiffies = jiffies; + } while (read_seqretry(&xtime_lock, seq)); + + next_jiffies = get_next_timer_interrupt(last_jiffies); + delta_jiffies = next_jiffies - last_jiffies; + + cpu_base->idle_calls++; + + if ((long)delta_jiffies >= 1) { + /* + * Save the current tick time, so we can restart the + * scheduler tick when we get woken up before the next + * wheel timer expires + */ + cpu_base->idle_tick = cpu_base->sched_timer.expires; + expires = ktime_add_ns(last_update, + nsec_per_hz.tv64 * delta_jiffies); + hrtimer_start(&cpu_base->sched_timer, expires, HRTIMER_ABS); + cpu_base->idle_sleeps++; + cpu_base->idle_jiffies = last_jiffies; + } else { + /* Keep the timer alive */ + cpu_base->idle_tick.tv64 = 0; + if ((long) delta_jiffies < 0) + raise_softirq(TIMER_SOFTIRQ); + } + + if (local_softirq_pending()) { + inc_preempt_count(); + do_softirq(); + dec_preempt_count(); + } + + /* + * RCU normally depends on the timer IRQ kicking completion + * in every tick. We have to do this here now: + */ + if (rcu_pending(cpu)) { + /* + * We are in quiescent state, so advance callbacks: + */ + rcu_advance_callbacks(cpu, 1); + local_irq_enable(); + local_bh_disable(); + rcu_process_callbacks(0); + local_bh_enable(); + } + local_irq_restore(flags); + + return need_resched(); +} + +void hrtimer_restart_sched_tick(void) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + unsigned long flags; + ktime_t now; + + if (!hrtimer_hres_active || cpu_base->idle_tick.tv64 == 0) + return; + + /* Update jiffies first */ + now = ktime_get(); + + local_irq_save(flags); + update_jiffies64(now); + + /* + * Update process times would randomly account the time we slept to + * whatever the context of the next sched tick is. Enforce that this + * is accounted to idle ! + */ + add_preempt_count(HARDIRQ_OFFSET); + update_process_times(0); + sub_preempt_count(HARDIRQ_OFFSET); + + cpu_base->idle_sleeptime += jiffies - cpu_base->idle_jiffies; + + hrtimer_cancel(&cpu_base->sched_timer); + cpu_base->sched_timer.expires = cpu_base->idle_tick; + hrtimer_forward(&cpu_base->sched_timer, now, nsec_per_hz); + hrtimer_start(&cpu_base->sched_timer, cpu_base->sched_timer.expires, + HRTIMER_ABS); + local_irq_restore(flags); +} + +void show_no_hz_stats(struct seq_file *p) +{ + int cpu; + unsigned long calls = 0, sleeps = 0, time = 0, events = 0; + + for_each_online_cpu(cpu) { + struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); + + calls += base->idle_calls; + sleeps += base->idle_sleeps; + time += base->idle_sleeptime; + events += base->events; + + seq_printf(p, "nohz cpu%d I:%lu S:%lu T:%lu A:%lu E: %lu\n", + cpu, base->idle_calls, base->idle_sleeps, + base->idle_sleeptime, base->idle_sleeps ? + base->idle_sleeptime / sleeps : 0, base->events); + } +#ifdef CONFIG_SMP + seq_printf(p, "nohz total I:%lu S:%lu T:%lu A:%lu E:%lu\n", + calls, sleeps, time, sleeps ? time / sleeps : 0, events); +#endif +} + +#endif + +/* + * We rearm the timer until we get disabled by the idle code + */ +static int hrtimer_sched_tick(struct hrtimer *timer) +{ + unsigned long flags; + struct hrtimer_cpu_base *cpu_base = + container_of(timer, struct hrtimer_cpu_base, sched_timer); + + local_irq_save(flags); + /* + * Do not call, when we are not in irq context and have + * no valid regs pointer + */ + if (cpu_base->sched_regs) { + update_process_times(user_mode(cpu_base->sched_regs)); + profile_tick(CPU_PROFILING, cpu_base->sched_regs); + } + + hrtimer_forward(timer, hrtimer_cb_get_time(timer), nsec_per_hz); + local_irq_restore(flags); + + return HRTIMER_RESTART; +} + +/* + * A change in the clock source or clock events was detected. + * Check the clock source and the events, whether we can switch to + * high resolution mode or not. + * + * TODO: Handle the removal of clock sources / events + */ +static void hrtimer_check_clocks(void) +{ + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + unsigned long flags; + ktime_t now; + + if (!test_and_clear_bit(0, &base->check_clocks)) + return; + + if (!timekeeping_is_continuous()) + return; + + if (!clockevents_next_event_available()) + return; + + local_irq_save(flags); + + if (base->hres_active) { + local_irq_restore(flags); + return; + } + + now = ktime_get(); + if (clockevents_init_next_event()) { + local_irq_restore(flags); + return; + } + base->hres_active = 1; + + /* Did we start the jiffies update yet ? */ + if (last_jiffies_update.tv64 == 0) { + write_seqlock(&xtime_lock); + last_jiffies_update = now; + write_sequnlock(&xtime_lock); + } + + /* + * Emulate tick processing via per-CPU hrtimers: + */ + hrtimer_init(&base->sched_timer, CLOCK_MONOTONIC, HRTIMER_REL); + base->sched_timer.function = hrtimer_sched_tick; + base->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE; + hrtimer_start(&base->sched_timer, nsec_per_hz, HRTIMER_REL); + + /* "Retrigger" the interrupt to get things going */ + retrigger_next_event(NULL); + local_irq_restore(flags); + printk(KERN_INFO "hrtimers: Switched to high resolution mode CPU %d\n", + smp_processor_id()); +} + +static inline int hrtimer_cb_pending(const struct hrtimer *timer) +{ + return !list_empty(&timer->cb_entry); +} + +static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) +{ + list_del_init(&timer->cb_entry); +} + +static inline void hrtimer_add_cb_pending(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + list_add_tail(&timer->cb_entry, &base->cpu_base->cb_pending); + timer->state = HRTIMER_PENDING; +} + +static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) +{ + base->expires_next.tv64 = KTIME_MAX; + set_bit(0, &base->check_clocks); + base->hres_active = 0; + INIT_LIST_HEAD(&base->cb_pending); +} + +static inline void hrtimer_init_timer_hres(struct hrtimer *timer) +{ + INIT_LIST_HEAD(&timer->cb_entry); +} + +static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + /* + * High resolution timers, when active try to + * reprogram. If the timer is in the past we just move + * it to the expired list and schedule the softirq. + */ + if (hrtimer_hres_active && hrtimer_reprogram(timer, base)) { + /* + * HRTIMER_CB_IRQSAFE_NO_RESTART signals that the timer is not + * requested to be rearmed by the callback function. Just call + * the callback instead of going through the softirq. + */ + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_RESTART) { + int ret = timer->function(timer); + + BUG_ON(ret != HRTIMER_NORESTART); + } else { + hrtimer_add_cb_pending(timer, base); + raise_softirq(HRTIMER_SOFTIRQ); + } + return 1; + } + return 0; +} + +#else + +# define hrtimer_hres_active 0 +# define hrtimer_check_clocks() do { } while (0) +# define hrtimer_enqueue_reprogram(t,b) 0 +# define hrtimer_force_reprogram(b) do { } while (0) +# define hrtimer_cb_pending(t) 0 +# define hrtimer_remove_cb_pending(t) do { } while (0) +# define hrtimer_init_hres(c) do { } while (0) +# define hrtimer_init_timer_hres(t) do { } while (0) + +#endif /* CONFIG_HIGH_RES_TIMERS */ + /* * Counterpart to lock_timer_base above: */ static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { - spin_unlock_irqrestore(&timer->base->lock, *flags); + spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** @@ -345,7 +823,8 @@ hrtimer_forward(struct hrtimer *timer, k * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. */ -static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) +static void enqueue_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base) { struct rb_node **link = &base->active.rb_node; struct rb_node *parent = NULL; @@ -371,12 +850,18 @@ static void enqueue_hrtimer(struct hrtim * Insert the timer to the rbtree and check whether it * replaces the first pending timer */ - rb_link_node(&timer->node, parent, link); - rb_insert_color(&timer->node, &base->active); - if (!base->first || timer->expires.tv64 < - rb_entry(base->first, struct hrtimer, node)->expires.tv64) + rb_entry(base->first, struct hrtimer, node)->expires.tv64) { + + if (hrtimer_enqueue_reprogram(timer, base)) + return; + base->first = &timer->node; + } + + rb_link_node(&timer->node, parent, link); + rb_insert_color(&timer->node, &base->active); + timer->state = HRTIMER_ACTIVE; } /* @@ -384,26 +869,36 @@ static void enqueue_hrtimer(struct hrtim * * Caller must hold the base lock. */ -static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) -{ - /* - * Remove the timer from the rbtree and replace the - * first entry pointer if necessary. - */ - if (base->first == &timer->node) - base->first = rb_next(&timer->node); - rb_erase(&timer->node, &base->active); - timer->node.rb_parent = HRTIMER_INACTIVE; +static void __remove_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base, + enum hrtimer_state newstate, int reprogram) +{ + /* High res. callback list. NOP for !HIGHRES */ + if (hrtimer_cb_pending(timer)) + hrtimer_remove_cb_pending(timer); + else { + /* + * Remove the timer from the rbtree and replace the + * first entry pointer if necessary. + */ + if (base->first == &timer->node) { + base->first = rb_next(&timer->node); + if (reprogram && hrtimer_hres_active) + hrtimer_force_reprogram(base->cpu_base); + } + rb_erase(&timer->node, &base->active); + } + timer->state = newstate; } /* * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) { if (hrtimer_active(timer)) { - __remove_hrtimer(timer, base); + __remove_hrtimer(timer, base, HRTIMER_INACTIVE, 1); return 1; } return 0; @@ -423,7 +918,7 @@ remove_hrtimer(struct hrtimer *timer, st int hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { - struct hrtimer_base *base, *new_base; + struct hrtimer_clock_base *base, *new_base; unsigned long flags; int ret; @@ -471,13 +966,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start); */ int hrtimer_try_to_cancel(struct hrtimer *timer) { - struct hrtimer_base *base; + struct hrtimer_clock_base *base; unsigned long flags; int ret = -1; base = lock_hrtimer_base(timer, &flags); - if (base->curr_timer != timer) + if (base->cpu_base->curr_timer != timer) ret = remove_hrtimer(timer, base); unlock_hrtimer_base(timer, &flags); @@ -515,7 +1010,7 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); */ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) { - struct hrtimer_base *base; + struct hrtimer_clock_base *base; unsigned long flags; ktime_t rem; @@ -536,26 +1031,36 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining) */ ktime_t hrtimer_get_next_event(void) { - struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_clock_base *base = cpu_base->clock_base; ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; unsigned long flags; int i; - for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { + /* + * In high-res mode we dont need to get the next high-res + * event on a tickless system: + */ + if (hrtimer_hres_active) + return mindelta; + + spin_lock_irqsave(&cpu_base->lock, flags); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; - spin_lock_irqsave(&base->lock, flags); - if (!base->first) { - spin_unlock_irqrestore(&base->lock, flags); + if (!base->first) continue; - } + timer = rb_entry(base->first, struct hrtimer, node); delta.tv64 = timer->expires.tv64; - spin_unlock_irqrestore(&base->lock, flags); delta = ktime_sub(delta, base->get_time()); if (delta.tv64 < mindelta.tv64) mindelta.tv64 = delta.tv64; } + + spin_unlock_irqrestore(&cpu_base->lock, flags); + if (mindelta.tv64 < 0) mindelta.tv64 = 0; return mindelta; @@ -572,17 +1077,18 @@ ktime_t hrtimer_get_next_event(void) void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { - struct hrtimer_base *bases; + struct hrtimer_cpu_base *cpu_base; memset(timer, 0, sizeof(struct hrtimer)); - bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); + cpu_base = &__raw_get_cpu_var(hrtimer_bases); if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) clock_id = CLOCK_MONOTONIC; - timer->base = &bases[clock_id]; - timer->node.rb_parent = HRTIMER_INACTIVE; + timer->base = &cpu_base->clock_base[clock_id]; + timer->state = HRTIMER_INACTIVE; + hrtimer_init_timer_hres(timer); } EXPORT_SYMBOL_GPL(hrtimer_init); @@ -597,21 +1103,145 @@ EXPORT_SYMBOL_GPL(hrtimer_init); */ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { - struct hrtimer_base *bases; + struct hrtimer_cpu_base *cpu_base; - bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); - *tp = ktime_to_timespec(bases[which_clock].resolution); + cpu_base = &__raw_get_cpu_var(hrtimer_bases); + *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); return 0; } EXPORT_SYMBOL_GPL(hrtimer_get_res); +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer interrupt + * Called with interrupts disabled + */ +void hrtimer_interrupt(struct pt_regs *regs) +{ + struct hrtimer_clock_base *base; + ktime_t expires_next, now; + int i, raise = 0, cpu = smp_processor_id(); + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + + BUG_ON(!cpu_base->hres_active); + + /* Store the regs for an possible sched_timer callback */ + cpu_base->sched_regs = regs; + cpu_base->events++; + + retry: + now = ktime_get(); + + /* Check, if the jiffies need an update */ + update_jiffies64(now); + + expires_next.tv64 = KTIME_MAX; + + base = cpu_base->clock_base; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + ktime_t basenow; + struct rb_node *node; + + spin_lock(&cpu_base->lock); + + basenow = ktime_add(now, base->offset); + + while ((node = base->first)) { + struct hrtimer *timer; + + timer = rb_entry(node, struct hrtimer, node); + + if (basenow.tv64 < timer->expires.tv64) { + ktime_t expires; + + expires = ktime_sub(timer->expires, + base->offset); + if (expires.tv64 < expires_next.tv64) + expires_next = expires; + break; + } + __remove_hrtimer(timer, base, HRTIMER_PENDING, 0); + + if (timer->cb_mode != HRTIMER_CB_SOFTIRQ) { + if (timer->function(timer) == HRTIMER_RESTART) + enqueue_hrtimer(timer, base); + else + timer->state = HRTIMER_INACTIVE; + } else { + hrtimer_add_cb_pending(timer, base); + raise = 1; + } + } + spin_unlock(&cpu_base->lock); + base++; + } + + cpu_base->expires_next = expires_next; + + /* Reprogramming necessary ? */ + if (expires_next.tv64 != KTIME_MAX) { + if (clockevents_set_next_event(expires_next, 0)) + goto retry; + } + + /* Invalidate regs */ + cpu_base->sched_regs = NULL; + + /* Raise softirq ? */ + if (raise) + raise_softirq(HRTIMER_SOFTIRQ); +} + +static void run_hrtimer_softirq(struct softirq_action *h) +{ + struct hrtimer_cpu_base *cpu_base; + + cpu_base = &per_cpu(hrtimer_bases, smp_processor_id()); + + spin_lock_irq(&cpu_base->lock); + + while (!list_empty(&cpu_base->cb_pending)) { + struct hrtimer *timer; + int (*fn)(struct hrtimer *); + int restart; + + timer = list_entry(cpu_base->cb_pending.next, + struct hrtimer, cb_entry); + fn = timer->function; + set_curr_timer(cpu_base, timer); + __remove_hrtimer(timer, timer->base, HRTIMER_INACTIVE, 0); + spin_unlock_irq(&cpu_base->lock); + + restart = fn(timer); + + spin_lock_irq(&cpu_base->lock); + + if (restart == HRTIMER_RESTART) { + BUG_ON(hrtimer_active(timer)); + enqueue_hrtimer(timer, timer->base); + } else if (hrtimer_active(timer)) { + /* Timer was rearmed on another CPU: */ + if (timer->base->first == &timer->node) + hrtimer_reprogram(timer, timer->base); + } + } + set_curr_timer(cpu_base, NULL); + spin_unlock_irq(&cpu_base->lock); +} + +#endif /* CONFIG_HIGH_RES_TIMERS */ + /* * Expire the per base hrtimer-queue: */ -static inline void run_hrtimer_queue(struct hrtimer_base *base) +static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, + int index) { struct rb_node *node; + struct hrtimer_clock_base *base = &cpu_base->clock_base[index]; if (!base->first) return; @@ -619,7 +1249,7 @@ static inline void run_hrtimer_queue(str if (base->get_softirq_time) base->softirq_time = base->get_softirq_time(); - spin_lock_irq(&base->lock); + spin_lock_irq(&cpu_base->lock); while ((node = base->first)) { struct hrtimer *timer; @@ -631,35 +1261,44 @@ static inline void run_hrtimer_queue(str break; fn = timer->function; - set_curr_timer(base, timer); - __remove_hrtimer(timer, base); - spin_unlock_irq(&base->lock); + set_curr_timer(cpu_base, timer); + __remove_hrtimer(timer, base, HRTIMER_INACTIVE, 0); + spin_unlock_irq(&cpu_base->lock); restart = fn(timer); - spin_lock_irq(&base->lock); + spin_lock_irq(&cpu_base->lock); - if (restart != HRTIMER_NORESTART) { + if (restart == HRTIMER_RESTART) { BUG_ON(hrtimer_active(timer)); enqueue_hrtimer(timer, base); } } - set_curr_timer(base, NULL); - spin_unlock_irq(&base->lock); + set_curr_timer(cpu_base, NULL); + spin_unlock_irq(&cpu_base->lock); } /* * Called from timer softirq every jiffy, expire hrtimers: + * + * For HRT its the fall back code to run the softirq in the timer + * softirq context in case the hrtimer initialization failed or has + * not been done yet. */ void hrtimer_run_queues(void) { - struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); int i; - hrtimer_get_softirq_time(base); + hrtimer_check_clocks(); + + if (hrtimer_hres_active) + return; - for (i = 0; i < MAX_HRTIMER_BASES; i++) - run_hrtimer_queue(&base[i]); + hrtimer_get_softirq_time(cpu_base); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + run_hrtimer_queue(cpu_base, i); } /* @@ -682,6 +1321,9 @@ void hrtimer_init_sleeper(struct hrtimer { sl->timer.function = hrtimer_wakeup; sl->task = task; +#ifdef CONFIG_HIGH_RES_TIMERS + sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART; +#endif } static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) @@ -692,7 +1334,8 @@ static int __sched do_nanosleep(struct h set_current_state(TASK_INTERRUPTIBLE); hrtimer_start(&t->timer, t->timer.expires, mode); - schedule(); + if (likely(t->task)) + schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_ABS; @@ -788,24 +1431,27 @@ sys_nanosleep(struct timespec __user *rq */ static void __devinit init_hrtimers_cpu(int cpu) { - struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) - spin_lock_init(&base->lock); + spin_lock_init(&cpu_base->lock); + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + cpu_base->clock_base[i].cpu_base = cpu_base; + + hrtimer_init_hres(cpu_base); } #ifdef CONFIG_HOTPLUG_CPU -static void migrate_hrtimer_list(struct hrtimer_base *old_base, - struct hrtimer_base *new_base) +static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct rb_node *node; while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); - __remove_hrtimer(timer, old_base); + __remove_hrtimer(timer, old_base, HRTIMER_INACTIVE, 0); timer->base = new_base; enqueue_hrtimer(timer, new_base); } @@ -813,29 +1459,26 @@ static void migrate_hrtimer_list(struct static void migrate_hrtimers(int cpu) { - struct hrtimer_base *old_base, *new_base; + struct hrtimer_cpu_base *old_base, *new_base; int i; BUG_ON(cpu_online(cpu)); - old_base = per_cpu(hrtimer_bases, cpu); - new_base = get_cpu_var(hrtimer_bases); + old_base = &per_cpu(hrtimer_bases, cpu); + new_base = &get_cpu_var(hrtimer_bases); local_irq_disable(); - for (i = 0; i < MAX_HRTIMER_BASES; i++) { - - spin_lock(&new_base->lock); - spin_lock(&old_base->lock); + spin_lock(&new_base->lock); + spin_lock(&old_base->lock); + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { BUG_ON(old_base->curr_timer); - migrate_hrtimer_list(old_base, new_base); - - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - old_base++; - new_base++; + migrate_hrtimer_list(old_base->clock_base + 1, + new_base->clock_base + 1); } + spin_unlock(&old_base->lock); + spin_unlock(&new_base->lock); local_irq_enable(); put_cpu_var(hrtimer_bases); @@ -875,5 +1518,8 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); +#ifdef CONFIG_HIGH_RES_TIMERS + open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL); +#endif } Index: linux/kernel/itimer.c =================================================================== --- linux.orig/kernel/itimer.c +++ linux/kernel/itimer.c @@ -136,7 +136,7 @@ int it_real_fn(struct hrtimer *timer) send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); if (sig->it_real_incr.tv64 != 0) { - hrtimer_forward(timer, timer->base->softirq_time, + hrtimer_forward(timer, hrtimer_cb_get_time(timer), sig->it_real_incr); return HRTIMER_RESTART; } Index: linux/kernel/posix-timers.c =================================================================== --- linux.orig/kernel/posix-timers.c +++ linux/kernel/posix-timers.c @@ -356,7 +356,7 @@ static int posix_timer_fn(struct hrtimer if (timr->it.real.interval.tv64 != 0) { timr->it_overrun += hrtimer_forward(timer, - timer->base->softirq_time, + hrtimer_cb_get_time(timer), timr->it.real.interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; Index: linux/kernel/printk.c =================================================================== --- linux.orig/kernel/printk.c +++ linux/kernel/printk.c @@ -337,7 +337,10 @@ static void __call_console_drivers(unsig static void _call_console_drivers(unsigned long start, unsigned long end, int msg_log_level) { - if (msg_log_level < console_loglevel && + if ( +#ifndef CONFIG_PRINTK_IGNORE_LOGLEVEL + msg_log_level < console_loglevel && +#endif console_drivers && start != end) { if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { /* wrapped write */ Index: linux/kernel/rcupdate.c =================================================================== --- linux.orig/kernel/rcupdate.c +++ linux/kernel/rcupdate.c @@ -449,7 +449,7 @@ static void __rcu_process_callbacks(stru rcu_do_batch(rdp); } -static void rcu_process_callbacks(unsigned long unused) +void rcu_process_callbacks(unsigned long unused) { __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); @@ -504,6 +504,17 @@ int rcu_needs_cpu(int cpu) return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); } +void rcu_advance_callbacks(int cpu, int user) +{ + if (user || + (idle_cpu(cpu) && !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + rcu_qsctr_inc(cpu); + rcu_bh_qsctr_inc(cpu); + } else if (!in_softirq()) + rcu_bh_qsctr_inc(cpu); +} + void rcu_check_callbacks(int cpu, int user) { if (user || Index: linux/kernel/sched.c =================================================================== --- linux.orig/kernel/sched.c +++ linux/kernel/sched.c @@ -818,6 +818,11 @@ static void deactivate_task(struct task_ * the target CPU. */ #ifdef CONFIG_SMP + +#ifndef tsk_is_polling +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#endif + static void resched_task(task_t *p) { int cpu; @@ -833,9 +838,9 @@ static void resched_task(task_t *p) if (cpu == smp_processor_id()) return; - /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ + /* NEED_RESCHED must be visible before we test polling */ smp_mb(); - if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) + if (!tsk_is_polling(p)) smp_send_reschedule(cpu); } #else @@ -4142,7 +4147,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + struct runqueue *rq = &__raw_get_cpu_var(runqueues); atomic_inc(&rq->nr_iowait); schedule(); @@ -4153,7 +4158,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + struct runqueue *rq = &__raw_get_cpu_var(runqueues); long ret; atomic_inc(&rq->nr_iowait); Index: linux/kernel/softlockup.c =================================================================== --- linux.orig/kernel/softlockup.c +++ linux/kernel/softlockup.c @@ -36,7 +36,7 @@ static struct notifier_block panic_block void touch_softlockup_watchdog(void) { - per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; + __raw_get_cpu_var(touch_timestamp) = jiffies; } EXPORT_SYMBOL(touch_softlockup_watchdog); Index: linux/kernel/sysctl.c =================================================================== --- linux.orig/kernel/sysctl.c +++ linux/kernel/sysctl.c @@ -487,6 +487,7 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#if 0 #ifdef CONFIG_MAGIC_SYSRQ { .ctl_name = KERN_SYSRQ, @@ -497,6 +498,7 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#endif { .ctl_name = KERN_CADPID, .procname = "cad_pid", @@ -579,6 +581,16 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_NO_IDLE_HZ + { + .ctl_name = KERN_TIMEOUT_GRANULARITY, + .procname = "timeout_granularity", + .data = &timeout_granularity, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_PIDMAX, .procname = "pid_max", Index: linux/kernel/time.c =================================================================== --- linux.orig/kernel/time.c +++ linux/kernel/time.c @@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday); #else +#ifndef CONFIG_GENERIC_TIME /* * Simulate gettimeofday using do_gettimeofday which only allows a timeval * and therefore only yields usec accuracy @@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv) } EXPORT_SYMBOL_GPL(getnstimeofday); #endif +#endif /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 @@ -641,6 +643,273 @@ struct timeval ns_to_timeval(const s64 n return tv; } +/* + * Convert jiffies to milliseconds and back. + * + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases: + */ +unsigned int jiffies_to_msecs(const unsigned long j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); +#else + return (j * MSEC_PER_SEC) / HZ; +#endif +} + +EXPORT_SYMBOL(jiffies_to_msecs); + +unsigned int jiffies_to_usecs(const unsigned long j) +{ +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); +#else + return (j * USEC_PER_SEC) / HZ; +#endif +} + +EXPORT_SYMBOL(jiffies_to_usecs); + +/* + * When we convert to jiffies then we interpret incoming values + * the following way: + * + * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor + * + * We must also be careful about 32-bit overflows. + */ +unsigned long msecs_to_jiffies(const unsigned int m) +{ + /* + * Negative value, means infinite timeout: + */ + if ((int)m < 0) + return MAX_JIFFY_OFFSET; + +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + /* + * HZ is equal to or smaller than 1000, and 1000 is a nice + * round multiple of HZ, divide with the factor between them, + * but round upwards: + */ + return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + /* + * HZ is larger than 1000, and HZ is a nice round multiple of + * 1000 - simply multiply with the factor between them. + * + * But first make sure the multiplication result cannot + * overflow: + */ + if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return m * (HZ / MSEC_PER_SEC); +#else + /* + * Generic case - multiply, round and divide. But first + * check that if we are doing a net multiplication, that + * we wouldnt overflow: + */ + if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC; +#endif +} + +EXPORT_SYMBOL(msecs_to_jiffies); + +unsigned long usecs_to_jiffies(const unsigned int u) +{ + if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return u * (HZ / USEC_PER_SEC); +#else + return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC; +#endif +} + +EXPORT_SYMBOL(usecs_to_jiffies); + +/* + * The TICK_NSEC - 1 rounds up the value to the next resolution. Note + * that a remainder subtract here would not do the right thing as the + * resolution values don't fall on second boundries. I.e. the line: + * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. + * + * Rather, we just shift the bits off the right. + * + * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec + * value to a scaled second value. + */ +unsigned long +timespec_to_jiffies(const struct timespec *value) +{ + unsigned long sec = value->tv_sec; + long nsec = value->tv_nsec + TICK_NSEC - 1; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + nsec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)nsec * NSEC_CONVERSION) >> + (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; + +} + +void +jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u64 nsec = (u64)jiffies * TICK_NSEC; + value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec); +} + +/* Same for "timeval" + * + * Well, almost. The problem here is that the real system resolution is + * in nanoseconds and the value being converted is in micro seconds. + * Also for some machines (those that use HZ = 1024, in-particular), + * there is a LARGE error in the tick size in microseconds. + + * The solution we use is to do the rounding AFTER we convert the + * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. + * Instruction wise, this should cost only an additional add with carry + * instruction above the way it was done above. + */ +unsigned long +timeval_to_jiffies(const struct timeval *value) +{ + unsigned long sec = value->tv_sec; + long usec = value->tv_usec; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + usec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> + (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; +} + +void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u64 nsec = (u64)jiffies * TICK_NSEC; + long tv_usec; + + value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec); + tv_usec /= NSEC_PER_USEC; + value->tv_usec = tv_usec; +} + +/* + * Convert jiffies/jiffies_64 to clock_t and back. + */ +clock_t jiffies_to_clock_t(long x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 + return x / (HZ / USER_HZ); +#else + u64 tmp = (u64)x * TICK_NSEC; + do_div(tmp, (NSEC_PER_SEC / USER_HZ)); + return (long)tmp; +#endif +} + +EXPORT_SYMBOL(jiffies_to_clock_t); + +unsigned long clock_t_to_jiffies(unsigned long x) +{ +#if (HZ % USER_HZ)==0 + if (x >= ~0UL / (HZ / USER_HZ)) + return ~0UL; + return x * (HZ / USER_HZ); +#else + u64 jif; + + /* Don't worry about loss of precision here .. */ + if (x >= ~0UL / HZ * USER_HZ) + return ~0UL; + + /* .. but do try to contain it here */ + jif = x * (u64) HZ; + do_div(jif, USER_HZ); + return jif; +#endif +} + +EXPORT_SYMBOL(clock_t_to_jiffies); + +u64 jiffies_64_to_clock_t(u64 x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 + do_div(x, HZ / USER_HZ); +#else + /* + * There are better ways that don't overflow early, + * but even this doesn't overflow in hundreds of years + * in 64 bits, so.. + */ + x *= TICK_NSEC; + do_div(x, (NSEC_PER_SEC / USER_HZ)); +#endif + return x; +} + +u64 nsec_to_clock_t(u64 x) +{ +#if (NSEC_PER_SEC % USER_HZ) == 0 + do_div(x, (NSEC_PER_SEC / USER_HZ)); +#elif (USER_HZ % 512) == 0 + x *= USER_HZ/512; + do_div(x, (NSEC_PER_SEC / 512)); +#else + /* + * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, + * overflow after 64.99 years. + * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... + */ + x *= 9; + do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) + / USER_HZ)); +#endif + return x; +} + +int nsec_to_timestamp(char *s, u64 t) +{ + unsigned long nsec_rem = do_div(t, NSEC_PER_SEC); + return sprintf(s, "[%5lu.%06lu]", (unsigned long)t, + nsec_rem/NSEC_PER_USEC); +} +__attribute__((weak)) unsigned long long timestamp_clock(void) +{ + return sched_clock(); +} + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { Index: linux/kernel/time/Kconfig =================================================================== --- /dev/null +++ linux/kernel/time/Kconfig @@ -0,0 +1,35 @@ +# +# Timer subsystem related configuration options +# +config HIGH_RES_TIMERS + bool "High Resolution Timer Support" + depends on GENERIC_TIME + help + This option enables high resolution timer support. If your + hardware is not capable then this option only increases + the size of the kernel image. + +config NO_HZ + bool "Tickless System (Dynamic Ticks)" + depends on GENERIC_TIME && HIGH_RES_TIMERS + help + This option enables a tickless system: timer interrupts will + only trigger on an as-needed basis both when the system is + busy and when the system is idle. + +config NO_IDLE_HZ + bool + default y + depends on NO_HZ + +config HIGH_RES_RESOLUTION + int "High Resolution Timer resolution (nanoseconds)" + depends on HIGH_RES_TIMERS + default 1000 + help + This sets the resolution in nanoseconds of the high resolution + timers. Too fine a resolution (small a number) will usually + not be observable due to normal system latencies. For an + 800 MHz processor about 10,000 (10 microseconds) is recommended as a + finest resolution. If you don't need that sort of resolution, + larger values may generate less overhead. Index: linux/kernel/time/Makefile =================================================================== --- /dev/null +++ linux/kernel/time/Makefile @@ -0,0 +1,3 @@ +obj-y += clocksource.o jiffies.o + +obj-$(CONFIG_GENERIC_TIME) += clockevents.o Index: linux/kernel/time/clockevents.c =================================================================== --- /dev/null +++ linux/kernel/time/clockevents.c @@ -0,0 +1,504 @@ +/* + * linux/kernel/time/clockevents.c + * + * This file contains functions which manage clock event drivers. + * + * Copyright(C) 2005 Thomas Gleixner + * + * Kudos to Ingo Molnar for review, criticism, ideas + * + * We have two types of clock event devices: + * - global events (one device per system) + * - local events (one device per cpu) + * + * We assign the various time(r) related interrupts to those devices + * + * - global tick + * - profiling (per cpu) + * - next timer events (per cpu) + * + * TODO: + * - implement variable frequency profiling + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_CLOCK_EVENTS 4 +#define GLOBAL_CLOCK_EVENT MAX_CLOCK_EVENTS + +struct event_descr { + struct clock_event *event; + unsigned int mode; + unsigned int real_caps; + struct irqaction action; + cpumask_t cpumask; +}; + +struct local_events { + int installed; + struct event_descr events[MAX_CLOCK_EVENTS]; + struct clock_event *nextevt; +}; + +/* Variables related to the global event source */ +static struct event_descr global_eventsource; + +/* Variables related to the per cpu local event sources */ +static DEFINE_PER_CPU(struct local_events, local_eventsources); + +/* lock to protect the above */ +static DEFINE_SPINLOCK(events_lock); + +/* + * Math helper. Convert a latch value to ns + */ +unsigned long clockevent_delta2ns(unsigned long latch, struct clock_event *evt) +{ + u64 clc = ((u64) latch << evt->shift); + + do_div(clc, evt->mult); + if (clc < KTIME_MONOTONIC_RES.tv64) + clc = KTIME_MONOTONIC_RES.tv64; + if (clc > LONG_MAX) + clc = LONG_MAX; + + return (unsigned long) clc; +} + +/* + * Bootup and lowres handler: ticks only + */ +static void handle_tick(struct pt_regs *regs) +{ + BUG_ON(hrtimer_hres_active); + + write_seqlock(&xtime_lock); + do_timer(regs); + write_sequnlock(&xtime_lock); +} + +/* + * Bootup and lowres handler: ticks and update_process_times + */ +static void handle_tick_update(struct pt_regs *regs) +{ + BUG_ON(hrtimer_hres_active); + + write_seqlock(&xtime_lock); + do_timer(regs); + write_sequnlock(&xtime_lock); + + update_process_times(user_mode(regs)); +} + +/* + * Bootup and lowres handler: ticks and profileing + */ +static void handle_tick_profile(struct pt_regs *regs) +{ + BUG_ON(hrtimer_hres_active); + + write_seqlock(&xtime_lock); + do_timer(regs); + write_sequnlock(&xtime_lock); + + profile_tick(CPU_PROFILING, regs); +} + +/* + * Bootup and lowres handler: ticks, update_process_times and profiling + */ +static void handle_tick_update_profile(struct pt_regs *regs) +{ + BUG_ON(hrtimer_hres_active); + + write_seqlock(&xtime_lock); + do_timer(regs); + write_sequnlock(&xtime_lock); + + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING, regs); +} + +/* + * Bootup and lowres handler: update_process_times + */ +static void handle_update(struct pt_regs *regs) +{ + BUG_ON(hrtimer_hres_active); + + update_process_times(user_mode(regs)); +} + +/* + * Bootup and lowres handler: update_process_times and profiling + */ +static void handle_update_profile(struct pt_regs *regs) +{ + BUG_ON(hrtimer_hres_active); + + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING, regs); +} + +/* + * Bootup and lowres handler: profiling + */ +static void handle_profile(struct pt_regs *regs) +{ + BUG_ON(hrtimer_hres_active); + + profile_tick(CPU_PROFILING, regs); +} + +/* + * Noop handler when we shut down an event source + */ +static void handle_noop(struct pt_regs *regs) +{ +} + +/* + * Lookup table for bootup and lowres event assignment + */ +static void *event_handlers[] = { + handle_noop, /* 0: No capability selected */ + handle_tick, /* 1: Tick only */ + handle_update, /* 2: Update process times */ + handle_tick_update, /* 3: Tick + update process times */ + handle_profile, /* 4: Profiling int */ + handle_tick_profile, /* 5: Tick + Profiling int */ + handle_update_profile, /* 6: Update process times + + profiling */ + handle_tick_update_profile, /* 7: Tick + update process times + + profiling */ +#ifdef CONFIG_HIGH_RES_TIMERS + hrtimer_interrupt, /* 8: Reprogrammable event source */ +#endif +}; + +/* + * Setup an event source. Assign an handler and start it up + * When the event source has no own interrupt handler we setup + * the interrupt too. + */ +static int setup_event(struct event_descr *descr, struct clock_event *evt, + unsigned int caps, cpumask_t cpumask) +{ + void *handler = event_handlers[caps]; + int mode; + + /* Set the event handler */ + evt->event_handler = handler; + + /* Store all relevant information */ + descr->real_caps = caps; + + mode = caps == CLOCK_CAP_NEXTEVT ? + CLOCK_EVT_ONESHOT : CLOCK_EVT_PERIODIC; + + evt->set_mode(mode, evt); + + printk(KERN_INFO "Event source %s configured with caps set: " + "%02x\n", evt->name, descr->real_caps); + return 0; +} + +/** + * set_global_clockevent - Set the device which generates global clock events + * + * @evt: The device which generates global clock events (ticks) + * + * This can be a device which is only necessary for bootup. On UP systems this + * might be the only event source which is used for everything including + * high resolution events. + * + * When a cpu local event source is installed the global event source is + * switched off in the high resolution timer / tickless mode. + */ +int __init setup_global_clockevent(struct clock_event *evt, cpumask_t cpumask) +{ + int ret; + + /* Already installed ? */ + if (global_eventsource.event) { + printk(KERN_ERR "Global clock event source already installed: " + "%s. Ignoring new global eventsoruce %s\n", + global_eventsource.event->name, + evt->name); + return -EBUSY; + } + + /* Preset the handler in any case */ + evt->event_handler = handle_noop; + + /* + * Check, whether it is a valid global event source + */ + if (!(evt->capabilities & CLOCK_BASE_CAPS_MASK)) { + printk(KERN_ERR "Unsupported event source %s\n", evt->name); + return -EINVAL; + } + + /* Mask out high resolution capabilities for now */ + ret = setup_event(&global_eventsource, evt, + evt->capabilities & CLOCK_BASE_CAPS_MASK, cpumask); + if (!ret) { + global_eventsource.cpumask = cpumask; + global_eventsource.event = evt; + } + + return ret; +} + +/* + * Mask out the functionality which is covered by the new event source + * and assign a new event handler. + */ +static void recalc_active_event(struct event_descr *descr, + unsigned int newcaps) +{ + unsigned int caps; + + if (!descr->real_caps) + return; + + /* Mask the overlapping bits */ + caps = descr->real_caps & ~newcaps; + + /* Assign the new event handler */ + if (caps) { + descr->event->event_handler = event_handlers[caps]; + printk(KERN_INFO "Event source %s new caps set: %02x\n" , + descr->event->name, caps); + } else { + descr->event->event_handler = handle_noop; + + if (descr->event->set_mode) + descr->event->set_mode(CLOCK_EVT_SHUTDOWN, + descr->event); + + printk(KERN_INFO "Event source %s disabled\n" , + descr->event->name); + } + descr->real_caps = caps; +} + +/* + * Recalc the events and reassign the handlers if necessary + */ +static int recalc_events(struct local_events *sources, struct clock_event *evt, + cpumask_t cpumask, unsigned int caps) +{ + int i, ret = 0; + + if (sources->installed == MAX_CLOCK_EVENTS) + return -ENOSPC; + + /* + * If there is no handler and this is not a next event capable + * event source, refuse to handle it + */ + if (!evt->capabilities & CLOCK_CAP_NEXTEVT && !event_handlers[caps]) { + printk(KERN_ERR "Unsupported event source %s\n", evt->name); + return -EINVAL; + } + + if (caps && global_eventsource.event != evt) + recalc_active_event(&global_eventsource, caps); + + for (i = 0; i < sources->installed; i++) { + if (sources->events[i].event != evt) + recalc_active_event(&sources->events[i], caps); + } + + if (caps) { + /* Is next_event event source going to be installed ? */ + if (caps & CLOCK_CAP_NEXTEVT) + caps = CLOCK_CAP_NEXTEVT; + + ret = setup_event(&sources->events[sources->installed], + evt, caps, cpumask); + } else + printk(KERN_INFO "Inactive event source %s registered\n", + evt->name); + + if (!ret) { + sources->events[sources->installed].cpumask = cpumask; + sources->events[sources->installed++].event = evt; + } + + return 0; +} + +/** + * setup_local_clockevent - Set up a cpu local clock event device + * + * @evt: event device to be registered + * @cpumask: cpumask for the irq setup + */ +int setup_local_clockevent(struct clock_event *evt, cpumask_t cpumask) +{ + struct local_events *sources = &__get_cpu_var(local_eventsources); + unsigned long flags; + int res; + + spin_lock_irqsave(&events_lock, flags); + + /* Preset the handler in any case */ + evt->event_handler = handle_noop; + + /* Recalc event sources and maybe reassign handlers */ + res = recalc_events(sources, evt, cpumask, + evt->capabilities & CLOCK_BASE_CAPS_MASK); + + spin_unlock_irqrestore(&events_lock, flags); + + /* + * Trigger hrtimers, when the event source is next event + * capable + */ + if (!res && (evt->capabilities & CLOCK_CAP_NEXTEVT)) + hrtimer_clock_notify(); + + return res; +} +EXPORT_SYMBOL(setup_local_clockevent); + + +/* + * Find a next event capable event source + */ +static int get_next_event_source(void) +{ + struct local_events *sources = &__get_cpu_var(local_eventsources); + int i; + + for (i = 0; i installed; i++) { + struct clock_event *evt; + + evt = sources->events[i].event; + if (evt->capabilities & CLOCK_CAP_NEXTEVT) + return i; + } + +#ifndef CONFIG_SMP + if (global_eventsource.event->capabilities & CLOCK_CAP_NEXTEVT) + return GLOBAL_CLOCK_EVENT; +#endif + return -ENODEV; +} + +/** + * clockevents_next_event_available - Check for a installed next event source + */ +int clockevents_next_event_available(void) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&events_lock, flags); + ret = get_next_event_source(); + spin_unlock_irqrestore(&events_lock, flags); + return ret < 0 ? 0 : 1; +} + +int clockevents_init_next_event(void) +{ + struct local_events *sources = &__get_cpu_var(local_eventsources); + unsigned long flags; + int idx, ret = -ENODEV; + cpumask_t cpumask; + + if (sources->nextevt) + return -EBUSY; + + spin_lock_irqsave(&events_lock, flags); + + idx = get_next_event_source(); + if (idx < 0) + goto out; + + if (idx == GLOBAL_CLOCK_EVENT) { + sources->nextevt = global_eventsource.event; + cpumask = global_eventsource.cpumask; + } else { + sources->nextevt = sources->events[idx].event; + cpumask = sources->events[idx].cpumask; + } + + ret = recalc_events(sources, sources->nextevt, cpumask, + CLOCK_CAPS_MASK); + out: + spin_unlock_irqrestore(&events_lock, flags); + return ret; +} + +int clockevents_set_next_event(ktime_t expires, int force) +{ + struct local_events *sources = &__get_cpu_var(local_eventsources); + ktime_t now = ktime_get(); + int64_t delta = ktime_to_ns(ktime_sub(expires, now)); + unsigned long long clc; + + if (delta <= 0 && !force) + return -ETIME; + if (delta > sources->nextevt->max_delta_ns) + delta = sources->nextevt->max_delta_ns; + if (delta < sources->nextevt->min_delta_ns) + delta = sources->nextevt->min_delta_ns; + + clc = delta * sources->nextevt->mult; + clc >>= sources->nextevt->shift; + sources->nextevt->set_next_event((unsigned long) clc, + sources->nextevt); + return 0; +} + +void clockevents_stop_next_event(void) +{ + /* Implement me ! */ +} + +/* + * Functions related to initialization and hotplug + */ +static int clockevents_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch(action) { + case CPU_UP_PREPARE: + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + /* + * Do something sensible here ! + * Disable the cpu local clocksources + */ + break; +#endif + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata clockevents_nb = { + .notifier_call = clockevents_cpu_notify, +}; + +void __init init_clockevents(void) +{ + clockevents_cpu_notify(&clockevents_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&clockevents_nb); +} Index: linux/kernel/time/clocksource.c =================================================================== --- /dev/null +++ linux/kernel/time/clocksource.c @@ -0,0 +1,349 @@ +/* + * linux/kernel/time/clocksource.c + * + * This file contains the functions which manage clocksource drivers. + * + * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * TODO WishList: + * o Allow clocksource drivers to be unregistered + * o get rid of clocksource_jiffies extern + */ + +#include +#include +#include +#include + +/* XXX - Would like a better way for initializing curr_clocksource */ +extern struct clocksource clocksource_jiffies; + +/*[Clocksource internal variables]--------- + * curr_clocksource: + * currently selected clocksource. Initialized to clocksource_jiffies. + * next_clocksource: + * pending next selected clocksource. + * clocksource_list: + * linked list with the registered clocksources + * clocksource_lock: + * protects manipulations to curr_clocksource and next_clocksource + * and the clocksource_list + * override_name: + * Name of the user-specified clocksource. + */ +static struct clocksource *curr_clocksource = &clocksource_jiffies; +static struct clocksource *next_clocksource; +static LIST_HEAD(clocksource_list); +static DEFINE_SPINLOCK(clocksource_lock); +static char override_name[32]; +static int finished_booting; + +/* clocksource_done_booting - Called near the end of bootup + * + * Hack to avoid lots of clocksource churn at boot time + */ +static int __init clocksource_done_booting(void) +{ + finished_booting = 1; + return 0; +} + +late_initcall(clocksource_done_booting); + +/** + * clocksource_get_next - Returns the selected clocksource + * + */ +struct clocksource *clocksource_get_next(void) +{ + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + if (next_clocksource && finished_booting) { + curr_clocksource = next_clocksource; + next_clocksource = NULL; + } + spin_unlock_irqrestore(&clocksource_lock, flags); + + return curr_clocksource; +} + +/** + * select_clocksource - Finds the best registered clocksource. + * + * Private function. Must hold clocksource_lock when called. + * + * Looks through the list of registered clocksources, returning + * the one with the highest rating value. If there is a clocksource + * name that matches the override string, it returns that clocksource. + */ +static struct clocksource *select_clocksource(void) +{ + struct clocksource *best = NULL; + struct list_head *tmp; + + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + if (!best) + best = src; + + /* check for override: */ + if (strlen(src->name) == strlen(override_name) && + !strcmp(src->name, override_name)) { + best = src; + break; + } + /* pick the highest rating: */ + if (src->rating > best->rating) + best = src; + } + + return best; +} + +/** + * is_registered_source - Checks if clocksource is registered + * @c: pointer to a clocksource + * + * Private helper function. Must hold clocksource_lock when called. + * + * Returns one if the clocksource is already registered, zero otherwise. + */ +static int is_registered_source(struct clocksource *c) +{ + int len = strlen(c->name); + struct list_head *tmp; + + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + if (strlen(src->name) == len && !strcmp(src->name, c->name)) + return 1; + } + + return 0; +} + +/** + * clocksource_register - Used to install new clocksources + * @t: clocksource to be registered + * + * Returns -EBUSY if registration fails, zero otherwise. + */ +int clocksource_register(struct clocksource *c) +{ + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + /* check if clocksource is already registered */ + if (is_registered_source(c)) { + printk("register_clocksource: Cannot register %s. " + "Already registered!", c->name); + ret = -EBUSY; + } else { + /* register it */ + list_add(&c->list, &clocksource_list); + /* scan the registered clocksources, and pick the best one */ + next_clocksource = select_clocksource(); + } + spin_unlock_irqrestore(&clocksource_lock, flags); + return ret; +} +EXPORT_SYMBOL(clocksource_register); + +/** + * clocksource_reselect - Rescan list for next clocksource + * + * A quick helper function to be used if a clocksource changes its + * rating. Forces the clocksource list to be re-scanned for the best + * clocksource. + */ +void clocksource_reselect(void) +{ + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + next_clocksource = select_clocksource(); + spin_unlock_irqrestore(&clocksource_lock, flags); +} +EXPORT_SYMBOL(clocksource_reselect); + +/** + * sysfs_show_current_clocksources - sysfs interface for current clocksource + * @dev: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing current clocksource. + */ +static ssize_t +sysfs_show_current_clocksources(struct sys_device *dev, char *buf) +{ + char *curr = buf; + + spin_lock_irq(&clocksource_lock); + curr += sprintf(curr, "%s ", curr_clocksource->name); + spin_unlock_irq(&clocksource_lock); + + curr += sprintf(curr, "\n"); + + return curr - buf; +} + +/** + * sysfs_override_clocksource - interface for manually overriding clocksource + * @dev: unused + * @buf: name of override clocksource + * @count: length of buffer + * + * Takes input from sysfs interface for manually overriding the default + * clocksource selction. + */ +static ssize_t sysfs_override_clocksource(struct sys_device *dev, + const char *buf, size_t count) +{ + size_t ret = count; + /* strings from sysfs write are not 0 terminated! */ + if (count >= sizeof(override_name)) + return -EINVAL; + + /* strip of \n: */ + if (buf[count-1] == '\n') + count--; + if (count < 1) + return -EINVAL; + + spin_lock_irq(&clocksource_lock); + + /* copy the name given: */ + memcpy(override_name, buf, count); + override_name[count] = 0; + + /* try to select it: */ + next_clocksource = select_clocksource(); + + spin_unlock_irq(&clocksource_lock); + + return ret; +} + +/** + * sysfs_show_available_clocksources - sysfs interface for listing clocksource + * @dev: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing registered clocksources + */ +static ssize_t +sysfs_show_available_clocksources(struct sys_device *dev, char *buf) +{ + struct list_head *tmp; + char *curr = buf; + + spin_lock_irq(&clocksource_lock); + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + curr += sprintf(curr, "%s ", src->name); + } + spin_unlock_irq(&clocksource_lock); + + curr += sprintf(curr, "\n"); + + return curr - buf; +} + +/* + * Sysfs setup bits: + */ +static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, + sysfs_override_clocksource); + +static SYSDEV_ATTR(available_clocksource, 0600, + sysfs_show_available_clocksources, NULL); + +static struct sysdev_class clocksource_sysclass = { + set_kset_name("clocksource"), +}; + +static struct sys_device device_clocksource = { + .id = 0, + .cls = &clocksource_sysclass, +}; + +static int __init init_clocksource_sysfs(void) +{ + int error = sysdev_class_register(&clocksource_sysclass); + + if (!error) + error = sysdev_register(&device_clocksource); + if (!error) + error = sysdev_create_file( + &device_clocksource, + &attr_current_clocksource); + if (!error) + error = sysdev_create_file( + &device_clocksource, + &attr_available_clocksource); + return error; +} + +device_initcall(init_clocksource_sysfs); + +/** + * boot_override_clocksource - boot clock override + * @str: override name + * + * Takes a clocksource= boot argument and uses it + * as the clocksource override name. + */ +static int __init boot_override_clocksource(char* str) +{ + unsigned long flags; + spin_lock_irqsave(&clocksource_lock, flags); + if (str) + strlcpy(override_name, str, sizeof(override_name)); + spin_unlock_irqrestore(&clocksource_lock, flags); + return 1; +} + +__setup("clocksource=", boot_override_clocksource); + +/** + * boot_override_clock - Compatibility layer for deprecated boot option + * @str: override name + * + * DEPRECATED! Takes a clock= boot argument and uses it + * as the clocksource override name + */ +static int __init boot_override_clock(char* str) +{ + if (!strcmp(str, "pmtmr")) { + printk("Warning: clock=pmtmr is deprecated. " + "Use clocksource=acpi_pm.\n"); + return boot_override_clocksource("acpi_pm"); + } + printk("Warning! clock= boot option is deprecated. " + "Use clocksource=xyz\n"); + return boot_override_clocksource(str); +} + +__setup("clock=", boot_override_clock); Index: linux/kernel/time/jiffies.c =================================================================== --- /dev/null +++ linux/kernel/time/jiffies.c @@ -0,0 +1,73 @@ +/*********************************************************************** +* linux/kernel/time/jiffies.c +* +* This file contains the jiffies based clocksource. +* +* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +* +************************************************************************/ +#include +#include +#include + +/* The Jiffies based clocksource is the lowest common + * denominator clock source which should function on + * all systems. It has the same coarse resolution as + * the timer interrupt frequency HZ and it suffers + * inaccuracies caused by missed or lost timer + * interrupts and the inability for the timer + * interrupt hardware to accuratly tick at the + * requested HZ value. It is also not reccomended + * for "tick-less" systems. + */ +#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) + +/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier + * conversion, the .shift value could be zero. However + * this would make NTP adjustments impossible as they are + * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to + * shift both the nominator and denominator the same + * amount, and give ntp adjustments in units of 1/2^8 + * + * The value 8 is somewhat carefully chosen, as anything + * larger can result in overflows. NSEC_PER_JIFFY grows as + * HZ shrinks, so values greater then 8 overflow 32bits when + * HZ=100. + */ +#define JIFFIES_SHIFT 8 + +static cycle_t jiffies_read(void) +{ + return (cycle_t) jiffies; +} + +struct clocksource clocksource_jiffies = { + .name = "jiffies", + .rating = 0, /* lowest rating*/ + .read = jiffies_read, + .mask = 0xffffffff, /*32bits*/ + .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ + .shift = JIFFIES_SHIFT, + .is_continuous = 0, /* tick based, not free running */ +}; + +static int __init init_jiffies_clocksource(void) +{ + return clocksource_register(&clocksource_jiffies); +} + +module_init(init_jiffies_clocksource); Index: linux/kernel/timer.c =================================================================== --- linux.orig/kernel/timer.c +++ linux/kernel/timer.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -69,6 +70,8 @@ typedef struct tvec_root_s { struct list_head vec[TVR_SIZE]; } tvec_root_t; +unsigned int __read_mostly timeout_granularity = 1; + struct tvec_t_base_s { spinlock_t lock; struct timer_list *running_timer; @@ -89,9 +92,7 @@ static DEFINE_PER_CPU(tvec_base_t *, tve static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) { -#ifdef CONFIG_SMP base->running_timer = timer; -#endif } static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) @@ -136,6 +137,17 @@ static void internal_add_timer(tvec_base list_add_tail(&timer->entry, vec); } +#ifdef CONFIG_TIMER_INFO +void __timer_set_start_info(struct timer_list *timer, void *addr) +{ + if (timer->start_site) + return; + + timer->start_site = addr; + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +} +#endif /*** * init_timer - initialize a timer. * @timer: the timer to be initialized @@ -146,12 +158,17 @@ static void internal_add_timer(tvec_base void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; - timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); + timer->base = __raw_get_cpu_var(tvec_bases); +#ifdef CONFIG_TIMER_INFO + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif } EXPORT_SYMBOL(init_timer); -static inline void detach_timer(struct timer_list *timer, - int clear_pending) +static inline void +detach_timer(tvec_base_t *base, struct timer_list *timer, int clear_pending) { struct list_head *entry = &timer->entry; @@ -197,12 +214,13 @@ int __mod_timer(struct timer_list *timer unsigned long flags; int ret = 0; + timer_set_start_info(timer); BUG_ON(!timer->function); base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { - detach_timer(timer, 0); + detach_timer(base, timer, 0); ret = 1; } @@ -247,6 +265,7 @@ void add_timer_on(struct timer_list *tim tvec_base_t *base = per_cpu(tvec_bases, cpu); unsigned long flags; + timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); timer->base = base; @@ -278,6 +297,7 @@ int mod_timer(struct timer_list *timer, { BUG_ON(!timer->function); + timer_set_start_info(timer); /* * This is a common optimization triggered by the * networking code - if the timer is re-modified @@ -308,10 +328,11 @@ int del_timer(struct timer_list *timer) unsigned long flags; int ret = 0; + timer_clear_start_info(timer); if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { - detach_timer(timer, 1); + detach_timer(base, timer, 1); ret = 1; } spin_unlock_irqrestore(&base->lock, flags); @@ -342,7 +363,7 @@ int try_to_del_timer_sync(struct timer_l ret = 0; if (timer_pending(timer)) { - detach_timer(timer, 1); + detach_timer(base, timer, 1); ret = 1; } out: @@ -417,8 +438,7 @@ static inline void __run_timers(tvec_bas { struct timer_list *timer; - spin_lock_irq(&base->lock); - while (time_after_eq(jiffies, base->timer_jiffies)) { + while (time_before_eq(base->timer_jiffies, jiffies)) { struct list_head work_list = LIST_HEAD_INIT(work_list); struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK; @@ -441,8 +461,10 @@ static inline void __run_timers(tvec_bas fn = timer->function; data = timer->data; + account_timer(timer); + set_running_timer(base, timer); - detach_timer(timer, 1); + detach_timer(base, timer, 1); spin_unlock_irq(&base->lock); { int preempt_count = preempt_count(); @@ -460,7 +482,6 @@ static inline void __run_timers(tvec_bas } } set_running_timer(base, NULL); - spin_unlock_irq(&base->lock); } #ifdef CONFIG_NO_IDLE_HZ @@ -469,29 +490,28 @@ static inline void __run_timers(tvec_bas * is used on S/390 to stop all activity when a cpus is idle. * This functions needs to be called disabled. */ -unsigned long next_timer_interrupt(void) +unsigned long __next_timer_interrupt(tvec_base_t *base, unsigned long now) { - tvec_base_t *base; struct list_head *list; - struct timer_list *nte; + struct timer_list *nte, *found = NULL; unsigned long expires; - unsigned long hr_expires = MAX_JIFFY_OFFSET; - ktime_t hr_delta; tvec_t *varray[4]; int i, j; - hr_delta = hrtimer_get_next_event(); +#ifndef CONFIG_NO_HZ + unsigned long hr_expires = MAX_JIFFY_OFFSET; + ktime_t hr_delta = hrtimer_get_next_event(); + if (hr_delta.tv64 != KTIME_MAX) { struct timespec tsdelta; tsdelta = ktime_to_timespec(hr_delta); hr_expires = timespec_to_jiffies(&tsdelta); if (hr_expires < 3) - return hr_expires + jiffies; + return hr_expires + now; } - hr_expires += jiffies; + hr_expires += now; +#endif - base = __get_cpu_var(tvec_bases); - spin_lock(&base->lock); expires = base->timer_jiffies + (LONG_MAX >> 1); list = NULL; @@ -500,6 +520,7 @@ unsigned long next_timer_interrupt(void) do { list_for_each_entry(nte, base->tv1.vec + j, entry) { expires = nte->expires; + found = nte; if (j < (base->timer_jiffies & TVR_MASK)) list = base->tv2.vec + (INDEX(0)); goto found; @@ -519,9 +540,12 @@ unsigned long next_timer_interrupt(void) j = (j + 1) & TVN_MASK; continue; } - list_for_each_entry(nte, varray[i]->vec + j, entry) - if (time_before(nte->expires, expires)) + list_for_each_entry(nte, varray[i]->vec + j, entry) { + if (time_before(nte->expires, expires)) { expires = nte->expires; + found = nte; + } + } if (j < (INDEX(i)) && i < 3) list = varray[i + 1]->vec + (INDEX(i + 1)); goto found; @@ -535,12 +559,15 @@ found: * where we found the timer element. */ list_for_each_entry(nte, list, entry) { - if (time_before(nte->expires, expires)) + if (time_before(nte->expires, expires)) { expires = nte->expires; + found = nte; + } } } - spin_unlock(&base->lock); + WARN_ON(!found); +#ifndef CONFIG_NO_HZ /* * It can happen that other CPUs service timer IRQs and increment * jiffies, but we have not yet got a local timer tick to process @@ -554,14 +581,44 @@ found: * would falsely evaluate to true. If that is the case, just * return jiffies so that we can immediately fire the local timer */ - if (time_before(expires, jiffies)) - return jiffies; + if (time_before(expires, now)) + expires = now; + else if (time_before(hr_expires, expires)) + expires = hr_expires; +#endif + /* + * 'Timer wheel time' can lag behind 'jiffies time' due to + * delayed processing, so make sure we return a value that + * makes sense externally: + */ + expires -= (now - base->timer_jiffies); - if (time_before(hr_expires, expires)) - return hr_expires; + /* + * Round it up per timeout_granularity: + */ + expires += timeout_granularity - 1; + expires -= expires % timeout_granularity; return expires; } + +unsigned long get_next_timer_interrupt(unsigned long now) +{ + tvec_base_t *base = __get_cpu_var(tvec_bases); + unsigned long expires; + + spin_lock(&base->lock); + expires = __next_timer_interrupt(base, now); + spin_unlock(&base->lock); + + return expires; +} + +unsigned long next_timer_interrupt(void) +{ + return get_next_timer_interrupt(jiffies); +} + #endif /******************************************************************/ @@ -601,7 +658,6 @@ long time_tolerance = MAXFREQ; /* frequ long time_precision = 1; /* clock precision (us) */ long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -static long time_phase; /* phase offset (scaled us) */ long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; /* frequency offset (scaled ppm)*/ static long time_adj; /* tick adjust (scaled 1 / HZ) */ @@ -751,27 +807,14 @@ static long adjtime_adjustment(void) } /* in the NTP reference this is called "hardclock()" */ -static void update_wall_time_one_tick(void) +static void update_ntp_one_tick(void) { - long time_adjust_step, delta_nsec; + long time_adjust_step; time_adjust_step = adjtime_adjustment(); if (time_adjust_step) /* Reduce by this step the amount of time left */ time_adjust -= time_adjust_step; - delta_nsec = tick_nsec + time_adjust_step * 1000; - /* - * Advance the phase, once it gets to one microsecond, then - * advance the tick more. - */ - time_phase += time_adj; - if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { - long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); - time_phase -= ltemp << (SHIFT_SCALE - 10); - delta_nsec += ltemp; - } - xtime.tv_nsec += delta_nsec; - time_interpolator_update(delta_nsec); /* Changes by adjtime() do not take effect till next tick. */ if (time_next_adjust != 0) { @@ -784,36 +827,319 @@ static void update_wall_time_one_tick(vo * Return how long ticks are at the moment, that is, how much time * update_wall_time_one_tick will add to xtime next time we call it * (assuming no calls to do_adjtimex in the meantime). - * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 - * bits to the right of the binary point. + * The return value is in fixed-point nanoseconds shifted by the + * specified number of bits to the right of the binary point. * This function has no side-effects. */ -u64 current_tick_length(void) +u64 current_tick_length(long shift) { long delta_nsec; + u64 ret; + /* calculate the finest interval NTP will allow. + * ie: nanosecond value shifted by (SHIFT_SCALE - 10) + */ delta_nsec = tick_nsec + adjtime_adjustment() * 1000; - return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; + ret = ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; + + /* convert from (SHIFT_SCALE - 10) to specified shift scale: */ + shift = shift - (SHIFT_SCALE - 10); + if (shift < 0) + ret >>= -shift; + else + ret <<= shift; + + return ret; } -/* - * Using a loop looks inefficient, but "ticks" is - * usually just one (we shouldn't be losing ticks, - * we're doing this this way mainly for interrupt - * latency reasons, not because we think we'll - * have lots of lost timer ticks +/* XXX - all of this timekeeping code should be later moved to time.c */ +#include +static struct clocksource *clock; /* pointer to current clocksource */ +static cycle_t last_clock_cycle; /* cycle value at last update_wall_time */ + +#ifdef CONFIG_GENERIC_TIME +/** + * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook + * + * private function, must hold xtime_lock lock when being + * called. Returns the number of nanoseconds since the + * last call to update_wall_time() (adjusted by NTP scaling) */ -static void update_wall_time(unsigned long ticks) +static inline s64 __get_nsec_offset(void) { + cycle_t cycle_now, cycle_delta; + s64 ns_offset; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - last_clock_cycle) & clock->mask; + + /* convert to nanoseconds: */ + ns_offset = cyc2ns(clock, cycle_delta); + + return ns_offset; +} + +/** + * __get_realtime_clock_ts - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. Used by + * do_gettimeofday() and get_realtime_clock_ts(). + */ +static inline void __get_realtime_clock_ts(struct timespec *ts) +{ + unsigned long seq; + s64 nsecs; + do { - ticks--; - update_wall_time_one_tick(); - if (xtime.tv_nsec >= 1000000000) { - xtime.tv_nsec -= 1000000000; + seq = read_seqbegin(&xtime_lock); + + *ts = xtime; + nsecs = __get_nsec_offset(); + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); +} + +/** + * getnstimeofday - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. + */ +void getnstimeofday(struct timespec *ts) +{ + __get_realtime_clock_ts(ts); +} + +EXPORT_SYMBOL(getnstimeofday); + +/** + * do_gettimeofday - Returns the time of day in a timeval + * @tv: pointer to the timeval to be set + * + * NOTE: Users should be converted to using get_realtime_clock_ts() + */ +void do_gettimeofday(struct timeval *tv) +{ + struct timespec now; + + __get_realtime_clock_ts(&now); + tv->tv_sec = now.tv_sec; + tv->tv_usec = now.tv_nsec/1000; +} + +EXPORT_SYMBOL(do_gettimeofday); +/** + * do_settimeofday - Sets the time of day + * @tv: pointer to the timespec variable containing the new time + * + * Sets the time of day to the new time and update NTP and notify hrtimers + */ +int do_settimeofday(struct timespec *tv) +{ + unsigned long flags; + time_t wtm_sec, sec = tv->tv_sec; + long wtm_nsec, nsec = tv->tv_nsec; + + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irqsave(&xtime_lock, flags); + + nsec -= __get_nsec_offset(); + + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + + set_normalized_timespec(&xtime, sec, nsec); + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + + ntp_clear(); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return 0; +} + +EXPORT_SYMBOL(do_settimeofday); + +/** + * change_clocksource - Swaps clocksources if a new one is available + * + * Accumulates current time interval and initializes new clocksource + */ +static int change_clocksource(void) +{ + struct clocksource *new; + cycle_t now; + u64 nsec; + new = clocksource_get_next(); + if (clock != new) { + now = clocksource_read(new); + nsec = __get_nsec_offset(); + timespec_add_ns(&xtime, nsec); + + clock = new; + last_clock_cycle = now; + printk(KERN_INFO "Time: %s clocksource has been installed.\n", + clock->name); + return 1; + } else if (clock->update_callback) { + return clock->update_callback(); + } + return 0; +} +#else +#define change_clocksource() (0) +#endif + +/** + * timeofday_is_continuous - check to see if timekeeping is free running + */ +int timekeeping_is_continuous(void) +{ + unsigned long seq; + int ret; + + do { + seq = read_seqbegin(&xtime_lock); + + ret = clock->is_continuous; + + } while (read_seqretry(&xtime_lock, seq)); + + return ret; +} + +/* + * timekeeping_init - Initializes the clocksource and common timekeeping values + */ +void __init timekeeping_init(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + clock = clocksource_get_next(); + clocksource_calculate_interval(clock, tick_nsec); + last_clock_cycle = clocksource_read(clock); + ntp_clear(); + write_sequnlock_irqrestore(&xtime_lock, flags); +} + + +/* + * timekeeping_resume - Resumes the generic timekeeping subsystem. + * @dev: unused + * + * This is for the generic clocksource timekeeping. + * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are + * still managed by arch specific suspend/resume code. + */ +static int timekeeping_resume(struct sys_device *dev) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + /* restart the last cycle value */ + last_clock_cycle = clocksource_read(clock); + write_sequnlock_irqrestore(&xtime_lock, flags); + return 0; +} + +/* sysfs resume/suspend bits for timekeeping */ +static struct sysdev_class timekeeping_sysclass = { + .resume = timekeeping_resume, + set_kset_name("timekeeping"), +}; + +static struct sys_device device_timer = { + .id = 0, + .cls = &timekeeping_sysclass, +}; + +static int __init timekeeping_init_device(void) +{ + int error = sysdev_class_register(&timekeeping_sysclass); + if (!error) + error = sysdev_register(&device_timer); + return error; +} + +device_initcall(timekeeping_init_device); + +#ifdef CONFIG_VSYSCALL_GTOD +extern void update_vsyscall(struct clocksource* clock, cycle_t base); +#else +#define update_vsyscall(clock, base) +#endif + +/* + * update_wall_time - Uses the current clocksource to increment the wall time + * + * Called from the timer interrupt, must hold a write on xtime_lock. + */ +static void update_wall_time(void) +{ + static s64 remainder_snsecs, error; + s64 snsecs_per_sec; + cycle_t now, offset; + + snsecs_per_sec = (s64)NSEC_PER_SEC << clock->shift; + remainder_snsecs += (s64)xtime.tv_nsec << clock->shift; + + now = clocksource_read(clock); + offset = (now - last_clock_cycle)&clock->mask; + + /* normally this loop will run just once, however in the + * case of lost or late ticks, it will accumulate correctly. + */ + while (offset > clock->interval_cycles) { + /* get the ntp interval in clock shifted nanoseconds */ + s64 ntp_snsecs = current_tick_length(clock->shift); + + /* accumulate one interval */ + remainder_snsecs += clock->interval_snsecs; + last_clock_cycle += clock->interval_cycles; + offset -= clock->interval_cycles; + + /* interpolator bits */ + time_interpolator_update(clock->interval_snsecs + >> clock->shift); + /* increment the NTP state machine */ + update_ntp_one_tick(); + + /* accumulate error between NTP and clock interval */ + error += (ntp_snsecs - (s64)clock->interval_snsecs); + + /* correct the clock when NTP error is too big */ + remainder_snsecs += make_ntp_adj(clock, offset, &error); + + if (remainder_snsecs >= snsecs_per_sec) { + remainder_snsecs -= snsecs_per_sec; xtime.tv_sec++; second_overflow(); } - } while (ticks); + } + /* store full nanoseconds into xtime */ + xtime.tv_nsec = remainder_snsecs >> clock->shift; + remainder_snsecs -= (s64)xtime.tv_nsec << clock->shift; + + /* check to see if there is a new clocksource to use */ + if (change_clocksource()) { + error = 0; + remainder_snsecs = 0; + hrtimer_clock_notify(); + clocksource_calculate_interval(clock, tick_nsec); + } + update_vsyscall(clock, last_clock_cycle); } /* @@ -896,9 +1222,12 @@ static void run_timer_softirq(struct sof { tvec_base_t *base = __get_cpu_var(tvec_bases); - hrtimer_run_queues(); + hrtimer_run_queues(); + + spin_lock_irq(&base->lock); if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); + spin_unlock_irq(&base->lock); } /* @@ -906,7 +1235,13 @@ static void run_timer_softirq(struct sof */ void run_local_timers(void) { - raise_softirq(TIMER_SOFTIRQ); + tvec_base_t *base = per_cpu(tvec_bases, smp_processor_id()); + /* + * Only wake up the TIMER_SOFTIRQ every timeout_granularity + * jiffies: + */ + if (time_before_eq(base->timer_jiffies + timeout_granularity, jiffies)) + raise_softirq(TIMER_SOFTIRQ); softlockup_tick(); } @@ -919,10 +1254,8 @@ static inline void update_times(void) unsigned long ticks; ticks = jiffies - wall_jiffies; - if (ticks) { - wall_jiffies += ticks; - update_wall_time(ticks); - } + wall_jiffies += ticks; + update_wall_time(); calc_load(ticks); } @@ -1286,13 +1619,14 @@ static int __devinit init_timers_cpu(int } #ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) +static void migrate_timer_list(tvec_base_t *old_base, tvec_base_t *new_base, + struct list_head *head) { struct timer_list *timer; while (!list_empty(head)) { timer = list_entry(head->next, struct timer_list, entry); - detach_timer(timer, 0); + detach_timer(old_base, timer, 0); timer->base = new_base; internal_add_timer(new_base, timer); } @@ -1315,12 +1649,12 @@ static void __devinit migrate_timers(int BUG_ON(old_base->running_timer); for (i = 0; i < TVR_SIZE; i++) - migrate_timer_list(new_base, old_base->tv1.vec + i); + migrate_timer_list(old_base, new_base, old_base->tv1.vec + i); for (i = 0; i < TVN_SIZE; i++) { - migrate_timer_list(new_base, old_base->tv2.vec + i); - migrate_timer_list(new_base, old_base->tv3.vec + i); - migrate_timer_list(new_base, old_base->tv4.vec + i); - migrate_timer_list(new_base, old_base->tv5.vec + i); + migrate_timer_list(old_base, new_base, old_base->tv2.vec + i); + migrate_timer_list(old_base, new_base, old_base->tv3.vec + i); + migrate_timer_list(old_base, new_base, old_base->tv4.vec + i); + migrate_timer_list(old_base, new_base, old_base->tv5.vec + i); } spin_unlock(&old_base->lock); Index: linux/kernel/timer_top.c =================================================================== --- /dev/null +++ linux/kernel/timer_top.c @@ -0,0 +1,259 @@ +/* + * kernel/timer_top.c + * + * Export Timers information to /proc/timer_info + * + * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus + * Written by Daniel Petrini + * + * This utility should be used to get information from the system timers + * and maybe optimize the system once you know which timers are being used + * and the process which starts them. + * This is particular useful above dynamic tick implementation. One can + * see who is starting timers and make the HZ value increase. + * + * We export the addresses and counting of timer functions being called, + * the pid and cmdline from the owner process if applicable. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#define VERSION "Timer Top v0.9.9" + +struct timer_top_info { + void *start_func; + void *expire_func; + unsigned long counter; + pid_t pid; + char comm[TASK_COMM_LEN + 1]; + struct list_head list; +}; + +struct timer_top_root { + spinlock_t lock; + struct list_head list; + kmem_cache_t *cache; + int record; /* if currently collecting data */ + unsigned long start_jiffies; +}; + +static struct timer_top_root top_root = { + .lock = SPIN_LOCK_UNLOCKED, + .list = LIST_HEAD_INIT(top_root.list), +}; + +static struct list_head *timer_list = &top_root.list; +static spinlock_t *top_lock = &top_root.lock; + +static inline int update_top_info(struct timer_list *timer) +{ + struct timer_top_info *top; + + list_for_each_entry(top, timer_list, list) { + /* if it is in the list increment its count */ + if (top->start_func == timer->start_site && + top->expire_func == timer->function && + top->pid == timer->start_pid) { + top->counter++; + return 1; + } + } + + return 0; +} + +int account_timer(struct timer_list *timer) +{ + pid_t pid_info = timer->start_pid; + struct timer_top_info *top; + unsigned long flags; + + if (!top_root.record) + goto out; + + spin_lock_irqsave(top_lock, flags); + + if (update_top_info(timer)) + goto out_unlock; + + /* Function not found so insert it in the list */ + top = kmem_cache_alloc(top_root.cache, GFP_ATOMIC); + if (unlikely(!top)) + goto out_unlock; + + top->start_func = timer->start_site; + top->expire_func = timer->function; + timer->start_site = NULL; + top->counter = 1; + top->pid = pid_info; + memcpy(top->comm, timer->start_comm, TASK_COMM_LEN); + top->comm[TASK_COMM_LEN] = 0; + list_add(&top->list, timer_list); + +out_unlock: + spin_unlock_irqrestore(top_lock, flags); +out: + return 0; +} + +/* + * Must hold top_lock + */ +static void timer_list_del(void) +{ + struct list_head *aux1, *aux2; + struct timer_top_info *entry; + + list_for_each_safe(aux1, aux2, timer_list) { + entry = list_entry(aux1, struct timer_top_info, list); + list_del(aux1); + kmem_cache_free(top_root.cache, entry); + } + top_root.start_jiffies = jiffies; +} + +/* PROC_FS_SECTION */ + +static struct proc_dir_entry *top_info_file; +static struct proc_dir_entry *top_info_file_out; + +static void print_name_offset(struct seq_file *m, unsigned long addr) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); + if (sym_name) + seq_printf(m, "%s", sym_name); + else + seq_printf(m, "<%p>", (void *)addr); +} + +/* Statistics output - timer_info*/ +static int proc_read_top_info(struct seq_file *m, void *v) +{ + struct timer_top_info *top; + unsigned long events = 0, delta, secs, tenths, ratio, ratio_tenths; + + delta = jiffies - top_root.start_jiffies; + secs = delta / HZ; + tenths = (delta % HZ) * 10 / HZ; + + seq_printf(m, "Function counter - %s\n", VERSION); + seq_printf(m, "collection period: %ld.%ld seconds\n", secs, tenths); + + list_for_each_entry(top, timer_list, list) { + seq_printf(m, "%4lu %5d %-16s ", + top->counter, top->pid, top->comm); + print_name_offset(m, (unsigned long)top->start_func); + seq_puts(m, " ("); + print_name_offset(m, (unsigned long)top->expire_func); + seq_puts(m, ")\n"); + events += top->counter; + } + + ratio = events * HZ / delta; + ratio_tenths = ((events * HZ) % delta) * 10 / delta; + + seq_printf(m, "%4lu total events, %ld.%ld events/sec\n", + events, ratio, ratio_tenths); + + if (!top_root.record) + seq_printf(m, "Disabled\n"); + + return 0; +} + +static int proc_timertop_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_read_top_info, NULL); +} + +static struct file_operations proc_timertop_operations = { + .open = proc_timertop_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#define MAX_INPUT_TOP 10 + +/* Receive some commands from user - timer_input */ +static int proc_write_timer_input(struct file *file, const char *page, + unsigned long count, void *data) +{ + int len; + char input_data[MAX_INPUT_TOP]; + unsigned long flags; + + /* input size checking */ + if (count > MAX_INPUT_TOP - 1) + len = MAX_INPUT_TOP - 1; + else + len = count; + + if (copy_from_user(input_data, page, len)) + return -EFAULT; + + input_data[len] = '\0'; + + spin_lock_irqsave(top_lock, flags); + if (!strncmp(input_data, "clear", 5)) { + timer_list_del(); + } else if (!strncmp(input_data, "start", 5)) { + top_root.start_jiffies = jiffies; + top_root.record = 1; + } else if (!strncmp(input_data, "stop", 4)) { + top_root.record = 0; + timer_list_del(); + } + spin_unlock_irqrestore(top_lock, flags); + + return len; +} + +/* Print a sample string showing the possible inputs - timer_input */ +static int proc_read_timer_input(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + return sprintf(page, "clear start stop\n"); +} + +static int __init init_top_info(void) +{ + top_root.cache = kmem_cache_create("top_info", + sizeof(struct timer_top_info), 0, SLAB_PANIC, NULL, NULL); + + top_info_file = create_proc_entry("timer_info", 0444, NULL); + if (top_info_file == NULL) + return -ENOMEM; + + top_info_file_out = create_proc_entry("timer_input", 0666, NULL); + if (top_info_file_out == NULL) + return -ENOMEM; + + /* Statistics output */ + top_info_file->proc_fops = &proc_timertop_operations; + + /* Control */ + top_info_file_out->write_proc = &proc_write_timer_input; + top_info_file_out->read_proc = &proc_read_timer_input; + + return 0; +} + +module_init(init_top_info); Index: linux/kernel/workqueue.c =================================================================== --- linux.orig/kernel/workqueue.c +++ linux/kernel/workqueue.c @@ -115,12 +115,14 @@ int fastcall queue_work(struct workqueue return ret; } -static void delayed_work_timer_fn(unsigned long __data) +void delayed_work_timer_fn(unsigned long __data) { struct work_struct *work = (struct work_struct *)__data; struct workqueue_struct *wq = work->wq_data; int cpu = smp_processor_id(); + struct list_head *head; + head = &per_cpu_ptr(wq->cpu_wq, cpu)->more_work.task_list; if (unlikely(is_single_threaded(wq))) cpu = singlethread_cpu; @@ -128,11 +130,12 @@ static void delayed_work_timer_fn(unsign } int fastcall queue_delayed_work(struct workqueue_struct *wq, - struct work_struct *work, unsigned long delay) + struct work_struct *work, unsigned long delay) { int ret = 0; struct timer_list *timer = &work->timer; + timer_set_start_info(&work->timer); if (!test_and_set_bit(0, &work->pending)) { BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); @@ -405,6 +408,7 @@ int fastcall schedule_work(struct work_s int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) { + timer_set_start_info(&work->timer); return queue_delayed_work(keventd_wq, work, delay); } Index: linux/lib/Kconfig.debug =================================================================== --- linux.orig/lib/Kconfig.debug +++ linux/lib/Kconfig.debug @@ -8,6 +8,22 @@ config PRINTK_TIME operations. This is useful for identifying long delays in kernel startup. +config PRINTK_IGNORE_LOGLEVEL + bool "Ignore loglevel on printks" + default n + help + Selecting this option causes all printk messages to go + to the console. This allows you to serial-log kernel + messages, no matter what userspace does. (e.g. some + distributions disable kernel log messages during + certain phases of system startup.) + + NOTE: this option also makes printk non-preemptible, + which might improve the output of debugging info or + crash info, but it might also cause latencies if your + kernel is printk-ing alot. + + Normally you dont need or want this option. config MAGIC_SYSRQ bool "Magic SysRq key" @@ -77,6 +93,19 @@ config SCHEDSTATS application, you can say N to avoid the very slight overhead this adds. +config TIMER_INFO + bool "Collect kernel timers statistics" + depends on DEBUG_KERNEL && PROC_FS && NO_IDLE_HZ + help + If you say Y here, additional code will be inserted into the + timer routines to collect statistics about kernel timers being + reprogrammed through dynamic ticks feature. The statistics + will be provided in /proc/timer_info and the behavior of this + feature can be controlled through /proc/timer_input. + The goal is to offer some output to let user applications show + timer pattern usage and allow some tuning in them to + maximise idle time. + config DEBUG_SLAB bool "Debug slab memory allocations" depends on DEBUG_KERNEL && SLAB Index: linux/mm/slab.c =================================================================== --- linux.orig/mm/slab.c +++ linux/mm/slab.c @@ -444,8 +444,13 @@ struct kmem_cache { * OTOH the cpuarrays can contain lots of objects, * which could lock up otherwise freeable slabs. */ -#define REAPTIMEOUT_CPUC (2*HZ) -#define REAPTIMEOUT_LIST3 (4*HZ) +#ifdef CONFIG_NO_IDLE_HZ +# define REAPTIMEOUT_CPUC (4*HZ) +# define REAPTIMEOUT_LIST3 (8*HZ) +#else +# define REAPTIMEOUT_CPUC (2*HZ) +# define REAPTIMEOUT_LIST3 (4*HZ) +#endif #if STATS #define STATS_INC_ACTIVE(x) ((x)->num_active++) Index: linux/net/ipv4/route.c =================================================================== --- linux.orig/net/ipv4/route.c +++ linux/net/ipv4/route.c @@ -244,7 +244,7 @@ static unsigned int rt_hash_rnd; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) \ - (per_cpu(rt_cache_stat, raw_smp_processor_id()).field++) + (__raw_get_cpu_var(rt_cache_stat).field++) static int rt_intern_hash(unsigned hash, struct rtable *rth, struct rtable **res);