Index: linux-hres.q/Documentation/kernel-parameters.txt =================================================================== --- linux-hres.q.orig/Documentation/kernel-parameters.txt +++ linux-hres.q/Documentation/kernel-parameters.txt @@ -61,6 +61,7 @@ parameter is applicable: MTD MTD support is enabled. NET Appropriate network support is enabled. NUMA NUMA support is enabled. + GENERIC_TIME The generic timeofday code is enabled. NFS Appropriate NFS support is enabled. OSS OSS sound support is enabled. PARIDE The ParIDE subsystem is enabled. @@ -176,6 +177,11 @@ running once the system is up. override platform specific driver. See also Documentation/acpi-hotkey.txt. + acpi_pm_good [IA-32,X86-64] + Override the pmtimer bug detection: force the kernel + to assume that this machine's pmtimer latches its value + and always returns good values. + enable_timer_pin_1 [i386,x86-64] Enable PIN 1 of APIC timer Can be useful to work around chipset bugs @@ -338,10 +344,11 @@ running once the system is up. Value can be changed at runtime via /selinux/checkreqprot. - clock= [BUGS=IA-32,HW] gettimeofday timesource override. - Forces specified timesource (if avaliable) to be used - when calculating gettimeofday(). If specicified - timesource is not avalible, it defaults to PIT. + clock= [BUGS=IA-32, HW] gettimeofday clocksource override. + [Deprecated] + Forces specified clocksource (if avaliable) to be used + when calculating gettimeofday(). If specified + clocksource is not avalible, it defaults to PIT. Format: { pit | tsc | cyclone | pmtmr } disable_8254_timer @@ -1605,6 +1612,16 @@ running once the system is up. time Show timing data prefixed to each printk message line + timeout_granularity= + [KNL] + Timeout granularity: process timer wheel timers every + timeout_granularity jiffies. Defaults to 1 (process + timers HZ times per second - most finegrained). + + clocksource= [GENERIC_TIME] Override the default clocksource + Override the default clocksource and use the clocksource + with the name specified. + tipar.timeout= [HW,PPT] Set communications timeout in tenths of a second (default 15). Index: linux-hres.q/Makefile =================================================================== --- linux-hres.q.orig/Makefile +++ linux-hres.q/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 17 -EXTRAVERSION = +EXTRAVERSION =-hrt-dyntick4 NAME=Crazed Snow-Weasel # *DOCUMENTATION* Index: linux-hres.q/arch/i386/Kconfig =================================================================== --- linux-hres.q.orig/arch/i386/Kconfig +++ linux-hres.q/arch/i386/Kconfig @@ -14,6 +14,10 @@ config X86_32 486, 586, Pentiums, and various instruction-set-compatible chips by AMD, Cyrix, and others. +config GENERIC_TIME + bool + default y + config SEMAPHORE_SLEEPERS bool default y @@ -53,6 +57,8 @@ source "init/Kconfig" menu "Processor type and features" +source "kernel/time/Kconfig" + config SMP bool "Symmetric multi-processing support" ---help--- Index: linux-hres.q/arch/i386/kernel/Makefile =================================================================== --- linux-hres.q.orig/arch/i386/kernel/Makefile +++ linux-hres.q/arch/i386/kernel/Makefile @@ -7,10 +7,9 @@ extra-y := head.o init_task.o vmlinux.ld obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o bootflag.o \ - quirks.o i8237.o topology.o alternative.o + quirks.o i8237.o topology.o alternative.o i8253.o tsc.o obj-y += cpu/ -obj-y += timers/ obj-y += acpi/ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca.o @@ -26,6 +25,7 @@ obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_X86_NUMAQ) += numaq.o obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o obj-$(CONFIG_KPROBES) += kprobes.o Index: linux-hres.q/arch/i386/kernel/apic.c =================================================================== --- linux-hres.q.orig/arch/i386/kernel/apic.c +++ linux-hres.q/arch/i386/kernel/apic.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -59,6 +60,23 @@ int enable_local_apic __initdata = 0; /* */ int apic_verbosity; +static unsigned int calibration_result; + +static void lapic_next_event(unsigned long delta, struct clock_event *evt); +static void lapic_timer_setup(int mode, struct clock_event *evt); + +static struct clock_event lapic_clockevent = { + .name = "lapic", + .capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE +#ifdef CONFIG_SMP + | CLOCK_CAP_UPDATE +#endif + , + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, +}; +static DEFINE_PER_CPU(struct clock_event, lapic_events); static void apic_pm_activate(void); @@ -909,6 +927,11 @@ fake_ioapic_page: */ /* + * FIXME: Move this to i8253.h. There is no need to keep the access to + * the PIT scattered all around the place -tglx + */ + +/* * The timer chip is already set up at HZ interrupts per second here, * but we do not accept timer interrupts yet. We only allow the BP * to calibrate. @@ -966,13 +989,15 @@ void (*wait_timer_tick)(void) __devinitd #define APIC_DIVISOR 16 -static void __setup_APIC_LVTT(unsigned int clocks) +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot) { unsigned int lvtt_value, tmp_value, ver; int cpu = smp_processor_id(); ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; if (!APIC_INTEGRATED(ver)) lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); @@ -989,23 +1014,31 @@ static void __setup_APIC_LVTT(unsigned i & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | APIC_TDR_DIV_16); - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); + if (!oneshot) + apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); +} + +static void lapic_next_event(unsigned long delta, struct clock_event *evt) +{ + apic_write_around(APIC_TMICT, delta); } -static void __devinit setup_APIC_timer(unsigned int clocks) +static void lapic_timer_setup(int mode, struct clock_event *evt) { unsigned long flags; local_irq_save(flags); + __setup_APIC_LVTT(calibration_result, mode != CLOCK_EVT_PERIODIC); + local_irq_restore(flags); +} - /* - * Wait for IRQ0's slice: - */ - wait_timer_tick(); +static void __devinit setup_APIC_timer(void) +{ + struct clock_event *levt = &__get_cpu_var(lapic_events); - __setup_APIC_LVTT(clocks); + memcpy(levt, &lapic_clockevent, sizeof(*levt)); - local_irq_restore(flags); + register_local_clockevent(levt); } /* @@ -1014,6 +1047,8 @@ static void __devinit setup_APIC_timer(u * to calibrate, since some later bootup code depends on getting * the first irq? Ugh. * + * TODO: Fix this rather than saying "Ugh" -tglx + * * We want to do the calibration only once since we * want to have local timer irqs syncron. CPUs connected * by the same APIC bus have the very same bus frequency. @@ -1036,7 +1071,7 @@ static int __init calibrate_APIC_clock(v * value into the APIC clock, we just want to get the * counter running for calibration. */ - __setup_APIC_LVTT(1000000000); + __setup_APIC_LVTT(1000000000, 0); /* * The timer chip counts down to zero. Let's wait @@ -1073,6 +1108,14 @@ static int __init calibrate_APIC_clock(v result = (tt1-tt2)*APIC_DIVISOR/LOOPS; + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(tt1-tt2, TICK_NSEC * LOOPS, 32); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + printk("lapic max_delta_ns: %ld\n", lapic_clockevent.max_delta_ns); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + if (cpu_has_tsc) apic_printk(APIC_VERBOSE, "..... CPU clock speed is " "%ld.%04ld MHz.\n", @@ -1087,8 +1130,6 @@ static int __init calibrate_APIC_clock(v return result; } -static unsigned int calibration_result; - void __init setup_boot_APIC_clock(void) { unsigned long flags; @@ -1101,14 +1142,14 @@ void __init setup_boot_APIC_clock(void) /* * Now set up the timer for real. */ - setup_APIC_timer(calibration_result); + setup_APIC_timer(); local_irq_restore(flags); } void __devinit setup_secondary_APIC_clock(void) { - setup_APIC_timer(calibration_result); + setup_APIC_timer(); } void disable_APIC_timer(void) @@ -1143,6 +1184,13 @@ void switch_APIC_timer_to_ipi(void *cpum !cpu_isset(cpu, timer_bcast_ipi)) { disable_APIC_timer(); cpu_set(cpu, timer_bcast_ipi); +#ifdef CONFIG_HIGH_RES_TIMERS + printk("Disabling NO_HZ and high resolution timers " + "due to timer broadcasting\n"); + for_each_possible_cpu(cpu) + per_cpu(lapic_events, cpu).capabilities &= + ~CLOCK_CAP_NEXTEVT; +#endif } } EXPORT_SYMBOL(switch_APIC_timer_to_ipi); @@ -1174,21 +1222,10 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer); inline void smp_local_timer_interrupt(struct pt_regs * regs) { - profile_tick(CPU_PROFILING, regs); -#ifdef CONFIG_SMP - update_process_times(user_mode_vm(regs)); -#endif + int cpu = smp_processor_id(); + struct clock_event *evt = &per_cpu(lapic_events, cpu); - /* - * We take the 'long' return path, and there every subsystem - * grabs the apropriate locks (kernel lock/ irq lock). - * - * we might want to decouple profiling from the 'long path', - * and do the profiling totally in assembly. - * - * Currently this isn't too much of an issue (performance wise), - * we can take more than 100K local irqs per second on a 100 MHz P5. - */ + evt->event_handler(regs); } /* @@ -1203,6 +1240,7 @@ inline void smp_local_timer_interrupt(st fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) { int cpu = smp_processor_id(); + struct clock_event *evt = &per_cpu(lapic_events, cpu); /* * the NMI deadlock-detector uses this. @@ -1220,7 +1258,7 @@ fastcall void smp_apic_timer_interrupt(s * interrupt lock, which is the WrongThing (tm) to do. */ irq_enter(); - smp_local_timer_interrupt(regs); + evt->event_handler(regs); irq_exit(); } Index: linux-hres.q/arch/i386/kernel/apm.c =================================================================== --- linux-hres.q.orig/arch/i386/kernel/apm.c +++ linux-hres.q/arch/i386/kernel/apm.c @@ -764,9 +764,9 @@ static int apm_do_idle(void) int idled = 0; int polling; - polling = test_thread_flag(TIF_POLLING_NRFLAG); + polling = tsk_is_polling(current); if (polling) { - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); } if (!need_resched()) { @@ -774,7 +774,7 @@ static int apm_do_idle(void) ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax); } if (polling) - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= ~TS_POLLING; if (!idled) return 0; Index: linux-hres.q/arch/i386/kernel/hpet.c =================================================================== --- /dev/null +++ linux-hres.q/arch/i386/kernel/hpet.c @@ -0,0 +1,67 @@ +#include +#include +#include +#include + +#include +#include + +#define HPET_MASK CLOCKSOURCE_MASK(32) +#define HPET_SHIFT 22 + +/* FSEC = 10^-15 NSEC = 10^-9 */ +#define FSEC_PER_NSEC 1000000 + +static void *hpet_ptr; + +static cycle_t read_hpet(void) +{ + return (cycle_t)readl(hpet_ptr); +} + +static struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 250, + .read = read_hpet, + .mask = HPET_MASK, + .mult = 0, /* set below */ + .shift = HPET_SHIFT, + .is_continuous = 1, +}; + +static int __init init_hpet_clocksource(void) +{ + unsigned long hpet_period; + void __iomem* hpet_base; + u64 tmp; + + if (!hpet_address) + return -ENODEV; + + /* calculate the hpet address: */ + hpet_base = + (void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + hpet_ptr = hpet_base + HPET_COUNTER; + + /* calculate the frequency: */ + hpet_period = readl(hpet_base + HPET_PERIOD); + + /* + * hpet period is in femto seconds per cycle + * so we need to convert this to ns/cyc units + * aproximated by mult/2^shift + * + * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift + * fsec/cyc * 1ns/1000000fsec * 2^shift = mult + * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult + * (fsec/cyc << shift)/1000000 = mult + * (hpet_period << shift)/FSEC_PER_NSEC = mult + */ + tmp = (u64)hpet_period << HPET_SHIFT; + do_div(tmp, FSEC_PER_NSEC); + clocksource_hpet.mult = (u32)tmp; + + return clocksource_register(&clocksource_hpet); +} + +module_init(init_hpet_clocksource); Index: linux-hres.q/arch/i386/kernel/i8253.c =================================================================== --- /dev/null +++ linux-hres.q/arch/i386/kernel/i8253.c @@ -0,0 +1,162 @@ +/* + * i8253.c 8253/PIT functions + * + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "io_ports.h" + +DEFINE_SPINLOCK(i8253_lock); +EXPORT_SYMBOL(i8253_lock); + +static void init_pit_timer(int mode, struct clock_event *evt) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + + switch(mode) { + case CLOCK_EVT_PERIODIC: + /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(0x34, PIT_MODE); + udelay(10); + outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ + outb(LATCH >> 8 , PIT_CH0); /* MSB */ + break; + + case CLOCK_EVT_ONESHOT: + case CLOCK_EVT_SHUTDOWN: + /* One shot setup */ + outb_p(0x38, PIT_MODE); + udelay(10); + break; + } + spin_unlock_irqrestore(&i8253_lock, flags); +} + +static void pit_next_event(unsigned long delta, struct clock_event *evt) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + outb_p(delta & 0xff , PIT_CH0); /* LSB */ + outb(delta >> 8 , PIT_CH0); /* MSB */ + spin_unlock_irqrestore(&i8253_lock, flags); +} + +struct clock_event pit_clockevent = { + .name = "pit", + .capabilities = CLOCK_CAP_TICK | CLOCK_CAP_PROFILE | CLOCK_CAP_UPDATE +#ifndef CONFIG_SMP + | CLOCK_CAP_NEXTEVT +#endif + , + .set_mode = init_pit_timer, + .set_next_event = pit_next_event, + .shift = 32, +}; + +void setup_pit_timer(void) +{ + pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32); + pit_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFF, &pit_clockevent); + pit_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &pit_clockevent); + register_global_clockevent(&pit_clockevent); +} + +/* + * Since the PIT overflows every tick, its not very useful + * to just read by itself. So use jiffies to emulate a free + * running counter: + */ +static cycle_t pit_read(void) +{ + unsigned long flags; + int count; + u32 jifs; + static int old_count; + static u32 old_jifs; + + spin_lock_irqsave(&i8253_lock, flags); + /* + * Although our caller may have the read side of xtime_lock, + * this is now a seqlock, and we are cheating in this routine + * by having side effects on state that we cannot undo if + * there is a collision on the seqlock and our caller has to + * retry. (Namely, old_jifs and old_count.) So we must treat + * jiffies as volatile despite the lock. We read jiffies + * before latching the timer count to guarantee that although + * the jiffies value might be older than the count (that is, + * the counter may underflow between the last point where + * jiffies was incremented and the point where we latch the + * count), it cannot be newer. + */ + jifs = jiffies; + outb_p(0x00, PIT_MODE); /* latch the count ASAP */ + count = inb_p(PIT_CH0); /* read the latched count */ + count |= inb_p(PIT_CH0) << 8; + + /* VIA686a test code... reset the latch if count > max + 1 */ + if (count > LATCH) { + outb_p(0x34, PIT_MODE); + outb_p(LATCH & 0xff, PIT_CH0); + outb(LATCH >> 8, PIT_CH0); + count = LATCH - 1; + } + + /* + * It's possible for count to appear to go the wrong way for a + * couple of reasons: + * + * 1. The timer counter underflows, but we haven't handled the + * resulting interrupt and incremented jiffies yet. + * 2. Hardware problem with the timer, not giving us continuous time, + * the counter does small "jumps" upwards on some Pentium systems, + * (see c't 95/10 page 335 for Neptun bug.) + * + * Previous attempts to handle these cases intelligently were + * buggy, so we just do the simple thing now. + */ + if (count > old_count && jifs == old_jifs) { + count = old_count; + } + old_count = count; + old_jifs = jifs; + + spin_unlock_irqrestore(&i8253_lock, flags); + + count = (LATCH - 1) - count; + + return (cycle_t)(jifs * LATCH) + count; +} + +static struct clocksource clocksource_pit = { + .name = "pit", + .rating = 110, + .read = pit_read, + .mask = CLOCKSOURCE_MASK(32), + .mult = 0, + .shift = 20, +}; + +static int __init init_pit_clocksource(void) +{ + if (num_possible_cpus() > 4) /* PIT does not scale! */ + return 0; + + clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); + return clocksource_register(&clocksource_pit); +} +module_init(init_pit_clocksource); Index: linux-hres.q/arch/i386/kernel/irq.c =================================================================== --- linux-hres.q.orig/arch/i386/kernel/irq.c +++ linux-hres.q/arch/i386/kernel/irq.c @@ -75,6 +75,19 @@ fastcall unsigned int do_IRQ(struct pt_r } } #endif +#ifdef CONFIG_NO_HZ + if (idle_cpu(smp_processor_id())) { + update_jiffies(); + /* + * Force polling-idle loops to break out into + * the sched-timer setting code, to make sure + * that timer interval changes due to __mod_timer() + * in IRQ context get properly propagated: + */ + if (tsk_is_polling(current)) + set_need_resched(); + } +#endif #ifdef CONFIG_4KSTACKS Index: linux-hres.q/arch/i386/kernel/nmi.c =================================================================== --- linux-hres.q.orig/arch/i386/kernel/nmi.c +++ linux-hres.q/arch/i386/kernel/nmi.c @@ -532,7 +532,7 @@ void nmi_watchdog_tick (struct pt_regs * unsigned int sum; int cpu = smp_processor_id(); - sum = per_cpu(irq_stat, cpu).apic_timer_irqs; + sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0); if (last_irq_sums[cpu] == sum) { /* Index: linux-hres.q/arch/i386/kernel/numaq.c =================================================================== --- linux-hres.q.orig/arch/i386/kernel/numaq.c +++ linux-hres.q/arch/i386/kernel/numaq.c @@ -79,10 +79,12 @@ int __init get_memcfg_numaq(void) return 1; } -static int __init numaq_dsc_disable(void) +static int __init numaq_tsc_disable(void) { - printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); - tsc_disable = 1; + if (num_online_nodes() > 1) { + printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); + tsc_disable = 1; + } return 0; } -core_initcall(numaq_dsc_disable); +arch_initcall(numaq_tsc_disable); Index: linux-hres.q/arch/i386/kernel/process.c =================================================================== --- linux-hres.q.orig/arch/i386/kernel/process.c +++ linux-hres.q/arch/i386/kernel/process.c @@ -102,16 +102,20 @@ void default_idle(void) local_irq_enable(); if (!hlt_counter && boot_cpu_data.hlt_works_ok) { - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); while (!need_resched()) { local_irq_disable(); - if (!need_resched()) - safe_halt(); - else + if (!need_resched()) { + if (!hrtimer_stop_sched_tick()) + safe_halt(); + else + local_irq_enable(); + hrtimer_restart_sched_tick(); + } else local_irq_enable(); } - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; } else { while (!need_resched()) cpu_relax(); @@ -126,16 +130,18 @@ EXPORT_SYMBOL(default_idle); * to poll the ->work.need_resched flag instead of waiting for the * cross-CPU IPI to arrive. Use this option with caution. */ -static void poll_idle (void) +static void poll_idle(void) { local_irq_enable(); - asm volatile( - "2:" - "testl %0, %1;" - "rep; nop;" - "je 2b;" - : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); + while (!need_resched()) { + hrtimer_stop_sched_tick(); + local_irq_enable(); + while (!need_resched() && !rcu_pending(smp_processor_id()) && !local_softirq_pending()) + rep_nop(); + hrtimer_restart_sched_tick(); + local_irq_enable(); + } } #ifdef CONFIG_HOTPLUG_CPU @@ -174,7 +180,7 @@ void cpu_idle(void) { int cpu = smp_processor_id(); - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { @@ -242,12 +248,15 @@ static void mwait_idle(void) local_irq_enable(); while (!need_resched()) { + if (hrtimer_stop_sched_tick()) + break; __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (need_resched()) break; __mwait(0, 0); } + hrtimer_restart_sched_tick(); } void __devinit select_idle_routine(const struct cpuinfo_x86 *c) Index: linux-hres.q/arch/i386/kernel/setup.c =================================================================== --- linux-hres.q.orig/arch/i386/kernel/setup.c +++ linux-hres.q/arch/i386/kernel/setup.c @@ -1583,6 +1583,7 @@ void __init setup_arch(char **cmdline_p) conswitchp = &dummy_con; #endif #endif + tsc_init(); } static __init int add_pcspkr(void) Index: linux-hres.q/arch/i386/kernel/time.c =================================================================== --- linux-hres.q.orig/arch/i386/kernel/time.c +++ linux-hres.q/arch/i386/kernel/time.c @@ -82,13 +82,6 @@ extern unsigned long wall_jiffies; DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); -#include - -DEFINE_SPINLOCK(i8253_lock); -EXPORT_SYMBOL(i8253_lock); - -struct timer_opts *cur_timer __read_mostly = &timer_none; - /* * This is a special lock that is owned by the CPU and holds the index * register we are working with. It is required for NMI access to the @@ -118,99 +111,19 @@ void rtc_cmos_write(unsigned char val, u } EXPORT_SYMBOL(rtc_cmos_write); -/* - * This version of gettimeofday has microsecond resolution - * and better than microsecond precision on fast x86 machines with TSC. - */ -void do_gettimeofday(struct timeval *tv) -{ - unsigned long seq; - unsigned long usec, sec; - unsigned long max_ntp_tick; - - do { - unsigned long lost; - - seq = read_seqbegin(&xtime_lock); - - usec = cur_timer->get_offset(); - lost = jiffies - wall_jiffies; - - /* - * If time_adjust is negative then NTP is slowing the clock - * so make sure not to go into next possible interval. - * Better to lose some accuracy than have time go backwards.. - */ - if (unlikely(time_adjust < 0)) { - max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj; - usec = min(usec, max_ntp_tick); - - if (lost) - usec += lost * max_ntp_tick; - } - else if (unlikely(lost)) - usec += lost * (USEC_PER_SEC / HZ); - - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - } while (read_seqretry(&xtime_lock, seq)); - - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - /* - * This is revolting. We need to set "xtime" correctly. However, the - * value in this location is the value at the most recent update of - * wall time. Discover what correction gettimeofday() would have - * made, and then undo it! - */ - nsec -= cur_timer->get_offset() * NSEC_PER_USEC; - nsec -= (jiffies - wall_jiffies) * TICK_NSEC; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - static int set_rtc_mmss(unsigned long nowtime) { int retval; - - WARN_ON(irqs_disabled()); + unsigned long flags; /* gets recalled with irq locally disabled */ - spin_lock_irq(&rtc_lock); + /* XXX - does irqsave resolve this? -johnstul */ + spin_lock_irqsave(&rtc_lock, flags); if (efi_enabled) retval = efi_set_rtc_mmss(nowtime); else retval = mach_set_rtc_mmss(nowtime); - spin_unlock_irq(&rtc_lock); + spin_unlock_irqrestore(&rtc_lock, flags); return retval; } @@ -218,16 +131,6 @@ static int set_rtc_mmss(unsigned long no int timer_ack; -/* monotonic_clock(): returns # of nanoseconds passed since time_init() - * Note: This function is required to return accurate - * time even in the absence of multiple timer ticks. - */ -unsigned long long monotonic_clock(void) -{ - return cur_timer->monotonic_clock(); -} -EXPORT_SYMBOL(monotonic_clock); - #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) unsigned long profile_pc(struct pt_regs *regs) { @@ -242,10 +145,11 @@ EXPORT_SYMBOL(profile_pc); #endif /* - * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * This is the same as the above, except we _also_ save the current + * Time Stamp Counter value at the time of the timer interrupt, so that + * we later on can estimate the time of day more exactly. */ -static inline void do_timer_interrupt(int irq, struct pt_regs *regs) +irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { #ifdef CONFIG_X86_IO_APIC if (timer_ack) { @@ -265,7 +169,6 @@ static inline void do_timer_interrupt(in do_timer_interrupt_hook(regs); - if (MCA_bus) { /* The PS/2 uses level-triggered interrupts. You can't turn them off, nor would you want to (any attempt to @@ -279,29 +182,6 @@ static inline void do_timer_interrupt(in irq = inb_p( 0x61 ); /* read the current state */ outb_p( irq|0x80, 0x61 ); /* reset the IRQ */ } -} - -/* - * This is the same as the above, except we _also_ save the current - * Time Stamp Counter value at the time of the timer interrupt, so that - * we later on can estimate the time of day more exactly. - */ -irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - /* - * Here we are in the timer irq handler. We just have irqs locally - * disabled but we don't know if the timer_bh is running on the other - * CPU. We need to avoid to SMP race with it. NOTE: we don' t need - * the irq version of write_lock because as just said we have irq - * locally disabled. -arca - */ - write_seqlock(&xtime_lock); - - cur_timer->mark_offset(); - - do_timer_interrupt(irq, regs); - - write_sequnlock(&xtime_lock); #ifdef CONFIG_X86_LOCAL_APIC if (using_apic_timer) @@ -380,7 +260,6 @@ void notify_arch_cmos_timer(void) static long clock_cmos_diff, sleep_start; -static struct timer_opts *last_timer; static int timer_suspend(struct sys_device *dev, pm_message_t state) { /* @@ -389,10 +268,6 @@ static int timer_suspend(struct sys_devi clock_cmos_diff = -get_cmos_time(); clock_cmos_diff += get_seconds(); sleep_start = get_cmos_time(); - last_timer = cur_timer; - cur_timer = &timer_none; - if (last_timer->suspend) - last_timer->suspend(state); return 0; } @@ -415,10 +290,6 @@ static int timer_resume(struct sys_devic jiffies_64 += sleep_length; wall_jiffies += sleep_length; write_sequnlock_irqrestore(&xtime_lock, flags); - if (last_timer->resume) - last_timer->resume(); - cur_timer = last_timer; - last_timer = NULL; touch_softlockup_watchdog(); return 0; } @@ -460,9 +331,6 @@ static void __init hpet_time_init(void) printk("Using HPET for base-timer\n"); } - cur_timer = select_timer(); - printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); - time_init_hook(); } #endif @@ -484,8 +352,5 @@ void __init time_init(void) set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - cur_timer = select_timer(); - printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); - time_init_hook(); } Index: linux-hres.q/arch/i386/kernel/timers/Makefile =================================================================== --- linux-hres.q.orig/arch/i386/kernel/timers/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# -# Makefile for x86 timers -# - -obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o common.o - -obj-$(CONFIG_X86_CYCLONE_TIMER) += timer_cyclone.o -obj-$(CONFIG_HPET_TIMER) += timer_hpet.o -obj-$(CONFIG_X86_PM_TIMER) += timer_pm.o Index: linux-hres.q/arch/i386/kernel/timers/common.c =================================================================== --- linux-hres.q.orig/arch/i386/kernel/timers/common.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Common functions used across the timers go here - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "mach_timer.h" - -/* ------ Calibrate the TSC ------- - * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset(). - * Too much 64-bit arithmetic here to do this cleanly in C, and for - * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2) - * output busy loop as low as possible. We avoid reading the CTC registers - * directly because of the awkward 8-bit access mechanism of the 82C54 - * device. - */ - -#define CALIBRATE_TIME (5 * 1000020/HZ) - -unsigned long calibrate_tsc(void) -{ - mach_prepare_counter(); - - { - unsigned long startlow, starthigh; - unsigned long endlow, endhigh; - unsigned long count; - - rdtsc(startlow,starthigh); - mach_countup(&count); - rdtsc(endlow,endhigh); - - - /* Error: ECTCNEVERSET */ - if (count <= 1) - goto bad_ctc; - - /* 64-bit subtract - gcc just messes up with long longs */ - __asm__("subl %2,%0\n\t" - "sbbl %3,%1" - :"=a" (endlow), "=d" (endhigh) - :"g" (startlow), "g" (starthigh), - "0" (endlow), "1" (endhigh)); - - /* Error: ECPUTOOFAST */ - if (endhigh) - goto bad_ctc; - - /* Error: ECPUTOOSLOW */ - if (endlow <= CALIBRATE_TIME) - goto bad_ctc; - - __asm__("divl %2" - :"=a" (endlow), "=d" (endhigh) - :"r" (endlow), "0" (0), "1" (CALIBRATE_TIME)); - - return endlow; - } - - /* - * The CTC wasn't reliable: we got a hit on the very first read, - * or the CPU was so fast/slow that the quotient wouldn't fit in - * 32 bits.. - */ -bad_ctc: - return 0; -} - -#ifdef CONFIG_HPET_TIMER -/* ------ Calibrate the TSC using HPET ------- - * Return 2^32 * (1 / (TSC clocks per usec)) for getting the CPU freq. - * Second output is parameter 1 (when non NULL) - * Set 2^32 * (1 / (tsc per HPET clk)) for delay_hpet(). - * calibrate_tsc() calibrates the processor TSC by comparing - * it to the HPET timer of known frequency. - * Too much 64-bit arithmetic here to do this cleanly in C - */ -#define CALIBRATE_CNT_HPET (5 * hpet_tick) -#define CALIBRATE_TIME_HPET (5 * KERNEL_TICK_USEC) - -unsigned long __devinit calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr) -{ - unsigned long tsc_startlow, tsc_starthigh; - unsigned long tsc_endlow, tsc_endhigh; - unsigned long hpet_start, hpet_end; - unsigned long result, remain; - - hpet_start = hpet_readl(HPET_COUNTER); - rdtsc(tsc_startlow, tsc_starthigh); - do { - hpet_end = hpet_readl(HPET_COUNTER); - } while ((hpet_end - hpet_start) < CALIBRATE_CNT_HPET); - rdtsc(tsc_endlow, tsc_endhigh); - - /* 64-bit subtract - gcc just messes up with long longs */ - __asm__("subl %2,%0\n\t" - "sbbl %3,%1" - :"=a" (tsc_endlow), "=d" (tsc_endhigh) - :"g" (tsc_startlow), "g" (tsc_starthigh), - "0" (tsc_endlow), "1" (tsc_endhigh)); - - /* Error: ECPUTOOFAST */ - if (tsc_endhigh) - goto bad_calibration; - - /* Error: ECPUTOOSLOW */ - if (tsc_endlow <= CALIBRATE_TIME_HPET) - goto bad_calibration; - - ASM_DIV64_REG(result, remain, tsc_endlow, 0, CALIBRATE_TIME_HPET); - if (remain > (tsc_endlow >> 1)) - result++; /* rounding the result */ - - if (tsc_hpet_quotient_ptr) { - unsigned long tsc_hpet_quotient; - - ASM_DIV64_REG(tsc_hpet_quotient, remain, tsc_endlow, 0, - CALIBRATE_CNT_HPET); - if (remain > (tsc_endlow >> 1)) - tsc_hpet_quotient++; /* rounding the result */ - *tsc_hpet_quotient_ptr = tsc_hpet_quotient; - } - - return result; -bad_calibration: - /* - * the CPU was so fast/slow that the quotient wouldn't fit in - * 32 bits.. - */ - return 0; -} -#endif - - -unsigned long read_timer_tsc(void) -{ - unsigned long retval; - rdtscl(retval); - return retval; -} - - -/* calculate cpu_khz */ -void init_cpu_khz(void) -{ - if (cpu_has_tsc) { - unsigned long tsc_quotient = calibrate_tsc(); - if (tsc_quotient) { - /* report CPU clock rate in Hz. - * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = - * clock/second. Our precision is about 100 ppm. - */ - { unsigned long eax=0, edx=1000; - __asm__("divl %2" - :"=a" (cpu_khz), "=d" (edx) - :"r" (tsc_quotient), - "0" (eax), "1" (edx)); - printk("Detected %u.%03u MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - } - } - } -} - Index: linux-hres.q/arch/i386/kernel/timers/timer.c =================================================================== --- linux-hres.q.orig/arch/i386/kernel/timers/timer.c +++ /dev/null @@ -1,75 +0,0 @@ -#include -#include -#include -#include - -#ifdef CONFIG_HPET_TIMER -/* - * HPET memory read is slower than tsc reads, but is more dependable as it - * always runs at constant frequency and reduces complexity due to - * cpufreq. So, we prefer HPET timer to tsc based one. Also, we cannot use - * timer_pit when HPET is active. So, we default to timer_tsc. - */ -#endif -/* list of timers, ordered by preference, NULL terminated */ -static struct init_timer_opts* __initdata timers[] = { -#ifdef CONFIG_X86_CYCLONE_TIMER - &timer_cyclone_init, -#endif -#ifdef CONFIG_HPET_TIMER - &timer_hpet_init, -#endif -#ifdef CONFIG_X86_PM_TIMER - &timer_pmtmr_init, -#endif - &timer_tsc_init, - &timer_pit_init, - NULL, -}; - -static char clock_override[10] __initdata; - -static int __init clock_setup(char* str) -{ - if (str) - strlcpy(clock_override, str, sizeof(clock_override)); - return 1; -} -__setup("clock=", clock_setup); - - -/* The chosen timesource has been found to be bad. - * Fall back to a known good timesource (the PIT) - */ -void clock_fallback(void) -{ - cur_timer = &timer_pit; -} - -/* iterates through the list of timers, returning the first - * one that initializes successfully. - */ -struct timer_opts* __init select_timer(void) -{ - int i = 0; - - /* find most preferred working timer */ - while (timers[i]) { - if (timers[i]->init) - if (timers[i]->init(clock_override) == 0) - return timers[i]->opts; - ++i; - } - - panic("select_timer: Cannot find a suitable timer\n"); - return NULL; -} - -int read_current_timer(unsigned long *timer_val) -{ - if (cur_timer->read_timer) { - *timer_val = cur_timer->read_timer(); - return 0; - } - return -1; -} Index: linux-hres.q/arch/i386/kernel/timers/timer_cyclone.c =================================================================== --- linux-hres.q.orig/arch/i386/kernel/timers/timer_cyclone.c +++ /dev/null @@ -1,259 +0,0 @@ -/* Cyclone-timer: - * This code implements timer_ops for the cyclone counter found - * on IBM x440, x360, and other Summit based systems. - * - * Copyright (C) 2002 IBM, John Stultz (johnstul@us.ibm.com) - */ - - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "io_ports.h" - -/* Number of usecs that the last interrupt was delayed */ -static int delay_at_last_interrupt; - -#define CYCLONE_CBAR_ADDR 0xFEB00CD0 -#define CYCLONE_PMCC_OFFSET 0x51A0 -#define CYCLONE_MPMC_OFFSET 0x51D0 -#define CYCLONE_MPCS_OFFSET 0x51A8 -#define CYCLONE_TIMER_FREQ 100000000 -#define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */ -int use_cyclone = 0; - -static u32* volatile cyclone_timer; /* Cyclone MPMC0 register */ -static u32 last_cyclone_low; -static u32 last_cyclone_high; -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* helper macro to atomically read both cyclone counter registers */ -#define read_cyclone_counter(low,high) \ - do{ \ - high = cyclone_timer[1]; low = cyclone_timer[0]; \ - } while (high != cyclone_timer[1]); - - -static void mark_offset_cyclone(void) -{ - unsigned long lost, delay; - unsigned long delta = last_cyclone_low; - int count; - unsigned long long this_offset, last_offset; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; - - spin_lock(&i8253_lock); - read_cyclone_counter(last_cyclone_low,last_cyclone_high); - - /* read values for delay_at_last_interrupt */ - outb_p(0x00, 0x43); /* latch the count ASAP */ - - count = inb_p(0x40); /* read the latched count */ - count |= inb(0x40) << 8; - - /* - * VIA686a test code... reset the latch if count > max + 1 - * from timer_pit.c - cjb - */ - if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - spin_unlock(&i8253_lock); - - /* lost tick compensation */ - delta = last_cyclone_low - delta; - delta /= (CYCLONE_TIMER_FREQ/1000000); - delta += delay_at_last_interrupt; - lost = delta/(1000000/HZ); - delay = delta%(1000000/HZ); - if (lost >= 2) - jiffies_64 += lost-1; - - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; - monotonic_base += (this_offset - last_offset) & CYCLONE_TIMER_MASK; - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - count = ((LATCH-1) - count) * TICK_SIZE; - delay_at_last_interrupt = (count + LATCH/2) / LATCH; - - - /* catch corner case where tick rollover occured - * between cyclone and pit reads (as noted when - * usec delta is > 90% # of usecs/tick) - */ - if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ)) - jiffies_64++; -} - -static unsigned long get_offset_cyclone(void) -{ - u32 offset; - - if(!cyclone_timer) - return delay_at_last_interrupt; - - /* Read the cyclone timer */ - offset = cyclone_timer[0]; - - /* .. relative to previous jiffy */ - offset = offset - last_cyclone_low; - - /* convert cyclone ticks to microseconds */ - /* XXX slow, can we speed this up? */ - offset = offset/(CYCLONE_TIMER_FREQ/1000000); - - /* our adjusted time offset in microseconds */ - return delay_at_last_interrupt + offset; -} - -static unsigned long long monotonic_clock_cyclone(void) -{ - u32 now_low, now_high; - unsigned long long last_offset, this_offset, base; - unsigned long long ret; - unsigned seq; - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - - /* Read the cyclone counter */ - read_cyclone_counter(now_low,now_high); - this_offset = ((unsigned long long)now_high<<32)|now_low; - - /* convert to nanoseconds */ - ret = base + ((this_offset - last_offset)&CYCLONE_TIMER_MASK); - return ret * (1000000000 / CYCLONE_TIMER_FREQ); -} - -static int __init init_cyclone(char* override) -{ - u32* reg; - u32 base; /* saved cyclone base address */ - u32 pageaddr; /* page that contains cyclone_timer register */ - u32 offset; /* offset from pageaddr to cyclone_timer register */ - int i; - - /* check clock override */ - if (override[0] && strncmp(override,"cyclone",7)) - return -ENODEV; - - /*make sure we're on a summit box*/ - if(!use_cyclone) return -ENODEV; - - printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n"); - - /* find base address */ - pageaddr = (CYCLONE_CBAR_ADDR)&PAGE_MASK; - offset = (CYCLONE_CBAR_ADDR)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n"); - return -ENODEV; - } - base = *reg; - if(!base){ - printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n"); - return -ENODEV; - } - - /* setup PMCC */ - pageaddr = (base + CYCLONE_PMCC_OFFSET)&PAGE_MASK; - offset = (base + CYCLONE_PMCC_OFFSET)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n"); - return -ENODEV; - } - reg[0] = 0x00000001; - - /* setup MPCS */ - pageaddr = (base + CYCLONE_MPCS_OFFSET)&PAGE_MASK; - offset = (base + CYCLONE_MPCS_OFFSET)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n"); - return -ENODEV; - } - reg[0] = 0x00000001; - - /* map in cyclone_timer */ - pageaddr = (base + CYCLONE_MPMC_OFFSET)&PAGE_MASK; - offset = (base + CYCLONE_MPMC_OFFSET)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - cyclone_timer = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!cyclone_timer){ - printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n"); - return -ENODEV; - } - - /*quick test to make sure its ticking*/ - for(i=0; i<3; i++){ - u32 old = cyclone_timer[0]; - int stall = 100; - while(stall--) barrier(); - if(cyclone_timer[0] == old){ - printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n"); - cyclone_timer = 0; - return -ENODEV; - } - } - - init_cpu_khz(); - - /* Everything looks good! */ - return 0; -} - - -static void delay_cyclone(unsigned long loops) -{ - unsigned long bclock, now; - if(!cyclone_timer) - return; - bclock = cyclone_timer[0]; - do { - rep_nop(); - now = cyclone_timer[0]; - } while ((now-bclock) < loops); -} -/************************************************************/ - -/* cyclone timer_opts struct */ -static struct timer_opts timer_cyclone = { - .name = "cyclone", - .mark_offset = mark_offset_cyclone, - .get_offset = get_offset_cyclone, - .monotonic_clock = monotonic_clock_cyclone, - .delay = delay_cyclone, -}; - -struct init_timer_opts __initdata timer_cyclone_init = { - .init = init_cyclone, - .opts = &timer_cyclone, -}; Index: linux-hres.q/arch/i386/kernel/timers/timer_hpet.c =================================================================== --- linux-hres.q.orig/arch/i386/kernel/timers/timer_hpet.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "io_ports.h" -#include "mach_timer.h" -#include - -static unsigned long hpet_usec_quotient __read_mostly; /* convert hpet clks to usec */ -static unsigned long tsc_hpet_quotient __read_mostly; /* convert tsc to hpet clks */ -static unsigned long hpet_last; /* hpet counter value at last tick*/ -static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ -static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better percision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ -static unsigned long cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline void set_cyc2ns_scale(unsigned long cpu_khz) -{ - cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - -static unsigned long long monotonic_clock_hpet(void) -{ - unsigned long long last_offset, this_offset, base; - unsigned seq; - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return base + cycles_2_ns(this_offset - last_offset); -} - -static unsigned long get_offset_hpet(void) -{ - register unsigned long eax, edx; - - eax = hpet_readl(HPET_COUNTER); - eax -= hpet_last; /* hpet delta */ - eax = min(hpet_tick, eax); - /* - * Time offset = (hpet delta) * ( usecs per HPET clock ) - * = (hpet delta) * ( usecs per tick / HPET clocks per tick) - * = (hpet delta) * ( hpet_usec_quotient ) / (2^32) - * - * Where, - * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick - * - * Using a mull instead of a divl saves some cycles in critical path. - */ - ASM_MUL64_REG(eax, edx, hpet_usec_quotient, eax); - - /* our adjusted time offset in microseconds */ - return edx; -} - -static void mark_offset_hpet(void) -{ - unsigned long long this_offset, last_offset; - unsigned long offset; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - rdtsc(last_tsc_low, last_tsc_high); - - if (hpet_use_timer) - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - offset = hpet_readl(HPET_COUNTER); - if (unlikely(((offset - hpet_last) >= (2*hpet_tick)) && (hpet_last != 0))) { - int lost_ticks = ((offset - hpet_last) / hpet_tick) - 1; - jiffies_64 += lost_ticks; - } - hpet_last = offset; - - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); -} - -static void delay_hpet(unsigned long loops) -{ - unsigned long hpet_start, hpet_end; - unsigned long eax; - - /* loops is the number of cpu cycles. Convert it to hpet clocks */ - ASM_MUL64_REG(eax, loops, tsc_hpet_quotient, loops); - - hpet_start = hpet_readl(HPET_COUNTER); - do { - rep_nop(); - hpet_end = hpet_readl(HPET_COUNTER); - } while ((hpet_end - hpet_start) < (loops)); -} - -static struct timer_opts timer_hpet; - -static int __init init_hpet(char* override) -{ - unsigned long result, remain; - - /* check clock override */ - if (override[0] && strncmp(override,"hpet",4)) - return -ENODEV; - - if (!is_hpet_enabled()) - return -ENODEV; - - printk("Using HPET for gettimeofday\n"); - if (cpu_has_tsc) { - unsigned long tsc_quotient = calibrate_tsc_hpet(&tsc_hpet_quotient); - if (tsc_quotient) { - /* report CPU clock rate in Hz. - * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = - * clock/second. Our precision is about 100 ppm. - */ - { unsigned long eax=0, edx=1000; - ASM_DIV64_REG(cpu_khz, edx, tsc_quotient, - eax, edx); - printk("Detected %u.%03u MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - } - set_cyc2ns_scale(cpu_khz); - } - /* set this only when cpu_has_tsc */ - timer_hpet.read_timer = read_timer_tsc; - } - - /* - * Math to calculate hpet to usec multiplier - * Look for the comments at get_offset_hpet() - */ - ASM_DIV64_REG(result, remain, hpet_tick, 0, KERNEL_TICK_USEC); - if (remain > (hpet_tick >> 1)) - result++; /* rounding the result */ - hpet_usec_quotient = result; - - return 0; -} - -static int hpet_resume(void) -{ - write_seqlock(&monotonic_lock); - /* Assume this is the last mark offset time */ - rdtsc(last_tsc_low, last_tsc_high); - - if (hpet_use_timer) - hpet_last = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - hpet_last = hpet_readl(HPET_COUNTER); - write_sequnlock(&monotonic_lock); - return 0; -} -/************************************************************/ - -/* tsc timer_opts struct */ -static struct timer_opts timer_hpet __read_mostly = { - .name = "hpet", - .mark_offset = mark_offset_hpet, - .get_offset = get_offset_hpet, - .monotonic_clock = monotonic_clock_hpet, - .delay = delay_hpet, - .resume = hpet_resume, -}; - -struct init_timer_opts __initdata timer_hpet_init = { - .init = init_hpet, - .opts = &timer_hpet, -}; Index: linux-hres.q/arch/i386/kernel/timers/timer_none.c =================================================================== --- linux-hres.q.orig/arch/i386/kernel/timers/timer_none.c +++ /dev/null @@ -1,39 +0,0 @@ -#include -#include - -static void mark_offset_none(void) -{ - /* nothing needed */ -} - -static unsigned long get_offset_none(void) -{ - return 0; -} - -static unsigned long long monotonic_clock_none(void) -{ - return 0; -} - -static void delay_none(unsigned long loops) -{ - int d0; - __asm__ __volatile__( - "\tjmp 1f\n" - ".align 16\n" - "1:\tjmp 2f\n" - ".align 16\n" - "2:\tdecl %0\n\tjns 2b" - :"=&a" (d0) - :"0" (loops)); -} - -/* none timer_opts struct */ -struct timer_opts timer_none = { - .name = "none", - .mark_offset = mark_offset_none, - .get_offset = get_offset_none, - .monotonic_clock = monotonic_clock_none, - .delay = delay_none, -}; Index: linux-hres.q/arch/i386/kernel/timers/timer_pit.c =================================================================== --- linux-hres.q.orig/arch/i386/kernel/timers/timer_pit.c +++ /dev/null @@ -1,177 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "do_timer.h" -#include "io_ports.h" - -static int count_p; /* counter in get_offset_pit() */ - -static int __init init_pit(char* override) -{ - /* check clock override */ - if (override[0] && strncmp(override,"pit",3)) - printk(KERN_ERR "Warning: clock= override failed. Defaulting " - "to PIT\n"); - init_cpu_khz(); - count_p = LATCH; - return 0; -} - -static void mark_offset_pit(void) -{ - /* nothing needed */ -} - -static unsigned long long monotonic_clock_pit(void) -{ - return 0; -} - -static void delay_pit(unsigned long loops) -{ - int d0; - __asm__ __volatile__( - "\tjmp 1f\n" - ".align 16\n" - "1:\tjmp 2f\n" - ".align 16\n" - "2:\tdecl %0\n\tjns 2b" - :"=&a" (d0) - :"0" (loops)); -} - - -/* This function must be called with xtime_lock held. - * It was inspired by Steve McCanne's microtime-i386 for BSD. -- jrs - * - * However, the pc-audio speaker driver changes the divisor so that - * it gets interrupted rather more often - it loads 64 into the - * counter rather than 11932! This has an adverse impact on - * do_gettimeoffset() -- it stops working! What is also not - * good is that the interval that our timer function gets called - * is no longer 10.0002 ms, but 9.9767 ms. To get around this - * would require using a different timing source. Maybe someone - * could use the RTC - I know that this can interrupt at frequencies - * ranging from 8192Hz to 2Hz. If I had the energy, I'd somehow fix - * it so that at startup, the timer code in sched.c would select - * using either the RTC or the 8253 timer. The decision would be - * based on whether there was any other device around that needed - * to trample on the 8253. I'd set up the RTC to interrupt at 1024 Hz, - * and then do some jiggery to have a version of do_timer that - * advanced the clock by 1/1024 s. Every time that reached over 1/100 - * of a second, then do all the old code. If the time was kept correct - * then do_gettimeoffset could just return 0 - there is no low order - * divider that can be accessed. - * - * Ideally, you would be able to use the RTC for the speaker driver, - * but it appears that the speaker driver really needs interrupt more - * often than every 120 us or so. - * - * Anyway, this needs more thought.... pjsg (1993-08-28) - * - * If you are really that interested, you should be reading - * comp.protocols.time.ntp! - */ - -static unsigned long get_offset_pit(void) -{ - int count; - unsigned long flags; - static unsigned long jiffies_p = 0; - - /* - * cache volatile jiffies temporarily; we have xtime_lock. - */ - unsigned long jiffies_t; - - spin_lock_irqsave(&i8253_lock, flags); - /* timer count may underflow right here */ - outb_p(0x00, PIT_MODE); /* latch the count ASAP */ - - count = inb_p(PIT_CH0); /* read the latched count */ - - /* - * We do this guaranteed double memory access instead of a _p - * postfix in the previous port access. Wheee, hackady hack - */ - jiffies_t = jiffies; - - count |= inb_p(PIT_CH0) << 8; - - /* VIA686a test code... reset the latch if count > max + 1 */ - if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - - /* - * avoiding timer inconsistencies (they are rare, but they happen)... - * there are two kinds of problems that must be avoided here: - * 1. the timer counter underflows - * 2. hardware problem with the timer, not giving us continuous time, - * the counter does small "jumps" upwards on some Pentium systems, - * (see c't 95/10 page 335 for Neptun bug.) - */ - - if( jiffies_t == jiffies_p ) { - if( count > count_p ) { - /* the nutcase */ - count = do_timer_overflow(count); - } - } else - jiffies_p = jiffies_t; - - count_p = count; - - spin_unlock_irqrestore(&i8253_lock, flags); - - count = ((LATCH-1) - count) * TICK_SIZE; - count = (count + LATCH/2) / LATCH; - - return count; -} - - -/* tsc timer_opts struct */ -struct timer_opts timer_pit = { - .name = "pit", - .mark_offset = mark_offset_pit, - .get_offset = get_offset_pit, - .monotonic_clock = monotonic_clock_pit, - .delay = delay_pit, -}; - -struct init_timer_opts __initdata timer_pit_init = { - .init = init_pit, - .opts = &timer_pit, -}; - -void setup_pit_timer(void) -{ - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); - outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ - udelay(10); - outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - udelay(10); - outb(LATCH >> 8 , PIT_CH0); /* MSB */ - spin_unlock_irqrestore(&i8253_lock, flags); -} Index: linux-hres.q/arch/i386/kernel/timers/timer_pm.c =================================================================== --- linux-hres.q.orig/arch/i386/kernel/timers/timer_pm.c +++ /dev/null @@ -1,342 +0,0 @@ -/* - * (C) Dominik Brodowski 2003 - * - * Driver to use the Power Management Timer (PMTMR) available in some - * southbridges as primary timing source for the Linux kernel. - * - * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, - * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. - * - * This file is licensed under the GPL v2. - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "mach_timer.h" - -/* Number of PMTMR ticks expected during calibration run */ -#define PMTMR_TICKS_PER_SEC 3579545 -#define PMTMR_EXPECTED_RATE \ - ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10)) - - -/* The I/O port the PMTMR resides at. - * The location is detected during setup_arch(), - * in arch/i386/acpi/boot.c */ -u32 pmtmr_ioport = 0; - - -/* value of the Power timer at last timer interrupt */ -static u32 offset_tick; -static u32 offset_delay; - -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ - -static int pmtmr_need_workaround __read_mostly = 1; - -/*helper function to safely read acpi pm timesource*/ -static inline u32 read_pmtmr(void) -{ - if (pmtmr_need_workaround) { - u32 v1, v2, v3; - - /* It has been reported that because of various broken - * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time - * source is not latched, so you must read it multiple - * times to insure a safe value is read. - */ - do { - v1 = inl(pmtmr_ioport); - v2 = inl(pmtmr_ioport); - v3 = inl(pmtmr_ioport); - } while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1) - || (v3 > v1 && v3 < v2)); - - /* mask the output to 24 bits */ - return v2 & ACPI_PM_MASK; - } - - return inl(pmtmr_ioport) & ACPI_PM_MASK; -} - - -/* - * Some boards have the PMTMR running way too fast. We check - * the PMTMR rate against PIT channel 2 to catch these cases. - */ -static int verify_pmtmr_rate(void) -{ - u32 value1, value2; - unsigned long count, delta; - - mach_prepare_counter(); - value1 = read_pmtmr(); - mach_countup(&count); - value2 = read_pmtmr(); - delta = (value2 - value1) & ACPI_PM_MASK; - - /* Check that the PMTMR delta is within 5% of what we expect */ - if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 || - delta > (PMTMR_EXPECTED_RATE * 21) / 20) { - printk(KERN_INFO "PM-Timer running at invalid rate: %lu%% of normal - aborting.\n", 100UL * delta / PMTMR_EXPECTED_RATE); - return -1; - } - - return 0; -} - - -static int init_pmtmr(char* override) -{ - u32 value1, value2; - unsigned int i; - - if (override[0] && strncmp(override,"pmtmr",5)) - return -ENODEV; - - if (!pmtmr_ioport) - return -ENODEV; - - /* we use the TSC for delay_pmtmr, so make sure it exists */ - if (!cpu_has_tsc) - return -ENODEV; - - /* "verify" this timing source */ - value1 = read_pmtmr(); - for (i = 0; i < 10000; i++) { - value2 = read_pmtmr(); - if (value2 == value1) - continue; - if (value2 > value1) - goto pm_good; - if ((value2 < value1) && ((value2) < 0xFFF)) - goto pm_good; - printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2); - return -EINVAL; - } - printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1); - return -ENODEV; - -pm_good: - if (verify_pmtmr_rate() != 0) - return -ENODEV; - - init_cpu_khz(); - return 0; -} - -static inline u32 cyc2us(u32 cycles) -{ - /* The Power Management Timer ticks at 3.579545 ticks per microsecond. - * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] - * - * Even with HZ = 100, delta is at maximum 35796 ticks, so it can - * easily be multiplied with 286 (=0x11E) without having to fear - * u32 overflows. - */ - cycles *= 286; - return (cycles >> 10); -} - -/* - * this gets called during each timer interrupt - * - Called while holding the writer xtime_lock - */ -static void mark_offset_pmtmr(void) -{ - u32 lost, delta, last_offset; - static int first_run = 1; - last_offset = offset_tick; - - write_seqlock(&monotonic_lock); - - offset_tick = read_pmtmr(); - - /* calculate tick interval */ - delta = (offset_tick - last_offset) & ACPI_PM_MASK; - - /* convert to usecs */ - delta = cyc2us(delta); - - /* update the monotonic base value */ - monotonic_base += delta * NSEC_PER_USEC; - write_sequnlock(&monotonic_lock); - - /* convert to ticks */ - delta += offset_delay; - lost = delta / (USEC_PER_SEC / HZ); - offset_delay = delta % (USEC_PER_SEC / HZ); - - - /* compensate for lost ticks */ - if (lost >= 2) - jiffies_64 += lost - 1; - - /* don't calculate delay for first run, - or if we've got less then a tick */ - if (first_run || (lost < 1)) { - first_run = 0; - offset_delay = 0; - } -} - -static int pmtmr_resume(void) -{ - write_seqlock(&monotonic_lock); - /* Assume this is the last mark offset time */ - offset_tick = read_pmtmr(); - write_sequnlock(&monotonic_lock); - return 0; -} - -static unsigned long long monotonic_clock_pmtmr(void) -{ - u32 last_offset, this_offset; - unsigned long long base, ret; - unsigned seq; - - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = offset_tick; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - /* Read the pmtmr */ - this_offset = read_pmtmr(); - - /* convert to nanoseconds */ - ret = (this_offset - last_offset) & ACPI_PM_MASK; - ret = base + (cyc2us(ret) * NSEC_PER_USEC); - return ret; -} - -static void delay_pmtmr(unsigned long loops) -{ - unsigned long bclock, now; - - rdtscl(bclock); - do - { - rep_nop(); - rdtscl(now); - } while ((now-bclock) < loops); -} - - -/* - * get the offset (in microseconds) from the last call to mark_offset() - * - Called holding a reader xtime_lock - */ -static unsigned long get_offset_pmtmr(void) -{ - u32 now, offset, delta = 0; - - offset = offset_tick; - now = read_pmtmr(); - delta = (now - offset)&ACPI_PM_MASK; - - return (unsigned long) offset_delay + cyc2us(delta); -} - - -/* acpi timer_opts struct */ -static struct timer_opts timer_pmtmr = { - .name = "pmtmr", - .mark_offset = mark_offset_pmtmr, - .get_offset = get_offset_pmtmr, - .monotonic_clock = monotonic_clock_pmtmr, - .delay = delay_pmtmr, - .read_timer = read_timer_tsc, - .resume = pmtmr_resume, -}; - -struct init_timer_opts __initdata timer_pmtmr_init = { - .init = init_pmtmr, - .opts = &timer_pmtmr, -}; - -#ifdef CONFIG_PCI -/* - * PIIX4 Errata: - * - * The power management timer may return improper results when read. - * Although the timer value settles properly after incrementing, - * while incrementing there is a 3 ns window every 69.8 ns where the - * timer value is indeterminate (a 4.2% chance that the data will be - * incorrect when read). As a result, the ACPI free running count up - * timer specification is violated due to erroneous reads. - */ -static int __init pmtmr_bug_check(void) -{ - static struct pci_device_id gray_list[] __initdata = { - /* these chipsets may have bug. */ - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801DB_0) }, - { }, - }; - struct pci_dev *dev; - int pmtmr_has_bug = 0; - u8 rev; - - if (cur_timer != &timer_pmtmr || !pmtmr_need_workaround) - return 0; - - dev = pci_get_device(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82371AB_3, NULL); - if (dev) { - pci_read_config_byte(dev, PCI_REVISION_ID, &rev); - /* the bug has been fixed in PIIX4M */ - if (rev < 3) { - printk(KERN_WARNING "* Found PM-Timer Bug on this " - "chipset. Due to workarounds for a bug,\n" - "* this time source is slow. Consider trying " - "other time sources (clock=)\n"); - pmtmr_has_bug = 1; - } - pci_dev_put(dev); - } - - if (pci_dev_present(gray_list)) { - printk(KERN_WARNING "* This chipset may have PM-Timer Bug. Due" - " to workarounds for a bug,\n" - "* this time source is slow. If you are sure your timer" - " does not have\n" - "* this bug, please use \"pmtmr_good\" to disable the " - "workaround\n"); - pmtmr_has_bug = 1; - } - - if (!pmtmr_has_bug) - pmtmr_need_workaround = 0; - - return 0; -} -device_initcall(pmtmr_bug_check); -#endif - -static int __init pmtr_good_setup(char *__str) -{ - pmtmr_need_workaround = 0; - return 1; -} -__setup("pmtmr_good", pmtr_good_setup); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Dominik Brodowski "); -MODULE_DESCRIPTION("Power Management Timer (PMTMR) as primary timing source for x86"); Index: linux-hres.q/arch/i386/kernel/timers/timer_tsc.c =================================================================== --- linux-hres.q.orig/arch/i386/kernel/timers/timer_tsc.c +++ /dev/null @@ -1,617 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - * - * 2004-06-25 Jesper Juhl - * moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4 - * failing to inline. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -/* processor.h for distable_tsc flag */ -#include - -#include "io_ports.h" -#include "mach_timer.h" - -#include -#include - -#ifdef CONFIG_HPET_TIMER -static unsigned long hpet_usec_quotient; -static unsigned long hpet_last; -static struct timer_opts timer_tsc; -#endif - -static inline void cpufreq_delayed_get(void); - -int tsc_disable __devinitdata = 0; - -static int use_tsc; -/* Number of usecs that the last interrupt was delayed */ -static int delay_at_last_interrupt; - -static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ -static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* Avoid compensating for lost ticks before TSCs are synched */ -static int detect_lost_ticks; -static int __init start_lost_tick_compensation(void) -{ - detect_lost_ticks = 1; - return 0; -} -late_initcall(start_lost_tick_compensation); - -/* convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better percision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ -static unsigned long cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline void set_cyc2ns_scale(unsigned long cpu_khz) -{ - cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - -static int count2; /* counter for mark_offset_tsc() */ - -/* Cached *multiplier* to convert TSC counts to microseconds. - * (see the equation below). - * Equal to 2^32 * (1 / (clocks per usec) ). - * Initialized in time_init. - */ -static unsigned long fast_gettimeoffset_quotient; - -static unsigned long get_offset_tsc(void) -{ - register unsigned long eax, edx; - - /* Read the Time Stamp Counter */ - - rdtsc(eax,edx); - - /* .. relative to previous jiffy (32 bits is enough) */ - eax -= last_tsc_low; /* tsc_low delta */ - - /* - * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient - * = (tsc_low delta) * (usecs_per_clock) - * = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy) - * - * Using a mull instead of a divl saves up to 31 clock cycles - * in the critical path. - */ - - __asm__("mull %2" - :"=a" (eax), "=d" (edx) - :"rm" (fast_gettimeoffset_quotient), - "0" (eax)); - - /* our adjusted time offset in microseconds */ - return delay_at_last_interrupt + edx; -} - -static unsigned long long monotonic_clock_tsc(void) -{ - unsigned long long last_offset, this_offset, base; - unsigned seq; - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return base + cycles_2_ns(this_offset - last_offset); -} - -/* - * Scheduler clock - returns current time in nanosec units. - */ -unsigned long long sched_clock(void) -{ - unsigned long long this_offset; - - /* - * In the NUMA case we dont use the TSC as they are not - * synchronized across all CPUs. - */ -#ifndef CONFIG_NUMA - if (!use_tsc) -#endif - /* no locking but a rare wrong value is not a big deal */ - return jiffies_64 * (1000000000 / HZ); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return cycles_2_ns(this_offset); -} - -static void delay_tsc(unsigned long loops) -{ - unsigned long bclock, now; - - rdtscl(bclock); - do - { - rep_nop(); - rdtscl(now); - } while ((now-bclock) < loops); -} - -#ifdef CONFIG_HPET_TIMER -static void mark_offset_tsc_hpet(void) -{ - unsigned long long this_offset, last_offset; - unsigned long offset, temp, hpet_current; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - /* - * It is important that these two operations happen almost at - * the same time. We do the RDTSC stuff first, since it's - * faster. To avoid any inconsistencies, we need interrupts - * disabled locally. - */ - /* - * Interrupts are just disabled locally since the timer irq - * has the SA_INTERRUPT flag set. -arca - */ - /* read Pentium cycle counter */ - - hpet_current = hpet_readl(HPET_COUNTER); - rdtsc(last_tsc_low, last_tsc_high); - - /* lost tick compensation */ - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0)) - && detect_lost_ticks) { - int lost_ticks = (offset - hpet_last) / hpet_tick; - jiffies_64 += lost_ticks; - } - hpet_last = hpet_current; - - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - /* - * Time offset = (hpet delta) * ( usecs per HPET clock ) - * = (hpet delta) * ( usecs per tick / HPET clocks per tick) - * = (hpet delta) * ( hpet_usec_quotient ) / (2^32) - * Where, - * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick - */ - delay_at_last_interrupt = hpet_current - offset; - ASM_MUL64_REG(temp, delay_at_last_interrupt, - hpet_usec_quotient, delay_at_last_interrupt); -} -#endif - - -#ifdef CONFIG_CPU_FREQ -#include - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(void *v) -{ - unsigned int cpu; - for_each_online_cpu(cpu) { - cpufreq_get(cpu); - } - cpufreq_delayed_issched = 0; -} - -/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static inline void cpufreq_delayed_get(void) -{ - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n"); - schedule_work(&cpufreq_delayed_get_work); - } -} - -/* If the CPU frequency is scaled, TSC-based delays will need a different - * loops_per_jiffy value to function properly. - */ - -static unsigned int ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -#ifndef CONFIG_SMP -static unsigned long fast_gettimeoffset_ref = 0; -static unsigned int cpu_khz_ref = 0; -#endif - -static int -time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - - if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) - write_seqlock_irq(&xtime_lock); - if (!ref_freq) { - if (!freq->old){ - ref_freq = freq->new; - goto end; - } - ref_freq = freq->old; - loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; -#ifndef CONFIG_SMP - fast_gettimeoffset_ref = fast_gettimeoffset_quotient; - cpu_khz_ref = cpu_khz; -#endif - } - - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); -#ifndef CONFIG_SMP - if (cpu_khz) - cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); - if (use_tsc) { - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { - fast_gettimeoffset_quotient = cpufreq_scale(fast_gettimeoffset_ref, freq->new, ref_freq); - set_cyc2ns_scale(cpu_khz); - } - } -#endif - } - -end: - if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) - write_sequnlock_irq(&xtime_lock); - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - - -static int __init cpufreq_tsc(void) -{ - int ret; - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); - ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - if (!ret) - cpufreq_init = 1; - return ret; -} -core_initcall(cpufreq_tsc); - -#else /* CONFIG_CPU_FREQ */ -static inline void cpufreq_delayed_get(void) { return; } -#endif - -int recalibrate_cpu_khz(void) -{ -#ifndef CONFIG_SMP - unsigned int cpu_khz_old = cpu_khz; - - if (cpu_has_tsc) { - local_irq_disable(); - init_cpu_khz(); - local_irq_enable(); - cpu_data[0].loops_per_jiffy = - cpufreq_scale(cpu_data[0].loops_per_jiffy, - cpu_khz_old, - cpu_khz); - return 0; - } else - return -ENODEV; -#else - return -ENODEV; -#endif -} -EXPORT_SYMBOL(recalibrate_cpu_khz); - -static void mark_offset_tsc(void) -{ - unsigned long lost,delay; - unsigned long delta = last_tsc_low; - int count; - int countmp; - static int count1 = 0; - unsigned long long this_offset, last_offset; - static int lost_count = 0; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - /* - * It is important that these two operations happen almost at - * the same time. We do the RDTSC stuff first, since it's - * faster. To avoid any inconsistencies, we need interrupts - * disabled locally. - */ - - /* - * Interrupts are just disabled locally since the timer irq - * has the SA_INTERRUPT flag set. -arca - */ - - /* read Pentium cycle counter */ - - rdtsc(last_tsc_low, last_tsc_high); - - spin_lock(&i8253_lock); - outb_p(0x00, PIT_MODE); /* latch the count ASAP */ - - count = inb_p(PIT_CH0); /* read the latched count */ - count |= inb(PIT_CH0) << 8; - - /* - * VIA686a test code... reset the latch if count > max + 1 - * from timer_pit.c - cjb - */ - if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - - spin_unlock(&i8253_lock); - - if (pit_latch_buggy) { - /* get center value of last 3 time lutch */ - if ((count2 >= count && count >= count1) - || (count1 >= count && count >= count2)) { - count2 = count1; count1 = count; - } else if ((count1 >= count2 && count2 >= count) - || (count >= count2 && count2 >= count1)) { - countmp = count;count = count2; - count2 = count1;count1 = countmp; - } else { - count2 = count1; count1 = count; count = count1; - } - } - - /* lost tick compensation */ - delta = last_tsc_low - delta; - { - register unsigned long eax, edx; - eax = delta; - __asm__("mull %2" - :"=a" (eax), "=d" (edx) - :"rm" (fast_gettimeoffset_quotient), - "0" (eax)); - delta = edx; - } - delta += delay_at_last_interrupt; - lost = delta/(1000000/HZ); - delay = delta%(1000000/HZ); - if (lost >= 2 && detect_lost_ticks) { - jiffies_64 += lost-1; - - /* sanity check to ensure we're not always losing ticks */ - if (lost_count++ > 100) { - printk(KERN_WARNING "Losing too many ticks!\n"); - printk(KERN_WARNING "TSC cannot be used as a timesource. \n"); - printk(KERN_WARNING "Possible reasons for this are:\n"); - printk(KERN_WARNING " You're running with Speedstep,\n"); - printk(KERN_WARNING " You don't have DMA enabled for your hard disk (see hdparm),\n"); - printk(KERN_WARNING " Incorrect TSC synchronization on an SMP system (see dmesg).\n"); - printk(KERN_WARNING "Falling back to a sane timesource now.\n"); - - clock_fallback(); - } - /* ... but give the TSC a fair chance */ - if (lost_count > 25) - cpufreq_delayed_get(); - } else - lost_count = 0; - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - count = ((LATCH-1) - count) * TICK_SIZE; - delay_at_last_interrupt = (count + LATCH/2) / LATCH; - - /* catch corner case where tick rollover occured - * between tsc and pit reads (as noted when - * usec delta is > 90% # of usecs/tick) - */ - if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ)) - jiffies_64++; -} - -static int __init init_tsc(char* override) -{ - - /* check clock override */ - if (override[0] && strncmp(override,"tsc",3)) { -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled()) { - printk(KERN_ERR "Warning: clock= override failed. Defaulting to tsc\n"); - } else -#endif - { - return -ENODEV; - } - } - - /* - * If we have APM enabled or the CPU clock speed is variable - * (CPU stops clock on HLT or slows clock to save power) - * then the TSC timestamps may diverge by up to 1 jiffy from - * 'real time' but nothing will break. - * The most frequent case is that the CPU is "woken" from a halt - * state by the timer interrupt itself, so we get 0 error. In the - * rare cases where a driver would "wake" the CPU and request a - * timestamp, the maximum error is < 1 jiffy. But timestamps are - * still perfectly ordered. - * Note that the TSC counter will be reset if APM suspends - * to disk; this won't break the kernel, though, 'cuz we're - * smart. See arch/i386/kernel/apm.c. - */ - /* - * Firstly we have to do a CPU check for chips with - * a potentially buggy TSC. At this point we haven't run - * the ident/bugs checks so we must run this hook as it - * may turn off the TSC flag. - * - * NOTE: this doesn't yet handle SMP 486 machines where only - * some CPU's have a TSC. Thats never worked and nobody has - * moaned if you have the only one in the world - you fix it! - */ - - count2 = LATCH; /* initialize counter for mark_offset_tsc() */ - - if (cpu_has_tsc) { - unsigned long tsc_quotient; -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled() && hpet_use_timer) { - unsigned long result, remain; - printk("Using TSC for gettimeofday\n"); - tsc_quotient = calibrate_tsc_hpet(NULL); - timer_tsc.mark_offset = &mark_offset_tsc_hpet; - /* - * Math to calculate hpet to usec multiplier - * Look for the comments at get_offset_tsc_hpet() - */ - ASM_DIV64_REG(result, remain, hpet_tick, - 0, KERNEL_TICK_USEC); - if (remain > (hpet_tick >> 1)) - result++; /* rounding the result */ - - hpet_usec_quotient = result; - } else -#endif - { - tsc_quotient = calibrate_tsc(); - } - - if (tsc_quotient) { - fast_gettimeoffset_quotient = tsc_quotient; - use_tsc = 1; - /* - * We could be more selective here I suspect - * and just enable this for the next intel chips ? - */ - /* report CPU clock rate in Hz. - * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = - * clock/second. Our precision is about 100 ppm. - */ - { unsigned long eax=0, edx=1000; - __asm__("divl %2" - :"=a" (cpu_khz), "=d" (edx) - :"r" (tsc_quotient), - "0" (eax), "1" (edx)); - printk("Detected %u.%03u MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - } - set_cyc2ns_scale(cpu_khz); - return 0; - } - } - return -ENODEV; -} - -static int tsc_resume(void) -{ - write_seqlock(&monotonic_lock); - /* Assume this is the last mark offset time */ - rdtsc(last_tsc_low, last_tsc_high); -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled() && hpet_use_timer) - hpet_last = hpet_readl(HPET_COUNTER); -#endif - write_sequnlock(&monotonic_lock); - return 0; -} - -#ifndef CONFIG_X86_TSC -/* disable flag for tsc. Takes effect by clearing the TSC cpu flag - * in cpu/common.c */ -static int __init tsc_setup(char *str) -{ - tsc_disable = 1; - return 1; -} -#else -static int __init tsc_setup(char *str) -{ - printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC.\n"); - return 1; -} -#endif -__setup("notsc", tsc_setup); - - - -/************************************************************/ - -/* tsc timer_opts struct */ -static struct timer_opts timer_tsc = { - .name = "tsc", - .mark_offset = mark_offset_tsc, - .get_offset = get_offset_tsc, - .monotonic_clock = monotonic_clock_tsc, - .delay = delay_tsc, - .read_timer = read_timer_tsc, - .resume = tsc_resume, -}; - -struct init_timer_opts __initdata timer_tsc_init = { - .init = init_tsc, - .opts = &timer_tsc, -}; Index: linux-hres.q/arch/i386/kernel/tsc.c =================================================================== --- /dev/null +++ linux-hres.q/arch/i386/kernel/tsc.c @@ -0,0 +1,478 @@ +/* + * This code largely moved from arch/i386/kernel/timer/timer_tsc.c + * which was originally moved from arch/i386/kernel/time.c. + * See comments there for proper credits. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "mach_timer.h" + +/* + * On some systems the TSC frequency does not + * change with the cpu frequency. So we need + * an extra value to store the TSC freq + */ +unsigned int tsc_khz; + +int tsc_disable __cpuinitdata = 0; + +#ifdef CONFIG_X86_TSC +static int __init tsc_setup(char *str) +{ + printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " + "cannot disable TSC.\n"); + return 1; +} +#else +/* + * disable flag for tsc. Takes effect by clearing the TSC cpu flag + * in cpu/common.c + */ +static int __init tsc_setup(char *str) +{ + tsc_disable = 1; + + return 1; +} +#endif + +__setup("notsc", tsc_setup); + +/* + * code to mark and check if the TSC is unstable + * due to cpufreq or due to unsynced TSCs + */ +static int tsc_unstable; + +static inline int check_tsc_unstable(void) +{ + return tsc_unstable; +} + +void mark_tsc_unstable(void) +{ + tsc_unstable = 1; +} +EXPORT_SYMBOL_GPL(mark_tsc_unstable); + +/* Accellerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better percision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +static unsigned long cyc2ns_scale __read_mostly; + +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static inline void set_cyc2ns_scale(unsigned long cpu_khz) +{ + cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +} + +/* + * Scheduler clock - returns current time in nanosec units. + */ +unsigned long long sched_clock(void) +{ + unsigned long long this_offset; + + /* + * in the NUMA case we dont use the TSC as they are not + * synchronized across all CPUs. + */ +#ifndef CONFIG_NUMA + if (!cpu_khz || check_tsc_unstable()) +#endif + /* no locking but a rare wrong value is not a big deal */ + return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); + + /* read the Time Stamp Counter: */ + rdtscll(this_offset); + + /* return the value in ns */ + return cycles_2_ns(this_offset); +} + +static unsigned long calculate_cpu_khz(void) +{ + unsigned long long start, end; + unsigned long count; + u64 delta64; + int i; + unsigned long flags; + + local_irq_save(flags); + + /* run 3 times to ensure the cache is warm */ + for (i = 0; i < 3; i++) { + mach_prepare_counter(); + rdtscll(start); + mach_countup(&count); + rdtscll(end); + } + /* + * Error: ECTCNEVERSET + * The CTC wasn't reliable: we got a hit on the very first read, + * or the CPU was so fast/slow that the quotient wouldn't fit in + * 32 bits.. + */ + if (count <= 1) + goto err; + + delta64 = end - start; + + /* cpu freq too fast: */ + if (delta64 > (1ULL<<32)) + goto err; + + /* cpu freq too slow: */ + if (delta64 <= CALIBRATE_TIME_MSEC) + goto err; + + delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */ + do_div(delta64,CALIBRATE_TIME_MSEC); + + local_irq_restore(flags); + return (unsigned long)delta64; +err: + local_irq_restore(flags); + return 0; +} + +int recalibrate_cpu_khz(void) +{ +#ifndef CONFIG_SMP + unsigned long cpu_khz_old = cpu_khz; + + if (cpu_has_tsc) { + cpu_khz = calculate_cpu_khz(); + tsc_khz = cpu_khz; + cpu_data[0].loops_per_jiffy = + cpufreq_scale(cpu_data[0].loops_per_jiffy, + cpu_khz_old, cpu_khz); + return 0; + } else + return -ENODEV; +#else + return -ENODEV; +#endif +} + +EXPORT_SYMBOL(recalibrate_cpu_khz); + +void tsc_init(void) +{ + if (!cpu_has_tsc || tsc_disable) + return; + + cpu_khz = calculate_cpu_khz(); + tsc_khz = cpu_khz; + + if (!cpu_khz) + return; + + printk("Detected %lu.%03lu MHz processor.\n", + (unsigned long)cpu_khz / 1000, + (unsigned long)cpu_khz % 1000); + + set_cyc2ns_scale(cpu_khz); + use_tsc_delay(); +} + +#ifdef CONFIG_CPU_FREQ + +static unsigned int cpufreq_delayed_issched = 0; +static unsigned int cpufreq_init = 0; +static struct work_struct cpufreq_delayed_get_work; + +static void handle_cpufreq_delayed_get(void *v) +{ + unsigned int cpu; + + for_each_online_cpu(cpu) + cpufreq_get(cpu); + + cpufreq_delayed_issched = 0; +} + +/* + * if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries + * to verify the CPU frequency the timing core thinks the CPU is running + * at is still correct. + */ +static inline void cpufreq_delayed_get(void) +{ + if (cpufreq_init && !cpufreq_delayed_issched) { + cpufreq_delayed_issched = 1; + printk(KERN_DEBUG "Checking if CPU frequency changed.\n"); + schedule_work(&cpufreq_delayed_get_work); + } +} + +/* + * if the CPU frequency is scaled, TSC-based delays will need a different + * loops_per_jiffy value to function properly. + */ +static unsigned int ref_freq = 0; +static unsigned long loops_per_jiffy_ref = 0; +static unsigned long cpu_khz_ref = 0; + +static int +time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = data; + + if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) + write_seqlock_irq(&xtime_lock); + + if (!ref_freq) { + if (!freq->old){ + ref_freq = freq->new; + goto end; + } + ref_freq = freq->old; + loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; + cpu_khz_ref = cpu_khz; + } + + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || + (val == CPUFREQ_RESUMECHANGE)) { + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + cpu_data[freq->cpu].loops_per_jiffy = + cpufreq_scale(loops_per_jiffy_ref, + ref_freq, freq->new); + + if (cpu_khz) { + + if (num_online_cpus() == 1) + cpu_khz = cpufreq_scale(cpu_khz_ref, + ref_freq, freq->new); + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { + tsc_khz = cpu_khz; + set_cyc2ns_scale(cpu_khz); + /* + * TSC based sched_clock turns + * to junk w/ cpufreq + */ + mark_tsc_unstable(); + } + } + } +end: + if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) + write_sequnlock_irq(&xtime_lock); + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + +static int __init cpufreq_tsc(void) +{ + int ret; + + INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); + ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + if (!ret) + cpufreq_init = 1; + + return ret; +} + +core_initcall(cpufreq_tsc); + +#endif + +/* clock source code */ + +static unsigned long current_tsc_khz = 0; +static int tsc_update_callback(void); + +static cycle_t read_tsc(void) +{ + cycle_t ret; + + rdtscll(ret); + + return ret; +} + +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .mult = 0, /* to be set */ + .shift = 22, + .update_callback = tsc_update_callback, + .is_continuous = 1, +}; + +static int tsc_update_callback(void) +{ + int change = 0; + + /* check to see if we should switch to the safe clocksource: */ + if (clocksource_tsc.rating != 50 && check_tsc_unstable()) { + clocksource_tsc.rating = 50; + clocksource_reselect(); + change = 1; + } + + /* only update if tsc_khz has changed: */ + if (current_tsc_khz != tsc_khz) { + current_tsc_khz = tsc_khz; + clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, + clocksource_tsc.shift); + change = 1; + } + + return change; +} + +static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d) +{ + printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", + d->ident); + mark_tsc_unstable(); + return 0; +} + +/* List of systems that have known TSC problems */ +static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { + { + .callback = dmi_mark_tsc_unstable, + .ident = "IBM Thinkpad 380XD", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), + }, + }, + {} +}; + +#define TSC_FREQ_CHECK_INTERVAL (10*MSEC_PER_SEC) /* 10sec in MS */ +static struct timer_list verify_tsc_freq_timer; + +/* XXX - Probably should add locking */ +static void verify_tsc_freq(unsigned long unused) +{ + static u64 last_tsc; + static unsigned long last_jiffies; + + u64 now_tsc, interval_tsc; + unsigned long now_jiffies, interval_jiffies; + + + if (check_tsc_unstable()) + return; + + rdtscll(now_tsc); + now_jiffies = jiffies; + + if (!last_jiffies) { + goto out; + } + + interval_jiffies = now_jiffies - last_jiffies; + interval_tsc = now_tsc - last_tsc; + interval_tsc *= HZ; + do_div(interval_tsc, cpu_khz*1000); + + if (interval_tsc < (interval_jiffies * 3 / 4)) { + printk("TSC appears to be running slowly. " + "Marking it as unstable\n"); + mark_tsc_unstable(); + return; + } + +out: + last_tsc = now_tsc; + last_jiffies = now_jiffies; + /* set us up to go off on the next interval: */ + mod_timer(&verify_tsc_freq_timer, + jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL)); +} + +/* + * Make an educated guess if the TSC is trustworthy and synchronized + * over all CPUs. + */ +static __init int unsynchronized_tsc(void) +{ + /* + * Intel systems are normally all synchronized. + * Exceptions must mark TSC as unstable: + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return 0; + + /* assume multi socket systems are not synchronized: */ + return num_possible_cpus() > 1; +} + +static int __init init_tsc_clocksource(void) +{ + + if (cpu_has_tsc && tsc_khz && !tsc_disable) { + /* check blacklist */ + dmi_check_system(bad_tsc_dmi_table); + + if (unsynchronized_tsc()) /* mark unstable if unsynced */ + mark_tsc_unstable(); + current_tsc_khz = tsc_khz; + clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, + clocksource_tsc.shift); + /* lower the rating if we already know its unstable: */ + if (check_tsc_unstable()) + clocksource_tsc.rating = 50; + + init_timer(&verify_tsc_freq_timer); + verify_tsc_freq_timer.function = verify_tsc_freq; + verify_tsc_freq_timer.expires = + jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL); + add_timer(&verify_tsc_freq_timer); + + return clocksource_register(&clocksource_tsc); + } + + return 0; +} + +module_init(init_tsc_clocksource); Index: linux-hres.q/arch/i386/lib/delay.c =================================================================== --- linux-hres.q.orig/arch/i386/lib/delay.c +++ linux-hres.q/arch/i386/lib/delay.c @@ -10,43 +10,92 @@ * we have to worry about. */ +#include #include #include #include -#include + #include #include #include #ifdef CONFIG_SMP -#include +# include #endif -extern struct timer_opts* timer; +/* simple loop based delay: */ +static void delay_loop(unsigned long loops) +{ + int d0; + + __asm__ __volatile__( + "\tjmp 1f\n" + ".align 16\n" + "1:\tjmp 2f\n" + ".align 16\n" + "2:\tdecl %0\n\tjns 2b" + :"=&a" (d0) + :"0" (loops)); +} + +/* TSC based delay: */ +static void delay_tsc(unsigned long loops) +{ + unsigned long bclock, now; + + rdtscl(bclock); + do { + rep_nop(); + rdtscl(now); + } while ((now-bclock) < loops); +} + +/* + * Since we calibrate only once at boot, this + * function should be set once at boot and not changed + */ +static void (*delay_fn)(unsigned long) = delay_loop; + +void use_tsc_delay(void) +{ + delay_fn = delay_tsc; +} + +int read_current_timer(unsigned long *timer_val) +{ + if (delay_fn == delay_tsc) { + rdtscl(*timer_val); + return 0; + } + return -1; +} void __delay(unsigned long loops) { - cur_timer->delay(loops); + delay_fn(loops); } inline void __const_udelay(unsigned long xloops) { int d0; + xloops *= 4; __asm__("mull %0" :"=d" (xloops), "=&a" (d0) - :"1" (xloops),"0" (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4))); - __delay(++xloops); + :"1" (xloops), "0" + (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4))); + + __delay(++xloops); } void __udelay(unsigned long usecs) { - __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ + __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ } void __ndelay(unsigned long nsecs) { - __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ + __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ } EXPORT_SYMBOL(__delay); Index: linux-hres.q/arch/ia64/kernel/process.c =================================================================== --- linux-hres.q.orig/arch/ia64/kernel/process.c +++ linux-hres.q/arch/ia64/kernel/process.c @@ -272,9 +272,9 @@ cpu_idle (void) /* endless idle loop with no priority at all */ while (1) { if (can_do_pal_halt) - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; else - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; if (!need_resched()) { void (*idle)(void); Index: linux-hres.q/arch/powerpc/kernel/time.c =================================================================== --- linux-hres.q.orig/arch/powerpc/kernel/time.c +++ linux-hres.q/arch/powerpc/kernel/time.c @@ -535,7 +535,7 @@ static __inline__ void timer_recalc_offs if (__USE_RTC()) return; - tlen = current_tick_length(); + tlen = current_tick_length(SHIFT_SCALE - 10); offset = cur_tb - do_gtod.varp->tb_orig_stamp; if (tlen == last_tick_len && offset < 0x80000000u) return; Index: linux-hres.q/arch/x86_64/kernel/pmtimer.c =================================================================== --- linux-hres.q.orig/arch/x86_64/kernel/pmtimer.c +++ linux-hres.q/arch/x86_64/kernel/pmtimer.c @@ -27,7 +27,7 @@ /* The I/O port the PMTMR resides at. * The location is detected during setup_arch(), * in arch/i386/kernel/acpi/boot.c */ -u32 pmtmr_ioport; +u32 pmtmr_ioport __read_mostly; /* value of the Power timer at last timer interrupt */ static u32 offset_delay; Index: linux-hres.q/arch/x86_64/kernel/process.c =================================================================== --- linux-hres.q.orig/arch/x86_64/kernel/process.c +++ linux-hres.q/arch/x86_64/kernel/process.c @@ -111,7 +111,7 @@ static void default_idle(void) { local_irq_enable(); - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); while (!need_resched()) { local_irq_disable(); @@ -120,7 +120,7 @@ static void default_idle(void) else local_irq_enable(); } - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; } /* @@ -203,8 +203,7 @@ static inline void play_dead(void) */ void cpu_idle (void) { - set_thread_flag(TIF_POLLING_NRFLAG); - + current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { while (!need_resched()) { Index: linux-hres.q/drivers/Makefile =================================================================== --- linux-hres.q.orig/drivers/Makefile +++ linux-hres.q/drivers/Makefile @@ -58,6 +58,7 @@ obj-$(CONFIG_RTC_LIB) += rtc/ obj-$(CONFIG_I2C) += i2c/ obj-$(CONFIG_W1) += w1/ obj-$(CONFIG_HWMON) += hwmon/ +obj-$(CONFIG_GENERIC_TIME) += clocksource/ obj-$(CONFIG_PHONE) += telephony/ obj-$(CONFIG_MD) += md/ obj-$(CONFIG_BT) += bluetooth/ Index: linux-hres.q/drivers/acpi/processor_idle.c =================================================================== --- linux-hres.q.orig/drivers/acpi/processor_idle.c +++ linux-hres.q/drivers/acpi/processor_idle.c @@ -206,11 +206,11 @@ acpi_processor_power_activate(struct acp static void acpi_safe_halt(void) { - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); if (!need_resched()) safe_halt(); - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; } static atomic_t c3_cpu_count; @@ -330,10 +330,10 @@ static void acpi_processor_idle(void) * Invoke the current Cx state to put the processor to sleep. */ if (cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3) { - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); if (need_resched()) { - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; local_irq_enable(); return; } @@ -369,9 +369,14 @@ static void acpi_processor_idle(void) t2 = inl(acpi_fadt.xpm_tmr_blk.address); /* Get end time (ticks) */ t2 = inl(acpi_fadt.xpm_tmr_blk.address); + +#ifdef CONFIG_GENERIC_TIME + /* TSC halts in C2, so notify users */ + mark_tsc_unstable(); +#endif /* Re-enable interrupts */ local_irq_enable(); - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; /* Compute time (ticks) that we were actually asleep */ sleep_ticks = ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD; @@ -409,9 +414,13 @@ static void acpi_processor_idle(void) ACPI_MTX_DO_NOT_LOCK); } +#ifdef CONFIG_GENERIC_TIME + /* TSC halts in C3, so notify users */ + mark_tsc_unstable(); +#endif /* Re-enable interrupts */ local_irq_enable(); - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; /* Compute time (ticks) that we were actually asleep */ sleep_ticks = ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD; Index: linux-hres.q/drivers/char/hangcheck-timer.c =================================================================== --- linux-hres.q.orig/drivers/char/hangcheck-timer.c +++ linux-hres.q/drivers/char/hangcheck-timer.c @@ -117,12 +117,12 @@ __setup("hcheck_reboot", hangcheck_parse __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks); #endif /* not MODULE */ -#if defined(CONFIG_X86) || defined(CONFIG_S390) +#if defined(CONFIG_X86_64) || defined(CONFIG_S390) # define HAVE_MONOTONIC # define TIMER_FREQ 1000000000ULL #elif defined(CONFIG_IA64) # define TIMER_FREQ ((unsigned long long)local_cpu_data->itc_freq) -#elif defined(CONFIG_PPC64) +#else # define TIMER_FREQ (HZ*loops_per_jiffy) #endif @@ -133,6 +133,8 @@ static inline unsigned long long monoton { return get_cycles(); } + +EXPORT_SYMBOL(monotonic_clock); #endif /* HAVE_MONOTONIC */ Index: linux-hres.q/drivers/char/ipmi/ipmi_si_intf.c =================================================================== --- linux-hres.q.orig/drivers/char/ipmi/ipmi_si_intf.c +++ linux-hres.q/drivers/char/ipmi/ipmi_si_intf.c @@ -55,23 +55,6 @@ #include #include #include -#ifdef CONFIG_HIGH_RES_TIMERS -#include -# if defined(schedule_next_int) -/* Old high-res timer code, do translations. */ -# define get_arch_cycles(a) quick_update_jiffies_sub(a) -# define arch_cycles_per_jiffy cycles_per_jiffies -# endif -static inline void add_usec_to_timer(struct timer_list *t, long v) -{ - t->arch_cycle_expires += nsec_to_arch_cycle(v * 1000); - while (t->arch_cycle_expires >= arch_cycles_per_jiffy) - { - t->expires++; - t->arch_cycle_expires -= arch_cycles_per_jiffy; - } -} -#endif #include #include #include @@ -836,32 +819,6 @@ static int initialized = 0; /* Must be called with interrupts off and with the si_lock held. */ static void si_restart_short_timer(struct smi_info *smi_info) { -#if defined(CONFIG_HIGH_RES_TIMERS) - unsigned long flags; - unsigned long jiffies_now; - unsigned long seq; - - if (del_timer(&(smi_info->si_timer))) { - /* If we don't delete the timer, then it will go off - immediately, anyway. So we only process if we - actually delete the timer. */ - - do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); - jiffies_now = jiffies; - smi_info->si_timer.expires = jiffies_now; - smi_info->si_timer.arch_cycle_expires - = get_arch_cycles(jiffies_now); - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - - add_usec_to_timer(&smi_info->si_timer, SI_SHORT_TIMEOUT_USEC); - - add_timer(&(smi_info->si_timer)); - spin_lock_irqsave(&smi_info->count_lock, flags); - smi_info->timeout_restarts++; - spin_unlock_irqrestore(&smi_info->count_lock, flags); - } -#endif } static void smi_timeout(unsigned long data) @@ -904,31 +861,15 @@ static void smi_timeout(unsigned long da /* If the state machine asks for a short delay, then shorten the timer timeout. */ if (smi_result == SI_SM_CALL_WITH_DELAY) { -#if defined(CONFIG_HIGH_RES_TIMERS) - unsigned long seq; -#endif spin_lock_irqsave(&smi_info->count_lock, flags); smi_info->short_timeouts++; spin_unlock_irqrestore(&smi_info->count_lock, flags); -#if defined(CONFIG_HIGH_RES_TIMERS) - do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); - smi_info->si_timer.expires = jiffies; - smi_info->si_timer.arch_cycle_expires - = get_arch_cycles(smi_info->si_timer.expires); - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - add_usec_to_timer(&smi_info->si_timer, SI_SHORT_TIMEOUT_USEC); -#else smi_info->si_timer.expires = jiffies + 1; -#endif } else { spin_lock_irqsave(&smi_info->count_lock, flags); smi_info->long_timeouts++; spin_unlock_irqrestore(&smi_info->count_lock, flags); smi_info->si_timer.expires = jiffies + SI_TIMEOUT_JIFFIES; -#if defined(CONFIG_HIGH_RES_TIMERS) - smi_info->si_timer.arch_cycle_expires = 0; -#endif } do_add_timer: Index: linux-hres.q/drivers/clocksource/Makefile =================================================================== --- /dev/null +++ linux-hres.q/drivers/clocksource/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_X86_CYCLONE_TIMER) += cyclone.o +obj-$(CONFIG_X86_PM_TIMER) += acpi_pm.o Index: linux-hres.q/drivers/clocksource/acpi_pm.c =================================================================== --- /dev/null +++ linux-hres.q/drivers/clocksource/acpi_pm.c @@ -0,0 +1,177 @@ +/* + * linux/drivers/clocksource/acpi_pm.c + * + * This file contains the ACPI PM based clocksource. + * + * This code was largely moved from the i386 timer_pm.c file + * which was (C) Dominik Brodowski 2003 + * and contained the following comments: + * + * Driver to use the Power Management Timer (PMTMR) available in some + * southbridges as primary timing source for the Linux kernel. + * + * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, + * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. + * + * This file is licensed under the GPL v2. + */ + +#include +#include +#include +#include +#include + +/* Number of PMTMR ticks expected during calibration run */ +#define PMTMR_TICKS_PER_SEC 3579545 + +/* + * The I/O port the PMTMR resides at. + * The location is detected during setup_arch(), + * in arch/i386/acpi/boot.c + */ +u32 pmtmr_ioport __read_mostly; + +#define ACPI_PM_MASK CLOCKSOURCE_MASK(24) /* limit it to 24 bits */ + +static inline u32 read_pmtmr(void) +{ + /* mask the output to 24 bits */ + return inl(pmtmr_ioport) & ACPI_PM_MASK; +} + +static cycle_t acpi_pm_read_verified(void) +{ + u32 v1 = 0, v2 = 0, v3 = 0; + + /* + * It has been reported that because of various broken + * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM clock + * source is not latched, you must read it multiple + * times to ensure a safe value is read: + */ + do { + v1 = read_pmtmr(); + v2 = read_pmtmr(); + v3 = read_pmtmr(); + } while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1) + || (v3 > v1 && v3 < v2)); + + return (cycle_t)v2; +} + +static cycle_t acpi_pm_read(void) +{ + return (cycle_t)read_pmtmr(); +} + +static struct clocksource clocksource_acpi_pm = { + .name = "acpi_pm", + .rating = 200, + .read = acpi_pm_read, + .mask = (cycle_t)ACPI_PM_MASK, + .mult = 0, /*to be caluclated*/ + .shift = 22, + .is_continuous = 1, +}; + + +#ifdef CONFIG_PCI +static int acpi_pm_good; +static int __init acpi_pm_good_setup(char *__str) +{ + acpi_pm_good = 1; + return 1; +} +__setup("acpi_pm_good", acpi_pm_good_setup); + +static inline void acpi_pm_need_workaround(void) +{ + clocksource_acpi_pm.read = acpi_pm_read_verified; + clocksource_acpi_pm.rating = 110; +} + +/* + * PIIX4 Errata: + * + * The power management timer may return improper results when read. + * Although the timer value settles properly after incrementing, + * while incrementing there is a 3 ns window every 69.8 ns where the + * timer value is indeterminate (a 4.2% chance that the data will be + * incorrect when read). As a result, the ACPI free running count up + * timer specification is violated due to erroneous reads. + */ +static void __devinit acpi_pm_check_blacklist(struct pci_dev *dev) +{ + u8 rev; + + if (acpi_pm_good) + return; + + pci_read_config_byte(dev, PCI_REVISION_ID, &rev); + /* the bug has been fixed in PIIX4M */ + if (rev < 3) { + printk(KERN_WARNING "* Found PM-Timer Bug on the chipset." + " Due to workarounds for a bug,\n" + "* this clock source is slow. Consider trying" + " other clock sources\n"); + + acpi_pm_need_workaround(); + } +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_3, + acpi_pm_check_blacklist); + +static void __devinit acpi_pm_check_graylist(struct pci_dev *dev) +{ + if (acpi_pm_good) + return; + + printk(KERN_WARNING "* The chipset may have PM-Timer Bug. Due to" + " workarounds for a bug,\n" + "* this clock source is slow. If you are sure your timer" + " does not have\n" + "* this bug, please use \"acpi_pm_good\" to disable the" + " workaround\n"); + + acpi_pm_need_workaround(); +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0, + acpi_pm_check_graylist); +#endif + + +static int __init init_acpi_pm_clocksource(void) +{ + u32 value1, value2; + unsigned int i; + + if (!pmtmr_ioport) + return -ENODEV; + + clocksource_acpi_pm.mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, + clocksource_acpi_pm.shift); + + /* "verify" this timing source: */ + value1 = read_pmtmr(); + for (i = 0; i < 10000; i++) { + value2 = read_pmtmr(); + if (value2 == value1) + continue; + if (value2 > value1) + goto pm_good; + if ((value2 < value1) && ((value2) < 0xFFF)) + goto pm_good; + printk(KERN_INFO "PM-Timer had inconsistent results:" + " 0x%#x, 0x%#x - aborting.\n", value1, value2); + return -EINVAL; + } + printk(KERN_INFO "PM-Timer had no reasonable result:" + " 0x%#x - aborting.\n", value1); + return -ENODEV; + +pm_good: + return clocksource_register(&clocksource_acpi_pm); +} + +module_init(init_acpi_pm_clocksource); Index: linux-hres.q/drivers/clocksource/cyclone.c =================================================================== --- /dev/null +++ linux-hres.q/drivers/clocksource/cyclone.c @@ -0,0 +1,119 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include "mach_timer.h" + +#define CYCLONE_CBAR_ADDR 0xFEB00CD0 /* base address ptr */ +#define CYCLONE_PMCC_OFFSET 0x51A0 /* offset to control register */ +#define CYCLONE_MPCS_OFFSET 0x51A8 /* offset to select register */ +#define CYCLONE_MPMC_OFFSET 0x51D0 /* offset to count register */ +#define CYCLONE_TIMER_FREQ 99780000 /* 100Mhz, but not really */ +#define CYCLONE_TIMER_MASK CLOCKSOURCE_MASK(32) /* 32 bit mask */ + +int use_cyclone = 0; +static void __iomem *cyclone_ptr; + +static cycle_t read_cyclone(void) +{ + return (cycle_t)readl(cyclone_ptr); +} + +static struct clocksource clocksource_cyclone = { + .name = "cyclone", + .rating = 250, + .read = read_cyclone, + .mask = CYCLONE_TIMER_MASK, + .mult = 10, + .shift = 0, + .is_continuous = 1, +}; + +static int __init init_cyclone_clocksource(void) +{ + unsigned long base; /* saved value from CBAR */ + unsigned long offset; + u32 __iomem* volatile cyclone_timer; /* Cyclone MPMC0 register */ + u32 __iomem* reg; + int i; + + /* make sure we're on a summit box: */ + if (!use_cyclone) + return -ENODEV; + + printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n"); + + /* find base address: */ + offset = CYCLONE_CBAR_ADDR; + reg = ioremap_nocache(offset, sizeof(reg)); + if (!reg) { + printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n"); + return -ENODEV; + } + /* even on 64bit systems, this is only 32bits: */ + base = readl(reg); + if (!base) { + printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n"); + return -ENODEV; + } + iounmap(reg); + + /* setup PMCC: */ + offset = base + CYCLONE_PMCC_OFFSET; + reg = ioremap_nocache(offset, sizeof(reg)); + if (!reg) { + printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n"); + return -ENODEV; + } + writel(0x00000001,reg); + iounmap(reg); + + /* setup MPCS: */ + offset = base + CYCLONE_MPCS_OFFSET; + reg = ioremap_nocache(offset, sizeof(reg)); + if (!reg) { + printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n"); + return -ENODEV; + } + writel(0x00000001,reg); + iounmap(reg); + + /* map in cyclone_timer: */ + offset = base + CYCLONE_MPMC_OFFSET; + cyclone_timer = ioremap_nocache(offset, sizeof(u64)); + if (!cyclone_timer) { + printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n"); + return -ENODEV; + } + + /* quick test to make sure its ticking: */ + for (i = 0; i < 3; i++){ + u32 old = readl(cyclone_timer); + int stall = 100; + + while (stall--) + barrier(); + + if (readl(cyclone_timer) == old) { + printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n"); + iounmap(cyclone_timer); + cyclone_timer = NULL; + return -ENODEV; + } + } + cyclone_ptr = cyclone_timer; + + /* sort out mult/shift values: */ + clocksource_cyclone.shift = 22; + clocksource_cyclone.mult = clocksource_hz2mult(CYCLONE_TIMER_FREQ, + clocksource_cyclone.shift); + + return clocksource_register(&clocksource_cyclone); +} + +module_init(init_cyclone_clocksource); Index: linux-hres.q/drivers/input/serio/i8042.c =================================================================== --- linux-hres.q.orig/drivers/input/serio/i8042.c +++ linux-hres.q/drivers/input/serio/i8042.c @@ -1085,7 +1085,7 @@ static int __devinit i8042_probe(struct goto err_controller_cleanup; } - mod_timer(&i8042_timer, jiffies + I8042_POLL_PERIOD); + mod_timer(&i8042_timer, jiffies + 2); //I8042_POLL_PERIOD); return 0; err_unregister_ports: Index: linux-hres.q/drivers/input/serio/i8042.h =================================================================== --- linux-hres.q.orig/drivers/input/serio/i8042.h +++ linux-hres.q/drivers/input/serio/i8042.h @@ -44,7 +44,7 @@ * polling. */ -#define I8042_POLL_PERIOD HZ/20 +#define I8042_POLL_PERIOD (10*HZ) /* * Status register bits. Index: linux-hres.q/fs/proc/proc_misc.c =================================================================== --- linux-hres.q.orig/fs/proc/proc_misc.c +++ linux-hres.q/fs/proc/proc_misc.c @@ -508,6 +508,8 @@ static int show_stat(struct seq_file *p, nr_running(), nr_iowait()); + show_no_hz_stats(p); + return 0; } Index: linux-hres.q/include/asm-generic/percpu.h =================================================================== --- linux-hres.q.orig/include/asm-generic/percpu.h +++ linux-hres.q/include/asm-generic/percpu.h @@ -14,6 +14,7 @@ extern unsigned long __per_cpu_offset[NR /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu])) #define __get_cpu_var(var) per_cpu(var, smp_processor_id()) +#define __raw_get_cpu_var(var) per_cpu(var, raw_smp_processor_id()) /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ @@ -30,6 +31,7 @@ do { \ #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #endif /* SMP */ Index: linux-hres.q/include/asm-i386/delay.h =================================================================== --- linux-hres.q.orig/include/asm-i386/delay.h +++ linux-hres.q/include/asm-i386/delay.h @@ -23,4 +23,6 @@ extern void __delay(unsigned long loops) ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \ __ndelay(n)) +void use_tsc_delay(void); + #endif /* defined(_I386_DELAY_H) */ Index: linux-hres.q/include/asm-i386/i8253.h =================================================================== --- linux-hres.q.orig/include/asm-i386/i8253.h +++ linux-hres.q/include/asm-i386/i8253.h @@ -2,5 +2,6 @@ #define __ASM_I8253_H__ extern spinlock_t i8253_lock; +extern struct clock_event pit_clockevent; #endif /* __ASM_I8253_H__ */ Index: linux-hres.q/include/asm-i386/mach-default/do_timer.h =================================================================== --- linux-hres.q.orig/include/asm-i386/mach-default/do_timer.h +++ linux-hres.q/include/asm-i386/mach-default/do_timer.h @@ -1,7 +1,8 @@ /* defines for inline arch setup functions */ - +#include #include #include +#include /** * do_timer_interrupt_hook - hook into timer tick @@ -16,24 +17,9 @@ static inline void do_timer_interrupt_hook(struct pt_regs *regs) { - do_timer(regs); -#ifndef CONFIG_SMP - update_process_times(user_mode_vm(regs)); -#endif -/* - * In the SMP case we use the local APIC timer interrupt to do the - * profiling, except when we simulate SMP mode on a uniprocessor - * system, in that case we have to call the local interrupt handler. - */ -#ifndef CONFIG_X86_LOCAL_APIC - profile_tick(CPU_PROFILING, regs); -#else - if (!using_apic_timer) - smp_local_timer_interrupt(regs); -#endif + pit_clockevent.event_handler(regs); } - /* you can safely undefine this if you don't have the Neptune chipset */ #define BUGGY_NEPTUN_TIMER Index: linux-hres.q/include/asm-i386/mach-default/mach_timer.h =================================================================== --- linux-hres.q.orig/include/asm-i386/mach-default/mach_timer.h +++ linux-hres.q/include/asm-i386/mach-default/mach_timer.h @@ -15,7 +15,9 @@ #ifndef _MACH_TIMER_H #define _MACH_TIMER_H -#define CALIBRATE_LATCH (5 * LATCH) +#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */ +#define CALIBRATE_LATCH \ + ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000) static inline void mach_prepare_counter(void) { Index: linux-hres.q/include/asm-i386/mach-summit/mach_mpparse.h =================================================================== --- linux-hres.q.orig/include/asm-i386/mach-summit/mach_mpparse.h +++ linux-hres.q/include/asm-i386/mach-summit/mach_mpparse.h @@ -2,6 +2,7 @@ #define __ASM_MACH_MPPARSE_H #include +#include extern int use_cyclone; @@ -29,6 +30,7 @@ static inline int mps_oem_check(struct m (!strncmp(productid, "VIGIL SMP", 9) || !strncmp(productid, "EXA", 3) || !strncmp(productid, "RUTHLESS SMP", 12))){ + mark_tsc_unstable(); use_cyclone = 1; /*enable cyclone-timer*/ setup_summit(); return 1; @@ -42,6 +44,7 @@ static inline int acpi_madt_oem_check(ch if (!strncmp(oem_id, "IBM", 3) && (!strncmp(oem_table_id, "SERVIGIL", 8) || !strncmp(oem_table_id, "EXA", 3))){ + mark_tsc_unstable(); use_cyclone = 1; /*enable cyclone-timer*/ setup_summit(); return 1; Index: linux-hres.q/include/asm-i386/mach-visws/do_timer.h =================================================================== --- linux-hres.q.orig/include/asm-i386/mach-visws/do_timer.h +++ linux-hres.q/include/asm-i386/mach-visws/do_timer.h @@ -9,7 +9,10 @@ static inline void do_timer_interrupt_ho /* Clear the interrupt */ co_cpu_write(CO_CPU_STAT,co_cpu_read(CO_CPU_STAT) & ~CO_STAT_TIMEINTR); + write_seqlock(&xtime_lock); do_timer(regs); + write_sequnlock(&xtime_lock); + #ifndef CONFIG_SMP update_process_times(user_mode_vm(regs)); #endif Index: linux-hres.q/include/asm-i386/thread_info.h =================================================================== --- linux-hres.q.orig/include/asm-i386/thread_info.h +++ linux-hres.q/include/asm-i386/thread_info.h @@ -141,8 +141,7 @@ register unsigned long current_stack_poi #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ -#define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ -#define TIF_MEMDIE 17 +#define TIF_MEMDIE 16 #define _TIF_SYSCALL_TRACE (1<thread_info->status & TS_POLLING) #endif /* __KERNEL__ */ Index: linux-hres.q/include/asm-i386/timer.h =================================================================== --- linux-hres.q.orig/include/asm-i386/timer.h +++ linux-hres.q/include/asm-i386/timer.h @@ -3,68 +3,11 @@ #include #include -/** - * struct timer_ops - used to define a timer source - * - * @name: name of the timer. - * @init: Probes and initializes the timer. Takes clock= override - * string as an argument. Returns 0 on success, anything else - * on failure. - * @mark_offset: called by the timer interrupt. - * @get_offset: called by gettimeofday(). Returns the number of microseconds - * since the last timer interupt. - * @monotonic_clock: returns the number of nanoseconds since the init of the - * timer. - * @delay: delays this many clock cycles. - */ -struct timer_opts { - char* name; - void (*mark_offset)(void); - unsigned long (*get_offset)(void); - unsigned long long (*monotonic_clock)(void); - void (*delay)(unsigned long); - unsigned long (*read_timer)(void); - int (*suspend)(pm_message_t state); - int (*resume)(void); -}; - -struct init_timer_opts { - int (*init)(char *override); - struct timer_opts *opts; -}; - #define TICK_SIZE (tick_nsec / 1000) - -extern struct timer_opts* __init select_timer(void); -extern void clock_fallback(void); void setup_pit_timer(void); - /* Modifiers for buggy PIT handling */ - extern int pit_latch_buggy; - -extern struct timer_opts *cur_timer; extern int timer_ack; - -/* list of externed timers */ -extern struct timer_opts timer_none; -extern struct timer_opts timer_pit; -extern struct init_timer_opts timer_pit_init; -extern struct init_timer_opts timer_tsc_init; -#ifdef CONFIG_X86_CYCLONE_TIMER -extern struct init_timer_opts timer_cyclone_init; -#endif - -extern unsigned long calibrate_tsc(void); -extern unsigned long read_timer_tsc(void); -extern void init_cpu_khz(void); extern int recalibrate_cpu_khz(void); -#ifdef CONFIG_HPET_TIMER -extern struct init_timer_opts timer_hpet_init; -extern unsigned long calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr); -#endif -#ifdef CONFIG_X86_PM_TIMER -extern struct init_timer_opts timer_pmtmr_init; -#endif #endif Index: linux-hres.q/include/asm-i386/timex.h =================================================================== --- linux-hres.q.orig/include/asm-i386/timex.h +++ linux-hres.q/include/asm-i386/timex.h @@ -8,6 +8,7 @@ #include #include +#include #ifdef CONFIG_X86_ELAN # define CLOCK_TICK_RATE 1189200 /* AMD Elan has different frequency! */ @@ -16,39 +17,6 @@ #endif -/* - * Standard way to access the cycle counter on i586+ CPUs. - * Currently only used on SMP. - * - * If you really have a SMP machine with i486 chips or older, - * compile for that, and this will just always return zero. - * That's ok, it just means that the nicer scheduling heuristics - * won't work for you. - * - * We only use the low 32 bits, and we'd simply better make sure - * that we reschedule before that wraps. Scheduling at least every - * four billion cycles just basically sounds like a good idea, - * regardless of how fast the machine is. - */ -typedef unsigned long long cycles_t; - -static inline cycles_t get_cycles (void) -{ - unsigned long long ret=0; - -#ifndef CONFIG_X86_TSC - if (!cpu_has_tsc) - return 0; -#endif - -#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC) - rdtscll(ret); -#endif - return ret; -} - -extern unsigned int cpu_khz; - extern int read_current_timer(unsigned long *timer_value); #define ARCH_HAS_READ_CURRENT_TIMER 1 Index: linux-hres.q/include/asm-i386/tsc.h =================================================================== --- /dev/null +++ linux-hres.q/include/asm-i386/tsc.h @@ -0,0 +1,49 @@ +/* + * linux/include/asm-i386/tsc.h + * + * i386 TSC related functions + */ +#ifndef _ASM_i386_TSC_H +#define _ASM_i386_TSC_H + +#include +#include + +/* + * Standard way to access the cycle counter on i586+ CPUs. + * Currently only used on SMP. + * + * If you really have a SMP machine with i486 chips or older, + * compile for that, and this will just always return zero. + * That's ok, it just means that the nicer scheduling heuristics + * won't work for you. + * + * We only use the low 32 bits, and we'd simply better make sure + * that we reschedule before that wraps. Scheduling at least every + * four billion cycles just basically sounds like a good idea, + * regardless of how fast the machine is. + */ +typedef unsigned long long cycles_t; + +extern unsigned int cpu_khz; +extern unsigned int tsc_khz; + +static inline cycles_t get_cycles(void) +{ + unsigned long long ret = 0; + +#ifndef CONFIG_X86_TSC + if (!cpu_has_tsc) + return 0; +#endif + +#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC) + rdtscll(ret); +#endif + return ret; +} + +extern void tsc_init(void); +extern void mark_tsc_unstable(void); + +#endif Index: linux-hres.q/include/asm-ia64/percpu.h =================================================================== --- linux-hres.q.orig/include/asm-ia64/percpu.h +++ linux-hres.q/include/asm-ia64/percpu.h @@ -43,6 +43,7 @@ DECLARE_PER_CPU(unsigned long, local_per #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu])) #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __ia64_per_cpu_var(local_per_cpu_offset))) +#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __ia64_per_cpu_var(local_per_cpu_offset))) extern void percpu_modcopy(void *pcpudst, const void *src, unsigned long size); extern void setup_per_cpu_areas (void); @@ -52,6 +53,7 @@ extern void *per_cpu_init(void); #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #define per_cpu_init() (__phys_per_cpu_start) #endif /* SMP */ Index: linux-hres.q/include/asm-ia64/thread_info.h =================================================================== --- linux-hres.q.orig/include/asm-ia64/thread_info.h +++ linux-hres.q/include/asm-ia64/thread_info.h @@ -27,6 +27,7 @@ struct thread_info { __u32 flags; /* thread_info flags (see TIF_*) */ __u32 cpu; /* current CPU */ __u32 last_cpu; /* Last CPU thread ran on */ + __u32 status; /* Thread synchronous flags */ mm_segment_t addr_limit; /* user-level address space limit */ int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ struct restart_block restart_block; @@ -103,4 +104,8 @@ struct thread_info { /* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */ #define TIF_WORK_MASK (TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)) +#define TS_POLLING 1 /* true if in idle loop and not sleeping */ + +#define tsk_is_polling(t) ((t)->thread_info->status & TS_POLLING) + #endif /* _ASM_IA64_THREAD_INFO_H */ Index: linux-hres.q/include/asm-powerpc/percpu.h =================================================================== --- linux-hres.q.orig/include/asm-powerpc/percpu.h +++ linux-hres.q/include/asm-powerpc/percpu.h @@ -22,6 +22,7 @@ /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) +#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ @@ -41,6 +42,7 @@ extern void setup_per_cpu_areas(void); #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #endif /* SMP */ Index: linux-hres.q/include/asm-s390/percpu.h =================================================================== --- linux-hres.q.orig/include/asm-s390/percpu.h +++ linux-hres.q/include/asm-s390/percpu.h @@ -40,6 +40,7 @@ extern unsigned long __per_cpu_offset[NR __typeof__(type) per_cpu__##name #define __get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset) +#define __raw_get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset) #define per_cpu(var,cpu) __reloc_hide(var,__per_cpu_offset[cpu]) /* A macro to avoid #include hell... */ @@ -57,6 +58,7 @@ do { \ __typeof__(type) per_cpu__##name #define __get_cpu_var(var) __reloc_hide(var,0) +#define __raw_get_cpu_var(var) __reloc_hide(var,0) #define per_cpu(var,cpu) __reloc_hide(var,0) #endif /* SMP */ Index: linux-hres.q/include/asm-sparc64/percpu.h =================================================================== --- linux-hres.q.orig/include/asm-sparc64/percpu.h +++ linux-hres.q/include/asm-sparc64/percpu.h @@ -21,6 +21,7 @@ register unsigned long __local_per_cpu_o /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __local_per_cpu_offset)) +#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __local_per_cpu_offset)) /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ @@ -37,6 +38,7 @@ do { \ #define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #endif /* SMP */ Index: linux-hres.q/include/asm-x86_64/percpu.h =================================================================== --- linux-hres.q.orig/include/asm-x86_64/percpu.h +++ linux-hres.q/include/asm-x86_64/percpu.h @@ -21,6 +21,7 @@ /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) +#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ @@ -40,6 +41,7 @@ extern void setup_per_cpu_areas(void); #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #endif /* SMP */ Index: linux-hres.q/include/asm-x86_64/thread_info.h =================================================================== --- linux-hres.q.orig/include/asm-x86_64/thread_info.h +++ linux-hres.q/include/asm-x86_64/thread_info.h @@ -101,7 +101,7 @@ static inline struct thread_info *stack_ #define TIF_IRET 5 /* force IRET */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ -#define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ +/* 16 free */ #define TIF_IA32 17 /* 32bit process */ #define TIF_FORK 18 /* ret_from_fork */ #define TIF_ABI_PENDING 19 @@ -115,7 +115,6 @@ static inline struct thread_info *stack_ #define _TIF_IRET (1<thread_info->status & TS_POLLING) #endif /* __KERNEL__ */ Index: linux-hres.q/include/linux/clockchips.h =================================================================== --- /dev/null +++ linux-hres.q/include/linux/clockchips.h @@ -0,0 +1,102 @@ +/* linux/include/linux/clockchips.h + * + * This file contains the structure definitions for clockchips. + * + * If you are not a clockchip, or the time of day code, you should + * not be including this file! + */ +#ifndef _LINUX_CLOCKCHIPS_H +#define _LINUX_CLOCKCHIPS_H + +#include + +#ifdef CONFIG_GENERIC_TIME + +#include +#include + +/* Clock event mode commands */ +enum { + CLOCK_EVT_PERIODIC, + CLOCK_EVT_ONESHOT, + CLOCK_EVT_SHUTDOWN, +}; + +/* Clock event capability flags */ +#define CLOCK_CAP_TICK 0x000001 +#define CLOCK_CAP_UPDATE 0x000002 +#ifndef CONFIG_PROFILE_NMI +# define CLOCK_CAP_PROFILE 0x000004 +#else +# define CLOCK_CAP_PROFILE 0x000000 +#endif +#ifdef CONFIG_HIGH_RES_TIMERS +# define CLOCK_CAP_NEXTEVT 0x000008 +#else +# define CLOCK_CAP_NEXTEVT 0x000000 +#endif + +#define CLOCK_BASE_CAPS_MASK (CLOCK_CAP_TICK | CLOCK_CAP_PROFILE | \ + CLOCK_CAP_UPDATE) +#define CLOCK_CAPS_MASK (CLOCK_BASE_CAPS_MASK | CLOCK_CAP_NEXTEVT) + +struct clock_event; + +/** + * struct clock_event - clock event descriptor + * + * @name: ptr to clock event name + * @capabilities: capabilities of the event chip + * @max_delta_ns: maximum delta value in ns + * @min_delta_ns: minimum delta value in ns + * @mult: nanosecond to cycles multiplier + * @shift: nanoseconds to cycles divisor (power of two) + * @set_next_event: set next event + * @set_mode: set mode function + * @suspend: suspend function (optional) + * @resume: resume function (optional) + * @evthandler: Assigned by the framework to be called by the low + * level handler of the event source + */ +struct clock_event { + const char *name; + unsigned int capabilities; + unsigned long max_delta_ns; + unsigned long min_delta_ns; + unsigned long mult; + int shift; + void (*set_next_event)(unsigned long evt, + struct clock_event *); + void (*set_mode)(int mode, struct clock_event *); + int (*suspend)(struct clock_event *); + int (*resume)(struct clock_event *); + void (*event_handler)(struct pt_regs *regs); +}; + +/* + * Calculate a multiplication factor + */ +static inline unsigned long div_sc(unsigned long a, unsigned long b, + int shift) +{ + uint64_t tmp = ((uint64_t)a) << shift; + do_div(tmp, b); + return (unsigned long) tmp; +} + +/* Clock event layer functions */ +extern int register_local_clockevent(struct clock_event *); +extern int register_global_clockevent(struct clock_event *); +extern unsigned long clockevent_delta2ns(unsigned long latch, + struct clock_event *evt); +extern void init_clockevents(void); + +extern int clockevents_init_next_event(void); +extern int clockevents_set_next_event(ktime_t expires, int force); +extern int clockevents_next_event_available(void); + +#else +# define init_clockevents() do { } while(0) +#endif + +#endif Index: linux-hres.q/include/linux/clocksource.h =================================================================== --- /dev/null +++ linux-hres.q/include/linux/clocksource.h @@ -0,0 +1,280 @@ +/* linux/include/linux/clocksource.h + * + * This file contains the structure definitions for clocksources. + * + * If you are not a clocksource, or timekeeping code, you should + * not be including this file! + */ +#ifndef _LINUX_CLOCKSOURCE_H +#define _LINUX_CLOCKSOURCE_H + +#include +#include +#include +#include +#include +#include + +/* clocksource cycle base type */ +typedef u64 cycle_t; + +/** + * struct clocksource - hardware abstraction for a free running counter + * Provides mostly state-free accessors to the underlying hardware. + * + * @name: ptr to clocksource name + * @list: list head for registration + * @rating: rating value for selection (higher is better) + * To avoid rating inflation the following + * list should give you a guide as to how + * to assign your clocksource a rating + * 1-99: Unfit for real use + * Only available for bootup and testing purposes. + * 100-199: Base level usability. + * Functional for real use, but not desired. + * 200-299: Good. + * A correct and usable clocksource. + * 300-399: Desired. + * A reasonably fast and accurate clocksource. + * 400-499: Perfect + * The ideal clocksource. A must-use where + * available. + * @read: returns a cycle value + * @mask: bitmask for two's complement + * subtraction of non 64 bit counters + * @mult: cycle to nanosecond multiplier + * @shift: cycle to nanosecond divisor (power of two) + * @update_callback: called when safe to alter clocksource values + * @is_continuous: defines if clocksource is free-running. + * @interval_cycles: Used internally by timekeeping core, please ignore. + * @interval_snsecs: Used internally by timekeeping core, please ignore. + */ +struct clocksource { + char *name; + struct list_head list; + int rating; + cycle_t (*read)(void); + cycle_t mask; + u32 mult; + u32 shift; + int (*update_callback)(void); + int is_continuous; + + /* timekeeping specific data, ignore */ + cycle_t interval_cycles; + u64 interval_snsecs; +}; + +/* simplify initialization of mask field */ +#define CLOCKSOURCE_MASK(bits) (cycle_t)(bits<64 ? ((1ULL<read(); +} + +/** + * cyc2ns - converts clocksource cycles to nanoseconds + * @cs: Pointer to clocksource + * @cycles: Cycles + * + * Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds. + * + * XXX - This could use some mult_lxl_ll() asm optimization + */ +static inline s64 cyc2ns(struct clocksource *cs, cycle_t cycles) +{ + u64 ret = (u64)cycles; + ret = (ret * cs->mult) >> cs->shift; + return ret; +} + +/** + * clocksource_calculate_interval - Calculates a clocksource interval struct + * + * @c: Pointer to clocksource. + * @length_nsec: Desired interval length in nanoseconds. + * + * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment + * pair and interval request. + * + * Unless you're the timekeeping code, you should not be using this! + */ +static inline void clocksource_calculate_interval(struct clocksource *c, + unsigned long length_nsec) +{ + u64 tmp; + + /* XXX - All of this could use a whole lot of optimization */ + tmp = length_nsec; + tmp <<= c->shift; + tmp += c->mult/2; + do_div(tmp, c->mult); + + c->interval_cycles = (cycle_t)tmp; + if(c->interval_cycles == 0) + c->interval_cycles = 1; + + c->interval_snsecs = (u64)c->interval_cycles * c->mult; +} + + +/** + * error_aproximation - calculates an error adjustment for a given error + * + * @error: Error value (unsigned) + * @unit: Adjustment unit + * + * For a given error value, this function takes the adjustment unit + * and uses binary approximation to return a power of two adjustment value. + * + * This function is only for use by the the make_ntp_adj() function + * and you must hold a write on the xtime_lock when calling. + */ +static inline int error_aproximation(u64 error, u64 unit) +{ + static int saved_adj = 0; + u64 adjusted_unit = unit << saved_adj; + + if (error > (adjusted_unit * 2)) { + /* large error, so increment the adjustment factor */ + saved_adj++; + } else if (error > adjusted_unit) { + /* just right, don't touch it */ + } else if (saved_adj) { + /* small error, so drop the adjustment factor */ + saved_adj--; + return 0; + } + + return saved_adj; +} + + +/** + * make_ntp_adj - Adjusts the specified clocksource for a given error + * + * @clock: Pointer to clock to be adjusted + * @cycles_delta: Current unacounted cycle delta + * @error: Pointer to current error value + * + * Returns clock shifted nanosecond adjustment to be applied against + * the accumulated time value (ie: xtime). + * + * If the error value is large enough, this function calulates the + * (power of two) adjustment value, and adjusts the clock's mult and + * interval_snsecs values accordingly. + * + * However, since there may be some unaccumulated cycles, to avoid + * time inconsistencies we must adjust the accumulation value + * accordingly. + * + * This is not very intuitive, so the following proof should help: + * The basic timeofday algorithm: base + cycle * mult + * Thus: + * new_base + cycle * new_mult = old_base + cycle * old_mult + * new_base = old_base + cycle * old_mult - cycle * new_mult + * new_base = old_base + cycle * (old_mult - new_mult) + * new_base - old_base = cycle * (old_mult - new_mult) + * base_delta = cycle * (old_mult - new_mult) + * base_delta = cycle * (mult_delta) + * + * Where mult_delta is the adjustment value made to mult + * + */ +static inline s64 make_ntp_adj(struct clocksource *clock, + cycles_t cycles_delta, s64* error) +{ + s64 ret = 0; + if (*error > ((s64)clock->interval_cycles+1)/2) { + /* calculate adjustment value */ + int adjustment = error_aproximation(*error, + clock->interval_cycles); + /* adjust clock */ + clock->mult += 1 << adjustment; + clock->interval_snsecs += clock->interval_cycles << adjustment; + + /* adjust the base and error for the adjustment */ + ret = -(cycles_delta << adjustment); + *error -= clock->interval_cycles << adjustment; + /* XXX adj error for cycle_delta offset? */ + } else if ((-(*error)) > ((s64)clock->interval_cycles+1)/2) { + /* calculate adjustment value */ + int adjustment = error_aproximation(-(*error), + clock->interval_cycles); + /* adjust clock */ + clock->mult -= 1 << adjustment; + clock->interval_snsecs -= clock->interval_cycles << adjustment; + + /* adjust the base and error for the adjustment */ + ret = cycles_delta << adjustment; + *error += clock->interval_cycles << adjustment; + /* XXX adj error for cycle_delta offset? */ + } + return ret; +} + + +/* used to install a new clocksource */ +int clocksource_register(struct clocksource*); +void clocksource_reselect(void); +struct clocksource* clocksource_get_next(void); + +#endif /* _LINUX_CLOCKSOURCE_H */ Index: linux-hres.q/include/linux/hrtimer.h =================================================================== --- linux-hres.q.orig/include/linux/hrtimer.h +++ linux-hres.q/include/linux/hrtimer.h @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -34,9 +35,19 @@ enum hrtimer_restart { HRTIMER_RESTART, }; -#define HRTIMER_INACTIVE ((void *)1UL) +enum hrtimer_state { + HRTIMER_INACTIVE, + HRTIMER_PENDING, + HRTIMER_ACTIVE, +}; + +enum hrtimer_cb_mode { + HRTIMER_CB_SOFTIRQ, + HRTIMER_CB_IRQSAFE, + HRTIMER_CB_IRQSAFE_NO_RESTART, +}; -struct hrtimer_base; +struct hrtimer_clock_base; /** * struct hrtimer - the basic hrtimer structure @@ -48,13 +59,22 @@ struct hrtimer_base; * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * + * @mode: high resolution timer feature to allow executing the + * callback in the hardirq context (wakeups) + * @cb_entry: list head to enqueue an expired timer into the callback list + * * The hrtimer structure must be initialized by init_hrtimer_#CLOCKTYPE() */ struct hrtimer { - struct rb_node node; - ktime_t expires; - int (*function)(struct hrtimer *); - struct hrtimer_base *base; + struct rb_node node; + ktime_t expires; + int (*function)(struct hrtimer *); + struct hrtimer_clock_base *base; + enum hrtimer_state state; +#ifdef CONFIG_HIGH_RES_TIMERS + int cb_mode; + struct list_head cb_entry; +#endif }; /** @@ -70,6 +90,8 @@ struct hrtimer_sleeper { struct task_struct *task; }; +struct hrtimer_cpu_base; + /** * struct hrtimer_base - the timer base for a specific clock * @@ -83,25 +105,95 @@ struct hrtimer_sleeper { * @get_softirq_time: function to retrieve the current time from the softirq * @curr_timer: the timer which is executing a callback right now * @softirq_time: the time when running the hrtimer queue in the softirq + * + * @cb_pending: list of timers where the callback is pending + * @offset: hmmm + * @reprogram: function to reprogram the timer event */ -struct hrtimer_base { +struct hrtimer_clock_base { + struct hrtimer_cpu_base *cpu_base; clockid_t index; - spinlock_t lock; struct rb_root active; struct rb_node *first; ktime_t resolution; ktime_t (*get_time)(void); ktime_t (*get_softirq_time)(void); - struct hrtimer *curr_timer; ktime_t softirq_time; +#ifdef CONFIG_HIGH_RES_TIMERS + ktime_t offset; + int (*reprogram)(struct hrtimer *t, + struct hrtimer_clock_base *b, + ktime_t n); +#endif }; +#define HRTIMER_MAX_CLOCK_BASES 2 + +/* + * struct hrtimer_cpu_base + */ +struct hrtimer_cpu_base { + spinlock_t lock; + struct hrtimer *curr_timer; + struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; +#ifdef CONFIG_HIGH_RES_TIMERS + ktime_t expires_next; + int hres_active; + unsigned long check_clocks; + struct list_head cb_pending; + struct hrtimer sched_timer; + struct pt_regs *sched_regs; + unsigned long events; +#ifdef CONFIG_NO_HZ + ktime_t idle_tick; + unsigned long idle_jiffies; + unsigned long idle_calls; + unsigned long idle_sleeps; + unsigned long idle_sleeptime; +#endif +#endif +}; + +#ifdef CONFIG_HIGH_RES_TIMERS + +/* temporary hack */ +DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); + +extern void hrtimer_clock_notify(void); +extern void clock_was_set(void); +extern void hrtimer_interrupt(struct pt_regs *regs); + +# define hrtimer_cb_get_time(t) (t)->base->get_time() +# define hrtimer_hres_active (__get_cpu_var(hrtimer_bases).hres_active) +/* + * The resolution of the clocks. The resolution value is returned in + * the clock_getres() system call to give application programmers an + * idea of the (in)accuracy of timers. Timer values are rounded up to + * this resolution values. + */ +# define KTIME_REALTIME_RES (ktime_t) { .tv64 = CONFIG_HIGH_RES_RESOLUTION } +# define KTIME_MONOTONIC_RES (ktime_t) { .tv64 = CONFIG_HIGH_RES_RESOLUTION } + +#else + +# define KTIME_REALTIME_RES KTIME_LOW_RES +# define KTIME_MONOTONIC_RES KTIME_LOW_RES + /* * clock_was_set() is a NOP for non- high-resolution systems. The * time-sorted order guarantees that a timer does not expire early and * is expired in the next softirq when the clock was advanced. */ -#define clock_was_set() do { } while (0) +# define clock_was_set() do { } while (0) +# define hrtimer_clock_notify() do { } while (0) + +# define hrtimer_cb_get_time(t) (t)->base->softirq_time +# define hrtimer_hres_active 0 + +#endif + +extern ktime_t ktime_get(void); +extern ktime_t ktime_get_real(void); /* Exported timer functions: */ @@ -127,7 +219,7 @@ extern ktime_t hrtimer_get_next_event(vo static inline int hrtimer_active(const struct hrtimer *timer) { - return timer->node.rb_parent != HRTIMER_INACTIVE; + return timer->state != HRTIMER_INACTIVE; } /* Forward a hrtimer so it expires after now: */ @@ -149,4 +241,22 @@ extern void hrtimer_run_queues(void); /* Bootup initialization: */ extern void __init hrtimers_init(void); +#ifdef CONFIG_NO_HZ +extern void hrtimer_trigger_next_hz_tick(struct tvec_t_base_s *base); +extern int hrtimer_stop_sched_tick(void); +extern void hrtimer_restart_sched_tick(void); +extern void update_jiffies(void); +struct seq_file; +extern void show_no_hz_stats(struct seq_file *p); +#else +# define hrtimer_trigger_next_hz_tick(base) do { } while (0) +static inline int hrtimer_stop_sched_tick(void) +{ + return 0; +} +# define hrtimer_restart_sched_tick() do { } while (0) +# define update_jiffies() do { } while (0) +# define show_no_hz_stats(p) do { } while (0) +#endif + #endif Index: linux-hres.q/include/linux/interrupt.h =================================================================== --- linux-hres.q.orig/include/linux/interrupt.h +++ linux-hres.q/include/linux/interrupt.h @@ -113,7 +113,10 @@ enum NET_TX_SOFTIRQ, NET_RX_SOFTIRQ, BLOCK_SOFTIRQ, - TASKLET_SOFTIRQ + TASKLET_SOFTIRQ, +#ifdef CONFIG_HIGH_RES_TIMERS + HRTIMER_SOFTIRQ, +#endif }; /* softirq mask and active fields moved to irq_cpustat_t in Index: linux-hres.q/include/linux/jiffies.h =================================================================== --- linux-hres.q.orig/include/linux/jiffies.h +++ linux-hres.q/include/linux/jiffies.h @@ -127,13 +127,13 @@ static inline u64 get_jiffies_64(void) * * And some not so obvious. * - * Note that we don't want to return MAX_LONG, because + * Note that we don't want to return LONG_MAX, because * for various timeout reasons we often end up having * to wait "jiffies+1" in order to guarantee that we wait * at _least_ "jiffies" - so "jiffies+1" had better still * be positive. */ -#define MAX_JIFFY_OFFSET ((~0UL >> 1)-1) +#define MAX_JIFFY_OFFSET ((LONG_MAX >> 1)-1) /* * We want to do realistic conversions of time so we need to use the same @@ -244,207 +244,24 @@ static inline u64 get_jiffies_64(void) #endif /* - * Convert jiffies to milliseconds and back. - * - * Avoid unnecessary multiplications/divisions in the - * two most common HZ cases: - */ -static inline unsigned int jiffies_to_msecs(const unsigned long j) -{ -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (MSEC_PER_SEC / HZ) * j; -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); -#else - return (j * MSEC_PER_SEC) / HZ; -#endif -} - -static inline unsigned int jiffies_to_usecs(const unsigned long j) -{ -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (USEC_PER_SEC / HZ) * j; -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); -#else - return (j * USEC_PER_SEC) / HZ; -#endif -} - -static inline unsigned long msecs_to_jiffies(const unsigned int m) -{ - if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - return m * (HZ / MSEC_PER_SEC); -#else - return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC; -#endif -} - -static inline unsigned long usecs_to_jiffies(const unsigned int u) -{ - if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return u * (HZ / USEC_PER_SEC); -#else - return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC; -#endif -} - -/* - * The TICK_NSEC - 1 rounds up the value to the next resolution. Note - * that a remainder subtract here would not do the right thing as the - * resolution values don't fall on second boundries. I.e. the line: - * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. - * - * Rather, we just shift the bits off the right. - * - * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec - * value to a scaled second value. + * Convert various time units to each other: */ -static __inline__ unsigned long -timespec_to_jiffies(const struct timespec *value) -{ - unsigned long sec = value->tv_sec; - long nsec = value->tv_nsec + TICK_NSEC - 1; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - nsec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)nsec * NSEC_CONVERSION) >> - (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; - -} - -static __inline__ void -jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u64 nsec = (u64)jiffies * TICK_NSEC; - value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec); -} +extern unsigned int jiffies_to_msecs(const unsigned long j); +extern unsigned int jiffies_to_usecs(const unsigned long j); +extern unsigned long msecs_to_jiffies(const unsigned int m); +extern unsigned long usecs_to_jiffies(const unsigned int u); +extern unsigned long timespec_to_jiffies(const struct timespec *value); +extern void jiffies_to_timespec(const unsigned long jiffies, + struct timespec *value); +extern unsigned long timeval_to_jiffies(const struct timeval *value); +extern void jiffies_to_timeval(const unsigned long jiffies, + struct timeval *value); +extern clock_t jiffies_to_clock_t(long x); +extern unsigned long clock_t_to_jiffies(unsigned long x); +extern u64 jiffies_64_to_clock_t(u64 x); +extern u64 nsec_to_clock_t(u64 x); +extern int nsec_to_timestamp(char *s, u64 t); -/* Same for "timeval" - * - * Well, almost. The problem here is that the real system resolution is - * in nanoseconds and the value being converted is in micro seconds. - * Also for some machines (those that use HZ = 1024, in-particular), - * there is a LARGE error in the tick size in microseconds. - - * The solution we use is to do the rounding AFTER we convert the - * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. - * Instruction wise, this should cost only an additional add with carry - * instruction above the way it was done above. - */ -static __inline__ unsigned long -timeval_to_jiffies(const struct timeval *value) -{ - unsigned long sec = value->tv_sec; - long usec = value->tv_usec; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - usec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> - (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; -} - -static __inline__ void -jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u64 nsec = (u64)jiffies * TICK_NSEC; - long tv_usec; - - value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec); - tv_usec /= NSEC_PER_USEC; - value->tv_usec = tv_usec; -} - -/* - * Convert jiffies/jiffies_64 to clock_t and back. - */ -static inline clock_t jiffies_to_clock_t(long x) -{ -#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 - return x / (HZ / USER_HZ); -#else - u64 tmp = (u64)x * TICK_NSEC; - do_div(tmp, (NSEC_PER_SEC / USER_HZ)); - return (long)tmp; -#endif -} - -static inline unsigned long clock_t_to_jiffies(unsigned long x) -{ -#if (HZ % USER_HZ)==0 - if (x >= ~0UL / (HZ / USER_HZ)) - return ~0UL; - return x * (HZ / USER_HZ); -#else - u64 jif; - - /* Don't worry about loss of precision here .. */ - if (x >= ~0UL / HZ * USER_HZ) - return ~0UL; - - /* .. but do try to contain it here */ - jif = x * (u64) HZ; - do_div(jif, USER_HZ); - return jif; -#endif -} - -static inline u64 jiffies_64_to_clock_t(u64 x) -{ -#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 - do_div(x, HZ / USER_HZ); -#else - /* - * There are better ways that don't overflow early, - * but even this doesn't overflow in hundreds of years - * in 64 bits, so.. - */ - x *= TICK_NSEC; - do_div(x, (NSEC_PER_SEC / USER_HZ)); -#endif - return x; -} - -static inline u64 nsec_to_clock_t(u64 x) -{ -#if (NSEC_PER_SEC % USER_HZ) == 0 - do_div(x, (NSEC_PER_SEC / USER_HZ)); -#elif (USER_HZ % 512) == 0 - x *= USER_HZ/512; - do_div(x, (NSEC_PER_SEC / 512)); -#else - /* - * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, - * overflow after 64.99 years. - * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... - */ - x *= 9; - do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) - / USER_HZ)); -#endif - return x; -} +#define TIMESTAMP_SIZE 30 #endif Index: linux-hres.q/include/linux/ktime.h =================================================================== --- linux-hres.q.orig/include/linux/ktime.h +++ linux-hres.q/include/linux/ktime.h @@ -264,8 +264,7 @@ static inline u64 ktime_to_ns(const ktim * idea of the (in)accuracy of timers. Timer values are rounded up to * this resolution values. */ -#define KTIME_REALTIME_RES (ktime_t){ .tv64 = TICK_NSEC } -#define KTIME_MONOTONIC_RES (ktime_t){ .tv64 = TICK_NSEC } +#define KTIME_LOW_RES (ktime_t){ .tv64 = TICK_NSEC } /* Get the monotonic time in timespec format: */ extern void ktime_get_ts(struct timespec *ts); Index: linux-hres.q/include/linux/rcupdate.h =================================================================== --- linux-hres.q.orig/include/linux/rcupdate.h +++ linux-hres.q/include/linux/rcupdate.h @@ -256,6 +256,7 @@ extern int rcu_needs_cpu(int cpu); extern void rcu_init(void); extern void rcu_check_callbacks(int cpu, int user); +extern void rcu_advance_callbacks(int cpu, int user); extern void rcu_restart_cpu(int cpu); extern long rcu_batches_completed(void); @@ -268,6 +269,7 @@ extern __deprecated_for_modules void syn extern void synchronize_rcu(void); void synchronize_idle(void); extern void rcu_barrier(void); +extern void rcu_process_callbacks(unsigned long unused); #endif /* __KERNEL__ */ #endif /* __LINUX_RCUPDATE_H */ Index: linux-hres.q/include/linux/sysctl.h =================================================================== --- linux-hres.q.orig/include/linux/sysctl.h +++ linux-hres.q/include/linux/sysctl.h @@ -148,6 +148,7 @@ enum KERN_SPIN_RETRY=70, /* int: number of spinlock retries */ KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */ KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */ + KERN_TIMEOUT_GRANULARITY=73, /* int: timeout granularity in jiffies */ }; Index: linux-hres.q/include/linux/time.h =================================================================== --- linux-hres.q.orig/include/linux/time.h +++ linux-hres.q/include/linux/time.h @@ -77,6 +77,8 @@ extern struct timespec xtime; extern struct timespec wall_to_monotonic; extern seqlock_t xtime_lock; +void timekeeping_init(void); + static inline unsigned long get_seconds(void) { return xtime.tv_sec; @@ -100,6 +102,7 @@ extern int do_getitimer(int which, struc extern void getnstimeofday(struct timespec *tv); extern struct timespec timespec_trunc(struct timespec t, unsigned gran); +extern int timekeeping_is_continuous(void); /** * timespec_to_ns - Convert timespec to nanoseconds @@ -142,6 +145,20 @@ extern struct timespec ns_to_timespec(co */ extern struct timeval ns_to_timeval(const s64 nsec); +/** + * timespec_add_ns - Adds nanoseconds to a timespec + * @a: pointer to timespec to be incremented + * @ns: unsigned nanoseconds value to be added + */ +static inline void timespec_add_ns(struct timespec *a, u64 ns) +{ + ns += a->tv_nsec; + while(unlikely(ns >= NSEC_PER_SEC)) { + ns -= NSEC_PER_SEC; + a->tv_sec++; + } + a->tv_nsec = ns; +} #endif /* __KERNEL__ */ #define NFDBITS __NFDBITS Index: linux-hres.q/include/linux/timer.h =================================================================== --- linux-hres.q.orig/include/linux/timer.h +++ linux-hres.q/include/linux/timer.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -16,9 +17,15 @@ struct timer_list { unsigned long data; struct tvec_t_base_s *base; +#ifdef CONFIG_TIMER_INFO + void *start_site; + char start_comm[16]; + int start_pid; +#endif }; extern struct tvec_t_base_s boot_tvec_bases; +extern unsigned int timeout_granularity; #define TIMER_INITIALIZER(_function, _expires, _data) { \ .function = (_function), \ @@ -62,7 +69,52 @@ extern int del_timer(struct timer_list * extern int __mod_timer(struct timer_list *timer, unsigned long expires); extern int mod_timer(struct timer_list *timer, unsigned long expires); +/* + * Return when the next timer-wheel timeout occurs (in absolute jiffies), + * locks the timer base: + */ extern unsigned long next_timer_interrupt(void); +/* + * Return when the next timer-wheel timeout occurs (in absolute jiffies), + * locks the timer base and does the comparison against the given + * jiffie. + */ +extern unsigned long get_next_timer_interrupt(unsigned long now); + +/* + * Timer-top info: + */ +#ifdef CONFIG_TIMER_INFO +extern int account_timer(struct timer_list *timer); + +extern void __timer_set_start_info(struct timer_list *timer, void *addr); + +static inline void timer_set_start_info(struct timer_list *timer) +{ + __timer_set_start_info(timer, __builtin_return_address(0)); +} + +static inline void timer_clear_start_info(struct timer_list *timer) +{ + timer->start_site = NULL; +} +#else +static inline int account_timer(struct timer_list *timer) +{ + return 0; +} + +static inline void timer_set_start_info(struct timer_list *timer) +{ +} + +static inline void timer_clear_start_info(struct timer_list *timer) +{ +} +#endif + +extern void delayed_work_timer_fn(unsigned long __data); + /*** * add_timer - start a timer Index: linux-hres.q/include/linux/timex.h =================================================================== --- linux-hres.q.orig/include/linux/timex.h +++ linux-hres.q/include/linux/timex.h @@ -305,7 +305,7 @@ time_interpolator_reset(void) #endif /* !CONFIG_TIME_INTERPOLATION */ /* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */ -extern u64 current_tick_length(void); +extern u64 current_tick_length(long); extern int do_adjtimex(struct timex *); Index: linux-hres.q/init/main.c =================================================================== --- linux-hres.q.orig/init/main.c +++ linux-hres.q/init/main.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -486,10 +487,12 @@ asmlinkage void __init start_kernel(void rcu_init(); init_IRQ(); pidhash_init(); + init_clockevents(); init_timers(); hrtimers_init(); softirq_init(); time_init(); + timekeeping_init(); /* * HACK ALERT! This is early. We're enabling the console before Index: linux-hres.q/kernel/Makefile =================================================================== --- linux-hres.q.orig/kernel/Makefile +++ linux-hres.q/kernel/Makefile @@ -10,11 +10,13 @@ obj-y = sched.o fork.o exec_domain.o kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o +obj-y += time/ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_FUTEX) += futex.o ifeq ($(CONFIG_COMPAT),y) obj-$(CONFIG_FUTEX) += futex_compat.o endif +obj-$(CONFIG_TIMER_INFO) += timer_top.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o Index: linux-hres.q/kernel/hrtimer.c =================================================================== --- linux-hres.q.orig/kernel/hrtimer.c +++ linux-hres.q/kernel/hrtimer.c @@ -1,8 +1,8 @@ /* * linux/kernel/hrtimer.c * - * Copyright(C) 2005, Thomas Gleixner - * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar * * High-resolution kernel timers * @@ -36,7 +36,10 @@ #include #include #include +#include #include +#include +#include #include @@ -45,7 +48,7 @@ * * returns the time in ktime_t format */ -static ktime_t ktime_get(void) +ktime_t ktime_get(void) { struct timespec now; @@ -59,7 +62,7 @@ static ktime_t ktime_get(void) * * returns the time in ktime_t format */ -static ktime_t ktime_get_real(void) +ktime_t ktime_get_real(void) { struct timespec now; @@ -79,21 +82,22 @@ EXPORT_SYMBOL_GPL(ktime_get_real); * This ensures that we capture erroneous accesses to these clock ids * rather than moving them into the range of valid clock id's. */ - -#define MAX_HRTIMER_BASES 2 - -static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = +DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { + + .clock_base = { - .index = CLOCK_REALTIME, - .get_time = &ktime_get_real, - .resolution = KTIME_REALTIME_RES, - }, - { - .index = CLOCK_MONOTONIC, - .get_time = &ktime_get, - .resolution = KTIME_MONOTONIC_RES, - }, + { + .index = CLOCK_REALTIME, + .get_time = &ktime_get_real, + .resolution = KTIME_REALTIME_RES, + }, + { + .index = CLOCK_MONOTONIC, + .get_time = &ktime_get, + .resolution = KTIME_MONOTONIC_RES, + }, + } }; /** @@ -126,7 +130,7 @@ EXPORT_SYMBOL_GPL(ktime_get_ts); * Get the coarse grained time at the softirq based on xtime and * wall_to_monotonic. */ -static void hrtimer_get_softirq_time(struct hrtimer_base *base) +static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { ktime_t xtim, tomono; unsigned long seq; @@ -138,8 +142,9 @@ static void hrtimer_get_softirq_time(str } while (read_seqretry(&xtime_lock, seq)); - base[CLOCK_REALTIME].softirq_time = xtim; - base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono); + base->clock_base[CLOCK_REALTIME].softirq_time = xtim; + base->clock_base[CLOCK_MONOTONIC].softirq_time = + ktime_add(xtim, tomono); } /* @@ -148,7 +153,12 @@ static void hrtimer_get_softirq_time(str */ #ifdef CONFIG_SMP -#define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0) +static inline int base_same_cpu(struct hrtimer_clock_base *base) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + return (base == &cpu_base->clock_base[base->index]); +} /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock @@ -162,19 +172,20 @@ static void hrtimer_get_softirq_time(str * possible to set timer->base = NULL and drop the lock: the timer remains * locked. */ -static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, - unsigned long *flags) +static +struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, + unsigned long *flags) { - struct hrtimer_base *base; + struct hrtimer_clock_base *base; for (;;) { base = timer->base; if (likely(base != NULL)) { - spin_lock_irqsave(&base->lock, *flags); + spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* The timer has migrated to another CPU: */ - spin_unlock_irqrestore(&base->lock, *flags); + spin_unlock_irqrestore(&base->cpu_base->lock, *flags); } cpu_relax(); } @@ -183,12 +194,14 @@ static struct hrtimer_base *lock_hrtimer /* * Switch the timer base to the current CPU when possible. */ -static inline struct hrtimer_base * -switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) +static inline struct hrtimer_clock_base * +switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base) { - struct hrtimer_base *new_base; + struct hrtimer_clock_base *new_base; + struct hrtimer_cpu_base *new_cpu_base; - new_base = &__get_cpu_var(hrtimer_bases[base->index]); + new_cpu_base = &__get_cpu_var(hrtimer_bases); + new_base = &new_cpu_base->clock_base[base->index]; if (base != new_base) { /* @@ -200,33 +213,35 @@ switch_hrtimer_base(struct hrtimer *time * completed. There is no conflict as we hold the lock until * the timer is enqueued. */ - if (unlikely(base->curr_timer == timer)) + if (unlikely(base->cpu_base->curr_timer == timer)) return base; /* See the comment in lock_timer_base() */ timer->base = NULL; - spin_unlock(&base->lock); - spin_lock(&new_base->lock); + spin_unlock(&base->cpu_base->lock); + spin_lock(&new_base->cpu_base->lock); timer->base = new_base; } return new_base; } -#else /* CONFIG_SMP */ +# define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0) -#define set_curr_timer(b, t) do { } while (0) +#else /* CONFIG_SMP */ -static inline struct hrtimer_base * +static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { - struct hrtimer_base *base = timer->base; + struct hrtimer_clock_base *base = timer->base; - spin_lock_irqsave(&base->lock, *flags); + spin_lock_irqsave(&base->cpu_base->lock, *flags); return base; } -#define switch_hrtimer_base(t, b) (b) +# define switch_hrtimer_base(t, b) (b) +# define base_same_cpu(b) 1 +# define set_curr_timer(b, t) do { } while (0) #endif /* !CONFIG_SMP */ @@ -258,9 +273,6 @@ ktime_t ktime_add_ns(const ktime_t kt, u return ktime_add(kt, tmp); } - -#else /* CONFIG_KTIME_SCALAR */ - # endif /* !CONFIG_KTIME_SCALAR */ /* @@ -288,13 +300,477 @@ static unsigned long ktime_divns(const k # define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div)) #endif /* BITS_PER_LONG >= 64 */ +/* High resolution timer related functions */ +#ifdef CONFIG_HIGH_RES_TIMERS + +static ktime_t last_jiffies_update; + +/* + * Reprogramm the event source with checking both queues for the + * next event + * Called with interrupts disabled and base->lock held + */ +static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) +{ + int i; + struct hrtimer_clock_base *base = cpu_base->clock_base; + ktime_t expires; + + cpu_base->expires_next.tv64 = KTIME_MAX; + + for (i = HRTIMER_MAX_CLOCK_BASES; i ; i--, base++) { + struct hrtimer *timer; + + if (!base->first) + continue; + timer = rb_entry(base->first, struct hrtimer, node); + expires = ktime_sub(timer->expires, base->offset); + if (expires.tv64 < cpu_base->expires_next.tv64) + cpu_base->expires_next = expires; + } + + if (cpu_base->expires_next.tv64 != KTIME_MAX) + clockevents_set_next_event(cpu_base->expires_next, 1); +} + +/* + * Shared reprogramming for clock_realtime and clock_monotonic + * + * When a new expires first timer is enqueued, we have + * to check, whether it expires earlier than the timer + * for which the hrt time source was armed. + * + * Called with interrupts disabled and base->cpu_base.lock held + */ +static int hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; + ktime_t expires = ktime_sub(timer->expires, base->offset); + int res; + + if (!base_same_cpu(base)) + return 0; + + if (expires.tv64 >= expires_next->tv64) + return 0; + + res = clockevents_set_next_event(expires, 0); + if (!res) + *expires_next = expires; + return res; +} + + +/* + * Retrigger next event is called after clock was set + */ +static void retrigger_next_event(void *arg) +{ + struct hrtimer_cpu_base *base; + struct timespec realtime_offset; + + base = &per_cpu(hrtimer_bases, smp_processor_id()); + + /* Adjust CLOCK_REALTIME offset */ + spin_lock(&base->lock); + set_normalized_timespec(&realtime_offset, + -wall_to_monotonic.tv_sec, + -wall_to_monotonic.tv_nsec); + base->clock_base[CLOCK_REALTIME].offset = + timespec_to_ktime(realtime_offset); + + hrtimer_force_reprogram(base); + spin_unlock(&base->lock); +} + +/* + * Clock realtime was set + * + * Change the offset of the realtime clock vs. the monotonic + * clock. Called with xtime lock held ! + * + * We might have to reprogram the high resolution timer interrupt. On + * SMP we call the architecture specific code to retrigger _all_ high + * resolution timer interrupts. On UP we just disable interrupts and + * call the high resolution interrupt code. + */ +void clock_was_set(void) +{ + preempt_disable(); + local_irq_disable(); + + if (hrtimer_hres_active) { + retrigger_next_event(NULL); + local_irq_enable(); + + if (smp_call_function(retrigger_next_event, NULL, 1, 1)) + BUG(); + } else + local_irq_enable(); + preempt_enable(); +} + +/** + * hrtimer_clock_notify - A clock source or a clock event has been installed + * + * Notify the per cpu softirqs to recheck the clock sources and events + */ +void hrtimer_clock_notify(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) + set_bit(0, &per_cpu(hrtimer_bases, i).check_clocks); +} + + +static const ktime_t nsec_per_hz = { .tv64 = NSEC_PER_SEC / HZ }; + +/* + * We switched off the global tick source when switching to high resolution + * mode. Update jiffies64. + * + * Must be called with interrupts disabled ! + */ +static void update_jiffies64(ktime_t now) +{ + ktime_t delta; + + write_seqlock(&xtime_lock); + + delta = ktime_sub(now, last_jiffies_update); + if (delta.tv64 >= nsec_per_hz.tv64) { + + delta = ktime_sub(delta, nsec_per_hz); + last_jiffies_update = ktime_add(last_jiffies_update, + nsec_per_hz); + + /* Slow path for long timeouts */ + if (unlikely(delta.tv64 >= nsec_per_hz.tv64)) { + s64 incr = ktime_to_ns(nsec_per_hz); + unsigned long orun = ktime_divns(delta, incr); + + last_jiffies_update = ktime_add_ns(last_jiffies_update, + incr * orun); + jiffies_64 += orun; + } + do_timer(NULL); + } + write_sequnlock(&xtime_lock); +} + +#ifdef CONFIG_NO_HZ +/* + * Called from interrupt entry when then CPU was idle + */ +void update_jiffies(void) +{ + unsigned long flags; + ktime_t now; + + if (unlikely(!hrtimer_hres_active)) + return; + + now = ktime_get(); + + local_irq_save(flags); + update_jiffies64(now); + local_irq_restore(flags); +} + +/* + * Called from the idle thread so careful! + */ +int hrtimer_stop_sched_tick(void) +{ + int cpu = smp_processor_id(); + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + unsigned long seq, last_jiffies, next_jiffies; + ktime_t last_update, expires; + unsigned long delta_jiffies; + unsigned long flags; + + if (unlikely(!hrtimer_hres_active)) + return 0; + + local_irq_save(flags); + + do { + seq = read_seqbegin(&xtime_lock); + last_update = last_jiffies_update; + last_jiffies = jiffies; + } while (read_seqretry(&xtime_lock, seq)); + + next_jiffies = get_next_timer_interrupt(last_jiffies); + delta_jiffies = next_jiffies - last_jiffies; + + cpu_base->idle_calls++; + + if ((long)delta_jiffies >= 1) { + /* + * Save the current tick time, so we can restart the + * scheduler tick when we get woken up before the next + * wheel timer expires + */ + cpu_base->idle_tick = cpu_base->sched_timer.expires; + expires = ktime_add_ns(last_update, + nsec_per_hz.tv64 * delta_jiffies); + hrtimer_start(&cpu_base->sched_timer, expires, HRTIMER_ABS); + cpu_base->idle_sleeps++; + cpu_base->idle_jiffies = last_jiffies; + } else { + /* Keep the timer alive */ + cpu_base->idle_tick.tv64 = 0; + if ((long) delta_jiffies < 0) + raise_softirq(TIMER_SOFTIRQ); + } + + if (local_softirq_pending()) { + inc_preempt_count(); + do_softirq(); + dec_preempt_count(); + } + + /* + * RCU normally depends on the timer IRQ kicking completion + * in every tick. We have to do this here now: + */ + if (rcu_pending(cpu)) { + /* + * We are in quiescent state, so advance callbacks: + */ + rcu_advance_callbacks(cpu, 1); + local_irq_enable(); + local_bh_disable(); + rcu_process_callbacks(0); + local_bh_enable(); + } + local_irq_restore(flags); + + return need_resched(); +} + +void hrtimer_restart_sched_tick(void) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + unsigned long flags; + ktime_t now; + + if (!hrtimer_hres_active || cpu_base->idle_tick.tv64 == 0) + return; + + /* Update jiffies first */ + now = ktime_get(); + + local_irq_save(flags); + update_jiffies64(now); + + /* + * Update process times would randomly account the time we slept to + * whatever the context of the next sched tick is. Enforce that this + * is accounted to idle ! + */ + add_preempt_count(HARDIRQ_OFFSET); + update_process_times(0); + sub_preempt_count(HARDIRQ_OFFSET); + + cpu_base->idle_sleeptime += jiffies - cpu_base->idle_jiffies; + + hrtimer_cancel(&cpu_base->sched_timer); + cpu_base->sched_timer.expires = cpu_base->idle_tick; + hrtimer_forward(&cpu_base->sched_timer, now, nsec_per_hz); + hrtimer_start(&cpu_base->sched_timer, cpu_base->sched_timer.expires, + HRTIMER_ABS); + local_irq_restore(flags); +} + +void show_no_hz_stats(struct seq_file *p) +{ + int cpu; + unsigned long calls = 0, sleeps = 0, time = 0, events = 0; + + for_each_online_cpu(cpu) { + struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); + + calls += base->idle_calls; + sleeps += base->idle_sleeps; + time += base->idle_sleeptime; + events += base->events; + + seq_printf(p, "nohz cpu%d I:%lu S:%lu T:%lu A:%lu E: %lu\n", + cpu, base->idle_calls, base->idle_sleeps, + base->idle_sleeptime, base->idle_sleeps ? + base->idle_sleeptime / sleeps : 0, base->events); + } +#ifdef CONFIG_SMP + seq_printf(p, "nohz total I:%lu S:%lu T:%lu A:%lu E:%lu\n", + calls, sleeps, time, sleeps ? time / sleeps : 0, events); +#endif +} + +#endif + +/* + * We rearm the timer until we get disabled by the idle code + */ +static int hrtimer_sched_tick(struct hrtimer *timer) +{ + unsigned long flags; + struct hrtimer_cpu_base *cpu_base = + container_of(timer, struct hrtimer_cpu_base, sched_timer); + + local_irq_save(flags); + /* + * Do not call, when we are not in irq context and have + * no valid regs pointer + */ + if (cpu_base->sched_regs) { + update_process_times(user_mode(cpu_base->sched_regs)); + profile_tick(CPU_PROFILING, cpu_base->sched_regs); + } + + hrtimer_forward(timer, hrtimer_cb_get_time(timer), nsec_per_hz); + local_irq_restore(flags); + + return HRTIMER_RESTART; +} + +/* + * A change in the clock source or clock events was detected. + * Check the clock source and the events, whether we can switch to + * high resolution mode or not. + * + * TODO: Handle the removal of clock sources / events + */ +static void hrtimer_check_clocks(void) +{ + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + unsigned long flags; + ktime_t now; + + if (!test_and_clear_bit(0, &base->check_clocks)) + return; + + if (!timekeeping_is_continuous()) + return; + + if (!clockevents_next_event_available()) + return; + + local_irq_save(flags); + + if (base->hres_active) { + local_irq_restore(flags); + return; + } + + now = ktime_get(); + if (clockevents_init_next_event()) { + local_irq_restore(flags); + return; + } + base->hres_active = 1; + + /* Did we start the jiffies update yet ? */ + if (last_jiffies_update.tv64 == 0) { + write_seqlock(&xtime_lock); + last_jiffies_update = now; + write_sequnlock(&xtime_lock); + } + + /* + * Emulate tick processing via per-CPU hrtimers: + */ + hrtimer_init(&base->sched_timer, CLOCK_MONOTONIC, HRTIMER_REL); + base->sched_timer.function = hrtimer_sched_tick; + base->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE; + hrtimer_start(&base->sched_timer, nsec_per_hz, HRTIMER_REL); + + /* "Retrigger" the interrupt to get things going */ + retrigger_next_event(NULL); + local_irq_restore(flags); + printk(KERN_INFO "hrtimers: Switched to high resolution mode CPU %d\n", + smp_processor_id()); +} + +static inline int hrtimer_cb_pending(const struct hrtimer *timer) +{ + return !list_empty(&timer->cb_entry); +} + +static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) +{ + list_del_init(&timer->cb_entry); +} + +static inline void hrtimer_add_cb_pending(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + list_add_tail(&timer->cb_entry, &base->cpu_base->cb_pending); + timer->state = HRTIMER_PENDING; +} + +static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) +{ + base->expires_next.tv64 = KTIME_MAX; + set_bit(0, &base->check_clocks); + base->hres_active = 0; + INIT_LIST_HEAD(&base->cb_pending); +} + +static inline void hrtimer_init_timer_hres(struct hrtimer *timer) +{ + INIT_LIST_HEAD(&timer->cb_entry); +} + +static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + /* + * High resolution timers, when active try to + * reprogram. If the timer is in the past we just move + * it to the expired list and schedule the softirq. + */ + if (hrtimer_hres_active && hrtimer_reprogram(timer, base)) { + /* + * HRTIMER_CB_IRQSAFE_NO_RESTART signals that the timer is not + * requested to be rearmed by the callback function. Just call + * the callback instead of going through the softirq. + */ + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_RESTART) { + int ret = timer->function(timer); + + BUG_ON(ret != HRTIMER_NORESTART); + } else { + hrtimer_add_cb_pending(timer, base); + raise_softirq(HRTIMER_SOFTIRQ); + } + return 1; + } + return 0; +} + +#else + +# define hrtimer_hres_active 0 +# define hrtimer_check_clocks() do { } while (0) +# define hrtimer_enqueue_reprogram(t,b) 0 +# define hrtimer_force_reprogram(b) do { } while (0) +# define hrtimer_cb_pending(t) 0 +# define hrtimer_remove_cb_pending(t) do { } while (0) +# define hrtimer_init_hres(c) do { } while (0) +# define hrtimer_init_timer_hres(t) do { } while (0) + +#endif /* CONFIG_HIGH_RES_TIMERS */ + /* * Counterpart to lock_timer_base above: */ static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { - spin_unlock_irqrestore(&timer->base->lock, *flags); + spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** @@ -345,7 +821,8 @@ hrtimer_forward(struct hrtimer *timer, k * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. */ -static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) +static void enqueue_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base) { struct rb_node **link = &base->active.rb_node; struct rb_node *parent = NULL; @@ -371,12 +848,18 @@ static void enqueue_hrtimer(struct hrtim * Insert the timer to the rbtree and check whether it * replaces the first pending timer */ - rb_link_node(&timer->node, parent, link); - rb_insert_color(&timer->node, &base->active); - if (!base->first || timer->expires.tv64 < - rb_entry(base->first, struct hrtimer, node)->expires.tv64) + rb_entry(base->first, struct hrtimer, node)->expires.tv64) { + + if (hrtimer_enqueue_reprogram(timer, base)) + return; + base->first = &timer->node; + } + + rb_link_node(&timer->node, parent, link); + rb_insert_color(&timer->node, &base->active); + timer->state = HRTIMER_ACTIVE; } /* @@ -384,26 +867,36 @@ static void enqueue_hrtimer(struct hrtim * * Caller must hold the base lock. */ -static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) -{ - /* - * Remove the timer from the rbtree and replace the - * first entry pointer if necessary. - */ - if (base->first == &timer->node) - base->first = rb_next(&timer->node); - rb_erase(&timer->node, &base->active); - timer->node.rb_parent = HRTIMER_INACTIVE; +static void __remove_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base, + enum hrtimer_state newstate, int reprogram) +{ + /* High res. callback list. NOP for !HIGHRES */ + if (hrtimer_cb_pending(timer)) + hrtimer_remove_cb_pending(timer); + else { + /* + * Remove the timer from the rbtree and replace the + * first entry pointer if necessary. + */ + if (base->first == &timer->node) { + base->first = rb_next(&timer->node); + if (reprogram && hrtimer_hres_active) + hrtimer_force_reprogram(base->cpu_base); + } + rb_erase(&timer->node, &base->active); + } + timer->state = newstate; } /* * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) { if (hrtimer_active(timer)) { - __remove_hrtimer(timer, base); + __remove_hrtimer(timer, base, HRTIMER_INACTIVE, 1); return 1; } return 0; @@ -423,7 +916,7 @@ remove_hrtimer(struct hrtimer *timer, st int hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { - struct hrtimer_base *base, *new_base; + struct hrtimer_clock_base *base, *new_base; unsigned long flags; int ret; @@ -471,13 +964,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start); */ int hrtimer_try_to_cancel(struct hrtimer *timer) { - struct hrtimer_base *base; + struct hrtimer_clock_base *base; unsigned long flags; int ret = -1; base = lock_hrtimer_base(timer, &flags); - if (base->curr_timer != timer) + if (base->cpu_base->curr_timer != timer) ret = remove_hrtimer(timer, base); unlock_hrtimer_base(timer, &flags); @@ -515,7 +1008,7 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); */ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) { - struct hrtimer_base *base; + struct hrtimer_clock_base *base; unsigned long flags; ktime_t rem; @@ -536,26 +1029,36 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining) */ ktime_t hrtimer_get_next_event(void) { - struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_clock_base *base = cpu_base->clock_base; ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; unsigned long flags; int i; - for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { + /* + * In high-res mode we dont need to get the next high-res + * event on a tickless system: + */ + if (hrtimer_hres_active) + return mindelta; + + spin_lock_irqsave(&cpu_base->lock, flags); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; - spin_lock_irqsave(&base->lock, flags); - if (!base->first) { - spin_unlock_irqrestore(&base->lock, flags); + if (!base->first) continue; - } + timer = rb_entry(base->first, struct hrtimer, node); delta.tv64 = timer->expires.tv64; - spin_unlock_irqrestore(&base->lock, flags); delta = ktime_sub(delta, base->get_time()); if (delta.tv64 < mindelta.tv64) mindelta.tv64 = delta.tv64; } + + spin_unlock_irqrestore(&cpu_base->lock, flags); + if (mindelta.tv64 < 0) mindelta.tv64 = 0; return mindelta; @@ -572,17 +1075,18 @@ ktime_t hrtimer_get_next_event(void) void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { - struct hrtimer_base *bases; + struct hrtimer_cpu_base *cpu_base; memset(timer, 0, sizeof(struct hrtimer)); - bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); + cpu_base = &__raw_get_cpu_var(hrtimer_bases); if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) clock_id = CLOCK_MONOTONIC; - timer->base = &bases[clock_id]; - timer->node.rb_parent = HRTIMER_INACTIVE; + timer->base = &cpu_base->clock_base[clock_id]; + timer->state = HRTIMER_INACTIVE; + hrtimer_init_timer_hres(timer); } EXPORT_SYMBOL_GPL(hrtimer_init); @@ -597,21 +1101,145 @@ EXPORT_SYMBOL_GPL(hrtimer_init); */ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { - struct hrtimer_base *bases; + struct hrtimer_cpu_base *cpu_base; - bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); - *tp = ktime_to_timespec(bases[which_clock].resolution); + cpu_base = &__raw_get_cpu_var(hrtimer_bases); + *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); return 0; } EXPORT_SYMBOL_GPL(hrtimer_get_res); +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer interrupt + * Called with interrupts disabled + */ +void hrtimer_interrupt(struct pt_regs *regs) +{ + struct hrtimer_clock_base *base; + ktime_t expires_next, now; + int i, raise = 0, cpu = smp_processor_id(); + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + + BUG_ON(!cpu_base->hres_active); + + /* Store the regs for an possible sched_timer callback */ + cpu_base->sched_regs = regs; + cpu_base->events++; + + retry: + now = ktime_get(); + + /* Check, if the jiffies need an update */ + update_jiffies64(now); + + expires_next.tv64 = KTIME_MAX; + + base = cpu_base->clock_base; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + ktime_t basenow; + struct rb_node *node; + + spin_lock(&cpu_base->lock); + + basenow = ktime_add(now, base->offset); + + while ((node = base->first)) { + struct hrtimer *timer; + + timer = rb_entry(node, struct hrtimer, node); + + if (basenow.tv64 < timer->expires.tv64) { + ktime_t expires; + + expires = ktime_sub(timer->expires, + base->offset); + if (expires.tv64 < expires_next.tv64) + expires_next = expires; + break; + } + __remove_hrtimer(timer, base, HRTIMER_PENDING, 0); + + if (timer->cb_mode != HRTIMER_CB_SOFTIRQ) { + if (timer->function(timer) == HRTIMER_RESTART) + enqueue_hrtimer(timer, base); + else + timer->state = HRTIMER_INACTIVE; + } else { + hrtimer_add_cb_pending(timer, base); + raise = 1; + } + } + spin_unlock(&cpu_base->lock); + base++; + } + + cpu_base->expires_next = expires_next; + + /* Reprogramming necessary ? */ + if (expires_next.tv64 != KTIME_MAX) { + if (clockevents_set_next_event(expires_next, 0)) + goto retry; + } + + /* Invalidate regs */ + cpu_base->sched_regs = NULL; + + /* Raise softirq ? */ + if (raise) + raise_softirq(HRTIMER_SOFTIRQ); +} + +static void run_hrtimer_softirq(struct softirq_action *h) +{ + struct hrtimer_cpu_base *cpu_base; + + cpu_base = &per_cpu(hrtimer_bases, smp_processor_id()); + + spin_lock_irq(&cpu_base->lock); + + while (!list_empty(&cpu_base->cb_pending)) { + struct hrtimer *timer; + int (*fn)(struct hrtimer *); + int restart; + + timer = list_entry(cpu_base->cb_pending.next, + struct hrtimer, cb_entry); + fn = timer->function; + set_curr_timer(cpu_base, timer); + __remove_hrtimer(timer, timer->base, HRTIMER_INACTIVE, 0); + spin_unlock_irq(&cpu_base->lock); + + restart = fn(timer); + + spin_lock_irq(&cpu_base->lock); + + if (restart == HRTIMER_RESTART) { + BUG_ON(hrtimer_active(timer)); + enqueue_hrtimer(timer, timer->base); + } else if (hrtimer_active(timer)) { + /* Timer was rearmed on another CPU: */ + if (timer->base->first == &timer->node) + hrtimer_reprogram(timer, timer->base); + } + } + set_curr_timer(cpu_base, NULL); + spin_unlock_irq(&cpu_base->lock); +} + +#endif /* CONFIG_HIGH_RES_TIMERS */ + /* * Expire the per base hrtimer-queue: */ -static inline void run_hrtimer_queue(struct hrtimer_base *base) +static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, + int index) { struct rb_node *node; + struct hrtimer_clock_base *base = &cpu_base->clock_base[index]; if (!base->first) return; @@ -619,7 +1247,7 @@ static inline void run_hrtimer_queue(str if (base->get_softirq_time) base->softirq_time = base->get_softirq_time(); - spin_lock_irq(&base->lock); + spin_lock_irq(&cpu_base->lock); while ((node = base->first)) { struct hrtimer *timer; @@ -631,35 +1259,44 @@ static inline void run_hrtimer_queue(str break; fn = timer->function; - set_curr_timer(base, timer); - __remove_hrtimer(timer, base); - spin_unlock_irq(&base->lock); + set_curr_timer(cpu_base, timer); + __remove_hrtimer(timer, base, HRTIMER_INACTIVE, 0); + spin_unlock_irq(&cpu_base->lock); restart = fn(timer); - spin_lock_irq(&base->lock); + spin_lock_irq(&cpu_base->lock); - if (restart != HRTIMER_NORESTART) { + if (restart == HRTIMER_RESTART) { BUG_ON(hrtimer_active(timer)); enqueue_hrtimer(timer, base); } } - set_curr_timer(base, NULL); - spin_unlock_irq(&base->lock); + set_curr_timer(cpu_base, NULL); + spin_unlock_irq(&cpu_base->lock); } /* * Called from timer softirq every jiffy, expire hrtimers: + * + * For HRT its the fall back code to run the softirq in the timer + * softirq context in case the hrtimer initialization failed or has + * not been done yet. */ void hrtimer_run_queues(void) { - struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); int i; - hrtimer_get_softirq_time(base); + hrtimer_check_clocks(); + + if (hrtimer_hres_active) + return; - for (i = 0; i < MAX_HRTIMER_BASES; i++) - run_hrtimer_queue(&base[i]); + hrtimer_get_softirq_time(cpu_base); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + run_hrtimer_queue(cpu_base, i); } /* @@ -682,6 +1319,9 @@ void hrtimer_init_sleeper(struct hrtimer { sl->timer.function = hrtimer_wakeup; sl->task = task; +#ifdef CONFIG_HIGH_RES_TIMERS + sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART; +#endif } static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) @@ -692,7 +1332,8 @@ static int __sched do_nanosleep(struct h set_current_state(TASK_INTERRUPTIBLE); hrtimer_start(&t->timer, t->timer.expires, mode); - schedule(); + if (likely(t->task)) + schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_ABS; @@ -788,24 +1429,27 @@ sys_nanosleep(struct timespec __user *rq */ static void __devinit init_hrtimers_cpu(int cpu) { - struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) - spin_lock_init(&base->lock); + spin_lock_init(&cpu_base->lock); + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + cpu_base->clock_base[i].cpu_base = cpu_base; + + hrtimer_init_hres(cpu_base); } #ifdef CONFIG_HOTPLUG_CPU -static void migrate_hrtimer_list(struct hrtimer_base *old_base, - struct hrtimer_base *new_base) +static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct rb_node *node; while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); - __remove_hrtimer(timer, old_base); + __remove_hrtimer(timer, old_base, HRTIMER_INACTIVE, 0); timer->base = new_base; enqueue_hrtimer(timer, new_base); } @@ -813,29 +1457,26 @@ static void migrate_hrtimer_list(struct static void migrate_hrtimers(int cpu) { - struct hrtimer_base *old_base, *new_base; + struct hrtimer_cpu_base *old_base, *new_base; int i; BUG_ON(cpu_online(cpu)); - old_base = per_cpu(hrtimer_bases, cpu); - new_base = get_cpu_var(hrtimer_bases); + old_base = &per_cpu(hrtimer_bases, cpu); + new_base = &get_cpu_var(hrtimer_bases); local_irq_disable(); - for (i = 0; i < MAX_HRTIMER_BASES; i++) { - - spin_lock(&new_base->lock); - spin_lock(&old_base->lock); + spin_lock(&new_base->lock); + spin_lock(&old_base->lock); + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { BUG_ON(old_base->curr_timer); - migrate_hrtimer_list(old_base, new_base); - - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - old_base++; - new_base++; + migrate_hrtimer_list(old_base->clock_base + 1, + new_base->clock_base + 1); } + spin_unlock(&old_base->lock); + spin_unlock(&new_base->lock); local_irq_enable(); put_cpu_var(hrtimer_bases); @@ -875,5 +1516,8 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); +#ifdef CONFIG_HIGH_RES_TIMERS + open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL); +#endif } Index: linux-hres.q/kernel/itimer.c =================================================================== --- linux-hres.q.orig/kernel/itimer.c +++ linux-hres.q/kernel/itimer.c @@ -136,7 +136,7 @@ int it_real_fn(struct hrtimer *timer) send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); if (sig->it_real_incr.tv64 != 0) { - hrtimer_forward(timer, timer->base->softirq_time, + hrtimer_forward(timer, hrtimer_cb_get_time(timer), sig->it_real_incr); return HRTIMER_RESTART; } Index: linux-hres.q/kernel/posix-timers.c =================================================================== --- linux-hres.q.orig/kernel/posix-timers.c +++ linux-hres.q/kernel/posix-timers.c @@ -356,7 +356,7 @@ static int posix_timer_fn(struct hrtimer if (timr->it.real.interval.tv64 != 0) { timr->it_overrun += hrtimer_forward(timer, - timer->base->softirq_time, + hrtimer_cb_get_time(timer), timr->it.real.interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; Index: linux-hres.q/kernel/printk.c =================================================================== --- linux-hres.q.orig/kernel/printk.c +++ linux-hres.q/kernel/printk.c @@ -337,7 +337,10 @@ static void __call_console_drivers(unsig static void _call_console_drivers(unsigned long start, unsigned long end, int msg_log_level) { - if (msg_log_level < console_loglevel && + if ( +#ifndef CONFIG_PRINTK_IGNORE_LOGLEVEL + msg_log_level < console_loglevel && +#endif console_drivers && start != end) { if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { /* wrapped write */ Index: linux-hres.q/kernel/rcupdate.c =================================================================== --- linux-hres.q.orig/kernel/rcupdate.c +++ linux-hres.q/kernel/rcupdate.c @@ -449,7 +449,7 @@ static void __rcu_process_callbacks(stru rcu_do_batch(rdp); } -static void rcu_process_callbacks(unsigned long unused) +void rcu_process_callbacks(unsigned long unused) { __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); @@ -504,6 +504,17 @@ int rcu_needs_cpu(int cpu) return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); } +void rcu_advance_callbacks(int cpu, int user) +{ + if (user || + (idle_cpu(cpu) && !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + rcu_qsctr_inc(cpu); + rcu_bh_qsctr_inc(cpu); + } else if (!in_softirq()) + rcu_bh_qsctr_inc(cpu); +} + void rcu_check_callbacks(int cpu, int user) { if (user || Index: linux-hres.q/kernel/sched.c =================================================================== --- linux-hres.q.orig/kernel/sched.c +++ linux-hres.q/kernel/sched.c @@ -818,6 +818,11 @@ static void deactivate_task(struct task_ * the target CPU. */ #ifdef CONFIG_SMP + +#ifndef tsk_is_polling +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#endif + static void resched_task(task_t *p) { int cpu; @@ -833,9 +838,9 @@ static void resched_task(task_t *p) if (cpu == smp_processor_id()) return; - /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ + /* NEED_RESCHED must be visible before we test polling */ smp_mb(); - if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) + if (!tsk_is_polling(p)) smp_send_reschedule(cpu); } #else @@ -4142,7 +4147,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + struct runqueue *rq = &__raw_get_cpu_var(runqueues); atomic_inc(&rq->nr_iowait); schedule(); @@ -4153,7 +4158,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + struct runqueue *rq = &__raw_get_cpu_var(runqueues); long ret; atomic_inc(&rq->nr_iowait); Index: linux-hres.q/kernel/softlockup.c =================================================================== --- linux-hres.q.orig/kernel/softlockup.c +++ linux-hres.q/kernel/softlockup.c @@ -36,7 +36,7 @@ static struct notifier_block panic_block void touch_softlockup_watchdog(void) { - per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; + __raw_get_cpu_var(touch_timestamp) = jiffies; } EXPORT_SYMBOL(touch_softlockup_watchdog); Index: linux-hres.q/kernel/sysctl.c =================================================================== --- linux-hres.q.orig/kernel/sysctl.c +++ linux-hres.q/kernel/sysctl.c @@ -487,6 +487,7 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#if 0 #ifdef CONFIG_MAGIC_SYSRQ { .ctl_name = KERN_SYSRQ, @@ -497,6 +498,7 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#endif { .ctl_name = KERN_CADPID, .procname = "cad_pid", @@ -579,6 +581,16 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_NO_IDLE_HZ + { + .ctl_name = KERN_TIMEOUT_GRANULARITY, + .procname = "timeout_granularity", + .data = &timeout_granularity, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_PIDMAX, .procname = "pid_max", Index: linux-hres.q/kernel/time.c =================================================================== --- linux-hres.q.orig/kernel/time.c +++ linux-hres.q/kernel/time.c @@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday); #else +#ifndef CONFIG_GENERIC_TIME /* * Simulate gettimeofday using do_gettimeofday which only allows a timeval * and therefore only yields usec accuracy @@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv) } EXPORT_SYMBOL_GPL(getnstimeofday); #endif +#endif /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 @@ -641,6 +643,277 @@ struct timeval ns_to_timeval(const s64 n return tv; } +/* + * Convert jiffies to milliseconds and back. + * + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases: + */ +unsigned int jiffies_to_msecs(const unsigned long j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); +#else + return (j * MSEC_PER_SEC) / HZ; +#endif +} + +EXPORT_SYMBOL(jiffies_to_msecs); + +unsigned int jiffies_to_usecs(const unsigned long j) +{ +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); +#else + return (j * USEC_PER_SEC) / HZ; +#endif +} + +EXPORT_SYMBOL(jiffies_to_usecs); + +/* + * When we convert to jiffies then we interpret incoming values + * the following way: + * + * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor + * + * We must also be careful about 32-bit overflows. + */ +unsigned long msecs_to_jiffies(const unsigned int m) +{ + /* + * Negative value, means infinite timeout: + */ + if ((int)m < 0) + return MAX_JIFFY_OFFSET; + +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + /* + * HZ is equal to or smaller than 1000, and 1000 is a nice + * round multiple of HZ, divide with the factor between them, + * but round upwards: + */ + return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + /* + * HZ is larger than 1000, and HZ is a nice round multiple of + * 1000 - simply multiply with the factor between them. + * + * But first make sure the multiplication result cannot + * overflow: + */ + if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return m * (HZ / MSEC_PER_SEC); +#else + /* + * Generic case - multiply, round and divide. But first + * check that if we are doing a net multiplication, that + * we wouldnt overflow: + */ + if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC; +#endif +} + +EXPORT_SYMBOL(msecs_to_jiffies); + +unsigned long usecs_to_jiffies(const unsigned int u) +{ + if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return u * (HZ / USEC_PER_SEC); +#else + return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC; +#endif +} + +EXPORT_SYMBOL(usecs_to_jiffies); + +/* + * The TICK_NSEC - 1 rounds up the value to the next resolution. Note + * that a remainder subtract here would not do the right thing as the + * resolution values don't fall on second boundries. I.e. the line: + * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. + * + * Rather, we just shift the bits off the right. + * + * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec + * value to a scaled second value. + */ +unsigned long +timespec_to_jiffies(const struct timespec *value) +{ + unsigned long sec = value->tv_sec; + long nsec = value->tv_nsec + TICK_NSEC - 1; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + nsec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)nsec * NSEC_CONVERSION) >> + (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; + +} + +EXPORT_SYMBOL(timespec_to_jiffies); + +void +jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u64 nsec = (u64)jiffies * TICK_NSEC; + value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec); +} + +/* Same for "timeval" + * + * Well, almost. The problem here is that the real system resolution is + * in nanoseconds and the value being converted is in micro seconds. + * Also for some machines (those that use HZ = 1024, in-particular), + * there is a LARGE error in the tick size in microseconds. + + * The solution we use is to do the rounding AFTER we convert the + * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. + * Instruction wise, this should cost only an additional add with carry + * instruction above the way it was done above. + */ +unsigned long +timeval_to_jiffies(const struct timeval *value) +{ + unsigned long sec = value->tv_sec; + long usec = value->tv_usec; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + usec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> + (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; +} + +void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u64 nsec = (u64)jiffies * TICK_NSEC; + long tv_usec; + + value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec); + tv_usec /= NSEC_PER_USEC; + value->tv_usec = tv_usec; +} + +/* + * Convert jiffies/jiffies_64 to clock_t and back. + */ +clock_t jiffies_to_clock_t(long x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 + return x / (HZ / USER_HZ); +#else + u64 tmp = (u64)x * TICK_NSEC; + do_div(tmp, (NSEC_PER_SEC / USER_HZ)); + return (long)tmp; +#endif +} + +EXPORT_SYMBOL(jiffies_to_clock_t); + +unsigned long clock_t_to_jiffies(unsigned long x) +{ +#if (HZ % USER_HZ)==0 + if (x >= ~0UL / (HZ / USER_HZ)) + return ~0UL; + return x * (HZ / USER_HZ); +#else + u64 jif; + + /* Don't worry about loss of precision here .. */ + if (x >= ~0UL / HZ * USER_HZ) + return ~0UL; + + /* .. but do try to contain it here */ + jif = x * (u64) HZ; + do_div(jif, USER_HZ); + return jif; +#endif +} + +EXPORT_SYMBOL(clock_t_to_jiffies); + +u64 jiffies_64_to_clock_t(u64 x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 + do_div(x, HZ / USER_HZ); +#else + /* + * There are better ways that don't overflow early, + * but even this doesn't overflow in hundreds of years + * in 64 bits, so.. + */ + x *= TICK_NSEC; + do_div(x, (NSEC_PER_SEC / USER_HZ)); +#endif + return x; +} + +EXPORT_SYMBOL(jiffies_64_to_clock_t); + +u64 nsec_to_clock_t(u64 x) +{ +#if (NSEC_PER_SEC % USER_HZ) == 0 + do_div(x, (NSEC_PER_SEC / USER_HZ)); +#elif (USER_HZ % 512) == 0 + x *= USER_HZ/512; + do_div(x, (NSEC_PER_SEC / 512)); +#else + /* + * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, + * overflow after 64.99 years. + * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... + */ + x *= 9; + do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) + / USER_HZ)); +#endif + return x; +} + +int nsec_to_timestamp(char *s, u64 t) +{ + unsigned long nsec_rem = do_div(t, NSEC_PER_SEC); + return sprintf(s, "[%5lu.%06lu]", (unsigned long)t, + nsec_rem/NSEC_PER_USEC); +} +__attribute__((weak)) unsigned long long timestamp_clock(void) +{ + return sched_clock(); +} + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { Index: linux-hres.q/kernel/time/Kconfig =================================================================== --- /dev/null +++ linux-hres.q/kernel/time/Kconfig @@ -0,0 +1,35 @@ +# +# Timer subsystem related configuration options +# +config HIGH_RES_TIMERS + bool "High Resolution Timer Support" + depends on GENERIC_TIME + help + This option enables high resolution timer support. If your + hardware is not capable then this option only increases + the size of the kernel image. + +config NO_HZ + bool "Tickless System (Dynamic Ticks)" + depends on GENERIC_TIME && HIGH_RES_TIMERS + help + This option enables a tickless system: timer interrupts will + only trigger on an as-needed basis both when the system is + busy and when the system is idle. + +config NO_IDLE_HZ + bool + default y + depends on NO_HZ + +config HIGH_RES_RESOLUTION + int "High Resolution Timer resolution (nanoseconds)" + depends on HIGH_RES_TIMERS + default 1000 + help + This sets the resolution in nanoseconds of the high resolution + timers. Too fine a resolution (small a number) will usually + not be observable due to normal system latencies. For an + 800 MHz processor about 10,000 (10 microseconds) is recommended as a + finest resolution. If you don't need that sort of resolution, + larger values may generate less overhead. Index: linux-hres.q/kernel/time/Makefile =================================================================== --- /dev/null +++ linux-hres.q/kernel/time/Makefile @@ -0,0 +1,3 @@ +obj-y += clocksource.o jiffies.o + +obj-$(CONFIG_GENERIC_TIME) += clockevents.o Index: linux-hres.q/kernel/time/clockevents.c =================================================================== --- /dev/null +++ linux-hres.q/kernel/time/clockevents.c @@ -0,0 +1,490 @@ +/* + * linux/kernel/time/clockevents.c + * + * This file contains functions which manage clock event drivers. + * + * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar + * + * We have two types of clock event devices: + * - global events (one device per system) + * - local events (one device per cpu) + * + * We assign the various time(r) related interrupts to those devices + * + * - global tick + * - profiling (per cpu) + * - next timer events (per cpu) + * + * TODO: + * - implement variable frequency profiling + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_CLOCK_EVENTS 4 +#define GLOBAL_CLOCK_EVENT MAX_CLOCK_EVENTS + +struct event_descr { + struct clock_event *event; + unsigned int mode; + unsigned int real_caps; + struct irqaction action; +}; + +struct local_events { + int installed; + struct event_descr events[MAX_CLOCK_EVENTS]; + struct clock_event *nextevt; +}; + +/* Variables related to the global event source */ +static __read_mostly struct event_descr global_eventsource; + +/* Variables related to the per cpu local event sources */ +static DEFINE_PER_CPU(struct local_events, local_eventsources); + +/* lock to protect the above */ +static DEFINE_SPINLOCK(events_lock); + +/* + * Math helper. Convert a latch value to ns + */ +unsigned long clockevent_delta2ns(unsigned long latch, struct clock_event *evt) +{ + u64 clc = ((u64) latch << evt->shift); + + do_div(clc, evt->mult); + if (clc < KTIME_MONOTONIC_RES.tv64) + clc = KTIME_MONOTONIC_RES.tv64; + if (clc > LONG_MAX) + clc = LONG_MAX; + + return (unsigned long) clc; +} + +/* + * Bootup and lowres handler: ticks only + */ +static void handle_tick(struct pt_regs *regs) +{ + BUG_ON(hrtimer_hres_active); + + write_seqlock(&xtime_lock); + do_timer(regs); + write_sequnlock(&xtime_lock); +} + +/* + * Bootup and lowres handler: ticks and update_process_times + */ +static void handle_tick_update(struct pt_regs *regs) +{ + BUG_ON(hrtimer_hres_active); + + write_seqlock(&xtime_lock); + do_timer(regs); + write_sequnlock(&xtime_lock); + + update_process_times(user_mode(regs)); +} + +/* + * Bootup and lowres handler: ticks and profileing + */ +static void handle_tick_profile(struct pt_regs *regs) +{ + BUG_ON(hrtimer_hres_active); + + write_seqlock(&xtime_lock); + do_timer(regs); + write_sequnlock(&xtime_lock); + + profile_tick(CPU_PROFILING, regs); +} + +/* + * Bootup and lowres handler: ticks, update_process_times and profiling + */ +static void handle_tick_update_profile(struct pt_regs *regs) +{ + BUG_ON(hrtimer_hres_active); + + write_seqlock(&xtime_lock); + do_timer(regs); + write_sequnlock(&xtime_lock); + + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING, regs); +} + +/* + * Bootup and lowres handler: update_process_times + */ +static void handle_update(struct pt_regs *regs) +{ + BUG_ON(hrtimer_hres_active); + + update_process_times(user_mode(regs)); +} + +/* + * Bootup and lowres handler: update_process_times and profiling + */ +static void handle_update_profile(struct pt_regs *regs) +{ + BUG_ON(hrtimer_hres_active); + + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING, regs); +} + +/* + * Bootup and lowres handler: profiling + */ +static void handle_profile(struct pt_regs *regs) +{ + BUG_ON(hrtimer_hres_active); + + profile_tick(CPU_PROFILING, regs); +} + +/* + * Noop handler when we shut down an event source + */ +static void handle_noop(struct pt_regs *regs) +{ +} + +/* + * Lookup table for bootup and lowres event assignment + */ +static void __read_mostly *event_handlers[] = { + handle_noop, /* 0: No capability selected */ + handle_tick, /* 1: Tick only */ + handle_update, /* 2: Update process times */ + handle_tick_update, /* 3: Tick + update process times */ + handle_profile, /* 4: Profiling int */ + handle_tick_profile, /* 5: Tick + Profiling int */ + handle_update_profile, /* 6: Update process times + + profiling */ + handle_tick_update_profile, /* 7: Tick + update process times + + profiling */ +#ifdef CONFIG_HIGH_RES_TIMERS + hrtimer_interrupt, /* 8: Reprogrammable event source */ +#endif +}; + +/* + * Setup an event source. Assign an handler and start it up + * When the event source has no own interrupt handler we setup + * the interrupt too. + */ +static void setup_event(struct event_descr *descr, struct clock_event *evt, + unsigned int caps) +{ + void *handler = event_handlers[caps]; + int mode; + + /* Set the event handler */ + evt->event_handler = handler; + + /* Store all relevant information */ + descr->real_caps = caps; + + if (caps == CLOCK_CAP_NEXTEVT) + mode = CLOCK_EVT_ONESHOT; + else + mode = CLOCK_EVT_PERIODIC; + + evt->set_mode(mode, evt); + + printk(KERN_INFO "Event source %s configured with caps set: " + "%02x\n", evt->name, descr->real_caps); +} + +/** + * register_global_clockevent - register the device which generates + * global clock events + * + * @evt: The device which generates global clock events (ticks) + * + * This can be a device which is only necessary for bootup. On UP systems this + * might be the only event source which is used for everything including + * high resolution events. + * + * When a cpu local event source is installed the global event source is + * switched off in the high resolution timer / tickless mode. + */ +int __init register_global_clockevent(struct clock_event *evt) +{ + /* Already installed? */ + if (global_eventsource.event) { + printk(KERN_ERR "Global clock event source already installed: " + "%s. Ignoring new global eventsoruce %s\n", + global_eventsource.event->name, + evt->name); + return -EBUSY; + } + + /* Preset the handler in any case */ + evt->event_handler = handle_noop; + + /* + * Check, whether it is a valid global event source + */ + if (!(evt->capabilities & CLOCK_BASE_CAPS_MASK)) { + printk(KERN_ERR "Unsupported event source %s\n", evt->name); + return -EINVAL; + } + + /* Mask out high resolution capabilities for now */ + global_eventsource.event = evt; + setup_event(&global_eventsource, evt, + evt->capabilities & CLOCK_BASE_CAPS_MASK); + return 0; +} + +/* + * Mask out the functionality which is covered by the new event source + * and assign a new event handler. + */ +static void recalc_active_event(struct event_descr *descr, + unsigned int newcaps) +{ + unsigned int caps; + + if (!descr->real_caps) + return; + + /* Mask the overlapping bits */ + caps = descr->real_caps & ~newcaps; + + /* Assign the new event handler */ + if (caps) { + descr->event->event_handler = event_handlers[caps]; + printk(KERN_INFO "Event source %s new caps set: %02x\n" , + descr->event->name, caps); + } else { + descr->event->event_handler = handle_noop; + + if (descr->event->set_mode) + descr->event->set_mode(CLOCK_EVT_SHUTDOWN, + descr->event); + + printk(KERN_INFO "Event source %s disabled\n" , + descr->event->name); + } + descr->real_caps = caps; +} + +/* + * Recalc the events and reassign the handlers if necessary + */ +static int recalc_events(struct local_events *sources, struct clock_event *evt, + unsigned int caps, int new) +{ + int i; + + if (new && sources->installed == MAX_CLOCK_EVENTS) + return -ENOSPC; + + /* + * If there is no handler and this is not a next-event capable + * event source, refuse to handle it + */ + if (!evt->capabilities & CLOCK_CAP_NEXTEVT && !event_handlers[caps]) { + printk(KERN_ERR "Unsupported event source %s\n", evt->name); + return -EINVAL; + } + + if (caps && global_eventsource.event && global_eventsource.event != evt) + recalc_active_event(&global_eventsource, caps); + + for (i = 0; i < sources->installed; i++) { + if (sources->events[i].event != evt) + recalc_active_event(&sources->events[i], caps); + } + + if (new) + sources->events[sources->installed++].event = evt; + + if (caps) { + /* Is next_event event source going to be installed? */ + if (caps & CLOCK_CAP_NEXTEVT) + caps = CLOCK_CAP_NEXTEVT; + + setup_event(&sources->events[sources->installed], + evt, caps); + } else + printk(KERN_INFO "Inactive event source %s registered\n", + evt->name); + + return 0; +} + +/** + * register_local_clockevent - Set up a cpu local clock event device + * + * @evt: event device to be registered + */ +int register_local_clockevent(struct clock_event *evt) +{ + struct local_events *sources = &__get_cpu_var(local_eventsources); + unsigned long flags; + int ret; + + spin_lock_irqsave(&events_lock, flags); + + /* Preset the handler in any case */ + evt->event_handler = handle_noop; + + /* Recalc event sources and maybe reassign handlers */ + ret = recalc_events(sources, evt, + evt->capabilities & CLOCK_BASE_CAPS_MASK, 1); + + spin_unlock_irqrestore(&events_lock, flags); + + /* + * Trigger hrtimers, when the event source is next-event + * capable + */ + if (!ret && (evt->capabilities & CLOCK_CAP_NEXTEVT)) + hrtimer_clock_notify(); + + return ret; +} +EXPORT_SYMBOL_GPL(register_local_clockevent); + +/* + * Find a next-event capable event source + */ +static int get_next_event_source(void) +{ + struct local_events *sources = &__get_cpu_var(local_eventsources); + int i; + + for (i = 0; i < sources->installed; i++) { + struct clock_event *evt; + + evt = sources->events[i].event; + if (evt->capabilities & CLOCK_CAP_NEXTEVT) + return i; + } + +#ifndef CONFIG_SMP + if (global_eventsource.event->capabilities & CLOCK_CAP_NEXTEVT) + return GLOBAL_CLOCK_EVENT; +#endif + return -ENODEV; +} + +/** + * clockevents_next_event_available - Check for a installed next-event source + */ +int clockevents_next_event_available(void) +{ + unsigned long flags; + int idx; + + spin_lock_irqsave(&events_lock, flags); + idx = get_next_event_source(); + spin_unlock_irqrestore(&events_lock, flags); + + return idx < 0 ? 0 : 1; +} + +int clockevents_init_next_event(void) +{ + struct local_events *sources = &__get_cpu_var(local_eventsources); + struct clock_event *nextevt; + unsigned long flags; + int idx, ret = -ENODEV; + + if (sources->nextevt) + return -EBUSY; + + spin_lock_irqsave(&events_lock, flags); + + idx = get_next_event_source(); + if (idx < 0) + goto out_unlock; + + if (idx == GLOBAL_CLOCK_EVENT) + nextevt = global_eventsource.event; + else + nextevt = sources->events[idx].event; + + ret = recalc_events(sources, nextevt, CLOCK_CAPS_MASK, 0); + if (!ret) + sources->nextevt = nextevt; + out_unlock: + spin_unlock_irqrestore(&events_lock, flags); + + return ret; +} + +int clockevents_set_next_event(ktime_t expires, int force) +{ + struct local_events *sources = &__get_cpu_var(local_eventsources); + int64_t delta = ktime_to_ns(ktime_sub(expires, ktime_get())); + struct clock_event *nextevt = sources->nextevt; + unsigned long long clc; + + if (delta <= 0 && !force) + return -ETIME; + + if (delta > nextevt->max_delta_ns) + delta = nextevt->max_delta_ns; + if (delta < nextevt->min_delta_ns) + delta = nextevt->min_delta_ns; + + clc = delta * nextevt->mult; + clc >>= nextevt->shift; + nextevt->set_next_event((unsigned long)clc, sources->nextevt); + + return 0; +} + +/* + * Functions related to initialization and hotplug + */ +static int clockevents_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch(action) { + case CPU_UP_PREPARE: + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + /* + * Do something sensible here ! + * Disable the cpu local clocksources + */ + break; +#endif + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata clockevents_nb = { + .notifier_call = clockevents_cpu_notify, +}; + +void __init init_clockevents(void) +{ + clockevents_cpu_notify(&clockevents_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&clockevents_nb); +} Index: linux-hres.q/kernel/time/clocksource.c =================================================================== --- /dev/null +++ linux-hres.q/kernel/time/clocksource.c @@ -0,0 +1,349 @@ +/* + * linux/kernel/time/clocksource.c + * + * This file contains the functions which manage clocksource drivers. + * + * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * TODO WishList: + * o Allow clocksource drivers to be unregistered + * o get rid of clocksource_jiffies extern + */ + +#include +#include +#include +#include + +/* XXX - Would like a better way for initializing curr_clocksource */ +extern struct clocksource clocksource_jiffies; + +/*[Clocksource internal variables]--------- + * curr_clocksource: + * currently selected clocksource. Initialized to clocksource_jiffies. + * next_clocksource: + * pending next selected clocksource. + * clocksource_list: + * linked list with the registered clocksources + * clocksource_lock: + * protects manipulations to curr_clocksource and next_clocksource + * and the clocksource_list + * override_name: + * Name of the user-specified clocksource. + */ +static struct clocksource *curr_clocksource = &clocksource_jiffies; +static struct clocksource *next_clocksource; +static LIST_HEAD(clocksource_list); +static DEFINE_SPINLOCK(clocksource_lock); +static char override_name[32]; +static int finished_booting; + +/* clocksource_done_booting - Called near the end of bootup + * + * Hack to avoid lots of clocksource churn at boot time + */ +static int __init clocksource_done_booting(void) +{ + finished_booting = 1; + return 0; +} + +late_initcall(clocksource_done_booting); + +/** + * clocksource_get_next - Returns the selected clocksource + * + */ +struct clocksource *clocksource_get_next(void) +{ + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + if (next_clocksource && finished_booting) { + curr_clocksource = next_clocksource; + next_clocksource = NULL; + } + spin_unlock_irqrestore(&clocksource_lock, flags); + + return curr_clocksource; +} + +/** + * select_clocksource - Finds the best registered clocksource. + * + * Private function. Must hold clocksource_lock when called. + * + * Looks through the list of registered clocksources, returning + * the one with the highest rating value. If there is a clocksource + * name that matches the override string, it returns that clocksource. + */ +static struct clocksource *select_clocksource(void) +{ + struct clocksource *best = NULL; + struct list_head *tmp; + + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + if (!best) + best = src; + + /* check for override: */ + if (strlen(src->name) == strlen(override_name) && + !strcmp(src->name, override_name)) { + best = src; + break; + } + /* pick the highest rating: */ + if (src->rating > best->rating) + best = src; + } + + return best; +} + +/** + * is_registered_source - Checks if clocksource is registered + * @c: pointer to a clocksource + * + * Private helper function. Must hold clocksource_lock when called. + * + * Returns one if the clocksource is already registered, zero otherwise. + */ +static int is_registered_source(struct clocksource *c) +{ + int len = strlen(c->name); + struct list_head *tmp; + + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + if (strlen(src->name) == len && !strcmp(src->name, c->name)) + return 1; + } + + return 0; +} + +/** + * clocksource_register - Used to install new clocksources + * @t: clocksource to be registered + * + * Returns -EBUSY if registration fails, zero otherwise. + */ +int clocksource_register(struct clocksource *c) +{ + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + /* check if clocksource is already registered */ + if (is_registered_source(c)) { + printk("register_clocksource: Cannot register %s. " + "Already registered!", c->name); + ret = -EBUSY; + } else { + /* register it */ + list_add(&c->list, &clocksource_list); + /* scan the registered clocksources, and pick the best one */ + next_clocksource = select_clocksource(); + } + spin_unlock_irqrestore(&clocksource_lock, flags); + return ret; +} +EXPORT_SYMBOL(clocksource_register); + +/** + * clocksource_reselect - Rescan list for next clocksource + * + * A quick helper function to be used if a clocksource changes its + * rating. Forces the clocksource list to be re-scanned for the best + * clocksource. + */ +void clocksource_reselect(void) +{ + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + next_clocksource = select_clocksource(); + spin_unlock_irqrestore(&clocksource_lock, flags); +} +EXPORT_SYMBOL(clocksource_reselect); + +/** + * sysfs_show_current_clocksources - sysfs interface for current clocksource + * @dev: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing current clocksource. + */ +static ssize_t +sysfs_show_current_clocksources(struct sys_device *dev, char *buf) +{ + char *curr = buf; + + spin_lock_irq(&clocksource_lock); + curr += sprintf(curr, "%s ", curr_clocksource->name); + spin_unlock_irq(&clocksource_lock); + + curr += sprintf(curr, "\n"); + + return curr - buf; +} + +/** + * sysfs_override_clocksource - interface for manually overriding clocksource + * @dev: unused + * @buf: name of override clocksource + * @count: length of buffer + * + * Takes input from sysfs interface for manually overriding the default + * clocksource selction. + */ +static ssize_t sysfs_override_clocksource(struct sys_device *dev, + const char *buf, size_t count) +{ + size_t ret = count; + /* strings from sysfs write are not 0 terminated! */ + if (count >= sizeof(override_name)) + return -EINVAL; + + /* strip of \n: */ + if (buf[count-1] == '\n') + count--; + if (count < 1) + return -EINVAL; + + spin_lock_irq(&clocksource_lock); + + /* copy the name given: */ + memcpy(override_name, buf, count); + override_name[count] = 0; + + /* try to select it: */ + next_clocksource = select_clocksource(); + + spin_unlock_irq(&clocksource_lock); + + return ret; +} + +/** + * sysfs_show_available_clocksources - sysfs interface for listing clocksource + * @dev: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing registered clocksources + */ +static ssize_t +sysfs_show_available_clocksources(struct sys_device *dev, char *buf) +{ + struct list_head *tmp; + char *curr = buf; + + spin_lock_irq(&clocksource_lock); + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + curr += sprintf(curr, "%s ", src->name); + } + spin_unlock_irq(&clocksource_lock); + + curr += sprintf(curr, "\n"); + + return curr - buf; +} + +/* + * Sysfs setup bits: + */ +static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, + sysfs_override_clocksource); + +static SYSDEV_ATTR(available_clocksource, 0600, + sysfs_show_available_clocksources, NULL); + +static struct sysdev_class clocksource_sysclass = { + set_kset_name("clocksource"), +}; + +static struct sys_device device_clocksource = { + .id = 0, + .cls = &clocksource_sysclass, +}; + +static int __init init_clocksource_sysfs(void) +{ + int error = sysdev_class_register(&clocksource_sysclass); + + if (!error) + error = sysdev_register(&device_clocksource); + if (!error) + error = sysdev_create_file( + &device_clocksource, + &attr_current_clocksource); + if (!error) + error = sysdev_create_file( + &device_clocksource, + &attr_available_clocksource); + return error; +} + +device_initcall(init_clocksource_sysfs); + +/** + * boot_override_clocksource - boot clock override + * @str: override name + * + * Takes a clocksource= boot argument and uses it + * as the clocksource override name. + */ +static int __init boot_override_clocksource(char* str) +{ + unsigned long flags; + spin_lock_irqsave(&clocksource_lock, flags); + if (str) + strlcpy(override_name, str, sizeof(override_name)); + spin_unlock_irqrestore(&clocksource_lock, flags); + return 1; +} + +__setup("clocksource=", boot_override_clocksource); + +/** + * boot_override_clock - Compatibility layer for deprecated boot option + * @str: override name + * + * DEPRECATED! Takes a clock= boot argument and uses it + * as the clocksource override name + */ +static int __init boot_override_clock(char* str) +{ + if (!strcmp(str, "pmtmr")) { + printk("Warning: clock=pmtmr is deprecated. " + "Use clocksource=acpi_pm.\n"); + return boot_override_clocksource("acpi_pm"); + } + printk("Warning! clock= boot option is deprecated. " + "Use clocksource=xyz\n"); + return boot_override_clocksource(str); +} + +__setup("clock=", boot_override_clock); Index: linux-hres.q/kernel/time/jiffies.c =================================================================== --- /dev/null +++ linux-hres.q/kernel/time/jiffies.c @@ -0,0 +1,73 @@ +/*********************************************************************** +* linux/kernel/time/jiffies.c +* +* This file contains the jiffies based clocksource. +* +* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +* +************************************************************************/ +#include +#include +#include + +/* The Jiffies based clocksource is the lowest common + * denominator clock source which should function on + * all systems. It has the same coarse resolution as + * the timer interrupt frequency HZ and it suffers + * inaccuracies caused by missed or lost timer + * interrupts and the inability for the timer + * interrupt hardware to accuratly tick at the + * requested HZ value. It is also not reccomended + * for "tick-less" systems. + */ +#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) + +/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier + * conversion, the .shift value could be zero. However + * this would make NTP adjustments impossible as they are + * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to + * shift both the nominator and denominator the same + * amount, and give ntp adjustments in units of 1/2^8 + * + * The value 8 is somewhat carefully chosen, as anything + * larger can result in overflows. NSEC_PER_JIFFY grows as + * HZ shrinks, so values greater then 8 overflow 32bits when + * HZ=100. + */ +#define JIFFIES_SHIFT 8 + +static cycle_t jiffies_read(void) +{ + return (cycle_t) jiffies; +} + +struct clocksource clocksource_jiffies = { + .name = "jiffies", + .rating = 0, /* lowest rating*/ + .read = jiffies_read, + .mask = 0xffffffff, /*32bits*/ + .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ + .shift = JIFFIES_SHIFT, + .is_continuous = 0, /* tick based, not free running */ +}; + +static int __init init_jiffies_clocksource(void) +{ + return clocksource_register(&clocksource_jiffies); +} + +module_init(init_jiffies_clocksource); Index: linux-hres.q/kernel/timer.c =================================================================== --- linux-hres.q.orig/kernel/timer.c +++ linux-hres.q/kernel/timer.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -69,6 +70,8 @@ typedef struct tvec_root_s { struct list_head vec[TVR_SIZE]; } tvec_root_t; +unsigned int __read_mostly timeout_granularity = 1; + struct tvec_t_base_s { spinlock_t lock; struct timer_list *running_timer; @@ -89,9 +92,7 @@ static DEFINE_PER_CPU(tvec_base_t *, tve static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) { -#ifdef CONFIG_SMP base->running_timer = timer; -#endif } static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) @@ -136,6 +137,17 @@ static void internal_add_timer(tvec_base list_add_tail(&timer->entry, vec); } +#ifdef CONFIG_TIMER_INFO +void __timer_set_start_info(struct timer_list *timer, void *addr) +{ + if (timer->start_site) + return; + + timer->start_site = addr; + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +} +#endif /*** * init_timer - initialize a timer. * @timer: the timer to be initialized @@ -146,12 +158,17 @@ static void internal_add_timer(tvec_base void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; - timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); + timer->base = __raw_get_cpu_var(tvec_bases); +#ifdef CONFIG_TIMER_INFO + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif } EXPORT_SYMBOL(init_timer); -static inline void detach_timer(struct timer_list *timer, - int clear_pending) +static inline void +detach_timer(tvec_base_t *base, struct timer_list *timer, int clear_pending) { struct list_head *entry = &timer->entry; @@ -197,12 +214,13 @@ int __mod_timer(struct timer_list *timer unsigned long flags; int ret = 0; + timer_set_start_info(timer); BUG_ON(!timer->function); base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { - detach_timer(timer, 0); + detach_timer(base, timer, 0); ret = 1; } @@ -247,6 +265,7 @@ void add_timer_on(struct timer_list *tim tvec_base_t *base = per_cpu(tvec_bases, cpu); unsigned long flags; + timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); timer->base = base; @@ -278,6 +297,7 @@ int mod_timer(struct timer_list *timer, { BUG_ON(!timer->function); + timer_set_start_info(timer); /* * This is a common optimization triggered by the * networking code - if the timer is re-modified @@ -308,10 +328,11 @@ int del_timer(struct timer_list *timer) unsigned long flags; int ret = 0; + timer_clear_start_info(timer); if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { - detach_timer(timer, 1); + detach_timer(base, timer, 1); ret = 1; } spin_unlock_irqrestore(&base->lock, flags); @@ -342,7 +363,7 @@ int try_to_del_timer_sync(struct timer_l ret = 0; if (timer_pending(timer)) { - detach_timer(timer, 1); + detach_timer(base, timer, 1); ret = 1; } out: @@ -417,8 +438,7 @@ static inline void __run_timers(tvec_bas { struct timer_list *timer; - spin_lock_irq(&base->lock); - while (time_after_eq(jiffies, base->timer_jiffies)) { + while (time_before_eq(base->timer_jiffies, jiffies)) { struct list_head work_list = LIST_HEAD_INIT(work_list); struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK; @@ -441,8 +461,10 @@ static inline void __run_timers(tvec_bas fn = timer->function; data = timer->data; + account_timer(timer); + set_running_timer(base, timer); - detach_timer(timer, 1); + detach_timer(base, timer, 1); spin_unlock_irq(&base->lock); { int preempt_count = preempt_count(); @@ -460,7 +482,6 @@ static inline void __run_timers(tvec_bas } } set_running_timer(base, NULL); - spin_unlock_irq(&base->lock); } #ifdef CONFIG_NO_IDLE_HZ @@ -469,29 +490,28 @@ static inline void __run_timers(tvec_bas * is used on S/390 to stop all activity when a cpus is idle. * This functions needs to be called disabled. */ -unsigned long next_timer_interrupt(void) +unsigned long __next_timer_interrupt(tvec_base_t *base, unsigned long now) { - tvec_base_t *base; struct list_head *list; - struct timer_list *nte; + struct timer_list *nte, *found = NULL; unsigned long expires; - unsigned long hr_expires = MAX_JIFFY_OFFSET; - ktime_t hr_delta; tvec_t *varray[4]; int i, j; - hr_delta = hrtimer_get_next_event(); +#ifndef CONFIG_NO_HZ + unsigned long hr_expires = MAX_JIFFY_OFFSET; + ktime_t hr_delta = hrtimer_get_next_event(); + if (hr_delta.tv64 != KTIME_MAX) { struct timespec tsdelta; tsdelta = ktime_to_timespec(hr_delta); hr_expires = timespec_to_jiffies(&tsdelta); if (hr_expires < 3) - return hr_expires + jiffies; + return hr_expires + now; } - hr_expires += jiffies; + hr_expires += now; +#endif - base = __get_cpu_var(tvec_bases); - spin_lock(&base->lock); expires = base->timer_jiffies + (LONG_MAX >> 1); list = NULL; @@ -500,6 +520,7 @@ unsigned long next_timer_interrupt(void) do { list_for_each_entry(nte, base->tv1.vec + j, entry) { expires = nte->expires; + found = nte; if (j < (base->timer_jiffies & TVR_MASK)) list = base->tv2.vec + (INDEX(0)); goto found; @@ -519,9 +540,12 @@ unsigned long next_timer_interrupt(void) j = (j + 1) & TVN_MASK; continue; } - list_for_each_entry(nte, varray[i]->vec + j, entry) - if (time_before(nte->expires, expires)) + list_for_each_entry(nte, varray[i]->vec + j, entry) { + if (time_before(nte->expires, expires)) { expires = nte->expires; + found = nte; + } + } if (j < (INDEX(i)) && i < 3) list = varray[i + 1]->vec + (INDEX(i + 1)); goto found; @@ -535,12 +559,15 @@ found: * where we found the timer element. */ list_for_each_entry(nte, list, entry) { - if (time_before(nte->expires, expires)) + if (time_before(nte->expires, expires)) { expires = nte->expires; + found = nte; + } } } - spin_unlock(&base->lock); + WARN_ON(!found); +#ifndef CONFIG_NO_HZ /* * It can happen that other CPUs service timer IRQs and increment * jiffies, but we have not yet got a local timer tick to process @@ -554,14 +581,44 @@ found: * would falsely evaluate to true. If that is the case, just * return jiffies so that we can immediately fire the local timer */ - if (time_before(expires, jiffies)) - return jiffies; + if (time_before(expires, now)) + expires = now; + else if (time_before(hr_expires, expires)) + expires = hr_expires; +#endif + /* + * 'Timer wheel time' can lag behind 'jiffies time' due to + * delayed processing, so make sure we return a value that + * makes sense externally: + */ + expires -= (now - base->timer_jiffies); - if (time_before(hr_expires, expires)) - return hr_expires; + /* + * Round it up per timeout_granularity: + */ + expires += timeout_granularity - 1; + expires -= expires % timeout_granularity; return expires; } + +unsigned long get_next_timer_interrupt(unsigned long now) +{ + tvec_base_t *base = __get_cpu_var(tvec_bases); + unsigned long expires; + + spin_lock(&base->lock); + expires = __next_timer_interrupt(base, now); + spin_unlock(&base->lock); + + return expires; +} + +unsigned long next_timer_interrupt(void) +{ + return get_next_timer_interrupt(jiffies); +} + #endif /******************************************************************/ @@ -601,7 +658,6 @@ long time_tolerance = MAXFREQ; /* frequ long time_precision = 1; /* clock precision (us) */ long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -static long time_phase; /* phase offset (scaled us) */ long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; /* frequency offset (scaled ppm)*/ static long time_adj; /* tick adjust (scaled 1 / HZ) */ @@ -751,27 +807,14 @@ static long adjtime_adjustment(void) } /* in the NTP reference this is called "hardclock()" */ -static void update_wall_time_one_tick(void) +static void update_ntp_one_tick(void) { - long time_adjust_step, delta_nsec; + long time_adjust_step; time_adjust_step = adjtime_adjustment(); if (time_adjust_step) /* Reduce by this step the amount of time left */ time_adjust -= time_adjust_step; - delta_nsec = tick_nsec + time_adjust_step * 1000; - /* - * Advance the phase, once it gets to one microsecond, then - * advance the tick more. - */ - time_phase += time_adj; - if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { - long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); - time_phase -= ltemp << (SHIFT_SCALE - 10); - delta_nsec += ltemp; - } - xtime.tv_nsec += delta_nsec; - time_interpolator_update(delta_nsec); /* Changes by adjtime() do not take effect till next tick. */ if (time_next_adjust != 0) { @@ -784,36 +827,312 @@ static void update_wall_time_one_tick(vo * Return how long ticks are at the moment, that is, how much time * update_wall_time_one_tick will add to xtime next time we call it * (assuming no calls to do_adjtimex in the meantime). - * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 - * bits to the right of the binary point. + * The return value is in fixed-point nanoseconds shifted by the + * specified number of bits to the right of the binary point. * This function has no side-effects. */ -u64 current_tick_length(void) +u64 current_tick_length(long shift) { long delta_nsec; + u64 ret; + /* calculate the finest interval NTP will allow. + * ie: nanosecond value shifted by (SHIFT_SCALE - 10) + */ delta_nsec = tick_nsec + adjtime_adjustment() * 1000; - return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; + ret = ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; + + /* convert from (SHIFT_SCALE - 10) to specified shift scale: */ + shift = shift - (SHIFT_SCALE - 10); + if (shift < 0) + ret >>= -shift; + else + ret <<= shift; + + return ret; } -/* - * Using a loop looks inefficient, but "ticks" is - * usually just one (we shouldn't be losing ticks, - * we're doing this this way mainly for interrupt - * latency reasons, not because we think we'll - * have lots of lost timer ticks +/* XXX - all of this timekeeping code should be later moved to time.c */ +#include +static struct clocksource *clock; /* pointer to current clocksource */ +static cycle_t last_clock_cycle; /* cycle value at last update_wall_time */ + +#ifdef CONFIG_GENERIC_TIME +/** + * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook + * + * private function, must hold xtime_lock lock when being + * called. Returns the number of nanoseconds since the + * last call to update_wall_time() (adjusted by NTP scaling) + */ +static inline s64 __get_nsec_offset(void) +{ + cycle_t cycle_now, cycle_delta; + s64 ns_offset; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - last_clock_cycle) & clock->mask; + + /* convert to nanoseconds: */ + ns_offset = cyc2ns(clock, cycle_delta); + + return ns_offset; +} + +/** + * __get_realtime_clock_ts - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. Used by + * do_gettimeofday() and get_realtime_clock_ts(). + */ +static inline void __get_realtime_clock_ts(struct timespec *ts) +{ + unsigned long seq; + s64 nsecs; + + do { + seq = read_seqbegin(&xtime_lock); + + *ts = xtime; + nsecs = __get_nsec_offset(); + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); +} + +/** + * getnstimeofday - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. + */ +void getnstimeofday(struct timespec *ts) +{ + __get_realtime_clock_ts(ts); +} + +EXPORT_SYMBOL(getnstimeofday); + +/** + * do_gettimeofday - Returns the time of day in a timeval + * @tv: pointer to the timeval to be set + * + * NOTE: Users should be converted to using get_realtime_clock_ts() + */ +void do_gettimeofday(struct timeval *tv) +{ + struct timespec now; + + __get_realtime_clock_ts(&now); + tv->tv_sec = now.tv_sec; + tv->tv_usec = now.tv_nsec/1000; +} + +EXPORT_SYMBOL(do_gettimeofday); +/** + * do_settimeofday - Sets the time of day + * @tv: pointer to the timespec variable containing the new time + * + * Sets the time of day to the new time and update NTP and notify hrtimers + */ +int do_settimeofday(struct timespec *tv) +{ + unsigned long flags; + time_t wtm_sec, sec = tv->tv_sec; + long wtm_nsec, nsec = tv->tv_nsec; + + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irqsave(&xtime_lock, flags); + + nsec -= __get_nsec_offset(); + + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + + set_normalized_timespec(&xtime, sec, nsec); + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + + ntp_clear(); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return 0; +} + +EXPORT_SYMBOL(do_settimeofday); + +/** + * change_clocksource - Swaps clocksources if a new one is available + * + * Accumulates current time interval and initializes new clocksource */ -static void update_wall_time(unsigned long ticks) +static int change_clocksource(void) { + struct clocksource *new; + cycle_t now; + u64 nsec; + new = clocksource_get_next(); + if (clock != new) { + now = clocksource_read(new); + nsec = __get_nsec_offset(); + timespec_add_ns(&xtime, nsec); + + clock = new; + last_clock_cycle = now; + printk(KERN_INFO "Time: %s clocksource has been installed.\n", + clock->name); + return 1; + } else if (clock->update_callback) { + return clock->update_callback(); + } + return 0; +} +#else +#define change_clocksource() (0) +#endif + +/** + * timeofday_is_continuous - check to see if timekeeping is free running + */ +int timekeeping_is_continuous(void) +{ + unsigned long seq; + int ret; + do { - ticks--; - update_wall_time_one_tick(); - if (xtime.tv_nsec >= 1000000000) { - xtime.tv_nsec -= 1000000000; + seq = read_seqbegin(&xtime_lock); + + ret = clock->is_continuous; + + } while (read_seqretry(&xtime_lock, seq)); + + return ret; +} + +/* + * timekeeping_init - Initializes the clocksource and common timekeeping values + */ +void __init timekeeping_init(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + clock = clocksource_get_next(); + clocksource_calculate_interval(clock, tick_nsec); + last_clock_cycle = clocksource_read(clock); + ntp_clear(); + write_sequnlock_irqrestore(&xtime_lock, flags); +} + + +/* + * timekeeping_resume - Resumes the generic timekeeping subsystem. + * @dev: unused + * + * This is for the generic clocksource timekeeping. + * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are + * still managed by arch specific suspend/resume code. + */ +static int timekeeping_resume(struct sys_device *dev) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + /* restart the last cycle value */ + last_clock_cycle = clocksource_read(clock); + write_sequnlock_irqrestore(&xtime_lock, flags); + return 0; +} + +/* sysfs resume/suspend bits for timekeeping */ +static struct sysdev_class timekeeping_sysclass = { + .resume = timekeeping_resume, + set_kset_name("timekeeping"), +}; + +static struct sys_device device_timer = { + .id = 0, + .cls = &timekeeping_sysclass, +}; + +static int __init timekeeping_init_device(void) +{ + int error = sysdev_class_register(&timekeeping_sysclass); + if (!error) + error = sysdev_register(&device_timer); + return error; +} + +device_initcall(timekeeping_init_device); + +/* + * update_wall_time - Uses the current clocksource to increment the wall time + * + * Called from the timer interrupt, must hold a write on xtime_lock. + */ +static void update_wall_time(void) +{ + static s64 remainder_snsecs, error; + s64 snsecs_per_sec; + cycle_t now, offset; + + snsecs_per_sec = (s64)NSEC_PER_SEC << clock->shift; + remainder_snsecs += (s64)xtime.tv_nsec << clock->shift; + + now = clocksource_read(clock); + offset = (now - last_clock_cycle)&clock->mask; + + /* normally this loop will run just once, however in the + * case of lost or late ticks, it will accumulate correctly. + */ + while (offset > clock->interval_cycles) { + /* get the ntp interval in clock shifted nanoseconds */ + s64 ntp_snsecs = current_tick_length(clock->shift); + + /* accumulate one interval */ + remainder_snsecs += clock->interval_snsecs; + last_clock_cycle += clock->interval_cycles; + offset -= clock->interval_cycles; + + /* interpolator bits */ + time_interpolator_update(clock->interval_snsecs + >> clock->shift); + /* increment the NTP state machine */ + update_ntp_one_tick(); + + /* accumulate error between NTP and clock interval */ + error += (ntp_snsecs - (s64)clock->interval_snsecs); + + /* correct the clock when NTP error is too big */ + remainder_snsecs += make_ntp_adj(clock, offset, &error); + + if (remainder_snsecs >= snsecs_per_sec) { + remainder_snsecs -= snsecs_per_sec; xtime.tv_sec++; second_overflow(); } - } while (ticks); + } + /* store full nanoseconds into xtime */ + xtime.tv_nsec = remainder_snsecs >> clock->shift; + remainder_snsecs -= (s64)xtime.tv_nsec << clock->shift; + + /* check to see if there is a new clocksource to use */ + if (change_clocksource()) { + error = 0; + remainder_snsecs = 0; + hrtimer_clock_notify(); + clocksource_calculate_interval(clock, tick_nsec); + } } /* @@ -896,9 +1215,12 @@ static void run_timer_softirq(struct sof { tvec_base_t *base = __get_cpu_var(tvec_bases); - hrtimer_run_queues(); + hrtimer_run_queues(); + + spin_lock_irq(&base->lock); if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); + spin_unlock_irq(&base->lock); } /* @@ -906,7 +1228,13 @@ static void run_timer_softirq(struct sof */ void run_local_timers(void) { - raise_softirq(TIMER_SOFTIRQ); + tvec_base_t *base = per_cpu(tvec_bases, smp_processor_id()); + /* + * Only wake up the TIMER_SOFTIRQ every timeout_granularity + * jiffies: + */ + if (time_before_eq(base->timer_jiffies + timeout_granularity, jiffies)) + raise_softirq(TIMER_SOFTIRQ); softlockup_tick(); } @@ -919,10 +1247,8 @@ static inline void update_times(void) unsigned long ticks; ticks = jiffies - wall_jiffies; - if (ticks) { - wall_jiffies += ticks; - update_wall_time(ticks); - } + wall_jiffies += ticks; + update_wall_time(); calc_load(ticks); } @@ -1286,13 +1612,14 @@ static int __devinit init_timers_cpu(int } #ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) +static void migrate_timer_list(tvec_base_t *old_base, tvec_base_t *new_base, + struct list_head *head) { struct timer_list *timer; while (!list_empty(head)) { timer = list_entry(head->next, struct timer_list, entry); - detach_timer(timer, 0); + detach_timer(old_base, timer, 0); timer->base = new_base; internal_add_timer(new_base, timer); } @@ -1315,12 +1642,12 @@ static void __devinit migrate_timers(int BUG_ON(old_base->running_timer); for (i = 0; i < TVR_SIZE; i++) - migrate_timer_list(new_base, old_base->tv1.vec + i); + migrate_timer_list(old_base, new_base, old_base->tv1.vec + i); for (i = 0; i < TVN_SIZE; i++) { - migrate_timer_list(new_base, old_base->tv2.vec + i); - migrate_timer_list(new_base, old_base->tv3.vec + i); - migrate_timer_list(new_base, old_base->tv4.vec + i); - migrate_timer_list(new_base, old_base->tv5.vec + i); + migrate_timer_list(old_base, new_base, old_base->tv2.vec + i); + migrate_timer_list(old_base, new_base, old_base->tv3.vec + i); + migrate_timer_list(old_base, new_base, old_base->tv4.vec + i); + migrate_timer_list(old_base, new_base, old_base->tv5.vec + i); } spin_unlock(&old_base->lock); Index: linux-hres.q/kernel/timer_top.c =================================================================== --- /dev/null +++ linux-hres.q/kernel/timer_top.c @@ -0,0 +1,259 @@ +/* + * kernel/timer_top.c + * + * Export Timers information to /proc/timer_info + * + * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus + * Written by Daniel Petrini + * + * This utility should be used to get information from the system timers + * and maybe optimize the system once you know which timers are being used + * and the process which starts them. + * This is particular useful above dynamic tick implementation. One can + * see who is starting timers and make the HZ value increase. + * + * We export the addresses and counting of timer functions being called, + * the pid and cmdline from the owner process if applicable. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#define VERSION "Timer Top v0.9.9" + +struct timer_top_info { + void *start_func; + void *expire_func; + unsigned long counter; + pid_t pid; + char comm[TASK_COMM_LEN + 1]; + struct list_head list; +}; + +struct timer_top_root { + spinlock_t lock; + struct list_head list; + kmem_cache_t *cache; + int record; /* if currently collecting data */ + unsigned long start_jiffies; +}; + +static struct timer_top_root top_root = { + .lock = SPIN_LOCK_UNLOCKED, + .list = LIST_HEAD_INIT(top_root.list), +}; + +static struct list_head *timer_list = &top_root.list; +static spinlock_t *top_lock = &top_root.lock; + +static inline int update_top_info(struct timer_list *timer) +{ + struct timer_top_info *top; + + list_for_each_entry(top, timer_list, list) { + /* if it is in the list increment its count */ + if (top->start_func == timer->start_site && + top->expire_func == timer->function && + top->pid == timer->start_pid) { + top->counter++; + return 1; + } + } + + return 0; +} + +int account_timer(struct timer_list *timer) +{ + pid_t pid_info = timer->start_pid; + struct timer_top_info *top; + unsigned long flags; + + if (!top_root.record) + goto out; + + spin_lock_irqsave(top_lock, flags); + + if (update_top_info(timer)) + goto out_unlock; + + /* Function not found so insert it in the list */ + top = kmem_cache_alloc(top_root.cache, GFP_ATOMIC); + if (unlikely(!top)) + goto out_unlock; + + top->start_func = timer->start_site; + top->expire_func = timer->function; + timer->start_site = NULL; + top->counter = 1; + top->pid = pid_info; + memcpy(top->comm, timer->start_comm, TASK_COMM_LEN); + top->comm[TASK_COMM_LEN] = 0; + list_add(&top->list, timer_list); + +out_unlock: + spin_unlock_irqrestore(top_lock, flags); +out: + return 0; +} + +/* + * Must hold top_lock + */ +static void timer_list_del(void) +{ + struct list_head *aux1, *aux2; + struct timer_top_info *entry; + + list_for_each_safe(aux1, aux2, timer_list) { + entry = list_entry(aux1, struct timer_top_info, list); + list_del(aux1); + kmem_cache_free(top_root.cache, entry); + } + top_root.start_jiffies = jiffies; +} + +/* PROC_FS_SECTION */ + +static struct proc_dir_entry *top_info_file; +static struct proc_dir_entry *top_info_file_out; + +static void print_name_offset(struct seq_file *m, unsigned long addr) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); + if (sym_name) + seq_printf(m, "%s", sym_name); + else + seq_printf(m, "<%p>", (void *)addr); +} + +/* Statistics output - timer_info*/ +static int proc_read_top_info(struct seq_file *m, void *v) +{ + struct timer_top_info *top; + unsigned long events = 0, delta, secs, tenths, ratio, ratio_tenths; + + delta = jiffies - top_root.start_jiffies; + secs = delta / HZ; + tenths = (delta % HZ) * 10 / HZ; + + seq_printf(m, "Function counter - %s\n", VERSION); + seq_printf(m, "collection period: %ld.%ld seconds\n", secs, tenths); + + list_for_each_entry(top, timer_list, list) { + seq_printf(m, "%4lu %5d %-16s ", + top->counter, top->pid, top->comm); + print_name_offset(m, (unsigned long)top->start_func); + seq_puts(m, " ("); + print_name_offset(m, (unsigned long)top->expire_func); + seq_puts(m, ")\n"); + events += top->counter; + } + + ratio = events * HZ / delta; + ratio_tenths = ((events * HZ) % delta) * 10 / delta; + + seq_printf(m, "%4lu total events, %ld.%ld events/sec\n", + events, ratio, ratio_tenths); + + if (!top_root.record) + seq_printf(m, "Disabled\n"); + + return 0; +} + +static int proc_timertop_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_read_top_info, NULL); +} + +static struct file_operations proc_timertop_operations = { + .open = proc_timertop_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#define MAX_INPUT_TOP 10 + +/* Receive some commands from user - timer_input */ +static int proc_write_timer_input(struct file *file, const char *page, + unsigned long count, void *data) +{ + int len; + char input_data[MAX_INPUT_TOP]; + unsigned long flags; + + /* input size checking */ + if (count > MAX_INPUT_TOP - 1) + len = MAX_INPUT_TOP - 1; + else + len = count; + + if (copy_from_user(input_data, page, len)) + return -EFAULT; + + input_data[len] = '\0'; + + spin_lock_irqsave(top_lock, flags); + if (!strncmp(input_data, "clear", 5)) { + timer_list_del(); + } else if (!strncmp(input_data, "start", 5)) { + top_root.start_jiffies = jiffies; + top_root.record = 1; + } else if (!strncmp(input_data, "stop", 4)) { + top_root.record = 0; + timer_list_del(); + } + spin_unlock_irqrestore(top_lock, flags); + + return len; +} + +/* Print a sample string showing the possible inputs - timer_input */ +static int proc_read_timer_input(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + return sprintf(page, "clear start stop\n"); +} + +static int __init init_top_info(void) +{ + top_root.cache = kmem_cache_create("top_info", + sizeof(struct timer_top_info), 0, SLAB_PANIC, NULL, NULL); + + top_info_file = create_proc_entry("timer_info", 0444, NULL); + if (top_info_file == NULL) + return -ENOMEM; + + top_info_file_out = create_proc_entry("timer_input", 0666, NULL); + if (top_info_file_out == NULL) + return -ENOMEM; + + /* Statistics output */ + top_info_file->proc_fops = &proc_timertop_operations; + + /* Control */ + top_info_file_out->write_proc = &proc_write_timer_input; + top_info_file_out->read_proc = &proc_read_timer_input; + + return 0; +} + +module_init(init_top_info); Index: linux-hres.q/kernel/workqueue.c =================================================================== --- linux-hres.q.orig/kernel/workqueue.c +++ linux-hres.q/kernel/workqueue.c @@ -115,12 +115,14 @@ int fastcall queue_work(struct workqueue return ret; } -static void delayed_work_timer_fn(unsigned long __data) +void delayed_work_timer_fn(unsigned long __data) { struct work_struct *work = (struct work_struct *)__data; struct workqueue_struct *wq = work->wq_data; int cpu = smp_processor_id(); + struct list_head *head; + head = &per_cpu_ptr(wq->cpu_wq, cpu)->more_work.task_list; if (unlikely(is_single_threaded(wq))) cpu = singlethread_cpu; @@ -128,11 +130,12 @@ static void delayed_work_timer_fn(unsign } int fastcall queue_delayed_work(struct workqueue_struct *wq, - struct work_struct *work, unsigned long delay) + struct work_struct *work, unsigned long delay) { int ret = 0; struct timer_list *timer = &work->timer; + timer_set_start_info(&work->timer); if (!test_and_set_bit(0, &work->pending)) { BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); @@ -405,6 +408,7 @@ int fastcall schedule_work(struct work_s int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) { + timer_set_start_info(&work->timer); return queue_delayed_work(keventd_wq, work, delay); } Index: linux-hres.q/lib/Kconfig.debug =================================================================== --- linux-hres.q.orig/lib/Kconfig.debug +++ linux-hres.q/lib/Kconfig.debug @@ -8,6 +8,22 @@ config PRINTK_TIME operations. This is useful for identifying long delays in kernel startup. +config PRINTK_IGNORE_LOGLEVEL + bool "Ignore loglevel on printks" + default n + help + Selecting this option causes all printk messages to go + to the console. This allows you to serial-log kernel + messages, no matter what userspace does. (e.g. some + distributions disable kernel log messages during + certain phases of system startup.) + + NOTE: this option also makes printk non-preemptible, + which might improve the output of debugging info or + crash info, but it might also cause latencies if your + kernel is printk-ing alot. + + Normally you dont need or want this option. config MAGIC_SYSRQ bool "Magic SysRq key" @@ -77,6 +93,19 @@ config SCHEDSTATS application, you can say N to avoid the very slight overhead this adds. +config TIMER_INFO + bool "Collect kernel timers statistics" + depends on DEBUG_KERNEL && PROC_FS && NO_IDLE_HZ + help + If you say Y here, additional code will be inserted into the + timer routines to collect statistics about kernel timers being + reprogrammed through dynamic ticks feature. The statistics + will be provided in /proc/timer_info and the behavior of this + feature can be controlled through /proc/timer_input. + The goal is to offer some output to let user applications show + timer pattern usage and allow some tuning in them to + maximise idle time. + config DEBUG_SLAB bool "Debug slab memory allocations" depends on DEBUG_KERNEL && SLAB Index: linux-hres.q/mm/slab.c =================================================================== --- linux-hres.q.orig/mm/slab.c +++ linux-hres.q/mm/slab.c @@ -444,8 +444,13 @@ struct kmem_cache { * OTOH the cpuarrays can contain lots of objects, * which could lock up otherwise freeable slabs. */ -#define REAPTIMEOUT_CPUC (2*HZ) -#define REAPTIMEOUT_LIST3 (4*HZ) +#ifdef CONFIG_NO_IDLE_HZ +# define REAPTIMEOUT_CPUC (4*HZ) +# define REAPTIMEOUT_LIST3 (8*HZ) +#else +# define REAPTIMEOUT_CPUC (2*HZ) +# define REAPTIMEOUT_LIST3 (4*HZ) +#endif #if STATS #define STATS_INC_ACTIVE(x) ((x)->num_active++) Index: linux-hres.q/net/ipv4/route.c =================================================================== --- linux-hres.q.orig/net/ipv4/route.c +++ linux-hres.q/net/ipv4/route.c @@ -244,7 +244,7 @@ static unsigned int rt_hash_rnd; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) \ - (per_cpu(rt_cache_stat, raw_smp_processor_id()).field++) + (__raw_get_cpu_var(rt_cache_stat).field++) static int rt_intern_hash(unsigned hash, struct rtable *rth, struct rtable **res);