--- Documentation/stable_api_nonsense.txt | 3 Makefile | 11 arch/arm/boot/compressed/head.S | 13 arch/arm/kernel/entry-common.S | 109 + arch/arm/kernel/fiq.c | 4 arch/arm/kernel/irq.c | 4 arch/arm/kernel/traps.c | 1 arch/arm/mm/copypage-v4mc.c | 4 arch/arm/mm/copypage-xscale.c | 4 arch/arm/mm/fault.c | 14 arch/powerpc/kernel/time.c | 1 arch/ppc/boot/Makefile | 9 arch/x86/Kconfig | 8 arch/x86/Makefile_32 | 4 arch/x86/boot/compressed/Makefile_32 | 1 arch/x86/ia32/ia32entry.S | 11 arch/x86/kernel/Makefile_32 | 1 arch/x86/kernel/apic_32.c | 2 arch/x86/kernel/entry_32.S | 15 arch/x86/kernel/entry_64.S | 45 arch/x86/kernel/head64.c | 3 arch/x86/kernel/hpet.c | 2 arch/x86/kernel/i8253.c | 2 arch/x86/kernel/irq_32.c | 7 arch/x86/kernel/irq_64.c | 6 arch/x86/kernel/mcount-wrapper.S | 27 arch/x86/kernel/setup64.c | 4 arch/x86/kernel/smpboot_64.c | 2 arch/x86/kernel/stacktrace.c | 2 arch/x86/kernel/traps_32.c | 1 arch/x86/kernel/traps_64.c | 1 arch/x86/kernel/tsc_32.c | 4 arch/x86/kernel/tsc_64.c | 4 arch/x86/kernel/vsyscall_64.c | 2 arch/x86/lib/delay_32.c | 6 arch/x86/mm/fault_32.c | 1 arch/x86/mm/init_32.c | 2 arch/x86/power/cpu.c | 3 drivers/acpi/namespace/nsutils.c | 2 drivers/acpi/namespace/nswalk.c | 2 drivers/clocksource/acpi_pm.c | 8 fs/proc/proc_misc.c | 17 include/asm-arm/pgalloc.h | 4 include/asm-arm/timex.h | 10 include/asm-arm/unistd.h | 4 include/asm-x86/calling.h | 50 include/asm-x86/irq_32.h | 2 include/asm-x86/module_32.h | 8 include/asm-x86/processor_32.h | 2 include/asm-x86/unistd_64.h | 2 include/linux/clocksource.h | 23 include/linux/kernel.h | 2 include/linux/latency_hist.h | 32 include/linux/linkage.h | 2 include/linux/prctl.h | 1 include/linux/preempt.h | 20 include/linux/sched.h | 109 + init/main.c | 2 kernel/Makefile | 5 kernel/exit.c | 1 kernel/fork.c | 2 kernel/latency_hist.c | 267 +++ kernel/latency_trace.c | 2799 ++++++++++++++++++++++++++++++++++ kernel/lockdep.c | 41 kernel/panic.c | 2 kernel/printk.c | 19 kernel/sched.c | 92 - kernel/sched_fair.c | 3 kernel/sched_rt.c | 2 kernel/softirq.c | 6 kernel/spinlock.c | 4 kernel/sys.c | 8 kernel/sysctl.c | 128 + kernel/time/timekeeping.c | 27 lib/Kconfig.debug | 186 ++ lib/debug_locks.c | 8 lib/spinlock_debug.c | 2 scripts/Makefile | 1 scripts/trace-it.c | 79 79 files changed, 4218 insertions(+), 109 deletions(-) Index: linux/Documentation/stable_api_nonsense.txt =================================================================== --- linux.orig/Documentation/stable_api_nonsense.txt +++ linux/Documentation/stable_api_nonsense.txt @@ -62,6 +62,9 @@ consider the following facts about the L - different structures can contain different fields - Some functions may not be implemented at all, (i.e. some locks compile away to nothing for non-SMP builds.) + - Parameter passing of variables from function to function can be + done in different ways (the CONFIG_REGPARM option controls + this.) - Memory within the kernel can be aligned in different ways, depending on the build options. - Linux runs on a wide range of different processor architectures. Index: linux/Makefile =================================================================== --- linux.orig/Makefile +++ linux/Makefile @@ -508,10 +508,15 @@ endif include $(srctree)/arch/$(SRCARCH)/Makefile -ifdef CONFIG_FRAME_POINTER -KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls + +ifdef CONFIG_MCOUNT +KBUILD_CFLAGS += -pg -fno-omit-frame-pointer -fno-optimize-sibling-calls else -KBUILD_CFLAGS += -fomit-frame-pointer + ifdef CONFIG_FRAME_POINTER + KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls + else + KBUILD_CFLAGS += -fomit-frame-pointer + endif endif ifdef CONFIG_DEBUG_INFO Index: linux/arch/arm/boot/compressed/head.S =================================================================== --- linux.orig/arch/arm/boot/compressed/head.S +++ linux/arch/arm/boot/compressed/head.S @@ -928,6 +928,19 @@ memdump: mov r12, r0 #endif .ltorg +#ifdef CONFIG_MCOUNT +/* CONFIG_MCOUNT causes boot header to be built with -pg requiring this + * trampoline + */ + .text + .align 0 + .type mcount %function + .global mcount +mcount: + mov pc, lr @ just return +#endif + + reloc_end: .align Index: linux/arch/arm/kernel/entry-common.S =================================================================== --- linux.orig/arch/arm/kernel/entry-common.S +++ linux/arch/arm/kernel/entry-common.S @@ -3,6 +3,8 @@ * * Copyright (C) 2000 Russell King * + * FUNCTION_TRACE/mcount support (C) 2005 Timesys john.cooper@timesys.com + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -395,5 +397,112 @@ ENTRY(sys_oabi_call_table) #undef ABI #undef OBSOLETE +#ifdef CONFIG_FRAME_POINTER + +#ifdef CONFIG_MCOUNT +/* + * At the point where we are in mcount() we maintain the + * frame of the prologue code and keep the call to mcount() + * out of the stack frame list: + + saved pc <---\ caller of instrumented routine + saved lr | + ip/prev_sp | + fp -----^ | + : | + | + -> saved pc | instrumented routine + | saved lr | + | ip/prev_sp | + | fp ---------/ + | : + | + | mcount + | saved pc + | saved lr + | ip/prev sp + -- fp + r3 + r2 + r1 + sp-> r0 + : + */ + + .text + .align 0 + .type mcount %function + .global mcount + +/* gcc -pg generated FUNCTION_PROLOGUE references mcount() + * and has already created the stack frame invocation for + * the routine we have been called to instrument. We create + * a complete frame nevertheless, as we want to use the same + * call to mcount() from c code. + */ +mcount: + + ldr ip, =mcount_enabled @ leave early, if disabled + ldr ip, [ip] + cmp ip, #0 + moveq pc,lr + + mov ip, sp + stmdb sp!, {r0 - r3, fp, ip, lr, pc} @ create stack frame + + ldr r1, [fp, #-4] @ get lr (the return address + @ of the caller of the + @ instrumented function) + mov r0, lr @ get lr - (the return address + @ of the instrumented function) + + sub fp, ip, #4 @ point fp at this frame + + bl __trace +1: + ldmdb fp, {r0 - r3, fp, sp, pc} @ pop entry frame and return + +#endif + +/* ARM replacement for unsupported gcc __builtin_return_address(n) + * where 0 < n. n == 0 is supported here as well. + * + * Walk up the stack frame until the desired frame is found or a NULL + * fp is encountered, return NULL in the latter case. + * + * Note: it is possible under code optimization for the stack invocation + * of an ancestor function (level N) to be removed before calling a + * descendant function (level N+1). No easy means is available to deduce + * this scenario with the result being [for example] caller_addr(0) when + * called from level N+1 returning level N-1 rather than the expected + * level N. This optimization issue appears isolated to the case of + * a call to a level N+1 routine made at the tail end of a level N + * routine -- the level N frame is deleted and a simple branch is made + * to the level N+1 routine. + */ + + .text + .align 0 + .type arm_return_addr %function + .global arm_return_addr + +arm_return_addr: + mov ip, r0 + mov r0, fp +3: + cmp r0, #0 + beq 1f @ frame list hit end, bail + cmp ip, #0 + beq 2f @ reached desired frame + ldr r0, [r0, #-12] @ else continue, get next fp + sub ip, ip, #1 + b 3b +2: + ldr r0, [r0, #-4] @ get target return address +1: + mov pc, lr + +#endif + #endif Index: linux/arch/arm/kernel/fiq.c =================================================================== --- linux.orig/arch/arm/kernel/fiq.c +++ linux/arch/arm/kernel/fiq.c @@ -89,7 +89,7 @@ void set_fiq_handler(void *start, unsign * disable irqs for the duration. Note - these functions are almost * entirely coded in assembly. */ -void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( @@ -107,7 +107,7 @@ void __attribute__((naked)) set_fiq_regs : "r" (®s->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE)); } -void __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( Index: linux/arch/arm/kernel/irq.c =================================================================== --- linux.orig/arch/arm/kernel/irq.c +++ linux/arch/arm/kernel/irq.c @@ -108,11 +108,13 @@ static struct irq_desc bad_irq_desc = { * come via this function. Instead, they should provide their * own 'handler' */ -asmlinkage void __exception asm_do_IRQ(unsigned int irq, struct pt_regs *regs) +asmlinkage void __exception notrace asm_do_IRQ(unsigned int irq, struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc = irq_desc + irq; + trace_special(instruction_pointer(regs), irq, 0); + /* * Some hardware gives randomly wrong interrupts. Rather * than crashing, do something sensible. Index: linux/arch/arm/kernel/traps.c =================================================================== --- linux.orig/arch/arm/kernel/traps.c +++ linux/arch/arm/kernel/traps.c @@ -354,6 +354,7 @@ asmlinkage void do_unexp_fiq (struct pt_ { printk("Hmm. Unexpected FIQ received, but trying to continue\n"); printk("You may have a hardware problem...\n"); + print_traces(current); } /* Index: linux/arch/arm/mm/copypage-v4mc.c =================================================================== --- linux.orig/arch/arm/mm/copypage-v4mc.c +++ linux/arch/arm/mm/copypage-v4mc.c @@ -44,7 +44,7 @@ static DEFINE_SPINLOCK(minicache_lock); * instruction. If your processor does not supply this, you have to write your * own copy_user_page that does the right thing. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { asm volatile( @@ -88,7 +88,7 @@ void v4_mc_copy_user_page(void *kto, con /* * ARMv4 optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) v4_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux/arch/arm/mm/copypage-xscale.c =================================================================== --- linux.orig/arch/arm/mm/copypage-xscale.c +++ linux/arch/arm/mm/copypage-xscale.c @@ -42,7 +42,7 @@ static DEFINE_SPINLOCK(minicache_lock); * Dcache aliasing issue. The writes will be forwarded to the write buffer, * and merged as appropriate. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { /* @@ -110,7 +110,7 @@ void xscale_mc_copy_user_page(void *kto, /* * XScale optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) xscale_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux/arch/arm/mm/fault.c =================================================================== --- linux.orig/arch/arm/mm/fault.c +++ linux/arch/arm/mm/fault.c @@ -215,7 +215,7 @@ out: return fault; } -static int +static notrace int do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct task_struct *tsk; @@ -311,7 +311,7 @@ no_context: * interrupt or a critical region, and should only copy the information * from the master page table, nothing more. */ -static int +static notrace int do_translation_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { @@ -354,7 +354,7 @@ bad_area: * Some section permission faults need to be handled gracefully. * They can happen due to a __{get,put}_user during an oops. */ -static int +static notrace int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { do_bad_area(addr, fsr, regs); @@ -364,7 +364,7 @@ do_sect_fault(unsigned long addr, unsign /* * This abort handler always returns "fault". */ -static int +static notrace int do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { return 1; @@ -419,7 +419,7 @@ static struct fsr_info { { do_bad, SIGBUS, 0, "unknown 31" } }; -void __init +void __init notrace hook_fault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *), int sig, const char *name) { @@ -433,7 +433,7 @@ hook_fault_code(int nr, int (*fn)(unsign /* * Dispatch a data abort to the relevant handler. */ -asmlinkage void __exception +asmlinkage void __exception notrace do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { const struct fsr_info *inf = fsr_info + (fsr & 15) + ((fsr & (1 << 10)) >> 6); @@ -452,7 +452,7 @@ do_DataAbort(unsigned long addr, unsigne arm_notify_die("", regs, &info, fsr, 0); } -asmlinkage void __exception +asmlinkage void __exception notrace do_PrefetchAbort(unsigned long addr, struct pt_regs *regs) { do_translation_fault(addr, 0, regs); Index: linux/arch/powerpc/kernel/time.c =================================================================== --- linux.orig/arch/powerpc/kernel/time.c +++ linux/arch/powerpc/kernel/time.c @@ -882,6 +882,7 @@ void __init time_init(void) tb_ticks_per_jiffy = ppc_tb_freq / HZ; tb_ticks_per_sec = ppc_tb_freq; tb_ticks_per_usec = ppc_tb_freq / 1000000; + cpu_khz = ppc_tb_freq / 1000; tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000); calc_cputime_factors(); Index: linux/arch/ppc/boot/Makefile =================================================================== --- linux.orig/arch/ppc/boot/Makefile +++ linux/arch/ppc/boot/Makefile @@ -15,6 +15,15 @@ # KBUILD_CFLAGS used when building rest of boot (takes effect recursively) KBUILD_CFLAGS += -fno-builtin -D__BOOTER__ -Iarch/$(ARCH)/boot/include + +ifdef CONFIG_MCOUNT +# do not trace the boot loader +nullstring := +space := $(nullstring) # end of the line +pg_flag = $(nullstring) -pg # end of the line +CFLAGS := $(subst ${pg_flag},${space},${CFLAGS}) +endif + HOSTCFLAGS += -Iarch/$(ARCH)/boot/include BOOT_TARGETS = zImage zImage.initrd znetboot znetboot.initrd Index: linux/arch/x86/Kconfig =================================================================== --- linux.orig/arch/x86/Kconfig +++ linux/arch/x86/Kconfig @@ -1017,6 +1017,14 @@ config BOOT_IOREMAP depends on X86_32 && (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI)) default y +# +# function tracing might turn this off: +# +config REGPARM + bool + depends on !MCOUNT + default y + config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" depends on PROC_FS Index: linux/arch/x86/Makefile_32 =================================================================== --- linux.orig/arch/x86/Makefile_32 +++ linux/arch/x86/Makefile_32 @@ -37,7 +37,7 @@ LDFLAGS_vmlinux := --emit-relocs endif CHECKFLAGS += -D__i386__ -KBUILD_CFLAGS += -pipe -msoft-float -mregparm=3 -freg-struct-return +KBUILD_CFLAGS += -pipe -msoft-float -freg-struct-return # prevent gcc from keeping the stack 16 byte aligned KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) @@ -45,6 +45,8 @@ KBUILD_CFLAGS += $(call cc-option,-mpref # CPU-specific tuning. Anything which can be shared with UML should go here. include $(srctree)/arch/x86/Makefile_32.cpu +cflags-$(CONFIG_REGPARM) += -mregparm=3 + # temporary until string.h is fixed cflags-y += -ffreestanding Index: linux/arch/x86/boot/compressed/Makefile_32 =================================================================== --- linux.orig/arch/x86/boot/compressed/Makefile_32 +++ linux/arch/x86/boot/compressed/Makefile_32 @@ -9,6 +9,7 @@ targets := vmlinux vmlinux.bin vmlinux. EXTRA_AFLAGS := -traditional LDFLAGS_vmlinux := -T +#CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing hostprogs-y := relocs KBUILD_CFLAGS := -m32 -D__KERNEL__ $(LINUX_INCLUDE) -O2 \ Index: linux/arch/x86/ia32/ia32entry.S =================================================================== --- linux.orig/arch/x86/ia32/ia32entry.S +++ linux/arch/x86/ia32/ia32entry.S @@ -132,7 +132,9 @@ sysenter_do_call: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys IA32_ARG_FIXUP 1 + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli @@ -237,7 +239,9 @@ cstar_do_call: cmpl $IA32_NR_syscalls-1,%eax ja ia32_badsys IA32_ARG_FIXUP 1 + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli @@ -328,8 +332,10 @@ ia32_do_syscall: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys IA32_ARG_FIXUP + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) jmp int_ret_from_sys_call @@ -400,7 +406,7 @@ END(ia32_ptregs_common) .section .rodata,"a" .align 8 -ia32_sys_call_table: +ENTRY(ia32_sys_call_table) .quad sys_restart_syscall .quad sys_exit .quad stub32_fork @@ -726,4 +732,7 @@ ia32_sys_call_table: .quad compat_sys_timerfd .quad sys_eventfd .quad sys32_fallocate +#ifdef CONFIG_EVENT_TRACE + .globl ia32_syscall_end +#endif ia32_syscall_end: Index: linux/arch/x86/kernel/Makefile_32 =================================================================== --- linux.orig/arch/x86/kernel/Makefile_32 +++ linux/arch/x86/kernel/Makefile_32 @@ -23,6 +23,7 @@ obj-$(CONFIG_APM) += apm_32.o obj-$(CONFIG_X86_SMP) += smp_32.o smpboot_32.o tsc_sync.o obj-$(CONFIG_SMP) += smpcommon_32.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_32.o +obj-$(CONFIG_MCOUNT) += mcount-wrapper.o obj-$(CONFIG_X86_MPPARSE) += mpparse_32.o obj-$(CONFIG_X86_LOCAL_APIC) += apic_32.o nmi_32.o obj-$(CONFIG_X86_IO_APIC) += io_apic_32.o Index: linux/arch/x86/kernel/apic_32.c =================================================================== --- linux.orig/arch/x86/kernel/apic_32.c +++ linux/arch/x86/kernel/apic_32.c @@ -581,6 +581,8 @@ void fastcall smp_apic_timer_interrupt(s { struct pt_regs *old_regs = set_irq_regs(regs); +// trace_special(regs->eip, 1, 0); + /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. Index: linux/arch/x86/kernel/entry_32.S =================================================================== --- linux.orig/arch/x86/kernel/entry_32.S +++ linux/arch/x86/kernel/entry_32.S @@ -330,6 +330,11 @@ sysenter_past_esp: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL +#ifdef CONFIG_EVENT_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -345,6 +350,11 @@ sysenter_past_esp: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work +#ifdef CONFIG_EVENT_TRACE + pushl %eax + call sys_ret + popl %eax +#endif /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx movl PT_OLDESP(%esp), %ecx @@ -368,6 +378,11 @@ ENTRY(system_call) pushl %eax # save orig_eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL +#ifdef CONFIG_EVENT_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ Index: linux/arch/x86/kernel/entry_64.S =================================================================== --- linux.orig/arch/x86/kernel/entry_64.S +++ linux/arch/x86/kernel/entry_64.S @@ -53,6 +53,47 @@ .code64 +#ifdef CONFIG_EVENT_TRACE + +ENTRY(mcount) + cmpl $0, mcount_enabled + jz out + + push %rbp + mov %rsp,%rbp + + push %r11 + push %r10 + push %r9 + push %r8 + push %rdi + push %rsi + push %rdx + push %rcx + push %rax + + mov 0x0(%rbp),%rax + mov 0x8(%rbp),%rdi + mov 0x8(%rax),%rsi + + call __trace + + pop %rax + pop %rcx + pop %rdx + pop %rsi + pop %rdi + pop %r8 + pop %r9 + pop %r10 + pop %r11 + + pop %rbp +out: + ret + +#endif + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif @@ -234,7 +275,9 @@ ENTRY(system_call) cmpq $__NR_syscall_max,%rax ja badsys movq %r10,%rcx + TRACE_SYS_CALL call *sys_call_table(,%rax,8) # XXX: rip relative + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) /* * Syscall return path ending with SYSRET (fast path) @@ -317,7 +360,9 @@ tracesys: cmova %rcx,%rax ja 1f movq %r10,%rcx /* fixup for C */ + TRACE_SYS_CALL call *sys_call_table(,%rax,8) + TRACE_SYS_RET 1: movq %rax,RAX-ARGOFFSET(%rsp) /* Use IRET because user could have changed frame */ Index: linux/arch/x86/kernel/head64.c =================================================================== --- linux.orig/arch/x86/kernel/head64.c +++ linux/arch/x86/kernel/head64.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -46,7 +47,7 @@ static void __init copy_bootdata(char *r } } -void __init x86_64_start_kernel(char * real_mode_data) +void __init notrace x86_64_start_kernel(char * real_mode_data) { int i; Index: linux/arch/x86/kernel/hpet.c =================================================================== --- linux.orig/arch/x86/kernel/hpet.c +++ linux/arch/x86/kernel/hpet.c @@ -295,7 +295,7 @@ static int hpet_legacy_next_event(unsign /* * Clock source related code */ -static cycle_t read_hpet(void) +static cycle_t notrace read_hpet(void) { return (cycle_t)hpet_readl(HPET_COUNTER); } Index: linux/arch/x86/kernel/i8253.c =================================================================== --- linux.orig/arch/x86/kernel/i8253.c +++ linux/arch/x86/kernel/i8253.c @@ -125,7 +125,7 @@ void __init setup_pit_timer(void) * to just read by itself. So use jiffies to emulate a free * running counter: */ -static cycle_t pit_read(void) +static notrace cycle_t pit_read(void) { unsigned long flags; int count; Index: linux/arch/x86/kernel/irq_32.c =================================================================== --- linux.orig/arch/x86/kernel/irq_32.c +++ linux/arch/x86/kernel/irq_32.c @@ -66,7 +66,7 @@ static union irq_ctx *softirq_ctx[NR_CPU * SMP cross-CPU interrupts have their own specific * handlers). */ -fastcall unsigned int do_IRQ(struct pt_regs *regs) +fastcall notrace unsigned int do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs; /* high bit used in ret_from_ code */ @@ -85,6 +85,11 @@ fastcall unsigned int do_IRQ(struct pt_r old_regs = set_irq_regs(regs); irq_enter(); +#ifdef CONFIG_EVENT_TRACE + if (irq == trace_user_trigger_irq) + user_trace_start(); +#endif +// trace_special(regs->eip, irq, 0); #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { Index: linux/arch/x86/kernel/irq_64.c =================================================================== --- linux.orig/arch/x86/kernel/irq_64.c +++ linux/arch/x86/kernel/irq_64.c @@ -149,6 +149,12 @@ asmlinkage unsigned int do_IRQ(struct pt irq_enter(); irq = __get_cpu_var(vector_irq)[vector]; +#ifdef CONFIG_EVENT_TRACE + if (irq == trace_user_trigger_irq) + user_trace_start(); +#endif + trace_special(regs->rip, irq, 0); + #ifdef CONFIG_DEBUG_STACKOVERFLOW stack_overflow_check(regs); #endif Index: linux/arch/x86/kernel/mcount-wrapper.S =================================================================== --- /dev/null +++ linux/arch/x86/kernel/mcount-wrapper.S @@ -0,0 +1,27 @@ +/* + * linux/arch/i386/mcount-wrapper.S + * + * Copyright (C) 2004 Ingo Molnar + */ + +.globl mcount +mcount: + + cmpl $0, mcount_enabled + jz out + + push %ebp + mov %esp, %ebp + pushl %eax + pushl %ecx + pushl %edx + + call __mcount + + popl %edx + popl %ecx + popl %eax + popl %ebp +out: + ret + Index: linux/arch/x86/kernel/setup64.c =================================================================== --- linux.orig/arch/x86/kernel/setup64.c +++ linux/arch/x86/kernel/setup64.c @@ -114,7 +114,7 @@ void __init setup_per_cpu_areas(void) } } -void pda_init(int cpu) +void notrace pda_init(int cpu) { struct x8664_pda *pda = cpu_pda(cpu); @@ -197,7 +197,7 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist * 'CPU state barrier', nothing should get across. * A lot of state is already set up in PDA init. */ -void __cpuinit cpu_init (void) +void __cpuinit notrace cpu_init (void) { int cpu = stack_smp_processor_id(); struct tss_struct *t = &per_cpu(init_tss, cpu); Index: linux/arch/x86/kernel/smpboot_64.c =================================================================== --- linux.orig/arch/x86/kernel/smpboot_64.c +++ linux/arch/x86/kernel/smpboot_64.c @@ -317,7 +317,7 @@ static inline void set_cpu_sibling_map(i /* * Setup code on secondary processor (after comming out of the trampoline) */ -void __cpuinit start_secondary(void) +void __cpuinit notrace start_secondary(void) { /* * Dont put anything before smp_callin(), SMP Index: linux/arch/x86/kernel/stacktrace.c =================================================================== --- linux.orig/arch/x86/kernel/stacktrace.c +++ linux/arch/x86/kernel/stacktrace.c @@ -22,7 +22,7 @@ static int save_stack_stack(void *data, return -1; } -static void save_stack_address(void *data, unsigned long addr) +static void notrace save_stack_address(void *data, unsigned long addr) { struct stack_trace *trace = (struct stack_trace *)data; if (trace->skip > 0) { Index: linux/arch/x86/kernel/traps_32.c =================================================================== --- linux.orig/arch/x86/kernel/traps_32.c +++ linux/arch/x86/kernel/traps_32.c @@ -237,6 +237,7 @@ show_trace_log_lvl(struct task_struct *t { dump_trace(task, regs, stack, &print_trace_ops, log_lvl); printk("%s =======================\n", log_lvl); + print_traces(task); } void show_trace(struct task_struct *task, struct pt_regs *regs, Index: linux/arch/x86/kernel/traps_64.c =================================================================== --- linux.orig/arch/x86/kernel/traps_64.c +++ linux/arch/x86/kernel/traps_64.c @@ -318,6 +318,7 @@ print_trace_warning_symbol(void *data, c { print_symbol(msg, symbol); printk("\n"); + print_traces(tsk); } static void print_trace_warning(void *data, char *msg) Index: linux/arch/x86/kernel/tsc_32.c =================================================================== --- linux.orig/arch/x86/kernel/tsc_32.c +++ linux/arch/x86/kernel/tsc_32.c @@ -92,7 +92,7 @@ static inline void set_cyc2ns_scale(unsi /* * Scheduler clock - returns current time in nanosec units. */ -unsigned long long native_sched_clock(void) +unsigned notrace long long native_sched_clock(void) { unsigned long long this_offset; @@ -269,7 +269,7 @@ core_initcall(cpufreq_tsc); static unsigned long current_tsc_khz = 0; -static cycle_t read_tsc(void) +static notrace cycle_t read_tsc(void) { cycle_t ret; Index: linux/arch/x86/kernel/tsc_64.c =================================================================== --- linux.orig/arch/x86/kernel/tsc_64.c +++ linux/arch/x86/kernel/tsc_64.c @@ -248,13 +248,13 @@ __setup("notsc", notsc_setup); /* clock source code: */ -static cycle_t read_tsc(void) +static notrace cycle_t read_tsc(void) { cycle_t ret = (cycle_t)get_cycles_sync(); return ret; } -static cycle_t __vsyscall_fn vread_tsc(void) +static notrace cycle_t __vsyscall_fn vread_tsc(void) { cycle_t ret = (cycle_t)get_cycles_sync(); return ret; Index: linux/arch/x86/kernel/vsyscall_64.c =================================================================== --- linux.orig/arch/x86/kernel/vsyscall_64.c +++ linux/arch/x86/kernel/vsyscall_64.c @@ -42,7 +42,7 @@ #include #include -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) notrace #define __syscall_clobber "r11","rcx","memory" #define __pa_vsymbol(x) \ ({unsigned long v; \ Index: linux/arch/x86/lib/delay_32.c =================================================================== --- linux.orig/arch/x86/lib/delay_32.c +++ linux/arch/x86/lib/delay_32.c @@ -24,7 +24,7 @@ #endif /* simple loop based delay: */ -static void delay_loop(unsigned long loops) +static notrace void delay_loop(unsigned long loops) { int d0; @@ -39,7 +39,7 @@ static void delay_loop(unsigned long loo } /* TSC based delay: */ -static void delay_tsc(unsigned long loops) +static notrace void delay_tsc(unsigned long loops) { unsigned long bclock, now; @@ -72,7 +72,7 @@ int read_current_timer(unsigned long *ti return -1; } -void __delay(unsigned long loops) +void notrace __delay(unsigned long loops) { delay_fn(loops); } Index: linux/arch/x86/mm/fault_32.c =================================================================== --- linux.orig/arch/x86/mm/fault_32.c +++ linux/arch/x86/mm/fault_32.c @@ -498,6 +498,7 @@ bad_area_nosemaphore: nr = (address - idt_descr.address) >> 3; if (nr == 6) { + stop_trace(); do_invalid_op(regs, 0); return; } Index: linux/arch/x86/mm/init_32.c =================================================================== --- linux.orig/arch/x86/mm/init_32.c +++ linux/arch/x86/mm/init_32.c @@ -200,7 +200,7 @@ static inline int page_kills_ppro(unsign return 0; } -int page_is_ram(unsigned long pagenr) +int notrace page_is_ram(unsigned long pagenr) { int i; unsigned long addr, end; Index: linux/arch/x86/power/cpu.c =================================================================== --- linux.orig/arch/x86/power/cpu.c +++ linux/arch/x86/power/cpu.c @@ -123,8 +123,9 @@ void __restore_processor_state(struct sa mcheck_init(&boot_cpu_data); } -void restore_processor_state(void) +void notrace restore_processor_state(void) { + trace_resume(); __restore_processor_state(&saved_context); } Index: linux/drivers/acpi/namespace/nsutils.c =================================================================== --- linux.orig/drivers/acpi/namespace/nsutils.c +++ linux/drivers/acpi/namespace/nsutils.c @@ -923,7 +923,7 @@ struct acpi_namespace_node *acpi_ns_get_ * ******************************************************************************/ -struct acpi_namespace_node *acpi_ns_get_next_valid_node(struct +struct acpi_namespace_node * notrace acpi_ns_get_next_valid_node(struct acpi_namespace_node *node) { Index: linux/drivers/acpi/namespace/nswalk.c =================================================================== --- linux.orig/drivers/acpi/namespace/nswalk.c +++ linux/drivers/acpi/namespace/nswalk.c @@ -65,7 +65,7 @@ ACPI_MODULE_NAME("nswalk") * within Scope is returned. * ******************************************************************************/ -struct acpi_namespace_node *acpi_ns_get_next_node(acpi_object_type type, struct acpi_namespace_node +struct acpi_namespace_node * notrace acpi_ns_get_next_node(acpi_object_type type, struct acpi_namespace_node *parent_node, struct acpi_namespace_node *child_node) { Index: linux/drivers/clocksource/acpi_pm.c =================================================================== --- linux.orig/drivers/clocksource/acpi_pm.c +++ linux/drivers/clocksource/acpi_pm.c @@ -30,13 +30,13 @@ */ u32 pmtmr_ioport __read_mostly; -static inline u32 read_pmtmr(void) +static notrace inline u32 read_pmtmr(void) { /* mask the output to 24 bits */ return inl(pmtmr_ioport) & ACPI_PM_MASK; } -u32 acpi_pm_read_verified(void) +u32 notrace acpi_pm_read_verified(void) { u32 v1 = 0, v2 = 0, v3 = 0; @@ -56,12 +56,12 @@ u32 acpi_pm_read_verified(void) return v2; } -static cycle_t acpi_pm_read_slow(void) +static notrace cycle_t acpi_pm_read_slow(void) { return (cycle_t)acpi_pm_read_verified(); } -static cycle_t acpi_pm_read(void) +static notrace cycle_t acpi_pm_read(void) { return (cycle_t)read_pmtmr(); } Index: linux/fs/proc/proc_misc.c =================================================================== --- linux.orig/fs/proc/proc_misc.c +++ linux/fs/proc/proc_misc.c @@ -653,6 +653,20 @@ static int execdomains_read_proc(char *p return proc_calc_metrics(page, start, off, count, eof, len); } +#ifdef CONFIG_EVENT_TRACE +extern struct seq_operations latency_trace_op; +static int latency_trace_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &latency_trace_op); +} +static struct file_operations proc_latency_trace_operations = { + .open = latency_trace_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + #ifdef CONFIG_MAGIC_SYSRQ /* * writing 'C' to /proc/sysrq-trigger is like sysrq-C @@ -747,6 +761,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_SCHEDSTATS create_seq_entry("schedstat", 0, &proc_schedstat_operations); #endif +#ifdef CONFIG_EVENT_TRACE + create_seq_entry("latency_trace", 0, &proc_latency_trace_operations); +#endif #ifdef CONFIG_PROC_KCORE proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { Index: linux/include/asm-arm/pgalloc.h =================================================================== --- linux.orig/include/asm-arm/pgalloc.h +++ linux/include/asm-arm/pgalloc.h @@ -109,7 +109,7 @@ static inline void __pmd_populate(pmd_t * * Ensure that we always set both PMD entries. */ -static inline void +static inline void notrace pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep) { unsigned long pte_ptr = (unsigned long)ptep; @@ -122,7 +122,7 @@ pmd_populate_kernel(struct mm_struct *mm __pmd_populate(pmdp, __pa(pte_ptr) | _PAGE_KERNEL_TABLE); } -static inline void +static inline void notrace pmd_populate(struct mm_struct *mm, pmd_t *pmdp, struct page *ptep) { __pmd_populate(pmdp, page_to_pfn(ptep) << PAGE_SHIFT | _PAGE_USER_TABLE); Index: linux/include/asm-arm/timex.h =================================================================== --- linux.orig/include/asm-arm/timex.h +++ linux/include/asm-arm/timex.h @@ -16,9 +16,17 @@ typedef unsigned long cycles_t; +#ifndef mach_read_cycles + #define mach_read_cycles() (0) +#ifdef CONFIG_LATENCY_TIMING + #define mach_cycles_to_usecs(d) (d) + #define mach_usecs_to_cycles(d) (d) +#endif +#endif + static inline cycles_t get_cycles (void) { - return 0; + return mach_read_cycles(); } #endif Index: linux/include/asm-arm/unistd.h =================================================================== --- linux.orig/include/asm-arm/unistd.h +++ linux/include/asm-arm/unistd.h @@ -380,6 +380,10 @@ #define __NR_eventfd (__NR_SYSCALL_BASE+351) #define __NR_fallocate (__NR_SYSCALL_BASE+352) +#ifndef __ASSEMBLY__ +#define NR_syscalls (__NR_set_mempolicy + 1 - __NR_SYSCALL_BASE) +#endif + /* * The following SWIs are ARM private. */ Index: linux/include/asm-x86/calling.h =================================================================== --- linux.orig/include/asm-x86/calling.h +++ linux/include/asm-x86/calling.h @@ -160,3 +160,53 @@ .macro icebp .byte 0xf1 .endm + +/* + * latency-tracing helpers: + */ + + .macro TRACE_SYS_CALL + +#ifdef CONFIG_EVENT_TRACE + SAVE_ARGS + + mov %rdx, %rcx + mov %rsi, %rdx + mov %rdi, %rsi + mov %rax, %rdi + + call sys_call + + RESTORE_ARGS +#endif + .endm + + + .macro TRACE_SYS_IA32_CALL + +#ifdef CONFIG_EVENT_TRACE + SAVE_ARGS + + mov %rdx, %rcx + mov %rsi, %rdx + mov %rdi, %rsi + mov %rax, %rdi + + call sys_ia32_call + + RESTORE_ARGS +#endif + .endm + + .macro TRACE_SYS_RET + +#ifdef CONFIG_EVENT_TRACE + SAVE_ARGS + + mov %rax, %rdi + + call sys_ret + + RESTORE_ARGS +#endif + .endm Index: linux/include/asm-x86/irq_32.h =================================================================== --- linux.orig/include/asm-x86/irq_32.h +++ linux/include/asm-x86/irq_32.h @@ -41,7 +41,7 @@ extern int irqbalance_disable(char *str) extern void fixup_irqs(cpumask_t map); #endif -unsigned int do_IRQ(struct pt_regs *regs); +extern fastcall notrace unsigned int do_IRQ(struct pt_regs *regs); void init_IRQ(void); void __init native_init_IRQ(void); Index: linux/include/asm-x86/module_32.h =================================================================== --- linux.orig/include/asm-x86/module_32.h +++ linux/include/asm-x86/module_32.h @@ -64,12 +64,18 @@ struct mod_arch_specific #error unknown processor family #endif +#ifdef CONFIG_REGPARM +#define MODULE_REGPARM "REGPARM " +#else +#define MODULE_REGPARM "" +#endif + #ifdef CONFIG_4KSTACKS #define MODULE_STACKSIZE "4KSTACKS " #else #define MODULE_STACKSIZE "" #endif -#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE +#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM MODULE_STACKSIZE #endif /* _ASM_I386_MODULE_H */ Index: linux/include/asm-x86/processor_32.h =================================================================== --- linux.orig/include/asm-x86/processor_32.h +++ linux/include/asm-x86/processor_32.h @@ -134,7 +134,7 @@ extern void detect_ht(struct cpuinfo_x86 static inline void detect_ht(struct cpuinfo_x86 *c) {} #endif -static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, +static inline void fastcall native_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ Index: linux/include/asm-x86/unistd_64.h =================================================================== --- linux.orig/include/asm-x86/unistd_64.h +++ linux/include/asm-x86/unistd_64.h @@ -11,6 +11,8 @@ * Note: holes are not allowed. */ +#define NR_syscalls (__NR_syscall_max+1) + /* at least 8 syscall per cacheline */ #define __NR_read 0 __SYSCALL(__NR_read, sys_read) Index: linux/include/linux/clocksource.h =================================================================== --- linux.orig/include/linux/clocksource.h +++ linux/include/linux/clocksource.h @@ -21,6 +21,9 @@ typedef u64 cycle_t; struct clocksource; +extern unsigned long preempt_max_latency; +extern unsigned long preempt_thresh; + /** * struct clocksource - hardware abstraction for a free running counter * Provides mostly state-free accessors to the underlying hardware. @@ -178,8 +181,20 @@ static inline cycle_t clocksource_read(s */ static inline s64 cyc2ns(struct clocksource *cs, cycle_t cycles) { - u64 ret = (u64)cycles; - ret = (ret * cs->mult) >> cs->shift; + return ((u64)cycles * cs->mult) >> cs->shift; +} + +/** + * ns2cyc - converts nanoseconds to clocksource cycles + * @cs: Pointer to clocksource + * @nsecs: Nanoseconds + */ +static inline cycles_t ns2cyc(struct clocksource *cs, u64 nsecs) +{ + cycles_t ret = nsecs << cs->shift; + + do_div(ret, cs->mult + 1); + return ret; } @@ -232,4 +247,8 @@ static inline void update_vsyscall_tz(vo } #endif +extern cycle_t get_monotonic_cycles(void); +extern unsigned long cycles_to_usecs(cycle_t); +extern cycle_t usecs_to_cycles(unsigned long); + #endif /* _LINUX_CLOCKSOURCE_H */ Index: linux/include/linux/kernel.h =================================================================== --- linux.orig/include/linux/kernel.h +++ linux/include/linux/kernel.h @@ -182,6 +182,8 @@ asmlinkage int printk(const char * fmt, extern int log_buf_get_len(void); extern int log_buf_read(int idx); extern int log_buf_copy(char *dest, int idx, int len); +extern void early_printk(const char *fmt, ...) + __attribute__ ((format (printf, 1, 2))); #else static inline int vprintk(const char *s, va_list args) __attribute__ ((format (printf, 1, 0))); Index: linux/include/linux/latency_hist.h =================================================================== --- /dev/null +++ linux/include/linux/latency_hist.h @@ -0,0 +1,32 @@ +/* + * kernel/latency_hist.h + * + * Add support for histograms of preemption-off latency and + * interrupt-off latency and wakeup latency, it depends on + * Real-Time Preemption Support. + * + * Copyright (C) 2005 MontaVista Software, Inc. + * Yi Yang + * + */ +#ifndef _LINUX_LATENCY_HIST_H_ +#define _LINUX_LATENCY_HIST_H_ + +enum { + INTERRUPT_LATENCY = 0, + PREEMPT_LATENCY, + WAKEUP_LATENCY +}; + +#define MAX_ENTRY_NUM 10240 +#define LATENCY_TYPE_NUM 3 + +#ifdef CONFIG_LATENCY_HIST +extern void latency_hist(int latency_type, int cpu, unsigned long latency); +# define latency_hist_flag 1 +#else +# define latency_hist(a,b,c) do { (void)(cpu); } while (0) +# define latency_hist_flag 0 +#endif /* CONFIG_LATENCY_HIST */ + +#endif /* ifndef _LINUX_LATENCY_HIST_H_ */ Index: linux/include/linux/linkage.h =================================================================== --- linux.orig/include/linux/linkage.h +++ linux/include/linux/linkage.h @@ -3,6 +3,8 @@ #include +#define notrace __attribute ((no_instrument_function)) + #ifdef __cplusplus #define CPP_ASMLINKAGE extern "C" #else Index: linux/include/linux/prctl.h =================================================================== --- linux.orig/include/linux/prctl.h +++ linux/include/linux/prctl.h @@ -3,6 +3,7 @@ /* Values to pass as first argument to prctl() */ +#define PR_SET_TRACING 0 /* Second arg is tracing on/off */ #define PR_SET_PDEATHSIG 1 /* Second arg is a signal */ #define PR_GET_PDEATHSIG 2 /* Second arg is a ptr to return the signal */ Index: linux/include/linux/preempt.h =================================================================== --- linux.orig/include/linux/preempt.h +++ linux/include/linux/preempt.h @@ -10,12 +10,26 @@ #include #include -#ifdef CONFIG_DEBUG_PREEMPT - extern void fastcall add_preempt_count(int val); - extern void fastcall sub_preempt_count(int val); +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING) + extern void notrace add_preempt_count(unsigned int val); + extern void notrace sub_preempt_count(unsigned int val); + extern void notrace mask_preempt_count(unsigned int mask); + extern void notrace unmask_preempt_count(unsigned int mask); #else # define add_preempt_count(val) do { preempt_count() += (val); } while (0) # define sub_preempt_count(val) do { preempt_count() -= (val); } while (0) +# define mask_preempt_count(mask) \ + do { preempt_count() |= (mask); } while (0) +# define unmask_preempt_count(mask) \ + do { preempt_count() &= ~(mask); } while (0) +#endif + +#ifdef CONFIG_CRITICAL_TIMING + extern void touch_critical_timing(void); + extern void stop_critical_timing(void); +#else +# define touch_critical_timing() do { } while (0) +# define stop_critical_timing() do { } while (0) #endif #define inc_preempt_count() add_preempt_count(1) Index: linux/include/linux/sched.h =================================================================== --- linux.orig/include/linux/sched.h +++ linux/include/linux/sched.h @@ -241,6 +241,7 @@ static inline void show_state(void) } extern void show_regs(struct pt_regs *); +extern void irq_show_regs_callback(int cpu, struct pt_regs *regs); /* * TASK is a pointer to the task whose backtrace we want to see (or NULL for current @@ -279,6 +280,107 @@ static inline void touch_all_softlockup_ } #endif +#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_EVENT_TRACE) + extern void print_traces(struct task_struct *task); +#else +# define print_traces(task) do { } while (0) +#endif + +#ifdef CONFIG_FRAME_POINTER +# ifndef CONFIG_ARM +# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) +# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) +# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) +# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4)) +# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5)) +# else + extern unsigned long arm_return_addr(int level); +# define CALLER_ADDR0 arm_return_addr(0) +# define CALLER_ADDR1 arm_return_addr(1) +# define CALLER_ADDR2 arm_return_addr(2) +# define CALLER_ADDR3 arm_return_addr(3) +# define CALLER_ADDR4 arm_return_addr(4) +# define CALLER_ADDR5 arm_return_addr(5) +#endif +#else +# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +# define CALLER_ADDR1 0UL +# define CALLER_ADDR2 0UL +# define CALLER_ADDR3 0UL +# define CALLER_ADDR4 0UL +# define CALLER_ADDR5 0UL +#endif + +#ifdef CONFIG_MCOUNT + extern void notrace mcount(void); +#else +# define mcount() do { } while (0) +#endif + +#ifdef CONFIG_EVENT_TRACE + extern int mcount_enabled, trace_enabled, trace_user_triggered, + trace_user_trigger_irq, trace_freerunning, trace_verbose, + trace_print_on_crash, trace_all_cpus, print_functions, + syscall_tracing, stackframe_tracing, trace_use_raw_cycles, + trace_all_runnable; + extern void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3); + extern void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2); + extern void notrace trace_special_u64(unsigned long long v1, unsigned long v2); + extern void notrace trace_special_sym(void); + extern void stop_trace(void); +# define start_trace() do { trace_enabled = 1; } while (0) + extern void print_last_trace(void); + extern void nmi_trace(unsigned long eip, unsigned long parent_eip, + unsigned long flags); + extern long user_trace_start(void); + extern long user_trace_stop(void); + extern void trace_cmdline(void); + extern void init_tracer(void); + extern void trace_resume(void); +#else +# define mcount_enabled 0 +# define trace_enabled 0 +# define syscall_tracing 0 +# define stackframe_tracing 0 +# define trace_user_triggered 0 +# define trace_freerunning 0 +# define trace_all_cpus 0 +# define trace_verbose 0 +# define trace_special(v1,v2,v3) do { } while (0) +# define trace_special_pid(pid,v1,v2) do { } while (0) +# define trace_special_u64(v1,v2) do { } while (0) +# define trace_special_sym() do { } while (0) +# define stop_trace() do { } while (0) +# define start_trace() do { } while (0) +# define print_last_trace() do { } while (0) +# define nmi_trace(eip, parent_eip, flags) do { } while (0) +# define user_trace_start() do { } while (0) +# define user_trace_stop() do { } while (0) +# define trace_cmdline() do { } while (0) +# define init_tracer() do { } while (0) +# define trace_resume() do { } while (0) +#endif + +#ifdef CONFIG_WAKEUP_TIMING + extern int wakeup_timing; + extern void __trace_start_sched_wakeup(struct task_struct *p); + extern void trace_stop_sched_switched(struct task_struct *p); + extern void trace_change_sched_cpu(struct task_struct *p, int new_cpu); +#else +# define wakeup_timing 0 +# define __trace_start_sched_wakeup(p) do { } while (0) +# define trace_stop_sched_switched(p) do { } while (0) +# define trace_change_sched_cpu(p, cpu) do { } while (0) +#endif + +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + extern void notrace time_hardirqs_on(unsigned long a0, unsigned long a1); + extern void notrace time_hardirqs_off(unsigned long a0, unsigned long a1); +#else +# define time_hardirqs_on(a0, a1) do { } while (0) +# define time_hardirqs_off(a0, a1) do { } while (0) +#endif /* Attach to any functions which should be ignored in wchan output. */ #define __sched __attribute__((__section__(".sched.text"))) @@ -1116,6 +1218,13 @@ struct task_struct { unsigned int lockdep_recursion; #endif +#define MAX_PREEMPT_TRACE 16 + +#ifdef CONFIG_PREEMPT_TRACE + unsigned long preempt_trace_eip[MAX_PREEMPT_TRACE]; + unsigned long preempt_trace_parent_eip[MAX_PREEMPT_TRACE]; +#endif + /* journalling filesystem info */ void *journal_info; Index: linux/init/main.c =================================================================== --- linux.orig/init/main.c +++ linux/init/main.c @@ -587,6 +587,8 @@ asmlinkage void __init start_kernel(void if (panic_later) panic(panic_later, panic_param); + init_tracer(); + lockdep_info(); /* Index: linux/kernel/Makefile =================================================================== --- linux.orig/kernel/Makefile +++ linux/kernel/Makefile @@ -43,6 +43,11 @@ obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o +obj-$(CONFIG_DEBUG_PREEMPT) += latency_trace.o +obj-$(CONFIG_WAKEUP_TIMING) += latency_trace.o +obj-$(CONFIG_EVENT_TRACE) += latency_trace.o +obj-$(CONFIG_CRITICAL_TIMING) += latency_trace.o +obj-$(CONFIG_LATENCY_HIST) += latency_hist.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o Index: linux/kernel/exit.c =================================================================== --- linux.orig/kernel/exit.c +++ linux/kernel/exit.c @@ -1043,6 +1043,7 @@ fastcall NORET_TYPE void do_exit(long co __free_pipe_info(tsk->splice_pipe); preempt_disable(); + trace_special_pid(tsk->pid, 0, 0); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; Index: linux/kernel/fork.c =================================================================== --- linux.orig/kernel/fork.c +++ linux/kernel/fork.c @@ -1010,7 +1010,7 @@ static struct task_struct *copy_process( rt_mutex_init_task(p); -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP) DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif Index: linux/kernel/latency_hist.c =================================================================== --- /dev/null +++ linux/kernel/latency_hist.c @@ -0,0 +1,267 @@ +/* + * kernel/latency_hist.c + * + * Add support for histograms of preemption-off latency and + * interrupt-off latency and wakeup latency, it depends on + * Real-Time Preemption Support. + * + * Copyright (C) 2005 MontaVista Software, Inc. + * Yi Yang + * + */ +#include +#include +#include +#include +#include +#include +#include + +typedef struct hist_data_struct { + atomic_t hist_mode; /* 0 log, 1 don't log */ + unsigned long min_lat; + unsigned long avg_lat; + unsigned long max_lat; + unsigned long long beyond_hist_bound_samples; + unsigned long long accumulate_lat; + unsigned long long total_samples; + unsigned long long hist_array[MAX_ENTRY_NUM]; +} hist_data_t; + +static struct proc_dir_entry * latency_hist_root = NULL; +static char * latency_hist_proc_dir_root = "latency_hist"; + +static char * percpu_proc_name = "CPU"; + +#ifdef CONFIG_INTERRUPT_OFF_HIST +static DEFINE_PER_CPU(hist_data_t, interrupt_off_hist); +static char * interrupt_off_hist_proc_dir = "interrupt_off_latency"; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST +static DEFINE_PER_CPU(hist_data_t, preempt_off_hist); +static char * preempt_off_hist_proc_dir = "preempt_off_latency"; +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST +static DEFINE_PER_CPU(hist_data_t, wakeup_latency_hist); +static char * wakeup_latency_hist_proc_dir = "wakeup_latency"; +#endif + +static struct proc_dir_entry *entry[LATENCY_TYPE_NUM][NR_CPUS]; + +static inline u64 u64_div(u64 x, u64 y) +{ + do_div(x, y); + return x; +} + +void latency_hist(int latency_type, int cpu, unsigned long latency) +{ + hist_data_t * my_hist; + + if ((cpu < 0) || (cpu >= NR_CPUS) || (latency_type < INTERRUPT_LATENCY) + || (latency_type > WAKEUP_LATENCY) || (latency < 0)) + return; + + switch(latency_type) { +#ifdef CONFIG_INTERRUPT_OFF_HIST + case INTERRUPT_LATENCY: + my_hist = (hist_data_t *)&per_cpu(interrupt_off_hist, cpu); + break; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + case PREEMPT_LATENCY: + my_hist = (hist_data_t *)&per_cpu(preempt_off_hist, cpu); + break; +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + case WAKEUP_LATENCY: + my_hist = (hist_data_t *)&per_cpu(wakeup_latency_hist, cpu); + break; +#endif + default: + return; + } + + if (atomic_read(&my_hist->hist_mode) == 0) + return; + + if (latency >= MAX_ENTRY_NUM) + my_hist->beyond_hist_bound_samples++; + else + my_hist->hist_array[latency]++; + + if (latency < my_hist->min_lat) + my_hist->min_lat = latency; + else if (latency > my_hist->max_lat) + my_hist->max_lat = latency; + + my_hist->total_samples++; + my_hist->accumulate_lat += latency; + my_hist->avg_lat = (unsigned long) u64_div(my_hist->accumulate_lat, + my_hist->total_samples); + return; +} + +static void *l_start(struct seq_file *m, loff_t * pos) +{ + loff_t *index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL); + loff_t index = *pos; + hist_data_t *my_hist = (hist_data_t *) m->private; + + if (!index_ptr) + return NULL; + + if (index == 0) { + atomic_dec(&my_hist->hist_mode); + seq_printf(m, "#Minimum latency: %lu microseconds.\n" + "#Average latency: %lu microseconds.\n" + "#Maximum latency: %lu microseconds.\n" + "#Total samples: %llu\n" + "#There are %llu samples greater or equal than %d microseconds\n" + "#usecs\t%16s\n" + , my_hist->min_lat + , my_hist->avg_lat + , my_hist->max_lat + , my_hist->total_samples + , my_hist->beyond_hist_bound_samples + , MAX_ENTRY_NUM, "samples"); + } + if (index >= MAX_ENTRY_NUM) + return NULL; + + *index_ptr = index; + return index_ptr; +} + +static void *l_next(struct seq_file *m, void *p, loff_t * pos) +{ + loff_t *index_ptr = p; + hist_data_t *my_hist = (hist_data_t *) m->private; + + if (++*pos >= MAX_ENTRY_NUM) { + atomic_inc(&my_hist->hist_mode); + return NULL; + } + *index_ptr = *pos; + return index_ptr; +} + +static void l_stop(struct seq_file *m, void *p) +{ + kfree(p); +} + +static int l_show(struct seq_file *m, void *p) +{ + int index = *(loff_t *) p; + hist_data_t *my_hist = (hist_data_t *) m->private; + + seq_printf(m, "%5d\t%16llu\n", index, my_hist->hist_array[index]); + return 0; +} + +static struct seq_operations latency_hist_seq_op = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show +}; + +static int latency_hist_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *entry_ptr = NULL; + int ret, i, j, break_flags = 0; + struct seq_file *seq; + + entry_ptr = PDE(file->f_dentry->d_inode); + for (i = 0; i < LATENCY_TYPE_NUM; i++) { + for (j = 0; j < NR_CPUS; j++) { + if (entry[i][j] == NULL) + continue; + if (entry_ptr->low_ino == entry[i][j]->low_ino) { + break_flags = 1; + break; + } + } + if (break_flags == 1) + break; + } + ret = seq_open(file, &latency_hist_seq_op); + if (break_flags == 1) { + seq = (struct seq_file *)file->private_data; + seq->private = entry[i][j]->data; + } + return ret; +} + +static struct file_operations latency_hist_seq_fops = { + .open = latency_hist_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static __init int latency_hist_init(void) +{ + struct proc_dir_entry *tmp_parent_proc_dir; + int i = 0, len = 0; + hist_data_t *my_hist; + char procname[64]; + + latency_hist_root = proc_mkdir(latency_hist_proc_dir_root, NULL); + + +#ifdef CONFIG_INTERRUPT_OFF_HIST + tmp_parent_proc_dir = proc_mkdir(interrupt_off_hist_proc_dir, latency_hist_root); + for (i = 0; i < NR_CPUS; i++) { + len = sprintf(procname, "%s%d", percpu_proc_name, i); + procname[len] = '\0'; + entry[INTERRUPT_LATENCY][i] = + create_proc_entry(procname, 0, tmp_parent_proc_dir); + entry[INTERRUPT_LATENCY][i]->data = (void *)&per_cpu(interrupt_off_hist, i); + entry[INTERRUPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops; + my_hist = (hist_data_t *) entry[INTERRUPT_LATENCY][i]->data; + atomic_set(&my_hist->hist_mode,1); + my_hist->min_lat = 0xFFFFFFFFUL; + } +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + tmp_parent_proc_dir = proc_mkdir(preempt_off_hist_proc_dir, latency_hist_root); + for (i = 0; i < NR_CPUS; i++) { + len = sprintf(procname, "%s%d", percpu_proc_name, i); + procname[len] = '\0'; + entry[PREEMPT_LATENCY][i] = + create_proc_entry(procname, 0, tmp_parent_proc_dir); + entry[PREEMPT_LATENCY][i]->data = (void *)&per_cpu(preempt_off_hist, i); + entry[PREEMPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops; + my_hist = (hist_data_t *) entry[PREEMPT_LATENCY][i]->data; + atomic_set(&my_hist->hist_mode,1); + my_hist->min_lat = 0xFFFFFFFFUL; + } +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + tmp_parent_proc_dir = proc_mkdir(wakeup_latency_hist_proc_dir, latency_hist_root); + for (i = 0; i < NR_CPUS; i++) { + len = sprintf(procname, "%s%d", percpu_proc_name, i); + procname[len] = '\0'; + entry[WAKEUP_LATENCY][i] = + create_proc_entry(procname, 0, tmp_parent_proc_dir); + entry[WAKEUP_LATENCY][i]->data = (void *)&per_cpu(wakeup_latency_hist, i); + entry[WAKEUP_LATENCY][i]->proc_fops = &latency_hist_seq_fops; + my_hist = (hist_data_t *) entry[WAKEUP_LATENCY][i]->data; + atomic_set(&my_hist->hist_mode,1); + my_hist->min_lat = 0xFFFFFFFFUL; + } +#endif + return 0; + +} + +__initcall(latency_hist_init); + Index: linux/kernel/latency_trace.c =================================================================== --- /dev/null +++ linux/kernel/latency_trace.c @@ -0,0 +1,2799 @@ +/* + * kernel/latency_trace.c + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef DEFINE_RAW_SPINLOCK +# define DEFINE_RAW_SPINLOCK DEFINE_SPINLOCK +#endif + +#ifndef RAW_SPIN_LOCK_UNLOCKED +# define RAW_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED +#endif + +int trace_use_raw_cycles = 0; + +#define __raw_spinlock_t raw_spinlock_t +#define need_resched_delayed() 0 + +#ifdef CONFIG_EVENT_TRACE +/* + * Convert raw cycles to usecs. + * Note: this is not the 'clocksource cycles' value, it's the raw + * cycle counter cycles. We use GTOD to timestamp latency start/end + * points, but the trace entries inbetween are timestamped with + * get_cycles(). + */ +static unsigned long notrace cycles_to_us(cycle_t delta) +{ + if (!trace_use_raw_cycles) + return cycles_to_usecs(delta); +#ifdef CONFIG_X86 + do_div(delta, cpu_khz/1000+1); +#elif defined(CONFIG_PPC) + delta = mulhwu(tb_to_us, delta); +#elif defined(CONFIG_ARM) + delta = mach_cycles_to_usecs(delta); +#else + #error Implement cycles_to_usecs. +#endif + + return (unsigned long) delta; +} +#endif + +#ifndef irqs_off +# define irqs_off() 0 +#endif + +#ifndef DEBUG_WARN_ON +static inline int DEBUG_WARN_ON(int cond) +{ + WARN_ON(cond); + return 0; +} +#endif + +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING +# ifdef CONFIG_CRITICAL_PREEMPT_TIMING +# define irqs_off_preempt_count() preempt_count() +# else +# define irqs_off_preempt_count() 0 +# endif +#endif + +#ifdef CONFIG_WAKEUP_TIMING +struct sch_struct { + __raw_spinlock_t trace_lock; + struct task_struct *task; + int cpu; + struct cpu_trace *tr; +} ____cacheline_aligned_in_smp; + +static __cacheline_aligned_in_smp struct sch_struct sch = + { trace_lock: __RAW_SPIN_LOCK_UNLOCKED }; + +int wakeup_timing = 0; +#endif + +/* + * Track maximum latencies and save the trace: + */ + +/* + * trace_stop_sched_switched must not be called with runqueue locks held! + */ +static __cacheline_aligned_in_smp DECLARE_MUTEX(max_mutex); + +/* + * Sequence count - we record it when starting a measurement and + * skip the latency if the sequence has changed - some other section + * did a maximum and could disturb our measurement with serial console + * printouts, etc. Truly coinciding maximum latencies should be rare + * and what happens together happens separately as well, so this doesnt + * decrease the validity of the maximum found: + */ +static __cacheline_aligned_in_smp unsigned long max_sequence; + +enum trace_type +{ + __TRACE_FIRST_TYPE = 0, + + TRACE_FN, + TRACE_SPECIAL, + TRACE_SPECIAL_PID, + TRACE_SPECIAL_U64, + TRACE_SPECIAL_SYM, + TRACE_CMDLINE, + TRACE_SYSCALL, + TRACE_SYSRET, + + __TRACE_LAST_TYPE +}; + +enum trace_flag_type +{ + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_NEED_RESCHED = 0x02, + TRACE_FLAG_NEED_RESCHED_DELAYED = 0x04, + TRACE_FLAG_HARDIRQ = 0x08, + TRACE_FLAG_SOFTIRQ = 0x10, + TRACE_FLAG_IRQS_HARD_OFF = 0x20, +}; + +/* + * Maximum preemption latency measured. Initialize to maximum, + * we clear it after bootup. + */ +#ifdef CONFIG_LATENCY_HIST +unsigned long preempt_max_latency = (cycle_t)0UL; +#else +unsigned long preempt_max_latency = (cycle_t)ULONG_MAX; +#endif + +unsigned long preempt_thresh; + +/* + * Should this new latency be reported/recorded? + */ +static int report_latency(cycle_t delta) +{ + if (latency_hist_flag && !trace_user_triggered) + return 1; + + if (preempt_thresh) { + if (delta < preempt_thresh) + return 0; + } else { + if (delta <= preempt_max_latency) + return 0; + } + return 1; +} + +#ifdef CONFIG_EVENT_TRACE + +/* + * Number of per-CPU trace entries: + */ +#define MAX_TRACE (65536UL*32UL) + +#define CMDLINE_BYTES 16 + +/* + * 32 bytes on 32-bit platforms: + */ +struct trace_entry { + char type; + char cpu; + char flags; + char preempt_count; // assumes PREEMPT_MASK is 8 bits or less + int pid; + cycle_t timestamp; + union { + struct { + unsigned long eip; + unsigned long parent_eip; + } fn; + struct { + unsigned long eip; + unsigned long v1, v2, v3; + } special; + struct { + unsigned char str[CMDLINE_BYTES]; + } cmdline; + struct { + unsigned long nr; // highest bit: compat call + unsigned long p1, p2, p3; + } syscall; + struct { + unsigned long ret; + } sysret; + struct { + unsigned long __pad3[4]; + } pad; + } u; +} __attribute__((packed)); + +#endif + +struct cpu_trace { + atomic_t disabled; + unsigned long trace_idx; + cycle_t preempt_timestamp; + unsigned long critical_start, critical_end; + unsigned long critical_sequence; + atomic_t underrun; + atomic_t overrun; + cycle_t cycles; + cycle_t last_cycles; + int early_warning; + int latency_type; + int cpu; + +#ifdef CONFIG_EVENT_TRACE + struct trace_entry *trace; + char comm[CMDLINE_BYTES]; + pid_t pid; + unsigned long uid; + unsigned long nice; + unsigned long policy; + unsigned long rt_priority; + unsigned long saved_latency; +#endif +#ifdef CONFIG_DEBUG_STACKOVERFLOW + unsigned long stack_check; +#endif +} ____cacheline_aligned_in_smp; + +static struct cpu_trace cpu_traces[NR_CPUS] ____cacheline_aligned_in_smp = +{ [0 ... NR_CPUS-1] = { +#ifdef CONFIG_DEBUG_STACKOVERFLOW + .stack_check = 1 +#endif + } }; + +static inline notrace cycle_t __now(int monotonic) +{ + cycles_t now; + + if (trace_use_raw_cycles && !monotonic) + now = get_cycles(); + else + now = get_monotonic_cycles(); + + return now; +} + +static notrace cycle_t now(struct cpu_trace *tr, int monotonic) +{ + cycles_t now, delta, last = tr->last_cycles; + + now = __now(monotonic); + + /* + * Protect against time warps: + */ + if (unlikely(now < last || !last)) + delta = 1; + else + delta = now - last; + + tr->last_cycles = now; + tr->cycles += delta; + + return tr->cycles; +} + +/* + * Resume callback - ignore any time spent resumed: + * (the clocksource readout might be unreliable anyway) + */ +void notrace trace_resume(void) +{ + struct cpu_trace *tr = cpu_traces + raw_smp_processor_id(); + + tr->last_cycles = 0; + mcount(); +} + +#ifdef CONFIG_EVENT_TRACE + +int trace_enabled = 1; +int syscall_tracing = 0; +int stackframe_tracing = 0; +int mcount_enabled = 0; +int trace_freerunning = 0; +int trace_print_on_crash = 0; +int trace_verbose = 0; +int trace_all_cpus = 1; +int print_functions = 0; +int trace_all_runnable = 0; + +/* + * user-triggered via gettimeofday(0,1)/gettimeofday(0,0) + */ +int trace_user_triggered = 0; +int trace_user_trigger_irq = -1; + +struct saved_trace_struct { + int cpu; + cycle_t first_timestamp, last_timestamp; + struct cpu_trace traces[NR_CPUS]; +} ____cacheline_aligned_in_smp; + +/* + * The current worst-case trace: + */ +static struct saved_trace_struct max_tr; + +/* + * /proc/latency_trace atomicity: + */ +static DECLARE_MUTEX(out_mutex); + +static struct saved_trace_struct out_tr; + +static void notrace printk_name(unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + printk("%s+%#lx/%#lx", sym_name, offset, size); + else + printk("<%08lx>", eip); +} + +#ifdef CONFIG_DEBUG_STACKOVERFLOW + +#ifndef STACK_WARN +# define STACK_WARN (THREAD_SIZE/8) +#endif + +#define MIN_STACK_NEEDED (sizeof(struct thread_info) + STACK_WARN) +#define MAX_STACK (THREAD_SIZE - sizeof(struct thread_info)) + +#if (defined(__i386__) || defined(__x86_64__)) && defined(CONFIG_FRAME_POINTER) +# define PRINT_EXACT_STACKFRAME +#endif + +#ifdef PRINT_EXACT_STACKFRAME +static unsigned long *worst_stack_bp; +#endif +static DEFINE_RAW_SPINLOCK(worst_stack_lock); +unsigned long worst_stack_left = THREAD_SIZE; +static unsigned long worst_stack_printed = THREAD_SIZE; +static char worst_stack_comm[TASK_COMM_LEN+1]; +static int worst_stack_pid; +static unsigned long worst_stack_sp; +static char worst_stack[THREAD_SIZE]; + +static notrace void fill_worst_stack(unsigned long stack_left) +{ + unsigned long flags; + + /* + * On x64, we must not read the PDA during early bootup: + */ +#ifdef CONFIG_X86_64 + if (system_state == SYSTEM_BOOTING) + return; +#endif + spin_lock_irqsave(&worst_stack_lock, flags); + if (likely(stack_left < worst_stack_left)) { + worst_stack_left = stack_left; + memcpy(worst_stack, current_thread_info(), THREAD_SIZE); + worst_stack_sp = (unsigned long)&stack_left; + memcpy(worst_stack_comm, current->comm, TASK_COMM_LEN); + worst_stack_pid = current->pid; +#ifdef PRINT_EXACT_STACKFRAME +# ifdef __i386__ + asm ("mov %%ebp, %0\n" :"=g"(worst_stack_bp)); +# elif defined(__x86_64__) + asm ("mov %%rbp, %0\n" :"=g"(worst_stack_bp)); +# else +# error Poke the author of above asm code lines ! +# endif +#endif + } + spin_unlock_irqrestore(&worst_stack_lock, flags); +} + +#ifdef PRINT_EXACT_STACKFRAME + +/* + * This takes a BP offset to point the BP back into the saved stack, + * the original stack might be long gone (but the stackframe within + * the saved copy still contains references to it). + */ +#define CONVERT_TO_SAVED_STACK(bp) \ + ((void *)worst_stack + ((unsigned long)bp & (THREAD_SIZE-1))) + +static void show_stackframe(void) +{ + unsigned long addr, frame_size, *bp, *prev_bp, sum = 0; + + bp = CONVERT_TO_SAVED_STACK(worst_stack_bp); + + while (bp[0]) { + addr = bp[1]; + if (!kernel_text_address(addr)) + break; + + prev_bp = bp; + bp = CONVERT_TO_SAVED_STACK((unsigned long *)bp[0]); + + frame_size = (bp - prev_bp) * sizeof(long); + + if (frame_size < THREAD_SIZE) { + printk("{ %4ld} ", frame_size); + sum += frame_size; + } else + printk("{=%4ld} ", sum); + + printk("[<%08lx>] ", addr); + printk_name(addr); + printk("\n"); + } +} + +#else + +static inline int valid_stack_ptr(void *p) +{ + return p > (void *)worst_stack && + p < (void *)worst_stack + THREAD_SIZE - 3; +} + +static void show_stackframe(void) +{ + unsigned long prev_frame, addr; + unsigned long *stack; + + prev_frame = (unsigned long)(worst_stack + + (worst_stack_sp & (THREAD_SIZE-1))); + stack = (unsigned long *)prev_frame; + + while (valid_stack_ptr(stack)) { + addr = *stack++; + if (__kernel_text_address(addr)) { + printk("(%4ld) ", (unsigned long)stack - prev_frame); + printk("[<%08lx>] ", addr); + print_symbol("%s\n", addr); + prev_frame = (unsigned long)stack; + } + if ((char *)stack >= worst_stack + THREAD_SIZE) + break; + } +} + +#endif + +static notrace void __print_worst_stack(void) +{ + unsigned long fill_ratio; + printk("----------------------------->\n"); + printk("| new stack fill maximum: %s/%d, %ld bytes (out of %ld bytes).\n", + worst_stack_comm, worst_stack_pid, + MAX_STACK-worst_stack_left, (long)MAX_STACK); + fill_ratio = (MAX_STACK-worst_stack_left)*100/(long)MAX_STACK; + printk("| Stack fill ratio: %02ld%%", fill_ratio); + if (fill_ratio >= 90) + printk(" - BUG: that's quite high, please report this!\n"); + else + printk(" - that's still OK, no need to report this.\n"); + printk("------------|\n"); + + show_stackframe(); + printk("<---------------------------\n\n"); +} + +static notrace void print_worst_stack(void) +{ + unsigned long flags; + + if (irqs_disabled() || preempt_count()) + return; + + spin_lock_irqsave(&worst_stack_lock, flags); + if (worst_stack_printed == worst_stack_left) { + spin_unlock_irqrestore(&worst_stack_lock, flags); + return; + } + worst_stack_printed = worst_stack_left; + spin_unlock_irqrestore(&worst_stack_lock, flags); + + __print_worst_stack(); +} + +static notrace void debug_stackoverflow(struct cpu_trace *tr) +{ + long stack_left; + + if (unlikely(tr->stack_check <= 0)) + return; + atomic_inc(&tr->disabled); + + /* Debugging check for stack overflow: is there less than 1KB free? */ +#ifdef __i386__ + __asm__ __volatile__("and %%esp,%0" : + "=r" (stack_left) : "0" (THREAD_SIZE - 1)); +#elif defined(__x86_64__) + __asm__ __volatile__("and %%rsp,%0" : + "=r" (stack_left) : "0" (THREAD_SIZE - 1)); +#else +# error Poke the author of above asm code lines ! +#endif + if (unlikely(stack_left < MIN_STACK_NEEDED)) { + tr->stack_check = 0; + printk(KERN_ALERT "BUG: stack overflow: only %ld bytes left! [%08lx...(%08lx-%08lx)]\n", + stack_left - sizeof(struct thread_info), + (long)&stack_left, + (long)current_thread_info(), + (long)current_thread_info() + THREAD_SIZE); + fill_worst_stack(stack_left); + __print_worst_stack(); + goto out; + } + if (unlikely(stack_left < worst_stack_left)) { + tr->stack_check--; + fill_worst_stack(stack_left); + print_worst_stack(); + tr->stack_check++; + } else + if (worst_stack_printed != worst_stack_left) { + tr->stack_check--; + print_worst_stack(); + tr->stack_check++; + } +out: + atomic_dec(&tr->disabled); +} + +#endif + +#ifdef CONFIG_EARLY_PRINTK +static void notrace early_printk_name(unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + early_printk("%s <%08lx>", sym_name, eip); + else + early_printk("<%08lx>", eip); +} + +static __raw_spinlock_t early_print_lock = __RAW_SPIN_LOCK_UNLOCKED; + +static void notrace early_print_entry(struct trace_entry *entry) +{ + int hardirq, softirq; + + __raw_spin_lock(&early_print_lock); + early_printk("%-5d ", entry->pid); + + early_printk("%d%c%c", + entry->cpu, + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.', + (entry->flags & TRACE_FLAG_NEED_RESCHED_DELAYED) ? 'n' : + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) + early_printk("H"); + else { + if (hardirq) + early_printk("h"); + else { + if (softirq) + early_printk("s"); + else + early_printk("."); + } + } + + early_printk(":%d: ", entry->preempt_count); + + if (entry->type == TRACE_FN) { + early_printk_name(entry->u.fn.eip); + early_printk(" <= ("); + early_printk_name(entry->u.fn.parent_eip); + early_printk(")\n"); + } else { + /* special entries: */ + early_printk_name(entry->u.special.eip); + early_printk(": <%08lx> <%08lx> <%08lx>\n", + entry->u.special.v1, + entry->u.special.v2, + entry->u.special.v3); + } + __raw_spin_unlock(&early_print_lock); +} +#else +# define early_print_entry(x) do { } while(0) +#endif + +static void notrace +____trace(int cpu, enum trace_type type, struct cpu_trace *tr, + unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, unsigned long v3, + unsigned long flags) +{ + struct trace_entry *entry; + unsigned long idx, idx_next; + cycle_t timestamp; + u32 pc; + +#ifdef CONFIG_DEBUG_PREEMPT +// WARN_ON(!atomic_read(&tr->disabled)); +#endif + if (!tr->critical_start && !trace_user_triggered && !trace_all_cpus && + !trace_print_on_crash && !print_functions) + goto out; + /* + * Allocate the next index. Make sure an NMI (or interrupt) + * has not taken it away. Potentially redo the timestamp as + * well to make sure the trace timestamps are in chronologic + * order. + */ +again: + idx = tr->trace_idx; + idx_next = idx + 1; + timestamp = now(tr, 0); + + if (unlikely((trace_freerunning || print_functions || atomic_read(&tr->underrun)) && + (idx_next >= MAX_TRACE) && !atomic_read(&tr->overrun))) { + atomic_inc(&tr->underrun); + idx_next = 0; + } + if (unlikely(idx >= MAX_TRACE)) { + atomic_inc(&tr->overrun); + goto out; + } +#ifdef __HAVE_ARCH_CMPXCHG + if (unlikely(cmpxchg(&tr->trace_idx, idx, idx_next) != idx)) { + if (idx_next == 0) + atomic_dec(&tr->underrun); + goto again; + } +#else +# ifdef CONFIG_SMP +# error CMPXCHG missing +# else + /* No worry, we are protected by the atomic_incr(&tr->disabled) + * in __trace further down + */ + tr->trace_idx = idx_next; +# endif +#endif + if (unlikely(idx_next != 0 && atomic_read(&tr->underrun))) + atomic_inc(&tr->underrun); + + pc = preempt_count(); + + if (unlikely(!tr->trace)) + goto out; + entry = tr->trace + idx; + entry->type = type; +#ifdef CONFIG_SMP + entry->cpu = cpu; +#endif + entry->flags = (irqs_off() ? TRACE_FLAG_IRQS_OFF : 0) | + (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_HARD_OFF : 0)| + ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | + ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | + (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | + (need_resched_delayed() ? TRACE_FLAG_NEED_RESCHED_DELAYED : 0); + entry->preempt_count = pc & 0xff; + entry->pid = current->pid; + entry->timestamp = timestamp; + + switch (type) { + case TRACE_FN: + entry->u.fn.eip = eip; + entry->u.fn.parent_eip = parent_eip; + if (unlikely(print_functions && !in_interrupt())) + early_print_entry(entry); + break; + case TRACE_SPECIAL: + case TRACE_SPECIAL_PID: + case TRACE_SPECIAL_U64: + case TRACE_SPECIAL_SYM: + entry->u.special.eip = eip; + entry->u.special.v1 = v1; + entry->u.special.v2 = v2; + entry->u.special.v3 = v3; + if (unlikely(print_functions && !in_interrupt())) + early_print_entry(entry); + break; + case TRACE_SYSCALL: + entry->u.syscall.nr = eip; + entry->u.syscall.p1 = v1; + entry->u.syscall.p2 = v2; + entry->u.syscall.p3 = v3; + break; + case TRACE_SYSRET: + entry->u.sysret.ret = eip; + break; + case TRACE_CMDLINE: + memcpy(entry->u.cmdline.str, current->comm, CMDLINE_BYTES); + break; + default: + break; + } +out: + ; +} + +static inline void notrace +___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, + unsigned long v3) +{ + struct cpu_trace *tr; + unsigned long flags; + int cpu; + + if (unlikely(trace_enabled <= 0)) + return; + +#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_X86) + debug_stackoverflow(cpu_traces + raw_smp_processor_id()); +#endif + + raw_local_irq_save(flags); + cpu = raw_smp_processor_id(); + /* + * Trace on the CPU where the current highest-prio task + * is waiting to become runnable: + */ +#ifdef CONFIG_WAKEUP_TIMING + if (wakeup_timing && !trace_all_cpus && !trace_print_on_crash && + !print_functions) { + if (!sch.tr || cpu != sch.cpu) + goto out; + tr = sch.tr; + } else + tr = cpu_traces + cpu; +#else + tr = cpu_traces + cpu; +#endif + atomic_inc(&tr->disabled); + if (likely(atomic_read(&tr->disabled) == 1)) { +//#define DEBUG_STACK_POISON +#ifdef DEBUG_STACK_POISON + char stack; + + memset(&stack - 128, 0x34, 128); +#endif + ____trace(cpu, type, tr, eip, parent_eip, v1, v2, v3, flags); + } + atomic_dec(&tr->disabled); +#ifdef CONFIG_WAKEUP_TIMING +out: +#endif + raw_local_irq_restore(flags); +} + +/* + * Special, ad-hoc tracepoints: + */ +void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3) +{ + ___trace(TRACE_SPECIAL, CALLER_ADDR0, 0, v1, v2, v3); +} + +EXPORT_SYMBOL(trace_special); + +void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2) +{ + ___trace(TRACE_SPECIAL_PID, CALLER_ADDR0, 0, pid, v1, v2); +} + +EXPORT_SYMBOL(trace_special_pid); + +void notrace trace_special_u64(unsigned long long v1, unsigned long v2) +{ + ___trace(TRACE_SPECIAL_U64, CALLER_ADDR0, 0, + (unsigned long) (v1 >> 32), (unsigned long) (v1 & 0xFFFFFFFF), + v2); +} + +EXPORT_SYMBOL(trace_special_u64); + +void notrace trace_special_sym(void) +{ +#define STACK_ENTRIES 8 + unsigned long entries[STACK_ENTRIES]; + struct stack_trace trace; + + if (!trace_enabled || !stackframe_tracing) + return; + + trace.entries = entries; + trace.skip = 3; + trace.max_entries = STACK_ENTRIES; + trace.nr_entries = 0; + + save_stack_trace(&trace); + /* + * clear out the rest: + */ + while (trace.nr_entries < trace.max_entries) + entries[trace.nr_entries++] = 0; + + ___trace(TRACE_SPECIAL_SYM, entries[0], 0, + entries[1], entries[2], entries[3]); + ___trace(TRACE_SPECIAL_SYM, entries[4], 0, + entries[5], entries[6], entries[7]); +} + +EXPORT_SYMBOL(trace_special_sym); + +/* + * Non-inlined function: + */ +void notrace __trace(unsigned long eip, unsigned long parent_eip) +{ + ___trace(TRACE_FN, eip, parent_eip, 0, 0, 0); +} + +#ifdef CONFIG_MCOUNT + +extern void mcount(void); + +EXPORT_SYMBOL(mcount); + +void notrace __mcount(void) +{ + ___trace(TRACE_FN, CALLER_ADDR1, CALLER_ADDR2, 0, 0, 0); +} + +#endif + +void notrace +sys_call(unsigned long nr, unsigned long p1, unsigned long p2, unsigned long p3) +{ + if (syscall_tracing) + ___trace(TRACE_SYSCALL, nr, 0, p1, p2, p3); +} + +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86) + +void notrace +sys_ia32_call(unsigned long nr, unsigned long p1, unsigned long p2, + unsigned long p3) +{ + if (syscall_tracing) + ___trace(TRACE_SYSCALL, nr | 0x80000000, 0, p1, p2, p3); +} + +#endif + +void notrace sys_ret(unsigned long ret) +{ + if (syscall_tracing) + ___trace(TRACE_SYSRET, ret, 0, 0, 0, 0); +} + +static void notrace print_name(struct seq_file *m, unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + /* + * Special trace values: + */ + if (((long)eip < 100000L) && ((long)eip > -100000L)) { + seq_printf(m, "(%5ld)", eip); + return; + } + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + seq_puts(m, sym_name); + else + seq_printf(m, "<%08lx>", eip); +} + +static void notrace print_name_offset(struct seq_file *m, unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + seq_printf(m, "%s+%#lx/%#lx <%08lx>", + sym_name, offset, size, eip); + else + seq_printf(m, "<%08lx>", eip); +} + +static unsigned long out_sequence = -1; + +static int pid_to_cmdline_array[PID_MAX_DEFAULT+1]; + +static void notrace _trace_cmdline(int cpu, struct cpu_trace *tr) +{ + unsigned long flags; + + local_save_flags(flags); + ____trace(cpu, TRACE_CMDLINE, tr, 0, 0, 0, 0, 0, flags); +} + +void notrace trace_cmdline(void) +{ + ___trace(TRACE_CMDLINE, 0, 0, 0, 0, 0); +} + +static void construct_pid_to_cmdline(struct cpu_trace *tr) +{ + unsigned int i, j, entries, pid; + + if (tr->critical_sequence == out_sequence) + return; + out_sequence = tr->critical_sequence; + + memset(pid_to_cmdline_array, -1, sizeof(int) * (PID_MAX_DEFAULT + 1)); + + if (!tr->trace) + return; + + entries = min(tr->trace_idx, MAX_TRACE); + + for (i = 0; i < entries; i++) { + struct trace_entry *entry = tr->trace + i; + + if (entry->type != TRACE_CMDLINE) + continue; + pid = entry->pid; + if (pid < PID_MAX_DEFAULT) { + pid_to_cmdline_array[pid] = i; + /* + * Replace space with underline - makes it easier + * to process for tools: + */ + for (j = 0; j < CMDLINE_BYTES; j++) + if (entry->u.cmdline.str[j] == ' ') + entry->u.cmdline.str[j] = '_'; + } + } +} + +char *pid_to_cmdline(unsigned long pid) +{ + struct cpu_trace *tr = out_tr.traces + 0; + char *cmdline = "<...>"; + int idx; + + pid = min(pid, (unsigned long)PID_MAX_DEFAULT); + if (!pid) + return ""; + + if (pid_to_cmdline_array[pid] != -1) { + idx = pid_to_cmdline_array[pid]; + if (tr->trace[idx].type == TRACE_CMDLINE) + cmdline = tr->trace[idx].u.cmdline.str; + } + return cmdline; +} + +static void copy_trace(struct cpu_trace *save, struct cpu_trace *tr, int reorder) +{ + if (!save->trace || !tr->trace) + return; + /* free-running needs reordering */ + if (reorder && atomic_read(&tr->underrun)) { + int i, idx, idx0 = tr->trace_idx; + + for (i = 0; i < MAX_TRACE; i++) { + idx = (idx0 + i) % MAX_TRACE; + save->trace[i] = tr->trace[idx]; + } + save->trace_idx = MAX_TRACE; + } else { + save->trace_idx = tr->trace_idx; + + memcpy(save->trace, tr->trace, + min(save->trace_idx, MAX_TRACE) * + sizeof(struct trace_entry)); + } + save->underrun = tr->underrun; + save->overrun = tr->overrun; +} + + +struct block_idx { + int idx[NR_CPUS]; +}; + +/* + * return the trace entry (position) of the smallest-timestamp + * one (that is still in the valid idx range): + */ +static int min_idx(struct block_idx *bidx) +{ + cycle_t min_stamp = (cycle_t) -1; + struct trace_entry *entry; + int cpu, min_cpu = -1, idx; + + for_each_online_cpu(cpu) { + idx = bidx->idx[cpu]; + if (idx >= min(max_tr.traces[cpu].trace_idx, MAX_TRACE)) + continue; + if (idx >= MAX_TRACE*NR_CPUS) { + printk("huh: idx (%d) > %ld*%d!\n", idx, MAX_TRACE, + NR_CPUS); + WARN_ON(1); + break; + } + entry = max_tr.traces[cpu].trace + bidx->idx[cpu]; + if (entry->timestamp < min_stamp) { + min_cpu = cpu; + min_stamp = entry->timestamp; + } + } + + return min_cpu; +} + +/* + * This code is called to construct an output trace from + * the maximum trace. Having separate traces serves both + * atomicity (a new max might be saved while we are busy + * accessing /proc/latency_trace) and it is also used to + * delay the (expensive) sorting of the output trace by + * timestamps, in the trace_all_cpus case. + */ +static void update_out_trace(void) +{ + struct trace_entry *out_entry, *entry, *tmp; + cycle_t stamp, first_stamp, last_stamp; + struct block_idx bidx = { { 0, }, }; + struct cpu_trace *tmp_max, *tmp_out; + int cpu, sum, entries, underrun_sum, overrun_sum; + + /* + * For out_tr we only have the first array's trace entries + * allocated - and they have are larger on SMP to make room + * for all trace entries from all CPUs. + */ + tmp_out = out_tr.traces + 0; + tmp_max = max_tr.traces + max_tr.cpu; + /* + * Easier to copy this way. Note: the trace buffer is private + * to the output buffer, so preserve it: + */ + copy_trace(tmp_out, tmp_max, 0); + tmp = tmp_out->trace; + *tmp_out = *tmp_max; + tmp_out->trace = tmp; + + out_tr.cpu = max_tr.cpu; + + if (!tmp_out->trace) + return; + + out_entry = tmp_out->trace + 0; + + if (!trace_all_cpus) { + entries = min(tmp_out->trace_idx, MAX_TRACE); + if (!entries) + return; + out_tr.first_timestamp = tmp_out->trace[0].timestamp; + out_tr.last_timestamp = tmp_out->trace[entries-1].timestamp; + return; + } + /* + * Find the range of timestamps that are fully traced in + * all CPU traces. (since CPU traces can cover a variable + * range of time, we have to find the best range.) + */ + first_stamp = 0; + for_each_online_cpu(cpu) { + tmp_max = max_tr.traces + cpu; + stamp = tmp_max->trace[0].timestamp; + if (stamp > first_stamp) + first_stamp = stamp; + } + /* + * Save the timestamp range: + */ + tmp_max = max_tr.traces + max_tr.cpu; + entries = min(tmp_max->trace_idx, MAX_TRACE); + /* + * No saved trace yet? + */ + if (!entries) { + out_tr.traces[0].trace_idx = 0; + return; + } + + last_stamp = tmp_max->trace[entries-1].timestamp; + + if (last_stamp < first_stamp) { + WARN_ON(1); + + for_each_online_cpu(cpu) { + tmp_max = max_tr.traces + cpu; + entries = min(tmp_max->trace_idx, MAX_TRACE); + printk("CPU%d: %016Lx (%016Lx) ... #%d (%016Lx) %016Lx\n", + cpu, + tmp_max->trace[0].timestamp, + tmp_max->trace[1].timestamp, + entries, + tmp_max->trace[entries-2].timestamp, + tmp_max->trace[entries-1].timestamp); + } + tmp_max = max_tr.traces + max_tr.cpu; + entries = min(tmp_max->trace_idx, MAX_TRACE); + + printk("CPU%d entries: %d\n", max_tr.cpu, entries); + printk("first stamp: %016Lx\n", first_stamp); + printk(" last stamp: %016Lx\n", first_stamp); + } + +#if 0 + printk("first_stamp: %Ld [%016Lx]\n", first_stamp, first_stamp); + printk(" last_stamp: %Ld [%016Lx]\n", last_stamp, last_stamp); + printk(" +1 stamp: %Ld [%016Lx]\n", + tmp_max->trace[entries].timestamp, + tmp_max->trace[entries].timestamp); + printk(" +2 stamp: %Ld [%016Lx]\n", + tmp_max->trace[entries+1].timestamp, + tmp_max->trace[entries+1].timestamp); + printk(" delta: %Ld\n", last_stamp-first_stamp); + printk(" entries: %d\n", entries); +#endif + + out_tr.first_timestamp = first_stamp; + out_tr.last_timestamp = last_stamp; + + /* + * Fetch trace entries one by one, in increasing timestamp + * order. Start at first_stamp, stop at last_stamp: + */ + sum = 0; + for (;;) { + cpu = min_idx(&bidx); + if (cpu == -1) + break; + entry = max_tr.traces[cpu].trace + bidx.idx[cpu]; + if (entry->timestamp > last_stamp) + break; + + bidx.idx[cpu]++; + if (entry->timestamp < first_stamp) + continue; + *out_entry = *entry; + out_entry++; + sum++; + if (sum >= MAX_TRACE*NR_CPUS) { + printk("huh: sum (%d) > %ld*%d!\n", sum, MAX_TRACE, + NR_CPUS); + WARN_ON(1); + break; + } + } + + sum = 0; + underrun_sum = 0; + overrun_sum = 0; + for_each_online_cpu(cpu) { + sum += max_tr.traces[cpu].trace_idx; + underrun_sum += atomic_read(&max_tr.traces[cpu].underrun); + overrun_sum += atomic_read(&max_tr.traces[cpu].overrun); + } + tmp_out->trace_idx = sum; + atomic_set(&tmp_out->underrun, underrun_sum); + atomic_set(&tmp_out->overrun, overrun_sum); +} + +static void notrace print_help_header(struct seq_file *m) +{ + seq_puts(m, " _------=> CPU# \n"); + seq_puts(m, " / _-----=> irqs-off \n"); + seq_puts(m, " | / _----=> need-resched \n"); + seq_puts(m, " || / _---=> hardirq/softirq \n"); + seq_puts(m, " ||| / _--=> preempt-depth \n"); + seq_puts(m, " |||| / \n"); + seq_puts(m, " ||||| delay \n"); + seq_puts(m, " cmd pid ||||| time | caller \n"); + seq_puts(m, " \\ / ||||| \\ | / \n"); +} + +static void * notrace l_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + unsigned long entries; + struct cpu_trace *tr = out_tr.traces + 0; + + down(&out_mutex); + /* + * if the file is being read newly, update the output trace: + */ + if (!n) { + // TODO: use the sequence counter here to optimize + down(&max_mutex); + update_out_trace(); + up(&max_mutex); +#if 0 + if (!tr->trace_idx) { + up(&out_mutex); + return NULL; + } +#endif + construct_pid_to_cmdline(tr); + } + entries = min(tr->trace_idx, MAX_TRACE); + + if (!n) { + seq_printf(m, "preemption latency trace v1.1.5 on %s\n", + UTS_RELEASE); + seq_puts(m, "--------------------------------------------------------------------\n"); + seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d | (M:%s VP:%d, KP:%d, SP:%d HP:%d", + cycles_to_usecs(tr->saved_latency), + entries, + (entries + atomic_read(&tr->underrun) + + atomic_read(&tr->overrun)), + out_tr.cpu, +#if defined(CONFIG_PREEMPT_NONE) + "server", +#elif defined(CONFIG_PREEMPT_VOLUNTARY) + "desktop", +#elif defined(CONFIG_PREEMPT_DESKTOP) + "preempt", +#else + "rt", +#endif + 0, 0, +#ifdef CONFIG_PREEMPT_SOFTIRQS + softirq_preemption +#else + 0 +#endif + , +#ifdef CONFIG_PREEMPT_HARDIRQS + hardirq_preemption +#else + 0 +#endif + ); +#ifdef CONFIG_SMP + seq_printf(m, " #P:%d)\n", num_online_cpus()); +#else + seq_puts(m, ")\n"); +#endif + seq_puts(m, " -----------------\n"); + seq_printf(m, " | task: %.16s %d (uid:%ld nice:%ld policy:%ld rt_prio:%ld)\n", + tr->comm, tr->pid, tr->uid, tr->nice, + tr->policy, tr->rt_priority); + seq_puts(m, " -----------------\n"); + if (trace_user_triggered) { + seq_puts(m, " => started at: "); + print_name_offset(m, tr->critical_start); + seq_puts(m, "\n => ended at: "); + print_name_offset(m, tr->critical_end); + seq_puts(m, "\n"); + } + seq_puts(m, "\n"); + + if (!trace_verbose) + print_help_header(m); + } + if (n >= entries || !tr->trace) + return NULL; + + return tr->trace + n; +} + +static void * notrace l_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct cpu_trace *tr = out_tr.traces; + unsigned long entries = min(tr->trace_idx, MAX_TRACE); + + WARN_ON(!tr->trace); + + if (++*pos >= entries) { + if (*pos == entries) + seq_puts(m, "\n\nvim:ft=help\n"); + return NULL; + } + return tr->trace + *pos; +} + +static void notrace l_stop(struct seq_file *m, void *p) +{ + up(&out_mutex); +} + +static void print_timestamp(struct seq_file *m, unsigned long abs_usecs, + unsigned long rel_usecs) +{ + seq_printf(m, " %4ldus", abs_usecs); + if (rel_usecs > 100) + seq_puts(m, "!: "); + else if (rel_usecs > 1) + seq_puts(m, "+: "); + else + seq_puts(m, " : "); +} + +static void +print_timestamp_short(struct seq_file *m, unsigned long abs_usecs, + unsigned long rel_usecs) +{ + seq_printf(m, " %4ldus", abs_usecs); + if (rel_usecs > 100) + seq_putc(m, '!'); + else if (rel_usecs > 1) + seq_putc(m, '+'); + else + seq_putc(m, ' '); +} + +static void +print_generic(struct seq_file *m, struct trace_entry *entry) +{ + int hardirq, softirq; + + seq_printf(m, "%8.8s %-5d ", pid_to_cmdline(entry->pid), entry->pid); + seq_printf(m, "%d", entry->cpu); + seq_printf(m, "%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.', + (entry->flags & TRACE_FLAG_NEED_RESCHED_DELAYED) ? 'n' : + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) + seq_putc(m, 'H'); + else { + if (hardirq) + seq_putc(m, 'h'); + else { + if (softirq) + seq_putc(m, 's'); + else + seq_putc(m, '.'); + } + } + + if (entry->preempt_count) + seq_printf(m, "%x", entry->preempt_count); + else + seq_puts(m, "."); +} + + +static int notrace l_show_fn(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + if (trace_verbose) { + seq_printf(m, "%16s %5d %d %d %08x %08lx [%016Lx] %ld.%03ldms (+%ld.%03ldms): ", + pid_to_cmdline(entry->pid), + entry->pid, entry->cpu, entry->flags, + entry->preempt_count, trace_idx, + entry->timestamp, abs_usecs/1000, + abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); + print_name_offset(m, entry->u.fn.eip); + seq_puts(m, " ("); + print_name_offset(m, entry->u.fn.parent_eip); + seq_puts(m, ")\n"); + } else { + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + print_name(m, entry->u.fn.eip); + seq_puts(m, " ("); + print_name(m, entry->u.fn.parent_eip); + seq_puts(m, ")\n"); + } + return 0; +} + +static int notrace l_show_special(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry, int mode64) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + if (trace_verbose) + print_name_offset(m, entry->u.special.eip); + else + print_name(m, entry->u.special.eip); + + if (!mode64) { + /* + * For convenience, print small numbers in decimal: + */ + if (abs((int)entry->u.special.v1) < 100000) + seq_printf(m, " (%5ld ", entry->u.special.v1); + else + seq_printf(m, " (%lx ", entry->u.special.v1); + if (abs((int)entry->u.special.v2) < 100000) + seq_printf(m, "%5ld ", entry->u.special.v2); + else + seq_printf(m, "%lx ", entry->u.special.v2); + if (abs((int)entry->u.special.v3) < 100000) + seq_printf(m, "%5ld)\n", entry->u.special.v3); + else + seq_printf(m, "%lx)\n", entry->u.special.v3); + } else { + seq_printf(m, " (%13Ld %ld)\n", + ((u64)entry->u.special.v1 << 32) + + (u64)entry->u.special.v2, entry->u.special.v3); + } + return 0; +} + +static int notrace +l_show_special_pid(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + unsigned int pid; + + pid = entry->u.special.v1; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + if (trace_verbose) + print_name_offset(m, entry->u.special.eip); + else + print_name(m, entry->u.special.eip); + seq_printf(m, " <%.8s %d> (%ld %ld)\n", + pid_to_cmdline(pid), pid, + entry->u.special.v2, entry->u.special.v3); + + return 0; +} + +static int notrace +l_show_special_sym(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry, int mode64) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + if (trace_verbose) + print_name_offset(m, entry->u.special.eip); + else + print_name(m, entry->u.special.eip); + + seq_puts(m, "()<-"); + print_name(m, entry->u.special.v1); + seq_puts(m, "()<-"); + print_name(m, entry->u.special.v2); + seq_puts(m, "()<-"); + print_name(m, entry->u.special.v3); + seq_puts(m, "()\n"); + + return 0; +} + + +static int notrace l_show_cmdline(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + if (!trace_verbose) + return 0; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + seq_printf(m, + "[ => %16s ] %ld.%03ldms (+%ld.%03ldms)\n", + entry->u.cmdline.str, + abs_usecs/1000, abs_usecs % 1000, + rel_usecs/1000, rel_usecs % 1000); + + return 0; +} + +extern unsigned long sys_call_table[NR_syscalls]; + +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86) +extern unsigned long ia32_sys_call_table[], ia32_syscall_end[]; +#define IA32_NR_syscalls (ia32_syscall_end - ia32_sys_call_table) +#endif + +static int notrace l_show_syscall(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + unsigned long nr; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp_short(m, abs_usecs, rel_usecs); + + seq_puts(m, "> "); + nr = entry->u.syscall.nr; +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86) + if (nr & 0x80000000) { + nr &= ~0x80000000; + if (nr < IA32_NR_syscalls) + print_name(m, ia32_sys_call_table[nr]); + else + seq_printf(m, "", nr); + } else +#endif + if (nr < NR_syscalls) + print_name(m, sys_call_table[nr]); + else + seq_printf(m, "", nr); + +#ifdef CONFIG_64BIT + seq_printf(m, " (%016lx %016lx %016lx)\n", + entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3); +#else + seq_printf(m, " (%08lx %08lx %08lx)\n", + entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3); +#endif + + return 0; +} + +static int notrace l_show_sysret(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_us(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp_short(m, abs_usecs, rel_usecs); + + seq_printf(m, "< (%ld)\n", entry->u.sysret.ret); + + return 0; +} + + +static int notrace l_show(struct seq_file *m, void *p) +{ + struct cpu_trace *tr = out_tr.traces; + struct trace_entry *entry, *entry0, *next_entry; + unsigned long trace_idx; + + cond_resched(); + entry = p; + if (entry->timestamp < out_tr.first_timestamp) + return 0; + if (entry->timestamp > out_tr.last_timestamp) + return 0; + + entry0 = tr->trace; + trace_idx = entry - entry0; + + if (trace_idx + 1 < tr->trace_idx) + next_entry = entry + 1; + else + next_entry = entry; + + if (trace_verbose) + seq_printf(m, "(T%d/#%ld) ", entry->type, trace_idx); + + switch (entry->type) { + case TRACE_FN: + l_show_fn(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SPECIAL: + l_show_special(m, trace_idx, entry, entry0, next_entry, 0); + break; + case TRACE_SPECIAL_PID: + l_show_special_pid(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SPECIAL_U64: + l_show_special(m, trace_idx, entry, entry0, next_entry, 1); + break; + case TRACE_SPECIAL_SYM: + l_show_special_sym(m, trace_idx, entry, entry0, + next_entry, 1); + break; + case TRACE_CMDLINE: + l_show_cmdline(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SYSCALL: + l_show_syscall(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SYSRET: + l_show_sysret(m, trace_idx, entry, entry0, next_entry); + break; + default: + seq_printf(m, "unknown trace type %d\n", entry->type); + } + return 0; +} + +struct seq_operations latency_trace_op = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show +}; + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /proc/latency_trace) + */ +static void update_max_tr(struct cpu_trace *tr) +{ + struct cpu_trace *save; + int cpu, all_cpus = 0; + +#ifdef CONFIG_PREEMPT + WARN_ON(!preempt_count() && !irqs_disabled()); +#endif + + max_tr.cpu = tr->cpu; + save = max_tr.traces + tr->cpu; + + if ((wakeup_timing || trace_user_triggered || trace_print_on_crash || + print_functions) && trace_all_cpus) { + all_cpus = 1; + for_each_online_cpu(cpu) + atomic_inc(&cpu_traces[cpu].disabled); + } + + save->saved_latency = preempt_max_latency; + save->preempt_timestamp = tr->preempt_timestamp; + save->critical_start = tr->critical_start; + save->critical_end = tr->critical_end; + save->critical_sequence = tr->critical_sequence; + + memcpy(save->comm, current->comm, CMDLINE_BYTES); + save->pid = current->pid; + save->uid = current->uid; + save->nice = current->static_prio - 20 - MAX_RT_PRIO; + save->policy = current->policy; + save->rt_priority = current->rt_priority; + + if (all_cpus) { + for_each_online_cpu(cpu) { + copy_trace(max_tr.traces + cpu, cpu_traces + cpu, 1); + atomic_dec(&cpu_traces[cpu].disabled); + } + } else + copy_trace(save, tr, 1); +} + +#else /* !EVENT_TRACE */ + +static inline void notrace +____trace(int cpu, enum trace_type type, struct cpu_trace *tr, + unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, unsigned long v3, + unsigned long flags) +{ +} + +static inline void notrace +___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, + unsigned long v3) +{ +} + +static inline void notrace __trace(unsigned long eip, unsigned long parent_eip) +{ +} + +static inline void update_max_tr(struct cpu_trace *tr) +{ +} + +static inline void notrace _trace_cmdline(int cpu, struct cpu_trace *tr) +{ +} + +#endif + +static int setup_preempt_thresh(char *s) +{ + int thresh; + + get_option(&s, &thresh); + if (thresh > 0) { + preempt_thresh = usecs_to_cycles(thresh); + printk("Preemption threshold = %u us\n", thresh); + } + return 1; +} +__setup("preempt_thresh=", setup_preempt_thresh); + +static inline void notrace reset_trace_idx(int cpu, struct cpu_trace *tr) +{ + if (trace_all_cpus) + for_each_online_cpu(cpu) { + tr = cpu_traces + cpu; + tr->trace_idx = 0; + atomic_set(&tr->underrun, 0); + atomic_set(&tr->overrun, 0); + } + else{ + tr->trace_idx = 0; + atomic_set(&tr->underrun, 0); + atomic_set(&tr->overrun, 0); + } +} + +#ifdef CONFIG_CRITICAL_TIMING + +static void notrace +check_critical_timing(int cpu, struct cpu_trace *tr, unsigned long parent_eip) +{ + unsigned long latency, t0, t1; + cycle_t T0, T1, T2, delta; + unsigned long flags; + + if (trace_user_triggered) + return; + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = tr->preempt_timestamp; + T1 = now(tr, 1); + delta = T1-T0; + + local_save_flags(flags); + + if (!report_latency(delta)) + goto out; + + ____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags); + /* + * Update the timestamp, because the trace entry above + * might change it (it can only get larger so the latency + * is fair to be reported): + */ + T2 = now(tr, 1); + + delta = T2-T0; + + latency = cycles_to_usecs(delta); + latency_hist(tr->latency_type, cpu, latency); + + if (latency_hist_flag) { + if (preempt_max_latency >= delta) + goto out; + } + + if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex)) + goto out; + +#ifndef CONFIG_CRITICAL_LATENCY_HIST + if (!preempt_thresh && preempt_max_latency > delta) { + printk("bug: updating %016Lx > %016Lx?\n", + preempt_max_latency, delta); + printk(" [%016Lx %016Lx %016Lx]\n", T0, T1, T2); + } +#endif + + preempt_max_latency = delta; + t0 = cycles_to_usecs(T0); + t1 = cycles_to_usecs(T1); + + tr->critical_end = parent_eip; + + update_max_tr(tr); + +#ifndef CONFIG_CRITICAL_LATENCY_HIST + if (preempt_thresh) + printk("(%16s-%-5d|#%d): %lu us critical section " + "violates %lu us threshold.\n" + " => started at timestamp %lu: ", + current->comm, current->pid, + raw_smp_processor_id(), + latency, cycles_to_usecs(preempt_thresh), t0); + else + printk("(%16s-%-5d|#%d): new %lu us maximum-latency " + "critical section.\n => started at timestamp %lu: ", + current->comm, current->pid, + raw_smp_processor_id(), + latency, t0); + + print_symbol("<%s>\n", tr->critical_start); + printk(" => ended at timestamp %lu: ", t1); + print_symbol("<%s>\n", tr->critical_end); + dump_stack(); + t1 = cycles_to_usecs(now(tr, 1)); + printk(" => dump-end timestamp %lu\n\n", t1); +#endif + + max_sequence++; + + up(&max_mutex); + +out: + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = now(tr, 1); + tr->early_warning = 0; + reset_trace_idx(cpu, tr); + _trace_cmdline(cpu, tr); + ____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags); +} + +void notrace touch_critical_timing(void) +{ + int cpu = raw_smp_processor_id(); + struct cpu_trace *tr = cpu_traces + cpu; + + if (!tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + if (preempt_count() > 0 && tr->critical_start) { + atomic_inc(&tr->disabled); + check_critical_timing(cpu, tr, CALLER_ADDR0); + tr->critical_start = CALLER_ADDR0; + tr->critical_sequence = max_sequence; + atomic_dec(&tr->disabled); + } +} +EXPORT_SYMBOL(touch_critical_timing); + +void notrace stop_critical_timing(void) +{ + struct cpu_trace *tr = cpu_traces + raw_smp_processor_id(); + + tr->critical_start = 0; +} +EXPORT_SYMBOL(stop_critical_timing); + +static inline void notrace +__start_critical_timing(unsigned long eip, unsigned long parent_eip, + int latency_type) +{ + int cpu = raw_smp_processor_id(); + struct cpu_trace *tr = cpu_traces + cpu; + unsigned long flags; + + if (tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + atomic_inc(&tr->disabled); + + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = now(tr, 1); + tr->critical_start = eip; + reset_trace_idx(cpu, tr); + tr->latency_type = latency_type; + _trace_cmdline(cpu, tr); + + local_save_flags(flags); + ____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags); + + atomic_dec(&tr->disabled); +} + +static inline void notrace +__stop_critical_timing(unsigned long eip, unsigned long parent_eip) +{ + int cpu = raw_smp_processor_id(); + struct cpu_trace *tr = cpu_traces + cpu; + unsigned long flags; + + if (!tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + atomic_inc(&tr->disabled); + local_save_flags(flags); + ____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags); + check_critical_timing(cpu, tr, eip); + tr->critical_start = 0; + atomic_dec(&tr->disabled); +} + +#endif + +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + +#ifdef CONFIG_LOCKDEP + +void notrace time_hardirqs_on(unsigned long a0, unsigned long a1) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_off_preempt_count() && irqs_disabled_flags(flags)) + __stop_critical_timing(a0, a1); +} + +void notrace time_hardirqs_off(unsigned long a0, unsigned long a1) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_off_preempt_count() && irqs_disabled_flags(flags)) + __start_critical_timing(a0, a1, INTERRUPT_LATENCY); +} + +#else /* !CONFIG_LOCKDEP */ + +/* + * Dummy: + */ + +void early_boot_irqs_off(void) +{ +} + +void early_boot_irqs_on(void) +{ +} + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} + +/* + * We are only interested in hardirq on/off events: + */ +void notrace trace_hardirqs_on(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_off_preempt_count() && irqs_disabled_flags(flags)) + __stop_critical_timing(CALLER_ADDR0, 0 /* CALLER_ADDR1 */); +} + +EXPORT_SYMBOL(trace_hardirqs_on); + +void notrace trace_hardirqs_off(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_off_preempt_count() && irqs_disabled_flags(flags)) + __start_critical_timing(CALLER_ADDR0, 0 /* CALLER_ADDR1 */, + INTERRUPT_LATENCY); +} + +EXPORT_SYMBOL(trace_hardirqs_off); + +#endif /* !CONFIG_LOCKDEP */ + +#endif /* CONFIG_CRITICAL_IRQSOFF_TIMING */ + +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING) + +static inline unsigned long get_parent_eip(void) +{ + unsigned long parent_eip = CALLER_ADDR1; + + if (in_lock_functions(parent_eip)) { + parent_eip = CALLER_ADDR2; + if (in_lock_functions(parent_eip)) + parent_eip = CALLER_ADDR3; + } + + return parent_eip; +} + +void notrace add_preempt_count(unsigned int val) +{ + unsigned long eip = CALLER_ADDR0; + unsigned long parent_eip = get_parent_eip(); + +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_WARN_ON(((int)preempt_count() < 0))) + return; + /* + * Spinlock count overflowing soon? + */ + if (DEBUG_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10)) + return; +#endif + + preempt_count() += val; +#ifdef CONFIG_PREEMPT_TRACE + if (val <= 10) { + unsigned int idx = preempt_count() & PREEMPT_MASK; + if (idx < MAX_PREEMPT_TRACE) { + current->preempt_trace_eip[idx] = eip; + current->preempt_trace_parent_eip[idx] = parent_eip; + } + } +#endif +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) +#endif + if (preempt_count() == val) + __start_critical_timing(eip, parent_eip, + PREEMPT_LATENCY); + } +#endif + (void)eip, (void)parent_eip; +} +EXPORT_SYMBOL(add_preempt_count); + +void notrace sub_preempt_count(unsigned int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_WARN_ON(unlikely(val > preempt_count()))) + return; + /* + * Is the spinlock portion underflowing? + */ + if (DEBUG_WARN_ON((val < PREEMPT_MASK) && + !(preempt_count() & PREEMPT_MASK))) + return; +#endif + +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) +#endif + if (preempt_count() == val) + __stop_critical_timing(CALLER_ADDR0, + CALLER_ADDR1); + } +#endif + preempt_count() -= val; +} + +EXPORT_SYMBOL(sub_preempt_count); + +void notrace mask_preempt_count(unsigned int mask) +{ + unsigned long eip = CALLER_ADDR0; + unsigned long parent_eip = get_parent_eip(); + + preempt_count() |= mask; + +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) +#endif + if (preempt_count() == mask) + __start_critical_timing(eip, parent_eip, + PREEMPT_LATENCY); + } +#endif + (void) eip, (void) parent_eip; +} +EXPORT_SYMBOL(mask_preempt_count); + +void notrace unmask_preempt_count(unsigned int mask) +{ +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) +#endif + if (preempt_count() == mask) + __stop_critical_timing(CALLER_ADDR0, + CALLER_ADDR1); + } +#endif + preempt_count() &= ~mask; +} +EXPORT_SYMBOL(unmask_preempt_count); + + +#endif + +/* + * Wakeup latency timing/tracing. We get upcalls from the scheduler + * when a task is being woken up and we time/trace it until it gets + * to a CPU - or an even-higher-prio task supercedes it. (in that + * case we throw away the currently traced task - we dont try to + * handle nesting, that simplifies things significantly) + */ +#ifdef CONFIG_WAKEUP_TIMING + +static void notrace +check_wakeup_timing(struct cpu_trace *tr, unsigned long parent_eip, + unsigned long *flags) +{ + int cpu = raw_smp_processor_id(); + unsigned long latency, t0, t1; + cycle_t T0, T1, delta; + + if (trace_user_triggered) + return; + + atomic_inc(&tr->disabled); + if (atomic_read(&tr->disabled) != 1) + goto out; + + T0 = tr->preempt_timestamp; + T1 = now(tr, 1); + /* + * Any wraparound or time warp and we are out: + */ + if (T0 > T1) + goto out; + delta = T1-T0; + + if (!report_latency(delta)) + goto out; + + ____trace(smp_processor_id(), TRACE_FN, tr, CALLER_ADDR0, parent_eip, + 0, 0, 0, *flags); + + latency = cycles_to_usecs(delta); + latency_hist(tr->latency_type, cpu, latency); + + if (latency_hist_flag) { + if (preempt_max_latency >= delta) + goto out; + } + + if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex)) + goto out; + +#ifndef CONFIG_WAKEUP_LATENCY_HIST + if (!preempt_thresh && preempt_max_latency > delta) { + printk("bug2: updating %016lx > %016Lx?\n", + preempt_max_latency, delta); + printk(" [%016Lx %016Lx]\n", T0, T1); + } +#endif + + preempt_max_latency = delta; + t0 = cycles_to_usecs(T0); + t1 = cycles_to_usecs(T1); + tr->critical_end = parent_eip; + + update_max_tr(tr); + + atomic_dec(&tr->disabled); + __raw_spin_unlock(&sch.trace_lock); + local_irq_restore(*flags); + +#ifndef CONFIG_WAKEUP_LATENCY_HIST + if (preempt_thresh) + printk("(%16s-%-5d|#%d): %lu us wakeup latency " + "violates %lu us threshold.\n", + current->comm, current->pid, + raw_smp_processor_id(), latency, + cycles_to_usecs(preempt_thresh)); + else + printk("(%16s-%-5d|#%d): new %lu us maximum-latency " + "wakeup.\n", current->comm, current->pid, + raw_smp_processor_id(), latency); +#endif + + max_sequence++; + + up(&max_mutex); + + return; + +out: + atomic_dec(&tr->disabled); + __raw_spin_unlock(&sch.trace_lock); + local_irq_restore(*flags); +} + +/* + * Start wakeup latency tracing - called with the runqueue held + * and interrupts disabled: + */ +void __trace_start_sched_wakeup(struct task_struct *p) +{ + struct cpu_trace *tr; + int cpu; + + if (trace_user_triggered || !wakeup_timing) { + trace_special_pid(p->pid, p->prio, -1); + return; + } + + __raw_spin_lock(&sch.trace_lock); + if (sch.task && (sch.task->prio <= p->prio)) + goto out_unlock; + + /* + * New highest-prio task just woke up - start tracing: + */ + sch.task = p; + cpu = task_cpu(p); + sch.cpu = cpu; + /* + * We keep using this CPU's trace buffer even if the task + * gets migrated to another CPU. Tracing only happens on + * the CPU that 'owns' the highest-prio task so it's + * fundamentally single-threaded. + */ + sch.tr = tr = cpu_traces + cpu; + reset_trace_idx(cpu, tr); + +// if (!atomic_read(&tr->disabled)) { + atomic_inc(&tr->disabled); + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = now(tr, 1); + tr->latency_type = WAKEUP_LATENCY; + tr->critical_start = CALLER_ADDR0; + _trace_cmdline(raw_smp_processor_id(), tr); + atomic_dec(&tr->disabled); +// } + + mcount(); + trace_special_pid(p->pid, p->prio, cpu); + trace_special_sym(); +out_unlock: + __raw_spin_unlock(&sch.trace_lock); +} + +void trace_stop_sched_switched(struct task_struct *p) +{ + struct cpu_trace *tr; + unsigned long flags; + + if (trace_user_triggered || !wakeup_timing) + return; + + local_irq_save(flags); + __raw_spin_lock(&sch.trace_lock); + if (p == sch.task) { + trace_special_pid(p->pid, p->prio, task_cpu(p)); + + sch.task = NULL; + tr = sch.tr; + sch.tr = NULL; + WARN_ON(!tr); + /* auto-unlocks the spinlock: */ + check_wakeup_timing(tr, CALLER_ADDR0, &flags); + } else { + if (sch.task) + trace_special_pid(sch.task->pid, sch.task->prio, + p->prio); + if (sch.task && (sch.task->prio >= p->prio)) + sch.task = NULL; + __raw_spin_unlock(&sch.trace_lock); + } + local_irq_restore(flags); +} + +void trace_change_sched_cpu(struct task_struct *p, int new_cpu) +{ + unsigned long flags; + + if (!wakeup_timing) + return; + + trace_special_pid(p->pid, task_cpu(p), new_cpu); + trace_special_sym(); + local_irq_save(flags); + __raw_spin_lock(&sch.trace_lock); + if (p == sch.task && task_cpu(p) != new_cpu) { + sch.cpu = new_cpu; + trace_special(task_cpu(p), new_cpu, 0); + } + __raw_spin_unlock(&sch.trace_lock); + local_irq_restore(flags); +} + +#endif + +#ifdef CONFIG_EVENT_TRACE + +long user_trace_start(void) +{ + struct cpu_trace *tr; + unsigned long flags; + int cpu; + + if (!trace_user_triggered || trace_print_on_crash || print_functions) + return -EINVAL; + + /* + * If the user has not yet reset the max latency after + * bootup then we assume that this was the intention + * (we wont get any tracing done otherwise): + */ + if (preempt_max_latency == (cycle_t)ULONG_MAX) + preempt_max_latency = 0; + + /* + * user_trace_start() might be called from hardirq + * context, if trace_user_triggered_irq is set, so + * be careful about locking: + */ + if (preempt_count() || irqs_disabled()) { + if (down_trylock(&max_mutex)) + return -EAGAIN; + } else + down(&max_mutex); + + local_irq_save(flags); + cpu = smp_processor_id(); + tr = cpu_traces + cpu; + +#ifdef CONFIG_WAKEUP_TIMING + if (wakeup_timing) { + __raw_spin_lock(&sch.trace_lock); + sch.task = current; + sch.cpu = cpu; + sch.tr = tr; + __raw_spin_unlock(&sch.trace_lock); + } +#endif + reset_trace_idx(cpu, tr); + + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = now(tr, 1); + tr->critical_start = CALLER_ADDR0; + _trace_cmdline(cpu, tr); + mcount(); + + WARN_ON(!irqs_disabled()); + local_irq_restore(flags); + + up(&max_mutex); + + return 0; +} + +EXPORT_SYMBOL_GPL(user_trace_start); + +long user_trace_stop(void) +{ + unsigned long latency = 0, flags; + struct cpu_trace *tr; + cycle_t delta; + + if (!trace_user_triggered || trace_print_on_crash || print_functions) + return -EINVAL; + + local_irq_save(flags); + mcount(); + +#ifdef CONFIG_WAKEUP_TIMING + if (wakeup_timing) { + struct task_struct *t; + + __raw_spin_lock(&sch.trace_lock); + t = sch.task; + if (current != t) { + __raw_spin_unlock(&sch.trace_lock); + local_irq_restore(flags); + printk("wrong stop: curr: %s/%d[%d] => %p\n", + current->comm, current->pid, + task_thread_info(current)->cpu, t); + if (t) + printk("wrong stop: curr: %s/%d[%d]\n", + t->comm, t->pid, + task_thread_info(t)->cpu); + return -EINVAL; + } + sch.task = NULL; + tr = sch.tr; + sch.tr = NULL; + __raw_spin_unlock(&sch.trace_lock); + } else +#endif + tr = cpu_traces + smp_processor_id(); + + atomic_inc(&tr->disabled); + if (tr->preempt_timestamp) { + cycle_t T0, T1; + unsigned long long tmp0; + + T0 = tr->preempt_timestamp; + T1 = now(tr, 1); + tmp0 = preempt_max_latency; + if (T1 < T0) + T0 = T1; + delta = T1 - T0; + if (!report_latency(delta)) + goto out; + if (tr->critical_sequence != max_sequence || + down_trylock(&max_mutex)) + goto out; + + WARN_ON(!preempt_thresh && preempt_max_latency > delta); + + preempt_max_latency = delta; + update_max_tr(tr); + + latency = cycles_to_usecs(delta); + + max_sequence++; + up(&max_mutex); +out: + tr->preempt_timestamp = 0; + } + atomic_dec(&tr->disabled); + local_irq_restore(flags); + + if (latency) { + if (preempt_thresh) + printk("(%16s-%-5d|#%d): %lu us user-latency " + "violates %lu us threshold.\n", + current->comm, current->pid, + raw_smp_processor_id(), latency, + cycles_to_usecs(preempt_thresh)); + else + printk("(%16s-%-5d|#%d): new %lu us user-latency.\n", + current->comm, current->pid, + raw_smp_processor_id(), latency); + } + + return 0; +} + +EXPORT_SYMBOL(user_trace_stop); + +static int trace_print_cpu = -1; + +void notrace stop_trace(void) +{ + if (trace_print_on_crash && trace_print_cpu == -1) { + trace_enabled = -1; + trace_print_cpu = raw_smp_processor_id(); + } +} + +EXPORT_SYMBOL(stop_trace); + +static void print_entry(struct trace_entry *entry, struct trace_entry *entry0) +{ + unsigned long abs_usecs; + int hardirq, softirq; + + abs_usecs = cycles_to_us(entry->timestamp - entry0->timestamp); + + printk("%-5d ", entry->pid); + + printk("%d%c%c", + entry->cpu, + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.', + (entry->flags & TRACE_FLAG_NEED_RESCHED_DELAYED) ? 'n' : + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) + printk("H"); + else { + if (hardirq) + printk("h"); + else { + if (softirq) + printk("s"); + else + printk("."); + } + } + + if (entry->preempt_count) + printk(":%x ", entry->preempt_count); + else + printk(":. "); + + printk("%ld.%03ldms: ", abs_usecs/1000, abs_usecs % 1000); + + switch (entry->type) { + case TRACE_FN: + printk_name(entry->u.fn.eip); + printk(" <= ("); + printk_name(entry->u.fn.parent_eip); + printk(")\n"); + break; + case TRACE_SPECIAL: + printk(" special: %lx %lx %lx\n", + entry->u.special.v1, entry->u.special.v2, + entry->u.special.v3); + break; + case TRACE_SPECIAL_U64: + printk(" spec64: %lx%08lx %lx\n", + entry->u.special.v1, entry->u.special.v2, + entry->u.special.v3); + break; + } +} + +/* + * Print the current trace at crash time. + * + * We print it backwards, so that the newest (most interesting) entries + * are printed first. + */ +void print_last_trace(void) +{ + unsigned int idx0, idx, i, cpu; + struct cpu_trace *tr; + struct trace_entry *entry0, *entry; + + preempt_disable(); + cpu = smp_processor_id(); + if (trace_enabled != -1 || trace_print_cpu != cpu || + !trace_print_on_crash) { + if (trace_print_on_crash) + printk("skipping trace printing on CPU#%d != %d\n", + cpu, trace_print_cpu); + preempt_enable(); + return; + } + + trace_print_on_crash = 0; + + tr = cpu_traces + cpu; + if (!tr->trace) + goto out; + + printk("Last %ld trace entries:\n", MAX_TRACE); + idx0 = tr->trace_idx; + printk("curr idx: %d\n", idx0); + if (idx0 >= MAX_TRACE) + idx0 = 0; + idx = idx0; + entry0 = tr->trace + idx0; + + for (i = 0; i < MAX_TRACE; i++) { + if (idx == 0) + idx = MAX_TRACE-1; + else + idx--; + entry = tr->trace + idx; + switch (entry->type) { + case TRACE_FN: + case TRACE_SPECIAL: + case TRACE_SPECIAL_U64: + print_entry(entry, entry0); + break; + } + } + printk("printed %ld entries\n", MAX_TRACE); +out: + preempt_enable(); +} + +#ifdef CONFIG_SMP +/* + * On SMP, try to 'peek' on other CPU's traces and record them + * in this CPU's trace. This way we get a rough idea about what's + * going on there, without the overhead of global tracing. + * + * (no need to make this PER_CPU, we bounce it around anyway.) + */ +unsigned long nmi_eips[NR_CPUS]; +unsigned long nmi_flags[NR_CPUS]; + +void notrace nmi_trace(unsigned long eip, unsigned long parent_eip, + unsigned long flags) +{ + int cpu, this_cpu = smp_processor_id(); + + __trace(eip, parent_eip); + + nmi_eips[this_cpu] = parent_eip; + nmi_flags[this_cpu] = flags; + for (cpu = 0; cpu < NR_CPUS; cpu++) + if (cpu_online(cpu) && cpu != this_cpu) { + __trace(eip, nmi_eips[cpu]); + __trace(eip, nmi_flags[cpu]); + } +} +#else +/* + * On UP, NMI tracing is quite simple: + */ +void notrace nmi_trace(unsigned long eip, unsigned long parent_eip, + unsigned long flags) +{ + __trace(eip, parent_eip); +} +#endif + +#endif + +#ifdef CONFIG_PREEMPT_TRACE + +static void print_preempt_trace(struct task_struct *task) +{ + unsigned int count = task_thread_info(task)->preempt_count; + unsigned int i, lim = count & PREEMPT_MASK; + if (lim >= MAX_PREEMPT_TRACE) + lim = MAX_PREEMPT_TRACE-1; + printk("---------------------------\n"); + printk("| preempt count: %08x ]\n", count); + printk("| %d-level deep critical section nesting:\n", lim); + printk("----------------------------------------\n"); + for (i = 1; i <= lim; i++) { + printk(".. [<%08lx>] .... ", task->preempt_trace_eip[i]); + print_symbol("%s\n", task->preempt_trace_eip[i]); + printk(".....[<%08lx>] .. ( <= ", + task->preempt_trace_parent_eip[i]); + print_symbol("%s)\n", task->preempt_trace_parent_eip[i]); + } + printk("\n"); +} + +#endif + +#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_EVENT_TRACE) +void print_traces(struct task_struct *task) +{ + if (!task) + task = current; + +#ifdef CONFIG_PREEMPT_TRACE + print_preempt_trace(task); +#endif +#ifdef CONFIG_EVENT_TRACE + print_last_trace(); +#endif +} +#endif + +#ifdef CONFIG_EVENT_TRACE +#ifndef ARCH_LOW_ADDRESS_LIMIT +#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL +#endif +void * __init tracer_alloc_bootmem(unsigned long size) +{ + void * ret; + + ret =__alloc_bootmem(size, SMP_CACHE_BYTES, ARCH_LOW_ADDRESS_LIMIT); + if (ret != NULL && ((unsigned long)ret) < ARCH_LOW_ADDRESS_LIMIT) { + free_bootmem(__pa(ret), size); + ret = __alloc_bootmem(size, + SMP_CACHE_BYTES, + __pa(MAX_DMA_ADDRESS)); + } + + return ret; +} + +/* + * Allocate all the per-CPU trace buffers and the + * save-maximum/save-output staging buffers: + */ +void __init init_tracer(void) +{ + unsigned long size, total_size = 0; + struct trace_entry *array; + struct cpu_trace *tr; + int cpu; + + printk("num_possible_cpus(): %d\n", num_possible_cpus()); + + size = sizeof(struct trace_entry)*MAX_TRACE; + + for_each_possible_cpu(cpu) { + tr = cpu_traces + cpu; + array = tracer_alloc_bootmem(size); + if (!array) { + printk(KERN_ERR + "CPU#%d: failed to allocate %ld bytes trace buffer!\n", + cpu, size); + } else { + printk(KERN_INFO + "CPU#%d: allocated %ld bytes trace buffer.\n", + cpu, size); + total_size += size; + } + tr->cpu = cpu; + tr->trace = array; + + array = tracer_alloc_bootmem(size); + if (!array) { + printk(KERN_ERR + "CPU#%d: failed to allocate %ld bytes max-trace buffer!\n", + cpu, size); + } else { + printk(KERN_INFO + "CPU#%d: allocated %ld bytes max-trace buffer.\n", + cpu, size); + total_size += size; + } + max_tr.traces[cpu].trace = array; + } + + /* + * The output trace buffer is a special one that only has + * trace entries for the first cpu-trace structure: + */ + size = sizeof(struct trace_entry)*MAX_TRACE*num_possible_cpus(); + array = tracer_alloc_bootmem(size); + if (!array) { + printk(KERN_ERR + "failed to allocate %ld bytes out-trace buffer!\n", + size); + } else { + printk(KERN_INFO "allocated %ld bytes out-trace buffer.\n", + size); + total_size += size; + } + out_tr.traces[0].trace = array; + printk(KERN_INFO + "tracer: a total of %ld bytes allocated.\n", + total_size); +} +#endif Index: linux/kernel/lockdep.c =================================================================== --- linux.orig/kernel/lockdep.c +++ linux/kernel/lockdep.c @@ -139,7 +139,7 @@ static int lock_contention_point(struct return i; } -static void lock_time_inc(struct lock_time *lt, s64 time) +static notrace void lock_time_inc(struct lock_time *lt, s64 time) { if (time > lt->max) lt->max = time; @@ -198,7 +198,7 @@ void clear_lock_stats(struct lock_class memset(class->contention_point, 0, sizeof(class->contention_point)); } -static struct lock_class_stats *get_lock_stats(struct lock_class *class) +static notrace struct lock_class_stats *get_lock_stats(struct lock_class *class) { return &get_cpu_var(lock_stats)[class - lock_classes]; } @@ -208,7 +208,7 @@ static void put_lock_stats(struct lock_c put_cpu_var(lock_stats); } -static void lock_release_holdtime(struct held_lock *hlock) +static notrace void lock_release_holdtime(struct held_lock *hlock) { struct lock_class_stats *stats; s64 holdtime; @@ -1036,7 +1036,7 @@ find_usage_forwards(struct lock_class *s * Return 1 otherwise and keep unchanged. * Return 0 on error. */ -static noinline int +static noinline notrace int find_usage_backwards(struct lock_class *source, unsigned int depth) { struct lock_list *entry; @@ -1586,7 +1586,7 @@ static inline int validate_chain(struct * We are building curr_chain_key incrementally, so double-check * it from scratch, to make sure that it's done correctly: */ -static void check_chain_key(struct task_struct *curr) +static void notrace check_chain_key(struct task_struct *curr) { #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; @@ -2009,7 +2009,7 @@ void early_boot_irqs_on(void) /* * Hardirqs will be enabled: */ -void trace_hardirqs_on(void) +void notrace trace_hardirqs_on(void) { struct task_struct *curr = current; unsigned long ip; @@ -2050,6 +2050,9 @@ void trace_hardirqs_on(void) curr->hardirq_enable_ip = ip; curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_on_events); +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + time_hardirqs_on(CALLER_ADDR0, 0 /* CALLER_ADDR1 */); +#endif } EXPORT_SYMBOL(trace_hardirqs_on); @@ -2057,7 +2060,7 @@ EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void trace_hardirqs_off(void) +void notrace trace_hardirqs_off(void) { struct task_struct *curr = current; @@ -2075,6 +2078,9 @@ void trace_hardirqs_off(void) curr->hardirq_disable_ip = _RET_IP_; curr->hardirq_disable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_off_events); +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + time_hardirqs_off(CALLER_ADDR0, 0 /* CALLER_ADDR1 */); +#endif } else debug_atomic_inc(&redundant_hardirqs_off); } @@ -2241,8 +2247,8 @@ static inline int separate_irq_context(s /* * Mark a lock with a usage bit, and validate the state transition: */ -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) +static int notrace mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) { unsigned int new_mask = 1 << new_bit, ret = 1; @@ -2301,6 +2307,7 @@ static int mark_lock(struct task_struct * We must printk outside of the graph_lock: */ if (ret == 2) { + user_trace_stop(); printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); print_lock(this); print_irqtrace_events(curr); @@ -2648,7 +2655,7 @@ __lock_release(struct lockdep_map *lock, /* * Check whether we follow the irq-flags state precisely: */ -static void check_flags(unsigned long flags) +static notrace void check_flags(unsigned long flags) { #if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS) if (!debug_locks) @@ -2680,8 +2687,9 @@ static void check_flags(unsigned long fl * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: */ -void lock_acquire(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, unsigned long ip) +void notrace +lock_acquire(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, unsigned long ip) { unsigned long flags; @@ -2692,9 +2700,9 @@ void lock_acquire(struct lockdep_map *lo return; raw_local_irq_save(flags); + current->lockdep_recursion = 1; check_flags(flags); - current->lockdep_recursion = 1; __lock_acquire(lock, subclass, trylock, read, check, irqs_disabled_flags(flags), ip); current->lockdep_recursion = 0; @@ -2703,7 +2711,8 @@ void lock_acquire(struct lockdep_map *lo EXPORT_SYMBOL_GPL(lock_acquire); -void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +void notrace +lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { unsigned long flags; @@ -2714,8 +2723,8 @@ void lock_release(struct lockdep_map *lo return; raw_local_irq_save(flags); - check_flags(flags); current->lockdep_recursion = 1; + check_flags(flags); __lock_release(lock, nested, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); @@ -2863,7 +2872,7 @@ void lock_contended(struct lockdep_map * } EXPORT_SYMBOL_GPL(lock_contended); -void lock_acquired(struct lockdep_map *lock) +void notrace lock_acquired(struct lockdep_map *lock) { unsigned long flags; Index: linux/kernel/panic.c =================================================================== --- linux.orig/kernel/panic.c +++ linux/kernel/panic.c @@ -66,6 +66,8 @@ NORET_TYPE void panic(const char * fmt, unsigned long caller = (unsigned long) __builtin_return_address(0); #endif + stop_trace(); + /* * It's possible to come here directly from a panic-assertion and not * have preempt disabled. Some functions called from here want Index: linux/kernel/printk.c =================================================================== --- linux.orig/kernel/printk.c +++ linux/kernel/printk.c @@ -433,12 +433,29 @@ static void __call_console_drivers(unsig { struct console *con; + touch_critical_timing(); for (con = console_drivers; con; con = con->next) { if ((con->flags & CON_ENABLED) && con->write && (cpu_online(smp_processor_id()) || - (con->flags & CON_ANYTIME))) + (con->flags & CON_ANYTIME))) { + /* + * Disable tracing of printk details - it just + * clobbers the trace output with lots of + * repetitive lines (especially if console is + * on a serial line): + */ +#ifdef CONFIG_EVENT_TRACE + int trace_save = trace_enabled; + + trace_enabled = 0; + con->write(con, &LOG_BUF(start), end - start); + trace_enabled = trace_save; +#else con->write(con, &LOG_BUF(start), end - start); +#endif + } } + touch_critical_timing(); } static int __read_mostly ignore_loglevel; Index: linux/kernel/sched.c =================================================================== --- linux.orig/kernel/sched.c +++ linux/kernel/sched.c @@ -86,6 +86,10 @@ unsigned long long __attribute__((weak)) #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) +#define __PRIO(prio) \ + ((prio) <= 99 ? 199 - (prio) : (prio) - 120) + +#define PRIO(p) __PRIO((p)->prio) /* * 'User priority' is the nice value converted to something we * can work with better when scaling various scheduler parameters, @@ -368,6 +372,8 @@ static DEFINE_MUTEX(sched_hotcpu_mutex); static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) { rq->curr->sched_class->check_preempt_curr(rq, p); + if (p != rq->curr && p->prio < rq->curr->prio) + __trace_start_sched_wakeup(p); } static inline int cpu_of(struct rq *rq) @@ -1003,6 +1009,11 @@ static void deactivate_task(struct rq *r dequeue_task(rq, p, sleep); dec_nr_running(p, rq); + + if (sleep) { + trace_special_pid(p->pid, p->state == TASK_UNINTERRUPTIBLE, 0); + trace_special_sym(); + } } /** @@ -1024,6 +1035,7 @@ static inline void __set_task_cpu(struct { set_task_cfs_rq(p, cpu); #ifdef CONFIG_SMP + trace_change_sched_cpu(p, cpu); /* * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be * successfuly executed on another CPU. We must ensure that updates of @@ -1079,6 +1091,8 @@ void set_task_cpu(struct task_struct *p, schedstat_inc(p, se.nr_migrations); if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); + trace_special_pid(p->pid, old_cpu, new_cpu); + trace_special_sym(); } #endif p->se.vruntime -= old_cfsrq->min_vruntime - @@ -1626,6 +1640,8 @@ out_activate: else schedstat_inc(p, se.nr_wakeups_remote); update_rq_clock(rq); + trace_special_pid(p->pid, in_interrupt() != 0, sync); + trace_special_sym(); activate_task(rq, p, 1); check_preempt_curr(rq, p); success = 1; @@ -1640,14 +1656,19 @@ out: int fastcall wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); + mcount(); + return ret; } EXPORT_SYMBOL(wake_up_process); int fastcall wake_up_state(struct task_struct *p, unsigned int state) { - return try_to_wake_up(p, state, 0); + int ret = try_to_wake_up(p, state, 0); + + mcount(); + return ret; } /* @@ -1743,6 +1764,7 @@ void fastcall wake_up_new_task(struct ta p->prio = effective_prio(p); + trace_special_pid(p->pid, 0, 0); if (!p->sched_class->task_new || !current->se.on_rq) { activate_task(rq, p, 0); } else { @@ -1875,6 +1897,7 @@ static void finish_task_switch(struct rq finish_arch_switch(prev); finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); + trace_stop_sched_switched(current); if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { @@ -1946,10 +1969,13 @@ context_switch(struct rq *rq, struct tas spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif + trace_cmdline(); + /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); barrier(); + trace_special_pid(prev->pid, PRIO(prev), PRIO(current)); /* * this_rq must be evaluated again because prev may have moved * CPUs since it called schedule(), thus the 'rq' on its stack @@ -3482,41 +3508,39 @@ void scheduler_tick(void) #endif } -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) +#if defined(CONFIG_EVENT_TRACE) && defined(CONFIG_DEBUG_RT_MUTEXES) -void fastcall add_preempt_count(int val) +static void trace_array(struct rt_prio_array *array) { - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) - return; - preempt_count() += val; - /* - * Spinlock count overflowing soon? - */ - DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= - PREEMPT_MASK - 10); + int i; + struct task_struct *p; + struct list_head *head, *tmp; + + for (i = 0; i < MAX_RT_PRIO; i++) { + head = array->queue + i; + if (list_empty(head)) { + WARN_ON(test_bit(i, array->bitmap)); + continue; + } + WARN_ON(!test_bit(i, array->bitmap)); + list_for_each(tmp, head) { + p = list_entry(tmp, struct task_struct, run_list); + trace_special_pid(p->pid, p->prio, PRIO(p)); + } + } } -EXPORT_SYMBOL(add_preempt_count); -void fastcall sub_preempt_count(int val) +static inline void trace_all_runnable_tasks(struct rq *rq) { - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) - return; - /* - * Is the spinlock portion underflowing? - */ - if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && - !(preempt_count() & PREEMPT_MASK))) - return; + if (trace_enabled) + trace_array(&rq->rt.active); +} - preempt_count() -= val; +#else + +static inline void trace_all_runnable_tasks(struct rq *rq) +{ } -EXPORT_SYMBOL(sub_preempt_count); #endif @@ -3643,6 +3667,8 @@ need_resched_nonpreemptible: prev->sched_class->put_prev_task(rq, prev); next = pick_next_task(rq, prev); + trace_all_runnable_tasks(rq); + sched_info_switch(prev, next); if (likely(prev != next)) { @@ -3651,8 +3677,10 @@ need_resched_nonpreemptible: ++*switch_count; context_switch(rq, prev, next); /* unlocks the rq */ - } else + } else { spin_unlock_irq(&rq->lock); + trace_stop_sched_switched(next); + } if (unlikely(reacquire_kernel_lock(current) < 0)) { cpu = smp_processor_id(); @@ -4040,6 +4068,7 @@ void rt_mutex_setprio(struct task_struct check_preempt_curr(rq, p); } } + task_rq_unlock(rq, &flags); } @@ -6830,6 +6859,7 @@ void __might_sleep(char *file, int line) if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; + stop_trace(); printk(KERN_ERR "BUG: sleeping function called from invalid" " context at %s:%d\n", file, line); printk("in_atomic():%d, irqs_disabled():%d\n", Index: linux/kernel/sched_fair.c =================================================================== --- linux.orig/kernel/sched_fair.c +++ linux/kernel/sched_fair.c @@ -311,6 +311,9 @@ __update_curr(struct cfs_rq *cfs_rq, str schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); curr->sum_exec_runtime += delta_exec; + if (entity_is_task(curr)) + trace_special_pid(task_of(curr)->pid, delta_exec, 0); + schedstat_add(cfs_rq, exec_clock, delta_exec); delta_exec_weighted = delta_exec; if (unlikely(curr->load.weight != NICE_0_LOAD)) { Index: linux/kernel/sched_rt.c =================================================================== --- linux.orig/kernel/sched_rt.c +++ linux/kernel/sched_rt.c @@ -22,6 +22,8 @@ static void update_curr_rt(struct rq *rq schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; + trace_special_pid(curr->pid, delta_exec, 0); + curr->se.exec_start = rq->clock; } Index: linux/kernel/softirq.c =================================================================== --- linux.orig/kernel/softirq.c +++ linux/kernel/softirq.c @@ -91,7 +91,7 @@ static inline void __local_bh_disable(un } #endif /* CONFIG_TRACE_IRQFLAGS */ -void local_bh_disable(void) +void notrace local_bh_disable(void) { __local_bh_disable((unsigned long)__builtin_return_address(0)); } @@ -129,7 +129,7 @@ void _local_bh_enable(void) EXPORT_SYMBOL(_local_bh_enable); -void local_bh_enable(void) +void notrace local_bh_enable(void) { #ifdef CONFIG_TRACE_IRQFLAGS unsigned long flags; @@ -163,7 +163,7 @@ void local_bh_enable(void) } EXPORT_SYMBOL(local_bh_enable); -void local_bh_enable_ip(unsigned long ip) +void notrace local_bh_enable_ip(unsigned long ip) { #ifdef CONFIG_TRACE_IRQFLAGS unsigned long flags; Index: linux/kernel/spinlock.c =================================================================== --- linux.orig/kernel/spinlock.c +++ linux/kernel/spinlock.c @@ -76,7 +76,7 @@ void __lockfunc _read_lock(rwlock_t *loc } EXPORT_SYMBOL(_read_lock); -unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +unsigned long notrace __lockfunc _spin_lock_irqsave(spinlock_t *lock) { unsigned long flags; @@ -341,7 +341,7 @@ void __lockfunc _read_unlock(rwlock_t *l } EXPORT_SYMBOL(_read_unlock); -void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) +void notrace __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { spin_release(&lock->dep_map, 1, _RET_IP_); _raw_spin_unlock(lock); Index: linux/kernel/sys.c =================================================================== --- linux.orig/kernel/sys.c +++ linux/kernel/sys.c @@ -1643,6 +1643,14 @@ asmlinkage long sys_prctl(int option, un { long error; +#ifdef CONFIG_EVENT_TRACE + if (option == PR_SET_TRACING) { + if (arg2) + return user_trace_start(); + return user_trace_stop(); + } +#endif + error = security_task_prctl(option, arg2, arg3, arg4, arg5); if (error) return error; Index: linux/kernel/sysctl.c =================================================================== --- linux.orig/kernel/sysctl.c +++ linux/kernel/sysctl.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -346,6 +348,132 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_WAKEUP_TIMING + { + .ctl_name = CTL_UNNUMBERED, + .procname = "wakeup_timing", + .data = &wakeup_timing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_WAKEUP_TIMING) || defined(CONFIG_EVENT_TRACE) + { + .ctl_name = CTL_UNNUMBERED, + .procname = "preempt_max_latency", + .data = &preempt_max_latency, + .maxlen = sizeof(preempt_max_latency), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "preempt_thresh", + .data = &preempt_thresh, + .maxlen = sizeof(preempt_thresh), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, +#endif +#ifdef CONFIG_EVENT_TRACE + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_enabled", + .data = &trace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "syscall_tracing", + .data = &syscall_tracing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "stackframe_tracing", + .data = &stackframe_tracing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "mcount_enabled", + .data = &mcount_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_user_triggered", + .data = &trace_user_triggered, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_user_trigger_irq", + .data = &trace_user_trigger_irq, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_freerunning", + .data = &trace_freerunning, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_print_on_crash", + .data = &trace_print_on_crash, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_verbose", + .data = &trace_verbose, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_all_cpus", + .data = &trace_all_cpus, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_use_raw_cycles", + .data = &trace_use_raw_cycles, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "trace_all_runnable", + .data = &trace_all_runnable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_CORE_USES_PID, .procname = "core_uses_pid", Index: linux/kernel/time/timekeeping.c =================================================================== --- linux.orig/kernel/time/timekeeping.c +++ linux/kernel/time/timekeeping.c @@ -104,6 +104,33 @@ static inline void __get_realtime_clock_ timespec_add_ns(ts, nsecs); } +cycle_t notrace get_monotonic_cycles(void) +{ + cycle_t cycle_now, cycle_delta; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + return clock->cycle_last + cycle_delta; +} + +unsigned long notrace cycles_to_usecs(cycle_t cycles) +{ + u64 ret = cyc2ns(clock, cycles); + + do_div(ret, 1000); + + return ret; +} + +cycle_t notrace usecs_to_cycles(unsigned long usecs) +{ + return ns2cyc(clock, (u64)usecs * 1000); +} + /** * getnstimeofday - Returns the time of day in a timespec * @ts: pointer to the timespec to be set Index: linux/lib/Kconfig.debug =================================================================== --- linux.orig/lib/Kconfig.debug +++ linux/lib/Kconfig.debug @@ -342,6 +342,192 @@ config STACKTRACE depends on DEBUG_KERNEL depends on STACKTRACE_SUPPORT +config PREEMPT_TRACE + bool + default y + depends on DEBUG_PREEMPT + +config EVENT_TRACE + bool "Kernel event tracing" + default y + depends on GENERIC_TIME + select FRAME_POINTER + select STACKTRACE + help + This option enables a kernel tracing mechanism that will track + certain kernel events such as system call entry and return, + IRQ entry, context-switching, etc. + + Run the scripts/trace-it utility on a kernel with this option + enabled for sample output. + +config FUNCTION_TRACE + bool "Kernel function call tracing" + default n + depends on !REORDER + select EVENT_TRACE + help + This option enables a kernel tracing mechanism that will track + precise function-call granularity kernel execution. Sample + output: + + pcscd-1772 0D..2 6867us : deactivate_task (-2 1) + pcscd-1772 0D..2 6867us : dequeue_task (deactivate_task) + -0 0D..2 6870us : __switch_to (__schedule) + -0 0D..2 6871us : __schedule (-2 20) + -0 0D..2 6871us : __lock_acquire (lock_acquire) + -0 0D..2 6872us : __spin_unlock_irq (__schedule) + + Run the scripts/trace-it sample utility on a kernel with this + option enabled to capture 1 second worth of events. + + (Note that kernel size and overhead increases noticeably + with this option enabled.) + +config WAKEUP_TIMING + bool "Wakeup latency timing" + depends on GENERIC_TIME + help + This option measures the time spent from a highprio thread being + woken up to it getting scheduled on a CPU, with microsecond + accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started via: + + echo 0 > /proc/sys/kernel/preempt_max_latency + +config LATENCY_TRACE + bool "Latency tracing" + default n + depends on LATENCY_TIMING && !REORDER && GENERIC_TIME + select FRAME_POINTER + select FUNCTION_TRACE + help + When this option is enabled then the last maximum latency timing + event's full trace can be found in /proc/latency_trace, in a + human-readable (or rather as some would say, in a + kernel-developer-readable) form. + + (Note that kernel size and overhead increases noticeably + with this option enabled.) + +config CRITICAL_PREEMPT_TIMING + bool "Non-preemptible critical section latency timing" + default n + depends on PREEMPT + depends on GENERIC_TIME + help + This option measures the time spent in preempt-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started via: + + echo 0 > /proc/sys/kernel/preempt_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the irqs-off timing option can be + used together or separately.) + +config CRITICAL_IRQSOFF_TIMING + bool "Interrupts-off critical section latency timing" + default n + depends on GENERIC_TIME + select TRACE_IRQFLAGS + help + This option measures the time spent in irqs-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started via: + + echo 0 > /proc/sys/kernel/preempt_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the preempt-off timing option can be + used together or separately.) + +config WAKEUP_LATENCY_HIST + bool "wakeup latency histogram" + default n + depends on WAKEUP_TIMING + help + This option logs all the wakeup latency timing to a big histogram + bucket, in the meanwhile, it also dummies up printk produced by + wakeup latency timing. + + The wakeup latency timing histogram can be viewed via: + + cat /proc/latency_hist/wakeup_latency/CPU* + + (Note: * presents CPU ID.) + +config PREEMPT_OFF_HIST + bool "non-preemptible critical section latency histogram" + default n + depends on CRITICAL_PREEMPT_TIMING + help + This option logs all the non-preemptible critical section latency + timing to a big histogram bucket, in the meanwhile, it also + dummies up printk produced by non-preemptible critical section + latency timing. + + The non-preemptible critical section latency timing histogram can + be viewed via: + + cat /proc/latency_hist/preempt_off_latency/CPU* + + (Note: * presents CPU ID.) + +config INTERRUPT_OFF_HIST + bool "interrupts-off critical section latency histogram" + default n + depends on CRITICAL_IRQSOFF_TIMING + help + This option logs all the interrupts-off critical section latency + timing to a big histogram bucket, in the meanwhile, it also + dummies up printk produced by interrupts-off critical section + latency timing. + + The interrupts-off critical section latency timing histogram can + be viewed via: + + cat /proc/latency_hist/interrupt_off_latency/CPU* + + (Note: * presents CPU ID.) + +config CRITICAL_TIMING + bool + default y + depends on CRITICAL_PREEMPT_TIMING || CRITICAL_IRQSOFF_TIMING + +config DEBUG_TRACE_IRQFLAGS + bool + default y + depends on CRITICAL_IRQSOFF_TIMING + +config LATENCY_TIMING + bool + default y + depends on WAKEUP_TIMING || CRITICAL_TIMING + select SYSCTL + +config CRITICAL_LATENCY_HIST + bool + default y + depends on PREEMPT_OFF_HIST || INTERRUPT_OFF_HIST + +config LATENCY_HIST + bool + default y + depends on WAKEUP_LATENCY_HIST || CRITICAL_LATENCY_HIST + +config MCOUNT + bool + depends on FUNCTION_TRACE + default y + config DEBUG_KOBJECT bool "kobject debugging" depends on DEBUG_KERNEL Index: linux/lib/debug_locks.c =================================================================== --- linux.orig/lib/debug_locks.c +++ linux/lib/debug_locks.c @@ -10,6 +10,7 @@ */ #include #include +#include #include #include #include @@ -36,7 +37,14 @@ int debug_locks_silent; int debug_locks_off(void) { if (xchg(&debug_locks, 0)) { +#ifdef CONFIG_DEBUG_RT_MUTEXES + if (spin_is_locked(¤t->pi_lock)) + spin_unlock(¤t->pi_lock); +#endif if (!debug_locks_silent) { + stop_trace(); + user_trace_stop(); + printk("stopped custom tracer.\n"); console_verbose(); return 1; } Index: linux/lib/spinlock_debug.c =================================================================== --- linux.orig/lib/spinlock_debug.c +++ linux/lib/spinlock_debug.c @@ -148,7 +148,7 @@ int _raw_spin_trylock(spinlock_t *lock) return ret; } -void _raw_spin_unlock(spinlock_t *lock) +void notrace _raw_spin_unlock(spinlock_t *lock) { debug_spin_unlock(lock); __raw_spin_unlock(&lock->raw_lock); Index: linux/scripts/Makefile =================================================================== --- linux.orig/scripts/Makefile +++ linux/scripts/Makefile @@ -7,6 +7,7 @@ # conmakehash: Create chartable # conmakehash: Create arrays for initializing the kernel console tables +hostprogs-$(CONFIG_EVENT_TRACE) += trace-it hostprogs-$(CONFIG_KALLSYMS) += kallsyms hostprogs-$(CONFIG_LOGO) += pnmtologo hostprogs-$(CONFIG_VT) += conmakehash Index: linux/scripts/trace-it.c =================================================================== --- /dev/null +++ linux/scripts/trace-it.c @@ -0,0 +1,79 @@ + +/* + * Copyright (C) 2005, Ingo Molnar + * + * user-triggered tracing. + * + * The -rt kernel has a built-in kernel tracer, which will trace + * all kernel function calls (and a couple of special events as well), + * by using a build-time gcc feature that instruments all kernel + * functions. + * + * The tracer is highly automated for a number of latency tracing purposes, + * but it can also be switched into 'user-triggered' mode, which is a + * half-automatic tracing mode where userspace apps start and stop the + * tracer. This file shows a dumb example how to turn user-triggered + * tracing on, and how to start/stop tracing. Note that if you do + * multiple start/stop sequences, the kernel will do a maximum search + * over their latencies, and will keep the trace of the largest latency + * in /proc/latency_trace. The maximums are also reported to the kernel + * log. (but can also be read from /proc/sys/kernel/preempt_max_latency) + * + * For the tracer to be activated, turn on CONFIG_EVENT_TRACING + * in the .config, rebuild the kernel and boot into it. The trace will + * get _alot_ more verbose if you also turn on CONFIG_FUNCTION_TRACING, + * every kernel function call will be put into the trace. Note that + * CONFIG_FUNCTION_TRACING has significant runtime overhead, so you dont + * want to use it for performance testing :) + */ + +#include +#include +#include +#include +#include +#include +#include + +int main (int argc, char **argv) +{ + int ret; + + if (getuid() != 0) { + fprintf(stderr, "needs to run as root.\n"); + exit(1); + } + ret = system("cat /proc/sys/kernel/mcount_enabled >/dev/null 2>/dev/null"); + if (ret) { + fprintf(stderr, "CONFIG_LATENCY_TRACING not enabled?\n"); + exit(1); + } + system("echo 1 > /proc/sys/kernel/trace_user_triggered"); + system("[ -e /proc/sys/kernel/wakeup_timing ] && echo 0 > /proc/sys/kernel/wakeup_timing"); + system("echo 1 > /proc/sys/kernel/trace_enabled"); + system("echo 1 > /proc/sys/kernel/mcount_enabled"); + system("echo 0 > /proc/sys/kernel/trace_freerunning"); + system("echo 0 > /proc/sys/kernel/trace_print_on_crash"); + system("echo 0 > /proc/sys/kernel/trace_verbose"); + system("echo 0 > /proc/sys/kernel/preempt_thresh 2>/dev/null"); + system("echo 0 > /proc/sys/kernel/preempt_max_latency 2>/dev/null"); + + // start tracing + if (prctl(0, 1)) { + fprintf(stderr, "trace-it: couldnt start tracing!\n"); + return 1; + } + usleep(10000000); + if (prctl(0, 0)) { + fprintf(stderr, "trace-it: couldnt stop tracing!\n"); + return 1; + } + + system("echo 0 > /proc/sys/kernel/trace_user_triggered"); + system("echo 0 > /proc/sys/kernel/trace_enabled"); + system("cat /proc/latency_trace"); + + return 0; +} + +