Makefile | 10 arch/arm/boot/compressed/head.S | 13 arch/arm/kernel/calls.S | 6 arch/arm/kernel/entry-common.S | 111 + arch/arm/kernel/fiq.c | 4 arch/arm/kernel/irq.c | 4 arch/arm/kernel/traps.c | 2 arch/arm/mm/copypage-v4mc.c | 4 arch/arm/mm/copypage-xscale.c | 4 arch/arm/mm/fault.c | 14 arch/i386/Kconfig | 2 arch/i386/boot/compressed/misc.c | 8 arch/i386/kernel/Makefile | 1 arch/i386/kernel/entry.S | 41 arch/i386/kernel/irq.c | 8 arch/i386/kernel/mcount-wrapper.S | 27 arch/ppc/boot/Makefile | 9 arch/ppc/kernel/entry.S | 82 + arch/x86_64/kernel/entry.S | 48 arch/x86_64/kernel/vsyscall.c | 2 fs/proc/proc_misc.c | 17 include/asm-arm/pgalloc.h | 4 include/asm-arm/timex.h | 10 include/asm-arm/unistd.h | 3 include/asm-i386/system.h | 8 include/linux/interrupt.h | 10 include/linux/latency_hist.h | 32 include/linux/linkage.h | 2 include/linux/preempt.h | 20 include/linux/sched.h | 86 + kernel/Makefile | 3 kernel/latency.c | 2390 ++++++++++++++++++++++++++++++++++++++ kernel/latency_hist.c | 267 ++++ kernel/sched.c | 103 + kernel/sysctl.c | 77 + kernel/time.c | 19 lib/Kconfig.debug | 149 ++ 37 files changed, 3530 insertions(+), 70 deletions(-) Index: linux-latency-tracing.q/Makefile =================================================================== --- linux-latency-tracing.q.orig/Makefile +++ linux-latency-tracing.q/Makefile @@ -519,10 +519,14 @@ CFLAGS += $(call add-align,CONFIG_CC_AL CFLAGS += $(call add-align,CONFIG_CC_ALIGN_LOOPS,-loops) CFLAGS += $(call add-align,CONFIG_CC_ALIGN_JUMPS,-jumps) -ifdef CONFIG_FRAME_POINTER -CFLAGS += -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) +ifdef CONFIG_MCOUNT +CFLAGS += -pg -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) else -CFLAGS += -fomit-frame-pointer + ifdef CONFIG_FRAME_POINTER + CFLAGS += -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) + else + CFLAGS += -fomit-frame-pointer + endif endif ifdef CONFIG_DEBUG_INFO Index: linux-latency-tracing.q/arch/arm/boot/compressed/head.S =================================================================== --- linux-latency-tracing.q.orig/arch/arm/boot/compressed/head.S +++ linux-latency-tracing.q/arch/arm/boot/compressed/head.S @@ -710,6 +710,19 @@ memdump: mov r12, r0 mov pc, r10 #endif +#ifdef CONFIG_MCOUNT +/* CONFIG_MCOUNT causes boot header to be built with -pg requiring this + * trampoline + */ + .text + .align 0 + .type mcount %function + .global mcount +mcount: + mov pc, lr @ just return +#endif + + reloc_end: .align Index: linux-latency-tracing.q/arch/arm/kernel/calls.S =================================================================== --- linux-latency-tracing.q.orig/arch/arm/kernel/calls.S +++ linux-latency-tracing.q/arch/arm/kernel/calls.S @@ -7,11 +7,8 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * - * This file is included twice in entry-common.S + * NR_syscalls now defined in include/asm-arm/unistd.h - tglx */ -#ifndef NR_syscalls -#define NR_syscalls 328 -#else __syscall_start: /* 0 */ .long sys_restart_syscall @@ -341,4 +338,3 @@ __syscall_end: .rept NR_syscalls - (__syscall_end - __syscall_start) / 4 .long sys_ni_syscall .endr -#endif Index: linux-latency-tracing.q/arch/arm/kernel/entry-common.S =================================================================== --- linux-latency-tracing.q.orig/arch/arm/kernel/entry-common.S +++ linux-latency-tracing.q/arch/arm/kernel/entry-common.S @@ -3,6 +3,8 @@ * * Copyright (C) 2000 Russell King * + * LATENCY_TRACE/mcount support (C) 2005 Timesys john.cooper@timesys.com + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -87,8 +89,6 @@ ENTRY(ret_from_fork) b ret_slow_syscall -#include "calls.S" - /*============================================================================= * SWI handler *----------------------------------------------------------------------------- @@ -271,3 +271,110 @@ sys_mmap2: str r5, [sp, #4] b do_mmap2 #endif + +#ifdef CONFIG_FRAME_POINTER + +#ifdef CONFIG_MCOUNT +/* + * At the point where we are in mcount() we maintain the + * frame of the prologue code and keep the call to mcount() + * out of the stack frame list: + + saved pc <---\ caller of instrumented routine + saved lr | + ip/prev_sp | + fp -----^ | + : | + | + -> saved pc | instrumented routine + | saved lr | + | ip/prev_sp | + | fp ---------/ + | : + | + | mcount + | saved pc + | saved lr + | ip/prev sp + -- fp + r3 + r2 + r1 + sp-> r0 + : + */ + + .text + .align 0 + .type mcount %function + .global mcount + +/* gcc -pg generated FUNCTION_PROLOGUE references mcount() + * and has already created the stack frame invocation for + * the routine we have been called to instrument. We create + * a complete frame nevertheless, as we want to use the same + * call to mcount() from c code. + */ +mcount: + + ldr ip, =mcount_enabled @ leave early, if disabled + ldr ip, [ip] + cmp ip, #0 + moveq pc,lr + + mov ip, sp + stmdb sp!, {r0 - r3, fp, ip, lr, pc} @ create stack frame + + ldr r1, [fp, #-4] @ get lr (the return address + @ of the caller of the + @ instrumented function) + mov r0, lr @ get lr - (the return address + @ of the instrumented function) + + sub fp, ip, #4 @ point fp at this frame + + bl __trace +1: + ldmdb fp, {r0 - r3, fp, sp, pc} @ pop entry frame and return + +#endif + +/* ARM replacement for unsupported gcc __builtin_return_address(n) + * where 0 < n. n == 0 is supported here as well. + * + * Walk up the stack frame until the desired frame is found or a NULL + * fp is encountered, return NULL in the latter case. + * + * Note: it is possible under code optimization for the stack invocation + * of an ancestor function (level N) to be removed before calling a + * descendant function (level N+1). No easy means is available to deduce + * this scenario with the result being [for example] caller_addr(0) when + * called from level N+1 returning level N-1 rather than the expected + * level N. This optimization issue appears isolated to the case of + * a call to a level N+1 routine made at the tail end of a level N + * routine -- the level N frame is deleted and a simple branch is made + * to the level N+1 routine. + */ + + .text + .align 0 + .type arm_return_addr %function + .global arm_return_addr + +arm_return_addr: + mov ip, r0 + mov r0, fp +3: + cmp r0, #0 + beq 1f @ frame list hit end, bail + cmp ip, #0 + beq 2f @ reached desired frame + ldr r0, [r0, #-12] @ else continue, get next fp + sub ip, ip, #1 + b 3b +2: + ldr r0, [r0, #-4] @ get target return address +1: + mov pc, lr + +#endif Index: linux-latency-tracing.q/arch/arm/kernel/fiq.c =================================================================== --- linux-latency-tracing.q.orig/arch/arm/kernel/fiq.c +++ linux-latency-tracing.q/arch/arm/kernel/fiq.c @@ -88,7 +88,7 @@ void set_fiq_handler(void *start, unsign * disable irqs for the duration. Note - these functions are almost * entirely coded in assembly. */ -void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( @@ -106,7 +106,7 @@ void __attribute__((naked)) set_fiq_regs : "r" (®s->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE)); } -void __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( Index: linux-latency-tracing.q/arch/arm/kernel/irq.c =================================================================== --- linux-latency-tracing.q.orig/arch/arm/kernel/irq.c +++ linux-latency-tracing.q/arch/arm/kernel/irq.c @@ -533,10 +533,12 @@ static void do_pending_irqs(struct pt_re * come via this function. Instead, they should provide their * own 'handler' */ -asmlinkage void asm_do_IRQ(unsigned int irq, struct pt_regs *regs) +asmlinkage notrace void asm_do_IRQ(unsigned int irq, struct pt_regs *regs) { struct irqdesc *desc = irq_desc + irq; + trace_special(instruction_pointer(regs), irq, 0); + /* * Some hardware gives randomly wrong interrupts. Rather * than crashing, do something sensible. Index: linux-latency-tracing.q/arch/arm/kernel/traps.c =================================================================== --- linux-latency-tracing.q.orig/arch/arm/kernel/traps.c +++ linux-latency-tracing.q/arch/arm/kernel/traps.c @@ -177,6 +177,8 @@ void dump_stack(void) { #ifdef CONFIG_DEBUG_ERRORS __backtrace(); + print_traces(current); + show_held_locks(current); #endif } Index: linux-latency-tracing.q/arch/arm/mm/copypage-v4mc.c =================================================================== --- linux-latency-tracing.q.orig/arch/arm/mm/copypage-v4mc.c +++ linux-latency-tracing.q/arch/arm/mm/copypage-v4mc.c @@ -43,7 +43,7 @@ static DEFINE_SPINLOCK(minicache_lock); * instruction. If your processor does not supply this, you have to write your * own copy_user_page that does the right thing. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { asm volatile( @@ -82,7 +82,7 @@ void v4_mc_copy_user_page(void *kto, con /* * ARMv4 optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) v4_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux-latency-tracing.q/arch/arm/mm/copypage-xscale.c =================================================================== --- linux-latency-tracing.q.orig/arch/arm/mm/copypage-xscale.c +++ linux-latency-tracing.q/arch/arm/mm/copypage-xscale.c @@ -41,7 +41,7 @@ static DEFINE_SPINLOCK(minicache_lock); * Dcache aliasing issue. The writes will be forwarded to the write buffer, * and merged as appropriate. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { /* @@ -104,7 +104,7 @@ void xscale_mc_copy_user_page(void *kto, /* * XScale optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) xscale_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux-latency-tracing.q/arch/arm/mm/fault.c =================================================================== --- linux-latency-tracing.q.orig/arch/arm/mm/fault.c +++ linux-latency-tracing.q/arch/arm/mm/fault.c @@ -216,7 +216,7 @@ out: return fault; } -static int +static notrace int do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct task_struct *tsk; @@ -316,7 +316,7 @@ no_context: * interrupt or a critical region, and should only copy the information * from the master page table, nothing more. */ -static int +static notrace int do_translation_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { @@ -362,7 +362,7 @@ bad_area: * Some section permission faults need to be handled gracefully. * They can happen due to a __{get,put}_user during an oops. */ -static int +static notrace int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct task_struct *tsk = current; @@ -373,7 +373,7 @@ do_sect_fault(unsigned long addr, unsign /* * This abort handler always returns "fault". */ -static int +static notrace int do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { return 1; @@ -428,7 +428,7 @@ static struct fsr_info { { do_bad, SIGBUS, 0, "unknown 31" } }; -void __init +void __init notrace hook_fault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *), int sig, const char *name) { @@ -442,7 +442,7 @@ hook_fault_code(int nr, int (*fn)(unsign /* * Dispatch a data abort to the relevant handler. */ -asmlinkage void +asmlinkage notrace void do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { const struct fsr_info *inf = fsr_info + (fsr & 15) + ((fsr & (1 << 10)) >> 6); @@ -461,7 +461,7 @@ do_DataAbort(unsigned long addr, unsigne notify_die("", regs, &info, fsr, 0); } -asmlinkage void +asmlinkage notrace void do_PrefetchAbort(unsigned long addr, struct pt_regs *regs) { do_translation_fault(addr, 0, regs); Index: linux-latency-tracing.q/arch/i386/Kconfig =================================================================== --- linux-latency-tracing.q.orig/arch/i386/Kconfig +++ linux-latency-tracing.q/arch/i386/Kconfig @@ -619,7 +619,7 @@ config BOOT_IOREMAP config REGPARM bool "Use register arguments (EXPERIMENTAL)" - depends on EXPERIMENTAL + depends on EXPERIMENTAL && !MCOUNT default n help Compile the kernel with -mregparm=3. This uses a different ABI Index: linux-latency-tracing.q/arch/i386/boot/compressed/misc.c =================================================================== --- linux-latency-tracing.q.orig/arch/i386/boot/compressed/misc.c +++ linux-latency-tracing.q/arch/i386/boot/compressed/misc.c @@ -15,6 +15,12 @@ #include #include +#ifdef CONFIG_MCOUNT +void notrace mcount(void) +{ +} +#endif + /* * gzip declarations */ @@ -112,7 +118,7 @@ static long free_mem_end_ptr; #define INPLACE_MOVE_ROUTINE 0x1000 #define LOW_BUFFER_START 0x2000 #define LOW_BUFFER_MAX 0x90000 -#define HEAP_SIZE 0x3000 +#define HEAP_SIZE 0x4000 static unsigned int low_buffer_end, low_buffer_size; static int high_loaded =0; static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/; Index: linux-latency-tracing.q/arch/i386/kernel/Makefile =================================================================== --- linux-latency-tracing.q.orig/arch/i386/kernel/Makefile +++ linux-latency-tracing.q/arch/i386/kernel/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp.o smpboot.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o +obj-$(CONFIG_MCOUNT) += mcount-wrapper.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o Index: linux-latency-tracing.q/arch/i386/kernel/entry.S =================================================================== --- linux-latency-tracing.q.orig/arch/i386/kernel/entry.S +++ linux-latency-tracing.q/arch/i386/kernel/entry.S @@ -200,6 +200,11 @@ sysenter_past_esp: pushl %eax SAVE_ALL +#ifdef CONFIG_LATENCY_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -213,6 +218,11 @@ sysenter_past_esp: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work +#ifdef CONFIG_LATENCY_TRACE + pushl %eax + call sys_ret + popl %eax +#endif /* if something modifies registers it must also disable sysexit */ movl EIP(%esp), %edx movl OLDESP(%esp), %ecx @@ -225,6 +235,11 @@ sysenter_past_esp: ENTRY(system_call) pushl %eax # save orig_eax SAVE_ALL +#ifdef CONFIG_LATENCY_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -254,6 +269,17 @@ restore_all: cmpl $((4 << 8) | 3), %eax je ldt_ss # returning to user-space with LDT SS restore_nocheck: +#if defined(CONFIG_CRITICAL_IRQSOFF_TIMING) || defined(CONFIG_LATENCY_TRACE) + pushl %eax +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + call trace_irqs_on +#endif +#ifdef CONFIG_LATENCY_TRACE + call sys_ret +#endif + popl %eax +#endif +restore_nocheck_nmi: RESTORE_REGS addl $4, %esp 1: iret @@ -351,6 +377,11 @@ syscall_trace_entry: syscall_exit_work: testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl jz work_pending +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + pushl %eax + call trace_irqs_on + popl %eax +#endif sti # could let do_syscall_trace() call # schedule() instead movl %esp, %eax @@ -412,9 +443,16 @@ ENTRY(irq_entries_start) vector=vector+1 .endr +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING +# define TRACE_IRQS_OFF call trace_irqs_off_lowlevel; +#else +# define TRACE_IRQS_OFF +#endif + ALIGN common_interrupt: SAVE_ALL + TRACE_IRQS_OFF movl %esp,%eax call do_IRQ jmp ret_from_intr @@ -423,6 +461,7 @@ common_interrupt: ENTRY(name) \ pushl $nr-256; \ SAVE_ALL \ + TRACE_IRQS_OFF \ movl %esp,%eax; \ call smp_/**/name; \ jmp ret_from_intr; @@ -552,7 +591,7 @@ nmi_stack_correct: xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi - jmp restore_all + jmp restore_nocheck_nmi nmi_stack_fixup: FIX_STACK(12,nmi_stack_correct, 1) Index: linux-latency-tracing.q/arch/i386/kernel/irq.c =================================================================== --- linux-latency-tracing.q.orig/arch/i386/kernel/irq.c +++ linux-latency-tracing.q/arch/i386/kernel/irq.c @@ -51,7 +51,7 @@ static union irq_ctx *softirq_ctx[NR_CPU * SMP cross-CPU interrupts have their own specific * handlers). */ -fastcall unsigned int do_IRQ(struct pt_regs *regs) +fastcall notrace unsigned int do_IRQ(struct pt_regs *regs) { /* high bits used in ret_from_ code */ int irq = regs->orig_eax & 0xff; @@ -59,8 +59,12 @@ fastcall unsigned int do_IRQ(struct pt_r union irq_ctx *curctx, *irqctx; u32 *isp; #endif - irq_enter(); +#ifdef CONFIG_LATENCY_TRACE + if (irq == trace_user_trigger_irq) + user_trace_start(); +#endif + trace_special(regs->eip, irq, 0); #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { Index: linux-latency-tracing.q/arch/i386/kernel/mcount-wrapper.S =================================================================== --- /dev/null +++ linux-latency-tracing.q/arch/i386/kernel/mcount-wrapper.S @@ -0,0 +1,27 @@ +/* + * linux/arch/i386/mcount-wrapper.S + * + * Copyright (C) 2004 Ingo Molnar + */ + +.globl mcount +mcount: + + cmpl $0, mcount_enabled + jz out + + push %ebp + mov %esp, %ebp + pushl %eax + pushl %ecx + pushl %edx + + call __mcount + + popl %edx + popl %ecx + popl %eax + popl %ebp +out: + ret + Index: linux-latency-tracing.q/arch/ppc/boot/Makefile =================================================================== --- linux-latency-tracing.q.orig/arch/ppc/boot/Makefile +++ linux-latency-tracing.q/arch/ppc/boot/Makefile @@ -11,6 +11,15 @@ # CFLAGS += -fno-builtin -D__BOOTER__ -Iarch/$(ARCH)/boot/include + +ifdef CONFIG_MCOUNT +# do not trace the boot loader +nullstring := +space := $(nullstring) # end of the line +pg_flag = $(nullstring) -pg # end of the line +CFLAGS := $(subst ${pg_flag},${space},${CFLAGS}) +endif + HOSTCFLAGS += -Iarch/$(ARCH)/boot/include BOOT_TARGETS = zImage zImage.initrd znetboot znetboot.initrd Index: linux-latency-tracing.q/arch/ppc/kernel/entry.S =================================================================== --- linux-latency-tracing.q.orig/arch/ppc/kernel/entry.S +++ linux-latency-tracing.q/arch/ppc/kernel/entry.S @@ -1000,3 +1000,85 @@ machine_check_in_rtas: /* XXX load up BATs and panic */ #endif /* CONFIG_PPC_OF */ + +#ifdef CONFIG_MCOUNT + +/* + * mcount() is not the same as _mcount(). The callers of mcount() have a + * normal context. The callers of _mcount() do not have a stack frame and + * have not saved the "caller saves" registers. + */ +_GLOBAL(mcount) + stwu r1,-16(r1) + mflr r3 + lis r5,mcount_enabled@ha + lwz r5,mcount_enabled@l(r5) + stw r3,20(r1) + cmpwi r5,0 + beq 1f + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + lwz r4,16(r1) + lwz r4,4(r4) + bl __trace +1: + lwz r0,20(r1) + mtlr r0 + addi r1,r1,16 + blr + +/* + * The -pg flag, which is specified in the case of CONFIG_MCOUNT, causes the + * C compiler to add a call to _mcount() at the start of each function preamble, + * before the stack frame is created. An example of this preamble code is: + * + * mflr r0 + * lis r12,-16354 + * stw r0,4(r1) + * addi r0,r12,-19652 + * bl 0xc00034c8 <_mcount> + * mflr r0 + * stwu r1,-16(r1) + */ +_GLOBAL(_mcount) +#define M_STK_SIZE 48 + /* Would not expect to need to save cr, but glibc version of */ + /* _mcount() does, so cautiously saving it here too. */ + stwu r1,-M_STK_SIZE(r1) + stw r3, 12(r1) + stw r4, 16(r1) + stw r5, 20(r1) + stw r6, 24(r1) + mflr r3 /* will use as first arg to __trace() */ + mfcr r4 + lis r5,mcount_enabled@ha + lwz r5,mcount_enabled@l(r5) + cmpwi r5,0 + stw r3, 44(r1) /* lr */ + stw r4, 8(r1) /* cr */ + stw r7, 28(r1) + stw r8, 32(r1) + stw r9, 36(r1) + stw r10,40(r1) + beq 1f + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + lwz r4,M_STK_SIZE+4(r1) + bl __trace +1: + lwz r8, 8(r1) /* cr */ + lwz r9, 44(r1) /* lr */ + lwz r3, 12(r1) + lwz r4, 16(r1) + lwz r5, 20(r1) + mtcrf 0xff,r8 + mtctr r9 + lwz r0, 52(r1) + lwz r6, 24(r1) + lwz r7, 28(r1) + lwz r8, 32(r1) + lwz r9, 36(r1) + lwz r10,40(r1) + addi r1,r1,M_STK_SIZE + mtlr r0 + bctr + +#endif /* CONFIG_MCOUNT */ Index: linux-latency-tracing.q/arch/x86_64/kernel/entry.S =================================================================== --- linux-latency-tracing.q.orig/arch/x86_64/kernel/entry.S +++ linux-latency-tracing.q/arch/x86_64/kernel/entry.S @@ -48,6 +48,15 @@ #define retint_kernel retint_restore_args #endif +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING +# define CALL_TRACE_IRQS_ON \ + push %rbp; \ + mov %rsp, %rbp; \ + call trace_irqs_on; \ + leaveq +#else +# define CALL_TRACE_IRQS_ON +#endif /* * C code is not supposed to know about undefined top of stack. Every time * a C function with an pt_regs argument is called from the SYSCALL based @@ -593,6 +602,7 @@ retint_kernel: bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ jnc retint_restore_args call preempt_schedule_irq + CALL_TRACE_IRQS_ON jmp exit_intr #endif CFI_ENDPROC @@ -1041,3 +1051,41 @@ ENTRY(call_softirq) CFI_ADJUST_CFA_OFFSET -8 ret CFI_ENDPROC + +#ifdef CONFIG_LATENCY_TRACE + +ENTRY(mcount) + cmpq $0, trace_enabled + jz out + + push %rbp + mov %rsp,%rbp + + push %r9 + push %r8 + push %rdi + push %rsi + push %rdx + push %rcx + push %rax + + mov 0x0(%rbp),%rax + mov 0x8(%rbp),%rdi + mov 0x8(%rax),%rsi + + call __trace + + pop %rax + pop %rcx + pop %rdx + pop %rsi + pop %rdi + pop %r8 + pop %r9 + + leaveq +out: + ret + +#endif + Index: linux-latency-tracing.q/arch/x86_64/kernel/vsyscall.c =================================================================== --- linux-latency-tracing.q.orig/arch/x86_64/kernel/vsyscall.c +++ linux-latency-tracing.q/arch/x86_64/kernel/vsyscall.c @@ -34,7 +34,7 @@ #include #include -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) notrace #define force_inline __attribute__((always_inline)) inline int __sysctl_vsyscall __section_sysctl_vsyscall = 1; Index: linux-latency-tracing.q/fs/proc/proc_misc.c =================================================================== --- linux-latency-tracing.q.orig/fs/proc/proc_misc.c +++ linux-latency-tracing.q/fs/proc/proc_misc.c @@ -531,6 +531,20 @@ static int execdomains_read_proc(char *p return proc_calc_metrics(page, start, off, count, eof, len); } +#ifdef CONFIG_LATENCY_TRACE +extern struct seq_operations latency_trace_op; +static int latency_trace_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &latency_trace_op); +} +static struct file_operations proc_latency_trace_operations = { + .open = latency_trace_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + #ifdef CONFIG_MAGIC_SYSRQ /* * writing 'C' to /proc/sysrq-trigger is like sysrq-C @@ -611,6 +625,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_SCHEDSTATS create_seq_entry("schedstat", 0, &proc_schedstat_operations); #endif +#ifdef CONFIG_LATENCY_TRACE + create_seq_entry("latency_trace", 0, &proc_latency_trace_operations); +#endif #ifdef CONFIG_PROC_KCORE proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { Index: linux-latency-tracing.q/include/asm-arm/pgalloc.h =================================================================== --- linux-latency-tracing.q.orig/include/asm-arm/pgalloc.h +++ linux-latency-tracing.q/include/asm-arm/pgalloc.h @@ -102,7 +102,7 @@ static inline void __pmd_populate(pmd_t * * Ensure that we always set both PMD entries. */ -static inline void +static inline void notrace pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep) { unsigned long pte_ptr = (unsigned long)ptep; @@ -115,7 +115,7 @@ pmd_populate_kernel(struct mm_struct *mm __pmd_populate(pmdp, __pa(pte_ptr) | _PAGE_KERNEL_TABLE); } -static inline void +static inline void notrace pmd_populate(struct mm_struct *mm, pmd_t *pmdp, struct page *ptep) { __pmd_populate(pmdp, page_to_pfn(ptep) << PAGE_SHIFT | _PAGE_USER_TABLE); Index: linux-latency-tracing.q/include/asm-arm/timex.h =================================================================== --- linux-latency-tracing.q.orig/include/asm-arm/timex.h +++ linux-latency-tracing.q/include/asm-arm/timex.h @@ -16,9 +16,17 @@ typedef unsigned long cycles_t; +#ifndef mach_read_cycles + #define mach_read_cycles() (0) +#ifdef CONFIG_LATENCY_TIMING + #define mach_cycles_to_usecs(d) (d) + #define mach_usecs_to_cycles(d) (d) +#endif +#endif + static inline cycles_t get_cycles (void) { - return 0; + return mach_read_cycles(); } #endif Index: linux-latency-tracing.q/include/asm-arm/unistd.h =================================================================== --- linux-latency-tracing.q.orig/include/asm-arm/unistd.h +++ linux-latency-tracing.q/include/asm-arm/unistd.h @@ -526,6 +526,9 @@ type name(type1 arg1, type2 arg2, type3 #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION + +#define NR_syscalls 328 + #endif #ifdef __KERNEL_SYSCALLS__ Index: linux-latency-tracing.q/include/asm-i386/system.h =================================================================== --- linux-latency-tracing.q.orig/include/asm-i386/system.h +++ linux-latency-tracing.q/include/asm-i386/system.h @@ -531,13 +531,17 @@ struct alt_instr { /* used when interrupts are already enabled or to shutdown the processor */ #define halt() __asm__ __volatile__("hlt": : :"memory") +#define irqs_disabled_flags(flags) \ +({ \ + !(flags & (1<<9)); \ +}) + #define irqs_disabled() \ ({ \ unsigned long flags; \ local_save_flags(flags); \ - !(flags & (1<<9)); \ + irqs_disabled_flags(flags); \ }) - /* For spinlocks etc */ #define local_irq_save(x) __asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory") Index: linux-latency-tracing.q/include/linux/interrupt.h =================================================================== --- linux-latency-tracing.q.orig/include/linux/interrupt.h +++ linux-latency-tracing.q/include/linux/interrupt.h @@ -292,4 +292,14 @@ extern int probe_irq_off(unsigned long); extern unsigned int probe_irq_mask(unsigned long); /* returns mask of ISA interrupts */ #endif +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + extern void notrace trace_irqs_off_lowlevel(void); + extern void notrace trace_irqs_off(void); + extern void notrace trace_irqs_on(void); +#else +# define trace_irqs_off_lowlevel() do { } while (0) +# define trace_irqs_off() do { } while (0) +# define trace_irqs_on() do { } while (0) +#endif + #endif Index: linux-latency-tracing.q/include/linux/latency_hist.h =================================================================== --- /dev/null +++ linux-latency-tracing.q/include/linux/latency_hist.h @@ -0,0 +1,32 @@ +/* + * kernel/latency_hist.h + * + * Add support for histograms of preemption-off latency and + * interrupt-off latency and wakeup latency, it depends on + * Real-Time Preemption Support. + * + * Copyright (C) 2005 MontaVista Software, Inc. + * Yi Yang + * + */ +#ifndef _LINUX_LATENCY_HIST_H_ +#define _LINUX_LATENCY_HIST_H_ + +enum { + INTERRUPT_LATENCY = 0, + PREEMPT_LATENCY, + WAKEUP_LATENCY +}; + +#define MAX_ENTRY_NUM 10240 +#define LATENCY_TYPE_NUM 3 + +#ifdef CONFIG_LATENCY_HIST +extern void latency_hist(int latency_type, int cpu, unsigned long latency); +# define latency_hist_flag 1 +#else +# define latency_hist(a,b,c) do { (void)(cpu); } while (0) +# define latency_hist_flag 0 +#endif /* CONFIG_LATENCY_HIST */ + +#endif /* ifndef _LINUX_LATENCY_HIST_H_ */ Index: linux-latency-tracing.q/include/linux/linkage.h =================================================================== --- linux-latency-tracing.q.orig/include/linux/linkage.h +++ linux-latency-tracing.q/include/linux/linkage.h @@ -4,6 +4,8 @@ #include #include +#define notrace __attribute ((no_instrument_function)) + #ifdef __cplusplus #define CPP_ASMLINKAGE extern "C" #else Index: linux-latency-tracing.q/include/linux/preempt.h =================================================================== --- linux-latency-tracing.q.orig/include/linux/preempt.h +++ linux-latency-tracing.q/include/linux/preempt.h @@ -10,12 +10,26 @@ #include #include -#ifdef CONFIG_DEBUG_PREEMPT - extern void fastcall add_preempt_count(int val); - extern void fastcall sub_preempt_count(int val); +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING) + extern void notrace add_preempt_count(unsigned int val); + extern void notrace sub_preempt_count(unsigned int val); + extern void notrace mask_preempt_count(unsigned int mask); + extern void notrace unmask_preempt_count(unsigned int mask); #else # define add_preempt_count(val) do { preempt_count() += (val); } while (0) # define sub_preempt_count(val) do { preempt_count() -= (val); } while (0) +# define mask_preempt_count(mask) \ + do { preempt_count() |= (mask); } while (0) +# define unmask_preempt_count(mask) \ + do { preempt_count() &= ~(mask); } while (0) +#endif + +#ifdef CONFIG_CRITICAL_TIMING + extern void touch_critical_timing(void); + extern void stop_critical_timing(void); +#else +# define touch_critical_timing() do { } while (0) +# define stop_critical_timing() do { } while (0) #endif #define inc_preempt_count() add_preempt_count(1) Index: linux-latency-tracing.q/include/linux/sched.h =================================================================== --- linux-latency-tracing.q.orig/include/linux/sched.h +++ linux-latency-tracing.q/include/linux/sched.h @@ -39,6 +39,84 @@ struct exec_domain; +#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_LATENCY_TRACE) + extern void print_traces(struct task_struct *task); +#else +# define print_traces(task) do { } while (0) +#endif + +#ifdef CONFIG_FRAME_POINTER +# ifndef CONFIG_ARM +# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) +# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) +# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) +# else + extern unsigned long arm_return_addr(int level); +# define CALLER_ADDR0 arm_return_addr(0) +# define CALLER_ADDR1 arm_return_addr(1) +# define CALLER_ADDR2 arm_return_addr(2) +# define CALLER_ADDR3 arm_return_addr(3) +#endif +#else +# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +# define CALLER_ADDR1 0UL +# define CALLER_ADDR2 0UL +# define CALLER_ADDR3 0UL +#endif + +#ifdef CONFIG_MCOUNT + extern void notrace mcount(void); +#else +# define mcount() do { } while (0) +#endif + +#ifdef CONFIG_LATENCY_TRACE + extern int mcount_enabled, trace_enabled, trace_user_triggered, + trace_user_trigger_irq, trace_freerunning, trace_verbose, + trace_print_at_crash, trace_all_cpus; + extern void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3); + extern void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2); + extern void notrace trace_special_u64(unsigned long long v1, unsigned long v2); + extern void stop_trace(void); + extern void print_last_trace(void); + extern void nmi_trace(unsigned long eip, unsigned long parent_eip, + unsigned long flags); + extern long user_trace_start(void); + extern long user_trace_stop(void); + extern void trace_cmdline(void); +#else +# define mcount_enabled 0 +# define trace_enabled 0 +# define trace_user_triggered 0 +# define trace_freerunning 0 +# define trace_all_cpus 0 +# define trace_verbose 0 +# define trace_special(v1,v2,v3) do { } while (0) +# define trace_special_pid(pid,v1,v2) do { } while (0) +# define trace_special_u64(v1,v2) do { } while (0) +# define stop_trace() do { } while (0) +# define print_last_trace() do { } while (0) +# define nmi_trace(eip, parent_eip, flags) do { } while (0) +# define user_trace_start() do { } while (0) +# define user_trace_stop() do { } while (0) +# define trace_cmdline() do { } while (0) +#endif + +extern int timeofday_API_hacks(void *tv, void *tz); + +#ifdef CONFIG_WAKEUP_TIMING + extern int wakeup_timing; + extern void __trace_start_sched_wakeup(struct task_struct *p); + extern void trace_stop_sched_switched(struct task_struct *p); + extern void trace_change_sched_cpu(struct task_struct *p, int new_cpu); +#else +# define wakeup_timing 0 +# define __trace_start_sched_wakeup(p) do { } while (0) +# define trace_stop_sched_switched(p) do { } while (0) +# define trace_change_sched_cpu(p, cpu) do { } while (0) +#endif + /* * cloning flags: */ @@ -820,6 +898,13 @@ struct task_struct { /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */ spinlock_t proc_lock; +#define MAX_PREEMPT_TRACE 16 + +#ifdef CONFIG_PREEMPT_TRACE + unsigned long preempt_trace_eip[MAX_PREEMPT_TRACE]; + unsigned long preempt_trace_parent_eip[MAX_PREEMPT_TRACE]; +#endif + /* journalling filesystem info */ void *journal_info; @@ -1349,6 +1434,7 @@ static inline unsigned int task_cpu(cons static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) { + trace_change_sched_cpu(p, cpu); task_thread_info(p)->cpu = cpu; } Index: linux-latency-tracing.q/kernel/Makefile =================================================================== --- linux-latency-tracing.q.orig/kernel/Makefile +++ linux-latency-tracing.q/kernel/Makefile @@ -9,6 +9,9 @@ obj-y = sched.o fork.o exec_domain.o rcupdate.o intermodule.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o +obj-$(CONFIG_DEBUG_PREEMPT) += latency.o +obj-$(CONFIG_LATENCY_TIMING) += latency.o +obj-$(CONFIG_LATENCY_HIST) += latency_hist.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o Index: linux-latency-tracing.q/kernel/latency.c =================================================================== --- /dev/null +++ linux-latency-tracing.q/kernel/latency.c @@ -0,0 +1,2390 @@ +/* + * kernel/latency.c + * + * Copyright (C) 2004, 2005 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef CONFIG_PREEMPT_RT +# define DEFINE_RAW_SPINLOCK DEFINE_SPINLOCK +# define raw_spinlock_t spinlock_t +# define raw_irqs_disabled irqs_disabled +# define irqs_off irqs_disabled +# define raw_irqs_disabled_flags irqs_disabled_flags +# define __raw_local_irq_save local_irq_save +# define __raw_local_irq_restore local_irq_restore +# define raw_local_irq_save local_irq_save +# define raw_local_save_flags local_save_flags +# define raw_local_irq_restore local_irq_restore +#endif + +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING +# ifdef CONFIG_CRITICAL_PREEMPT_TIMING +# define irqs_off_preempt_count() preempt_count() +# else +# define irqs_off_preempt_count() 0 +# endif +#endif + +#ifdef CONFIG_WAKEUP_TIMING +struct sch_struct { + raw_spinlock_t trace_lock; + struct task_struct *task; + int cpu; + struct cpu_trace *tr; +} ____cacheline_aligned_in_smp; + +static __cacheline_aligned_in_smp struct sch_struct sch = +#ifdef CONFIG_PREEMPT_RT + { trace_lock: RAW_SPIN_LOCK_UNLOCKED }; +#else + { trace_lock: SPIN_LOCK_UNLOCKED }; +#endif + +int wakeup_timing = 1; +#endif + +#ifdef CONFIG_LATENCY_TIMING + +/* + * Maximum preemption latency measured. Initialize to maximum, + * we clear it after bootup. + */ +#ifdef CONFIG_LATENCY_HIST +cycles_t preempt_max_latency = (cycles_t)0UL; +#else +cycles_t preempt_max_latency = (cycles_t)ULONG_MAX; +#endif + +static cycles_t preempt_thresh; + +/* + * Should this new latency be reported/recorded? + */ +static int report_latency(cycles_t delta) +{ + if (latency_hist_flag && !trace_user_triggered) + return 1; + + if (preempt_thresh) { + if (delta < preempt_thresh) + return 0; + } else { + if (delta <= preempt_max_latency) + return 0; + } + return 1; +} + +/* + * Track maximum latencies and save the trace: + */ + +/* + * trace_stop_sched_switched must not be called with runqueue locks held! + */ +static __cacheline_aligned_in_smp DECLARE_MUTEX(max_mutex); + +/* + * Sequence count - we record it when starting a measurement and + * skip the latency if the sequence has changed - some other section + * did a maximum and could disturb our measurement with serial console + * printouts, etc. Truly coinciding maximum latencies should be rare + * and what happens together happens separately as well, so this doesnt + * decrease the validity of the maximum found: + */ +static __cacheline_aligned_in_smp int max_sequence; + +enum trace_type +{ + __TRACE_FIRST_TYPE = 0, + + TRACE_FN, + TRACE_SPECIAL, + TRACE_SPECIAL_PID, + TRACE_SPECIAL_U64, + TRACE_CMDLINE, + TRACE_SYSCALL, + TRACE_SYSRET, + + __TRACE_LAST_TYPE +}; + +enum trace_flag_type +{ + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_NEED_RESCHED = 0x02, + TRACE_FLAG_HARDIRQ = 0x04, + TRACE_FLAG_SOFTIRQ = 0x08, + TRACE_FLAG_IRQS_HARD_OFF = 0x10, +}; + + +#ifdef CONFIG_LATENCY_TRACE + +/* + * On DEBUG_PAGEALLOC && SMP there's not too much lowmem, so reduce + * the # of trace entries, or else we OOM on bootup. Same applies for + * ARM where we have only 4MB boot window for kernel text+data+bss. + * + * The large buffer allocates 8MB memory, which might also be more + * than the available memory on a small embedded box. This needs more + * thought for embedded devices and should be initialized at runtime + * under consideration of the available memory resources. + */ +#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_SMP) && !defined(CONFIG_ARM) +# define MAX_TRACE (unsigned long)(8192*16-1) +#else +# define MAX_TRACE (unsigned long)(8192*2-1) +#endif + +#define CMDLINE_BYTES 16 + +/* + * 32 bytes on 32-bit platforms: + */ +struct trace_entry { + char type; + char cpu; + char flags; + char preempt_count; // assumes PREEMPT_MASK is 8 bits or less + int pid; + cycles_t timestamp; + union { + struct { + unsigned long eip; + unsigned long parent_eip; + } fn; + struct { + unsigned long eip; + unsigned long v1, v2, v3; + } special; + struct { + unsigned char str[CMDLINE_BYTES]; + } cmdline; + struct { + unsigned int nr; + unsigned long p1, p2, p3; + } syscall; + struct { + unsigned int ret; + } sysret; + struct { + int __pad3[4]; + } pad; + } u; +} __attribute__((packed)); + +#endif + +struct cpu_trace { + atomic_t disabled; + unsigned long trace_idx; + cycles_t preempt_timestamp; + unsigned long critical_start, critical_end; + int critical_sequence; + atomic_t overrun; + int early_warning; + int latency_type; + int cpu; + +#ifdef CONFIG_LATENCY_TRACE + struct trace_entry trace[MAX_TRACE]; + char comm[CMDLINE_BYTES]; + pid_t pid; + unsigned long uid; + unsigned long nice; + unsigned long policy; + unsigned long rt_priority; + unsigned long saved_latency; +#endif +#ifdef CONFIG_DEBUG_STACKOVERFLOW + unsigned long stack_check; +#endif +} ____cacheline_aligned_in_smp; + +static struct cpu_trace cpu_traces[NR_CPUS] ____cacheline_aligned_in_smp = +{ [0 ... NR_CPUS-1] = { +#ifdef CONFIG_DEBUG_STACKOVERFLOW + .stack_check = 1 +#endif + } }; + +static unsigned long notrace cycles_to_usecs(cycles_t delta) +{ +#ifdef CONFIG_X86 + do_div(delta, cpu_khz/1000+1); +#elif defined(CONFIG_PPC) + delta = mulhwu(tb_to_us, delta); +#elif defined(CONFIG_ARM) + delta = mach_cycles_to_usecs(delta); +#else + #error Implement cycles_to_usecs. +#endif + + return (unsigned long) delta; +} + +static cycles_t notrace usecs_to_cycles(unsigned long delta) +{ +#if defined(CONFIG_X86) || defined(CONFIG_PPC) + return (cycles_t) delta * (cycles_t) (cpu_khz/1000+1); +#elif defined(CONFIG_ARM) + return mach_usecs_to_cycles(delta); +#else + #error Implement usecs_to_cycles +#endif +} + +#ifdef CONFIG_LATENCY_TRACE + +int trace_enabled = 1; +int mcount_enabled = 1; +int trace_freerunning = 0; +int trace_print_at_crash = 0; +int trace_verbose = 0; +int trace_all_cpus = 0; + +/* + * user-triggered via gettimeofday(0,1)/gettimeofday(0,0) + */ +int trace_user_triggered = 0; +int trace_user_trigger_irq = -1; + +struct saved_trace_struct { + int cpu; + cycles_t first_timestamp, last_timestamp; + struct cpu_trace traces[NR_CPUS]; +} ____cacheline_aligned_in_smp; + +/* + * The current worst-case trace: + */ +static struct saved_trace_struct max_tr; + +/* + * /proc/latency_trace atomicity: + */ +static DECLARE_MUTEX(out_mutex); + +static struct saved_trace_struct out_tr; + +static void notrace printk_name(unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + printk("%s+%#lx/%#lx", sym_name, offset, size); + else + printk("<%08lx>", eip); +} + +#ifdef CONFIG_DEBUG_STACKOVERFLOW + +#define MIN_STACK_NEEDED (sizeof(struct thread_info) + STACK_WARN) +#define MAX_STACK (THREAD_SIZE - sizeof(struct thread_info)) + +#if (defined(__i386__) || defined(__x86_64__)) && defined(CONFIG_FRAME_POINTER) +# define PRINT_EXACT_STACKFRAME +#endif + +#ifdef PRINT_EXACT_STACKFRAME +static unsigned long *worst_stack_bp; +#endif +static DEFINE_RAW_SPINLOCK(worst_stack_lock); +unsigned long worst_stack_left = THREAD_SIZE; +static unsigned long worst_stack_printed = THREAD_SIZE; +static char worst_stack_comm[TASK_COMM_LEN+1]; +static int worst_stack_pid; +static unsigned long worst_stack_sp; +static char worst_stack[THREAD_SIZE]; + +static notrace void fill_worst_stack(unsigned long stack_left) +{ + unsigned long flags; + + /* + * On x64, we must not read the PDA during early bootup: + */ +#ifdef CONFIG_X86_64 + if (system_state == SYSTEM_BOOTING) + return; +#endif + spin_lock_irqsave(&worst_stack_lock, flags); + if (likely(stack_left < worst_stack_left)) { + worst_stack_left = stack_left; + memcpy(worst_stack, current_thread_info(), THREAD_SIZE); + worst_stack_sp = (unsigned long)&stack_left; + memcpy(worst_stack_comm, current->comm, TASK_COMM_LEN); + worst_stack_pid = current->pid; +#ifdef PRINT_EXACT_STACKFRAME +# ifdef __i386__ + asm ("mov %%ebp, %0\n" :"=g"(worst_stack_bp)); +# elif defined(__x86_64__) + asm ("mov %%rbp, %0\n" :"=g"(worst_stack_bp)); +# else +# error Poke the author of above asm code lines ! +# endif +#endif + } + spin_unlock_irqrestore(&worst_stack_lock, flags); +} + +#ifdef PRINT_EXACT_STACKFRAME + +/* + * This takes a BP offset to point the BP back into the saved stack, + * the original stack might be long gone (but the stackframe within + * the saved copy still contains references to it). + */ +#define CONVERT_TO_SAVED_STACK(bp) \ + ((void *)worst_stack + ((unsigned long)bp & (THREAD_SIZE-1))) + +static void show_stackframe(void) +{ + unsigned long addr, frame_size, *bp, *prev_bp, sum = 0; + + bp = CONVERT_TO_SAVED_STACK(worst_stack_bp); + + while (bp[0]) { + addr = bp[1]; + if (!kernel_text_address(addr)) + break; + + prev_bp = bp; + bp = CONVERT_TO_SAVED_STACK((unsigned long *)bp[0]); + + frame_size = (bp - prev_bp) * sizeof(long); + + if (frame_size < THREAD_SIZE) { + printk("{ %4ld} ", frame_size); + sum += frame_size; + } else + printk("{=%4ld} ", sum); + + printk("[<%08lx>] ", addr); + printk_name(addr); + printk("\n"); + } +} + +#else + +static inline int valid_stack_ptr(void *p) +{ + return p > (void *)worst_stack && + p < (void *)worst_stack + THREAD_SIZE - 3; +} + +static void show_stackframe(void) +{ + unsigned long prev_frame, addr; + unsigned long *stack; + + prev_frame = (unsigned long)(worst_stack + + (worst_stack_sp & (THREAD_SIZE-1))); + stack = (unsigned long *)prev_frame; + + while (valid_stack_ptr(stack)) { + addr = *stack++; + if (__kernel_text_address(addr)) { + printk("(%4ld) ", (unsigned long)stack - prev_frame); + printk("[<%08lx>] ", addr); + print_symbol("%s\n", addr); + prev_frame = (unsigned long)stack; + } + if ((char *)stack >= worst_stack + THREAD_SIZE) + break; + } +} + +#endif + +static notrace void __print_worst_stack(void) +{ + printk("----------------------------->\n"); + printk("| new stack-footprint maximum: %s/%d, %ld bytes (out of %ld bytes).\n", + worst_stack_comm, worst_stack_pid, + MAX_STACK-worst_stack_left, (long)MAX_STACK); + printk("------------|\n"); + + show_stackframe(); + printk("<---------------------------\n\n"); +} + +static notrace void print_worst_stack(void) +{ + unsigned long flags; + + if (raw_irqs_disabled()) + return; + + spin_lock_irqsave(&worst_stack_lock, flags); + if (worst_stack_printed == worst_stack_left) { + spin_unlock_irqrestore(&worst_stack_lock, flags); + return; + } + worst_stack_printed = worst_stack_left; + spin_unlock_irqrestore(&worst_stack_lock, flags); + + __print_worst_stack(); +} + +static notrace void debug_stackoverflow(struct cpu_trace *tr) +{ + long stack_left; + + if (unlikely(tr->stack_check <= 0)) + return; + atomic_inc(&tr->disabled); + + /* Debugging check for stack overflow: is there less than 1KB free? */ +#ifdef __i386__ + __asm__ __volatile__("and %%esp,%0" : + "=r" (stack_left) : "0" (THREAD_SIZE - 1)); +#elif defined(__x86_64__) + __asm__ __volatile__("and %%rsp,%0" : + "=r" (stack_left) : "0" (THREAD_SIZE - 1)); +#else +# error Poke the author of above asm code lines ! +#endif + if (unlikely(stack_left < MIN_STACK_NEEDED)) { + tr->stack_check = 0; + printk(KERN_ALERT "BUG: stack overflow: only %ld bytes left! [%08lx...(%08lx-%08lx)]\n", + stack_left - sizeof(struct thread_info), + (long)&stack_left, + (long)current_thread_info(), + (long)current_thread_info() + THREAD_SIZE); + fill_worst_stack(stack_left); + __print_worst_stack(); + goto out; + } + if (unlikely(stack_left < worst_stack_left)) { + tr->stack_check--; + fill_worst_stack(stack_left); + print_worst_stack(); + tr->stack_check++; + } else + if (worst_stack_printed != worst_stack_left) { + tr->stack_check--; + print_worst_stack(); + tr->stack_check++; + } +out: + atomic_dec(&tr->disabled); +} + +#endif + +static void notrace +____trace(int cpu, enum trace_type type, struct cpu_trace *tr, + unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, unsigned long v3, + unsigned long flags) +{ + struct trace_entry *entry; + unsigned long idx, idx_next; + cycles_t timestamp; + u32 pc; + +#ifdef CONFIG_DEBUG_PREEMPT +// WARN_ON(!atomic_read(&tr->disabled)); +#endif + if (!tr->critical_start && !trace_user_triggered && !trace_all_cpus && !trace_print_at_crash) + goto out; + /* + * Allocate the next index. Make sure an NMI (or interrupt) + * has not taken it away. Potentially redo the timestamp as + * well to make sure the trace timestamps are in chronologic + * order. + */ +again: + idx = tr->trace_idx; + idx_next = idx + 1; + timestamp = get_cycles(); + + if (unlikely(trace_freerunning && (idx_next >= MAX_TRACE))) + idx_next = 0; + if (unlikely(idx_next >= MAX_TRACE)) { + atomic_inc(&tr->overrun); + goto out; + } +#ifdef __HAVE_ARCH_CMPXCHG + if (unlikely(cmpxchg(&tr->trace_idx, idx, idx_next) != idx)) + goto again; +#else +# ifdef CONFIG_SMP +# error CMPXHG missing +# else + /* No worry, we are protected by the atomic_incr(&tr->disabled) + * in __trace further down + */ + tr->trace_idx = idx_next; +# endif +#endif + pc = preempt_count(); + + entry = tr->trace + idx; + entry->type = type; +#ifdef CONFIG_SMP + entry->cpu = cpu; +#endif + entry->flags = (irqs_off() ? TRACE_FLAG_IRQS_OFF : 0) | + (raw_irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_HARD_OFF : 0)| + ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | + ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | + (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); + entry->preempt_count = pc & 0xff; + entry->pid = current->pid; + entry->timestamp = timestamp; + + switch (type) { + case TRACE_FN: + entry->u.fn.eip = eip; + entry->u.fn.parent_eip = parent_eip; + break; + case TRACE_SPECIAL: + case TRACE_SPECIAL_PID: + case TRACE_SPECIAL_U64: + entry->u.special.eip = eip; + entry->u.special.v1 = v1; + entry->u.special.v2 = v2; + entry->u.special.v3 = v3; + break; + case TRACE_SYSCALL: + entry->u.syscall.nr = eip; + entry->u.syscall.p1 = v1; + entry->u.syscall.p2 = v2; + entry->u.syscall.p3 = v3; + break; + case TRACE_SYSRET: + entry->u.sysret.ret = eip; + break; + case TRACE_CMDLINE: + memcpy(entry->u.cmdline.str, current->comm, CMDLINE_BYTES); + break; + default: + break; + } +out: + ; +} + +static inline void notrace +___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, + unsigned long v3) +{ + struct cpu_trace *tr; + unsigned long flags; + int cpu; + + if (unlikely(trace_enabled <= 0)) + return; + +#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_X86) + debug_stackoverflow(cpu_traces + raw_smp_processor_id()); +#endif + + __raw_local_irq_save(flags); + cpu = raw_smp_processor_id(); + /* + * Trace on the CPU where the current highest-prio task + * is waiting to become runnable: + */ +#ifdef CONFIG_WAKEUP_TIMING + if (wakeup_timing && !trace_all_cpus && !trace_print_at_crash) { + if (!sch.tr || cpu != sch.cpu) + goto out; + tr = sch.tr; + } else + tr = cpu_traces + cpu; +#else + tr = cpu_traces + cpu; +#endif + atomic_inc(&tr->disabled); + if (likely(atomic_read(&tr->disabled) == 1)) { +//#define DEBUG_STACK_POISON +#ifdef DEBUG_STACK_POISON + char stack; + + memset(&stack - 128, 0x34, 128); +#endif + ____trace(cpu, type, tr, eip, parent_eip, v1, v2, v3, flags); + } + atomic_dec(&tr->disabled); +#ifdef CONFIG_WAKEUP_TIMING +out: +#endif + __raw_local_irq_restore(flags); +} + +/* + * Special, ad-hoc tracepoints: + */ +void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3) +{ + ___trace(TRACE_SPECIAL, CALLER_ADDR0, 0, v1, v2, v3); +} + +EXPORT_SYMBOL(trace_special); + +void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2) +{ + ___trace(TRACE_SPECIAL_PID, CALLER_ADDR0, 0, pid, v1, v2); +} + +EXPORT_SYMBOL(trace_special_pid); + +void notrace trace_special_u64(unsigned long long v1, unsigned long v2) +{ + ___trace(TRACE_SPECIAL_U64, CALLER_ADDR0, 0, + (unsigned long) (v1 >> 32), (unsigned long) (v1 & 0xFFFFFFFF), v2); +} + +EXPORT_SYMBOL(trace_special_u64); + +/* + * Non-inlined function: + */ +void notrace __trace(unsigned long eip, unsigned long parent_eip) +{ + ___trace(TRACE_FN, eip, parent_eip, 0, 0, 0); +} + +extern void mcount(void); + +EXPORT_SYMBOL(mcount); + +void notrace __mcount(void) +{ + ___trace(TRACE_FN, CALLER_ADDR1, CALLER_ADDR2, 0, 0, 0); +} + +void notrace +sys_call(int nr, unsigned long p1, unsigned long p2, unsigned long p3) +{ + ___trace(TRACE_SYSCALL, nr, 0, p1, p2, p3); +} + +void notrace sys_ret(int ret) +{ + ___trace(TRACE_SYSRET, ret, 0, 0, 0, 0); +} + +static void notrace print_name(struct seq_file *m, unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + /* + * Special trace values: + */ + if (((long)eip < 10000L) && ((long)eip > -10000L)) { + seq_printf(m, "(%ld)", eip); + return; + } + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + seq_puts(m, sym_name); + else + seq_printf(m, "<%08lx>", eip); +} + +static void notrace print_name_offset(struct seq_file *m, unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + seq_printf(m, "%s+%#lx/%#lx <%08lx>", + sym_name, offset, size, eip); + else + seq_printf(m, "<%08lx>", eip); +} + +static unsigned int out_sequence = -1; +static int pid_to_cmdline_array[PID_MAX_DEFAULT+1]; + +static void notrace _trace_cmdline(int cpu, struct cpu_trace *tr) +{ + unsigned long flags; + + raw_local_save_flags(flags); + ____trace(cpu, TRACE_CMDLINE, tr, 0, 0, 0, 0, 0, flags); +} + +void notrace trace_cmdline(void) +{ + ___trace(TRACE_CMDLINE, 0, 0, 0, 0, 0); +} + +static void construct_pid_to_cmdline(void) +{ + struct cpu_trace *tr = out_tr.traces; + unsigned int i, j, entries, pid; + + if (tr->critical_sequence == out_sequence) + return; + out_sequence = tr->critical_sequence; + + memset(pid_to_cmdline_array, -1, sizeof(int) * (PID_MAX_DEFAULT + 1)); + + entries = min(tr->trace_idx, MAX_TRACE-1); + + for (i = 0; i < entries; i++) { + struct trace_entry *entry = tr->trace + i; + + if (entry->type != TRACE_CMDLINE) + continue; + pid = entry->pid; + if (pid < PID_MAX_DEFAULT) { + pid_to_cmdline_array[pid] = i; + /* + * Replace space with underline - makes it easier + * to process for tools: + */ + for (j = 0; j < CMDLINE_BYTES; j++) + if (entry->u.cmdline.str[j] == ' ') + entry->u.cmdline.str[j] = '_'; + } + } +} + +char *pid_to_cmdline(unsigned long pid) +{ + struct cpu_trace *tr = out_tr.traces; + char *cmdline = "<...>"; + int idx; + + pid = min(pid, (unsigned long)PID_MAX_DEFAULT); + if (!pid) + return ""; + + if (pid_to_cmdline_array[pid] != -1) { + idx = pid_to_cmdline_array[pid]; + if (tr->trace[idx].type == TRACE_CMDLINE) + cmdline = tr->trace[idx].u.cmdline.str; + } + return cmdline; +} + +struct block_idx { + int idx[NR_CPUS]; +}; + +/* + * return the trace entry (position) of the smallest-timestamp + * one (that is still in the valid idx range): + */ +static int min_idx(struct block_idx *bidx) +{ + cycles_t min_stamp = (cycles_t) -1; + struct trace_entry *entry; + int cpu, min_cpu = -1, idx; + + for_each_online_cpu(cpu) { + idx = bidx->idx[cpu]; + if (idx >= min(max_tr.traces[cpu].trace_idx, MAX_TRACE-1)) + continue; + if (idx >= MAX_TRACE*NR_CPUS) { + printk("huh: idx (%d) > %ld*%d!\n", idx, MAX_TRACE, NR_CPUS); + WARN_ON(1); + break; + } + entry = max_tr.traces[cpu].trace + bidx->idx[cpu]; + if (entry->timestamp < min_stamp) { + min_cpu = cpu; + min_stamp = entry->timestamp; + } + } + + return min_cpu; +} + +/* + * This code is called to construct an output trace from + * the maximum trace. Having separate traces serves both + * atomicity (a new max might be saved while we are busy + * accessing /proc/latency_trace) and it is also used to + * delay the (expensive) sorting of the output trace by + * timestamps, in the trace_all_cpus case. + */ +static void update_out_trace(void) +{ + int cpu, sum, entries, overrun_sum; + struct cpu_trace *tmp_max, *tmp_out; + struct trace_entry *out_entry, *entry; + struct block_idx bidx = { { 0, }, }; + cycles_t stamp, first_stamp, last_stamp; + + /* + * Nasty trick. We might overflow the first array but + * there are NR_CPUS of them so we use it as a 'big' + * trace buffer. + */ + tmp_out = out_tr.traces + 0; + *tmp_out = max_tr.traces[max_tr.cpu]; + out_tr.cpu = max_tr.cpu; + out_entry = tmp_out->trace + 0; + + if (!trace_all_cpus) { + entries = min(tmp_out->trace_idx, MAX_TRACE-1); + if (!entries) + return; + out_tr.first_timestamp = tmp_out->trace[0].timestamp; + out_tr.last_timestamp = tmp_out->trace[entries-1].timestamp; + return; + } + /* + * Find the range of timestamps that are fully traced in + * all CPU traces. (since CPU traces can cover a variable + * range of time, we have to find the best range.) + */ + first_stamp = 0; + for_each_online_cpu(cpu) { + tmp_max = max_tr.traces + cpu; + stamp = tmp_max->trace[0].timestamp; + if (stamp > first_stamp) + first_stamp = stamp; + } + /* + * Save the timestamp range: + */ + tmp_max = max_tr.traces + max_tr.cpu; + entries = min(tmp_max->trace_idx, MAX_TRACE-1); + /* + * No saved trace yet? + */ + if (!entries) { + out_tr.traces[0].trace_idx = 0; + return; + } + + last_stamp = tmp_max->trace[entries-1].timestamp; + + if (last_stamp < first_stamp) { + WARN_ON(1); + + for_each_online_cpu(cpu) { + tmp_max = max_tr.traces + cpu; + entries = min(tmp_max->trace_idx, MAX_TRACE-1); + printk("CPU%d: %016Lx (%016Lx) ... #%d (%016Lx) %016Lx\n", cpu, + tmp_max->trace[0].timestamp, + tmp_max->trace[1].timestamp, + entries, + tmp_max->trace[entries-2].timestamp, + tmp_max->trace[entries-1].timestamp); + } + tmp_max = max_tr.traces + max_tr.cpu; + entries = min(tmp_max->trace_idx, MAX_TRACE-1); + + printk("CPU%d entries: %d\n", max_tr.cpu, entries); + printk("first stamp: %016Lx\n", first_stamp); + printk(" last stamp: %016Lx\n", first_stamp); + } + +#if 0 + printk("first_stamp: %Ld [%016Lx]\n", first_stamp, first_stamp); + printk(" last_stamp: %Ld [%016Lx]\n", last_stamp, last_stamp); + printk(" +1 stamp: %Ld [%016Lx]\n", + tmp_max->trace[entries].timestamp, + tmp_max->trace[entries].timestamp); + printk(" +2 stamp: %Ld [%016Lx]\n", + tmp_max->trace[entries+1].timestamp, + tmp_max->trace[entries+1].timestamp); + printk(" delta: %Ld\n", last_stamp-first_stamp); + printk(" entries: %d\n", entries); +#endif + + out_tr.first_timestamp = first_stamp; + out_tr.last_timestamp = last_stamp; + + /* + * Fetch trace entries one by one, in increasing timestamp + * order. Start at first_stamp, stop at last_stamp: + */ + sum = 0; + for (;;) { + cpu = min_idx(&bidx); + if (cpu == -1) + break; + entry = max_tr.traces[cpu].trace + bidx.idx[cpu]; + if (entry->timestamp > last_stamp) + break; + + bidx.idx[cpu]++; + if (entry->timestamp < first_stamp) + continue; + *out_entry = *entry; + out_entry++; + sum++; + if (sum >= MAX_TRACE*NR_CPUS) { + printk("huh: sum (%d) > %ld*%d!\n", sum, MAX_TRACE, NR_CPUS); + WARN_ON(1); + break; + } + } + + sum = 0; + overrun_sum = 0; + for_each_online_cpu(cpu) { + sum += max_tr.traces[cpu].trace_idx; + overrun_sum += atomic_read(&max_tr.traces[cpu].overrun); + } + tmp_out->trace_idx = sum; + atomic_set(&tmp_out->overrun, overrun_sum); +} + +static void notrace print_help_header(struct seq_file *m) +{ + seq_puts(m, " _------=> CPU# \n"); + seq_puts(m, " / _-----=> irqs-off \n"); + seq_puts(m, " | / _----=> need-resched \n"); + seq_puts(m, " || / _---=> hardirq/softirq \n"); + seq_puts(m, " ||| / _--=> preempt-depth \n"); + seq_puts(m, " |||| / \n"); + seq_puts(m, " ||||| delay \n"); + seq_puts(m, " cmd pid ||||| time | caller \n"); + seq_puts(m, " \\ / ||||| \\ | / \n"); +} + +static void * notrace l_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + unsigned long entries; + struct cpu_trace *tr; + + down(&out_mutex); + /* + * if the file is being read newly, update the output trace: + */ + if (!n) { + // TODO: use the sequence counter here to optimize + down(&max_mutex); + update_out_trace(); + up(&max_mutex); + if (!out_tr.traces[0].trace_idx) { + up(&out_mutex); + return NULL; + } + construct_pid_to_cmdline(); + } + tr = out_tr.traces; + entries = min(tr->trace_idx, MAX_TRACE-1); + + if (!n) { + seq_printf(m, "preemption latency trace v1.1.5 on %s\n", UTS_RELEASE); + seq_puts(m, "--------------------------------------------------------------------\n"); + seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d | (M:%s VP:%d, KP:%d, SP:%d HP:%d", + cycles_to_usecs(tr->saved_latency), + entries, entries + atomic_read(&tr->overrun), + out_tr.cpu, +#if defined(CONFIG_PREEMPT_NONE) + "server", +#elif defined(CONFIG_PREEMPT_VOLUNTARY) + "desktop", +#elif defined(CONFIG_PREEMPT_DESKTOP) + "preempt", +#else + "rt", +#endif + 0, 0, + 0, 0); +#ifdef CONFIG_SMP + seq_printf(m, " #P:%d)\n", num_online_cpus()); +#else + seq_puts(m, ")\n"); +#endif + seq_puts(m, " -----------------\n"); + seq_printf(m, " | task: %.16s-%d (uid:%ld nice:%ld policy:%ld rt_prio:%ld)\n", + tr->comm, tr->pid, tr->uid, tr->nice, + tr->policy, tr->rt_priority); + seq_puts(m, " -----------------\n"); + if (trace_user_triggered) { + seq_puts(m, " => started at: "); + print_name_offset(m, tr->critical_start); + seq_puts(m, "\n => ended at: "); + print_name_offset(m, tr->critical_end); + seq_puts(m, "\n"); + } + seq_puts(m, "\n"); + + if (!trace_verbose) + print_help_header(m); + } + if (n >= entries) + return NULL; + + return tr->trace + n; +} + +static void * notrace l_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct cpu_trace *tr = out_tr.traces; + unsigned long entries = min(tr->trace_idx, MAX_TRACE-1); + + if (++*pos >= entries) { + if (*pos == entries) + seq_puts(m, "\n\nvim:ft=help\n"); + return NULL; + } + return tr->trace + *pos; +} + +static void notrace l_stop(struct seq_file *m, void *p) +{ + up(&out_mutex); +} + +static void print_timestamp(struct seq_file *m, unsigned long abs_usecs, + unsigned long rel_usecs) +{ + seq_printf(m, " %4ldus", abs_usecs); + if (rel_usecs > 100) + seq_puts(m, "!: "); + else if (rel_usecs > 1) + seq_puts(m, "+: "); + else + seq_puts(m, " : "); +} + +static void +print_timestamp_short(struct seq_file *m, unsigned long abs_usecs, + unsigned long rel_usecs) +{ + seq_printf(m, " %4ldus", abs_usecs); + if (rel_usecs > 100) + seq_putc(m, '!'); + else if (rel_usecs > 1) + seq_putc(m, '+'); + else + seq_putc(m, ' '); +} + +static void +print_generic(struct seq_file *m, struct trace_entry *entry) +{ + int hardirq, softirq; + + seq_printf(m, "%8.8s-%-5d ", pid_to_cmdline(entry->pid), entry->pid); + seq_printf(m, "%d", entry->cpu); + seq_printf(m, "%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.', + (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'n' : '.'); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) + seq_putc(m, 'H'); + else { + if (hardirq) + seq_putc(m, 'h'); + else { + if (softirq) + seq_putc(m, 's'); + else + seq_putc(m, '.'); + } + } + + if (entry->preempt_count) + seq_printf(m, "%x", entry->preempt_count); + else + seq_puts(m, "."); +} + + +static int notrace l_show_fn(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + if (trace_verbose) { + seq_printf(m, "%16s %5d %d %d %08x %08lx [%016Lx] %ld.%03ldms (+%ld.%03ldms): ", + pid_to_cmdline(entry->pid), + entry->pid, entry->cpu, entry->flags, + entry->preempt_count, trace_idx, + entry->timestamp, abs_usecs/1000, + abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); + print_name_offset(m, entry->u.fn.eip); + seq_puts(m, " ("); + print_name_offset(m, entry->u.fn.parent_eip); + seq_puts(m, ")\n"); + } else { + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + print_name(m, entry->u.fn.eip); + seq_puts(m, " ("); + print_name(m, entry->u.fn.parent_eip); + seq_puts(m, ")\n"); + } + return 0; +} + +static int notrace l_show_special(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry, int mode64) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + if (trace_verbose) + print_name_offset(m, entry->u.special.eip); + else + print_name(m, entry->u.special.eip); + + if (!mode64) { + seq_printf(m, " (%lx %lx %lx)\n", + entry->u.special.v1, entry->u.special.v2, entry->u.special.v3); + } else { + seq_printf(m, " (%lx%8lx %lx)\n", + entry->u.special.v1, entry->u.special.v2, entry->u.special.v3); + } + return 0; +} + +static int notrace +l_show_special_pid(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + unsigned int pid; + + pid = entry->u.special.v1; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + if (trace_verbose) + print_name_offset(m, entry->u.special.eip); + else + print_name(m, entry->u.special.eip); + seq_printf(m, " <%.8s-%d> (%lx %lx)\n", + pid_to_cmdline(pid), pid, + entry->u.special.v2, entry->u.special.v3); + + return 0; +} + +static int notrace l_show_cmdline(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + if (!trace_verbose) + return 0; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + seq_printf(m, + "[ => %16s ] %ld.%03ldms (+%ld.%03ldms)\n", + entry->u.cmdline.str, + abs_usecs/1000, abs_usecs % 1000, + rel_usecs/1000, rel_usecs % 1000); + + return 0; +} + +extern unsigned long sys_call_table[NR_syscalls]; + +static int notrace l_show_syscall(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + unsigned int nr; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp_short(m, abs_usecs, rel_usecs); + + seq_puts(m, "> "); + nr = entry->u.syscall.nr; + if (nr < NR_syscalls) + print_name(m, sys_call_table[nr]); + else + seq_printf(m, "", nr); + + seq_printf(m, " (%08lx %08lx %08lx)\n", + entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3); + + return 0; +} + +static int notrace l_show_sysret(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp_short(m, abs_usecs, rel_usecs); + + seq_printf(m, "< (%d)\n", entry->u.sysret.ret); + + return 0; +} + + +static int notrace l_show(struct seq_file *m, void *p) +{ + struct cpu_trace *tr = out_tr.traces; + struct trace_entry *entry, *entry0, *next_entry; + unsigned long trace_idx; + + cond_resched(); + entry = p; + if (entry->timestamp < out_tr.first_timestamp) + return 0; + if (entry->timestamp > out_tr.last_timestamp) + return 0; + + entry0 = tr->trace; + trace_idx = entry - entry0; + + if (trace_idx + 1 < tr->trace_idx) + next_entry = entry + 1; + else + next_entry = entry; + + if (trace_verbose) + seq_printf(m, "(T%d/#%ld) ", entry->type, trace_idx); + + switch (entry->type) { + case TRACE_FN: + l_show_fn(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SPECIAL: + l_show_special(m, trace_idx, entry, entry0, next_entry, 0); + break; + case TRACE_SPECIAL_PID: + l_show_special_pid(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SPECIAL_U64: + l_show_special(m, trace_idx, entry, entry0, next_entry, 1); + break; + case TRACE_CMDLINE: + l_show_cmdline(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SYSCALL: + l_show_syscall(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SYSRET: + l_show_sysret(m, trace_idx, entry, entry0, next_entry); + break; + default: + seq_printf(m, "unknown trace type %d\n", entry->type); + } + return 0; +} + +struct seq_operations latency_trace_op = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show +}; + +static void copy_trace(struct cpu_trace *save, struct cpu_trace *tr) +{ + /* free-running needs reordering */ + if (trace_freerunning) { + int i, idx, idx0 = tr->trace_idx; + + for (i = 0; i < MAX_TRACE; i++) { + idx = (idx0 + i) % MAX_TRACE; + save->trace[i] = tr->trace[idx]; + } + save->trace_idx = MAX_TRACE-1; + } else { + save->trace_idx = tr->trace_idx; + + memcpy(save->trace, tr->trace, + min(save->trace_idx + 1, MAX_TRACE-1) * + sizeof(struct trace_entry)); + } + save->overrun = tr->overrun; +} + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /proc/latency_trace) + */ +static void update_max_tr(struct cpu_trace *tr) +{ + struct cpu_trace *save; + int cpu, all_cpus = 0; + +#ifdef CONFIG_PREEMPT + WARN_ON(!preempt_count() && !raw_irqs_disabled()); +#endif + + max_tr.cpu = tr->cpu; + save = max_tr.traces + tr->cpu; + + if ((wakeup_timing || trace_user_triggered || trace_print_at_crash) && + trace_all_cpus) { + all_cpus = 1; + for_each_online_cpu(cpu) + atomic_inc(&cpu_traces[cpu].disabled); + } + + save->saved_latency = preempt_max_latency; + save->preempt_timestamp = tr->preempt_timestamp; + save->critical_start = tr->critical_start; + save->critical_end = tr->critical_end; + save->critical_sequence = tr->critical_sequence; + + memcpy(save->comm, current->comm, CMDLINE_BYTES); + save->pid = current->pid; + save->uid = current->uid; + save->nice = current->static_prio - 20 - MAX_RT_PRIO; + save->policy = current->policy; + save->rt_priority = current->rt_priority; + + if (all_cpus) { + for_each_online_cpu(cpu) { + copy_trace(max_tr.traces + cpu, cpu_traces + cpu); + atomic_dec(&cpu_traces[cpu].disabled); + } + } else + copy_trace(save, tr); +} + +#else /* !LATENCY_TRACE */ + +static inline void notrace +____trace(int cpu, enum trace_type type, struct cpu_trace *tr, + unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, unsigned long v3, + unsigned long flags) +{ +} + +static inline void notrace +___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, + unsigned long v3) +{ +} + +static inline void notrace __trace(unsigned long eip, unsigned long parent_eip) +{ +} + +static inline void update_max_tr(struct cpu_trace *tr) +{ +} + +static inline void notrace _trace_cmdline(int cpu, struct cpu_trace *tr) +{ +} + +#endif + +static int setup_preempt_thresh(char *s) +{ + int thresh; + + get_option(&s, &thresh); + if (thresh > 0) { + preempt_thresh = usecs_to_cycles(thresh); + printk("Preemption threshold = %u us\n", thresh); + } + return 1; +} +__setup("preempt_thresh=", setup_preempt_thresh); + +static inline void notrace reset_trace_idx(int cpu, struct cpu_trace *tr) +{ + if (trace_all_cpus) + for_each_online_cpu(cpu) + cpu_traces[cpu].trace_idx = 0; + else + tr->trace_idx = 0; +} + +#ifdef CONFIG_CRITICAL_TIMING + +static void notrace +check_critical_timing(int cpu, struct cpu_trace *tr, unsigned long parent_eip) +{ + unsigned long latency, t0, t1; + cycles_t T0, T1, T2, delta; + unsigned long flags; + + if (trace_user_triggered) + return; + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = tr->preempt_timestamp; + T1 = get_cycles(); + delta = T1-T0; + + raw_local_save_flags(flags); + + if (!report_latency(delta)) + goto out; + + ____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags); + /* + * Update the timestamp, because the trace entry above + * might change it (it can only get larger so the latency + * is fair to be reported): + */ + T2 = get_cycles(); + if (T2 < T1) + printk("bug: %016Lx < %016Lx!\n", T2, T1); + delta = T2-T0; + + latency = cycles_to_usecs(delta); + latency_hist(tr->latency_type, cpu, latency); + + if (latency_hist_flag) { + if (preempt_max_latency >= delta) + goto out; + } + + if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex)) + goto out; + +#ifndef CONFIG_CRITICAL_LATENCY_HIST + if (!preempt_thresh && preempt_max_latency > delta) { + printk("bug: updating %016Lx > %016Lx?\n", + preempt_max_latency, delta); + printk(" [%016Lx %016Lx %016Lx]\n", T0, T1, T2); + } +#endif + + preempt_max_latency = delta; + t0 = cycles_to_usecs(T0); + t1 = cycles_to_usecs(T1); + + tr->critical_end = parent_eip; + + update_max_tr(tr); + +#ifndef CONFIG_CRITICAL_LATENCY_HIST + if (preempt_thresh) + printk("(%16s-%-5d|#%d): %lu us critical section " + "violates %lu us threshold.\n" + " => started at timestamp %lu: ", + current->comm, current->pid, + raw_smp_processor_id(), + latency, cycles_to_usecs(preempt_thresh), t0); + else + printk("(%16s-%-5d|#%d): new %lu us maximum-latency " + "critical section.\n => started at timestamp %lu: ", + current->comm, current->pid, + raw_smp_processor_id(), + latency, t0); + + print_symbol("<%s>\n", tr->critical_start); + printk(" => ended at timestamp %lu: ", t1); + print_symbol("<%s>\n", tr->critical_end); + dump_stack(); + t1 = cycles_to_usecs(get_cycles()); + printk(" => dump-end timestamp %lu\n\n", t1); +#endif + + max_sequence++; + + up(&max_mutex); + +out: + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = get_cycles(); + tr->early_warning = 0; + reset_trace_idx(cpu, tr); + _trace_cmdline(cpu, tr); + ____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags); +} + +void notrace touch_critical_timing(void) +{ + int cpu = raw_smp_processor_id(); + struct cpu_trace *tr = cpu_traces + cpu; + + if (!tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + if (preempt_count() > 0 && tr->critical_start) { + atomic_inc(&tr->disabled); + check_critical_timing(cpu, tr, CALLER_ADDR0); + tr->critical_start = CALLER_ADDR0; + tr->critical_sequence = max_sequence; + atomic_dec(&tr->disabled); + } +} +EXPORT_SYMBOL(touch_critical_timing); + +void notrace stop_critical_timing(void) +{ + struct cpu_trace *tr = cpu_traces + raw_smp_processor_id(); + + tr->critical_start = 0; +} +EXPORT_SYMBOL(stop_critical_timing); + +static inline void notrace +__start_critical_timing(unsigned long eip, unsigned long parent_eip, int latency_type) +{ + int cpu = raw_smp_processor_id(); + struct cpu_trace *tr = cpu_traces + cpu; + unsigned long flags; + + if (tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + atomic_inc(&tr->disabled); + + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = get_cycles(); + tr->critical_start = eip; + atomic_set(&tr->overrun, 0); + reset_trace_idx(cpu, tr); + tr->latency_type = latency_type; + _trace_cmdline(cpu, tr); + + raw_local_save_flags(flags); + ____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags); + + atomic_dec(&tr->disabled); +} + +static inline void notrace +__stop_critical_timing(unsigned long eip, unsigned long parent_eip) +{ + int cpu = raw_smp_processor_id(); + struct cpu_trace *tr = cpu_traces + cpu; + unsigned long flags; + + if (!tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + atomic_inc(&tr->disabled); + raw_local_save_flags(flags); + ____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags); + check_critical_timing(cpu, tr, eip); + tr->critical_start = 0; + atomic_dec(&tr->disabled); +} + +#endif + +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + +void notrace trace_irqs_off_lowlevel(void) +{ + unsigned long flags; + + raw_local_save_flags(flags); + + if (!irqs_off_preempt_count() && raw_irqs_disabled_flags(flags)) + __start_critical_timing(CALLER_ADDR0, 0, INTERRUPT_LATENCY); +} + +void notrace trace_irqs_off(void) +{ + unsigned long flags; + + raw_local_save_flags(flags); + + if (!irqs_off_preempt_count() && raw_irqs_disabled_flags(flags)) + __start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, INTERRUPT_LATENCY); +} + +EXPORT_SYMBOL(trace_irqs_off); + +void notrace trace_irqs_on(void) +{ + unsigned long flags; + + raw_local_save_flags(flags); + + if (!irqs_off_preempt_count() && raw_irqs_disabled_flags(flags)) + __stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + + __raw_local_irq_restore(flags); +} + +EXPORT_SYMBOL(trace_irqs_on); + +#endif + +#endif /* LATENCY_TIMING */ + +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING) + +static inline unsigned long get_parent_eip(void) +{ + unsigned long parent_eip = CALLER_ADDR1; + + if (in_lock_functions(parent_eip)) { + parent_eip = CALLER_ADDR2; + if (in_lock_functions(parent_eip)) + parent_eip = CALLER_ADDR3; + } + + return parent_eip; +} + +void notrace add_preempt_count_ti(struct thread_info *ti, unsigned int val) +{ + unsigned long eip = CALLER_ADDR0; + unsigned long parent_eip = get_parent_eip(); + +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + BUG_ON(((int)ti->preempt_count < 0)); + /* + * Spinlock count overflowing soon? + */ + BUG_ON((ti->preempt_count & PREEMPT_MASK) >= PREEMPT_MASK-10); +#endif + + ti->preempt_count += val; +#ifdef CONFIG_PREEMPT_TRACE + if (val <= 10) { + unsigned int idx = ti->preempt_count & PREEMPT_MASK; + if (idx < MAX_PREEMPT_TRACE) { + current->preempt_trace_eip[idx] = eip; + current->preempt_trace_parent_eip[idx] = parent_eip; + } + } +#endif +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + raw_local_save_flags(flags); + + if (!raw_irqs_disabled_flags(flags)) +#endif + if (preempt_count() == val) + __start_critical_timing(eip, parent_eip, PREEMPT_LATENCY); + } +#endif + (void)eip, (void)parent_eip; +} +EXPORT_SYMBOL(add_preempt_count_ti); + +void notrace add_preempt_count(unsigned int val) +{ + add_preempt_count_ti(current_thread_info(), val); +} + +EXPORT_SYMBOL(add_preempt_count); + +void notrace sub_preempt_count_ti(struct thread_info *ti, unsigned int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + BUG_ON(unlikely(val > ti->preempt_count)); + + /* + * Is the spinlock portion underflowing? + */ + BUG_ON((val < PREEMPT_MASK) && !(ti->preempt_count & PREEMPT_MASK)); +#endif + +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + raw_local_save_flags(flags); + + if (!raw_irqs_disabled_flags(flags)) +#endif + if (ti->preempt_count == val) + __stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + } +#endif + ti->preempt_count -= val; +} + +EXPORT_SYMBOL(sub_preempt_count_ti); + +void notrace sub_preempt_count(unsigned int val) +{ + sub_preempt_count_ti(current_thread_info(), val); +} + +EXPORT_SYMBOL(sub_preempt_count); + +void notrace mask_preempt_count(unsigned int mask) +{ + unsigned long eip = CALLER_ADDR0; + unsigned long parent_eip = get_parent_eip(); + + preempt_count() |= mask; + +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + raw_local_save_flags(flags); + + if (!raw_irqs_disabled_flags(flags)) +#endif + if (preempt_count() == mask) + __start_critical_timing(eip, parent_eip, PREEMPT_LATENCY); + } +#endif + (void) eip, (void) parent_eip; +} +EXPORT_SYMBOL(mask_preempt_count); + +void notrace unmask_preempt_count(unsigned int mask) +{ +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + raw_local_save_flags(flags); + + if (!raw_irqs_disabled_flags(flags)) +#endif + if (preempt_count() == mask) + __stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + } +#endif + preempt_count() &= ~mask; +} +EXPORT_SYMBOL(unmask_preempt_count); + + +#endif + +/* + * Wakeup latency timing/tracing. We get upcalls from the scheduler + * when a task is being woken up and we time/trace it until it gets + * to a CPU - or an even-higher-prio task supercedes it. (in that + * case we throw away the currently traced task - we dont try to + * handle nesting, that simplifies things significantly) + */ +#ifdef CONFIG_WAKEUP_TIMING + +static void notrace +check_wakeup_timing(struct cpu_trace *tr, unsigned long parent_eip) +{ + unsigned long latency, t0, t1, flags; + cycles_t T0, T1, T2, delta; + int cpu = raw_smp_processor_id(); + + if (trace_user_triggered) + return; + + atomic_inc(&tr->disabled); + if (atomic_read(&tr->disabled) != 1) + goto out; + + T0 = tr->preempt_timestamp; + T1 = get_cycles(); + /* + * maybe preempt_timestamp originated on another CPU, + * with a TSC drift: + */ + if (T0 > T1) + T0 = T1; + delta = T1-T0; + + if (!report_latency(delta)) + goto out; + + raw_local_save_flags(flags); + ____trace(smp_processor_id(), TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags); + T2 = get_cycles(); + if (T2 < T1) + printk("bug2: %016Lx < %016Lx!\n", T2, T1); + delta = T2-T0; + + latency = cycles_to_usecs(delta); + latency_hist(tr->latency_type, cpu, latency); + + if (latency_hist_flag) { + if (preempt_max_latency >= delta) + goto out; + } + + if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex)) + goto out; + +#ifndef CONFIG_WAKEUP_LATENCY_HIST + if (!preempt_thresh && preempt_max_latency > delta) { + printk("bug2: updating %016Lx > %016Lx?\n", + preempt_max_latency, delta); + printk(" [%016Lx %016Lx %016Lx]\n", T0, T1, T2); + } +#endif + + preempt_max_latency = delta; + t0 = cycles_to_usecs(T0); + t1 = cycles_to_usecs(T1); + tr->critical_end = parent_eip; + + update_max_tr(tr); + +#ifndef CONFIG_WAKEUP_LATENCY_HIST + if (preempt_thresh) + printk("(%16s-%-5d|#%d): %lu us wakeup latency " + "violates %lu us threshold.\n", + current->comm, current->pid, + raw_smp_processor_id(), latency, + cycles_to_usecs(preempt_thresh)); + else + printk("(%16s-%-5d|#%d): new %lu us maximum-latency " + "wakeup.\n", current->comm, current->pid, + raw_smp_processor_id(), latency); +#endif + + max_sequence++; + + up(&max_mutex); + +out: + atomic_dec(&tr->disabled); +} + +/* + * Start wakeup latency tracing - called with the runqueue held + * and interrupts disabled: + */ +void __trace_start_sched_wakeup(struct task_struct *p) +{ + struct cpu_trace *tr; + int cpu; + + if (trace_user_triggered || !wakeup_timing) + return; + + spin_lock(&sch.trace_lock); + if (sch.task && (sch.task->prio >= p->prio)) + goto out_unlock; + + /* + * New highest-prio task just woke up - start tracing: + */ + sch.task = p; + cpu = task_cpu(p); + sch.cpu = cpu; + /* + * We keep using this CPU's trace buffer even if the task + * gets migrated to another CPU. Tracing only happens on + * the CPU that 'owns' the highest-prio task so it's + * fundamentally single-threaded. + */ + sch.tr = tr = cpu_traces + cpu; + reset_trace_idx(cpu, tr); + +// if (!atomic_read(&tr->disabled)) { + atomic_inc(&tr->disabled); + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = get_cycles(); + tr->latency_type = WAKEUP_LATENCY; + tr->critical_start = CALLER_ADDR0; + atomic_set(&tr->overrun, 0); + _trace_cmdline(raw_smp_processor_id(), tr); + atomic_dec(&tr->disabled); +// } + + mcount(); + trace_special_pid(p->pid, p->prio, cpu); +out_unlock: + spin_unlock(&sch.trace_lock); +} + +void trace_stop_sched_switched(struct task_struct *p) +{ + struct cpu_trace *tr; + unsigned long flags; + + if (trace_user_triggered || !wakeup_timing) + return; + + raw_local_irq_save(flags); + spin_lock(&sch.trace_lock); + if (p == sch.task) { + trace_special_pid(p->pid, p->prio, task_cpu(p)); + + sch.task = NULL; + tr = sch.tr; + sch.tr = NULL; + WARN_ON(!tr); + /* + * Somewhat racy but safer - the printks within + * check_wakeup_timing() can call back into the + * wakup-timing code and deadlock: + */ +// atomic_inc(&tr->disabled); + preempt_disable(); + spin_unlock(&sch.trace_lock); + check_wakeup_timing(tr, CALLER_ADDR0); + preempt_enable(); +// atomic_dec(&tr->disabled); + } else { + if (sch.task) + trace_special_pid(sch.task->pid, sch.task->prio, p->prio); + if (sch.task && (sch.task->prio >= p->prio)) + sch.task = NULL; + spin_unlock(&sch.trace_lock); + } + raw_local_irq_restore(flags); +} + +void trace_change_sched_cpu(struct task_struct *p, int new_cpu) +{ + unsigned long flags; + + if (!wakeup_timing) + return; + + trace_special(task_cpu(p), task_cpu(p), new_cpu); + raw_local_irq_save(flags); + spin_lock(&sch.trace_lock); + if (p == sch.task && task_cpu(p) != new_cpu) { + sch.cpu = new_cpu; + trace_special(task_cpu(p), new_cpu, 0); + } + spin_unlock(&sch.trace_lock); + raw_local_irq_restore(flags); +} + +#endif + +#ifdef CONFIG_LATENCY_TRACE + +long user_trace_start(void) +{ + struct cpu_trace *tr; + unsigned long flags; + int cpu; + + if (!trace_user_triggered || trace_print_at_crash) + return -EINVAL; + + /* + * user_trace_start() might be called from hardirq + * context, if trace_user_triggered_irq is set, so + * be careful about locking: + */ + if (preempt_count()) { + if (down_trylock(&max_mutex)) + return -EAGAIN; + } else + down(&max_mutex); + + raw_local_irq_save(flags); + cpu = smp_processor_id(); + tr = cpu_traces + cpu; + +#ifdef CONFIG_WAKEUP_TIMING + if (wakeup_timing) { + spin_lock(&sch.trace_lock); + sch.task = current; + sch.cpu = cpu; + sch.tr = tr; + spin_unlock(&sch.trace_lock); + } +#endif + reset_trace_idx(cpu, tr); + + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = get_cycles(); + tr->critical_start = CALLER_ADDR0; + atomic_set(&tr->overrun, 0); + _trace_cmdline(cpu, tr); + mcount(); + + WARN_ON(!raw_irqs_disabled()); + raw_local_irq_restore(flags); + + up(&max_mutex); + + return 0; +} + +EXPORT_SYMBOL_GPL(user_trace_start); + +long user_trace_stop(void) +{ + unsigned long latency, flags; + struct cpu_trace *tr; + cycles_t delta; + + + if (!trace_user_triggered || trace_print_at_crash) + return -EINVAL; + + preempt_disable(); + mcount(); + + raw_local_irq_save(flags); +#ifdef CONFIG_WAKEUP_TIMING + if (wakeup_timing) { + spin_lock(&sch.trace_lock); + if (current != sch.task) { + spin_unlock(&sch.trace_lock); + raw_local_irq_restore(flags); + preempt_enable(); + return -EINVAL; + } + sch.task = NULL; + tr = sch.tr; + sch.tr = NULL; + spin_unlock(&sch.trace_lock); + } else +#endif + tr = cpu_traces + smp_processor_id(); + + atomic_inc(&tr->disabled); + if (tr->preempt_timestamp) { + cycles_t T0, T1; + unsigned long long tmp0; + + T0 = tr->preempt_timestamp; + T1 = get_cycles(); + tmp0 = preempt_max_latency; + if (T1 < T0) + T0 = T1; + delta = T1 - T0; + if (!report_latency(delta)) + goto out; + if (tr->critical_sequence != max_sequence || + down_trylock(&max_mutex)) + goto out; + + if (!preempt_thresh && preempt_max_latency > delta) { + raw_local_irq_restore(flags); + printk("bug3: updating %016Lx > %016Lx [%016Lx]?\n", + preempt_max_latency, delta, tmp0); + printk(" [%016Lx %016Lx]\n", T0, T1); + raw_local_irq_save(flags); + } + + preempt_max_latency = delta; + update_max_tr(tr); + + latency = cycles_to_usecs(delta); + + raw_local_irq_restore(flags); + if (preempt_thresh) + printk("(%16s-%-5d|#%d): %lu us user-latency " + "violates %lu us threshold.\n", + current->comm, current->pid, + raw_smp_processor_id(), latency, + cycles_to_usecs(preempt_thresh)); + else + printk("(%16s-%-5d|#%d): new %lu us user-latency.\n", + current->comm, current->pid, + raw_smp_processor_id(), latency); + raw_local_irq_save(flags); + + max_sequence++; + up(&max_mutex); +out: + tr->preempt_timestamp = 0; + } + atomic_dec(&tr->disabled); + raw_local_irq_restore(flags); + preempt_enable(); + + return 0; +} + +EXPORT_SYMBOL(user_trace_stop); + +void stop_trace(void) +{ + if (trace_print_at_crash) + trace_enabled = -1; +} + +static void print_entry(struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + int hardirq, softirq; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + printk("%-5d ", entry->pid); + + printk("%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.', + (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'n' : '.'); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) + printk("H"); + else { + if (hardirq) + printk("h"); + else { + if (softirq) + printk("s"); + else + printk("."); + } + } + + printk(":%d %ld.%03ldms: ", + entry->preempt_count, abs_usecs/1000, abs_usecs % 1000); + + printk_name(entry->u.fn.eip); + printk(" <= ("); + printk_name(entry->u.fn.parent_eip); + printk(")\n"); +} + +/* + * Print the current trace at crash time. + * + * We print it backwards, so that the newest (most interesting) entries + * are printed first. + */ +void print_last_trace(void) +{ + unsigned int idx0, idx, i; + struct cpu_trace *tr; + struct trace_entry *entry0, *entry, *next_entry; + + if (trace_enabled != -1 || !trace_print_at_crash) + return; + + trace_print_at_crash = 0; + + preempt_disable(); + tr = cpu_traces + smp_processor_id(); + + printk("Last %ld trace entries:\n", MAX_TRACE); + idx0 = tr->trace_idx; + printk("curr idx: %d\n", idx0); + if (idx0 >= MAX_TRACE) + idx0 = MAX_TRACE-1; + idx = idx0; + entry0 = tr->trace + idx0; + + for (i = 0; i < MAX_TRACE; i++) { + next_entry = tr->trace + idx; + if (idx == 0) + idx = MAX_TRACE-1; + else + idx--; + entry = tr->trace + idx; + if (entry->type == TRACE_FN) + print_entry(entry, entry0, next_entry); + } + printk("printed %ld entries\n", MAX_TRACE); + + preempt_enable(); +} + +#ifdef CONFIG_SMP +/* + * On SMP, try to 'peek' on other CPU's traces and record them + * in this CPU's trace. This way we get a rough idea about what's + * going on there, without the overhead of global tracing. + * + * (no need to make this PER_CPU, we bounce it around anyway.) + */ +unsigned long nmi_eips[NR_CPUS]; +unsigned long nmi_flags[NR_CPUS]; + +void notrace nmi_trace(unsigned long eip, unsigned long parent_eip, + unsigned long flags) +{ + int cpu, this_cpu = smp_processor_id(); + + __trace(eip, parent_eip); + + nmi_eips[this_cpu] = parent_eip; + nmi_flags[this_cpu] = flags; + for (cpu = 0; cpu < NR_CPUS; cpu++) + if (cpu_online(cpu) && cpu != this_cpu) { + __trace(eip, nmi_eips[cpu]); + __trace(eip, nmi_flags[cpu]); + } +} +#else +/* + * On UP, NMI tracing is quite simple: + */ +void notrace nmi_trace(unsigned long eip, unsigned long parent_eip, + unsigned long flags) +{ + __trace(eip, parent_eip); +} +#endif + +#endif + +#ifdef CONFIG_PREEMPT_TRACE + +static void print_preempt_trace(struct task_struct *task) +{ + unsigned int count = task->thread_info->preempt_count; + unsigned int i, lim = count & PREEMPT_MASK; + if (lim >= MAX_PREEMPT_TRACE) + lim = MAX_PREEMPT_TRACE-1; + printk("---------------------------\n"); + printk("| preempt count: %08x ]\n", count); + printk("| %d-level deep critical section nesting:\n", lim); + printk("----------------------------------------\n"); + for (i = 1; i <= lim; i++) { + printk(".. [<%08lx>] .... ", task->preempt_trace_eip[i]); + print_symbol("%s\n", task->preempt_trace_eip[i]); + printk(".....[<%08lx>] .. ( <= ", + task->preempt_trace_parent_eip[i]); + print_symbol("%s)\n", task->preempt_trace_parent_eip[i]); + } + printk("\n"); +} + +#endif + +#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_LATENCY_TRACE) +void print_traces(struct task_struct *task) +{ +#ifdef CONFIG_PREEMPT_TRACE + print_preempt_trace(task); +#endif +#ifdef CONFIG_LATENCY_TRACE + print_last_trace(); +#endif +} +#endif + +#ifdef CONFIG_LATENCY_TIMING + +static int preempt_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + cycles_t *max = data; + + return sprintf(page, "%ld\n", cycles_to_usecs(*max)); +} + +static int preempt_write_proc(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + unsigned int c, done = 0, val, sum = 0; + cycles_t *max = data; + + while (count) { + if (get_user(c, buffer)) + return -EFAULT; + val = c - '0'; + buffer++; + done++; + count--; + if (c == 0 || c == '\n') + break; + if (val > 9) + return -EINVAL; + sum *= 10; + sum += val; + } + *max = usecs_to_cycles(sum); + return done; +} + +#define PROCNAME_PML "sys/kernel/preempt_max_latency" +#define PROCNAME_PT "sys/kernel/preempt_thresh" + +static __init int latency_init(void) +{ + struct proc_dir_entry *entry; + int cpu; + + for (cpu = 0; cpu < NR_CPUS; cpu++) + cpu_traces[cpu].cpu = cpu; + + if (!(entry = create_proc_entry(PROCNAME_PML, 0644, NULL))) + printk("latency_init(): can't create %s\n", PROCNAME_PML); + else { + entry->nlink = 1; + entry->data = &preempt_max_latency; + entry->read_proc = preempt_read_proc; + entry->write_proc = preempt_write_proc; + } + + if (!(entry = create_proc_entry(PROCNAME_PT, 0644, NULL))) + printk("latency_init(): can't create %s\n", PROCNAME_PT); + else { + entry->nlink = 1; + entry->data = &preempt_thresh; + entry->read_proc = preempt_read_proc; + entry->write_proc = preempt_write_proc; + } + return 0; +} +__initcall(latency_init); + +#endif Index: linux-latency-tracing.q/kernel/latency_hist.c =================================================================== --- /dev/null +++ linux-latency-tracing.q/kernel/latency_hist.c @@ -0,0 +1,267 @@ +/* + * kernel/latency_hist.c + * + * Add support for histograms of preemption-off latency and + * interrupt-off latency and wakeup latency, it depends on + * Real-Time Preemption Support. + * + * Copyright (C) 2005 MontaVista Software, Inc. + * Yi Yang + * + */ +#include +#include +#include +#include +#include +#include +#include + +typedef struct hist_data_struct { + atomic_t hist_mode; /* 0 log, 1 don't log */ + unsigned long min_lat; + unsigned long avg_lat; + unsigned long max_lat; + unsigned long long beyond_hist_bound_samples; + unsigned long long accumulate_lat; + unsigned long long total_samples; + unsigned long long hist_array[MAX_ENTRY_NUM]; +} hist_data_t; + +static struct proc_dir_entry * latency_hist_root = NULL; +static char * latency_hist_proc_dir_root = "latency_hist"; + +static char * percpu_proc_name = "CPU"; + +#ifdef CONFIG_INTERRUPT_OFF_HIST +static DEFINE_PER_CPU(hist_data_t, interrupt_off_hist); +static char * interrupt_off_hist_proc_dir = "interrupt_off_latency"; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST +static DEFINE_PER_CPU(hist_data_t, preempt_off_hist); +static char * preempt_off_hist_proc_dir = "preempt_off_latency"; +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST +static DEFINE_PER_CPU(hist_data_t, wakeup_latency_hist); +static char * wakeup_latency_hist_proc_dir = "wakeup_latency"; +#endif + +static struct proc_dir_entry *entry[LATENCY_TYPE_NUM][NR_CPUS]; + +static inline u64 u64_div(u64 x, u64 y) +{ + do_div(x, y); + return x; +} + +void latency_hist(int latency_type, int cpu, unsigned long latency) +{ + hist_data_t * my_hist; + + if ((cpu < 0) || (cpu >= NR_CPUS) || (latency_type < INTERRUPT_LATENCY) + || (latency_type > WAKEUP_LATENCY) || (latency < 0)) + return; + + switch(latency_type) { +#ifdef CONFIG_INTERRUPT_OFF_HIST + case INTERRUPT_LATENCY: + my_hist = (hist_data_t *)&per_cpu(interrupt_off_hist, cpu); + break; +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + case PREEMPT_LATENCY: + my_hist = (hist_data_t *)&per_cpu(preempt_off_hist, cpu); + break; +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + case WAKEUP_LATENCY: + my_hist = (hist_data_t *)&per_cpu(wakeup_latency_hist, cpu); + break; +#endif + default: + return; + } + + if (atomic_read(&my_hist->hist_mode) == 0) + return; + + if (latency >= MAX_ENTRY_NUM) + my_hist->beyond_hist_bound_samples++; + else + my_hist->hist_array[latency]++; + + if (latency < my_hist->min_lat) + my_hist->min_lat = latency; + else if (latency > my_hist->max_lat) + my_hist->max_lat = latency; + + my_hist->total_samples++; + my_hist->accumulate_lat += latency; + my_hist->avg_lat = (unsigned long) u64_div(my_hist->accumulate_lat, + my_hist->total_samples); + return; +} + +static void *l_start(struct seq_file *m, loff_t * pos) +{ + loff_t *index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL); + loff_t index = *pos; + hist_data_t *my_hist = (hist_data_t *) m->private; + + if (!index_ptr) + return NULL; + + if (index == 0) { + atomic_dec(&my_hist->hist_mode); + seq_printf(m, "#Minimum latency: %lu microseconds.\n" + "#Average latency: %lu microseconds.\n" + "#Maximum latency: %lu microseconds.\n" + "#Total samples: %llu\n" + "#There are %llu samples greater or equal than %d microseconds\n" + "#usecs\t%16s\n" + , my_hist->min_lat + , my_hist->avg_lat + , my_hist->max_lat + , my_hist->total_samples + , my_hist->beyond_hist_bound_samples + , MAX_ENTRY_NUM, "samples"); + } + if (index >= MAX_ENTRY_NUM) + return NULL; + + *index_ptr = index; + return index_ptr; +} + +static void *l_next(struct seq_file *m, void *p, loff_t * pos) +{ + loff_t *index_ptr = p; + hist_data_t *my_hist = (hist_data_t *) m->private; + + if (++*pos >= MAX_ENTRY_NUM) { + atomic_inc(&my_hist->hist_mode); + return NULL; + } + *index_ptr = *pos; + return index_ptr; +} + +static void l_stop(struct seq_file *m, void *p) +{ + kfree(p); +} + +static int l_show(struct seq_file *m, void *p) +{ + int index = *(loff_t *) p; + hist_data_t *my_hist = (hist_data_t *) m->private; + + seq_printf(m, "%5d\t%16llu\n", index, my_hist->hist_array[index]); + return 0; +} + +static struct seq_operations latency_hist_seq_op = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show +}; + +static int latency_hist_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *entry_ptr = NULL; + int ret, i, j, break_flags = 0; + struct seq_file *seq; + + entry_ptr = PDE(file->f_dentry->d_inode); + for (i = 0; i < LATENCY_TYPE_NUM; i++) { + for (j = 0; j < NR_CPUS; j++) { + if (entry[i][j] == NULL) + continue; + if (entry_ptr->low_ino == entry[i][j]->low_ino) { + break_flags = 1; + break; + } + } + if (break_flags == 1) + break; + } + ret = seq_open(file, &latency_hist_seq_op); + if (break_flags == 1) { + seq = (struct seq_file *)file->private_data; + seq->private = entry[i][j]->data; + } + return ret; +} + +static struct file_operations latency_hist_seq_fops = { + .open = latency_hist_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static __init int latency_hist_init(void) +{ + struct proc_dir_entry *tmp_parent_proc_dir; + int i = 0, len = 0; + hist_data_t *my_hist; + char procname[64]; + + latency_hist_root = proc_mkdir(latency_hist_proc_dir_root, NULL); + + +#ifdef CONFIG_INTERRUPT_OFF_HIST + tmp_parent_proc_dir = proc_mkdir(interrupt_off_hist_proc_dir, latency_hist_root); + for (i = 0; i < NR_CPUS; i++) { + len = sprintf(procname, "%s%d", percpu_proc_name, i); + procname[len] = '\0'; + entry[INTERRUPT_LATENCY][i] = + create_proc_entry(procname, 0, tmp_parent_proc_dir); + entry[INTERRUPT_LATENCY][i]->data = (void *)&per_cpu(interrupt_off_hist, i); + entry[INTERRUPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops; + my_hist = (hist_data_t *) entry[INTERRUPT_LATENCY][i]->data; + atomic_set(&my_hist->hist_mode,1); + my_hist->min_lat = 0xFFFFFFFFUL; + } +#endif + +#ifdef CONFIG_PREEMPT_OFF_HIST + tmp_parent_proc_dir = proc_mkdir(preempt_off_hist_proc_dir, latency_hist_root); + for (i = 0; i < NR_CPUS; i++) { + len = sprintf(procname, "%s%d", percpu_proc_name, i); + procname[len] = '\0'; + entry[PREEMPT_LATENCY][i] = + create_proc_entry(procname, 0, tmp_parent_proc_dir); + entry[PREEMPT_LATENCY][i]->data = (void *)&per_cpu(preempt_off_hist, i); + entry[PREEMPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops; + my_hist = (hist_data_t *) entry[PREEMPT_LATENCY][i]->data; + atomic_set(&my_hist->hist_mode,1); + my_hist->min_lat = 0xFFFFFFFFUL; + } +#endif + +#ifdef CONFIG_WAKEUP_LATENCY_HIST + tmp_parent_proc_dir = proc_mkdir(wakeup_latency_hist_proc_dir, latency_hist_root); + for (i = 0; i < NR_CPUS; i++) { + len = sprintf(procname, "%s%d", percpu_proc_name, i); + procname[len] = '\0'; + entry[WAKEUP_LATENCY][i] = + create_proc_entry(procname, 0, tmp_parent_proc_dir); + entry[WAKEUP_LATENCY][i]->data = (void *)&per_cpu(wakeup_latency_hist, i); + entry[WAKEUP_LATENCY][i]->proc_fops = &latency_hist_seq_fops; + my_hist = (hist_data_t *) entry[WAKEUP_LATENCY][i]->data; + atomic_set(&my_hist->hist_mode,1); + my_hist->min_lat = 0xFFFFFFFFUL; + } +#endif + return 0; + +} + +__initcall(latency_hist_init); + Index: linux-latency-tracing.q/kernel/sched.c =================================================================== --- linux-latency-tracing.q.orig/kernel/sched.c +++ linux-latency-tracing.q/kernel/sched.c @@ -715,11 +715,18 @@ static inline void dec_nr_running(task_t } #endif +static inline void trace_start_sched_wakeup(task_t *p, runqueue_t *rq) +{ + if (TASK_PREEMPTS_CURR(p, rq) && (p != rq->curr)) + __trace_start_sched_wakeup(p); +} + /* * __activate_task - move a task to the runqueue. */ static inline void __activate_task(task_t *p, runqueue_t *rq) { + trace_special_pid(p->pid, p->prio, rq->nr_running); enqueue_task(p, rq->active); inc_nr_running(p, rq); } @@ -731,6 +738,7 @@ static inline void __activate_idle_task( { enqueue_task_head(p, rq->active); inc_nr_running(p, rq); + WARN_ON(rt_task(p)); } static int recalc_task_prio(task_t *p, unsigned long long now) @@ -1381,8 +1389,10 @@ out_activate: * to be considered on this CPU.) */ if (!sync || cpu != this_cpu) { - if (TASK_PREEMPTS_CURR(p, rq)) + if (TASK_PREEMPTS_CURR(p, rq)) { resched_task(rq->curr); + trace_start_sched_wakeup(p, rq); + } } success = 1; @@ -1396,15 +1406,19 @@ out: int fastcall wake_up_process(task_t *p) { - return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); + mcount(); + return ret; } EXPORT_SYMBOL(wake_up_process); int fastcall wake_up_state(task_t *p, unsigned int state) { - return try_to_wake_up(p, state, 0); + int ret = try_to_wake_up(p, state, 0); + mcount(); + return ret; } /* @@ -1512,6 +1526,7 @@ void fastcall wake_up_new_task(task_t *p inc_nr_running(p, rq); } set_need_resched(); + trace_start_sched_wakeup(p, rq); } else /* Run child last */ __activate_task(p, rq); @@ -1633,6 +1648,7 @@ static inline void finish_task_switch(ru prev_task_flags = prev->flags; finish_arch_switch(prev); finish_lock_switch(rq, prev); + trace_stop_sched_switched(current); if (mm) mmdrop(mm); if (unlikely(prev_task_flags & PF_DEAD)) @@ -1679,6 +1695,8 @@ task_t * context_switch(runqueue_t *rq, rq->prev_mm = oldmm; } + trace_cmdline(); + /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -2795,13 +2813,13 @@ static inline unsigned long smt_slice(ta return p->time_slice * (100 - sd->per_cpu_gain) / 100; } -static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) { struct sched_domain *tmp, *sd = NULL; cpumask_t sibling_map; prio_array_t *array; - int ret = 0, i; - task_t *p; + int ret = 0, i, stop_sched_switched = 0; + task_t *p = NULL; for_each_domain(this_cpu, tmp) if (tmp->flags & SD_SHARE_CPUPOWER) @@ -2863,6 +2881,12 @@ static inline int dependent_sleeper(int !TASK_PREEMPTS_CURR(p, smt_rq) && smt_slice(smt_curr, sd) > task_timeslice(p)) ret = 1; + if (ret) { + trace_special_pid(smt_curr->pid, smt_curr->prio, + smt_curr->static_prio); + trace_special_pid(p->pid, p->prio, p->static_prio); + stop_sched_switched = 1; + } check_smt_task: if ((!smt_curr->mm && smt_curr != smt_rq->idle) || @@ -2893,6 +2917,9 @@ check_smt_task: out_unlock: for_each_cpu_mask(i, sibling_map) spin_unlock(&cpu_rq(i)->lock); + if (p && stop_sched_switched) + trace_stop_sched_switched(p); + return ret; } #else @@ -2906,35 +2933,44 @@ static inline int dependent_sleeper(int } #endif -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) +#if defined(CONFIG_LATENCY_TRACE) && defined(CONFIG_DEBUG_MUTEXES) -void fastcall add_preempt_count(int val) +static void trace_array(prio_array_t *array) { - /* - * Underflow? - */ - BUG_ON((preempt_count() < 0)); - preempt_count() += val; - /* - * Spinlock count overflowing soon? - */ - BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); + int i; + task_t *p; + struct list_head *head, *tmp; + + for (i = 0; i < MAX_PRIO; i++) { + head = array->queue + i; + if (list_empty(head)) { + WARN_ON(test_bit(i, array->bitmap)); + continue; + } + WARN_ON(!test_bit(i, array->bitmap)); + list_for_each(tmp, head) { + p = list_entry(tmp, task_t, run_list); + trace_special_pid(p->pid, p->prio, + p->policy == SCHED_NORMAL ? + p->static_prio : + (MAX_RT_PRIO-1) - p->rt_priority); + } + } } -EXPORT_SYMBOL(add_preempt_count); -void fastcall sub_preempt_count(int val) +static inline void trace_all_runnable_tasks(runqueue_t *rq) +{ + if (trace_enabled) { + trace_array(rq->active); + trace_array(rq->expired); + } +} + +#else + +static inline void trace_all_runnable_tasks(runqueue_t *rq) { - /* - * Underflow? - */ - BUG_ON(val > preempt_count()); - /* - * Is the spinlock portion underflowing? - */ - BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); - preempt_count() -= val; } -EXPORT_SYMBOL(sub_preempt_count); #endif @@ -2958,6 +2994,7 @@ asmlinkage void __sched schedule(void) * Otherwise, whine if we are scheduling when we should not be. */ if (likely(!current->exit_state)) { + stop_trace(); if (unlikely(in_atomic())) { printk(KERN_ERR "scheduling while atomic: " "%s/0x%08x/%d\n", @@ -3097,6 +3134,8 @@ switch_tasks: prev->sleep_avg = 0; prev->timestamp = prev->last_ran = now; + trace_all_runnable_tasks(rq); + sched_info_switch(prev, next); if (likely(prev != next)) { next->timestamp = now; @@ -3107,14 +3146,17 @@ switch_tasks: prepare_task_switch(rq, next); prev = context_switch(rq, prev, next); barrier(); + trace_special_pid(prev->pid, prev->prio, current->prio); /* * this_rq must be evaluated again because prev may have moved * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */ finish_task_switch(this_rq(), prev); - } else + } else { spin_unlock_irq(&rq->lock); + trace_stop_sched_switched(next); + } prev = current; if (unlikely(reacquire_kernel_lock(prev) < 0)) @@ -5660,6 +5702,7 @@ void __might_sleep(char *file, int line) if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; + stop_trace(); printk(KERN_ERR "Debug: sleeping function called from invalid" " context at %s:%d\n", file, line); printk("in_atomic():%d, irqs_disabled():%d\n", Index: linux-latency-tracing.q/kernel/sysctl.c =================================================================== --- linux-latency-tracing.q.orig/kernel/sysctl.c +++ linux-latency-tracing.q/kernel/sysctl.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -278,6 +279,82 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_WAKEUP_TIMING + { + .ctl_name = KERN_PANIC, + .procname = "wakeup_timing", + .data = &wakeup_timing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_LATENCY_TRACE + { + .ctl_name = KERN_PANIC, + .procname = "trace_enabled", + .data = &trace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "mcount_enabled", + .data = &mcount_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "trace_user_triggered", + .data = &trace_user_triggered, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "trace_user_trigger_irq", + .data = &trace_user_trigger_irq, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "trace_freerunning", + .data = &trace_freerunning, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "trace_print_at_crash", + .data = &trace_print_at_crash, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "trace_verbose", + .data = &trace_verbose, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "trace_all_cpus", + .data = &trace_all_cpus, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_CORE_USES_PID, .procname = "core_uses_pid", Index: linux-latency-tracing.q/kernel/time.c =================================================================== --- linux-latency-tracing.q.orig/kernel/time.c +++ linux-latency-tracing.q/kernel/time.c @@ -97,8 +97,23 @@ asmlinkage long sys_stime(time_t __user #endif /* __ARCH_WANT_SYS_TIME */ +int timeofday_API_hacks(void *tv, void *tz) +{ +#ifdef CONFIG_LATENCY_TRACE + if (!tv && ((long)tz == 1)) + return user_trace_start(); + if (!tv && !tz) + return user_trace_stop(); +#endif + return 1; +} + asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz) { + int ret = timeofday_API_hacks(tv, tz); + if (ret != 1) + return ret; + if (likely(tv != NULL)) { struct timeval ktv; do_gettimeofday(&ktv); @@ -184,6 +199,10 @@ asmlinkage long sys_settimeofday(struct struct timespec new_ts; struct timezone new_tz; + int ret = timeofday_API_hacks(tv, tz); + if (ret != 1) + return ret; + if (tv) { if (copy_from_user(&user_tv, tv, sizeof(*tv))) return -EFAULT; Index: linux-latency-tracing.q/lib/Kconfig.debug =================================================================== --- linux-latency-tracing.q.orig/lib/Kconfig.debug +++ linux-latency-tracing.q/lib/Kconfig.debug @@ -111,6 +111,146 @@ config DEBUG_SPINLOCK_SLEEP If you say Y here, various routines which may sleep will become very noisy if they are called with a spinlock held. +config WAKEUP_TIMING + bool "Wakeup latency timing" + default y + help + This option measures the time spent from a highprio thread being + woken up to it getting scheduled on a CPU, with microsecond + accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started via: + + echo 0 > /proc/sys/kernel/preempt_max_latency + +config WAKEUP_LATENCY_HIST + bool "wakeup latency histogram" + default n + depends on WAKEUP_TIMING + help + This option logs all the wakeup latency timing to a big histogram + bucket, in the meanwhile, it also dummies up printk produced by + wakeup latency timing. + + The wakeup latency timing histogram can be viewed via: + + cat /proc/latency_hist/wakeup_latency/CPU* + + (Note: * presents CPU ID.) + +config PREEMPT_TRACE + bool + default y + depends on DEBUG_PREEMPT + +config CRITICAL_PREEMPT_TIMING + bool "Non-preemptible critical section latency timing" + default n + depends on PREEMPT + help + This option measures the time spent in preempt-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started via: + + echo 0 > /proc/sys/kernel/preempt_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the irqs-off timing option can be + used together or separately.) + +config PREEMPT_OFF_HIST + bool "non-preemptible critical section latency histogram" + default n + depends on CRITICAL_PREEMPT_TIMING + help + This option logs all the non-preemptible critical section latency + timing to a big histogram bucket, in the meanwhile, it also + dummies up printk produced by non-preemptible critical section + latency timing. + + The non-preemptible critical section latency timing histogram can + be viewed via: + + cat /proc/latency_hist/preempt_off_latency/CPU* + + (Note: * presents CPU ID.) + +config CRITICAL_IRQSOFF_TIMING + bool "Interrupts-off critical section latency timing" + default n + help + This option measures the time spent in irqs-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started via: + + echo 0 > /proc/sys/kernel/preempt_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the preempt-off timing option can be + used together or separately.) + +config INTERRUPT_OFF_HIST + bool "interrupts-off critical section latency histogram" + default n + depends on CRITICAL_IRQSOFF_TIMING + help + This option logs all the interrupts-off critical section latency + timing to a big histogram bucket, in the meanwhile, it also + dummies up printk produced by interrupts-off critical section + latency timing. + + The interrupts-off critical section latency timing histogram can + be viewed via: + + cat /proc/latency_hist/interrupt_off_latency/CPU* + + (Note: * presents CPU ID.) + +config CRITICAL_TIMING + bool + default y + depends on CRITICAL_PREEMPT_TIMING || CRITICAL_IRQSOFF_TIMING + +config LATENCY_TIMING + bool + default y + depends on WAKEUP_TIMING || CRITICAL_TIMING + +config CRITICAL_LATENCY_HIST + bool + default y + depends on PREEMPT_OFF_HIST || INTERRUPT_OFF_HIST + +config LATENCY_HIST + bool + default y + depends on WAKEUP_LATENCY_HIST || CRITICAL_LATENCY_HIST + +config LATENCY_TRACE + bool "Latency tracing" + default n + depends on LATENCY_TIMING + help + This option enables a kernel tracing mechanism that will track + precise function-call granularity kernel execution during + wakeup paths or critical sections. When this option is enabled + then the last maximum latency timing event's full trace can be + found in /proc/latency_trace, in a human-readable (or rather as + some would say, in a kernel-developer-readable) form. + + (Note that kernel size and overhead increases noticeably + with this option enabled.) + +config MCOUNT + bool + depends on LATENCY_TRACE + default y + config DEBUG_KOBJECT bool "kobject debugging" depends on DEBUG_KERNEL @@ -176,9 +316,9 @@ config DEBUG_VM If unsure, say N. -config FRAME_POINTER +config USE_FRAME_POINTER bool "Compile the kernel with frame pointers" - depends on DEBUG_KERNEL && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML) + depends on DEBUG_KERNEL && !MCOUNT && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML) default y if DEBUG_INFO && UML help If you say Y here the resulting kernel image will be slightly larger @@ -186,6 +326,11 @@ config FRAME_POINTER some architectures or if you use external debuggers. If you don't debug the kernel, you can say N. +config FRAME_POINTER + bool + depends on USE_FRAME_POINTER || MCOUNT + default y + config RCU_TORTURE_TEST tristate "torture tests for RCU" depends on DEBUG_KERNEL