diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt index 3370bc4..f5cfc62 100644 --- a/Documentation/device-mapper/thin-provisioning.txt +++ b/Documentation/device-mapper/thin-provisioning.txt @@ -287,6 +287,17 @@ iii) Messages the current transaction id is when you change it with this compare-and-swap message. + reserve_metadata_snap + + Reserve a copy of the data mapping btree for use by userland. + This allows userland to inspect the mappings as they were when + this message was executed. Use the pool's status command to + get the root block associated with the metadata snapshot. + + release_metadata_snap + + Release a previously reserved copy of the data mapping btree. + 'thin' target ------------- diff --git a/Documentation/x86/efi-stub.txt b/Documentation/x86/efi-stub.txt new file mode 100644 index 0000000..44e6bb6 --- /dev/null +++ b/Documentation/x86/efi-stub.txt @@ -0,0 +1,65 @@ + The EFI Boot Stub + --------------------------- + +On the x86 platform, a bzImage can masquerade as a PE/COFF image, +thereby convincing EFI firmware loaders to load it as an EFI +executable. The code that modifies the bzImage header, along with the +EFI-specific entry point that the firmware loader jumps to are +collectively known as the "EFI boot stub", and live in +arch/x86/boot/header.S and arch/x86/boot/compressed/eboot.c, +respectively. + +By using the EFI boot stub it's possible to boot a Linux kernel +without the use of a conventional EFI boot loader, such as grub or +elilo. Since the EFI boot stub performs the jobs of a boot loader, in +a certain sense it *IS* the boot loader. + +The EFI boot stub is enabled with the CONFIG_EFI_STUB kernel option. + + +**** How to install bzImage.efi + +The bzImage located in arch/x86/boot/bzImage must be copied to the EFI +System Partiion (ESP) and renamed with the extension ".efi". Without +the extension the EFI firmware loader will refuse to execute it. It's +not possible to execute bzImage.efi from the usual Linux file systems +because EFI firmware doesn't have support for them. + + +**** Passing kernel parameters from the EFI shell + +Arguments to the kernel can be passed after bzImage.efi, e.g. + + fs0:> bzImage.efi console=ttyS0 root=/dev/sda4 + + +**** The "initrd=" option + +Like most boot loaders, the EFI stub allows the user to specify +multiple initrd files using the "initrd=" option. This is the only EFI +stub-specific command line parameter, everything else is passed to the +kernel when it boots. + +The path to the initrd file must be an absolute path from the +beginning of the ESP, relative path names do not work. Also, the path +is an EFI-style path and directory elements must be separated with +backslashes (\). For example, given the following directory layout, + +fs0:> + Kernels\ + bzImage.efi + initrd-large.img + + Ramdisks\ + initrd-small.img + initrd-medium.img + +to boot with the initrd-large.img file if the current working +directory is fs0:\Kernels, the following command must be used, + + fs0:\Kernels> bzImage.efi initrd=\Kernels\initrd-large.img + +Notice how bzImage.efi can be specified with a relative path. That's +because the image we're executing is interpreted by the EFI shell, +which understands relative paths, whereas the rest of the command line +is passed to bzImage.efi. diff --git a/Makefile b/Makefile index dda21c3..0d718ed 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 3 -PATCHLEVEL = 4 +PATCHLEVEL = 5 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Saber-toothed Squirrel # *DOCUMENTATION* diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d700811..c70684f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1506,6 +1506,8 @@ config EFI_STUB This kernel feature allows a bzImage to be loaded directly by EFI firmware without the use of a bootloader. + See Documentation/x86/efi-stub.txt for more information. + config SECCOMP def_bool y prompt "Enable seccomp to safely compute untrusted bytecode" diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 2c14e76..4e85f5f 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -16,6 +16,26 @@ static efi_system_table_t *sys_table; +static void efi_printk(char *str) +{ + char *s8; + + for (s8 = str; *s8; s8++) { + struct efi_simple_text_output_protocol *out; + efi_char16_t ch[2] = { 0 }; + + ch[0] = *s8; + out = (struct efi_simple_text_output_protocol *)sys_table->con_out; + + if (*s8 == '\n') { + efi_char16_t nl[2] = { '\r', 0 }; + efi_call_phys2(out->output_string, out, nl); + } + + efi_call_phys2(out->output_string, out, ch); + } +} + static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size, unsigned long *desc_size) { @@ -531,8 +551,10 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image, EFI_LOADER_DATA, nr_initrds * sizeof(*initrds), &initrds); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for initrds\n"); goto fail; + } str = (char *)(unsigned long)hdr->cmd_line_ptr; for (i = 0; i < nr_initrds; i++) { @@ -575,32 +597,42 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image, status = efi_call_phys3(boottime->handle_protocol, image->device_handle, &fs_proto, &io); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to handle fs_proto\n"); goto free_initrds; + } status = efi_call_phys2(io->open_volume, io, &fh); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to open volume\n"); goto free_initrds; + } } status = efi_call_phys5(fh->open, fh, &h, filename_16, EFI_FILE_MODE_READ, (u64)0); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to open initrd file\n"); goto close_handles; + } initrd->handle = h; info_sz = 0; status = efi_call_phys4(h->get_info, h, &info_guid, &info_sz, NULL); - if (status != EFI_BUFFER_TOO_SMALL) + if (status != EFI_BUFFER_TOO_SMALL) { + efi_printk("Failed to get initrd info size\n"); goto close_handles; + } grow: status = efi_call_phys3(sys_table->boottime->allocate_pool, EFI_LOADER_DATA, info_sz, &info); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for initrd info\n"); goto close_handles; + } status = efi_call_phys4(h->get_info, h, &info_guid, &info_sz, info); @@ -612,8 +644,10 @@ grow: file_sz = info->file_size; efi_call_phys1(sys_table->boottime->free_pool, info); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to get initrd info\n"); goto close_handles; + } initrd->size = file_sz; initrd_total += file_sz; @@ -629,11 +663,14 @@ grow: */ status = high_alloc(initrd_total, 0x1000, &initrd_addr, hdr->initrd_addr_max); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc highmem for initrds\n"); goto close_handles; + } /* We've run out of free low memory. */ if (initrd_addr > hdr->initrd_addr_max) { + efi_printk("We've run out of free low memory\n"); status = EFI_INVALID_PARAMETER; goto free_initrd_total; } @@ -652,8 +689,10 @@ grow: status = efi_call_phys3(fh->read, initrds[j].handle, &chunksize, addr); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to read initrd\n"); goto free_initrd_total; + } addr += chunksize; size -= chunksize; } @@ -674,7 +713,7 @@ free_initrd_total: low_free(initrd_total, initrd_addr); close_handles: - for (k = j; k < nr_initrds; k++) + for (k = j; k < i; k++) efi_call_phys1(fh->close, initrds[k].handle); free_initrds: efi_call_phys1(sys_table->boottime->free_pool, initrds); @@ -732,8 +771,10 @@ static efi_status_t make_boot_params(struct boot_params *boot_params, options_size++; /* NUL termination */ status = low_alloc(options_size, 1, &cmdline); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for cmdline\n"); goto fail; + } s1 = (u8 *)(unsigned long)cmdline; s2 = (u16 *)options; @@ -895,12 +936,16 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table) status = efi_call_phys3(sys_table->boottime->handle_protocol, handle, &proto, (void *)&image); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n"); goto fail; + } status = low_alloc(0x4000, 1, (unsigned long *)&boot_params); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc lowmem for boot params\n"); goto fail; + } memset(boot_params, 0x0, 0x4000); @@ -933,8 +978,10 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table) if (status != EFI_SUCCESS) { status = low_alloc(hdr->init_size, hdr->kernel_alignment, &start); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for kernel\n"); goto fail; + } } hdr->code32_start = (__u32)start; @@ -945,19 +992,25 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table) status = efi_call_phys3(sys_table->boottime->allocate_pool, EFI_LOADER_DATA, sizeof(*gdt), (void **)&gdt); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for gdt structure\n"); goto fail; + } gdt->size = 0x800; status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for gdt\n"); goto fail; + } status = efi_call_phys3(sys_table->boottime->allocate_pool, EFI_LOADER_DATA, sizeof(*idt), (void **)&idt); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for idt structure\n"); goto fail; + } idt->size = 0; idt->address = 0; diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index 3925166..3b6e156 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -58,4 +58,10 @@ struct efi_uga_draw_protocol { void *blt; }; +struct efi_simple_text_output_protocol { + void *reset; + void *output_string; + void *test_string; +}; + #endif /* BOOT_COMPRESSED_EBOOT_H */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 18d9005..b0767bc 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -34,7 +34,7 @@ #ifndef __ASSEMBLY__ extern void mcount(void); -extern int modifying_ftrace_code; +extern atomic_t modifying_ftrace_code; static inline unsigned long ftrace_call_adjust(unsigned long addr) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 82f29e7..6b9333b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1101,14 +1101,20 @@ int is_debug_stack(unsigned long addr) addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); } +static DEFINE_PER_CPU(u32, debug_stack_use_ctr); + void debug_stack_set_zero(void) { + this_cpu_inc(debug_stack_use_ctr); load_idt((const struct desc_ptr *)&nmi_idt_descr); } void debug_stack_reset(void) { - load_idt((const struct desc_ptr *)&idt_descr); + if (WARN_ON(!this_cpu_read(debug_stack_use_ctr))) + return; + if (this_cpu_dec_return(debug_stack_use_ctr) == 0) + load_idt((const struct desc_ptr *)&idt_descr); } #else /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 320852d..7d65133 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -191,6 +191,44 @@ ENDPROC(native_usergs_sysret64) .endm /* + * When dynamic function tracer is enabled it will add a breakpoint + * to all locations that it is about to modify, sync CPUs, update + * all the code, sync CPUs, then remove the breakpoints. In this time + * if lockdep is enabled, it might jump back into the debug handler + * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). + * + * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to + * make sure the stack pointer does not get reset back to the top + * of the debug stack, and instead just reuses the current stack. + */ +#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) + +.macro TRACE_IRQS_OFF_DEBUG + call debug_stack_set_zero + TRACE_IRQS_OFF + call debug_stack_reset +.endm + +.macro TRACE_IRQS_ON_DEBUG + call debug_stack_set_zero + TRACE_IRQS_ON + call debug_stack_reset +.endm + +.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET + bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON_DEBUG +1: +.endm + +#else +# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF +# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON +# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ +#endif + +/* * C code is not supposed to know about undefined top of stack. Every time * a C function with an pt_regs argument is called from the SYSCALL based * fast path FIXUP_TOP_OF_STACK is needed. @@ -1098,7 +1136,7 @@ ENTRY(\sym) subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid - TRACE_IRQS_OFF + TRACE_IRQS_OFF_DEBUG movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) @@ -1393,7 +1431,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip) ENTRY(paranoid_exit) DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF + TRACE_IRQS_OFF_DEBUG testl %ebx,%ebx /* swapgs needed? */ jnz paranoid_restore testl $3,CS(%rsp) @@ -1404,7 +1442,7 @@ paranoid_swapgs: RESTORE_ALL 8 jmp irq_return paranoid_restore: - TRACE_IRQS_IRETQ 0 + TRACE_IRQS_IRETQ_DEBUG 0 RESTORE_ALL 8 jmp irq_return paranoid_userspace: diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 32ff365..c3a7cb4 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -100,7 +100,7 @@ static const unsigned char *ftrace_nop_replace(void) } static int -ftrace_modify_code(unsigned long ip, unsigned const char *old_code, +ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, unsigned const char *new_code) { unsigned char replaced[MCOUNT_INSN_SIZE]; @@ -141,7 +141,20 @@ int ftrace_make_nop(struct module *mod, old = ftrace_call_replace(ip, addr); new = ftrace_nop_replace(); - return ftrace_modify_code(rec->ip, old, new); + /* + * On boot up, and when modules are loaded, the MCOUNT_ADDR + * is converted to a nop, and will never become MCOUNT_ADDR + * again. This code is either running before SMP (on boot up) + * or before the code will ever be executed (module load). + * We do not want to use the breakpoint version in this case, + * just modify the code directly. + */ + if (addr == MCOUNT_ADDR) + return ftrace_modify_code_direct(rec->ip, old, new); + + /* Normal cases use add_brk_on_nop */ + WARN_ONCE(1, "invalid use of ftrace_make_nop"); + return -EINVAL; } int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) @@ -152,9 +165,47 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) old = ftrace_nop_replace(); new = ftrace_call_replace(ip, addr); - return ftrace_modify_code(rec->ip, old, new); + /* Should only be called when module is loaded */ + return ftrace_modify_code_direct(rec->ip, old, new); } +/* + * The modifying_ftrace_code is used to tell the breakpoint + * handler to call ftrace_int3_handler(). If it fails to + * call this handler for a breakpoint added by ftrace, then + * the kernel may crash. + * + * As atomic_writes on x86 do not need a barrier, we do not + * need to add smp_mb()s for this to work. It is also considered + * that we can not read the modifying_ftrace_code before + * executing the breakpoint. That would be quite remarkable if + * it could do that. Here's the flow that is required: + * + * CPU-0 CPU-1 + * + * atomic_inc(mfc); + * write int3s + * // implicit (r)mb + * if (atomic_read(mfc)) + * call ftrace_int3_handler() + * + * Then when we are finished: + * + * atomic_dec(mfc); + * + * If we hit a breakpoint that was not set by ftrace, it does not + * matter if ftrace_int3_handler() is called or not. It will + * simply be ignored. But it is crucial that a ftrace nop/caller + * breakpoint is handled. No other user should ever place a + * breakpoint on an ftrace nop/caller location. It must only + * be done by this code. + */ +atomic_t modifying_ftrace_code __read_mostly; + +static int +ftrace_modify_code(unsigned long ip, unsigned const char *old_code, + unsigned const char *new_code); + int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); @@ -163,13 +214,17 @@ int ftrace_update_ftrace_func(ftrace_func_t func) memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(ip, (unsigned long)func); + + /* See comment above by declaration of modifying_ftrace_code */ + atomic_inc(&modifying_ftrace_code); + ret = ftrace_modify_code(ip, old, new); + atomic_dec(&modifying_ftrace_code); + return ret; } -int modifying_ftrace_code __read_mostly; - /* * A breakpoint was added to the code address we are about to * modify, and this is the handle that will just skip over it. @@ -489,13 +544,46 @@ void ftrace_replace_code(int enable) } } +static int +ftrace_modify_code(unsigned long ip, unsigned const char *old_code, + unsigned const char *new_code) +{ + int ret; + + ret = add_break(ip, old_code); + if (ret) + goto out; + + run_sync(); + + ret = add_update_code(ip, new_code); + if (ret) + goto fail_update; + + run_sync(); + + ret = ftrace_write(ip, new_code, 1); + if (ret) { + ret = -EPERM; + goto out; + } + run_sync(); + out: + return ret; + + fail_update: + probe_kernel_write((void *)ip, &old_code[0], 1); + goto out; +} + void arch_ftrace_update_code(int command) { - modifying_ftrace_code++; + /* See comment above by declaration of modifying_ftrace_code */ + atomic_inc(&modifying_ftrace_code); ftrace_modify_all_code(command); - modifying_ftrace_code--; + atomic_dec(&modifying_ftrace_code); } int __init ftrace_dyn_arch_init(void *data) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 9087527..a0b2f84 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -444,14 +444,16 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs) */ if (unlikely(is_debug_stack(regs->sp))) { debug_stack_set_zero(); - __get_cpu_var(update_debug_stack) = 1; + this_cpu_write(update_debug_stack, 1); } } static inline void nmi_nesting_postprocess(void) { - if (unlikely(__get_cpu_var(update_debug_stack))) + if (unlikely(this_cpu_read(update_debug_stack))) { debug_stack_reset(); + this_cpu_write(update_debug_stack, 0); + } } #endif diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 13b1990..c4c6a5c 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1211,12 +1211,6 @@ static long x32_arch_ptrace(struct task_struct *child, 0, sizeof(struct user_i387_struct), datap); - /* normal 64bit interface to access TLS data. - Works just like arch_prctl, except that the arguments - are reversed. */ - case PTRACE_ARCH_PRCTL: - return do_arch_prctl(child, data, addr); - default: return compat_ptrace_request(child, request, addr, data); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ff08457..05b31d9 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -303,8 +303,12 @@ gp_in_kernel: dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) { #ifdef CONFIG_DYNAMIC_FTRACE - /* ftrace must be first, everything else may cause a recursive crash */ - if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs)) + /* + * ftrace must be first, everything else may cause a recursive crash. + * See note by declaration of modifying_ftrace_code in ftrace.c + */ + if (unlikely(atomic_read(&modifying_ftrace_code)) && + ftrace_int3_handler(regs)) return; #endif #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 754f38f..638dae0 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -61,11 +62,11 @@ struct multipath { struct list_head list; struct dm_target *ti; - spinlock_t lock; - const char *hw_handler_name; char *hw_handler_params; + spinlock_t lock; + unsigned nr_priority_groups; struct list_head priority_groups; @@ -81,16 +82,17 @@ struct multipath { struct priority_group *next_pg; /* Switch to this PG if set */ unsigned repeat_count; /* I/Os left before calling PS again */ - unsigned queue_io; /* Must we queue all I/O? */ - unsigned queue_if_no_path; /* Queue I/O if last path fails? */ - unsigned saved_queue_if_no_path;/* Saved state during suspension */ + unsigned queue_io:1; /* Must we queue all I/O? */ + unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ + unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ + unsigned pg_init_retries; /* Number of times to retry pg_init */ unsigned pg_init_count; /* Number of times pg_init called */ unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ + unsigned queue_size; struct work_struct process_queued_ios; struct list_head queued_ios; - unsigned queue_size; struct work_struct trigger_event; @@ -328,14 +330,18 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes) /* * Loop through priority groups until we find a valid path. * First time we skip PGs marked 'bypassed'. - * Second time we only try the ones we skipped. + * Second time we only try the ones we skipped, but set + * pg_init_delay_retry so we do not hammer controllers. */ do { list_for_each_entry(pg, &m->priority_groups, list) { if (pg->bypassed == bypassed) continue; - if (!__choose_path_in_pg(m, pg, nr_bytes)) + if (!__choose_path_in_pg(m, pg, nr_bytes)) { + if (!bypassed) + m->pg_init_delay_retry = 1; return; + } } } while (bypassed--); @@ -481,9 +487,6 @@ static void process_queued_ios(struct work_struct *work) spin_lock_irqsave(&m->lock, flags); - if (!m->queue_size) - goto out; - if (!m->current_pgpath) __choose_pgpath(m, 0); @@ -496,7 +499,6 @@ static void process_queued_ios(struct work_struct *work) if (m->pg_init_required && !m->pg_init_in_progress && pgpath) __pg_init_all_paths(m); -out: spin_unlock_irqrestore(&m->lock, flags); if (!must_queue) dispatch_queued_ios(m); @@ -1517,11 +1519,16 @@ out: static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) { - struct multipath *m = (struct multipath *) ti->private; - struct block_device *bdev = NULL; - fmode_t mode = 0; + struct multipath *m = ti->private; + struct block_device *bdev; + fmode_t mode; unsigned long flags; - int r = 0; + int r; + +again: + bdev = NULL; + mode = 0; + r = 0; spin_lock_irqsave(&m->lock, flags); @@ -1546,6 +1553,12 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) r = scsi_verify_blk_ioctl(NULL, cmd); + if (r == -EAGAIN && !fatal_signal_pending(current)) { + queue_work(kmultipathd, &m->process_queued_ios); + msleep(10); + goto again; + } + return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); } @@ -1643,7 +1656,7 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 3, 0}, + .version = {1, 4, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 737d388..3e2907f 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1082,12 +1082,89 @@ int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, return 0; } -static int __get_held_metadata_root(struct dm_pool_metadata *pmd, - dm_block_t *result) +static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) +{ + int r, inc; + struct thin_disk_superblock *disk_super; + struct dm_block *copy, *sblock; + dm_block_t held_root; + + /* + * Copy the superblock. + */ + dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION); + r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, ©, &inc); + if (r) + return r; + + BUG_ON(!inc); + + held_root = dm_block_location(copy); + disk_super = dm_block_data(copy); + + if (le64_to_cpu(disk_super->held_root)) { + DMWARN("Pool metadata snapshot already exists: release this before taking another."); + + dm_tm_dec(pmd->tm, held_root); + dm_tm_unlock(pmd->tm, copy); + pmd->need_commit = 1; + + return -EBUSY; + } + + /* + * Wipe the spacemap since we're not publishing this. + */ + memset(&disk_super->data_space_map_root, 0, + sizeof(disk_super->data_space_map_root)); + memset(&disk_super->metadata_space_map_root, 0, + sizeof(disk_super->metadata_space_map_root)); + + /* + * Increment the data structures that need to be preserved. + */ + dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root)); + dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root)); + dm_tm_unlock(pmd->tm, copy); + + /* + * Write the held root into the superblock. + */ + r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); + if (r) { + dm_tm_dec(pmd->tm, held_root); + pmd->need_commit = 1; + return r; + } + + disk_super = dm_block_data(sblock); + disk_super->held_root = cpu_to_le64(held_root); + dm_bm_unlock(sblock); + + pmd->need_commit = 1; + + return 0; +} + +int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd) +{ + int r; + + down_write(&pmd->root_lock); + r = __reserve_metadata_snap(pmd); + up_write(&pmd->root_lock); + + return r; +} + +static int __release_metadata_snap(struct dm_pool_metadata *pmd) { int r; struct thin_disk_superblock *disk_super; - struct dm_block *sblock; + struct dm_block *sblock, *copy; + dm_block_t held_root; r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, &sb_validator, &sblock); @@ -1095,18 +1172,65 @@ static int __get_held_metadata_root(struct dm_pool_metadata *pmd, return r; disk_super = dm_block_data(sblock); + held_root = le64_to_cpu(disk_super->held_root); + disk_super->held_root = cpu_to_le64(0); + pmd->need_commit = 1; + + dm_bm_unlock(sblock); + + if (!held_root) { + DMWARN("No pool metadata snapshot found: nothing to release."); + return -EINVAL; + } + + r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, ©); + if (r) + return r; + + disk_super = dm_block_data(copy); + dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root)); + dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root)); + dm_sm_dec_block(pmd->metadata_sm, held_root); + + return dm_tm_unlock(pmd->tm, copy); +} + +int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd) +{ + int r; + + down_write(&pmd->root_lock); + r = __release_metadata_snap(pmd); + up_write(&pmd->root_lock); + + return r; +} + +static int __get_metadata_snap(struct dm_pool_metadata *pmd, + dm_block_t *result) +{ + int r; + struct thin_disk_superblock *disk_super; + struct dm_block *sblock; + + r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); *result = le64_to_cpu(disk_super->held_root); return dm_bm_unlock(sblock); } -int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd, - dm_block_t *result) +int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd, + dm_block_t *result) { int r; down_read(&pmd->root_lock); - r = __get_held_metadata_root(pmd, result); + r = __get_metadata_snap(pmd, result); up_read(&pmd->root_lock); return r; diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index ed4725e..b88918c 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -90,11 +90,18 @@ int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, /* * Hold/get root for userspace transaction. + * + * The metadata snapshot is a copy of the current superblock (minus the + * space maps). Userland can access the data structures for READ + * operations only. A small performance hit is incurred by providing this + * copy of the metadata to userland due to extra copy-on-write operations + * on the metadata nodes. Release this as soon as you finish with it. */ -int dm_pool_hold_metadata_root(struct dm_pool_metadata *pmd); +int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd); +int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd); -int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd, - dm_block_t *result); +int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd, + dm_block_t *result); /* * Actions on a single virtual device. diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index eb3d138..37fdaf8 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -111,7 +111,7 @@ struct cell_key { dm_block_t block; }; -struct cell { +struct dm_bio_prison_cell { struct hlist_node list; struct bio_prison *prison; struct cell_key key; @@ -141,6 +141,8 @@ static uint32_t calc_nr_buckets(unsigned nr_cells) return n; } +static struct kmem_cache *_cell_cache; + /* * @nr_cells should be the number of cells you want in use _concurrently_. * Don't confuse it with the number of distinct keys. @@ -157,8 +159,7 @@ static struct bio_prison *prison_create(unsigned nr_cells) return NULL; spin_lock_init(&prison->lock); - prison->cell_pool = mempool_create_kmalloc_pool(nr_cells, - sizeof(struct cell)); + prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache); if (!prison->cell_pool) { kfree(prison); return NULL; @@ -194,10 +195,10 @@ static int keys_equal(struct cell_key *lhs, struct cell_key *rhs) (lhs->block == rhs->block); } -static struct cell *__search_bucket(struct hlist_head *bucket, - struct cell_key *key) +static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket, + struct cell_key *key) { - struct cell *cell; + struct dm_bio_prison_cell *cell; struct hlist_node *tmp; hlist_for_each_entry(cell, tmp, bucket, list) @@ -214,12 +215,12 @@ static struct cell *__search_bucket(struct hlist_head *bucket, * Returns 1 if the cell was already held, 0 if @inmate is the new holder. */ static int bio_detain(struct bio_prison *prison, struct cell_key *key, - struct bio *inmate, struct cell **ref) + struct bio *inmate, struct dm_bio_prison_cell **ref) { int r = 1; unsigned long flags; uint32_t hash = hash_key(prison, key); - struct cell *cell, *cell2; + struct dm_bio_prison_cell *cell, *cell2; BUG_ON(hash > prison->nr_buckets); @@ -273,7 +274,7 @@ out: /* * @inmates must have been initialised prior to this call */ -static void __cell_release(struct cell *cell, struct bio_list *inmates) +static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates) { struct bio_prison *prison = cell->prison; @@ -287,7 +288,7 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates) mempool_free(cell, prison->cell_pool); } -static void cell_release(struct cell *cell, struct bio_list *bios) +static void cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios) { unsigned long flags; struct bio_prison *prison = cell->prison; @@ -303,7 +304,7 @@ static void cell_release(struct cell *cell, struct bio_list *bios) * bio may be in the cell. This function releases the cell, and also does * a sanity check. */ -static void __cell_release_singleton(struct cell *cell, struct bio *bio) +static void __cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio) { BUG_ON(cell->holder != bio); BUG_ON(!bio_list_empty(&cell->bios)); @@ -311,7 +312,7 @@ static void __cell_release_singleton(struct cell *cell, struct bio *bio) __cell_release(cell, NULL); } -static void cell_release_singleton(struct cell *cell, struct bio *bio) +static void cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio) { unsigned long flags; struct bio_prison *prison = cell->prison; @@ -324,7 +325,8 @@ static void cell_release_singleton(struct cell *cell, struct bio *bio) /* * Sometimes we don't want the holder, just the additional bios. */ -static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates) +static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, + struct bio_list *inmates) { struct bio_prison *prison = cell->prison; @@ -334,7 +336,8 @@ static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates mempool_free(cell, prison->cell_pool); } -static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates) +static void cell_release_no_holder(struct dm_bio_prison_cell *cell, + struct bio_list *inmates) { unsigned long flags; struct bio_prison *prison = cell->prison; @@ -344,7 +347,7 @@ static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates) spin_unlock_irqrestore(&prison->lock, flags); } -static void cell_error(struct cell *cell) +static void cell_error(struct dm_bio_prison_cell *cell) { struct bio_prison *prison = cell->prison; struct bio_list bios; @@ -491,7 +494,7 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, * also provides the interface for creating and destroying internal * devices. */ -struct new_mapping; +struct dm_thin_new_mapping; struct pool_features { unsigned zero_new_blocks:1; @@ -537,7 +540,7 @@ struct pool { struct deferred_set shared_read_ds; struct deferred_set all_io_ds; - struct new_mapping *next_mapping; + struct dm_thin_new_mapping *next_mapping; mempool_t *mapping_pool; mempool_t *endio_hook_pool; }; @@ -630,11 +633,11 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev /*----------------------------------------------------------------*/ -struct endio_hook { +struct dm_thin_endio_hook { struct thin_c *tc; struct deferred_entry *shared_read_entry; struct deferred_entry *all_io_entry; - struct new_mapping *overwrite_mapping; + struct dm_thin_new_mapping *overwrite_mapping; }; static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) @@ -647,7 +650,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) bio_list_init(master); while ((bio = bio_list_pop(&bios))) { - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; + if (h->tc == tc) bio_endio(bio, DM_ENDIO_REQUEUE); else @@ -736,7 +740,7 @@ static void wake_worker(struct pool *pool) /* * Bio endio functions. */ -struct new_mapping { +struct dm_thin_new_mapping { struct list_head list; unsigned quiesced:1; @@ -746,7 +750,7 @@ struct new_mapping { struct thin_c *tc; dm_block_t virt_block; dm_block_t data_block; - struct cell *cell, *cell2; + struct dm_bio_prison_cell *cell, *cell2; int err; /* @@ -759,7 +763,7 @@ struct new_mapping { bio_end_io_t *saved_bi_end_io; }; -static void __maybe_add_mapping(struct new_mapping *m) +static void __maybe_add_mapping(struct dm_thin_new_mapping *m) { struct pool *pool = m->tc->pool; @@ -772,7 +776,7 @@ static void __maybe_add_mapping(struct new_mapping *m) static void copy_complete(int read_err, unsigned long write_err, void *context) { unsigned long flags; - struct new_mapping *m = context; + struct dm_thin_new_mapping *m = context; struct pool *pool = m->tc->pool; m->err = read_err || write_err ? -EIO : 0; @@ -786,8 +790,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) static void overwrite_endio(struct bio *bio, int err) { unsigned long flags; - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; - struct new_mapping *m = h->overwrite_mapping; + struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct dm_thin_new_mapping *m = h->overwrite_mapping; struct pool *pool = m->tc->pool; m->err = err; @@ -811,7 +815,7 @@ static void overwrite_endio(struct bio *bio, int err) /* * This sends the bios in the cell back to the deferred_bios list. */ -static void cell_defer(struct thin_c *tc, struct cell *cell, +static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell, dm_block_t data_block) { struct pool *pool = tc->pool; @@ -828,7 +832,7 @@ static void cell_defer(struct thin_c *tc, struct cell *cell, * Same as cell_defer above, except it omits one particular detainee, * a write bio that covers the block and has already been processed. */ -static void cell_defer_except(struct thin_c *tc, struct cell *cell) +static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell) { struct bio_list bios; struct pool *pool = tc->pool; @@ -843,7 +847,7 @@ static void cell_defer_except(struct thin_c *tc, struct cell *cell) wake_worker(pool); } -static void process_prepared_mapping(struct new_mapping *m) +static void process_prepared_mapping(struct dm_thin_new_mapping *m) { struct thin_c *tc = m->tc; struct bio *bio; @@ -886,7 +890,7 @@ static void process_prepared_mapping(struct new_mapping *m) mempool_free(m, tc->pool->mapping_pool); } -static void process_prepared_discard(struct new_mapping *m) +static void process_prepared_discard(struct dm_thin_new_mapping *m) { int r; struct thin_c *tc = m->tc; @@ -909,11 +913,11 @@ static void process_prepared_discard(struct new_mapping *m) } static void process_prepared(struct pool *pool, struct list_head *head, - void (*fn)(struct new_mapping *)) + void (*fn)(struct dm_thin_new_mapping *)) { unsigned long flags; struct list_head maps; - struct new_mapping *m, *tmp; + struct dm_thin_new_mapping *m, *tmp; INIT_LIST_HEAD(&maps); spin_lock_irqsave(&pool->lock, flags); @@ -957,9 +961,9 @@ static int ensure_next_mapping(struct pool *pool) return pool->next_mapping ? 0 : -ENOMEM; } -static struct new_mapping *get_next_mapping(struct pool *pool) +static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) { - struct new_mapping *r = pool->next_mapping; + struct dm_thin_new_mapping *r = pool->next_mapping; BUG_ON(!pool->next_mapping); @@ -971,11 +975,11 @@ static struct new_mapping *get_next_mapping(struct pool *pool) static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, struct dm_dev *origin, dm_block_t data_origin, dm_block_t data_dest, - struct cell *cell, struct bio *bio) + struct dm_bio_prison_cell *cell, struct bio *bio) { int r; struct pool *pool = tc->pool; - struct new_mapping *m = get_next_mapping(pool); + struct dm_thin_new_mapping *m = get_next_mapping(pool); INIT_LIST_HEAD(&m->list); m->quiesced = 0; @@ -997,7 +1001,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, * bio immediately. Otherwise we use kcopyd to clone the data first. */ if (io_overwrites_block(pool, bio)) { - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; + h->overwrite_mapping = m; m->bio = bio; save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); @@ -1025,7 +1030,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, dm_block_t data_origin, dm_block_t data_dest, - struct cell *cell, struct bio *bio) + struct dm_bio_prison_cell *cell, struct bio *bio) { schedule_copy(tc, virt_block, tc->pool_dev, data_origin, data_dest, cell, bio); @@ -1033,18 +1038,18 @@ static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, dm_block_t data_dest, - struct cell *cell, struct bio *bio) + struct dm_bio_prison_cell *cell, struct bio *bio) { schedule_copy(tc, virt_block, tc->origin_dev, virt_block, data_dest, cell, bio); } static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, - dm_block_t data_block, struct cell *cell, + dm_block_t data_block, struct dm_bio_prison_cell *cell, struct bio *bio) { struct pool *pool = tc->pool; - struct new_mapping *m = get_next_mapping(pool); + struct dm_thin_new_mapping *m = get_next_mapping(pool); INIT_LIST_HEAD(&m->list); m->quiesced = 1; @@ -1065,12 +1070,12 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, process_prepared_mapping(m); else if (io_overwrites_block(pool, bio)) { - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; + h->overwrite_mapping = m; m->bio = bio; save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); remap_and_issue(tc, bio, data_block); - } else { int r; struct dm_io_region to; @@ -1155,7 +1160,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) */ static void retry_on_resume(struct bio *bio) { - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; struct thin_c *tc = h->tc; struct pool *pool = tc->pool; unsigned long flags; @@ -1165,7 +1170,7 @@ static void retry_on_resume(struct bio *bio) spin_unlock_irqrestore(&pool->lock, flags); } -static void no_space(struct cell *cell) +static void no_space(struct dm_bio_prison_cell *cell) { struct bio *bio; struct bio_list bios; @@ -1182,11 +1187,11 @@ static void process_discard(struct thin_c *tc, struct bio *bio) int r; unsigned long flags; struct pool *pool = tc->pool; - struct cell *cell, *cell2; + struct dm_bio_prison_cell *cell, *cell2; struct cell_key key, key2; dm_block_t block = get_bio_block(tc, bio); struct dm_thin_lookup_result lookup_result; - struct new_mapping *m; + struct dm_thin_new_mapping *m; build_virtual_key(tc->td, block, &key); if (bio_detain(tc->pool->prison, &key, bio, &cell)) @@ -1263,7 +1268,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio) static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, struct cell_key *key, struct dm_thin_lookup_result *lookup_result, - struct cell *cell) + struct dm_bio_prison_cell *cell) { int r; dm_block_t data_block; @@ -1290,7 +1295,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, dm_block_t block, struct dm_thin_lookup_result *lookup_result) { - struct cell *cell; + struct dm_bio_prison_cell *cell; struct pool *pool = tc->pool; struct cell_key key; @@ -1305,7 +1310,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, if (bio_data_dir(bio) == WRITE) break_sharing(tc, bio, block, &key, lookup_result, cell); else { - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; h->shared_read_entry = ds_inc(&pool->shared_read_ds); @@ -1315,7 +1320,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, } static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, - struct cell *cell) + struct dm_bio_prison_cell *cell) { int r; dm_block_t data_block; @@ -1363,7 +1368,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio) { int r; dm_block_t block = get_bio_block(tc, bio); - struct cell *cell; + struct dm_bio_prison_cell *cell; struct cell_key key; struct dm_thin_lookup_result lookup_result; @@ -1432,7 +1437,7 @@ static void process_deferred_bios(struct pool *pool) spin_unlock_irqrestore(&pool->lock, flags); while ((bio = bio_list_pop(&bios))) { - struct endio_hook *h = dm_get_mapinfo(bio)->ptr; + struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; struct thin_c *tc = h->tc; /* @@ -1522,10 +1527,10 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio) wake_worker(pool); } -static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) +static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) { struct pool *pool = tc->pool; - struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); + struct dm_thin_endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); h->tc = tc; h->shared_read_entry = NULL; @@ -1687,6 +1692,9 @@ static void __pool_destroy(struct pool *pool) kfree(pool); } +static struct kmem_cache *_new_mapping_cache; +static struct kmem_cache *_endio_hook_cache; + static struct pool *pool_create(struct mapped_device *pool_md, struct block_device *metadata_dev, unsigned long block_size, char **error) @@ -1755,16 +1763,16 @@ static struct pool *pool_create(struct mapped_device *pool_md, ds_init(&pool->all_io_ds); pool->next_mapping = NULL; - pool->mapping_pool = - mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping)); + pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE, + _new_mapping_cache); if (!pool->mapping_pool) { *error = "Error creating pool's mapping mempool"; err_p = ERR_PTR(-ENOMEM); goto bad_mapping_pool; } - pool->endio_hook_pool = - mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook)); + pool->endio_hook_pool = mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE, + _endio_hook_cache); if (!pool->endio_hook_pool) { *error = "Error creating pool's endio_hook mempool"; err_p = ERR_PTR(-ENOMEM); @@ -2276,6 +2284,36 @@ static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct po return 0; } +static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) +{ + int r; + + r = check_arg_count(argc, 1); + if (r) + return r; + + r = dm_pool_reserve_metadata_snap(pool->pmd); + if (r) + DMWARN("reserve_metadata_snap message failed."); + + return r; +} + +static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) +{ + int r; + + r = check_arg_count(argc, 1); + if (r) + return r; + + r = dm_pool_release_metadata_snap(pool->pmd); + if (r) + DMWARN("release_metadata_snap message failed."); + + return r; +} + /* * Messages supported: * create_thin @@ -2283,6 +2321,8 @@ static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct po * delete * trim * set_transaction_id + * reserve_metadata_snap + * release_metadata_snap */ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) { @@ -2302,6 +2342,12 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) else if (!strcasecmp(argv[0], "set_transaction_id")) r = process_set_transaction_id_mesg(argc, argv, pool); + else if (!strcasecmp(argv[0], "reserve_metadata_snap")) + r = process_reserve_metadata_snap_mesg(argc, argv, pool); + + else if (!strcasecmp(argv[0], "release_metadata_snap")) + r = process_release_metadata_snap_mesg(argc, argv, pool); + else DMWARN("Unrecognised thin pool target message received: %s", argv[0]); @@ -2361,7 +2407,7 @@ static int pool_status(struct dm_target *ti, status_type_t type, if (r) return r; - r = dm_pool_get_held_metadata_root(pool->pmd, &held_root); + r = dm_pool_get_metadata_snap(pool->pmd, &held_root); if (r) return r; @@ -2457,7 +2503,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 1, 0}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -2613,9 +2659,9 @@ static int thin_endio(struct dm_target *ti, union map_info *map_context) { unsigned long flags; - struct endio_hook *h = map_context->ptr; + struct dm_thin_endio_hook *h = map_context->ptr; struct list_head work; - struct new_mapping *m, *tmp; + struct dm_thin_new_mapping *m, *tmp; struct pool *pool = h->tc->pool; if (h->shared_read_entry) { @@ -2755,7 +2801,32 @@ static int __init dm_thin_init(void) r = dm_register_target(&pool_target); if (r) - dm_unregister_target(&thin_target); + goto bad_pool_target; + + r = -ENOMEM; + + _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0); + if (!_cell_cache) + goto bad_cell_cache; + + _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0); + if (!_new_mapping_cache) + goto bad_new_mapping_cache; + + _endio_hook_cache = KMEM_CACHE(dm_thin_endio_hook, 0); + if (!_endio_hook_cache) + goto bad_endio_hook_cache; + + return 0; + +bad_endio_hook_cache: + kmem_cache_destroy(_new_mapping_cache); +bad_new_mapping_cache: + kmem_cache_destroy(_cell_cache); +bad_cell_cache: + dm_unregister_target(&pool_target); +bad_pool_target: + dm_unregister_target(&thin_target); return r; } @@ -2764,6 +2835,10 @@ static void dm_thin_exit(void) { dm_unregister_target(&thin_target); dm_unregister_target(&pool_target); + + kmem_cache_destroy(_cell_cache); + kmem_cache_destroy(_new_mapping_cache); + kmem_cache_destroy(_endio_hook_cache); } module_init(dm_thin_init); diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index 6f8d387..400fe14 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -249,6 +249,7 @@ int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, return r; } +EXPORT_SYMBOL_GPL(dm_tm_shadow_block); int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, struct dm_block_validator *v, @@ -259,6 +260,7 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, return dm_bm_read_lock(tm->bm, b, v, blk); } +EXPORT_SYMBOL_GPL(dm_tm_read_lock); int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b) { diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx.c b/drivers/net/ethernet/freescale/fec_mpc52xx.c index 97f947b..2933d08 100644 --- a/drivers/net/ethernet/freescale/fec_mpc52xx.c +++ b/drivers/net/ethernet/freescale/fec_mpc52xx.c @@ -437,7 +437,7 @@ static irqreturn_t mpc52xx_fec_rx_interrupt(int irq, void *dev_id) length = status & BCOM_FEC_RX_BD_LEN_MASK; skb_put(rskb, length - 4); /* length without CRC32 */ rskb->protocol = eth_type_trans(rskb, dev); - if (!skb_defer_rx_timestamp(skb)) + if (!skb_defer_rx_timestamp(rskb)) netif_rx(rskb); spin_lock(&priv->lock); diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c index 95731c8..7483ca0 100644 --- a/drivers/net/ethernet/intel/e1000/e1000_main.c +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c @@ -4080,7 +4080,7 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_adapter *adapter, spin_lock_irqsave(&adapter->stats_lock, irq_flags); e1000_tbi_adjust_stats(hw, &adapter->stats, - length, skb->data); + length, mapped); spin_unlock_irqrestore(&adapter->stats_lock, irq_flags); length--; diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index bbf70ba..238ab2f 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -165,14 +165,14 @@ #define I217_EEE_100_SUPPORTED (1 << 1) /* 100BaseTx EEE supported */ /* Intel Rapid Start Technology Support */ -#define I217_PROXY_CTRL PHY_REG(BM_WUC_PAGE, 70) +#define I217_PROXY_CTRL BM_PHY_REG(BM_WUC_PAGE, 70) #define I217_PROXY_CTRL_AUTO_DISABLE 0x0080 #define I217_SxCTRL PHY_REG(BM_PORT_CTRL_PAGE, 28) -#define I217_SxCTRL_MASK 0x1000 +#define I217_SxCTRL_ENABLE_LPI_RESET 0x1000 #define I217_CGFREG PHY_REG(772, 29) -#define I217_CGFREG_MASK 0x0002 +#define I217_CGFREG_ENABLE_MTA_RESET 0x0002 #define I217_MEMPWR PHY_REG(772, 26) -#define I217_MEMPWR_MASK 0x0010 +#define I217_MEMPWR_DISABLE_SMB_RELEASE 0x0010 /* Strapping Option Register - RO */ #define E1000_STRAP 0x0000C @@ -4089,12 +4089,12 @@ void e1000_suspend_workarounds_ich8lan(struct e1000_hw *hw) * power good. */ e1e_rphy_locked(hw, I217_SxCTRL, &phy_reg); - phy_reg |= I217_SxCTRL_MASK; + phy_reg |= I217_SxCTRL_ENABLE_LPI_RESET; e1e_wphy_locked(hw, I217_SxCTRL, phy_reg); /* Disable the SMB release on LCD reset. */ e1e_rphy_locked(hw, I217_MEMPWR, &phy_reg); - phy_reg &= ~I217_MEMPWR; + phy_reg &= ~I217_MEMPWR_DISABLE_SMB_RELEASE; e1e_wphy_locked(hw, I217_MEMPWR, phy_reg); } @@ -4103,7 +4103,7 @@ void e1000_suspend_workarounds_ich8lan(struct e1000_hw *hw) * Support */ e1e_rphy_locked(hw, I217_CGFREG, &phy_reg); - phy_reg |= I217_CGFREG_MASK; + phy_reg |= I217_CGFREG_ENABLE_MTA_RESET; e1e_wphy_locked(hw, I217_CGFREG, phy_reg); release: @@ -4176,7 +4176,7 @@ void e1000_resume_workarounds_pchlan(struct e1000_hw *hw) ret_val = e1e_rphy_locked(hw, I217_MEMPWR, &phy_reg); if (ret_val) goto release; - phy_reg |= I217_MEMPWR_MASK; + phy_reg |= I217_MEMPWR_DISABLE_SMB_RELEASE; e1e_wphy_locked(hw, I217_MEMPWR, phy_reg); /* Disable Proxy */ @@ -4186,7 +4186,7 @@ void e1000_resume_workarounds_pchlan(struct e1000_hw *hw) ret_val = e1e_rphy_locked(hw, I217_CGFREG, &phy_reg); if (ret_val) goto release; - phy_reg &= ~I217_CGFREG_MASK; + phy_reg &= ~I217_CGFREG_ENABLE_MTA_RESET; e1e_wphy_locked(hw, I217_CGFREG, phy_reg); release: if (ret_val) diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index 1bcead1..842c8ce 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -617,7 +617,7 @@ static struct mlx4_cmd_info cmd_info[] = { .out_is_imm = false, .encode_slave_id = false, .verify = NULL, - .wrapper = NULL + .wrapper = mlx4_QUERY_FW_wrapper }, { .opcode = MLX4_CMD_QUERY_HCA, @@ -635,7 +635,7 @@ static struct mlx4_cmd_info cmd_info[] = { .out_is_imm = false, .encode_slave_id = false, .verify = NULL, - .wrapper = NULL + .wrapper = mlx4_QUERY_DEV_CAP_wrapper }, { .opcode = MLX4_CMD_QUERY_FUNC_CAP, diff --git a/drivers/net/ethernet/mellanox/mlx4/en_main.c b/drivers/net/ethernet/mellanox/mlx4/en_main.c index 988b242..69ba572 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_main.c @@ -136,13 +136,12 @@ static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr, struct mlx4_en_dev *mdev = (struct mlx4_en_dev *) endev_ptr; struct mlx4_en_priv *priv; - if (!mdev->pndev[port]) - return; - - priv = netdev_priv(mdev->pndev[port]); switch (event) { case MLX4_DEV_EVENT_PORT_UP: case MLX4_DEV_EVENT_PORT_DOWN: + if (!mdev->pndev[port]) + return; + priv = netdev_priv(mdev->pndev[port]); /* To prevent races, we poll the link state in a separate task rather than changing it here */ priv->link_state = event; @@ -154,7 +153,10 @@ static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr, break; default: - mlx4_warn(mdev, "Unhandled event: %d\n", event); + if (port < 1 || port > dev->caps.num_ports || + !mdev->pndev[port]) + return; + mlx4_warn(mdev, "Unhandled event %d for port %d\n", event, port); } } diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c index 3b6f8ef..bce98d9 100644 --- a/drivers/net/ethernet/mellanox/mlx4/eq.c +++ b/drivers/net/ethernet/mellanox/mlx4/eq.c @@ -426,7 +426,7 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) mlx4_dbg(dev, "FLR event for slave: %d\n", flr_slave); - if (flr_slave > dev->num_slaves) { + if (flr_slave >= dev->num_slaves) { mlx4_warn(dev, "Got FLR for unknown function: %d\n", flr_slave); diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 68f5cd6..9c83bb8 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -412,7 +412,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) outbox = mailbox->buf; err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP, - MLX4_CMD_TIME_CLASS_A, !mlx4_is_slave(dev)); + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (err) goto out; @@ -590,8 +590,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) for (i = 1; i <= dev_cap->num_ports; ++i) { err = mlx4_cmd_box(dev, 0, mailbox->dma, i, 0, MLX4_CMD_QUERY_PORT, - MLX4_CMD_TIME_CLASS_B, - !mlx4_is_slave(dev)); + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); if (err) goto out; @@ -669,6 +668,28 @@ out: return err; } +int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err = 0; + u8 field; + + err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + if (err) + return err; + + /* For guests, report Blueflame disabled */ + MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_BF_OFFSET); + field &= 0x7f; + MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_BF_OFFSET); + + return 0; +} + int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, @@ -860,6 +881,9 @@ int mlx4_QUERY_FW(struct mlx4_dev *dev) ((fw_ver & 0xffff0000ull) >> 16) | ((fw_ver & 0x0000ffffull) << 16); + if (mlx4_is_slave(dev)) + goto out; + MLX4_GET(lg, outbox, QUERY_FW_PPF_ID); dev->caps.function = lg; @@ -927,6 +951,27 @@ out: return err; } +int mlx4_QUERY_FW_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + u8 *outbuf; + int err; + + outbuf = outbox->buf; + err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_QUERY_FW, + MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); + if (err) + return err; + + /* for slaves, zero out everything except FW version */ + outbuf[0] = outbuf[1] = 0; + memset(&outbuf[8], 0, QUERY_FW_OUT_SIZE - 8); + return 0; +} + static void get_board_id(void *vsd, char *board_id) { int i; diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 2e024a6..ee6f4fe 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -142,12 +142,6 @@ struct mlx4_port_config { struct pci_dev *pdev; }; -static inline int mlx4_master_get_num_eqs(struct mlx4_dev *dev) -{ - return dev->caps.reserved_eqs + - MLX4_MFUNC_EQ_NUM * (dev->num_slaves + 1); -} - int mlx4_check_port_params(struct mlx4_dev *dev, enum mlx4_port_type *port_type) { @@ -217,6 +211,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) } dev->caps.num_ports = dev_cap->num_ports; + dev->phys_caps.num_phys_eqs = MLX4_MAX_EQ_NUM; for (i = 1; i <= dev->caps.num_ports; ++i) { dev->caps.vl_cap[i] = dev_cap->max_vl[i]; dev->caps.ib_mtu_cap[i] = dev_cap->ib_mtu[i]; @@ -435,12 +430,17 @@ static int mlx4_slave_cap(struct mlx4_dev *dev) mlx4_log_num_mgm_entry_size = hca_param.log_mc_entry_sz; memset(&dev_cap, 0, sizeof(dev_cap)); + dev->caps.max_qp_dest_rdma = 1 << hca_param.log_rd_per_qp; err = mlx4_dev_cap(dev, &dev_cap); if (err) { mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n"); return err; } + err = mlx4_QUERY_FW(dev); + if (err) + mlx4_err(dev, "QUERY_FW command failed: could not get FW version.\n"); + page_size = ~dev->caps.page_size_cap + 1; mlx4_warn(dev, "HCA minimum page size:%d\n", page_size); if (page_size > PAGE_SIZE) { @@ -485,15 +485,15 @@ static int mlx4_slave_cap(struct mlx4_dev *dev) dev->caps.num_mgms = 0; dev->caps.num_amgms = 0; - for (i = 1; i <= dev->caps.num_ports; ++i) - dev->caps.port_mask[i] = dev->caps.port_type[i]; - if (dev->caps.num_ports > MLX4_MAX_PORTS) { mlx4_err(dev, "HCA has %d ports, but we only support %d, " "aborting.\n", dev->caps.num_ports, MLX4_MAX_PORTS); return -ENODEV; } + for (i = 1; i <= dev->caps.num_ports; ++i) + dev->caps.port_mask[i] = dev->caps.port_type[i]; + if (dev->caps.uar_page_size * (dev->caps.num_uars - dev->caps.reserved_uars) > pci_resource_len(dev->pdev, 2)) { @@ -504,18 +504,6 @@ static int mlx4_slave_cap(struct mlx4_dev *dev) return -ENODEV; } -#if 0 - mlx4_warn(dev, "sqp_demux:%d\n", dev->caps.sqp_demux); - mlx4_warn(dev, "num_uars:%d reserved_uars:%d uar region:0x%x bar2:0x%llx\n", - dev->caps.num_uars, dev->caps.reserved_uars, - dev->caps.uar_page_size * dev->caps.num_uars, - pci_resource_len(dev->pdev, 2)); - mlx4_warn(dev, "num_eqs:%d reserved_eqs:%d\n", dev->caps.num_eqs, - dev->caps.reserved_eqs); - mlx4_warn(dev, "num_pds:%d reserved_pds:%d slave_pd_shift:%d pd_base:%d\n", - dev->caps.num_pds, dev->caps.reserved_pds, - dev->caps.slave_pd_shift, dev->caps.pd_base); -#endif return 0; } @@ -810,9 +798,8 @@ static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base, if (err) goto err_srq; - num_eqs = (mlx4_is_master(dev)) ? - roundup_pow_of_two(mlx4_master_get_num_eqs(dev)) : - dev->caps.num_eqs; + num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs : + dev->caps.num_eqs; err = mlx4_init_icm_table(dev, &priv->eq_table.cmpt_table, cmpt_base + ((u64) (MLX4_CMPT_TYPE_EQ * @@ -874,9 +861,8 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, } - num_eqs = (mlx4_is_master(dev)) ? - roundup_pow_of_two(mlx4_master_get_num_eqs(dev)) : - dev->caps.num_eqs; + num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs : + dev->caps.num_eqs; err = mlx4_init_icm_table(dev, &priv->eq_table.table, init_hca->eqc_base, dev_cap->eqc_entry_sz, num_eqs, num_eqs, 0, 0); diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index 86b6e5a..e5d2022 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -1039,6 +1039,11 @@ int mlx4_init_resource_tracker(struct mlx4_dev *dev); void mlx4_free_resource_tracker(struct mlx4_dev *dev, enum mlx4_res_tracker_free_type type); +int mlx4_QUERY_FW_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); int mlx4_SET_PORT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, @@ -1054,6 +1059,11 @@ int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_cmd_mailbox *inbox, struct mlx4_cmd_mailbox *outbox, struct mlx4_cmd_info *cmd); +int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, diff --git a/drivers/net/ethernet/mellanox/mlx4/profile.c b/drivers/net/ethernet/mellanox/mlx4/profile.c index 06e5ade..b83bc92 100644 --- a/drivers/net/ethernet/mellanox/mlx4/profile.c +++ b/drivers/net/ethernet/mellanox/mlx4/profile.c @@ -126,7 +126,9 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, profile[MLX4_RES_AUXC].num = request->num_qp; profile[MLX4_RES_SRQ].num = request->num_srq; profile[MLX4_RES_CQ].num = request->num_cq; - profile[MLX4_RES_EQ].num = min_t(unsigned, dev_cap->max_eqs, MAX_MSIX); + profile[MLX4_RES_EQ].num = mlx4_is_mfunc(dev) ? + dev->phys_caps.num_phys_eqs : + min_t(unsigned, dev_cap->max_eqs, MAX_MSIX); profile[MLX4_RES_DMPT].num = request->num_mpt; profile[MLX4_RES_CMPT].num = MLX4_NUM_CMPTS; profile[MLX4_RES_MTT].num = request->num_mtt * (1 << log_mtts_per_seg); @@ -215,9 +217,10 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, init_hca->log_num_cqs = profile[i].log_num; break; case MLX4_RES_EQ: - dev->caps.num_eqs = profile[i].num; + dev->caps.num_eqs = roundup_pow_of_two(min_t(unsigned, dev_cap->max_eqs, + MAX_MSIX)); init_hca->eqc_base = profile[i].start; - init_hca->log_num_eqs = profile[i].log_num; + init_hca->log_num_eqs = ilog2(dev->caps.num_eqs); break; case MLX4_RES_DMPT: dev->caps.num_mpts = profile[i].num; diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c index 5eef290..995d0cf 100644 --- a/drivers/net/ethernet/realtek/8139cp.c +++ b/drivers/net/ethernet/realtek/8139cp.c @@ -979,6 +979,17 @@ static void cp_init_hw (struct cp_private *cp) cpw32_f (MAC0 + 0, le32_to_cpu (*(__le32 *) (dev->dev_addr + 0))); cpw32_f (MAC0 + 4, le32_to_cpu (*(__le32 *) (dev->dev_addr + 4))); + cpw32_f(HiTxRingAddr, 0); + cpw32_f(HiTxRingAddr + 4, 0); + + ring_dma = cp->ring_dma; + cpw32_f(RxRingAddr, ring_dma & 0xffffffff); + cpw32_f(RxRingAddr + 4, (ring_dma >> 16) >> 16); + + ring_dma += sizeof(struct cp_desc) * CP_RX_RING_SIZE; + cpw32_f(TxRingAddr, ring_dma & 0xffffffff); + cpw32_f(TxRingAddr + 4, (ring_dma >> 16) >> 16); + cp_start_hw(cp); cpw8(TxThresh, 0x06); /* XXX convert magic num to a constant */ @@ -992,17 +1003,6 @@ static void cp_init_hw (struct cp_private *cp) cpw8(Config5, cpr8(Config5) & PMEStatus); - cpw32_f(HiTxRingAddr, 0); - cpw32_f(HiTxRingAddr + 4, 0); - - ring_dma = cp->ring_dma; - cpw32_f(RxRingAddr, ring_dma & 0xffffffff); - cpw32_f(RxRingAddr + 4, (ring_dma >> 16) >> 16); - - ring_dma += sizeof(struct cp_desc) * CP_RX_RING_SIZE; - cpw32_f(TxRingAddr, ring_dma & 0xffffffff); - cpw32_f(TxRingAddr + 4, (ring_dma >> 16) >> 16); - cpw16(MultiIntr, 0); cpw8_f(Cfg9346, Cfg9346_Lock); @@ -1636,7 +1636,7 @@ static void eeprom_cmd(void __iomem *ee_addr, int cmd, int cmd_len) static void eeprom_cmd_end(void __iomem *ee_addr) { - writeb (~EE_CS, ee_addr); + writeb(0, ee_addr); eeprom_delay (); } diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c index 03df076..1d83565 100644 --- a/drivers/net/ethernet/realtek/8139too.c +++ b/drivers/net/ethernet/realtek/8139too.c @@ -1173,7 +1173,7 @@ static int __devinit read_eeprom (void __iomem *ioaddr, int location, int addr_l } /* Terminate the EEPROM access. */ - RTL_W8 (Cfg9346, ~EE_CS); + RTL_W8(Cfg9346, 0); eeprom_delay (); return retval; diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 00b4f56..9757ce3 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -6345,6 +6345,8 @@ static void __devexit rtl_remove_one(struct pci_dev *pdev) cancel_work_sync(&tp->wk.work); + netif_napi_del(&tp->napi); + unregister_netdev(dev); rtl_release_firmware(tp); @@ -6668,6 +6670,7 @@ out: return rc; err_out_msi_4: + netif_napi_del(&tp->napi); rtl_disable_msi(pdev, tp); iounmap(ioaddr); err_out_free_res_3: diff --git a/drivers/net/usb/mcs7830.c b/drivers/net/usb/mcs7830.c index add1064..03c2d8d 100644 --- a/drivers/net/usb/mcs7830.c +++ b/drivers/net/usb/mcs7830.c @@ -629,11 +629,31 @@ static int mcs7830_rx_fixup(struct usbnet *dev, struct sk_buff *skb) return skb->len > 0; } +static void mcs7830_status(struct usbnet *dev, struct urb *urb) +{ + u8 *buf = urb->transfer_buffer; + bool link; + + if (urb->actual_length < 16) + return; + + link = !(buf[1] & 0x20); + if (netif_carrier_ok(dev->net) != link) { + if (link) { + netif_carrier_on(dev->net); + usbnet_defer_kevent(dev, EVENT_LINK_RESET); + } else + netif_carrier_off(dev->net); + netdev_dbg(dev->net, "Link Status is: %d\n", link); + } +} + static const struct driver_info moschip_info = { .description = "MOSCHIP 7830/7832/7730 usb-NET adapter", .bind = mcs7830_bind, .rx_fixup = mcs7830_rx_fixup, - .flags = FLAG_ETHER, + .flags = FLAG_ETHER | FLAG_LINK_INTR, + .status = mcs7830_status, .in = 1, .out = 2, }; @@ -642,7 +662,8 @@ static const struct driver_info sitecom_info = { .description = "Sitecom LN-30 usb-NET adapter", .bind = mcs7830_bind, .rx_fixup = mcs7830_rx_fixup, - .flags = FLAG_ETHER, + .flags = FLAG_ETHER | FLAG_LINK_INTR, + .status = mcs7830_status, .in = 1, .out = 2, }; diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c index 35819e3..6cc4358 100644 --- a/drivers/tty/amiserial.c +++ b/drivers/tty/amiserial.c @@ -1033,7 +1033,7 @@ static int get_serial_info(struct tty_struct *tty, struct serial_state *state, if (!retinfo) return -EFAULT; memset(&tmp, 0, sizeof(tmp)); - tty_lock(tty); + tty_lock(); tmp.line = tty->index; tmp.port = state->port; tmp.flags = state->tport.flags; @@ -1042,7 +1042,7 @@ static int get_serial_info(struct tty_struct *tty, struct serial_state *state, tmp.close_delay = state->tport.close_delay; tmp.closing_wait = state->tport.closing_wait; tmp.custom_divisor = state->custom_divisor; - tty_unlock(tty); + tty_unlock(); if (copy_to_user(retinfo,&tmp,sizeof(*retinfo))) return -EFAULT; return 0; @@ -1059,12 +1059,12 @@ static int set_serial_info(struct tty_struct *tty, struct serial_state *state, if (copy_from_user(&new_serial,new_info,sizeof(new_serial))) return -EFAULT; - tty_lock(tty); + tty_lock(); change_spd = ((new_serial.flags ^ port->flags) & ASYNC_SPD_MASK) || new_serial.custom_divisor != state->custom_divisor; if (new_serial.irq || new_serial.port != state->port || new_serial.xmit_fifo_size != state->xmit_fifo_size) { - tty_unlock(tty); + tty_unlock(); return -EINVAL; } @@ -1074,7 +1074,7 @@ static int set_serial_info(struct tty_struct *tty, struct serial_state *state, (new_serial.xmit_fifo_size != state->xmit_fifo_size) || ((new_serial.flags & ~ASYNC_USR_MASK) != (port->flags & ~ASYNC_USR_MASK))) { - tty_unlock(tty); + tty_unlock(); return -EPERM; } port->flags = ((port->flags & ~ASYNC_USR_MASK) | @@ -1084,7 +1084,7 @@ static int set_serial_info(struct tty_struct *tty, struct serial_state *state, } if (new_serial.baud_base < 9600) { - tty_unlock(tty); + tty_unlock(); return -EINVAL; } @@ -1116,7 +1116,7 @@ check_and_exit: } } else retval = startup(tty, state); - tty_unlock(tty); + tty_unlock(); return retval; } diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c index 6984e1a..e61cabd 100644 --- a/drivers/tty/cyclades.c +++ b/drivers/tty/cyclades.c @@ -1599,7 +1599,7 @@ static int cy_open(struct tty_struct *tty, struct file *filp) * If the port is the middle of closing, bail out now */ if (tty_hung_up_p(filp) || (info->port.flags & ASYNC_CLOSING)) { - wait_event_interruptible_tty(tty, info->port.close_wait, + wait_event_interruptible_tty(info->port.close_wait, !(info->port.flags & ASYNC_CLOSING)); return (info->port.flags & ASYNC_HUP_NOTIFY) ? -EAGAIN: -ERESTARTSYS; } diff --git a/drivers/tty/n_r3964.c b/drivers/tty/n_r3964.c index 656ad93..5c6c314 100644 --- a/drivers/tty/n_r3964.c +++ b/drivers/tty/n_r3964.c @@ -1065,8 +1065,7 @@ static ssize_t r3964_read(struct tty_struct *tty, struct file *file, TRACE_L("read()"); - /* FIXME: should use a private lock */ - tty_lock(tty); + tty_lock(); pClient = findClient(pInfo, task_pid(current)); if (pClient) { @@ -1078,7 +1077,7 @@ static ssize_t r3964_read(struct tty_struct *tty, struct file *file, goto unlock; } /* block until there is a message: */ - wait_event_interruptible_tty(tty, pInfo->read_wait, + wait_event_interruptible_tty(pInfo->read_wait, (pMsg = remove_msg(pInfo, pClient))); } @@ -1108,7 +1107,7 @@ static ssize_t r3964_read(struct tty_struct *tty, struct file *file, } ret = -EPERM; unlock: - tty_unlock(tty); + tty_unlock(); return ret; } @@ -1157,7 +1156,7 @@ static ssize_t r3964_write(struct tty_struct *tty, struct file *file, pHeader->locks = 0; pHeader->owner = NULL; - tty_lock(tty); + tty_lock(); pClient = findClient(pInfo, task_pid(current)); if (pClient) { @@ -1176,7 +1175,7 @@ static ssize_t r3964_write(struct tty_struct *tty, struct file *file, add_tx_queue(pInfo, pHeader); trigger_transmit(pInfo); - tty_unlock(tty); + tty_unlock(); return 0; } diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index 65c7c62..5505ffc 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -47,7 +47,6 @@ static void pty_close(struct tty_struct *tty, struct file *filp) wake_up_interruptible(&tty->read_wait); wake_up_interruptible(&tty->write_wait); tty->packet = 0; - /* Review - krefs on tty_link ?? */ if (!tty->link) return; tty->link->packet = 0; @@ -63,9 +62,9 @@ static void pty_close(struct tty_struct *tty, struct file *filp) mutex_unlock(&devpts_mutex); } #endif - tty_unlock(tty); + tty_unlock(); tty_vhangup(tty->link); - tty_lock(tty); + tty_lock(); } } @@ -623,27 +622,26 @@ static int ptmx_open(struct inode *inode, struct file *filp) return retval; /* find a device that is not in use. */ - mutex_lock(&devpts_mutex); + tty_lock(); index = devpts_new_index(inode); + tty_unlock(); if (index < 0) { retval = index; goto err_file; } - mutex_unlock(&devpts_mutex); - mutex_lock(&tty_mutex); + mutex_lock(&devpts_mutex); tty = tty_init_dev(ptm_driver, index); + mutex_unlock(&devpts_mutex); + tty_lock(); + mutex_unlock(&tty_mutex); if (IS_ERR(tty)) { retval = PTR_ERR(tty); goto out; } - /* The tty returned here is locked so we can safely - drop the mutex */ - mutex_unlock(&tty_mutex); - set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ tty_add_file(tty, filp); @@ -656,17 +654,16 @@ static int ptmx_open(struct inode *inode, struct file *filp) if (retval) goto err_release; - tty_unlock(tty); + tty_unlock(); return 0; err_release: - tty_unlock(tty); + tty_unlock(); tty_release(inode, filp); return retval; out: - mutex_unlock(&tty_mutex); devpts_kill_index(inode, index); + tty_unlock(); err_file: - mutex_unlock(&devpts_mutex); tty_free_file(filp); return retval; } diff --git a/drivers/tty/serial/crisv10.c b/drivers/tty/serial/crisv10.c index 7264d4d..80b6b1b 100644 --- a/drivers/tty/serial/crisv10.c +++ b/drivers/tty/serial/crisv10.c @@ -3976,7 +3976,7 @@ block_til_ready(struct tty_struct *tty, struct file * filp, */ if (tty_hung_up_p(filp) || (info->flags & ASYNC_CLOSING)) { - wait_event_interruptible_tty(tty, info->close_wait, + wait_event_interruptible_tty(info->close_wait, !(info->flags & ASYNC_CLOSING)); #ifdef SERIAL_DO_RESTART if (info->flags & ASYNC_HUP_NOTIFY) @@ -4052,9 +4052,9 @@ block_til_ready(struct tty_struct *tty, struct file * filp, printk("block_til_ready blocking: ttyS%d, count = %d\n", info->line, info->count); #endif - tty_unlock(tty); + tty_unlock(); schedule(); - tty_lock(tty); + tty_lock(); } set_current_state(TASK_RUNNING); remove_wait_queue(&info->open_wait, &wait); @@ -4115,7 +4115,7 @@ rs_open(struct tty_struct *tty, struct file * filp) */ if (tty_hung_up_p(filp) || (info->flags & ASYNC_CLOSING)) { - wait_event_interruptible_tty(tty, info->close_wait, + wait_event_interruptible_tty(info->close_wait, !(info->flags & ASYNC_CLOSING)); #ifdef SERIAL_DO_RESTART return ((info->flags & ASYNC_HUP_NOTIFY) ? diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c index 5ed0daa..593d40a 100644 --- a/drivers/tty/synclink.c +++ b/drivers/tty/synclink.c @@ -3338,9 +3338,9 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp, printk("%s(%d):block_til_ready blocking on %s count=%d\n", __FILE__,__LINE__, tty->driver->name, port->count ); - tty_unlock(tty); + tty_unlock(); schedule(); - tty_lock(tty); + tty_lock(); } set_current_state(TASK_RUNNING); diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c index 45b43f1..aa1debf 100644 --- a/drivers/tty/synclink_gt.c +++ b/drivers/tty/synclink_gt.c @@ -3336,9 +3336,9 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp, } DBGINFO(("%s block_til_ready wait\n", tty->driver->name)); - tty_unlock(tty); + tty_unlock(); schedule(); - tty_lock(tty); + tty_lock(); } set_current_state(TASK_RUNNING); diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c index 4a1e4f0..a3dddc1 100644 --- a/drivers/tty/synclinkmp.c +++ b/drivers/tty/synclinkmp.c @@ -3357,9 +3357,9 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp, printk("%s(%d):%s block_til_ready() count=%d\n", __FILE__,__LINE__, tty->driver->name, port->count ); - tty_unlock(tty); + tty_unlock(); schedule(); - tty_lock(tty); + tty_lock(); } set_current_state(TASK_RUNNING); diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 9e930c0..b425c79 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -185,7 +185,6 @@ void free_tty_struct(struct tty_struct *tty) put_device(tty->dev); kfree(tty->write_buf); tty_buffer_free_all(tty); - tty->magic = 0xDEADDEAD; kfree(tty); } @@ -574,7 +573,7 @@ void __tty_hangup(struct tty_struct *tty) } spin_unlock(&redirect_lock); - tty_lock(tty); + tty_lock(); /* some functions below drop BTM, so we need this bit */ set_bit(TTY_HUPPING, &tty->flags); @@ -667,7 +666,7 @@ void __tty_hangup(struct tty_struct *tty) clear_bit(TTY_HUPPING, &tty->flags); tty_ldisc_enable(tty); - tty_unlock(tty); + tty_unlock(); if (f) fput(f); @@ -1104,12 +1103,12 @@ void tty_write_message(struct tty_struct *tty, char *msg) { if (tty) { mutex_lock(&tty->atomic_write_lock); - tty_lock(tty); + tty_lock(); if (tty->ops->write && !test_bit(TTY_CLOSING, &tty->flags)) { - tty_unlock(tty); + tty_unlock(); tty->ops->write(tty, msg, strlen(msg)); } else - tty_unlock(tty); + tty_unlock(); tty_write_unlock(tty); } return; @@ -1404,7 +1403,6 @@ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx) } initialize_tty_struct(tty, driver, idx); - tty_lock(tty); retval = tty_driver_install_tty(driver, tty); if (retval < 0) goto err_deinit_tty; @@ -1417,11 +1415,9 @@ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx) retval = tty_ldisc_setup(tty, tty->link); if (retval) goto err_release_tty; - /* Return the tty locked so that it cannot vanish under the caller */ return tty; err_deinit_tty: - tty_unlock(tty); deinitialize_tty_struct(tty); free_tty_struct(tty); err_module_put: @@ -1430,7 +1426,6 @@ err_module_put: /* call the tty release_tty routine to clean out this slot */ err_release_tty: - tty_unlock(tty); printk_ratelimited(KERN_INFO "tty_init_dev: ldisc open failed, " "clearing slot %d\n", idx); release_tty(tty, idx); @@ -1633,7 +1628,7 @@ int tty_release(struct inode *inode, struct file *filp) if (tty_paranoia_check(tty, inode, __func__)) return 0; - tty_lock(tty); + tty_lock(); check_tty_count(tty, __func__); __tty_fasync(-1, filp, 0); @@ -1642,11 +1637,10 @@ int tty_release(struct inode *inode, struct file *filp) pty_master = (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER); devpts = (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) != 0; - /* Review: parallel close */ o_tty = tty->link; if (tty_release_checks(tty, o_tty, idx)) { - tty_unlock(tty); + tty_unlock(); return 0; } @@ -1658,7 +1652,7 @@ int tty_release(struct inode *inode, struct file *filp) if (tty->ops->close) tty->ops->close(tty, filp); - tty_unlock(tty); + tty_unlock(); /* * Sanity check: if tty->count is going to zero, there shouldn't be * any waiters on tty->read_wait or tty->write_wait. We test the @@ -1681,7 +1675,7 @@ int tty_release(struct inode *inode, struct file *filp) opens on /dev/tty */ mutex_lock(&tty_mutex); - tty_lock_pair(tty, o_tty); + tty_lock(); tty_closing = tty->count <= 1; o_tty_closing = o_tty && (o_tty->count <= (pty_master ? 1 : 0)); @@ -1712,7 +1706,7 @@ int tty_release(struct inode *inode, struct file *filp) printk(KERN_WARNING "%s: %s: read/write wait queue active!\n", __func__, tty_name(tty, buf)); - tty_unlock_pair(tty, o_tty); + tty_unlock(); mutex_unlock(&tty_mutex); schedule(); } @@ -1775,7 +1769,7 @@ int tty_release(struct inode *inode, struct file *filp) /* check whether both sides are closing ... */ if (!tty_closing || (o_tty && !o_tty_closing)) { - tty_unlock_pair(tty, o_tty); + tty_unlock(); return 0; } @@ -1788,16 +1782,14 @@ int tty_release(struct inode *inode, struct file *filp) tty_ldisc_release(tty, o_tty); /* * The release_tty function takes care of the details of clearing - * the slots and preserving the termios structure. The tty_unlock_pair - * should be safe as we keep a kref while the tty is locked (so the - * unlock never unlocks a freed tty). + * the slots and preserving the termios structure. */ release_tty(tty, idx); - tty_unlock_pair(tty, o_tty); /* Make this pty number available for reallocation */ if (devpts) devpts_kill_index(inode, idx); + tty_unlock(); return 0; } @@ -1901,9 +1893,6 @@ static struct tty_driver *tty_lookup_driver(dev_t device, struct file *filp, * Locking: tty_mutex protects tty, tty_lookup_driver and tty_init_dev. * tty->count should protect the rest. * ->siglock protects ->signal/->sighand - * - * Note: the tty_unlock/lock cases without a ref are only safe due to - * tty_mutex */ static int tty_open(struct inode *inode, struct file *filp) @@ -1927,7 +1916,8 @@ retry_open: retval = 0; mutex_lock(&tty_mutex); - /* This is protected by the tty_mutex */ + tty_lock(); + tty = tty_open_current_tty(device, filp); if (IS_ERR(tty)) { retval = PTR_ERR(tty); @@ -1948,19 +1938,17 @@ retry_open: } if (tty) { - tty_lock(tty); retval = tty_reopen(tty); - if (retval < 0) { - tty_unlock(tty); + if (retval) tty = ERR_PTR(retval); - } - } else /* Returns with the tty_lock held for now */ + } else tty = tty_init_dev(driver, index); mutex_unlock(&tty_mutex); if (driver) tty_driver_kref_put(driver); if (IS_ERR(tty)) { + tty_unlock(); retval = PTR_ERR(tty); goto err_file; } @@ -1989,7 +1977,7 @@ retry_open: printk(KERN_DEBUG "%s: error %d in opening %s...\n", __func__, retval, tty->name); #endif - tty_unlock(tty); /* need to call tty_release without BTM */ + tty_unlock(); /* need to call tty_release without BTM */ tty_release(inode, filp); if (retval != -ERESTARTSYS) return retval; @@ -2001,15 +1989,17 @@ retry_open: /* * Need to reset f_op in case a hangup happened. */ + tty_lock(); if (filp->f_op == &hung_up_tty_fops) filp->f_op = &tty_fops; + tty_unlock(); goto retry_open; } - tty_unlock(tty); + tty_unlock(); mutex_lock(&tty_mutex); - tty_lock(tty); + tty_lock(); spin_lock_irq(¤t->sighand->siglock); if (!noctty && current->signal->leader && @@ -2017,10 +2007,11 @@ retry_open: tty->session == NULL) __proc_set_tty(current, tty); spin_unlock_irq(¤t->sighand->siglock); - tty_unlock(tty); + tty_unlock(); mutex_unlock(&tty_mutex); return 0; err_unlock: + tty_unlock(); mutex_unlock(&tty_mutex); /* after locks to avoid deadlock */ if (!IS_ERR_OR_NULL(driver)) @@ -2103,13 +2094,10 @@ out: static int tty_fasync(int fd, struct file *filp, int on) { - struct tty_struct *tty = file_tty(filp); int retval; - - tty_lock(tty); + tty_lock(); retval = __tty_fasync(fd, filp, on); - tty_unlock(tty); - + tty_unlock(); return retval; } @@ -2946,7 +2934,6 @@ void initialize_tty_struct(struct tty_struct *tty, tty->pgrp = NULL; tty->overrun_time = jiffies; tty_buffer_init(tty); - mutex_init(&tty->legacy_mutex); mutex_init(&tty->termios_mutex); mutex_init(&tty->ldisc_mutex); init_waitqueue_head(&tty->write_wait); diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c index ba8be39..9911eb6 100644 --- a/drivers/tty/tty_ldisc.c +++ b/drivers/tty/tty_ldisc.c @@ -568,7 +568,7 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc) if (IS_ERR(new_ldisc)) return PTR_ERR(new_ldisc); - tty_lock(tty); + tty_lock(); /* * We need to look at the tty locking here for pty/tty pairs * when both sides try to change in parallel. @@ -582,12 +582,12 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc) */ if (tty->ldisc->ops->num == ldisc) { - tty_unlock(tty); + tty_unlock(); tty_ldisc_put(new_ldisc); return 0; } - tty_unlock(tty); + tty_unlock(); /* * Problem: What do we do if this blocks ? * We could deadlock here @@ -595,7 +595,7 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc) tty_wait_until_sent(tty, 0); - tty_lock(tty); + tty_lock(); mutex_lock(&tty->ldisc_mutex); /* @@ -605,10 +605,10 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc) while (test_bit(TTY_LDISC_CHANGING, &tty->flags)) { mutex_unlock(&tty->ldisc_mutex); - tty_unlock(tty); + tty_unlock(); wait_event(tty_ldisc_wait, test_bit(TTY_LDISC_CHANGING, &tty->flags) == 0); - tty_lock(tty); + tty_lock(); mutex_lock(&tty->ldisc_mutex); } @@ -623,7 +623,7 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc) o_ldisc = tty->ldisc; - tty_unlock(tty); + tty_unlock(); /* * Make sure we don't change while someone holds a * reference to the line discipline. The TTY_LDISC bit @@ -650,7 +650,7 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc) retval = tty_ldisc_wait_idle(tty, 5 * HZ); - tty_lock(tty); + tty_lock(); mutex_lock(&tty->ldisc_mutex); /* handle wait idle failure locked */ @@ -665,7 +665,7 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc) clear_bit(TTY_LDISC_CHANGING, &tty->flags); mutex_unlock(&tty->ldisc_mutex); tty_ldisc_put(new_ldisc); - tty_unlock(tty); + tty_unlock(); return -EIO; } @@ -708,7 +708,7 @@ enable: if (o_work) schedule_work(&o_tty->buf.work); mutex_unlock(&tty->ldisc_mutex); - tty_unlock(tty); + tty_unlock(); return retval; } @@ -816,11 +816,11 @@ void tty_ldisc_hangup(struct tty_struct *tty) * need to wait for another function taking the BTM */ clear_bit(TTY_LDISC, &tty->flags); - tty_unlock(tty); + tty_unlock(); cancel_work_sync(&tty->buf.work); mutex_unlock(&tty->ldisc_mutex); retry: - tty_lock(tty); + tty_lock(); mutex_lock(&tty->ldisc_mutex); /* At this point we have a closed ldisc and we want to @@ -831,7 +831,7 @@ retry: if (atomic_read(&tty->ldisc->users) != 1) { char cur_n[TASK_COMM_LEN], tty_n[64]; long timeout = 3 * HZ; - tty_unlock(tty); + tty_unlock(); while (tty_ldisc_wait_idle(tty, timeout) == -EBUSY) { timeout = MAX_SCHEDULE_TIMEOUT; @@ -894,23 +894,6 @@ int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty) tty_ldisc_enable(tty); return 0; } - -static void tty_ldisc_kill(struct tty_struct *tty) -{ - mutex_lock(&tty->ldisc_mutex); - /* - * Now kill off the ldisc - */ - tty_ldisc_close(tty, tty->ldisc); - tty_ldisc_put(tty->ldisc); - /* Force an oops if we mess this up */ - tty->ldisc = NULL; - - /* Ensure the next open requests the N_TTY ldisc */ - tty_set_termios_ldisc(tty, N_TTY); - mutex_unlock(&tty->ldisc_mutex); -} - /** * tty_ldisc_release - release line discipline * @tty: tty being shut down @@ -929,19 +912,27 @@ void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty) * race with the set_ldisc code path. */ - tty_unlock_pair(tty, o_tty); + tty_unlock(); tty_ldisc_halt(tty); tty_ldisc_flush_works(tty); - if (o_tty) { - tty_ldisc_halt(o_tty); - tty_ldisc_flush_works(o_tty); - } - tty_lock_pair(tty, o_tty); + tty_lock(); + mutex_lock(&tty->ldisc_mutex); + /* + * Now kill off the ldisc + */ + tty_ldisc_close(tty, tty->ldisc); + tty_ldisc_put(tty->ldisc); + /* Force an oops if we mess this up */ + tty->ldisc = NULL; + + /* Ensure the next open requests the N_TTY ldisc */ + tty_set_termios_ldisc(tty, N_TTY); + mutex_unlock(&tty->ldisc_mutex); - tty_ldisc_kill(tty); + /* This will need doing differently if we need to lock */ if (o_tty) - tty_ldisc_kill(o_tty); + tty_ldisc_release(o_tty, NULL); /* And the memory resources remaining (buffers, termios) will be disposed of when the kref hits zero */ diff --git a/drivers/tty/tty_mutex.c b/drivers/tty/tty_mutex.c index 67feac9..9ff986c 100644 --- a/drivers/tty/tty_mutex.c +++ b/drivers/tty/tty_mutex.c @@ -4,70 +4,29 @@ #include #include -/* Legacy tty mutex glue */ - -enum { - TTY_MUTEX_NORMAL, - TTY_MUTEX_NESTED, -}; +/* + * The 'big tty mutex' + * + * This mutex is taken and released by tty_lock() and tty_unlock(), + * replacing the older big kernel lock. + * It can no longer be taken recursively, and does not get + * released implicitly while sleeping. + * + * Don't use in new code. + */ +static DEFINE_MUTEX(big_tty_mutex); /* * Getting the big tty mutex. */ - -static void __lockfunc tty_lock_nested(struct tty_struct *tty, - unsigned int subclass) +void __lockfunc tty_lock(void) { - if (tty->magic != TTY_MAGIC) { - printk(KERN_ERR "L Bad %p\n", tty); - WARN_ON(1); - return; - } - tty_kref_get(tty); - mutex_lock_nested(&tty->legacy_mutex, subclass); -} - -void __lockfunc tty_lock(struct tty_struct *tty) -{ - return tty_lock_nested(tty, TTY_MUTEX_NORMAL); + mutex_lock(&big_tty_mutex); } EXPORT_SYMBOL(tty_lock); -void __lockfunc tty_unlock(struct tty_struct *tty) +void __lockfunc tty_unlock(void) { - if (tty->magic != TTY_MAGIC) { - printk(KERN_ERR "U Bad %p\n", tty); - WARN_ON(1); - return; - } - mutex_unlock(&tty->legacy_mutex); - tty_kref_put(tty); + mutex_unlock(&big_tty_mutex); } EXPORT_SYMBOL(tty_unlock); - -/* - * Getting the big tty mutex for a pair of ttys with lock ordering - * On a non pty/tty pair tty2 can be NULL which is just fine. - */ -void __lockfunc tty_lock_pair(struct tty_struct *tty, - struct tty_struct *tty2) -{ - if (tty < tty2) { - tty_lock(tty); - tty_lock_nested(tty2, TTY_MUTEX_NESTED); - } else { - if (tty2 && tty2 != tty) - tty_lock(tty2); - tty_lock_nested(tty, TTY_MUTEX_NESTED); - } -} -EXPORT_SYMBOL(tty_lock_pair); - -void __lockfunc tty_unlock_pair(struct tty_struct *tty, - struct tty_struct *tty2) -{ - tty_unlock(tty); - if (tty2 && tty2 != tty) - tty_unlock(tty2); -} -EXPORT_SYMBOL(tty_unlock_pair); diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c index d9cca95..bf6e238 100644 --- a/drivers/tty/tty_port.c +++ b/drivers/tty/tty_port.c @@ -230,7 +230,7 @@ int tty_port_block_til_ready(struct tty_port *port, /* block if port is in the process of being closed */ if (tty_hung_up_p(filp) || port->flags & ASYNC_CLOSING) { - wait_event_interruptible_tty(tty, port->close_wait, + wait_event_interruptible_tty(port->close_wait, !(port->flags & ASYNC_CLOSING)); if (port->flags & ASYNC_HUP_NOTIFY) return -EAGAIN; @@ -296,9 +296,9 @@ int tty_port_block_til_ready(struct tty_port *port, retval = -ERESTARTSYS; break; } - tty_unlock(tty); + tty_unlock(); schedule(); - tty_lock(tty); + tty_lock(); } finish_wait(&port->open_wait, &wait); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 6e27fa9..6a8f002 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -64,6 +64,7 @@ enum { MLX4_MAX_NUM_PF = 16, MLX4_MAX_NUM_VF = 64, MLX4_MFUNC_MAX = 80, + MLX4_MAX_EQ_NUM = 1024, MLX4_MFUNC_EQ_NUM = 4, MLX4_MFUNC_MAX_EQES = 8, MLX4_MFUNC_EQE_MASK = (MLX4_MFUNC_MAX_EQES - 1) @@ -239,6 +240,10 @@ static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor) return (major << 32) | (minor << 16) | subminor; } +struct mlx4_phys_caps { + u32 num_phys_eqs; +}; + struct mlx4_caps { u64 fw_ver; u32 function; @@ -499,6 +504,7 @@ struct mlx4_dev { unsigned long flags; unsigned long num_slaves; struct mlx4_caps caps; + struct mlx4_phys_caps phys_caps; struct radix_tree_root qp_table_tree; u8 rev_id; char board_id[MLX4_BOARD_ID_LEN]; diff --git a/include/linux/tty.h b/include/linux/tty.h index 4990ef2..9f47ab5 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -268,7 +268,6 @@ struct tty_struct { struct mutex ldisc_mutex; struct tty_ldisc *ldisc; - struct mutex legacy_mutex; struct mutex termios_mutex; spinlock_t ctrl_lock; /* Termios values are protected by the termios mutex */ @@ -606,12 +605,8 @@ extern long vt_compat_ioctl(struct tty_struct *tty, /* tty_mutex.c */ /* functions for preparation of BKL removal */ -extern void __lockfunc tty_lock(struct tty_struct *tty); -extern void __lockfunc tty_unlock(struct tty_struct *tty); -extern void __lockfunc tty_lock_pair(struct tty_struct *tty, - struct tty_struct *tty2); -extern void __lockfunc tty_unlock_pair(struct tty_struct *tty, - struct tty_struct *tty2); +extern void __lockfunc tty_lock(void) __acquires(tty_lock); +extern void __lockfunc tty_unlock(void) __releases(tty_lock); /* * this shall be called only from where BTM is held (like close) @@ -626,9 +621,9 @@ extern void __lockfunc tty_unlock_pair(struct tty_struct *tty, static inline void tty_wait_until_sent_from_close(struct tty_struct *tty, long timeout) { - tty_unlock(tty); /* tty->ops->close holds the BTM, drop it while waiting */ + tty_unlock(); /* tty->ops->close holds the BTM, drop it while waiting */ tty_wait_until_sent(tty, timeout); - tty_lock(tty); + tty_lock(); } /* @@ -643,16 +638,16 @@ static inline void tty_wait_until_sent_from_close(struct tty_struct *tty, * * Do not use in new code. */ -#define wait_event_interruptible_tty(tty, wq, condition) \ +#define wait_event_interruptible_tty(wq, condition) \ ({ \ int __ret = 0; \ if (!(condition)) { \ - __wait_event_interruptible_tty(tty, wq, condition, __ret); \ + __wait_event_interruptible_tty(wq, condition, __ret); \ } \ __ret; \ }) -#define __wait_event_interruptible_tty(tty, wq, condition, ret) \ +#define __wait_event_interruptible_tty(wq, condition, ret) \ do { \ DEFINE_WAIT(__wait); \ \ @@ -661,9 +656,9 @@ do { \ if (condition) \ break; \ if (!signal_pending(current)) { \ - tty_unlock(tty); \ + tty_unlock(); \ schedule(); \ - tty_lock(tty); \ + tty_lock(); \ continue; \ } \ ret = -ERESTARTSYS; \ diff --git a/include/net/cipso_ipv4.h b/include/net/cipso_ipv4.h index 9808877..a7a683e 100644 --- a/include/net/cipso_ipv4.h +++ b/include/net/cipso_ipv4.h @@ -42,6 +42,7 @@ #include #include #include +#include /* known doi values */ #define CIPSO_V4_DOI_UNKNOWN 0x00000000 @@ -285,7 +286,33 @@ static inline int cipso_v4_skbuff_getattr(const struct sk_buff *skb, static inline int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option) { - return -ENOSYS; + unsigned char *opt = *option; + unsigned char err_offset = 0; + u8 opt_len = opt[1]; + u8 opt_iter; + + if (opt_len < 8) { + err_offset = 1; + goto out; + } + + if (get_unaligned_be32(&opt[2]) == 0) { + err_offset = 2; + goto out; + } + + for (opt_iter = 6; opt_iter < opt_len;) { + if (opt[opt_iter + 1] > (opt_len - opt_iter)) { + err_offset = opt_iter + 1; + goto out; + } + opt_iter += opt[opt_iter + 1]; + } + +out: + *option = opt + err_offset; + return err_offset; + } #endif /* CONFIG_NETLABEL */ diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c index 6ab4587..0777c5a 100644 --- a/lib/dynamic_queue_limits.c +++ b/lib/dynamic_queue_limits.c @@ -10,23 +10,27 @@ #include #include -#define POSDIFF(A, B) ((A) > (B) ? (A) - (B) : 0) +#define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0) +#define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0) /* Records completed count and recalculates the queue limit */ void dql_completed(struct dql *dql, unsigned int count) { unsigned int inprogress, prev_inprogress, limit; - unsigned int ovlimit, all_prev_completed, completed; + unsigned int ovlimit, completed, num_queued; + bool all_prev_completed; + + num_queued = ACCESS_ONCE(dql->num_queued); /* Can't complete more than what's in queue */ - BUG_ON(count > dql->num_queued - dql->num_completed); + BUG_ON(count > num_queued - dql->num_completed); completed = dql->num_completed + count; limit = dql->limit; - ovlimit = POSDIFF(dql->num_queued - dql->num_completed, limit); - inprogress = dql->num_queued - completed; + ovlimit = POSDIFF(num_queued - dql->num_completed, limit); + inprogress = num_queued - completed; prev_inprogress = dql->prev_num_queued - dql->num_completed; - all_prev_completed = POSDIFF(completed, dql->prev_num_queued); + all_prev_completed = AFTER_EQ(completed, dql->prev_num_queued); if ((ovlimit && !inprogress) || (dql->prev_ovlimit && all_prev_completed)) { @@ -104,7 +108,7 @@ void dql_completed(struct dql *dql, unsigned int count) dql->prev_ovlimit = ovlimit; dql->prev_last_obj_cnt = dql->last_obj_cnt; dql->num_completed = completed; - dql->prev_num_queued = dql->num_queued; + dql->prev_num_queued = num_queued; } EXPORT_SYMBOL(dql_completed); diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index aa5d73b..d1820ff 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -710,9 +710,9 @@ static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp) break; } - tty_unlock(tty); + tty_unlock(); schedule(); - tty_lock(tty); + tty_lock(); } set_current_state(TASK_RUNNING); remove_wait_queue(&dev->wait, &wait); diff --git a/net/core/sock.c b/net/core/sock.c index 653f8c0..9e5b71f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1592,6 +1592,11 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, gfp_t gfp_mask; long timeo; int err; + int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + + err = -EMSGSIZE; + if (npages > MAX_SKB_FRAGS) + goto failure; gfp_mask = sk->sk_allocation; if (gfp_mask & __GFP_WAIT) @@ -1610,14 +1615,12 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { skb = alloc_skb(header_len, gfp_mask); if (skb) { - int npages; int i; /* No pages, we're done... */ if (!data_len) break; - npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; skb->truesize += data_len; skb_shinfo(skb)->nr_frags = npages; for (i = 0; i < npages; i++) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 95e61596..f9ee741 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -377,7 +377,8 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, - sk->sk_protocol, inet_sk_flowi_flags(sk), + sk->sk_protocol, + inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS, (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); security_req_classify_flow(req, flowi4_to_flowi(fl4)); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a43b87d..c8d28c4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -824,7 +824,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, */ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct request_values *rvp) + struct request_values *rvp, + u16 queue_mapping) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -840,6 +841,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, if (skb) { __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); + skb_set_queue_mapping(skb, queue_mapping); err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, ireq->rmt_addr, ireq->opt); @@ -854,7 +856,7 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, struct request_values *rvp) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); - return tcp_v4_send_synack(sk, NULL, req, rvp); + return tcp_v4_send_synack(sk, NULL, req, rvp, 0); } /* @@ -1422,7 +1424,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_rsk(req)->snt_synack = tcp_time_stamp; if (tcp_v4_send_synack(sk, dst, req, - (struct request_values *)&tmp_ext) || + (struct request_values *)&tmp_ext, + skb_get_queue_mapping(skb)) || want_cookie) goto drop_and_free; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 554d599..3a9aec2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -476,7 +476,8 @@ out: static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, - struct request_values *rvp) + struct request_values *rvp, + u16 queue_mapping) { struct inet6_request_sock *treq = inet6_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); @@ -513,6 +514,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, __tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr); fl6.daddr = treq->rmt_addr; + skb_set_queue_mapping(skb, queue_mapping); err = ip6_xmit(sk, skb, &fl6, opt, np->tclass); err = net_xmit_eval(err); } @@ -528,7 +530,7 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req, struct request_values *rvp) { TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); - return tcp_v6_send_synack(sk, req, rvp); + return tcp_v6_send_synack(sk, req, rvp, 0); } static void tcp_v6_reqsk_destructor(struct request_sock *req) @@ -1213,7 +1215,8 @@ have_isn: security_inet_conn_request(sk, skb, req); if (tcp_v6_send_synack(sk, req, - (struct request_values *)&tmp_ext) || + (struct request_values *)&tmp_ext, + skb_get_queue_mapping(skb)) || want_cookie) goto drop_and_free;