Index: linux/arch/i386/kernel/paravirt.c =================================================================== --- linux.orig/arch/i386/kernel/paravirt.c +++ linux/arch/i386/kernel/paravirt.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -33,6 +34,9 @@ #include #include +#include +#include + /* nop stub */ static void native_nop(void) { @@ -492,6 +496,7 @@ struct paravirt_ops paravirt_ops = { .patch = native_patch, .banner = default_banner, + .arch_setup = native_nop, .memory_setup = machine_specific_memory_setup, .get_wallclock = native_get_wallclock, @@ -566,4 +571,290 @@ struct paravirt_ops paravirt_ops = { .irq_enable_sysexit = native_irq_enable_sysexit, .iret = native_iret, }; -EXPORT_SYMBOL(paravirt_ops); + +/* + * These are exported to modules: + */ +struct paravirt_ops paravirt_mod_ops = { + .name = "bare hardware", + .paravirt_enabled = 0, + .kernel_rpl = 0, + + .patch = native_patch, + .banner = default_banner, + + .save_fl = native_save_fl, + .restore_fl = native_restore_fl, + .irq_disable = native_irq_disable, + .irq_enable = native_irq_enable, + + .cpuid = native_cpuid, + + .read_msr = native_read_msr, + .write_msr = native_write_msr, + + .read_tsc = native_read_tsc, + .read_pmc = native_read_pmc, + + .io_delay = native_io_delay, + .const_udelay = __const_udelay, + +#ifdef CONFIG_X86_LOCAL_APIC + .apic_write = native_apic_write, + .apic_write_atomic = native_apic_write_atomic, + .apic_read = native_apic_read, +#endif + + .flush_tlb_user = native_flush_tlb, + .flush_tlb_kernel = native_flush_tlb_global, + .flush_tlb_single = native_flush_tlb_single, +}; +EXPORT_SYMBOL(paravirt_mod_ops); + +/* + * KVM paravirtualization optimizations: + */ +int kvm_paravirt; + +/* + * No need for any "IO delay" on KVM: + */ +static void kvm_io_delay(void) +{ +} + +static DEFINE_PER_CPU(struct kvm_vcpu_para_state, para_state); + +/* + * Special, register-to-cr3 instruction based hypercall API + * variant to the KVM host. This utilizes the cr3 filter capability + * of the hardware - if this works out then no VM exit happens, + * if a VM exit happens then KVM will get the virtual address too. + */ +static void kvm_write_cr3(unsigned long guest_cr3) +{ + struct kvm_vcpu_para_state *para_state = &get_cpu_var(para_state); + struct kvm_cr3_cache *cache = ¶_state->cr3_cache; + int idx; + + /* + * Check the cache (maintained by the host) for a matching + * guest_cr3 => host_cr3 mapping. Use it if found: + */ + for (idx = 0; idx < cache->max_idx; idx++) { + if (cache->entry[idx].guest_cr3 == guest_cr3) { + /* + * Cache-hit: we load the cached host-CR3 value. + * This never causes any VM exit. (if it does then the + * hypervisor could do nothing with this instruction + * and the guest OS would be aborted) + */ + asm volatile("movl %0, %%cr3" + : : "r" (cache->entry[idx].host_cr3)); + goto out; + } + } + + /* + * Cache-miss. Load the guest-cr3 value into cr3, which will + * cause a VM exit to the hypervisor, which then loads the + * host cr3 value and updates the cr3_cache. + */ + asm volatile("movl %0, %%cr3" : : "r" (guest_cr3)); +out: + put_cpu_var(para_state); +} + +/* + * Avoid the VM exit upon cr3 load by using the cached + * ->active_mm->pgd value: + */ +static void kvm_flush_tlb_user(void) +{ + kvm_write_cr3(__pa(current->active_mm->pgd)); +} + +static void kvm_flush_tlb_single(u32 addr) +{ + __native_flush_tlb_single(addr); +} +/* + * Disable global pages, do a flush, then enable global pages: + */ +static fastcall void kvm_flush_tlb_kernel(void) +{ + unsigned long orig_cr4 = read_cr4(); + + write_cr4(orig_cr4 & ~X86_CR4_PGE); + kvm_flush_tlb_user(); + write_cr4(orig_cr4); +} + +/* + * Simplified i8259A controller handling: + */ +static void mask_and_ack_kvm(unsigned int irq) +{ + unsigned int irqmask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= irqmask; + + if (irq & 8) { + outb(cached_slave_mask, PIC_SLAVE_IMR); + outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ + } else { + outb(cached_master_mask, PIC_MASTER_IMR); + /* 'Specific EOI' to master: */ + outb(0x60+irq, PIC_MASTER_CMD); + } + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static void disable_kvm_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static void enable_kvm_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask &= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static struct irq_chip kvm_chip = { + .name = "XT-PIC", + .mask = disable_kvm_irq, + .disable = disable_kvm_irq, + .unmask = enable_kvm_irq, + .mask_ack = mask_and_ack_kvm, +}; + +static void __init kvm_init_IRQ(void) +{ + int i; + + printk("init KVM IRQ controller\n"); +#ifdef CONFIG_X86_LOCAL_APIC + init_bsp_APIC(); +#endif + init_8259A(0); + + for (i = 0; i < NR_IRQS; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = NULL; + irq_desc[i].depth = 1; + + if (i < 16) { + /* + * 16 old-style INTA-cycle interrupts: + */ + set_irq_chip_and_handler_name(i, &kvm_chip, + handle_level_irq, "XT"); + } else { + /* + * 'high' PCI IRQs filled in on demand + */ + irq_desc[i].chip = &no_irq_chip; + } + } + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (i >= NR_IRQS) + break; + if (vector != SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); + } + + /* setup after call gates are initialised (usually add in + * the architecture specific gates) + */ + intr_init_hook(); + + irq_ctx_init(smp_processor_id()); +} + +int kvm_guest_register_para(int cpu) +{ + struct kvm_vcpu_para_state *para_state = &per_cpu(para_state, cpu); + unsigned long magic_val, ret; + + printk("kvm guest on VCPU#%d: trying to register para_state %p\n", + cpu, para_state); + printk("per_cpu_offset(%d): %08lx\n", cpu, per_cpu_offset(cpu)); + /* + * Move a magic (and otherwise invalid) value to + * cr3, and thus signal to KVM that we are entering + * paravirtualized mode: + */ + magic_val = KVM_API_MAGIC; + para_state->guest_version = KVM_PARA_API_VERSION; + para_state->host_version = -1; + para_state->size = sizeof(*para_state); + + asm volatile ("movl %0, %%cr3" + : "=&r" (ret) + : "a" (para_state), + "0" (magic_val) + ); + printk("kvm guest: host returned %ld\n", ret); + printk("kvm guest: host version: %d\n", para_state->host_version); + printk("kvm guest: cr3 cache size: %d\n", + para_state->cr3_cache.max_idx); + if (!ret) { + printk("kvm guest: registration with host failed.\n"); + return 0; + } + return 1; +} + +static int __init kvm_paravirt_setup(char *s) +{ + printk("KVM paravirtualization setup\n"); + if (sscanf(s, "%u", &kvm_paravirt) <= 0) + return 1; + if (!kvm_paravirt) + return 1; + + kvm_paravirt = kvm_guest_register_para(smp_processor_id()); + if (!kvm_paravirt) + return 1; + + printk("KVM paravirtualized: OK\n"); + + paravirt_ops.name = "KVM"; + paravirt_ops.io_delay = kvm_io_delay; + paravirt_ops.init_IRQ = kvm_init_IRQ; + paravirt_mod_ops.flush_tlb_user = kvm_flush_tlb_user; + paravirt_mod_ops.flush_tlb_kernel = kvm_flush_tlb_kernel; + paravirt_mod_ops.flush_tlb_single = kvm_flush_tlb_single; + paravirt_ops.write_cr3 = kvm_write_cr3; + + return 1; +} +__setup("kvm_paravirt=", kvm_paravirt_setup); Index: linux/drivers/kvm/kvm.h =================================================================== --- linux.orig/drivers/kvm/kvm.h +++ linux/drivers/kvm/kvm.h @@ -51,8 +51,8 @@ #define KVM_MAX_VCPUS 1 #define KVM_MEMORY_SLOTS 4 -#define KVM_NUM_MMU_PAGES 256 -#define KVM_MIN_FREE_MMU_PAGES 5 +#define KVM_NUM_MMU_PAGES 2048 +#define KVM_MIN_FREE_MMU_PAGES 10 #define KVM_REFILL_PAGES 25 #define FX_IMAGE_SIZE 512 @@ -165,7 +165,7 @@ struct kvm_mmu { int root_level; int shadow_root_level; - u64 *pae_root; + u64 *pae_root[KVM_CR3_CACHE_SIZE]; }; #define KVM_NR_MEM_OBJS 20 @@ -237,6 +237,10 @@ struct kvm_vcpu { unsigned long cr0; unsigned long cr2; unsigned long cr3; + struct kvm_vcpu_para_state *para_state; + unsigned int cr3_cache_idx; + unsigned int cr3_cache_limit; + gpa_t guest_cr3_gpa[KVM_CR3_CACHE_SIZE]; unsigned long cr4; unsigned long cr8; u64 pdptrs[4]; /* pae */ @@ -397,6 +401,8 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot); +void kvm_cr3_cache_clear(struct kvm_vcpu *vcpu); + hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) @@ -442,9 +448,9 @@ int emulator_set_dr(struct x86_emulate_c unsigned long value); void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0); -void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0); -void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0); +void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); +void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); void lmsw(struct kvm_vcpu *vcpu, unsigned long msw); int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); Index: linux/drivers/kvm/kvm_main.c =================================================================== --- linux.orig/drivers/kvm/kvm_main.c +++ linux/drivers/kvm/kvm_main.c @@ -445,7 +445,7 @@ EXPORT_SYMBOL_GPL(set_cr4); void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (is_long_mode(vcpu)) { - if ( cr3 & CR3_L_MODE_RESEVED_BITS) { + if (cr3 & CR3_L_MODE_RESEVED_BITS) { printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); inject_gp(vcpu); return; Index: linux/drivers/kvm/mmu.c =================================================================== --- linux.orig/drivers/kvm/mmu.c +++ linux/drivers/kvm/mmu.c @@ -779,7 +779,7 @@ static int nonpaging_map(struct kvm_vcpu static void mmu_free_roots(struct kvm_vcpu *vcpu) { - int i; + int i, j; struct kvm_mmu_page *page; #ifdef CONFIG_X86_64 @@ -793,21 +793,40 @@ static void mmu_free_roots(struct kvm_vc return; } #endif - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->mmu.pae_root[i]; + /* + * Skip to the next cr3 filter entry and free it (if it's occupied): + */ + vcpu->cr3_cache_idx++; + if (unlikely(vcpu->cr3_cache_idx >= vcpu->cr3_cache_limit)) + vcpu->cr3_cache_idx = 0; - ASSERT(VALID_PAGE(root)); - root &= PT64_BASE_ADDR_MASK; - page = page_header(root); - --page->root_count; - vcpu->mmu.pae_root[i] = INVALID_PAGE; + j = vcpu->cr3_cache_idx; + /* + * Clear the guest-visible entry: + */ + if (vcpu->para_state) { + vcpu->para_state->cr3_cache.entry[j].guest_cr3 = 0; + vcpu->para_state->cr3_cache.entry[j].host_cr3 = 0; + } + ASSERT(vcpu->mmu.pae_root[j]); + if (VALID_PAGE(vcpu->mmu.pae_root[j][0])) { + vcpu->guest_cr3_gpa[j] = INVALID_PAGE; + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->mmu.pae_root[j][i]; + + ASSERT(VALID_PAGE(root)); + root &= PT64_BASE_ADDR_MASK; + page = page_header(root); + --page->root_count; + vcpu->mmu.pae_root[j][i] = INVALID_PAGE; + } } vcpu->mmu.root_hpa = INVALID_PAGE; } static void mmu_alloc_roots(struct kvm_vcpu *vcpu) { - int i; + int i, j; gfn_t root_gfn; struct kvm_mmu_page *page; @@ -826,8 +845,10 @@ static void mmu_alloc_roots(struct kvm_v return; } #endif + + j = vcpu->cr3_cache_idx; for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->mmu.pae_root[i]; + hpa_t root = vcpu->mmu.pae_root[j][i]; ASSERT(!VALID_PAGE(root)); if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) @@ -839,9 +860,14 @@ static void mmu_alloc_roots(struct kvm_v NULL); root = page->page_hpa; ++page->root_count; - vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; + vcpu->mmu.pae_root[j][i] = root | PT_PRESENT_MASK; } - vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root); + vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root[j]); + /* + * Store the guest-side address too, we need it if a guest + * exits the VM, to rediscover what cr3 it changed to: + */ + vcpu->guest_cr3_gpa[j] = vcpu->cr3; } static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) @@ -874,7 +900,13 @@ static int nonpaging_page_fault(struct k static void nonpaging_free(struct kvm_vcpu *vcpu) { - mmu_free_roots(vcpu); + int j; + + /* + * This will cycle through all existing roots and free them: + */ + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) + mmu_free_roots(vcpu); } static int nonpaging_init_context(struct kvm_vcpu *vcpu) @@ -893,20 +925,17 @@ static int nonpaging_init_context(struct return 0; } -static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) -{ - ++kvm_stat.tlb_flush; - kvm_arch_ops->tlb_flush(vcpu); -} - static void paging_new_cr3(struct kvm_vcpu *vcpu) { pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); + mmu_free_roots(vcpu); if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) kvm_mmu_free_some_pages(vcpu); mmu_alloc_roots(vcpu); - kvm_mmu_flush_tlb(vcpu); + /* + * Setting the cr3 will flush the TLB: + */ kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); } @@ -1191,6 +1220,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_free_some_page static void free_mmu_pages(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *page; + int j; while (!list_empty(&vcpu->kvm->active_mmu_pages)) { page = container_of(vcpu->kvm->active_mmu_pages.next, @@ -1204,13 +1234,17 @@ static void free_mmu_pages(struct kvm_vc __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT)); page->page_hpa = INVALID_PAGE; } - free_page((unsigned long)vcpu->mmu.pae_root); + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) { + ASSERT(vcpu->mmu.pae_root[j]); + free_page((unsigned long)vcpu->mmu.pae_root[j]); + vcpu->mmu.pae_root[j] = NULL; + } } static int alloc_mmu_pages(struct kvm_vcpu *vcpu) { struct page *page; - int i; + int i, j; ASSERT(vcpu); @@ -1227,17 +1261,22 @@ static int alloc_mmu_pages(struct kvm_vc ++vcpu->kvm->n_free_mmu_pages; } - /* - * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. - * Therefore we need to allocate shadow page tables in the first - * 4GB of memory, which happens to fit the DMA32 zone. - */ - page = alloc_page(GFP_KERNEL | __GFP_DMA32); - if (!page) - goto error_1; - vcpu->mmu.pae_root = page_address(page); - for (i = 0; i < 4; ++i) - vcpu->mmu.pae_root[i] = INVALID_PAGE; + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) { + /* + * When emulating 32-bit mode, cr3 is only 32 bits even on + * x86_64. Therefore we need to allocate shadow page tables + * in the first 4GB of memory, which happens to fit the DMA32 + * zone: + */ + page = alloc_page(GFP_KERNEL | __GFP_DMA32); + if (!page) + goto error_1; + + ASSERT(!vcpu->mmu.pae_root[j]); + vcpu->mmu.pae_root[j] = page_address(page); + for (i = 0; i < 4; ++i) + vcpu->mmu.pae_root[j][i] = INVALID_PAGE; + } return 0; @@ -1341,15 +1380,19 @@ static void audit_mappings(struct kvm_vc { int i; - if (vcpu->mmu.root_level == 4) + if (vcpu->mmu.root_level == 4) { audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4); - else - for (i = 0; i < 4; ++i) - if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK) + return; + } + + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) { + for (i = 0; i < 4; ++i) { + if (vcpu->mmu.pae_root[j][i] & PT_PRESENT_MASK) { audit_mappings_page(vcpu, - vcpu->mmu.pae_root[i], - i << 30, - 2); + vcpu->mmu.pae_root[j][i], i << 30, 2); + } + } + } } static int count_rmaps(struct kvm_vcpu *vcpu) Index: linux/drivers/kvm/paging_tmpl.h =================================================================== --- linux.orig/drivers/kvm/paging_tmpl.h +++ linux/drivers/kvm/paging_tmpl.h @@ -197,7 +197,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu shadow_addr = vcpu->mmu.root_hpa; level = vcpu->mmu.shadow_root_level; if (level == PT32E_ROOT_LEVEL) { - shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3]; + shadow_addr = vcpu->mmu.pae_root[vcpu->cr3_cache_idx][(addr >> 30) & 3]; shadow_addr &= PT64_BASE_ADDR_MASK; --level; } Index: linux/drivers/kvm/vmx.c =================================================================== --- linux.orig/drivers/kvm/vmx.c +++ linux/drivers/kvm/vmx.c @@ -788,9 +788,54 @@ static void vmx_set_cr0_no_modeswitch(st vcpu->cr0 = cr0; } -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +static void print_area_in_hex(void *area, int size) { - vmcs_writel(GUEST_CR3, cr3); + unsigned char *data = area; + int i; + + for (i = 0; i < size; i++, data++) { + if (!(i & 15)) + printk("\n%p:", data); + printk(" %02x", data[i]); + } + printk("\n"); +} + +/* + * Clear the guest side of the cr3 cache: + */ +void kvm_cr3_cache_clear(struct kvm_vcpu *vcpu) +{ + struct kvm_cr3_cache *cache; + + if (!vcpu->para_state) + return; + cache = &vcpu->para_state->cr3_cache; + memset(cache->entry, 0, sizeof(cache->entry)); +} + +static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3_hpa) +{ + struct kvm_cr3_cache *cache; + int idx; + + vmcs_writel(GUEST_CR3, cr3_hpa); + if (!vcpu->para_state) + return; + + WARN_ON(vmcs_readl(GUEST_CR3) != vcpu->mmu.root_hpa); + + idx = vcpu->cr3_cache_idx; + cache = &vcpu->para_state->cr3_cache; + + /* NOTE: remove this check, in case of hostile guests: */ + WARN_ON(cache->entry[idx].guest_cr3); + WARN_ON(cache->entry[idx].host_cr3); + + cache->entry[idx].guest_cr3 = vcpu->cr3; + cache->entry[idx].host_cr3 = cr3_hpa; + + vmcs_writel(CR3_TARGET_VALUE0 + idx*2, cr3_hpa); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -967,6 +1012,42 @@ static void seg_setup(int seg) } /* + * Set up the cr3 validity hardware cache: + */ +static int vmcs_setup_cr3_cache(struct kvm_vcpu *vcpu) +{ + unsigned int cr3_target_values, i; + u64 msr_val; + + rdmsrl(MSR_IA32_VMX_MISC, msr_val); + + printk("MSR_IA32_VMX_MISC: %016Lx\n", msr_val); + + /* + * 9 bits of "CR3 target values": + */ + cr3_target_values = (msr_val >> 16) & ((1 << 10) - 1); + printk(" cr3 target values: %d\n", cr3_target_values); + if (cr3_target_values > KVM_CR3_CACHE_SIZE) { + printk("KVM: limiting cr3 cache size from %d to %d\n", + cr3_target_values, KVM_CR3_CACHE_SIZE); + cr3_target_values = KVM_CR3_CACHE_SIZE; + } + + vcpu->cr3_cache_idx = 0; + vcpu->cr3_cache_limit = cr3_target_values; + /* + * Initialize. TODO: set this to guest physical memory. + */ + for (i = 0; i < cr3_target_values; i++) + vmcs_writel(CR3_TARGET_VALUE0 + i*2, -1UL); + + vmcs_write32(CR3_TARGET_COUNT, cr3_target_values); + + return 0; +} + +/* * Sets up the vmcs for emulated real mode. */ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) @@ -1070,7 +1151,10 @@ static int vmx_vcpu_setup(struct kvm_vcp vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); - vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ + + ret = vmcs_setup_cr3_cache(vcpu); + if (ret < 0) + goto out; vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ @@ -1322,6 +1406,7 @@ static int handle_exception(struct kvm_v cr2 = vmcs_readl(EXIT_QUALIFICATION); spin_lock(&vcpu->kvm->lock); + kvm_cr3_cache_clear(vcpu); r = kvm_mmu_page_fault(vcpu, cr2, error_code); if (r < 0) { spin_unlock(&vcpu->kvm->lock); @@ -1447,6 +1532,44 @@ static int handle_io(struct kvm_vcpu *vc return 0; } +int vcpu_register_para(struct kvm_vcpu *vcpu, unsigned long para_state_gva) +{ + struct kvm_vcpu_para_state *para_state; + hpa_t para_state_hpa; + hva_t para_state_hva; + + printk("KVM: guest trying to enter paravirtual mode\n"); + printk(".... gva: %08lx\n", para_state_gva); + /* + * Needs to be page aligned: + */ + if (para_state_gva != PAGE_ALIGN(para_state_gva)) + return 0; + + para_state_hpa = gva_to_hpa(vcpu, para_state_gva); + printk(".... hpa: %08Lx\n", para_state_hpa); + if (is_error_hpa(para_state_hpa)) + return 0; + para_state_hva = (unsigned long)__va(para_state_hpa); + printk(".... hva: %08lx\n", para_state_hva); + para_state = (void *)para_state_hva; + printk(".... guest version: %d\n", para_state->guest_version); + printk(".... size: %d\n", para_state->size); + para_state->host_version = KVM_PARA_API_VERSION; + /* + * We cannot support guests that try to register themselves + * with a newer API version than the host supports: + */ + if (para_state->guest_version > KVM_PARA_API_VERSION) + return 0; + + para_state->cr3_cache.max_idx = vcpu->cr3_cache_limit; + printk("KVM: para guest successfully registered.\n"); + vcpu->para_state = para_state; + + return 1; +} + static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u64 exit_qualification; @@ -1466,7 +1589,24 @@ static int handle_cr(struct kvm_vcpu *vc return 1; case 3: vcpu_load_rsp_rip(vcpu); - set_cr3(vcpu, vcpu->regs[reg]); + /* + * Special, otherwise invalid argument to movl reg, %cr3 + * drives the paravirtual-registration API: + */ + if (vcpu->regs[reg] == KVM_API_MAGIC && + reg != VCPU_REGS_RAX) { + vcpu->regs[reg] = vcpu_register_para(vcpu, + vcpu->regs[VCPU_REGS_RAX]); + } else { + set_cr3(vcpu, vcpu->regs[reg]); + /* + * Paravirtualized guests expect the + * host-cr3 address back in the same register + * where they provided the guest-cr3 address: + */ + if (vcpu->para_state) + vcpu->regs[reg] = vcpu->mmu.root_hpa; + } skip_emulated_instruction(vcpu); return 1; case 4: @@ -1684,48 +1824,43 @@ static int dm_request_for_irq_injection( (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)); } -static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void kvm_cr3_cache_sync(struct kvm_vcpu *vcpu) { - u8 fail; - u16 fs_sel, gs_sel, ldt_sel; - int fs_gs_ldt_reload_needed; - int r; + void *guest_cr3_hva; + hpa_t guest_cr3_hpa; + u64 *root; + int j; + + if (!vcpu->para_state) + return; + + guest_cr3_hpa = vmcs_readl(GUEST_CR3); -again: /* - * Set host fs and gs selectors. Unfortunately, 22.2.3 does not - * allow segment selectors with cpl > 0 or ti == 1. + * Are they in sync already? */ - fs_sel = read_fs(); - gs_sel = read_gs(); - ldt_sel = read_ldt(); - fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel; - if (!fs_gs_ldt_reload_needed) { - vmcs_write16(HOST_FS_SELECTOR, fs_sel); - vmcs_write16(HOST_GS_SELECTOR, gs_sel); - } else { - vmcs_write16(HOST_FS_SELECTOR, 0); - vmcs_write16(HOST_GS_SELECTOR, 0); - } - -#ifdef CONFIG_X86_64 - vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); - vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); -#else - vmcs_writel(HOST_FS_BASE, segment_base(fs_sel)); - vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); -#endif + if (guest_cr3_hpa == vcpu->mmu.root_hpa) + return; - do_interrupt_requests(vcpu, kvm_run); + guest_cr3_hva = __va(guest_cr3_hpa); - if (vcpu->guest_debug.enabled) - kvm_guest_debug_pre(vcpu); + for (j = 0; j < vcpu->cr3_cache_limit; j++) { + root = vcpu->mmu.pae_root[j]; + WARN_ON(!root); + if (root != guest_cr3_hva) + continue; - fx_save(vcpu->host_fx_image); - fx_restore(vcpu->guest_fx_image); + vcpu->cr3 = vcpu->guest_cr3_gpa[j]; + vcpu->cr3_cache_idx = j; + vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root[j]); + break; + } + WARN_ON(j == KVM_CR3_CACHE_SIZE); +} - save_msrs(vcpu->host_msrs, vcpu->nmsrs); - load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); +static int __vmx_vcpu_run(struct kvm_vcpu *vcpu) +{ + u8 fail; asm ( /* Store host registers */ @@ -1846,6 +1981,64 @@ again: [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) : "cc", "memory" ); + return fail; +} + +static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u8 fail; + u16 fs_sel, gs_sel, ldt_sel; + int fs_gs_ldt_reload_needed; + int r; + +again: + /* + * Set host fs and gs selectors. Unfortunately, 22.2.3 does not + * allow segment selectors with cpl > 0 or ti == 1. + */ + fs_sel = read_fs(); + gs_sel = read_gs(); + ldt_sel = read_ldt(); + fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel; + if (!fs_gs_ldt_reload_needed) { + vmcs_write16(HOST_FS_SELECTOR, fs_sel); + vmcs_write16(HOST_GS_SELECTOR, gs_sel); + } else { + vmcs_write16(HOST_FS_SELECTOR, 0); + vmcs_write16(HOST_GS_SELECTOR, 0); + } + +#ifdef CONFIG_X86_64 + vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); + vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); +#else + vmcs_writel(HOST_FS_BASE, segment_base(fs_sel)); + vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); +#endif + + do_interrupt_requests(vcpu, kvm_run); + + if (vcpu->guest_debug.enabled) + kvm_guest_debug_pre(vcpu); + + fx_save(vcpu->host_fx_image); + fx_restore(vcpu->guest_fx_image); + + save_msrs(vcpu->host_msrs, vcpu->nmsrs); + load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + + WARN_ON(vmcs_readl(GUEST_CR3) != vcpu->mmu.root_hpa); + + fail = __vmx_vcpu_run(vcpu); + + /* + * Figure out whether vcpu->cr3 needs updating because + * the guest makde use of the cr3 cache: + */ + kvm_cr3_cache_sync(vcpu); + + WARN_ON(vmcs_readl(GUEST_CR3) != vcpu->mmu.root_hpa); + ++kvm_stat.exits; save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); @@ -1910,6 +2103,7 @@ again: static void vmx_flush_tlb(struct kvm_vcpu *vcpu) { vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3)); + kvm_cr3_cache_clear(vcpu); } static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, @@ -1939,7 +2133,7 @@ static void vmx_inject_page_fault(struct INTR_TYPE_EXCEPTION | INTR_INFO_DELIEVER_CODE_MASK | INTR_INFO_VALID_MASK); - + kvm_cr3_cache_clear(vcpu); } static void vmx_free_vmcs(struct kvm_vcpu *vcpu) Index: linux/drivers/kvm/vmx.h =================================================================== --- linux.orig/drivers/kvm/vmx.h +++ linux/drivers/kvm/vmx.h @@ -292,5 +292,6 @@ enum vmcs_field { #define MSR_IA32_VMX_PROCBASED_CTLS 0x482 #define MSR_IA32_VMX_EXIT_CTLS 0x483 #define MSR_IA32_VMX_ENTRY_CTLS 0x484 +#define MSR_IA32_VMX_MISC 0x485 #endif Index: linux/drivers/serial/8250.c =================================================================== --- linux.orig/drivers/serial/8250.c +++ linux/drivers/serial/8250.c @@ -1371,7 +1371,7 @@ static irqreturn_t serial8250_interrupt( l = l->next; - if (l == i->head && pass_counter++ > PASS_LIMIT) { + if (!paravirt_enabled() && l == i->head && pass_counter++ > PASS_LIMIT) { /* If we hit this, we're dead. */ printk(KERN_ERR "serial8250: too much work for " "irq%d\n", irq); Index: linux/include/asm-i386/delay.h =================================================================== --- linux.orig/include/asm-i386/delay.h +++ linux/include/asm-i386/delay.h @@ -17,9 +17,9 @@ extern void __const_udelay(unsigned long extern void __delay(unsigned long loops); #if defined(CONFIG_PARAVIRT) && !defined(USE_REAL_TIME_DELAY) -#define udelay(n) paravirt_ops.const_udelay((n) * 0x10c7ul) +#define udelay(n) paravirt_mod_ops.const_udelay((n) * 0x10c7ul) -#define ndelay(n) paravirt_ops.const_udelay((n) * 5ul) +#define ndelay(n) paravirt_mod_ops.const_udelay((n) * 5ul) #else /* !PARAVIRT || USE_REAL_TIME_DELAY */ Index: linux/include/asm-i386/paravirt.h =================================================================== --- linux.orig/include/asm-i386/paravirt.h +++ linux/include/asm-i386/paravirt.h @@ -151,8 +151,9 @@ struct paravirt_ops __attribute__((__section__(".paravirtprobe"))) = fn extern struct paravirt_ops paravirt_ops; +extern struct paravirt_ops paravirt_mod_ops; -#define paravirt_enabled() (paravirt_ops.paravirt_enabled) +#define paravirt_enabled() (paravirt_mod_ops.paravirt_enabled) static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread) @@ -180,7 +181,7 @@ static inline void do_time_init(void) static inline void __cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { - paravirt_ops.cpuid(eax, ebx, ecx, edx); + paravirt_mod_ops.cpuid(eax, ebx, ecx, edx); } /* @@ -219,52 +220,52 @@ static inline void halt(void) #define rdmsr(msr,val1,val2) do { \ int _err; \ - u64 _l = paravirt_ops.read_msr(msr,&_err); \ + u64 _l = paravirt_mod_ops.read_msr(msr,&_err); \ val1 = (u32)_l; \ val2 = _l >> 32; \ } while(0) #define wrmsr(msr,val1,val2) do { \ u64 _l = ((u64)(val2) << 32) | (val1); \ - paravirt_ops.write_msr((msr), _l); \ + paravirt_mod_ops.write_msr((msr), _l); \ } while(0) #define rdmsrl(msr,val) do { \ int _err; \ - val = paravirt_ops.read_msr((msr),&_err); \ + val = paravirt_mod_ops.read_msr((msr),&_err); \ } while(0) -#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val))) +#define wrmsrl(msr,val) (paravirt_mod_ops.write_msr((msr),(val))) #define wrmsr_safe(msr,a,b) ({ \ u64 _l = ((u64)(b) << 32) | (a); \ - paravirt_ops.write_msr((msr),_l); \ + paravirt_mod_ops.write_msr((msr),_l); \ }) /* rdmsr with exception handling */ #define rdmsr_safe(msr,a,b) ({ \ int _err; \ - u64 _l = paravirt_ops.read_msr(msr,&_err); \ + u64 _l = paravirt_mod_ops.read_msr(msr,&_err); \ (*a) = (u32)_l; \ (*b) = _l >> 32; \ _err; }) #define rdtsc(low,high) do { \ - u64 _l = paravirt_ops.read_tsc(); \ + u64 _l = paravirt_mod_ops.read_tsc(); \ low = (u32)_l; \ high = _l >> 32; \ } while(0) #define rdtscl(low) do { \ - u64 _l = paravirt_ops.read_tsc(); \ + u64 _l = paravirt_mod_ops.read_tsc(); \ low = (int)_l; \ } while(0) -#define rdtscll(val) (val = paravirt_ops.read_tsc()) +#define rdtscll(val) (val = paravirt_mod_ops.read_tsc()) #define write_tsc(val1,val2) wrmsr(0x10, val1, val2) #define rdpmc(counter,low,high) do { \ - u64 _l = paravirt_ops.read_pmc(); \ + u64 _l = paravirt_mod_ops.read_pmc(); \ low = (u32)_l; \ high = _l >> 32; \ } while(0) @@ -287,11 +288,11 @@ static inline void halt(void) /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { - paravirt_ops.io_delay(); + paravirt_mod_ops.io_delay(); #ifdef REALLY_SLOW_IO - paravirt_ops.io_delay(); - paravirt_ops.io_delay(); - paravirt_ops.io_delay(); + paravirt_mod_ops.io_delay(); + paravirt_mod_ops.io_delay(); + paravirt_mod_ops.io_delay(); #endif } @@ -301,24 +302,24 @@ static inline void slow_down_io(void) { */ static inline void apic_write(unsigned long reg, unsigned long v) { - paravirt_ops.apic_write(reg,v); + paravirt_mod_ops.apic_write(reg,v); } static inline void apic_write_atomic(unsigned long reg, unsigned long v) { - paravirt_ops.apic_write_atomic(reg,v); + paravirt_mod_ops.apic_write_atomic(reg,v); } static inline unsigned long apic_read(unsigned long reg) { - return paravirt_ops.apic_read(reg); + return paravirt_mod_ops.apic_read(reg); } #endif -#define __flush_tlb() paravirt_ops.flush_tlb_user() -#define __flush_tlb_global() paravirt_ops.flush_tlb_kernel() -#define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr) +#define __flush_tlb() paravirt_mod_ops.flush_tlb_user() +#define __flush_tlb_global() paravirt_mod_ops.flush_tlb_kernel() +#define __flush_tlb_single(addr) paravirt_mod_ops.flush_tlb_single(addr) static inline void set_pte(pte_t *ptep, pte_t pteval) { @@ -397,7 +398,7 @@ static inline unsigned long __raw_local_ "call *%1;" "popl %%edx; popl %%ecx", PARAVIRT_SAVE_FLAGS, CLBR_NONE) - : "=a"(f): "m"(paravirt_ops.save_fl) + : "=a"(f): "m"(paravirt_mod_ops.save_fl) : "memory", "cc"); return f; } @@ -408,7 +409,7 @@ static inline void raw_local_irq_restore "call *%1;" "popl %%edx; popl %%ecx", PARAVIRT_RESTORE_FLAGS, CLBR_EAX) - : "=a"(f) : "m" (paravirt_ops.restore_fl), "0"(f) + : "=a"(f) : "m" (paravirt_mod_ops.restore_fl), "0"(f) : "memory", "cc"); } @@ -418,7 +419,7 @@ static inline void raw_local_irq_disable "call *%0;" "popl %%edx; popl %%ecx", PARAVIRT_IRQ_DISABLE, CLBR_EAX) - : : "m" (paravirt_ops.irq_disable) + : : "m" (paravirt_mod_ops.irq_disable) : "memory", "eax", "cc"); } @@ -428,7 +429,7 @@ static inline void raw_local_irq_enable( "call *%0;" "popl %%edx; popl %%ecx", PARAVIRT_IRQ_ENABLE, CLBR_EAX) - : : "m" (paravirt_ops.irq_enable) + : : "m" (paravirt_mod_ops.irq_enable) : "memory", "eax", "cc"); } @@ -443,19 +444,19 @@ static inline unsigned long __raw_local_ PARAVIRT_SAVE_FLAGS_IRQ_DISABLE, CLBR_NONE) : "=a"(f) - : "m" (paravirt_ops.save_fl), - "m" (paravirt_ops.irq_disable) + : "m" (paravirt_mod_ops.save_fl), + "m" (paravirt_mod_ops.irq_disable) : "memory", "cc"); return f; } #define CLI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;" \ - "call *paravirt_ops+%c[irq_disable];" \ + "call *paravirt_mod_ops+%c[irq_disable];" \ "popl %%edx; popl %%ecx", \ PARAVIRT_IRQ_DISABLE, CLBR_EAX) #define STI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;" \ - "call *paravirt_ops+%c[irq_enable];" \ + "call *paravirt_mod_ops+%c[irq_enable];" \ "popl %%edx; popl %%ecx", \ PARAVIRT_IRQ_ENABLE, CLBR_EAX) #define CLI_STI_CLOBBERS , "%eax" @@ -484,13 +485,13 @@ static inline unsigned long __raw_local_ #define DISABLE_INTERRUPTS(clobbers) \ PARA_PATCH(PARAVIRT_IRQ_DISABLE, clobbers, \ pushl %ecx; pushl %edx; \ - call *paravirt_ops+PARAVIRT_irq_disable; \ + call *paravirt_mod_ops+PARAVIRT_irq_disable; \ popl %edx; popl %ecx) \ #define ENABLE_INTERRUPTS(clobbers) \ PARA_PATCH(PARAVIRT_IRQ_ENABLE, clobbers, \ pushl %ecx; pushl %edx; \ - call *%cs:paravirt_ops+PARAVIRT_irq_enable; \ + call *%cs:paravirt_mod_ops+PARAVIRT_irq_enable; \ popl %edx; popl %ecx) #define ENABLE_INTERRUPTS_SYSEXIT \ Index: linux/include/linux/kvm.h =================================================================== --- linux.orig/include/linux/kvm.h +++ linux/include/linux/kvm.h @@ -238,4 +238,44 @@ struct kvm_dirty_log { #define KVM_DUMP_VCPU _IOW(KVMIO, 250, int /* vcpu_slot */) + +#define KVM_CR3_CACHE_SIZE 4 + +struct kvm_cr3_cache_entry { + u64 guest_cr3; + u64 host_cr3; +}; + +struct kvm_cr3_cache { + struct kvm_cr3_cache_entry entry[KVM_CR3_CACHE_SIZE]; + u32 max_idx; +}; + +/* + * Per-VCPU descriptor area shared between guest and host. Writable to + * both guest and host. Registered with the host by the guest when + * a guest acknowledges paravirtual mode. + */ +struct kvm_vcpu_para_state { + /* + * API version information for compatibility. If there's any support + * mismatch (too old host trying to execute too new guest) then + * the host will deny entry into paravirtual mode. Any other + * combination (new host + old guest and new host + new guest) + * is supposed to work - new host versions will support all old + * guest API versions. + */ + u32 guest_version; + u32 host_version; + u32 size; + u32 __pad_00; + + struct kvm_cr3_cache cr3_cache; + +} __attribute__ ((aligned(PAGE_SIZE))); + +#define KVM_PARA_API_VERSION 1 + +#define KVM_API_MAGIC 0x87654321 + #endif Index: linux/init/main.c =================================================================== --- linux.orig/init/main.c +++ linux/init/main.c @@ -374,7 +374,11 @@ static void __init setup_per_cpu_areas(v if (size < PERCPU_ENOUGH_ROOM) size = PERCPU_ENOUGH_ROOM; #endif - ptr = alloc_bootmem(size * nr_possible_cpus); + /* + * Align them to page size - just in case someone aligns + * the per-CPU data to page that alignment should be preserved: + */ + ptr = alloc_bootmem_pages(size * nr_possible_cpus); for_each_possible_cpu(i) { __per_cpu_offset[i] = ptr - __per_cpu_start;