/* * GPL HEADER START * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * GPL HEADER END * * Copyright 2011 various Linux Kernel contributors. * Copyright 2017 Joyent, Inc. * Copyright 2011 Joshua M. Clulow * Copyright 2011 Richard Lowe */ #include #include "kvm_bitops.h" #include "kvm_cache_regs.h" #include "kvm_x86impl.h" #include "kvm_host.h" #include "kvm_mmu.h" #include "msr-index.h" /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: * 1. the guest-virtual to guest-physical * 2. while doing 1. it walks guest-physical to host-physical * If the hardware supports that we don't need to do shadow paging. */ int tdp_enabled = 0; static int oos_shadow = 1; #define virt_to_page(addr) pfn_to_page(hat_getpfnum(kas.a_hat, addr)) #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 #define VALID_PAGE(x) ((x) != INVALID_PAGE) #define PT64_LEVEL_BITS 9 #define PT64_LEVEL_SHIFT(level) \ (PAGESHIFT + (level - 1) * PT64_LEVEL_BITS) #define PT64_LEVEL_MASK(level) \ (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) #define PT64_INDEX(address, level)\ (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) #define PT32_LEVEL_BITS 10 #define PT32_LEVEL_SHIFT(level) \ (PAGESHIFT + (level - 1) * PT32_LEVEL_BITS) #define PT32_LEVEL_MASK(level) \ (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) #define PT32_LVL_OFFSET_MASK(level) (PT32_BASE_ADDR_MASK & \ ((1ULL << (PAGESHIFT + (((level) - 1) * PT32_LEVEL_BITS))) - 1)) #define PT32_INDEX(address, level) \ (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(uint64_t)(PAGESIZE-1)) #define PT64_DIR_BASE_ADDR_MASK \ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGESHIFT + PT64_LEVEL_BITS)) - 1)) #define PT64_LVL_ADDR_MASK(level) \ (PT64_BASE_ADDR_MASK & \ ~((1ULL << (PAGESHIFT + (((level) - 1) * PT64_LEVEL_BITS))) - 1)) #define PT64_LVL_OFFSET_MASK(level) (PT64_BASE_ADDR_MASK & \ ((1ULL << (PAGESHIFT + (((level) - 1) * PT64_LEVEL_BITS))) - 1)) #define PT32_BASE_ADDR_MASK PAGEMASK #define PT32_DIR_BASE_ADDR_MASK \ (PAGEMASK & ~((1ULL << (PAGESHIFT + PT32_LEVEL_BITS)) - 1)) #define PT32_LVL_ADDR_MASK(level) (PAGEMASK & \ ~((1ULL << (PAGESHIFT + (((level) - 1) * PT32_LEVEL_BITS))) - 1)) #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | PT64_NX_MASK) #define RMAP_EXT 4 #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) typedef struct kvm_rmap_desc { uint64_t *sptes[RMAP_EXT]; struct kvm_rmap_desc *more; } kvm_rmap_desc_t; typedef struct kvm_shadow_walk_iterator { uint64_t addr; hpa_t shadow_addr; uint64_t *sptep; int level; unsigned index; } kvm_shadow_walk_iterator_t; #define for_each_shadow_entry(_vcpu, _addr, _walker) \ for (shadow_walk_init(&(_walker), _vcpu, _addr); \ shadow_walk_okay(&(_walker), _vcpu); \ shadow_walk_next(&(_walker))) typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *, struct kvm_mmu_page *); struct kmem_cache *pte_chain_cache; struct kmem_cache *rmap_desc_cache; struct kmem_cache *mmu_page_header_cache; static uint64_t shadow_trap_nonpresent_pte; static uint64_t shadow_notrap_nonpresent_pte; static uint64_t shadow_base_present_pte; static uint64_t shadow_nx_mask; static uint64_t shadow_x_mask; /* mutual exclusive with nx_mask */ static uint64_t shadow_user_mask; static uint64_t shadow_accessed_mask; static uint64_t shadow_dirty_mask; static uint64_t rsvd_bits(int s, int e) { return (((1ULL << (e - s + 1)) - 1) << s); } void kvm_mmu_set_nonpresent_ptes(uint64_t trap_pte, uint64_t notrap_pte) { shadow_trap_nonpresent_pte = trap_pte; shadow_notrap_nonpresent_pte = notrap_pte; } void kvm_mmu_set_base_ptes(uint64_t base_pte) { shadow_base_present_pte = base_pte; } void kvm_mmu_set_mask_ptes(uint64_t user_mask, uint64_t accessed_mask, uint64_t dirty_mask, uint64_t nx_mask, uint64_t x_mask) { shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; } static int is_write_protection(struct kvm_vcpu *vcpu) { return (kvm_read_cr0_bits(vcpu, X86_CR0_WP)); } static int is_cpuid_PSE36(void) { return (1); } static int is_nx(struct kvm_vcpu *vcpu) { return (vcpu->arch.efer & EFER_NX); } static int is_shadow_present_pte(uint64_t pte) { return (pte != shadow_trap_nonpresent_pte && pte != shadow_notrap_nonpresent_pte); } static int is_large_pte(uint64_t pte) { return (pte & PT_PAGE_SIZE_MASK); } static int is_writable_pte(unsigned long pte) { return (pte & PT_WRITABLE_MASK); } static int is_dirty_gpte(unsigned long pte) { return (pte & PT_DIRTY_MASK); } static int is_rmap_spte(uint64_t pte) { return (is_shadow_present_pte(pte)); } static int is_last_spte(uint64_t pte, int level) { if (level == PT_PAGE_TABLE_LEVEL) return (1); if (is_large_pte(pte)) return (1); return (0); } static pfn_t spte_to_pfn(uint64_t pte) { return ((pte & PT64_BASE_ADDR_MASK) >> PAGESHIFT); } static gfn_t pse36_gfn_delta(uint32_t gpte) { int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGESHIFT; return ((gpte & PT32_DIR_PSE36_MASK) << shift); } static void __set_spte(uint64_t *sptep, uint64_t spte) { *sptep = spte; } static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, struct kmem_cache *base_cache, int min) { caddr_t obj; if (cache->nobjs >= min) return (0); while (cache->nobjs < ARRAY_SIZE(cache->objects)) { obj = kmem_cache_alloc(base_cache, KM_SLEEP); cache->objects[cache->nobjs].kma_object = obj; cache->objects[cache->nobjs++].kpm_object = NULL; } return (0); } static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, int min) { page_t *page; if (cache->nobjs >= min) return (0); while (cache->nobjs < ARRAY_SIZE(cache->objects)) { page = alloc_page(KM_SLEEP, &cache->objects[cache->nobjs].kma_object); if (!page) return (-ENOMEM); cache->objects[cache->nobjs++].kpm_object = page_address(page); } return (0); } static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) { int r = 0; r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache, 4); if (r) goto out; r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache, 4); if (r) goto out; r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); if (r) goto out; r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, mmu_page_header_cache, 4); out: return (r); } static void * mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, size_t size) { if (mc->objects[--mc->nobjs].kpm_object) return (mc->objects[mc->nobjs].kpm_object); else return (mc->objects[mc->nobjs].kma_object); } static struct kvm_objects mmu_memory_page_cache_alloc(struct kvm_mmu_memory_cache *mc, size_t size) { return (mc->objects[--mc->nobjs]); } static struct kvm_pte_chain * mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) { return (mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, sizeof (struct kvm_pte_chain))); } static void mmu_free_pte_chain(struct kvm_pte_chain *pc) { kmem_cache_free(pte_chain_cache, pc); } static struct kvm_rmap_desc * mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) { return (mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, sizeof (struct kvm_rmap_desc))); } static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) { kmem_cache_free(rmap_desc_cache, rd); } /* * Return the pointer to the largepage write count for a given * gfn, handling slots that are not large page aligned. */ int * slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot, int level) { unsigned long idx; idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); return (&slot->lpage_info[level - 2][idx].write_count); } static void account_shadowed(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *slot; int *write_count; int i; gfn = unalias_gfn(kvm, gfn); slot = gfn_to_memslot_unaliased(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { write_count = slot_largepage_idx(gfn, slot, i); *write_count += 1; } } static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *slot; int *write_count; int i; gfn = unalias_gfn(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { slot = gfn_to_memslot_unaliased(kvm, gfn); write_count = slot_largepage_idx(gfn, slot, i); *write_count -= 1; if (*write_count < 0) cmn_err(CE_WARN, "unaccount_shadowed: *write_count = %d (< 0)\n", *write_count); } } static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn, int level) { struct kvm_memory_slot *slot; int *largepage_idx; gfn = unalias_gfn(kvm, gfn); slot = gfn_to_memslot_unaliased(kvm, gfn); if (slot) { largepage_idx = slot_largepage_idx(gfn, slot, level); return (*largepage_idx); } return (1); } static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { unsigned long page_size; int i, ret = 0; page_size = kvm_host_page_size(kvm, gfn); for (i = PT_PAGE_TABLE_LEVEL; i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { if (page_size >= KVM_HPAGE_SIZE(i)) ret = i; else break; } return (ret); } static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) { struct kvm_memory_slot *slot; int host_level, level, max_level; slot = gfn_to_memslot(vcpu->kvm, large_gfn); if (slot && slot->dirty_bitmap) return (PT_PAGE_TABLE_LEVEL); host_level = host_mapping_level(vcpu->kvm, large_gfn); if (host_level == PT_PAGE_TABLE_LEVEL) return (host_level); max_level = kvm_x86_ops->get_lpage_level() < host_level ? kvm_x86_ops->get_lpage_level() : host_level; for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) break; return (level - 1); } /* * Take gfn and return the reverse mapping to it. * Note: gfn must be unaliased before this function get called */ static unsigned long * gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) { struct kvm_memory_slot *slot; unsigned long idx; slot = gfn_to_memslot(kvm, gfn); if (level == PT_PAGE_TABLE_LEVEL) return (&slot->rmap[gfn - slot->base_gfn]); idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); return (&slot->lpage_info[level - 2][idx].rmap_pde); } /* * Reverse mapping data structures: * * If rmapp bit zero is zero, then rmapp point to the shadw page table entry * that points to page_address(page). * * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc * containing more mappings. * * Returns the number of rmap entries before the spte was added or zero if * the spte was not added. * */ static int rmap_add(struct kvm_vcpu *vcpu, uint64_t *spte, gfn_t gfn) { struct kvm_mmu_page *sp; struct kvm_rmap_desc *desc; unsigned long *rmapp; int i, count = 0; if (!is_rmap_spte(*spte)) return (count); gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(vcpu->kvm, kvm_va2pa((caddr_t)spte)); sp->gfns[spte - sp->spt] = gfn; rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); if (!*rmapp) { *rmapp = (unsigned long)spte; } else if (!(*rmapp & 1)) { desc = mmu_alloc_rmap_desc(vcpu); desc->sptes[0] = (uint64_t *)*rmapp; desc->sptes[1] = spte; *rmapp = (unsigned long)desc | 1; } else { desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); while (desc->sptes[RMAP_EXT-1] && desc->more) { desc = desc->more; count += RMAP_EXT; } if (desc->sptes[RMAP_EXT-1]) { desc->more = mmu_alloc_rmap_desc(vcpu); desc = desc->more; } for (i = 0; desc->sptes[i]; i++) continue; desc->sptes[i] = spte; } return (count); } static void rmap_desc_remove_entry(unsigned long *rmapp, struct kvm_rmap_desc *desc, int i, struct kvm_rmap_desc *prev_desc) { int j; for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) continue; desc->sptes[i] = desc->sptes[j]; desc->sptes[j] = NULL; if (j != 0) return; if (!prev_desc && !desc->more) { *rmapp = (unsigned long)desc->sptes[0]; } else { if (prev_desc) prev_desc->more = desc->more; else *rmapp = (unsigned long)desc->more | 1; } mmu_free_rmap_desc(desc); } static void rmap_remove(struct kvm *kvm, uint64_t *spte) { struct kvm_rmap_desc *desc; struct kvm_rmap_desc *prev_desc; struct kvm_mmu_page *sp; pfn_t pfn; unsigned long *rmapp; int i; if (!is_rmap_spte(*spte)) return; sp = page_header(kvm, kvm_va2pa((caddr_t)spte)); pfn = spte_to_pfn(*spte); if (*spte & shadow_accessed_mask) kvm_set_pfn_accessed(kvm, pfn); if (is_writable_pte(*spte)) kvm_set_pfn_dirty(pfn); rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); if (!*rmapp) { panic("rmap_remove: %p %lx 0->BUG\n", spte, *spte); } else if (!(*rmapp & 1)) { if ((uint64_t *)*rmapp != spte) { panic("rmap_remove: %p %lx 1->BUG\n", spte, *spte); } *rmapp = 0; } else { desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); prev_desc = NULL; while (desc) { for (i = 0; i < RMAP_EXT && desc->sptes[i]; i++) { if (desc->sptes[i] == spte) { rmap_desc_remove_entry(rmapp, desc, i, prev_desc); return; } } prev_desc = desc; desc = desc->more; } panic("rmap_remove: %p %lx many->many\n", spte, *spte); } } static uint64_t * rmap_next(struct kvm *kvm, unsigned long *rmapp, uint64_t *spte) { struct kvm_rmap_desc *desc; struct kvm_rmap_desc *prev_desc; uint64_t *prev_spte; int i; if (!*rmapp) return (NULL); else if (!(*rmapp & 1)) { if (!spte) return ((uint64_t *)*rmapp); return (NULL); } desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); prev_desc = NULL; prev_spte = NULL; while (desc) { for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { if (prev_spte == spte) return (desc->sptes[i]); prev_spte = desc->sptes[i]; } desc = desc->more; } return (NULL); } static int rmap_write_protect(struct kvm *kvm, uint64_t gfn) { unsigned long *rmapp; uint64_t *spte; int i, write_protected = 0; gfn = unalias_gfn(kvm, gfn); rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); spte = rmap_next(kvm, rmapp, NULL); while (spte) { ASSERT(spte); ASSERT(*spte & PT_PRESENT_MASK); if (is_writable_pte(*spte)) { __set_spte(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } spte = rmap_next(kvm, rmapp, spte); } if (write_protected) { pfn_t pfn; spte = rmap_next(kvm, rmapp, NULL); pfn = spte_to_pfn(*spte); kvm_set_pfn_dirty(pfn); } /* check for huge page mappings */ for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; i++) { rmapp = gfn_to_rmap(kvm, gfn, i); spte = rmap_next(kvm, rmapp, NULL); while (spte) { ASSERT(spte); ASSERT(*spte & PT_PRESENT_MASK); ASSERT((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) == (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); if (is_writable_pte(*spte)) { rmap_remove(kvm, spte); KVM_KSTAT_DEC(kvm, kvmks_lpages); __set_spte(spte, shadow_trap_nonpresent_pte); spte = NULL; write_protected = 1; } spte = rmap_next(kvm, rmapp, spte); } } return (write_protected); } static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kmem_free(sp->sptkma, PAGESIZE); kmem_free(sp->gfnskma, PAGESIZE); mutex_enter(&kvm->kvm_avllock); avl_remove(&kvm->kvm_avlmp, sp); mutex_exit(&kvm->kvm_avllock); list_remove(&kvm->arch.active_mmu_pages, sp); kmem_cache_free(mmu_page_header_cache, sp); ++kvm->arch.n_free_mmu_pages; } static unsigned kvm_page_table_hashfn(gfn_t gfn) { return (gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1)); } static void bitmap_zero(unsigned long *dst, int nbits) { int len = BITS_TO_LONGS(nbits) * sizeof (unsigned long); memset(dst, 0, len); } static struct kvm_mmu_page * kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, uint64_t *parent_pte) { struct kvm_mmu_page *sp; struct kvm_objects kobj; sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof (*sp)); kobj = mmu_memory_page_cache_alloc(&vcpu->arch.mmu_page_cache, PAGESIZE); sp->spt = kobj.kpm_object; sp->sptkma = kobj.kma_object; kobj = mmu_memory_page_cache_alloc(&vcpu->arch.mmu_page_cache, PAGESIZE); sp->gfns = kobj.kpm_object; sp->gfnskma = kobj.kma_object; sp->kmp_avlspt = (uintptr_t)virt_to_page((caddr_t)sp->spt); sp->vcpu = vcpu; mutex_enter(&vcpu->kvm->kvm_avllock); avl_add(&vcpu->kvm->kvm_avlmp, sp); mutex_exit(&vcpu->kvm->kvm_avllock); list_insert_head(&vcpu->kvm->arch.active_mmu_pages, sp); bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); sp->multimapped = 0; sp->parent_pte = parent_pte; --vcpu->kvm->arch.n_free_mmu_pages; return (sp); } static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, uint64_t *parent_pte) { struct kvm_pte_chain *pte_chain; struct list_t *node; int i; if (!sp->multimapped) { sp->parent_pte = NULL; return; } for (pte_chain = list_head(&sp->parent_ptes); pte_chain != NULL; pte_chain = list_next(&sp->parent_ptes, pte_chain)) { for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { if (!pte_chain->parent_ptes[i]) break; if (pte_chain->parent_ptes[i] != parent_pte) continue; while (i + 1 < NR_PTE_CHAIN_ENTRIES && pte_chain->parent_ptes[i + 1]) { pte_chain->parent_ptes[i] = pte_chain->parent_ptes[i + 1]; i++; } pte_chain->parent_ptes[i] = NULL; if (i == 0) { list_remove(&sp->parent_ptes, pte_chain); mmu_free_pte_chain(pte_chain); if (list_is_empty(&sp->parent_ptes)) { sp->multimapped = 0; sp->parent_pte = NULL; } } return; } } panic("We shouldn't make it here\n"); } static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, uint64_t *parent_pte) { struct kvm_pte_chain *pte_chain; struct hlist_node *node; int i; if (!parent_pte) return; if (!sp->multimapped) { uint64_t *old = sp->parent_pte; if (!old) { sp->parent_pte = parent_pte; return; } sp->multimapped = 1; pte_chain = mmu_alloc_pte_chain(vcpu); list_create(&sp->parent_ptes, sizeof (struct kvm_pte_chain), offsetof(struct kvm_pte_chain, link)); list_insert_head(&sp->parent_ptes, pte_chain); pte_chain->parent_ptes[0] = old; } for (pte_chain = list_head(&sp->parent_ptes); pte_chain != NULL; pte_chain = list_next(&sp->parent_ptes, pte_chain)) { if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) continue; for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { if (!pte_chain->parent_ptes[i]) { pte_chain->parent_ptes[i] = parent_pte; return; } } } pte_chain = mmu_alloc_pte_chain(vcpu); list_insert_head(&sp->parent_ptes, pte_chain); pte_chain->parent_ptes[0] = parent_pte; } static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) { struct kvm_pte_chain *pte_chain; struct hlist_node *node; struct kvm_mmu_page *parent_sp; int i; if (!sp->multimapped && sp->parent_pte) { parent_sp = page_header(vcpu->kvm, kvm_va2pa((caddr_t)sp->parent_pte)); fn(vcpu, parent_sp); mmu_parent_walk(vcpu, parent_sp, fn); return; } for (pte_chain = list_head(&sp->parent_ptes); pte_chain != NULL; pte_chain = list_next(&sp->parent_ptes, pte_chain)) { for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { if (!pte_chain->parent_ptes[i]) break; parent_sp = page_header(vcpu->kvm, kvm_va2pa( (caddr_t)pte_chain->parent_ptes[i])); fn(vcpu, parent_sp); mmu_parent_walk(vcpu, parent_sp, fn); } } } static void kvm_mmu_update_unsync_bitmap(uint64_t *spte, struct kvm *kvm) { unsigned int index; struct kvm_mmu_page *sp = page_header(kvm, kvm_va2pa((caddr_t)spte)); index = spte - sp->spt; if (!__test_and_set_bit(index, sp->unsync_child_bitmap)) sp->unsync_children++; } static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp, struct kvm *kvm) { struct kvm_pte_chain *pte_chain; int i; if (!sp->parent_pte) return; if (!sp->multimapped) { kvm_mmu_update_unsync_bitmap(sp->parent_pte, kvm); return; } for (pte_chain = list_head(&sp->parent_ptes); pte_chain != NULL; pte_chain = list_next(&sp->parent_ptes, pte_chain)) { for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { if (!pte_chain->parent_ptes[i]) break; kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i], kvm); } } } static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { kvm_mmu_update_parents_unsync(sp, vcpu->kvm); return (1); } void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { mmu_parent_walk(vcpu, sp, unsync_walk_fn); kvm_mmu_update_parents_unsync(sp, vcpu->kvm); } static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { int i; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) sp->spt[i] = shadow_trap_nonpresent_pte; } static int nonpaging_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { return (1); } static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) {} #define KVM_PAGE_ARRAY_NR 16 typedef struct kvm_mmu_pages { struct mmu_page_and_offset { struct kvm_mmu_page *sp; unsigned int idx; } page[KVM_PAGE_ARRAY_NR]; unsigned int nr; } kvm_mmu_pages_t; #define for_each_unsync_children(bitmap, idx) \ for (idx = bt_getlowbit(bitmap, 0, 511); \ (idx != -1) && (idx < 512); \ idx = bt_getlowbit(bitmap, idx+1, 511)) static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, int idx) { int i; if (sp->unsync) { for (i = 0; i < pvec->nr; i++) { if (pvec->page[i].sp == sp) return (0); } } pvec->page[pvec->nr].sp = sp; pvec->page[pvec->nr].idx = idx; pvec->nr++; return (pvec->nr == KVM_PAGE_ARRAY_NR); } static int __mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_pages *pvec, struct kvm *kvm) { int i, ret, nr_unsync_leaf = 0; for_each_unsync_children(sp->unsync_child_bitmap, i) { uint64_t ent = sp->spt[i]; if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { struct kvm_mmu_page *child; child = page_header(kvm, ent & PT64_BASE_ADDR_MASK); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) return (-ENOSPC); ret = __mmu_unsync_walk(child, pvec, kvm); if (!ret) { __clear_bit(i, sp->unsync_child_bitmap); } else if (ret > 0) nr_unsync_leaf += ret; else return (ret); } if (child->unsync) { nr_unsync_leaf++; if (mmu_pages_add(pvec, child, i)) return (-ENOSPC); } } } if (bt_getlowbit(sp->unsync_child_bitmap, 0, 511) == -1) sp->unsync_children = 0; return (nr_unsync_leaf); } static int mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_pages *pvec, struct kvm *kvm) { if (!sp->unsync_children) return (0); mmu_pages_add(pvec, sp, 0); return (__mmu_unsync_walk(sp, pvec, kvm)); } static struct kvm_mmu_page * kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) { unsigned index; list_t *bucket; struct kvm_mmu_page *sp; index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; for (sp = list_head(bucket); sp; sp = list_next(bucket, sp)) { if (sp->gfn == gfn && !sp->role.direct && !sp->role.invalid) { return (sp); } } return (NULL); } static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { sp->unsync = 0; KVM_KSTAT_DEC(kvm, kvmks_mmu_unsync_page); } static int kvm_mmu_zap_page(struct kvm *, struct kvm_mmu_page *); static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { if (sp->role.glevels != vcpu->arch.mmu.root_level) { kvm_mmu_zap_page(vcpu->kvm, sp); return (1); } KVM_TRACE1(mmu__sync__page, struct kvm_mmu_page *, sp); if (rmap_write_protect(vcpu->kvm, sp->gfn)) kvm_flush_remote_tlbs(vcpu->kvm); kvm_unlink_unsync_page(vcpu->kvm, sp); if (vcpu->arch.mmu.sync_page(vcpu, sp)) { kvm_mmu_zap_page(vcpu->kvm, sp); return (1); } kvm_mmu_flush_tlb(vcpu); return (0); } typedef struct mmu_page_path { struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; unsigned int idx[PT64_ROOT_LEVEL-1]; } mmu_page_path_t; #define for_each_sp(pvec, sp, parents, i) \ for (i = mmu_pages_next(&pvec, &parents, -1), \ sp = pvec.page[i].sp; \ /*CSTYLED*/ \ i < pvec.nr && ({ sp = pvec.page[i].sp; 1; }); \ i = mmu_pages_next(&pvec, &parents, i)) static int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents, int i) { int n; for (n = i + 1; n < pvec->nr; n++) { struct kvm_mmu_page *sp = pvec->page[n].sp; if (sp->role.level == PT_PAGE_TABLE_LEVEL) { parents->idx[0] = pvec->page[n].idx; return (n); } parents->parent[sp->role.level-2] = sp; parents->idx[sp->role.level-1] = pvec->page[n].idx; } return (n); } static void mmu_pages_clear_parents(struct mmu_page_path *parents) { struct kvm_mmu_page *sp; unsigned int level = 0; do { unsigned int idx = parents->idx[level]; sp = parents->parent[level]; if (!sp) return; --sp->unsync_children; if ((int)sp->unsync_children < 0) cmn_err(CE_WARN, "mmu_pages_clear_parents: unsync_children (%d)\n", (int)sp->unsync_children); __clear_bit(idx, sp->unsync_child_bitmap); level++; } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); } static void kvm_mmu_pages_init(struct kvm_mmu_page *parent, struct mmu_page_path *parents, struct kvm_mmu_pages *pvec) { parents->parent[parent->role.level-1] = NULL; pvec->nr = 0; } static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *parent) { int i; struct kvm_mmu_page *sp; struct mmu_page_path parents; struct kvm_mmu_pages pages; kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages, vcpu->kvm)) { int protected = 0; for_each_sp(pages, sp, parents, i) protected |= rmap_write_protect(vcpu->kvm, sp->gfn); if (protected) kvm_flush_remote_tlbs(vcpu->kvm); for_each_sp(pages, sp, parents, i) { kvm_sync_page(vcpu, sp); mmu_pages_clear_parents(&parents); } mutex_enter(&vcpu->kvm->mmu_lock); kvm_mmu_pages_init(parent, &parents, &pages); mutex_exit(&vcpu->kvm->mmu_lock); } } struct kvm_mmu_page * kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, unsigned level, int direct, unsigned access, uint64_t *parent_pte) { union kvm_mmu_page_role role; unsigned index; unsigned quadrant; list_t *bucket; struct kvm_mmu_page *sp, *nsp = NULL; struct hlist_node *node, *tmp; role = vcpu->arch.mmu.base_role; role.level = level; role.direct = direct; role.access = access; if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGESHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; for (sp = list_head(bucket); sp != NULL; sp = nsp) { nsp = list_next(bucket, sp); if (sp->gfn == gfn) { if (sp->unsync) if (kvm_sync_page(vcpu, sp)) continue; if (sp->role.word != role.word) continue; mmu_page_add_parent_pte(vcpu, sp, parent_pte); if (sp->unsync_children) { set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); kvm_mmu_mark_parents_unsync(vcpu, sp); } return (sp); } } KVM_KSTAT_INC(vcpu->kvm, kvmks_mmu_cache_miss); sp = kvm_mmu_alloc_page(vcpu, parent_pte); if (!sp) return (sp); sp->gfn = gfn; sp->role = role; list_insert_head(bucket, sp); if (!direct) { if (rmap_write_protect(vcpu->kvm, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); account_shadowed(vcpu->kvm, gfn); } if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) vcpu->arch.mmu.prefetch_page(vcpu, sp); else nonpaging_prefetch_page(vcpu, sp); KVM_TRACE1(mmu__get__page, struct kvm_mmu_page *, sp); return (sp); } static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, uint64_t addr) { iterator->addr = addr; iterator->shadow_addr = vcpu->arch.mmu.root_hpa; iterator->level = vcpu->arch.mmu.shadow_root_level; if (iterator->level == PT32E_ROOT_LEVEL) { iterator->shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; iterator->shadow_addr &= PT64_BASE_ADDR_MASK; --iterator->level; if (!iterator->shadow_addr) iterator->level = 0; } } static int shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu) { if (iterator->level < PT_PAGE_TABLE_LEVEL) return (0); if (iterator->level == PT_PAGE_TABLE_LEVEL) { if (is_large_pte(*iterator->sptep)) return (0); } iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); iterator->sptep = (uint64_t *)page_address(pfn_to_page((iterator->shadow_addr) >> PAGESHIFT)) + iterator->index; return (1); } static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) { iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK; --iterator->level; } static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { unsigned i; uint64_t *pt; uint64_t ent; pt = sp->spt; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { ent = pt[i]; if (is_shadow_present_pte(ent)) { if (!is_last_spte(ent, sp->role.level)) { ent &= PT64_BASE_ADDR_MASK; mmu_page_remove_parent_pte(page_header(kvm, ent), &pt[i]); } else { rmap_remove(kvm, &pt[i]); } } pt[i] = shadow_trap_nonpresent_pte; } } static void kvm_mmu_put_page(struct kvm_mmu_page *sp, uint64_t *parent_pte) { mmu_page_remove_parent_pte(sp, parent_pte); } static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) { int i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) vcpu->arch.last_pte_updated = NULL; } static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { uint64_t *parent_pte; while (sp->multimapped || sp->parent_pte) { if (!sp->multimapped) parent_pte = sp->parent_pte; else { struct kvm_pte_chain *chain; chain = list_head(&sp->parent_ptes); parent_pte = chain->parent_ptes[0]; } kvm_mmu_put_page(sp, parent_pte); __set_spte(parent_pte, shadow_trap_nonpresent_pte); } } static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *parent) { int i, zapped = 0; struct mmu_page_path parents; struct kvm_mmu_pages pages; if (parent->role.level == PT_PAGE_TABLE_LEVEL) return (0); kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages, kvm)) { struct kvm_mmu_page *sp; for_each_sp(pages, sp, parents, i) { kvm_mmu_zap_page(kvm, sp); mmu_pages_clear_parents(&parents); zapped++; } kvm_mmu_pages_init(parent, &parents, &pages); } return (zapped); } static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) { int ret; ret = mmu_zap_unsync_children(kvm, sp); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); kvm_flush_remote_tlbs(kvm); if (!sp->role.invalid && !sp->role.direct) unaccount_shadowed(kvm, sp->gfn); if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { sp->hash_link.list_prev->list_next = sp->hash_link.list_next; sp->hash_link.list_next->list_prev = sp->hash_link.list_prev; sp->hash_link.list_prev = 0; sp->hash_link.list_next = 0; kvm_mmu_free_page(kvm, sp); } else { sp->role.invalid = 1; list_remove(&kvm->arch.active_mmu_pages, sp); list_insert_head(&kvm->arch.active_mmu_pages, sp); kvm_reload_remote_mmus(kvm); } kvm_mmu_reset_last_pte_updated(kvm); return (ret); } /* * Changing the number of mmu pages allocated to the vm * Note: if kvm_nr_mmu_pages is too small, you will get dead lock */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) { int used_pages; used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; used_pages = MAX(0, used_pages); /* for the time being, assume that address space will only grow */ /* larger. The following code will be added later. */ /* * If we set the number of mmu pages to be smaller be than the * number of actived pages , we must to free some mmu pages before we * change the value */ if (used_pages > kvm_nr_mmu_pages) { while (used_pages > kvm_nr_mmu_pages && !list_is_empty(&kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *page; page = (struct kvm_mmu_page *) list_tail(&kvm->arch.active_mmu_pages); /* page removed by kvm_mmu_zap_page */ used_pages -= kvm_mmu_zap_page(kvm, page); used_pages--; } kvm_nr_mmu_pages = used_pages; kvm->arch.n_free_mmu_pages = 0; } else { kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages - kvm->arch.n_alloc_mmu_pages; } kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; } static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { unsigned index; list_t *bucket; struct kvm_mmu_page *sp, *nsp = NULL; int r; r = 0; index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; for (sp = list_head(bucket); sp; sp = nsp) { /* preserve link to next node in case we free this one */ nsp = list_next(bucket, sp); if (sp->gfn == gfn && !sp->role.direct) { r = 1; if (kvm_mmu_zap_page(kvm, sp)) nsp = list_head(bucket); } } return (r); } static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) { int slot = memslot_id(kvm, gfn); struct kvm_mmu_page *sp = page_header(kvm, kvm_va2pa(pte)); __set_bit(slot, sp->slot_bitmap); } static void mmu_convert_notrap(struct kvm_mmu_page *sp) { int i; uint64_t *pt = sp->spt; if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte) return; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { if (pt[i] == shadow_notrap_nonpresent_pte) __set_spte(&pt[i], shadow_trap_nonpresent_pte); } } static int get_mtrr_type(struct mtrr_state_type *mtrr_state, uint64_t start, uint64_t end) { int i; uint64_t base, mask; uint8_t prev_match, curr_match; int num_var_ranges = KVM_NR_VAR_MTRR; if (!mtrr_state->enabled) return (0xFF); /* Make end inclusive end, instead of exclusive */ end--; /* Look in fixed ranges. Just return the type as per start */ if (mtrr_state->have_fixed && (start < 0x100000)) { int idx; if (start < 0x80000) { idx = 0; idx += (start >> 16); return (mtrr_state->fixed_ranges[idx]); } else if (start < 0xC0000) { idx = 1 * 8; idx += ((start - 0x80000) >> 14); return (mtrr_state->fixed_ranges[idx]); } else if (start < 0x1000000) { idx = 3 * 8; idx += ((start - 0xC0000) >> 12); return (mtrr_state->fixed_ranges[idx]); } } /* * Look in variable ranges * Look of multiple ranges matching this address and pick type * as per MTRR precedence */ if (!(mtrr_state->enabled & 2)) return (mtrr_state->def_type); prev_match = 0xFF; for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state; if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11))) continue; base = (((uint64_t)mtrr_state->var_ranges[i].base_hi) << 32) + (mtrr_state->var_ranges[i].base_lo & PAGEMASK); mask = (((uint64_t)mtrr_state->var_ranges[i].mask_hi) << 32) + (mtrr_state->var_ranges[i].mask_lo & PAGEMASK); start_state = ((start & mask) == (base & mask)); end_state = ((end & mask) == (base & mask)); if (start_state != end_state) return (0xFE); if ((start & mask) != (base & mask)) continue; curr_match = mtrr_state->var_ranges[i].base_lo & 0xff; if (prev_match == 0xFF) { prev_match = curr_match; continue; } if (prev_match == MTRR_TYPE_UNCACHABLE || curr_match == MTRR_TYPE_UNCACHABLE) return (MTRR_TYPE_UNCACHABLE); if ((prev_match == MTRR_TYPE_WRBACK && curr_match == MTRR_TYPE_WRTHROUGH) || (prev_match == MTRR_TYPE_WRTHROUGH && curr_match == MTRR_TYPE_WRBACK)) { prev_match = MTRR_TYPE_WRTHROUGH; curr_match = MTRR_TYPE_WRTHROUGH; } if (prev_match != curr_match) return (MTRR_TYPE_UNCACHABLE); } if (prev_match != 0xFF) return (prev_match); return (mtrr_state->def_type); } uint8_t kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) { uint8_t mtrr; mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGESHIFT, (gfn << PAGESHIFT) + PAGESIZE); if (mtrr == 0xfe || mtrr == 0xff) mtrr = MTRR_TYPE_WRBACK; return (mtrr); } static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { unsigned index; list_t *bucket; struct kvm_mmu_page *s; index = kvm_page_table_hashfn(sp->gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; /* don't unsync if pagetable is shadowed with multiple roles */ /* XXX - need protection here(?) */ for (s = list_head(bucket); s; s = list_next(bucket, s)) { if (s->gfn != sp->gfn || s->role.direct) continue; if (s->role.word != sp->role.word) return (1); } KVM_KSTAT_INC(vcpu->kvm, kvmks_mmu_unsync_page); sp->unsync = 1; kvm_mmu_mark_parents_unsync(vcpu, sp); mmu_convert_notrap(sp); return (0); } static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, int can_unsync) { struct kvm_mmu_page *shadow; shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); if (shadow) { if (shadow->role.level != PT_PAGE_TABLE_LEVEL) return (1); if (shadow->unsync) return (0); if (can_unsync && oos_shadow) return (kvm_unsync_page(vcpu, shadow)); return (1); } return (0); } static int set_spte(struct kvm_vcpu *vcpu, uint64_t *sptep, unsigned pte_access, int user_fault, int write_fault, int dirty, int level, gfn_t gfn, pfn_t pfn, int speculative, int can_unsync, int reset_host_protection) { uint64_t spte; int ret = 0; /* * We don't set the accessed bit, since we sometimes want to see * whether the guest actually used the pte (in order to detect * demand paging). */ spte = shadow_base_present_pte | shadow_dirty_mask; if (!speculative) spte |= shadow_accessed_mask; if (!dirty) pte_access &= ~ACC_WRITE_MASK; if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; else spte |= shadow_nx_mask; if (pte_access & ACC_USER_MASK) spte |= shadow_user_mask; if (level > PT_PAGE_TABLE_LEVEL) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, kvm_is_mmio_pfn(pfn)); if (reset_host_protection) spte |= SPTE_HOST_WRITEABLE; spte |= (uint64_t)pfn << PAGESHIFT; if ((pte_access & ACC_WRITE_MASK) || (write_fault && !is_write_protection(vcpu) && !user_fault)) { if (level > PT_PAGE_TABLE_LEVEL && has_wrprotected_page(vcpu->kvm, gfn, level)) { ret = 1; spte = shadow_trap_nonpresent_pte; goto set_pte; } spte |= PT_WRITABLE_MASK; if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) spte &= ~PT_USER_MASK; /* * Optimization: for pte sync, if spte was writable the hash * lookup is unnecessary (and expensive). Write protection * is responsibility of mmu_get_page / kvm_sync_page. * Same reasoning can be applied to dirty page accounting. */ if (!can_unsync && is_writable_pte(*sptep)) goto set_pte; if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { ret = 1; pte_access &= ~ACC_WRITE_MASK; if (is_writable_pte(spte)) spte &= ~PT_WRITABLE_MASK; } } if (pte_access & ACC_WRITE_MASK) mark_page_dirty(vcpu->kvm, gfn); set_pte: __set_spte(sptep, spte); return (ret); } static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { uint64_t *spte; int need_tlb_flush = 0; while ((spte = rmap_next(kvm, rmapp, NULL))) { if (!(*spte & PT_PRESENT_MASK)) { cmn_err(CE_PANIC, "kvm_unmap_rmapp: spte = %p, *spte = %lx\n", spte, *spte); } rmap_remove(kvm, spte); __set_spte(spte, shadow_trap_nonpresent_pte); need_tlb_flush = 1; } return (need_tlb_flush); } #define RMAP_RECYCLE_THRESHOLD 1000 static void rmap_recycle(struct kvm_vcpu *vcpu, uint64_t *spte, gfn_t gfn) { unsigned long *rmapp; struct kvm_mmu_page *sp; sp = page_header(vcpu->kvm, kvm_va2pa((caddr_t)spte)); gfn = unalias_gfn(vcpu->kvm, gfn); rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); kvm_flush_remote_tlbs(vcpu->kvm); } static void mmu_set_spte(struct kvm_vcpu *vcpu, uint64_t *sptep, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, int *ptwrite, int level, gfn_t gfn, pfn_t pfn, int speculative, int reset_host_protection) { int was_rmapped = 0; int was_writable = is_writable_pte(*sptep); int rmap_count; if (is_rmap_spte(*sptep)) { /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. */ if (level > PT_PAGE_TABLE_LEVEL && !is_large_pte(*sptep)) { struct kvm_mmu_page *child; uint64_t pte = *sptep; child = page_header(vcpu->kvm, pte & PT64_BASE_ADDR_MASK); mmu_page_remove_parent_pte(child, sptep); __set_spte(sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); } else if (pfn != spte_to_pfn(*sptep)) { rmap_remove(vcpu->kvm, sptep); __set_spte(sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); } else was_rmapped = 1; } if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, dirty, level, gfn, pfn, speculative, 1, reset_host_protection)) { if (write_fault) *ptwrite = 1; kvm_x86_ops->tlb_flush(vcpu); } if (!was_rmapped && is_large_pte(*sptep)) KVM_KSTAT_INC(vcpu->kvm, kvmks_lpages); page_header_update_slot(vcpu->kvm, sptep, gfn); if (!was_rmapped) { rmap_count = rmap_add(vcpu, sptep, gfn); kvm_release_pfn_clean(pfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); } else { if (was_writable) kvm_release_pfn_dirty(pfn); else kvm_release_pfn_clean(pfn); } if (speculative) { vcpu->arch.last_pte_updated = sptep; vcpu->arch.last_pte_gfn = gfn; } } static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, int level, gfn_t gfn, pfn_t pfn) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; int pt_write = 0; gfn_t pseudo_gfn; for_each_shadow_entry(vcpu, (uint64_t)gfn << PAGESHIFT, iterator) { if (iterator.level == level) { mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 0, write, 1, &pt_write, level, gfn, pfn, 0, 1); KVM_VCPU_KSTAT_INC(vcpu, kvmvs_pf_fixed); break; } if (*iterator.sptep == shadow_trap_nonpresent_pte) { pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGESHIFT; sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, iterator.level - 1, 1, ACC_ALL, iterator.sptep); if (!sp) { cmn_err(CE_WARN, "nonpaging_map: ENOMEM\n"); kvm_release_pfn_clean(pfn); return (-ENOMEM); } __set_spte(iterator.sptep, kvm_va2pa((caddr_t)sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask | shadow_x_mask); } } return (pt_write); } static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) { int r; int level; pfn_t pfn; level = mapping_level(vcpu, gfn); /* * This path builds a PAE pagetable - so we can map 2mb pages at * maximum. Therefore check if the level is larger than that. */ if (level > PT_DIRECTORY_LEVEL) level = PT_DIRECTORY_LEVEL; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); /* mmio */ if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); return (1); } mutex_enter(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, v, write, level, gfn, pfn); mutex_exit(&vcpu->kvm->mmu_lock); return (r); out_unlock: mutex_exit(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); return (0); } static void mmu_free_roots(struct kvm_vcpu *vcpu) { int i; struct kvm_mmu_page *sp; if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; mutex_enter(&vcpu->kvm->mmu_lock); if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(vcpu->kvm, root); --sp->root_count; if (!sp->root_count && sp->role.invalid) kvm_mmu_zap_page(vcpu->kvm, sp); vcpu->arch.mmu.root_hpa = INVALID_PAGE; mutex_exit(&vcpu->kvm->mmu_lock); return; } for (i = 0; i < 4; i++) { hpa_t root = vcpu->arch.mmu.pae_root[i]; if (root) { root &= PT64_BASE_ADDR_MASK; sp = page_header(vcpu->kvm, root); --sp->root_count; if (!sp->root_count && sp->role.invalid) kvm_mmu_zap_page(vcpu->kvm, sp); } vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; } mutex_exit(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = INVALID_PAGE; } static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) { int ret = 0; if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); ret = 1; } return (ret); } static int mmu_alloc_roots(struct kvm_vcpu *vcpu) { int i; gfn_t root_gfn; struct kvm_mmu_page *sp; int direct = 0; uint64_t pdptr; root_gfn = vcpu->arch.cr3 >> PAGESHIFT; if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; ASSERT(!VALID_PAGE(root)); if (tdp_enabled) direct = 1; if (mmu_check_root(vcpu, root_gfn)) return (1); mutex_enter(&vcpu->kvm->mmu_lock); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, direct, ACC_ALL, NULL); root = kvm_va2pa((caddr_t)sp->spt); ++sp->root_count; mutex_exit(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = root; return (0); } direct = !is_paging(vcpu); if (tdp_enabled) direct = 1; for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; ASSERT(!VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { pdptr = kvm_pdptr_read(vcpu, i); if (!is_present_gpte(pdptr)) { vcpu->arch.mmu.pae_root[i] = 0; continue; } root_gfn = pdptr >> PAGESHIFT; } else if (vcpu->arch.mmu.root_level == 0) root_gfn = 0; if (mmu_check_root(vcpu, root_gfn)) return (1); mutex_enter(&vcpu->kvm->mmu_lock); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); root = kvm_va2pa((caddr_t)sp->spt); ++sp->root_count; mutex_exit(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; } vcpu->arch.mmu.root_hpa = kvm_va2pa((caddr_t)vcpu->arch.mmu.pae_root); return (0); } static void mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; struct kvm_mmu_page *sp; if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(vcpu->kvm, root); mmu_sync_children(vcpu, sp); return; } for (i = 0; i < 4; i++) { hpa_t root = vcpu->arch.mmu.pae_root[i]; if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; sp = page_header(vcpu->kvm, root); mmu_sync_children(vcpu, sp); } } } void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { mutex_enter(&vcpu->kvm->mmu_lock); mmu_sync_roots(vcpu); mutex_exit(&vcpu->kvm->mmu_lock); } static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, uint32_t access, uint32_t *error) { if (error) *error = 0; return (vaddr); } static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, uint32_t error_code) { gfn_t gfn; int r; r = mmu_topup_memory_caches(vcpu); if (r) return (r); ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); gfn = gva >> PAGESHIFT; return (nonpaging_map(vcpu, gva & PAGEMASK, error_code & PFERR_WRITE_MASK, gfn)); } static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, uint32_t error_code) { pfn_t pfn; int r; int level; gfn_t gfn = gpa >> PAGESHIFT; ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); r = mmu_topup_memory_caches(vcpu); if (r) return (r); level = mapping_level(vcpu, gfn); gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); return (1); } mutex_enter(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, level, gfn, pfn); mutex_exit(&vcpu->kvm->mmu_lock); return (r); out_unlock: mutex_exit(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); return (0); } static void nonpaging_free(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); } static int nonpaging_init_context(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; context->new_cr3 = nonpaging_new_cr3; context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; return (0); } void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) { kvm_x86_ops->tlb_flush(vcpu); } static void paging_new_cr3(struct kvm_vcpu *vcpu) { cmn_err(CE_CONT, "!%s: cr3 %lx\n", __func__, vcpu->arch.cr3); mmu_free_roots(vcpu); } static void inject_page_fault(struct kvm_vcpu *vcpu, uint64_t addr, uint32_t err_code) { kvm_inject_page_fault(vcpu, addr, err_code); } static void paging_free(struct kvm_vcpu *vcpu) { nonpaging_free(vcpu); } static int is_rsvd_bits_set(struct kvm_vcpu *vcpu, uint64_t gpte, int level) { int bit7; bit7 = (gpte >> 7) & 1; return ((gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level - 1]) != 0); } #define PTTYPE 64 #include "kvm_paging_tmpl.h" #undef PTTYPE #define PTTYPE 32 #include "kvm_paging_tmpl.h" #undef PTTYPE static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) { struct kvm_mmu *context = &vcpu->arch.mmu; int maxphyaddr = cpuid_maxphyaddr(vcpu); uint64_t exb_bit_rsvd = 0; if (!is_nx(vcpu)) exb_bit_rsvd = rsvd_bits(63, 63); switch (level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ context->rsvd_bits_mask[0][1] = 0; context->rsvd_bits_mask[0][0] = 0; if (is_cpuid_PSE36()) /* 36bits PSE 4MB page */ context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); else /* 32 bits PSE 4MB page */ context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; break; case PT32E_ROOT_LEVEL: context->rsvd_bits_mask[0][2] = rsvd_bits(maxphyaddr, 63) | rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ context->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62); /* PDE */ context->rsvd_bits_mask[0][0] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62); /* PTE */ context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62) | rsvd_bits(13, 20); /* large page */ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; break; case PT64_ROOT_LEVEL: context->rsvd_bits_mask[0][3] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); context->rsvd_bits_mask[0][2] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); context->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][0] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; context->rsvd_bits_mask[1][2] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 29); context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20); /* large page */ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; break; } } static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) { struct kvm_mmu *context = &vcpu->arch.mmu; ASSERT(is_pae(vcpu)); context->new_cr3 = paging_new_cr3; context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->prefetch_page = paging64_prefetch_page; context->sync_page = paging64_sync_page; context->invlpg = paging64_invlpg; context->free = paging_free; context->root_level = level; context->shadow_root_level = level; context->root_hpa = INVALID_PAGE; return (0); } static int paging64_init_context(struct kvm_vcpu *vcpu) { reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); return (paging64_init_context_common(vcpu, PT64_ROOT_LEVEL)); } static int paging32_init_context(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; context->free = paging_free; context->prefetch_page = paging32_prefetch_page; context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; return (0); } static int paging32E_init_context(struct kvm_vcpu *vcpu) { reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); return (paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL)); } static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; context->new_cr3 = nonpaging_new_cr3; context->page_fault = tdp_page_fault; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; if (!is_paging(vcpu)) { context->gva_to_gpa = nonpaging_gva_to_gpa; context->root_level = 0; } else if (is_long_mode(vcpu)) { reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT64_ROOT_LEVEL; } else if (is_pae(vcpu)) { reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT32E_ROOT_LEVEL; } else { reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); context->gva_to_gpa = paging32_gva_to_gpa; context->root_level = PT32_ROOT_LEVEL; } return (0); } static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { int r; ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (!is_paging(vcpu)) r = nonpaging_init_context(vcpu); else if (is_long_mode(vcpu)) r = paging64_init_context(vcpu); else if (is_pae(vcpu)) r = paging32E_init_context(vcpu); else r = paging32_init_context(vcpu); vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level; vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); return (r); } static int init_kvm_mmu(struct kvm_vcpu *vcpu) { vcpu->arch.update_pte.pfn = -1; /* bad_pfn */ if (tdp_enabled) return (init_kvm_tdp_mmu(vcpu)); else return (init_kvm_softmmu(vcpu)); return (0); } static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) { ASSERT(vcpu); if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { vcpu->arch.mmu.free(vcpu); vcpu->arch.mmu.root_hpa = INVALID_PAGE; } } int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { destroy_kvm_mmu(vcpu); return (init_kvm_mmu(vcpu)); } int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; r = mmu_topup_memory_caches(vcpu); if (r) goto out; mutex_enter(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); mutex_exit(&vcpu->kvm->mmu_lock); r = mmu_alloc_roots(vcpu); mutex_enter(&vcpu->kvm->mmu_lock); mmu_sync_roots(vcpu); mutex_exit(&vcpu->kvm->mmu_lock); if (r) goto out; /* * set_cr3() should ensure TLB has been flushed */ kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); out: return (r); } void kvm_mmu_unload(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); } static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, uint64_t *spte) { uint64_t pte; struct kvm_mmu_page *child; pte = *spte; if (is_shadow_present_pte(pte)) { if (is_last_spte(pte, sp->role.level)) { rmap_remove(vcpu->kvm, spte); } else { child = page_header(vcpu->kvm, pte & PT64_BASE_ADDR_MASK); mmu_page_remove_parent_pte(child, spte); } } __set_spte(spte, shadow_trap_nonpresent_pte); if (is_large_pte(pte)) KVM_KSTAT_DEC(vcpu->kvm, kvmks_lpages); } static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, uint64_t *spte, const void *new) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) { KVM_KSTAT_INC(vcpu->kvm, kvmks_mmu_pte_zapped); return; } KVM_KSTAT_INC(vcpu->kvm, kvmks_mmu_pte_updated); if (sp->role.glevels == PT32_ROOT_LEVEL) paging32_update_pte(vcpu, sp, spte, new); else paging64_update_pte(vcpu, sp, spte, new); } static int need_remote_flush(uint64_t old, uint64_t new) { if (!is_shadow_present_pte(old)) return (0); if (!is_shadow_present_pte(new)) return (1); if ((old ^ new) & PT64_BASE_ADDR_MASK) return (1); old ^= PT64_NX_MASK; new ^= PT64_NX_MASK; return ((old & ~new & PT64_PERM_MASK) != 0); } static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, uint64_t old, uint64_t new) { if (need_remote_flush(old, new)) kvm_flush_remote_tlbs(vcpu->kvm); else kvm_mmu_flush_tlb(vcpu); } static int last_updated_pte_accessed(struct kvm_vcpu *vcpu) { uint64_t *spte = vcpu->arch.last_pte_updated; return (!!(spte && (*spte & shadow_accessed_mask))); } static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const uint8_t *new, int bytes) { gfn_t gfn; int r; uint64_t gpte = 0; pfn_t pfn; if (bytes != 4 && bytes != 8) return; /* * Assume that the pte write on a page table of the same type * as the current vcpu paging mode. This is nearly always true * (might be false while changing modes). Note it is verified later * by update_pte(). */ if (is_pae(vcpu)) { /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ if ((bytes == 4) && (gpa % 4 == 0)) { r = kvm_read_guest(vcpu->kvm, gpa & ~(uint64_t)7, &gpte, 8); if (r) return; memcpy((void *)((uintptr_t)&gpte + (gpa % 8)), new, 4); } else if ((bytes == 8) && (gpa % 8 == 0)) { memcpy((void *)&gpte, new, 8); } } else { if ((bytes == 4) && (gpa % 4 == 0)) memcpy((void *)&gpte, new, 4); } if (!is_present_gpte(gpte)) return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGESHIFT; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); return; } vcpu->arch.update_pte.gfn = gfn; vcpu->arch.update_pte.pfn = pfn; } static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) { uint64_t *spte = vcpu->arch.last_pte_updated; if (spte && vcpu->arch.last_pte_gfn == gfn && shadow_accessed_mask && !(*spte & shadow_accessed_mask) && is_shadow_present_pte(*spte)) set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); } void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const uint8_t *new, int bytes, int guest_initiated) { gfn_t gfn = gpa >> PAGESHIFT; struct kvm_mmu_page *sp, *nsp = NULL; list_t *bucket; unsigned index; uint64_t entry, gentry; uint64_t *spte; unsigned offset = offset_in_page(gpa); unsigned pte_size; unsigned page_offset; unsigned misaligned; unsigned quadrant; int level; int flooded = 0; int npte; int r; mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); mutex_enter(&vcpu->kvm->mmu_lock); kvm_mmu_access_page(vcpu, gfn); kvm_mmu_free_some_pages(vcpu); KVM_KSTAT_INC(vcpu->kvm, kvmks_mmu_pte_write); if (guest_initiated) { if (gfn == vcpu->arch.last_pt_write_gfn && !last_updated_pte_accessed(vcpu)) { ++vcpu->arch.last_pt_write_count; if (vcpu->arch.last_pt_write_count >= 3) flooded = 1; } else { vcpu->arch.last_pt_write_gfn = gfn; vcpu->arch.last_pt_write_count = 1; vcpu->arch.last_pte_updated = NULL; } } index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; for (sp = list_head(bucket); sp; sp = nsp) { /* * Keep next list node pointer as we may free the current one */ nsp = list_next(bucket, sp); if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) continue; pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned |= bytes < 4; if (misaligned || flooded) { /* * Misaligned accesses are too much trouble to fix * up; also, they usually indicate a page is not used * as a page table. * * If we're seeing too many writes to a page, * it may no longer be a page table, or we may be * forking, in which case it is better to unmap the * page. */ if (kvm_mmu_zap_page(vcpu->kvm, sp)) { /* * kvm_mmu_zap_page() freed page(s) from * somewhere in the list, so start walking * again from the head. */ nsp = list_head(bucket); } KVM_KSTAT_INC(vcpu->kvm, kvmks_mmu_flooded); continue; } page_offset = offset; level = sp->role.level; npte = 1; if (sp->role.glevels == PT32_ROOT_LEVEL) { page_offset <<= 1; /* 32->64 */ /* * A 32-bit pde maps 4MB while the shadow pdes map * only 2MB. So we need to double the offset again * and zap two pdes instead of one. */ if (level == PT32_ROOT_LEVEL) { page_offset &= ~7; /* kill rounding error */ page_offset <<= 1; npte = 2; } quadrant = page_offset >> PAGESHIFT; page_offset &= ~PAGEMASK; if (quadrant != sp->role.quadrant) continue; } spte = &sp->spt[page_offset / sizeof (*spte)]; if ((gpa & (pte_size - 1)) || (bytes < pte_size)) { gentry = 0; r = kvm_read_guest_atomic(vcpu->kvm, gpa & ~(uint64_t)(pte_size - 1), &gentry, pte_size); new = (const void *)&gentry; if (r < 0) new = NULL; } while (npte--) { entry = *spte; mmu_pte_write_zap_pte(vcpu, sp, spte); if (new) mmu_pte_write_new_pte(vcpu, sp, spte, new); mmu_pte_write_flush_tlb(vcpu, entry, *spte); ++spte; } } KVM_TRACE1(mmu__audit__post__pte, struct kvm_vcpu *, vcpu); mutex_exit(&vcpu->kvm->mmu_lock); if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); vcpu->arch.update_pte.pfn = bad_pfn; } } int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) { gpa_t gpa; int r; if (tdp_enabled) return (0); gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); mutex_enter(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGESHIFT); mutex_exit(&vcpu->kvm->mmu_lock); return (r); } void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && !list_is_empty(&vcpu->kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *sp; sp = list_tail(&vcpu->kvm->arch.active_mmu_pages); kvm_mmu_zap_page(vcpu->kvm, sp); KVM_KSTAT_INC(vcpu->kvm, kvmks_mmu_recycled); } } int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, uint32_t error_code) { int r; enum emulation_result er; if ((r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code)) < 0) return (r); if (r == 0) return (1); if ((r = mmu_topup_memory_caches(vcpu)) != 0) return (r); er = emulate_instruction(vcpu, cr2, error_code, 0); switch (er) { case EMULATE_DONE: return (1); case EMULATE_DO_MMIO: KVM_VCPU_KSTAT_INC(vcpu, kvmvs_mmio_exits); return (0); case EMULATE_FAIL: vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; return (0); default: panic("kvm_mmu_page_fault: unknown return " "from emulate_instruction: %x\n", er); } return (0); } void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { vcpu->arch.mmu.invlpg(vcpu, gva); kvm_mmu_flush_tlb(vcpu); KVM_VCPU_KSTAT_INC(vcpu, kvmvs_invlpg); } void kvm_enable_tdp(void) { tdp_enabled = 1; } void kvm_disable_tdp(void) { tdp_enabled = 0; } static int alloc_mmu_pages(struct kvm_vcpu *vcpu) { page_t *page; int i; ASSERT(vcpu); /* * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. * Therefore we need to allocate shadow page tables in the first * 4GB of memory, which happens to fit the DMA32 zone. * XXX - for right now, ignore DMA32. need to use ddi_dma_mem_alloc * to address this issue... * XXX - also, don't need to allocate a full page, we'll look * at htable_t later on solaris. */ page = alloc_page(KM_SLEEP, &vcpu->arch.mmu.alloc_pae_root); if (!page) return (-ENOMEM); vcpu->arch.mmu.pae_root = (uint64_t *)page_address(page); for (i = 0; i < 4; ++i) vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; return (0); } int kvm_mmu_create(struct kvm_vcpu *vcpu) { int i; ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); /* * We'll initialize hash lists here */ for (i = 0; i < KVM_NUM_MMU_PAGES; i++) list_create(&vcpu->kvm->arch.mmu_page_hash[i], sizeof (struct kvm_mmu_page), offsetof(struct kvm_mmu_page, hash_link)); return (alloc_mmu_pages(vcpu)); } int kvm_mmu_setup(struct kvm_vcpu *vcpu) { ASSERT(vcpu); return (init_kvm_mmu(vcpu)); } static void free_mmu_pages(struct kvm_vcpu *vcpu) { kmem_free(vcpu->arch.mmu.alloc_pae_root, PAGESIZE); } static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, struct kmem_cache *cp) { while (mc->nobjs) kmem_cache_free(cp, mc->objects[--mc->nobjs].kma_object); } static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) { while (mc->nobjs) kmem_free(mc->objects[--mc->nobjs].kma_object, PAGESIZE); } static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, mmu_page_header_cache); } void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { ASSERT(vcpu); destroy_kvm_mmu(vcpu); free_mmu_pages(vcpu); mmu_free_memory_caches(vcpu); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; for (sp = list_head(&kvm->arch.active_mmu_pages); sp != NULL; sp = list_next(&kvm->arch.active_mmu_pages, sp)) { int i; uint64_t *pt; if (!test_bit(slot, sp->slot_bitmap)) continue; pt = sp->spt; for (i = 0; i < PT64_ENT_PER_PAGE; i++) { /* avoid RMW */ if (pt[i] & PT_WRITABLE_MASK) pt[i] &= ~PT_WRITABLE_MASK; } } kvm_flush_remote_tlbs(kvm); } void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *nsp; /* * In the following loop, sp may be freed and deleted * from the list indirectly from kvm_mmu_zap_page. * So we hold onto the next element before zapping. */ mutex_enter(&kvm->mmu_lock); for (sp = list_head(&kvm->arch.active_mmu_pages); sp != NULL; sp = nsp) { nsp = list_next(&kvm->arch.active_mmu_pages, sp); if (kvm_mmu_zap_page(kvm, sp)) nsp = list_head(&kvm->arch.active_mmu_pages); } mutex_exit(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); } void kvm_mmu_destroy_caches(void) { if (pte_chain_cache) kmem_cache_destroy(pte_chain_cache); if (rmap_desc_cache) kmem_cache_destroy(rmap_desc_cache); if (mmu_page_header_cache) kmem_cache_destroy(mmu_page_header_cache); } int kvm_mmu_module_init(void) { if ((pte_chain_cache = kmem_cache_create("kvm_pte_chain", sizeof (struct kvm_pte_chain), 0, zero_constructor, NULL, NULL, (void *)sizeof (struct kvm_pte_chain), NULL, 0)) == NULL) goto nomem; if ((rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", sizeof (struct kvm_rmap_desc), 0, zero_constructor, NULL, NULL, (void *)sizeof (struct kvm_rmap_desc), NULL, 0)) == NULL) goto nomem; if ((mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", sizeof (struct kvm_mmu_page), 0, zero_constructor, NULL, NULL, (void *)sizeof (struct kvm_mmu_page), NULL, 0)) == NULL) goto nomem; return (0); nomem: kvm_mmu_destroy_caches(); return (ENOMEM); } /* * Caculate mmu pages needed for kvm. */ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) { int i; unsigned int nr_mmu_pages; unsigned int nr_pages = 0; struct kvm_memslots *slots; mutex_enter(&kvm->memslots_lock); slots = kvm->memslots; for (i = 0; i < slots->nmemslots; i++) nr_pages += slots->memslots[i].npages; mutex_exit(&kvm->memslots_lock); nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = MAX(nr_mmu_pages, (unsigned int)KVM_MIN_ALLOC_MMU_PAGES); return (nr_mmu_pages); } int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, uint64_t addr, uint64_t sptes[4]) { struct kvm_shadow_walk_iterator iterator; int nr_sptes = 0; mutex_enter(&vcpu->kvm->mmu_lock); for_each_shadow_entry(vcpu, addr, iterator) { sptes[iterator.level - 1] = *iterator.sptep; nr_sptes++; if (!is_shadow_present_pte(*iterator.sptep)) break; } mutex_exit(&vcpu->kvm->mmu_lock); return (nr_sptes); } void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { if (vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES) __kvm_mmu_free_some_pages(vcpu); } int kvm_mmu_reload(struct kvm_vcpu *vcpu) { if (vcpu->arch.mmu.root_hpa != INVALID_PAGE) return (0); return (kvm_mmu_load(vcpu)); } int is_present_gpte(unsigned long pte) { return (pte & PT_PRESENT_MASK); } static struct kvm_mmu_page * page_private(kvm_t *kvmp, page_t *page) { kvm_mmu_page_t mp, *res; mp.kmp_avlspt = (uintptr_t)page; mutex_enter(&kvmp->kvm_avllock); res = avl_find(&kvmp->kvm_avlmp, &mp, NULL); mutex_exit(&kvmp->kvm_avllock); ASSERT(res != NULL); return (res); } struct kvm_mmu_page * page_header(kvm_t *kvmp, hpa_t shadow_page) { return (page_private(kvmp, pfn_to_page(shadow_page >> PAGESHIFT))); } int kvm_avlmmucmp(const void *arg1, const void *arg2) { const kvm_mmu_page_t *mp1 = arg1; const kvm_mmu_page_t *mp2 = arg2; if (mp1->kmp_avlspt > mp2->kmp_avlspt) return (1); if (mp1->kmp_avlspt < mp2->kmp_avlspt) return (-1); ASSERT(mp1->kmp_avlspt == mp2->kmp_avlspt); return (0); } inline page_t * compound_head(page_t *page) { return (page); } inline void get_page(page_t *page) { page = compound_head(page); } page_t * pfn_to_page(pfn_t pfn) { return (page_numtopp_nolock(pfn)); } page_t * alloc_page(int flag, void **kma_addr) { caddr_t page_addr; pfn_t pfn; page_t *pp; if ((page_addr = kmem_zalloc(PAGESIZE, flag)) == NULL) return ((page_t *)NULL); *kma_addr = page_addr; pp = page_numtopp_nolock(hat_getpfnum(kas.a_hat, page_addr)); return (pp); } /* * Often times we have pages that correspond to addresses that are in a users * virtual address space. Rather than trying to constantly map them in and out * of our address space we instead go through and use the kpm segment to * facilitate this for us. This always returns an address that is always in the * kernel's virtual address space. */ caddr_t page_address(page_t *page) { return (hat_kpm_mapin_pfn(page->p_pagenum)); }