- Support write protection of entire VM address space - Split pmds section in migration mode - Write protect dirty pages on Dirty log read Signed-off-by: Mario Smarduch <m.smarduch@xxxxxxxxxxx> --- arch/arm/kvm/mmu.c | 158 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 156 insertions(+), 2 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 7789857..502e776 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -56,6 +56,13 @@ static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } +void kvm_tlb_flush_vmid(struct kvm *kvm) +{ + phys_addr_t x; + /* based on function description 2nd argument is irrelevent */ + kvm_tlb_flush_vmid_ipa(kvm, x); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { @@ -639,6 +646,143 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) return false; } +/* + * Called when QEMU retrieves the dirty log and write protects dirty pages + * for next QEMU call to retrieve the dirty logn + */ +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + phys_addr_t ipa; + pgd_t *pgdp = kvm->arch.pgd, *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, new_pte; + + while (mask) { + ipa = (slot->base_gfn + gfn_offset + __ffs(mask)) << PAGE_SHIFT; + pgd = pgdp + pgd_index(ipa); + if (!pgd_present(*pgd)) + goto update_mask; + pud = pud_offset(pgd, ipa); + if (!pud_present(*pud)) + goto update_mask; + pmd = pmd_offset(pud, ipa); + if (!pmd_present(*pmd)) + goto update_mask; + pte = pte_offset_kernel(pmd, ipa); + if (!pte_present(*pte)) + goto update_mask; + if ((*pte & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY) + goto update_mask; + new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2); + *pte = new_pte; +update_mask: + mask &= mask - 1; + } +} + +/* + * In migration splits PMDs into PTEs to keep track of dirty pages. Without + * spliting light execution prevents migration. + */ +bool split_pmd(struct kvm *kvm, pmd_t *pmd, u64 addr) +{ + struct page *page; + pfn_t pfn = pmd_pfn(*pmd); + pte_t *pte, new_pte; + int i; + + page = alloc_page(GFP_KERNEL); + if (page == NULL) + return false; + + pte = page_address(page); + for (i = 0; i < PMD_SIZE/PAGE_SIZE; i++) { + new_pte = pfn_pte(pfn+i, PAGE_S2); + pte[i] = new_pte; + } + kvm_clean_pte(pte); + pmd_populate_kernel(NULL, pmd, pte); + + /* + * flush the whole TLB for VM relying on hardware broadcast + */ + kvm_tlb_flush_vmid(kvm); + get_page(virt_to_page(pte)); + return true; +} + +/* + * Called from QEMU when migration dirty logging is started. Write the protect + * current set. Future faults writes are tracked through WP of when dirty log + * log. + */ + +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, new_pte; + pgd_t *pgdp = kvm->arch.pgd; + struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot); + u64 start = memslot->base_gfn << PAGE_SHIFT; + u64 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; + u64 addr = start, addr1; + + spin_lock(&kvm->mmu_lock); + kvm->arch.migration_in_progress = true; + while (addr < end) { + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_tlb_flush_vmid(kvm); + cond_resched_lock(&kvm->mmu_lock); + } + + pgd = pgdp + pgd_index(addr); + if (!pgd_present(*pgd)) { + addr = pgd_addr_end(addr, end); + continue; + } + + pud = pud_offset(pgd, addr); + if (pud_huge(*pud) || !pud_present(*pud)) { + addr = pud_addr_end(addr, end); + continue; + } + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) { + addr = pmd_addr_end(addr, end); + continue; + } + + if (kvm_pmd_huge(*pmd)) { + if (!split_pmd(kvm, pmd, addr)) { + kvm->arch.migration_in_progress = false; + return; + } + addr = pmd_addr_end(addr, end); + continue; + } + + pte = pte_offset_kernel(pmd, addr); + addr1 = addr; + addr += PAGE_SIZE; + if (!pte_present(*pte)) + continue; + + if ((*pte & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY) + continue; + + new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2); + *pte = new_pte; + } + kvm_tlb_flush_vmid(kvm); + spin_unlock(&kvm->mmu_lock); +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long fault_status) @@ -652,6 +796,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; struct vm_area_struct *vma; pfn_t pfn; + bool migration_active = vcpu->kvm->arch.migration_in_progress; write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); if (fault_status == FSC_PERM && !write_fault) { @@ -705,10 +850,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, spin_lock(&kvm->mmu_lock); if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (!hugetlb && !force_pte) + /* During migration don't rebuild huge pages */ + if (!hugetlb && !force_pte && !migration_active) hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); - if (hugetlb) { + if (!migration_active && hugetlb) { pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); new_pmd = pmd_mkhuge(new_pmd); if (writable) { @@ -720,6 +866,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } else { pte_t new_pte = pfn_pte(pfn, PAGE_S2); if (writable) { + if (migration_active && hugetlb) { + /* get back pfn from fault_ipa */ + pfn += (fault_ipa >> PAGE_SHIFT) & + ((1 << (PMD_SHIFT - PAGE_SHIFT))-1); + new_pte = pfn_pte(pfn, PAGE_S2); + } kvm_set_s2pte_writable(&new_pte); kvm_set_pfn_dirty(pfn); } @@ -727,6 +879,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false); } + if (writable) + mark_page_dirty(kvm, gfn); out_unlock: spin_unlock(&kvm->mmu_lock); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html