When KVM_DIRTY_LOG_INITIALLY_SET is enabled, we can only report these pages that have valid translation entries to userspace, then userspace don't need to do zero-check on other pages during VM migration. Under the Huawei Kunpeng 920 2.6GHz platform, I did some tests on 128G Linux VMs with different page size. About the time of enabling dirty log: The memory pressure is 127GB. Page size Before After 4K 1.8ms 341ms 2M 1.8ms 4ms 1G 1.8ms 2ms About the time of migration: The memory pressure is 3GB and the migration bandwidth is 500MB/s. Page size Before After 4K 21s 6s 2M 21s 6s 1G 21s 7s Signed-off-by: Keqian Zhu <zhukeqian1@xxxxxxxxxx> --- virt/kvm/arm/mmu.c | 161 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 137 insertions(+), 24 deletions(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 6c84de442a0e..0c7a5faf8609 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1413,34 +1413,85 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) return false; } +enum s2_operation { + S2_OP_WP, /* write protect page tables */ + S2_OP_MD, /* mark dirty bitmap in memslot */ +}; + /** - * stage2_wp_ptes - write protect PMD range + * mark_range_dirty - mark a range of dirty bitmap + * @kvm: kvm instance for the VM + * @addr: range start address + * @end: range end address + * + * note: addr and end should belong to the same memslot. + */ +static void mark_range_dirty(struct kvm *kvm, + phys_addr_t addr, + phys_addr_t end) +{ + gfn_t gfn; + unsigned int start, nbits; + struct kvm_memory_slot *memslot = NULL; + + gfn = addr >> PAGE_SHIFT; + memslot = gfn_to_memslot(kvm, gfn); + + if (memslot && memslot->dirty_bitmap) { + start = gfn - memslot->base_gfn; + nbits = DIV_ROUND_UP(end, PAGE_SIZE) - gfn; + bitmap_set(memslot->dirty_bitmap, start, nbits); + } +} + +/** + * stage2_op_ptes - do an operation on PMD range + * @kvm: kvm instance for the VM + * @op: the operation wanted * @pmd: pointer to pmd entry * @addr: range start address * @end: range end address */ -static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +static void stage2_op_ptes(struct kvm *kvm, + enum s2_operation op, + pmd_t *pmd, + phys_addr_t addr, + phys_addr_t end) { pte_t *pte; pte = pte_offset_kernel(pmd, addr); do { - if (!pte_none(*pte)) { + if (pte_none(*pte)) + continue; + + switch (op) { + case S2_OP_WP: if (!kvm_s2pte_readonly(pte)) kvm_set_s2pte_readonly(pte); + break; + case S2_OP_MD: + mark_range_dirty(kvm, addr, addr + PAGE_SIZE); + break; + default: + break; } } while (pte++, addr += PAGE_SIZE, addr != end); } /** - * stage2_wp_pmds - write protect PUD range - * kvm: kvm instance for the VM + * stage2_op_pmds - do an operation on PUD range + * @kvm: kvm instance for the VM + * @op: the operation wanted * @pud: pointer to pud entry * @addr: range start address * @end: range end address */ -static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, - phys_addr_t addr, phys_addr_t end) +static void stage2_op_pmds(struct kvm *kvm, + enum s2_operation op, + pud_t *pud, + phys_addr_t addr, + phys_addr_t end) { pmd_t *pmd; phys_addr_t next; @@ -1449,25 +1500,40 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, do { next = stage2_pmd_addr_end(kvm, addr, end); - if (!pmd_none(*pmd)) { - if (pmd_thp_or_huge(*pmd)) { + if (pmd_none(*pmd)) + continue; + + if (pmd_thp_or_huge(*pmd)) { + switch (op) { + case S2_OP_WP: if (!kvm_s2pmd_readonly(pmd)) kvm_set_s2pmd_readonly(pmd); - } else { - stage2_wp_ptes(pmd, addr, next); + break; + case S2_OP_MD: + mark_range_dirty(kvm, addr, next); + break; + default: + break; } + } else { + stage2_op_ptes(kvm, op, pmd, addr, next); } } while (pmd++, addr = next, addr != end); } /** - * stage2_wp_puds - write protect PGD range + * stage2_op_puds - do an operation on PGD range + * @kvm: kvm instance for the VM + * @op: the operation wanted * @pgd: pointer to pgd entry * @addr: range start address * @end: range end address */ -static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, - phys_addr_t addr, phys_addr_t end) +static void stage2_op_puds(struct kvm *kvm, + enum s2_operation op, + pgd_t *pgd, + phys_addr_t addr, + phys_addr_t end) { pud_t *pud; phys_addr_t next; @@ -1475,24 +1541,38 @@ static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, pud = stage2_pud_offset(kvm, pgd, addr); do { next = stage2_pud_addr_end(kvm, addr, end); - if (!stage2_pud_none(kvm, *pud)) { - if (stage2_pud_huge(kvm, *pud)) { + if (stage2_pud_none(kvm, *pud)) + continue; + + if (stage2_pud_huge(kvm, *pud)) { + switch (op) { + case S2_OP_WP: if (!kvm_s2pud_readonly(pud)) kvm_set_s2pud_readonly(pud); - } else { - stage2_wp_pmds(kvm, pud, addr, next); + break; + case S2_OP_MD: + mark_range_dirty(kvm, addr, next); + break; + default: + break; } + } else { + stage2_op_pmds(kvm, op, pud, addr, next); } } while (pud++, addr = next, addr != end); } /** - * stage2_wp_range() - write protect stage2 memory region range + * stage2_op_range() - do an operation on stage2 memory region range * @kvm: The KVM pointer + * @op: The operation wanted * @addr: Start address of range * @end: End address of range */ -static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) +static void stage2_op_range(struct kvm *kvm, + enum s2_operation op, + phys_addr_t addr, + phys_addr_t end) { pgd_t *pgd; phys_addr_t next; @@ -1513,7 +1593,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) break; next = stage2_pgd_addr_end(kvm, addr, end); if (stage2_pgd_present(kvm, *pgd)) - stage2_wp_puds(kvm, pgd, addr, next); + stage2_op_puds(kvm, op, pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -1543,11 +1623,44 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; spin_lock(&kvm->mmu_lock); - stage2_wp_range(kvm, start, end); + stage2_op_range(kvm, S2_OP_WP, start, end); spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); } +/** + * kvm_mmu_md_memory_region() - mark dirty bitmap for memory slot + * @kvm: The KVM pointer + * @slot: The memory slot to mark dirty + * + * Called to mark dirty bitmap after memory region KVM_MEM_LOG_DIRTY_PAGES + * operation is called and kvm_dirty_log_manual_protect_and_init_set is + * true. After this function returns, a bit of dirty bitmap is set if its + * corresponding page table (including PUD, PMD and PTEs) is present. + * + * Afterwards read of dirty page log can be called and present PUD, PMD and + * PTEs can be write protected by userspace manually. + * + * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, + * serializing operations for VM memory regions. + */ +static void kvm_mmu_md_memory_region(struct kvm *kvm, int slot) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); + phys_addr_t start, end; + + if (WARN_ON_ONCE(!memslot)) + return; + + start = memslot->base_gfn << PAGE_SHIFT; + end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; + + spin_lock(&kvm->mmu_lock); + stage2_op_range(kvm, S2_OP_MD, start, end); + spin_unlock(&kvm->mmu_lock); +} + /** * kvm_mmu_write_protect_pt_masked() - write protect dirty pages * @kvm: The KVM pointer @@ -1567,7 +1680,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; - stage2_wp_range(kvm, start, end); + stage2_op_range(kvm, S2_OP_WP, start, end); } /* @@ -2274,7 +2387,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * write protect any pages because they're reported * as dirty here. */ - bitmap_set(new->dirty_bitmap, 0, new->npages); + kvm_mmu_md_memory_region(kvm, mem->slot); } } } -- 2.19.1