On 04/01/2017 21:43, Cao, Lei wrote: > Introduce memory tracking data structures and implement the new ioctls > and support functions. > > Signed-off-by: Lei Cao <lei.cao@xxxxxxxxxxx> > --- > include/linux/kvm_host.h | 25 ++++++ > virt/kvm/kvm_main.c | 220 +++++++++++++++++++++++++++++++++++++++++++++-- > 2 files changed, 239 insertions(+), 6 deletions(-) > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 1c5190d..7a85b30 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -204,6 +204,22 @@ struct kvm_mmio_fragment { > unsigned len; > }; > > +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET > +struct dirty_gfn_t { > + __u32 slot; /* as_id | slot_id */ > + __u32 pad; > + __u64 offset; > +}; > + > +struct gfn_list_t { > + __u32 dirty_index; /* where to put the next dirty GFN */ > + __u32 avail_index; /* GFNs before this can be harvested */ > + __u32 fetch_index; /* the next GFN to be harvested */ > + __u32 pad; > + struct dirty_gfn_t dirty_gfns[0]; > +}; These are part of the userspace API, so they go in include/uapi/linux/kvm.h and they should be included unconditionally. Also, as mentioned (indirectly) in my reply to the cover letter, the types for userspace API should start with "kvm_". Please document in include/linux/kvm.h or virt/kvm/kvm_main.c the requirements for enabling KVM_DIRTY_LOG_PAGE_OFFSET. As far as I can see there is at least the following: - any memory accesses done by KVM should use kvm_vcpu_write_* instead of kvm_write_* if possible, otherwise the per-VM log will fill too fast - you should provide kvm_arch_mmu_enable_log_dirty_pt_masked and you should not have a separate step to synchronize a hardware dirty bitmap with KVM's The required size of the global log probably correlates to the number of VCPUs, so what is a good rule of thumb to size the logs? Could it be something like "M times the number of VCPUs, plus N" (and document the M and N of course)? > @@ -1996,6 +2038,9 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, > r = __copy_to_user((void __user *)ghc->hva + offset, data, len); > if (r) > return -EFAULT; > +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET > + kvm_mt_mark_page_dirty(kvm, ghc->memslot, NULL, gpa >> PAGE_SHIFT); > +#endif Please add the kvm and vcpu arguments to mark_page_dirty_in_slot, since you're modifying all four calls to mark_page_dirty_in_slot itself. Furthermore, mark_page_dirty_in_slot already has a lot of ingredients that remove duplicated code: - it checks for memslot != NULL - it checks for memslot->dirty_bitmap - the check for memslot->dirty_bitmap makes it unnecessary to check memslot->id against KVM_USER_MEM_SLOTS. > mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT); > > return 0; > @@ -2076,6 +2121,12 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) > struct kvm_memory_slot *memslot; > > memslot = gfn_to_memslot(kvm, gfn); > +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET > + if (memslot) { > + if (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS) > + kvm_mt_mark_page_dirty(kvm, memslot, NULL, gfn); > + } > +#endif > mark_page_dirty_in_slot(memslot, gfn); > } > EXPORT_SYMBOL_GPL(mark_page_dirty); > @@ -2085,6 +2136,13 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) > struct kvm_memory_slot *memslot; > > memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); > +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET > + if (memslot) { > + if (memslot->id >= 0 && memslot->id < KVM_USER_MEM_SLOTS) > + kvm_mt_mark_page_dirty(vcpu->kvm, memslot, > + vcpu, gfn); > + } > +#endif > mark_page_dirty_in_slot(memslot, gfn); > } > EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); > @@ -2377,6 +2435,32 @@ static const struct vm_operations_struct kvm_vcpu_vm_ops = { > > static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) > { > +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET > + struct kvm_vcpu *vcpu = file->private_data; > + unsigned long start, pfn, size; > + int ret; > + struct gfn_list_t *log; > + > + if (vcpu->kvm->dirty_log_size) { > + size = vcpu->kvm->dirty_log_size; > + start = vma->vm_start + > + KVM_DIRTY_LOG_PAGE_OFFSET*PAGE_SIZE; > + log = vcpu->kvm->dirty_logs; > + pfn = page_to_pfn(virt_to_page((unsigned long)log)); > + ret = remap_pfn_range(vma, start, pfn, size, > + vma->vm_page_prot); > + if (ret) > + return ret; > + start = vma->vm_start + > + KVM_DIRTY_LOG_PAGE_OFFSET*PAGE_SIZE + size; > + log = vcpu->dirty_logs; > + pfn = page_to_pfn(virt_to_page((unsigned long)log)); > + ret = remap_pfn_range(vma, start, pfn, size, > + vma->vm_page_prot); > + if (ret) > + return ret; > + } Please don't use remap_pfn_range; instead extend kvm_vcpu_fault. For the VM-wide log, let's mmap the KVM file descriptor instead. Please add a .mmap callback to kvm_vm_fops and define a new vm_operations_struct kvm_vm_vm_ops. > +#endif > vma->vm_ops = &kvm_vcpu_vm_ops; > return 0; > } > @@ -2946,19 +3030,143 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) > } > > #ifdef KVM_DIRTY_LOG_PAGE_OFFSET > +void kvm_mt_mark_page_dirty(struct kvm *kvm, struct kvm_memory_slot *slot, > + struct kvm_vcpu *vcpu, gfn_t gfn) > +{ > + struct gfn_list_t *gfnlist; > + int slot_id; > + u32 as_id = 0; > + u64 offset; > + > + if (!slot || !slot->dirty_bitmap || !kvm->dirty_log_size) > + return; > + > + offset = gfn - slot->base_gfn; > + > + if (test_bit_le(offset, slot->dirty_bitmap)) > + return; > + > + slot_id = slot->id; > + > + if (vcpu) > + as_id = kvm_arch_vcpu_memslots_id(vcpu); > + > + if (vcpu) > + gfnlist = vcpu->dirty_logs; > + else > + gfnlist = kvm->dirty_logs; > + > + if (gfnlist->dirty_index == kvm->max_dirty_logs) { > + printk(KERN_ERR "dirty log overflow\n"); This should be at least ratelimited, but maybe it should even trigger a WARN_ONCE. > + return; > + } > + > + if (!vcpu) > + spin_lock(&kvm->dirty_log_lock); > + gfnlist->dirty_gfns[gfnlist->dirty_index].slot = > + (as_id << 16) | slot_id; > + gfnlist->dirty_gfns[gfnlist->dirty_index].offset = offset; > + smp_wmb(); > + gfnlist->dirty_index++; > + if (!vcpu) > + spin_unlock(&kvm->dirty_log_lock); > +} > + > static int kvm_mt_set_dirty_log_size(struct kvm *kvm, u32 size) > { > - return -EINVAL; > + struct page *page; > + > + if (!size) > + return -EINVAL; > + > + kvm->dirty_log_size = get_order(size) * PAGE_SIZE; > + kvm->max_dirty_logs = (kvm->dirty_log_size - > + sizeof(struct gfn_list_t)) / > + sizeof(struct dirty_gfn_t); > + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(size)); Please use vmalloc. We don't want userspace to trigger large-order allocations. > + if (!page) { > + kvm_put_kvm(kvm); > + return -ENOMEM; > + } > + kvm->dirty_logs = page_address(page); > + spin_lock_init(&kvm->dirty_log_lock); > + return 0; > +} > + > +static void kvm_mt_reset_gfn(struct kvm *kvm, > + struct dirty_gfn_t *slot_offset) > +{ > + struct kvm_memory_slot *slot; > + int as_id, id; > + > + as_id = slot_offset->slot >> 16; > + id = (u16)slot_offset->slot; > + slot = id_to_memslot(__kvm_memslots(kvm, as_id), id); > + > + clear_bit_le(slot_offset->offset, slot->dirty_bitmap); > + kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, slot, > + slot_offset->offset, 1); In case the guest has locality, would it be beneficial to check for consecutive dirtied pages, and only call kvm_arch_mmu_enable_log_dirty_pt_masked once? Paolo > } > > static int kvm_mt_reset_all_gfns(struct kvm *kvm) > { > - return -EINVAL; > + int i, j; > + struct kvm_vcpu *vcpu; > + int cleared = 0; > + > + if (!kvm->dirty_log_size) > + return -EINVAL; > + > + spin_lock(&kvm->mmu_lock); > + > + kvm_for_each_vcpu(i, vcpu, kvm) { > + for (j = 0; > + j < vcpu->dirty_logs->dirty_index; > + j++, cleared++) > + kvm_mt_reset_gfn(kvm, > + &kvm->dirty_logs->dirty_gfns[j]); > + vcpu->dirty_logs->dirty_index = 0; > + vcpu->dirty_logs->avail_index = 0; > + vcpu->dirty_logs->fetch_index = 0; > + } > + > + for (j = 0; j < kvm->dirty_logs->dirty_index; j++, cleared++) > + kvm_mt_reset_gfn(kvm, &kvm->dirty_logs->dirty_gfns[j]); > + kvm->dirty_logs->dirty_index = 0; > + kvm->dirty_logs->avail_index = 0; > + kvm->dirty_logs->fetch_index = 0; > + > + spin_unlock(&kvm->mmu_lock); > + > + if (cleared) > + kvm_flush_remote_tlbs(kvm); > + > + return 0; > } > > static int kvm_mt_get_dirty_count(struct kvm *kvm, u32 *count) > { > - return -EINVAL; > + int i, avail = 0; > + struct kvm_vcpu *vcpu; > + > + if (!kvm->dirty_log_size) > + return -EINVAL; > + > + kvm_for_each_vcpu(i, vcpu, kvm) { > + vcpu->dirty_logs->avail_index = > + READ_ONCE(vcpu->dirty_logs->dirty_index); > + avail += vcpu->dirty_logs->avail_index - > + vcpu->dirty_logs->fetch_index; > + } > + > + kvm->dirty_logs->avail_index = > + READ_ONCE(kvm->dirty_logs->dirty_index); > + avail += kvm->dirty_logs->avail_index - > + kvm->dirty_logs->fetch_index; > + > + *count = avail; > + > + return 0; > } > #endif /* KVM_DIRTY_LOG_PAGE_OFFSET*/ > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html