Thanks, this is much nicer. I have another generic comment on the ring buffer implementation. If you make the size a power of two, you can use free running counters, i.e. avoid the modulo or mask operation every time you write them. For example reset_index = 0, dirty_index = 0 -> 0 items reset_index = 0, dirty_index = 512 -> 512 items reset_index = 1, dirty_index = 512 -> 511 items This also removes the need to leave a free item in the ring buffer. Of course KVM_ENABLE_CAP_VM then needs to check that the size is a power of two. Also, please add some comments to kvm_gfn_ring.h's definition of struct kvm_gfn_ring. The ring buffer is split between struct kvm_dirty_list and struct kvm_gfn_ring, so it's nice to have some documentation and a reference to struct kvm_dirty_list. Feel free to harvest my review of v2 for some text. On 03/02/2017 12:06, Cao, Lei wrote: > + cur_slot = READ_ONCE(list->dirty_gfns[gfnring->reset_index].slot); > + cur_offset = READ_ONCE(list->dirty_gfns[gfnring->reset_index].offset); > + mask = 1; > + count++; > + gfnring->reset_index = (gfnring->reset_index + 1) % gfnring->size; This "(x + 1) % size" recurs often. Let's make the size a power of two and then write it like struct kvm_dirty_gfn *entry; ... entry = &list->dirty_gfns[gfnring->reset_index & (gfnring->size - 1)]; /* * The ring buffer is shared with userspace, which might mmap * it and concurrently modify slot and offset. Userspace must * not be trusted! READ_ONCE prevents the compiler from * the values after they've been range-checked (the checks are * in kvm_reset_dirty_gfn). */ smp_read_barrier_depends(); cur_slot = READ_ONCE(entry->slot); cur_offset = READ_ONCE(entry->offset); gfnring->reset_index++; > + while (gfnring->reset_index != fetch) { > + next_slot = > + READ_ONCE(list->dirty_gfns[gfnring->reset_index].slot); > + next_offset = > + READ_ONCE(list->dirty_gfns[gfnring->reset_index].offset); > + if ((next_slot != cur_slot) || > + (next_offset < cur_offset) || > + ((next_offset - cur_offset) > (BITS_PER_LONG - 1))) { No extra parentheses please. Some optimization and cleanup is also possible: entry = &list->dirty_gfns[gfnring->reset_index & (gfnring->size - 1)]; smp_read_barrier_depends(); next_slot = READ_ONCE(entry->slot); next_offset = READ_ONCE(entry->offset); gfnring->reset_index++; /* * Try to coalesce the reset operations when the guest is * scanning pages in the same slot. */ if (next_slot == cur_slot) { int delta = next_offset - cur_offset; if (delta >= 0 && delta < BITS_PER_LONG) { mask |= 1ull << delta; continue; } /* Backwards visit, careful about overflows! */ if (delta >= -BITS_PER_LONG && delta < 0 && (mask << -delta >> -delta) == mask) { cur_offset = next_offset; mask = (mask << -delta) | 1; continue; } } kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); cur_slot = next_slot; cur_offset = next_offset; mask = 1; This should handle backwards writes in the guest more effectively (e.g. large memmove). Maybe not super common, but easy enough to handle. > + kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); > + cur_slot = next_slot; > + cur_offset = next_offset; > + mask = 1; > + } else > + mask |= (u64)1 << (next_offset - cur_offset); > + count++; > + gfnring->reset_index = (gfnring->reset_index + 1) % > + gfnring->size; > + } > + kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); > + [...] > +#ifdef KVM_DIRTY_LOG_PAGE_OFFSET > + if (kvm->dirty_ring_size) { > + r = kvm_gfn_ring_alloc(&vcpu->dirty_ring, > + kvm->dirty_ring_size); > + if (r) { > + kvm->dirty_ring_size = 0; > + goto fail_free_run; > + } > + vcpu->max_dirty_logs = > + (kvm->dirty_ring_size/sizeof(struct kvm_dirty_gfn)) > + - 1 - kvm_cpu_dirty_log_size(); > + } > +#endif > + > + kvm->dirty_ring_size = size; > + kvm->max_dirty_logs = (size/sizeof(struct kvm_dirty_gfn)) - 1; Should there be some legroom, in case KVM writes several pages consecutively? I would feel safer if you had perhaps 16 empty pages when exiting to the guest (and then "if (num == max)" becomes "if (num > max)". > > + if (vcpu) { > + gfnlist = &vcpu->dirty_ring; > + max = vcpu->max_dirty_logs; > + } else { > + gfnlist = &kvm->dirty_ring; > + max = kvm->max_dirty_logs; > + } This suggests making the soft limit (max_dirty_logs) an argument of kvm_gfn_ring_alloc. kvm_gfn_ring_push can return 0 if the soft limit is not reached, 1 if it is, and -EBUSY if the hard limit is reached. > > + > + if (((gfnring->dirty_index + 1) % gfnring->size) == > + gfnring->reset_index) > + return -EBUSY; The spin lock can be taken in the error case too. It makes it simpler to understand the code, and the error case is obviously not hot. > + if (locked) > + spin_lock(&gfnring->lock); > + > + list->dirty_gfns[gfnring->dirty_index].slot = slot; > + list->dirty_gfns[gfnring->dirty_index].offset = offset; > + smp_wmb(); > + gfnring->dirty_index = (gfnring->dirty_index+1) % gfnring->size; > + num = (gfnring->dirty_index - gfnring->reset_index) % gfnring->size; Using the free running counter technique, the invariants are clearer and modulo operations do not clutter the code as much: if (locked) spin_lock(&gfnring->lock); num = (u16)(gfnring->dirty_index - gfnring->reset_index); if (num >= gfnring->size) { WARN_ON_ONCE(num > gfnring->size); r = -EBUSY; goto out; } entry = &list->dirty_gfns[gfnring->dirty_index & (gfnring->size - 1)]; entry->slot = slot; entry->offset = offset; smp_wmb(); gfnring->dirty_index++; num++; r = num >= gfnring->soft_limit; out: if (locked) spin_unlock(&gfnring->lock); return r; Paolo