Fuad Tabba <tabba@xxxxxxxxxx> writes: >> > <snip> >> > >> > +/* >> > + * Registers a callback to __folio_put(), so that gmem knows that the host does >> > + * not have any references to the folio. It does that by setting the folio type >> > + * to guestmem. >> > + * >> > + * Returns 0 if the host doesn't have any references, or -EAGAIN if the host >> > + * has references, and the callback has been registered. >> >> Note this comment. >> >> > + * >> > + * Must be called with the following locks held: >> > + * - filemap (inode->i_mapping) invalidate_lock >> > + * - folio lock >> > + */ >> > +static int __gmem_register_callback(struct folio *folio, struct inode *inode, pgoff_t idx) >> > +{ >> > + struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets; >> > + void *xval_guest = xa_mk_value(KVM_GMEM_GUEST_MAPPABLE); >> > + int refcount; >> > + >> > + rwsem_assert_held_write_nolockdep(&inode->i_mapping->invalidate_lock); >> > + WARN_ON_ONCE(!folio_test_locked(folio)); >> > + >> > + if (folio_mapped(folio) || folio_test_guestmem(folio)) >> > + return -EAGAIN; >> >> But here we return -EAGAIN and no callback was registered? > > This is intentional. If the folio is still mapped (i.e., its mapcount > is elevated), then we cannot register the callback yet, so the > host/vmm needs to unmap first, then try again. That said, I see the > problem with the comment above, and I will clarify this. > >> > + >> > + /* Register a callback first. */ >> > + __folio_set_guestmem(folio); >> > + >> > + /* >> > + * Check for references after setting the type to guestmem, to guard >> > + * against potential races with the refcount being decremented later. >> > + * >> > + * At least one reference is expected because the folio is locked. >> > + */ >> > + >> > + refcount = folio_ref_sub_return(folio, folio_nr_pages(folio)); >> > + if (refcount == 1) { >> > + int r; >> > + >> > + /* refcount isn't elevated, it's now faultable by the guest. */ >> >> Again this seems racy, somebody could have just speculatively increased it. >> Maybe we need to freeze here as well? > > A speculative increase here is ok I think (famous last words). The > callback was registered before the check, therefore, such an increase > would trigger the callback. > > Thanks, > /fuad > > I checked the callback (kvm_gmem_handle_folio_put()) and agree with you that the mappability reset to KVM_GMEM_GUEST_MAPPABLE is handled correctly (since kvm_gmem_handle_folio_put() doesn't assume anything about the mappability state at callback-time). However, what if the new speculative reference writes to the page and guest goes on to fault/use the page? >> > + r = WARN_ON_ONCE(xa_err(xa_store(mappable_offsets, idx, xval_guest, GFP_KERNEL))); >> > + if (!r) >> > + __kvm_gmem_restore_pending_folio(folio); >> > + >> > + return r; >> > + } >> > + >> > + return -EAGAIN; >> > +} >> > + >> > +int kvm_slot_gmem_register_callback(struct kvm_memory_slot *slot, gfn_t gfn) >> > +{ >> > + unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn; >> > + struct inode *inode = file_inode(slot->gmem.file); >> > + struct folio *folio; >> > + int r; >> > + >> > + filemap_invalidate_lock(inode->i_mapping); >> > + >> > + folio = filemap_lock_folio(inode->i_mapping, pgoff); >> > + if (WARN_ON_ONCE(IS_ERR(folio))) { >> > + r = PTR_ERR(folio); >> > + goto out; >> > + } >> > + >> > + r = __gmem_register_callback(folio, inode, pgoff); >> > + >> > + folio_unlock(folio); >> > + folio_put(folio); >> > +out: >> > + filemap_invalidate_unlock(inode->i_mapping); >> > + >> > + return r; >> > +} >> > + >> > +/* >> > + * Callback function for __folio_put(), i.e., called when all references by the >> > + * host to the folio have been dropped. This allows gmem to transition the state >> > + * of the folio to mappable by the guest, and allows the hypervisor to continue >> > + * transitioning its state to private, since the host cannot attempt to access >> > + * it anymore. >> > + */ >> > +void kvm_gmem_handle_folio_put(struct folio *folio) >> > +{ >> > + struct xarray *mappable_offsets; >> > + struct inode *inode; >> > + pgoff_t index; >> > + void *xval; >> > + >> > + inode = folio->mapping->host; >> > + index = folio->index; >> > + mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets; >> > + xval = xa_mk_value(KVM_GMEM_GUEST_MAPPABLE); >> > + >> > + filemap_invalidate_lock(inode->i_mapping); >> > + __kvm_gmem_restore_pending_folio(folio); >> > + WARN_ON_ONCE(xa_err(xa_store(mappable_offsets, index, xval, GFP_KERNEL))); >> > + filemap_invalidate_unlock(inode->i_mapping); >> > +} >> > + >> > static bool gmem_is_mappable(struct inode *inode, pgoff_t pgoff) >> > { >> > struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets; >>