On 09/02/2018 22:44, Benjamin Herrenschmidt wrote: >> >> (Also, why is this code reinventing most of hva_to_pfn? Not asking you >> to fix that of course, but perhaps there's something to improve in the >> generic code too). > I've been wondering the same thing, which led me to try to understand > hva_to_pfn, which resulted in skull marks in the wall ;-) > > So hva_to_pfn does a complicated dance around 3 or 4 different variants > of gup and it's really not obvious why, the "intent" doesn't appear to > be documented clearly. I think I somewhat figued out it relates to the > async page fault stuff but even then I don't actually know what those > are and what is their purpose :-) Yeah, hva_to_pfn has all those arguments to tweak its behavior, however you can treat it as essentially get_user_pages_unlocked: - if async==false && atomic==false you can ignore hva_to_pfn_fast. (hva_to_pfn_fast uses __get_user_pages_fast) - hva_to_pfn_slow then is essentially get_user_pages_unlocked. Optionally it's followed by __get_user_pages_fast to get a writable mapping if the caller would like it but does not require it, but you can ignore this if you pass writable==NULL. PPC uses get_user_pages_fast; x86 cannot use it directly because we need FOLL_HWPOISON and get_user_pages_fast does not support it. However, get_user_pages_fast is itself a combination of __get_user_pages_fast and get_user_pages_unlocked. So if it is useful for PPC to use get_user_pages_fast, perhaps the generic code should go down hva_to_pfn_fast unconditionally, even if async and atomic are both false. The other big difference is that you follow up with the PageHuge check to get compound_head/compound_order. This is a bit like kvm_host_page_size, but different. It would be fine for me to add a struct page ** argument to hva_to_pfn if that's enough to remove the duplicate code in arch/powerpc/kvm/. See the untested/uncompiled patch below the signature for some ideas. Thanks, Paolo diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b4414842b023..a2c8824ae608 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1339,15 +1339,12 @@ static inline int check_user_page_hwpoison(unsigned long addr) * The atomic path to get the writable pfn which will be stored in @pfn, * true indicates success, otherwise false is returned. */ -static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, - bool write_fault, bool *writable, kvm_pfn_t *pfn) +static struct page *__hva_to_page_fast(unsigned long addr, bool write_fault, + bool *writable) { struct page *page[1]; int npages; - if (!(async || atomic)) - return false; - /* * Fast pin a writable pfn only if it is a write fault request * or the caller allows to map a writable pfn for a read fault @@ -1357,23 +1354,21 @@ static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, return false; npages = __get_user_pages_fast(addr, 1, 1, page); - if (npages == 1) { - *pfn = page_to_pfn(page[0]); + if (npages < 0) + return ERR_PTR(npages); - if (writable) - *writable = true; - return true; - } + if (writable) + *writable = true; - return false; + return page[0]; } /* * The slow path to get the pfn of the specified host virtual address, * 1 indicates success, -errno is returned if error is detected. */ -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, - bool *writable, kvm_pfn_t *pfn) +static struct page *hva_to_page_unlocked(unsigned long addr, bool *async, + bool write_fault, bool *writable) { struct page *page[1]; int npages = 0; @@ -1395,24 +1390,20 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, npages = get_user_pages_unlocked(addr, 1, page, flags); } - if (npages != 1) - return npages; + if (npages < 0) + return ERR_PTR(npages); /* map read fault as writable if possible */ if (unlikely(!write_fault) && writable) { - struct page *wpage[1]; - - npages = __get_user_pages_fast(addr, 1, 1, wpage); - if (npages == 1) { - *writable = true; + struct page *wpage; + wpage = __hva_to_page_fast(addr, write_fault, writable); + if (!IS_ERR(wpage)) { put_page(page[0]); - page[0] = wpage[0]; + return wpage; } - - npages = 1; } - *pfn = page_to_pfn(page[0]); - return npages; + + return page[0]; } static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) @@ -1487,33 +1478,37 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, * whether the mapping is writable. */ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, - bool write_fault, bool *writable) + bool write_fault, bool *writable, struct page **p_page) { struct vm_area_struct *vma; kvm_pfn_t pfn = 0; - int npages, r; + struct page *page; + int r; /* we can do it either atomically or asynchronously, not both */ BUG_ON(atomic && async); - if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn)) - return pfn; + page = __hva_to_page_fast(addr, write_fault, writable); + if (!IS_ERR(page)) + goto got_page; if (atomic) return KVM_PFN_ERR_FAULT; - npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); - if (npages == 1) - return pfn; + page = hva_to_page_unlocked(addr, async, write_fault, writable); + if (!IS_ERR(page)) + goto got_page; down_read(¤t->mm->mmap_sem); - if (npages == -EHWPOISON || + /* FIXME, is check_user_page_hwpoison still needed? */ + if (PTR_ERR(page) == -EHWPOISON || (!async && check_user_page_hwpoison(addr))) { - pfn = KVM_PFN_ERR_HWPOISON; - goto exit; + up_read(¤t->mm->mmap_sem); + return KVM_PFN_ERR_HWPOISON; } retry: + *p_page = NULL; vma = find_vma_intersection(current->mm, addr, addr + 1); if (vma == NULL) @@ -1529,9 +1523,13 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, *async = true; pfn = KVM_PFN_ERR_FAULT; } -exit: up_read(¤t->mm->mmap_sem); return pfn; + +got_page: + if (p_page) + *p_page = page; + return page_to_pfn(page); } kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, @@ -1559,7 +1557,7 @@ kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, } return hva_to_pfn(addr, atomic, async, write_fault, - writable); + writable, NULL); } EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);