This teaches gup_fast and __gup_fast to re-enable irqs and cond_resched() if possible every BATCH_PAGES. This must be implemented by other archs as well and it's a requirement before converting more get_user_pages() to get_user_pages_fast() as an optimization (instead of using get_user_pages_unlocked which would be slower). Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx> --- arch/x86/mm/gup.c | 234 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 149 insertions(+), 85 deletions(-) diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 2ab183b..917d8c1 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -12,6 +12,12 @@ #include <asm/pgtable.h> +/* + * Keep irq disabled for no more than BATCH_PAGES pages. + * Matches PTRS_PER_PTE (or half in non-PAE kernels). + */ +#define BATCH_PAGES 512 + static inline pte_t gup_get_pte(pte_t *ptep) { #ifndef CONFIG_X86_PAE @@ -250,6 +256,40 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, return 1; } +static inline int __get_user_pages_fast_batch(unsigned long start, + unsigned long end, + int write, struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long next; + unsigned long flags; + pgd_t *pgdp; + int nr = 0; + + /* + * This doesn't prevent pagetable teardown, but does prevent + * the pagetables and pages from being freed on x86. + * + * So long as we atomically load page table pointers versus teardown + * (which we do on x86, with the above PAE exception), we can follow the + * address down to the the page and take a ref on it. + */ + local_irq_save(flags); + pgdp = pgd_offset(mm, start); + do { + pgd_t pgd = *pgdp; + + next = pgd_addr_end(start, end); + if (pgd_none(pgd)) + break; + if (!gup_pud_range(pgd, start, next, write, pages, &nr)) + break; + } while (pgdp++, start = next, start != end); + local_irq_restore(flags); + + return nr; +} + /* * Like get_user_pages_fast() except its IRQ-safe in that it won't fall * back to the regular GUP. @@ -257,31 +297,55 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - unsigned long flags; - pgd_t *pgdp; - int nr = 0; + unsigned long len, end, batch_pages; + int nr, ret; start &= PAGE_MASK; - addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; + /* + * get_user_pages() handles nr_pages == 0 gracefully, but + * gup_fast starts walking the first pagetable in a do {} + * while() fashion so it's not robust to handle nr_pages == + * 0. There's no point in being permissive about end < start + * either. So this check verifies both nr_pages being non + * zero, and that "end" didn't overflow. + */ + VM_BUG_ON(end <= start); if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, (void __user *)start, len))) return 0; - /* - * XXX: batch / limit 'nr', to avoid large irq off latency - * needs some instrumenting to determine the common sizes used by - * important workloads (eg. DB2), and whether limiting the batch size - * will decrease performance. - * - * It seems like we're in the clear for the moment. Direct-IO is - * the main guy that batches up lots of get_user_pages, and even - * they are limited to 64-at-a-time which is not so many. - */ + ret = 0; + for (;;) { + batch_pages = nr_pages; + if (batch_pages > BATCH_PAGES && !irqs_disabled()) + batch_pages = BATCH_PAGES; + len = (unsigned long) batch_pages << PAGE_SHIFT; + end = start + len; + nr = __get_user_pages_fast_batch(start, end, write, pages); + VM_BUG_ON(nr > batch_pages); + nr_pages -= nr; + ret += nr; + if (!nr_pages || nr != batch_pages) + break; + start += len; + pages += batch_pages; + } + + return ret; +} + +static inline int get_user_pages_fast_batch(unsigned long start, + unsigned long end, + int write, struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long next; + pgd_t *pgdp; + int nr = 0; + unsigned long orig_start = start; + /* * This doesn't prevent pagetable teardown, but does prevent * the pagetables and pages from being freed on x86. @@ -290,18 +354,24 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * (which we do on x86, with the above PAE exception), we can follow the * address down to the the page and take a ref on it. */ - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); + local_irq_disable(); + pgdp = pgd_offset(mm, start); do { pgd_t pgd = *pgdp; - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) + next = pgd_addr_end(start, end); + if (pgd_none(pgd)) { + VM_BUG_ON(nr >= (end-orig_start) >> PAGE_SHIFT); break; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + } + if (!gup_pud_range(pgd, start, next, write, pages, &nr)) { + VM_BUG_ON(nr >= (end-orig_start) >> PAGE_SHIFT); break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); + } + } while (pgdp++, start = next, start != end); + local_irq_enable(); + + cond_resched(); return nr; } @@ -326,80 +396,74 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - pgd_t *pgdp; - int nr = 0; + unsigned long len, end, batch_pages; + int nr, ret; + unsigned long orig_start; start &= PAGE_MASK; - addr = start; + orig_start = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; - if (end < start) - goto slow_irqon; + /* + * get_user_pages() handles nr_pages == 0 gracefully, but + * gup_fast starts walking the first pagetable in a do {} + * while() fashion so it's not robust to handle nr_pages == + * 0. There's no point in being permissive about end < start + * either. So this check verifies both nr_pages being non + * zero, and that "end" didn't overflow. + */ + VM_BUG_ON(end <= start); + nr = ret = 0; #ifdef CONFIG_X86_64 if (end >> __VIRTUAL_MASK_SHIFT) goto slow_irqon; #endif + for (;;) { + batch_pages = min(nr_pages, BATCH_PAGES); + len = (unsigned long) batch_pages << PAGE_SHIFT; + end = start + len; + nr = get_user_pages_fast_batch(start, end, write, pages); + VM_BUG_ON(nr > batch_pages); + nr_pages -= nr; + ret += nr; + if (!nr_pages) + break; + if (nr < batch_pages) + goto slow_irqon; + start += len; + pages += batch_pages; + } - /* - * XXX: batch / limit 'nr', to avoid large irq off latency - * needs some instrumenting to determine the common sizes used by - * important workloads (eg. DB2), and whether limiting the batch size - * will decrease performance. - * - * It seems like we're in the clear for the moment. Direct-IO is - * the main guy that batches up lots of get_user_pages, and even - * they are limited to 64-at-a-time which is not so many. - */ - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables and pages from being freed on x86. - * - * So long as we atomically load page table pointers versus teardown - * (which we do on x86, with the above PAE exception), we can follow the - * address down to the the page and take a ref on it. - */ - local_irq_disable(); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) - goto slow; - } while (pgdp++, addr = next, addr != end); - local_irq_enable(); - - VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); - return nr; - - { - int ret; + VM_BUG_ON(ret != (end - orig_start) >> PAGE_SHIFT); + return ret; -slow: - local_irq_enable(); slow_irqon: - /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; - - ret = get_user_pages_unlocked(current, mm, start, - (end - start) >> PAGE_SHIFT, - write, 0, pages); - - /* Have to be a bit careful with return values */ - if (nr > 0) { - if (ret < 0) - ret = nr; - else - ret += nr; - } + /* Try to get the remaining pages with get_user_pages */ + start += nr << PAGE_SHIFT; + pages += nr; - return ret; + /* + * "nr" was the get_user_pages_fast_batch last retval, "ret" + * was the sum of all get_user_pages_fast_batch retvals, now + * "nr" becomes the sum of all get_user_pages_fast_batch + * retvals and "ret" will become the get_user_pages_unlocked + * retval. + */ + nr = ret; + + ret = get_user_pages_unlocked(current, mm, start, + (end - start) >> PAGE_SHIFT, + write, 0, pages); + + /* Have to be a bit careful with return values */ + if (nr > 0) { + if (ret < 0) + ret = nr; + else + ret += nr; } + + return ret; } -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html