On Mon, 5 August 2013 16:32:02 +0200, Vlastimil Babka wrote: > > /* > + * Munlock a batch of pages from the same zone > + * > + * The work is split to two main phases. First phase clears the Mlocked flag > + * and attempts to isolate the pages, all under a single zone lru lock. > + * The second phase finishes the munlock only for pages where isolation > + * succeeded. > + */ > +static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) > +{ > + int i; > + int nr = pagevec_count(pvec); > + > + /* Phase 1: page isolation */ > + spin_lock_irq(&zone->lru_lock); > + for (i = 0; i < nr; i++) { > + struct page *page = pvec->pages[i]; > + > + if (TestClearPageMlocked(page)) { > + struct lruvec *lruvec; > + int lru; > + > + /* we have disabled interrupts */ > + __mod_zone_page_state(zone, NR_MLOCK, -1); > + > + switch (__isolate_lru_page(page, > + ISOLATE_UNEVICTABLE)) { > + case 0: > + lruvec = mem_cgroup_page_lruvec(page, zone); > + lru = page_lru(page); > + del_page_from_lru_list(page, lruvec, lru); > + break; > + > + case -EINVAL: > + __munlock_isolation_failed(page); > + goto skip_munlock; > + > + default: > + BUG(); > + } On purely aesthetic grounds I don't like the switch too much. A bit more serious is that you don't handle -EBUSY gracefully. I guess you would have to mlock() the empty zero page to excercise this code path. > + } else { > +skip_munlock: > + /* > + * We won't be munlocking this page in the next phase > + * but we still need to release the follow_page_mask() > + * pin. > + */ > + pvec->pages[i] = NULL; > + put_page(page); > + } > + } > + spin_unlock_irq(&zone->lru_lock); > + > + /* Phase 2: page munlock and putback */ > + for (i = 0; i < nr; i++) { > + struct page *page = pvec->pages[i]; > + > + if (unlikely(!page)) > + continue; Whenever I see likely() or unlikely() I wonder whether it really makes a difference or whether it is just cargo-cult programming. My best guess is that about half of them are cargo-cult. > + lock_page(page); > + __munlock_isolated_page(page); > + unlock_page(page); > + put_page(page); /* pin from follow_page_mask() */ > + } > + pagevec_reinit(pvec); > +} > + > +/* > * munlock_vma_pages_range() - munlock all pages in the vma range.' > * @vma - vma containing range to be munlock()ed. > * @start - start address in @vma of the range > @@ -230,11 +315,16 @@ static int __mlock_posix_error_return(long retval) > void munlock_vma_pages_range(struct vm_area_struct *vma, > unsigned long start, unsigned long end) > { > + struct pagevec pvec; > + struct zone *zone = NULL; > + > + pagevec_init(&pvec, 0); > vma->vm_flags &= ~VM_LOCKED; > > while (start < end) { > struct page *page; > unsigned int page_mask, page_increm; > + struct zone *pagezone; > > /* > * Although FOLL_DUMP is intended for get_dump_page(), > @@ -246,20 +336,47 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, > page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, > &page_mask); > if (page && !IS_ERR(page)) { > - lock_page(page); > - /* > - * Any THP page found by follow_page_mask() may have > - * gotten split before reaching munlock_vma_page(), > - * so we need to recompute the page_mask here. > - */ > - page_mask = munlock_vma_page(page); > - unlock_page(page); > - put_page(page); > + pagezone = page_zone(page); > + /* The whole pagevec must be in the same zone */ > + if (pagezone != zone) { > + if (pagevec_count(&pvec)) > + __munlock_pagevec(&pvec, zone); > + zone = pagezone; > + } > + if (PageTransHuge(page)) { > + /* > + * THP pages are not handled by pagevec due > + * to their possible split (see below). > + */ > + if (pagevec_count(&pvec)) > + __munlock_pagevec(&pvec, zone); Should you re-initialize the pvec after this call? > + lock_page(page); > + /* > + * Any THP page found by follow_page_mask() may > + * have gotten split before reaching > + * munlock_vma_page(), so we need to recompute > + * the page_mask here. > + */ > + page_mask = munlock_vma_page(page); > + unlock_page(page); > + put_page(page); /* follow_page_mask() */ > + } else { > + /* > + * Non-huge pages are handled in batches > + * via pagevec. The pin from > + * follow_page_mask() prevents them from > + * collapsing by THP. > + */ > + if (pagevec_add(&pvec, page) == 0) > + __munlock_pagevec(&pvec, zone); > + } > } > page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); > start += page_increm * PAGE_SIZE; > cond_resched(); > } > + if (pagevec_count(&pvec)) > + __munlock_pagevec(&pvec, zone); > } The rest looks good to my untrained eyes. Jörn -- One of my most productive days was throwing away 1000 lines of code. -- Ken Thompson. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>