On Fri, Apr 25, 2014 at 2:48 AM, Dan Streetman <ddstreet@xxxxxxxx> wrote: > On Wed, Apr 23, 2014 at 6:34 AM, Mel Gorman <mgorman@xxxxxxx> wrote: >> On Sat, Apr 12, 2014 at 05:00:53PM -0400, Dan Streetman wrote: >>> Replace the singly-linked list tracking active, i.e. swapon'ed, >>> swap_info_struct entries with a doubly-linked list using struct list_heads. >>> Simplify the logic iterating and manipulating the list of entries, >>> especially get_swap_page(), by using standard list_head functions, >>> and removing the highest priority iteration logic. >>> >>> The change fixes the bug: >>> https://lkml.org/lkml/2014/2/13/181 >>> in which different priority swap entries after the highest priority entry >>> are incorrectly used equally in pairs. The swap behavior is now as >>> advertised, i.e. different priority swap entries are used in order, and >>> equal priority swap targets are used concurrently. >>> >>> Signed-off-by: Dan Streetman <ddstreet@xxxxxxxx> >>> --- >>> include/linux/swap.h | 7 +-- >>> include/linux/swapfile.h | 2 +- >>> mm/frontswap.c | 13 ++-- >>> mm/swapfile.c | 156 +++++++++++++++++------------------------------ >>> 4 files changed, 63 insertions(+), 115 deletions(-) >>> >>> diff --git a/include/linux/swap.h b/include/linux/swap.h >>> index 3507115..96662d8 100644 >>> --- a/include/linux/swap.h >>> +++ b/include/linux/swap.h >>> @@ -214,8 +214,8 @@ struct percpu_cluster { >>> struct swap_info_struct { >>> unsigned long flags; /* SWP_USED etc: see above */ >>> signed short prio; /* swap priority of this type */ >>> + struct list_head list; /* entry in swap list */ >>> signed char type; /* strange name for an index */ >>> - signed char next; /* next type on the swap list */ >>> unsigned int max; /* extent of the swap_map */ >>> unsigned char *swap_map; /* vmalloc'ed array of usage counts */ >>> struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ >>> @@ -255,11 +255,6 @@ struct swap_info_struct { >>> struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */ >>> }; >>> >>> -struct swap_list_t { >>> - int head; /* head of priority-ordered swapfile list */ >>> - int next; /* swapfile to be used next */ >>> -}; >>> - >>> /* linux/mm/workingset.c */ >>> void *workingset_eviction(struct address_space *mapping, struct page *page); >>> bool workingset_refault(void *shadow); >>> diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h >>> index e282624..2eab382 100644 >>> --- a/include/linux/swapfile.h >>> +++ b/include/linux/swapfile.h >>> @@ -6,7 +6,7 @@ >>> * want to expose them to the dozens of source files that include swap.h >>> */ >>> extern spinlock_t swap_lock; >>> -extern struct swap_list_t swap_list; >>> +extern struct list_head swap_list_head; >>> extern struct swap_info_struct *swap_info[]; >>> extern int try_to_unuse(unsigned int, bool, unsigned long); >>> >>> diff --git a/mm/frontswap.c b/mm/frontswap.c >>> index 1b24bdc..fae1160 100644 >>> --- a/mm/frontswap.c >>> +++ b/mm/frontswap.c >>> @@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area); >>> >>> static unsigned long __frontswap_curr_pages(void) >>> { >>> - int type; >>> unsigned long totalpages = 0; >>> struct swap_info_struct *si = NULL; >>> >>> assert_spin_locked(&swap_lock); >>> - for (type = swap_list.head; type >= 0; type = si->next) { >>> - si = swap_info[type]; >>> + list_for_each_entry(si, &swap_list_head, list) >>> totalpages += atomic_read(&si->frontswap_pages); >>> - } >>> return totalpages; >>> } >>> >>> @@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, >>> int si_frontswap_pages; >>> unsigned long total_pages_to_unuse = total; >>> unsigned long pages = 0, pages_to_unuse = 0; >>> - int type; >>> >>> assert_spin_locked(&swap_lock); >>> - for (type = swap_list.head; type >= 0; type = si->next) { >>> - si = swap_info[type]; >>> + list_for_each_entry(si, &swap_list_head, list) { >>> si_frontswap_pages = atomic_read(&si->frontswap_pages); >>> if (total_pages_to_unuse < si_frontswap_pages) { >>> pages = pages_to_unuse = total_pages_to_unuse; >> >> The frontswap shrink code looks suspicious. If the target is smaller than >> the total number of frontswap pages then it does nothing. The callers >> appear to get this right at least. Similarly, if the first swapfile has >> fewer frontswap pages than the target then it does not unuse the target >> number of pages because it only handles one swap file. It's outside the >> scope of your patch to address this or wonder if xen balloon driver is >> really using it the way it's expected. > > I didn't look into the frontswap shrinking code, but I agree the > existing logic there doesn't look right. I'll review frontswap in > more detail to see if it needs changing here, unless anyone else gets > it to first :-) > FYI, I drop the frontswap_shrink code in a patch see: https://lkml.org/lkml/2014/1/27/98 > And as you said, it's outside the scope of this particular patch. > >> >> The old code scanned the files in priority order. Superficially this does >> not appear to but it actually does because you add the swap files to the >> list in priority order during swapon. If you do another revision it's >> worth adding a comment above swap_list_head that the list is ordered by >> priority and protected by the swap_lock. > > Yep you're right, I will add a comment to make clear it's also > priority ordered, and why. The only reason it needs to be priority > ordered is because swapoff has to adjust the auto-priority of any > following swap_info_structs when one is removed, that has an > automatically assigned (i.e. negative) priority. > >> >>> @@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, >>> } >>> vm_unacct_memory(pages); >>> *unused = pages_to_unuse; >>> - *swapid = type; >>> + *swapid = si->type; >>> ret = 0; >>> break; >>> } >>> @@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages) >>> /* >>> * we don't want to hold swap_lock while doing a very >>> * lengthy try_to_unuse, but swap_list may change >>> - * so restart scan from swap_list.head each time >>> + * so restart scan from swap_list_head each time >>> */ >>> spin_lock(&swap_lock); >>> ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); >>> diff --git a/mm/swapfile.c b/mm/swapfile.c >>> index 4a7f7e6..b958645 100644 >>> --- a/mm/swapfile.c >>> +++ b/mm/swapfile.c >>> @@ -51,14 +51,14 @@ atomic_long_t nr_swap_pages; >>> /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ >>> long total_swap_pages; >>> static int least_priority; >>> -static atomic_t highest_priority_index = ATOMIC_INIT(-1); >>> >>> static const char Bad_file[] = "Bad swap file entry "; >>> static const char Unused_file[] = "Unused swap file entry "; >>> static const char Bad_offset[] = "Bad swap offset entry "; >>> static const char Unused_offset[] = "Unused swap offset entry "; >>> >>> -struct swap_list_t swap_list = {-1, -1}; >>> +/* all active swap_info */ >>> +LIST_HEAD(swap_list_head); >>> >>> struct swap_info_struct *swap_info[MAX_SWAPFILES]; >>> >>> @@ -640,66 +640,50 @@ no_page: >>> >>> swp_entry_t get_swap_page(void) >>> { >>> - struct swap_info_struct *si; >>> + struct swap_info_struct *si, *next; >>> pgoff_t offset; >>> - int type, next; >>> - int wrapped = 0; >>> - int hp_index; >>> + struct list_head *tmp; >>> >>> spin_lock(&swap_lock); >>> if (atomic_long_read(&nr_swap_pages) <= 0) >>> goto noswap; >>> atomic_long_dec(&nr_swap_pages); >>> >>> - for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { >>> - hp_index = atomic_xchg(&highest_priority_index, -1); >>> - /* >>> - * highest_priority_index records current highest priority swap >>> - * type which just frees swap entries. If its priority is >>> - * higher than that of swap_list.next swap type, we use it. It >>> - * isn't protected by swap_lock, so it can be an invalid value >>> - * if the corresponding swap type is swapoff. We double check >>> - * the flags here. It's even possible the swap type is swapoff >>> - * and swapon again and its priority is changed. In such rare >>> - * case, low prority swap type might be used, but eventually >>> - * high priority swap will be used after several rounds of >>> - * swap. >>> - */ >>> - if (hp_index != -1 && hp_index != type && >>> - swap_info[type]->prio < swap_info[hp_index]->prio && >>> - (swap_info[hp_index]->flags & SWP_WRITEOK)) { >>> - type = hp_index; >>> - swap_list.next = type; >>> - } >>> - >>> - si = swap_info[type]; >>> - next = si->next; >>> - if (next < 0 || >>> - (!wrapped && si->prio != swap_info[next]->prio)) { >>> - next = swap_list.head; >>> - wrapped++; >>> - } >>> - >>> + list_for_each(tmp, &swap_list_head) { >>> + si = list_entry(tmp, typeof(*si), list); >>> spin_lock(&si->lock); >>> - if (!si->highest_bit) { >>> - spin_unlock(&si->lock); >>> - continue; >>> - } >>> - if (!(si->flags & SWP_WRITEOK)) { >>> + if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { >>> spin_unlock(&si->lock); >>> continue; >>> } >>> >>> - swap_list.next = next; >>> + /* >>> + * rotate the current swap_info that we're going to use >>> + * to after any other swap_info that have the same prio, >>> + * so that all equal-priority swap_info get used equally >>> + */ >>> + next = si; >>> + list_for_each_entry_continue(next, &swap_list_head, list) { >>> + if (si->prio != next->prio) >>> + break; >>> + list_rotate_left(&si->list); >>> + next = si; >>> + } >>> >> >> The list manipulations will be a lot of cache writes as the list is shuffled >> around. On slow storage I do not think this will be noticable but it may >> be noticable on faster swap devices that are SSD based. I've added Shaohua >> Li to the cc as he has been concerned with the performance of swap in the >> past. Shaohua, can you run this patchset through any of your test cases >> with the addition that multiple swap files are used to see if the cache >> writes are noticable? You'll need multiple swap files, some of which are >> at equal priority so the list shuffling logic is triggered. > > One performance improvement could be instead of rotating the current > entry past each following same-prio entry, just scan to the end of the > same-prio entries and move the current entry there; that would skip > the extra writes. Especially since this code will run for each > get_swap_page(), no need for any unnecessary writes. > >> >>> spin_unlock(&swap_lock); >>> /* This is called for allocating swap entry for cache */ >>> offset = scan_swap_map(si, SWAP_HAS_CACHE); >>> spin_unlock(&si->lock); >>> if (offset) >>> - return swp_entry(type, offset); >>> + return swp_entry(si->type, offset); >>> spin_lock(&swap_lock); >>> - next = swap_list.next; >>> + /* >>> + * shouldn't really have got here, but for some reason the >>> + * scan_swap_map came back empty for this swap_info. >>> + * Since we dropped the swap_lock, there may now be >>> + * non-full higher prio swap_infos; let's start over. >>> + */ >>> + tmp = &swap_list_head; >>> } >> >> Has this ever triggered? The number of swap pages was examined under the >> swap lock so no other process should have been iterating through the >> swap files. Once a candidate was found, the si lock was acquired for the >> swap scan map so nothing else should have raced with it. > > Well scan_swap_map() does drop the si->lock if it has any trouble at > all finding an offset to use, so I think it's possible that for a > nearly-full si multiple concurrent get_swap_page() calls could enter > scan_swap_map() with the same si, only some of them actually get pages > from the si and then the si becomes full, and the other threads in > scan_swap_map() see it's full and exit in failure. I can update the > code comment there to better indicate why it was reached, instead of > just saying "we shouldn't have got here" :-) > > It may also be worth trying to get a better indicator of "available" > swap_info_structs for use in get_swap_page(), either by looking at > something other than si->highest_bit and/or keeping the si out of the > prio_list until it's actually available for use, not just has a single > entry free. However, that probably won't be simple and might be > better as a separate patch to the rest of these changes. > >> >>> >>> atomic_long_inc(&nr_swap_pages); >>> @@ -766,27 +750,6 @@ out: >>> return NULL; >>> } >>> >>> -/* >>> - * This swap type frees swap entry, check if it is the highest priority swap >>> - * type which just frees swap entry. get_swap_page() uses >>> - * highest_priority_index to search highest priority swap type. The >>> - * swap_info_struct.lock can't protect us if there are multiple swap types >>> - * active, so we use atomic_cmpxchg. >>> - */ >>> -static void set_highest_priority_index(int type) >>> -{ >>> - int old_hp_index, new_hp_index; >>> - >>> - do { >>> - old_hp_index = atomic_read(&highest_priority_index); >>> - if (old_hp_index != -1 && >>> - swap_info[old_hp_index]->prio >= swap_info[type]->prio) >>> - break; >>> - new_hp_index = type; >>> - } while (atomic_cmpxchg(&highest_priority_index, >>> - old_hp_index, new_hp_index) != old_hp_index); >>> -} >>> - >>> static unsigned char swap_entry_free(struct swap_info_struct *p, >>> swp_entry_t entry, unsigned char usage) >>> { >>> @@ -830,7 +793,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, >>> p->lowest_bit = offset; >>> if (offset > p->highest_bit) >>> p->highest_bit = offset; >>> - set_highest_priority_index(p->type); >>> atomic_long_inc(&nr_swap_pages); >>> p->inuse_pages--; >>> frontswap_invalidate_page(p->type, offset); >>> @@ -1765,7 +1727,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, >>> unsigned char *swap_map, >>> struct swap_cluster_info *cluster_info) >>> { >>> - int i, prev; >>> + struct swap_info_struct *si; >>> >>> if (prio >= 0) >>> p->prio = prio; >>> @@ -1777,18 +1739,21 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, >>> atomic_long_add(p->pages, &nr_swap_pages); >>> total_swap_pages += p->pages; >>> >>> - /* insert swap space into swap_list: */ >>> - prev = -1; >>> - for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { >>> - if (p->prio >= swap_info[i]->prio) >>> - break; >>> - prev = i; >>> + assert_spin_locked(&swap_lock); >>> + BUG_ON(!list_empty(&p->list)); >>> + /* insert into swap list: */ >>> + list_for_each_entry(si, &swap_list_head, list) { >>> + if (p->prio >= si->prio) { >>> + list_add_tail(&p->list, &si->list); >>> + return; >>> + } >> >> An additional comment saying that it must be priority ordered for >> get_swap_page wouldn't kill. > > will do. > >> >>> } >>> - p->next = i; >>> - if (prev < 0) >>> - swap_list.head = swap_list.next = p->type; >>> - else >>> - swap_info[prev]->next = p->type; >>> + /* >>> + * this covers two cases: >>> + * 1) p->prio is less than all existing prio >>> + * 2) the swap list is empty >>> + */ >>> + list_add_tail(&p->list, &swap_list_head); >>> } >>> >>> static void enable_swap_info(struct swap_info_struct *p, int prio, >>> @@ -1823,8 +1788,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) >>> struct address_space *mapping; >>> struct inode *inode; >>> struct filename *pathname; >>> - int i, type, prev; >>> - int err; >>> + int err, found = 0; >>> unsigned int old_block_size; >>> >>> if (!capable(CAP_SYS_ADMIN)) >>> @@ -1842,17 +1806,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) >>> goto out; >>> >>> mapping = victim->f_mapping; >>> - prev = -1; >>> spin_lock(&swap_lock); >>> - for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { >>> - p = swap_info[type]; >>> + list_for_each_entry(p, &swap_list_head, list) { >>> if (p->flags & SWP_WRITEOK) { >>> - if (p->swap_file->f_mapping == mapping) >>> + if (p->swap_file->f_mapping == mapping) { >>> + found = 1; >>> break; >>> + } >>> } >>> - prev = type; >>> } >>> - if (type < 0) { >>> + if (!found) { >>> err = -EINVAL; >>> spin_unlock(&swap_lock); >>> goto out_dput; >>> @@ -1864,20 +1827,15 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) >>> spin_unlock(&swap_lock); >>> goto out_dput; >>> } >>> - if (prev < 0) >>> - swap_list.head = p->next; >>> - else >>> - swap_info[prev]->next = p->next; >>> - if (type == swap_list.next) { >>> - /* just pick something that's safe... */ >>> - swap_list.next = swap_list.head; >>> - } >>> spin_lock(&p->lock); >>> if (p->prio < 0) { >>> - for (i = p->next; i >= 0; i = swap_info[i]->next) >>> - swap_info[i]->prio = p->prio--; >>> + struct swap_info_struct *si = p; >>> + list_for_each_entry_continue(si, &swap_list_head, list) { >>> + si->prio++; >>> + } >>> least_priority++; >>> } >>> + list_del_init(&p->list); >>> atomic_long_sub(p->pages, &nr_swap_pages); >>> total_swap_pages -= p->pages; >>> p->flags &= ~SWP_WRITEOK; >>> @@ -1885,7 +1843,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) >>> spin_unlock(&swap_lock); >>> >>> set_current_oom_origin(); >>> - err = try_to_unuse(type, false, 0); /* force all pages to be unused */ >>> + err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ >>> clear_current_oom_origin(); >>> >>> if (err) { >>> @@ -1926,7 +1884,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) >>> frontswap_map = frontswap_map_get(p); >>> spin_unlock(&p->lock); >>> spin_unlock(&swap_lock); >>> - frontswap_invalidate_area(type); >>> + frontswap_invalidate_area(p->type); >>> frontswap_map_set(p, NULL); >>> mutex_unlock(&swapon_mutex); >>> free_percpu(p->percpu_cluster); >>> @@ -1935,7 +1893,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) >>> vfree(cluster_info); >>> vfree(frontswap_map); >>> /* Destroy swap account information */ >>> - swap_cgroup_swapoff(type); >>> + swap_cgroup_swapoff(p->type); >>> >>> inode = mapping->host; >>> if (S_ISBLK(inode->i_mode)) { >>> @@ -2142,8 +2100,8 @@ static struct swap_info_struct *alloc_swap_info(void) >>> */ >>> } >>> INIT_LIST_HEAD(&p->first_swap_extent.list); >>> + INIT_LIST_HEAD(&p->list); >>> p->flags = SWP_USED; >>> - p->next = -1; >>> spin_unlock(&swap_lock); >>> spin_lock_init(&p->lock); >>> >>> -- >>> 1.8.3.1 >>> >> >> -- >> Mel Gorman >> SUSE Labs > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>