On Tue, Jan 10, 2023 at 3:17 PM Nhat Pham <nphamcs@xxxxxxxxx> wrote: > > Currently, there is a race between zs_free() and zs_reclaim_page(): > zs_reclaim_page() finds a handle to an allocated object, but before the > eviction happens, an independent zs_free() call to the same handle could > come in and overwrite the object value stored at the handle with > the last deferred handle. When zs_reclaim_page() finally gets to call > the eviction handler, it will see an invalid object value (i.e the > previous deferred handle instead of the original object value). > > This race happens quite infrequently. We only managed to produce it with > out-of-tree developmental code that triggers zsmalloc writeback with a > much higher frequency than usual. > > This patch fixes this race by storing the deferred handle in the object > header instead. We differentiate the deferred handle from the other two > cases (handle for allocated object, and linkage for free object) with a > new tag. If zspage reclamation succeeds, we will free these deferred > handles by walking through the zspage objects. On the other hand, if > zspage reclamation fails, we reconstruct the zspage freelist (with the > deferred handle tag and allocated tag) before trying again with the > reclamation. > Fixes: 9997bc017549 ("zsmalloc: implement writeback mechanism for zsmalloc") I forgot to add the fix tag to this patch. > Suggested-by: Johannes Weiner <hannes@xxxxxxxxxxx> > Signed-off-by: Nhat Pham <nphamcs@xxxxxxxxx> > --- > mm/zsmalloc.c | 237 +++++++++++++++++++++++++++++++++++++++++++------- > 1 file changed, 205 insertions(+), 32 deletions(-) > > diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c > index 9445bee6b014..b4b40ef98703 100644 > --- a/mm/zsmalloc.c > +++ b/mm/zsmalloc.c > @@ -113,7 +113,23 @@ > * have room for two bit at least. > */ > #define OBJ_ALLOCATED_TAG 1 > -#define OBJ_TAG_BITS 1 > + > +#ifdef CONFIG_ZPOOL > +/* > + * The second least-significant bit in the object's header identifies if the > + * value stored at the header is a deferred handle from the last reclaim > + * attempt. > + * > + * As noted above, this is valid because we have room for two bits. > + */ > +#define OBJ_DEFERRED_HANDLE_TAG 2 > +#define OBJ_TAG_BITS 2 > +#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG) > +#else > +#define OBJ_TAG_BITS 1 > +#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG > +#endif /* CONFIG_ZPOOL */ > + > #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) > #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) > > @@ -222,6 +238,12 @@ struct link_free { > * Handle of allocated object. > */ > unsigned long handle; > +#ifdef CONFIG_ZPOOL > + /* > + * Deferred handle of a reclaimed object. > + */ > + unsigned long deferred_handle; > +#endif > }; > }; > > @@ -272,8 +294,6 @@ struct zspage { > /* links the zspage to the lru list in the pool */ > struct list_head lru; > bool under_reclaim; > - /* list of unfreed handles whose objects have been reclaimed */ > - unsigned long *deferred_handles; > #endif > > struct zs_pool *pool; > @@ -897,7 +917,8 @@ static unsigned long handle_to_obj(unsigned long handle) > return *(unsigned long *)handle; > } > > -static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) > +static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle, > + int tag) > { > unsigned long handle; > struct zspage *zspage = get_zspage(page); > @@ -908,13 +929,27 @@ static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) > } else > handle = *(unsigned long *)obj; > > - if (!(handle & OBJ_ALLOCATED_TAG)) > + if (!(handle & tag)) > return false; > > - *phandle = handle & ~OBJ_ALLOCATED_TAG; > + /* Clear all tags before returning the handle */ > + *phandle = handle & ~OBJ_TAG_MASK; > return true; > } > > +static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) > +{ > + return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG); > +} > + > +#ifdef CONFIG_ZPOOL > +static bool obj_stores_deferred_handle(struct page *page, void *obj, > + unsigned long *phandle) > +{ > + return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG); > +} > +#endif > + > static void reset_page(struct page *page) > { > __ClearPageMovable(page); > @@ -946,22 +981,36 @@ static int trylock_zspage(struct zspage *zspage) > } > > #ifdef CONFIG_ZPOOL > +static unsigned long find_deferred_handle_obj(struct size_class *class, > + struct page *page, int *obj_idx); > + > /* > * Free all the deferred handles whose objects are freed in zs_free. > */ > -static void free_handles(struct zs_pool *pool, struct zspage *zspage) > +static void free_handles(struct zs_pool *pool, struct size_class *class, > + struct zspage *zspage) > { > - unsigned long handle = (unsigned long)zspage->deferred_handles; > + int obj_idx = 0; > + struct page *page = get_first_page(zspage); > + unsigned long handle; > > - while (handle) { > - unsigned long nxt_handle = handle_to_obj(handle); > + while (1) { > + handle = find_deferred_handle_obj(class, page, &obj_idx); > + if (!handle) { > + page = get_next_page(page); > + if (!page) > + break; > + obj_idx = 0; > + continue; > + } > > cache_free_handle(pool, handle); > - handle = nxt_handle; > + obj_idx++; > } > } > #else > -static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {} > +static inline void free_handles(struct zs_pool *pool, struct size_class *class, > + struct zspage *zspage) {} > #endif > > static void __free_zspage(struct zs_pool *pool, struct size_class *class, > @@ -979,7 +1028,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, > VM_BUG_ON(fg != ZS_EMPTY); > > /* Free all deferred handles from zs_free */ > - free_handles(pool, zspage); > + free_handles(pool, class, zspage); > > next = page = get_first_page(zspage); > do { > @@ -1067,7 +1116,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) > #ifdef CONFIG_ZPOOL > INIT_LIST_HEAD(&zspage->lru); > zspage->under_reclaim = false; > - zspage->deferred_handles = NULL; > #endif > > set_freeobj(zspage, 0); > @@ -1568,7 +1616,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) > } > EXPORT_SYMBOL_GPL(zs_malloc); > > -static void obj_free(int class_size, unsigned long obj) > +static void obj_free(int class_size, unsigned long obj, unsigned long *handle) > { > struct link_free *link; > struct zspage *zspage; > @@ -1582,15 +1630,29 @@ static void obj_free(int class_size, unsigned long obj) > zspage = get_zspage(f_page); > > vaddr = kmap_atomic(f_page); > - > - /* Insert this object in containing zspage's freelist */ > link = (struct link_free *)(vaddr + f_offset); > - if (likely(!ZsHugePage(zspage))) > - link->next = get_freeobj(zspage) << OBJ_TAG_BITS; > - else > - f_page->index = 0; > + > + if (handle) { > +#ifdef CONFIG_ZPOOL > + /* Stores the (deferred) handle in the object's header */ > + *handle |= OBJ_DEFERRED_HANDLE_TAG; > + *handle &= ~OBJ_ALLOCATED_TAG; > + > + if (likely(!ZsHugePage(zspage))) > + link->deferred_handle = *handle; > + else > + f_page->index = *handle; > +#endif > + } else { > + /* Insert this object in containing zspage's freelist */ > + if (likely(!ZsHugePage(zspage))) > + link->next = get_freeobj(zspage) << OBJ_TAG_BITS; > + else > + f_page->index = 0; > + set_freeobj(zspage, f_objidx); > + } > + > kunmap_atomic(vaddr); > - set_freeobj(zspage, f_objidx); > mod_zspage_inuse(zspage, -1); > } > > @@ -1615,7 +1677,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle) > zspage = get_zspage(f_page); > class = zspage_class(pool, zspage); > > - obj_free(class->size, obj); > class_stat_dec(class, OBJ_USED, 1); > > #ifdef CONFIG_ZPOOL > @@ -1624,15 +1685,15 @@ void zs_free(struct zs_pool *pool, unsigned long handle) > * Reclaim needs the handles during writeback. It'll free > * them along with the zspage when it's done with them. > * > - * Record current deferred handle at the memory location > - * whose address is given by handle. > + * Record current deferred handle in the object's header. > */ > - record_obj(handle, (unsigned long)zspage->deferred_handles); > - zspage->deferred_handles = (unsigned long *)handle; > + obj_free(class->size, obj, &handle); > spin_unlock(&pool->lock); > return; > } > #endif > + obj_free(class->size, obj, NULL); > + > fullness = fix_fullness_group(class, zspage); > if (fullness == ZS_EMPTY) > free_zspage(pool, class, zspage); > @@ -1713,11 +1774,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, > } > > /* > - * Find alloced object in zspage from index object and > + * Find object with a certain tag in zspage from index object and > * return handle. > */ > -static unsigned long find_alloced_obj(struct size_class *class, > - struct page *page, int *obj_idx) > +static unsigned long find_tagged_obj(struct size_class *class, > + struct page *page, int *obj_idx, int tag) > { > unsigned int offset; > int index = *obj_idx; > @@ -1728,7 +1789,7 @@ static unsigned long find_alloced_obj(struct size_class *class, > offset += class->size * index; > > while (offset < PAGE_SIZE) { > - if (obj_allocated(page, addr + offset, &handle)) > + if (obj_tagged(page, addr + offset, &handle, tag)) > break; > > offset += class->size; > @@ -1742,6 +1803,28 @@ static unsigned long find_alloced_obj(struct size_class *class, > return handle; > } > > +/* > + * Find alloced object in zspage from index object and > + * return handle. > + */ > +static unsigned long find_alloced_obj(struct size_class *class, > + struct page *page, int *obj_idx) > +{ > + return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG); > +} > + > +#ifdef CONFIG_ZPOOL > +/* > + * Find object storing a deferred handle in header in zspage from index object > + * and return handle. > + */ > +static unsigned long find_deferred_handle_obj(struct size_class *class, > + struct page *page, int *obj_idx) > +{ > + return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG); > +} > +#endif > + > struct zs_compact_control { > /* Source spage for migration which could be a subpage of zspage */ > struct page *s_page; > @@ -1784,7 +1867,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, > zs_object_copy(class, free_obj, used_obj); > obj_idx++; > record_obj(handle, free_obj); > - obj_free(class->size, used_obj); > + obj_free(class->size, used_obj, NULL); > } > > /* Remember last position in this iteration */ > @@ -2478,6 +2561,90 @@ void zs_destroy_pool(struct zs_pool *pool) > EXPORT_SYMBOL_GPL(zs_destroy_pool); > > #ifdef CONFIG_ZPOOL > +static void restore_freelist(struct zs_pool *pool, struct size_class *class, > + struct zspage *zspage) > +{ > + unsigned int obj_idx = 0; > + unsigned long handle, off = 0; /* off is within-page offset */ > + struct page *page = get_first_page(zspage); > + struct link_free *prev_free = NULL; > + void *prev_page_vaddr = NULL; > + > + /* in case no free object found */ > + set_freeobj(zspage, (unsigned int)(-1UL)); > + > + while (page) { > + void *vaddr = kmap_atomic(page); > + struct page *next_page; > + > + while (off < PAGE_SIZE) { > + void *obj_addr = vaddr + off; > + > + /* skip allocated object */ > + if (obj_allocated(page, obj_addr, &handle)) { > + obj_idx++; > + off += class->size; > + continue; > + } > + > + /* free deferred handle from reclaim attempt */ > + if (obj_stores_deferred_handle(page, obj_addr, &handle)) > + cache_free_handle(pool, handle); > + > + if (prev_free) > + prev_free->next = obj_idx << OBJ_TAG_BITS; > + else /* first free object found */ > + set_freeobj(zspage, obj_idx); > + > + prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free); > + /* if last free object in a previous page, need to unmap */ > + if (prev_page_vaddr) { > + kunmap_atomic(prev_page_vaddr); > + prev_page_vaddr = NULL; > + } > + > + obj_idx++; > + off += class->size; > + } > + > + /* > + * Handle the last (full or partial) object on this page. > + */ > + next_page = get_next_page(page); > + if (next_page) { > + if (!prev_free || prev_page_vaddr) { > + /* > + * There is no free object in this page, so we can safely > + * unmap it. > + */ > + kunmap_atomic(vaddr); > + } else { > + /* update prev_page_vaddr since prev_free is on this page */ > + prev_page_vaddr = vaddr; > + } > + } else { /* this is the last page */ > + if (prev_free) { > + /* > + * Reset OBJ_TAG_BITS bit to last link to tell > + * whether it's allocated object or not. > + */ > + prev_free->next = -1UL << OBJ_TAG_BITS; > + } > + > + /* unmap previous page (if not done yet) */ > + if (prev_page_vaddr) { > + kunmap_atomic(prev_page_vaddr); > + prev_page_vaddr = NULL; > + } > + > + kunmap_atomic(vaddr); > + } > + > + page = next_page; > + off %= PAGE_SIZE; > + } > +} > + > static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries) > { > int i, obj_idx, ret = 0; > @@ -2561,6 +2728,12 @@ static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries) > return 0; > } > > + /* > + * Eviction fails on one of the handles, so we need to restore zspage. > + * We need to rebuild its freelist (and free stored deferred handles), > + * put it back to the correct size class, and add it to the LRU list. > + */ > + restore_freelist(pool, class, zspage); > putback_zspage(class, zspage); > list_add(&zspage->lru, &pool->lru); > unlock_zspage(zspage); > -- > 2.30.2 >