Hello Chulmin, On Mon, Apr 04, 2016 at 05:04:16PM +0900, Chulmin Kim wrote: > On 2016년 03월 30일 16:12, Minchan Kim wrote: > >Currently, we rely on class->lock to prevent zspage destruction. > >It was okay until now because the critical section is short but > >with run-time migration, it could be long so class->lock is not > >a good apporach any more. > > > >So, this patch introduces [un]freeze_zspage functions which > >freeze allocated objects in the zspage with pinning tag so > >user cannot free using object. With those functions, this patch > >redesign compaction. > > > >Those functions will be used for implementing zspage runtime > >migrations, too. > > > >Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx> > >--- > > mm/zsmalloc.c | 393 ++++++++++++++++++++++++++++++++++++++-------------------- > > 1 file changed, 257 insertions(+), 136 deletions(-) > > > >diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c > >index b11dcd718502..ac8ca7b10720 100644 > >--- a/mm/zsmalloc.c > >+++ b/mm/zsmalloc.c > >@@ -922,6 +922,13 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page, > > return *(unsigned long *)obj; > > } > > > >+static inline int testpin_tag(unsigned long handle) > >+{ > >+ unsigned long *ptr = (unsigned long *)handle; > >+ > >+ return test_bit(HANDLE_PIN_BIT, ptr); > >+} > >+ > > static inline int trypin_tag(unsigned long handle) > > { > > unsigned long *ptr = (unsigned long *)handle; > >@@ -949,8 +956,7 @@ static void reset_page(struct page *page) > > page->freelist = NULL; > > } > > > >-static void free_zspage(struct zs_pool *pool, struct size_class *class, > >- struct page *first_page) > >+static void free_zspage(struct zs_pool *pool, struct page *first_page) > > { > > struct page *nextp, *tmp, *head_extra; > > > >@@ -973,11 +979,6 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class, > > } > > reset_page(head_extra); > > __free_page(head_extra); > >- > >- zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( > >- class->size, class->pages_per_zspage)); > >- atomic_long_sub(class->pages_per_zspage, > >- &pool->pages_allocated); > > } > > > > /* Initialize a newly allocated zspage */ > >@@ -1325,6 +1326,11 @@ static bool zspage_full(struct size_class *class, struct page *first_page) > > return get_zspage_inuse(first_page) == class->objs_per_zspage; > > } > > > >+static bool zspage_empty(struct size_class *class, struct page *first_page) > >+{ > >+ return get_zspage_inuse(first_page) == 0; > >+} > >+ > > unsigned long zs_get_total_pages(struct zs_pool *pool) > > { > > return atomic_long_read(&pool->pages_allocated); > >@@ -1455,7 +1461,6 @@ static unsigned long obj_malloc(struct size_class *class, > > set_page_private(first_page, handle | OBJ_ALLOCATED_TAG); > > kunmap_atomic(vaddr); > > mod_zspage_inuse(first_page, 1); > >- zs_stat_inc(class, OBJ_USED, 1); > > > > obj = location_to_obj(m_page, obj); > > > >@@ -1510,6 +1515,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) > > } > > > > obj = obj_malloc(class, first_page, handle); > >+ zs_stat_inc(class, OBJ_USED, 1); > > /* Now move the zspage to another fullness group, if required */ > > fix_fullness_group(class, first_page); > > record_obj(handle, obj); > >@@ -1540,7 +1546,6 @@ static void obj_free(struct size_class *class, unsigned long obj) > > kunmap_atomic(vaddr); > > set_freeobj(first_page, f_objidx); > > mod_zspage_inuse(first_page, -1); > >- zs_stat_dec(class, OBJ_USED, 1); > > } > > > > void zs_free(struct zs_pool *pool, unsigned long handle) > >@@ -1564,10 +1569,19 @@ void zs_free(struct zs_pool *pool, unsigned long handle) > > > > spin_lock(&class->lock); > > obj_free(class, obj); > >+ zs_stat_dec(class, OBJ_USED, 1); > > fullness = fix_fullness_group(class, first_page); > >- if (fullness == ZS_EMPTY) > >- free_zspage(pool, class, first_page); > >+ if (fullness == ZS_EMPTY) { > >+ zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( > >+ class->size, class->pages_per_zspage)); > >+ spin_unlock(&class->lock); > >+ atomic_long_sub(class->pages_per_zspage, > >+ &pool->pages_allocated); > >+ free_zspage(pool, first_page); > >+ goto out; > >+ } > > spin_unlock(&class->lock); > >+out: > > unpin_tag(handle); > > > > free_handle(pool, handle); > >@@ -1637,127 +1651,66 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, > > kunmap_atomic(s_addr); > > } > > > >-/* > >- * Find alloced object in zspage from index object and > >- * return handle. > >- */ > >-static unsigned long find_alloced_obj(struct size_class *class, > >- struct page *page, int index) > >+static unsigned long handle_from_obj(struct size_class *class, > >+ struct page *first_page, int obj_idx) > > { > >- unsigned long head; > >- int offset = 0; > >- unsigned long handle = 0; > >- void *addr = kmap_atomic(page); > >- > >- if (!is_first_page(page)) > >- offset = page->index; > >- offset += class->size * index; > >- > >- while (offset < PAGE_SIZE) { > >- head = obj_to_head(class, page, addr + offset); > >- if (head & OBJ_ALLOCATED_TAG) { > >- handle = head & ~OBJ_ALLOCATED_TAG; > >- if (trypin_tag(handle)) > >- break; > >- handle = 0; > >- } > >+ struct page *page; > >+ unsigned long offset_in_page; > >+ void *addr; > >+ unsigned long head, handle = 0; > > > >- offset += class->size; > >- index++; > >- } > >+ objidx_to_page_and_offset(class, first_page, obj_idx, > >+ &page, &offset_in_page); > > > >+ addr = kmap_atomic(page); > >+ head = obj_to_head(class, page, addr + offset_in_page); > >+ if (head & OBJ_ALLOCATED_TAG) > >+ handle = head & ~OBJ_ALLOCATED_TAG; > > kunmap_atomic(addr); > >+ > > return handle; > > } > > > >-struct zs_compact_control { > >- /* Source page for migration which could be a subpage of zspage. */ > >- struct page *s_page; > >- /* Destination page for migration which should be a first page > >- * of zspage. */ > >- struct page *d_page; > >- /* Starting object index within @s_page which used for live object > >- * in the subpage. */ > >- int index; > >-}; > >- > >-static int migrate_zspage(struct zs_pool *pool, struct size_class *class, > >- struct zs_compact_control *cc) > >+static int migrate_zspage(struct size_class *class, struct page *dst_page, > >+ struct page *src_page) > > { > >- unsigned long used_obj, free_obj; > > unsigned long handle; > >- struct page *s_page = cc->s_page; > >- struct page *d_page = cc->d_page; > >- unsigned long index = cc->index; > >- int ret = 0; > >+ unsigned long old_obj, new_obj; > >+ int i; > >+ int nr_migrated = 0; > > > >- while (1) { > >- handle = find_alloced_obj(class, s_page, index); > >- if (!handle) { > >- s_page = get_next_page(s_page); > >- if (!s_page) > >- break; > >- index = 0; > >+ for (i = 0; i < class->objs_per_zspage; i++) { > >+ handle = handle_from_obj(class, src_page, i); > >+ if (!handle) > > continue; > >- } > >- > >- /* Stop if there is no more space */ > >- if (zspage_full(class, d_page)) { > >- unpin_tag(handle); > >- ret = -ENOMEM; > >+ if (zspage_full(class, dst_page)) > > break; > >- } > >- > >- used_obj = handle_to_obj(handle); > >- free_obj = obj_malloc(class, d_page, handle); > >- zs_object_copy(class, free_obj, used_obj); > >- index++; > >+ old_obj = handle_to_obj(handle); > >+ new_obj = obj_malloc(class, dst_page, handle); > >+ zs_object_copy(class, new_obj, old_obj); > >+ nr_migrated++; > > /* > > * record_obj updates handle's value to free_obj and it will > > * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which > > * breaks synchronization using pin_tag(e,g, zs_free) so > > * let's keep the lock bit. > > */ > >- free_obj |= BIT(HANDLE_PIN_BIT); > >- record_obj(handle, free_obj); > >- unpin_tag(handle); > >- obj_free(class, used_obj); > >+ new_obj |= BIT(HANDLE_PIN_BIT); > >+ record_obj(handle, new_obj); > >+ obj_free(class, old_obj); > > } > >- > >- /* Remember last position in this iteration */ > >- cc->s_page = s_page; > >- cc->index = index; > >- > >- return ret; > >-} > >- > >-static struct page *isolate_target_page(struct size_class *class) > >-{ > >- int i; > >- struct page *page; > >- > >- for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { > >- page = class->fullness_list[i]; > >- if (page) { > >- remove_zspage(class, i, page); > >- break; > >- } > >- } > >- > >- return page; > >+ return nr_migrated; > > } > > > > /* > > * putback_zspage - add @first_page into right class's fullness list > >- * @pool: target pool > > * @class: destination class > > * @first_page: target page > > * > > * Return @first_page's updated fullness_group > > */ > >-static enum fullness_group putback_zspage(struct zs_pool *pool, > >- struct size_class *class, > >- struct page *first_page) > >+static enum fullness_group putback_zspage(struct size_class *class, > >+ struct page *first_page) > > { > > enum fullness_group fullness; > > > >@@ -1768,17 +1721,155 @@ static enum fullness_group putback_zspage(struct zs_pool *pool, > > return fullness; > > } > > > >+/* > >+ * freeze_zspage - freeze all objects in a zspage > >+ * @class: size class of the page > >+ * @first_page: first page of zspage > >+ * > >+ * Freeze all allocated objects in a zspage so objects couldn't be > >+ * freed until unfreeze objects. It should be called under class->lock. > >+ * > >+ * RETURNS: > >+ * the number of pinned objects > >+ */ > >+static int freeze_zspage(struct size_class *class, struct page *first_page) > >+{ > >+ unsigned long obj_idx; > >+ struct page *obj_page; > >+ unsigned long offset; > >+ void *addr; > >+ int nr_freeze = 0; > >+ > >+ for (obj_idx = 0; obj_idx < class->objs_per_zspage; obj_idx++) { > >+ unsigned long head; > >+ > >+ objidx_to_page_and_offset(class, first_page, obj_idx, > >+ &obj_page, &offset); > >+ addr = kmap_atomic(obj_page); > >+ head = obj_to_head(class, obj_page, addr + offset); > >+ if (head & OBJ_ALLOCATED_TAG) { > >+ unsigned long handle = head & ~OBJ_ALLOCATED_TAG; > >+ > >+ if (!trypin_tag(handle)) { > >+ kunmap_atomic(addr); > >+ break; > >+ } > >+ nr_freeze++; > >+ } > >+ kunmap_atomic(addr); > >+ } > >+ > >+ return nr_freeze; > >+} > >+ > >+/* > >+ * unfreeze_page - unfreeze objects freezed by freeze_zspage in a zspage > >+ * @class: size class of the page > >+ * @first_page: freezed zspage to unfreeze > >+ * @nr_obj: the number of objects to unfreeze > >+ * > >+ * unfreeze objects in a zspage. > >+ */ > >+static void unfreeze_zspage(struct size_class *class, struct page *first_page, > >+ int nr_obj) > >+{ > >+ unsigned long obj_idx; > >+ struct page *obj_page; > >+ unsigned long offset; > >+ void *addr; > >+ int nr_unfreeze = 0; > >+ > >+ for (obj_idx = 0; obj_idx < class->objs_per_zspage && > >+ nr_unfreeze < nr_obj; obj_idx++) { > >+ unsigned long head; > >+ > >+ objidx_to_page_and_offset(class, first_page, obj_idx, > >+ &obj_page, &offset); > >+ addr = kmap_atomic(obj_page); > >+ head = obj_to_head(class, obj_page, addr + offset); > >+ if (head & OBJ_ALLOCATED_TAG) { > >+ unsigned long handle = head & ~OBJ_ALLOCATED_TAG; > >+ > >+ VM_BUG_ON(!testpin_tag(handle)); > >+ unpin_tag(handle); > >+ nr_unfreeze++; > >+ } > >+ kunmap_atomic(addr); > >+ } > >+} > >+ > >+/* > >+ * isolate_source_page - isolate a zspage for migration source > >+ * @class: size class of zspage for isolation > >+ * > >+ * Returns a zspage which are isolated from list so anyone can > >+ * allocate a object from that page. As well, freeze all objects > >+ * allocated in the zspage so anyone cannot access that objects > >+ * (e.g., zs_map_object, zs_free). > >+ */ > > static struct page *isolate_source_page(struct size_class *class) > > { > > int i; > > struct page *page = NULL; > > > > for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) { > >+ int inuse, freezed; > >+ > > page = class->fullness_list[i]; > > if (!page) > > continue; > > > > remove_zspage(class, i, page); > >+ > >+ inuse = get_zspage_inuse(page); > >+ freezed = freeze_zspage(class, page); > >+ > >+ if (inuse != freezed) { > >+ unfreeze_zspage(class, page, freezed); > >+ putback_zspage(class, page); > >+ page = NULL; > >+ continue; > >+ } > >+ > >+ break; > >+ } > >+ > >+ return page; > >+} > >+ > >+/* > >+ * isolate_target_page - isolate a zspage for migration target > >+ * @class: size class of zspage for isolation > >+ * > >+ * Returns a zspage which are isolated from list so anyone can > >+ * allocate a object from that page. As well, freeze all objects > >+ * allocated in the zspage so anyone cannot access that objects > >+ * (e.g., zs_map_object, zs_free). > >+ */ > >+static struct page *isolate_target_page(struct size_class *class) > >+{ > >+ int i; > >+ struct page *page; > >+ > >+ for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { > >+ int inuse, freezed; > >+ > >+ page = class->fullness_list[i]; > >+ if (!page) > >+ continue; > >+ > >+ remove_zspage(class, i, page); > >+ > >+ inuse = get_zspage_inuse(page); > >+ freezed = freeze_zspage(class, page); > >+ > >+ if (inuse != freezed) { > >+ unfreeze_zspage(class, page, freezed); > >+ putback_zspage(class, page); > >+ page = NULL; > >+ continue; > >+ } > >+ > > break; > > } > > > >@@ -1793,9 +1884,11 @@ static struct page *isolate_source_page(struct size_class *class) > > static unsigned long zs_can_compact(struct size_class *class) > > { > > unsigned long obj_wasted; > >+ unsigned long obj_allocated, obj_used; > > > >- obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) - > >- zs_stat_get(class, OBJ_USED); > >+ obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); > >+ obj_used = zs_stat_get(class, OBJ_USED); > >+ obj_wasted = obj_allocated - obj_used; > > > > obj_wasted /= get_maxobj_per_zspage(class->size, > > class->pages_per_zspage); > >@@ -1805,53 +1898,81 @@ static unsigned long zs_can_compact(struct size_class *class) > > > > static void __zs_compact(struct zs_pool *pool, struct size_class *class) > > { > >- struct zs_compact_control cc; > >- struct page *src_page; > >+ struct page *src_page = NULL; > > struct page *dst_page = NULL; > > > >- spin_lock(&class->lock); > >- while ((src_page = isolate_source_page(class))) { > >+ while (1) { > >+ int nr_migrated; > > > >- if (!zs_can_compact(class)) > >+ spin_lock(&class->lock); > >+ if (!zs_can_compact(class)) { > >+ spin_unlock(&class->lock); > > break; > >+ } > > > >- cc.index = 0; > >- cc.s_page = src_page; > >+ /* > >+ * Isolate source page and freeze all objects in a zspage > >+ * to prevent zspage destroying. > >+ */ > >+ if (!src_page) { > >+ src_page = isolate_source_page(class); > >+ if (!src_page) { > >+ spin_unlock(&class->lock); > >+ break; > >+ } > >+ } > > > >- while ((dst_page = isolate_target_page(class))) { > >- cc.d_page = dst_page; > >- /* > >- * If there is no more space in dst_page, resched > >- * and see if anyone had allocated another zspage. > >- */ > >- if (!migrate_zspage(pool, class, &cc)) > >+ /* Isolate target page and freeze all objects in the zspage */ > >+ if (!dst_page) { > >+ dst_page = isolate_target_page(class); > >+ if (!dst_page) { > >+ spin_unlock(&class->lock); > > break; > >+ } > >+ } > >+ spin_unlock(&class->lock); > > (Sorry to delete individual recipients due to my compliance issues.) > > Hello, Minchan. > > > Is it safe to unlock? > > > (I assume that the system has 2 cores > and a swap device is using zsmalloc pool.) > If a zs compact context scheduled out after this "spin_unlock" line, > > > CPU A (Swap In) CPU B (zs_free by process killed) > --------------------- ------------------------- > ... > spin_lock(&si->lock) > ... > # assume it is pinned by zs_compact context. > pin_tag(handle) --> block > > ... > spin_lock(&si->lock) --> block > > > I think CPU A and CPU B may not be released forever. > Am I missing something? You didn't miss anything. It could be dead locked. The swap_slot_free_notify is always really problem. :( That's why I want to remove it. I will think over how to handle it and send fix in next revision. Thanks for the review! -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href