This patch adds strategies which can be selected by users in order to cache both incomplete ends of compressed physical clusters as a complement of in-place I/O in order to boost random read, but it costs more memory than the in-place I/O only. Signed-off-by: Gao Xiang <gaoxiang25@xxxxxxxxxx> --- fs/erofs/Kconfig | 38 ++++++++++ fs/erofs/internal.h | 17 +++++ fs/erofs/super.c | 15 ++++ fs/erofs/utils.c | 104 ++++++++++++++++++++++++++- fs/erofs/zdata.c | 171 ++++++++++++++++++++++++++++++++++++++++++++ fs/erofs/zdata.h | 9 +++ 6 files changed, 353 insertions(+), 1 deletion(-) diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 45a81ebeb023..95affb10d4d1 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -114,3 +114,41 @@ config EROFS_FS_CLUSTER_PAGE_LIMIT less than 2. Otherwise, the image will be refused to mount on this kernel. +choice + prompt "EROFS Data Decompression mode" + depends on EROFS_FS_ZIP + default EROFS_FS_ZIP_CACHE_READAROUND + help + EROFS supports three options for decompression. + "In-place I/O Only" consumes the minimum memory + with lowest random read. + + "Cached Decompression for readaround" consumes + the maximum memory with highest random read. + + If unsure, select "Cached Decompression for readaround" + +config EROFS_FS_ZIP_CACHE_DISABLED + bool "In-place I/O Only" + help + Read compressed data into page cache and do in-place + I/O decompression directly. + +config EROFS_FS_ZIP_CACHE_READAHEAD + bool "Cached Decompression for readahead" + help + For each request, it caches the last compressed page + for further reading. + It still does in-place I/O for the rest compressed pages. + +config EROFS_FS_ZIP_CACHE_READAROUND + bool "Cached Decompression for readaround" + help + For each request, it caches the both end compressed pages + for further reading. + It still does in-place I/O for the rest compressed pages. + + Recommended for performance priority. + +endchoice + diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 73892162f494..e8b0d65db1d1 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -51,6 +51,11 @@ struct erofs_fault_info { }; #endif +#ifdef CONFIG_EROFS_FS_ZIP_CACHE_READAROUND +#define EROFS_FS_HAS_MANAGED_CACHE (2) +#elif defined(CONFIG_EROFS_FS_ZIP_CACHE_READAHEAD) +#define EROFS_FS_HAS_MANAGED_CACHE (1) +#endif /* EROFS_SUPER_MAGIC_V1 to represent the whole file system */ #define EROFS_SUPER_MAGIC EROFS_SUPER_MAGIC_V1 @@ -73,6 +78,10 @@ struct erofs_sb_info { unsigned int shrinker_run_no; #endif + +#ifdef EROFS_FS_HAS_MANAGED_CACHE + struct inode *managed_cache; +#endif u32 blocks; u32 meta_blkaddr; #ifdef CONFIG_EROFS_FS_XATTR @@ -545,5 +554,13 @@ static inline int z_erofs_init_zip_subsystem(void) { return 0; } static inline void z_erofs_exit_zip_subsystem(void) {} #endif +#ifdef EROFS_FS_HAS_MANAGED_CACHE +struct inode *erofs_init_managed_cache(struct super_block *sb); +int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, + struct erofs_workgroup *egrp); +int erofs_try_to_free_cached_page(struct address_space *mapping, + struct page *page); +#endif + #endif diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 80e1a9b6d855..643adcbb46fc 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -350,6 +350,14 @@ static int erofs_read_super(struct super_block *sb, INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC); #endif +#ifdef EROFS_FS_HAS_MANAGED_CACHE + sbi->managed_cache = erofs_init_managed_cache(sb); + if (IS_ERR(sbi->managed_cache)) { + err = PTR_ERR(sbi->managed_cache); + goto err_init_managed_cache; + } +#endif + /* get the root inode */ inode = erofs_iget(sb, ROOT_NID(sbi), true); if (IS_ERR(inode)) { @@ -396,6 +404,10 @@ static int erofs_read_super(struct super_block *sb, dput(sb->s_root); sb->s_root = NULL; err_iget: +#ifdef EROFS_FS_HAS_MANAGED_CACHE + iput(sbi->managed_cache); +err_init_managed_cache: +#endif err_parseopt: err_sbread: sb->s_fs_info = NULL; @@ -421,6 +433,9 @@ static void erofs_put_super(struct super_block *sb) infoln("unmounted for %s", sbi->dev_name); __putname(sbi->dev_name); +#ifdef EROFS_FS_HAS_MANAGED_CACHE + iput(sbi->managed_cache); +#endif erofs_shrinker_unregister(sb); kfree(sbi); diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 53ee6daa3f70..eb161b31b8ee 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -145,6 +145,53 @@ int erofs_workgroup_put(struct erofs_workgroup *grp) return count; } +#ifdef EROFS_FS_HAS_MANAGED_CACHE +/* for cache-managed case, customized reclaim paths exist */ +static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp) +{ + erofs_workgroup_unfreeze(grp, 0); + __erofs_workgroup_free(grp); +} + +static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, + struct erofs_workgroup *grp, + bool cleanup) +{ + /* + * for managed cache enabled, the refcount of workgroups + * themselves could be < 0 (freezed). So there is no guarantee + * that all refcount > 0 if managed cache is enabled. + */ + if (!erofs_workgroup_try_to_freeze(grp, 1)) + return false; + + /* + * note that all cached pages should be unlinked + * before delete it from the radix tree. + * Otherwise some cached pages of an orphan old workgroup + * could be still linked after the new one is available. + */ + if (erofs_try_to_free_all_cached_pages(sbi, grp)) { + erofs_workgroup_unfreeze(grp, 1); + return false; + } + + /* + * it is impossible to fail after the workgroup is freezed, + * however in order to avoid some race conditions, add a + * DBG_BUGON to observe this in advance. + */ + DBG_BUGON(xa_untag_pointer(radix_tree_delete(&sbi->workstn_tree, + grp->index)) != grp); + + /* + * if managed cache is enable, the last refcount + * should indicate the related workstation. + */ + erofs_workgroup_unfreeze_final(grp); + return true; +} +#else /* for nocache case, no customized reclaim path at all */ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, struct erofs_workgroup *grp, @@ -165,7 +212,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, erofs_workgroup_put(grp); return true; } - +#endif unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, unsigned long nr_shrink, @@ -312,3 +359,58 @@ void erofs_exit_shrinker(void) #endif +#ifdef EROFS_FS_HAS_MANAGED_CACHE +static const struct address_space_operations managed_cache_aops; + +struct inode *erofs_init_managed_cache(struct super_block *sb) +{ + struct inode *inode = new_inode(sb); + + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + + set_nlink(inode, 1); + inode->i_size = OFFSET_MAX; + + inode->i_mapping->a_ops = &managed_cache_aops; + mapping_set_gfp_mask(inode->i_mapping, + GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE); + return inode; +} + +static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask) +{ + int ret = 1; /* 0 - busy */ + struct address_space *const mapping = page->mapping; + + DBG_BUGON(!PageLocked(page)); + DBG_BUGON(mapping->a_ops != &managed_cache_aops); + + if (PagePrivate(page)) + ret = erofs_try_to_free_cached_page(mapping, page); + + return ret; +} + +static void erofs_managed_cache_invalidatepage(struct page *page, + unsigned int offset, + unsigned int length) +{ + const unsigned int stop = length + offset; + + DBG_BUGON(!PageLocked(page)); + + /* Check for potential overflow in debug mode */ + DBG_BUGON(stop > PAGE_SIZE || stop < length); + + if (offset == 0 && stop == PAGE_SIZE) + while (!erofs_managed_cache_releasepage(page, GFP_NOFS)) + cond_resched(); +} + +static const struct address_space_operations managed_cache_aops = { + .releasepage = erofs_managed_cache_releasepage, + .invalidatepage = erofs_managed_cache_invalidatepage, +}; +#endif + diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index bebbd45bf08e..591acd90fd9d 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -162,6 +162,118 @@ struct z_erofs_decompress_frontend { static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; static DEFINE_MUTEX(z_pagemap_global_lock); +#ifdef EROFS_FS_HAS_MANAGED_CACHE +static void preload_compressed_pages(struct z_erofs_collector *clt, + struct address_space *mc, + enum z_erofs_cache_alloctype type, + struct list_head *pagepool) +{ + const struct z_erofs_pcluster *pcl = clt->pcl; + const unsigned int clusterpages = BIT(pcl->clusterbits); + struct page **pages = clt->compressedpages; + pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages); + bool standalone = true; + + if (clt->mode < COLLECT_PRIMARY_FOLLOWED) + return; + + for (; pages < pcl->compressed_pages + clusterpages; ++pages) { + struct page *page; + compressed_page_t t; + + /* the compressed page was loaded before */ + if (READ_ONCE(*pages)) + continue; + + page = find_get_page(mc, index); + + if (page) { + t = tag_compressed_page_justfound(page); + } else if (type == DELAYEDALLOC) { + t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED); + } else { /* DONTALLOC */ + if (standalone) + clt->compressedpages = pages; + standalone = false; + continue; + } + + if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) + continue; + + if (page) + put_page(page); + } + + if (standalone) /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */ + clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; +} + +/* called by erofs_shrinker to get rid of all compressed_pages */ +int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, + struct erofs_workgroup *grp) +{ + struct z_erofs_pcluster *const pcl = + container_of(grp, struct z_erofs_pcluster, obj); + struct address_space *const mapping = MNGD_MAPPING(sbi); + const unsigned int clusterpages = BIT(pcl->clusterbits); + int i; + + /* + * refcount of workgroup is now freezed as 1, + * therefore no need to worry about available decompression users. + */ + for (i = 0; i < clusterpages; ++i) { + struct page *page = pcl->compressed_pages[i]; + + if (!page) + continue; + + /* block other users from reclaiming or migrating the page */ + if (!trylock_page(page)) + return -EBUSY; + + if (unlikely(page->mapping != mapping)) + continue; + + /* barrier is implied in the following 'unlock_page' */ + pcl->compressed_pages[i] = NULL; + set_page_private(page, 0); + ClearPagePrivate(page); + + unlock_page(page); + put_page(page); + } + return 0; +} + +int erofs_try_to_free_cached_page(struct address_space *mapping, + struct page *page) +{ + struct z_erofs_pcluster *const pcl = (void *)page_private(page); + const unsigned int clusterpages = BIT(pcl->clusterbits); + int ret = 0; /* 0 - busy */ + + if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { + unsigned int i; + + for (i = 0; i < clusterpages; ++i) { + if (pcl->compressed_pages[i] == page) { + pcl->compressed_pages[i] = NULL; + ret = 1; + break; + } + } + erofs_workgroup_unfreeze(&pcl->obj, 1); + + if (ret) { + ClearPagePrivate(page); + put_page(page); + } + } + return ret; +} +#else static void preload_compressed_pages(struct z_erofs_collector *clt, struct address_space *mc, enum z_erofs_cache_alloctype type, @@ -169,6 +281,7 @@ static void preload_compressed_pages(struct z_erofs_collector *clt, { /* nowhere to load compressed pages from */ } +#endif /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ static inline bool try_inplace_io(struct z_erofs_collector *clt, @@ -440,6 +553,13 @@ static inline struct page *__stagingpage_alloc(struct list_head *pagepool, static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, erofs_off_t la) { +#ifdef EROFS_FS_HAS_MANAGED_CACHE + if (fe->backmost) + return true; +#if (EROFS_FS_HAS_MANAGED_CACHE >= 2) + return la < fe->headoffset; +#endif +#endif return false; } @@ -1001,6 +1121,9 @@ static struct z_erofs_unzip_io *jobqueue_init(struct super_block *sb, /* define decompression jobqueue types */ enum { +#ifdef EROFS_FS_HAS_MANAGED_CACHE + JQ_BYPASS, +#endif JQ_SUBMIT, NR_JOBQUEUES, }; @@ -1011,12 +1134,56 @@ static void *jobqueueset_init(struct super_block *sb, struct z_erofs_unzip_io *fgq, bool forcefg) { +#ifdef EROFS_FS_HAS_MANAGED_CACHE + /* + * if managed cache is enabled, bypass jobqueue is needed, + * no need to read from device for all pclusters in this queue. + */ + q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, true); + qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; +#endif + q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, forcefg); qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], !forcefg)); } +#ifdef EROFS_FS_HAS_MANAGED_CACHE +static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, + z_erofs_next_pcluster_t qtail[], + z_erofs_next_pcluster_t owned_head) +{ + z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; + z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; + + DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + if (owned_head == Z_EROFS_PCLUSTER_TAIL) + owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED; + + WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED); + + WRITE_ONCE(*submit_qtail, owned_head); + WRITE_ONCE(*bypass_qtail, &pcl->next); + + qtail[JQ_BYPASS] = &pcl->next; +} + +static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[], + unsigned int nr_bios, + bool force_fg) +{ + /* + * although background is preferred, no one is pending for submission. + * don't issue workqueue for decompression but drop it directly instead. + */ + if (force_fg || nr_bios) + return false; + + kvfree(container_of(q[JQ_SUBMIT], struct z_erofs_unzip_io_sb, io)); + return true; +} +#else static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, z_erofs_next_pcluster_t qtail[], z_erofs_next_pcluster_t owned_head) @@ -1033,6 +1200,7 @@ static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[], DBG_BUGON(!nr_bios); return false; } +#endif static bool z_erofs_vle_submit_all(struct super_block *sb, z_erofs_next_pcluster_t owned_head, @@ -1144,6 +1312,9 @@ static void z_erofs_submit_and_unzip(struct super_block *sb, pagepool, io, force_fg)) return; +#ifdef EROFS_FS_HAS_MANAGED_CACHE + z_erofs_vle_unzip_all(sb, &io[JQ_BYPASS], pagepool); +#endif if (!force_fg) return; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 53297d1811dd..ab22f994506f 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -101,9 +101,18 @@ struct z_erofs_unzip_io_sb { struct super_block *sb; }; +#ifdef EROFS_FS_HAS_MANAGED_CACHE +#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) +static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, + struct page *page) +{ + return page->mapping == MNGD_MAPPING(sbi); +} +#else #define MNGD_MAPPING(sbi) (NULL) static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, struct page *page) { return false; } +#endif #define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 #define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) -- 2.17.1