From: Darrick J. Wong <djwong@xxxxxxxxxx> Split the block preallocation garbage collection work into per-AG work items so that we can take advantage of parallelization. Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> --- fs/xfs/xfs_icache.c | 47 +++++++++++++++++++++++++++++++++-------------- fs/xfs/xfs_mount.c | 3 +++ fs/xfs/xfs_mount.h | 5 +++-- fs/xfs/xfs_super.c | 26 ++++++++++++++++++++++++-- 4 files changed, 63 insertions(+), 18 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index d1179f3e9d1f..e78724440d87 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -943,12 +943,12 @@ xfs_inode_walk( */ static void xfs_queue_blockgc( - struct xfs_mount *mp) + struct xfs_perag *pag) { rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_BLOCK_GC_TAG)) - queue_delayed_work(mp->m_blockgc_workqueue, - &mp->m_blockgc_work, + if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCK_GC_TAG)) + queue_delayed_work(pag->pag_mount->m_blockgc_workqueue, + &pag->pag_blockgc_work, msecs_to_jiffies(xfs_blockgc_secs * 1000)); rcu_read_unlock(); } @@ -983,25 +983,40 @@ void xfs_blockgc_worker( struct work_struct *work) { - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_blockgc_work); + struct xfs_perag *pag = container_of(to_delayed_work(work), + struct xfs_perag, pag_blockgc_work); int error; - if (!sb_start_write_trylock(mp->m_super)) + if (!sb_start_write_trylock(pag->pag_mount->m_super)) return; - error = xfs_blockgc_scan(mp, NULL); + + error = xfs_inode_walk_ag(pag, 0, xfs_blockgc_scan_inode, NULL, + XFS_ICI_BLOCK_GC_TAG); if (error) - xfs_info(mp, "preallocation gc worker failed, err=%d", error); - sb_end_write(mp->m_super); - xfs_queue_blockgc(mp); + xfs_info(pag->pag_mount, + "AG %u preallocation gc worker failed, err=%d", + pag->pag_agno, error); + sb_end_write(pag->pag_mount->m_super); + xfs_queue_blockgc(pag); } +#define for_each_perag_tag(mp, next_agno, pag, tag) \ + for ((next_agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \ + (pag) != NULL; \ + (next_agno) = (pag)->pag_agno + 1, \ + xfs_perag_put(pag), \ + (pag) = xfs_perag_get_tag((mp), (next_agno), (tag))) + /* Disable post-EOF and CoW block auto-reclamation. */ void xfs_blockgc_stop( struct xfs_mount *mp) { - cancel_delayed_work_sync(&mp->m_blockgc_work); + struct xfs_perag *pag; + xfs_agnumber_t agno; + + for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCK_GC_TAG) + cancel_delayed_work_sync(&pag->pag_blockgc_work); } /* Enable post-EOF and CoW block auto-reclamation. */ @@ -1009,7 +1024,11 @@ void xfs_blockgc_start( struct xfs_mount *mp) { - xfs_queue_blockgc(mp); + struct xfs_perag *pag; + xfs_agnumber_t agno; + + for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCK_GC_TAG) + xfs_queue_blockgc(pag); } /* @@ -1546,7 +1565,7 @@ __xfs_inode_set_blocks_tag( spin_unlock(&ip->i_mount->m_perag_lock); /* kick off background trimming */ - xfs_queue_blockgc(ip->i_mount); + xfs_queue_blockgc(pag); trace_xfs_perag_set_blockgc(ip->i_mount, pag->pag_agno, -1, _RET_IP_); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c3144cf5febe..ed96b48afb0a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -126,6 +126,7 @@ __xfs_free_perag( { struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); + ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); ASSERT(atomic_read(&pag->pag_ref) == 0); kmem_free(pag); } @@ -146,6 +147,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); + cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); call_rcu(&pag->rcu_head, __xfs_free_perag); @@ -201,6 +203,7 @@ xfs_initialize_perag( pag->pag_agno = index; pag->pag_mount = mp; spin_lock_init(&pag->pag_ici_lock); + INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); error = xfs_buf_hash_init(pag); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index d9f32102514e..2bb2b5704805 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -177,8 +177,6 @@ typedef struct xfs_mount { uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ struct delayed_work m_reclaim_work; /* background inode reclaim */ - struct delayed_work m_blockgc_work; /* background prealloc blocks - trimming */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; struct xfs_kobj m_error_meta_kobj; @@ -367,6 +365,9 @@ typedef struct xfs_perag { /* Blocks reserved for the reverse mapping btree. */ struct xfs_ag_resv pag_rmapbt_resv; + /* background prealloc block trimming */ + struct delayed_work pag_blockgc_work; + /* reference count */ uint8_t pagf_refcount_level; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f72c1f473025..9624e8a08509 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -516,7 +516,8 @@ xfs_init_mount_workqueues( goto out_destroy_cil; mp->m_blockgc_workqueue = alloc_workqueue("xfs-blockgc/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id); + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_FREEZABLE, 0, + mp->m_super->s_id); if (!mp->m_blockgc_workqueue) goto out_destroy_reclaim; @@ -1387,6 +1388,26 @@ xfs_fs_validate_params( return 0; } +/* + * Constrain the number of threads that we start for background work. This + * is the estimated parallelism of the filesystem capped to the unbound work + * queue maximum. + * + * We can't set this when we allocate the workqueues because the thread count + * derives from AG count, and we can't know that until we're far enough through + * setup to read the superblock, which requires functioning workqueues. + */ +static inline void +xfs_configure_background_workqueues( + struct xfs_mount *mp) +{ + unsigned int threads = xfs_guess_metadata_threads(mp); + unsigned int max_active; + + max_active = min_t(unsigned int, threads, WQ_UNBOUND_MAX_ACTIVE); + workqueue_set_max_active(mp->m_blockgc_workqueue, max_active); +} + static int xfs_fs_fill_super( struct super_block *sb, @@ -1452,6 +1473,8 @@ xfs_fs_fill_super( if (error) goto out_free_sb; + xfs_configure_background_workqueues(mp); + error = xfs_setup_devices(mp); if (error) goto out_free_sb; @@ -1872,7 +1895,6 @@ static int xfs_init_fs_context( mutex_init(&mp->m_growlock); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - INIT_DELAYED_WORK(&mp->m_blockgc_work, xfs_blockgc_worker); mp->m_kobj.kobject.kset = xfs_kset; /* * We don't create the finobt per-ag space reservation until after log