v2 remove the architecture conditional. The AGF hang is caused when the process that holds the AGF buffer lock cannot get a worker. The allocation worker pool are blocked waiting to take the AGF buffer lock. Move the allocation worker call so that multiple calls to xfs_alloc_vextent() for a particular transaction are contained within a single worker. --- With the xfs_alloc_arg structure zeroed, the AGF hang occurs in xfs_bmap_btalloc() due to a secondary call to xfs_alloc_vextent(). These calls to xfs_alloc_vextent() try different strategies to allocate the extent if the previous allocation attempt failed. I still prefer this patch's approach. It also limits the number worker context switches when xfs_alloc_ventent() is called multiple times within a transaction. The intent of the patch is to move the allocation worker as reasonably close to the xfs_trans_alloc() - xfs_trans_commit / xfs_trans_cancel() calls as possible. I have ported this patch to Linux 3.0.x. Linux 2.6.x will be the same as the Linux 3.0 port. This patch allows an easy addition of an architecture limit on the allocation worker for those that choose to do so. Signed-off-by: Mark Tinguely <tinguely@xxxxxxx> --- fs/xfs/xfs_alloc.c | 42 ------------------------------------- fs/xfs/xfs_alloc.h | 3 -- fs/xfs/xfs_bmap.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++-- fs/xfs/xfs_bmap.h | 16 ++++++++++++++ 4 files changed, 75 insertions(+), 46 deletions(-) Index: b/fs/xfs/xfs_alloc.c =================================================================== --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2211,7 +2211,7 @@ xfs_alloc_read_agf( * group or loop over the allocation groups to find the result. */ int /* error */ -__xfs_alloc_vextent( +xfs_alloc_vextent( xfs_alloc_arg_t *args) /* allocation argument structure */ { xfs_agblock_t agsize; /* allocation group size */ @@ -2421,46 +2421,6 @@ error0: return error; } -static void -xfs_alloc_vextent_worker( - struct work_struct *work) -{ - struct xfs_alloc_arg *args = container_of(work, - struct xfs_alloc_arg, work); - unsigned long pflags; - - /* we are in a transaction context here */ - current_set_flags_nested(&pflags, PF_FSTRANS); - - args->result = __xfs_alloc_vextent(args); - complete(args->done); - - current_restore_flags_nested(&pflags, PF_FSTRANS); -} - -/* - * Data allocation requests often come in with little stack to work on. Push - * them off to a worker thread so there is lots of stack to use. Metadata - * requests, OTOH, are generally from low stack usage paths, so avoid the - * context switch overhead here. - */ -int -xfs_alloc_vextent( - struct xfs_alloc_arg *args) -{ - DECLARE_COMPLETION_ONSTACK(done); - - if (!args->userdata) - return __xfs_alloc_vextent(args); - - - args->done = &done; - INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker); - queue_work(xfs_alloc_wq, &args->work); - wait_for_completion(&done); - return args->result; -} - /* * Free an extent. * Just break up the extent address and hand off to xfs_free_ag_extent Index: b/fs/xfs/xfs_alloc.h =================================================================== --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -120,9 +120,6 @@ typedef struct xfs_alloc_arg { char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* set if this is user data */ xfs_fsblock_t firstblock; /* io first block allocated */ - struct completion *done; - struct work_struct work; - int result; } xfs_alloc_arg_t; /* Index: b/fs/xfs/xfs_bmap.c =================================================================== --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -48,7 +48,6 @@ #include "xfs_vnodeops.h" #include "xfs_trace.h" - kmem_zone_t *xfs_bmap_free_item_zone; /* @@ -4820,7 +4819,7 @@ xfs_bmapi_convert_unwritten( * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). */ int -xfs_bmapi_write( +__xfs_bmapi_write( struct xfs_trans *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t bno, /* starting file offs. mapped */ @@ -5044,6 +5043,63 @@ error0: return error; } +static void +xfs_bmapi_write_worker( + struct work_struct *work) +{ + struct xfs_bmw_wkr *bw = container_of(work, + struct xfs_bmw_wkr, work); + unsigned long pflags; + + /* we are in a transaction context here */ + current_set_flags_nested(&pflags, PF_FSTRANS); + + bw->result = __xfs_bmapi_write(bw->tp, bw->ip, bw->bno, bw->len, + bw->flags, bw->firstblock, bw->total, + bw->mval, bw->nmap, bw->flist); + complete(bw->done); + + current_restore_flags_nested(&pflags, PF_FSTRANS); +} + +int +xfs_bmapi_write( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + int flags, /* XFS_BMAPI_... */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_extlen_t total, /* total blocks needed */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + struct xfs_bmap_free *flist) /* i/o: list extents to free */ +{ + struct xfs_bmw_wkr bw; + DECLARE_COMPLETION_ONSTACK(done); + + if (flags & XFS_BMAPI_METADATA) + return __xfs_bmapi_write(tp, ip, bno, len, flags, firstblock, + total, mval, nmap, flist); + /* initialize the worker argument list structure */ + bw.tp = tp; + bw.ip = ip; + bw.bno = bno; + bw.len = len; + bw.flags = flags; + bw.firstblock = firstblock; + bw.total = total; + bw.mval = mval; + bw.nmap = nmap; + bw.flist = flist; + bw.done = &done; + INIT_WORK_ONSTACK(&bw.work, xfs_bmapi_write_worker); + queue_work(xfs_alloc_wq, &bw.work); + wait_for_completion(&done); + return bw.result; +} + /* * Unmap (remove) blocks from a file. * If nexts is nonzero then the number of extents to remove is limited to Index: b/fs/xfs/xfs_bmap.h =================================================================== --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -135,6 +135,22 @@ typedef struct xfs_bmalloca { char conv; /* overwriting unwritten extents */ } xfs_bmalloca_t; +struct xfs_bmw_wkr { + struct xfs_trans *tp; /* transaction pointer */ + struct xfs_inode *ip; /* incore inode */ + xfs_fileoff_t bno; /* starting file offs. mapped */ + xfs_filblks_t len; /* length to map in file */ + int flags; /* XFS_BMAPI_... */ + xfs_fsblock_t *firstblock; /* first allocblock controls */ + xfs_extlen_t total; /* total blocks needed */ + struct xfs_bmbt_irec *mval; /* output: map values */ + int *nmap; /* i/o: mval size/count */ + struct xfs_bmap_free *flist; /* bmap freelist */ + struct completion *done; /* worker completion ptr */ + struct work_struct work; /* worker */ + int result; /* worker function result */ +} ; + /* * Flags for xfs_bmap_add_extent*. */ _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs