Hi Carlos, On Fri, Aug 10, 2012 at 03:01:51PM -0300, Carlos Maiolino wrote: > While xfs_buftarg_shrink() is freeing buffers from the dispose list (filled with > buffers from lru list), there is a possibility to have xfs_buf_stale() racing > with it, and removing buffers from dispose list before xfs_buftarg_shrink() does > it. > > This happens because xfs_buftarg_shrink() handle the dispose list without > locking and the test condition in xfs_buf_stale() checks for the buffer being in > *any* list: > > if (!list_empty(&bp->b_lru) ) That's cruel and unusual punishment. > If the buffer happens to be on dispose list, this causes the buffer counter of > lru list (btp->bt_lru_nr) to be decremented twice (once in xfs_buftarg_shrink() > and another in xfs_buf_stale()) causing a wrong account usage of the lru list. > > This may cause xfs_buftarg_shrink() to return a wrong value to the memory > shrinker shrink_slab(), and such account error may also cause an underflowed > value to be returned; since the counter is lower than the current number of > items in the lru list, a decrement may happen when the counter is 0, causing > an underflow on the counter. > > The fix uses a new flag field (and a new buffer flag) to serialize buffer > handling during the shrink process. The new flag field has been designed to use > btp->bt_lru_lock/unlock instead of xfs_buf_lock/unlock mechanism. > > dchinner, sandeen, aquini and aris also deserve credits for this. > > Signed-off-by: Carlos Maiolino <cmaiolino@xxxxxxxxxx> > --- > fs/xfs/xfs_buf.c | 5 ++++- > fs/xfs/xfs_buf.h | 41 ++++++++++++++++++++++++----------------- > 2 files changed, 28 insertions(+), 18 deletions(-) > > diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c > index d7a9dd7..933b793 100644 > --- a/fs/xfs/xfs_buf.c > +++ b/fs/xfs/xfs_buf.c > @@ -96,6 +96,7 @@ xfs_buf_lru_add( > atomic_inc(&bp->b_hold); > list_add_tail(&bp->b_lru, &btp->bt_lru); > btp->bt_lru_nr++; > + bp->b_lru_flags &= ~_XBF_LRU_DISPOSE; > } > spin_unlock(&btp->bt_lru_lock); > } > @@ -154,7 +155,8 @@ xfs_buf_stale( > struct xfs_buftarg *btp = bp->b_target; > > spin_lock(&btp->bt_lru_lock); > - if (!list_empty(&bp->b_lru)) { > + if (!list_empty(&bp->b_lru) && > + !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) { > list_del_init(&bp->b_lru); > btp->bt_lru_nr--; > atomic_dec(&bp->b_hold); > @@ -1501,6 +1503,7 @@ xfs_buftarg_shrink( > */ > list_move(&bp->b_lru, &dispose); > btp->bt_lru_nr--; > + bp->b_lru_flags |= _XBF_LRU_DISPOSE; > } > spin_unlock(&btp->bt_lru_lock); > > diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h > index d03b73b..7c0b6a0 100644 > --- a/fs/xfs/xfs_buf.h > +++ b/fs/xfs/xfs_buf.h > @@ -38,27 +38,28 @@ typedef enum { > XBRW_ZERO = 3, /* Zero target memory */ > } xfs_buf_rw_t; > > -#define XBF_READ (1 << 0) /* buffer intended for reading from device */ > -#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ > -#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ > -#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ > -#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ > -#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ > +#define XBF_READ (1 << 0) /* buffer intended for reading from device */ > +#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ > +#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ > +#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ > +#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ > +#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ > > /* I/O hints for the BIO layer */ > -#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ > -#define XBF_FUA (1 << 11)/* force cache write through mode */ > -#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ > +#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ > +#define XBF_FUA (1 << 11)/* force cache write through mode */ > +#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ > > /* flags used only as arguments to access routines */ > -#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ > -#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ > +#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ > +#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ > > /* flags used only internally */ > -#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ > -#define _XBF_KMEM (1 << 21)/* backed by heap memory */ > -#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ > -#define _XBF_COMPOUND (1 << 23)/* compound buffer */ > +#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ > +#define _XBF_KMEM (1 << 21)/* backed by heap memory */ > +#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ > +#define _XBF_COMPOUND (1 << 23)/* compound buffer */ > +#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */ It's nice to have them lined up like that. > > typedef unsigned int xfs_buf_flags_t; > > @@ -72,12 +73,13 @@ typedef unsigned int xfs_buf_flags_t; > { XBF_SYNCIO, "SYNCIO" }, \ > { XBF_FUA, "FUA" }, \ > { XBF_FLUSH, "FLUSH" }, \ > - { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ > + { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ ...and you got rid of an extra space here. > { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ > { _XBF_PAGES, "PAGES" }, \ > { _XBF_KMEM, "KMEM" }, \ > { _XBF_DELWRI_Q, "DELWRI_Q" }, \ > - { _XBF_COMPOUND, "COMPOUND" } > + { _XBF_COMPOUND, "COMPOUND" }, \ > + { _XBF_LRU_DISPOSE, "LRU_DISPOSE" } > > typedef struct xfs_buftarg { > dev_t bt_dev; > @@ -124,7 +126,12 @@ typedef struct xfs_buf { > xfs_buf_flags_t b_flags; /* status flags */ > struct semaphore b_sema; /* semaphore for lockables */ > > + /* > + * concurrent access to b_lru and b_lru_flags are protected by > + * bt_lru_lock and not by b_sema > + */ > struct list_head b_lru; /* lru list */ > + xfs_buf_flags_t b_lru_flags; /* internal lru status flags */ > wait_queue_head_t b_waiters; /* unpin waiters */ > struct list_head b_list; > struct xfs_perag *b_pag; /* contains rbtree root */ This looks pretty good to me. Looks like you've been careful about the locking. What was the symptom that led to the discovery of this problem? Reviewed-by: Ben Myers <bpm@xxxxxxx> Regards, Ben _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs