Re: [PATCH 3/3] ext4: optimize locking for end_io extent conversion

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Ted,
On Mon, Oct 31, 2011 at 11:02 PM, Theodore Ts'o <tytso@xxxxxxx> wrote:
> Now that we are doing the locking correctly, we need to grab the
> i_completed_io_lock() twice per end_io.  We can clean this up by
> removing the structure from the i_complted_io_list, and use this as
> the locking mechanism to prevent ext4_flush_completed_IO() racing
> against ext4_end_io_work(), instead of clearing the
> EXT4_IO_END_UNWRITTEN in io->flag.
>
> In addition, if the ext4_convert_unwritten_extents() returns an error,
> we no longer keep the end_io structure on the linked list.  This
> doesn't help, because it tends to lock up the file system and wedges
> the system.  That's one way to call attention to the problem, but it
> doesn't help the overall robustness of the system.
>
> Signed-off-by: "Theodore Ts'o" <tytso@xxxxxxx>
> ---
>  fs/ext4/fsync.c   |    5 ++---
>  fs/ext4/page-io.c |   37 +++++++++++--------------------------
>  2 files changed, 13 insertions(+), 29 deletions(-)
>
> diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
> index 851ac5b..00a2cb7 100644
> --- a/fs/ext4/fsync.c
> +++ b/fs/ext4/fsync.c
> @@ -88,6 +88,7 @@ int ext4_flush_completed_IO(struct inode *inode)
>        while (!list_empty(&ei->i_completed_io_list)){
>                io = list_entry(ei->i_completed_io_list.next,
>                                ext4_io_end_t, list);
> +               list_del_init(&io->list);
>                /*
>                 * Calling ext4_end_io_nolock() to convert completed
>                 * IO to written.
> @@ -104,11 +105,9 @@ int ext4_flush_completed_IO(struct inode *inode)
>                 */
>                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
>                ret = ext4_end_io_nolock(io);
> -               spin_lock_irqsave(&ei->i_completed_io_lock, flags);
>                if (ret < 0)
>                        ret2 = ret;
> -               else
> -                       list_del_init(&io->list);
> +               spin_lock_irqsave(&ei->i_completed_io_lock, flags);
>        }
>        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
>        return (ret2 < 0) ? ret2 : 0;
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index 4fa1d70..7bacd27 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -99,28 +99,21 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
>                   "list->prev 0x%p\n",
>                   io, inode->i_ino, io->list.next, io->list.prev);
>
> -       if (!(io->flag & EXT4_IO_END_UNWRITTEN))
> -               return ret;
> -
>        ret = ext4_convert_unwritten_extents(inode, offset, size);
>        if (ret < 0) {
> -               printk(KERN_EMERG "%s: failed to convert unwritten "
> -                       "extents to written extents, error is %d "
> -                       "io is still on inode %lu aio dio list\n",
> -                      __func__, ret, inode->i_ino);
> -               return ret;
> +               ext4_msg(inode->i_sb, KERN_EMERG,
> +                        "failed to convert unwritten extents to written "
> +                        "extents -- potential data loss!  "
> +                        "(inode %lu, offset %llu, size %d, error %d)",
> +                        inode->i_ino, offset, size, ret);
>        }
>
>        if (io->iocb)
>                aio_complete(io->iocb, io->result, 0);
> -       /* clear the DIO AIO unwritten flag */
> -       if (io->flag & EXT4_IO_END_UNWRITTEN) {
> -               io->flag &= ~EXT4_IO_END_UNWRITTEN;
> -               /* Wake up anyone waiting on unwritten extent conversion */
> -               if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
> -                       wake_up_all(ext4_ioend_wq(io->inode));
> -       }
>
> +       /* Wake up anyone waiting on unwritten extent conversion */
> +       if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
> +               wake_up_all(ext4_ioend_wq(io->inode));
>        return ret;
>  }
>
> @@ -133,16 +126,15 @@ static void ext4_end_io_work(struct work_struct *work)
>        struct inode            *inode = io->inode;
>        struct ext4_inode_info  *ei = EXT4_I(inode);
>        unsigned long           flags;
> -       int                     ret;
>
>        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
>        if (list_empty(&io->list)) {
>                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
>                goto free;
>        }
> -       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
>
>        if (!mutex_trylock(&inode->i_mutex)) {
So here, we spin_lock first and then mutex_lock.
But in  ext4_flush_completed_IO, we mutex_lock first and
then spin_lock. Will lockdep complain about it?
Other than that, the patch looks good to me.

Thanks
Tao
> +               spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
>                /*
>                 * Requeue the work instead of waiting so that the work
>                 * items queued after this can be processed.
> @@ -159,16 +151,9 @@ static void ext4_end_io_work(struct work_struct *work)
>                io->flag |= EXT4_IO_END_QUEUED;
>                return;
>        }
> -       ret = ext4_end_io_nolock(io);
> -       if (ret < 0) {
> -               mutex_unlock(&inode->i_mutex);
> -               return;
> -       }
> -
> -       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> -       if (!list_empty(&io->list))
> -               list_del_init(&io->list);
> +       list_del_init(&io->list);
>        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> +       (void) ext4_end_io_nolock(io);
>        mutex_unlock(&inode->i_mutex);
>  free:
>        ext4_free_io_end(io);
> --
> 1.7.4.1.22.gec8e1.dirty
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Reiser Filesystem Development]     [Ceph FS]     [Kernel Newbies]     [Security]     [Netfilter]     [Bugtraq]     [Linux FS]     [Yosemite National Park]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]     [Linux Media]

  Powered by Linux