From: Dave Chinner <dchinner@xxxxxxxxxx> The current xfs_file_aio_write code is a mess of locking shenanigans to handle the different locking requirements of buffered and direct IO. Start to clean this up by disentangling the direct IO path from the mess. Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> --- fs/xfs/linux-2.6/xfs_file.c | 184 +++++++++++++++++++++++++++--------------- 1 files changed, 118 insertions(+), 66 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 4608aab..d70a249 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -587,6 +587,116 @@ out_lock: return error; } +/* + * xfs_file_dio_aio_write - handle direct IO writes + * + * Lock the inode appropriately to prepare for and issue a direct IO write. + * By spearating it from the buffered write path we remove all the tricky to + * follow locking changes and looping. This also clearly indicates that XFS + * does not fall back to buffered IO in the direct IO write path. + * + * Returns with locks held indicated by @need_i_mutex and @iolock. + */ +STATIC ssize_t +xfs_file_dio_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos, + size_t ocount, + int *need_i_mutex, + int *iolock) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t error = 0; + xfs_fsize_t new_size; + size_t count = ocount; + xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + *need_i_mutex = 0; + *iolock = 0; + if ((pos & target->bt_smask) || (count & target->bt_smask)) + return XFS_ERROR(-EINVAL); + + if (mapping->nrpages || pos > ip->i_size) { + *iolock = XFS_IOLOCK_EXCL; + *need_i_mutex = 1; + mutex_lock(&inode->i_mutex); + } else { + *iolock = XFS_IOLOCK_SHARED; + } + xfs_ilock(ip, XFS_ILOCK_EXCL|*iolock); + + error = generic_write_checks(file, &pos, &count, + S_ISBLK(inode->i_mode)); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL|*iolock); + *iolock = 0; + return error; + } + + new_size = pos + count; + if (new_size > ip->i_size) + ip->i_new_size = new_size; + + if (likely(!(file->f_mode & FMODE_NOCMTIME))) + file_update_time(file); + + /* + * If the offset is beyond the size of the file, we have a couple of + * things to do. First, if there is already space allocated we need to + * either create holes or zero the disk or ... + * + * If there is a page where the previous size lands, we need to zero it + * out up to the new size. + */ + if (pos > ip->i_size) { + error = -xfs_zero_eof(ip, pos, ip->i_size); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* + * If we're writing the file then make sure to clear the setuid and + * setgid bits if the process is not being run by root. This keeps + * people from modifying setuid and setgid binaries. + */ + error = file_remove_suid(file); + if (unlikely(error)) + return error; + + if (mapping->nrpages) { + WARN_ON(*need_i_mutex == 0); + error = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, + FI_REMAPF_LOCKED); + if (error) + return error; + } + + if (*need_i_mutex) { + /* demote the lock now the cached pages are gone */ + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); + mutex_unlock(&inode->i_mutex); + *iolock = XFS_IOLOCK_SHARED; + *need_i_mutex = 0; + } + + trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + error = generic_file_direct_write(iocb, iovp, + &nr_segs, pos, &iocb->ki_pos, count, ocount); + + /* No fallback to buffered IO on errors for XFS. */ + return error; +} + STATIC ssize_t xfs_file_aio_write( struct kiocb *iocb, @@ -628,19 +738,17 @@ xfs_file_aio_write( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; -relock: if (ioflags & IO_ISDIRECT) { - iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; - } else { - iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); + ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, + ocount, &need_i_mutex, &iolock); + goto done_io; } + iolock = XFS_IOLOCK_EXCL; + need_i_mutex = 1; + mutex_lock(&inode->i_mutex); xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); -start: error = -generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (error) { @@ -648,26 +756,6 @@ start: goto out_unlock_mutex; } - if (ioflags & IO_ISDIRECT) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - - if ((pos & target->bt_smask) || (count & target->bt_smask)) { - xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); - return XFS_ERROR(-EINVAL); - } - - if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { - xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); - iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); - goto start; - } - } - new_size = pos + count; if (new_size > ip->i_size) ip->i_new_size = new_size; @@ -706,44 +794,7 @@ start: /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; - if ((ioflags & IO_ISDIRECT)) { - if (mapping->nrpages) { - WARN_ON(need_i_mutex == 0); - error = xfs_flushinval_pages(ip, - (pos & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); - if (error) - goto out_unlock_internal; - } - - if (need_i_mutex) { - /* demote the lock now the cached pages are gone */ - xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); - mutex_unlock(&inode->i_mutex); - - iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; - } - - trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); - ret = generic_file_direct_write(iocb, iovp, - &nr_segs, pos, &iocb->ki_pos, count, ocount); - - /* - * direct-io write to a hole: fall through to buffered I/O - * for completing the rest of the request. - */ - if (ret >= 0 && ret != count) { - XFS_STATS_ADD(xs_write_bytes, ret); - - pos += ret; - count -= ret; - - ioflags &= ~IO_ISDIRECT; - xfs_iunlock(ip, iolock); - goto relock; - } - } else { + if (!(ioflags & IO_ISDIRECT)) { int enospc = 0; ssize_t ret2 = 0; @@ -767,6 +818,7 @@ write_retry: current->backing_dev_info = NULL; +done_io: xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); error = -ret; -- 1.7.2.3 _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs