On 01/21/2014 05:46 PM, Eric Sandeen wrote: > Some time ago, mkfs.xfs started picking the storage physical > sector size as the default filesystem "sector size" in order > to avoid RMW costs incurred by doing IOs at logical sector > size alignments. > > However, this means that for a filesystem made with i.e. > a 4k sector size on an "advanced format" 4k/512 disk, > 512-byte direct IOs are no longer allowed. This means > that XFS has essentially turned this AF drive into a hard > 4K device, from the filesystem on up. > > XFS's mkfs-specified "sector size" is really just controlling > the minimum size & alignment of filesystem metadata. > > There is no real need to tightly couple XFS's minimal > metadata size to the minimum allowed direct IO size; > XFS can continue doing metadata in optimal sizes, but > still allow smaller DIOs for apps which issue them, > for whatever reason. > > This patch adds a new field to the xfs_buftarg, so that > we now track 2 sizes: > > 1) The metadata sector size, which is the minimum unit and > alignment of IO which will be performed by metadata operations. > 2) The device logical sector size > > The first is used internally by the file system for metadata > alignment and IOs. > The second is used for the minimum allowed direct IO alignment. > > This has passed xfstests on filesystems made with 4k sectors, > including when run under the patch I sent to ignore > XFS_IOC_DIOINFO, and issue 512 DIOs anyway. I also directly > tested end of block behavior on preallocated, sparse, and > existing files when we do a 512 IO into a 4k file on a > 4k-sector filesystem, to be sure there were no unexpected > behaviors. > > Signed-off-by: Eric Sandeen <sandeen@xxxxxxxxxx> > --- Looks good to me... Reviewed-by: Brian Foster <bfoster@xxxxxxxxxx> > > diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c > index a526f8d..5175711 100644 > --- a/fs/xfs/xfs_buf.c > +++ b/fs/xfs/xfs_buf.c > @@ -1599,6 +1599,7 @@ xfs_setsize_buftarg( > unsigned int blocksize, > unsigned int sectorsize) > { > + /* Set up metadata sector size info */ > btp->bt_meta_sectorsize = sectorsize; > btp->bt_meta_sectormask = sectorsize - 1; > > @@ -1613,6 +1614,10 @@ xfs_setsize_buftarg( > return EINVAL; > } > > + /* Set up device logical sector size mask */ > + btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); > + btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; > + > return 0; > } > > diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h > index d5d88dd..9953395 100644 > --- a/fs/xfs/xfs_buf.h > +++ b/fs/xfs/xfs_buf.h > @@ -88,6 +88,19 @@ typedef unsigned int xfs_buf_flags_t; > */ > #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ > > +/* > + * The xfs_buftarg contains 2 notions of "sector size" - > + * > + * 1) The metadata sector size, which is the minimum unit and > + * alignment of IO which will be performed by metadata operations. > + * 2) The device logical sector size > + * > + * The first is specified at mkfs time, and is stored on-disk in the > + * superblock's sb_sectsize. > + * > + * The latter is derived from the underlying device, and controls direct IO > + * alignment constraints. > + */ > typedef struct xfs_buftarg { > dev_t bt_dev; > struct block_device *bt_bdev; > @@ -95,6 +108,8 @@ typedef struct xfs_buftarg { > struct xfs_mount *bt_mount; > unsigned int bt_meta_sectorsize; > size_t bt_meta_sectormask; > + size_t bt_logical_sectorsize; > + size_t bt_logical_sectormask; > > /* LRU control structures */ > struct shrinker bt_shrinker; > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index ef0c933..57725b4 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -261,7 +261,8 @@ xfs_file_aio_read( > xfs_buftarg_t *target = > XFS_IS_REALTIME_INODE(ip) ? > mp->m_rtdev_targp : mp->m_ddev_targp; > - if ((pos | size) & target->bt_meta_sectormask) { > + /* DIO must be aligned to device logical sector size */ > + if ((pos | size) & target->bt_logical_sectormask) { > if (pos == i_size_read(inode)) > return 0; > return -XFS_ERROR(EINVAL); > @@ -641,9 +642,11 @@ xfs_file_dio_aio_write( > struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? > mp->m_rtdev_targp : mp->m_ddev_targp; > > - if ((pos | count) & target->bt_meta_sectormask) > + /* DIO must be aligned to device logical sector size */ > + if ((pos | count) & target->bt_logical_sectormask) > return -XFS_ERROR(EINVAL); > > + /* "unaligned" here means not aligned to a filesystem block */ > if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) > unaligned_io = 1; > > diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c > index 64ca8e9..f7ac335 100644 > --- a/fs/xfs/xfs_ioctl.c > +++ b/fs/xfs/xfs_ioctl.c > @@ -1587,7 +1587,7 @@ xfs_file_ioctl( > XFS_IS_REALTIME_INODE(ip) ? > mp->m_rtdev_targp : mp->m_ddev_targp; > > - da.d_mem = da.d_miniosz = target->bt_meta_sectorsize; > + da.d_mem = da.d_miniosz = target->bt_logical_sectorsize; > da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); > > if (copy_to_user(arg, &da, sizeof(da))) > > _______________________________________________ > xfs mailing list > xfs@xxxxxxxxxxx > http://oss.sgi.com/mailman/listinfo/xfs > _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs