On Mon, Oct 19, 2015 at 02:27:13PM +1100, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > Both direct IO and DAX pass an offset and count into get_blocks that > will overflow a s64 variable when an IO goes into the last supported > block in a file (i.e. at offset 2^63 - 1FSB bytes). This can be seen > from the tracing: > > xfs_get_blocks_alloc: [...] offset 0x7ffffffffffff000 count 4096 > xfs_gbmap_direct: [...] offset 0x7ffffffffffff000 count 4096 > xfs_gbmap_direct_none:[...] offset 0x7ffffffffffff000 count 4096 > > 0x7ffffffffffff000 + 4096 = 0x8000000000000000, and hence that > overflows the s64 offset and we fail to detect the need for a > filesize update and an ioend is not allocated. > > This is *mostly* avoided for direct IO because such extending IOs > occur with full block allocation, and so the "IS_UNWRITTEN()" check > still evaluates as true and we get an ioend that way. However, doing > single sector extending IOs to this last block will expose the fact > that file size updates will not occur after the first allocating > direct IO as the overflow will then be exposed. > > There is one further complexity: the DAX page fault path also > exposes the same issue in block allocation. However, page faults > cannot extend the file size, so in this case we want to allocate the > block but do not want to allocate an ioend to enable file size > update at IO completion. Hence we now need to distinguish between > the direct IO patch allocation and dax fault path allocation to > avoid leaking ioend structures. > > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> > --- Reviewed-by: Brian Foster <bfoster@xxxxxxxxxx> > fs/xfs/xfs_aops.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ > fs/xfs/xfs_aops.h | 2 ++ > fs/xfs/xfs_file.c | 6 +++--- > 3 files changed, 49 insertions(+), 9 deletions(-) > > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c > index e4fff58..366e41eb 100644 > --- a/fs/xfs/xfs_aops.c > +++ b/fs/xfs/xfs_aops.c > @@ -1259,13 +1259,28 @@ xfs_vm_releasepage( > * the DIO. There is only going to be one reference to the ioend and its life > * cycle is constrained by the DIO completion code. hence we don't need > * reference counting here. > + * > + * Note that for DIO, an IO to the highest supported file block offset (i.e. > + * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64 > + * bit variable. Hence if we see this overflow, we have to assume that the IO is > + * extending the file size. We won't know for sure until IO completion is run > + * and the actual max write offset is communicated to the IO completion > + * routine. > + * > + * For DAX page faults, we are preparing to never see unwritten extents here, > + * nor should we ever extend the inode size. Hence we will soon have nothing to > + * do here for this case, ensuring we don't have to provide an IO completion > + * callback to free an ioend that we don't actually need for a fault into the > + * page at offset (2^63 - 1FSB) bytes. > */ > + > static void > xfs_map_direct( > struct inode *inode, > struct buffer_head *bh_result, > struct xfs_bmbt_irec *imap, > - xfs_off_t offset) > + xfs_off_t offset, > + bool dax_fault) > { > struct xfs_ioend *ioend; > xfs_off_t size = bh_result->b_size; > @@ -1278,6 +1293,16 @@ xfs_map_direct( > > trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); > > + /* XXX: preparation for removing unwritten extents in DAX */ > +#if 0 > + if (dax_fault) { > + ASSERT(type == XFS_IO_OVERWRITE); > + trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, > + imap); > + return; > + } > +#endif > + > if (bh_result->b_private) { > ioend = bh_result->b_private; > ASSERT(ioend->io_size > 0); > @@ -1292,7 +1317,8 @@ xfs_map_direct( > ioend->io_size, ioend->io_type, > imap); > } else if (type == XFS_IO_UNWRITTEN || > - offset + size > i_size_read(inode)) { > + offset + size > i_size_read(inode) || > + offset + size < 0) { > ioend = xfs_alloc_ioend(inode, type); > ioend->io_offset = offset; > ioend->io_size = size; > @@ -1354,7 +1380,8 @@ __xfs_get_blocks( > sector_t iblock, > struct buffer_head *bh_result, > int create, > - bool direct) > + bool direct, > + bool dax_fault) > { > struct xfs_inode *ip = XFS_I(inode); > struct xfs_mount *mp = ip->i_mount; > @@ -1467,7 +1494,8 @@ __xfs_get_blocks( > set_buffer_unwritten(bh_result); > /* direct IO needs special help */ > if (create && direct) > - xfs_map_direct(inode, bh_result, &imap, offset); > + xfs_map_direct(inode, bh_result, &imap, offset, > + dax_fault); > } > > /* > @@ -1514,7 +1542,7 @@ xfs_get_blocks( > struct buffer_head *bh_result, > int create) > { > - return __xfs_get_blocks(inode, iblock, bh_result, create, false); > + return __xfs_get_blocks(inode, iblock, bh_result, create, false, false); > } > > int > @@ -1524,7 +1552,17 @@ xfs_get_blocks_direct( > struct buffer_head *bh_result, > int create) > { > - return __xfs_get_blocks(inode, iblock, bh_result, create, true); > + return __xfs_get_blocks(inode, iblock, bh_result, create, true, false); > +} > + > +int > +xfs_get_blocks_dax_fault( > + struct inode *inode, > + sector_t iblock, > + struct buffer_head *bh_result, > + int create) > +{ > + return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); > } > > static void > diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h > index 86afd1a..d39ba25 100644 > --- a/fs/xfs/xfs_aops.h > +++ b/fs/xfs/xfs_aops.h > @@ -58,6 +58,8 @@ int xfs_get_blocks(struct inode *inode, sector_t offset, > struct buffer_head *map_bh, int create); > int xfs_get_blocks_direct(struct inode *inode, sector_t offset, > struct buffer_head *map_bh, int create); > +int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, > + struct buffer_head *map_bh, int create); > void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate); > > extern void xfs_count_page_state(struct page *, int *, int *); > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index 2f7b6bd..7f873bc 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -1508,7 +1508,7 @@ xfs_filemap_page_mkwrite( > xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > > if (IS_DAX(inode)) { > - ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct, > + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, > xfs_end_io_dax_write); > } else { > ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); > @@ -1543,7 +1543,7 @@ xfs_filemap_fault( > * changes to xfs_get_blocks_direct() to map unwritten extent > * ioend for conversion on read-only mappings. > */ > - ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL); > + ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL); > } else > ret = filemap_fault(vma, vmf); > xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > @@ -1570,7 +1570,7 @@ xfs_filemap_pmd_fault( > sb_start_pagefault(inode->i_sb); > file_update_time(vma->vm_file); > xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > - ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct, > + ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault, > xfs_end_io_dax_write); > xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > sb_end_pagefault(inode->i_sb); > -- > 2.5.0 > > _______________________________________________ > xfs mailing list > xfs@xxxxxxxxxxx > http://oss.sgi.com/mailman/listinfo/xfs _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs