Add block device reservation infrastructure to XFS. This primarily consists of wrappers around the associated block device functions. This mechanism provides the ability to reserve, release and provision a set of blocks in the underlying block device. The mechanism enables the filesystem to adopt a block reservation model with the underlying device. In turn, this allows the filesystem to identify when the underlying device is out of space and propagate an error (-ENOSPC) gracefully before the device itself must handle the condition. The latter typically involves a read-only state transition and thus requires administrator intervention to resolve. Signed-off-by: Brian Foster <bfoster@xxxxxxxxxx> --- fs/xfs/Makefile | 1 + fs/xfs/xfs_mount.h | 5 + fs/xfs/xfs_thin.c | 273 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_thin.h | 9 ++ fs/xfs/xfs_trace.h | 27 ++++++ 5 files changed, 315 insertions(+) create mode 100644 fs/xfs/xfs_thin.c create mode 100644 fs/xfs/xfs_thin.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index f646391..92ea714 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -88,6 +88,7 @@ xfs-y += xfs_aops.o \ xfs_super.o \ xfs_symlink.o \ xfs_sysfs.o \ + xfs_thin.o \ xfs_trans.o \ xfs_xattr.o \ kmem.o \ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b570984..3696700 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -147,6 +147,11 @@ typedef struct xfs_mount { * to various other kinds of pain inflicted on the pNFS server. */ __uint32_t m_generation; + + bool m_thin_reserve; + struct mutex m_thin_res_lock; + uint32_t m_thin_sectpb; + sector_t m_thin_res; } xfs_mount_t; /* diff --git a/fs/xfs/xfs_thin.c b/fs/xfs/xfs_thin.c new file mode 100644 index 0000000..ce6f373 --- /dev/null +++ b/fs/xfs/xfs_thin.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2016 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_bmap.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_fsops.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_sysfs.h" +/* XXX: above copied from xfs_mount.c */ +#include "xfs_thin.h" + +/* + * Notes/Issues: + * + * - Reservation support depends on the '-o discard' mount option so freed + * extents are returned to the pool. + * - The absolute reservation value API is potentially racy. We can cover our + * own reservations/provisions with a mutex, but a delta reservation API might + * be better. + * - Local reservation accounting is not necessarily correct/accurate. + * Reservation leakage has been reproduced, particularly in ENOSPC conditions. + * The discard mechanism to return blocks to dm-thin has not been totally + * reliable either, which means filling, removing and filling an fs causes + * some space to be lost. This can be worked around with fstrim for the time + * being. + * - The locking in xfs_mod_fdblocks() is not quite correct/safe. Sleeping from + * invalid context BUG()'s are expected. Needs to be reworked. + * - Worst case reservation means each XFS filesystem block is considered a new + * dm block allocation. This translates to a significant amount of space given + * larger dm block sizes. For example, 4k XFS blocks to 64k dm blocks means + * we'll hit ENOSPC sooner and more frequently than typically expected. + * - The above also means large fallocate requests are problematic. Need to find + * a workaround for this. Perhaps a reduced reservation is safe for known + * contiguous extents? E.g., xfs_bmapi_write() w/ nimaps = 1; + * - The xfs_mod_fdblocks() implementation means the XFS reserve pool blocks are + * also reserved from the thin pool. XFS defaults to 8192 reserve pool blocks + * in most cases, which translates to 512MB of reserved space. This can be + * tuned with: 'xfs_io -xc "resblks <blks>" <mnt>'. Note that insufficient + * reserves will result in errors in unexpected areas of code (e.g., page + * discards on writeback, inode unlinked list removal failures, etc.). + * - The existing xfs_reserve_blocks() implementation is flaky and does not + * correctly reserve in the event of xfs_mod_fdblocks() failure. This will + * likely require some fixes independent of this feature. It also may depend + * on some kind of (currently undefined) "query available reservation" or + * "perform partial reservation" API to support partial XFS reserved blocks + * allocation. + */ + +/* + * Convert an fsb count to a sector reservation. + */ +static inline sector_t +XFS_FSB_TO_SECT( + struct xfs_mount *mp, + xfs_fsblock_t fsb) +{ + sector_t bb; + + bb = fsb * mp->m_thin_sectpb; + return bb; +} + +/* + * Reserve blocks from the underlying block device. + */ +int +xfs_thin_reserve( + struct xfs_mount *mp, + xfs_fsblock_t fsb) +{ + int error; + sector_t bb; + + bb = XFS_FSB_TO_SECT(mp, fsb); + + mutex_lock(&mp->m_thin_res_lock); + + error = blk_reserve_space(mp->m_ddev_targp->bt_bdev, + mp->m_thin_res + bb); + if (error) { + if (error == -ENOSPC) + trace_xfs_thin_reserve_enospc(mp, mp->m_thin_res, bb); + goto out; + } + + trace_xfs_thin_reserve(mp, mp->m_thin_res, bb); + mp->m_thin_res += bb; + +out: + mutex_unlock(&mp->m_thin_res_lock); + return error; +} + +static int +__xfs_thin_unreserve( + struct xfs_mount *mp, + sector_t bb) +{ + int error; + + if (bb > mp->m_thin_res) { + WARN(1, "unres (%lu) exceeds current res (%lu)", bb, + mp->m_thin_res); + bb = mp->m_thin_res; + } + + error = blk_reserve_space(mp->m_ddev_targp->bt_bdev, + mp->m_thin_res - bb); + if (error) + return error;; + + trace_xfs_thin_unreserve(mp, mp->m_thin_res, bb); + mp->m_thin_res -= bb; + + return error; +} + +/* + * Release a reservation back to the block device. + */ +int +xfs_thin_unreserve( + struct xfs_mount *mp, + xfs_fsblock_t fsb) +{ + int error; + sector_t bb; + + bb = XFS_FSB_TO_SECT(mp, fsb); + + mutex_lock(&mp->m_thin_res_lock); + error = __xfs_thin_unreserve(mp, bb); + mutex_unlock(&mp->m_thin_res_lock); + + return error; +} + +/* + * Given a recently allocated extent, ask the block device to provision the + * underlying space. + */ +int +xfs_thin_provision( + struct xfs_mount *mp, + xfs_fsblock_t offset, + xfs_fsblock_t len) +{ + sector_t bbres; + sector_t bbstart, bblen; + int count; + int error; + + bbstart = XFS_FSB_TO_DADDR(mp, offset); + bbstart = round_down(bbstart, mp->m_thin_sectpb); + bblen = XFS_FSB_TO_BB(mp, len); + bblen = round_up(bblen, mp->m_thin_sectpb); + + bbres = XFS_FSB_TO_SECT(mp, len); + + mutex_lock(&mp->m_thin_res_lock); + + WARN_ON(bblen > mp->m_thin_res); + + /* + * XXX: alloc count here is kind of a hack. Need to find a local + * mechanism. Pass res to blk_provision_space? + */ + count = blk_provision_space(mp->m_ddev_targp->bt_bdev, bbstart, bblen); + if (count < 0) { + error = count; + goto out; + } + + trace_xfs_thin_provision(mp, count, bbres); + + /* + * Update the local reservation based on the blocks that were actually + * allocated and release the rest of the unused reservation. + */ + mp->m_thin_res -= count; + bbres -= count; + error = __xfs_thin_unreserve(mp, bbres); +out: + mutex_unlock(&mp->m_thin_res_lock); + return error; +} + +int +xfs_thin_init( + struct xfs_mount *mp) +{ + sector_t res1 = 0, res2 = 0; + int error = 0; + unsigned int io_opt; + + mp->m_thin_reserve = false; + + if (!(mp->m_flags & XFS_MOUNT_DISCARD)) + goto out; + + mutex_init(&mp->m_thin_res_lock); + + /* use optimal I/O size as dm-thin block size */ + io_opt = bdev_io_opt(mp->m_super->s_bdev); + if ((io_opt % BBSIZE) || (io_opt < mp->m_sb.sb_blocksize)) + goto out; + mp->m_thin_sectpb = io_opt / BBSIZE; + + /* + * Run some test calls to determine whether the block device has + * support. Note: res is in 512b sector units. + */ + error = xfs_thin_reserve(mp, 1); + if (error) + goto out; + + error = blk_get_reserved_space(mp->m_ddev_targp->bt_bdev, &res1); + if (error) + goto out; + + error = xfs_thin_unreserve(mp, 1); + if (error) + goto out; + + error = blk_get_reserved_space(mp->m_ddev_targp->bt_bdev, &res2); + if (error) + goto out; + + ASSERT(res1 >= 1 && res2 == 0); + mp->m_thin_reserve = true; +out: + xfs_notice(mp, "Thin pool reservation %s", mp->m_thin_reserve ? + "enabled" : "disabled"); + if (mp->m_thin_reserve) + xfs_notice(mp, "Thin reserve blocksize: %u sectors", + mp->m_thin_sectpb); + return 0; +} diff --git a/fs/xfs/xfs_thin.h b/fs/xfs/xfs_thin.h new file mode 100644 index 0000000..ce5a019 --- /dev/null +++ b/fs/xfs/xfs_thin.h @@ -0,0 +1,9 @@ +#ifndef __XFS_THIN_H__ +#define __XFS_THIN_H__ + +int xfs_thin_init(struct xfs_mount *); +int xfs_thin_reserve(struct xfs_mount *, xfs_fsblock_t); +int xfs_thin_unreserve(struct xfs_mount *, xfs_fsblock_t); +int xfs_thin_provision(struct xfs_mount *, xfs_fsblock_t, xfs_fsblock_t); + +#endif /* __XFS_THIN_H__ */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 391d797..01b0702 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2185,6 +2185,33 @@ DEFINE_DISCARD_EVENT(xfs_discard_toosmall); DEFINE_DISCARD_EVENT(xfs_discard_exclude); DEFINE_DISCARD_EVENT(xfs_discard_busy); +DECLARE_EVENT_CLASS(xfs_thin_class, + TP_PROTO(struct xfs_mount *mp, sector_t total, sector_t res), + TP_ARGS(mp, total, res), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(sector_t, total) + __field(sector_t, res) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->total = total; + __entry->res = res; + ), + TP_printk("dev %d:%d total %lu res %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->total, + __entry->res) +) + +#define DEFINE_THIN_EVENT(name) \ +DEFINE_EVENT(xfs_thin_class, name, \ + TP_PROTO(struct xfs_mount *mp, sector_t total, sector_t res), \ + TP_ARGS(mp, total, res)) +DEFINE_THIN_EVENT(xfs_thin_reserve); +DEFINE_THIN_EVENT(xfs_thin_reserve_enospc); +DEFINE_THIN_EVENT(xfs_thin_unreserve); +DEFINE_THIN_EVENT(xfs_thin_provision); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH -- 2.4.3 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html