[RFC PATCH] ceph: fix ceph_fallocate() ignoring of FALLOC_FL_ALLOCATE_RANGE mode

Viacheslav Dubeyko <slava@xxxxxxxxxxx> · Tue, 18 Mar 2025 16:47:52 -0700

From: Viacheslav Dubeyko <Slava.Dubeyko@xxxxxxx>

The fio test reveals the issue for the case of file size
is not aligned on 4K (for example, 4122, 8600, 10K etc).
The reproducing path:

target_dir=/mnt/cephfs
report_dir=/report
size=100ki
nrfiles=10
pattern=0x74657374

fio --runtime=5M --rw=write --bs=4k --size=$size \
--nrfiles=$nrfiles --numjobs=16 --buffer_pattern=0x74657374 \
--iodepth=1 --direct=0 --ioengine=libaio --group_reporting \
--name=fiotest --directory=$target_dir \
--output $report_dir/sequential_write.log

fio --runtime=5M --verify_only --verify=pattern \
--verify_pattern=0x74657374 --size=$size --nrfiles=$nrfiles \
--numjobs=16 --bs=4k --iodepth=1 --direct=0 --name=fiotest \
--ioengine=libaio --group_reporting --verify_fatal=1 \
--verify_state_save=0 --directory=$target_dir \
--output $report_dir/verify_sequential_write.log

The essence of the issue that the write phase calls
the fallocate() to pre-allocate 10K of file size and, then,
it writes only 8KB of data. However, CephFS code
in ceph_fallocate() ignores the FALLOC_FL_ALLOCATE_RANGE
mode and, finally, file is 8K in size only. As a result,
verification phase initiates wierd behaviour of CephFS code.
CephFS code calls ceph_fallocate() again and completely
re-write the file content by some garbage. Finally,
verification phase fails because file contains unexpected
data pattern.

fio: got pattern 'd0', wanted '74'. Bad bits 3
fio: bad pattern block offset 0
pattern: verify failed at file /mnt/cephfs/fiotest.3.0 offset 0, length 2631490270 (requested block: offset=0, length=4096, flags=8)
fio: verify type mismatch (36969 media, 18 given)
fio: got pattern '25', wanted '74'. Bad bits 3
fio: bad pattern block offset 0
pattern: verify failed at file /mnt/cephfs/fiotest.4.0 offset 0, length 1694436820 (requested block: offset=0, length=4096, flags=8)
fio: verify type mismatch (6714 media, 18 given)

Expected state ot the file:

hexdump -C ./fiotest.0.0
00000000 74 65 73 74 74 65 73 74 74 65 73 74 74 65 73 74 |testtesttesttest| *
00002000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| *
00002190 00 00 00 00 00 00 00 00 |........|
00002198

Real state of the file:

head -n 2 ./fiotest.0.0
00000000 35 e0 28 cc 38 a0 99 16 06 9c 6a a9 f2 cd e9 0a |5.(.8.....j.....|
00000010 80 53 2a 07 09 e5 0d 15 70 4a 25 f7 0b 39 9d 18 |.S*.....pJ%..9..|

The patch reworks ceph_fallocate() method by means of adding
support of FALLOC_FL_ALLOCATE_RANGE mode. Also, it adds the checking
that new size can be allocated by means of checking inode_newsize_ok(),
fsc->max_file_size, and ceph_quota_is_max_bytes_exceeded().
Invalidation and making dirty logic is moved into dedicated
methods.

There is one peculiarity for the case of generic/103 test.
CephFS logic receives max_file_size from MDS server and it's 1TB
by default. As a result, generic/103 can fail if max_file_size
is smaller than volume size:

generic/103 6s ... - output mismatch (see /home/slavad/XFSTESTS/xfstests-dev/results//generic/103.out.bad)
--- tests/generic/103.out 2025-02-25 13:05:32.494668258 -0800
+++ /home/slavad/XFSTESTS/xfstests-dev/results//generic/103.out.bad 2025-03-17 22:28:26.475750878 -0700
@ -1,2 +1,3 @
QA output created by 103
+fallocate: No space left on device
Silence is golden.

The solution is to set the max_file_size equal to volume size:

sudo ceph fs volume info cephfs
{
    "mon_addrs": [
        "192.168.1.213:6789",
        "192.168.1.212:6789",
        "192.168.1.195:6789"
    ],
    "pools": {
        "data": [
            {
                "avail": 7531994808320,
                "name": "cephfs_data",
                "used": 163955761152
            }
        ],
        "metadata": [
            {
                "avail": 7531994808320,
                "name": "cephfs_metadata",
                "used": 706346483
            }
        ]
    }
}

sudo ceph fs set cephfs max_file_size 7531994808320

sudo ./check generic/103
FSTYP         -- ceph
PLATFORM      -- Linux/x86_64 ceph-0005 6.14.0-rc5+ #82 SMP PREEMPT_DYNAMIC Tue Mar 18 14:12:08 PDT 2025
MKFS_OPTIONS  -- 192.168.1.212:6789:/scratch
MOUNT_OPTIONS -- -o name=admin 192.168.1.212:6789:/scratch /mnt/cephfs/scratch

generic/103 6s ...  8s
Ran: generic/103
Passed all 1 tests

Signed-off-by: Viacheslav Dubeyko <Slava.Dubeyko@xxxxxxx>
---
 fs/ceph/file.c | 114 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 91 insertions(+), 23 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 851d70200c6b..7bf283eba29a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -2655,24 +2655,64 @@ static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
 	return ret;
 }
 
-static long ceph_fallocate(struct file *file, int mode,
+static inline
+void ceph_fallocate_mark_dirty(struct inode *inode,
+				struct ceph_cap_flush **prealloc_cf)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int dirty;
+
+	spin_lock(&ci->i_ceph_lock);
+	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+					prealloc_cf);
+	spin_unlock(&ci->i_ceph_lock);
+
+	if (dirty)
+		__mark_inode_dirty(inode, dirty);
+}
+
+static inline
+int ceph_fallocate_invalidate(struct inode *inode,
+				struct ceph_cap_flush **prealloc_cf,
 				loff_t offset, loff_t length)
+{
+	int ret = 0;
+
+	filemap_invalidate_lock(inode->i_mapping);
+	ceph_fscache_invalidate(inode, false);
+	ceph_zero_pagecache_range(inode, offset, length);
+	ret = ceph_zero_objects(inode, offset, length);
+	if (!ret)
+		ceph_fallocate_mark_dirty(inode, prealloc_cf);
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	return ret;
+}
+
+static long ceph_fallocate(struct file *file, int mode,
+			   loff_t offset, loff_t length)
 {
 	struct ceph_file_info *fi = file->private_data;
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_cap_flush *prealloc_cf;
 	struct ceph_client *cl = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 	int want, got = 0;
-	int dirty;
-	int ret = 0;
 	loff_t endoff = 0;
 	loff_t size;
+	loff_t new_size;
+	int ret = 0;
 
 	doutc(cl, "%p %llx.%llx mode %x, offset %llu length %llu\n",
 	      inode, ceph_vinop(inode), mode, offset, length);
 
-	if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+	if (mode == FALLOC_FL_ALLOCATE_RANGE ||
+	    mode == (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) {
+		/*
+		 * Supported modes. Continue logic.
+		 */
+	} else
 		return -EOPNOTSUPP;
 
 	if (!S_ISREG(inode->i_mode))
@@ -2687,18 +2727,35 @@ static long ceph_fallocate(struct file *file, int mode,
 
 	inode_lock(inode);
 
+	size = i_size_read(inode);
+	new_size = offset + length;
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && new_size > size) {
+		ret = inode_newsize_ok(inode, new_size);
+		if (ret)
+			goto unlock;
+
+		if (new_size > max(size, fsc->max_file_size)) {
+			ret = -ENOSPC;
+			goto unlock;
+		}
+
+		if (ceph_quota_is_max_bytes_exceeded(inode, offset + length)) {
+			ret = -EDQUOT;
+			goto unlock;
+		}
+	}
+
 	if (ceph_snap(inode) != CEPH_NOSNAP) {
 		ret = -EROFS;
 		goto unlock;
 	}
 
-	size = i_size_read(inode);
-
-	/* Are we punching a hole beyond EOF? */
-	if (offset >= size)
-		goto unlock;
-	if ((offset + length) > size)
-		length = size - offset;
+	if ((mode & FALLOC_FL_KEEP_SIZE) || (mode & FALLOC_FL_PUNCH_HOLE)) {
+		/* Are we punching a hole beyond EOF? */
+		if (offset >= size)
+			goto unlock;
+	}
 
 	if (fi->fmode & CEPH_FILE_MODE_LAZY)
 		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
@@ -2713,20 +2770,31 @@ static long ceph_fallocate(struct file *file, int mode,
 	if (ret)
 		goto put_caps;
 
-	filemap_invalidate_lock(inode->i_mapping);
-	ceph_fscache_invalidate(inode, false);
-	ceph_zero_pagecache_range(inode, offset, length);
-	ret = ceph_zero_objects(inode, offset, length);
+	if (mode & FALLOC_FL_PUNCH_HOLE) {
+		if ((offset + length) > size)
+			length = size - offset;
 
-	if (!ret) {
-		spin_lock(&ci->i_ceph_lock);
-		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
-					       &prealloc_cf);
-		spin_unlock(&ci->i_ceph_lock);
-		if (dirty)
-			__mark_inode_dirty(inode, dirty);
+		ret = ceph_fallocate_invalidate(inode, &prealloc_cf,
+						offset, length);
+	} else if (mode & FALLOC_FL_KEEP_SIZE) {
+		/*
+		 * If the FALLOC_FL_KEEP_SIZE flag is specified in mode,
+		 * then the file size will not be changed even
+		 * if offset+size is greater than the file size.
+		 */
+	} else {
+		/*
+		 * FALLOC_FL_ALLOCATE_RANGE case:
+		 * The default operation (i.e., mode is zero) of fallocate()
+		 * allocates the disk space within the range specified by
+		 * offset and size.  The file size will be changed if
+		 * offset+size is greater than the file size.
+		 */
+		if ((offset + length) > size) {
+			ceph_inode_set_size(inode, offset + length);
+			ceph_fallocate_mark_dirty(inode, &prealloc_cf);
+		}
 	}
-	filemap_invalidate_unlock(inode->i_mapping);
 
 put_caps:
 	ceph_put_cap_refs(ci, got);
-- 
2.48.0