Commit 23d0127096cb ("fs/sync.c: make sync_file_range(2) use WB_SYNC_NONE writeback") claims that sync_file_range(2) syscall was "created for userspace to be able to issue background writeout and so waiting for in-flight IO is undesirable there" and changes the writeback (back) to WB_SYNC_NONE. This claim is only partially true. It is true for users that use the flag SYNC_FILE_RANGE_WRITE by itself, as does PostgreSQL, the user that was the reason for changing to WB_SYNC_NONE writeback. However, that claim is not true for users that use that flag combination SYNC_FILE_RANGE_{WAIT_BEFORE|WRITE|_WAIT_AFTER}. Those users explicitly requested to wait for in-flight IO as well as to writeback of dirty pages. Re-brand that flag combination as SYNC_FILE_RANGE_WRITE_AND_WAIT and use the helper filemap_write_and_wait_range(), that uses WB_SYNC_ALL writeback, to perform the full range sync request. Fixes: 23d0127096cb ("fs/sync.c: make sync_file_range(2) use WB_SYNC_NONE") Acked-by: Jan Kara <jack@xxxxxxxx> Signed-off-by: Amir Goldstein <amir73il@xxxxxxxxx> --- Andrew, Oops found a braino in my patch. Here is v3. You may find the discussion on V1 here: https://lore.kernel.org/lkml/20190409114922.30095-1-amir73il@xxxxxxxxx/ Thanks, Amir. Changes since v2: - Return after filemap_write_and_wait_range() Changes since v1: - Remove non-guaranties of the API from commit message - Added ACK by Jan fs/sync.c | 24 +++++++++++++++++------- include/uapi/linux/fs.h | 3 +++ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/sync.c b/fs/sync.c index b54e0541ad89..3a923652d720 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -18,8 +18,8 @@ #include <linux/backing-dev.h> #include "internal.h" -#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ - SYNC_FILE_RANGE_WAIT_AFTER) +#define VALID_FLAGS (SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WRITE_AND_WAIT | \ + SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WAIT_AFTER) /* * Do the filesystem syncing work. For simple filesystems @@ -235,9 +235,9 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) } /* - * sys_sync_file_range() permits finely controlled syncing over a segment of + * ksys_sync_file_range() permits finely controlled syncing over a segment of * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is - * zero then sys_sync_file_range() will operate from offset out to EOF. + * zero then ksys_sync_file_range() will operate from offset out to EOF. * * The flag bits are: * @@ -254,7 +254,7 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) * Useful combinations of the flag bits are: * * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages - * in the range which were dirty on entry to sys_sync_file_range() are placed + * in the range which were dirty on entry to ksys_sync_file_range() are placed * under writeout. This is a start-write-for-data-integrity operation. * * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which @@ -266,10 +266,13 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait * for that operation to complete and to return the result. * - * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER: + * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER + * (a.k.a. SYNC_FILE_RANGE_WRITE_AND_WAIT): * a traditional sync() operation. This is a write-for-data-integrity operation * which will ensure that all pages in the range which were dirty on entry to - * sys_sync_file_range() are committed to disk. + * ksys_sync_file_range() are written to disk. It should be noted that disk + * caches are not flushed by this call, so there are no guarantees here that the + * data will be available on disk after a crash. * * * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any @@ -338,6 +341,13 @@ int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes, mapping = f.file->f_mapping; ret = 0; + if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) == + SYNC_FILE_RANGE_WRITE_AND_WAIT) { + /* Unlike SYNC_FILE_RANGE_WRITE alone uses WB_SYNC_ALL */ + ret = filemap_write_and_wait_range(mapping, offset, endbyte); + goto out_put; + } + if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { ret = file_fdatawait_range(f.file, offset, endbyte); if (ret < 0) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 121e82ce296b..59c71fa8c553 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -320,6 +320,9 @@ struct fscrypt_key { #define SYNC_FILE_RANGE_WAIT_BEFORE 1 #define SYNC_FILE_RANGE_WRITE 2 #define SYNC_FILE_RANGE_WAIT_AFTER 4 +#define SYNC_FILE_RANGE_WRITE_AND_WAIT (SYNC_FILE_RANGE_WRITE | \ + SYNC_FILE_RANGE_WAIT_BEFORE | \ + SYNC_FILE_RANGE_WAIT_AFTER) /* * Flags for preadv2/pwritev2: -- 2.17.1