Hi folks,
Our consumer reports a behavior change between pre-iomap and iomap
direct io conversion:
If the system crashes after an appending write to a file open with
O_DIRECT | O_SYNC flag set, file i_size won't be updated even if
O_SYNC was marked before.
It can be reproduced by a test program in the attachment with
gcc -o repro repro.c && ./repro testfile && echo c > /proc/sysrq-trigger
After some analysis, we found that before iomap direct I/O conversion,
the timing was roughly (taking Linux 3.10 codebase as an example):
..
- ext4_file_dio_write
- __generic_file_aio_write
..
- ext4_direct_IO # generic_file_direct_write
- ext4_ext_direct_IO
- ext4_ind_direct_IO # final_size > inode->i_size
- ..
- ret = blockdev_direct_IO()
- i_size_write(inode, end) # orphan && ret > 0 &&
# end > inode->i_size
- ext4_mark_inode_dirty()
- ...
- generic_write_sync # handling O_SYNC
So the dirty inode meta will be committed into journal immediately
if O_SYNC is set. However, After commit 569342dc2485 ("ext4: move
inode extension/truncate code out from ->iomap_end() callback"),
the new behavior seems as below:
..
- ext4_dio_write_iter
- ext4_dio_write_checks # extend = 1
- iomap_dio_rw
- __iomap_dio_rw
- iomap_dio_complete
- generic_write_sync
- ext4_handle_inode_extension # extend = 1
So that i_size will be recorded only after generic_write_sync() is
called. So O_SYNC won't flush the update i_size to the disk.
On the other side, after a quick look of XFS side, it will record
i_size changes in xfs_dio_write_end_io() so it seems that it doesn't
have this problem.
Thanks,
Gao Xiang
#define _GNU_SOURCE
#include <unistd.h>
#include <stdio.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/types.h>
#include <linux/fs.h>
#include <fcntl.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#define PAGE_SIZE 4096
int test(char* file)
{
char* buf = NULL;
int ret = 0;
int i = 0;
posix_memalign((void**)(&buf), PAGE_SIZE, PAGE_SIZE);
memset(buf, 0, PAGE_SIZE);
for (i = 0 ; i < PAGE_SIZE ; ++i) buf[i] = i;
int fd = open(file, O_WRONLY|O_CREAT|O_DIRECT|O_SYNC);
if (fd == -1)
{
fprintf(stderr, "%s: %s\n", file, strerror(errno));
exit(1);
}
struct stat st;
ret = fstat(fd, &st);
if (ret != 0)
{
fprintf(stderr, "%s: %s\n", file, strerror(errno));
exit(1);
}
int offset = st.st_size;
ret = pwrite(fd, buf, PAGE_SIZE, offset);
if (ret != PAGE_SIZE)
{
fprintf(stderr, "write fail: ret %d %s\n", ret, strerror(errno));
}
close(fd);
return 0;
}
int main(int argc, char ** argv)
{
int ret = 0;
char file[1024] = {};
if (argc != 2)
{
fprintf(stderr, "usage: %s path-to-write\n", argv[0]);
exit(2);
}
snprintf(file, sizeof(file), "%s_%d", argv[1], 0);
test(file);
return 0;
}