These two patches address two issues I found while examining the behaviour of posix_fallocate() on a file which had recently been locked. posix_fallocate() allocates space in a file by writing bytes, so locking is important if sharing is possible. The first patch fixes a bug which can lead to data corruption. The second fixes a performance issue which can be demonstrated without the first patch, but is made more likely by that patch (it was the performance issue that caused me to start looking). The program below can be used to demonstrate both problems. Given a remote filesystem and 2 mount points, it mounts the filesystem twice using nosharecache so that it acts like two separate clients. It then creates a file and uses locking and posix_fallocate to trigger the behaviours. There are two threads. Each reads the first byte, increments it, and writes it out again. This is done with proper file locking, so the result should be '2'. With the current kernel it is '1'. because one of the posix_fallocate() corrupts a '1' making it '0'. Mount options can be specified, so it can be confirmed that the befaviour is the same for v3 and v4.1. If used with v4.2 to a server that supports ALLOCATE, the corruption doesn't happen. To see the performance issue, a network trace is required. Allocating 1 megabyte as this program does, doesn't take very long and it is hard to measure a performance difference. Allocating several gigabytes can make the difference quite noticeable. Thanks, NeilBrown --- NeilBrown (2): NFS: invalidate file size when taking a lock. NFS: Optimize fallocate by refreshing mapping when needed. fs/nfs/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) -- /* nfs-lock-allocate.c * Test for performing fallocate() after locking an NFS file. * The nfs client must revalidate the size after getting the * lock, and should merge all the 1-byte writes past EOF into * fewer large writes. */ #define _GNU_SOURCE #include <sys/types.h> #include <sys/socket.h> #include <unistd.h> #include <stdlib.h> #include <sys/mount.h> #include <sys/wait.h> #include <sys/fcntl.h> #include <stdio.h> static void setup(char *mnt, char *remote, char *options) { char *buf = NULL; int fd; asprintf(&buf, "/bin/mount %s %s -o %s,nosharecache", remote, mnt, options); system(buf); free(buf); } static void prepare(char *mnt, char *file) { char *buf = NULL; int fd; asprintf(&buf, "%s/%s", mnt, file); unlink(buf); fd = open(buf, O_RDWR|O_CREAT|O_TRUNC, 0600); close(fd); free(buf); } static void teardown(char *mnt) { umount(mnt); } static void test(int thread, char *mnt, char *file, int sync_fd) { /* thread 1 writes a zero the file. * thread 1 then gets the lock, allocates the file, and increments the byte. * thread 2 then gets the lock and increments the byte. */ char *path = NULL; int fd; char buf[100]; struct flock lk; asprintf(&path, "%s/%s", mnt, file); fd = open(path, O_RDWR, 0600); if (fd < 0) { perror(path); exit(1); } free(path); buf[0] = 0; if (thread == 1) { write(fd, buf, 1); fsync(fd); } if (thread == 2) { /* Allow thread 1 to get the lock first */ write(sync_fd, buf, 1); read(sync_fd, buf, 1); } lk.l_type = F_WRLCK; lk.l_whence = SEEK_SET; lk.l_start = 0; lk.l_len = 0; fcntl(fd, F_SETLKW, &lk); if (thread == 1) { /* Now thread 2 can wait for the lock */ write(sync_fd, buf, 1); read(sync_fd, buf, 1); } posix_fallocate(fd, 0, 1024*1024); lseek(fd, 0, 0); read(fd, buf, 1); lseek(fd, 0, 0); buf[0] += 1; write(fd, buf, 1); close(fd); } static int check(char *mnt, char *file) { char *path = NULL; char buf[100]; int fd; asprintf(&path, "%s/%s", mnt, file); fd = open(path, O_RDWR|O_CREAT, 0600); if (read(fd, buf, 1) != 1) { perror("read"); close(fd); return 2; } close(fd); if (buf[0] != 2) return 3; return 0; } int main(int argc, char *argv[]) { char *remote; char *options; char *mnt1; char *mnt2; char *file; int socks[2]; int ret; if (argc != 6) { fprintf(stderr, "Usage: %s server:/path mount-options /mntpoint1 /mntpoint2 file\n", argv[0]); exit(1); } remote = argv[1]; options = argv[2]; mnt1 = argv[3]; mnt2 = argv[4]; file = argv[5]; setup(mnt1, remote, options); setup(mnt2, remote, options); prepare(mnt1, file); socketpair(AF_UNIX, SOCK_STREAM, 0, socks); switch (fork()) { case -1: perror("fork"); exit(1); case 0: /* child */ test(1, mnt1, file, socks[0]); exit(1); break; default: test(2, mnt2, file, socks[1]); break; } wait(NULL); teardown(mnt1); teardown(mnt2); setup(mnt1, remote, options); ret = check(mnt1, file); teardown(mnt1); if (ret == 3) fprintf(stderr, "File was corrupted\n"); exit(ret); } -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html