Ok, sorry for the long delay, I was distracted (and hoping that Al would come up with a patch). Anyway, attached is the patch I think we should do for this issue. It is fairly simple: - it adds a "f_pos_mutex" to the "struct file". - it adds a new FMODE_ATOMIC_POS flag to the file mode flags to mark things that need atomic f_pos updates - it makes the "struct fd" flags be two flags rather than one: the second flag is for "unlock f_pos_mutex when done" - it introduces "fd[get,put]_pos()" which gets the f_pos_mutex when required - it makes read/write/lseek use that. It's pretty damn straightforward, I think, and is minimally serializing. Al, comments? Yongzhi Pan, this is pretty much untested, but it's pretty simple and it does fix your test-case. Linus On Thu, Feb 20, 2014 at 9:14 AM, Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> wrote: > Yes, I do think we violate POSIX here because of how we handle f_pos > (the earlier thread from 2006 you point to talks about the "thread > safe" part, the point here about the actual wording of "atomic" is a > separate issue). > > Long long ago we used to just pass in the pointer to f_pos directly, > and then the low-level write would update it all under the inode > semaphore, and all was good. > > And then we ended up having tons of problems with non-regular files > and drivers accessing f_pos and having nasty races with it because > they did *not* have any locking (and very fundamentally didn't want > any), and we broke the serialization of f_pos. We still do the *IO* > atomically, but yes, the f_pos access itself is outside the lock. > > Ho humm.. Al, any ideas of how to fix this?
fs/file_table.c | 1 + fs/open.c | 4 ++++ fs/read_write.c | 46 ++++++++++++++++++++++++++++++++++++---------- include/linux/file.h | 6 +++--- include/linux/fs.h | 6 +++++- 5 files changed, 49 insertions(+), 14 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index 5fff9030be34..5b24008ea4f6 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -135,6 +135,7 @@ struct file *get_empty_filp(void) atomic_long_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); + mutex_init(&f->f_pos_lock); eventpoll_init_file(f); /* f->f_version: 0 */ return f; diff --git a/fs/open.c b/fs/open.c index 4b3e1edf2fe4..b9ed8b25c108 100644 --- a/fs/open.c +++ b/fs/open.c @@ -705,6 +705,10 @@ static int do_dentry_open(struct file *f, return 0; } + /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ + if (S_ISREG(inode->i_mode)) + f->f_mode |= FMODE_ATOMIC_POS; + f->f_op = fops_get(inode->i_fop); if (unlikely(WARN_ON(!f->f_op))) { error = -ENODEV; diff --git a/fs/read_write.c b/fs/read_write.c index edc5746a902a..9273ec92f07e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -264,10 +264,36 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence) } EXPORT_SYMBOL(vfs_llseek); +/* + * We only lock f_pos if we have threads (in which case "need_put" + * will be set) or if the file might be shared with another process + * (in which case the file count is elevated). + */ +static struct fd fdget_pos(int fd) +{ + struct fd f = fdget(fd); + struct file *file = f.file; + + if (file && (file->f_mode & FMODE_ATOMIC_POS)) { + if (f.need_put || file_count(file) > 1) { + f.need_pos_unlock = 1; + mutex_lock(&file->f_pos_lock); + } + } + return f; +} + +static void fdput_pos(struct fd f) +{ + if (f.need_pos_unlock) + mutex_unlock(&f.file->f_pos_lock); + fdput(f); +} + SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { off_t retval; - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -278,7 +304,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } - fdput(f); + fdput_pos(f); return retval; } @@ -498,7 +524,7 @@ static inline void file_pos_write(struct file *file, loff_t pos) SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -506,7 +532,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) ret = vfs_read(f.file, buf, count, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } return ret; } @@ -514,7 +540,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -522,7 +548,7 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, ret = vfs_write(f.file, buf, count, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } return ret; @@ -797,7 +823,7 @@ EXPORT_SYMBOL(vfs_writev); SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -805,7 +831,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, ret = vfs_readv(f.file, vec, vlen, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } if (ret > 0) @@ -817,7 +843,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -825,7 +851,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, ret = vfs_writev(f.file, vec, vlen, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } if (ret > 0) diff --git a/include/linux/file.h b/include/linux/file.h index cbacf4faf447..636634a09c92 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -28,7 +28,7 @@ static inline void fput_light(struct file *file, int fput_needed) struct fd { struct file *file; - int need_put; + unsigned need_put:1, need_pos_unlock:1; }; static inline void fdput(struct fd fd) @@ -44,7 +44,7 @@ static inline struct fd fdget(unsigned int fd) { int b; struct file *f = fget_light(fd, &b); - return (struct fd){f,b}; + return (struct fd){f,b,0}; } extern struct file *fget_raw(unsigned int fd); @@ -54,7 +54,7 @@ static inline struct fd fdget_raw(unsigned int fd) { int b; struct file *f = fget_raw_light(fd, &b); - return (struct fd){f,b}; + return (struct fd){f,b,0}; } extern int f_dupfd(unsigned int from, struct file *file, unsigned flags); diff --git a/include/linux/fs.h b/include/linux/fs.h index 60829565e552..ebfde04bca06 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -123,6 +123,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)0x4000) +/* File needs atomic accesses to f_pos */ +#define FMODE_ATOMIC_POS ((__force fmode_t)0x8000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x1000000) @@ -780,13 +783,14 @@ struct file { const struct file_operations *f_op; /* - * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR. + * Protects f_ep_links, f_flags. * Must not be taken from IRQ context. */ spinlock_t f_lock; atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; + struct mutex f_pos_lock; loff_t f_pos; struct fown_struct f_owner; const struct cred *f_cred;