[PATCH 2/3] fs: Add O_ATOMIC support to direct IO

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This adds the O_ATOMIC file flag (which requires O_DIRECT).  If
applications request atomic IO, the generic O_DIRECT code is changed to
build a list of bios to represent any single O_DIRECT write() call.  The
bios may span discontig areas of the drive if the file is fragmented.

The bios are sent to submit_bio as a single unit, and we expect the
storage to do one of three things:

Fail each bio individually if the list is too large for atomic
completion.

Fail each bio individually if there are any errors during any write.

Complete each bio with success if every write is fully stable
on media.

This works with any filesystem that uses the generic O_DIRECT code for
bio submission (almost everyone except Btrfs).

Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxxxx>
---
 fs/direct-io.c                   | 23 +++++++++++++++++++++--
 fs/fcntl.c                       | 14 +++++++++++---
 include/uapi/asm-generic/fcntl.h |  4 ++++
 3 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 160a548..6837418 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -120,6 +120,7 @@ struct dio {
 	struct inode *inode;
 	loff_t i_size;			/* i_size when submitted */
 	dio_iodone_t *end_io;		/* IO completion function */
+	struct bio_list atomic_bio;
 
 	void *private;			/* copy from map_bh.b_private */
 
@@ -409,14 +410,30 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 	if (sdio->submit_io)
 		sdio->submit_io(dio->rw, bio, dio->inode,
 			       sdio->logical_offset_in_bio);
-	else
-		submit_bio(dio->rw, bio);
+	else {
+		/* atomic writes are collected for submission together */
+		if (dio->rw != READ &&
+		    (dio->iocb->ki_filp->f_flags & O_ATOMIC)) {
+			bio->bi_rw |= (REQ_ATOMIC | dio->rw);
+			bio_list_add(&dio->atomic_bio, bio);
+		} else {
+			/* everything else is sent directly */
+			submit_bio(dio->rw, bio);
+		}
+	}
 
 	sdio->bio = NULL;
 	sdio->boundary = 0;
 	sdio->logical_offset_in_bio = 0;
 }
 
+static inline void dio_bio_atomic_submit(struct dio *dio)
+{
+	struct bio *bio = bio_list_get(&dio->atomic_bio);
+	if (bio)
+		submit_bio(dio->rw, bio);
+}
+
 /*
  * Release any resources in case of a failure
  */
@@ -1173,6 +1190,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	 * care to only zero out what's needed.
 	 */
 	memset(dio, 0, offsetof(struct dio, pages));
+	bio_list_init(&dio->atomic_bio);
 
 	dio->flags = flags;
 	if (dio->flags & DIO_LOCKING) {
@@ -1318,6 +1336,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	if (sdio.bio)
 		dio_bio_submit(dio, &sdio);
 
+	dio_bio_atomic_submit(dio);
 	blk_finish_plug(&plug);
 
 	/*
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 65343c3..09f4c7a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -26,7 +26,8 @@
 #include <asm/siginfo.h>
 #include <asm/uaccess.h>
 
-#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
+#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | \
+		    O_DIRECT | O_NOATIME | O_ATOMIC)
 
 static int setfl(int fd, struct file * filp, unsigned long arg)
 {
@@ -56,6 +57,12 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 				return -EINVAL;
 	}
 
+	/* O_ATOMIC requires O_DIRECT */
+	if (arg & O_ATOMIC) {
+		if (!((arg | filp->f_flags) & O_DIRECT))
+			return -EINVAL;
+	}
+
 	if (filp->f_op && filp->f_op->check_flags)
 		error = filp->f_op->check_flags(arg);
 	if (error)
@@ -730,14 +737,15 @@ static int __init fcntl_init(void)
 	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
 	 * is defined as O_NONBLOCK on some platforms and not on others.
 	 */
-	BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+	BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
 		O_RDONLY	| O_WRONLY	| O_RDWR	|
 		O_CREAT		| O_EXCL	| O_NOCTTY	|
 		O_TRUNC		| O_APPEND	| /* O_NONBLOCK	| */
 		__O_SYNC	| O_DSYNC	| FASYNC	|
 		O_DIRECT	| O_LARGEFILE	| O_DIRECTORY	|
 		O_NOFOLLOW	| O_NOATIME	| O_CLOEXEC	|
-		__FMODE_EXEC	| O_PATH	| __O_TMPFILE
+		__FMODE_EXEC	| O_PATH	| __O_TMPFILE	|
+		O_ATOMIC
 		));
 
 	fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 95e46c8..00259df 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -88,6 +88,10 @@
 #define __O_TMPFILE	020000000
 #endif
 
+#ifndef O_ATOMIC
+#define O_ATOMIC	040000000	/* set do atomic O_DIRECT writes */
+#endif
+
 /* a horrid kludge trying to make sure that this will fail on old kernels */
 #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
 #define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT)      
-- 
1.8.2

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux