[PATCHSET 0/9 ver7] exofs for 2.6.30 (really)

Boaz Harrosh <bharrosh@xxxxxxxxxxx> · Wed, 01 Apr 2009 17:05:21 +0300

Posting the final exofs patchset for the Linux 2.6.30
merge window.

Unless something critical is found, any new comments,
clean-ups, enhancements, I will apply ontop of this code
for the next Kernel. So please do comment away.

I have squashed all comments changes and fixes, including
Randy Dunlap's patch into their respective patch below.
Attached are the grand diff from last time with annotations
below important changes about what was changed.

Thank you for every one that participated and help
to make this a better code.

exofs is a fully usable and exportable filesystem by now.
About stability, well it was not used that much.

These patches are available in the git repository at:

  git://git.open-osd.org/linux-open-osd.git for-linus

Boaz Harrosh (9):
      exofs: Kbuild, Headers and osd utils
      exofs: file and file_inode operations
      exofs: symlink_inode and fast_symlink_inode operations
      exofs: address_space_operations
      exofs: dir_inode and directory operations
      exofs: super_operations and file_system_type
      exofs: export_operations
      exofs: Documentation
      fs: Add exofs to Kernel build

 Documentation/filesystems/exofs.txt |  176 +++++
 fs/Kconfig                          |    2 +
 fs/Makefile                         |    1 +
 fs/exofs/BUGS                       |    3 +
 fs/exofs/Kbuild                     |   16 +
 fs/exofs/Kconfig                    |   13 +
 fs/exofs/common.h                   |  184 +++++
 fs/exofs/dir.c                      |  672 ++++++++++++++++++
 fs/exofs/exofs.h                    |  180 +++++
 fs/exofs/file.c                     |   87 +++
 fs/exofs/inode.c                    | 1303 +++++++++++++++++++++++++++++++++++
 fs/exofs/namei.c                    |  342 +++++++++
 fs/exofs/osd.c                      |  153 ++++
 fs/exofs/super.c                    |  584 ++++++++++++++++
 fs/exofs/symlink.c                  |   57 ++
 15 files changed, 3773 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/filesystems/exofs.txt
 create mode 100644 fs/exofs/BUGS
 create mode 100644 fs/exofs/Kbuild
 create mode 100644 fs/exofs/Kconfig
 create mode 100644 fs/exofs/common.h
 create mode 100644 fs/exofs/dir.c
 create mode 100644 fs/exofs/exofs.h
 create mode 100644 fs/exofs/file.c
 create mode 100644 fs/exofs/inode.c
 create mode 100644 fs/exofs/namei.c
 create mode 100644 fs/exofs/osd.c
 create mode 100644 fs/exofs/super.c
 create mode 100644 fs/exofs/symlink.c

Boaz
See comments embedded inside the patch

git diff --stat -p oo-experimental HEAD -- fs/exofs/
 fs/exofs/common.h |    5 +--
 fs/exofs/dir.c    |    3 +-
 fs/exofs/exofs.h  |   14 +++-----
 fs/exofs/file.c   |   11 +++---
 fs/exofs/inode.c  |   89 ++++++++++++++++++++++++++++-------------------------
 fs/exofs/super.c  |   11 ++++--
 6 files changed, 69 insertions(+), 64 deletions(-)

diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index 8a56338..24667ee 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -63,9 +63,8 @@
  * Object IDs 0, 1, and 2 are always in use (see above defines).
  */
 enum {
-	EXOFS_UINT64_MAX = (~0LL),
-	EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? EXOFS_UINT64_MAX :
-					(1LL << (sizeof(ino_t) * 8 - 1)),
+	EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
+					(1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
 	EXOFS_MAX_ID	 = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
 };

For now ULLONG_MAX == UINT64_MAX until 128 bit machines come up.
Perhaps in future I will submit a patch to kernel.h that adds:
UINT64_MAX, UINT32_MAX, UINT16_MAX, and UINT8_MAX in a platform
independent way

diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 55ebbb1..65b0c8c 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -46,6 +46,7 @@ static inline void exofs_put_page(struct page *page)
 	page_cache_release(page);
 }
 
+/* Accesses dir's inode->i_size must be called under inode lock */
 static inline unsigned long dir_pages(struct inode *inode)
 {
 	return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -53,7 +54,7 @@ static inline unsigned long dir_pages(struct inode *inode)
 
 static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
 {
-	unsigned last_byte = inode->i_size;
+	loff_t last_byte = inode->i_size;
 
 	last_byte -= page_nr << PAGE_CACHE_SHIFT;
 	if (last_byte > PAGE_CACHE_SIZE)
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index da1397a..0fd4c78 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -47,7 +47,7 @@
 	printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
 #else
 #define EXOFS_DBGMSG(fmt, a...) \
-	do {} while (0)
+	do { if (0) printk(fmt, ##a); } while (0)
 #endif
 
 /* u64 has problems with printk this will cast it to unsigned long long */

Here I also let compile EXOFS_DBGMSG even if OFF. To catch compilation errors

@@ -89,22 +89,22 @@ struct exofs_i_info {
 
 static inline int obj_2bcreated(struct exofs_i_info *oi)
 {
-	return test_bit(OBJ_2BCREATED, &(oi->i_flags));
+	return test_bit(OBJ_2BCREATED, &oi->i_flags);
 }
 
 static inline void set_obj_2bcreated(struct exofs_i_info *oi)
 {
-	set_bit(OBJ_2BCREATED, &(oi->i_flags));
+	set_bit(OBJ_2BCREATED, &oi->i_flags);
 }
 
 static inline int obj_created(struct exofs_i_info *oi)
 {
-	return test_bit(OBJ_CREATED, &(oi->i_flags));
+	return test_bit(OBJ_CREATED, &oi->i_flags);
 }
 
 static inline void set_obj_created(struct exofs_i_info *oi)
 {
-	set_bit(OBJ_CREATED, &(oi->i_flags));
+	set_bit(OBJ_CREATED, &oi->i_flags);
 }
 
 int __exofs_wait_obj_created(struct exofs_i_info *oi);
@@ -173,10 +173,6 @@ extern const struct address_space_operations exofs_aops;
 extern const struct inode_operations exofs_dir_inode_operations;
 extern const struct inode_operations exofs_special_inode_operations;
 
-/* super.c           */
-extern const struct super_operations exofs_sops;
-extern const struct export_operations exofs_export_ops;
-
 /* symlink.c         */
 extern const struct inode_operations exofs_symlink_inode_operations;
 extern const struct inode_operations exofs_fast_symlink_inode_operations;
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 2712f68..6ed7fe4 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -45,17 +45,18 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
 static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
 			    int datasync)
 {
-	int ret1, ret2;
+	int ret;
 	struct address_space *mapping = filp->f_mapping;
 
-	ret1 = filemap_write_and_wait(mapping);
+	ret = filemap_write_and_wait(mapping);
+	if (ret)
+		return ret;
+
 	/*Note: file_fsync below also calles sync_blockdev, which is a no-op
 	 *      for exofs, but other then that it does sync_inode and
 	 *      sync_superblock which is what we need here.
 	 */
-	ret2 = file_fsync(filp, dentry, datasync);
-
-	return ret1 ? ret1 : ret2;
+	return file_fsync(filp, dentry, datasync);
 }
 
 static int exofs_flush(struct file *file, fl_owner_t id)

As noted by Andrew, timeouts here can kill you.

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index b77c197..ba8d9fa 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -52,7 +52,7 @@ struct page_collect {
 	struct bio *bio;
 	unsigned nr_pages;
 	unsigned long length;
-	long pg_first;
+	loff_t pg_first; /* keep 64bit also in 32-arches */
 };
 
 static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -161,8 +161,9 @@ static void update_write_page(struct page *page, int ret)
 	end_page_writeback(page);
 }
 
-static int _readpage(struct page *page, bool is_sync);
-
+/* Called at the end of reads, to optionally unlock pages and update their
+ * status.
+ */
 static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
 			    bool do_unlock)
 {
@@ -183,35 +184,30 @@ static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
 		good_bytes = pcol->length - resid;
 
 	EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx"
-		     " length=0x%zx nr_pages=%u\n",
+		     " length=0x%lx nr_pages=%u\n",
 		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
 		     pcol->nr_pages);
 
 	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
 		struct page *page = bvec->bv_page;
 		struct inode *inode = page->mapping->host;
+		int page_stat;
 
 		if (inode != pcol->inode)
 			continue; /* osd might add more pages at end */
 
-		if ((length < good_bytes) || (i == 0)) {
-			ret = update_read_page(page, (i == 0) ? ret : 0);
-			if (do_unlock)
-				unlock_page(page);
-			EXOFS_DBGMSG("    readpages_done(0x%lx, 0x%lx)\n",
-				     inode->i_ino, page->index);
-		} else {
-			/* can not happen on single sync_readpage */
-			BUG_ON(!do_unlock);
+		if (likely(length < good_bytes))
+			page_stat = 0;
+		else
+			page_stat = ret;
 
-			/* try a single page read and only then it is
-			 * marked as SetPageError()
-			 */
-			EXOFS_ERR("    readpages_done(0x%lx, 0x%lx)"
-				  " bad_bytes\n", inode->i_ino, page->index);
-			_readpage(page, false);
-		}
+		EXOFS_DBGMSG("    readpages_done(0x%lx, 0x%lx) %s\n",
+			  inode->i_ino, page->index,
+			  page_stat ? "bad_bytes" : "good_bytes");
 
+		ret = update_read_page(page, page_stat);
+		if (do_unlock)
+			unlock_page(page);
 		length += bvec->bv_len;
 	}
 

What happened above was: Before I wanted to retry and read page-by-page
those pages that failed. (Doing BLOCK_PC commands kind of fails early
without retries). This can not work as it is, because current code will
issue all commands GFP_KERNEL but the callback is called from interupt.
(It can be fixed but not so trivially). Also a more detailed analysis
should be preformed on when we can retry and when we should not.

Leave this fixture on TODO status.
(It was actually failing in tests)

@@ -220,6 +216,7 @@ static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
 	return ret;
 }
 
+/* callback of async reads */
 static void readpages_done(struct osd_request *or, void *p)
 {
 	struct page_collect *pcol = p;
@@ -289,7 +286,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 
 	atomic_inc(&pcol->sbi->s_curr_pending);
 
-	EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%zx\n",
+	EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
 		  obj.id, _LLU(i_start), pcol->length);
 
 	/* pages ownership was passed to pcol_copy */
@@ -305,6 +302,13 @@ err:
 	return ret;
 }
 
+/* readpage_strip is called either directly from readpage() or by the VFS from
+ * within read_cache_pages(), to add one more page to be read. It will try to
+ * collect as many contiguous pages as posible. If a discontinuity is
+ * encountered, or it runs out of resources, it will submit the previous segment
+ * and will start a new collection. Eventually caller must submit the last
+ * segment if present.
+ */
 static int readpage_strip(void *data, struct page *page)
 {
 	struct page_collect *pcol = data;
@@ -372,7 +376,7 @@ try_again:
 	ret = pcol_add_page(pcol, page, len);
 	if (ret) {
 		EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
-			  "this_len=0x%zx nr_pages=%u length=0x%zx\n",
+			  "this_len=0x%zx nr_pages=%u length=0x%lx\n",
 			  page, len, pcol->nr_pages, pcol->length);
 
 		/* split the request, and start again with current page */
@@ -436,8 +440,7 @@ static int exofs_readpage(struct file *file, struct page *page)
 	return _readpage(page, false);
 }
 
-static int exofs_writepage(struct page *page, struct writeback_control *wbc2);
-
+/* Callback for osd_write. All writes are asynchronouse */
 static void writepages_done(struct osd_request *or, void *p)
 {
 	struct page_collect *pcol = p;
@@ -460,32 +463,27 @@ static void writepages_done(struct osd_request *or, void *p)
 		good_bytes = pcol->length - resid;
 
 	EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx"
-		     " length=0x%zx nr_pages=%u\n",
+		     " length=0x%lx nr_pages=%u\n",
 		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
 		     pcol->nr_pages);
 
 	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
 		struct page *page = bvec->bv_page;
 		struct inode *inode = page->mapping->host;
+		int page_stat;
 
 		if (inode != pcol->inode)
 			continue; /* osd might add more pages to a bio */
 
-		if ((length < good_bytes) || (i == 0)) {
-			update_write_page(page, ret);
-			unlock_page(page);
-			EXOFS_DBGMSG("    writepages_done(0x%lx, 0x%lx)"
-				     " good_bytes ret=%d\n",
-				     inode->i_ino, page->index, ret);
-		} else {
-			/* try a single page write and only then it is
-			 * marked as SetPageError()
-			 */
-			EXOFS_ERR("    writepages_done(0x%lx, 0x%lx)"
-				  " bad_bytes\n", inode->i_ino, page->index);
+		if (likely(length < good_bytes))
+			page_stat = 0;
+		else
+			page_stat = ret;
 
-			exofs_writepage(page, NULL);
-		}
+		update_write_page(page, page_stat);
+		unlock_page(page);
+		EXOFS_DBGMSG("    writepages_done(0x%lx, 0x%lx) status=%d\n",
+			     inode->i_ino, page->index, page_stat);
 
 		length += bvec->bv_len;
 	}

Same thing as above but for writes.

@@ -532,7 +530,7 @@ static int write_exec(struct page_collect *pcol)
 	}
 
 	atomic_inc(&pcol->sbi->s_curr_pending);
-	EXOFS_DBGMSG("write_exec(0x%lx, 0x%lx) start=0x%llx length=0x%zx\n",
+	EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
 		  pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
 		  pcol->length);
 	/* pages ownership was passed to pcol_copy */
@@ -547,6 +545,13 @@ err:
 	return ret;
 }
 
+/* writepage_strip is called either directly from writepage() or by the VFS from
+ * within write_cache_pages(), to add one more page to be written to storage.
+ * It will try to collect as many contiguous pages as possible. If a
+ * discontinuity is encountered or it runs out of resources it will submit the
+ * previous segment and will start a new collection.
+ * Eventually caller must submit the last segment if present.
+ */
 static int writepage_strip(struct page *page,
 			   struct writeback_control *wbc_unused, void *data)
 {
@@ -609,7 +614,7 @@ try_again:
 	ret = pcol_add_page(pcol, page, len);
 	if (unlikely(ret)) {
 		EXOFS_DBGMSG("Failed pcol_add_page "
-			     "nr_pages=%u total_length=0x%zx\n",
+			     "nr_pages=%u total_length=0x%lx\n",
 			     pcol->nr_pages, pcol->length);
 
 		/* split the request, next loop will start again */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 8a07d6d..9f1985e 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -194,6 +194,8 @@ static void destroy_inodecache(void)
 /******************************************************************************
  * SUPERBLOCK FUNCTIONS
  *****************************************************************************/
+static const struct super_operations exofs_sops;
+static const struct export_operations exofs_export_ops;
 
 /*
  * Write the superblock to the OSD
@@ -427,8 +429,8 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 		ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
 			OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
 	};
-	uint64_t capacity = ~0;
-	uint64_t used = ~0;
+	uint64_t capacity = ULLONG_MAX;
+	uint64_t used = ULLONG_MAX;
 	struct osd_request *or;
 	uint8_t cred_a[OSD_CAP_LEN];
 	int ret;
@@ -475,7 +477,7 @@ out:
 	return ret;
 }
 
-const struct super_operations exofs_sops = {
+static const struct super_operations exofs_sops = {
 	.alloc_inode    = exofs_alloc_inode,
 	.destroy_inode  = exofs_destroy_inode,
 	.write_inode    = exofs_write_inode,
@@ -488,6 +490,7 @@ const struct super_operations exofs_sops = {
 /******************************************************************************
  * EXPORT OPERATIONS
  *****************************************************************************/
+
 struct dentry *exofs_get_parent(struct dentry *child)
 {
 	unsigned long ino = exofs_parent_ino(child);
@@ -528,7 +531,7 @@ static struct dentry *exofs_fh_to_parent(struct super_block *sb,
 				    exofs_nfs_get_inode);
 }
 
-const struct export_operations exofs_export_ops = {
+static const struct export_operations exofs_export_ops = {
 	.fh_to_dentry = exofs_fh_to_dentry,
 	.fh_to_parent = exofs_fh_to_parent,
 	.get_parent = exofs_get_parent,