Re: [PATCH 13/13] orangefs: implement write through the page cache

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, 2017-05-26 at 14:09 -0400, martin@xxxxxxxxxxxx wrote:
> On Thu, May 25, 2017 at 12:09:49PM -0400, Jeff Layton wrote:
> > On Mon, 2017-05-22 at 05:59 -0400, Martin Brandenburg wrote:
> > > With this and the last commit, OrangeFS is capable of writing through
> > > the page cache.  This should significantly increase performance of very
> > > small writes, since writeback will not be done for every write call.
> > > 
> > > However it is not appropriate for use with multiple clients due to the
> > > long writeback delay.
> > > 
> > > Signed-off-by: Martin Brandenburg <martin@xxxxxxxxxxxx>
> > > ---
> > >  fs/orangefs/file.c | 128 +++++++++++++++++++++++------------------------------
> > >  1 file changed, 56 insertions(+), 72 deletions(-)
> > > 
> > > diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
> > > index c03deea..3ab0a1f 100644
> > > --- a/fs/orangefs/file.c
> > > +++ b/fs/orangefs/file.c
> > > @@ -448,69 +448,11 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
> > >  	return generic_file_read_iter(iocb, iter);
> > >  }
> > >  
> > > -static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
> > > +static ssize_t orangefs_file_write_iter(struct kiocb *iocb,
> > > +    struct iov_iter *iter)
> > >  {
> > > -	struct file *file = iocb->ki_filp;
> > > -	loff_t pos;
> > > -	ssize_t rc;
> > > -
> > > -	BUG_ON(iocb->private);
> > > -
> > > -	gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
> > > -
> > > -	inode_lock(file->f_mapping->host);
> > > -
> > > -	/* Make sure generic_write_checks sees an up to date inode size. */
> > > -	if (file->f_flags & O_APPEND) {
> > > -		rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
> > > -		    STATX_SIZE);
> > > -		if (rc == -ESTALE)
> > > -			rc = -EIO;
> > > -		if (rc) {
> > > -			gossip_err("%s: orangefs_inode_getattr failed, "
> > > -			    "rc:%zd:.\n", __func__, rc);
> > > -			goto out;
> > > -		}
> > > -	}
> > > -
> > > -	if (file->f_pos > i_size_read(file->f_mapping->host))
> > > -		orangefs_i_size_write(file->f_mapping->host, file->f_pos);
> > > -
> > > -	rc = generic_write_checks(iocb, iter);
> > > -
> > > -	if (rc <= 0) {
> > > -		gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
> > > -			   __func__, rc);
> > > -		goto out;
> > > -	}
> > > -
> > > -	/*
> > > -	 * if we are appending, generic_write_checks would have updated
> > > -	 * pos to the end of the file, so we will wait till now to set
> > > -	 * pos...
> > > -	 */
> > > -	pos = *(&iocb->ki_pos);
> > > -
> > > -	rc = do_readv_writev(ORANGEFS_IO_WRITE,
> > > -			     file,
> > > -			     &pos,
> > > -			     iter);
> > > -	if (rc < 0) {
> > > -		gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
> > > -			   __func__, rc);
> > > -		goto out;
> > > -	}
> > > -
> > > -	iocb->ki_pos = pos;
> > >  	orangefs_stats.writes++;
> > > -
> > > -	if (pos > i_size_read(file->f_mapping->host))
> > > -		orangefs_i_size_write(file->f_mapping->host, pos);
> > > -
> > > -out:
> > > -
> > > -	inode_unlock(file->f_mapping->host);
> > > -	return rc;
> > > +	return generic_file_write_iter(iocb, iter);
> > >  }
> > >  
> > >  /*
> > > @@ -606,9 +548,8 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
> > >  	orangefs_flush_inode(inode);
> > >  
> > >  	/*
> > > -	 * remove all associated inode pages from the page cache and
> > > -	 * readahead cache (if any); this forces an expensive refresh of
> > > -	 * data for the next caller of mmap (or 'get_block' accesses)
> > > +	 * remove all associated inode pages from the readahead cache
> > > +	 * (if any)
> > >  	 */
> > >  	if (file_inode(file) &&
> > >  	    file_inode(file)->i_mapping &&
> > > @@ -621,8 +562,6 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
> > >  			gossip_debug(GOSSIP_INODE_DEBUG,
> > >  			    "flush_racache finished\n");
> > >  		}
> > > -		truncate_inode_pages(file_inode(file)->i_mapping,
> > > -				     0);
> > >  	}
> > >  	return 0;
> > >  }
> > > @@ -741,6 +680,40 @@ const struct file_operations orangefs_file_operations = {
> > >  	.fsync		= orangefs_fsync,
> > >  };
> > >  
> > > +static int orangefs_writepage(struct page *page,
> > > +    struct writeback_control *wbc)
> > > +{
> > > +	struct inode *inode = page->mapping->host;
> > > +	struct iov_iter iter;
> > > +	struct iovec iov;
> > > +	loff_t off;
> > > +	size_t len;
> > > +	ssize_t r;
> > > +	void *map;
> > > +
> > > +	off = page_offset(page);
> > > +	len = i_size_read(inode);
> > > +	if (off + PAGE_SIZE > len)
> > > +		len = len - off;
> > > +	else
> > > +		len = PAGE_SIZE;
> > > +
> > > +	map = kmap_atomic(page);
> > > +	iov.iov_base = map;
> > > +	iov.iov_len = len;
> > > +	iov_iter_init(&iter, WRITE, &iov, 1, len);
> > > +
> > > +	set_page_writeback(page);
> > > +
> > 
> > wait_for_direct_io can sleep, so you can't use kmap_atomic there.
> > Regular old kmap should be ok there though.
> > 
> > Also, you probably really don't want to use iovecs there, as those are
> > expected to deal in userland addresses and the kmap address won't be
> > one.
> > 
> > It may be cleaner to use a bio_vec there instead. You most likely
> > wouldn't need to kmap at all if you did that.
> 
> I tried that since that's what readpage does.
> 
> 	static int orangefs_writepage(struct page *page,
> 	    struct writeback_control *wbc)
> 	{
> 		struct inode *inode = page->mapping->host;
> 		struct iov_iter iter;
> 		struct bio_vec bv;
> 		loff_t off;
> 		size_t len;
> 		ssize_t r;
> 
> 		off = page_offset(page);
> 		len = i_size_read(inode);
> 		if (off + PAGE_SIZE > len)
> 			len = len - off;
> 		else
> 			len = PAGE_SIZE;
> 
> 		bv.bv_page = page;
> 		bv.bv_len = len;

You didn't set bv.bv_offset there.

> 		iov_iter_bvec(&iter, ITER_BVEC | WRITE, &bv, 1, len);
> 
> 		set_page_writeback(page);
> 
> 		r = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter,
> 		   len, 0);
> 		printk("orangefs_writepage wait_for_direct_io returned %d\n", (int)r);
> 
> 		end_page_writeback(page);
> 		unlock_page(page);
> 		return 0;
> 	}
> 
> But this produced:
> 
> 	[  125.739396] BUG: unable to handle kernel paging request at ffff880303833802
> 	[  125.739429] IP: __memcpy+0x12/0x20
> 	[  125.739439] PGD 2ec7067 
> 	[  125.739441] PUD 0 
> 
> 	[  125.739463] Oops: 0000 [#1] SMP
> 	[  125.739474] Modules linked in: nfsd auth_rpcgss oid_registry nfs_acl snd_hda_codec_realtek nfs snd_hda_codec_generic lockd grace fscache sunrpc adt7475 hwmon_vid nouveau video mxm_wmi wmi coretemp i2c_algo_bit kvm_intel drm_kms_helper ttm drm kvm snd_hda_intel snd_hda_codec irqbypass snd_hwdep evdev iTCO_wdt dcdbas iTCO_vendor_support pcspkr snd_hda_core serio_raw i2c_i801 snd_pcm snd_timer lpc_ich mfd_core i2c_core shpchp snd soundcore button acpi_cpufreq autofs4 ext4 crc16 jbd2 mbcache hid_generic usbhid hid sg sr_mod cdrom sd_mod usb_storage ata_generic ata_piix libata psmouse firewire_ohci scsi_mod r8169 firewire_core ehci_pci mii uhci_hcd ehci_hcd crc_itu_t usbcore usb_common
> 	[  125.739683] CPU: 1 PID: 111 Comm: kworker/u8:2 Not tainted 4.11.0+ #66
> 	[  125.739697] Hardware name: Dell Inc. Studio 540      /0M017G, BIOS 1.1.1 07/24/2009
> 	[  125.739718] Workqueue: writeback wb_workfn (flush-orangefs-1)
> 	[  125.739734] task: ffff88021d9bc100 task.stack: ffffc900016d8000
> 	[  125.739748] RIP: 0010:__memcpy+0x12/0x20
> 	[  125.739759] RSP: 0018:ffffc900016db800 EFLAGS: 00010246
> 	[  125.739773] RAX: ffff88020693d000 RBX: 0000000000001000 RCX: 0000000000000200
> 	[  125.739788] RDX: 0000000000000000 RSI: ffff880303833802 RDI: ffff88020693d000
> 	[  125.739803] RBP: ffffc900016db808 R08: 0000160000000000 R09: 00000000ffff8802
> 	[  125.739818] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000001000
> 	[  125.739834] R13: ffffc900016db990 R14: 0000000000000000 R15: ffff88020693e000
> 	[  125.739849] FS:  0000000000000000(0000) GS:ffff880227000000(0000) knlGS:0000000000000000
> 	[  125.739866] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> 	[  125.739879] CR2: ffff880303833802 CR3: 0000000220d97000 CR4: 00000000000406e0
> 	[  125.739895] Call Trace:
> 	[  125.739906]  ? memcpy_from_page+0x46/0x80
> 	[  125.739918]  copy_from_iter+0x1f1/0x380
> 	[  125.739929]  ? get+0x55/0x2a0
> 	[  125.739942]  copy_page_from_iter+0x166/0x250
> 	[  125.739957]  orangefs_bufmap_copy_from_iovec+0x77/0xc0
> 	[  125.739971]  wait_for_direct_io+0x1cb/0x500
> 	[  125.739987]  orangefs_writepage+0x76/0xb0
> 	[  125.740002]  __writepage+0x16/0x40
> 	[  125.740014]  write_cache_pages+0x266/0x5c0
> 	[  125.740027]  ? sched_clock_local+0x17/0x90
> 	[  125.740040]  ? wb_position_ratio+0x1f0/0x1f0
> 	[  125.740060]  generic_writepages+0x52/0x70
> 	[  125.740074]  do_writepages+0x2b/0x30
> 	[  125.740084]  ? do_writepages+0x2b/0x30
> 	[  125.740096]  __writeback_single_inode+0x61/0x6d0
> 	[  125.740109]  ? _raw_spin_unlock+0x27/0x30
> 	[  125.740122]  writeback_sb_inodes+0x2e4/0x680
> 	[  125.740141]  __writeback_inodes_wb+0x8f/0xc0
> 	[  125.740155]  wb_writeback+0x32c/0x4f0
> 	[  125.740171]  wb_workfn+0x9c/0x560
> 	[  125.740181]  ? wb_workfn+0x9c/0x560
> 	[  125.740194]  ? process_one_work+0x164/0x650
> 	[  125.740210]  process_one_work+0x1e1/0x650
> 	[  125.740225]  worker_thread+0x69/0x4a0
> 	[  125.740240]  kthread+0x127/0x160
> 	[  125.740250]  ? process_one_work+0x650/0x650
> 	[  125.740262]  ? kthread_create_on_node+0x40/0x40
> 	[  125.740275]  ret_from_fork+0x31/0x40
> 	[  125.740291] Code: 75 05 e8 72 fb ff ff 48 8b 43 60 48 2b 43 50 88 43 4e 5b 5d f3 c3 90 90 90 66 66 90 66 90 48 89 f8 48 89 d1 48 c1 e9 03 83 e2 07 <f3> 48 a5 89 d1 f3 a4 c3 66 0f 1f 44 00 00 48 89 f8 48 89 d1 f3 
> 	[  125.740413] RIP: __memcpy+0x12/0x20 RSP: ffffc900016db800
> 	[  125.740425] CR2: ffff880303833802
> 	[  125.740437] ---[ end trace 844fdaac4da7b68d ]---
> 
> > 
> > > +	r = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter,
> > > +	    len, 0);
> > > +	kunmap_atomic(map);
> > > +
> > 
> > When writeback fails for some reason, you'll also want to call
> > mapping_set_error to help ensure that those errors get reported (unless
> > you're tracking them on your own somehow). I don't see where that's
> > being done in a cursory glance at these patches, but I could have missed
> > it.
> > 
> > > +	end_page_writeback(page);
> > > +	unlock_page(page);
> > > +	return 0;
> > > +}
> > > +
> > > static int orangefs_readpage(struct file *file, struct page *page)
> > >  {
> > >  	int ret;
> > > @@ -786,6 +759,17 @@ static int orangefs_readpage(struct file *file, struct page *page)
> > >  	return ret;
> > >  }
> > >  
> > > +static int orangefs_write_end(struct file *file,
> > > +    struct address_space *mapping, loff_t pos, unsigned len,
> > > +    unsigned copied, struct page *page, void *fsdata)
> > > +{
> > > +	int r;
> > > +	r = simple_write_end(file, mapping, pos, len, copied, page,
> > > +	    fsdata);
> > > +	mark_inode_dirty_sync(file_inode(file));
> > > +	return r;
> > > +}
> > > +
> > >  static void orangefs_invalidatepage(struct page *page,
> > >  				 unsigned int offset,
> > >  				 unsigned int length)
> > > @@ -815,17 +799,17 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
> > >  {
> > >  	struct file *file = iocb->ki_filp;
> > >  	loff_t pos = *(&iocb->ki_pos);
> > > -	/*
> > > -	 * This cannot happen until write_iter becomes
> > > -	 * generic_file_write_iter.
> > > -	 */
> > > -	BUG_ON(iov_iter_rw(iter) != READ);
> > > -	return do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
> > > +	return do_readv_writev(iocb->ki_flags & IOCB_WRITE ?
> > > +	    ORANGEFS_IO_WRITE : ORANGEFS_IO_READ, file, &pos, iter);
> > >  }
> > >  
> > >  /** ORANGEFS2 implementation of address space operations */
> > >  const struct address_space_operations orangefs_address_operations = {
> > > +	.writepage = orangefs_writepage,
> > >  	.readpage = orangefs_readpage,
> > > +	.set_page_dirty = __set_page_dirty_nobuffers,
> > > +	.write_begin = simple_write_begin,
> > > +	.write_end = orangefs_write_end,
> > >  	.invalidatepage = orangefs_invalidatepage,
> > >  	.releasepage = orangefs_releasepage,
> > >  	.direct_IO = orangefs_direct_IO,
> > 
> > -- 
> > Jeff Layton <jlayton@xxxxxxxxxx>

-- 
Jeff Layton <jlayton@xxxxxxxxxx>



[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux