On Fri, 2017-05-26 at 14:09 -0400, martin@xxxxxxxxxxxx wrote: > On Thu, May 25, 2017 at 12:09:49PM -0400, Jeff Layton wrote: > > On Mon, 2017-05-22 at 05:59 -0400, Martin Brandenburg wrote: > > > With this and the last commit, OrangeFS is capable of writing through > > > the page cache. This should significantly increase performance of very > > > small writes, since writeback will not be done for every write call. > > > > > > However it is not appropriate for use with multiple clients due to the > > > long writeback delay. > > > > > > Signed-off-by: Martin Brandenburg <martin@xxxxxxxxxxxx> > > > --- > > > fs/orangefs/file.c | 128 +++++++++++++++++++++++------------------------------ > > > 1 file changed, 56 insertions(+), 72 deletions(-) > > > > > > diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c > > > index c03deea..3ab0a1f 100644 > > > --- a/fs/orangefs/file.c > > > +++ b/fs/orangefs/file.c > > > @@ -448,69 +448,11 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, > > > return generic_file_read_iter(iocb, iter); > > > } > > > > > > -static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) > > > +static ssize_t orangefs_file_write_iter(struct kiocb *iocb, > > > + struct iov_iter *iter) > > > { > > > - struct file *file = iocb->ki_filp; > > > - loff_t pos; > > > - ssize_t rc; > > > - > > > - BUG_ON(iocb->private); > > > - > > > - gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n"); > > > - > > > - inode_lock(file->f_mapping->host); > > > - > > > - /* Make sure generic_write_checks sees an up to date inode size. */ > > > - if (file->f_flags & O_APPEND) { > > > - rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1, > > > - STATX_SIZE); > > > - if (rc == -ESTALE) > > > - rc = -EIO; > > > - if (rc) { > > > - gossip_err("%s: orangefs_inode_getattr failed, " > > > - "rc:%zd:.\n", __func__, rc); > > > - goto out; > > > - } > > > - } > > > - > > > - if (file->f_pos > i_size_read(file->f_mapping->host)) > > > - orangefs_i_size_write(file->f_mapping->host, file->f_pos); > > > - > > > - rc = generic_write_checks(iocb, iter); > > > - > > > - if (rc <= 0) { > > > - gossip_err("%s: generic_write_checks failed, rc:%zd:.\n", > > > - __func__, rc); > > > - goto out; > > > - } > > > - > > > - /* > > > - * if we are appending, generic_write_checks would have updated > > > - * pos to the end of the file, so we will wait till now to set > > > - * pos... > > > - */ > > > - pos = *(&iocb->ki_pos); > > > - > > > - rc = do_readv_writev(ORANGEFS_IO_WRITE, > > > - file, > > > - &pos, > > > - iter); > > > - if (rc < 0) { > > > - gossip_err("%s: do_readv_writev failed, rc:%zd:.\n", > > > - __func__, rc); > > > - goto out; > > > - } > > > - > > > - iocb->ki_pos = pos; > > > orangefs_stats.writes++; > > > - > > > - if (pos > i_size_read(file->f_mapping->host)) > > > - orangefs_i_size_write(file->f_mapping->host, pos); > > > - > > > -out: > > > - > > > - inode_unlock(file->f_mapping->host); > > > - return rc; > > > + return generic_file_write_iter(iocb, iter); > > > } > > > > > > /* > > > @@ -606,9 +548,8 @@ static int orangefs_file_release(struct inode *inode, struct file *file) > > > orangefs_flush_inode(inode); > > > > > > /* > > > - * remove all associated inode pages from the page cache and > > > - * readahead cache (if any); this forces an expensive refresh of > > > - * data for the next caller of mmap (or 'get_block' accesses) > > > + * remove all associated inode pages from the readahead cache > > > + * (if any) > > > */ > > > if (file_inode(file) && > > > file_inode(file)->i_mapping && > > > @@ -621,8 +562,6 @@ static int orangefs_file_release(struct inode *inode, struct file *file) > > > gossip_debug(GOSSIP_INODE_DEBUG, > > > "flush_racache finished\n"); > > > } > > > - truncate_inode_pages(file_inode(file)->i_mapping, > > > - 0); > > > } > > > return 0; > > > } > > > @@ -741,6 +680,40 @@ const struct file_operations orangefs_file_operations = { > > > .fsync = orangefs_fsync, > > > }; > > > > > > +static int orangefs_writepage(struct page *page, > > > + struct writeback_control *wbc) > > > +{ > > > + struct inode *inode = page->mapping->host; > > > + struct iov_iter iter; > > > + struct iovec iov; > > > + loff_t off; > > > + size_t len; > > > + ssize_t r; > > > + void *map; > > > + > > > + off = page_offset(page); > > > + len = i_size_read(inode); > > > + if (off + PAGE_SIZE > len) > > > + len = len - off; > > > + else > > > + len = PAGE_SIZE; > > > + > > > + map = kmap_atomic(page); > > > + iov.iov_base = map; > > > + iov.iov_len = len; > > > + iov_iter_init(&iter, WRITE, &iov, 1, len); > > > + > > > + set_page_writeback(page); > > > + > > > > wait_for_direct_io can sleep, so you can't use kmap_atomic there. > > Regular old kmap should be ok there though. > > > > Also, you probably really don't want to use iovecs there, as those are > > expected to deal in userland addresses and the kmap address won't be > > one. > > > > It may be cleaner to use a bio_vec there instead. You most likely > > wouldn't need to kmap at all if you did that. > > I tried that since that's what readpage does. > > static int orangefs_writepage(struct page *page, > struct writeback_control *wbc) > { > struct inode *inode = page->mapping->host; > struct iov_iter iter; > struct bio_vec bv; > loff_t off; > size_t len; > ssize_t r; > > off = page_offset(page); > len = i_size_read(inode); > if (off + PAGE_SIZE > len) > len = len - off; > else > len = PAGE_SIZE; > > bv.bv_page = page; > bv.bv_len = len; You didn't set bv.bv_offset there. > iov_iter_bvec(&iter, ITER_BVEC | WRITE, &bv, 1, len); > > set_page_writeback(page); > > r = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, > len, 0); > printk("orangefs_writepage wait_for_direct_io returned %d\n", (int)r); > > end_page_writeback(page); > unlock_page(page); > return 0; > } > > But this produced: > > [ 125.739396] BUG: unable to handle kernel paging request at ffff880303833802 > [ 125.739429] IP: __memcpy+0x12/0x20 > [ 125.739439] PGD 2ec7067 > [ 125.739441] PUD 0 > > [ 125.739463] Oops: 0000 [#1] SMP > [ 125.739474] Modules linked in: nfsd auth_rpcgss oid_registry nfs_acl snd_hda_codec_realtek nfs snd_hda_codec_generic lockd grace fscache sunrpc adt7475 hwmon_vid nouveau video mxm_wmi wmi coretemp i2c_algo_bit kvm_intel drm_kms_helper ttm drm kvm snd_hda_intel snd_hda_codec irqbypass snd_hwdep evdev iTCO_wdt dcdbas iTCO_vendor_support pcspkr snd_hda_core serio_raw i2c_i801 snd_pcm snd_timer lpc_ich mfd_core i2c_core shpchp snd soundcore button acpi_cpufreq autofs4 ext4 crc16 jbd2 mbcache hid_generic usbhid hid sg sr_mod cdrom sd_mod usb_storage ata_generic ata_piix libata psmouse firewire_ohci scsi_mod r8169 firewire_core ehci_pci mii uhci_hcd ehci_hcd crc_itu_t usbcore usb_common > [ 125.739683] CPU: 1 PID: 111 Comm: kworker/u8:2 Not tainted 4.11.0+ #66 > [ 125.739697] Hardware name: Dell Inc. Studio 540 /0M017G, BIOS 1.1.1 07/24/2009 > [ 125.739718] Workqueue: writeback wb_workfn (flush-orangefs-1) > [ 125.739734] task: ffff88021d9bc100 task.stack: ffffc900016d8000 > [ 125.739748] RIP: 0010:__memcpy+0x12/0x20 > [ 125.739759] RSP: 0018:ffffc900016db800 EFLAGS: 00010246 > [ 125.739773] RAX: ffff88020693d000 RBX: 0000000000001000 RCX: 0000000000000200 > [ 125.739788] RDX: 0000000000000000 RSI: ffff880303833802 RDI: ffff88020693d000 > [ 125.739803] RBP: ffffc900016db808 R08: 0000160000000000 R09: 00000000ffff8802 > [ 125.739818] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000001000 > [ 125.739834] R13: ffffc900016db990 R14: 0000000000000000 R15: ffff88020693e000 > [ 125.739849] FS: 0000000000000000(0000) GS:ffff880227000000(0000) knlGS:0000000000000000 > [ 125.739866] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 125.739879] CR2: ffff880303833802 CR3: 0000000220d97000 CR4: 00000000000406e0 > [ 125.739895] Call Trace: > [ 125.739906] ? memcpy_from_page+0x46/0x80 > [ 125.739918] copy_from_iter+0x1f1/0x380 > [ 125.739929] ? get+0x55/0x2a0 > [ 125.739942] copy_page_from_iter+0x166/0x250 > [ 125.739957] orangefs_bufmap_copy_from_iovec+0x77/0xc0 > [ 125.739971] wait_for_direct_io+0x1cb/0x500 > [ 125.739987] orangefs_writepage+0x76/0xb0 > [ 125.740002] __writepage+0x16/0x40 > [ 125.740014] write_cache_pages+0x266/0x5c0 > [ 125.740027] ? sched_clock_local+0x17/0x90 > [ 125.740040] ? wb_position_ratio+0x1f0/0x1f0 > [ 125.740060] generic_writepages+0x52/0x70 > [ 125.740074] do_writepages+0x2b/0x30 > [ 125.740084] ? do_writepages+0x2b/0x30 > [ 125.740096] __writeback_single_inode+0x61/0x6d0 > [ 125.740109] ? _raw_spin_unlock+0x27/0x30 > [ 125.740122] writeback_sb_inodes+0x2e4/0x680 > [ 125.740141] __writeback_inodes_wb+0x8f/0xc0 > [ 125.740155] wb_writeback+0x32c/0x4f0 > [ 125.740171] wb_workfn+0x9c/0x560 > [ 125.740181] ? wb_workfn+0x9c/0x560 > [ 125.740194] ? process_one_work+0x164/0x650 > [ 125.740210] process_one_work+0x1e1/0x650 > [ 125.740225] worker_thread+0x69/0x4a0 > [ 125.740240] kthread+0x127/0x160 > [ 125.740250] ? process_one_work+0x650/0x650 > [ 125.740262] ? kthread_create_on_node+0x40/0x40 > [ 125.740275] ret_from_fork+0x31/0x40 > [ 125.740291] Code: 75 05 e8 72 fb ff ff 48 8b 43 60 48 2b 43 50 88 43 4e 5b 5d f3 c3 90 90 90 66 66 90 66 90 48 89 f8 48 89 d1 48 c1 e9 03 83 e2 07 <f3> 48 a5 89 d1 f3 a4 c3 66 0f 1f 44 00 00 48 89 f8 48 89 d1 f3 > [ 125.740413] RIP: __memcpy+0x12/0x20 RSP: ffffc900016db800 > [ 125.740425] CR2: ffff880303833802 > [ 125.740437] ---[ end trace 844fdaac4da7b68d ]--- > > > > > > + r = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, > > > + len, 0); > > > + kunmap_atomic(map); > > > + > > > > When writeback fails for some reason, you'll also want to call > > mapping_set_error to help ensure that those errors get reported (unless > > you're tracking them on your own somehow). I don't see where that's > > being done in a cursory glance at these patches, but I could have missed > > it. > > > > > + end_page_writeback(page); > > > + unlock_page(page); > > > + return 0; > > > +} > > > + > > > static int orangefs_readpage(struct file *file, struct page *page) > > > { > > > int ret; > > > @@ -786,6 +759,17 @@ static int orangefs_readpage(struct file *file, struct page *page) > > > return ret; > > > } > > > > > > +static int orangefs_write_end(struct file *file, > > > + struct address_space *mapping, loff_t pos, unsigned len, > > > + unsigned copied, struct page *page, void *fsdata) > > > +{ > > > + int r; > > > + r = simple_write_end(file, mapping, pos, len, copied, page, > > > + fsdata); > > > + mark_inode_dirty_sync(file_inode(file)); > > > + return r; > > > +} > > > + > > > static void orangefs_invalidatepage(struct page *page, > > > unsigned int offset, > > > unsigned int length) > > > @@ -815,17 +799,17 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, > > > { > > > struct file *file = iocb->ki_filp; > > > loff_t pos = *(&iocb->ki_pos); > > > - /* > > > - * This cannot happen until write_iter becomes > > > - * generic_file_write_iter. > > > - */ > > > - BUG_ON(iov_iter_rw(iter) != READ); > > > - return do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter); > > > + return do_readv_writev(iocb->ki_flags & IOCB_WRITE ? > > > + ORANGEFS_IO_WRITE : ORANGEFS_IO_READ, file, &pos, iter); > > > } > > > > > > /** ORANGEFS2 implementation of address space operations */ > > > const struct address_space_operations orangefs_address_operations = { > > > + .writepage = orangefs_writepage, > > > .readpage = orangefs_readpage, > > > + .set_page_dirty = __set_page_dirty_nobuffers, > > > + .write_begin = simple_write_begin, > > > + .write_end = orangefs_write_end, > > > .invalidatepage = orangefs_invalidatepage, > > > .releasepage = orangefs_releasepage, > > > .direct_IO = orangefs_direct_IO, > > > > -- > > Jeff Layton <jlayton@xxxxxxxxxx> -- Jeff Layton <jlayton@xxxxxxxxxx>