On Thu, May 25, 2017 at 12:09:49PM -0400, Jeff Layton wrote: > On Mon, 2017-05-22 at 05:59 -0400, Martin Brandenburg wrote: > > With this and the last commit, OrangeFS is capable of writing through > > the page cache. This should significantly increase performance of very > > small writes, since writeback will not be done for every write call. > > > > However it is not appropriate for use with multiple clients due to the > > long writeback delay. > > > > Signed-off-by: Martin Brandenburg <martin@xxxxxxxxxxxx> > > --- > > fs/orangefs/file.c | 128 +++++++++++++++++++++++------------------------------ > > 1 file changed, 56 insertions(+), 72 deletions(-) > > > > diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c > > index c03deea..3ab0a1f 100644 > > --- a/fs/orangefs/file.c > > +++ b/fs/orangefs/file.c > > @@ -448,69 +448,11 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, > > return generic_file_read_iter(iocb, iter); > > } > > > > -static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) > > +static ssize_t orangefs_file_write_iter(struct kiocb *iocb, > > + struct iov_iter *iter) > > { > > - struct file *file = iocb->ki_filp; > > - loff_t pos; > > - ssize_t rc; > > - > > - BUG_ON(iocb->private); > > - > > - gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n"); > > - > > - inode_lock(file->f_mapping->host); > > - > > - /* Make sure generic_write_checks sees an up to date inode size. */ > > - if (file->f_flags & O_APPEND) { > > - rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1, > > - STATX_SIZE); > > - if (rc == -ESTALE) > > - rc = -EIO; > > - if (rc) { > > - gossip_err("%s: orangefs_inode_getattr failed, " > > - "rc:%zd:.\n", __func__, rc); > > - goto out; > > - } > > - } > > - > > - if (file->f_pos > i_size_read(file->f_mapping->host)) > > - orangefs_i_size_write(file->f_mapping->host, file->f_pos); > > - > > - rc = generic_write_checks(iocb, iter); > > - > > - if (rc <= 0) { > > - gossip_err("%s: generic_write_checks failed, rc:%zd:.\n", > > - __func__, rc); > > - goto out; > > - } > > - > > - /* > > - * if we are appending, generic_write_checks would have updated > > - * pos to the end of the file, so we will wait till now to set > > - * pos... > > - */ > > - pos = *(&iocb->ki_pos); > > - > > - rc = do_readv_writev(ORANGEFS_IO_WRITE, > > - file, > > - &pos, > > - iter); > > - if (rc < 0) { > > - gossip_err("%s: do_readv_writev failed, rc:%zd:.\n", > > - __func__, rc); > > - goto out; > > - } > > - > > - iocb->ki_pos = pos; > > orangefs_stats.writes++; > > - > > - if (pos > i_size_read(file->f_mapping->host)) > > - orangefs_i_size_write(file->f_mapping->host, pos); > > - > > -out: > > - > > - inode_unlock(file->f_mapping->host); > > - return rc; > > + return generic_file_write_iter(iocb, iter); > > } > > > > /* > > @@ -606,9 +548,8 @@ static int orangefs_file_release(struct inode *inode, struct file *file) > > orangefs_flush_inode(inode); > > > > /* > > - * remove all associated inode pages from the page cache and > > - * readahead cache (if any); this forces an expensive refresh of > > - * data for the next caller of mmap (or 'get_block' accesses) > > + * remove all associated inode pages from the readahead cache > > + * (if any) > > */ > > if (file_inode(file) && > > file_inode(file)->i_mapping && > > @@ -621,8 +562,6 @@ static int orangefs_file_release(struct inode *inode, struct file *file) > > gossip_debug(GOSSIP_INODE_DEBUG, > > "flush_racache finished\n"); > > } > > - truncate_inode_pages(file_inode(file)->i_mapping, > > - 0); > > } > > return 0; > > } > > @@ -741,6 +680,40 @@ const struct file_operations orangefs_file_operations = { > > .fsync = orangefs_fsync, > > }; > > > > +static int orangefs_writepage(struct page *page, > > + struct writeback_control *wbc) > > +{ > > + struct inode *inode = page->mapping->host; > > + struct iov_iter iter; > > + struct iovec iov; > > + loff_t off; > > + size_t len; > > + ssize_t r; > > + void *map; > > + > > + off = page_offset(page); > > + len = i_size_read(inode); > > + if (off + PAGE_SIZE > len) > > + len = len - off; > > + else > > + len = PAGE_SIZE; > > + > > + map = kmap_atomic(page); > > + iov.iov_base = map; > > + iov.iov_len = len; > > + iov_iter_init(&iter, WRITE, &iov, 1, len); > > + > > + set_page_writeback(page); > > + > > wait_for_direct_io can sleep, so you can't use kmap_atomic there. > Regular old kmap should be ok there though. > > Also, you probably really don't want to use iovecs there, as those are > expected to deal in userland addresses and the kmap address won't be > one. > > It may be cleaner to use a bio_vec there instead. You most likely > wouldn't need to kmap at all if you did that. I tried that since that's what readpage does. static int orangefs_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; struct iov_iter iter; struct bio_vec bv; loff_t off; size_t len; ssize_t r; off = page_offset(page); len = i_size_read(inode); if (off + PAGE_SIZE > len) len = len - off; else len = PAGE_SIZE; bv.bv_page = page; bv.bv_len = len; iov_iter_bvec(&iter, ITER_BVEC | WRITE, &bv, 1, len); set_page_writeback(page); r = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, len, 0); printk("orangefs_writepage wait_for_direct_io returned %d\n", (int)r); end_page_writeback(page); unlock_page(page); return 0; } But this produced: [ 125.739396] BUG: unable to handle kernel paging request at ffff880303833802 [ 125.739429] IP: __memcpy+0x12/0x20 [ 125.739439] PGD 2ec7067 [ 125.739441] PUD 0 [ 125.739463] Oops: 0000 [#1] SMP [ 125.739474] Modules linked in: nfsd auth_rpcgss oid_registry nfs_acl snd_hda_codec_realtek nfs snd_hda_codec_generic lockd grace fscache sunrpc adt7475 hwmon_vid nouveau video mxm_wmi wmi coretemp i2c_algo_bit kvm_intel drm_kms_helper ttm drm kvm snd_hda_intel snd_hda_codec irqbypass snd_hwdep evdev iTCO_wdt dcdbas iTCO_vendor_support pcspkr snd_hda_core serio_raw i2c_i801 snd_pcm snd_timer lpc_ich mfd_core i2c_core shpchp snd soundcore button acpi_cpufreq autofs4 ext4 crc16 jbd2 mbcache hid_generic usbhid hid sg sr_mod cdrom sd_mod usb_storage ata_generic ata_piix libata psmouse firewire_ohci scsi_mod r8169 firewire_core ehci_pci mii uhci_hcd ehci_hcd crc_itu_t usbcore usb_common [ 125.739683] CPU: 1 PID: 111 Comm: kworker/u8:2 Not tainted 4.11.0+ #66 [ 125.739697] Hardware name: Dell Inc. Studio 540 /0M017G, BIOS 1.1.1 07/24/2009 [ 125.739718] Workqueue: writeback wb_workfn (flush-orangefs-1) [ 125.739734] task: ffff88021d9bc100 task.stack: ffffc900016d8000 [ 125.739748] RIP: 0010:__memcpy+0x12/0x20 [ 125.739759] RSP: 0018:ffffc900016db800 EFLAGS: 00010246 [ 125.739773] RAX: ffff88020693d000 RBX: 0000000000001000 RCX: 0000000000000200 [ 125.739788] RDX: 0000000000000000 RSI: ffff880303833802 RDI: ffff88020693d000 [ 125.739803] RBP: ffffc900016db808 R08: 0000160000000000 R09: 00000000ffff8802 [ 125.739818] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000001000 [ 125.739834] R13: ffffc900016db990 R14: 0000000000000000 R15: ffff88020693e000 [ 125.739849] FS: 0000000000000000(0000) GS:ffff880227000000(0000) knlGS:0000000000000000 [ 125.739866] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 125.739879] CR2: ffff880303833802 CR3: 0000000220d97000 CR4: 00000000000406e0 [ 125.739895] Call Trace: [ 125.739906] ? memcpy_from_page+0x46/0x80 [ 125.739918] copy_from_iter+0x1f1/0x380 [ 125.739929] ? get+0x55/0x2a0 [ 125.739942] copy_page_from_iter+0x166/0x250 [ 125.739957] orangefs_bufmap_copy_from_iovec+0x77/0xc0 [ 125.739971] wait_for_direct_io+0x1cb/0x500 [ 125.739987] orangefs_writepage+0x76/0xb0 [ 125.740002] __writepage+0x16/0x40 [ 125.740014] write_cache_pages+0x266/0x5c0 [ 125.740027] ? sched_clock_local+0x17/0x90 [ 125.740040] ? wb_position_ratio+0x1f0/0x1f0 [ 125.740060] generic_writepages+0x52/0x70 [ 125.740074] do_writepages+0x2b/0x30 [ 125.740084] ? do_writepages+0x2b/0x30 [ 125.740096] __writeback_single_inode+0x61/0x6d0 [ 125.740109] ? _raw_spin_unlock+0x27/0x30 [ 125.740122] writeback_sb_inodes+0x2e4/0x680 [ 125.740141] __writeback_inodes_wb+0x8f/0xc0 [ 125.740155] wb_writeback+0x32c/0x4f0 [ 125.740171] wb_workfn+0x9c/0x560 [ 125.740181] ? wb_workfn+0x9c/0x560 [ 125.740194] ? process_one_work+0x164/0x650 [ 125.740210] process_one_work+0x1e1/0x650 [ 125.740225] worker_thread+0x69/0x4a0 [ 125.740240] kthread+0x127/0x160 [ 125.740250] ? process_one_work+0x650/0x650 [ 125.740262] ? kthread_create_on_node+0x40/0x40 [ 125.740275] ret_from_fork+0x31/0x40 [ 125.740291] Code: 75 05 e8 72 fb ff ff 48 8b 43 60 48 2b 43 50 88 43 4e 5b 5d f3 c3 90 90 90 66 66 90 66 90 48 89 f8 48 89 d1 48 c1 e9 03 83 e2 07 <f3> 48 a5 89 d1 f3 a4 c3 66 0f 1f 44 00 00 48 89 f8 48 89 d1 f3 [ 125.740413] RIP: __memcpy+0x12/0x20 RSP: ffffc900016db800 [ 125.740425] CR2: ffff880303833802 [ 125.740437] ---[ end trace 844fdaac4da7b68d ]--- > > > + r = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, > > + len, 0); > > + kunmap_atomic(map); > > + > > When writeback fails for some reason, you'll also want to call > mapping_set_error to help ensure that those errors get reported (unless > you're tracking them on your own somehow). I don't see where that's > being done in a cursory glance at these patches, but I could have missed > it. > > > + end_page_writeback(page); > > + unlock_page(page); > > + return 0; > > +} > > + > > static int orangefs_readpage(struct file *file, struct page *page) > > { > > int ret; > > @@ -786,6 +759,17 @@ static int orangefs_readpage(struct file *file, struct page *page) > > return ret; > > } > > > > +static int orangefs_write_end(struct file *file, > > + struct address_space *mapping, loff_t pos, unsigned len, > > + unsigned copied, struct page *page, void *fsdata) > > +{ > > + int r; > > + r = simple_write_end(file, mapping, pos, len, copied, page, > > + fsdata); > > + mark_inode_dirty_sync(file_inode(file)); > > + return r; > > +} > > + > > static void orangefs_invalidatepage(struct page *page, > > unsigned int offset, > > unsigned int length) > > @@ -815,17 +799,17 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, > > { > > struct file *file = iocb->ki_filp; > > loff_t pos = *(&iocb->ki_pos); > > - /* > > - * This cannot happen until write_iter becomes > > - * generic_file_write_iter. > > - */ > > - BUG_ON(iov_iter_rw(iter) != READ); > > - return do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter); > > + return do_readv_writev(iocb->ki_flags & IOCB_WRITE ? > > + ORANGEFS_IO_WRITE : ORANGEFS_IO_READ, file, &pos, iter); > > } > > > > /** ORANGEFS2 implementation of address space operations */ > > const struct address_space_operations orangefs_address_operations = { > > + .writepage = orangefs_writepage, > > .readpage = orangefs_readpage, > > + .set_page_dirty = __set_page_dirty_nobuffers, > > + .write_begin = simple_write_begin, > > + .write_end = orangefs_write_end, > > .invalidatepage = orangefs_invalidatepage, > > .releasepage = orangefs_releasepage, > > .direct_IO = orangefs_direct_IO, > > -- > Jeff Layton <jlayton@xxxxxxxxxx>