> In stream extension directory entry, the ValidDataLength field describes > how far into the data stream user data has been written, and the > DataLength field describes the file size. > > Signed-off-by: Yuezhang Mo <Yuezhang.Mo@xxxxxxxx> > Reviewed-by: Andy Wu <Andy.Wu@xxxxxxxx> > Reviewed-by: Aoyama Wataru <wataru.aoyama@xxxxxxxx> > --- > fs/exfat/exfat_fs.h | 2 + > fs/exfat/file.c | 122 +++++++++++++++++++++++++++++++++++++++++++- > fs/exfat/inode.c | 96 ++++++++++++++++++++++++++++------ > fs/exfat/namei.c | 6 +++ > 4 files changed, 207 insertions(+), 19 deletions(-) [snip] > +static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct > +iov_iter *iter) { > + ssize_t ret; > + struct file *file = iocb->ki_filp; > + struct inode *inode = file_inode(file); > + struct exfat_inode_info *ei = EXFAT_I(inode); > + loff_t pos = iocb->ki_pos; > + loff_t valid_size; > + > + inode_lock(inode); > + > + valid_size = ei->valid_size; > + > + ret = generic_write_checks(iocb, iter); > + if (ret < 0) > + goto unlock; > + > + if (pos > valid_size) { > + ret = exfat_file_zeroed_range(file, valid_size, pos); > + if (ret < 0 && ret != -ENOSPC) { > + exfat_err(inode->i_sb, > + "write: fail to zero from %llu to %llu(%ld)", > + valid_size, pos, ret); > + } > + if (ret < 0) > + goto unlock; > + } > + > + ret = __generic_file_write_iter(iocb, iter); > + if (ret < 0) > + goto unlock; > + > + inode_unlock(inode); > + > + if (pos > valid_size && iocb_is_dsync(iocb)) { > + ssize_t err = vfs_fsync_range(file, valid_size, pos - 1, > + iocb->ki_flags & IOCB_SYNC); If there is a hole between valid_size and pos, it seems to call sync twice. Is there any reason to call separately? Why don't you call the vfs_fsync_range only once for the merged scope [valid_size:end]? > + if (err < 0) > + return err; > + } > + > + if (ret) > + ret = generic_write_sync(iocb, ret); > + > + return ret; > + > +unlock: > + inode_unlock(inode); > + > + return ret; > +} > + [snip] > @@ -75,8 +75,7 @@ int __exfat_write_inode(struct inode *inode, int sync) > if (ei->start_clu == EXFAT_EOF_CLUSTER) > on_disk_size = 0; > > - ep2->dentry.stream.valid_size = cpu_to_le64(on_disk_size); > - ep2->dentry.stream.size = ep2->dentry.stream.valid_size; > + ep2->dentry.stream.size = cpu_to_le64(on_disk_size); > if (on_disk_size) { > ep2->dentry.stream.flags = ei->flags; > ep2->dentry.stream.start_clu = cpu_to_le32(ei->start_clu); > @@ -85,6 +84,8 @@ int __exfat_write_inode(struct inode *inode, int sync) > ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER; > } > > + ep2->dentry.stream.valid_size = cpu_to_le64(ei->valid_size); Is there any reason to not only change the value but also move the line down? > + > exfat_update_dir_chksum_with_entry_set(&es); > return exfat_put_dentry_set(&es, sync); } @@ -306,17 +307,25 @@ > static int exfat_get_block(struct inode *inode, sector_t iblock, > mapped_blocks = sbi->sect_per_clus - sec_offset; > max_blocks = min(mapped_blocks, max_blocks); > > - /* Treat newly added block / cluster */ > - if (iblock < last_block) > - create = 0; > - > - if (create || buffer_delay(bh_result)) { > - pos = EXFAT_BLK_TO_B((iblock + 1), sb); > + pos = EXFAT_BLK_TO_B((iblock + 1), sb); > + if ((create && iblock >= last_block) || buffer_delay(bh_result)) { > if (ei->i_size_ondisk < pos) > ei->i_size_ondisk = pos; > } > > + map_bh(bh_result, sb, phys); > + if (buffer_delay(bh_result)) > + clear_buffer_delay(bh_result); > + > if (create) { > + sector_t valid_blks; > + > + valid_blks = EXFAT_B_TO_BLK_ROUND_UP(ei->valid_size, sb); > + if (iblock < valid_blks && iblock + max_blocks >= valid_blks) > { > + max_blocks = valid_blks - iblock; > + goto done; > + } > + You removed the code for handling the case for (iblock < last_block). So, under all write call-flows, it could be buffer_new abnormally. It seems wrong. right? > err = exfat_map_new_buffer(ei, bh_result, pos); > if (err) { > exfat_fs_error(sb, [snip] > @@ -436,8 +485,20 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, > struct iov_iter *iter) > * condition of exfat_get_block() and ->truncate(). > */ > ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block); > - if (ret < 0 && (rw & WRITE)) > - exfat_write_failed(mapping, size); > + if (ret < 0) { > + if (rw & WRITE) > + exfat_write_failed(mapping, size); > + > + if (ret != -EIOCBQUEUED) > + return ret; > + } else > + size = pos + ret; > + > + if ((rw & READ) && pos < ei->valid_size && ei->valid_size < size) { > + iov_iter_revert(iter, size - ei->valid_size); > + iov_iter_zero(size - ei->valid_size, iter); > + } This approach causes unnecessary reads to the range after valid_size, right? But it looks very simple and clear. Hum... Do you have any plan to handle the before and after of valid_size separately?