> > /* > * Shift extent records to the left to cover a hole. > * > * The maximum number of extents to be shifted in a single operation > * is @count, and @current_ext keeps track of the current extent > * index we have shifted. If there is no hole to shift the extents > * into, then we abort immediately. > */ Thanks for your help. I will change this comment instead of original one. >> +int >> +xfs_bmap_shift_extents( >> + struct xfs_trans *tp, >> + struct xfs_inode *ip, >> + int *done, >> + xfs_fileoff_t start_fsb, >> + xfs_fileoff_t shift, > > Shift means ...? Number of extents to shift, a length, a number of > block, or something else? Ah, yes, shift_len would be a more proper name > >> + xfs_extnum_t *current_ext, >> + xfs_fsblock_t *firstblock, >> + struct xfs_bmap_free *flist, >> + int count) > > if count is the number of extents to shift, then it should be named > "num_exts" or something similar to describe what it is a count of. Right, I will change num_exts. > >> +{ >> + struct xfs_btree_cur *cur; >> + struct xfs_bmbt_rec_host *gotp; >> + struct xfs_bmbt_irec left; >> + struct xfs_mount *mp = ip->i_mount; >> + struct xfs_ifork *ifp; >> + xfs_extnum_t nexts = 0; >> + xfs_fileoff_t startoff; >> + int error = 0; >> + int i; >> + int whichfork = XFS_DATA_FORK; >> + int state; >> + int logflags; >> + xfs_filblks_t blockcount = 0; >> + >> + if (unlikely(XFS_TEST_ERROR( >> + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && >> + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), >> + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { >> + XFS_ERROR_REPORT("xfs_bmap_shift_extents", >> + XFS_ERRLEVEL_LOW, mp); >> + return XFS_ERROR(EFSCORRUPTED); >> + } >> + >> + if (XFS_FORCED_SHUTDOWN(mp)) >> + return XFS_ERROR(EIO); >> + >> + ifp = XFS_IFORK_PTR(ip, whichfork); >> + >> + if (!(ifp->if_flags & XFS_IFEXTENTS)) { >> + /* Read in all the extents */ >> + error = xfs_iread_extents(tp, ip, whichfork); >> + if (error) >> + return error; >> + } >> + >> + if (!*current_ext) { > > I had to do a double take on that, because I thought it was checking > for a null pointer at first. It's not, so at the start of the > function: > > ASSERT(current_ext != NULL); > > secondly, it's checking for a zero count, so make it clear in this > case: > > if (*current_ext == 0) { Okay, I will update like this. > .... >> + gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext); >> + /* >> + * gotp can be null in 2 cases: 1) if there are no extents >> + * or 2) start_fsb lies in a hole beyond which there are >> + * no extents. Either way, we are done. >> + */ >> + if (!gotp) { >> + *done = 1; >> + return 0; >> + } > > What does "gotp" mean in this context? Yes, it's the extent we got > from a lookup, but what extent is that? Is it the extent we are > shifting, the extent we are shifting it up against, or something > else? > >> + } >> + >> + /* We are going to change core inode */ >> + logflags = XFS_ILOG_CORE; >> + >> + if (ifp->if_flags & XFS_IFBROOT) { >> + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); >> + cur->bc_private.b.firstblock = *firstblock; >> + cur->bc_private.b.flist = flist; >> + cur->bc_private.b.flags = 0; >> + } >> + else { >> + cur = NULL; >> + logflags |= XFS_ILOG_DEXT; >> + } >> + >> + while (nexts++ < count && >> + *current_ext < XFS_IFORK_NEXTENTS(ip, whichfork)) { >> + state = 0; >> + >> + gotp = xfs_iext_get_ext(ifp, *current_ext); >> + startoff = xfs_bmbt_get_startoff(gotp); >> + startoff -= shift; > > xfs_bmbt_get_all(gotp, &got); > > and then you can drop all the xfs_bmbt_get*() wrappers. Okay, I will check it. > >> + >> + /* >> + * Before shifting extent into hole, make sure that the hole >> + * is large enough to accomodate the shift. >> + */ >> + if (*current_ext) { >> + state |= BMAP_LEFT_VALID; >> + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, >> + *current_ext - 1), &left); >> + >> + if (isnullstartblock(left.br_startblock)) >> + state |= BMAP_LEFT_DELAY; >> + >> + if (startoff < left.br_startoff + left.br_blockcount) >> + error = XFS_ERROR(EFSCORRUPTED); > > Why is the filesystem corrupted if the shift we asked for is too > large for the hole in the file? I haven't seen any checks before > this that guarantee that the hole is big enough for the shift... we call xfs_free_file_space to free enough blocks for shifting. If still the space is not big enough will it be considered as fs corrupted? What error could we return in this case? > >> + >> + } else if (startoff > xfs_bmbt_get_startoff(gotp)) >> + /* Hole is at the start but not large enough */ >> + error = XFS_ERROR(EFSCORRUPTED); > > Same question.... > >> + >> + if (error) >> + goto del_cursor; >> + >> + /* Check if we can merge 2 adjacent extents */ >> + if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && >> + left.br_startoff + left.br_blockcount == startoff && >> + left.br_startblock + left.br_blockcount == >> + xfs_bmbt_get_startblock(gotp) && >> + xfs_bmbt_get_state(gotp) == left.br_state && >> + left.br_blockcount + xfs_bmbt_get_blockcount(gotp) <= >> + MAXEXTLEN) { > > The indenting needs work here - whitespace gives lots of context > that is missing here: > > if ((state & BMAP_LEFT_VALID) && > !(state & BMAP_LEFT_DELAY) && > left.br_startoff + left.br_blockcount == startoff && > left.br_startblock + left.br_blockcount == > xfs_bmbt_get_startblock(gotp) && > xfs_bmbt_get_state(gotp) == left.br_state && > left.br_blockcount + xfs_bmbt_get_blockcount(gotp) <= > MAXEXTLEN) { > > And it can be simplified, too: > > if ((state & BMAP_LEFT_VALID) && > !(state & BMAP_LEFT_DELAY) && > > is exactly the same as: > > if (state == BMAP_LEFT_VALID && Right. I will update your points. > >> + blockcount = >> + left.br_blockcount + xfs_bmbt_get_blockcount(gotp); >> + state |= BMAP_LEFT_CONTIG; >> + xfs_iext_remove(ip, *current_ext, 1, 0); >> + XFS_IFORK_NEXT_SET(ip, whichfork >> + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); > > Ok, so you remove and extent from the in-memory tree, but I don't > see where you remove it from the on-disk btree. Okay, I will add code to remove on-disk btree also. > >> + gotp = xfs_iext_get_ext(ifp, --*current_ext); > > xfs_bmbt_get_all(gotp, &got); > >> + } >> + >> + if (cur) { >> + error = xfs_bmbt_lookup_eq(cur, >> + xfs_bmbt_get_startoff(gotp), >> + xfs_bmbt_get_startblock(gotp), >> + xfs_bmbt_get_blockcount(gotp), >> + &i); >> + if (error) >> + goto del_cursor; >> + XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); >> + } > > This needs to be done before merging extents so the cursor points at > the record that needs to be deleted from the btree when you merge > the extent records. i.e. you need to completely separate the extent > merge case from the update case for both the in-memory extent tree > update and the on-disk btree update.... Okay. > >> + >> return xfs_trans_commit(tp, 0); >> } >> >> + >> +/* >> + * xfs_collapse_file_space: Implements the FALLOC_FL_COLLAPSE_SPACE flag. >> + */ >> +int >> +xfs_collapse_file_space( >> + struct xfs_inode *ip, >> + loff_t offset, >> + loff_t len, >> + int attr_flags) >> +{ >> + int done = 0; >> + struct xfs_mount *mp = ip->i_mount; >> + uint resblks; >> + struct xfs_trans *tp; >> + int error; >> + xfs_extnum_t current_ext = 0; >> + struct xfs_bmap_free free_list; >> + xfs_fsblock_t first_block; >> + int committed; >> + xfs_fileoff_t start_fsb = XFS_B_TO_FSB(mp, offset + len); >> + xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); >> + >> + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); > > Why do we need a stack variable for this? Ah. I will directly use it instead of stack varable. > >> + >> + /* >> + * The first thing we do is to free data blocks in the specified range >> + * by calling xfs_free_file_space(). It would also sync dirty data >> + * and invalidate page cache over the region on which collapse range >> + * is working. >> + */ >> + >> + error = xfs_free_file_space(ip, offset, len, attr_flags); >> + if (error) >> + return error; > > This separation of punching the hole and collapsing the range means > that the operation is not atomic w.r.t. concurrent IO, truncate or > other hole punch/preallocate operations if the XFS_IOLOCK_EXCL is > not held. Hence we need to ensure this operation is executed with > the correct locks held by the caller, and the correct flags passed > into the function. That is, we need these asserts before doing > anything else in this function: > > ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); > ASSERT((attr_flags & XFS_ATTR_NOLOCK) == XFS_ATTR_NOLOCK); > > This makes it clear that there's a bug in the function's locking in > the "out" case.... > Yes, right. I will check. >> + while (!error && !done) { >> + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); >> + tp->t_flags |= XFS_TRANS_RESERVE; >> + /* >> + * We would need to reserve permanent block for transaction. >> + * This will come into picture when after shifting extent into >> + * hole we found that adjacent extents can be merged which >> + * may lead to freeing of a block during record update. >> + */ >> + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); >> + if (error) { >> + ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); >> + xfs_trans_cancel(tp, 0); >> + break; >> + } >> + >> + xfs_ilock(ip, XFS_ILOCK_EXCL); >> + error = xfs_trans_reserve_quota(tp, mp, >> + ip->i_udquot, ip->i_gdquot, ip->i_pdquot, >> + resblks, 0, XFS_QMOPT_RES_REGBLKS); >> + if (error) >> + goto out; >> + >> + xfs_trans_ijoin(tp, ip, 0); >> + >> + xfs_bmap_init(&free_list, &first_block); >> + >> + /* >> + * We are using the write transaction in which max 2 bmbt >> + * updates are allowed >> + */ >> + error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb, >> + shift_fsb, ¤t_ext, >> + &first_block, &free_list, 2); >> + if (error) >> + goto out; >> + >> + error = xfs_bmap_finish(&tp, &free_list, &committed); >> + if (error) >> + goto out; >> + >> + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); >> + xfs_iunlock(ip, XFS_ILOCK_EXCL); >> + } >> + >> + return error; >> + >> +out: >> + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); >> + xfs_iunlock(ip, XFS_IOLOCK_EXCL); > > That should be XFS_ILOCK_EXCL.... Yes :) > >> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c >> index 818c623..9c9c1ff 100644 >> --- a/fs/xfs/xfs_file.c >> +++ b/fs/xfs/xfs_file.c >> @@ -807,7 +807,8 @@ xfs_file_fallocate( >> int cmd = XFS_IOC_RESVSP; >> int attr_flags = XFS_ATTR_NOLOCK; >> >> - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) >> + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | >> + FALLOC_FL_COLLAPSE_RANGE)) >> return -EOPNOTSUPP; >> >> bf.l_whence = 0; >> @@ -819,10 +820,19 @@ xfs_file_fallocate( >> if (mode & FALLOC_FL_PUNCH_HOLE) >> cmd = XFS_IOC_UNRESVSP; >> >> - /* check the new inode size is valid before allocating */ >> - if (!(mode & FALLOC_FL_KEEP_SIZE) && >> - offset + len > i_size_read(inode)) { >> + /* Shrink size in case of FALLOC_FL_COLLAPSE_RANGE */ >> + if (mode & FALLOC_FL_COLLAPSE_RANGE) { >> + cmd = XFS_COLLAPSE_RANGE; >> + if ((offset + len) > i_size_read(inode)) >> + new_size = offset; > > That's an illegal case according to the higher layers. Don't handle > it here, replace it with: > > ASSERT(offset + len < i_size_read(inode)); Okay. > >> + else >> + new_size = i_size_read(inode) - len; > >> + } else if (!(mode & FALLOC_FL_KEEP_SIZE) && >> + offset + len > i_size_read(inode)) >> new_size = offset + len; >> + >> + /* check the new inode size is valid before allocating */ >> + if (new_size || mode & FALLOC_FL_COLLAPSE_RANGE) { > > That's a bit ugly. > > if (new_size != i_size_read(inode)) { > .... > > would be better, and it handles the case of the new size being zero. Right. Will update it. > >> error = inode_newsize_ok(inode, new_size); >> if (error) >> goto out_unlock; >> @@ -836,7 +846,7 @@ xfs_file_fallocate( >> goto out_unlock; >> >> /* Change file size if needed */ >> - if (new_size) { >> + if (new_size || mode & FALLOC_FL_COLLAPSE_RANGE) { >> struct iattr iattr; >> >> iattr.ia_valid = ATTR_SIZE; > > Same again. okay. > > >> diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h >> index 1edb5cc..99f5244 100644 >> --- a/fs/xfs/xfs_fs.h >> +++ b/fs/xfs/xfs_fs.h >> @@ -516,6 +516,12 @@ typedef struct xfs_swapext >> #define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) >> #define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) >> #define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks) >> +/* >> + * Although there is no ioctl implemented yet, we reserve an ioctl number for >> + * representing collapse range operation to avoid any possible collision in >> + * switch case of xfs_change_file_space. >> + */ >> +#define XFS_COLLAPSE_RANGE _IOW('X', 59, struct xfs_flock64) > > XFS_IOC_COLLAPSE_RANGE. Okay. Thanks for review! > > Cheers, > > Dave. > -- > Dave Chinner > david@xxxxxxxxxxxxx -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html