On Tue, 2014-03-04 at 10:16 -0500, Jeff Layton wrote: > On Mon, 3 Mar 2014 15:11:46 +0000 > Sachin Prabhu <sprabhu@xxxxxxxxxx> wrote: > > > Problem reported in Red Hat bz 1040329 for strict writes where we cache > > only when we hold oplock and write direct to the server when we don't. > > > > When we receive an oplock break, we first change the oplock value for > > the inode in cifsInodeInfo->oplock to indicate that we no longer hold > > the oplock before we enqueue a task to flush changes to the backing > > device. Once we have completed flushing the changes, we return the > > oplock to the server. > > > > There are 2 ways here where we can have data corruption > > 1) While we flush changes to the backing device as part of the oplock > > break, we can have processes write to the file. These writes check for > > the oplock, find none and attempt to write directly to the server. > > These direct writes made while we are flushing from cache could be > > overwritten by data being flushed from the cache causing data > > corruption. > > 2) While a thread runs in cifs_strict_writev, the machine could receive > > and process an oplock break after the thread has checked the oplock and > > found that it allows us to cache and before we have made changes to the > > cache. In that case, we end up with a dirty page in cache when we > > shouldn't have any. This will be flushed later and will overwrite all > > subsequent writes to the part of the file represented by this page. > > > > Before making any writes to the server, we need to confirm that we are > > not in the process of flushing data to the server and if we are, we > > should wait until the process is complete before we attempt the write. > > We should also wait for existing writes to complete before we process > > an oplock break request which changes oplock values. > > > > Signed-off-by: Sachin Prabhu <sprabhu@xxxxxxxxxx> > > --- > > fs/cifs/cifsfs.c | 14 +++++++++- > > fs/cifs/cifsglob.h | 9 +++++++ > > fs/cifs/cifsproto.h | 3 +++ > > fs/cifs/file.c | 32 ++++++++++++++++++++--- > > fs/cifs/misc.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++-- > > 5 files changed, 126 insertions(+), 6 deletions(-) > > > > diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c > > index 849f613..6d5cf79 100644 > > --- a/fs/cifs/cifsfs.c > > +++ b/fs/cifs/cifsfs.c > > @@ -253,6 +253,11 @@ cifs_alloc_inode(struct super_block *sb) > > cifs_set_oplock_level(cifs_inode, 0); > > cifs_inode->delete_pending = false; > > cifs_inode->invalid_mapping = false; > > + clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cifs_inode->flags); > > + clear_bit(CIFS_INODE_PENDING_WRITERS, &cifs_inode->flags); > > + clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cifs_inode->flags); > > + mutex_init(&cifs_inode->writers_lock); > > + cifs_inode->writers = 0; > > cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ > > cifs_inode->server_eof = 0; > > cifs_inode->uniqueid = 0; > > @@ -731,19 +736,26 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, > > unsigned long nr_segs, loff_t pos) > > { > > struct inode *inode = file_inode(iocb->ki_filp); > > + struct cifsInodeInfo *cinode = CIFS_I(inode); > > ssize_t written; > > int rc; > > > > + written = cifs_get_writer(cinode); > > + if (written) > > + return written; > > + > > written = generic_file_aio_write(iocb, iov, nr_segs, pos); > > > > if (CIFS_CACHE_WRITE(CIFS_I(inode))) > > - return written; > > + goto out; > > > > rc = filemap_fdatawrite(inode->i_mapping); > > if (rc) > > cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n", > > rc, inode); > > > > +out: > > + cifs_put_writer(cinode); > > return written; > > } > > > > diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h > > index f918a99..1a3a88d 100644 > > --- a/fs/cifs/cifsglob.h > > +++ b/fs/cifs/cifsglob.h > > @@ -1096,6 +1096,15 @@ struct cifsInodeInfo { > > unsigned int epoch; /* used to track lease state changes */ > > bool delete_pending; /* DELETE_ON_CLOSE is set */ > > bool invalid_mapping; /* pagecache is invalid */ > > + unsigned long flags; > > +#define CIFS_INODE_PENDING_OPLOCK_BREAK 0 /* oplock break in progress */ > > +#define CIFS_INODE_PENDING_WRITERS 1 /* Writing operations in progress */ > > +#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 2 /* > > nit: please align the values for readability. Probably also good to > wrap them in parenthesis -- e.g. "(1)". > > > + * We have received a request to > > + * downgrade oplock to L2 oplock > > + */ > > + struct mutex writers_lock; > > I don't see any reason to use a mutex here. There are no critical > sections that sleep AFAICT. Why not use a spinlock which would be > smaller (4 bytes vs. 40 bytes on my x86_64 box)? > > In fact, you could even consider just using the i_lock for this (though > I'd understand if you wanted to use your own). > > > + unsigned int writers; /* Number of writers on this inode */ > > unsigned long time; /* jiffies of last update of inode */ > > u64 server_eof; /* current file size on server -- protected by i_lock */ > > u64 uniqueid; /* server inode number */ > > diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h > > index 2c29db6..fa41676 100644 > > --- a/fs/cifs/cifsproto.h > > +++ b/fs/cifs/cifsproto.h > > @@ -127,6 +127,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec); > > extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, > > int offset); > > extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); > > +extern int cifs_get_writer(struct cifsInodeInfo *cinode); > > +extern void cifs_put_writer(struct cifsInodeInfo *cinode); > > +extern void cifs_done_oplock_break(struct cifsInodeInfo *cinode); > > extern int cifs_unlock_range(struct cifsFileInfo *cfile, > > struct file_lock *flock, const unsigned int xid); > > extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); > > diff --git a/fs/cifs/file.c b/fs/cifs/file.c > > index 5a5a872..5a94d7b 100644 > > --- a/fs/cifs/file.c > > +++ b/fs/cifs/file.c > > @@ -2580,12 +2580,20 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, > > struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); > > ssize_t written; > > > > + written = cifs_get_writer(cinode); > > + if (written) > > + return written; > > + > > if (CIFS_CACHE_WRITE(cinode)) { > > if (cap_unix(tcon->ses) && > > (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) > > - && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) > > - return generic_file_aio_write(iocb, iov, nr_segs, pos); > > - return cifs_writev(iocb, iov, nr_segs, pos); > > + && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) { > > + written = generic_file_aio_write( > > + iocb, iov, nr_segs, pos); > > + goto out; > > + } > > + written = cifs_writev(iocb, iov, nr_segs, pos); > > + goto out; > > } > > /* > > * For non-oplocked files in strict cache mode we need to write the data > > @@ -2605,6 +2613,8 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, > > inode); > > cinode->oplock = 0; > > } > > +out: > > + cifs_put_writer(cinode); > > return written; > > } > > > > @@ -3616,6 +3626,13 @@ static int cifs_launder_page(struct page *page) > > return rc; > > } > > > > +static int > > +cifs_oplock_break_wait(void *unused) > > +{ > > + schedule(); > > + return signal_pending(current) ? -ERESTARTSYS : 0; > > +} > > + > > This wait will just be used from cifs_oplock_break(), right? Since > that's a workqueue job, then we don't expect any signals there. I think > there's no need for checking signal_pending(). Just calling schedule() > is probably fine. > > > void cifs_oplock_break(struct work_struct *work) > > { > > struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, > > @@ -3625,6 +3642,14 @@ void cifs_oplock_break(struct work_struct *work) > > struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); > > int rc = 0; > > > > + wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, > > + cifs_oplock_break_wait, TASK_UNINTERRUPTIBLE); > > + > > + if (test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)) > > + cifs_set_oplock_level(cinode, OPLOCK_READ); > > + else > > + cifs_set_oplock_level(cinode, 0); > > + > > if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) && > > cifs_has_mand_locks(cinode)) { > > cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n", > > @@ -3661,6 +3686,7 @@ void cifs_oplock_break(struct work_struct *work) > > cinode); > > cifs_dbg(FYI, "Oplock release rc = %d\n", rc); > > } > > + cifs_done_oplock_break(cinode); > > } > > > > /* > > diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c > > index 2f9f379..55b26bd 100644 > > --- a/fs/cifs/misc.c > > +++ b/fs/cifs/misc.c > > @@ -466,8 +466,22 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) > > cifs_dbg(FYI, "file id match, oplock break\n"); > > pCifsInode = CIFS_I(netfile->dentry->d_inode); > > > > - cifs_set_oplock_level(pCifsInode, > > - pSMB->OplockLevel ? OPLOCK_READ : 0); > > + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, > > + &pCifsInode->flags); > > + > > + /* > > + * Set flag if the server downgrades the oplock > > + * to L2 else clear. > > + */ > > + if (pSMB->OplockLevel) > > + set_bit( > > + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, > > + &pCifsInode->flags); > > + else > > + clear_bit( > > + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, > > + &pCifsInode->flags); > > + > > queue_work(cifsiod_wq, > > &netfile->oplock_break); > > netfile->oplock_break_cancelled = false; > > @@ -551,6 +565,62 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) > > cinode->oplock = 0; > > } > > > > +static int > > +cifs_oplock_break_wait(void *unused) > > +{ > > + schedule(); > > + return signal_pending(current) ? -ERESTARTSYS : 0; > > +} > > + > > +/* > > + * We wait for oplock breaks to be processed before we attempt to perform > > + * writes. > > + */ > > +int cifs_get_writer(struct cifsInodeInfo *cinode) > > +{ > > + int rc; > > + > > +start: > > + rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK, > > + cifs_oplock_break_wait, TASK_KILLABLE); > > + if (rc) > > + return rc; > > + > > + mutex_lock(&cinode->writers_lock); > > + if (!cinode->writers) > > + set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); > > + cinode->writers++; > > + /* Check to see if we have started servicing an oplock break */ > > + if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) { > > + cinode->writers--; > > + if (cinode->writers == 0) { > > + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); > > + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); > > + } > > + mutex_unlock(&cinode->writers_lock); > > + goto start; > > + } > > + mutex_unlock(&cinode->writers_lock); > > + return 0; > > +} > > + > > +void cifs_put_writer(struct cifsInodeInfo *cinode) > > +{ > > + mutex_lock(&cinode->writers_lock); > > + cinode->writers--; > > + if (cinode->writers == 0) { > > + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); > > + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); > > + } > > + mutex_unlock(&cinode->writers_lock); > > +} > > + > > +void cifs_done_oplock_break(struct cifsInodeInfo *cinode) > > +{ > > + clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); > > + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK); > > +} > > + > > bool > > backup_cred(struct cifs_sb_info *cifs_sb) > > { > > Looks good otherwise, and definitely something we need to fix since > it's a data corruptor. The final version of this should probably go to > stable for that reason... Jeff, I've made the changes you mentioned and have posted the latest version to the list. I did however leave the spinlock writers_lock in there. Sachin Prabhu -- To unsubscribe from this list: send the line "unsubscribe linux-cifs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html