When doing a synchronous write on an encrypted inode, we have no
guarantee that the caller is writing crypto block-aligned data. When
that happens, we must do a read/modify/write cycle.
First, expand the range to cover complete blocks. If we had to change
the original pos or length, issue a read to fill the first and/or last
pages, and fetch the version of the object from the result.
We then copy data into the pages as usual, encrypt the result and issue
a write prefixed by an assertion that the version hasn't changed. If
it has
changed then we restart the whole thing again.
If there is no object at that position in the file (-ENOENT), we prefix
the write on an exclusive create of the object instead.
Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
---
fs/ceph/file.c | 260 +++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 228 insertions(+), 32 deletions(-)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a6305ad5519b..41766b2012e9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1468,18 +1468,16 @@ ceph_sync_write(struct kiocb *iocb, struct
iov_iter *from, loff_t pos,
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_vino vino;
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_osd_request *req;
struct page **pages;
u64 len;
int num_pages;
int written = 0;
- int flags;
int ret;
bool check_caps = false;
struct timespec64 mtime = current_time(inode);
size_t count = iov_iter_count(from);
- size_t off;
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
@@ -1499,70 +1497,267 @@ ceph_sync_write(struct kiocb *iocb, struct
iov_iter *from, loff_t pos,
if (ret < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret);
- flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
-
while ((len = iov_iter_count(from)) > 0) {
size_t left;
int n;
+ u64 write_pos = pos;
+ u64 write_len = len;
+ u64 objnum, objoff;
+ u32 xlen;
+ u64 assert_ver;
+ bool rmw;
+ bool first, last;
+ struct iov_iter saved_iter = *from;
+ size_t off;
+
+ fscrypt_adjust_off_and_len(inode, &write_pos, &write_len);
+
+ /* clamp the length to the end of first object */
+ ceph_calc_file_object_mapping(&ci->i_layout, write_pos,
+ write_len, &objnum, &objoff,
+ &xlen);
+ write_len = xlen;
+
+ /* adjust len downward if it goes beyond current object */
+ if (pos + len > write_pos + write_len)
+ len = write_pos + write_len - pos;
- vino = ceph_vino(inode);
- req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
- vino, pos, &len, 0, 1,
- CEPH_OSD_OP_WRITE, flags, snapc,
- ci->i_truncate_seq,
- ci->i_truncate_size,
- false);
- if (IS_ERR(req)) {
- ret = PTR_ERR(req);
- break;
- }
+ /*
+ * If we had to adjust the length or position to align with a
+ * crypto block, then we must do a read/modify/write cycle. We
+ * use a version assertion to redrive the thing if something
+ * changes in between.
+ */
+ first = pos != write_pos;
+ last = (pos + len) != (write_pos + write_len);
+ rmw = first || last;
- /* FIXME: express in FSCRYPT_BLOCK_SIZE units */
- num_pages = calc_pages_for(pos, len);
+ /*
+ * The data is emplaced into the page as it would be if it
were in
+ * an array of pagecache pages.
+ */
+ num_pages = calc_pages_for(write_pos, write_len);
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
- goto out;
+ break;
+ }
+
+ /* Do we need to preload the pages? */
+ if (rmw) {
+ u64 first_pos = write_pos;
+ u64 last_pos = (write_pos + write_len) -
CEPH_FSCRYPT_BLOCK_SIZE;
+ u64 read_len = CEPH_FSCRYPT_BLOCK_SIZE;
+
+ /* We should only need to do this for encrypted inodes */
+ WARN_ON_ONCE(!IS_ENCRYPTED(inode));
+
+ /* No need to do two reads if first and last blocks are
same */
+ if (first && last_pos == first_pos)
+ last = false;
+
+ /*
+ * Allocate a read request for one or two extents,
depending
+ * on how the request was aligned.
+ */
+ req = ceph_osdc_new_request(osdc, &ci->i_layout,
+ ci->i_vino, first ? first_pos : last_pos,
+ &read_len, 0, (first && last) ? 2 : 1,
+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+ NULL, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
+ if (IS_ERR(req)) {
+ ceph_release_page_vector(pages, num_pages);
+ ret = PTR_ERR(req);
+ break;
+ }
+
+ /* Something is misaligned! */
+ if (read_len != CEPH_FSCRYPT_BLOCK_SIZE) {
+ ret = -EIO;
+ break;
+ }
+
+ /* Add extent for first block? */
+ if (first)
+ osd_req_op_extent_osd_data_pages(req, 0, pages,
+ CEPH_FSCRYPT_BLOCK_SIZE,
+ offset_in_page(first_pos),
+ false, false);
+
+ /* Add extent for last block */
+ if (last) {
+ /* Init the other extent if first extent has been
used */
+ if (first) {
+ osd_req_op_extent_init(req, 1, CEPH_OSD_OP_READ,
+ last_pos, CEPH_FSCRYPT_BLOCK_SIZE,
+ ci->i_truncate_size,
+ ci->i_truncate_seq);
+ }
+
+ osd_req_op_extent_osd_data_pages(req, first ? 1 : 0,
+ &pages[num_pages - 1],
+ CEPH_FSCRYPT_BLOCK_SIZE,
+ offset_in_page(last_pos),
+ false, false);
+ }
+
+ ret = ceph_osdc_start_request(osdc, req, false);
+ if (!ret)
+ ret = ceph_osdc_wait_request(osdc, req);
+
+ /* FIXME: length field is wrong if there are 2 extents */
+ ceph_update_read_metrics(&fsc->mdsc->metric,
+ req->r_start_latency,
+ req->r_end_latency,
+ read_len, ret);
+
+ /* Ok if object is not already present */
+ if (ret == -ENOENT) {
+ /*
+ * If there is no object, then we can't assert
+ * on its version. Set it to 0, and we'll use an
+ * exclusive create instead.
+ */
+ ceph_osdc_put_request(req);
+ assert_ver = 0;
+ ret = 0;
+
+ /*
+ * zero out the soon-to-be uncopied parts of the
+ * first and last pages.
+ */
+ if (first)
+ zero_user_segment(pages[0], 0,