On 11/7/2023 8:38 PM, Keith Busch wrote: > On Tue, Nov 07, 2023 at 03:55:14PM +0530, Kanchan Joshi wrote: >> On 11/6/2023 8:32 PM, Keith Busch wrote: >>> On Mon, Nov 06, 2023 at 11:18:03AM +0530, Kanchan Joshi wrote: >>>> On 10/27/2023 11:49 PM, Keith Busch wrote: >>>>> + for (i = 0; i < nr_vecs; i = j) { >>>>> + size_t size = min_t(size_t, bytes, PAGE_SIZE - offs); >>>>> + struct folio *folio = page_folio(pages[i]); >>>>> + >>>>> + bytes -= size; >>>>> + for (j = i + 1; j < nr_vecs; j++) { >>>>> + size_t next = min_t(size_t, PAGE_SIZE, bytes); >>>>> + >>>>> + if (page_folio(pages[j]) != folio || >>>>> + pages[j] != pages[j - 1] + 1) >>>>> + break; >>>>> + unpin_user_page(pages[j]); >>>> >>>> Is this unpin correct here? >>> >>> Should be. The pages are bound to the folio, so this doesn't really >>> unpin the user page. It just drops a reference, and the folio holds the >>> final reference to the contiguous pages, which is released on >>> completion. >> >> But the completion is still going to see multiple pages and not one >> (folio). The bip_for_each_vec loop is going to drop the reference again. >> I suspect it is not folio-aware. > > The completion unpins once per bvec, not individual pages. The setup > creates multipage bvecs with only one pin remaining per bvec for all of > the bvec's pages. If a page can't be merged into the current bvec, then > that page is not unpinned and becomes the first page of to the next > bvec. > Here is a test program [2] that creates this scenario. Single 8KB+16b read on a 4KB+8b formatted namespace. It prepares meta-buffer out of a huge-page in a way that it spans two regular 4K pages. With this, I see more unpins than expected. And I had added this [1] also on top of your patch. [1] @@ -339,7 +367,22 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len, memcpy(bip->bip_vec, bvec, folios * sizeof(*bvec)); if (bvec != stack_vec) kfree(bvec); + // quick fix for completion + bip->bip_vcnt = folios; + bip->bip_iter.bi_size = len; [2] #define _GNU_SOURCE #include <unistd.h> #include <stdlib.h> #include <string.h> #include <sys/mman.h> #include <liburing.h> #include <libnvme.h> #define DEV "/dev/ng0n1" #define NSID 1 #define DBCNT 2 #define DATA_BUFLEN (4096 * DBCNT) #define OFFSET 0 #define LBA_SHIFT 12 /* This assumes 4K + 8b lba format */ #define MD_BUFLEN (8 * DBCNT) #define MD_OFFSET (4096 - 8) #define HP_SIZE (2*2*1024*1024) /*Two 2M pages*/ #define APPTAG_MASK (0xFFFF) #define APPTAG (0x8888) void *alloc_meta_buf_hp() { void *ptr; ptr = mmap(NULL, HP_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB, -1, 0); if (ptr == MAP_FAILED) return NULL; return ptr; } void free_meta_buf(void *ptr) { munmap(ptr, HP_SIZE); } int main() { struct io_uring ring; struct io_uring_cqe *cqe; struct io_uring_sqe *sqe; struct io_uring_params p = { }; int fd, ret; struct nvme_uring_cmd *cmd; void *buffer, *md_buf; __u64 slba; __u16 nlb; ret = posix_memalign(&buffer, DATA_BUFLEN, DATA_BUFLEN); if (ret) { fprintf(stderr, "data buffer allocation failed: %d\n", ret); return 1; } memset(buffer, 'x', DATA_BUFLEN); md_buf = alloc_meta_buf_hp(); if (!md_buf) { fprintf(stderr, "meta buffer allocation failed: %d\n", ret); return 1; } p.flags = IORING_SETUP_CQE32 | IORING_SETUP_SQE128; ret = io_uring_queue_init_params(4, &ring, &p); if (ret) { fprintf(stderr, "ring create failed: %d\n", ret); return 1; } fd = open(DEV, O_RDWR); if (fd < 0) { perror("file open"); exit(1); } sqe = io_uring_get_sqe(&ring); io_uring_prep_read(sqe, fd, buffer, DATA_BUFLEN, OFFSET); sqe->cmd_op = NVME_URING_CMD_IO; sqe->opcode = IORING_OP_URING_CMD; sqe->user_data = 1234; cmd = (struct nvme_uring_cmd *)sqe->cmd; memset(cmd, 0, sizeof(struct nvme_uring_cmd)); cmd->opcode = nvme_cmd_read; cmd->addr = (__u64)(uintptr_t)buffer; cmd->data_len = DATA_BUFLEN; cmd->nsid = NSID; slba = OFFSET >> LBA_SHIFT; nlb = (DATA_BUFLEN >> LBA_SHIFT) - 1; cmd->cdw10 = slba & 0xffffffff; cmd->cdw11 = slba >> 32; cmd->cdw12 = nlb; /* set the pract and prchk (Guard, App, RefTag) bits in cdw12 */ //cmd->cdw12 |= 15 << 26; cmd->cdw12 |= 7 << 26; cmd->metadata = ((__u64)(uintptr_t)md_buf) + MD_OFFSET; cmd->metadata_len = MD_BUFLEN; /* reftag */ cmd->cdw14 = (__u32)slba; /* apptag mask and apptag */ cmd->cdw15 = APPTAG_MASK << 16 | APPTAG; ret = io_uring_submit(&ring); if (ret != 1) { fprintf(stderr, "submit got %d, wanted %d\n", ret, 1); goto err; } ret = io_uring_wait_cqe(&ring, &cqe); if (ret) { fprintf(stderr, "wait_cqe=%d\n", ret); goto err; } if (cqe->res != 0) { fprintf(stderr, "cqe res %d, wanted success\n", cqe->res); goto err; } io_uring_cqe_seen(&ring, cqe); free_meta_buf(md_buf); close(fd); io_uring_queue_exit(&ring); return 0; err: if (fd != -1) close(fd); io_uring_queue_exit(&ring); return 1; }