This RFC provides a rough implementation of a mechanism to allow userspace to attach protection information (e.g. T10 DIF) data to a disk write and to receive the information alongside a disk read. There's a new "IO extension" interface wherein we define a structure (per zab's comments on the v2 series) io_extension that points to the the PI data buffer. These patches are against 3.14-rc7. NOTE: As far as I know this works, but this is just a refresh of last week's patchset to start the discussion at LSF, which was moved up to today. I've not done rigorous testing, hence the 'donotmerge'. The first patch is a little bit of code refactoring, as sent in by Gu Zheng. It seems to be queued up for 3.15, so I figured I might as well start from there. Patch #2 implements a generic IO extension interface so that we can receive a struct io_extension from userspace containing the structure size, a flag telling us which extensions we'd like to use (ie_has), and (eventually) extension data. There's a small framework for mapping ie_has bits to actual extensions. Patch #3 provides the plumbing to get the user's buffer all the way to the block integrity code. Due to the way that the code deals with the array of struct page*s that represent the PI buffer, there's an unfortunate requirement that no PI tuple may cross a page boundary. Given that so far DIF is only 8 or 16 bytes this isn't a problem yet. There's also no explicit fallback for the case where the user pages are not within a device's DMA range. This patch hooks into the IO extension interface. Patch #4 builds on the previous patch to allow userspace to send some flags along with the PI buffer. The integrity provider now has a "mod_user_buf_fn" hook that enables the provider to read the userspace flags and modify the PI buffer before submit_bio. For now, this means that T10/DIF provider can be told to patch any of the reference, app, or guard tags. This is useful for sending PI data with an IO request for a file on a filesystem, since the kernel can patch in the device's LBA later. Also it means that if you only care about, say, app tags, you can provide those and let the kernel take care of the crc and the LBA. I don't know if that's anyone's requirement, but there we are. Patch #5 provides a mechanism for integrity providers to advertise both the per-logical-block PI buffer size and the flags that can be passed to the mod_user_buf_fn hook. The advertisements can be found in sysfs, since that's where we present all the other PI details about a device. Patch #6 removes redundant code and modifies the tag get/set functions to follow the other new functions and kmap/unmap the PI buffer page(s) before messing with the PI buffers, instead of relying on pi_buf being a valid pointer. Eventually there will be a patch #7 that makes it so that IO extensions can be piped through the synchronous IO calls, but it was nowhere near ready when I sent this patchset. :( Comments and questions are, as always, welcome. There will be a session about this on the second day of LSF/MM, if I'm not mistaken. A sample program follows this message. $ cc -o prog prog.c $ ./prog -rw -pr -s 2048 /path/to/pi/device --D /* * Userspace DIX API test program * Licensed under GPLv2. Copyright 2014 Oracle. * * XXX: We don't query the kernel for this information like we should! */ #define _GNU_SOURCE #include <stdio.h> #include <libaio.h> #include <unistd.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <sys/uio.h> #include <errno.h> #include <stdlib.h> #include <stdint.h> #include <arpa/inet.h> #include <sys/ioctl.h> #include <linux/fs.h> #include <sys/syscall.h> #define GENERATE_GUARD (1) #define GENERATE_REF (2) #define GENERATE_APP (4) #define GENERATE_ALL (7) #define NR_IOS (1) #define NR_IOVS (2) #define NR_IOCB_EXTS (1) /* Stuff that should go in libaio.h */ #define IO_EXT_INVALID (0) #define IO_EXT_PI (1) /* protection info attached */ #define IOCB_FLAG_EXTENSIONS (1 << 1) #define __FIOEXT 040000000 struct io_extension { __u64 ie_size; __u64 ie_has; /* PI stuff */ __u64 ie_pi_buf; __u32 ie_pi_buflen; __u32 ie_pi_ret; __u32 ie_pi_flags; }; static void io_prep_extensions(struct iocb *iocb, struct io_extension *ext, unsigned int nr) { iocb->u.c.flags |= IOCB_FLAG_EXTENSIONS; iocb->u.c.__pad3 = (long long)ext; } static void io_prep_extension(struct io_extension *ext) { memset(ext, 0, sizeof(struct io_extension)); ext->ie_size = sizeof(*ext); } static void io_prep_extension_pi(struct io_extension *ext, void *buf, unsigned int buflen, unsigned int flags) { ext->ie_has |= IO_EXT_PI; ext->ie_pi_buf = (__u64)buf; ext->ie_pi_buflen = buflen; ext->ie_pi_flags = flags; } /* End stuff for libaio.h */ static void dump_buffer(char *buf, size_t len) { size_t off; char *p; for (p = buf; p < buf + len; p++) { off = p - buf; if (off % 32 == 0) { if (p != buf) printf("\n"); printf("%05zu:", off); } printf(" %02x", *p & 0xFF); } printf("\n"); } /* Table generated using the following polynomium: * x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1 * gt: 0x8bb7 */ static const uint16_t t10_dif_crc_table[256] = { 0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B, 0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6, 0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6, 0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B, 0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1, 0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C, 0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C, 0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781, 0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8, 0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255, 0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925, 0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698, 0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472, 0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF, 0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF, 0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02, 0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA, 0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067, 0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17, 0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA, 0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640, 0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD, 0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D, 0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30, 0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759, 0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4, 0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394, 0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29, 0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3, 0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E, 0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E, 0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3 }; uint16_t crc_t10dif(uint16_t crc, const unsigned char *buffer, uint32_t len) { unsigned int i; for (i = 0 ; i < len ; i++) crc = (crc << 8) ^ t10_dif_crc_table[((crc >> 8) ^ buffer[i]) & 0xff]; return crc; } struct sd_dif_tuple { uint16_t guard_tag; /* Checksum */ uint16_t app_tag; /* Opaque storage */ uint32_t ref_tag; /* Target LBA or indirect LBA */ }; static void stamp_pi_buffer(struct sd_dif_tuple *t, uint16_t csum, uint16_t tag, uint32_t sector) { t->guard_tag = htons(csum); t->app_tag = htons(tag); t->ref_tag = htonl(sector); } static void print_help(const char *progname) { printf("Usage: %s [OPTS] fname\n", progname); printf("-a Use this application tag\n"); printf("-d Do not use O_DIRECT\n"); printf("-o Read/write this many sectors into the device\n"); printf("-q Don't use AIO.\n"); printf("-r Use DIX to read\n"); printf("-s Allocate buffer of this many sectors\n"); printf("-w Use DIX to write\n"); printf("-z Do not use O_SYNC\n"); } int main(int argc, char *argv[]) { struct sd_dif_tuple *pi; int page_size = sysconf(_SC_PAGESIZE); io_context_t ioctx; struct io_event events[NR_IOS]; struct iocb iocbs[NR_IOS]; struct iocb *iocbps[NR_IOS]; void *buf, *buf2; unsigned char *p; void *mbuf, *mbuf2; int ret, fd, i; struct iovec iov[NR_IOVS]; struct io_extension iocb_ext[NR_IOCB_EXTS]; int opt; int dix_read = 0, dix_write = 0; unsigned int SECTOR_SIZE = 0; unsigned long long num_sectors = 8, BUF_SIZE; unsigned long long sector_offset = 256, BDEV_OFFSET; unsigned int APP_TAG = 0xEF53; unsigned int the_byte = 0x55; size_t pi_buflen; int o_direct = O_DIRECT; int o_sync = O_SYNC; int use_aio = 1; uint32_t pi_flags = 0; while ((opt = getopt(argc, argv, "b:zdrws:o:a:p:q")) != -1) { switch (opt) { case 'a': APP_TAG = strtoul(optarg, NULL, 0); break; case 'b': the_byte = strtoul(optarg, NULL, 0) & 0xFF; break; case 'd': o_direct = 0; break; case 'o': sector_offset = strtoull(optarg, NULL, 0); break; case 'p': for (i = 0; i < strlen(optarg); i++) switch (optarg[i]) { case 'a': pi_flags |= GENERATE_APP; break; case 'g': pi_flags |= GENERATE_GUARD; break; case 'r': pi_flags |= GENERATE_REF; break; default: print_help(argv[0]); return 2; } break; case 'r': dix_read = 1; break; case 's': num_sectors = strtoull(optarg, NULL, 0); break; case 'w': dix_write = 1; break; case 'z': o_sync = 0; break; case 'q': use_aio = 0; break; default: print_help(argv[0]); return 0; } } if (optind >= argc) { print_help(argv[0]); return 0; } if (dix_read) fprintf(stderr, "Using DIX read.\n"); if (dix_write) fprintf(stderr, "Using DIX write.\n"); fd = open(argv[optind], o_direct | o_sync | O_RDWR); if (fd < 0) { perror(argv[optind]); return 1; } /* For now, don't let non-block devices in */ SECTOR_SIZE = 512; if (ioctl(fd, BLKSSZGET, &SECTOR_SIZE)) { perror(argv[optind]); } pi_buflen = num_sectors * sizeof(struct sd_dif_tuple); BUF_SIZE = num_sectors * SECTOR_SIZE; BDEV_OFFSET = sector_offset * SECTOR_SIZE; fprintf(stderr, "sector=%d num_sectors=%llu pi_len=%zu pi_flag=0x%x\n", SECTOR_SIZE, num_sectors, pi_buflen, pi_flags); if (posix_memalign(&buf, page_size, BUF_SIZE) || posix_memalign(&buf2, page_size, BUF_SIZE) || posix_memalign(&mbuf, page_size, pi_buflen) || posix_memalign(&mbuf2, page_size, pi_buflen)) { perror("memalign"); return 1; } if (io_queue_init(2, &ioctx)) { perror("io_queue_init"); return 1; } /* Write everything out */ memset(mbuf, 0, pi_buflen); memset(buf, the_byte, BUF_SIZE); for (p = buf, i = 0, pi = mbuf; i < num_sectors; i++, pi++, p += SECTOR_SIZE) stamp_pi_buffer(pi, pi_flags & GENERATE_GUARD ? 0 : crc_t10dif(0, p, SECTOR_SIZE), pi_flags & GENERATE_APP ? 0 : APP_TAG, pi_flags & GENERATE_REF ? 0 : (BDEV_OFFSET / SECTOR_SIZE) + i); io_prep_extension(iocb_ext); io_prep_extension_pi(iocb_ext, mbuf, pi_buflen, pi_flags); iov[0].iov_base = buf; iov[0].iov_len = page_size; iov[1].iov_base = buf + page_size; iov[1].iov_len = BUF_SIZE - page_size; iocbps[0] = iocbs; io_prep_pwritev(iocbs, fd, iov, NR_IOVS, BDEV_OFFSET); if (dix_write) io_prep_extensions(iocbs, iocb_ext, NR_IOCB_EXTS); fprintf(stderr, "Writing %llu bytes\n", BUF_SIZE); if (use_aio) { ret = io_submit(ioctx, 1, iocbps); if (ret < 0) { errno = -ret; perror("io_submit"); return 1; } ret = io_getevents(ioctx, 1, 1, events, NULL); if (ret < 0) { errno = -ret; perror("io_getevents"); return 1; } if ((signed)events[0].res < 0) { errno = -((signed)events[0].res); perror("io_pwritev"); return 1; } } else { ret = fcntl(fd, F_GETFL); ret = fcntl(fd, F_SETFL, ret | __FIOEXT); if (ret) { perror("fcntl set"); return 1; } fprintf(stderr, "flags=0x%x iocb=%p\n", fcntl(fd, F_GETFL), iocb_ext); /* ret = pwritev(fd, iov, NR_IOVS, BDEV_OFFSET); */ ret = syscall(SYS_pwritev, fd, iov, NR_IOVS, BDEV_OFFSET, dix_write ? iocb_ext : NULL); if (ret < 0) { errno = -ret; perror("pwritev"); return 1; } } fprintf(stderr, "Wrote %lu bytes\n", events[0].res); /* Read everything back in */ memset(buf2, 0x00, BUF_SIZE); memset(mbuf2, 0x00, pi_buflen); io_prep_extension(iocb_ext); io_prep_extension_pi(iocb_ext, mbuf2, pi_buflen, 0); iov[0].iov_base = buf2; iov[0].iov_len = page_size; iov[1].iov_base = buf2 + page_size; iov[1].iov_len = BUF_SIZE - page_size; iocbps[0] = iocbs; io_prep_preadv(iocbs, fd, iov, NR_IOVS, BDEV_OFFSET); if (dix_read) io_prep_extensions(iocbs, iocb_ext, NR_IOCB_EXTS); fprintf(stderr, "Reading %llu bytes\n", BUF_SIZE); if (use_aio) { ret = io_submit(ioctx, 1, iocbps); if (ret < 0) { errno = -ret; perror("io_submit"); return 1; } ret = io_getevents(ioctx, 1, 1, events, NULL); if (ret < 0) { errno = -ret; perror("io_getevents"); return 1; } if ((signed)events[0].res < 0) { errno = -((signed)events[0].res); perror("io_preadv"); return 1; } } else { ret = fcntl(fd, F_GETFL); ret = fcntl(fd, F_SETFL, ret | __FIOEXT); if (ret) { perror("fcntl set"); return 1; } fprintf(stderr, "flags=0x%x\n", fcntl(fd, F_GETFL)); /* ret = preadv(fd, iov, NR_IOVS, BDEV_OFFSET, iocb_ext); */ ret = syscall(SYS_preadv, fd, iov, NR_IOVS, BDEV_OFFSET, dix_read ? iocb_ext : NULL); if (ret < 0) { errno = -ret; perror("preadv"); return 1; } } fprintf(stderr, "Read %lu bytes\n", events[0].res); /* Compare */ ret = 0; if (memcmp(buf, buf2, BUF_SIZE)) { ret = 2; fprintf(stderr, "Buffers do not match!\n"); } if (dix_read && dix_write) { fprintf(stderr, "write pi\n"); dump_buffer(mbuf, pi_buflen); fprintf(stderr, "read pi\n"); dump_buffer(mbuf2, pi_buflen); if(memcmp(mbuf, mbuf2, pi_buflen)) { ret = 2; fprintf(stderr, "DIX buffers do not match!\n"); } } else fprintf(stderr, "Need to pass -rw to compare DIX buffers!\n"); if (io_queue_release(ioctx)) { perror("io_queue_release"); return 1; } close(fd); if (!ret) fprintf(stderr, "Success.\n"); return ret; } -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>