This RFC provides a rough implementation of a mechanism to allow userspace to attach protection information (e.g. T10 DIF) data to a disk write and to receive the information alongside a disk read. The interface is an extension to the AIO interface: two new commands (IOCB_CMD_P{READ,WRITE}VM) are provided. The last struct iovec in the arg list is interpreted to point to a buffer containing a header, followed by the the PI data. These patches are against 3.14-rc7. The first patch is a little bit of code refactoring, as sent in by Gu Zheng. It seems to be queued up for 3.15, so I figured I might as well start from there. Patch #2 provides the plumbing to get the user's buffer all the way to the block integrity code. I'm not quite sure if the mechanism I took (passing the results of get_user_pages around) actually works in all cases (such as the user's buffer being swapped out), but it survives a simple test. Due to the way that the code deals with the array of struct page*s that represent the PI buffer, there's an unfortunate requirement that no PI tuple may cross a page boundary. Given that so far DIF is only 8 or 16 bytes this isn't a problem... yet. There's also no explicit fallback for the case where the user pages are not within a device's DMA range. Patch #3 builds on the previous patch to allow userspace to send some flags along with the PI buffer. The integrity provider now has a "mod_user_buf_fn" hook that enables the provider to read the userspace flags and modify the PI buffer before submit_bio. For now, this means that T10/DIF provider can be told to patch any of the reference, app, or guard tags. This is useful for sending PI data with an IO request for a file on a filesystem, since the kernel can patch in the device's LBA later. Also it means that if you only care about, say, app tags, you can provide those and let the kernel take care of the crc and the LBA. I don't know if that's anyone's requirement, but there we are. Patch #4 provides a mechanism for integrity providers to advertise both the per-logical-block PI buffer size and the flags that can be passed to the mod_user_buf_fn hook. The advertisements can be found in sysfs, since that's where we present all the other PI details about a device. Patch #5 removes redundant code and modifies the tag get/set functions to follow the other new functions and kmap/unmap the PI buffer page(s) before messing with the PI buffers, instead of relying on pi_buf being a valid pointer. Comments and questions are, as always, welcome. There will be a session about this on the second day of LSF/MM, if I'm not mistaken. A sample program follows this message. $ cc -o prog prog.c $ ./prog -rw -p r -s 2048 /path/to/pi/device --D /* * Userspace DIX API test program * Licensed under GPLv2. Copyright 2014 Oracle. * * XXX: We don't query the kernel for this information like we should! */ #define _GNU_SOURCE #include <stdio.h> #include <libaio.h> #include <unistd.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <sys/uio.h> #include <errno.h> #include <stdlib.h> #include <stdint.h> #include <arpa/inet.h> #include <sys/ioctl.h> #include <linux/fs.h> #define IOCB_CMD_PREADVM (9) #define IOCB_CMD_PWRITEVM (10) #define GENERATE_GUARD (1) #define GENERATE_REF (2) #define GENERATE_APP (4) #define GENERATE_ALL (7) #define NR_IOS (1) static void dump_buffer(char *buf, size_t len) { size_t off; char *p; for (p = buf; p < buf + len; p++) { off = p - buf; if (off % 32 == 0) { if (p != buf) printf("\n"); printf("%05zu:", off); } printf(" %02x", *p & 0xFF); } printf("\n"); } /* Table generated using the following polynomium: * x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1 * gt: 0x8bb7 */ static const uint16_t t10_dif_crc_table[256] = { 0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B, 0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6, 0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6, 0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B, 0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1, 0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C, 0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C, 0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781, 0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8, 0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255, 0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925, 0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698, 0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472, 0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF, 0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF, 0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02, 0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA, 0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067, 0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17, 0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA, 0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640, 0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD, 0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D, 0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30, 0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759, 0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4, 0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394, 0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29, 0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3, 0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E, 0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E, 0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3 }; uint16_t crc_t10dif(uint16_t crc, const unsigned char *buffer, uint32_t len) { unsigned int i; for (i = 0 ; i < len ; i++) crc = (crc << 8) ^ t10_dif_crc_table[((crc >> 8) ^ buffer[i]) & 0xff]; return crc; } struct sd_dif_tuple { uint16_t guard_tag; /* Checksum */ uint16_t app_tag; /* Opaque storage */ uint32_t ref_tag; /* Target LBA or indirect LBA */ }; static void stamp_pi_buffer(struct sd_dif_tuple *t, uint16_t csum, uint16_t tag, uint32_t sector) { t->guard_tag = htons(csum); t->app_tag = htons(tag); t->ref_tag = htonl(sector); } static void print_help(const char *progname) { printf("Usage: %s [OPTS] fname\n", progname); printf("-a Use this application tag\n"); printf("-d Do not use O_DIRECT\n"); printf("-o Read/write this many sectors into the device\n"); printf("-r Use DIX to read\n"); printf("-s Allocate buffer of this many sectors\n"); printf("-w Use DIX to write\n"); printf("-z Do not use O_SYNC\n"); } int main(int argc, char *argv[]) { struct sd_dif_tuple *pi; int page_size = sysconf(_SC_PAGESIZE); io_context_t ioctx; struct io_event events[NR_IOS]; struct iocb iocbs[NR_IOS]; struct iocb *iocbps[NR_IOS]; void *buf, *buf2; unsigned char *p; void *mbuf, *mbuf2; int ret, fd, i; struct iovec iov[3]; int opt; int dix_read = 0, dix_write = 0; unsigned int SECTOR_SIZE = 0; unsigned long long num_sectors = 8, BUF_SIZE; unsigned long long sector_offset = 256, BDEV_OFFSET; unsigned int APP_TAG = 0xEF53; unsigned int the_byte = 0x55; size_t pi_buflen; int o_direct = O_DIRECT; int o_sync = O_SYNC; uint32_t pi_flags = 0; while ((opt = getopt(argc, argv, "b:zdrws:o:a:p:")) != -1) { switch (opt) { case 'a': APP_TAG = strtoul(optarg, NULL, 0); break; case 'b': the_byte = strtoul(optarg, NULL, 0) & 0xFF; break; case 'd': o_direct = 0; break; case 'o': sector_offset = strtoull(optarg, NULL, 0); break; case 'p': for (i = 0; i < strlen(optarg); i++) switch (optarg[i]) { case 'a': pi_flags |= GENERATE_APP; break; case 'g': pi_flags |= GENERATE_GUARD; break; case 'r': pi_flags |= GENERATE_REF; break; default: print_help(argv[0]); return 2; } break; case 'r': dix_read = 1; break; case 's': num_sectors = strtoull(optarg, NULL, 0); break; case 'w': dix_write = 1; break; case 'z': o_sync = 0; break; default: print_help(argv[0]); return 0; } } if (optind >= argc) { print_help(argv[0]); return 0; } if (dix_read) fprintf(stderr, "Using DIX read.\n"); if (dix_write) fprintf(stderr, "Using DIX write.\n"); fd = open(argv[optind], o_direct | o_sync | O_RDWR); if (fd < 0) { perror(argv[optind]); return 1; } /* For now, don't let non-block devices in */ SECTOR_SIZE = 512; if (ioctl(fd, BLKSSZGET, &SECTOR_SIZE)) { perror(argv[optind]); } pi_buflen = (num_sectors + 1) * sizeof(struct sd_dif_tuple); BUF_SIZE = num_sectors * SECTOR_SIZE; BDEV_OFFSET = sector_offset * SECTOR_SIZE; fprintf(stderr, "sector=%d num_sectors=%llu pi_len=%zu pi_flag=0x%x\n", SECTOR_SIZE, num_sectors, pi_buflen, pi_flags); if (posix_memalign(&buf, page_size, BUF_SIZE) || posix_memalign(&buf2, page_size, BUF_SIZE) || posix_memalign(&mbuf, page_size, pi_buflen) || posix_memalign(&mbuf2, page_size, pi_buflen)) { perror("memalign"); return 1; } if (io_queue_init(2, &ioctx)) { perror("io_queue_init"); return 1; } /* Write everything out */ memcpy(mbuf, &pi_flags, sizeof(pi_flags)); memset(buf, the_byte, BUF_SIZE); for (p = buf, i = 0, pi = mbuf + sizeof(struct sd_dif_tuple); i < num_sectors; i++, pi++, p += SECTOR_SIZE) stamp_pi_buffer(pi, pi_flags & GENERATE_GUARD ? 0 : crc_t10dif(0, p, SECTOR_SIZE), pi_flags & GENERATE_APP ? 0 : APP_TAG, pi_flags & GENERATE_REF ? 0 : (BDEV_OFFSET / SECTOR_SIZE) + i); iov[0].iov_base = buf; iov[0].iov_len = page_size; iov[1].iov_base = buf + page_size; iov[1].iov_len = BUF_SIZE - page_size; iov[2].iov_base = mbuf; iov[2].iov_len = pi_buflen; iocbps[0] = iocbs; io_prep_pwritev(iocbs, fd, iov, (dix_write ? 3 : 2), BDEV_OFFSET); if (dix_write) iocbs[0].aio_lio_opcode = IOCB_CMD_PWRITEVM; fprintf(stderr, "Writing %llu bytes\n", BUF_SIZE); ret = io_submit(ioctx, 1, iocbps); if (ret < 0) { errno = -ret; perror("io_submit"); return 1; } ret = io_getevents(ioctx, 1, 1, events, NULL); if (ret < 0) { errno = -ret; perror("io_getevents"); return 1; } if ((signed)events[0].res < 0) { errno = -((signed)events[0].res); perror("io_pwritev"); return 1; } fprintf(stderr, "Wrote %lu bytes\n", events[0].res); /* Read everything back in */ memset(buf2, 0x00, BUF_SIZE); memset(mbuf2, 0x00, pi_buflen); memcpy(mbuf2, &pi_flags, sizeof(pi_flags)); iov[0].iov_base = buf2; iov[0].iov_len = page_size; iov[1].iov_base = buf2 + page_size; iov[1].iov_len = BUF_SIZE - page_size; iov[2].iov_base = mbuf2; iov[2].iov_len = pi_buflen; iocbps[0] = iocbs; io_prep_preadv(iocbs, fd, iov, (dix_read ? 3 : 2), BDEV_OFFSET); if (dix_read) iocbs[0].aio_lio_opcode = IOCB_CMD_PREADVM; fprintf(stderr, "Reading %llu bytes\n", BUF_SIZE); ret = io_submit(ioctx, 1, iocbps); if (ret < 0) { errno = -ret; perror("io_submit"); return 1; } ret = io_getevents(ioctx, 1, 1, events, NULL); if (ret < 0) { errno = -ret; perror("io_getevents"); return 1; } if ((signed)events[0].res < 0) { errno = -((signed)events[0].res); perror("io_preadv"); return 1; } fprintf(stderr, "Read %lu bytes\n", events[0].res); /* Compare */ ret = 0; if (memcmp(buf, buf2, BUF_SIZE)) { ret = 2; fprintf(stderr, "Buffers do not match!\n"); } if (dix_read && dix_write) { fprintf(stderr, "write pi\n"); dump_buffer(mbuf, pi_buflen); fprintf(stderr, "read pi\n"); dump_buffer(mbuf2, pi_buflen); if(memcmp(mbuf, mbuf2, pi_buflen)) { ret = 2; fprintf(stderr, "DIX buffers do not match!\n"); } } else fprintf(stderr, "Need to pass -rw to compare DIX buffers!\n"); if (io_queue_release(ioctx)) { perror("io_queue_release"); return 1; } close(fd); if (!ret) fprintf(stderr, "Success.\n"); return ret; } -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>