From: weixinwei <nkxiaowei@xxxxxxx> Signed-off-by: Xinwei Wei <nkxiaowei@xxxxxxx> --- bcache.h | 86 ++++++++++++++++ make.c | 307 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 392 insertions(+), 1 deletion(-) diff --git a/bcache.h b/bcache.h index 61e4252..105979e 100644 --- a/bcache.h +++ b/bcache.h @@ -40,6 +40,92 @@ static const char bcache_magic[] = { #define BDEV_DATA_START_DEFAULT 16 /* sectors */ #define SB_START (SB_SECTOR * 512) + +#define ATA_OP_IDENTIFY 0xec +#define ATA_OP_PIDENTIFY 0xa1 + +/* + * Some useful ATA register bits + */ +enum { + ATA_USING_LBA = (1 << 6), + ATA_STAT_DRQ = (1 << 3), + ATA_STAT_ERR = (1 << 0), +}; + +/* + * ATA PASS-THROUGH (16) CDB + */ +#define SG_ATA_16 0x85 +#define SG_ATA_16_LEN 16 + +/* + * ATA Protocols + */ +#define SG_ATA_PROTO_PIO_IN (4 << 1) /* PIO Data-in */ + +enum { + /* No data is transferred */ + SG_CDB2_TLEN_NODATA = 0 << 0, + /* Transfer Length is found in the Feature field */ + SG_CDB2_TLEN_FEAT = 1 << 0, + /* Transfer Length is found in the Sector Count field */ + SG_CDB2_TLEN_NSECT = 2 << 0, + + /* transfer units for Transfer Length are bytes */ + SG_CDB2_TLEN_BYTES = 0 << 2, + /* transfer units for Transfer Length are blocks */ + SG_CDB2_TLEN_SECTORS = 1 << 2, + + /* data is transferred from the initiator to the target */ + SG_CDB2_TDIR_TO_DEV = 0 << 3, + /* indicate that data is transferred from the target to the initiator */ + SG_CDB2_TDIR_FROM_DEV = 1 << 3, + + /* Check Condition */ + SG_CDB2_CHECK_COND = 1 << 5, +}; + +/* + * SCSI Architecture Model (SAM) Status codes. Taken from SAM-6 + * T10/BSR INCITS 546 dated January 5, 2018. + */ +#define SAM_STAT_GOOD 0x00 +#define SG_CHECK_CONDITION 0x02 +#define SG_DRIVER_SENSE 0x08 + +/* + * This is a slightly modified SCSI sense "descriptor" format header. + * The addition is to allow the 0x70 and 0x71 response codes. The idea + * is to place the salient data from either "fixed" or "descriptor" sense + * format into one structure to ease application processing. + * + * The original sense buffer should be kept around for those cases + * in which more information is required (e.g. the LBA of a MEDIUM ERROR). + */ +struct scsi_sense_hdr { /* See SPC-3 section 4.5 */ + uint8_t response_code; /* permit: 0x0, 0x70, 0x71, 0x72, 0x73 */ + uint8_t sense_key; + uint8_t asc; + uint8_t ascq; + uint8_t byte4; + uint8_t byte5; + uint8_t byte6; + uint8_t additional_length; /* always 0 for fixed sense format */ +}; + +/* + * SENSE KEYS + */ + +#define SG_NO_SENSE 0x00 +#define SG_RECOVERED_ERROR 0x01 + +/* NVME Admin commands */ +#define nvme_admin_identify 0x06 + +#define NVME_IDENTIFY_DATA_SIZE 4096 + struct cache_sb { uint64_t csum; uint64_t offset; /* sector where this sb was written */ diff --git a/make.c b/make.c index e5e7464..3a7badb 100644 --- a/make.c +++ b/make.c @@ -31,10 +31,16 @@ #include <sys/stat.h> #include <unistd.h> #include <uuid/uuid.h> +#include <linux/hdreg.h> +#include <asm/byteorder.h> +#include <libgen.h> #include "bcache.h" #include "lib.h" +#include <scsi/sg.h> +#include <linux/nvme_ioctl.h> + #define max(x, y) ({ \ typeof(x) _max1 = (x); \ typeof(y) _max2 = (y); \ @@ -179,6 +185,300 @@ const char * const cache_replacement_policies[] = { NULL }; +int scsi_normalize_sense(const uint8_t *sense_buffer, + struct scsi_sense_hdr *sshdr) +{ + if (!sense_buffer) + goto err; + + memset(sshdr, 0, sizeof(struct scsi_sense_hdr)); + + sshdr->response_code = (sense_buffer[0] & 0x7f); + if ((sshdr->response_code & 0x70) != 0x70) + goto err; + + if (sshdr->response_code >= 0x72) { + /* + * descriptor format + */ + sshdr->sense_key = (sense_buffer[1] & 0xf); + sshdr->asc = sense_buffer[2]; + sshdr->ascq = sense_buffer[3]; + sshdr->additional_length = sense_buffer[7]; + } else { + /* + * fixed format + */ + sshdr->sense_key = (sense_buffer[2] & 0xf); + sshdr->asc = sense_buffer[12]; + sshdr->ascq = sense_buffer[13]; + } + + return 0; +err: + return -1; +} + +int query_identify(int fd, uint8_t *args) +{ +#ifdef SG_IO + uint8_t cdb[SG_ATA_16_LEN] = { 0 }; + uint8_t sensebuf[32] = { 0 }, *desc; + sg_io_hdr_t io_hdr = { 0 }; + struct scsi_sense_hdr sshdr = { 0 }; + + /* + * ATA PASS-THROUGH (16) CDB + */ + cdb[0] = SG_ATA_16; /* OPERATION CODE (85h) */ + cdb[1] = SG_ATA_PROTO_PIO_IN; /* PIO Data-in */ + /* no off.line or cc, read from dev, + * block count in sector count field + */ + cdb[2] |= SG_CDB2_TLEN_NSECT; + cdb[2] |= SG_CDB2_TLEN_SECTORS; + cdb[2] |= SG_CDB2_TDIR_FROM_DEV; + cdb[13] = ATA_USING_LBA; /* Device */ + cdb[14] = args[0]; /* Command */ + + io_hdr.interface_id = 'S'; + io_hdr.mx_sb_len = sizeof(sensebuf); + io_hdr.dxfer_direction = SG_DXFER_FROM_DEV; + io_hdr.dxfer_len = 512; + io_hdr.dxferp = args + 4; + io_hdr.cmdp = cdb; + io_hdr.cmd_len = SG_ATA_16_LEN; + io_hdr.sbp = sensebuf; + io_hdr.timeout = 15 * 1000; /* msecs */ + + if (ioctl(fd, SG_IO, &io_hdr) == -1) + goto use_legacy_ioctl; + + /* sense data available */ + if (io_hdr.driver_status == SG_DRIVER_SENSE) { + desc = sensebuf + 8; + /* SG_DRIVER_SENSE is not an error */ + io_hdr.driver_status &= ~SG_DRIVER_SENSE; + /* If we set cc then ATA pass-through will cause a + * check condition even if no error. Filter that. */ + if (io_hdr.status & SG_CHECK_CONDITION) { + scsi_normalize_sense(sensebuf, &sshdr); + if (sshdr.sense_key == SG_RECOVERED_ERROR && + sshdr.asc == 0 && sshdr.ascq == 0x1d) + io_hdr.status &= ~SG_CHECK_CONDITION; + } + + /* return a few ATA registers */ + if (sensebuf[0] == 0x72 && /* format is "descriptor" */ + desc[0] == 0x09) { /* ATA Descriptor Return */ + args[0] = desc[13]; /* Status */ + args[1] = desc[3]; /* Error */ + args[2] = desc[5]; /* Sector Count (0:7) */ + } + } + + if (io_hdr.status || io_hdr.host_status || io_hdr.driver_status) + goto use_legacy_ioctl; + + return 0; + +use_legacy_ioctl: +#endif + return ioctl(fd, HDIO_DRIVE_CMD, args); +} + +int check_trim_supported(int fd, + int *trim, + int *trim_blocks, + int *trim_rzat) +{ + *trim = *trim_blocks = *trim_rzat = 0; + + uint8_t args[4 + 512] = { 0 }; + uint16_t *identify; + int i; + + args[0] = ATA_OP_IDENTIFY; + if (query_identify(fd, args)) { + memset(args, 0, sizeof(args)); + args[0] = ATA_OP_PIDENTIFY; + if (query_identify(fd, args)) { + perror("HDIO_DRIVE_CMD(identify) failed"); + goto err; + } + } + + /* byte-swap the little-endian IDENTIFY data + * to match byte-order on host CPU + */ + identify = (uint16_t *)(args + 4); + for (i = 0; i < (512 >> 1); ++i) + __le16_to_cpus(&identify[i]); + + /* TRIM bit - Identify Device word 169 bit 0 + * DRAT bit - Identify Device word 69 bit 14 + * RZAT bit - Identify Device word 69 bit 5 + * Maximum number of 512-byte blocks per DATA SET MANAGEMENT command + * - Identify Device word 105 + * + * If word 169 bit 0 is set to one and word 69 bit 14 is cleared to + * zero, then the Trim function of the DATA SET MANAGEMENT command + * (see 7.10.3.2) supports indeterminate read after trim behavior. + * If word 169 bit 0 is set to one and word 69 bit 14 is set to one, + * the Trim function of the DATA SET MANAGEMENT command supports + * determinate read after trim behavior. + * If word 169 bit 0 is cleared to zero, + * then word 69 bit 14 is reserved. + * + * If word 69 bit 14 is set to one and word 69 bit 5 is set to one, + * then a read operation after a Trim operation returns data from + * trimmed LBAs as all words cleared to zero. If word 69 bit 14 + * is set to one and word 69 bit 5 is cleared to zero, + * then a read operation after a Trim operation may have words + * set to any value. If word 69 bit 14 is cleared to zero, + * then word 69 bit 5 is reserved. + * + * See http://t13.org/Documents/UploadedDocuments/docs2009/e0 + * 9158r0-Trim_Clarifications.pdf for detail. + */ + const uint16_t trimd = 1 << 14; /* deterministic read data after TRIM */ + const uint16_t trimz = 1 << 5; /* deterministic read ZEROs after TRIM */ + if (identify[169] & 1 && identify[169] != 0xffff) {/* support TRIM ? */ + *trim = 1; + + if (identify[69] & trimd) { + if (identify[105] && identify[105] != 0xffff) + *trim_blocks = (int)identify[105]; + if (identify[69] & trimz) + *trim_rzat = 1; + } + } + + return 0; +err: + return -1; +} + +int query_nvme_identify(int fd, uint8_t *args) +{ +#ifdef NVME_IOCTL_ADMIN_CMD + struct nvme_passthru_cmd cmd = { + .opcode = nvme_admin_identify, + .nsid = 0, + .addr = (uint64_t)args, + .data_len = NVME_IDENTIFY_DATA_SIZE, + .cdw10 = 1, + .cdw11 = 0, + }; + + return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd); +#endif + + return -1; +} + +int check_nvme_trim_supported(int fd, int *trim) +{ + *trim = 0; + + uint8_t args[NVME_IDENTIFY_DATA_SIZE] = { 0 }; + uint16_t *oncs; + + if (!query_nvme_identify(fd, args)) { + // offsetof(oncs, struct nvme_id_ctrl) + oncs = (uint16_t *)(args + 0x208); + __le16_to_cpus(&oncs[0]); + *trim = (oncs[0] & 0x4) >> 2; + + return 0; + } + + return -1; +} + +int blkdiscard(int fd) +{ + uint64_t end, blksize, secsize, range[2]; + struct stat sb; + + range[0] = 0; + range[1] = ULLONG_MAX; + + if (fstat(fd, &sb) == -1) { + perror("stat failed"); + goto err; + } + + if (!S_ISBLK(sb.st_mode)) { + fprintf(stderr, "is not a block device\n"); + goto err; + } + + if (ioctl(fd, BLKGETSIZE64, &blksize)) { + perror("BLKGETSIZE64 ioctl failed"); + goto err; + } + if (ioctl(fd, BLKSSZGET, &secsize)) { + perror("BLKSSZGET ioctl failed"); + goto err; + } + + /* align range to the sector size */ + range[0] = (range[0] + secsize - 1) & ~(secsize - 1); + range[1] &= ~(secsize - 1); + + /* is the range end behind the end of the device ?*/ + end = range[0] + range[1]; + if (end < range[0] || end > blksize) + range[1] = blksize - range[0]; + + if (ioctl(fd, BLKDISCARD, &range)) { + perror("BLKDISCARD ioctl failed"); + goto err; + } + + return 0; +err: + return -1; +} + +void trim_all_sectors(char *path, int fd) +{ + char *dev = basename(path); + int trim_supported = 0; + int trim_blocks = 0; + int trim_rzat = 0; + if (!strncmp(dev, "nvme", 4)) + check_nvme_trim_supported(fd, &trim_supported); + else + if (check_trim_supported(fd, + &trim_supported, + &trim_blocks, + &trim_rzat)) + check_nvme_trim_supported(fd, &trim_supported); + + if (trim_supported) { + printf("TRIM:\t\t\tData Set Management TRIM supported"); + if (trim_blocks) + printf(" (limit %d block%s)", + trim_blocks, trim_blocks > 1 ? "s" : ""); + printf("\n"); + + if (trim_rzat) { + printf("RZAT:"); + printf("\t\t\tDeterministic read ZEROs after TRIM\n"); + } + + printf("%s blkdiscard beginning...\n", path); + if (blkdiscard(fd)) + fprintf(stderr, "%s blkdiscard failed: %s", + path, strerror(errno)); + else + printf("%s blkdiscard successfully\n", path); + } else + printf("%s Skiping blkdiscard\n", path); +} + static void write_sb(char *dev, unsigned int block_size, unsigned int bucket_size, bool writeback, bool discard, bool wipe_bcache, @@ -354,6 +654,11 @@ static void write_sb(char *dev, unsigned int block_size, sb.nr_in_set, sb.nr_this_dev, sb.first_bucket); + + /* check whether cache dev supports TRIM or not. + * if supports, trim all sectors + */ + trim_all_sectors(dev, fd); putchar('\n'); } @@ -429,7 +734,7 @@ int make_bcache(int argc, char **argv) unsigned int i, ncache_devices = 0, nbacking_devices = 0; char *cache_devices[argc]; char *backing_devices[argc]; - char label[SB_LABEL_SIZE]; + char label[SB_LABEL_SIZE] = { 0 }; unsigned int block_size = 0, bucket_size = 1024; int writeback = 0, discard = 0, wipe_bcache = 0, force = 0; unsigned int cache_replacement_policy = 0; -- 2.21.0