On 6/3/22 6:46 AM, Christoph Hellwig wrote: > From a highlevel POV this looks good. I'll do a more detail review in > the next days once I find a little time. Any chance you could also > wire up the new ops for nvme so that we know the abtractions work right > even for exporting nvme as SCSI? No need to wire up nvmet for now, but > that would be the logical next step. I forgot to cc nvme. Doing that now. For the nvme list developers here the patchset discussed in this thread: https://lore.kernel.org/linux-scsi/20220603114645.GA14309@xxxxxx/T/#t For the patchset in this thread I only did the simple SCSI commands READ_KEYS and READ_RESERVATION. For nvme-only people, those commands just return the registered keys and the reservation info like the key and type. There is no controller/host ID type of info like in nvme's report reservation command. I did the basic commands because the apps I need to support don't use the more advanced SCSI command READ_FULL_STATUS which returns the equivalent of nvme's controller and host ID. I also hit issues with SCSI targets not implementing the command correctly. For example, the linux scsi target just got fixed last year and it only works by accident in some cases where we have 2 bugs that cancel each other out :) However, for nvme and for the interface we want to provide to userspace, do we want to implement an interface like READ_FULL_STATUS and report reservations where we report the host/controller/port info? If so, below is a patch I started. Notes: 1. I hit some issues with SCSI targets not reporting the IDs sometimes or sometimes they report it incorrectly. For nvme, it seems easier. SCSI has to handle a hand full of ways to report the ID where nvme has 2 ways to do the host ID. 2. I couldn't find a nvme device to test. Qemu and nvmet don't seem to support reservations. 3. The patch is only compile tested. It's based on a different patch I did. I'm just posting because you can see the pr_reservations_info data struct we could use for nvme and scsi if we want to report the ID info and keys/registrations in one command. diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ca544dfc3210..161a715ab70a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -3156,6 +3156,28 @@ static int dm_pr_read_reservation(struct block_device *bdev, return r; } +static int dm_pr_report_reservation(struct block_device *bdev, + struct pr_reservation_info *rsv_info, + int rsv_regs_len) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + const struct pr_ops *ops; + int r, srcu_idx; + + r = dm_prepare_ioctl(md, &srcu_idx, &bdev); + if (r < 0) + goto out; + + ops = bdev->bd_disk->fops->pr_ops; + if (ops && ops->pr_report_reservation) + r = ops->pr_report_reservation(bdev, rsv_info, rsv_regs_len); + else + r = -EOPNOTSUPP; +out: + dm_unprepare_ioctl(md, srcu_idx); + return r; +} + static const struct pr_ops dm_pr_ops = { .pr_register = dm_pr_register, .pr_reserve = dm_pr_reserve, @@ -3163,7 +3185,8 @@ static const struct pr_ops dm_pr_ops = { .pr_preempt = dm_pr_preempt, .pr_clear = dm_pr_clear, .pr_read_keys = dm_pr_read_keys, - .pr_read_reservation = dm_pr_read_reservation, + .pr_read_reservation = dm_pr_read_reservation, + .pr_report_reservation = dm_pr_report_reservation, }; static const struct block_device_operations dm_blk_dops = { diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 71283aaf3e82..1f251b8f381d 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1770,6 +1770,101 @@ static int sd_pr_read_reservation(struct block_device *bdev, return 0; } +static int sd_pr_read_full_status(struct block_device *bdev, + struct pr_reservation_info *rsv_info, + int rsv_regs_len) +{ + int len, full_data_off, i, result, num_regs; + struct pr_registration_info *reg_info; + struct pr_keys keys_info; + u8 *full_data; + +retry: + result = sd_pr_read_keys(bdev, &keys_info, 0); + if (result) + return result; + + if (!keys_info.num_keys) + return 0; + + len = keys_info.num_keys * sizeof(*reg_info); + if (len >= rsv_regs_len) + return -EOVERFLOW; + len += 8; + + full_data = kzalloc(len, GFP_KERNEL); + if (!full_data) + return -ENOMEM; + + result = sd_pr_in_command(bdev, READ_FULL_STATUS, full_data, len); + if (result) { + /* + * TODO? - If it's not supported do we want to drop down + * to READ_KEYS/RESERVATION and just not fill out the port + * and transport ID info. + */ + return result; + } + + num_regs = get_unaligned_be32(&full_data[4]) / 8; + /* + * We can have fewer registrations if the target did the ALL_TG_PT + * format where it does not report every I_T nexus. If we now have + * more registrations then someone is doing PR out commands so try + * to get a bigger buffer. + */ + if (keys_info.num_keys < num_regs) { + kfree(full_data); + goto retry; + } + + rsv_info->generation = get_unaligned_be32(&full_data[0]); + rsv_info->num_registrations = num_regs; + + full_data_off = 8; + + for (i = 0; i < num_regs; i++) { + /* every reg should have the same type? */ + rsv_info->type = scsi_pr_type_to_block(full_data[13] & 0x0f); + + reg_info = &rsv_info->registrations[i]; + reg_info->key = get_unaligned_be64(&full_data[0]); + + if (full_data[12] & 0x01) + reg_info->flags |= PR_REG_INFO_FL_HOLDER; + if (full_data[12] & 0x02) + reg_info->flags |= PR_REG_INFO_FL_ALL_TG_PT; + + /* We use SCSI rel target port id for remote_id */ + reg_info->remote_id = get_unaligned_be16(&full_data[18]); + + /* We use SCSI transport ID as the local_id */ + reg_info->local_id_len = get_unaligned_be32(&full_data[20]); + if (!reg_info->local_id_len) + continue; + + /* + * TODO. Do we fail or operate like the SCSI spec and return + * what we have and user should know that they messed up + * and need to send a bigger buffer to get the rest of the + * data; + */ + full_data_off += 24; + if (full_data_off + reg_info->local_id_len >= rsv_regs_len) + return -EOVERFLOW; + /* + * TODO - we should put this in a easier to use format for + * users. + */ + memcpy(reg_info->local_id, &full_data[full_data_off], + reg_info->local_id_len); + full_data_off += reg_info->local_id_len; + } + + kfree(full_data); + return 0; +} + static int sd_pr_out_command(struct block_device *bdev, u8 sa, u64 key, u64 sa_key, u8 type, u8 flags) { @@ -1845,7 +1940,8 @@ static const struct pr_ops sd_pr_ops = { .pr_preempt = sd_pr_preempt, .pr_clear = sd_pr_clear, .pr_read_keys = sd_pr_read_keys, - .pr_read_reservation = sd_pr_read_reservation, + .pr_read_reservation = sd_pr_read_reservation, + .pr_report_reservation = sd_pr_read_full_status, }; static void scsi_disk_free_disk(struct gendisk *disk) diff --git a/include/linux/pr.h b/include/linux/pr.h index 21a8eb8b34b5..ec325a0ed33c 100644 --- a/include/linux/pr.h +++ b/include/linux/pr.h @@ -30,6 +30,9 @@ struct pr_ops { struct pr_keys *keys_info, int keys_info_len); int (*pr_read_reservation)(struct block_device *bdev, struct pr_held_reservation *rsv); + int (*pr_report_reservation)(struct block_device *bdev, + struct pr_reservation_info *rsv_info, + int rsv_regs_len); }; #endif /* LINUX_PR_H */ diff --git a/include/uapi/linux/pr.h b/include/uapi/linux/pr.h index ccc78cbf1221..66028d37ac5d 100644 --- a/include/uapi/linux/pr.h +++ b/include/uapi/linux/pr.h @@ -39,6 +39,35 @@ struct pr_clear { __u32 __pad; }; +/* Is reservation holder */ +#define PR_REG_INFO_FL_HOLDER (1 << 0) +/* + * Registration applies to all SCSI target ports accesed through initiator port + * at local_id. remote_id is not set in this case. + */ +#define PR_REG_INFO_FL_ALL_TG_PT (1 << 1) + +struct pr_registration_info { + __u64 key; + __u8 flags; + __u8 __pad; + /* NVMe Controler ID or SCSI relative target port id */ + __u16 remote_id; + __u32 __pad2; + /* local_id size in bytes. */ + __u32 local_id_len; + /* NVMe Host ID or SCSI Transport ID */ + char local_id[]; +}; + +struct pr_reservation_info { + __u32 generation; + __u16 num_registrations; + __u8 type; + __u8 __pad; + struct pr_registration_info registrations[]; +}; + #define PR_FL_IGNORE_KEY (1 << 0) /* ignore existing key */ #define IOC_PR_REGISTER _IOW('p', 200, struct pr_registration) -- dm-devel mailing list dm-devel@xxxxxxxxxx https://listman.redhat.com/mailman/listinfo/dm-devel