> -----Original Message----- > From: KY Srinivasan > Sent: Saturday, March 4, 2017 1:40 PM > To: 'James Bottomley' <jejb@xxxxxxxxxxxxxxxxxx>; Stephen Hemminger > <stephen@xxxxxxxxxxxxxxxxxx> > Cc: Hannes Reinecke <hare@xxxxxxx>; Christoph Hellwig <hch@xxxxxx>; Jens > Axboe <axboe@xxxxxxxxx>; Linus Torvalds <torvalds@linux- > foundation.org>; Martin K. Petersen <martin.petersen@xxxxxxxxxx>; > Dexuan Cui <decui@xxxxxxxxxxxxx>; Long Li <longli@xxxxxxxxxxxxx>; Josh > Poulson <jopoulso@xxxxxxxxxxxxx>; Adrian Suhov (Cloudbase Solutions SRL) > <v-adsuho@xxxxxxxxxxxxx>; linux-scsi@xxxxxxxxxxxxxxx; Haiyang Zhang > <haiyangz@xxxxxxxxxxxxx> > Subject: RE: [RFC] hv_storvsc: error handling. > > > > > -----Original Message----- > > From: James Bottomley [mailto:jejb@xxxxxxxxxxxxxxxxxx] > > Sent: Saturday, March 4, 2017 1:37 PM > > To: KY Srinivasan <kys@xxxxxxxxxxxxx>; Stephen Hemminger > > <stephen@xxxxxxxxxxxxxxxxxx> > > Cc: Hannes Reinecke <hare@xxxxxxx>; Christoph Hellwig <hch@xxxxxx>; > Jens > > Axboe <axboe@xxxxxxxxx>; Linus Torvalds <torvalds@linux- > > foundation.org>; Martin K. Petersen <martin.petersen@xxxxxxxxxx>; > > Dexuan Cui <decui@xxxxxxxxxxxxx>; Long Li <longli@xxxxxxxxxxxxx>; Josh > > Poulson <jopoulso@xxxxxxxxxxxxx>; Adrian Suhov (Cloudbase Solutions > SRL) > > <v-adsuho@xxxxxxxxxxxxx>; linux-scsi@xxxxxxxxxxxxxxx; Haiyang Zhang > > <haiyangz@xxxxxxxxxxxxx> > > Subject: Re: [RFC] hv_storvsc: error handling. > > > > On Sat, 2017-03-04 at 21:03 +0000, KY Srinivasan wrote: > > > > > > > -----Original Message----- > > > > From: Stephen Hemminger [mailto:stephen@xxxxxxxxxxxxxxxxxx] > > > > Sent: Friday, March 3, 2017 4:50 PM > > > > To: James Bottomley <James.Bottomley@xxxxxxxxxxxxxxxxxxxxx> > > > > Cc: Hannes Reinecke <hare@xxxxxxx>; Christoph Hellwig > <hch@xxxxxx>; > > > > James Bottomley <jejb@xxxxxxxxxxxxxxxxxx>; Jens Axboe > > > > <axboe@xxxxxxxxx>; Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>; > > > > Martin K. Petersen <martin.petersen@xxxxxxxxxx>; KY Srinivasan > > > > <kys@xxxxxxxxxxxxx>; Dexuan Cui <decui@xxxxxxxxxxxxx>; Long Li > > > > <longli@xxxxxxxxxxxxx>; Josh Poulson <jopoulso@xxxxxxxxxxxxx>; > > > > Adrian > > > > Suhov (Cloudbase Solutions SRL) <v-adsuho@xxxxxxxxxxxxx>; linux- > > > > scsi@xxxxxxxxxxxxxxx; Haiyang Zhang <haiyangz@xxxxxxxxxxxxx> > > > > Subject: [RFC] hv_storvsc: error handling. > > > > > > > > Needs more testing but this does fix the observed problem. > > > > > > > > From: Stephen Hemminger <sthemmin@xxxxxxxxxxxxx> > > > > > > > > Subject: [PATCH] hv_storvsc: fix error handling > > > > > > > > The Hyper-V storvsc SCSI driver was hiding all errors in INQUIRY > > > > and > > > > MODE_SENSE commands. This caused the scan process to incorrectly > > > > think > > > > devices were present and online. Also invalid LUN errors were not > > > > being handled correctly. > > > > > > > > This fixes problems booting a GEN2 VM on Hyper-V. It effectively > > > > reverts commit 4ed51a21c0f69 ("Staging: hv: storvsc: Fixup > > > > srb and scsi status for INQUIRY and MODE_SENSE") > > > > > > > > Signed-off-by: Stephen Hemminger <sthemmin@xxxxxxxxxxxxx> > > > > --- > > > > drivers/scsi/storvsc_drv.c | 48 ++++------------------------------ > > > > ------------ > > > > 1 file changed, 4 insertions(+), 44 deletions(-) > > > > > > > > diff --git a/drivers/scsi/storvsc_drv.c > > > > b/drivers/scsi/storvsc_drv.c > > > > index 638e5f427c90..8cc241fc54b8 100644 > > > > --- a/drivers/scsi/storvsc_drv.c > > > > +++ b/drivers/scsi/storvsc_drv.c > > > > @@ -543,28 +543,6 @@ static void storvsc_host_scan(struct > > > > work_struct > > > > *work) > > > > kfree(wrk); > > > > } > > > > > > > > -static void storvsc_remove_lun(struct work_struct *work) > > > > -{ > > > > - struct storvsc_scan_work *wrk; > > > > - struct scsi_device *sdev; > > > > - > > > > - wrk = container_of(work, struct storvsc_scan_work, work); > > > > - if (!scsi_host_get(wrk->host)) > > > > - goto done; > > > > - > > > > - sdev = scsi_device_lookup(wrk->host, 0, wrk->tgt_id, wrk > > > > ->lun); > > > > - > > > > - if (sdev) { > > > > - scsi_remove_device(sdev); > > > > - scsi_device_put(sdev); > > > > - } > > > > - scsi_host_put(wrk->host); > > > > - > > > > -done: > > > > - kfree(wrk); > > > > -} > > > > - > > > > - > > > > /* > > > > * We can get incoming messages from the host that are not in > > > > response to > > > > * messages that we have sent out. An example of this would be > > > > messages > > > > @@ -955,8 +933,7 @@ static void storvsc_handle_error(struct > > > > vmscsi_request *vm_srb, > > > > } > > > > break; > > > > case SRB_STATUS_INVALID_LUN: > > > > - do_work = true; > > > > - process_err_fn = storvsc_remove_lun; > > > > + set_host_byte(scmnd, DID_NO_CONNECT); > > > > break; > > > > case SRB_STATUS_ABORTED: > > > > if (vm_srb->srb_status & > > > > SRB_STATUS_AUTOSENSE_VALID > > > > && > > > > @@ -1050,32 +1027,15 @@ static void storvsc_on_io_completion(struct > > > > storvsc_device *stor_device, > > > > > > > > stor_pkt = &request->vstor_packet; > > > > > > > > - /* > > > > - * The current SCSI handling on the host side does > > > > - * not correctly handle: > > > > - * INQUIRY command with page code parameter set to 0x80 > > > > - * MODE_SENSE command with cmd[2] == 0x1c > > > > - * > > > > - * Setup srb and scsi status so this won't be fatal. > > > > - * We do this so we can distinguish truly fatal failues > > > > - * (srb status == 0x4) and off-line the device in that > > > > case. > > > > - */ > > > > - > > > > - if ((stor_pkt->vm_srb.cdb[0] == INQUIRY) || > > > > - (stor_pkt->vm_srb.cdb[0] == MODE_SENSE)) { > > > > - vstor_packet->vm_srb.scsi_status = 0; > > > > - vstor_packet->vm_srb.srb_status = > > > > SRB_STATUS_SUCCESS; > > > > - } > > > > - > > > > - > > > > /* Copy over the status...etc */ > > > > stor_pkt->vm_srb.scsi_status = vstor_packet > > > > ->vm_srb.scsi_status; > > > > stor_pkt->vm_srb.srb_status = vstor_packet > > > > ->vm_srb.srb_status; > > > > stor_pkt->vm_srb.sense_info_length = > > > > vstor_packet->vm_srb.sense_info_length; > > > > > > > > - if (vstor_packet->vm_srb.scsi_status != 0 || > > > > - vstor_packet->vm_srb.srb_status != SRB_STATUS_SUCCESS) > > > > + if (stor_pkt->vm_srb.cdb[0] != INQUIRY && > > > > + (vstor_packet->vm_srb.scsi_status != 0 || > > > > + vstor_packet->vm_srb.srb_status != > > > > SRB_STATUS_SUCCESS)) > > > > storvsc_log(device, STORVSC_LOGGING_WARN, > > > > "cmd 0x%x scsi status 0x%x srb status > > > > 0x%x\n", > > > > stor_pkt->vm_srb.cdb[0], > > > > -- > > > > > > This patch gets rid of the ability to "hot remove" LUNs. I don't > > > think that can be part of any > > > solution. The INQUIRY hack I put in a long time ago was to deal with > > > host bugs on prior versions of > > > Windows server. WS2016 should not be trigerring this code. Stephen, > > > could you please test this patch - > > > a quick hack: > > > > > > From b97f24f224a71a6e745c42e5640045a553eb407c Mon Sep 17 00:00:00 > > > 2001 > > > From: K. Y. Srinivasan <kys@xxxxxxxxxxxxx> > > > Date: Sat, 4 Mar 2017 14:00:46 -0700 > > > Subject: [PATCH 1/1] scsi: storvsc: Fix a bug in LUN removal code > > > Reply-To: kys@xxxxxxxxxxxxx > > > > > > Signed-off-by: K. Y. Srinivasan <kys@xxxxxxxxxxxxx> > > > --- > > > drivers/scsi/storvsc_drv.c | 13 +++++++++++++ > > > 1 files changed, 13 insertions(+), 0 deletions(-) > > > > > > diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c > > > index 05526b7..27eb682 100644 > > > --- a/drivers/scsi/storvsc_drv.c > > > +++ b/drivers/scsi/storvsc_drv.c > > > @@ -885,6 +885,7 @@ static void storvsc_handle_error(struct > > > vmscsi_request *vm_srb, > > > struct storvsc_scan_work *wrk; > > > void (*process_err_fn)(struct work_struct *work); > > > bool do_work = false; > > > + struct scsi_device *sdev; > > > > > > switch (SRB_STATUS(vm_srb->srb_status)) { > > > case SRB_STATUS_ERROR: > > > @@ -911,6 +912,18 @@ static void storvsc_handle_error(struct > > > vmscsi_request *vm_srb, > > > } > > > break; > > > case SRB_STATUS_INVALID_LUN: > > > + if (!scsi_host_get(host)) { > > > + set_host_byte(scmnd, DID_NO_CONNECT); > > > + break; > > > + } > > > + > > > + sdev = scsi_device_lookup(wrk->host, 0, wrk->tgt_id, > > > wrk->lun); > > > + > > > + if (!sdev) { > > > + set_host_byte(scmnd, DID_NO_CONNECT); > > > + break; > > > + } > > > + > > > > You're now getting two references (one to the host and one to the > > device) that you don't put either in error handling or in > > storvsc_remove_lun(). > > > > Probably you should eliminate the scsi_host_get and scsi_device_lookup > > in storvsc_remove_lun() (making it basically remove device put device > > put host) and add a host put in the !sdev if above. > > Yes; this was a quick hack to prove a theory. This should work (I hope); I will > cleanup and resend once > Stephen verifies. Stephen, Please test this patch. Have addressed the ref count issue that James pointed out. >From 6d57fb68c40165e61d84b6766b2263747d5e241d Mon Sep 17 00:00:00 2001 From: K. Y. Srinivasan <kys@xxxxxxxxxxxxx> Date: Sat, 4 Mar 2017 15:57:26 -0700 Subject: [PATCH 1/1] scsi: storvsc: Fix a bug in LUN removal code Reply-To: kys@xxxxxxxxxxxxx Signed-off-by: K. Y. Srinivasan <kys@xxxxxxxxxxxxx> --- drivers/scsi/storvsc_drv.c | 33 +++++++++++++++++++++++---------- 1 files changed, 23 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 05526b7..2bd132d 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -478,6 +478,7 @@ struct hv_host_device { struct storvsc_scan_work { struct work_struct work; struct Scsi_Host *host; + struct scsi_device *sdev; u8 lun; u8 tgt_id; }; @@ -531,24 +532,20 @@ static void storvsc_host_scan(struct work_struct *work) kfree(wrk); } +/* + * On Entry we have a reference on both the host and the device. + * Drop them here. + */ static void storvsc_remove_lun(struct work_struct *work) { struct storvsc_scan_work *wrk; - struct scsi_device *sdev; wrk = container_of(work, struct storvsc_scan_work, work); - if (!scsi_host_get(wrk->host)) - goto done; - - sdev = scsi_device_lookup(wrk->host, 0, wrk->tgt_id, wrk->lun); - if (sdev) { - scsi_remove_device(sdev); - scsi_device_put(sdev); - } + scsi_remove_device(wrk->sdev); + scsi_device_put(wrk->sdev); scsi_host_put(wrk->host); -done: kfree(wrk); } @@ -885,6 +882,7 @@ static void storvsc_handle_error(struct vmscsi_request *vm_srb, struct storvsc_scan_work *wrk; void (*process_err_fn)(struct work_struct *work); bool do_work = false; + struct scsi_device *sdev; switch (SRB_STATUS(vm_srb->srb_status)) { case SRB_STATUS_ERROR: @@ -911,9 +909,23 @@ static void storvsc_handle_error(struct vmscsi_request *vm_srb, } break; case SRB_STATUS_INVALID_LUN: + if (!scsi_host_get(host)) + goto host_not_known; + + sdev = scsi_device_lookup(wrk->host, 0, wrk->tgt_id, wrk->lun); + + if (!sdev) + goto lun_not_known; + do_work = true; process_err_fn = storvsc_remove_lun; break; + +lun_not_known: + scsi_host_put(host); +host_not_known: + set_host_byte(scmnd, DID_NO_CONNECT); + break; case SRB_STATUS_ABORTED: if (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID && (asc == 0x2a) && (ascq == 0x9)) { @@ -939,6 +951,7 @@ static void storvsc_handle_error(struct vmscsi_request *vm_srb, return; } + wrk->sdev = sdev; wrk->host = host; wrk->lun = vm_srb->lun; wrk->tgt_id = vm_srb->target_id; -- 1.7.1 > > K. Y > > > > James