Randy wrote: > -----Original Message----- > From: Randy Dunlap [mailto:randy.dunlap@xxxxxxxxxx] > Sent: Tuesday, February 10, 2009 4:55 PM > To: Miller, Mike (OS Dev) > Cc: Andrew Morton; Jens Axboe; LKML; LKML-scsi > Subject: Re: [PATCH 1/1] cciss: PCI power management reset for kexec > > Mike Miller wrote: > > Patch 1 of 1 > > > > This patch provides the better "kick-in-the-pants" on > driver load in a > > kexec'ed environment. > > > > I've successfully sanity tested the port in my lab. Randy, please > > apply and test. You seem to be able to bring out the worst in the > > driver. ;-) > > Hi Mike, > > I've booted this successfully (new kernel thru kexec) 4-5 > times successfully, which doesn't prove a whole lot since the > failure is intermittent. > Anyway, I'll continue to apply this patch in my daily kernel > testing... > > Thanks. Thanks for the update, Randy. > > > Author: Chip Coldwell <coldwell@xxxxxxxxxx> > > > > CCISS: Use PCI power management to reset the controller > > > > The kexec kernel resets the CCISS hardware in three steps: > > > > 1. Use PCI power management states to reset the controller > > in the kexec kernel. > > 2. Clear the MSI/MSI-X bits in PCI configuration space so > > that MSI initialization in the kexec kernel doesn't fail. > > 3. Use the CCISS "No-op" message to determine when the > > controller firmware has recovered from the PCI PM reset. > > > > Signed-off-by: Mike Miller <mike.miller@xxxxxx> > > > > > ---------------------------------------------------------------------- > > --------- diff --git a/drivers/block/cciss.c > b/drivers/block/cciss.c > > index 01e6938..ff4a105 100644 > > --- a/drivers/block/cciss.c > > +++ b/drivers/block/cciss.c > > @@ -3390,6 +3390,205 @@ static void free_hba(int i) > > kfree(p); > > } > > > > +/* Send a message CDB to the firmware. */ static __devinit int > > +cciss_message(struct pci_dev *pdev, unsigned char opcode, unsigned > > +char type) { > > + typedef struct { > > + CommandListHeader_struct CommandHeader; > > + RequestBlock_struct Request; > > + ErrDescriptor_struct ErrorDescriptor; > > + } Command; > > + static const size_t cmd_sz = sizeof(Command) + > sizeof(ErrorInfo_struct); > > + Command *cmd; > > + dma_addr_t paddr64; > > + uint32_t paddr32, tag; > > + void __iomem *vaddr; > > + int i, err; > > + > > + vaddr = ioremap_nocache(pci_resource_start(pdev, 0), > pci_resource_len(pdev, 0)); > > + if (vaddr == NULL) > > + return -ENOMEM; > > + > > + /* The Inbound Post Queue only accepts 32-bit physical > addresses for the > > + CCISS commands, so they must be allocated from the > lower 4GiB of > > + memory. */ > > + err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK); > > + if (err) { > > + iounmap(vaddr); > > + return -ENOMEM; > > + } > > + > > + cmd = pci_alloc_consistent(pdev, cmd_sz, &paddr64); > > + if (cmd == NULL) { > > + iounmap(vaddr); > > + return -ENOMEM; > > + } > > + > > + /* This must fit, because of the 32-bit consistent DMA > mask. Also, > > + although there's no guarantee, we assume that the > address is at > > + least 4-byte aligned (most likely, it's page-aligned). */ > > + paddr32 = paddr64; > > + > > + cmd->CommandHeader.ReplyQueue = 0; > > + cmd->CommandHeader.SGList = 0; > > + cmd->CommandHeader.SGTotal = 0; > > + cmd->CommandHeader.Tag.lower = paddr32; > > + cmd->CommandHeader.Tag.upper = 0; > > + memset(&cmd->CommandHeader.LUN.LunAddrBytes, 0, 8); > > + > > + cmd->Request.CDBLen = 16; > > + cmd->Request.Type.Type = TYPE_MSG; > > + cmd->Request.Type.Attribute = ATTR_HEADOFQUEUE; > > + cmd->Request.Type.Direction = XFER_NONE; > > + cmd->Request.Timeout = 0; /* Don't time out */ > > + cmd->Request.CDB[0] = opcode; > > + cmd->Request.CDB[1] = type; > > + memset(&cmd->Request.CDB[2], 0, 14); /* the rest of the CDB is > > +reserved */ > > + > > + cmd->ErrorDescriptor.Addr.lower = paddr32 + sizeof(Command); > > + cmd->ErrorDescriptor.Addr.upper = 0; > > + cmd->ErrorDescriptor.Len = sizeof(ErrorInfo_struct); > > + > > + writel(paddr32, vaddr + SA5_REQUEST_PORT_OFFSET); > > + > > + for (i = 0; i < 10; i++) { > > + tag = readl(vaddr + SA5_REPLY_PORT_OFFSET); > > + if ((tag & ~3) == paddr32) > > + break; > > + schedule_timeout_uninterruptible(HZ); > > + } > > + > > + iounmap(vaddr); > > + > > + /* we leak the DMA buffer here ... no choice since the > controller could > > + still complete the command. */ > > + if (i == 10) { > > + printk(KERN_ERR "cciss: controller message > %02x:%02x timed out\n", > > + opcode, type); > > + return -ETIMEDOUT; > > + } > > + > > + pci_free_consistent(pdev, cmd_sz, cmd, paddr64); > > + > > + if (tag & 2) { > > + printk(KERN_ERR "cciss: controller message > %02x:%02x failed\n", > > + opcode, type); > > + return -EIO; > > + } > > + > > + printk(KERN_INFO "cciss: controller message %02x:%02x > succeeded\n", > > + opcode, type); > > + return 0; > > +} > > + > > +#define cciss_soft_reset_controller(p) cciss_message(p, 1, > 0) #define > > +cciss_noop(p) cciss_message(p, 3, 0) > > + > > +static __devinit int cciss_reset_msi(struct pci_dev *pdev) { > > +/* the #defines are stolen from drivers/pci/msi.h. */ > > +#define msi_control_reg(base) (base + PCI_MSI_FLAGS) > > +#define PCI_MSIX_FLAGS_ENABLE (1 << 15) > > + > > + int pos; > > + u16 control = 0; > > + > > + pos = pci_find_capability(pdev, PCI_CAP_ID_MSI); > > + if (pos) { > > + pci_read_config_word(pdev, > msi_control_reg(pos), &control); > > + if (control & PCI_MSI_FLAGS_ENABLE) { > > + printk(KERN_INFO "cciss: resetting MSI\n"); > > + pci_write_config_word(pdev, > msi_control_reg(pos), control & ~PCI_MSI_FLAGS_ENABLE); > > + } > > + } > > + > > + pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX); > > + if (pos) { > > + pci_read_config_word(pdev, > msi_control_reg(pos), &control); > > + if (control & PCI_MSIX_FLAGS_ENABLE) { > > + printk(KERN_INFO "cciss: resetting MSI-X\n"); > > + pci_write_config_word(pdev, > msi_control_reg(pos), control & ~PCI_MSIX_FLAGS_ENABLE); > > + } > > + } > > + > > + return 0; > > +} > > + > > +/* This does a hard reset of the controller using PCI power > > +management > > + * states. */ > > +static __devinit int cciss_hard_reset_controller(struct pci_dev > > +*pdev) { > > + u16 pmcsr, saved_config_space[32]; > > + int i, pos; > > + > > + printk(KERN_INFO "cciss: using PCI PM to reset controller\n"); > > + > > + /* This is very nearly the same thing as > > + > > + pci_save_state(pci_dev); > > + pci_set_power_state(pci_dev, PCI_D3hot); > > + pci_set_power_state(pci_dev, PCI_D0); > > + pci_restore_state(pci_dev); > > + > > + but we can't use these nice canned kernel routines on > > + kexec, because they also check the MSI/MSI-X state in PCI > > + configuration space and do the wrong thing when it is > > + set/cleared. Also, the pci_save/restore_state functions > > + violate the ordering requirements for restoring the > > + configuration space from the CCISS document (see the > > + comment below). So we roll our own .... */ > > + > > + for (i = 0; i < 32; i++) > > + pci_read_config_word(pdev, 2*i, &saved_config_space[i]); > > + > > + pos = pci_find_capability(pdev, PCI_CAP_ID_PM); > > + if (pos == 0) { > > + printk(KERN_ERR "cciss_reset_controller: PCI PM > not supported\n"); > > + return -ENODEV; > > + } > > + > > + /* Quoting from the Open CISS Specification: "The Power > > + * Management Control/Status Register (CSR) controls the power > > + * state of the device. The normal operating state is D0, > > + * CSR=00h. The software off state is D3, CSR=03h. To reset > > + * the controller, place the interface device in D3 then to > > + * D0, this causes a secondary PCI reset which will reset the > > + * controller." */ > > + > > + /* enter the D3hot power management state */ > > + pci_read_config_word(pdev, pos + PCI_PM_CTRL, &pmcsr); > > + pmcsr &= ~PCI_PM_CTRL_STATE_MASK; > > + pmcsr |= PCI_D3hot; > > + pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr); > > + > > + set_current_state(TASK_UNINTERRUPTIBLE); > > + schedule_timeout(HZ >> 1); > > + > > + /* enter the D0 power management state */ > > + pmcsr &= ~PCI_PM_CTRL_STATE_MASK; > > + pmcsr |= PCI_D0; > > + pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr); > > + > > + set_current_state(TASK_UNINTERRUPTIBLE); > > + schedule_timeout(HZ >> 1); > > + > > + /* Restore the PCI configuration space. The Open CISS > > + * Specification says, "Restore the PCI Configuration > > + * Registers, offsets 00h through 60h. It is important to > > + * restore the command register, 16-bits at offset 04h, > > + * last. Do not restore the configuration status register, > > + * 16-bits at offset 06h." Note that the offset is 2*i. */ > > + for (i = 0; i < 32; i++) { > > + if (i == 2 || i == 3) > > + continue; > > + pci_write_config_word(pdev, 2*i, saved_config_space[i]); > > + } > > + wmb(); > > + pci_write_config_word(pdev, 4, saved_config_space[2]); > > + > > + return 0; > > +} > > + > > /* > > * This is it. Find all the controllers and register > them. I really hate > > * stealing all these major device numbers. > > @@ -3404,6 +3603,24 @@ static int __devinit > cciss_init_one(struct pci_dev *pdev, > > int dac, return_code; > > InquiryData_struct *inq_buff = NULL; > > > > + if (reset_devices) { > > + /* Reset the controller with a PCI power-cycle */ > > + if (cciss_hard_reset_controller(pdev) || > cciss_reset_msi(pdev)) > > + return -ENODEV; > > + > > + /* Some devices (notably the HP Smart Array 5i > Controller) > > + need a little pause here */ > > + schedule_timeout_uninterruptible(30*HZ); > > + > > + /* Now try to get the controller to respond to > a no-op */ > > + for (i=0; i<12; i++) { > > + if (cciss_noop(pdev) == 0) > > + break; > > + else > > + printk("cciss: no-op > failed%s\n", (i < 11 ? "; re-trying" : "")); > > + } > > + } > > + > > i = alloc_cciss_hba(); > > if (i < 0) > > return -1; > > > -- > ~Randy > -- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html