On Fri, Oct 30, 2015 at 04:35:54PM +1100, Alexey Kardashevskiy wrote: >On 10/26/2015 02:16 PM, Wei Yang wrote: >>When PF is EEH aware while VFs are not, VFs will be removed during EEH >>recovery. This is not supported in current code, while will leads to the VF >>lost. >> >>This patch fixes this by adding VFs back. VFs should be added back after PF >>get recovered properly. >> >>Signed-off-by: Wei Yang <weiyang@xxxxxxxxxxxxxxxxxx> >>Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx> > >btw please remove my "sob" from this patchset (here and 11/12 at least) as I >did not "sob" the upstream versions of these and I did not post them and >there is no public tree of mine with these patches. When I agree that the >patches are good to go, it will be "reviewed-by" or "acked-by". > Sure, I would obey this rule in the future. > >>--- >> arch/powerpc/include/asm/eeh.h | 6 ++++++ >> arch/powerpc/kernel/eeh_dev.c | 1 + >> arch/powerpc/kernel/eeh_driver.c | 30 +++++++++++++++++++++++------- >> 3 files changed, 30 insertions(+), 7 deletions(-) >> >>diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h >>index ea1f13c4..c529a23 100644 >>--- a/arch/powerpc/include/asm/eeh.h >>+++ b/arch/powerpc/include/asm/eeh.h >>@@ -127,6 +127,11 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe) >> #define EEH_DEV_SYSFS (1 << 9) /* Sysfs created */ >> #define EEH_DEV_REMOVED (1 << 10) /* Removed permanently */ >> >>+struct eeh_rmv_data { >>+ struct list_head edev_list; >>+ int removed; >>+}; >>+ >> struct eeh_dev { >> int mode; /* EEH mode */ >> int class_code; /* Class code of the device */ >>@@ -139,6 +144,7 @@ struct eeh_dev { >> int af_cap; /* Saved AF capability */ >> struct eeh_pe *pe; /* Associated PE */ >> struct list_head list; /* Form link list in the PE */ >>+ struct list_head rmv_list; /* Record the removed edev */ >> struct pci_controller *phb; /* Associated PHB */ >> struct pci_dn *pdn; /* Associated PCI device node */ >> struct pci_dev *pdev; /* Associated PCI device */ >>diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c >>index aabba94..7815095 100644 >>--- a/arch/powerpc/kernel/eeh_dev.c >>+++ b/arch/powerpc/kernel/eeh_dev.c >>@@ -67,6 +67,7 @@ void *eeh_dev_init(struct pci_dn *pdn, void *data) >> edev->pdn = pdn; >> edev->phb = phb; >> INIT_LIST_HEAD(&edev->list); >>+ INIT_LIST_HEAD(&edev->rmv_list); >> >> return NULL; >> } >>diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c >>index 99868e2..f2406b4 100644 >>--- a/arch/powerpc/kernel/eeh_driver.c >>+++ b/arch/powerpc/kernel/eeh_driver.c >>@@ -420,7 +420,8 @@ static void *eeh_rmv_device(void *data, void *userdata) >> struct pci_driver *driver; >> struct eeh_dev *edev = (struct eeh_dev *)data; >> struct pci_dev *dev = eeh_dev_to_pci_dev(edev); >>- int *removed = (int *)userdata; >>+ struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata; >>+ int *removed = rmv_data ? &rmv_data->removed : NULL; > > >You just touched @userdata/@removed in [10/12] and now you are touching it >again. > >It feels like this patch is better to be merged into [10/12], this will >reduce the noise about the userdata pointer changes passed into >eeh_pe_dev_traverse() and make more sense as "powerpc/eeh: Support error >recovery for VF PE" without adding VFs back is pretty useless. > Agree, will merge them. > > > >> struct pci_dn *pdn = eeh_dev_to_pdn(edev); >> >> /* >>@@ -467,6 +468,9 @@ static void *eeh_rmv_device(void *data, void *userdata) >> * required to plug the VF successfully. >> */ >> pdn->pe_number = IODA_INVALID_PE; >>+ >>+ if (rmv_data) >>+ list_add(&edev->rmv_list, &rmv_data->edev_list); >> } else { >> pci_lock_rescan_remove(); >> pci_stop_and_remove_bus_device(dev); >>@@ -585,11 +589,12 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) >> * During the reset, udev might be invoked because those affected >> * PCI devices will be removed and then added. >> */ >>-static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) >>+static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, >>+ struct eeh_rmv_data *rmv_data) >> { >> struct pci_bus *frozen_bus = eeh_pe_bus_get(pe); >> struct timeval tstamp; >>- int cnt, rc, removed = 0; >>+ int cnt, rc; >> struct eeh_dev *edev; >> >> /* pcibios will clear the counter; save the value */ >>@@ -612,7 +617,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) >> pci_unlock_rescan_remove(); >> } >> } else if (frozen_bus) >>- eeh_pe_dev_traverse(pe, eeh_rmv_device, &removed); >>+ eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data); >> >> /* >> * Reset the pci controller. (Asserts RST#; resets config space). >>@@ -659,7 +664,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) >> eeh_add_virt_device(edev, NULL); >> else >> pcibios_add_pci_devices(bus); >>- } else if (frozen_bus && removed) { >>+ } else if (frozen_bus && rmv_data->removed) { >> pr_info("EEH: Sleep 5s ahead of partial hotplug\n"); >> ssleep(5); >> >>@@ -687,8 +692,10 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) >> static void eeh_handle_normal_event(struct eeh_pe *pe) >> { >> struct pci_bus *frozen_bus; >>+ struct eeh_dev *edev, *tmp; >> int rc = 0; >> enum pci_ers_result result = PCI_ERS_RESULT_NONE; >>+ struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.edev_list), 0}; >> >> frozen_bus = eeh_pe_bus_get(pe); >> if (!frozen_bus) { >>@@ -735,7 +742,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) >> */ >> if (result == PCI_ERS_RESULT_NONE) { >> pr_info("EEH: Reset with hotplug activity\n"); >>- rc = eeh_reset_device(pe, frozen_bus); >>+ rc = eeh_reset_device(pe, frozen_bus, NULL); >> if (rc) { >> pr_warn("%s: Unable to reset, err=%d\n", >> __func__, rc); >>@@ -787,7 +794,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) >> /* If any device called out for a reset, then reset the slot */ >> if (result == PCI_ERS_RESULT_NEED_RESET) { >> pr_info("EEH: Reset without hotplug activity\n"); >>- rc = eeh_reset_device(pe, NULL); >>+ rc = eeh_reset_device(pe, NULL, &rmv_data); >> if (rc) { >> pr_warn("%s: Cannot reset, err=%d\n", >> __func__, rc); >>@@ -807,6 +814,15 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) >> goto hard_fail; >> } >> >>+ /* >>+ * For those hot removed VFs, we should add back them after PF get >>+ * recovered properly. >>+ */ >>+ list_for_each_entry_safe(edev, tmp, &rmv_data.edev_list, rmv_list) { >>+ eeh_add_virt_device(edev, NULL); >>+ list_del(&edev->rmv_list); >>+ } >>+ >> /* Tell all device drivers that they can resume operations */ >> pr_info("EEH: Notify device driver to resume\n"); >> eeh_pe_dev_traverse(pe, eeh_report_resume, NULL); >> > > >-- >Alexey -- Richard Yang Help you, Help me -- To unsubscribe from this list: send the line "unsubscribe linux-pci" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html