Please see comments inline: On Wed, 2012-10-10 at 16:51 +0900, Takao Indoh wrote: > This patch resets PCIe devices at boot time by hot reset when > "reset_devices" is specified. > > > Signed-off-by: Takao Indoh <indou.takao at jp.fujitsu.com> > --- > arch/x86/include/asm/pci-direct.h | 1 > arch/x86/kernel/setup.c | 3 > arch/x86/pci/early.c | 299 ++++++++++++++++++++++++++++ > drivers/pci/pci.c | 18 - > include/linux/pci.h | 18 + > init/main.c | 4 > 6 files changed, 323 insertions(+), 20 deletions(-) > > diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h > index b1e7a45..de30db2 100644 > --- a/arch/x86/include/asm/pci-direct.h > +++ b/arch/x86/include/asm/pci-direct.h > @@ -18,4 +18,5 @@ extern int early_pci_allowed(void); > extern unsigned int pci_early_dump_regs; > extern void early_dump_pci_device(u8 bus, u8 slot, u8 func); > extern void early_dump_pci_devices(void); > +extern void early_reset_pcie_devices(void); > #endif /* _ASM_X86_PCI_DIRECT_H */ > diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c > index f4b9b80..24b011c 100644 > --- a/arch/x86/kernel/setup.c > +++ b/arch/x86/kernel/setup.c > @@ -988,6 +988,9 @@ void __init setup_arch(char **cmdline_p) > generic_apic_probe(); > > early_quirks(); > +#ifdef CONFIG_PCI > + early_reset_pcie_devices(); > +#endif > > /* > * Read APIC and some other early information from ACPI tables. > diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c > index d1067d5..584f16b 100644 > --- a/arch/x86/pci/early.c > +++ b/arch/x86/pci/early.c > @@ -1,5 +1,6 @@ > #include <linux/kernel.h> > #include <linux/pci.h> > +#include <linux/bootmem.h> > #include <asm/pci-direct.h> > #include <asm/io.h> > #include <asm/pci_x86.h> > @@ -109,3 +110,301 @@ void early_dump_pci_devices(void) > } > } > } > + > +struct save_config { > + u32 pci[16]; > + u16 pcie[PCI_EXP_SAVE_REGS]; > +}; > + > +struct devinfo { > + int pcie_pos; /* position of PCI Express capability */ > + int pcie_flags; /* PCI_EXP_FLAGS */ > + struct save_config *save; > +}; > + > +static struct save_config *save_cfg; > +static void __init pci_udelay(int loops) > +{ > + while (loops--) { > + /* Approximately 1 us */ > + native_io_delay(); > + } > +} > + > +/* Derived from drivers/pci/pci.c */ > +#define PCI_FIND_CAP_TTL 48 > +static int __init __pci_find_next_cap_ttl(u8 bus, u8 slot, u8 func, > + u8 pos, int cap, int *ttl) > +{ > + u8 id; > + > + while ((*ttl)--) { > + pos = read_pci_config_byte(bus, slot, func, pos); > + if (pos < 0x40) > + break; > + pos &= ~3; > + id = read_pci_config_byte(bus, slot, func, > + pos + PCI_CAP_LIST_ID); > + if (id == 0xff) > + break; > + if (id == cap) > + return pos; > + pos += PCI_CAP_LIST_NEXT; > + } > + return 0; > +} > + > +static int __init __pci_find_next_cap(u8 bus, u8 slot, u8 func, u8 pos, int cap) > +{ > + int ttl = PCI_FIND_CAP_TTL; > + > + return __pci_find_next_cap_ttl(bus, slot, func, pos, cap, &ttl); > +} > + > +static int __init __pci_bus_find_cap_start(u8 bus, u8 slot, u8 func, > + u8 hdr_type) > +{ > + u16 status; > + > + status = read_pci_config_16(bus, slot, func, PCI_STATUS); > + if (!(status & PCI_STATUS_CAP_LIST)) > + return 0; > + > + switch (hdr_type) { > + case PCI_HEADER_TYPE_NORMAL: > + case PCI_HEADER_TYPE_BRIDGE: > + return PCI_CAPABILITY_LIST; > + case PCI_HEADER_TYPE_CARDBUS: > + return PCI_CB_CAPABILITY_LIST; > + default: > + return 0; > + } > + > + return 0; > +} > + > +static int __init early_pci_find_capability(u8 bus, u8 slot, u8 func, int cap) > +{ > + int pos; > + u8 type = read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE); > + > + pos = __pci_bus_find_cap_start(bus, slot, func, type & 0x7f); > + if (pos) > + pos = __pci_find_next_cap(bus, slot, func, pos, cap); > + > + return pos; > +} > + > +static void __init do_reset(u8 bus, u8 slot, u8 func) > +{ > + u16 ctrl; > + > + printk(KERN_INFO "pci 0000:%02x:%02x.%d reset\n", bus, slot, func); > + > + /* Assert Secondary Bus Reset */ > + ctrl = read_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL); > + ctrl |= PCI_BRIDGE_CTL_BUS_RESET; > + write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl); > + > + pci_udelay(5000); > + > + /* De-assert Secondary Bus Reset */ > + ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET; > + write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl); > + > + pci_udelay(500000); This is 0.5 second. This will add up quickly on larger servers with multiple busses. Is 0.5 second required by the spec? aer_do_secondary_bus_reset() holds PCI_BRIDGE_CTL_BUS_RESET for 2 ms and then waits another 200 ms after de-asserting it. Still long, but less than half of the delay in above code.. > +} > + > +static void __init save_state(unsigned bus, unsigned slot, unsigned func, > + struct devinfo *info) > +{ > + int i; > + int pcie, flags, pcie_type; > + struct save_config *save; > + > + pcie = info->pcie_pos; > + flags = info->pcie_flags; > + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4; > + save = info->save; > + > + printk(KERN_INFO "pci 0000:%02x:%02x.%d save state\n", bus, slot, func); > + > + for (i = 0; i < 16; i++) > + save->pci[i] = read_pci_config(bus, slot, func, i * 4); > + i = 0; > + if (pcie_cap_has_devctl(pcie_type, flags)) > + save->pcie[i++] = read_pci_config_16(bus, slot, func, > + pcie + PCI_EXP_DEVCTL); > + if (pcie_cap_has_lnkctl(pcie_type, flags)) > + save->pcie[i++] = read_pci_config_16(bus, slot, func, > + pcie + PCI_EXP_LNKCTL); > + if (pcie_cap_has_sltctl(pcie_type, flags)) > + save->pcie[i++] = read_pci_config_16(bus, slot, func, > + pcie + PCI_EXP_SLTCTL); > + if (pcie_cap_has_rtctl(pcie_type, flags)) > + save->pcie[i++] = read_pci_config_16(bus, slot, func, > + pcie + PCI_EXP_RTCTL); > + > + if ((flags & PCI_EXP_FLAGS_VERS) >= 2) { > + save->pcie[i++] = read_pci_config_16(bus, slot, func, > + pcie + PCI_EXP_DEVCTL2); > + save->pcie[i++] = read_pci_config_16(bus, slot, func, > + pcie + PCI_EXP_LNKCTL2); > + save->pcie[i++] = read_pci_config_16(bus, slot, func, > + pcie + PCI_EXP_SLTCTL2); > + } > +} > + > +static void __init restore_state(unsigned bus, unsigned slot, unsigned func, > + struct devinfo *info) > +{ > + int i = 0; > + int pcie, flags, pcie_type; > + struct save_config *save; > + > + pcie = info->pcie_pos; > + flags = info->pcie_flags; > + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4; > + save = info->save; > + > + printk(KERN_INFO "pci 0000:%02x:%02x.%d restore state\n", > + bus, slot, func); > + > + if (pcie_cap_has_devctl(pcie_type, flags)) > + write_pci_config_16(bus, slot, func, > + pcie + PCI_EXP_DEVCTL, save->pcie[i++]); > + if (pcie_cap_has_lnkctl(pcie_type, flags)) > + write_pci_config_16(bus, slot, func, > + pcie + PCI_EXP_LNKCTL, save->pcie[i++]); > + if (pcie_cap_has_sltctl(pcie_type, flags)) > + write_pci_config_16(bus, slot, func, > + pcie + PCI_EXP_SLTCTL, save->pcie[i++]); > + if (pcie_cap_has_rtctl(pcie_type, flags)) > + write_pci_config_16(bus, slot, func, > + pcie + PCI_EXP_RTCTL, save->pcie[i++]); > + > + if ((flags & PCI_EXP_FLAGS_VERS) >= 2) { > + write_pci_config_16(bus, slot, func, > + pcie + PCI_EXP_DEVCTL2, save->pcie[i++]); > + write_pci_config_16(bus, slot, func, > + pcie + PCI_EXP_LNKCTL2, save->pcie[i++]); > + write_pci_config_16(bus, slot, func, > + pcie + PCI_EXP_SLTCTL2, save->pcie[i++]); > + } > + > + for (i = 15; i >= 0; i--) > + write_pci_config(bus, slot, func, i * 4, save->pci[i]); > +} > + > +static void __init reset_pcie_device(unsigned bus, unsigned slot, unsigned func) > +{ > + int f, count; > + int pcie, pcie_type; > + u8 type; > + u16 vendor, flags; > + u32 class; > + int secondary; > + struct devinfo child[8]; > + > + pcie = early_pci_find_capability(bus, slot, func, PCI_CAP_ID_EXP); > + if (!pcie) > + return; > + > + flags = read_pci_config_16(bus, slot, func, pcie + PCI_EXP_FLAGS); > + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4; > + if ((pcie_type != PCI_EXP_TYPE_ROOT_PORT) && > + (pcie_type != PCI_EXP_TYPE_DOWNSTREAM)) > + return; > + > + type = read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE); > + if ((type & 0x7f) != PCI_HEADER_TYPE_BRIDGE) > + return; > + secondary = read_pci_config_byte(bus, slot, func, PCI_SECONDARY_BUS); > + memset(child, 0, sizeof(child)); > + for (count = 0, f = 0; f < 8; f++) { Can we use a constant instead of "8" in the loop here? There are a few other places in kernel code with very similar loops enumerating over PCI functions that again use "8" instead of a constant like PCI_MAX_FUNCTIONS. I would suggest we use a constant at least in the new code. > + vendor = read_pci_config_16(secondary, 0, f, PCI_VENDOR_ID); > + if (vendor == 0xffff) > + continue; > + > + pcie = early_pci_find_capability(secondary, 0, f, > + PCI_CAP_ID_EXP); > + if (!pcie) > + continue; > + > + flags = read_pci_config_16(secondary, 0, f, > + pcie + PCI_EXP_FLAGS); > + pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4; > + if ((pcie_type == PCI_EXP_TYPE_UPSTREAM) || > + (pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)) > + /* Don't reset switch, bridge */ > + return; > + > + class = read_pci_config(secondary, 0, f, PCI_CLASS_REVISION); > + if ((class >> 24) == PCI_BASE_CLASS_DISPLAY) > + /* Don't reset VGA device */ > + return; > + > + count++; > + child[f].pcie_pos = pcie; > + child[f].pcie_flags = flags; > + child[f].save = save_cfg + f; > + } > + > + if (!count) > + return; > + > + /* save */ > + for (f = 0; f < 8; f++) > + if (child[f].pcie_pos) > + save_state(secondary, 0, f, &child[f]); > + > + do_reset(bus, slot, func); > + > + /* restore */ > + for (f = 0; f < 8; f++) > + if (child[f].pcie_pos) > + restore_state(secondary, 0, f, &child[f]); > +} > + > +void __init early_reset_pcie_devices(void) > +{ > + unsigned bus, slot, func; > + int size; > + > + if (!early_pci_allowed() || !reset_devices) > + return; > + > + /* alloc space to save config */ > + size = sizeof(struct save_config)*8; Use a constant instead of "8", please. > + save_cfg = (struct save_config *)alloc_bootmem(size); > + if (save_cfg == NULL) { > + printk(KERN_ERR "reset_pcie: alloc_bootmem failed\n"); > + return; > + } > + > + for (bus = 0; bus < 256; bus++) { > + for (slot = 0; slot < 32; slot++) { > + for (func = 0; func < 8; func++) { > + u16 vendor; > + u8 type; > + vendor = read_pci_config_16(bus, slot, func, > + PCI_VENDOR_ID); > + > + if (vendor == 0xffff) > + continue; > + > + reset_pcie_device(bus, slot, func); > + > + if (func == 0) { > + type = read_pci_config_byte(bus, slot, > + func, > + PCI_HEADER_TYPE); > + if (!(type & 0x80)) > + break; > + } > + } > + } > + } > + > + free_bootmem(__pa(save_cfg), size); > +} > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c > index ab4bf5a..a7a4125 100644 > --- a/drivers/pci/pci.c > +++ b/drivers/pci/pci.c > @@ -852,24 +852,6 @@ pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state) > > EXPORT_SYMBOL(pci_choose_state); > > -#define PCI_EXP_SAVE_REGS 7 > - > -#define pcie_cap_has_devctl(type, flags) 1 > -#define pcie_cap_has_lnkctl(type, flags) \ > - ((flags & PCI_EXP_FLAGS_VERS) > 1 || \ > - (type == PCI_EXP_TYPE_ROOT_PORT || \ > - type == PCI_EXP_TYPE_ENDPOINT || \ > - type == PCI_EXP_TYPE_LEG_END)) > -#define pcie_cap_has_sltctl(type, flags) \ > - ((flags & PCI_EXP_FLAGS_VERS) > 1 || \ > - ((type == PCI_EXP_TYPE_ROOT_PORT) || \ > - (type == PCI_EXP_TYPE_DOWNSTREAM && \ > - (flags & PCI_EXP_FLAGS_SLOT)))) > -#define pcie_cap_has_rtctl(type, flags) \ > - ((flags & PCI_EXP_FLAGS_VERS) > 1 || \ > - (type == PCI_EXP_TYPE_ROOT_PORT || \ > - type == PCI_EXP_TYPE_RC_EC)) > - > static struct pci_cap_saved_state *pci_find_saved_cap( > struct pci_dev *pci_dev, char cap) > { > diff --git a/include/linux/pci.h b/include/linux/pci.h > index 5faa831..8e10401 100644 > --- a/include/linux/pci.h > +++ b/include/linux/pci.h > @@ -1790,5 +1790,23 @@ static inline struct eeh_dev *pci_dev_to_eeh_dev(struct pci_dev *pdev) > */ > struct pci_dev *pci_find_upstream_pcie_bridge(struct pci_dev *pdev); > > +#define PCI_EXP_SAVE_REGS 7 > + > +#define pcie_cap_has_devctl(type, flags) 1 > +#define pcie_cap_has_lnkctl(type, flags) \ > + ((flags & PCI_EXP_FLAGS_VERS) > 1 || \ > + (type == PCI_EXP_TYPE_ROOT_PORT || \ > + type == PCI_EXP_TYPE_ENDPOINT || \ > + type == PCI_EXP_TYPE_LEG_END)) > +#define pcie_cap_has_sltctl(type, flags) \ > + ((flags & PCI_EXP_FLAGS_VERS) > 1 || \ > + ((type == PCI_EXP_TYPE_ROOT_PORT) || \ > + (type == PCI_EXP_TYPE_DOWNSTREAM && \ > + (flags & PCI_EXP_FLAGS_SLOT)))) > +#define pcie_cap_has_rtctl(type, flags) \ > + ((flags & PCI_EXP_FLAGS_VERS) > 1 || \ > + (type == PCI_EXP_TYPE_ROOT_PORT || \ > + type == PCI_EXP_TYPE_RC_EC)) > + > #endif /* __KERNEL__ */ > #endif /* LINUX_PCI_H */ > diff --git a/init/main.c b/init/main.c > index b286730..ebaf067 100644 > --- a/init/main.c > +++ b/init/main.c > @@ -144,10 +144,10 @@ EXPORT_SYMBOL(reset_devices); > static int __init set_reset_devices(char *str) > { > reset_devices = 1; > - return 1; > + return 0; > } > > -__setup("reset_devices", set_reset_devices); > +early_param("reset_devices", set_reset_devices); > > static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; > const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; > > > _______________________________________________ > kexec mailing list > kexec at lists.infradead.org > http://lists.infradead.org/mailman/listinfo/kexec We have been seeing problems with kexec/kdump kernel for quite some time that are related to I/O devices not being quiesced before kexec. I had added code to clear Bus Master bit to help stop runaway DMAs which helped many cases, but obviously not all. If resetting downstream ports helps stop runaway I/O from PCIe devices, I am all for this approach. This patch still doesn't do anything for old PCI devices though. -- Khalid