On Sun, Aug 27, 2017 at 11:12:37PM -0400, Pingfan Liu wrote: > > > > > ----- Original Message ----- > > From: "Hari Bathini" <hbathini at linux.vnet.ibm.com> > > To: "Simon Horman" <horms at verge.net.au>, "Kexec-ml" <kexec at lists.infradead.org> > > Cc: "Ankit Kumar" <ankit at linux.vnet.ibm.com>, "Anshuman Khandual" <khandual at linux.vnet.ibm.com>, "Ananth N > > Mavinakayanahalli" <ananth at linux.vnet.ibm.com>, "Alistair Popple" <alistair at popple.id.au> > > Sent: Thursday, August 17, 2017 8:31:51 PM > > Subject: [PATCH] kexec-tools: ppc64: avoid adding coherent memory regions to crash memory ranges > > > > Accelerator devices like GPU and FPGA cards contain onboard memory. This > > onboard memory is represented as a memory only NUMA node, integrating it > > with core memory subsystem. Since, the link through which these devices > > are integrated to core memory goes down after a system crash and they are > > meant for user workloads, avoid adding coherent device memory regions to > > crash memory ranges. Without this change, makedumpfile tool tries to save > > unaccessible coherent device memory regions, crashing the system. > > > > Signed-off-by: Hari Bathini <hbathini at linux.vnet.ibm.com> > > --- > > kexec/arch/ppc64/crashdump-ppc64.c | 64 > > +++++++++++++++++++++++++++++++++++- > > kexec/arch/ppc64/kexec-ppc64.h | 1 + > > 2 files changed, 63 insertions(+), 2 deletions(-) > > > > diff --git a/kexec/arch/ppc64/crashdump-ppc64.c > > b/kexec/arch/ppc64/crashdump-ppc64.c > > index 13995bf..7ea3983 100644 > > --- a/kexec/arch/ppc64/crashdump-ppc64.c > > +++ b/kexec/arch/ppc64/crashdump-ppc64.c > > @@ -181,6 +181,53 @@ static int get_dyn_reconf_crash_memory_ranges(void) > > return 0; > > } > > > > +/* > > + * For a given memory node, check if it is mapped to system RAM or > > + * to onboard memory on accelerator device like GPU card or such. > > + */ > > +static int is_coherent_device_mem(const char *fname) > > +{ > > + char fpath[PATH_LEN]; > > + char buf[32]; > > + DIR *dmem; > > + FILE *file; > > + struct dirent *mentry; > > + int cnt, ret = 0; > > + > > + strcpy(fpath, fname); > > + if ((dmem = opendir(fpath)) == NULL) { > > + perror(fpath); > > + return -1; > > + } > > + > > + while ((mentry = readdir(dmem)) != NULL) { > > + if (strcmp(mentry->d_name, "compatible")) > > + continue; > > + > > + strcat(fpath, "/compatible"); > > + if ((file = fopen(fpath, "r")) == NULL) { > > + perror(fpath); > > + ret = -1; > > + break; > > + } > > + if ((cnt = fread(buf, 1, 32, file)) < 0) { > > + perror(fpath); > > + fclose(file); > > + ret = -1; > > + break; > > + } > > + if (!strncmp(buf, "ibm,coherent-device-memory", 26)) { > > + ret = 1; > > + break; This seems to leak file. > > + } > > + fclose(file); > > + } > > + > > + closedir(dmem); > > + return ret; > > +} > > + > > + > > /* Reads the appropriate file and retrieves the SYSTEM RAM regions for whom > > to > > * create Elf headers. Keeping it separate from get_memory_ranges() as > > * requirements are different in the case of normal kexec and crashdumps. > > @@ -196,12 +243,12 @@ static int get_crash_memory_ranges(struct memory_range > > **range, int *ranges) > > { > > > > char device_tree[256] = "/proc/device-tree/"; > > - char fname[256]; > > + char fname[PATH_LEN]; > > char buf[MAXBYTES]; > > DIR *dir, *dmem; > > FILE *file; > > struct dirent *dentry, *mentry; > > - int n, crash_rng_len = 0; > > + int n, ret, crash_rng_len = 0; > > unsigned long long start, end; > > int page_size; > > > > @@ -240,6 +287,19 @@ static int get_crash_memory_ranges(struct memory_range > > **range, int *ranges) > > continue; > > strcpy(fname, device_tree); > > strcat(fname, dentry->d_name); > > + > > + ret = is_coherent_device_mem(fname); > > + if (ret == -1) { > > + closedir(dir); > > + goto err; > > + } else if (ret == 1) { > > + /* > > + * Avoid adding this memory region as it is not > > + * mapped to system RAM. > > + */ > > + continue; > > + } > > + > > if ((dmem = opendir(fname)) == NULL) { > > perror(fname); > > closedir(dir); > > diff --git a/kexec/arch/ppc64/kexec-ppc64.h b/kexec/arch/ppc64/kexec-ppc64.h > > index 633ae77..434b4bf 100644 > > --- a/kexec/arch/ppc64/kexec-ppc64.h > > +++ b/kexec/arch/ppc64/kexec-ppc64.h > > @@ -1,6 +1,7 @@ > > #ifndef KEXEC_PPC64_H > > #define KEXEC_PPC64_H > > > > +#define PATH_LEN 256 > > #define MAXBYTES 128 > > #define MAX_LINE 160 > > #define CORE_TYPE_ELF32 1 > > > Tested-by: Pingfan Liu <piliu at redhat.com> The above not withstanding I have applied this patch with Pingfan's tag. Please post a follow-up patch as appropriate.