Hi Andrew, On 11/08/21 at 06:31pm, Andrew Morton wrote: > From: David Hildenbrand <david@xxxxxxxxxx> > Subject: proc/vmcore: convert oldmem_pfn_is_ram callback to more generic vmcore callbacks > > Let's support multiple registered callbacks, making sure that registering > vmcore callbacks cannot fail. Make the callback return a bool instead of > an int, handling how to deal with errors internally. Drop unused > HAVE_OLDMEM_PFN_IS_RAM. > > We soon want to make use of this infrastructure from other drivers: > virtio-mem, registering one callback for each virtio-mem device, to > prevent reading unplugged virtio-mem memory. > > Handle it via a generic vmcore_cb structure, prepared for future > extensions: for example, once we support virtio-mem on s390x where the > vmcore is completely constructed in the second kernel, we want to detect > and add plugged virtio-mem memory ranges to the vmcore in order for them > to get dumped properly. > > Handle corner cases that are unexpected and shouldn't happen in sane > setups: registering a callback after the vmcore has already been opened > (warn only) and unregistering a callback after the vmcore has already been > opened (warn and essentially read only zeroes from that point on). This is a nice improvement, thanks David. But I did not get time to review it yet. The overall idea is good, I would prefer to hold on the patches for some time and waiting for more review. Sorry for jumping in late. > > Link: https://lkml.kernel.org/r/20211005121430.30136-6-david@xxxxxxxxxx > Signed-off-by: David Hildenbrand <david@xxxxxxxxxx> > Cc: Baoquan He <bhe@xxxxxxxxxx> > Cc: Borislav Petkov <bp@xxxxxxxxx> > Cc: Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx> > Cc: Dave Young <dyoung@xxxxxxxxxx> > Cc: "H. Peter Anvin" <hpa@xxxxxxxxx> > Cc: Ingo Molnar <mingo@xxxxxxxxxx> > Cc: Jason Wang <jasowang@xxxxxxxxxx> > Cc: Juergen Gross <jgross@xxxxxxxx> > Cc: "Michael S. Tsirkin" <mst@xxxxxxxxxx> > Cc: Michal Hocko <mhocko@xxxxxxxx> > Cc: Mike Rapoport <rppt@xxxxxxxxxx> > Cc: Oscar Salvador <osalvador@xxxxxxx> > Cc: "Rafael J. Wysocki" <rafael.j.wysocki@xxxxxxxxx> > Cc: Stefano Stabellini <sstabellini@xxxxxxxxxx> > Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> > Cc: Vivek Goyal <vgoyal@xxxxxxxxxx> > Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> > --- > > arch/x86/kernel/aperture_64.c | 13 +++- > arch/x86/xen/mmu_hvm.c | 11 ++- > fs/proc/vmcore.c | 101 ++++++++++++++++++++++---------- > include/linux/crash_dump.h | 26 +++++++- > 4 files changed, 112 insertions(+), 39 deletions(-) > > --- a/arch/x86/kernel/aperture_64.c~proc-vmcore-convert-oldmem_pfn_is_ram-callback-to-more-generic-vmcore-callbacks > +++ a/arch/x86/kernel/aperture_64.c > @@ -73,12 +73,23 @@ static int gart_mem_pfn_is_ram(unsigned > (pfn >= aperture_pfn_start + aperture_page_count)); > } > > +#ifdef CONFIG_PROC_VMCORE > +static bool gart_oldmem_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn) > +{ > + return !!gart_mem_pfn_is_ram(pfn); > +} > + > +static struct vmcore_cb gart_vmcore_cb = { > + .pfn_is_ram = gart_oldmem_pfn_is_ram, > +}; > +#endif > + > static void __init exclude_from_core(u64 aper_base, u32 aper_order) > { > aperture_pfn_start = aper_base >> PAGE_SHIFT; > aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT; > #ifdef CONFIG_PROC_VMCORE > - WARN_ON(register_oldmem_pfn_is_ram(&gart_mem_pfn_is_ram)); > + register_vmcore_cb(&gart_vmcore_cb); > #endif > #ifdef CONFIG_PROC_KCORE > WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram)); > --- a/arch/x86/xen/mmu_hvm.c~proc-vmcore-convert-oldmem_pfn_is_ram-callback-to-more-generic-vmcore-callbacks > +++ a/arch/x86/xen/mmu_hvm.c > @@ -12,10 +12,10 @@ > * The kdump kernel has to check whether a pfn of the crashed kernel > * was a ballooned page. vmcore is using this function to decide > * whether to access a pfn of the crashed kernel. > - * Returns 0 if the pfn is not backed by a RAM page, the caller may > + * Returns "false" if the pfn is not backed by a RAM page, the caller may > * handle the pfn special in this case. > */ > -static int xen_oldmem_pfn_is_ram(unsigned long pfn) > +static bool xen_vmcore_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn) > { > struct xen_hvm_get_mem_type a = { > .domid = DOMID_SELF, > @@ -24,10 +24,13 @@ static int xen_oldmem_pfn_is_ram(unsigne > > if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) { > pr_warn_once("Unexpected HVMOP_get_mem_type failure\n"); > - return -ENXIO; > + return true; > } > return a.mem_type != HVMMEM_mmio_dm; > } > +static struct vmcore_cb xen_vmcore_cb = { > + .pfn_is_ram = xen_vmcore_pfn_is_ram, > +}; > #endif > > static void xen_hvm_exit_mmap(struct mm_struct *mm) > @@ -61,6 +64,6 @@ void __init xen_hvm_init_mmu_ops(void) > if (is_pagetable_dying_supported()) > pv_ops.mmu.exit_mmap = xen_hvm_exit_mmap; > #ifdef CONFIG_PROC_VMCORE > - WARN_ON(register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram)); > + register_vmcore_cb(&xen_vmcore_cb); > #endif > } > --- a/fs/proc/vmcore.c~proc-vmcore-convert-oldmem_pfn_is_ram-callback-to-more-generic-vmcore-callbacks > +++ a/fs/proc/vmcore.c > @@ -62,46 +62,75 @@ core_param(novmcoredd, vmcoredd_disabled > /* Device Dump Size */ > static size_t vmcoredd_orig_sz; > > -/* > - * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error > - * The called function has to take care of module refcounting. > - */ > -static int (*oldmem_pfn_is_ram)(unsigned long pfn); > - > -int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn)) > -{ > - if (oldmem_pfn_is_ram) > - return -EBUSY; > - oldmem_pfn_is_ram = fn; > - return 0; > +static DECLARE_RWSEM(vmcore_cb_rwsem); > +/* List of registered vmcore callbacks. */ > +static LIST_HEAD(vmcore_cb_list); > +/* Whether we had a surprise unregistration of a callback. */ > +static bool vmcore_cb_unstable; > +/* Whether the vmcore has been opened once. */ > +static bool vmcore_opened; > + > +void register_vmcore_cb(struct vmcore_cb *cb) > +{ > + down_write(&vmcore_cb_rwsem); > + INIT_LIST_HEAD(&cb->next); > + list_add_tail(&cb->next, &vmcore_cb_list); > + /* > + * Registering a vmcore callback after the vmcore was opened is > + * very unusual (e.g., manual driver loading). > + */ > + if (vmcore_opened) > + pr_warn_once("Unexpected vmcore callback registration\n"); > + up_write(&vmcore_cb_rwsem); > } > -EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram); > +EXPORT_SYMBOL_GPL(register_vmcore_cb); > > -void unregister_oldmem_pfn_is_ram(void) > +void unregister_vmcore_cb(struct vmcore_cb *cb) > { > - oldmem_pfn_is_ram = NULL; > - wmb(); > + down_write(&vmcore_cb_rwsem); > + list_del(&cb->next); > + /* > + * Unregistering a vmcore callback after the vmcore was opened is > + * very unusual (e.g., forced driver removal), but we cannot stop > + * unregistering. > + */ > + if (vmcore_opened) { > + pr_warn_once("Unexpected vmcore callback unregistration\n"); > + vmcore_cb_unstable = true; > + } > + up_write(&vmcore_cb_rwsem); > } > -EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram); > +EXPORT_SYMBOL_GPL(unregister_vmcore_cb); > > static bool pfn_is_ram(unsigned long pfn) > { > - int (*fn)(unsigned long pfn); > - /* pfn is ram unless fn() checks pagetype */ > + struct vmcore_cb *cb; > bool ret = true; > > - /* > - * Ask hypervisor if the pfn is really ram. > - * A ballooned page contains no data and reading from such a page > - * will cause high load in the hypervisor. > - */ > - fn = oldmem_pfn_is_ram; > - if (fn) > - ret = !!fn(pfn); > + lockdep_assert_held_read(&vmcore_cb_rwsem); > + if (unlikely(vmcore_cb_unstable)) > + return false; > + > + list_for_each_entry(cb, &vmcore_cb_list, next) { > + if (unlikely(!cb->pfn_is_ram)) > + continue; > + ret = cb->pfn_is_ram(cb, pfn); > + if (!ret) > + break; > + } > > return ret; > } > > +static int open_vmcore(struct inode *inode, struct file *file) > +{ > + down_read(&vmcore_cb_rwsem); > + vmcore_opened = true; > + up_read(&vmcore_cb_rwsem); > + > + return 0; > +} > + > /* Reads a page from the oldmem device from given offset. */ > ssize_t read_from_oldmem(char *buf, size_t count, > u64 *ppos, int userbuf, > @@ -117,6 +146,7 @@ ssize_t read_from_oldmem(char *buf, size > offset = (unsigned long)(*ppos % PAGE_SIZE); > pfn = (unsigned long)(*ppos / PAGE_SIZE); > > + down_read(&vmcore_cb_rwsem); > do { > if (count > (PAGE_SIZE - offset)) > nr_bytes = PAGE_SIZE - offset; > @@ -136,8 +166,10 @@ ssize_t read_from_oldmem(char *buf, size > tmp = copy_oldmem_page(pfn, buf, nr_bytes, > offset, userbuf); > > - if (tmp < 0) > + if (tmp < 0) { > + up_read(&vmcore_cb_rwsem); > return tmp; > + } > } > *ppos += nr_bytes; > count -= nr_bytes; > @@ -147,6 +179,7 @@ ssize_t read_from_oldmem(char *buf, size > offset = 0; > } while (count); > > + up_read(&vmcore_cb_rwsem); > return read; > } > > @@ -537,14 +570,19 @@ static int vmcore_remap_oldmem_pfn(struc > unsigned long from, unsigned long pfn, > unsigned long size, pgprot_t prot) > { > + int ret; > + > /* > * Check if oldmem_pfn_is_ram was registered to avoid > * looping over all pages without a reason. > */ > - if (oldmem_pfn_is_ram) > - return remap_oldmem_pfn_checked(vma, from, pfn, size, prot); > + down_read(&vmcore_cb_rwsem); > + if (!list_empty(&vmcore_cb_list) || vmcore_cb_unstable) > + ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot); > else > - return remap_oldmem_pfn_range(vma, from, pfn, size, prot); > + ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot); > + up_read(&vmcore_cb_rwsem); > + return ret; > } > > static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) > @@ -668,6 +706,7 @@ static int mmap_vmcore(struct file *file > #endif > > static const struct proc_ops vmcore_proc_ops = { > + .proc_open = open_vmcore, > .proc_read = read_vmcore, > .proc_lseek = default_llseek, > .proc_mmap = mmap_vmcore, > --- a/include/linux/crash_dump.h~proc-vmcore-convert-oldmem_pfn_is_ram-callback-to-more-generic-vmcore-callbacks > +++ a/include/linux/crash_dump.h > @@ -91,9 +91,29 @@ static inline void vmcore_unusable(void) > elfcorehdr_addr = ELFCORE_ADDR_ERR; > } > > -#define HAVE_OLDMEM_PFN_IS_RAM 1 > -extern int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn)); > -extern void unregister_oldmem_pfn_is_ram(void); > +/** > + * struct vmcore_cb - driver callbacks for /proc/vmcore handling > + * @pfn_is_ram: check whether a PFN really is RAM and should be accessed when > + * reading the vmcore. Will return "true" if it is RAM or if the > + * callback cannot tell. If any callback returns "false", it's not > + * RAM and the page must not be accessed; zeroes should be > + * indicated in the vmcore instead. For example, a ballooned page > + * contains no data and reading from such a page will cause high > + * load in the hypervisor. > + * @next: List head to manage registered callbacks internally; initialized by > + * register_vmcore_cb(). > + * > + * vmcore callbacks allow drivers managing physical memory ranges to > + * coordinate with vmcore handling code, for example, to prevent accessing > + * physical memory ranges that should not be accessed when reading the vmcore, > + * although included in the vmcore header as memory ranges to dump. > + */ > +struct vmcore_cb { > + bool (*pfn_is_ram)(struct vmcore_cb *cb, unsigned long pfn); > + struct list_head next; > +}; > +extern void register_vmcore_cb(struct vmcore_cb *cb); > +extern void unregister_vmcore_cb(struct vmcore_cb *cb); > > #else /* !CONFIG_CRASH_DUMP */ > static inline bool is_kdump_kernel(void) { return 0; } > _ >