On Mon, 7 Feb 2022 11:56:11 +0100 Janosch Frank <frankja@xxxxxxxxxxxxx> wrote: > On 2/7/22 11:47, Claudio Imbrenda wrote: > > On Mon, 7 Feb 2022 11:02:28 +0100 > > Janosch Frank <frankja@xxxxxxxxxxxxx> wrote: > > > >> On 2/4/22 16:53, Claudio Imbrenda wrote: > >>> Refactor s390_reset_acc so that it can be reused in upcoming patches. > >>> > >>> We don't want to hold all the locks used in a walk_page_range for too > >>> long, and the destroy page UVC does take some time to complete. > >>> Therefore we quickly gather the pages to destroy, and then destroy them > >>> without holding all the locks. > >>> > >>> The new refactored function optionally allows to return early without > >>> completing if a fatal signal is pending (and return and appropriate > >>> error code). Two wrappers are provided to call the new function. > >>> > >>> Signed-off-by: Claudio Imbrenda <imbrenda@xxxxxxxxxxxxx> > >>> (dropping Janosch's Ack because of major changes to the patch) > >>> --- > >>> arch/s390/include/asm/gmap.h | 36 +++++++++++++- > >>> arch/s390/kvm/pv.c | 12 ++++- > >>> arch/s390/mm/gmap.c | 95 +++++++++++++++++++++++++----------- > >>> 3 files changed, 111 insertions(+), 32 deletions(-) > >>> > >>> diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h > >>> index 746e18bf8984..2a913014f63c 100644 > >>> --- a/arch/s390/include/asm/gmap.h > >>> +++ b/arch/s390/include/asm/gmap.h > >>> @@ -147,7 +147,41 @@ int gmap_mprotect_notify(struct gmap *, unsigned long start, > >>> void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], > >>> unsigned long gaddr, unsigned long vmaddr); > >>> int gmap_mark_unmergeable(void); > >>> -void s390_reset_acc(struct mm_struct *mm); > >>> void s390_remove_old_asce(struct gmap *gmap); > >>> int s390_replace_asce(struct gmap *gmap); > >>> +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns); > >>> +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, > >>> + unsigned long end, int interruptible); > >> > >> s/int/bool/ ? > > > > I don't know, I like the fact that the function could return > > different error codes in the future if needed. > > I meant the interruptible parameter or do you expect to have various > levels of interruptibility (is that even a word)? ooohhh, that one. I guess it could be a bool, yes > > > > >> > >>> + > >>> +/** > >>> + * s390_uv_destroy_range - Destroy a range of pages in the given mm. > >>> + * @mm the mm on which to operate on > >>> + * @start the start of the range > >>> + * @end the end of the range > >>> + * > >>> + * This call will call cond_sched, so it should not generate stalls, but it > >> > >> This function will call ? > > > > will fix > > > >> > >>> + * will otherwise only return when it completed. > >>> + */ > >>> +static inline void s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, > >>> + unsigned long end) > >>> +{ > >>> + (void)__s390_uv_destroy_range(mm, start, end, 0); > >>> +} > >>> + > >>> +/** > >>> + * s390_uv_destroy_range_interruptibe - Destroy a range of pages in the > >> > >> interruptible > > > > will fix > > > >> > >>> + * given mm, but stop when a fatal signal is received. > >>> + * @mm the mm on which to operate on > >>> + * @start the start of the range > >>> + * @end the end of the range > >>> + * > >>> + * This call will call cond_sched, so it should not generate stalls. It > >>> + * will return -EINTR if a fatal signal is received, or 0 if the whole range > >>> + * has been destroyed. > >>> + */ > >>> +static inline int s390_uv_destroy_range_interruptible(struct mm_struct *mm, unsigned long start, > >>> + unsigned long end) > >>> +{ > >>> + return __s390_uv_destroy_range(mm, start, end, 1); > >>> +} > >>> #endif /* _ASM_S390_GMAP_H */ > >>> diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c > >>> index 3c59ef763dde..2ab22500e092 100644 > >>> --- a/arch/s390/kvm/pv.c > >>> +++ b/arch/s390/kvm/pv.c > >>> @@ -12,6 +12,8 @@ > >>> #include <asm/gmap.h> > >>> #include <asm/uv.h> > >>> #include <asm/mman.h> > >>> +#include <linux/pagewalk.h> > >>> +#include <linux/sched/mm.h> > >>> #include "kvm-s390.h" > >>> > >>> int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) > >>> @@ -157,8 +159,14 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) > >>> { > >>> int cc; > >>> > >>> - /* make all pages accessible before destroying the guest */ > >>> - s390_reset_acc(kvm->mm); > >>> + /* > >>> + * if the mm still has a mapping, make all its pages accessible > >>> + * before destroying the guest > >>> + */ > >>> + if (mmget_not_zero(kvm->mm)) { > >>> + s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); > >>> + mmput(kvm->mm); > >>> + } > >>> > >>> cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), > >>> UVC_CMD_DESTROY_SEC_CONF, rc, rrc); > >>> diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c > >>> index ce6cac4463f2..6eb5acb4be3d 100644 > >>> --- a/arch/s390/mm/gmap.c > >>> +++ b/arch/s390/mm/gmap.c > >>> @@ -2676,44 +2676,81 @@ void s390_reset_cmma(struct mm_struct *mm) > >>> } > >>> EXPORT_SYMBOL_GPL(s390_reset_cmma); > >>> > >>> -/* > >>> - * make inaccessible pages accessible again > >>> - */ > >>> -static int __s390_reset_acc(pte_t *ptep, unsigned long addr, > >>> - unsigned long next, struct mm_walk *walk) > >>> +#define DESTROY_LOOP_THRESHOLD 32 > >>> + > >>> +struct reset_walk_state { > >>> + unsigned long next; > >>> + unsigned long count; > >>> + unsigned long pfns[DESTROY_LOOP_THRESHOLD]; > >>> +}; > >>> + > >>> +static int s390_gather_pages(pte_t *ptep, unsigned long addr, > >>> + unsigned long next, struct mm_walk *walk) > >>> { > >>> + struct reset_walk_state *p = walk->private; > >>> pte_t pte = READ_ONCE(*ptep); > >>> > >>> - /* There is a reference through the mapping */ > >>> - if (pte_present(pte)) > >>> - WARN_ON_ONCE(uv_destroy_owned_page(pte_val(pte) & PAGE_MASK)); > >>> - > >>> - return 0; > >>> + if (pte_present(pte)) { > >>> + /* we have a reference from the mapping, take an extra one */ > >>> + get_page(phys_to_page(pte_val(pte))); > >>> + p->pfns[p->count] = phys_to_pfn(pte_val(pte)); > >>> + p->next = next; > >>> + p->count++; > >>> + } > >>> + return p->count >= DESTROY_LOOP_THRESHOLD; > >>> } > >>> > >>> -static const struct mm_walk_ops reset_acc_walk_ops = { > >>> - .pte_entry = __s390_reset_acc, > >>> +static const struct mm_walk_ops gather_pages_ops = { > >>> + .pte_entry = s390_gather_pages, > >>> }; > >>> > >>> -#include <linux/sched/mm.h> > >>> -void s390_reset_acc(struct mm_struct *mm) > >>> +/* > >>> + * Call the Destroy secure page UVC on each page in the given array of PFNs. > >>> + * Each page needs to have an extra reference, which will be released here. > >>> + */ > >>> +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) > >>> { > >>> - if (!mm_is_protected(mm)) > >>> - return; > >>> - /* > >>> - * we might be called during > >>> - * reset: we walk the pages and clear > >>> - * close of all kvm file descriptors: we walk the pages and clear > >>> - * exit of process on fd closure: vma already gone, do nothing > >>> - */ > >>> - if (!mmget_not_zero(mm)) > >>> - return; > >>> - mmap_read_lock(mm); > >>> - walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL); > >>> - mmap_read_unlock(mm); > >>> - mmput(mm); > >>> + unsigned long i; > >>> + > >>> + for (i = 0; i < count; i++) { > >>> + /* we always have an extra reference */ > >>> + uv_destroy_owned_page(pfn_to_phys(pfns[i])); > >>> + /* get rid of the extra reference */ > >>> + put_page(pfn_to_page(pfns[i])); > >>> + cond_resched(); > >>> + } > >>> +} > >>> +EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); > >>> + > >>> +/** > >>> + * __s390_uv_destroy_range - Walk the given range of the given address > >>> + * space, and call the destroy secure page UVC on each page. > >>> + * Optionally exit early if a fatal signal is pending. > >>> + * @mm the mm to operate on > >>> + * @start the start of the range > >>> + * @end the end of the range > >>> + * @interruptible if not 0, stop when a fatal signal is received > >>> + * Return: 0 on success, -EINTR if the function stopped before completing > >>> + */ > >>> +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, > >>> + unsigned long end, int interruptible) > >>> +{ > >>> + struct reset_walk_state state = { .next = start }; > >>> + int r = 1; > >>> + > >>> + while (r > 0) { > >>> + state.count = 0; > >>> + mmap_read_lock(mm); > >>> + r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); > >>> + mmap_read_unlock(mm); > >>> + cond_resched(); > >>> + s390_uv_destroy_pfns(state.count, state.pfns); > >>> + if (interruptible && fatal_signal_pending(current)) > >>> + return -EINTR; > >>> + } > >>> + return 0; > >>> } > >>> -EXPORT_SYMBOL_GPL(s390_reset_acc); > >>> +EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); > >>> > >>> /** > >>> * s390_remove_old_asce - Remove the topmost level of page tables from the > >>> > >> > > >