Peter - not sure whether it's easy for you to make a simple adjustment to this patch or if you want me to just send a v2, but I have to pop an #ifdef CONFIG_MMU into the code. I've indicated inline below where it's needed, but if it's a pain/not possible for you to do a fixup in your tree I'm happy to just respin, let me know what works for you. Sorry about that! On Thu, Nov 28, 2024 at 11:37:14AM +0000, Lorenzo Stoakes wrote: > We are current refactoring struct page to make it smaller, removing > unneeded fields that correctly belong to struct folio. > > Two of those fields are page->index and page->mapping. Perf is currently > making use of both of these, so this patch removes this usage as it turns > out it is unnecessary. > > Perf establishes its own internally controlled memory-mapped pages using > vm_ops hooks. The first page in the mapping is the read/write user control > page, and the rest of the mapping consists of read-only pages. > > The VMA is backed by kernel memory either from the buddy allocator or > vmalloc depending on configuration. It is intended to be mapped read/write, > but because it has a page_mkwrite() hook, vma_wants_writenotify() indicaets > that it should be mapped read-only. > > When a write fault occurs, the provided page_mkwrite() hook, > perf_mmap_fault() (doing double duty handing faults as well) uses the > vmf->pgoff field to determine if this is the first page, allowing for the > desired read/write first page, read-only rest mapping. > > For this to work the implementation has to carefully work around faulting > logic. When a page is write-faulted, the fault() hook is called first, then > its page_mkwrite() hook is called (to allow for dirty tracking in file > systems). > > On fault we set the folio's mapping in perf_mmap_fault(), this is because > when do_page_mkwrite() is subsequently invoked, it treats a missing mapping > as an indicator that the fault should be retried. > > We also set the folio's index so, given the folio is being treated as faux > user memory, it correctly references its offset within the VMA. > > This explains why the mapping and index fields are used - but it's not > necessary. > > We preallocate pages when perf_mmap() is called for the first time via > rb_alloc(), and further allocate auxiliary pages via rb_aux_alloc() as > needed if the mapping requires it. > > This allocation is done in the f_ops->mmap() hook provided in perf_mmap(), > and so we can instead simply map all the memory right away here - there's > no point in handling (read) page faults when we don't demand page nor need > to be notified about them (perf does not). > > This patch therefore changes this logic to map everything when the mmap() > hook is called, establishing a PFN map. It implements vm_ops->pfn_mkwrite() > to provide the required read/write vs. read-only behaviour, which does not > require the previously implemented workarounds. > > It makes sense semantically to establish a PFN map too - we are managing > the pages internally and so it is appropriate to mark this as a special > mapping. > > There should be no change to actual functionality as a result of this > change. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> > --- > kernel/events/core.c | 76 +++++++++++++++++++------------------ > kernel/events/ring_buffer.c | 19 +--------- > 2 files changed, 40 insertions(+), 55 deletions(-) > > diff --git a/kernel/events/core.c b/kernel/events/core.c > index 5d4a54f50826..0754b070497f 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -6284,41 +6284,6 @@ void perf_event_update_userpage(struct perf_event *event) > } > EXPORT_SYMBOL_GPL(perf_event_update_userpage); > > -static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) > -{ > - struct perf_event *event = vmf->vma->vm_file->private_data; > - struct perf_buffer *rb; > - vm_fault_t ret = VM_FAULT_SIGBUS; > - > - if (vmf->flags & FAULT_FLAG_MKWRITE) { > - if (vmf->pgoff == 0) > - ret = 0; > - return ret; > - } > - > - rcu_read_lock(); > - rb = rcu_dereference(event->rb); > - if (!rb) > - goto unlock; > - > - if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) > - goto unlock; > - > - vmf->page = perf_mmap_to_page(rb, vmf->pgoff); > - if (!vmf->page) > - goto unlock; > - > - get_page(vmf->page); > - vmf->page->mapping = vmf->vma->vm_file->f_mapping; > - vmf->page->index = vmf->pgoff; > - > - ret = 0; > -unlock: > - rcu_read_unlock(); > - > - return ret; > -} > - > static void ring_buffer_attach(struct perf_event *event, > struct perf_buffer *rb) > { > @@ -6558,13 +6523,47 @@ static void perf_mmap_close(struct vm_area_struct *vma) > ring_buffer_put(rb); /* could be last */ > } > > +static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf) > +{ > + /* The first page is the user control page, others are read-only. */ > + return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS; > +} > + > static const struct vm_operations_struct perf_mmap_vmops = { > .open = perf_mmap_open, > .close = perf_mmap_close, /* non mergeable */ > - .fault = perf_mmap_fault, > - .page_mkwrite = perf_mmap_fault, > + .pfn_mkwrite = perf_mmap_pfn_mkwrite, > }; > > +static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) > +{ > + unsigned long nr_pages = vma_pages(vma); > + int err = 0; > + unsigned long pgoff; > + > + for (pgoff = 0; pgoff < nr_pages; pgoff++) { > + unsigned long va = vma->vm_start + PAGE_SIZE * pgoff; > + struct page *page = perf_mmap_to_page(rb, pgoff); > + > + if (page == NULL) { > + err = -EINVAL; > + break; > + } > + > + /* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */ > + err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE, > + vm_get_page_prot(vma->vm_flags & ~VM_SHARED)); > + if (err) > + break; > + } > + Need a: #ifdef CONFIG_MMU > + /* Clear any partial mappings on error. */ > + if (err) > + zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL); #endif Here to work around the wonders of nommu :) > + > + return err; > +} > + > static int perf_mmap(struct file *file, struct vm_area_struct *vma) > { > struct perf_event *event = file->private_data; > @@ -6783,6 +6782,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) > vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); > vma->vm_ops = &perf_mmap_vmops; > > + if (!ret) > + ret = map_range(rb, vma); > + > if (event->pmu->event_mapped) > event->pmu->event_mapped(event, vma->vm_mm); > > diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c > index 4f46f688d0d4..180509132d4b 100644 > --- a/kernel/events/ring_buffer.c > +++ b/kernel/events/ring_buffer.c > @@ -643,7 +643,6 @@ static void rb_free_aux_page(struct perf_buffer *rb, int idx) > struct page *page = virt_to_page(rb->aux_pages[idx]); > > ClearPagePrivate(page); > - page->mapping = NULL; > __free_page(page); > } > > @@ -819,7 +818,6 @@ static void perf_mmap_free_page(void *addr) > { > struct page *page = virt_to_page(addr); > > - page->mapping = NULL; > __free_page(page); > } > > @@ -890,28 +888,13 @@ __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) > return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); > } > > -static void perf_mmap_unmark_page(void *addr) > -{ > - struct page *page = vmalloc_to_page(addr); > - > - page->mapping = NULL; > -} > - > static void rb_free_work(struct work_struct *work) > { > struct perf_buffer *rb; > - void *base; > - int i, nr; > > rb = container_of(work, struct perf_buffer, work); > - nr = data_page_nr(rb); > - > - base = rb->user_page; > - /* The '<=' counts in the user page. */ > - for (i = 0; i <= nr; i++) > - perf_mmap_unmark_page(base + (i * PAGE_SIZE)); > > - vfree(base); > + vfree(rb->user_page); > kfree(rb); > } > > -- > 2.47.0 >