On Mon, Mar 25, 2024 at 11:24:34PM +0800, Nanyong Sun wrote:
On 2024/3/14 7:32, David Rientjes wrote:
On Thu, 8 Feb 2024, Will Deacon wrote:
How about take a new lock with irq disabled during BBM, like:
+void vmemmap_update_pte(unsigned long addr, pte_t *ptep, pte_t pte)
+{
+ (NEW_LOCK);
+ pte_clear(&init_mm, addr, ptep);
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+ set_pte_at(&init_mm, addr, ptep, pte);
+ spin_unlock_irq(NEW_LOCK);
+}
I really think the only maintainable way to achieve this is to avoid the
possibility of a fault altogether.
Will
Nanyong, are you still actively working on making HVO possible on arm64?
This would yield a substantial memory savings on hosts that are largely
configured with hugetlbfs. In our case, the size of this hugetlbfs pool
is actually never changed after boot, but it sounds from the thread that
there was an idea to make HVO conditional on FEAT_BBM. Is this being
pursued?
If so, any testing help needed?
I'm afraid that FEAT_BBM may not solve the problem here
I think so too -- I came cross this while working on TAO [1].
[1] https://lore.kernel.org/20240229183436.4110845-4-yuzhao@xxxxxxxxxx/
because from Arm
ARM,
I see that FEAT_BBM is only used for changing block size. Therefore, in this
HVO feature,
it can work in the split PMD stage, that is, BBM can be avoided in
vmemmap_split_pmd,
but in the subsequent vmemmap_remap_pte, the Output address of PTE still
needs to be
changed. I'm afraid FEAT_BBM is not competent for this stage. Perhaps my
understanding
of ARM FEAT_BBM is wrong, and I hope someone can correct me.
Actually, the solution I first considered was to use the stop_machine
method, but we have
products that rely on /proc/sys/vm/nr_overcommit_hugepages to dynamically
use hugepages,
so I have to consider performance issues. If your product does not change
the amount of huge
pages after booting, using stop_machine() may be a feasible way.
So far, I still haven't come up with a good solution.
I do have a patch that's similar to stop_machine() -- it uses NMI IPIs
to pause/resume remote CPUs while the local one is doing BBM.
Note that the problem of updating vmemmap for struct page[], as I see
it, is beyond hugeTLB HVO. I think it impacts virtio-mem and memory
hot removal in general [2]. On arm64, we would need to support BBM on
vmemmap so that we can fix the problem with offlining memory (or to be
precise, unmapping offlined struct page[]), by mapping offlined struct
page[] to a read-only page of dummy struct page[], similar to
ZERO_PAGE(). (Or we would have to make extremely invasive changes to
the reader side, i.e., all speculative PFN walkers.)
In case you are interested in testing my approach, you can swap your
patch 2 with the following:
[2] https://lore.kernel.org/20240621213717.1099079-1-yuzhao@xxxxxxxxxx/
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 8ff5f2a2579e..1af1aa34a351 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -12,6 +12,7 @@
#include <asm/processor.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+#include <asm/cpu.h>
#define __HAVE_ARCH_PGD_FREE
#define __HAVE_ARCH_PUD_FREE
@@ -137,4 +138,58 @@ pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep)
__pmd_populate(pmdp, page_to_phys(ptep), PMD_TYPE_TABLE | PMD_TABLE_PXN);
}
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+
+#define vmemmap_update_lock vmemmap_update_lock
+static inline void vmemmap_update_lock(void)
+{
+ cpus_read_lock();
+}
+
+#define vmemmap_update_unlock vmemmap_update_unlock
+static inline void vmemmap_update_unlock(void)
+{
+ cpus_read_unlock();
+}
+
+#define vmemmap_update_pte vmemmap_update_pte
+static inline void vmemmap_update_pte(unsigned long addr, pte_t *ptep, pte_t pte)
+{
+ preempt_disable();
+ pause_remote_cpus();
+
+ pte_clear(&init_mm, addr, ptep);
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+ set_pte_at(&init_mm, addr, ptep, pte);
+
+ resume_remote_cpus();
+ preempt_enable();
+}
+
+#define vmemmap_update_pmd vmemmap_update_pmd
+static inline void vmemmap_update_pmd(unsigned long addr, pmd_t *pmdp, pte_t *ptep)
+{
+ preempt_disable();
+ pause_remote_cpus();
+
+ pmd_clear(pmdp);
+ flush_tlb_kernel_range(addr, addr + PMD_SIZE);
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
+
+ resume_remote_cpus();
+ preempt_enable();
+}
+
+#define vmemmap_flush_tlb_all vmemmap_flush_tlb_all
+static inline void vmemmap_flush_tlb_all(void)
+{
+}
+
+#define vmemmap_flush_tlb_range vmemmap_flush_tlb_range
+static inline void vmemmap_flush_tlb_range(unsigned long start, unsigned long end)
+{
+}
+
+#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
+
#endif
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index efb13112b408..544b15948b64 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -144,6 +144,9 @@ bool cpus_are_stuck_in_kernel(void);
extern void crash_smp_send_stop(void);
extern bool smp_crash_stop_failed(void);
+void pause_remote_cpus(void);
+void resume_remote_cpus(void);
+
#endif /* ifndef __ASSEMBLY__ */
#endif /* ifndef __ASM_SMP_H */
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 31c8b3094dd7..ae0a178db066 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -71,16 +71,25 @@ enum ipi_msg_type {
IPI_RESCHEDULE,
IPI_CALL_FUNC,
IPI_CPU_STOP,
+ IPI_CPU_PAUSE,
+#ifdef CONFIG_KEXEC_CORE
IPI_CPU_CRASH_STOP,
+#endif
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
IPI_TIMER,
+#endif
+#ifdef CONFIG_IRQ_WORK
IPI_IRQ_WORK,
+#endif
NR_IPI,
/*
* Any enum >= NR_IPI and < MAX_IPI is special and not tracable
* with trace_ipi_*
*/
IPI_CPU_BACKTRACE = NR_IPI,
+#ifdef CONFIG_KGDB
IPI_KGDB_ROUNDUP,
+#endif
MAX_IPI
};
@@ -771,9 +780,16 @@ static const char *ipi_types[NR_IPI] __tracepoint_string = {
[IPI_RESCHEDULE] = "Rescheduling interrupts",
[IPI_CALL_FUNC] = "Function call interrupts",
[IPI_CPU_STOP] = "CPU stop interrupts",
+ [IPI_CPU_PAUSE] = "CPU pause interrupts",
+#ifdef CONFIG_KEXEC_CORE
[IPI_CPU_CRASH_STOP] = "CPU stop (for crash dump) interrupts",
+#endif
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
[IPI_TIMER] = "Timer broadcast interrupts",
+#endif
+#ifdef CONFIG_IRQ_WORK
[IPI_IRQ_WORK] = "IRQ work interrupts",
+#endif
};
static void smp_cross_call(const struct cpumask *target, unsigned int ipinr);
@@ -832,6 +848,85 @@ void __noreturn panic_smp_self_stop(void)
local_cpu_stop();
}
+static DEFINE_SPINLOCK(cpu_pause_lock);
+static cpumask_t paused_cpus;
+static cpumask_t resumed_cpus;
+
+static void pause_local_cpu(void)
+{
+ int cpu = smp_processor_id();
+
+ cpumask_clear_cpu(cpu, &resumed_cpus);
+ /*
+ * Paired with pause_remote_cpus() to confirm that this CPU not only
+ * will be paused but also can be reliably resumed.
+ */
+ smp_wmb();
+ cpumask_set_cpu(cpu, &paused_cpus);
+ /* A typical example for sleep and wake-up functions. */
+ smp_mb();
+ while (!cpumask_test_cpu(cpu, &resumed_cpus)) {
+ wfe();
+ barrier();
+ }
+ barrier();
+ cpumask_clear_cpu(cpu, &paused_cpus);
+}
+
+void pause_remote_cpus(void)
+{
+ cpumask_t cpus_to_pause;
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_preemption_disabled();
+
+ cpumask_copy(&cpus_to_pause, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), &cpus_to_pause);
+
+ spin_lock(&cpu_pause_lock);
+
+ WARN_ON_ONCE(!cpumask_empty(&paused_cpus));
+
+ smp_cross_call(&cpus_to_pause, IPI_CPU_PAUSE);
+
+ while (!cpumask_equal(&cpus_to_pause, &paused_cpus)) {
+ cpu_relax();
+ barrier();
+ }
+ /*
+ * Paired pause_local_cpu() to confirm that all CPUs not only will be
+ * paused but also can be reliably resumed.
+ */
+ smp_rmb();
+ WARN_ON_ONCE(cpumask_intersects(&cpus_to_pause, &resumed_cpus));
+
+ spin_unlock(&cpu_pause_lock);
+}
+
+void resume_remote_cpus(void)
+{
+ cpumask_t cpus_to_resume;
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_preemption_disabled();
+
+ cpumask_copy(&cpus_to_resume, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), &cpus_to_resume);
+
+ spin_lock(&cpu_pause_lock);
+
+ cpumask_setall(&resumed_cpus);
+ /* A typical example for sleep and wake-up functions. */
+ smp_mb();
+ while (cpumask_intersects(&cpus_to_resume, &paused_cpus)) {
+ sev();
+ cpu_relax();
+ barrier();
+ }
+
+ spin_unlock(&cpu_pause_lock);
+}
+
#ifdef CONFIG_KEXEC_CORE
static atomic_t waiting_for_crash_ipi = ATOMIC_INIT(0);
#endif
@@ -911,6 +1006,11 @@ static void do_handle_IPI(int ipinr)
local_cpu_stop();
break;
+ case IPI_CPU_PAUSE:
+ pause_local_cpu();
+ break;
+
+#ifdef CONFIG_KEXEC_CORE
case IPI_CPU_CRASH_STOP:
if (IS_ENABLED(CONFIG_KEXEC_CORE)) {
ipi_cpu_crash_stop(cpu, get_irq_regs());
@@ -918,6 +1018,7 @@ static void do_handle_IPI(int ipinr)
unreachable();
}
break;
+#endif
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
case IPI_TIMER:
@@ -939,9 +1040,11 @@ static void do_handle_IPI(int ipinr)
nmi_cpu_backtrace(get_irq_regs());
break;
+#ifdef CONFIG_KGDB
case IPI_KGDB_ROUNDUP:
kgdb_nmicallback(cpu, get_irq_regs());
break;
+#endif
default:
pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr);
@@ -971,9 +1074,14 @@ static bool ipi_should_be_nmi(enum ipi_msg_type ipi)
switch (ipi) {
case IPI_CPU_STOP:
+ case IPI_CPU_PAUSE:
+#ifdef CONFIG_KEXEC_CORE
case IPI_CPU_CRASH_STOP:
+#endif
case IPI_CPU_BACKTRACE:
+#ifdef CONFIG_KGDB
case IPI_KGDB_ROUNDUP:
+#endif
return true;
default:
return false;
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 5113753f3ac9..da6f2a7d665e 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -46,6 +46,18 @@ struct vmemmap_remap_walk {
unsigned long flags;
};
+#ifndef vmemmap_update_lock
+static void vmemmap_update_lock(void)
+{
+}
+#endif
+
+#ifndef vmemmap_update_unlock
+static void vmemmap_update_unlock(void)
+{
+}
+#endif
+
#ifndef vmemmap_update_pmd
static inline void vmemmap_update_pmd(unsigned long addr,
pmd_t *pmdp, pte_t *ptep)
@@ -194,10 +206,12 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
VM_BUG_ON(!PAGE_ALIGNED(start | end));
+ vmemmap_update_lock();
mmap_read_lock(&init_mm);
ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
NULL, walk);
mmap_read_unlock(&init_mm);
+ vmemmap_update_unlock();
if (ret)
return ret;
.