On 1/17/2023 11:15 PM, Raghavendra K T wrote:
On 1/17/2023 8:29 PM, Mel Gorman wrote:
Note that the cc list is excessive for the topic.
Thank you Mel for the review. Sorry for the long list. (got by
get_maintainer). Will trim the list for V2.
(trimming the list early)
[...]
Nice idea. Thanks again.. I will take this as a base patch for expansion.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f3f196e4d66d..3cebda5cc8a7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -620,6 +620,9 @@ static inline void vma_init(struct vm_area_struct
*vma, struct mm_struct *mm)
vma->vm_mm = mm;
vma->vm_ops = &dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
+#ifdef CONFIG_NUMA_BALANCING
+ vma->numab = NULL;
+#endif
}
static inline void vma_set_anonymous(struct vm_area_struct *vma)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3b8475007734..3c0cfdde33e0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -526,6 +526,10 @@ struct anon_vma_name {
char name[];
};
+struct vma_numab {
+ unsigned long next_scan;
+};
+
/*
* This struct describes a virtual memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual
memory
@@ -593,6 +597,9 @@ struct vm_area_struct {
#endif
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ struct vma_numab *numab; /* NUMA Balancing state */
#endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9f7fe3541897..2d34c484553d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -481,6 +481,9 @@ struct vm_area_struct *vm_area_dup(struct
vm_area_struct *orig)
void vm_area_free(struct vm_area_struct *vma)
{
+#ifdef CONFIG_NUMA_BALANCING
+ kfree(vma->numab);
+#endif >> free_anon_vma_name(vma);
kmem_cache_free(vm_area_cachep, vma);
}
while running mmtest kernbench on (256 pcpu), I have hit BUG(),
(not reproducible in normal boot flow otherwise)
[ 716.825398] kernel BUG at mm/slub.c:419!
[ 716.825736] invalid opcode: 0000 [#146] PREEMPT SMP NOPTI
[ 716.826042] CPU: 232 PID: 364844 Comm: cc1 Tainted: G D W
6.1.0-test-snp-host-a7065246cf78+ #44
[ 716.826345] Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS
2.6.6 01/13/2022
[ 716.826645] RIP: 0010:__kmem_cache_free+0x2a4/0x2c0
[ 716.826941] Code: ff e9 32 ff ff ff 49 8b 47 08 f0 48 83 28 01 0f 85
9b fe ff ff 49 8b 47 08 4c 89 ff 48 8b 40 08 e8 a1 c5 cc 00 e9 86 fe ff
ff <0f> 0b 48 8b 15 63 d6 4d 01 e9 85 fd ff ff 66 66 2e 0f 1f 84 00 00
[ 716.827550] RSP: 0018:ffffb0b070547c28 EFLAGS: 00010246
[ 716.827865] RAX: ffff990fa6bf1310 RBX: ffff990fa6bf1310 RCX:
ffff990fa6bf1310
[ 716.828180] RDX: 00000000001000e8 RSI: 0000000000000000 RDI:
ffff98d000044200
[ 716.828503] RBP: ffffb0b070547c50 R08: ffff98d030f222e0 R09:
0000000000000001
[ 716.828821] R10: ffff990ff6d298b0 R11: ffff98d030f226a0 R12:
ffff98d000044200
[ 716.829139] R13: ffffd605c29afc40 R14: ffffffff9e89c20f R15:
ffffb0b070547d58
[ 716.829458] FS: 00007f05f4cebac0(0000) GS:ffff994e00800000(0000)
knlGS:0000000000000000
[ 716.829781] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 716.830105] CR2: 00007f05e9cbc002 CR3: 00000040eea7c005 CR4:
0000000000770ee0
[ 716.830432] PKRU: 55555554
[ 716.830749] Call Trace:
[ 716.831057] <TASK>
[ 716.831360] kfree+0x79/0x120
[ 716.831664] vm_area_free+0x1f/0x50
[ 716.831970] vma_expand+0x311/0x3e0
[ 716.832274] mmap_region+0x772/0x900
[ 716.832571] do_mmap+0x3c0/0x5e0
[ 716.832866] ? __this_cpu_preempt_check+0x13/0x20
[ 716.833165] ? security_mmap_file+0xa1/0xc0
[ 716.833458] vm_mmap_pgoff+0xd5/0x170
[ 716.833745] ksys_mmap_pgoff+0x46/0x210
[ 716.834022] __x64_sys_mmap+0x33/0x50
[ 716.834291] do_syscall_64+0x3b/0x90
[ 716.834549] entry_SYSCALL_64_after_hwframe+0x63/0xcd
[ 716.834806] RIP: 0033:0x7f05f471ebd7
[ 716.835054] Code: 00 00 00 89 ef e8 59 ae ff ff eb e4 e8 62 7b 01 00
66 90 f3 0f 1e fa 41 89 ca 41 f7 c1 ff 0f 00 00 75 10 b8 09 00 00 00 0f
05 <48> 3d 00 f0 ff ff 77 21 c3 48 8b 05 29 a2 0f 00 64 c7 00 16 00 00
[ 716.835567] RSP: 002b:00007fff24c27ae8 EFLAGS: 00000246 ORIG_RAX:
0000000000000009
[ 716.835826] RAX: ffffffffffffffda RBX: 0000000000200000 RCX:
00007f05f471ebd7
[ 716.836077] RDX: 0000000000000003 RSI: 0000000000200000 RDI:
0000000000000000
[ 716.836323] RBP: 0000000000000000 R08: 00000000ffffffff R09:
0000000000000000
[ 716.836567] R10: 0000000000000022 R11: 0000000000000246 R12:
0000000000000038
[ 716.836808] R13: 0000000000001fff R14: 0000000000000044 R15:
0000000000000048
[ 716.837049] </TASK>
[ 716.837285] Modules linked in: tls ipmi_ssif binfmt_misc
nls_iso8859_1 joydev input_leds intel_rapl_msr intel_rapl_common
amd64_edac edac_mce_amd hid_generic kvm_amd dell_smbios dcdbas wmi_bmof
dell_wmi_descriptor kvm usbhid hid ccp k10temp wmi ipmi_si ipmi_devintf
ipmi_msghandler acpi_power_meter mac_hid sch_fq_codel dm_multipath
scsi_dh_rdac scsi_dh_emc scsi_dh_alua msr efi_pstore ip_tables x_tables
autofs4 btrfs blake2b_generic zstd_compress raid10 raid456
async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq
libcrc32c raid1 raid0 multipath linear mgag200 drm_kms_helper
syscopyarea sysfillrect sysimgblt fb_sys_fops crct10dif_pclmul
i2c_algo_bit crc32_pclmul drm_shmem_helper ghash_clmulni_intel nvme
aesni_intel crypto_simd cryptd tg3 drm nvme_core megaraid_sas ahci
xhci_pci i2c_piix4 xhci_pci_renesas libahci
[ 716.839185] ---[ end trace 0000000000000000 ]---
looks like we have to additionally handle numab initialization in
vm_area_dup() code path. something like below fixed it (copied pasted
from tty):
diff --git a/kernel/fork.c b/kernel/fork.c
index 08969f5aa38d..f5b2e41296c7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -475,12 +475,18 @@ struct vm_area_struct *vm_area_dup(struct
vm_area_struct *orig)
*new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
dup_anon_vma_name(orig, new);
+#ifdef CONFIG_NUMA_BALANCING
+ new->numab = NULL;
+#endif
}
return new;
}
Does this look okay? if so I will fold it into V2 spin (in
vma_scan_delay patch, hoping you are okay with this change and do not
see any other changes required)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c36aa54ae071..6a1cffdfc76b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3027,6 +3027,23 @@ static void task_numa_work(struct callback_head
*work)
if (!vma_is_accessible(vma))
continue;
+ /* Initialise new per-VMA NUMAB state. */
+ if (!vma->numab) {
+ vma->numab = kzalloc(sizeof(struct vma_numab), GFP_KERNEL);
+ if (!vma->numab)
+ continue;
+
+ vma->numab->next_scan = now +
+ msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ }
+
+ /*
+ * After the first scan is complete, delay the balancing scan
+ * for new VMAs.
+ */
+ if (mm->numa_scan_seq && time_before(jiffies,
vma->numab->next_scan))
+ continue;
+
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);