This adds two different drivers inside drivers/char/ directory under two new kernel config options COHERENT_HOTPLUG_DEMO and COHERENT_MEMORY_DEMO. 1) coherent_hotplug_demo: Detects, hoptlugs the coherent device memory 2) coherent_memory_demo: Exports debugfs interface for VMA migrations Signed-off-by: Anshuman Khandual <khandual@xxxxxxxxxxxxxxxxxx> --- drivers/char/Kconfig | 23 +++ drivers/char/Makefile | 2 + drivers/char/coherent_hotplug_demo.c | 133 ++++++++++++++ drivers/char/coherent_memory_demo.c | 337 +++++++++++++++++++++++++++++++++++ drivers/char/memory_online_sysfs.h | 148 +++++++++++++++ mm/migrate.c | 10 ++ 6 files changed, 653 insertions(+) create mode 100644 drivers/char/coherent_hotplug_demo.c create mode 100644 drivers/char/coherent_memory_demo.c create mode 100644 drivers/char/memory_online_sysfs.h diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index dcc0973..22c538d 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -588,6 +588,29 @@ config TILE_SROM device appear much like a simple EEPROM, and knows how to partition a single ROM for multiple purposes. +config COHERENT_HOTPLUG_DEMO + tristate "Demo driver to test coherent memory node hotplug" + depends on PPC64 || COHERENT_DEVICE + default n + help + Say yes when you want to build a test driver to hotplug all + the coherent memory nodes present on the system. This driver + scans through the device tree, checks on "ibm,memory-device" + property device nodes and onlines its memory. When unloaded, + it goes through the list of memory ranges it onlined before + and oflines them one by one. If not sure, select N. + +config COHERENT_MEMORY_DEMO + tristate "Demo driver to test coherent memory node functionality" + depends on PPC64 || COHERENT_DEVICE + default n + help + Say yes when you want to build a test driver to demonstrate + the coherent memory functionalities, capabilities and probable + utilizaton. It also exports a debugfs file to accept inputs for + virtual address range migration for any process. If not sure, + select N. + source "drivers/char/xillybus/Kconfig" endmenu diff --git a/drivers/char/Makefile b/drivers/char/Makefile index 6e6c244..92fa338 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -60,3 +60,5 @@ js-rtc-y = rtc.o obj-$(CONFIG_TILE_SROM) += tile-srom.o obj-$(CONFIG_XILLYBUS) += xillybus/ obj-$(CONFIG_POWERNV_OP_PANEL) += powernv-op-panel.o +obj-$(CONFIG_COHERENT_HOTPLUG_DEMO) += coherent_hotplug_demo.o +obj-$(CONFIG_COHERENT_MEMORY_DEMO) += coherent_memory_demo.o diff --git a/drivers/char/coherent_hotplug_demo.c b/drivers/char/coherent_hotplug_demo.c new file mode 100644 index 0000000..3670081 --- /dev/null +++ b/drivers/char/coherent_hotplug_demo.c @@ -0,0 +1,133 @@ +/* + * Memory hotplug support for coherent memory nodes in runtime. + * + * Copyright (C) 2016, Reza Arbab, IBM Corporation. + * Copyright (C) 2016, Anshuman Khandual, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include <linux/of.h> +#include <linux/export.h> +#include <linux/spinlock.h> +#include <linux/init.h> +#include <linux/memblock.h> +#include <linux/module.h> +#include <linux/memory.h> +#include <linux/sizes.h> +#include <linux/bitops.h> +#include <linux/device.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/migrate.h> +#include <linux/memblock.h> +#include <linux/uaccess.h> + +#include <asm/mmu.h> +#include <asm/pgalloc.h> +#include "memory_online_sysfs.h" + +#define MAX_HOTADD_NODES 100 +phys_addr_t addr[MAX_HOTADD_NODES][2]; +int nr_addr; + +/* + * extern int memory_failure(unsigned long pfn, int trapno, int flags); + * extern int min_free_kbytes; + * extern int user_min_free_kbytes; + * + * extern unsigned long nr_kernel_pages; + * extern unsigned long nr_all_pages; + * extern unsigned long dma_reserve; + */ + +static void dump_core_vm_tunables(void) +{ +/* + * printk(":::::::: VM TUNABLES :::::::\n"); + * printk("[min_free_kbytes] %d\n", min_free_kbytes); + * printk("[user_min_free_kbytes] %d\n", user_min_free_kbytes); + * printk("[nr_kernel_pages] %ld\n", nr_kernel_pages); + * printk("[nr_all_pages] %ld\n", nr_all_pages); + * printk("[dma_reserve] %ld\n", dma_reserve); + */ +} + + + +static int online_coherent_memory(void) +{ + struct device_node *memory; + + nr_addr = 0; + disable_auto_online(); + dump_core_vm_tunables(); + for_each_compatible_node(memory, NULL, "ibm,memory-device") { + struct device_node *mem; + const __be64 *reg; + unsigned int len, ret; + phys_addr_t start, size; + + mem = of_parse_phandle(memory, "memory-region", 0); + if (!mem) { + pr_info("memory-region property not found\n"); + return -1; + } + + reg = of_get_property(mem, "reg", &len); + if (!reg || len <= 0) { + pr_info("memory-region property not found\n"); + return -1; + } + start = be64_to_cpu(*reg); + size = be64_to_cpu(*(reg + 1)); + pr_info("Coherent memory start %llx size %llx\n", start, size); + ret = memory_probe_store(start, size); + if (ret) + pr_info("probe faile\n"); + + ret = store_mem_state(start, size, "online_movable"); + if (ret) + pr_info("online_movable failed\n"); + + addr[nr_addr][0] = start; + addr[nr_addr][1] = size; + nr_addr++; + } + dump_core_vm_tunables(); + enable_auto_online(); + return 0; +} + +static int offline_coherent_memory(void) +{ + int i; + + for (i = 0; i < nr_addr; i++) + store_mem_state(addr[i][0], addr[i][1], "offline"); + return 0; +} + +static void __exit coherent_hotplug_exit(void) +{ + pr_info("%s\n", __func__); + offline_coherent_memory(); +} + +static int __init coherent_hotplug_init(void) +{ + pr_info("%s\n", __func__); + return online_coherent_memory(); +} +module_init(coherent_hotplug_init); +module_exit(coherent_hotplug_exit); +MODULE_LICENSE("GPL"); diff --git a/drivers/char/coherent_memory_demo.c b/drivers/char/coherent_memory_demo.c new file mode 100644 index 0000000..1dcd9f7 --- /dev/null +++ b/drivers/char/coherent_memory_demo.c @@ -0,0 +1,337 @@ +/* + * Demonstrating various aspects of the coherent memory. + * + * Copyright (C) 2016, Anshuman Khandual, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include <linux/of.h> +#include <linux/export.h> +#include <linux/spinlock.h> +#include <linux/init.h> +#include <linux/memblock.h> +#include <linux/module.h> +#include <linux/memory.h> +#include <linux/sizes.h> +#include <linux/bitops.h> +#include <linux/device.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/migrate.h> +#include <linux/memblock.h> +#include <linux/debugfs.h> +#include <linux/uaccess.h> + +#include <asm/mmu.h> +#include <asm/pgalloc.h> + +#define COHERENT_DEV_MAJOR 89 +#define COHERENT_DEV_NAME "coherent_memory" + +#define CRNT_NODE_NID1 1 +#define CRNT_NODE_NID2 2 +#define CRNT_NODE_NID3 3 + +#define RAM_CRNT_MIGRATE 1 +#define CRNT_RAM_MIGRATE 2 + +struct vma_map_info { + struct list_head list; + unsigned long nr_pages; + spinlock_t lock; +}; + +static void vma_map_info_init(struct vm_area_struct *vma) +{ + struct vma_map_info *info = kmalloc(sizeof(struct vma_map_info), + GFP_KERNEL); + + BUG_ON(!info); + INIT_LIST_HEAD(&info->list); + spin_lock_init(&info->lock); + vma->vm_private_data = info; + info->nr_pages = 0; +} + +static void coherent_vmops_open(struct vm_area_struct *vma) +{ + vma_map_info_init(vma); +} + +static void coherent_vmops_close(struct vm_area_struct *vma) +{ + struct vma_map_info *info = vma->vm_private_data; + + BUG_ON(!info); +again: + cond_resched(); + spin_lock(&info->lock); + while (info->nr_pages) { + struct page *page, *page2; + + list_for_each_entry_safe(page, page2, &info->list, lru) { + if (!trylock_page(page)) { + spin_unlock(&info->lock); + goto again; + } + + list_del_init(&page->lru); + info->nr_pages--; + unlock_page(page); + SetPageReclaim(page); + put_page(page); + } + spin_unlock(&info->lock); + cond_resched(); + spin_lock(&info->lock); + } + spin_unlock(&info->lock); + kfree(info); + vma->vm_private_data = NULL; +} + +static int coherent_vmops_fault(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct vma_map_info *info; + struct page *page; + static int coherent_node = CRNT_NODE_NID1; + + if (coherent_node == CRNT_NODE_NID1) + coherent_node = CRNT_NODE_NID2; + else + coherent_node = CRNT_NODE_NID1; + + page = alloc_pages_node(coherent_node, + GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); + if (!page) + return VM_FAULT_SIGBUS; + + info = (struct vma_map_info *) vma->vm_private_data; + BUG_ON(!info); + spin_lock(&info->lock); + list_add(&page->lru, &info->list); + info->nr_pages++; + spin_unlock(&info->lock); + + page->index = vmf->pgoff; + get_page(page); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct coherent_memory_vmops = { + .open = coherent_vmops_open, + .close = coherent_vmops_close, + .fault = coherent_vmops_fault, +}; + +static int coherent_memory_mmap(struct file *file, struct vm_area_struct *vma) +{ + pr_info("Mmap opened (file: %lx vma: %lx)\n", + (unsigned long) file, (unsigned long) vma); + vma->vm_ops = &coherent_memory_vmops; + coherent_vmops_open(vma); + return 0; +} + +static int coherent_memory_open(struct inode *inode, struct file *file) +{ + pr_info("Device opened (inode: %lx file: %lx)\n", + (unsigned long) inode, (unsigned long) file); + return 0; +} + +static int coherent_memory_close(struct inode *inode, struct file *file) +{ + pr_info("Device closed (inode: %lx file: %lx)\n", + (unsigned long) inode, (unsigned long) file); + return 0; +} + +static void lru_ram_coherent_migrate(unsigned long addr) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + nodemask_t nmask; + LIST_HEAD(mlist); + + nodes_clear(nmask); + nodes_setall(nmask); + down_write(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if ((addr < vma->vm_start) || (addr > vma->vm_end)) + continue; + break; + } + up_write(&mm->mmap_sem); + if (!vma) { + pr_info("%s: No VMA found\n", __func__); + return; + } + migrate_virtual_range(current->pid, vma->vm_start, vma->vm_end, 2); +} + +static void lru_coherent_ram_migrate(unsigned long addr) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + nodemask_t nmask; + LIST_HEAD(mlist); + + nodes_clear(nmask); + nodes_setall(nmask); + down_write(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if ((addr < vma->vm_start) || (addr > vma->vm_end)) + continue; + break; + } + up_write(&mm->mmap_sem); + if (!vma) { + pr_info("%s: No VMA found\n", __func__); + return; + } + migrate_virtual_range(current->pid, vma->vm_start, vma->vm_end, 0); +} + +static long coherent_memory_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case RAM_CRNT_MIGRATE: + lru_ram_coherent_migrate(arg); + break; + + case CRNT_RAM_MIGRATE: + lru_coherent_ram_migrate(arg); + break; + + default: + pr_info("%s Invalid ioctl() command: %d\n", __func__, cmd); + return -EINVAL; + } + return 0; +} + +static const struct file_operations fops = { + .mmap = coherent_memory_mmap, + .open = coherent_memory_open, + .release = coherent_memory_close, + .unlocked_ioctl = &coherent_memory_ioctl +}; + +static char kbuf[100]; /* Will store original user passed buffer */ +static char str[100]; /* Working copy for individual substring */ + +static u64 args[4]; +static u64 index; +static void convert_substring(const char *buf) +{ + u64 val = 0; + + if (kstrtou64(buf, 0, &val)) + pr_info("String conversion failed\n"); + + args[index] = val; + index++; +} + +static ssize_t coherent_debug_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + char *tmp, *tmp1; + size_t ret; + + memset(args, 0, sizeof(args)); + index = 0; + + ret = simple_write_to_buffer(kbuf, sizeof(kbuf), ppos, user_buf, count); + if (ret < 0) + return ret; + + kbuf[ret] = '\0'; + tmp = kbuf; + do { + tmp1 = strchr(tmp, ','); + if (tmp1) { + *tmp1 = '\0'; + strncpy(str, (const char *)tmp, strlen(tmp)); + convert_substring(str); + } else { + strncpy(str, (const char *)tmp, strlen(tmp)); + convert_substring(str); + break; + } + tmp = tmp1 + 1; + memset(str, 0, sizeof(str)); + } while (true); + migrate_virtual_range(args[0], args[1], args[2], args[3]); + return ret; +} + +static int coherent_debug_show(struct seq_file *m, void *v) +{ + seq_puts(m, "Expected Value: <pid,vaddr,size,nid>\n"); + return 0; +} + +static int coherent_debug_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, coherent_debug_show, NULL); +} + +static const struct file_operations coherent_debug_fops = { + .open = coherent_debug_open, + .write = coherent_debug_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *debugfile; + +static void coherent_memory_debugfs(void) +{ + + debugfile = debugfs_create_file("coherent_debug", 0644, NULL, NULL, + &coherent_debug_fops); + if (!debugfile) + pr_warn("Failed to create coherent_memory in debugfs"); +} + +static void __exit coherent_memory_exit(void) +{ + pr_info("%s\n", __func__); + debugfs_remove(debugfile); + unregister_chrdev(COHERENT_DEV_MAJOR, COHERENT_DEV_NAME); +} + +static int __init coherent_memory_init(void) +{ + int ret; + + pr_info("%s\n", __func__); + ret = register_chrdev(COHERENT_DEV_MAJOR, COHERENT_DEV_NAME, &fops); + if (ret < 0) { + pr_info("%s register_chrdev() failed\n", __func__); + return -1; + } + coherent_memory_debugfs(); + return 0; +} + +module_init(coherent_memory_init); +module_exit(coherent_memory_exit); +MODULE_LICENSE("GPL"); diff --git a/drivers/char/memory_online_sysfs.h b/drivers/char/memory_online_sysfs.h new file mode 100644 index 0000000..a5f022d --- /dev/null +++ b/drivers/char/memory_online_sysfs.h @@ -0,0 +1,148 @@ +/* + * Accessing sysfs interface for memory hotplug operation from + * inside the kernel. + * + * Licensed under GPL V2 + */ +#ifndef __SYSFS_H +#define __SYSFS_H + +#include <linux/fs.h> +#include <linux/uaccess.h> + +#define AUTO_ONLINE_BLOCKS "/sys/devices/system/memory/auto_online_blocks" +#define BLOCK_SIZE_BYTES "/sys/devices/system/memory/block_size_bytes" +#define MEMORY_PROBE "/sys/devices/system/memory/probe" + +static ssize_t read_buf(char *filename, char *buf, ssize_t count) +{ + mm_segment_t old_fs; + struct file *filp; + loff_t pos = 0; + + if (!count) + return 0; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + + filp = filp_open(filename, O_RDONLY, 0); + if (IS_ERR(filp)) { + count = PTR_ERR(filp); + goto err_open; + } + + count = vfs_read(filp, buf, count - 1, &pos); + buf[count] = '\0'; + + filp_close(filp, NULL); + +err_open: + set_fs(old_fs); + + return count; +} + +static unsigned long long read_0x(char *filename) +{ + unsigned long long ret; + char buf[32]; + + if (read_buf(filename, buf, 32) <= 0) + return 0; + + if (kstrtoull(buf, 16, &ret)) + return 0; + + return ret; +} + +static ssize_t write_buf(char *filename, char *buf) +{ + int ret; + mm_segment_t old_fs; + struct file *filp; + loff_t pos = 0; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + + filp = filp_open(filename, O_WRONLY, 0); + if (IS_ERR(filp)) { + ret = PTR_ERR(filp); + goto err_open; + } + + ret = vfs_write(filp, buf, strlen(buf), &pos); + + filp_close(filp, NULL); + +err_open: + set_fs(old_fs); + + return ret; +} + +int memory_probe_store(phys_addr_t addr, phys_addr_t size) +{ + phys_addr_t block_sz = + read_0x(BLOCK_SIZE_BYTES); + long i; + + for (i = 0; i < size / block_sz; i++, addr += block_sz) { + char s[32]; + ssize_t count; + + snprintf(s, 32, "0x%llx", addr); + + count = write_buf(MEMORY_PROBE, s); + if (count < 0) + return count; + } + + return 0; +} + +int store_mem_state(phys_addr_t addr, phys_addr_t size, char *state) +{ + phys_addr_t block_sz = read_0x(BLOCK_SIZE_BYTES); + unsigned long start_block, end_block, i; + + start_block = addr / block_sz; + end_block = start_block + size / block_sz; + + for (i = end_block - 1; i >= start_block; i--) { + char filename[64]; + ssize_t count; + + snprintf(filename, 64, + "/sys/devices/system/memory/memory%ld/state", i); + + count = write_buf(filename, state); + if (count < 0) + return count; + } + + return 0; +} + +int disable_auto_online(void) +{ + int ret; + + ret = write_buf(AUTO_ONLINE_BLOCKS, "offline"); + if (ret) + return ret; + return 0; +} + +int enable_auto_online(void) +{ + int ret; + + ret = write_buf(AUTO_ONLINE_BLOCKS, "online"); + if (ret) + return ret; + return 0; +} +#endif diff --git a/mm/migrate.c b/mm/migrate.c index 06300bb..1fb2b19 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1405,6 +1405,7 @@ int migrate_virtual_range(int pid, unsigned long start, struct vm_area_struct *vma; nodemask_t nmask; int ret = -EINVAL; + bool found = false; LIST_HEAD(mlist); @@ -1414,6 +1415,7 @@ int migrate_virtual_range(int pid, unsigned long start, if ((!start) || (!end)) return -EINVAL; + pr_info("%s: %d %lx %lx %d: ", __func__, pid, start, end, nid); rcu_read_lock(); mm = find_task_by_vpid(pid)->mm; rcu_read_unlock(); @@ -1425,14 +1427,17 @@ int migrate_virtual_range(int pid, unsigned long start, if ((start < vma->vm_start) || (end > vma->vm_end)) continue; + found = true; ret = queue_pages_range(mm, start, end, &nmask, MPOL_MF_MOVE_ALL | MPOL_MF_DISCONTIG_OK, &mlist); if (ret) { + pr_info("queue_pages_range_failed\n"); putback_movable_pages(&mlist); break; } if (list_empty(&mlist)) { + pr_info("list_empty\n"); ret = -ENOMEM; break; } @@ -1440,12 +1445,17 @@ int migrate_virtual_range(int pid, unsigned long start, ret = migrate_pages(&mlist, new_node_page, NULL, nid, MIGRATE_SYNC, MR_COMPACTION); if (ret) { + pr_info("migration_failed\n"); putback_movable_pages(&mlist); } else { + pr_info("migration_passed\n"); if (isolated_cdm_node(nid)) mark_vma_cdm(vma); } } + if (!found) + pr_info("vma_missing\n"); + up_write(&mm->mmap_sem); return ret; } -- 2.1.0 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>