>From bc01c27e4ec3d97ea3e1b3b53bbe1938f6d20a34 Mon Sep 17 00:00:00 2001
From: Manish Sharma <manishrma@xxxxxxxxx>
Date: Thu, 4 Apr 2013 10:12:59 +0530
Subject: [PATCH 1/1] Control Page Cache The idea is to control/watch the page
cache pages wrt process & disk. The Control page cache
patch will show below details:- 1. Total pages in page
cache per device # cat /proc/pages_per_device
2.Threads allocating pages wrt disk # cat
/proc/threads_per_device 3.Total pages per device wrt
thread # cat /proc/<pid>/pages_per_device 4.Set
the maximum pages limit from device # cat
/proc/max_limit/<device> 5.Set the maximum thread limit
# cat /proc/<pid>/max_limit ToDo:- 1. Currently
we are killing the task which exceed the limit but need
to check a proper way. Task should not get killed It
might wait.
Signed-off-by: Manish Sharma <manishrma@xxxxxxxxx>
---
block/genhd.c | 1079 ++++++++++++++++++++++++++++++++++++++++++++-
block/partition-generic.c | 8 +
fs/proc/base.c | 163 +++++++
include/linux/genhd.h | 38 +-
include/linux/rmap.h | 4 +
include/linux/sched.h | 5 +
kernel/fork.c | 24 +
lib/Kconfig.debug | 10 +
mm/filemap.c | 11 +
mm/oom_kill.c | 17 +
mm/rmap.c | 97 +++-
11 files changed, 1453 insertions(+), 3 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c
index 7dcfdd8..0840de4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -42,6 +42,11 @@ static void disk_add_events(struct gendisk *disk);
static void disk_del_events(struct gendisk *disk);
static void disk_release_events(struct gendisk *disk);
+/* Control Page Cache */
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+struct proc_dir_entry *proc_cpcfs;
+#endif /* CONFIG_CONTROL_PAGE_CACHE */
+
/**
* disk_get_part - get partition
* @disk: disk to look partition from
@@ -568,6 +573,1063 @@ exit:
disk_part_iter_exit(&piter);
}
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+/**
+ * cpc_get_part_by_name - get partition by name
+ * @buf: name of partition
+ *
+ * It iterates over the partition and find out
+ * the requested partition and return the part
+ * structure.
+ *
+ * CONTEXT:
+ * RCU read locked. The returned partition pointer is valid only
+ * while preemption is disabled.
+ *
+ * RETURNS:
+ * Found partition on success, NULL is returned if no partition matches
+ */
+struct hd_struct *cpc_get_part_by_name(char *buf)
+{
+ char hdname[BDEVNAME_SIZE];
+ struct class_dev_iter iter;
+ struct device *dev;
+
+ class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
+ while ((dev = class_dev_iter_next(&iter))) {
+ struct gendisk *disk = dev_to_disk(dev);
+ struct disk_part_iter piter;
+ struct hd_struct *part;
+
+ /*
+ * Don't show empty devices or things that have been
+ * surpressed
+ */
+ if (get_capacity(disk) == 0 ||
+ (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
+ continue;
+
+ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
+ while ((part = disk_part_iter_next(&piter))) {
+ disk_name(disk, part->partno, hdname);
+ if (!strncmp(hdname, buf, strlen(buf))) {
+ disk_part_iter_exit(&piter);
+ class_dev_iter_exit(&iter);
+ return part;
+ }
+
+ }
+ disk_part_iter_exit(&piter);
+ }
+ class_dev_iter_exit(&iter);
+ return NULL;
+}
+
+/**
+ * proc_cpc_part_read - proc read for max limit of partition
+ * @file: file pointer
+ * @buf: user buffer pointer
+ * @len: length
+ * @ppos: position offset
+ *
+ * It gets the partition and prints the maximum limit of pages
+ *
+ * RETURNS:
+ * Success
+ */
+
+static ssize_t proc_cpc_part_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ struct hd_struct *part = NULL;
+
+ part = cpc_get_part_by_name(file->f_path.dentry->d_iname);
+ if (NULL == part)
+ printk(KERN_EMERG"No part\n");
+ else
+ printk(KERN_EMERG"%ld\n", part->max_limit);
+ return 0;
+
+}
+
+/**
+ * proc_cpc_part_write - proc write for max limit of partition
+ * @file: file pointer
+ * @buf: user buffer pointer
+ * @count: count
+ * @ppos: position offset
+ *
+ * It gets the partition and updates the maximum limit of pages
+ *
+ * RETURNS:
+ * count on Success; Error on Failure
+ */
+static ssize_t proc_cpc_part_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct hd_struct *part = NULL;
+ char buffer[CPC_MAX_BUFFER] = {0};
+ long max_limit = -1;
+ int err = 0;
+
+ if (count > CPC_MAX_BUFFER) {
+ printk(KERN_EMERG"ERROR:Enter only int value %d ::\n", count);
+ return -EINVAL;
+ }
+ if ((NULL == file->f_path.dentry) && (NULL == file))
+ return -EINVAL;
+
+ part = cpc_get_part_by_name(file->f_path.dentry->d_iname);
+ if (NULL == part) {
+ printk(KERN_EMERG"No part\n");
+ return -EINVAL;
+ } else {
+ if (copy_from_user(buffer, buf, count)) {
+ printk(KERN_EMERG"ERROR:%d\n", count);
+ return -EFAULT;
+ }
+
+ err = strict_strtol(strstrip(buffer), 0, &max_limit);
+ if (err) {
+ printk(KERN_EMERG"ERROR:err %d \n", err);
+ return -EINVAL;
+ }
+ if (max_limit >= CPC_MAX_LIMIT) { /* 2GB */
+ /* Later we will make it upto the RAM value */
+ printk(KERN_EMERG"Entered value >= 2GB\n");
+ return -EINVAL;
+ } else {
+ part->max_limit = max_limit;
+ printk(KERN_EMERG"Entered value %ld\n" , part->max_limit);
+ }
+ }
+ return count;
+}
+
+static const struct file_operations proc_cpc_part_ops = {
+ .read = proc_cpc_part_read,
+ .write = proc_cpc_part_write,
+};
+
+/**
+ * cpc_hash
+ * Returns the hashed key for the pid
+ * @pid: process/thread pid
+ *
+ * It returns the hashed key of the pid.
+ *
+ * RETURNS:
+ * hashed value with CPC_MAX_PID_PER_PART
+ */
+int cpc_hash(int pid)
+{
+ return pid % CPC_MAX_PID_PER_PART;
+}
+
+/**
+ * cpc_invoke_oom
+ * Invoke OOM on the current task
+ * @void:
+ *
+ * It invoke OOM on the current task.
+ *
+ * RETURNS:
+ *
+ */
+void cpc_invoke_oom(void)
+{
+ printk(KERN_EMERG"Invoking OOM on %d\n", current->pid);
+ if (cpc_oom_kill_task(current))
+ printk(KERN_EMERG"OOM failed on task\n");
+
+}
+
+/**
+ * cpc_part_limit
+ * check the limit on partition
+ * @partition: Partition whose limit to be checked
+ *
+ * It checks the limit of the partition if the limit
+ * is reached which is set as per the user compared with
+ * the total pages fetched from the device.
+ *
+ * CONTEXT:
+ * read or write locked.
+ *
+ * RETURNS:
+ *
+ */
+int cpc_part_limit(struct hd_struct *part)
+{
+ int ret = 0;
+ if (NULL != part) {
+ /*read_lock_irq(&part->cpc_lock);*/
+ if ((part->max_limit <= part->total_pages) && \
+ (-1 != part->max_limit)) {
+ ret = 1;
+ }
+ /*read_unlock_irq(&part->cpc_lock); */
+ }
+ return ret;
+}
+
+/**
+ * cpc_get_part
+ * Get the partition from the mapping
+ * @mapping: Mapping of the file mapped page w.r.t disk.
+ *
+ * It checks the inode and get the parition from the block
+ * device.
+ * either the file is access from the partition or the
+ * raw disk is accessed both the cases are handled here
+ * if there is any other case we will observe the dump.
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ *
+ */
+struct hd_struct *cpc_get_part(struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+ struct hd_struct *hd = NULL;
+ struct block_device *bdev = NULL;
+
+ if (NULL != inode) {
+ if ((NULL != inode->i_sb->s_bdev) && (NULL != inode->i_sb)) {
+ bdev = inode->i_sb->s_bdev;
+ if (bdev->bd_part)
+ hd = bdev->bd_part;
+ return hd;
+ } else {
+ /* Case where super block is not yet filled so need
+ to go other way round
+ Cant use this case every time because the inode is
+ not of device always :)
+ here we are working on zeroth partition every time
+ */
+ if (NULL != inode->i_bdev) {
+ if (inode->i_bdev->bd_part) {
+ hd = inode->i_bdev->bd_part;
+ return hd;
+ }
+ }
+ }
+ }
+ /* Never Comes here */
+ /* dump_stack();*/
+ printk(KERN_EMERG"ERROR:Never comes here ::No Device Found\n");
+ return NULL;
+}
+
+/**
+ * cpc_check_limit
+ * checks the limit of the tsk and the partition
+ * @tsk: task structure.
+ * @hd: partition structure.
+ *
+ * It checks the limit of the parition & the tsk
+ * w.r.t the total pages fetched and the max limit set.
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * 0: Failure(When limit is crossed) and
+ * 1: Success(When limit is not crossed)
+ *
+ */
+int cpc_check_limit(struct task_struct *tsk, struct hd_struct *hd)
+{
+ struct task_struct *task = NULL;
+ if (!tsk || !hd) {
+ printk(KERN_EMERG"NO tsk or partition [%s]\n", __FUNCTION__);
+ /* if we return 0 here it will invoke OOM */
+ return 1;
+ }
+
+ if (!cpc_part_limit(hd)) {
+ if (tsk->pid == tsk->tgid) {
+ if ((tsk->total_pages < tsk->max_limit) || (-1 == tsk->max_limit)) {
+ return 1;
+ }
+ } else {
+ task = tsk->group_leader;
+ if (task) {
+ if (pid_alive(task)) {
+ get_task_struct(task);
+ if ((task->total_pages < task->max_limit) || (-1 == task->max_limit)) {
+ put_task_struct(task);
+ return 1;
+ }
+ put_task_struct(task);
+ }
+ } else
+ printk(KERN_EMERG"#### NO PARENT TASK\n");
+ }
+ }
+ return 0;
+}
+
+/**
+ * cpc_find_tgid
+ * Finds the tgid of the process in the current partition
+ * @tgid: task group leader id.
+ * @part: partition structure.
+ *
+ * It iterates over the list and checks if any cpc structure
+ * i.e entry exists for the pid(thread/process) with passed tgid
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * cpc structure of the process with tgid on Success
+ * NULL if didnt find the CPC (Control Page Cache)
+ *
+ */
+
+struct cpc_struct *cpc_find_tgid(int tgid, struct hd_struct *part)
+{
+ struct hlist_node *list;
+ struct cpc_struct *cpc = NULL;
+ struct hlist_head *pid_tbl = NULL;
+
+ /* read_lock_irq(&part->cpc_lock); */
+ pid_tbl = part->pid_tbl;
+
+ if (tgid <= 0 || (NULL == pid_tbl))
+ return NULL;
+
+ hlist_for_each_entry_rcu(cpc, list,
+ &pid_tbl[cpc_hash(tgid)], pid_list) {
+ if (cpc) {
+ if ((cpc->tgid == tgid) && (cpc->pid != cpc->tgid)) {
+ /* read_unlock_irq(&part->cpc_lock); */
+ return cpc;
+ }
+ }
+ }
+
+ /*read_unlock_irq(&part->cpc_lock);*/
+ return NULL;
+}
+/**
+ * cpc_find_pid
+ * Finds the pid of the process in the current partition
+ * @tgid: task group leader id.
+ * @part: partition structure.
+ *
+ * It iterates over the list in the partition and checks if
+ * any cpc structure i.e entry exists for the pid(thread/process)
+ * with passed pid
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * cpc structure of the process with pid on Success
+ * NULL if didnt find the CPC (Control Page Cache)
+ *
+ */
+struct cpc_struct *cpc_find_pid(int pid, struct hd_struct *part)
+{
+ struct hlist_node *list = NULL;
+ struct cpc_struct *cpc = NULL;
+ struct hlist_head *pid_tbl = NULL;
+ int p = cpc_hash(pid);
+
+ /*read_lock_irq(&part->cpc_lock);*/
+
+ pid_tbl = part->pid_tbl;
+ if (pid <= 0 || (NULL == pid_tbl))
+ return NULL;
+
+ hlist_for_each_entry_rcu(cpc, list,
+ &pid_tbl[p], pid_list){
+ if (cpc)
+ if (cpc->pid == pid) {
+ /*read_unlock_irq(&part->cpc_lock);*/
+ return cpc;
+ }
+ }
+
+ /*read_unlock_irq(&part->cpc_lock);*/
+ return NULL;
+}
+
+/**
+ * cpc_add_to_thread
+ * Account the pages for the task w.r.t the partition
+ * @mapping: file mapped page mapping.
+ * @tsk: task structure.
+ *
+ * This function gets the partition and finds if the cuurent
+ * task already have the cpc entry in its structure.
+ * If not then it creates the entry else it increments the
+ * accounted pages
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * -1: if failure
+ * 0 : if Success
+ *
+ */
+int cpc_add_to_thread(struct address_space *mapping, struct task_struct *tsk)
+{
+ struct cpc_struct *cpc = NULL;
+ struct hlist_head *pid_tbl = NULL;
+ struct hd_struct *part = cpc_get_part(mapping);
+ int ret = 0;
+ int pid, tgid;
+
+ if (!part || !tsk) {
+ printk(KERN_EMERG"Error improper value [%s]\n", __FUNCTION__);
+ ret = -1;
+ }
+
+ write_lock_irq(&part->cpc_lock);
+ if (!cpc_check_limit(tsk, part)) {
+ write_unlock_irq(&part->cpc_lock);
+ cpc_oom_kill_task(tsk);
+ return -1;
+ }
+
+ pid = tsk->pid;
+ tgid = tsk->tgid;
+
+ pid_tbl = part->pid_tbl;
+ if (!pid_tbl) {
+ write_unlock_irq(&part->cpc_lock);
+ printk(KERN_EMERG"Error improper value [%s]\n", __FUNCTION__);
+ return -1;
+ }
+ /* find the current pid entry */
+ cpc = cpc_find_pid(pid, part);
+ if (NULL == cpc) {
+ cpc = kzalloc(sizeof(struct cpc_struct), GFP_KERNEL);
+ if (NULL == cpc) {
+ write_unlock_irq(&part->cpc_lock);
+ return -ENOMEM;
+ }
+ cpc->pid = pid;
+ cpc->tgid = tgid;
+ cpc->actual_pages = 0;
+ hlist_add_head_rcu(&cpc->pid_list, &pid_tbl[cpc_hash(cpc->pid)]);
+ }
+
+ cpc->actual_pages++;
+ write_unlock_irq(&part->cpc_lock);
+
+#ifdef CPC_DBG_2
+ printk(KERN_EMERG"[%s] %d::%d(%d)\n", __FUNCTION__, part->partno, pid, cpc->actual_pages);
+#endif
+ return ret;
+}
+
+/**
+ * cpc_del_from_thread
+ * Remove the pages for the task w.r.t the partition
+ * @mapping: file mapped page mapping.
+ * @tsk: task structure.
+ *
+ * This function gets the partition and finds if the cuurent
+ * task already have the cpc entry in its structure.
+ * If not then it creates the entry else it decrements the
+ * accounted pages
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * -1: if failure
+ * 0 : if Success
+ *
+ */
+int cpc_del_from_thread(struct address_space *mapping, struct task_struct *tsk)
+{
+ struct cpc_struct *cpc = NULL;
+ struct hlist_head *pid_tbl = NULL;
+ struct hd_struct *part = NULL;
+ int pid, tgid;
+
+ if (!mapping || !tsk) {
+ return -1;
+ }
+
+ part = cpc_get_part(mapping);
+ if (!part) {
+ /*printk(KERN_EMERG"[%s]Error improper value \n",__FUNCTION__);*/
+ return -1;
+ }
+
+ write_lock_irq(&part->cpc_lock);
+ pid = tsk->pid;
+ tgid = tsk->tgid;
+
+ pid_tbl = part->pid_tbl;
+ if (!pid_tbl) {
+ /*printk(KERN_EMERG"[%s]Error improper value \n",__FUNCTION__); */
+ write_unlock_irq(&part->cpc_lock);
+ return -1;
+ }
+
+ /* find if the current pid is available */
+ cpc = cpc_find_pid(pid, part);
+ if (NULL == cpc) {
+ if (pid != tgid) {
+ /* and if its a thread check for its group
+ * leader pid if available in the partition
+ * it can be that the thread is removing the
+ * pages */
+ cpc = cpc_find_pid(tgid, part);
+ if (NULL == cpc) {
+ /* No group leader Still we again will
+ * look for any thread of the same
+ * group leader is available : who has the page
+ * from this device
+ */
+ cpc = cpc_find_tgid(tgid, part);
+ if (NULL == cpc) {
+ /* No CPC till yet wierd case then who has owned this
+ * We should not come here
+ * OR
+ * We might have the page from the dup that belongs
+ * to this partition and we are freeing it :O
+ */
+ goto unlock;
+ }
+ }
+ } else {
+ if (tsk->dup) {
+ tsk->dup--;
+ tsk->total_pages--;
+ goto unlock;
+ }
+ /* So we are the Process and we dont have dup
+ * pages with us so we search for any other pages
+ * of the threads in this partition list
+ * so here checking for tgid to be same as our pid
+ * this means it is in the other thread of same parent
+ * this is just accounting so be careful.
+ * as all threads have same mm.
+ */
+ cpc = cpc_find_tgid(pid, part);
+ if (NULL == cpc) {
+ /* Didn't find anything so
+ * we cant do much except escaping
+ */
+ goto unlock;
+ }
+ }
+ }
+ cpc->actual_pages--;
+ if (!cpc->actual_pages) {
+ hlist_del_rcu(&cpc->pid_list);
+ kfree(cpc);
+ }
+unlock:
+ write_unlock_irq(&part->cpc_lock);
+
+#ifdef CPC_DBG_2
+ printk(KERN_EMERG"[%s] %d::%d(%d)\n", __FUNCTION__, part->partno,\
+ pid, cpc->actual_pages);
+#endif
+
+ return 0;
+}
+
+/**
+ * cpc_shift_acct
+ * Shift the pages from the thread to its respt.
+ * process when the thread is getting killed.
+ * @tsk: task structure.
+ *
+ * This function iterate over the partition and adds
+ * the pages to the group leader of the thread in the
+ * specific parition in the respective list
+ * of the device partition;
+ * Removal is taken care by the cpc_del_partition
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * void:
+ *
+ */
+void cpc_shift_acct(struct task_struct *tsk)
+{
+ struct class_dev_iter iter;
+ struct disk_part_iter piter;
+ struct device *dev;
+ struct cpc_struct *cpc = NULL, *cpc_p = NULL;
+ struct hlist_head *pid_tbl = NULL;
+ int pid = 0;
+
+ if (tsk->pid == tsk->tgid) {
+ printk(KERN_EMERG"ERROR:Shifting req for Parent\n");
+ return ;
+ }
+ class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
+ while ((dev = class_dev_iter_next(&iter))) {
+ struct gendisk *disk = dev_to_disk(dev);
+ struct hd_struct *part;
+ /*
+ * Don't show empty devices or things that have been
+ * surpressed
+ */
+ if (get_capacity(disk) == 0 ||
+ (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
+ continue;
+
+ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
+ while ((part = disk_part_iter_next(&piter))) {
+ write_lock_irq(&part->cpc_lock);
+ pid_tbl = part->pid_tbl;
+ if (!pid_tbl) {
+ printk(KERN_EMERG"#####Error improper value [%s]\n", __FUNCTION__);
+ write_unlock_irq(&part->cpc_lock);
+ break;
+ }
+ cpc = cpc_find_pid(tsk->pid, part);
+ if (cpc) {
+ pid = tsk->tgid;
+ cpc_p = cpc_find_pid(pid, part);
+ if (NULL == cpc_p) {
+ write_unlock_irq(&part->cpc_lock);
+ continue;
+ }
+ cpc_p->actual_pages += cpc->actual_pages;
+ printk(KERN_EMERG"Shifting %d ::%d(%d) to %d(%d) \n", part->partno, tsk->pid,\
+ cpc->actual_pages, pid, cpc_p->actual_pages);
+ if (!cpc_p->actual_pages) {
+ hlist_del_rcu(&cpc->pid_list);
+ kfree(cpc);
+ cpc = NULL;
+ }
+ hlist_del_rcu(&cpc->pid_list);
+ kfree(cpc);
+ cpc = NULL;
+ }
+ write_unlock_irq(&part->cpc_lock);
+ }
+ disk_part_iter_exit(&piter);
+ }
+ class_dev_iter_exit(&iter);
+}
+
+/**
+ * cpc_inc
+ * Increment the total pages fetched from the
+ * respective partition in the page cache
+ * @mapping: File mapped page mapping of device.
+ *
+ * This function gets the partition from the mapping
+ * checks if the limit is there and then increment the
+ * total pages of the partition.
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * void:
+ *
+ */
+void cpc_inc(struct address_space *mapping)
+{
+ struct hd_struct *hd;
+ hd = cpc_get_part(mapping);
+ if (NULL != hd) {
+ read_lock_irq(&hd->cpc_lock);
+ if (cpc_part_limit(hd)) {
+ read_unlock_irq(&hd->cpc_lock);
+ cpc_invoke_oom();
+ }
+ read_unlock_irq(&hd->cpc_lock);
+ hd->total_pages += 1;
+ } else
+ printk(KERN_EMERG"##[%s]## No partition\n", __FUNCTION__);
+}
+/**
+ * cpc_dec
+ * Decrement the total pages freed from the
+ * respective partition from the page cache
+ * @mapping: File mapped page mapping of device.
+ *
+ * This function gets the partition from the mapping
+ * and then decrement the total pages of the partition.
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * void:
+ *
+ */
+void cpc_dec(struct address_space *mapping)
+{
+ struct hd_struct *hd;
+ hd = cpc_get_part(mapping);
+ if (NULL != hd) {
+ hd->total_pages -= 1;
+ } else
+ printk(KERN_EMERG"##[%s]## No partition\n", __FUNCTION__);
+
+}
+/**
+ * del_part_htbl
+ * This function calls before deleting the partition
+ * so cleaning up the rest. remove all of the stuff
+ * and exit.
+ * @part: Partition to delete.
+ *
+ * This function removed all the cpc entries and free the
+ * memory allocated for the hash tables etc
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * -1: Failure
+ * 0: Success
+ *
+ */
+int del_part_htbl(struct hd_struct *part)
+{
+ int htbl = 0;
+ char hdname[BDEVNAME_SIZE];
+ struct gendisk *disk;
+ struct hlist_head *pid_tbl = NULL;
+ struct hlist_node *list = NULL;
+ struct cpc_struct *cpc = NULL;
+
+ if (!part) {
+ printk(KERN_EMERG" Deleting part No part\n");
+ dump_stack();
+ return -1;
+ }
+
+ write_lock_irq(&part->cpc_lock);
+ disk = part_to_disk(part);
+ part->total_pages = 0;
+ part->max_limit = -1;
+
+
+ if (part->pid_tbl) {
+ pid_tbl = part->pid_tbl;
+ for (htbl = 0; htbl < CPC_MAX_PID_PER_PART; htbl++) {
+ hlist_for_each_entry_rcu(cpc, list,
+ &pid_tbl[htbl], pid_list){
+ if (cpc) {
+ hlist_del_rcu(&cpc->pid_list);
+ kfree(cpc);
+ cpc = NULL;
+ }
+ }
+ }
+ kfree(part->pid_tbl);
+ part->pid_tbl = NULL;
+ }
+
+ disk_name(disk, part->partno, hdname);
+ if (NULL != proc_cpcfs) {
+ remove_proc_entry(hdname, proc_cpcfs);
+ }
+ write_unlock_irq(&part->cpc_lock);
+
+ return 0;
+}
+
+/**
+ * init_part_htbl
+ * This function calls after creating the partition
+ * so that we can create the hash table entries and
+ * initialize the structures
+ * @part: Partition to init.
+ *
+ * This function initialize all the cpc entries and
+ * creates the hash table enteries in the parition.
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * void:
+ *
+ */
+void init_part_htbl(struct hd_struct *part)
+{
+ int htbl = 0;
+ char hdname[BDEVNAME_SIZE];
+ struct gendisk *disk;
+
+ if (!part) {
+ printk(KERN_EMERG"Will not come here\n");
+ dump_stack();
+ }
+
+ disk = part_to_disk(part);
+ part->total_pages = 0;
+ part->max_limit = -1;
+ part->pid_tbl = kmalloc(CPC_MAX_PID_PER_PART * sizeof(*(part->pid_tbl)), GFP_KERNEL);
+ if (!part->pid_tbl) {
+ printk(KERN_EMERG"[%s]ERROR: unable to allocate mem!\n", __FUNCTION__);
+ dump_stack();
+ }
+ for (htbl = 0; htbl < CPC_MAX_PID_PER_PART; htbl++)
+ INIT_HLIST_HEAD(&part->pid_tbl[htbl]);
+ rwlock_init(&part->cpc_lock);
+ disk_name(disk, part->partno, hdname);
+ if (proc_cpcfs == NULL) {
+ printk(KERN_EMERG"### unable to create proc entry [%s]\n", hdname);
+ }
+ proc_create(hdname, 0, proc_cpcfs, &proc_cpc_part_ops);
+
+}
+/**
+ * init_disk_htbl
+ * This function initialize the disk
+ * i.e iterate over all the partitions and
+ * initialize the hash tables and cpc structures
+ * @disk: gendisk structure.
+ *
+ * This function initialize all the disk partitions
+ * with cpc entries and creates the hash table enteries
+ * in all the parition.
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * void:
+ *
+ */
+void init_disk_htbl(struct gendisk *disk)
+{
+
+ struct disk_part_iter piter;
+ struct hd_struct *part;
+
+ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
+ while ((part = disk_part_iter_next(&piter))) {
+ if (part == &disk->part0)
+ continue;
+ init_part_htbl(part);
+ }
+ disk_part_iter_exit(&piter);
+
+}
+/**
+ * cpc_del_part
+ * Exported CPC delete partition function.
+ * @part: Partition struct.
+ *
+ * partition is already invalidated so no issue
+ * This function delete the partition entries
+ * removes cpc and hash table enteries
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * void:
+ *
+ */
+void cpc_del_part(struct hd_struct *part)
+{
+ del_part_htbl(part);
+}
+
+/**
+ * cpc_del_part
+ * Exported CPC init partition function.
+ * @part: Partition struct.
+ *
+ * partition is already created
+ * This function init the partition entries
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * void:
+ *
+ */
+void cpc_init_part(struct hd_struct *part)
+{
+ init_part_htbl(part);
+}
+
+/**
+ * cpc_pages_per_device
+ * Displays the proc entries in the different case
+ * 1. /proc/pages_per_device
+ * 2. /proc/threads_per_device
+ * 3. /proc/<pid>/pages_per_device
+ * @pid: different in above three cases.
+ *
+ * This function iterate over the partitions and prints
+ * all the details depending upon the passed value.
+ * It displays the structures and the accounted pages for
+ * each process /thread w.r.t each device
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * void:
+ *
+ */
+void cpc_pages_per_device(int pid)
+{
+ struct class_dev_iter iter;
+ struct device *dev;
+ unsigned int actual_pages = 0;
+
+ if (0 == pid)
+ printk("Device Total pages\n");
+ else if (-1 == pid)
+ printk("Device Pid[Pages:Tgid,Pages] \n");
+ else
+ printk("Device Pages \n");
+
+ class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
+ while ((dev = class_dev_iter_next(&iter))) {
+ struct gendisk *disk = dev_to_disk(dev);
+ struct disk_part_iter piter;
+ struct hd_struct *part;
+ char buf[BDEVT_SIZE];
+ struct cpc_struct *cpc = NULL;
+ struct hlist_head *pid_tbl = NULL;
+ int cnt = 0;
+ int total_pages = 0;
+ struct task_struct *tsk = NULL;
+
+ /*
+ * Don't show empty devices or things that have been
+ * surpressed
+ */
+ if (get_capacity(disk) == 0 ||
+ (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
+ continue;
+
+ /*
+ * Note, unlike /proc/partitions, I am showing the
+ * numbers in hex - the same format as the root=
+ * option takes.
+ */
+ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
+ while ((part = disk_part_iter_next(&piter))) {
+ printk(" %s\t", disk_name(disk, part->partno, buf));
+ read_lock_irq(&part->cpc_lock);
+ pid_tbl = part->pid_tbl;
+
+ if (0 == pid) {
+ /* total pages for partition*/
+ printk("%u ", part->total_pages);
+ } else if (-1 == pid) {
+ /* all pids for partitions*/
+ struct hlist_node *pid_list;
+
+ if (NULL == pid_tbl)
+ printk("0 ");
+ else{
+ for (cnt = 0; cnt < CPC_MAX_PID_PER_PART; cnt++) {
+ hlist_for_each_entry(cpc, pid_list, &pid_tbl[cnt], pid_list) {
+ tsk = get_pid_task(find_get_pid(cpc->tgid), PIDTYPE_PID);
+ if (tsk) {
+ /*if (pid_alive(tsk)) {*/
+ total_pages = tsk->total_pages;
+ put_task_struct(tsk);
+ /*}*/
+ } else {
+ /*case where the thread/process has gone print 0 in this case*/
+ /*printk(KERN_EMERG"No parent %d:%d \n",cpc->pid, cpc->tgid);*/
+ }
+ /*printk("%d[%d:%d,%d] ",cpc->pid,cpc->actual_pages,cpc->tgid,total_pages);*/
+ printk("%d[%u:%d,%u] ", cpc->pid, cpc->actual_pages, cpc->tgid, total_pages);
+ total_pages = 0;
+ }
+ }
+ printk("0 ");
+ }
+ } else {
+ /* search and display only given pid for partition*/
+ if (NULL == pid_tbl)
+ printk("0 ");
+ else {
+ cpc = cpc_find_pid(pid, part);
+ if (NULL != cpc)
+ printk("%u ", cpc->actual_pages);
+ else
+ printk("%u ", actual_pages);
+ }
+ }
+ read_unlock_irq(&part->cpc_lock);
+ printk("\n");
+ }
+ disk_part_iter_exit(&piter);
+ }
+ class_dev_iter_exit(&iter);
+}
+/**
+ * proc_cpc_ppp_read
+ * proc entry for reading threads per device
+ * @file: file pointer
+ * @buf: user buffer pointer
+ * @count: count
+ * @ppos: position offset
+ *
+ * calls pages_per_device
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * size_t: count
+ *
+ */
+static ssize_t proc_cpc_ppp_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ int pid = -1;
+ cpc_pages_per_device(pid);
+ return 0;
+
+}
+
+/**
+ * proc_cpc_ppd_read
+ * proc entry for reading pages per device
+ * @file: file pointer
+ * @buf: user buffer pointer
+ * @count: count
+ * @ppos: position offset
+ *
+ * calls pages_per_device
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * size_t: count
+ *
+ */
+static ssize_t proc_cpc_ppd_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ int pid = 0;
+ cpc_pages_per_device(pid);
+ return 0;
+
+}
+
+static const struct file_operations proc_cpc_ppd_ops = {
+ .read = proc_cpc_ppd_read,
+};
+
+static const struct file_operations proc_cpc_ppp_ops = {
+ .read = proc_cpc_ppp_read,
+};
+
+/* EXPORTED CPC FUNCTIONS*/
+EXPORT_SYMBOL(cpc_inc);
+EXPORT_SYMBOL(cpc_dec);
+EXPORT_SYMBOL(cpc_add_to_thread);
+EXPORT_SYMBOL(cpc_del_from_thread);
+EXPORT_SYMBOL(cpc_del_part);
+EXPORT_SYMBOL(cpc_init_part);
+EXPORT_SYMBOL(cpc_shift_acct);
+EXPORT_SYMBOL(cpc_pages_per_device);
+
+#endif /* CONFIG_CONTROL_PAGE_CACHE */
/**
* add_disk - add partitioning information to kernel list
* @disk: per-device partitioning information
@@ -613,6 +1675,12 @@ void add_disk(struct gendisk *disk)
blk_register_region(disk_devt(disk), disk->minors, NULL,
exact_match, exact_lock, disk);
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+ /* part0 is special case when no partition table
+ * no page fetch from device before this point
+ */
+ cpc_init_part(&disk->part0);
+#endif /* CONFIG_CONTROL_PAGE_CACHE */
register_disk(disk);
blk_register_queue(disk);
@@ -645,7 +1713,9 @@ void del_gendisk(struct gendisk *disk)
delete_partition(disk, part->partno);
}
disk_part_iter_exit(&piter);
-
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+ cpc_del_part(&disk->part0);
+#endif /*CONFIG_CONTROL_PAGE_CACHE */
invalidate_partition(disk, 0);
set_capacity(disk, 0);
disk->flags &= ~GENHD_FL_UP;
@@ -1198,6 +2268,13 @@ static const struct file_operations proc_diskstats_operations = {
static int __init proc_genhd_init(void)
{
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+ proc_cpcfs = proc_mkdir("max_limit", NULL);
+ if (proc_cpcfs == NULL)
+ return -1;
+ proc_create("pages_per_device", 0, NULL, &proc_cpc_ppd_ops);
+ proc_create("threads_per_device", 0, NULL, &proc_cpc_ppp_ops);
+#endif /* CONFIG_CONTROL_PAGE_CACHE */
proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
proc_create("partitions", 0, NULL, &proc_partitions_operations);
return 0;
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 1cb4dec..c1b50bc 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -249,6 +249,10 @@ void delete_partition(struct gendisk *disk, int partno)
if (!part)
return;
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+ cpc_del_part(part);
+#endif /* CONFIG_CONTROL_PAGE_CACHE*/
+
rcu_assign_pointer(ptbl->part[partno], NULL);
rcu_assign_pointer(ptbl->last_lookup, NULL);
kobject_put(part->holder_dir);
@@ -356,6 +360,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
if (!dev_get_uevent_suppress(ddev))
kobject_uevent(&pdev->kobj, KOBJ_ADD);
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+ cpc_init_part(p);
+#endif /*CONFIG_CONTROL_PAGE_CACHE */
+
hd_ref_init(p);
return p;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9b43ff7..d19fe5e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -92,6 +92,8 @@
#include "internal.h"
#include "fd.h"
+#include <linux/genhd.h>
+#include <linux/backing-dev.h>
/* NOTE:
* Implementing inode permission operations in /proc is almost
* certainly an error. Permission checks need to happen during
@@ -2486,6 +2488,163 @@ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
return err;
}
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+/**
+ * proc_cpc_tsk_ppd_read
+ * Displays the pages per device w.r.t task
+ * @file: file pointer
+ * @buf: user buffer pointer
+ * @count: count
+ * @ppos: position offset
+ *
+ * This function prints the details of the task accounted pages
+ * wrt the device/partition calls cpc_pages_per_device
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * size_t: count
+ *
+ */
+static ssize_t proc_cpc_tsk_ppd_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct task_struct *tsk = get_proc_task(inode);
+ if (!tsk)
+ return -ESRCH;
+ printk(KERN_EMERG"Task \t :%s\n", tsk->comm);
+ printk(KERN_EMERG"Pid \t :%d\n", tsk->pid);
+ printk(KERN_EMERG"Tgid \t :%d\n", tsk->tgid);
+ if (tsk->pid == tsk->tgid) {
+ printk(KERN_EMERG"total pages \t :%u\n", tsk->total_pages);
+ printk(KERN_EMERG"OOM limit \t :%ld\n", tsk->max_limit);
+ } else {
+ if (tsk->group_leader) {
+ if (!pid_alive(tsk->group_leader))
+ printk(KERN_EMERG"#### NO PARENT TASK\n");
+ get_task_struct(tsk->group_leader);
+ printk(KERN_EMERG"Ppid \t :%u \n", tsk->group_leader->pid);
+ printk(KERN_EMERG"total pages \t :%u \n", tsk->group_leader->total_pages);
+ printk(KERN_EMERG"OOM limit \t :%ld \n", tsk->group_leader->max_limit);
+ put_task_struct(tsk->group_leader);
+ }
+ }
+ cpc_pages_per_device(tsk->pid);
+ return 0;
+
+}
+
+/**
+ * proc_cpc_tsk_limit_write
+ * Update the task Max limit
+ * @file: file pointer
+ * @buf: user buffer pointer
+ * @count: count
+ * @ppos: position offset
+ *
+ * This function updates the max limit of pages fetched
+ * for the process
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * size_t: count
+ *
+ */
+static ssize_t proc_cpc_tsk_limit_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offs)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct task_struct *task = get_proc_task(inode);
+ long max_limit = 0;
+ char buffer[CPC_MAX_BUFFER] = {0};
+ int err = 0;
+
+ if (!task)
+ return -ESRCH;
+
+ if (count > CPC_MAX_BUFFER) {
+ printk(KERN_EMERG"ERROR:Enter value less than 2GB %d ::\n", count);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(buffer, buf, count)) {
+ printk(KERN_EMERG"ERROR:%d\n", count);
+ return -EFAULT;
+ }
+
+ err = strict_strtol(strstrip(buffer), 0, &max_limit);
+ if (err) {
+ printk(KERN_EMERG"ERROR:err %d\n", err);
+ return -EINVAL;
+ }
+ printk(KERN_EMERG"user buf %ld count %d\n", max_limit, count);
+
+ if (max_limit >= CPC_MAX_LIMIT) { /* 2GB*/
+ /* Later we will make it upto the RAM value */
+ printk(KERN_EMERG"Entered value > 2GB\n");
+ return -EINVAL;
+ } else {
+ if (task->pid == task->tgid) {
+ task->max_limit = max_limit;
+ } else {
+ if (task->group_leader) {
+ if (!pid_alive(task->group_leader))
+ printk(KERN_EMERG"#### NO PARENT TASK\n");
+ get_task_struct(task->group_leader);
+ task->group_leader->max_limit = max_limit;
+ put_task_struct(task->group_leader);
+ printk(KERN_EMERG"Entered value %ld\n", task->parent->max_limit);
+ }
+ }
+ }
+ return count;
+}
+/**
+ * proc_cpc_tsk_limit_read
+ * Reads the task Max limit
+ * @file: file pointer
+ * @buf: user buffer pointer
+ * @count: count
+ * @ppos: position offset
+ *
+ * This function reads the max limit of pages fetched
+ * for the process
+ *
+ * CONTEXT:
+ *
+ * RETURNS:
+ * size_t: count
+ *
+ */
+static ssize_t proc_cpc_tsk_limit_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct task_struct *task = get_proc_task(inode);
+ if (!task)
+ return -ESRCH;
+ if (task->pid == task->tgid)
+ printk(KERN_EMERG"%ld\n", task->max_limit);
+ else {
+ if (task->group_leader) {
+ printk(KERN_EMERG"%ld\n", task->group_leader->max_limit);
+ }
+ }
+ return 0;
+}
+
+static const struct file_operations proc_cpc_tsk_ppd_ops = {
+ .read = proc_cpc_tsk_ppd_read,
+};
+static const struct file_operations proc_cpc_tsk_limit_ops = {
+ .read = proc_cpc_tsk_limit_read,
+ .write = proc_cpc_tsk_limit_write,
+};
+
+#endif /* CONFIG_CONTROL_PAGE_CACHE */
+
/*
* Thread groups
*/
@@ -2530,6 +2689,10 @@ static const struct pid_entry tgid_base_stuff[] = {
LNK("root", proc_root_link),
LNK("exe", proc_exe_link),
REG("mounts", S_IRUGO, proc_mounts_operations),
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+ REG("pages_per_device", S_IRUGO, proc_cpc_tsk_ppd_ops),
+ REG("max_limit", S_IRUGO, proc_cpc_tsk_limit_ops),
+#endif /* CONFIG_CONTROL_PAGE_CACHE */
REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
REG("mountstats", S_IRUSR, proc_mountstats_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 79b8bba..eff0e4b 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -99,6 +99,24 @@ struct partition_meta_info {
u8 volname[PARTITION_META_INFO_VOLNAMELTH];
};
+ #ifdef CONFIG_CONTROL_PAGE_CACHE
+ /* Enables the debugging information */
+ /*#define CPC_DBG 1*/
+ /*#define CPC_DBG_2 1*/
+ #define CPC_MAX_PID_PER_PART (100)
+ #define CPC_MAX_LIMIT 2147483647 /* 2GB*/
+ #define CPC_MAX_BUFFER (12)
+ /**
+ * Control Page Cache Structure
+ */
+ struct cpc_struct{
+ int pid;
+ int tgid;
+ unsigned int actual_pages;
+ struct hlist_node pid_list;
+ };
+ #endif /* CONFIG_CONTROL_PAGE_CACHE */
+
struct hd_struct {
sector_t start_sect;
/*
@@ -126,6 +144,13 @@ struct hd_struct {
#endif
atomic_t ref;
struct rcu_head rcu_head;
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+ unsigned int total_pages;
+ long max_limit;
+ rwlock_t cpc_lock;
+ struct proc_dir_entry *proc_cpcfs;
+ struct hlist_head *pid_tbl;
+#endif /* CONFIG_CONTROL_PAGE_CACHE */
};
#define GENHD_FL_REMOVABLE 1
@@ -190,7 +215,7 @@ struct gendisk {
void *private_data;
int flags;
- struct device *driverfs_dev; // FIXME: remove
+ struct device *driverfs_dev; /* FIXME: remove*/
struct kobject *slave_dir;
struct timer_rand_state *random;
@@ -633,6 +658,17 @@ extern ssize_t part_fail_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count);
#endif /* CONFIG_FAIL_MAKE_REQUEST */
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+int cpc_oom_kill_task(struct task_struct *p);
+void cpc_pages_per_device(int pid);
+void cpc_inc(struct address_space *mapping);
+void cpc_dec(struct address_space *mapping);
+int cpc_add_to_thread(struct address_space *mapping, struct task_struct *tsk);
+int cpc_del_from_thread(struct address_space *mapping, struct task_struct *tsk);
+void cpc_del_part(struct hd_struct *part);
+void cpc_init_part(struct hd_struct *part);
+void cpc_shift_acct(struct task_struct *tsk);
+#endif /* CONFIG_CONTROL_PAGE_CACHE */
static inline void hd_ref_init(struct hd_struct *part)
{
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c20635c..d108506 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -173,10 +173,14 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long);
+#ifndef CONFIG_CONTROL_PAGE_CACHE
static inline void page_dup_rmap(struct page *page)
{
atomic_inc(&page->_mapcount);
}
+#else /* CONFIG_CONTROL_PAGE_CACHE */
+void page_dup_rmap(struct page *page);
+#endif /*CONFIG_CONTROL_PAGE_CACHE */
/*
* Called from mm/vmscan.c to handle paging out
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d211247..a0aceb2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1605,6 +1605,11 @@ struct task_struct {
#ifdef CONFIG_UPROBES
struct uprobe_task *utask;
#endif
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+ long max_limit;
+ int total_pages;
+ int dup;
+#endif
};
/* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/kernel/fork.c b/kernel/fork.c
index c535f33..d693159 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -206,6 +206,10 @@ static void account_kernel_stack(struct thread_info *ti, int account)
void free_task(struct task_struct *tsk)
{
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+ if (tsk->pid != tsk->tgid)
+ cpc_shift_acct(tsk);
+#endif /* CONFIG_CONTROL_PAGE_CACHE */
account_kernel_stack(tsk->stack, -1);
arch_release_thread_info(tsk->stack);
free_thread_info(tsk->stack);
@@ -1342,8 +1346,28 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->pid = pid_nr(pid);
p->tgid = p->pid;
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+ /* check this while duplicating
+ * mm we didnt have the pid and task
+ * structure of the task that is being
+ * created we got it here
+ * we are accounting that both in total
+ * pages and the dup pages.
+ */
+ p->dup = 0;
+ p->max_limit = -1;
+ p->total_pages = current->dup;
+ p->dup = current->dup;
+ current->dup = 0;
+#endif /* CONFIG_CONTROL_PAGE_CACHE */
if (clone_flags & CLONE_THREAD)
p->tgid = current->tgid;
+#ifdef CPC_DBG
+ printk(KERN_EMERG"[%s] created %d by %d dup %d\n", __FUNCTION__,\
+ p->pid, current->pid, p->dup);
+ printk(KERN_EMERG"[%s] p=%s c=%s %d is %s\n", __FUNCTION__,\
+ current->comm, p->comm, p->pid, (clone_flags & CLONE_THREAD) ? "Thread" : "process");
+#endif
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 67604e5..6ecfede 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -763,6 +763,16 @@ config DEBUG_BUGVERBOSE
of the BUG call as well as the EIP and oops trace. This aids
debugging but costs about 70-100K of memory.
+config CONTROL_PAGE_CACHE
+ bool "Control Page Cache"
+ depends on DEBUG_KERNEL
+ help
+ Provide Page cache control for each device.
+ Accounting of Pages in Cache will be maintained.
+ proc interface will have the details (e.g pages_per_device)
+ Max page cache limit will be set for each device and
+ once the limit is reached oom will be invoked for that device.
+
config DEBUG_INFO
bool "Compile the kernel with debug info"
depends on DEBUG_KERNEL
diff --git a/mm/filemap.c b/mm/filemap.c
index 83efee7..dc4d9d9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,6 +34,9 @@
#include <linux/memcontrol.h>
#include <linux/cleancache.h>
#include "internal.h"
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+#include <linux/mm_inline.h>
+#endif /* CONFIG_CONTROL_PAGE_CACHE */
/*
* FIXME: remove all knowledge of the buffer layer from the core VM
@@ -128,6 +131,10 @@ void __delete_from_page_cache(struct page *page)
/* Leave page->index set: truncation lookup relies upon it */
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+ if (page_is_file_cache(page))
+ cpc_dec(mapping);
+#endif /*CONFIG_CONTROL_PAGE_CACHE */
if (PageSwapBacked(page))
__dec_zone_page_state(page, NR_SHMEM);
BUG_ON(page_mapped(page));
@@ -463,6 +470,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
if (likely(!error)) {
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+ if (page_is_file_cache(page))
+ cpc_inc(mapping);
+#endif /*CONFIG_CONTROL_PAGE_CACHE */
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0399f14..9aff3a7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -392,6 +392,23 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
dump_tasks(memcg, nodemask);
}
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+/**
+ * cpc_oom_kill_task
+ * Invoke OOM on the task.
+ * @p: task structure.
+ *
+ * RETURNS:
+ * 1: if failure
+ * 0 : if Success
+ *
+ */
+int cpc_oom_kill_task(struct task_struct *p)
+{
+ return do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+}
+EXPORT_SYMBOL(cpc_oom_kill_task);
+#endif /* CONFIG_CONTROL_PAGE_CACHE */
#define K(x) ((x) << (PAGE_SHIFT-10))
/*
* Must be called while holding a reference to p, which will be released upon
diff --git a/mm/rmap.c b/mm/rmap.c
index 2c78f8c..1d19765 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -61,6 +61,10 @@
#include <asm/tlbflush.h>
#include "internal.h"
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+#include <linux/genhd.h>
+#include <linux/mm_inline.h>
+#endif /* CONFIG_CONTROL_PAGE_CACHE */
static struct kmem_cache *anon_vma_cachep;
static struct kmem_cache *anon_vma_chain_cachep;
@@ -1116,7 +1120,59 @@ void page_add_file_rmap(struct page *page)
mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
}
mem_cgroup_end_update_page_stat(page, &locked, &flags);
+
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+ if (page_is_file_cache(page)) {
+ if (current->pid == current->tgid) {
+ current->total_pages++;
+ } else {
+ if (current->group_leader) {
+ get_task_struct(current->group_leader);
+ current->group_leader->total_pages++;
+ put_task_struct(current->group_leader);
+ }
+ }
+ cpc_add_to_thread(page->mapping, current);
+#ifdef CPC_DBG
+ printk(KERN_EMERG"[%s]AddingA %d P[0x%x,C%d]\n", \
+ __FUNCTION__, current->pid, page_address(page), \
+ page_mapcount(page));
+#endif /*CPC_DBG */
+ }
+#endif /* CONFIG_CONTROL_PAGE_CACHE */
+}
+
+#ifdef CONFIG_CONTROL_PAGE_CACHE
+/**
+ * page_dup_rmap - duplicate pte mapping to a page
+ * @page: the page to add the mapping to
+ * @vma: the vm area being duplicated
+ * @address: the user virtual address mapped
+ *
+ * For copy_page_range only: minimal extract from page_add_file_rmap /
+ * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
+ * quicker.
+ *
+ * The caller needs to hold the pte lock.
+ * MS: mOve from rmap.h as inline to here.
+ * Need the dup can move there now as prev we need the mm
+ */
+
+void page_dup_rmap(struct page *page)
+{
+
+ atomic_inc(&page->_mapcount);
+ if (page_is_file_cache(page)) {
+#ifdef CPC_DBG
+ printk(KERN_EMERG"[%s]DUP P[0x%x,C%d] %d\n",\
+ __FUNCTION__, page_address(page), page_mapcount(page),\
+ current->pid);
+#endif
+ current->dup++;
+ }
}
+#endif /* CONFIG_CONTROL_PAGE_CACHE */
+
/**
* page_remove_rmap - take down pte mapping from a page
@@ -1139,10 +1195,49 @@ void page_remove_rmap(struct page *page)
if (!anon)
mem_cgroup_begin_update_page_stat(page, &locked, &flags);
+#ifndef CONFIG_CONTROL_PAGE_CACHE
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
goto out;
-
+#else /* CONFIG_CONTROL_PAGE_CACHE */
+ if (!atomic_add_negative(-1, &page->_mapcount)) {
+ if (page_is_file_cache(page)) {
+ if (current->pid == current->tgid) {
+ current->total_pages--;
+ } else {
+ if (current->group_leader) {
+ get_task_struct(current->group_leader);
+ current->group_leader->total_pages--;
+ put_task_struct(current->group_leader);
+ }
+ }
+ cpc_del_from_thread(page->mapping, current);
+#ifdef CPC_DBG
+ printk(KERN_EMERG"[%s]RemovingS %d P[0x%x,C%d]\n",\
+ __FUNCTION__, current->pid, page_address(page),\
+ page_mapcount(page));
+#endif /* CPC_DBG */
+ }
+ return;
+ }
+ if (page_is_file_cache(page)) {
+ if (current->pid == current->tgid) {
+ current->total_pages--;
+ } else {
+ if (current->group_leader) {
+ get_task_struct(current->group_leader);
+ current->group_leader->total_pages--;
+ put_task_struct(current->group_leader);
+ }
+ }
+ cpc_del_from_thread(page->mapping, current);
+#ifdef CPC_DBG
+ printk(KERN_EMERG"[%s]RemovingA %d P[0x%x,C%d]\n",\
+ __FUNCTION__, current->pid, page_address(page),\
+ page_mapcount(page));
+#endif /* CPC_DBG */
+ }
+#endif /* CONFIG_CONTROL_PAGE_CACHE */
/*
* Now that the last pte has gone, s390 must transfer dirty
* flag from storage key to struct page. We can usually skip
--
1.7.9.5