Serge E. Hallyn wrote: > Quoting Nadia.Derbey@xxxxxxxx (Nadia.Derbey@xxxxxxxx): > >>[PATCH 01/05] >> >>This patch adds the procfs facility needed to feed some data for the >>next syscall to be called. >> >>The effect of issuing >>echo "LONG<Y> <XX>" > /proc/self/task/<tid>/next_syscall_data >>is that <XX> will be stored in a new field of the task structure >>(next_syscall_data). This field, in turn will be taken as the data to feed >>next syscall that supports the feature. >> >><Y> is the number of values provided on the line. >>For the sake of simplicity it is now fixed to 1, but this can be extended as >>needed, in the future. >> >>This is particularly useful when restarting an application, as we need >>sometimes the syscalls to have a non-default behavior. >> >>Signed-off-by: Nadia Derbey <Nadia.Derbey@xxxxxxxx> >> >>--- >> fs/exec.c | 6 + >> fs/proc/base.c | 75 ++++++++++++++++++ >> include/linux/next_syscall_data.h | 35 ++++++++ >> include/linux/sched.h | 6 + >> kernel/Makefile | 3 >> kernel/exit.c | 4 + >> kernel/fork.c | 2 >> kernel/next_syscall_data.c | 151 ++++++++++++++++++++++++++++++++++++++ >> 8 files changed, 281 insertions(+), 1 deletion(-) >> >>Index: linux-2.6.26-rc5-mm3/include/linux/sched.h >>=================================================================== >>--- linux-2.6.26-rc5-mm3.orig/include/linux/sched.h 2008-06-25 17:10:38.000000000 +0200 >>+++ linux-2.6.26-rc5-mm3/include/linux/sched.h 2008-06-27 14:18:56.000000000 +0200 >>@@ -87,6 +87,7 @@ struct sched_param { >> #include <linux/task_io_accounting.h> >> #include <linux/kobject.h> >> #include <linux/latencytop.h> >>+#include <linux/next_syscall_data.h> >> >> #include <asm/processor.h> >> >>@@ -1312,6 +1313,11 @@ struct task_struct { >> int latency_record_count; >> struct latency_record latency_record[LT_SAVECOUNT]; >> #endif >>+ /* >>+ * If non-NULL indicates that next operation will be forced, e.g. >>+ * that next object to be created will have a predefined id. >>+ */ >>+ struct next_syscall_data *nsd; >> }; >> >> /* >>Index: linux-2.6.26-rc5-mm3/include/linux/next_syscall_data.h >>=================================================================== >>--- /dev/null 1970-01-01 00:00:00.000000000 +0000 >>+++ linux-2.6.26-rc5-mm3/include/linux/next_syscall_data.h 2008-07-01 10:25:48.000000000 +0200 >>@@ -0,0 +1,35 @@ >>+/* >>+ * include/linux/next_syscall_data.h >>+ * >>+ * Definitions to support fixed data for next syscall to be called. The >>+ * following is supported today: >>+ * . object creation with a predefined id. >>+ * >>+ */ >>+ >>+#ifndef _LINUX_NEXT_SYSCALL_DATA_H >>+#define _LINUX_NEXT_SYSCALL_DATA_H >>+ >>+#define NDATA 1 >>+ >>+/* >>+ * If this structure is pointed to by a task_struct, next syscall to be called >>+ * by the task will have a non-default behavior. >>+ * For example, it can be used to pre-set the id of the object to be created >>+ * by next syscall. >>+ */ >>+struct next_syscall_data { >>+ int ndata; >>+ long data[NDATA]; >>+}; >>+ >>+extern ssize_t get_next_syscall_data(struct task_struct *, char *, size_t); >>+extern int set_next_syscall_data(struct task_struct *, char *); >>+extern int reset_next_syscall_data(struct task_struct *); >>+ >>+static inline void exit_next_syscall_data(struct task_struct *tsk) >>+{ >>+ reset_next_syscall_data(tsk); >>+} >>+ >>+#endif /* _LINUX_NEXT_SYSCALL_DATA_H */ >>Index: linux-2.6.26-rc5-mm3/fs/proc/base.c >>=================================================================== >>--- linux-2.6.26-rc5-mm3.orig/fs/proc/base.c 2008-06-25 17:11:04.000000000 +0200 >>+++ linux-2.6.26-rc5-mm3/fs/proc/base.c 2008-07-01 09:09:30.000000000 +0200 >>@@ -1158,6 +1158,76 @@ static const struct file_operations proc >> }; >> #endif >> >>+static ssize_t next_syscall_data_read(struct file *file, char __user *buf, >>+ size_t count, loff_t *ppos) >>+{ >>+ struct task_struct *task; >>+ char *page; >>+ ssize_t length; >>+ >>+ task = get_proc_task(file->f_path.dentry->d_inode); >>+ if (!task) >>+ return -ESRCH; >>+ >>+ if (count >= PAGE_SIZE) >>+ count = PAGE_SIZE - 1; >>+ >>+ length = -ENOMEM; >>+ page = (char *) __get_free_page(GFP_TEMPORARY); >>+ if (!page) >>+ goto out; >>+ >>+ length = get_next_syscall_data(task, (char *) page, count); >>+ if (length >= 0) >>+ length = simple_read_from_buffer(buf, count, ppos, >>+ (char *)page, length); >>+ free_page((unsigned long) page); >>+ >>+out: >>+ put_task_struct(task); >>+ return length; >>+} >>+ >>+static ssize_t next_syscall_data_write(struct file *file, >>+ const char __user *buf, >>+ size_t count, loff_t *ppos) >>+{ >>+ struct inode *inode = file->f_path.dentry->d_inode; >>+ char *page; >>+ ssize_t length; >>+ >>+ if (pid_task(proc_pid(inode), PIDTYPE_PID) != current) >>+ return -EPERM; >>+ >>+ if (count >= PAGE_SIZE) >>+ count = PAGE_SIZE - 1; >>+ >>+ if (*ppos != 0) { >>+ /* No partial writes. */ >>+ return -EINVAL; >>+ } >>+ page = (char *)__get_free_page(GFP_TEMPORARY); >>+ if (!page) >>+ return -ENOMEM; >>+ length = -EFAULT; >>+ if (copy_from_user(page, buf, count)) >>+ goto out_free_page; >>+ >>+ page[count] = '\0'; >>+ >>+ length = set_next_syscall_data(current, page); >>+ if (!length) >>+ length = count; >>+ >>+out_free_page: >>+ free_page((unsigned long) page); >>+ return length; >>+} >>+ >>+static const struct file_operations proc_next_syscall_data_operations = { >>+ .read = next_syscall_data_read, >>+ .write = next_syscall_data_write, >>+}; >> >> #ifdef CONFIG_SCHED_DEBUG >> /* >>@@ -2853,6 +2923,11 @@ static const struct pid_entry tid_base_s >> #ifdef CONFIG_TASK_IO_ACCOUNTING >> INF("io", S_IRUGO, tid_io_accounting), >> #endif >>+ /* >>+ * NOTE that this file is not added into tgid_base_stuff[] since it >>+ * has to be specified on a per-thread basis. >>+ */ >>+ REG("next_syscall_data", S_IRUGO|S_IWUSR, next_syscall_data), >> }; >> >> static int proc_tid_base_readdir(struct file * filp, >>Index: linux-2.6.26-rc5-mm3/kernel/Makefile >>=================================================================== >>--- linux-2.6.26-rc5-mm3.orig/kernel/Makefile 2008-06-25 17:10:41.000000000 +0200 >>+++ linux-2.6.26-rc5-mm3/kernel/Makefile 2008-06-27 09:03:01.000000000 +0200 >>@@ -9,7 +9,8 @@ obj-y = sched.o fork.o exec_domain.o >> rcupdate.o extable.o params.o posix-timers.o \ >> kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ >> hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ >>- notifier.o ksysfs.o pm_qos_params.o sched_clock.o >>+ notifier.o ksysfs.o pm_qos_params.o sched_clock.o \ >>+ next_syscall_data.o >> >> CFLAGS_REMOVE_sched.o = -pg -mno-spe >> >>Index: linux-2.6.26-rc5-mm3/kernel/next_syscall_data.c >>=================================================================== >>--- /dev/null 1970-01-01 00:00:00.000000000 +0000 >>+++ linux-2.6.26-rc5-mm3/kernel/next_syscall_data.c 2008-07-01 10:39:43.000000000 +0200 >>@@ -0,0 +1,151 @@ >>+/* >>+ * linux/kernel/next_syscall_data.c >>+ * >>+ * >>+ * Provide the get_next_syscall_data() / set_next_syscall_data() routines >>+ * (called from fs/proc/base.c). >>+ * They allow to specify some particular data for the next syscall to be >>+ * called. >>+ * E.g. they can be used to specify the id for the next resource to be >>+ * allocated, instead of letting the allocator set it for us. >>+ */ >>+ >>+#include <linux/sched.h> >>+#include <linux/ctype.h> >>+ >>+ >>+ >>+ssize_t get_next_syscall_data(struct task_struct *task, char *buffer, >>+ size_t size) >>+{ >>+ struct next_syscall_data *nsd; >>+ char *bufptr = buffer; >>+ ssize_t rc, count = 0; >>+ int i; >>+ >>+ nsd = task->nsd; >>+ if (!nsd || !nsd->ndata) >>+ return snprintf(buffer, size, "UNSET\n"); >>+ >>+ count = snprintf(bufptr, size, "LONG%d ", nsd->ndata); >>+ >>+ for (i = 0; i < nsd->ndata - 1; i++) { >>+ rc = snprintf(&bufptr[count], size - count, "%ld ", >>+ nsd->data[i]); >>+ if (rc >= size - count) >>+ return -ENOMEM; >>+ count += rc; >>+ } >>+ >>+ rc = snprintf(&bufptr[count], size - count, "%ld\n", nsd->data[i]); >>+ if (rc >= size - count) >>+ return -ENOMEM; >>+ count += rc; >>+ >>+ return count; >>+} >>+ >>+static int fill_next_syscall_data(struct task_struct *task, int ndata, >>+ char *buffer) >>+{ >>+ char *token, *buff = buffer; >>+ char *end; >>+ struct next_syscall_data *nsd = task->nsd; >>+ int i; >>+ >>+ if (!nsd) { >>+ nsd = kmalloc(sizeof(*nsd), GFP_KERNEL); >>+ if (!nsd) >>+ return -ENOMEM; >>+ task->nsd = nsd; >>+ } >>+ >>+ nsd->ndata = ndata; >>+ >>+ i = 0; >>+ while ((token = strsep(&buff, " ")) != NULL && i < ndata) { >>+ long data; >>+ >>+ if (!*token) >>+ goto out_free; >>+ data = simple_strtol(token, &end, 0); >>+ if (end == token || (*end && !isspace(*end))) >>+ goto out_free; >>+ nsd->data[i] = data; >>+ i++; >>+ } >>+ >>+ if (i != ndata) >>+ goto out_free; >>+ >>+ return 0; >>+ >>+out_free: >>+ kfree(nsd); > Serge, Thanks for reviewing this so fast! > > Shouldn't you also reset task->nsd to NULL here? :-) Oh yes! > > >>+ return -EINVAL; >>+} >>+ >>+/* >>+ * Parses a line with the following format: >>+ * <x> <id0> ... <idx-1> >>+ * Currently, only x=1 is accepted. >>+ * Any trailing character on the line is skipped. >>+ */ >>+static int do_set_next_syscall_data(struct task_struct *task, char *nb, >>+ char *buffer) >>+{ >>+ int ndata; >>+ char *end; >>+ >>+ ndata = simple_strtol(nb, &end, 0); >>+ if (*end) >>+ return -EINVAL; >>+ >>+ if (ndata > NDATA) >>+ return -EINVAL; >>+ >>+ return fill_next_syscall_data(task, ndata, buffer); >>+} >>+ >>+int reset_next_syscall_data(struct task_struct *task) > > > Why have this return an int? It always returns 0, and callers ignore > the return value. > You're right, will change it to a void. > >>+{ >>+ struct next_syscall_data *nsd; >>+ >>+ nsd = task->nsd; >>+ if (!nsd) >>+ return 0; >>+ >>+ task->nsd = NULL; >>+ kfree(nsd); >>+ return 0; >>+} >>+ >>+#define LONG_STR "LONG" >>+#define RESET_STR "RESET" >>+ >>+/* >>+ * Parses a line written to /proc/self/task/<my_tid>/next_syscall_data. >>+ * this line has the following format: >>+ * LONG<x> id --> a sequence of id(s) is specified >>+ * currently, only x=1 is accepted >>+ */ >>+int set_next_syscall_data(struct task_struct *task, char *buffer) >>+{ >>+ char *token, *out = buffer; >>+ size_t sz; >>+ >>+ if (!out) >>+ return -EINVAL; >>+ >>+ token = strsep(&out, " "); >>+ >>+ sz = strlen(LONG_STR); >>+ >>+ if (!strncmp(token, LONG_STR, sz)) >>+ return do_set_next_syscall_data(task, token + sz, out); >>+ >>+ if (!strncmp(token, RESET_STR, strlen(RESET_STR))) >>+ return reset_next_syscall_data(task); >>+ >>+ return -EINVAL; >>+} >>Index: linux-2.6.26-rc5-mm3/kernel/fork.c >>=================================================================== >>--- linux-2.6.26-rc5-mm3.orig/kernel/fork.c 2008-06-25 17:10:41.000000000 +0200 >>+++ linux-2.6.26-rc5-mm3/kernel/fork.c 2008-07-01 10:25:46.000000000 +0200 >>@@ -1077,6 +1077,8 @@ static struct task_struct *copy_process( >> p->blocked_on = NULL; /* not blocked yet */ >> #endif >> >>+ p->nsd = NULL; /* no next syscall data is the default */ >>+ >> /* Perform scheduler related setup. Assign this task to a CPU. */ >> sched_fork(p, clone_flags); >> >>Index: linux-2.6.26-rc5-mm3/fs/exec.c >>=================================================================== >>--- linux-2.6.26-rc5-mm3.orig/fs/exec.c 2008-06-25 17:11:05.000000000 +0200 >>+++ linux-2.6.26-rc5-mm3/fs/exec.c 2008-06-27 14:53:08.000000000 +0200 >>@@ -1014,6 +1014,12 @@ int flush_old_exec(struct linux_binprm * >> flush_signal_handlers(current, 0); >> flush_old_files(current->files); >> >>+ /* >>+ * the next syscall data is not inherited across execve() >>+ */ >>+ if (unlikely(current->nsd)) >>+ reset_next_syscall_data(current); >>+ >> return 0; >> >> out: >>Index: linux-2.6.26-rc5-mm3/kernel/exit.c >>=================================================================== >>--- linux-2.6.26-rc5-mm3.orig/kernel/exit.c 2008-06-25 17:10:41.000000000 +0200 >>+++ linux-2.6.26-rc5-mm3/kernel/exit.c 2008-06-27 14:57:55.000000000 +0200 >>@@ -1069,6 +1069,10 @@ NORET_TYPE void do_exit(long code) >> >> proc_exit_connector(tsk); >> exit_notify(tsk, group_dead); >>+ >>+ if (unlikely(tsk->nsd)) >>+ exit_next_syscall_data(tsk); >>+ >> #ifdef CONFIG_NUMA >> mpol_put(tsk->mempolicy); >> tsk->mempolicy = NULL; >> >>-- > > > _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers