Signed-off-by: Nagarathnam Muthusamy <nagarathnam.muthusamy@xxxxxxxxxx>
---
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 2 +
fs/nsfs.c | 9 +-
fs/proc/namespaces.c | 1 +
include/linux/ns_common.h | 1 +
include/linux/pid_namespace.h | 3 +
include/linux/proc_ns.h | 1 +
include/linux/syscalls.h | 1 +
kernel/pid_namespace.c | 190 ++++++++++++++++++++++++++++++++-
kernel/sys_ni.c | 4 +
10 files changed, 208 insertions(+), 5 deletions(-)
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 448ac21..31bf798 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -391,3 +391,4 @@
382 i386 pkey_free sys_pkey_free
383 i386 statx sys_statx
384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
+385 i386 translate_pid sys_translate_pid compat_sys_translate_pid
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183..89196c3 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
330 common pkey_alloc sys_pkey_alloc
331 common pkey_free sys_pkey_free
332 common statx sys_statx
+333 64 translate_pid sys_translate_pid
#
# x32-specific system call numbers start at 512 to avoid cache impact
@@ -380,3 +381,4 @@
545 x32 execveat compat_sys_execveat/ptregs
546 x32 preadv2 compat_sys_preadv64v2
547 x32 pwritev2 compat_sys_pwritev64v2
+548 x32 translate_pid compat_sys_translate_pid
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 36b0772..c635465 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -222,8 +222,13 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task,
const char *name;
ns = ns_ops->get(task);
if (ns) {
- name = ns_ops->real_ns_name ? : ns_ops->name;
- res = snprintf(buf, size, "%s:[%u]", name, ns->inum);
+ if (!strcmp(ns_ops->name, "pidns_id")) {
+ res = snprintf(buf, size, "[%llu]",
+ (unsigned long long)ns->ns_id);
+ } else {
+ name = ns_ops->real_ns_name ? : ns_ops->name;
+ res = snprintf(buf, size, "%s:[%u]", name, ns->inum);
+ }
ns_ops->put(ns);
}
return res;
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 59b17e5..ac823ce 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -24,6 +24,7 @@
#endif
#ifdef CONFIG_PID_NS
&pidns_operations,
+ &pidns_id_operations,
&pidns_for_children_operations,
#endif
#ifdef CONFIG_USER_NS
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 5fbc400..6ca3d43 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -8,6 +8,7 @@ struct ns_common {
atomic_long_t stashed;
const struct proc_ns_operations *ops;
unsigned int inum;
+ u64 ns_id;
};
#endif
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 49538b1..11d1d57 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -11,6 +11,7 @@
#include <linux/kref.h>
#include <linux/ns_common.h>
#include <linux/idr.h>
+#include <linux/list_bl.h>
struct fs_pin;
@@ -44,6 +45,8 @@ struct pid_namespace {
kgid_t pid_gid;
int hide_pid;
int reboot; /* group exit code if this pidns was rebooted */
+ struct hlist_bl_node node;
+ atomic_t lookups_pending;
struct ns_common ns;
} __randomize_layout;
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index d31cb62..861e38bd 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -28,6 +28,7 @@ struct proc_ns_operations {
extern const struct proc_ns_operations utsns_operations;
extern const struct proc_ns_operations ipcns_operations;
extern const struct proc_ns_operations pidns_operations;
+extern const struct proc_ns_operations pidns_id_operations;
extern const struct proc_ns_operations pidns_for_children_operations;
extern const struct proc_ns_operations userns_operations;
extern const struct proc_ns_operations mntns_operations;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a78186d..574349a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -901,6 +901,7 @@ asmlinkage long sys_open_by_handle_at(int mountdirfd,
struct file_handle __user *handle,
int flags);
asmlinkage long sys_setns(int fd, int nstype);
+asmlinkage long sys_translate_pid(pid_t pid, u64 source, u64 target);
asmlinkage long sys_process_vm_readv(pid_t pid,
const struct iovec __user *lvec,
unsigned long liovcnt,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 0b53eef..ff83aa8 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -22,6 +22,12 @@
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/idr.h>
+#include <linux/random.h>
+#include <linux/compat.h>
+
+#define PID_NS_ID_HASH_BITS 9
+
+struct hlist_bl_head *pid_ns_hash;
struct pid_cache {
int nr_ids;
@@ -34,6 +40,13 @@ struct pid_cache {
static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep;
+static inline struct hlist_bl_head *
+ pid_ns_hash_head(struct hlist_bl_head *hash,
+ uint64_t key)
+{
+ return &hash[hash_64(key, PID_NS_ID_HASH_BITS)];
+}
+
/*
* creates the kmem cache to allocate pids from.
* @nr_ids: the number of numerical ids this pid will have to carry
@@ -93,12 +106,24 @@ static void dec_pid_namespaces(struct ucounts *ucounts)
dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
}
+static inline u64 get_namespace_id(void)
+{
+ u64 id = 0;
+
+ while (!id)
+ get_random_bytes(&id, sizeof(id));
+
+ return id;
+}
+
static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
struct pid_namespace *parent_pid_ns)
{
- struct pid_namespace *ns;
- unsigned int level = parent_pid_ns->level + 1;
+ struct pid_namespace *ns, *pt;
struct ucounts *ucounts;
+ struct hlist_bl_head *head;
+ struct hlist_bl_node *dup_node;
+ unsigned int level = parent_pid_ns->level + 1;
int err;
err = -EINVAL;
@@ -135,7 +160,24 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
INIT_WORK(&ns->proc_work, proc_cleanup_work);
-
+ ns->ns.ns_id = get_namespace_id();
+ while (1) {
+ head = pid_ns_hash_head(pid_ns_hash, ns->ns.ns_id);
+ hlist_bl_lock(head);
+ hlist_bl_for_each_entry(pt, dup_node, head, node) {
+ if (ns->ns.ns_id == pt->ns.ns_id) {
+ /*
+ * ID is taken. Move to next ID;
+ */
+ ns->ns.ns_id = get_namespace_id();
+ hlist_bl_unlock(head);
+ continue;
+ }
+ }
+ break;
+ }
+ hlist_bl_add_head(&ns->node, head);
+ hlist_bl_unlock(head);
return ns;
out_free_idr:
@@ -159,6 +201,30 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns)
{
+ struct pid_namespace *ph;
+ struct hlist_bl_head *head;
+ struct hlist_bl_node *dup_node;
+
+ /*
+ * Remove the namespace structure from hash table so
+ * now new lookups can start on it.
+ */
+ if (ns->ns.ns_id) {
+ head = pid_ns_hash_head(pid_ns_hash, ns->ns.ns_id);
+ hlist_bl_lock(head);
+ hlist_bl_for_each_entry(ph, dup_node, head, node) {
+ if (ns->ns.ns_id == ph->ns.ns_id) {
+ hlist_bl_del_init(&ph->node);
+ break;
+ }
+ }
+ hlist_bl_unlock(head);
+ }
+ /*
+ * Wait for pending lookups to complete.
+ */
+ while (atomic_read(&ns->lookups_pending))
+ cpu_relax();
ns_free_inum(&ns->ns);
idr_destroy(&ns->idr);
@@ -463,6 +529,17 @@ static struct user_namespace *pidns_owner(struct ns_common *ns)
.get_parent = pidns_get_parent,
};
+const struct proc_ns_operations pidns_id_operations = {
+ .name = "pidns_id",
+ .real_ns_name = "pid",
+ .type = CLONE_NEWPID,
+ .get = pidns_get,
+ .put = pidns_put,
+ .install = pidns_install,
+ .owner = pidns_owner,
+ .get_parent = pidns_get_parent,
+};
+
const struct proc_ns_operations pidns_for_children_operations = {
.name = "pid_for_children",
.real_ns_name = "pid",
@@ -474,9 +551,116 @@ static struct user_namespace *pidns_owner(struct ns_common *ns)
.get_parent = pidns_get_parent,
};
+/*
+ * translate_pid - convert pid in source pid-ns into target pid-ns.
+ * @pid: pid for translation
+ * @source: pid-ns id
+ * @target: pid-ns id
+ *
+ * Return pid in @target pid-ns, zero if task have no pid there,
+ * or -ESRCH of task with @pid is not found in @source pid-ns.
+ */
+SYSCALL_DEFINE3(translate_pid, pid_t, pid, u64, source,
+ u64, target)
+{
+ struct pid_namespace *source_ns = NULL, *target_ns = NULL;
+ struct pid *struct_pid;
+ struct pid_namespace *ph;
+ struct hlist_bl_head *shead = NULL;
+ struct hlist_bl_head *thead = NULL;
+ struct hlist_bl_node *dup_node;
+ pid_t result;
+
+ if (!source) {
+ source_ns = &init_pid_ns;
+ } else {
+ shead = pid_ns_hash_head(pid_ns_hash, source);
+ hlist_bl_lock(shead);
+ hlist_bl_for_each_entry(ph, dup_node, shead, node) {
+ if (source == ph->ns.ns_id) {
+ source_ns = ph;
+ break;
+ }
+ }
+ if (!source_ns) {
+ hlist_bl_unlock(shead);
+ return -EINVAL;
+ }
+ }
+ if (!ptrace_may_access(source_ns->child_reaper,
+ PTRACE_MODE_READ_FSCREDS)) {
+ if (shead)
+ hlist_bl_unlock(shead);
+ return -EPERM;
+ }
+
+ atomic_inc(&source_ns->lookups_pending);
+ if (shead)
+ hlist_bl_unlock(shead);
+
+ if (!target) {
+ target_ns = &init_pid_ns;
+ } else {
+ thead = pid_ns_hash_head(pid_ns_hash, target);
+ hlist_bl_lock(thead);
+ hlist_bl_for_each_entry(ph, dup_node, thead, node) {
+ if (target == ph->ns.ns_id) {
+ target_ns = ph;
+ break;
+ }
+ }
+ if (!target_ns) {
+ atomic_dec(&source_ns->lookups_pending);
+ hlist_bl_unlock(thead);
+ return -EINVAL;
+ }
+ }
+ if (!ptrace_may_access(target_ns->child_reaper,
+ PTRACE_MODE_READ_FSCREDS)) {
+ atomic_dec(&source_ns->lookups_pending);
+ if (thead)
+ hlist_bl_unlock(thead);
+ return -EPERM;
+ }
+ atomic_inc(&target_ns->lookups_pending);
+ if (thead)
+ hlist_bl_unlock(thead);
+
+ rcu_read_lock();
+ struct_pid = find_pid_ns(pid, source_ns);
+ result = struct_pid ? pid_nr_ns(struct_pid, target_ns) : -ESRCH;
+ rcu_read_unlock();
+ atomic_dec(&source_ns->lookups_pending);
+ atomic_dec(&target_ns->lookups_pending);
+ return result;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE5(translate_pid, pid_t, pid, u32, s0, u32, s1,
+ u32, t0, u32, t1)
+{
+#ifdef __BIG_ENDIAN
+ return sys_translate_pid(pid, ((u64)s0 << 32) | s1,
+ ((u64)t0 << 32) | t1);
+#else
+ return sys_translate_pid(pid, ((u64)s1 << 32) | s0,
+ ((u64)t1 << 32) | t0);
+#endif
+}
+#endif
+
static __init int pid_namespaces_init(void)
{
+ unsigned long bucket_count;
+ int i;
+
+ bucket_count = (1UL << PID_NS_ID_HASH_BITS);
pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+ pid_ns_hash = kmalloc_array(bucket_count, sizeof(struct hlist_bl_head),
+ GFP_KERNEL);
+
+ for (i = 0; i < bucket_count; i++)
+ INIT_HLIST_BL_HEAD(&pid_ns_hash[i]);
#ifdef CONFIG_CHECKPOINT_RESTORE
register_sysctl_paths(kern_path, pid_ns_ctl_table);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index b518976..467255f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -259,3 +259,7 @@ asmlinkage long sys_ni_syscall(void)
cond_syscall(sys_pkey_mprotect);
cond_syscall(sys_pkey_alloc);
cond_syscall(sys_pkey_free);
+
+/* pid translation */
+cond_syscall(sys_translate_pid);
+cond_syscall(compat_sys_translate_pid);