On Fri, Mar 24, 2023 at 09:54:48AM +0100, Vlastimil Babka wrote: > On 3/24/23 01:18, Masami Hiramatsu (Google) wrote: > > Hi Beau, > > > > On Tue, 21 Feb 2023 13:11:43 -0800 > > Beau Belgrave <beaub@xxxxxxxxxxxxxxxxxxx> wrote: > > > >> Operators want to be able to ensure enough tracepoints exist on the > >> system for kernel components as well as for user components. Since there > >> are only up to 64K events, by default allow up to half to be used by > >> user events. > >> > >> Add a boot parameter (user_events_max=%d) and a kernel sysctl parameter > >> (kernel.user_events_max) to set a global limit that is honored among all > >> groups on the system. This ensures hard limits can be setup to prevent > >> user processes from consuming all event IDs on the system. > > > > sysctl is good to me, but would we really need the kernel parameter? > > The user_events starts using when user-space is up, so I think setting > > the limit with sysctl is enough. > > > > BTW, Vlastimil tried to add 'sysctl.*' kernel parameter support(*). If we > > need a kernel cmdline support, I think this is more generic way. But it > > seems the discussion has been stopped. > > It was actually merged in 5.8. So sysctl should be sufficient with that. > But maybe it's weird to start adding sysctls, when the rest of tracing > tunables is AFAIK under /sys/kernel/tracing/ ? > During the TraceFS meetings Steven runs I was asked to add a boot parameter and sysctl for user_events to limit the max. To me, it seems when user_events moves toward namespace awareness sysctl might be easier to use from within a namespace to turn knobs. Happy to change to whatever, but I want to see Steven and Masami agree on the approach before doing so. Steven, do you agree with Masami to move to just sysctl? Thanks, -Beau > > > (*) https://patchwork.kernel.org/project/linux-mm/patch/20200427180433.7029-2-vbabka@xxxxxxx/ > > > > Thank you, > > > >> > >> Signed-off-by: Beau Belgrave <beaub@xxxxxxxxxxxxxxxxxxx> > >> --- > >> kernel/trace/trace_events_user.c | 59 ++++++++++++++++++++++++++++++++ > >> 1 file changed, 59 insertions(+) > >> > >> diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c > >> index 222f2eb59c7c..6a5ebe243999 100644 > >> --- a/kernel/trace/trace_events_user.c > >> +++ b/kernel/trace/trace_events_user.c > >> @@ -20,6 +20,7 @@ > >> #include <linux/types.h> > >> #include <linux/uaccess.h> > >> #include <linux/highmem.h> > >> +#include <linux/init.h> > >> #include <linux/user_events.h> > >> #include "trace.h" > >> #include "trace_dynevent.h" > >> @@ -61,6 +62,12 @@ struct user_event_group { > >> /* Group for init_user_ns mapping, top-most group */ > >> static struct user_event_group *init_group; > >> > >> +/* Max allowed events for the whole system */ > >> +static unsigned int max_user_events = 32768; > >> + > >> +/* Current number of events on the whole system */ > >> +static unsigned int current_user_events; > >> + > >> /* > >> * Stores per-event properties, as users register events > >> * within a file a user_event might be created if it does not > >> @@ -1241,6 +1248,8 @@ static int destroy_user_event(struct user_event *user) > >> { > >> int ret = 0; > >> > >> + lockdep_assert_held(&event_mutex); > >> + > >> /* Must destroy fields before call removal */ > >> user_event_destroy_fields(user); > >> > >> @@ -1257,6 +1266,11 @@ static int destroy_user_event(struct user_event *user) > >> kfree(EVENT_NAME(user)); > >> kfree(user); > >> > >> + if (current_user_events > 0) > >> + current_user_events--; > >> + else > >> + pr_alert("BUG: Bad current_user_events\n"); > >> + > >> return ret; > >> } > >> > >> @@ -1744,6 +1758,11 @@ static int user_event_parse(struct user_event_group *group, char *name, > >> > >> mutex_lock(&event_mutex); > >> > >> + if (current_user_events >= max_user_events) { > >> + ret = -EMFILE; > >> + goto put_user_lock; > >> + } > >> + > >> ret = user_event_trace_register(user); > >> > >> if (ret) > >> @@ -1755,6 +1774,7 @@ static int user_event_parse(struct user_event_group *group, char *name, > >> dyn_event_init(&user->devent, &user_event_dops); > >> dyn_event_add(&user->devent, &user->call); > >> hash_add(group->register_table, &user->node, key); > >> + current_user_events++; > >> > >> mutex_unlock(&event_mutex); > >> > >> @@ -2386,6 +2406,43 @@ static int create_user_tracefs(void) > >> return -ENODEV; > >> } > >> > >> +static int __init set_max_user_events(char *str) > >> +{ > >> + if (!str) > >> + return 0; > >> + > >> + if (kstrtouint(str, 0, &max_user_events)) > >> + return 0; > >> + > >> + return 1; > >> +} > >> +__setup("user_events_max=", set_max_user_events); > >> + > >> +static int set_max_user_events_sysctl(struct ctl_table *table, int write, > >> + void *buffer, size_t *lenp, loff_t *ppos) > >> +{ > >> + int ret; > >> + > >> + mutex_lock(&event_mutex); > >> + > >> + ret = proc_douintvec(table, write, buffer, lenp, ppos); > >> + > >> + mutex_unlock(&event_mutex); > >> + > >> + return ret; > >> +} > >> + > >> +static struct ctl_table user_event_sysctls[] = { > >> + { > >> + .procname = "user_events_max", > >> + .data = &max_user_events, > >> + .maxlen = sizeof(unsigned int), > >> + .mode = 0644, > >> + .proc_handler = set_max_user_events_sysctl, > >> + }, > >> + {} > >> +}; > >> + > >> static int __init trace_events_user_init(void) > >> { > >> int ret; > >> @@ -2415,6 +2472,8 @@ static int __init trace_events_user_init(void) > >> if (dyn_event_register(&user_event_dops)) > >> pr_warn("user_events could not register with dyn_events\n"); > >> > >> + register_sysctl_init("kernel", user_event_sysctls); > >> + > >> return 0; > >> } > >> > >> -- > >> 2.25.1 > >> > > > >