Hi Paul, Some silly doubts. On Mon, Sep 10, 2007 at 11:39:01AM -0700, Paul E. McKenney wrote: > Work in progress, not for inclusion. > > RCU priority boosting is needed when running a workload that might include > CPU-bound user tasks running at realtime priorities with preemptible RCU. > In this situation, RCU priority boosting is needed to avoid OOM. > > Please note that because Classic RCU does not permit RCU read-side > critical sections to be preempted, there is no need to boost the priority > of Classic RCU readers. Boosting the priority of a running process > does not make it run any faster, at least not on any hardware that I am > aware of. ;-) > > Signed-off-by: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx> > --- > > include/linux/init_task.h | 13 > include/linux/rcupdate.h | 17 + > include/linux/rcupreempt.h | 20 + > include/linux/sched.h | 24 + > init/main.c | 1 > kernel/Kconfig.preempt | 14 - > kernel/fork.c | 6 > kernel/rcupreempt.c | 608 ++++++++++++++++++++++++++++++++++++++++++--- > kernel/rtmutex.c | 7 > kernel/sched.c | 5 > lib/Kconfig.debug | 34 ++ > 11 files changed, 703 insertions(+), 46 deletions(-) > > diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/include/linux/init_task.h linux-2.6.22-F-boostrcu/include/linux/init_task.h > --- linux-2.6.22-E-hotplug/include/linux/init_task.h 2007-07-08 16:32:17.000000000 -0700 > +++ linux-2.6.22-F-boostrcu/include/linux/init_task.h 2007-08-31 14:09:02.000000000 -0700 > @@ -87,6 +87,17 @@ extern struct nsproxy init_nsproxy; > .signalfd_list = LIST_HEAD_INIT(sighand.signalfd_list), \ > } > > +#ifdef CONFIG_PREEMPT_RCU_BOOST > +#define INIT_RCU_BOOST_PRIO .rcu_prio = MAX_PRIO, > +#define INIT_PREEMPT_RCU_BOOST(tsk) \ > + .rcub_rbdp = NULL, \ > + .rcub_state = RCU_BOOST_IDLE, \ > + .rcub_entry = LIST_HEAD_INIT(tsk.rcub_entry), > +#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */ > +#define INIT_RCU_BOOST_PRIO > +#define INIT_PREEMPT_RCU_BOOST(tsk) > +#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */ > + > extern struct group_info init_groups; > > #define INIT_STRUCT_PID { \ > @@ -125,6 +136,7 @@ extern struct group_info init_groups; > .prio = MAX_PRIO-20, \ > .static_prio = MAX_PRIO-20, \ > .normal_prio = MAX_PRIO-20, \ > + INIT_RCU_BOOST_PRIO \ > .policy = SCHED_NORMAL, \ > .cpus_allowed = CPU_MASK_ALL, \ > .mm = NULL, \ > @@ -169,6 +181,7 @@ extern struct group_info init_groups; > }, \ > INIT_TRACE_IRQFLAGS \ > INIT_LOCKDEP \ > + INIT_PREEMPT_RCU_BOOST(tsk) \ > } > > > diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/include/linux/rcupdate.h linux-2.6.22-F-boostrcu/include/linux/rcupdate.h > --- linux-2.6.22-E-hotplug/include/linux/rcupdate.h 2007-08-24 11:03:22.000000000 -0700 > +++ linux-2.6.22-F-boostrcu/include/linux/rcupdate.h 2007-08-24 17:04:14.000000000 -0700 > @@ -252,5 +252,22 @@ static inline void rcu_qsctr_inc(int cpu > per_cpu(rcu_data_passed_quiesc, cpu) = 1; > } > > +#ifdef CONFIG_PREEMPT_RCU_BOOST > +extern void init_rcu_boost_late(void); > +extern void __rcu_preempt_boost(void); > +#define rcu_preempt_boost() /* cpp to avoid #include hell. */ \ > + do { \ > + if (unlikely(current->rcu_read_lock_nesting > 0)) \ > + __rcu_preempt_boost(); \ > + } while (0) > +#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */ > +static inline void init_rcu_boost_late(void) > +{ > +} > +static inline void rcu_preempt_boost(void) > +{ > +} > +#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */ > + > #endif /* __KERNEL__ */ > #endif /* __LINUX_RCUPDATE_H */ > diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/include/linux/rcupreempt.h linux-2.6.22-F-boostrcu/include/linux/rcupreempt.h > --- linux-2.6.22-E-hotplug/include/linux/rcupreempt.h 2007-08-24 11:20:32.000000000 -0700 > +++ linux-2.6.22-F-boostrcu/include/linux/rcupreempt.h 2007-08-24 11:24:59.000000000 -0700 > @@ -42,6 +42,26 @@ > #include <linux/cpumask.h> > #include <linux/seqlock.h> > > +#ifdef CONFIG_PREEMPT_RCU_BOOST > +/* > + * Task state with respect to being RCU-boosted. This state is changed > + * by the task itself in response to the following three events: ^^^ > + * 1. Preemption (or block on lock) while in RCU read-side critical section. I am wondering, can a task block on a lock while in RCU read-side critical section? > + * 2. Outermost rcu_read_unlock() for blocked RCU read-side critical section. > + * Event #3. is missing? The state change from RCU_BOOST_BLOCKED to RCU_BOOSTED is not done by the task itself, but by the rcu_boost_task. No? > + * The RCU-boost task also updates the state when boosting priority. > + */ > +enum rcu_boost_state { > + RCU_BOOST_IDLE = 0, /* Not yet blocked if in RCU read-side. */ > + RCU_BOOST_BLOCKED = 1, /* Blocked from RCU read-side. */ > + RCU_BOOSTED = 2, /* Boosting complete. */ > + RCU_BOOST_INVALID = 3, /* For bogus state sightings. */ > +}; > + > +#define N_RCU_BOOST_STATE (RCU_BOOST_INVALID + 1) > + > +#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST */ > + > #define call_rcu_bh(head, rcu) call_rcu(head, rcu) > #define rcu_bh_qsctr_inc(cpu) do { } while (0) > #define __rcu_read_lock_bh() { rcu_read_lock(); local_bh_disable(); } > diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/include/linux/sched.h linux-2.6.22-F-boostrcu/include/linux/sched.h > --- linux-2.6.22-E-hotplug/include/linux/sched.h 2007-08-24 11:00:39.000000000 -0700 > +++ linux-2.6.22-F-boostrcu/include/linux/sched.h 2007-08-24 17:07:01.000000000 -0700 > @@ -546,6 +546,22 @@ struct signal_struct { > #define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH) > #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) > > +#ifdef CONFIG_PREEMPT_RCU_BOOST > +#define set_rcu_prio(p, prio) /* cpp to avoid #include hell */ \ > + do { \ > + (p)->rcu_prio = (prio); \ > + } while (0) > +#define get_rcu_prio(p) (p)->rcu_prio /* cpp to avoid #include hell */ > +#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */ > +static inline void set_rcu_prio(struct task_struct *p, int prio) > +{ > +} > +static inline int get_rcu_prio(struct task_struct *p) > +{ > + return MAX_PRIO; > +} > +#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */ > + > /* > * Some day this will be a full-fledged user tracking system.. > */ > @@ -834,6 +850,9 @@ struct task_struct { > #endif > int load_weight; /* for niceness load balancing purposes */ > int prio, static_prio, normal_prio; > +#ifdef CONFIG_PREEMPT_RCU_BOOST > + int rcu_prio; > +#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST */ > struct list_head run_list; > struct prio_array *array; > > @@ -858,6 +877,11 @@ struct task_struct { > #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) > struct sched_info sched_info; > #endif > +#ifdef CONFIG_PREEMPT_RCU_BOOST > + struct rcu_boost_dat *rcub_rbdp; > + enum rcu_boost_state rcub_state; > + struct list_head rcub_entry; > +#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST */ > > struct list_head tasks; > /* > diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/init/main.c linux-2.6.22-F-boostrcu/init/main.c > --- linux-2.6.22-E-hotplug/init/main.c 2007-07-08 16:32:17.000000000 -0700 > +++ linux-2.6.22-F-boostrcu/init/main.c 2007-08-24 11:24:59.000000000 -0700 > @@ -722,6 +722,7 @@ static void __init do_basic_setup(void) > driver_init(); > init_irq_proc(); > do_initcalls(); > + init_rcu_boost_late(); > } > > static void __init do_pre_smp_initcalls(void) > diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/kernel/fork.c linux-2.6.22-F-boostrcu/kernel/fork.c > --- linux-2.6.22-E-hotplug/kernel/fork.c 2007-08-24 11:00:39.000000000 -0700 > +++ linux-2.6.22-F-boostrcu/kernel/fork.c 2007-08-24 11:24:59.000000000 -0700 > @@ -1036,6 +1036,12 @@ static struct task_struct *copy_process( > p->rcu_read_lock_nesting = 0; > p->rcu_flipctr_idx = 0; > #endif /* #ifdef CONFIG_PREEMPT_RCU */ > +#ifdef CONFIG_PREEMPT_RCU_BOOST > + p->rcu_prio = MAX_PRIO; > + p->rcub_rbdp = NULL; > + p->rcub_state = RCU_BOOST_IDLE; > + INIT_LIST_HEAD(&p->rcub_entry); > +#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST */ > p->vfork_done = NULL; > spin_lock_init(&p->alloc_lock); > > diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/kernel/Kconfig.preempt linux-2.6.22-F-boostrcu/kernel/Kconfig.preempt > --- linux-2.6.22-E-hotplug/kernel/Kconfig.preempt 2007-08-24 11:00:39.000000000 -0700 > +++ linux-2.6.22-F-boostrcu/kernel/Kconfig.preempt 2007-08-24 11:24:59.000000000 -0700 > @@ -91,13 +91,13 @@ config PREEMPT_RCU > > endchoice > > -config RCU_TRACE > - bool "Enable tracing for RCU - currently stats in debugfs" > - select DEBUG_FS > - default y > +config PREEMPT_RCU_BOOST > + bool "Enable priority boosting of RCU read-side critical sections" > + depends on PREEMPT_RCU > + default n > help > - This option provides tracing in RCU which presents stats > - in debugfs for debugging RCU implementation. > + This option permits priority boosting of RCU read-side critical > + sections that have been preempted in order to prevent indefinite > + delay of grace periods in face of runaway non-realtime processes. > > - Say Y here if you want to enable RCU tracing > Say N if you are unsure. > diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/kernel/rcupreempt.c linux-2.6.22-F-boostrcu/kernel/rcupreempt.c > --- linux-2.6.22-E-hotplug/kernel/rcupreempt.c 2007-08-24 11:20:32.000000000 -0700 > +++ linux-2.6.22-F-boostrcu/kernel/rcupreempt.c 2007-08-31 14:06:49.000000000 -0700 > @@ -51,6 +51,15 @@ > #include <linux/byteorder/swabb.h> > #include <linux/cpumask.h> > #include <linux/rcupreempt_trace.h> > +#include <linux/kthread.h> > + > +/* > + * Macro that prevents the compiler from reordering accesses, but does > + * absolutely -nothing- to prevent CPUs from reordering. This is used > + * only to mediate communication between mainline code and hardware > + * interrupt and NMI handlers. > + */ > +#define ORDERED_WRT_IRQ(x) (*(volatile typeof(x) *)&(x)) > > /* > * PREEMPT_RCU data structures. > @@ -82,6 +91,531 @@ static struct rcu_ctrlblk rcu_ctrlblk = > }; > static DEFINE_PER_CPU(int [2], rcu_flipctr) = { 0, 0 }; > > +#ifndef CONFIG_PREEMPT_RCU_BOOST > +static inline void init_rcu_boost_early(void) { } > +static inline void rcu_read_unlock_unboost(void) { } > +#else /* #ifndef CONFIG_PREEMPT_RCU_BOOST */ > + > +/* Defines possible event indices for ->rbs_stats[] (first index). */ > + > +#define RCU_BOOST_DAT_BLOCK 0 > +#define RCU_BOOST_DAT_BOOST 1 > +#define RCU_BOOST_DAT_UNLOCK 2 > +#define N_RCU_BOOST_DAT_EVENTS 3 > + > +/* RCU-boost per-CPU array element. */ > + > +struct rcu_boost_dat { > + spinlock_t rbs_lock; /* Protects state/CPU slice of structures. */ > + struct list_head rbs_toboost; > + struct list_head rbs_boosted; > + unsigned long rbs_blocked; > + unsigned long rbs_boost_attempt; > + unsigned long rbs_boost; > + unsigned long rbs_unlock; > + unsigned long rbs_unboosted; > +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS > + unsigned long rbs_stats[N_RCU_BOOST_DAT_EVENTS][N_RCU_BOOST_STATE]; > +#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */ > +}; > +#define RCU_BOOST_ELEMENTS 4 > + > +static int rcu_boost_idx = -1; /* invalid value for early RCU use. */ > +static DEFINE_PER_CPU(struct rcu_boost_dat, rcu_boost_dat[RCU_BOOST_ELEMENTS]); > +static struct task_struct *rcu_boost_task; > + > +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS > + > +/* > + * Function to increment indicated ->rbs_stats[] element. > + */ > +static inline void rcu_boost_dat_stat(struct rcu_boost_dat *rbdp, > + int event, > + enum rcu_boost_state oldstate) > +{ > + if (oldstate >= RCU_BOOST_IDLE && oldstate <= RCU_BOOSTED) { > + rbdp->rbs_stats[event][oldstate]++; > + } else { > + rbdp->rbs_stats[event][RCU_BOOST_INVALID]++; > + } > +} > + > +static inline void rcu_boost_dat_stat_block(struct rcu_boost_dat *rbdp, > + enum rcu_boost_state oldstate) > +{ > + rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BLOCK, oldstate); > +} > + > +static inline void rcu_boost_dat_stat_boost(struct rcu_boost_dat *rbdp, > + enum rcu_boost_state oldstate) > +{ > + rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BOOST, oldstate); > +} > + > +static inline void rcu_boost_dat_stat_unlock(struct rcu_boost_dat *rbdp, > + enum rcu_boost_state oldstate) > +{ > + rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_UNLOCK, oldstate); > +} > + > +/* > + * Prefix for kprint() strings for periodic statistics messages. > + */ > +static char *rcu_boost_state_event[] = { > + "block: ", > + "boost: ", > + "unlock: ", > +}; > + > +/* > + * Indicators for numbers in kprint() strings. "!" indicates a state-event > + * pair that should not happen, while "?" indicates a state that should > + * not happen. > + */ > +static char *rcu_boost_state_error[] = { > + /*ibBe*/ > + " ?", /* block */ > + "! ?", /* boost */ > + "? ?", /* unlock */ > +}; > + > +/* > + * Print out RCU booster task statistics at the specified interval. > + */ > +static void rcu_boost_dat_stat_print(void) > +{ > + /* Three decimal digits per byte plus spacing per number and line. */ > + char buf[N_RCU_BOOST_STATE * (sizeof(long) * 3 + 2) + 2]; > + int cpu; > + int event; > + int i; > + static time_t lastprint; /* static implies 0 initial value. */ > + struct rcu_boost_dat *rbdp; > + int state; > + struct rcu_boost_dat sum; > + > + /* Wait a graceful interval between printk spamming. */ > + /* Note: time_after() dislikes time_t. */ > + > + if (xtime.tv_sec - lastprint < CONFIG_PREEMPT_RCU_BOOST_STATS_INTERVAL) > + return; > + > + /* Sum up the state/event-independent counters. */ > + > + sum.rbs_blocked = 0; > + sum.rbs_boost_attempt = 0; > + sum.rbs_boost = 0; > + sum.rbs_unlock = 0; > + sum.rbs_unboosted = 0; > + for_each_possible_cpu(cpu) > + for (i = 0; i < RCU_BOOST_ELEMENTS; i++) { > + rbdp = per_cpu(rcu_boost_dat, cpu); > + sum.rbs_blocked += rbdp[i].rbs_blocked; > + sum.rbs_boost_attempt += rbdp[i].rbs_boost_attempt; > + sum.rbs_boost += rbdp[i].rbs_boost; > + sum.rbs_unlock += rbdp[i].rbs_unlock; > + sum.rbs_unboosted += rbdp[i].rbs_unboosted; > + } > + > + /* Sum up the state/event-dependent counters. */ > + > + for (event = 0; event < N_RCU_BOOST_DAT_EVENTS; event++) > + for (state = 0; state < N_RCU_BOOST_STATE; state++) { > + sum.rbs_stats[event][state] = 0; > + for_each_possible_cpu(cpu) { > + for (i = 0; i < RCU_BOOST_ELEMENTS; i++) > + sum.rbs_stats[event][state] > + += per_cpu(rcu_boost_dat, > + cpu)[i].rbs_stats[event][state]; > + } > + } > + > + /* Print them out! */ > + > + printk(KERN_INFO > + "rcu_boost_dat: idx=%d " > + "b=%lu ul=%lu ub=%lu boost: a=%lu b=%lu\n", > + rcu_boost_idx, > + sum.rbs_blocked, sum.rbs_unlock, sum.rbs_unboosted, > + sum.rbs_boost_attempt, sum.rbs_boost); > + for (event = 0; event < N_RCU_BOOST_DAT_EVENTS; event++) { > + i = 0; > + for (state = 0; state < N_RCU_BOOST_STATE; state++) > + i += sprintf(&buf[i], " %ld%c", > + sum.rbs_stats[event][state], > + rcu_boost_state_error[event][state]); > + printk(KERN_INFO "rcu_boost_dat %s %s\n", > + rcu_boost_state_event[event], buf); > + } > + > + /* Go away and don't come back for awhile. */ > + > + lastprint = xtime.tv_sec; > +} > + > +#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */ > + > +static inline void rcu_boost_dat_stat_block(struct rcu_boost_dat *rbdp, > + enum rcu_boost_state oldstate) > +{ > +} > +static inline void rcu_boost_dat_stat_boost(struct rcu_boost_dat *rbdp, > + enum rcu_boost_state oldstate) > +{ > +} > +static inline void rcu_boost_dat_stat_unlock(struct rcu_boost_dat *rbdp, > + enum rcu_boost_state oldstate) > +{ > +} > +static void rcu_boost_dat_stat_print(void) > +{ > +} > + > +#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */ > + > +/* > + * Initialize RCU-boost state. This happens early in the boot process, > + * when the scheduler does not yet exist. So don't try to use it. > + */ > +static void init_rcu_boost_early(void) > +{ > + struct rcu_boost_dat *rbdp; > + int cpu; > + int i; > + > + for_each_possible_cpu(cpu) { > + rbdp = per_cpu(rcu_boost_dat, cpu); > + for (i = 0; i < RCU_BOOST_ELEMENTS; i++) { > + spin_lock_init(&rbdp[i].rbs_lock); > + INIT_LIST_HEAD(&rbdp[i].rbs_toboost); > + INIT_LIST_HEAD(&rbdp[i].rbs_boosted); > + rbdp[i].rbs_blocked = 0; > + rbdp[i].rbs_boost_attempt = 0; > + rbdp[i].rbs_boost = 0; > + rbdp[i].rbs_unlock = 0; > + rbdp[i].rbs_unboosted = 0; > +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS > + { > + int j, k; > + > + for (j = 0; j < N_RCU_BOOST_DAT_EVENTS; j++) > + for (k = 0; k < N_RCU_BOOST_STATE; k++) > + rbdp[i].rbs_stats[j][k] = 0; > + } > +#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */ > + } > + smp_wmb(); /* Make sure readers see above initialization. */ > + rcu_boost_idx = 0; /* Allow readers to access data. */ > + } > +} > + > +/* > + * Return the list on which the calling task should add itself, or > + * NULL if too early during initialization. > + */ > +static inline struct rcu_boost_dat *rcu_rbd_new(void) > +{ > + int cpu = smp_processor_id(); /* locks used, so preemption OK. */ > + int idx = ORDERED_WRT_IRQ(rcu_boost_idx); > + > + if (unlikely(idx < 0)) > + return NULL; > + return &per_cpu(rcu_boost_dat, cpu)[idx]; > +} > + > +/* > + * Return the list from which to boost target tasks. > + * May only be invoked by the booster task, so guaranteed to > + * already be initialized. Use rcu_boost_dat element least recently > + * the destination for task blocking in RCU read-side critical sections. > + */ > +static inline struct rcu_boost_dat *rcu_rbd_boosting(int cpu) > +{ > + int idx = (rcu_boost_idx + 1) & (RCU_BOOST_ELEMENTS - 1); > + > + return &per_cpu(rcu_boost_dat, cpu)[idx]; > +} > + > +#define PREEMPT_RCU_BOOSTER_PRIO 49 /* Match curr_irq_prio manually. */ > + /* Administrators can always adjust */ > + /* via the /proc interface. */ > + > +/* > + * Boost the specified task from an RCU viewpoint. > + * Boost the target task to a priority just a bit less-favored than > + * that of the RCU-boost task, but boost to a realtime priority even > + * if the RCU-boost task is running at a non-realtime priority. > + * We check the priority of the RCU-boost task each time we boost > + * in case the sysadm manually changes the priority. > + */ > +static void rcu_boost_prio(struct task_struct *taskp) > +{ > + unsigned long flags; > + int rcuprio; > + > + spin_lock_irqsave(¤t->pi_lock, flags); > + rcuprio = rt_mutex_getprio(current) + 1; > + if (rcuprio >= MAX_USER_RT_PRIO) > + rcuprio = MAX_USER_RT_PRIO - 1; > + spin_unlock_irqrestore(¤t->pi_lock, flags); > + spin_lock_irqsave(&taskp->pi_lock, flags); > + if (taskp->rcu_prio != rcuprio) { > + taskp->rcu_prio = rcuprio; > + if (taskp->rcu_prio < taskp->prio) > + rt_mutex_setprio(taskp, taskp->rcu_prio); > + } > + spin_unlock_irqrestore(&taskp->pi_lock, flags); > +} > + > +/* > + * Unboost the specified task from an RCU viewpoint. > + */ > +static void rcu_unboost_prio(struct task_struct *taskp) > +{ > + int nprio; > + unsigned long flags; > + > + spin_lock_irqsave(&taskp->pi_lock, flags); > + taskp->rcu_prio = MAX_PRIO; > + nprio = rt_mutex_getprio(taskp); > + if (nprio > taskp->prio) > + rt_mutex_setprio(taskp, nprio); > + spin_unlock_irqrestore(&taskp->pi_lock, flags); > +} > + > +/* > + * Boost all of the RCU-reader tasks on the specified list. > + */ > +static void rcu_boost_one_reader_list(struct rcu_boost_dat *rbdp) > +{ > + LIST_HEAD(list); > + unsigned long flags; > + struct task_struct *taskp; > + > + /* > + * Splice both lists onto a local list. We will still > + * need to hold the lock when manipulating the local list > + * because tasks can remove themselves at any time. > + * The reason for splicing the rbs_boosted list is that > + * our priority may have changed, so reboosting may be > + * required. > + */ > + > + spin_lock_irqsave(&rbdp->rbs_lock, flags); > + list_splice_init(&rbdp->rbs_toboost, &list); > + list_splice_init(&rbdp->rbs_boosted, &list); > + while (!list_empty(&list)) { > + > + /* > + * Pause for a bit before boosting each task. > + * @@@FIXME: reduce/eliminate pausing in case of OOM. > + */ > + > + spin_unlock_irqrestore(&rbdp->rbs_lock, flags); > + schedule_timeout_uninterruptible(1); > + spin_lock_irqsave(&rbdp->rbs_lock, flags); > + > + /* > + * All tasks might have removed themselves while > + * we were waiting. Recheck list emptiness. > + */ > + > + if (list_empty(&list)) > + break; > + > + /* Remove first task in local list, count the attempt. */ > + > + taskp = list_entry(list.next, typeof(*taskp), rcub_entry); > + list_del_init(&taskp->rcub_entry); > + rbdp->rbs_boost_attempt++; > + > + /* Ignore tasks in unexpected states. */ > + > + if (taskp->rcub_state == RCU_BOOST_IDLE) { > + list_add_tail(&taskp->rcub_entry, &rbdp->rbs_toboost); > + rcu_boost_dat_stat_boost(rbdp, taskp->rcub_state); > + continue; > + } > + > + /* Boost the task's priority. */ > + > + rcu_boost_prio(taskp); > + rbdp->rbs_boost++; > + rcu_boost_dat_stat_boost(rbdp, taskp->rcub_state); > + taskp->rcub_state = RCU_BOOSTED; > + list_add_tail(&taskp->rcub_entry, &rbdp->rbs_boosted); > + } > + spin_unlock_irqrestore(&rbdp->rbs_lock, flags); > +} > + > +/* > + * Priority-boost tasks stuck in RCU read-side critical sections as > + * needed (presumably rarely). > + */ > +static int rcu_booster(void *arg) > +{ > + int cpu; > + struct sched_param sp = { .sched_priority = PREEMPT_RCU_BOOSTER_PRIO, }; > + > + sched_setscheduler(current, SCHED_RR, &sp); > + current->flags |= PF_NOFREEZE; > + > + do { > + > + /* Advance the lists of tasks. */ > + > + rcu_boost_idx = (rcu_boost_idx + 1) % RCU_BOOST_ELEMENTS; > + for_each_possible_cpu(cpu) { > + > + /* > + * Boost all sufficiently aged readers. > + * Readers must first be preempted or block > + * on a mutex in an RCU read-side critical section, > + * then remain in that critical section for > + * RCU_BOOST_ELEMENTS-1 time intervals. > + * So most of the time we should end up doing > + * nothing. > + */ > + > + rcu_boost_one_reader_list(rcu_rbd_boosting(cpu)); > + > + /* > + * Large SMP systems may need to sleep sometimes > + * in this loop. Or have multiple RCU-boost tasks. > + */ > + } > + > + /* > + * Sleep to allow any unstalled RCU read-side critical > + * sections to age out of the list. @@@ FIXME: reduce, > + * adjust, or eliminate in case of OOM. > + */ > + > + schedule_timeout_uninterruptible(HZ); > + > + /* Print stats if enough time has passed. */ > + > + rcu_boost_dat_stat_print(); > + > + } while (!kthread_should_stop()); > + > + return 0; > +} > + > +/* > + * Perform the portions of RCU-boost initialization that require the > + * scheduler to be up and running. > + */ > +void init_rcu_boost_late(void) > +{ > + > + /* Spawn RCU-boost task. */ > + > + printk(KERN_INFO "Starting RCU priority booster\n"); > + rcu_boost_task = kthread_run(rcu_booster, NULL, "RCU Prio Booster"); > + if (IS_ERR(rcu_boost_task)) > + panic("Unable to create RCU Priority Booster, errno %ld\n", > + -PTR_ERR(rcu_boost_task)); > +} > + > +/* > + * Update task's RCU-boost state to reflect blocking in RCU read-side > + * critical section, so that the RCU-boost task can find it in case it > + * later needs its priority boosted. > + */ > +void __rcu_preempt_boost(void) > +{ > + struct rcu_boost_dat *rbdp; > + unsigned long flags; > + > + /* Identify list to place task on for possible later boosting. */ > + > + local_irq_save(flags); > + rbdp = rcu_rbd_new(); > + if (rbdp == NULL) { > + local_irq_restore(flags); > + printk(KERN_INFO > + "Preempted RCU read-side critical section too early.\n"); > + return; > + } > + spin_lock(&rbdp->rbs_lock); > + rbdp->rbs_blocked++; > + > + /* > + * Update state. We hold the lock and aren't yet on the list, > + * so the booster cannot mess with us yet. > + */ > + > + rcu_boost_dat_stat_block(rbdp, current->rcub_state); > + if (current->rcub_state != RCU_BOOST_IDLE) { > + > + /* > + * We have been here before, so just update stats. > + * It may seem strange to do all this work just to > + * accumulate statistics, but this is such a > + * low-probability code path that we shouldn't care. > + * If it becomes a problem, it can be fixed. > + */ > + > + spin_unlock_irqrestore(&rbdp->rbs_lock, flags); > + return; > + } > + current->rcub_state = RCU_BOOST_BLOCKED; > + > + /* Now add ourselves to the list so that the booster can find us. */ > + > + list_add_tail(¤t->rcub_entry, &rbdp->rbs_toboost); > + current->rcub_rbdp = rbdp; > + spin_unlock_irqrestore(&rbdp->rbs_lock, flags); > +} > + > +/* > + * Do the list-removal and priority-unboosting "heavy lifting" when > + * required. > + */ > +static void __rcu_read_unlock_unboost(void) > +{ > + unsigned long flags; > + struct rcu_boost_dat *rbdp; > + > + /* Identify the list structure and acquire the corresponding lock. */ > + > + rbdp = current->rcub_rbdp; > + spin_lock_irqsave(&rbdp->rbs_lock, flags); > + > + /* Remove task from the list it was on. */ > + > + list_del_init(¤t->rcub_entry); > + rbdp->rbs_unlock++; > + current->rcub_rbdp = NULL; > + > + /* Record stats, unboost if needed, and update state. */ > + > + rcu_boost_dat_stat_unlock(rbdp, current->rcub_state); > + if (current->rcub_state == RCU_BOOSTED) { > + rcu_unboost_prio(current); > + rbdp->rbs_unboosted++; > + } > + current->rcub_state = RCU_BOOST_IDLE; > + spin_unlock_irqrestore(&rbdp->rbs_lock, flags); > +} > + > +/* > + * Do any state changes and unboosting needed for rcu_read_unlock(). > + * Pass any complex work on to __rcu_read_unlock_unboost(). > + * The vast majority of the time, no work will be needed, as preemption > + * and blocking within RCU read-side critical sections is comparatively > + * rare. > + */ > +static inline void rcu_read_unlock_unboost(void) > +{ > + > + if (unlikely(current->rcub_state != RCU_BOOST_IDLE)) > + __rcu_read_unlock_unboost(); > +} > + > +#endif /* #else #ifndef CONFIG_PREEMPT_RCU_BOOST */ > + > /* > * States for rcu_try_flip() and friends. > */ > @@ -128,14 +662,6 @@ static DEFINE_PER_CPU(enum rcu_mb_flag_v > static cpumask_t rcu_cpu_online_map = CPU_MASK_NONE; > > /* > - * Macro that prevents the compiler from reordering accesses, but does > - * absolutely -nothing- to prevent CPUs from reordering. This is used > - * only to mediate communication between mainline code and hardware > - * interrupt and NMI handlers. > - */ > -#define ORDERED_WRT_IRQ(x) (*(volatile typeof(x) *)&(x)) > - > -/* > * RCU_DATA_ME: find the current CPU's rcu_data structure. > * RCU_DATA_CPU: find the specified CPU's rcu_data structure. > */ > @@ -194,7 +720,7 @@ void __rcu_read_lock(void) > me->rcu_read_lock_nesting = nesting + 1; > > } else { > - unsigned long oldirq; > + unsigned long flags; > > /* > * Disable local interrupts to prevent the grace-period > @@ -203,7 +729,7 @@ void __rcu_read_lock(void) > * contain rcu_read_lock(). > */ > > - local_irq_save(oldirq); > + local_irq_save(flags); > > /* > * Outermost nesting of rcu_read_lock(), so increment > @@ -233,7 +759,7 @@ void __rcu_read_lock(void) > */ > > ORDERED_WRT_IRQ(me->rcu_flipctr_idx) = idx; > - local_irq_restore(oldirq); > + local_irq_restore(flags); > } > } > EXPORT_SYMBOL_GPL(__rcu_read_lock); > @@ -255,7 +781,7 @@ void __rcu_read_unlock(void) > me->rcu_read_lock_nesting = nesting - 1; > > } else { > - unsigned long oldirq; > + unsigned long flags; > > /* > * Disable local interrupts to prevent the grace-period > @@ -264,7 +790,7 @@ void __rcu_read_unlock(void) > * contain rcu_read_lock() and rcu_read_unlock(). > */ > > - local_irq_save(oldirq); > + local_irq_save(flags); > > /* > * Outermost nesting of rcu_read_unlock(), so we must > @@ -305,7 +831,10 @@ void __rcu_read_unlock(void) > */ > > ORDERED_WRT_IRQ(__get_cpu_var(rcu_flipctr)[idx])--; > - local_irq_restore(oldirq); > + > + rcu_read_unlock_unboost(); > + > + local_irq_restore(flags); > } > } > EXPORT_SYMBOL_GPL(__rcu_read_unlock); > @@ -504,10 +1033,10 @@ rcu_try_flip_waitmb(void) > */ > static void rcu_try_flip(void) > { > - unsigned long oldirq; > + unsigned long flags; > > RCU_TRACE_ME(rcupreempt_trace_try_flip_1); > - if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, oldirq))) { > + if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) { > RCU_TRACE_ME(rcupreempt_trace_try_flip_e1); > return; > } > @@ -534,7 +1063,7 @@ static void rcu_try_flip(void) > if (rcu_try_flip_waitmb()) > rcu_try_flip_state = rcu_try_flip_idle_state; > } > - spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq); > + spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); > } > > /* > @@ -553,16 +1082,16 @@ static void rcu_check_mb(int cpu) > > void rcu_check_callbacks_rt(int cpu, int user) > { > - unsigned long oldirq; > + unsigned long flags; > struct rcu_data *rdp = RCU_DATA_CPU(cpu); > > rcu_check_mb(cpu); > if (rcu_ctrlblk.completed == rdp->completed) > rcu_try_flip(); > - spin_lock_irqsave(&rdp->lock, oldirq); > + spin_lock_irqsave(&rdp->lock, flags); > RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); > __rcu_advance_callbacks(rdp); > - spin_unlock_irqrestore(&rdp->lock, oldirq); > + spin_unlock_irqrestore(&rdp->lock, flags); > } > > /* > @@ -571,18 +1100,19 @@ void rcu_check_callbacks_rt(int cpu, int > */ > void rcu_advance_callbacks_rt(int cpu, int user) > { > - unsigned long oldirq; > + unsigned long flags; > struct rcu_data *rdp = RCU_DATA_CPU(cpu); > > if (rcu_ctrlblk.completed == rdp->completed) { > rcu_try_flip(); > if (rcu_ctrlblk.completed == rdp->completed) > return; > + rcu_read_unlock_unboost(); > } > - spin_lock_irqsave(&rdp->lock, oldirq); > + spin_lock_irqsave(&rdp->lock, flags); > RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); > __rcu_advance_callbacks(rdp); > - spin_unlock_irqrestore(&rdp->lock, oldirq); > + spin_unlock_irqrestore(&rdp->lock, flags); > } > > #ifdef CONFIG_HOTPLUG_CPU > @@ -601,24 +1131,24 @@ void rcu_offline_cpu_rt(int cpu) > { > int i; > struct rcu_head *list = NULL; > - unsigned long oldirq; > + unsigned long flags; > struct rcu_data *rdp = RCU_DATA_CPU(cpu); > struct rcu_head **tail = &list; > > /* Remove all callbacks from the newly dead CPU, retaining order. */ > > - spin_lock_irqsave(&rdp->lock, oldirq); > + spin_lock_irqsave(&rdp->lock, flags); > rcu_offline_cpu_rt_enqueue(rdp->donelist, rdp->donetail, list, tail); > for (i = GP_STAGES - 1; i >= 0; i--) > rcu_offline_cpu_rt_enqueue(rdp->waitlist[i], rdp->waittail[i], > list, tail); > rcu_offline_cpu_rt_enqueue(rdp->nextlist, rdp->nexttail, list, tail); > - spin_unlock_irqrestore(&rdp->lock, oldirq); > + spin_unlock_irqrestore(&rdp->lock, flags); > rdp->waitlistcount = 0; > > /* Disengage the newly dead CPU from grace-period computation. */ > > - spin_lock_irqsave(&rcu_ctrlblk.fliplock, oldirq); > + spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); > rcu_check_mb(cpu); > if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { > smp_mb(); /* Subsequent counter accesses must see new value */ > @@ -627,7 +1157,7 @@ void rcu_offline_cpu_rt(int cpu) > /* seen -after- acknowledgement. */ > } > cpu_clear(cpu, rcu_cpu_online_map); > - spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq); > + spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); > > /* > * Place the removed callbacks on the current CPU's queue. > @@ -640,20 +1170,20 @@ void rcu_offline_cpu_rt(int cpu) > */ > > rdp = RCU_DATA_ME(); > - spin_lock_irqsave(&rdp->lock, oldirq); > + spin_lock_irqsave(&rdp->lock, flags); > *rdp->nexttail = list; > if (list) > rdp->nexttail = tail; > - spin_unlock_irqrestore(&rdp->lock, oldirq); > + spin_unlock_irqrestore(&rdp->lock, flags); > } > > void __devinit rcu_online_cpu_rt(int cpu) > { > - unsigned long oldirq; > + unsigned long flags; > > - spin_lock_irqsave(&rcu_ctrlblk.fliplock, oldirq); > + spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); > cpu_set(cpu, rcu_cpu_online_map); > - spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq); > + spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); > } > > #else /* #ifdef CONFIG_HOTPLUG_CPU */ > @@ -695,12 +1225,12 @@ void rcu_process_callbacks_rt(struct sof > void fastcall call_rcu_preempt(struct rcu_head *head, > void (*func)(struct rcu_head *rcu)) > { > - unsigned long oldirq; > + unsigned long flags; > struct rcu_data *rdp; > > head->func = func; > head->next = NULL; > - local_irq_save(oldirq); > + local_irq_save(flags); > rdp = RCU_DATA_ME(); > spin_lock(&rdp->lock); > __rcu_advance_callbacks(rdp); > @@ -708,7 +1238,7 @@ void fastcall call_rcu_preempt(struct rc > rdp->nexttail = &head->next; > RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); > spin_unlock(&rdp->lock); > - local_irq_restore(oldirq); > + local_irq_restore(flags); > } > EXPORT_SYMBOL_GPL(call_rcu_preempt); > > @@ -757,6 +1287,11 @@ int rcu_pending_rt(int cpu) > return 0; > } > > +/* > + * Initialize RCU. This is called very early in boot, so is restricted > + * to very simple operations. Don't even think about messing with anything > + * that involves the scheduler, as it doesn't exist yet. > + */ > void __init rcu_init_rt(void) > { > int cpu; > @@ -778,6 +1313,7 @@ void __init rcu_init_rt(void) > rdp->donelist = NULL; > rdp->donetail = &rdp->donelist; > } > + init_rcu_boost_early(); > } > > /* > diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/kernel/rtmutex.c linux-2.6.22-F-boostrcu/kernel/rtmutex.c > --- linux-2.6.22-E-hotplug/kernel/rtmutex.c 2007-07-08 16:32:17.000000000 -0700 > +++ linux-2.6.22-F-boostrcu/kernel/rtmutex.c 2007-08-24 11:24:59.000000000 -0700 > @@ -111,11 +111,12 @@ static inline void mark_rt_mutex_waiters > */ > int rt_mutex_getprio(struct task_struct *task) > { > + int prio = min(task->normal_prio, get_rcu_prio(task)); > + > if (likely(!task_has_pi_waiters(task))) > - return task->normal_prio; > + return prio; > > - return min(task_top_pi_waiter(task)->pi_list_entry.prio, > - task->normal_prio); > + return min(task_top_pi_waiter(task)->pi_list_entry.prio, prio); > } > > /* > diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/kernel/sched.c linux-2.6.22-F-boostrcu/kernel/sched.c > --- linux-2.6.22-E-hotplug/kernel/sched.c 2007-07-08 16:32:17.000000000 -0700 > +++ linux-2.6.22-F-boostrcu/kernel/sched.c 2007-08-24 11:24:59.000000000 -0700 > @@ -1702,6 +1702,7 @@ void fastcall sched_fork(struct task_str > * Make sure we do not leak PI boosting priority to the child: > */ > p->prio = current->normal_prio; > + set_rcu_prio(p, MAX_PRIO); > > INIT_LIST_HEAD(&p->run_list); > p->array = NULL; > @@ -1784,6 +1785,7 @@ void fastcall wake_up_new_task(struct ta > else { > p->prio = current->prio; > p->normal_prio = current->normal_prio; > + set_rcu_prio(p, MAX_PRIO); > list_add_tail(&p->run_list, ¤t->run_list); > p->array = current->array; > p->array->nr_active++; > @@ -3590,6 +3592,8 @@ asmlinkage void __sched schedule(void) > } > profile_hit(SCHED_PROFILING, __builtin_return_address(0)); > > + rcu_preempt_boost(); > + > need_resched: > preempt_disable(); > prev = current; > @@ -5060,6 +5064,7 @@ void __cpuinit init_idle(struct task_str > idle->sleep_avg = 0; > idle->array = NULL; > idle->prio = idle->normal_prio = MAX_PRIO; > + set_rcu_prio(idle, MAX_PRIO); > idle->state = TASK_RUNNING; > idle->cpus_allowed = cpumask_of_cpu(cpu); > set_task_cpu(idle, cpu); > diff -urpNa -X dontdiff linux-2.6.22-E-hotplug/lib/Kconfig.debug linux-2.6.22-F-boostrcu/lib/Kconfig.debug > --- linux-2.6.22-E-hotplug/lib/Kconfig.debug 2007-07-08 16:32:17.000000000 -0700 > +++ linux-2.6.22-F-boostrcu/lib/Kconfig.debug 2007-08-24 11:24:59.000000000 -0700 > @@ -391,6 +391,40 @@ config RCU_TORTURE_TEST > Say M if you want the RCU torture tests to build as a module. > Say N if you are unsure. > > +config RCU_TRACE > + bool "Enable tracing for RCU - currently stats in debugfs" > + select DEBUG_FS > + depends on DEBUG_KERNEL > + default y > + help > + This option provides tracing in RCU which presents stats > + in debugfs for debugging RCU implementation. > + > + Say Y here if you want to enable RCU tracing > + Say N if you are unsure. > + > +config PREEMPT_RCU_BOOST_STATS > + bool "Enable RCU priority-boosting statistic printing" > + depends on PREEMPT_RCU_BOOST > + depends on DEBUG_KERNEL > + default n > + help > + This option enables debug printk()s of RCU boost statistics, > + which are normally only used to debug RCU priority boost > + implementations. > + > + Say N if you are unsure. > + > +config PREEMPT_RCU_BOOST_STATS_INTERVAL > + int "RCU priority-boosting statistic printing interval (seconds)" > + depends on PREEMPT_RCU_BOOST_STATS > + default 100 > + range 10 86400 > + help > + This option controls the timing of debug printk()s of RCU boost > + statistics, which are normally only used to debug RCU priority > + boost implementations. > + > config LKDTM > tristate "Linux Kernel Dump Test Tool Module" > depends on DEBUG_KERNEL -- Gautham R Shenoy Linux Technology Center IBM India. "Freedom comes with a price tag of responsibility, which is still a bargain, because Freedom is priceless!" - To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html