From 100eaa897b32405365ce13248c20fcbfd6e4a85d Mon Sep 17 00:00:00 2001 Message-Id: <100eaa897b32405365ce13248c20fcbfd6e4a85d.1528226387.git.Jim.Somerville@windriver.com> In-Reply-To: References: From: Thomas Gleixner Date: Tue, 26 May 2015 22:50:33 +0000 Subject: [PATCH 27/32] timer: Reduce timer migration overhead if disabled Eric reported that the timer_migration sysctl is not really nice performance wise as it needs to check at every timer insertion whether the feature is enabled or not. Further the check does not live in the timer code, so we have an extra function call which checks an extra cache line to figure out that it is disabled. We can do better and store that information in the per cpu (hr)timer bases. I pondered to use a static key, but that's a nightmare to update from the nohz code and the timer base cache line is hot anyway when we select a timer base. The old logic enabled the timer migration unconditionally if CONFIG_NO_HZ was set even if nohz was disabled on the kernel command line. With this modification, we start off with migration disabled. The user visible sysctl is still set to enabled. If the kernel switches to NOHZ migration is enabled, if the user did not disable it via the sysctl prior to the switch. If nohz=off is on the kernel command line, migration stays disabled no matter what. Before: 47.76% hog [.] main 14.84% [kernel] [k] _raw_spin_lock_irqsave 9.55% [kernel] [k] _raw_spin_unlock_irqrestore 6.71% [kernel] [k] mod_timer 6.24% [kernel] [k] lock_timer_base.isra.38 3.76% [kernel] [k] detach_if_pending 3.71% [kernel] [k] del_timer 2.50% [kernel] [k] internal_add_timer 1.51% [kernel] [k] get_nohz_timer_target 1.28% [kernel] [k] __internal_add_timer 0.78% [kernel] [k] timerfn 0.48% [kernel] [k] wake_up_nohz_cpu After: 48.10% hog [.] main 15.25% [kernel] [k] _raw_spin_lock_irqsave 9.76% [kernel] [k] _raw_spin_unlock_irqrestore 6.50% [kernel] [k] mod_timer 6.44% [kernel] [k] lock_timer_base.isra.38 3.87% [kernel] [k] detach_if_pending 3.80% [kernel] [k] del_timer 2.67% [kernel] [k] internal_add_timer 1.33% [kernel] [k] __internal_add_timer 0.73% [kernel] [k] timerfn 0.54% [kernel] [k] wake_up_nohz_cpu Reported-by: Eric Dumazet Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Paul McKenney Cc: Frederic Weisbecker Cc: Viresh Kumar Cc: John Stultz Cc: Joonwoo Park Cc: Wenbo Wang Link: http://lkml.kernel.org/r/20150526224512.127050787@linutronix.de Signed-off-by: Thomas Gleixner Signed-off-by: Alex Kozyrev Signed-off-by: Jim Somerville --- include/linux/hrtimer.h | 2 ++ include/linux/sched/sysctl.h | 12 -------- include/linux/timer.h | 8 +++++ kernel/hrtimer.c | 48 +++++++++++++++++------------- kernel/rcutree_plugin.h | 2 -- kernel/sched/core.c | 2 -- kernel/sysctl.c | 18 ++++++------ kernel/time/tick-internal.h | 14 +++++++++ kernel/time/tick-sched.c | 25 +++++++++------- kernel/time/timer_list.c | 3 +- kernel/timer.c | 70 ++++++++++++++++++++++++++++++++++++-------- 11 files changed, 133 insertions(+), 71 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index cd04b77..00d4c9b 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -175,6 +175,7 @@ enum hrtimer_base_type { * @clock_was_set: Sequence counter of clock was set events * Note that in RHEL7 clock_was_set is upstream's * clock_was_set_seq (KABI). + * @migration_enabled: The migration of hrtimers to other cpus is enabled * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() * @hres_active: State of high resolution mode @@ -189,6 +190,7 @@ struct hrtimer_cpu_base { raw_spinlock_t lock; unsigned int active_bases; unsigned int clock_was_set; /* clock_was_set_seq */ + bool migration_enabled; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires_next; int hres_active; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 4895484..02ab10e 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -54,24 +54,12 @@ extern unsigned int sysctl_numa_balancing_settle_count; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_time_avg; -extern unsigned int sysctl_timer_migration; extern unsigned int sysctl_sched_shares_window; int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif -#ifdef CONFIG_SCHED_DEBUG -static inline unsigned int get_sysctl_timer_migration(void) -{ - return sysctl_timer_migration; -} -#else -static inline unsigned int get_sysctl_timer_migration(void) -{ - return 1; -} -#endif /* * control realtime throttling: diff --git a/include/linux/timer.h b/include/linux/timer.h index c37d9b9..8eb4558 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -249,6 +249,14 @@ extern void run_local_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#include +extern unsigned int sysctl_timer_migration; +int timer_migration_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif + unsigned long __round_jiffies(unsigned long j, int cpu); unsigned long __round_jiffies_relative(unsigned long j, int cpu); unsigned long round_jiffies(unsigned long j); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 40655c8..55444ab 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -168,19 +168,6 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } } - -/* - * Get the preferred target CPU for NOHZ - */ -static int hrtimer_get_target(int this_cpu, int pinned) -{ -#ifdef CONFIG_NO_HZ_COMMON - if (!pinned && get_sysctl_timer_migration()) - return get_nohz_timer_target(); -#endif - return this_cpu; -} - /* * With HIGHRES=y we do not migrate the timer when it is expiring * before the next event on the target cpu because we cannot reprogram @@ -204,6 +191,24 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) #endif } +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +static inline +struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, + int pinned) +{ + if (pinned || !base->migration_enabled) + return this_cpu_ptr(&hrtimer_bases); + return &per_cpu(hrtimer_bases, get_nohz_timer_target()); +} +#else +static inline +struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, + int pinned) +{ + return this_cpu_ptr(&hrtimer_bases); +} +#endif + /* * Switch the timer base to the current CPU when possible. */ @@ -211,14 +216,13 @@ static inline struct hrtimer_clock_base * switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { + struct hrtimer_cpu_base *new_cpu_base, *this_base; struct hrtimer_clock_base *new_base; - struct hrtimer_cpu_base *new_cpu_base; - int this_cpu = smp_processor_id(); - int cpu = hrtimer_get_target(this_cpu, pinned); int basenum = base->index; + this_base = this_cpu_ptr(&hrtimer_bases); + new_cpu_base = get_target_base(this_base, pinned); again: - new_cpu_base = &per_cpu(hrtimer_bases, cpu); new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { @@ -239,17 +243,19 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; + if (new_cpu_base != this_base && + hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); + new_cpu_base = this_base; timer->base = base; goto again; } timer->base = new_base; } else { - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; + if (new_cpu_base != this_base && + hrtimer_check_target(timer, new_base)) { + new_cpu_base = this_base; goto again; } } diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 8f410cc..7232ecb 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1502,8 +1502,6 @@ module_param(rcu_idle_gp_delay, int, 0644); static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; module_param(rcu_idle_lazy_gp_delay, int, 0644); -extern int tick_nohz_active; - /* * Try to advance callbacks for all flavors of RCU on the current CPU. * Afterwards, if there are any callbacks ready for immediate invocation, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b2333b7..d82e745 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8824,8 +8824,6 @@ void __init sched_init_smp(void) } #endif /* CONFIG_SMP */ -const_debug unsigned int sysctl_timer_migration = 1; - int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8516049..b435155 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -376,15 +376,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "timer_migration", - .data = &sysctl_timer_migration, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", @@ -1200,6 +1191,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + { + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = timer_migration_handler, + }, +#endif { } }; diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index ecd2ff4..3ebdda4 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -165,3 +165,17 @@ extern void do_timer(unsigned long ticks); extern void update_wall_time(void); extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); + +#ifdef CONFIG_NO_HZ_COMMON +extern unsigned long tick_nohz_active; +#else +#define tick_nohz_active (0) +#endif + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +extern void timers_update_migration(void); +#else +static inline void timers_update_migration(void) { } +#endif + +DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 625c116..6c92920 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -412,7 +412,7 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -int tick_nohz_active __read_mostly; +unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ @@ -973,6 +973,16 @@ static void tick_nohz_handler(struct clock_event_device *dev) tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } +static inline void tick_nohz_activate(struct tick_sched *ts, int mode) +{ + if (!tick_nohz_enabled) + return; + ts->nohz_mode = mode; + /* One update is enough */ + if (!test_and_set_bit(0, &tick_nohz_active)) + timers_update_migration(); +} + /** * tick_nohz_switch_to_nohz - switch to nohz mode */ @@ -987,9 +997,6 @@ static void tick_nohz_switch_to_nohz(void) if (tick_switch_to_oneshot(tick_nohz_handler)) return; - tick_nohz_active = 1; - ts->nohz_mode = NOHZ_MODE_LOWRES; - /* * Recycle the hrtimer in ts, so we can share the * hrtimer_forward with the highres code. @@ -1001,6 +1008,7 @@ static void tick_nohz_switch_to_nohz(void) hrtimer_forward_now(&ts->sched_timer, tick_period); hrtimer_set_expires(&ts->sched_timer, next); tick_program_event(next, 1); + tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } /* @@ -1052,6 +1060,7 @@ static inline void tick_check_nohz_this_cpu(void) static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_check_nohz_this_cpu(void) { } +static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } #endif /* CONFIG_NO_HZ_COMMON */ @@ -1137,13 +1146,7 @@ void tick_setup_sched_timer(void) hrtimer_forward(&ts->sched_timer, now, tick_period); hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); - -#ifdef CONFIG_NO_HZ_COMMON - if (tick_nohz_enabled) { - ts->nohz_mode = NOHZ_MODE_HIGHRES; - tick_nohz_active = 1; - } -#endif + tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); } #endif /* HIGH_RES_TIMERS */ diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 9174c0a..d7dd92a 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -20,6 +20,7 @@ #include +#include "tick-internal.h" struct timer_list_iter { int cpu; @@ -29,8 +30,6 @@ struct timer_list_iter { typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); -DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); - /* * This allows printing both to /proc/timer_list and * to the console (on SysRq-Q): diff --git a/kernel/timer.c b/kernel/timer.c index dc85e24..4fcb630 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -49,6 +49,8 @@ #include #include +#include "time/tick-internal.h" + #define CREATE_TRACE_POINTS #include @@ -85,6 +87,7 @@ struct tvec_base { unsigned long next_timer; unsigned long active_timers; int cpu; + bool migration_enabled; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -97,6 +100,58 @@ struct tvec_base boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +unsigned int sysctl_timer_migration = 1; + +void timers_update_migration(void) +{ + bool on = sysctl_timer_migration && tick_nohz_active; + unsigned int cpu; + struct tvec_base *tvec_base = this_cpu_read(tvec_bases); + struct hrtimer_cpu_base *hrtimer_base = this_cpu_ptr(&hrtimer_bases); + + /* Avoid the loop, if nothing to update */ + if (tvec_base->migration_enabled == on) + return; + + for_each_possible_cpu(cpu) { + tvec_base = per_cpu(tvec_bases, cpu); + tvec_base->migration_enabled = on; + hrtimer_base = &per_cpu(hrtimer_bases, cpu); + hrtimer_base->migration_enabled = on; + } +} + +int timer_migration_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + static DEFINE_MUTEX(mutex); + int ret; + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (!ret && write) + timers_update_migration(); + mutex_unlock(&mutex); + return ret; +} + +static inline struct tvec_base *get_target_base(struct tvec_base *base, + int pinned) +{ + if (pinned || !base->migration_enabled) + return this_cpu_read(tvec_bases); + return per_cpu(tvec_bases, get_nohz_timer_target()); +} +#else +static inline struct tvec_base *get_target_base(struct tvec_base *base, + int pinned) +{ + return this_cpu_read(tvec_bases); +} +#endif + /* Functions below help us manage 'deferrable' flag */ static inline unsigned int tbase_get_deferrable(struct tvec_base *base) { @@ -793,11 +848,11 @@ static inline struct tvec_base *switch_timer_base(struct timer_list *timer, static inline int __mod_timer(struct timer_list *timer, unsigned long expires, - bool pending_only, int pinned) + bool pending_only, int pinned) { struct tvec_base *base, *new_base; unsigned long flags; - int ret = 0 , cpu; + int ret = 0; timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -810,16 +865,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - preempt_disable_rt(); - cpu = smp_processor_id(); - -#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) - if (!pinned && get_sysctl_timer_migration()) - cpu = get_nohz_timer_target(); -#endif - preempt_enable_rt(); - - new_base = per_cpu(tvec_bases, cpu); + new_base = get_target_base(base, pinned); if (base != new_base) { /* -- 1.8.3.1