ameliorate the load average account issues with dynticks (rhbz#650934)
This commit is contained in:
		
							parent
							
								
									817eb060de
								
							
						
					
					
						commit
						fc2adbbad5
					
				| @ -756,6 +756,8 @@ Patch12417: tty-open-hangup-race-fixup.patch | ||||
| Patch12420: mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch | ||||
| Patch12421: mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch | ||||
| 
 | ||||
| Patch12430: sched-cure-more-NO_HZ-load-average-woes.patch | ||||
| 
 | ||||
| %endif | ||||
| 
 | ||||
| BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root | ||||
| @ -1415,6 +1417,9 @@ ApplyPatch tty-open-hangup-race-fixup.patch | ||||
| ApplyPatch mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch | ||||
| ApplyPatch mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch | ||||
| 
 | ||||
| # rhbz#650934 | ||||
| ApplyPatch sched-cure-more-NO_HZ-load-average-woes.patch | ||||
| 
 | ||||
| # END OF PATCH APPLICATIONS | ||||
| 
 | ||||
| %endif | ||||
| @ -2029,6 +2034,10 @@ fi | ||||
| #                 ||     || | ||||
| 
 | ||||
| %changelog | ||||
| * Wed Dec 08 2010 Kyle McMartin <kyle@redhat.com> | ||||
| - sched-cure-more-NO_HZ-load-average-woes.patch: fix some of the complaints | ||||
|   in 2.6.35+ about load average with dynticks. (rhbz#650934) | ||||
| 
 | ||||
| * Sat Dec 04 2010 Kyle McMartin <kyle@redhat.com> | ||||
| - Enable C++ symbol demangling with perf by linking against libiberty.a, | ||||
|   which is LGPL2. | ||||
|  | ||||
							
								
								
									
										273
									
								
								sched-cure-more-NO_HZ-load-average-woes.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										273
									
								
								sched-cure-more-NO_HZ-load-average-woes.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,273 @@ | ||||
| From bounces.tip@hpa.at.zytor.com Wed Dec  8 15:40:48 2010 | ||||
| From: tip-bot for Peter Zijlstra <a.p.zijlstra@chello.nl> | ||||
| In-Reply-To: <1291129145.32004.874.camel@laptop> | ||||
| References: <1291129145.32004.874.camel@laptop> | ||||
| Subject: [tip:sched/urgent] sched: Cure more NO_HZ load average woes | ||||
| Message-ID: <tip-0f004f5a696a9434b7214d0d3cbd0525ee77d428@git.kernel.org> | ||||
| Git-Commit-ID: 0f004f5a696a9434b7214d0d3cbd0525ee77d428 | ||||
| 
 | ||||
| Commit-ID:  0f004f5a696a9434b7214d0d3cbd0525ee77d428 | ||||
| Gitweb:     http://git.kernel.org/tip/0f004f5a696a9434b7214d0d3cbd0525ee77d428 | ||||
| Author:     Peter Zijlstra <a.p.zijlstra@chello.nl> | ||||
| AuthorDate: Tue, 30 Nov 2010 19:48:45 +0100 | ||||
| Committer:  Ingo Molnar <mingo@elte.hu> | ||||
| CommitDate: Wed, 8 Dec 2010 20:15:04 +0100 | ||||
| 
 | ||||
| sched: Cure more NO_HZ load average woes | ||||
| 
 | ||||
| There's a long-running regression that proved difficult to fix and | ||||
| which is hitting certain people and is rather annoying in its effects. | ||||
| 
 | ||||
| Damien reported that after 74f5187ac8 (sched: Cure load average vs | ||||
| NO_HZ woes) his load average is unnaturally high, he also noted that | ||||
| even with that patch reverted the load avgerage numbers are not | ||||
| correct. | ||||
| 
 | ||||
| The problem is that the previous patch only solved half the NO_HZ | ||||
| problem, it addressed the part of going into NO_HZ mode, not of | ||||
| comming out of NO_HZ mode. This patch implements that missing half. | ||||
| 
 | ||||
| When comming out of NO_HZ mode there are two important things to take | ||||
| care of: | ||||
| 
 | ||||
|  - Folding the pending idle delta into the global active count. | ||||
|  - Correctly aging the averages for the idle-duration. | ||||
| 
 | ||||
| So with this patch the NO_HZ interaction should be complete and | ||||
| behaviour between CONFIG_NO_HZ=[yn] should be equivalent. | ||||
| 
 | ||||
| Furthermore, this patch slightly changes the load average computation | ||||
| by adding a rounding term to the fixed point multiplication. | ||||
| 
 | ||||
| Reported-by: Damien Wyart <damien.wyart@free.fr> | ||||
| Reported-by: Tim McGrath <tmhikaru@gmail.com> | ||||
| Tested-by: Damien Wyart <damien.wyart@free.fr> | ||||
| Tested-by: Orion Poplawski <orion@cora.nwra.com> | ||||
| Tested-by: Kyle McMartin <kyle@mcmartin.ca> | ||||
| Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> | ||||
| Cc: stable@kernel.org | ||||
| Cc: Chase Douglas <chase.douglas@canonical.com> | ||||
| LKML-Reference: <1291129145.32004.874.camel@laptop> | ||||
| Signed-off-by: Ingo Molnar <mingo@elte.hu> | ||||
| ---
 | ||||
|  include/linux/sched.h |    2 +- | ||||
|  kernel/sched.c        |  150 +++++++++++++++++++++++++++++++++++++++++++++---- | ||||
|  kernel/timer.c        |    2 +- | ||||
|  3 files changed, 141 insertions(+), 13 deletions(-) | ||||
| 
 | ||||
| diff --git a/include/linux/sched.h b/include/linux/sched.h
 | ||||
| index 2c79e92..2238745 100644
 | ||||
| --- a/include/linux/sched.h
 | ||||
| +++ b/include/linux/sched.h
 | ||||
| @@ -143,7 +143,7 @@ extern unsigned long nr_iowait_cpu(int cpu);
 | ||||
|  extern unsigned long this_cpu_load(void); | ||||
|   | ||||
|   | ||||
| -extern void calc_global_load(void);
 | ||||
| +extern void calc_global_load(unsigned long ticks);
 | ||||
|   | ||||
|  extern unsigned long get_parent_ip(unsigned long addr); | ||||
|   | ||||
| diff --git a/kernel/sched.c b/kernel/sched.c
 | ||||
| index dc91a4d..6b7c26a 100644
 | ||||
| --- a/kernel/sched.c
 | ||||
| +++ b/kernel/sched.c
 | ||||
| @@ -3119,6 +3119,15 @@ static long calc_load_fold_active(struct rq *this_rq)
 | ||||
|  	return delta; | ||||
|  } | ||||
|   | ||||
| +static unsigned long
 | ||||
| +calc_load(unsigned long load, unsigned long exp, unsigned long active)
 | ||||
| +{
 | ||||
| +	load *= exp;
 | ||||
| +	load += active * (FIXED_1 - exp);
 | ||||
| +	load += 1UL << (FSHIFT - 1);
 | ||||
| +	return load >> FSHIFT;
 | ||||
| +}
 | ||||
| +
 | ||||
|  #ifdef CONFIG_NO_HZ | ||||
|  /* | ||||
|   * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | ||||
| @@ -3148,6 +3157,128 @@ static long calc_load_fold_idle(void)
 | ||||
|   | ||||
|  	return delta; | ||||
|  } | ||||
| +
 | ||||
| +/**
 | ||||
| + * fixed_power_int - compute: x^n, in O(log n) time
 | ||||
| + *
 | ||||
| + * @x:         base of the power
 | ||||
| + * @frac_bits: fractional bits of @x
 | ||||
| + * @n:         power to raise @x to.
 | ||||
| + *
 | ||||
| + * By exploiting the relation between the definition of the natural power
 | ||||
| + * function: x^n := x*x*...*x (x multiplied by itself for n times), and
 | ||||
| + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
 | ||||
| + * (where: n_i \elem {0, 1}, the binary vector representing n),
 | ||||
| + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
 | ||||
| + * of course trivially computable in O(log_2 n), the length of our binary
 | ||||
| + * vector.
 | ||||
| + */
 | ||||
| +static unsigned long
 | ||||
| +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
 | ||||
| +{
 | ||||
| +	unsigned long result = 1UL << frac_bits;
 | ||||
| +
 | ||||
| +	if (n) for (;;) {
 | ||||
| +		if (n & 1) {
 | ||||
| +			result *= x;
 | ||||
| +			result += 1UL << (frac_bits - 1);
 | ||||
| +			result >>= frac_bits;
 | ||||
| +		}
 | ||||
| +		n >>= 1;
 | ||||
| +		if (!n)
 | ||||
| +			break;
 | ||||
| +		x *= x;
 | ||||
| +		x += 1UL << (frac_bits - 1);
 | ||||
| +		x >>= frac_bits;
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	return result;
 | ||||
| +}
 | ||||
| +
 | ||||
| +/*
 | ||||
| + * a1 = a0 * e + a * (1 - e)
 | ||||
| + *
 | ||||
| + * a2 = a1 * e + a * (1 - e)
 | ||||
| + *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
 | ||||
| + *    = a0 * e^2 + a * (1 - e) * (1 + e)
 | ||||
| + *
 | ||||
| + * a3 = a2 * e + a * (1 - e)
 | ||||
| + *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
 | ||||
| + *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
 | ||||
| + *
 | ||||
| + *  ...
 | ||||
| + *
 | ||||
| + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
 | ||||
| + *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
 | ||||
| + *    = a0 * e^n + a * (1 - e^n)
 | ||||
| + *
 | ||||
| + * [1] application of the geometric series:
 | ||||
| + *
 | ||||
| + *              n         1 - x^(n+1)
 | ||||
| + *     S_n := \Sum x^i = -------------
 | ||||
| + *             i=0          1 - x
 | ||||
| + */
 | ||||
| +static unsigned long
 | ||||
| +calc_load_n(unsigned long load, unsigned long exp,
 | ||||
| +	    unsigned long active, unsigned int n)
 | ||||
| +{
 | ||||
| +
 | ||||
| +	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
 | ||||
| +}
 | ||||
| +
 | ||||
| +/*
 | ||||
| + * NO_HZ can leave us missing all per-cpu ticks calling
 | ||||
| + * calc_load_account_active(), but since an idle CPU folds its delta into
 | ||||
| + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
 | ||||
| + * in the pending idle delta if our idle period crossed a load cycle boundary.
 | ||||
| + *
 | ||||
| + * Once we've updated the global active value, we need to apply the exponential
 | ||||
| + * weights adjusted to the number of cycles missed.
 | ||||
| + */
 | ||||
| +static void calc_global_nohz(unsigned long ticks)
 | ||||
| +{
 | ||||
| +	long delta, active, n;
 | ||||
| +
 | ||||
| +	if (time_before(jiffies, calc_load_update))
 | ||||
| +		return;
 | ||||
| +
 | ||||
| +	/*
 | ||||
| +	 * If we crossed a calc_load_update boundary, make sure to fold
 | ||||
| +	 * any pending idle changes, the respective CPUs might have
 | ||||
| +	 * missed the tick driven calc_load_account_active() update
 | ||||
| +	 * due to NO_HZ.
 | ||||
| +	 */
 | ||||
| +	delta = calc_load_fold_idle();
 | ||||
| +	if (delta)
 | ||||
| +		atomic_long_add(delta, &calc_load_tasks);
 | ||||
| +
 | ||||
| +	/*
 | ||||
| +	 * If we were idle for multiple load cycles, apply them.
 | ||||
| +	 */
 | ||||
| +	if (ticks >= LOAD_FREQ) {
 | ||||
| +		n = ticks / LOAD_FREQ;
 | ||||
| +
 | ||||
| +		active = atomic_long_read(&calc_load_tasks);
 | ||||
| +		active = active > 0 ? active * FIXED_1 : 0;
 | ||||
| +
 | ||||
| +		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
 | ||||
| +		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
 | ||||
| +		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
 | ||||
| +
 | ||||
| +		calc_load_update += n * LOAD_FREQ;
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	/*
 | ||||
| +	 * Its possible the remainder of the above division also crosses
 | ||||
| +	 * a LOAD_FREQ period, the regular check in calc_global_load()
 | ||||
| +	 * which comes after this will take care of that.
 | ||||
| +	 *
 | ||||
| +	 * Consider us being 11 ticks before a cycle completion, and us
 | ||||
| +	 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
 | ||||
| +	 * age us 4 cycles, and the test in calc_global_load() will
 | ||||
| +	 * pick up the final one.
 | ||||
| +	 */
 | ||||
| +}
 | ||||
|  #else | ||||
|  static void calc_load_account_idle(struct rq *this_rq) | ||||
|  { | ||||
| @@ -3157,6 +3288,10 @@ static inline long calc_load_fold_idle(void)
 | ||||
|  { | ||||
|  	return 0; | ||||
|  } | ||||
| +
 | ||||
| +static void calc_global_nohz(unsigned long ticks)
 | ||||
| +{
 | ||||
| +}
 | ||||
|  #endif | ||||
|   | ||||
|  /** | ||||
| @@ -3174,24 +3309,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 | ||||
|  	loads[2] = (avenrun[2] + offset) << shift; | ||||
|  } | ||||
|   | ||||
| -static unsigned long
 | ||||
| -calc_load(unsigned long load, unsigned long exp, unsigned long active)
 | ||||
| -{
 | ||||
| -	load *= exp;
 | ||||
| -	load += active * (FIXED_1 - exp);
 | ||||
| -	return load >> FSHIFT;
 | ||||
| -}
 | ||||
| -
 | ||||
|  /* | ||||
|   * calc_load - update the avenrun load estimates 10 ticks after the | ||||
|   * CPUs have updated calc_load_tasks. | ||||
|   */ | ||||
| -void calc_global_load(void)
 | ||||
| +void calc_global_load(unsigned long ticks)
 | ||||
|  { | ||||
| -	unsigned long upd = calc_load_update + 10;
 | ||||
|  	long active; | ||||
|   | ||||
| -	if (time_before(jiffies, upd))
 | ||||
| +	calc_global_nohz(ticks);
 | ||||
| +
 | ||||
| +	if (time_before(jiffies, calc_load_update + 10))
 | ||||
|  		return; | ||||
|   | ||||
|  	active = atomic_long_read(&calc_load_tasks); | ||||
| diff --git a/kernel/timer.c b/kernel/timer.c
 | ||||
| index 68a9ae7..7bd715f 100644
 | ||||
| --- a/kernel/timer.c
 | ||||
| +++ b/kernel/timer.c
 | ||||
| @@ -1319,7 +1319,7 @@ void do_timer(unsigned long ticks)
 | ||||
|  { | ||||
|  	jiffies_64 += ticks; | ||||
|  	update_wall_time(); | ||||
| -	calc_global_load();
 | ||||
| +	calc_global_load(ticks);
 | ||||
|  } | ||||
|   | ||||
|  #ifdef __ARCH_WANT_SYS_ALARM | ||||
| 
 | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user