kswapd fixes from mmotm
This commit is contained in:
parent
1c8b1fa25a
commit
fe72140793
14
kernel.spec
14
kernel.spec
@ -51,7 +51,7 @@ Summary: The Linux kernel
|
|||||||
# For non-released -rc kernels, this will be prepended with "0.", so
|
# For non-released -rc kernels, this will be prepended with "0.", so
|
||||||
# for example a 3 here will become 0.3
|
# for example a 3 here will become 0.3
|
||||||
#
|
#
|
||||||
%global baserelease 10
|
%global baserelease 11
|
||||||
%global fedora_build %{baserelease}
|
%global fedora_build %{baserelease}
|
||||||
|
|
||||||
# base_sublevel is the kernel version we're starting with and patching
|
# base_sublevel is the kernel version we're starting with and patching
|
||||||
@ -753,6 +753,9 @@ Patch12415: tty-dont-allow-reopen-when-ldisc-is-changing.patch
|
|||||||
Patch12416: tty-ldisc-fix-open-flag-handling.patch
|
Patch12416: tty-ldisc-fix-open-flag-handling.patch
|
||||||
Patch12417: tty-open-hangup-race-fixup.patch
|
Patch12417: tty-open-hangup-race-fixup.patch
|
||||||
|
|
||||||
|
Patch12420: mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch
|
||||||
|
Patch12421: mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch
|
||||||
|
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root
|
BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root
|
||||||
@ -1408,6 +1411,10 @@ ApplyPatch tty-dont-allow-reopen-when-ldisc-is-changing.patch
|
|||||||
ApplyPatch tty-ldisc-fix-open-flag-handling.patch
|
ApplyPatch tty-ldisc-fix-open-flag-handling.patch
|
||||||
ApplyPatch tty-open-hangup-race-fixup.patch
|
ApplyPatch tty-open-hangup-race-fixup.patch
|
||||||
|
|
||||||
|
# backport some fixes for kswapd from mmotm, rhbz#649694
|
||||||
|
ApplyPatch mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch
|
||||||
|
ApplyPatch mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch
|
||||||
|
|
||||||
# END OF PATCH APPLICATIONS
|
# END OF PATCH APPLICATIONS
|
||||||
|
|
||||||
%endif
|
%endif
|
||||||
@ -2021,6 +2028,11 @@ fi
|
|||||||
# || ||
|
# || ||
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Thu Dec 02 2010 Kyle McMartin <kyle@redhat.com> 2.6.36.1-11
|
||||||
|
- Grab some of Mel's fixes from -mmotm to hopefully sort out #649694.
|
||||||
|
They've been tested by a few on that bug on 2.6.35, but let's push
|
||||||
|
it out to a bigger audience.
|
||||||
|
|
||||||
* Mon Nov 29 2010 Kyle McMartin <kyle@redhat.com>
|
* Mon Nov 29 2010 Kyle McMartin <kyle@redhat.com>
|
||||||
- PNP: log PNP resources, as we do for PCI [c1f3f281]
|
- PNP: log PNP resources, as we do for PCI [c1f3f281]
|
||||||
should help us debug resource conflicts (requested by bjorn.)
|
should help us debug resource conflicts (requested by bjorn.)
|
||||||
|
@ -0,0 +1,389 @@
|
|||||||
|
From df43fae25437d7bc7dfff72599c1e825038b67cf Mon Sep 17 00:00:00 2001
|
||||||
|
From: Mel Gorman <mel@csn.ul.ie>
|
||||||
|
Date: Wed, 24 Nov 2010 22:18:23 -0500
|
||||||
|
Subject: [PATCH 1/2] mm: page allocator: Adjust the per-cpu counter threshold when memory is low
|
||||||
|
|
||||||
|
Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory
|
||||||
|
is low") noted that watermarks were based on the vmstat NR_FREE_PAGES. To
|
||||||
|
avoid synchronization overhead, these counters are maintained on a per-cpu
|
||||||
|
basis and drained both periodically and when a threshold is above a
|
||||||
|
threshold. On large CPU systems, the difference between the estimate and
|
||||||
|
real value of NR_FREE_PAGES can be very high. The system can get into a
|
||||||
|
case where pages are allocated far below the min watermark potentially
|
||||||
|
causing livelock issues. The commit solved the problem by taking a better
|
||||||
|
reading of NR_FREE_PAGES when memory was low.
|
||||||
|
|
||||||
|
Unfortately, as reported by Shaohua Li this accurate reading can consume a
|
||||||
|
large amount of CPU time on systems with many sockets due to cache line
|
||||||
|
bouncing. This patch takes a different approach. For large machines
|
||||||
|
where counter drift might be unsafe and while kswapd is awake, the per-cpu
|
||||||
|
thresholds for the target pgdat are reduced to limit the level of drift to
|
||||||
|
what should be a safe level. This incurs a performance penalty in heavy
|
||||||
|
memory pressure by a factor that depends on the workload and the machine
|
||||||
|
but the machine should function correctly without accidentally exhausting
|
||||||
|
all memory on a node. There is an additional cost when kswapd wakes and
|
||||||
|
sleeps but the event is not expected to be frequent - in Shaohua's test
|
||||||
|
case, there was one recorded sleep and wake event at least.
|
||||||
|
|
||||||
|
To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is
|
||||||
|
introduced that takes a more accurate reading of NR_FREE_PAGES when called
|
||||||
|
from wakeup_kswapd, when deciding whether it is really safe to go back to
|
||||||
|
sleep in sleeping_prematurely() and when deciding if a zone is really
|
||||||
|
balanced or not in balance_pgdat(). We are still using an expensive
|
||||||
|
function but limiting how often it is called.
|
||||||
|
|
||||||
|
When the test case is reproduced, the time spent in the watermark
|
||||||
|
functions is reduced. The following report is on the percentage of time
|
||||||
|
spent cumulatively spent in the functions zone_nr_free_pages(),
|
||||||
|
zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(),
|
||||||
|
zone_page_state_snapshot(), zone_page_state().
|
||||||
|
|
||||||
|
vanilla 11.6615%
|
||||||
|
disable-threshold 0.2584%
|
||||||
|
|
||||||
|
Reported-by: Shaohua Li <shaohua.li@intel.com>
|
||||||
|
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
|
||||||
|
Reviewed-by: Christoph Lameter <cl@linux.com>
|
||||||
|
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||||
|
[[http://userweb.kernel.org/~akpm/mmotm/broken-out/mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch]]
|
||||||
|
---
|
||||||
|
include/linux/mmzone.h | 10 ++-----
|
||||||
|
include/linux/vmstat.h | 5 +++
|
||||||
|
mm/mmzone.c | 21 ---------------
|
||||||
|
mm/page_alloc.c | 35 +++++++++++++++++++-----
|
||||||
|
mm/vmscan.c | 23 +++++++++-------
|
||||||
|
mm/vmstat.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++-
|
||||||
|
6 files changed, 115 insertions(+), 47 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
|
||||||
|
index 3984c4e..8d789d7 100644
|
||||||
|
--- a/include/linux/mmzone.h
|
||||||
|
+++ b/include/linux/mmzone.h
|
||||||
|
@@ -448,12 +448,6 @@ static inline int zone_is_oom_locked(const struct zone *zone)
|
||||||
|
return test_bit(ZONE_OOM_LOCKED, &zone->flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
-#ifdef CONFIG_SMP
|
||||||
|
-unsigned long zone_nr_free_pages(struct zone *zone);
|
||||||
|
-#else
|
||||||
|
-#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
|
||||||
|
-#endif /* CONFIG_SMP */
|
||||||
|
-
|
||||||
|
/*
|
||||||
|
* The "priority" of VM scanning is how much of the queues we will scan in one
|
||||||
|
* go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
|
||||||
|
@@ -651,7 +645,9 @@ typedef struct pglist_data {
|
||||||
|
extern struct mutex zonelists_mutex;
|
||||||
|
void build_all_zonelists(void *data);
|
||||||
|
void wakeup_kswapd(struct zone *zone, int order);
|
||||||
|
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
|
||||||
|
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
|
||||||
|
+ int classzone_idx, int alloc_flags);
|
||||||
|
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
|
||||||
|
int classzone_idx, int alloc_flags);
|
||||||
|
enum memmap_context {
|
||||||
|
MEMMAP_EARLY,
|
||||||
|
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
|
||||||
|
index eaaea37..e4cc21c 100644
|
||||||
|
--- a/include/linux/vmstat.h
|
||||||
|
+++ b/include/linux/vmstat.h
|
||||||
|
@@ -254,6 +254,8 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
|
||||||
|
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
|
||||||
|
|
||||||
|
void refresh_cpu_vm_stats(int);
|
||||||
|
+void reduce_pgdat_percpu_threshold(pg_data_t *pgdat);
|
||||||
|
+void restore_pgdat_percpu_threshold(pg_data_t *pgdat);
|
||||||
|
#else /* CONFIG_SMP */
|
||||||
|
|
||||||
|
/*
|
||||||
|
@@ -298,6 +300,9 @@ static inline void __dec_zone_page_state(struct page *page,
|
||||||
|
#define dec_zone_page_state __dec_zone_page_state
|
||||||
|
#define mod_zone_page_state __mod_zone_page_state
|
||||||
|
|
||||||
|
+static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { }
|
||||||
|
+static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { }
|
||||||
|
+
|
||||||
|
static inline void refresh_cpu_vm_stats(int cpu) { }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
diff --git a/mm/mmzone.c b/mm/mmzone.c
|
||||||
|
index e35bfb8..f5b7d17 100644
|
||||||
|
--- a/mm/mmzone.c
|
||||||
|
+++ b/mm/mmzone.c
|
||||||
|
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
|
||||||
|
-
|
||||||
|
-#ifdef CONFIG_SMP
|
||||||
|
-/* Called when a more accurate view of NR_FREE_PAGES is needed */
|
||||||
|
-unsigned long zone_nr_free_pages(struct zone *zone)
|
||||||
|
-{
|
||||||
|
- unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
|
||||||
|
-
|
||||||
|
- /*
|
||||||
|
- * While kswapd is awake, it is considered the zone is under some
|
||||||
|
- * memory pressure. Under pressure, there is a risk that
|
||||||
|
- * per-cpu-counter-drift will allow the min watermark to be breached
|
||||||
|
- * potentially causing a live-lock. While kswapd is awake and
|
||||||
|
- * free pages are low, get a better estimate for free pages
|
||||||
|
- */
|
||||||
|
- if (nr_free_pages < zone->percpu_drift_mark &&
|
||||||
|
- !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
|
||||||
|
- return zone_page_state_snapshot(zone, NR_FREE_PAGES);
|
||||||
|
-
|
||||||
|
- return nr_free_pages;
|
||||||
|
-}
|
||||||
|
-#endif /* CONFIG_SMP */
|
||||||
|
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
|
||||||
|
index f12ad18..0286150 100644
|
||||||
|
--- a/mm/page_alloc.c
|
||||||
|
+++ b/mm/page_alloc.c
|
||||||
|
@@ -1454,24 +1454,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
|
||||||
|
#endif /* CONFIG_FAIL_PAGE_ALLOC */
|
||||||
|
|
||||||
|
/*
|
||||||
|
- * Return 1 if free pages are above 'mark'. This takes into account the order
|
||||||
|
+ * Return true if free pages are above 'mark'. This takes into account the order
|
||||||
|
* of the allocation.
|
||||||
|
*/
|
||||||
|
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
|
||||||
|
- int classzone_idx, int alloc_flags)
|
||||||
|
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
|
||||||
|
+ int classzone_idx, int alloc_flags, long free_pages)
|
||||||
|
{
|
||||||
|
/* free_pages my go negative - that's OK */
|
||||||
|
long min = mark;
|
||||||
|
- long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
|
||||||
|
int o;
|
||||||
|
|
||||||
|
+ free_pages -= (1 << order) + 1;
|
||||||
|
if (alloc_flags & ALLOC_HIGH)
|
||||||
|
min -= min / 2;
|
||||||
|
if (alloc_flags & ALLOC_HARDER)
|
||||||
|
min -= min / 4;
|
||||||
|
|
||||||
|
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
|
||||||
|
- return 0;
|
||||||
|
+ return false;
|
||||||
|
for (o = 0; o < order; o++) {
|
||||||
|
/* At the next order, this order's pages become unavailable */
|
||||||
|
free_pages -= z->free_area[o].nr_free << o;
|
||||||
|
@@ -1480,9 +1480,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
|
||||||
|
min >>= 1;
|
||||||
|
|
||||||
|
if (free_pages <= min)
|
||||||
|
- return 0;
|
||||||
|
+ return false;
|
||||||
|
}
|
||||||
|
- return 1;
|
||||||
|
+ return true;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
|
||||||
|
+ int classzone_idx, int alloc_flags)
|
||||||
|
+{
|
||||||
|
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
|
||||||
|
+ zone_page_state(z, NR_FREE_PAGES));
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
|
||||||
|
+ int classzone_idx, int alloc_flags)
|
||||||
|
+{
|
||||||
|
+ long free_pages = zone_page_state(z, NR_FREE_PAGES);
|
||||||
|
+
|
||||||
|
+ if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
|
||||||
|
+ free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
|
||||||
|
+
|
||||||
|
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
|
||||||
|
+ free_pages);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_NUMA
|
||||||
|
@@ -2436,7 +2455,7 @@ void show_free_areas(void)
|
||||||
|
" all_unreclaimable? %s"
|
||||||
|
"\n",
|
||||||
|
zone->name,
|
||||||
|
- K(zone_nr_free_pages(zone)),
|
||||||
|
+ K(zone_page_state(zone, NR_FREE_PAGES)),
|
||||||
|
K(min_wmark_pages(zone)),
|
||||||
|
K(low_wmark_pages(zone)),
|
||||||
|
K(high_wmark_pages(zone)),
|
||||||
|
diff --git a/mm/vmscan.c b/mm/vmscan.c
|
||||||
|
index c5dfabf..3e71cb1 100644
|
||||||
|
--- a/mm/vmscan.c
|
||||||
|
+++ b/mm/vmscan.c
|
||||||
|
@@ -2082,7 +2082,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
|
||||||
|
if (zone->all_unreclaimable)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
- if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
|
||||||
|
+ if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
|
||||||
|
0, 0))
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
@@ -2169,7 +2169,7 @@ loop_again:
|
||||||
|
shrink_active_list(SWAP_CLUSTER_MAX, zone,
|
||||||
|
&sc, priority, 0);
|
||||||
|
|
||||||
|
- if (!zone_watermark_ok(zone, order,
|
||||||
|
+ if (!zone_watermark_ok_safe(zone, order,
|
||||||
|
high_wmark_pages(zone), 0, 0)) {
|
||||||
|
end_zone = i;
|
||||||
|
break;
|
||||||
|
@@ -2215,7 +2215,7 @@ loop_again:
|
||||||
|
* We put equal pressure on every zone, unless one
|
||||||
|
* zone has way too many pages free already.
|
||||||
|
*/
|
||||||
|
- if (!zone_watermark_ok(zone, order,
|
||||||
|
+ if (!zone_watermark_ok_safe(zone, order,
|
||||||
|
8*high_wmark_pages(zone), end_zone, 0))
|
||||||
|
shrink_zone(priority, zone, &sc);
|
||||||
|
reclaim_state->reclaimed_slab = 0;
|
||||||
|
@@ -2236,7 +2236,7 @@ loop_again:
|
||||||
|
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
|
||||||
|
sc.may_writepage = 1;
|
||||||
|
|
||||||
|
- if (!zone_watermark_ok(zone, order,
|
||||||
|
+ if (!zone_watermark_ok_safe(zone, order,
|
||||||
|
high_wmark_pages(zone), end_zone, 0)) {
|
||||||
|
all_zones_ok = 0;
|
||||||
|
/*
|
||||||
|
@@ -2244,7 +2244,7 @@ loop_again:
|
||||||
|
* means that we have a GFP_ATOMIC allocation
|
||||||
|
* failure risk. Hurry up!
|
||||||
|
*/
|
||||||
|
- if (!zone_watermark_ok(zone, order,
|
||||||
|
+ if (!zone_watermark_ok_safe(zone, order,
|
||||||
|
min_wmark_pages(zone), end_zone, 0))
|
||||||
|
has_under_min_watermark_zone = 1;
|
||||||
|
}
|
||||||
|
@@ -2378,7 +2378,9 @@ static int kswapd(void *p)
|
||||||
|
*/
|
||||||
|
if (!sleeping_prematurely(pgdat, order, remaining)) {
|
||||||
|
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
|
||||||
|
+ restore_pgdat_percpu_threshold(pgdat);
|
||||||
|
schedule();
|
||||||
|
+ reduce_pgdat_percpu_threshold(pgdat);
|
||||||
|
} else {
|
||||||
|
if (remaining)
|
||||||
|
count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
|
||||||
|
@@ -2417,16 +2419,17 @@ void wakeup_kswapd(struct zone *zone, int order)
|
||||||
|
if (!populated_zone(zone))
|
||||||
|
return;
|
||||||
|
|
||||||
|
- pgdat = zone->zone_pgdat;
|
||||||
|
- if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
|
||||||
|
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
|
||||||
|
return;
|
||||||
|
+ pgdat = zone->zone_pgdat;
|
||||||
|
if (pgdat->kswapd_max_order < order)
|
||||||
|
pgdat->kswapd_max_order = order;
|
||||||
|
- trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
|
||||||
|
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
|
||||||
|
- return;
|
||||||
|
if (!waitqueue_active(&pgdat->kswapd_wait))
|
||||||
|
return;
|
||||||
|
+ if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
|
||||||
|
+ return;
|
||||||
|
+
|
||||||
|
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
|
||||||
|
wake_up_interruptible(&pgdat->kswapd_wait);
|
||||||
|
}
|
||||||
|
|
||||||
|
diff --git a/mm/vmstat.c b/mm/vmstat.c
|
||||||
|
index 355a9e6..4d7faeb 100644
|
||||||
|
--- a/mm/vmstat.c
|
||||||
|
+++ b/mm/vmstat.c
|
||||||
|
@@ -81,6 +81,30 @@ EXPORT_SYMBOL(vm_stat);
|
||||||
|
|
||||||
|
#ifdef CONFIG_SMP
|
||||||
|
|
||||||
|
+static int calculate_pressure_threshold(struct zone *zone)
|
||||||
|
+{
|
||||||
|
+ int threshold;
|
||||||
|
+ int watermark_distance;
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * As vmstats are not up to date, there is drift between the estimated
|
||||||
|
+ * and real values. For high thresholds and a high number of CPUs, it
|
||||||
|
+ * is possible for the min watermark to be breached while the estimated
|
||||||
|
+ * value looks fine. The pressure threshold is a reduced value such
|
||||||
|
+ * that even the maximum amount of drift will not accidentally breach
|
||||||
|
+ * the min watermark
|
||||||
|
+ */
|
||||||
|
+ watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
|
||||||
|
+ threshold = max(1, (int)(watermark_distance / num_online_cpus()));
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * Maximum threshold is 125
|
||||||
|
+ */
|
||||||
|
+ threshold = min(125, threshold);
|
||||||
|
+
|
||||||
|
+ return threshold;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
static int calculate_threshold(struct zone *zone)
|
||||||
|
{
|
||||||
|
int threshold;
|
||||||
|
@@ -159,6 +183,48 @@ static void refresh_zone_stat_thresholds(void)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
+void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
|
||||||
|
+{
|
||||||
|
+ struct zone *zone;
|
||||||
|
+ int cpu;
|
||||||
|
+ int threshold;
|
||||||
|
+ int i;
|
||||||
|
+
|
||||||
|
+ get_online_cpus();
|
||||||
|
+ for (i = 0; i < pgdat->nr_zones; i++) {
|
||||||
|
+ zone = &pgdat->node_zones[i];
|
||||||
|
+ if (!zone->percpu_drift_mark)
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ threshold = calculate_pressure_threshold(zone);
|
||||||
|
+ for_each_online_cpu(cpu)
|
||||||
|
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
|
||||||
|
+ = threshold;
|
||||||
|
+ }
|
||||||
|
+ put_online_cpus();
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
|
||||||
|
+{
|
||||||
|
+ struct zone *zone;
|
||||||
|
+ int cpu;
|
||||||
|
+ int threshold;
|
||||||
|
+ int i;
|
||||||
|
+
|
||||||
|
+ get_online_cpus();
|
||||||
|
+ for (i = 0; i < pgdat->nr_zones; i++) {
|
||||||
|
+ zone = &pgdat->node_zones[i];
|
||||||
|
+ if (!zone->percpu_drift_mark)
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ threshold = calculate_threshold(zone);
|
||||||
|
+ for_each_online_cpu(cpu)
|
||||||
|
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
|
||||||
|
+ = threshold;
|
||||||
|
+ }
|
||||||
|
+ put_online_cpus();
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
/*
|
||||||
|
* For use when we know that interrupts are disabled.
|
||||||
|
*/
|
||||||
|
@@ -826,7 +892,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
|
||||||
|
"\n scanned %lu"
|
||||||
|
"\n spanned %lu"
|
||||||
|
"\n present %lu",
|
||||||
|
- zone_nr_free_pages(zone),
|
||||||
|
+ zone_page_state(zone, NR_FREE_PAGES),
|
||||||
|
min_wmark_pages(zone),
|
||||||
|
low_wmark_pages(zone),
|
||||||
|
high_wmark_pages(zone),
|
||||||
|
--
|
||||||
|
1.7.3.2
|
||||||
|
|
@ -0,0 +1,167 @@
|
|||||||
|
From 82e3d4969144377d13da97d511e849e8cf3e6dcc Mon Sep 17 00:00:00 2001
|
||||||
|
From: Mel Gorman <mel@csn.ul.ie>
|
||||||
|
Date: Wed, 24 Nov 2010 22:24:24 -0500
|
||||||
|
Subject: [PATCH 2/2] mm: vmstat: Use a single setter function and callback for adjusting percpu thresholds
|
||||||
|
|
||||||
|
reduce_pgdat_percpu_threshold() and restore_pgdat_percpu_threshold() exist
|
||||||
|
to adjust the per-cpu vmstat thresholds while kswapd is awake to avoid
|
||||||
|
errors due to counter drift. The functions duplicate some code so this
|
||||||
|
patch replaces them with a single set_pgdat_percpu_threshold() that takes
|
||||||
|
a callback function to calculate the desired threshold as a parameter.
|
||||||
|
|
||||||
|
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
|
||||||
|
Reviewed-by: Christoph Lameter <cl@linux.com>
|
||||||
|
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
|
||||||
|
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||||
|
[the various mmotm patches updating this were rolled up. --kyle]
|
||||||
|
[[http://userweb.kernel.org/~akpm/mmotm/broken-out/mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds-fix-set_pgdat_percpu_threshold-dont-use-for_each_online_cpu.patch]]
|
||||||
|
---
|
||||||
|
include/linux/vmstat.h | 10 ++++++----
|
||||||
|
mm/vmscan.c | 19 +++++++++++++++++--
|
||||||
|
mm/vmstat.c | 36 +++++++-----------------------------
|
||||||
|
3 files changed, 30 insertions(+), 35 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
|
||||||
|
index e4cc21c..833e676 100644
|
||||||
|
--- a/include/linux/vmstat.h
|
||||||
|
+++ b/include/linux/vmstat.h
|
||||||
|
@@ -254,8 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
|
||||||
|
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
|
||||||
|
|
||||||
|
void refresh_cpu_vm_stats(int);
|
||||||
|
-void reduce_pgdat_percpu_threshold(pg_data_t *pgdat);
|
||||||
|
-void restore_pgdat_percpu_threshold(pg_data_t *pgdat);
|
||||||
|
+
|
||||||
|
+int calculate_pressure_threshold(struct zone *zone);
|
||||||
|
+int calculate_normal_threshold(struct zone *zone);
|
||||||
|
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
|
||||||
|
+ int (*calculate_pressure)(struct zone *));
|
||||||
|
#else /* CONFIG_SMP */
|
||||||
|
|
||||||
|
/*
|
||||||
|
@@ -300,8 +303,7 @@ static inline void __dec_zone_page_state(struct page *page,
|
||||||
|
#define dec_zone_page_state __dec_zone_page_state
|
||||||
|
#define mod_zone_page_state __mod_zone_page_state
|
||||||
|
|
||||||
|
-static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { }
|
||||||
|
-static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { }
|
||||||
|
+#define set_pgdat_percpu_threshold(pgdat, callback) { }
|
||||||
|
|
||||||
|
static inline void refresh_cpu_vm_stats(int cpu) { }
|
||||||
|
#endif
|
||||||
|
diff --git a/mm/vmscan.c b/mm/vmscan.c
|
||||||
|
index 3e71cb1..ba39948 100644
|
||||||
|
--- a/mm/vmscan.c
|
||||||
|
+++ b/mm/vmscan.c
|
||||||
|
@@ -2378,9 +2378,24 @@ static int kswapd(void *p)
|
||||||
|
*/
|
||||||
|
if (!sleeping_prematurely(pgdat, order, remaining)) {
|
||||||
|
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
|
||||||
|
- restore_pgdat_percpu_threshold(pgdat);
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * vmstat counters are not perfectly
|
||||||
|
+ * accurate and the estimated value
|
||||||
|
+ * for counters such as NR_FREE_PAGES
|
||||||
|
+ * can deviate from the true value by
|
||||||
|
+ * nr_online_cpus * threshold. To
|
||||||
|
+ * avoid the zone watermarks being
|
||||||
|
+ * breached while under pressure, we
|
||||||
|
+ * reduce the per-cpu vmstat threshold
|
||||||
|
+ * while kswapd is awake and restore
|
||||||
|
+ * them before going back to sleep.
|
||||||
|
+ */
|
||||||
|
+ set_pgdat_percpu_threshold(pgdat,
|
||||||
|
+ calculate_normal_threshold);
|
||||||
|
schedule();
|
||||||
|
- reduce_pgdat_percpu_threshold(pgdat);
|
||||||
|
+ set_pgdat_percpu_threshold(pgdat,
|
||||||
|
+ calculate_pressure_threshold);
|
||||||
|
} else {
|
||||||
|
if (remaining)
|
||||||
|
count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
|
||||||
|
diff --git a/mm/vmstat.c b/mm/vmstat.c
|
||||||
|
index 4d7faeb..511c2c0 100644
|
||||||
|
--- a/mm/vmstat.c
|
||||||
|
+++ b/mm/vmstat.c
|
||||||
|
@@ -81,7 +81,7 @@ EXPORT_SYMBOL(vm_stat);
|
||||||
|
|
||||||
|
#ifdef CONFIG_SMP
|
||||||
|
|
||||||
|
-static int calculate_pressure_threshold(struct zone *zone)
|
||||||
|
+int calculate_pressure_threshold(struct zone *zone)
|
||||||
|
{
|
||||||
|
int threshold;
|
||||||
|
int watermark_distance;
|
||||||
|
@@ -105,7 +105,7 @@ static int calculate_pressure_threshold(struct zone *zone)
|
||||||
|
return threshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
-static int calculate_threshold(struct zone *zone)
|
||||||
|
+int calculate_normal_threshold(struct zone *zone)
|
||||||
|
{
|
||||||
|
int threshold;
|
||||||
|
int mem; /* memory in 128 MB units */
|
||||||
|
@@ -164,7 +164,7 @@ static void refresh_zone_stat_thresholds(void)
|
||||||
|
for_each_populated_zone(zone) {
|
||||||
|
unsigned long max_drift, tolerate_drift;
|
||||||
|
|
||||||
|
- threshold = calculate_threshold(zone);
|
||||||
|
+ threshold = calculate_normal_threshold(zone);
|
||||||
|
|
||||||
|
for_each_online_cpu(cpu)
|
||||||
|
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
|
||||||
|
@@ -183,46 +183,24 @@ static void refresh_zone_stat_thresholds(void)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
-void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
|
||||||
|
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
|
||||||
|
+ int (*calculate_pressure)(struct zone *))
|
||||||
|
{
|
||||||
|
struct zone *zone;
|
||||||
|
int cpu;
|
||||||
|
int threshold;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
- get_online_cpus();
|
||||||
|
- for (i = 0; i < pgdat->nr_zones; i++) {
|
||||||
|
- zone = &pgdat->node_zones[i];
|
||||||
|
- if (!zone->percpu_drift_mark)
|
||||||
|
- continue;
|
||||||
|
-
|
||||||
|
- threshold = calculate_pressure_threshold(zone);
|
||||||
|
- for_each_online_cpu(cpu)
|
||||||
|
- per_cpu_ptr(zone->pageset, cpu)->stat_threshold
|
||||||
|
- = threshold;
|
||||||
|
- }
|
||||||
|
- put_online_cpus();
|
||||||
|
-}
|
||||||
|
-
|
||||||
|
-void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
|
||||||
|
-{
|
||||||
|
- struct zone *zone;
|
||||||
|
- int cpu;
|
||||||
|
- int threshold;
|
||||||
|
- int i;
|
||||||
|
-
|
||||||
|
- get_online_cpus();
|
||||||
|
for (i = 0; i < pgdat->nr_zones; i++) {
|
||||||
|
zone = &pgdat->node_zones[i];
|
||||||
|
if (!zone->percpu_drift_mark)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
- threshold = calculate_threshold(zone);
|
||||||
|
- for_each_online_cpu(cpu)
|
||||||
|
+ threshold = (*calculate_pressure)(zone);
|
||||||
|
+ for_each_possible_cpu(cpu)
|
||||||
|
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
|
||||||
|
= threshold;
|
||||||
|
}
|
||||||
|
- put_online_cpus();
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
--
|
||||||
|
1.7.3.2
|
||||||
|
|
Loading…
Reference in New Issue
Block a user