444 lines
11 KiB
Diff
444 lines
11 KiB
Diff
|
From 3c549c7494e729a68b64ac5519bcf1506b24f945 Mon Sep 17 00:00:00 2001
|
||
|
From: Laurent Dufour <ldufour@linux.ibm.com>
|
||
|
Date: Wed, 25 Nov 2020 18:03:45 +0100
|
||
|
Subject: [PATCH 3/3] drmgr: introduce NUMA based LMB removal
|
||
|
|
||
|
When the NUMA topology can be read, all the LMBs found in the Device Tree
|
||
|
are linked the corresponding node. LMB not associated to node are
|
||
|
considered as not used.
|
||
|
|
||
|
LMB associated to CPU less node are accounted separately because they will
|
||
|
be targeted first to be remove. The LMB are removed from the CPU less nodes
|
||
|
to reach an average number LMBs per CPU less node.
|
||
|
|
||
|
Node with CPU have a ration indexed on their number of CPUs. The higher a
|
||
|
node have CPU the lower number LMB will be removed. This way node with a
|
||
|
high number of CPU will get a higher amount of memory.
|
||
|
|
||
|
When a LMB can't be removed (because its memory can't be offlined by the
|
||
|
kernel), the LMB count for node is decremented and the LMB is removed from
|
||
|
the node's LMB list. This way, it is no more accounted as 'active' and the
|
||
|
removal operation will continue without taking it in account anymore.
|
||
|
|
||
|
The removal is done through the remove by DRC index API, allowing to remove
|
||
|
a LMB at a time. One futur optimization would be to extend that API to
|
||
|
remove a linear range of LMB each time.
|
||
|
|
||
|
If the NUMA topology can't be read, we fallback using the legacy remove
|
||
|
way.
|
||
|
|
||
|
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
|
||
|
---
|
||
|
src/drmgr/drslot_chrp_mem.c | 335 +++++++++++++++++++++++++++++++++++-
|
||
|
src/drmgr/ofdt.h | 2 +
|
||
|
2 files changed, 336 insertions(+), 1 deletion(-)
|
||
|
|
||
|
diff --git a/src/drmgr/drslot_chrp_mem.c b/src/drmgr/drslot_chrp_mem.c
|
||
|
index 502aa3e9fff0..47d9f7b8ed90 100644
|
||
|
--- a/src/drmgr/drslot_chrp_mem.c
|
||
|
+++ b/src/drmgr/drslot_chrp_mem.c
|
||
|
@@ -31,12 +31,16 @@
|
||
|
#include "dr.h"
|
||
|
#include "ofdt.h"
|
||
|
#include "drmem.h"
|
||
|
+#include "common_numa.h"
|
||
|
|
||
|
static int block_sz_bytes = 0;
|
||
|
static char *state_strs[] = {"offline", "online"};
|
||
|
|
||
|
static char *usagestr = "-c mem {-a | -r} {-q <quantity> -p {variable_weight | ent_capacity} | {-q <quantity> | -s [<drc_name> | <drc_index>]}}";
|
||
|
|
||
|
+static struct numa_topology numa;
|
||
|
+static int numa_enabled = 0;
|
||
|
+
|
||
|
/**
|
||
|
* mem_usage
|
||
|
* @brief return usage string
|
||
|
@@ -306,6 +310,31 @@ get_mem_node_lmbs(struct lmb_list_head *lmb_list)
|
||
|
return rc;
|
||
|
}
|
||
|
|
||
|
+static int link_lmb_to_numa_node(struct dr_node *lmb)
|
||
|
+{
|
||
|
+ int nid;
|
||
|
+ struct numa_node *node;
|
||
|
+
|
||
|
+ nid = numa_aa_index_to_node(&numa, lmb->lmb_aa_index);
|
||
|
+ if (nid == NUMA_NO_NODE)
|
||
|
+ return 0;
|
||
|
+
|
||
|
+ node = numa_fetch_node(&numa, nid);
|
||
|
+ if (!node)
|
||
|
+ return -ENOMEM;
|
||
|
+
|
||
|
+ lmb->lmb_numa_next = node->lmbs;
|
||
|
+ node->lmbs = lmb;
|
||
|
+ node->n_lmbs++;
|
||
|
+
|
||
|
+ if (node->n_cpus)
|
||
|
+ numa.lmb_count++;
|
||
|
+ else
|
||
|
+ numa.cpuless_lmb_count++;
|
||
|
+
|
||
|
+ return 0;
|
||
|
+}
|
||
|
+
|
||
|
int add_lmb(struct lmb_list_head *lmb_list, uint32_t drc_index,
|
||
|
uint64_t address, uint64_t lmb_sz, uint32_t aa_index,
|
||
|
uint32_t flags)
|
||
|
@@ -324,6 +353,9 @@ int add_lmb(struct lmb_list_head *lmb_list, uint32_t drc_index,
|
||
|
lmb->lmb_address = address;
|
||
|
lmb->lmb_aa_index = aa_index;
|
||
|
|
||
|
+ if (numa_enabled && link_lmb_to_numa_node(lmb))
|
||
|
+ return -ENOMEM;
|
||
|
+
|
||
|
if (flags & DRMEM_ASSIGNED) {
|
||
|
int rc;
|
||
|
|
||
|
@@ -490,7 +522,7 @@ get_dynamic_reconfig_lmbs(struct lmb_list_head *lmb_list)
|
||
|
|
||
|
if (stat(DYNAMIC_RECONFIG_MEM_V1, &sbuf) == 0) {
|
||
|
rc = get_dynamic_reconfig_lmbs_v1(lmb_sz, lmb_list);
|
||
|
- } else if (is_lsslot_cmd &&
|
||
|
+ } else if ((is_lsslot_cmd || numa_enabled) &&
|
||
|
stat(DYNAMIC_RECONFIG_MEM_V2, &sbuf) == 0) {
|
||
|
rc = get_dynamic_reconfig_lmbs_v2(lmb_sz, lmb_list);
|
||
|
} else {
|
||
|
@@ -1424,11 +1456,312 @@ int valid_mem_options(void)
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
+static int remove_lmb_by_index(uint32_t drc_index)
|
||
|
+{
|
||
|
+ char cmdbuf[128];
|
||
|
+ int offset;
|
||
|
+
|
||
|
+ offset = sprintf(cmdbuf, "memory remove index 0x%x", drc_index);
|
||
|
+
|
||
|
+ return __do_kernel_dlpar(cmdbuf, offset, 1 /* Don't report error */);
|
||
|
+}
|
||
|
+
|
||
|
+static int remove_lmb_from_node(struct numa_node *node, uint32_t count)
|
||
|
+{
|
||
|
+ struct dr_node *lmb;
|
||
|
+ int err, done = 0, unlinked = 0;
|
||
|
+
|
||
|
+ say(DEBUG, "Try removing %d / %d LMBs from node %d\n",
|
||
|
+ count, node->n_lmbs, node->node_id);
|
||
|
+
|
||
|
+ for (lmb = node->lmbs; lmb && done < count; lmb = lmb->lmb_numa_next) {
|
||
|
+ unlinked ++;
|
||
|
+ err = remove_lmb_by_index(lmb->drc_index);
|
||
|
+ if (err)
|
||
|
+ say(WARN,"Can't remove LMB node:%d index:0x%x: %s\n",
|
||
|
+ node->node_id, lmb->drc_index, strerror(-err));
|
||
|
+ else
|
||
|
+ done++;
|
||
|
+ }
|
||
|
+
|
||
|
+ /*
|
||
|
+ * Decrement the node LMB's count since whatever is the success
|
||
|
+ * of the removal operation, it will not be tried again on that
|
||
|
+ * LMB.
|
||
|
+ */
|
||
|
+ node->n_lmbs -= unlinked;
|
||
|
+
|
||
|
+ /*
|
||
|
+ * Update the node's list of LMB to not process the one we removed or
|
||
|
+ * tried to removed again.
|
||
|
+ */
|
||
|
+ node->lmbs = lmb;
|
||
|
+
|
||
|
+ /* Update numa's counters */
|
||
|
+ if (node->n_cpus)
|
||
|
+ numa.lmb_count -= unlinked;
|
||
|
+ else
|
||
|
+ numa.cpuless_node_count -= unlinked;
|
||
|
+
|
||
|
+ if (!node->n_lmbs) {
|
||
|
+ node->ratio = 0; /* for sanity only */
|
||
|
+ if (node->n_cpus)
|
||
|
+ numa.cpu_count -= node->n_cpus;
|
||
|
+ else
|
||
|
+ numa.cpuless_node_count--;
|
||
|
+ }
|
||
|
+
|
||
|
+ say(INFO, "Removed %d LMBs from node %d\n", done, node->node_id);
|
||
|
+ return done;
|
||
|
+}
|
||
|
+
|
||
|
+#define min(a,b) ((a < b) ? a : b)
|
||
|
+
|
||
|
+static void update_cpuless_node_ratio(void)
|
||
|
+{
|
||
|
+ struct numa_node *node;
|
||
|
+ int nid;
|
||
|
+
|
||
|
+ /*
|
||
|
+ * Assumptions:
|
||
|
+ * 1. numa->cpuless_node_count is up to date
|
||
|
+ * 2. numa->cpuless_lmb_count is up to date
|
||
|
+ * Nodes with no memory and nodes with CPUs are ignored here.
|
||
|
+ */
|
||
|
+ numa_foreach_node(&numa, nid, node) {
|
||
|
+ if (node->n_cpus ||!node->n_lmbs)
|
||
|
+ continue;
|
||
|
+ node->ratio = (node->n_lmbs * 100) / numa.cpuless_lmb_count;
|
||
|
+ }
|
||
|
+}
|
||
|
+
|
||
|
+/*
|
||
|
+ * Remove LMBs from node without CPUs only.
|
||
|
+ * The more the node has LMBs, the more LMBs will be removed from it.
|
||
|
+ *
|
||
|
+ * We have to retry the operation multiple times because some LMB cannot be
|
||
|
+ * removed due to the page usage in the kernel. In that case, that LMB is no
|
||
|
+ * more taken in account and the node's LMB count is decremented, assuming that
|
||
|
+ * LMB is unremovable at this time. Thus each node's ratio has to be computed on
|
||
|
+ * each iteration. This is not a big deal, usually, there are not so much nodes.
|
||
|
+ */
|
||
|
+static int remove_cpuless_lmbs(uint32_t count)
|
||
|
+{
|
||
|
+ struct numa_node *node;
|
||
|
+ int nid;
|
||
|
+ uint32_t total = count, todo, done = 0, this_loop;
|
||
|
+
|
||
|
+ while (count) {
|
||
|
+ count = min(count, numa.cpuless_lmb_count);
|
||
|
+ if (!count)
|
||
|
+ break;
|
||
|
+
|
||
|
+ update_cpuless_node_ratio();
|
||
|
+
|
||
|
+ this_loop = 0;
|
||
|
+ numa_foreach_node(&numa, nid, node) {
|
||
|
+ if (!node->n_lmbs || node->n_cpus)
|
||
|
+ continue;
|
||
|
+
|
||
|
+ todo = (count * node->ratio) / 100;
|
||
|
+ todo = min(todo, node->n_lmbs);
|
||
|
+ /* Fix rounded value to 0 */
|
||
|
+ if (!todo && node->n_lmbs)
|
||
|
+ todo = (count - this_loop);
|
||
|
+
|
||
|
+ if (todo)
|
||
|
+ todo = remove_lmb_from_node(node, todo);
|
||
|
+
|
||
|
+ this_loop += todo;
|
||
|
+ done += todo;
|
||
|
+ if (done >= total)
|
||
|
+ break;
|
||
|
+ }
|
||
|
+
|
||
|
+ /* Don't continue if we didn't make any progress. */
|
||
|
+ if (!this_loop)
|
||
|
+ break;
|
||
|
+
|
||
|
+ count -= this_loop;
|
||
|
+ }
|
||
|
+
|
||
|
+ say(DEBUG, "%d / %d LMBs removed from the CPU less nodes\n",
|
||
|
+ done, total);
|
||
|
+ return done;
|
||
|
+}
|
||
|
+
|
||
|
+static void update_node_ratio(void)
|
||
|
+{
|
||
|
+ int nid;
|
||
|
+ struct numa_node *node, *n, **p;
|
||
|
+ uint32_t cpu_ratio, mem_ratio;
|
||
|
+
|
||
|
+ /*
|
||
|
+ * Assumptions:
|
||
|
+ * 1. numa->cpu_count is up to date
|
||
|
+ * 2. numa->lmb_count is up to date
|
||
|
+ * Nodes with no memory and nodes with no CPU are ignored here.
|
||
|
+ */
|
||
|
+
|
||
|
+ numa.ratio = NULL;
|
||
|
+ numa_foreach_node(&numa, nid, node) {
|
||
|
+ if (!node->n_lmbs || !node->n_cpus)
|
||
|
+ continue;
|
||
|
+ cpu_ratio = (node->n_cpus * 100) / numa.cpu_count;
|
||
|
+ mem_ratio = (node->n_lmbs * 100) / numa.lmb_count;
|
||
|
+
|
||
|
+ /* Say that CPU ratio is 90% of the ratio */
|
||
|
+ node->ratio = (cpu_ratio * 9 + mem_ratio) / 10;
|
||
|
+ }
|
||
|
+
|
||
|
+ /* Create an ordered link of the nodes */
|
||
|
+ numa_foreach_node(&numa, nid, node) {
|
||
|
+ if (!node->n_lmbs || !node->n_cpus)
|
||
|
+ continue;
|
||
|
+
|
||
|
+ p = &numa.ratio;
|
||
|
+ for (n = numa.ratio;
|
||
|
+ n && n->ratio < node->ratio; n = n->ratio_next)
|
||
|
+ p = &n->ratio_next;
|
||
|
+ *p = node;
|
||
|
+ node->ratio_next = n;
|
||
|
+ }
|
||
|
+}
|
||
|
+
|
||
|
+/*
|
||
|
+ * Remove LMBs from node with CPUs.
|
||
|
+ *
|
||
|
+ * The less a node has CPU, the more memory will be removed from it.
|
||
|
+ *
|
||
|
+ * As for the CPU less nodes, we must iterate because some LMBs may not be
|
||
|
+ * removable at this time.
|
||
|
+ */
|
||
|
+static int remove_cpu_lmbs(uint32_t count)
|
||
|
+{
|
||
|
+ struct numa_node *node;
|
||
|
+ uint32_t total = count, todo, done = 0, this_loop;
|
||
|
+ uint32_t new_lmb_count;
|
||
|
+
|
||
|
+ while(count) {
|
||
|
+ count = min(count, numa.lmb_count);
|
||
|
+ if (!count)
|
||
|
+ break;
|
||
|
+
|
||
|
+ update_node_ratio();
|
||
|
+
|
||
|
+ new_lmb_count = numa.lmb_count - count;
|
||
|
+
|
||
|
+ this_loop = 0;
|
||
|
+ numa_foreach_node_by_ratio(&numa, node) {
|
||
|
+ if (!node->n_lmbs || !node->n_cpus)
|
||
|
+ continue;
|
||
|
+
|
||
|
+ todo = (new_lmb_count * node->ratio) / 100;
|
||
|
+ todo = node->n_lmbs - min(todo, node->n_lmbs);
|
||
|
+ todo = min(count, todo);
|
||
|
+
|
||
|
+ if (todo) {
|
||
|
+ todo = remove_lmb_from_node(node, todo);
|
||
|
+ count -= todo;
|
||
|
+ this_loop += todo;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (!count)
|
||
|
+ break;
|
||
|
+ }
|
||
|
+
|
||
|
+ /* Don't continue if we didn't make any progress. */
|
||
|
+ if (!this_loop)
|
||
|
+ break;
|
||
|
+ done += this_loop;
|
||
|
+ }
|
||
|
+
|
||
|
+ say(DEBUG, "%d / %d LMBs removed from the CPU nodes\n",
|
||
|
+ done, total);
|
||
|
+ return done;
|
||
|
+}
|
||
|
+
|
||
|
+static void build_numa_topology(void)
|
||
|
+{
|
||
|
+ int rc;
|
||
|
+
|
||
|
+ rc = numa_get_topology(&numa);
|
||
|
+ if (rc)
|
||
|
+ return;
|
||
|
+
|
||
|
+ numa_enabled = 1;
|
||
|
+}
|
||
|
+
|
||
|
+static void clear_numa_lmb_links(void)
|
||
|
+{
|
||
|
+ int nid;
|
||
|
+ struct numa_node *node;
|
||
|
+
|
||
|
+ numa_foreach_node(&numa, nid, node)
|
||
|
+ node->lmbs = NULL;
|
||
|
+}
|
||
|
+
|
||
|
+static int numa_based_remove(uint32_t count)
|
||
|
+{
|
||
|
+ struct lmb_list_head *lmb_list;
|
||
|
+ struct numa_node *node;
|
||
|
+ int nid;
|
||
|
+ uint32_t done = 0;
|
||
|
+
|
||
|
+ /*
|
||
|
+ * Read the LMBs
|
||
|
+ * Link the LMBs to their node
|
||
|
+ * Update global counter
|
||
|
+ */
|
||
|
+ lmb_list = get_lmbs(LMB_NORMAL_SORT);
|
||
|
+ if (lmb_list == NULL) {
|
||
|
+ clear_numa_lmb_links();
|
||
|
+ return -1;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (!numa.node_count) {
|
||
|
+ clear_numa_lmb_links();
|
||
|
+ free_lmbs(lmb_list);
|
||
|
+ return -EINVAL;
|
||
|
+ }
|
||
|
+
|
||
|
+ numa_foreach_node(&numa, nid, node) {
|
||
|
+ say(INFO, "node %4d %4d CPUs %8d LMBs\n",
|
||
|
+ nid, node->n_cpus, node->n_lmbs);
|
||
|
+ }
|
||
|
+
|
||
|
+ done += remove_cpuless_lmbs(count);
|
||
|
+ count -= done;
|
||
|
+
|
||
|
+ done += remove_cpu_lmbs(count);
|
||
|
+
|
||
|
+ report_resource_count(done);
|
||
|
+
|
||
|
+ clear_numa_lmb_links();
|
||
|
+ free_lmbs(lmb_list);
|
||
|
+ return 0;
|
||
|
+}
|
||
|
+
|
||
|
int do_mem_kernel_dlpar(void)
|
||
|
{
|
||
|
char cmdbuf[128];
|
||
|
int rc, offset;
|
||
|
|
||
|
+
|
||
|
+ if (usr_action == REMOVE && usr_drc_count) {
|
||
|
+ build_numa_topology();
|
||
|
+ if (numa_enabled) {
|
||
|
+ if (!numa_based_remove(usr_drc_count))
|
||
|
+ return 0;
|
||
|
+
|
||
|
+ /*
|
||
|
+ * If the NUMA based removal failed, lets try the legacy
|
||
|
+ * way.
|
||
|
+ */
|
||
|
+ say(WARN, "Can't do NUMA based removal operation.\n");
|
||
|
+ }
|
||
|
+ }
|
||
|
+
|
||
|
offset = sprintf(cmdbuf, "%s ", "memory");
|
||
|
|
||
|
switch (usr_action) {
|
||
|
diff --git a/src/drmgr/ofdt.h b/src/drmgr/ofdt.h
|
||
|
index 3850a77229b4..3c2840b2e0ee 100644
|
||
|
--- a/src/drmgr/ofdt.h
|
||
|
+++ b/src/drmgr/ofdt.h
|
||
|
@@ -92,6 +92,7 @@ struct dr_node {
|
||
|
uint32_t _lmb_aa_index;
|
||
|
struct mem_scn *_mem_scns;
|
||
|
struct of_node *_of_node;
|
||
|
+ struct dr_node *_numa_next;
|
||
|
} _smem;
|
||
|
|
||
|
#define lmb_address _node_u._smem._address
|
||
|
@@ -99,6 +100,7 @@ struct dr_node {
|
||
|
#define lmb_aa_index _node_u._smem._lmb_aa_index
|
||
|
#define lmb_mem_scns _node_u._smem._mem_scns
|
||
|
#define lmb_of_node _node_u._smem._of_node
|
||
|
+#define lmb_numa_next _node_u._smem._numa_next
|
||
|
|
||
|
struct hea_info {
|
||
|
uint _port_no;
|
||
|
--
|
||
|
2.29.2
|
||
|
|