merge upstream patches

This commit is contained in:
Than Ngo 2021-06-01 11:19:01 +02:00
parent e3f1a6d00a
commit f50e4f6aef
10 changed files with 1291 additions and 0 deletions

View File

@ -0,0 +1,87 @@
From 014e8ba4580c7917e258df084776c16079dc07ce Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.ibm.com>
Date: Tue, 24 Nov 2020 19:28:48 +0100
Subject: [PATCH 1/3] drmgr: don't open sysfs file for each command
The new __do_kernel_dlpar() API will be used in later commit to remove by
DRC Index LMB per LMB. This will avoiding opennig and closing the fd each
time.
The fd closing will now be done at the process exit time.
In addition add an optinal parameter to silently ignore some error.
Also, change the log level of the "success" message to debug to match
the previous one saying "Trying.."
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
---
src/drmgr/common.c | 22 +++++++++++++---------
src/drmgr/dr.h | 3 ++-
2 files changed, 15 insertions(+), 10 deletions(-)
diff --git a/src/drmgr/common.c b/src/drmgr/common.c
index 5e8135bcf77e..25d244cb2f57 100644
--- a/src/drmgr/common.c
+++ b/src/drmgr/common.c
@@ -1469,32 +1469,36 @@ int kernel_dlpar_exists(void)
* @param cmd command string to write to sysfs
* @returns 0 on success, !0 otherwise
*/
-int do_kernel_dlpar(const char *cmd, int cmdlen)
+int __do_kernel_dlpar(const char *cmd, int cmdlen, int silent_error)
{
- int fd, rc;
+ static int fd = -1;
+ int rc;
int my_errno;
say(DEBUG, "Initiating kernel DLPAR \"%s\"\n", cmd);
/* write to file */
- fd = open(SYSFS_DLPAR_FILE, O_WRONLY);
- if (fd <= 0) {
- say(ERROR, "Could not open %s to initiate DLPAR request\n",
- SYSFS_DLPAR_FILE);
- return -1;
+ if (fd == -1) {
+ fd = open(SYSFS_DLPAR_FILE, O_WRONLY);
+ if (fd <= 0) {
+ say(ERROR, "Could not open %s to initiate DLPAR request\n",
+ SYSFS_DLPAR_FILE);
+ return -1;
+ }
}
rc = write(fd, cmd, cmdlen);
my_errno = errno;
- close(fd);
if (rc <= 0) {
+ if (silent_error)
+ return (my_errno == 0) ? -1 : -my_errno;
/* write does not set errno for rc == 0 */
say(ERROR, "Failed to write to %s: %s\n", SYSFS_DLPAR_FILE,
(rc == 0) ? "wrote 0 bytes" : strerror(my_errno));
return -1;
}
- say(INFO, "Success\n");
+ say(DEBUG, "Success\n");
return 0;
}
diff --git a/src/drmgr/dr.h b/src/drmgr/dr.h
index f171bfea73c3..00d2fffc9919 100644
--- a/src/drmgr/dr.h
+++ b/src/drmgr/dr.h
@@ -172,5 +172,6 @@ enum drc_type to_drc_type(const char *);
int handle_prrn(void);
int kernel_dlpar_exists(void);
-int do_kernel_dlpar(const char *, int);
+int __do_kernel_dlpar(const char *, int, int);
+#define do_kernel_dlpar(c, l) __do_kernel_dlpar(c, l, 0)
#endif
--
2.29.2

View File

@ -0,0 +1,40 @@
From 16469b696959aee4ce32d9f77483e1e3f192e82d Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.ibm.com>
Date: Fri, 16 Apr 2021 18:10:36 +0200
Subject: [PATCH] drmgr: fix remove by index operation
The commit e9f06531356f ("drmgr: introduce NUMA based LMB removal")
introduce a special processing when NUMA is on and the remove by count
operation is done.
Unfortunately, that code is also triggered when doing a remove by index
operation (-s argument) becauses usr_drc_count is set to 1. As a
consequence the index constraint is not respected and any LMB can be
removed.
Add a check agains usr_drc_index which is set when a remove by index
operation is done to ensure the numa removal code is not triggered in that
case.
Fixes: e9f06531356f ("drmgr: introduce NUMA based LMB removal")
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
---
src/drmgr/drslot_chrp_mem.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/drmgr/drslot_chrp_mem.c b/src/drmgr/drslot_chrp_mem.c
index f17c94adc270..8db98bb9e9ea 100644
--- a/src/drmgr/drslot_chrp_mem.c
+++ b/src/drmgr/drslot_chrp_mem.c
@@ -1749,7 +1749,7 @@ int do_mem_kernel_dlpar(void)
int rc, offset;
- if (usr_action == REMOVE && usr_drc_count) {
+ if (usr_action == REMOVE && usr_drc_count && !usr_drc_index) {
build_numa_topology();
if (numa_enabled) {
if (!numa_based_remove(usr_drc_count))
--
2.31.1

View File

@ -0,0 +1,438 @@
From 88caa91a4c8f0ac2376da433f697bc6845595dac Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.ibm.com>
Date: Wed, 2 Dec 2020 16:10:57 +0100
Subject: [PATCH 2/3] drmgr: read the CPU NUMA topology
This will be used in the next commit to compute LMB removal based on the
NUMA topology.
The NUMA topology is read using the libnuma, so a dependency against it is
added in the configure file.
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
---
Makefile.am | 5 +-
configure.ac | 4 +
src/drmgr/common_numa.c | 268 ++++++++++++++++++++++++++++++++++++++++
src/drmgr/common_numa.h | 83 +++++++++++++
4 files changed, 359 insertions(+), 1 deletion(-)
create mode 100644 src/drmgr/common_numa.c
create mode 100644 src/drmgr/common_numa.h
diff --git a/Makefile.am b/Makefile.am
index 2ff2232537df..31baaa74b353 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -155,6 +155,7 @@ src_drmgr_drmgr_SOURCES = \
src/drmgr/common_cpu.c \
src/drmgr/common_ofdt.c \
src/drmgr/common_pci.c \
+ src/drmgr/common_numa.c \
src/drmgr/drmgr.c \
src/drmgr/drmig_chrp_pmig.c \
src/drmgr/drslot_chrp_cpu.c \
@@ -171,13 +172,14 @@ noinst_HEADERS += \
src/drmgr/drcpu.h \
src/drmgr/dr.h \
src/drmgr/drmem.h \
+ src/drmgr/numa.h \
src/drmgr/drpci.h \
src/drmgr/rtas_calls.h \
src/drmgr/ofdt.h \
src/drmgr/rtas_calls.h \
src/drmgr/options.c
-src_drmgr_drmgr_LDADD = -lrtas
+src_drmgr_drmgr_LDADD = -lrtas -lnuma
src_drmgr_lsslot_SOURCES = \
src/drmgr/lsslot.c \
@@ -186,6 +188,7 @@ src_drmgr_lsslot_SOURCES = \
src/drmgr/common_cpu.c \
src/drmgr/common_pci.c \
src/drmgr/common_ofdt.c \
+ src/drmgr/common_numa.c \
src/drmgr/rtas_calls.c \
src/drmgr/drslot_chrp_mem.c \
$(pseries_platform_SOURCES)
diff --git a/configure.ac b/configure.ac
index de3c6758389a..0239754cc4f4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -42,6 +42,10 @@ AC_CHECK_HEADER(zlib.h,
[AC_CHECK_LIB(z, inflate, [], [AC_MSG_FAILURE([zlib library is required for compilation])])],
[AC_MSG_FAILURE([zlib.h is required for compiliation])])
+AC_CHECK_HEADER(numa.h,
+ [AC_CHECK_LIB(numa, numa_available, [], [AC_MSG_FAILURE([numa library is required for compilation])])],
+ [AC_MSG_FAILURE([numa.h is required for compiliation])])
+
# check for librtas
AC_ARG_WITH([librtas],
[AS_HELP_STRING([--without-librtas],
diff --git a/src/drmgr/common_numa.c b/src/drmgr/common_numa.c
new file mode 100644
index 000000000000..5778769b25b6
--- /dev/null
+++ b/src/drmgr/common_numa.c
@@ -0,0 +1,268 @@
+/**
+ * @file common_numa.c
+ *
+ * Copyright (C) IBM Corporation 2020
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <numa.h>
+
+#include "dr.h"
+#include "ofdt.h"
+#include "drmem.h" /* for DYNAMIC_RECONFIG_MEM */
+#include "common_numa.h"
+
+#define RTAS_DIRECTORY "/proc/device-tree/rtas"
+#define CHOSEN_DIRECTORY "/proc/device-tree/chosen"
+#define ASSOC_REF_POINTS "ibm,associativity-reference-points"
+#define ASSOC_LOOKUP_ARRAYS "ibm,associativity-lookup-arrays"
+#define ARCHITECTURE_VEC_5 "ibm,architecture-vec-5"
+
+/*
+ * Allocate and read a property, return the size.
+ * The read property is not converted to the host endianess.
+ */
+static int load_property(char *dir, char *prop, uint32_t **buf)
+{
+ int size;
+
+ size = get_property_size(dir, prop);
+ if (!size)
+ return -ENOENT;
+
+ *buf = zalloc(size);
+ if (!*buf) {
+ say(ERROR, "Could not allocate buffer read %s (%d bytes)\n",
+ prop, size);
+ return -ENOMEM;
+ }
+
+ if (get_property(dir, prop, *buf, size)) {
+ free(*buf);
+ say(ERROR, "Can't retrieve %s/%s\n", dir, prop);
+ return -EINVAL;
+ }
+
+ return size;
+}
+
+/*
+ * Get the minimal common depth, based on the form 1 of the ibm,associativ-
+ * ity-reference-points property. We only support that form.
+ *
+ * We should check that the "ibm,architecture-vec-5" property byte 5 bit 0
+ * has the value of one.
+ */
+static int get_min_common_depth(struct numa_topology *numa)
+{
+ int size;
+ uint32_t *p;
+ unsigned char val;
+
+ size = load_property(CHOSEN_DIRECTORY, ARCHITECTURE_VEC_5, &p);
+ if (size < 0)
+ return size;
+
+ /* PAPR byte start at 1 (and not 0) but there is the length field */
+ if (size < 6) {
+ report_unknown_error(__FILE__, __LINE__);
+ free(p);
+ return -EINVAL;
+ }
+ val = ((unsigned char *)p)[5];
+ free(p);
+
+ if (!(val & 0x80))
+ return -ENOTSUP;
+
+ size = load_property(RTAS_DIRECTORY, ASSOC_REF_POINTS, &p);
+ if (size <= 0)
+ return size;
+ if (size < sizeof(uint32_t)) {
+ report_unknown_error(__FILE__, __LINE__);
+ free(p);
+ return -EINVAL;
+ }
+
+ /* Get the first entry */
+ numa->min_common_depth = be32toh(*p);
+ free(p);
+ return 0;
+}
+
+static int get_assoc_arrays(struct numa_topology *numa)
+{
+ int size;
+ int rc;
+ uint32_t *prop, i;
+ struct assoc_arrays *aa = &numa->aa;
+
+ size = load_property(DYNAMIC_RECONFIG_MEM, ASSOC_LOOKUP_ARRAYS, &prop);
+ if (size < 0)
+ return size;
+
+ size /= sizeof(uint32_t);
+ if (size < 2) {
+ say(ERROR, "Could not find the associativity lookup arrays\n");
+ free(prop);
+ return -EINVAL;
+ }
+
+ aa->n_arrays = be32toh(prop[0]);
+ aa->array_sz = be32toh(prop[1]);
+
+ rc = -EINVAL;
+ if (numa->min_common_depth > aa->array_sz) {
+ say(ERROR, "Bad min common depth or associativity array size\n");
+ goto out_free;
+ }
+
+ /* Sanity check */
+ if (size != (aa->n_arrays * aa->array_sz + 2)) {
+ say(ERROR, "Bad size of the associativity lookup arrays\n");
+ goto out_free;
+ }
+
+ aa->min_array = zalloc(aa->n_arrays * sizeof(uint32_t));
+
+ /* Keep only the most significant value */
+ for (i = 0; i < aa->n_arrays; i++) {
+ int prop_index = i * aa->array_sz + numa->min_common_depth + 1;
+
+ aa->min_array[i] = be32toh(prop[prop_index]);
+ }
+ rc = 0;
+
+out_free:
+ free(prop);
+ return rc;
+}
+
+struct numa_node *numa_fetch_node(struct numa_topology *numa, int nid)
+{
+ struct numa_node *node;
+
+ if (nid > MAX_NUMNODES) {
+ report_unknown_error(__FILE__, __LINE__);
+ return NULL;
+ }
+
+ node = numa->nodes[nid];
+ if (node)
+ return node;
+
+ node = zalloc(sizeof(struct numa_node));
+ if (!node) {
+ say(ERROR, "Can't allocate a new node\n");
+ return NULL;
+ }
+
+ node->node_id = nid;
+
+ if (!numa->node_count || nid < numa->node_min)
+ numa->node_min = nid;
+ if (nid > numa->node_max)
+ numa->node_max = nid;
+
+ numa->nodes[nid] = node;
+ numa->node_count++;
+
+ return node;
+}
+
+/*
+ * Read the number of CPU for each node using the libnuma to get the details
+ * from sysfs.
+ */
+static int read_numa_topology(struct numa_topology *numa)
+{
+ struct bitmask *cpus;
+ struct numa_node *node;
+ int rc, max_node, nid, i;
+
+ if (numa_available() < 0)
+ return -ENOENT;
+
+ max_node = numa_max_node();
+ if (max_node >= MAX_NUMNODES) {
+ say(ERROR, "Too many nodes %d (max:%d)\n",
+ max_node, MAX_NUMNODES);
+ return -EINVAL;
+ }
+
+ rc = 0;
+
+ /* In case of allocation error, the libnuma is calling exit() */
+ cpus = numa_allocate_cpumask();
+
+ for (nid = 0; nid <= max_node; nid++) {
+
+ if (!numa_bitmask_isbitset(numa_nodes_ptr, nid))
+ continue;
+
+ node = numa_fetch_node(numa, nid);
+ if (!node) {
+ rc = -ENOMEM;
+ break;
+ }
+
+ rc = numa_node_to_cpus(nid, cpus);
+ if (rc < 0)
+ break;
+
+ /* Count the CPUs in that node */
+ for (i = 0; i < cpus->size; i++)
+ if (numa_bitmask_isbitset(cpus, i))
+ node->n_cpus++;
+
+ numa->cpu_count += node->n_cpus;
+ }
+
+ numa_bitmask_free(cpus);
+
+ if (rc) {
+ numa_foreach_node(numa, nid, node)
+ node->n_cpus = 0;
+ numa->cpu_count = 0;
+ }
+
+ return rc;
+}
+
+int numa_get_topology(struct numa_topology *numa)
+{
+ int rc;
+
+ rc = get_min_common_depth(numa);
+ if (rc)
+ return rc;
+
+
+ rc = get_assoc_arrays(numa);
+ if (rc)
+ return rc;
+
+ rc = read_numa_topology(numa);
+ if (rc)
+ return rc;
+
+ if (!numa->node_count)
+ return -1;
+
+ return 0;
+}
diff --git a/src/drmgr/common_numa.h b/src/drmgr/common_numa.h
new file mode 100644
index 000000000000..4d0054926819
--- /dev/null
+++ b/src/drmgr/common_numa.h
@@ -0,0 +1,83 @@
+/**
+ * @file numa.h
+ *
+ * Copyright (C) IBM Corporation 2020
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef _NUMA_H_
+#define _NUMA_H_
+
+#define MAX_NUMNODES 256
+#define NUMA_NO_NODE -1
+
+struct numa_node {
+ int node_id;
+ unsigned int n_cpus;
+ unsigned int n_lmbs;
+ unsigned int ratio;
+ struct dr_node *lmbs; /* linked by lmb_numa_next */
+ struct numa_node *ratio_next;
+};
+
+struct assoc_arrays {
+ uint32_t n_arrays;
+ uint32_t array_sz;
+ uint32_t *min_array;
+};
+
+struct numa_topology {
+ unsigned int cpu_count;
+ unsigned int lmb_count;
+ unsigned int cpuless_node_count;
+ unsigned int cpuless_lmb_count;
+ unsigned int node_count, node_min, node_max;
+ struct numa_node *nodes[MAX_NUMNODES];
+ struct numa_node *ratio;
+ uint32_t min_common_depth;
+ struct assoc_arrays aa;
+};
+
+int numa_get_topology(struct numa_topology *numa);
+struct numa_node *numa_fetch_node(struct numa_topology *numa, int node_id);
+
+static inline int numa_aa_index_to_node(struct numa_topology *numa,
+ uint32_t aa_index)
+{
+ if (aa_index < numa->aa.n_arrays)
+ return numa->aa.min_array[aa_index];
+ return NUMA_NO_NODE;
+}
+
+static inline int next_node(struct numa_topology *numa, int nid,
+ struct numa_node **node)
+{
+ for (nid++; nid <= numa->node_max; nid++)
+ if (numa->nodes[nid]) {
+ *node = numa->nodes[nid];
+ break;
+ }
+ return nid;
+}
+
+#define numa_foreach_node(numa, nid, node) \
+ for (nid = (numa)->node_min, node = (numa)->nodes[nid]; \
+ nid <= (numa)->node_max; \
+ nid = next_node(numa, nid, &(node)))
+
+#define numa_foreach_node_by_ratio(numa, node) \
+ for (node = (numa)->ratio; node; node = node->ratio_next)
+
+#endif /* _NUMA_H_ */
--
2.29.2

View File

@ -0,0 +1,443 @@
From 3c549c7494e729a68b64ac5519bcf1506b24f945 Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.ibm.com>
Date: Wed, 25 Nov 2020 18:03:45 +0100
Subject: [PATCH 3/3] drmgr: introduce NUMA based LMB removal
When the NUMA topology can be read, all the LMBs found in the Device Tree
are linked the corresponding node. LMB not associated to node are
considered as not used.
LMB associated to CPU less node are accounted separately because they will
be targeted first to be remove. The LMB are removed from the CPU less nodes
to reach an average number LMBs per CPU less node.
Node with CPU have a ration indexed on their number of CPUs. The higher a
node have CPU the lower number LMB will be removed. This way node with a
high number of CPU will get a higher amount of memory.
When a LMB can't be removed (because its memory can't be offlined by the
kernel), the LMB count for node is decremented and the LMB is removed from
the node's LMB list. This way, it is no more accounted as 'active' and the
removal operation will continue without taking it in account anymore.
The removal is done through the remove by DRC index API, allowing to remove
a LMB at a time. One futur optimization would be to extend that API to
remove a linear range of LMB each time.
If the NUMA topology can't be read, we fallback using the legacy remove
way.
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
---
src/drmgr/drslot_chrp_mem.c | 335 +++++++++++++++++++++++++++++++++++-
src/drmgr/ofdt.h | 2 +
2 files changed, 336 insertions(+), 1 deletion(-)
diff --git a/src/drmgr/drslot_chrp_mem.c b/src/drmgr/drslot_chrp_mem.c
index 502aa3e9fff0..47d9f7b8ed90 100644
--- a/src/drmgr/drslot_chrp_mem.c
+++ b/src/drmgr/drslot_chrp_mem.c
@@ -31,12 +31,16 @@
#include "dr.h"
#include "ofdt.h"
#include "drmem.h"
+#include "common_numa.h"
static int block_sz_bytes = 0;
static char *state_strs[] = {"offline", "online"};
static char *usagestr = "-c mem {-a | -r} {-q <quantity> -p {variable_weight | ent_capacity} | {-q <quantity> | -s [<drc_name> | <drc_index>]}}";
+static struct numa_topology numa;
+static int numa_enabled = 0;
+
/**
* mem_usage
* @brief return usage string
@@ -306,6 +310,31 @@ get_mem_node_lmbs(struct lmb_list_head *lmb_list)
return rc;
}
+static int link_lmb_to_numa_node(struct dr_node *lmb)
+{
+ int nid;
+ struct numa_node *node;
+
+ nid = numa_aa_index_to_node(&numa, lmb->lmb_aa_index);
+ if (nid == NUMA_NO_NODE)
+ return 0;
+
+ node = numa_fetch_node(&numa, nid);
+ if (!node)
+ return -ENOMEM;
+
+ lmb->lmb_numa_next = node->lmbs;
+ node->lmbs = lmb;
+ node->n_lmbs++;
+
+ if (node->n_cpus)
+ numa.lmb_count++;
+ else
+ numa.cpuless_lmb_count++;
+
+ return 0;
+}
+
int add_lmb(struct lmb_list_head *lmb_list, uint32_t drc_index,
uint64_t address, uint64_t lmb_sz, uint32_t aa_index,
uint32_t flags)
@@ -324,6 +353,9 @@ int add_lmb(struct lmb_list_head *lmb_list, uint32_t drc_index,
lmb->lmb_address = address;
lmb->lmb_aa_index = aa_index;
+ if (numa_enabled && link_lmb_to_numa_node(lmb))
+ return -ENOMEM;
+
if (flags & DRMEM_ASSIGNED) {
int rc;
@@ -490,7 +522,7 @@ get_dynamic_reconfig_lmbs(struct lmb_list_head *lmb_list)
if (stat(DYNAMIC_RECONFIG_MEM_V1, &sbuf) == 0) {
rc = get_dynamic_reconfig_lmbs_v1(lmb_sz, lmb_list);
- } else if (is_lsslot_cmd &&
+ } else if ((is_lsslot_cmd || numa_enabled) &&
stat(DYNAMIC_RECONFIG_MEM_V2, &sbuf) == 0) {
rc = get_dynamic_reconfig_lmbs_v2(lmb_sz, lmb_list);
} else {
@@ -1424,11 +1456,312 @@ int valid_mem_options(void)
return 0;
}
+static int remove_lmb_by_index(uint32_t drc_index)
+{
+ char cmdbuf[128];
+ int offset;
+
+ offset = sprintf(cmdbuf, "memory remove index 0x%x", drc_index);
+
+ return __do_kernel_dlpar(cmdbuf, offset, 1 /* Don't report error */);
+}
+
+static int remove_lmb_from_node(struct numa_node *node, uint32_t count)
+{
+ struct dr_node *lmb;
+ int err, done = 0, unlinked = 0;
+
+ say(DEBUG, "Try removing %d / %d LMBs from node %d\n",
+ count, node->n_lmbs, node->node_id);
+
+ for (lmb = node->lmbs; lmb && done < count; lmb = lmb->lmb_numa_next) {
+ unlinked ++;
+ err = remove_lmb_by_index(lmb->drc_index);
+ if (err)
+ say(WARN,"Can't remove LMB node:%d index:0x%x: %s\n",
+ node->node_id, lmb->drc_index, strerror(-err));
+ else
+ done++;
+ }
+
+ /*
+ * Decrement the node LMB's count since whatever is the success
+ * of the removal operation, it will not be tried again on that
+ * LMB.
+ */
+ node->n_lmbs -= unlinked;
+
+ /*
+ * Update the node's list of LMB to not process the one we removed or
+ * tried to removed again.
+ */
+ node->lmbs = lmb;
+
+ /* Update numa's counters */
+ if (node->n_cpus)
+ numa.lmb_count -= unlinked;
+ else
+ numa.cpuless_node_count -= unlinked;
+
+ if (!node->n_lmbs) {
+ node->ratio = 0; /* for sanity only */
+ if (node->n_cpus)
+ numa.cpu_count -= node->n_cpus;
+ else
+ numa.cpuless_node_count--;
+ }
+
+ say(INFO, "Removed %d LMBs from node %d\n", done, node->node_id);
+ return done;
+}
+
+#define min(a,b) ((a < b) ? a : b)
+
+static void update_cpuless_node_ratio(void)
+{
+ struct numa_node *node;
+ int nid;
+
+ /*
+ * Assumptions:
+ * 1. numa->cpuless_node_count is up to date
+ * 2. numa->cpuless_lmb_count is up to date
+ * Nodes with no memory and nodes with CPUs are ignored here.
+ */
+ numa_foreach_node(&numa, nid, node) {
+ if (node->n_cpus ||!node->n_lmbs)
+ continue;
+ node->ratio = (node->n_lmbs * 100) / numa.cpuless_lmb_count;
+ }
+}
+
+/*
+ * Remove LMBs from node without CPUs only.
+ * The more the node has LMBs, the more LMBs will be removed from it.
+ *
+ * We have to retry the operation multiple times because some LMB cannot be
+ * removed due to the page usage in the kernel. In that case, that LMB is no
+ * more taken in account and the node's LMB count is decremented, assuming that
+ * LMB is unremovable at this time. Thus each node's ratio has to be computed on
+ * each iteration. This is not a big deal, usually, there are not so much nodes.
+ */
+static int remove_cpuless_lmbs(uint32_t count)
+{
+ struct numa_node *node;
+ int nid;
+ uint32_t total = count, todo, done = 0, this_loop;
+
+ while (count) {
+ count = min(count, numa.cpuless_lmb_count);
+ if (!count)
+ break;
+
+ update_cpuless_node_ratio();
+
+ this_loop = 0;
+ numa_foreach_node(&numa, nid, node) {
+ if (!node->n_lmbs || node->n_cpus)
+ continue;
+
+ todo = (count * node->ratio) / 100;
+ todo = min(todo, node->n_lmbs);
+ /* Fix rounded value to 0 */
+ if (!todo && node->n_lmbs)
+ todo = (count - this_loop);
+
+ if (todo)
+ todo = remove_lmb_from_node(node, todo);
+
+ this_loop += todo;
+ done += todo;
+ if (done >= total)
+ break;
+ }
+
+ /* Don't continue if we didn't make any progress. */
+ if (!this_loop)
+ break;
+
+ count -= this_loop;
+ }
+
+ say(DEBUG, "%d / %d LMBs removed from the CPU less nodes\n",
+ done, total);
+ return done;
+}
+
+static void update_node_ratio(void)
+{
+ int nid;
+ struct numa_node *node, *n, **p;
+ uint32_t cpu_ratio, mem_ratio;
+
+ /*
+ * Assumptions:
+ * 1. numa->cpu_count is up to date
+ * 2. numa->lmb_count is up to date
+ * Nodes with no memory and nodes with no CPU are ignored here.
+ */
+
+ numa.ratio = NULL;
+ numa_foreach_node(&numa, nid, node) {
+ if (!node->n_lmbs || !node->n_cpus)
+ continue;
+ cpu_ratio = (node->n_cpus * 100) / numa.cpu_count;
+ mem_ratio = (node->n_lmbs * 100) / numa.lmb_count;
+
+ /* Say that CPU ratio is 90% of the ratio */
+ node->ratio = (cpu_ratio * 9 + mem_ratio) / 10;
+ }
+
+ /* Create an ordered link of the nodes */
+ numa_foreach_node(&numa, nid, node) {
+ if (!node->n_lmbs || !node->n_cpus)
+ continue;
+
+ p = &numa.ratio;
+ for (n = numa.ratio;
+ n && n->ratio < node->ratio; n = n->ratio_next)
+ p = &n->ratio_next;
+ *p = node;
+ node->ratio_next = n;
+ }
+}
+
+/*
+ * Remove LMBs from node with CPUs.
+ *
+ * The less a node has CPU, the more memory will be removed from it.
+ *
+ * As for the CPU less nodes, we must iterate because some LMBs may not be
+ * removable at this time.
+ */
+static int remove_cpu_lmbs(uint32_t count)
+{
+ struct numa_node *node;
+ uint32_t total = count, todo, done = 0, this_loop;
+ uint32_t new_lmb_count;
+
+ while(count) {
+ count = min(count, numa.lmb_count);
+ if (!count)
+ break;
+
+ update_node_ratio();
+
+ new_lmb_count = numa.lmb_count - count;
+
+ this_loop = 0;
+ numa_foreach_node_by_ratio(&numa, node) {
+ if (!node->n_lmbs || !node->n_cpus)
+ continue;
+
+ todo = (new_lmb_count * node->ratio) / 100;
+ todo = node->n_lmbs - min(todo, node->n_lmbs);
+ todo = min(count, todo);
+
+ if (todo) {
+ todo = remove_lmb_from_node(node, todo);
+ count -= todo;
+ this_loop += todo;
+ }
+
+ if (!count)
+ break;
+ }
+
+ /* Don't continue if we didn't make any progress. */
+ if (!this_loop)
+ break;
+ done += this_loop;
+ }
+
+ say(DEBUG, "%d / %d LMBs removed from the CPU nodes\n",
+ done, total);
+ return done;
+}
+
+static void build_numa_topology(void)
+{
+ int rc;
+
+ rc = numa_get_topology(&numa);
+ if (rc)
+ return;
+
+ numa_enabled = 1;
+}
+
+static void clear_numa_lmb_links(void)
+{
+ int nid;
+ struct numa_node *node;
+
+ numa_foreach_node(&numa, nid, node)
+ node->lmbs = NULL;
+}
+
+static int numa_based_remove(uint32_t count)
+{
+ struct lmb_list_head *lmb_list;
+ struct numa_node *node;
+ int nid;
+ uint32_t done = 0;
+
+ /*
+ * Read the LMBs
+ * Link the LMBs to their node
+ * Update global counter
+ */
+ lmb_list = get_lmbs(LMB_NORMAL_SORT);
+ if (lmb_list == NULL) {
+ clear_numa_lmb_links();
+ return -1;
+ }
+
+ if (!numa.node_count) {
+ clear_numa_lmb_links();
+ free_lmbs(lmb_list);
+ return -EINVAL;
+ }
+
+ numa_foreach_node(&numa, nid, node) {
+ say(INFO, "node %4d %4d CPUs %8d LMBs\n",
+ nid, node->n_cpus, node->n_lmbs);
+ }
+
+ done += remove_cpuless_lmbs(count);
+ count -= done;
+
+ done += remove_cpu_lmbs(count);
+
+ report_resource_count(done);
+
+ clear_numa_lmb_links();
+ free_lmbs(lmb_list);
+ return 0;
+}
+
int do_mem_kernel_dlpar(void)
{
char cmdbuf[128];
int rc, offset;
+
+ if (usr_action == REMOVE && usr_drc_count) {
+ build_numa_topology();
+ if (numa_enabled) {
+ if (!numa_based_remove(usr_drc_count))
+ return 0;
+
+ /*
+ * If the NUMA based removal failed, lets try the legacy
+ * way.
+ */
+ say(WARN, "Can't do NUMA based removal operation.\n");
+ }
+ }
+
offset = sprintf(cmdbuf, "%s ", "memory");
switch (usr_action) {
diff --git a/src/drmgr/ofdt.h b/src/drmgr/ofdt.h
index 3850a77229b4..3c2840b2e0ee 100644
--- a/src/drmgr/ofdt.h
+++ b/src/drmgr/ofdt.h
@@ -92,6 +92,7 @@ struct dr_node {
uint32_t _lmb_aa_index;
struct mem_scn *_mem_scns;
struct of_node *_of_node;
+ struct dr_node *_numa_next;
} _smem;
#define lmb_address _node_u._smem._address
@@ -99,6 +100,7 @@ struct dr_node {
#define lmb_aa_index _node_u._smem._lmb_aa_index
#define lmb_mem_scns _node_u._smem._mem_scns
#define lmb_of_node _node_u._smem._of_node
+#define lmb_numa_next _node_u._smem._numa_next
struct hea_info {
uint _port_no;
--
2.29.2

View File

@ -0,0 +1,33 @@
commit 0b59d4a372aa266caa75f3b6a253b8f5aeaf3802
Author: Mingming Cao <mmc@linux.vnet.ibm.com>
Date: Mon Mar 1 19:34:29 2021 -0800
hcnmgr: Avoid cleanup of bond interface at boot time when no HNV exists
At boot time, hcn scans the device tree and discovers if there was a new
HNV being added while lpar was inactive. It also cleans up the old hnv
interfaces. This patch avoids cleaning up bonding interface when no HNV
network devices exists.
Signed-off-by: Mingming Cao <mmc@linux.vnet.ibm.com>
[tyreld: fixup commit log]
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
diff --git a/scripts/hcnmgr b/scripts/hcnmgr
index a76505e..c95edba 100644
--- a/scripts/hcnmgr
+++ b/scripts/hcnmgr
@@ -575,7 +575,13 @@ scanhcn() {
done
fi
+ if [ ${HcnIds[@]} -eq 0 ]; then
+ hcnlog DEBUG "scanhcn: scan for hybrid virtual network finished"
+ return $E_SUCCESS
+ fi
+
# Next clean up dead connections left from orgitinal LPAR after inactive miration
+ # Only do this when the HNV ID array is not empty
# list of all HCN ids
ids="${HcnIds[*]}"

View File

@ -0,0 +1,32 @@
commit 1cb8bd89d6386c60e75c47d4a4452d3f130d5138
Author: Mingming Cao <mmc@linux.vnet.ibm.com>
Date: Fri Mar 12 14:18:18 2021 -0800
hcnmgr: Avoid using xargs to process NM show connections
When removing HNV bonding connections xargs can fail to process the output of
nmcli show propererly.
Instead of piping into xargs fix this by using a loop to check for all related
bonding connections and remove them explicitly one by one.
Signed-off-by: Mingming Cao <mmc@linux.vnet.ibm.com>
[tyreld: fixed up commit log]
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
diff --git a/scripts/hcnmgr b/scripts/hcnmgr
index d66b5d1..30d31e7 100644
--- a/scripts/hcnmgr
+++ b/scripts/hcnmgr
@@ -377,7 +377,10 @@ rmhcn() {
fi
hcnlog INFO "rmhcn: delete bond $BONDNAME and slaves "
- nmcli -f NAME con show | grep "$BONDNAME" | xargs sudo nmcli con delete
+ for connection in $(nmcli -f NAME con show | grep "$BONDNAME"); do
+ hcnlog INFO "Delete bonding connection $connection"
+ nmcli con delete "$connection"
+ done
hcnlog DEBUG "rmhcn: exit"
return $E_SUCCESS
}

View File

@ -0,0 +1,26 @@
commit 366e17553ed647613668678c2d301d369038f41b
Author: Brahadambal Srinivasan <latha@linux.vnet.ibm.com>
Date: Thu Nov 12 19:00:47 2020 +0530
Update ppc64-cpu usage
'ppc64_cpu --help' doesn't list '--version' as an option. This patch
adds the option in the usage information of ppc64-cpu command.
Signed-off-by: Brahadambal Srinivasan <latha@linux.vnet.ibm.com>
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
diff --git a/src/ppc64_cpu.c b/src/ppc64_cpu.c
index 71f4720..2b0f66c 100644
--- a/src/ppc64_cpu.c
+++ b/src/ppc64_cpu.c
@@ -1195,7 +1195,8 @@ static void usage(void)
"ppc64_cpu --subcores-per-core # Get number of subcores per core\n"
"ppc64_cpu --subcores-per-core=X # Set subcores per core to X (1 or 4)\n"
"ppc64_cpu --threads-per-core # Get threads per core\n"
-"ppc64_cpu --info # Display system state information)\n");
+"ppc64_cpu --info # Display system state information\n"
+"ppc64_cpu --version # Display version of ppc64-cpu\n");
}
struct option longopts[] = {

View File

@ -0,0 +1,30 @@
commit d9bcb21179ccfea122f326aca4690afe0f7de0c6
Author: Mingming Cao <mmc@linux.vnet.ibm.com>
Date: Mon Mar 1 21:34:34 2021 -0800
hcnmgr: Wait for sysfs device ready when looking up device name
At the time of calling ofpathname to look up for devicename, wait
for sysfs device ready. Otherwise, the OS may be in the middle of device
renaming.
Signed-off-by: Mingming Cao <mmc@linux.vnet.ibm.com>
[tyreld: fixed up commit log]
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
diff --git a/scripts/hcnmgr b/scripts/hcnmgr
index c95edba..0d20e7d 100644
--- a/scripts/hcnmgr
+++ b/scripts/hcnmgr
@@ -241,7 +241,10 @@ get_dev_hcn() {
# Let's retry a few times.
while [ $wait != 0 ]; do
if DEVNAME=$(ofpathname -l "$(echo "$1" | sed -e "s/\/proc\/device-tree//")" 2>/dev/null); then
- break
+ if [ -e /sys/class/net/"$DEVNAME" ]; then
+ hcnlog DEBUG "ofpathname waiting for /sys/class/net device $DEVNAME ready"
+ break
+ fi
fi
hcnlog DEBUG "ofpathname return $?, devname is $DEVNAME rety counter $wait"

View File

@ -0,0 +1,30 @@
commit e25d71be411b610e5e889f8efaaf04b38c2d9ecb
Author: Mingming Cao <mmc@linux.vnet.ibm.com>
Date: Fri Mar 12 13:50:33 2021 -0800
hcnmgr: Avoid using ifcfg file for checking bonding interface status
When configuring migratable sr_iov into hybrid network, it checks if
there is an existing HNV using the presense of ifcfg file location. This
is not preferred as the location can be different on distros.
This patch fixes this by using NetworkManager nmcli.
Signed-off-by: Mingming Cao <mmc@linux.vnet.ibm.com>
[tyreld: fixed spelling]
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
diff --git a/scripts/hcnmgr b/scripts/hcnmgr
index 0d20e7d..d66b5d1 100644
--- a/scripts/hcnmgr
+++ b/scripts/hcnmgr
@@ -282,8 +282,7 @@ do_config_vdevice() {
hcnlog DEBUG "Check if there is bond $BONDNAME with hcn id $HCNID"
- hcnlog DEBUG "ifconfig file $IFCONFIG_PATH/ifconfig-$BONDNAME"
- if [ ! -e "$IFCONFIG_PATH/ifcfg-$BONDNAME" ]; then
+ if ! nmcli -f NAME con show --active | grep -q "$BONDNAME\s"; then
hcnlog INFO "nmcli con add type bond con-name $BONDNAME ifname $BONDNAME"
nmcli con add type bond con-name "$BONDNAME" ifname "$BONDNAME"

View File

@ -0,0 +1,132 @@
commit 97269d301797e23b75d0c7a5cb63ce280783f615
Author: Laurent Dufour <ldufour@linux.ibm.com>
Date: Thu Mar 4 14:51:38 2021 +0100
lpartstat: add -x option for the security flavor
This allows user to get the security flavor settings fer the LPAR.
The output is :
$ lparstat -x
Speculative Execution Mode : 1
Where the output number means
0 = Speculative execution fully enabled
1 = Speculative execution controls to mitigate user-to-kernel side-channel
attacks
2 = Speculative execution controls to mitigate user-to-kernel and
user-to-user side-channel attacks
In the case the running kernel is not exposing the security flavor in
/proc/powerpc/lparcfg, the output is:
$ lparstat -x
Speculative Execution Mode : -
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
diff --git a/src/lparstat.c b/src/lparstat.c
index 23e4b85..00922c4 100644
--- a/src/lparstat.c
+++ b/src/lparstat.c
@@ -42,6 +42,7 @@
static bool o_legacy = false;
static bool o_scaled = false;
+static bool o_security = false;
static int threads_per_cpu;
static int cpus_in_system;
@@ -1152,6 +1153,15 @@ void print_scaled_output(int interval, int count)
} while (--count > 0);
}
+static void print_security_flavor(void)
+{
+ char value[64];
+ char *descr;
+
+ get_sysdata("security_flavor", &descr, value);
+ fprintf(stdout, "%-45s: %s\n", descr, value);
+}
+
static void usage(void)
{
printf("Usage: lparstat [ options ]\n\tlparstat <interval> [ count ]\n\n"
@@ -1159,6 +1169,7 @@ static void usage(void)
"\t-h, --help Show this message and exit.\n"
"\t-V, --version \tDisplay lparstat version information.\n"
"\t-i Lists details on the LPAR configuration.\n"
+ "\t-x Print the security mode settings for the LPAR.\n"
"\t-E Print SPURR metrics.\n"
"\t-l, --legacy Print the report in legacy format.\n"
"interval The interval parameter specifies the amount of time between each report.\n"
@@ -1184,7 +1195,7 @@ int main(int argc, char *argv[])
exit(1);
}
- while ((c = getopt_long(argc, argv, "iEVhl",
+ while ((c = getopt_long(argc, argv, "iEVhlx",
long_opts, &opt_index)) != -1) {
switch(c) {
case 'i':
@@ -1199,6 +1210,9 @@ int main(int argc, char *argv[])
case 'V':
printf("lparstat - %s\n", VERSION);
return 0;
+ case 'x':
+ o_security = true;
+ break;
case 'h':
usage();
return 0;
@@ -1223,6 +1237,8 @@ int main(int argc, char *argv[])
if (i_option)
print_iflag_data();
+ else if (o_security)
+ print_security_flavor();
else if (o_scaled) {
print_scaled_output(interval, count);
close_cpu_sysfs_fds(threads_in_system);
diff --git a/src/lparstat.h b/src/lparstat.h
index 9b7117f..26ed4ba 100644
--- a/src/lparstat.h
+++ b/src/lparstat.h
@@ -302,6 +302,10 @@ struct sysentry system_data[] = {
.descr = "Idle CPU value - SPURR",
.get = &get_cpu_idle_spurr},
+ /* Security flavor */
+ {.name = "security_flavor",
+ .descr = "Speculative Execution Mode"},
+
{.name[0] = '\0'},
};
diff -up powerpc-utils-1.3.8/man/lparstat.8.me powerpc-utils-1.3.8/man/lparstat.8
--- powerpc-utils-1.3.8/man/lparstat.8.me 2021-04-20 15:49:18.305532697 +0200
+++ powerpc-utils-1.3.8/man/lparstat.8 2021-04-20 15:52:04.703021972 +0200
@@ -209,6 +209,20 @@ The variable memory capacity weight of t
.TP
.SH
.TP
+\fB\-x\fR
+Display the LPAR security flavor mode
+.RS
+.TP
+.B 0
+Speculative execution fully enabled
+.TP
+.B 1
+Speculative execution controls to mitigate user-to-kernel side-channel attacks
+.TP
+.B 2
+Speculative execution controls to mitigate user-to-kernel and user-to-user side-channel attacks
+.RE
+.TP
\fB\-E\fR
Display Scaled Processor Utilization Resource Register(SPURR) based CPU utilization.
.RS