Resolves: #1873868, rebase to 1.3.9

This commit is contained in:
Than Ngo 2021-07-19 15:41:23 +02:00
parent 0ec56b561c
commit d325b1329b
23 changed files with 29 additions and 1855 deletions

1
.gitignore vendored
View File

@ -22,3 +22,4 @@ powerpc-utils-1.2.2.tar.gz
/powerpc-utils-1.3.6.tar.gz
/powerpc-utils-1.3.7.tar.gz
/powerpc-utils-1.3.8.tar.gz
/powerpc-utils-1.3.9.tar.gz

View File

@ -1,87 +0,0 @@
From 014e8ba4580c7917e258df084776c16079dc07ce Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.ibm.com>
Date: Tue, 24 Nov 2020 19:28:48 +0100
Subject: [PATCH 1/3] drmgr: don't open sysfs file for each command
The new __do_kernel_dlpar() API will be used in later commit to remove by
DRC Index LMB per LMB. This will avoiding opennig and closing the fd each
time.
The fd closing will now be done at the process exit time.
In addition add an optinal parameter to silently ignore some error.
Also, change the log level of the "success" message to debug to match
the previous one saying "Trying.."
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
---
src/drmgr/common.c | 22 +++++++++++++---------
src/drmgr/dr.h | 3 ++-
2 files changed, 15 insertions(+), 10 deletions(-)
diff --git a/src/drmgr/common.c b/src/drmgr/common.c
index 5e8135bcf77e..25d244cb2f57 100644
--- a/src/drmgr/common.c
+++ b/src/drmgr/common.c
@@ -1469,32 +1469,36 @@ int kernel_dlpar_exists(void)
* @param cmd command string to write to sysfs
* @returns 0 on success, !0 otherwise
*/
-int do_kernel_dlpar(const char *cmd, int cmdlen)
+int __do_kernel_dlpar(const char *cmd, int cmdlen, int silent_error)
{
- int fd, rc;
+ static int fd = -1;
+ int rc;
int my_errno;
say(DEBUG, "Initiating kernel DLPAR \"%s\"\n", cmd);
/* write to file */
- fd = open(SYSFS_DLPAR_FILE, O_WRONLY);
- if (fd <= 0) {
- say(ERROR, "Could not open %s to initiate DLPAR request\n",
- SYSFS_DLPAR_FILE);
- return -1;
+ if (fd == -1) {
+ fd = open(SYSFS_DLPAR_FILE, O_WRONLY);
+ if (fd <= 0) {
+ say(ERROR, "Could not open %s to initiate DLPAR request\n",
+ SYSFS_DLPAR_FILE);
+ return -1;
+ }
}
rc = write(fd, cmd, cmdlen);
my_errno = errno;
- close(fd);
if (rc <= 0) {
+ if (silent_error)
+ return (my_errno == 0) ? -1 : -my_errno;
/* write does not set errno for rc == 0 */
say(ERROR, "Failed to write to %s: %s\n", SYSFS_DLPAR_FILE,
(rc == 0) ? "wrote 0 bytes" : strerror(my_errno));
return -1;
}
- say(INFO, "Success\n");
+ say(DEBUG, "Success\n");
return 0;
}
diff --git a/src/drmgr/dr.h b/src/drmgr/dr.h
index f171bfea73c3..00d2fffc9919 100644
--- a/src/drmgr/dr.h
+++ b/src/drmgr/dr.h
@@ -172,5 +172,6 @@ enum drc_type to_drc_type(const char *);
int handle_prrn(void);
int kernel_dlpar_exists(void);
-int do_kernel_dlpar(const char *, int);
+int __do_kernel_dlpar(const char *, int, int);
+#define do_kernel_dlpar(c, l) __do_kernel_dlpar(c, l, 0)
#endif
--
2.29.2

View File

@ -1,40 +0,0 @@
From 16469b696959aee4ce32d9f77483e1e3f192e82d Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.ibm.com>
Date: Fri, 16 Apr 2021 18:10:36 +0200
Subject: [PATCH] drmgr: fix remove by index operation
The commit e9f06531356f ("drmgr: introduce NUMA based LMB removal")
introduce a special processing when NUMA is on and the remove by count
operation is done.
Unfortunately, that code is also triggered when doing a remove by index
operation (-s argument) becauses usr_drc_count is set to 1. As a
consequence the index constraint is not respected and any LMB can be
removed.
Add a check agains usr_drc_index which is set when a remove by index
operation is done to ensure the numa removal code is not triggered in that
case.
Fixes: e9f06531356f ("drmgr: introduce NUMA based LMB removal")
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
---
src/drmgr/drslot_chrp_mem.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/drmgr/drslot_chrp_mem.c b/src/drmgr/drslot_chrp_mem.c
index f17c94adc270..8db98bb9e9ea 100644
--- a/src/drmgr/drslot_chrp_mem.c
+++ b/src/drmgr/drslot_chrp_mem.c
@@ -1749,7 +1749,7 @@ int do_mem_kernel_dlpar(void)
int rc, offset;
- if (usr_action == REMOVE && usr_drc_count) {
+ if (usr_action == REMOVE && usr_drc_count && !usr_drc_index) {
build_numa_topology();
if (numa_enabled) {
if (!numa_based_remove(usr_drc_count))
--
2.31.1

View File

@ -1,78 +0,0 @@
From 21c860888425762468d339950518ab8b0940ecea Mon Sep 17 00:00:00 2001
From: Tyrel Datwyler <tyreld@linux.ibm.com>
Date: Mon, 5 Oct 2020 13:03:45 -0700
Subject: [PATCH] ofpathname: Use NVMe controller physical nsid
Linux creates logical block devices of the the form nvmeXnYpZ such that X = the
controller, Y = namepsace, and Z = partition. For example:
/dev/nvme0n1p1
The Linux namespace numbering scheme for namespaces always starts at 1 and
increases monotonically regardless of the actual numbering scheme of the
namespaces as seen by the physical NVMe controller. Accordingly, the Open
firmware path binding utilizes the namespace id as seen by the controller and
not the necessarily the one given in the logical block device name.
As such we need to use the "nsid" attribute in the sysfs entry for the logical
device to properly map back and forth from OP pathnames.
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
---
scripts/ofpathname | 24 ++++++++++++++++--------
1 file changed, 16 insertions(+), 8 deletions(-)
diff --git a/scripts/ofpathname b/scripts/ofpathname
index 2ceae25..41b8c5c 100755
--- a/scripts/ofpathname
+++ b/scripts/ofpathname
@@ -659,6 +659,11 @@ l2of_nvme()
devspec=`$CAT $PWD/device/devspec | tr -d '\000'`
if [[ -n $devspec ]]; then
found=1
+ if [[ -n $devnsid ]]; then
+ # Linux logical nsid might not match nvme controller nsid
+ goto_dir $dir "nsid"
+ devnsid=`$CAT $PWD/nsid | tr -d '\000'`
+ fi
break
fi
done
@@ -1609,16 +1614,9 @@ of2l_nvme()
local dir
local link
- for dir in `$FIND /sys/block -name "nvme*n$nsid"`; do
+ for dir in `$FIND /sys/class/nvme -name "nvme[0-9]*"`; do
cd $dir
- link=`get_link "device"` # points to nvme[0-9]+ (non-namespace)
- if [[ -n $link ]]; then
- cd $link
- else
- continue
- fi
-
link=`get_link "device"` # points to pci address dir
if [[ -n $link ]]; then
cd $link
@@ -1635,6 +1633,16 @@ of2l_nvme()
fi
done
+ for dir in `$FIND /sys/block -name "${LOGICAL_DEVNAME}n[0-9]*"`; do
+ cd $dir
+
+ local devnsid=`$CAT ./nsid 2>/dev/null`
+ if [[ $devnsid = $nsid ]]; then
+ LOGICAL_DEVNAME="${dir##*/}"
+ break
+ fi
+ done
+
if [[ -n $LOGICAL_DEVNAME ]] \
&& [[ -n $part ]]; then
--
1.8.3.1

View File

@ -1,438 +0,0 @@
From 88caa91a4c8f0ac2376da433f697bc6845595dac Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.ibm.com>
Date: Wed, 2 Dec 2020 16:10:57 +0100
Subject: [PATCH 2/3] drmgr: read the CPU NUMA topology
This will be used in the next commit to compute LMB removal based on the
NUMA topology.
The NUMA topology is read using the libnuma, so a dependency against it is
added in the configure file.
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
---
Makefile.am | 5 +-
configure.ac | 4 +
src/drmgr/common_numa.c | 268 ++++++++++++++++++++++++++++++++++++++++
src/drmgr/common_numa.h | 83 +++++++++++++
4 files changed, 359 insertions(+), 1 deletion(-)
create mode 100644 src/drmgr/common_numa.c
create mode 100644 src/drmgr/common_numa.h
diff --git a/Makefile.am b/Makefile.am
index 2ff2232537df..31baaa74b353 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -155,6 +155,7 @@ src_drmgr_drmgr_SOURCES = \
src/drmgr/common_cpu.c \
src/drmgr/common_ofdt.c \
src/drmgr/common_pci.c \
+ src/drmgr/common_numa.c \
src/drmgr/drmgr.c \
src/drmgr/drmig_chrp_pmig.c \
src/drmgr/drslot_chrp_cpu.c \
@@ -171,13 +172,14 @@ noinst_HEADERS += \
src/drmgr/drcpu.h \
src/drmgr/dr.h \
src/drmgr/drmem.h \
+ src/drmgr/numa.h \
src/drmgr/drpci.h \
src/drmgr/rtas_calls.h \
src/drmgr/ofdt.h \
src/drmgr/rtas_calls.h \
src/drmgr/options.c
-src_drmgr_drmgr_LDADD = -lrtas
+src_drmgr_drmgr_LDADD = -lrtas -lnuma
src_drmgr_lsslot_SOURCES = \
src/drmgr/lsslot.c \
@@ -186,6 +188,7 @@ src_drmgr_lsslot_SOURCES = \
src/drmgr/common_cpu.c \
src/drmgr/common_pci.c \
src/drmgr/common_ofdt.c \
+ src/drmgr/common_numa.c \
src/drmgr/rtas_calls.c \
src/drmgr/drslot_chrp_mem.c \
$(pseries_platform_SOURCES)
diff --git a/configure.ac b/configure.ac
index de3c6758389a..0239754cc4f4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -42,6 +42,10 @@ AC_CHECK_HEADER(zlib.h,
[AC_CHECK_LIB(z, inflate, [], [AC_MSG_FAILURE([zlib library is required for compilation])])],
[AC_MSG_FAILURE([zlib.h is required for compiliation])])
+AC_CHECK_HEADER(numa.h,
+ [AC_CHECK_LIB(numa, numa_available, [], [AC_MSG_FAILURE([numa library is required for compilation])])],
+ [AC_MSG_FAILURE([numa.h is required for compiliation])])
+
# check for librtas
AC_ARG_WITH([librtas],
[AS_HELP_STRING([--without-librtas],
diff --git a/src/drmgr/common_numa.c b/src/drmgr/common_numa.c
new file mode 100644
index 000000000000..5778769b25b6
--- /dev/null
+++ b/src/drmgr/common_numa.c
@@ -0,0 +1,268 @@
+/**
+ * @file common_numa.c
+ *
+ * Copyright (C) IBM Corporation 2020
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <numa.h>
+
+#include "dr.h"
+#include "ofdt.h"
+#include "drmem.h" /* for DYNAMIC_RECONFIG_MEM */
+#include "common_numa.h"
+
+#define RTAS_DIRECTORY "/proc/device-tree/rtas"
+#define CHOSEN_DIRECTORY "/proc/device-tree/chosen"
+#define ASSOC_REF_POINTS "ibm,associativity-reference-points"
+#define ASSOC_LOOKUP_ARRAYS "ibm,associativity-lookup-arrays"
+#define ARCHITECTURE_VEC_5 "ibm,architecture-vec-5"
+
+/*
+ * Allocate and read a property, return the size.
+ * The read property is not converted to the host endianess.
+ */
+static int load_property(char *dir, char *prop, uint32_t **buf)
+{
+ int size;
+
+ size = get_property_size(dir, prop);
+ if (!size)
+ return -ENOENT;
+
+ *buf = zalloc(size);
+ if (!*buf) {
+ say(ERROR, "Could not allocate buffer read %s (%d bytes)\n",
+ prop, size);
+ return -ENOMEM;
+ }
+
+ if (get_property(dir, prop, *buf, size)) {
+ free(*buf);
+ say(ERROR, "Can't retrieve %s/%s\n", dir, prop);
+ return -EINVAL;
+ }
+
+ return size;
+}
+
+/*
+ * Get the minimal common depth, based on the form 1 of the ibm,associativ-
+ * ity-reference-points property. We only support that form.
+ *
+ * We should check that the "ibm,architecture-vec-5" property byte 5 bit 0
+ * has the value of one.
+ */
+static int get_min_common_depth(struct numa_topology *numa)
+{
+ int size;
+ uint32_t *p;
+ unsigned char val;
+
+ size = load_property(CHOSEN_DIRECTORY, ARCHITECTURE_VEC_5, &p);
+ if (size < 0)
+ return size;
+
+ /* PAPR byte start at 1 (and not 0) but there is the length field */
+ if (size < 6) {
+ report_unknown_error(__FILE__, __LINE__);
+ free(p);
+ return -EINVAL;
+ }
+ val = ((unsigned char *)p)[5];
+ free(p);
+
+ if (!(val & 0x80))
+ return -ENOTSUP;
+
+ size = load_property(RTAS_DIRECTORY, ASSOC_REF_POINTS, &p);
+ if (size <= 0)
+ return size;
+ if (size < sizeof(uint32_t)) {
+ report_unknown_error(__FILE__, __LINE__);
+ free(p);
+ return -EINVAL;
+ }
+
+ /* Get the first entry */
+ numa->min_common_depth = be32toh(*p);
+ free(p);
+ return 0;
+}
+
+static int get_assoc_arrays(struct numa_topology *numa)
+{
+ int size;
+ int rc;
+ uint32_t *prop, i;
+ struct assoc_arrays *aa = &numa->aa;
+
+ size = load_property(DYNAMIC_RECONFIG_MEM, ASSOC_LOOKUP_ARRAYS, &prop);
+ if (size < 0)
+ return size;
+
+ size /= sizeof(uint32_t);
+ if (size < 2) {
+ say(ERROR, "Could not find the associativity lookup arrays\n");
+ free(prop);
+ return -EINVAL;
+ }
+
+ aa->n_arrays = be32toh(prop[0]);
+ aa->array_sz = be32toh(prop[1]);
+
+ rc = -EINVAL;
+ if (numa->min_common_depth > aa->array_sz) {
+ say(ERROR, "Bad min common depth or associativity array size\n");
+ goto out_free;
+ }
+
+ /* Sanity check */
+ if (size != (aa->n_arrays * aa->array_sz + 2)) {
+ say(ERROR, "Bad size of the associativity lookup arrays\n");
+ goto out_free;
+ }
+
+ aa->min_array = zalloc(aa->n_arrays * sizeof(uint32_t));
+
+ /* Keep only the most significant value */
+ for (i = 0; i < aa->n_arrays; i++) {
+ int prop_index = i * aa->array_sz + numa->min_common_depth + 1;
+
+ aa->min_array[i] = be32toh(prop[prop_index]);
+ }
+ rc = 0;
+
+out_free:
+ free(prop);
+ return rc;
+}
+
+struct numa_node *numa_fetch_node(struct numa_topology *numa, int nid)
+{
+ struct numa_node *node;
+
+ if (nid > MAX_NUMNODES) {
+ report_unknown_error(__FILE__, __LINE__);
+ return NULL;
+ }
+
+ node = numa->nodes[nid];
+ if (node)
+ return node;
+
+ node = zalloc(sizeof(struct numa_node));
+ if (!node) {
+ say(ERROR, "Can't allocate a new node\n");
+ return NULL;
+ }
+
+ node->node_id = nid;
+
+ if (!numa->node_count || nid < numa->node_min)
+ numa->node_min = nid;
+ if (nid > numa->node_max)
+ numa->node_max = nid;
+
+ numa->nodes[nid] = node;
+ numa->node_count++;
+
+ return node;
+}
+
+/*
+ * Read the number of CPU for each node using the libnuma to get the details
+ * from sysfs.
+ */
+static int read_numa_topology(struct numa_topology *numa)
+{
+ struct bitmask *cpus;
+ struct numa_node *node;
+ int rc, max_node, nid, i;
+
+ if (numa_available() < 0)
+ return -ENOENT;
+
+ max_node = numa_max_node();
+ if (max_node >= MAX_NUMNODES) {
+ say(ERROR, "Too many nodes %d (max:%d)\n",
+ max_node, MAX_NUMNODES);
+ return -EINVAL;
+ }
+
+ rc = 0;
+
+ /* In case of allocation error, the libnuma is calling exit() */
+ cpus = numa_allocate_cpumask();
+
+ for (nid = 0; nid <= max_node; nid++) {
+
+ if (!numa_bitmask_isbitset(numa_nodes_ptr, nid))
+ continue;
+
+ node = numa_fetch_node(numa, nid);
+ if (!node) {
+ rc = -ENOMEM;
+ break;
+ }
+
+ rc = numa_node_to_cpus(nid, cpus);
+ if (rc < 0)
+ break;
+
+ /* Count the CPUs in that node */
+ for (i = 0; i < cpus->size; i++)
+ if (numa_bitmask_isbitset(cpus, i))
+ node->n_cpus++;
+
+ numa->cpu_count += node->n_cpus;
+ }
+
+ numa_bitmask_free(cpus);
+
+ if (rc) {
+ numa_foreach_node(numa, nid, node)
+ node->n_cpus = 0;
+ numa->cpu_count = 0;
+ }
+
+ return rc;
+}
+
+int numa_get_topology(struct numa_topology *numa)
+{
+ int rc;
+
+ rc = get_min_common_depth(numa);
+ if (rc)
+ return rc;
+
+
+ rc = get_assoc_arrays(numa);
+ if (rc)
+ return rc;
+
+ rc = read_numa_topology(numa);
+ if (rc)
+ return rc;
+
+ if (!numa->node_count)
+ return -1;
+
+ return 0;
+}
diff --git a/src/drmgr/common_numa.h b/src/drmgr/common_numa.h
new file mode 100644
index 000000000000..4d0054926819
--- /dev/null
+++ b/src/drmgr/common_numa.h
@@ -0,0 +1,83 @@
+/**
+ * @file numa.h
+ *
+ * Copyright (C) IBM Corporation 2020
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef _NUMA_H_
+#define _NUMA_H_
+
+#define MAX_NUMNODES 256
+#define NUMA_NO_NODE -1
+
+struct numa_node {
+ int node_id;
+ unsigned int n_cpus;
+ unsigned int n_lmbs;
+ unsigned int ratio;
+ struct dr_node *lmbs; /* linked by lmb_numa_next */
+ struct numa_node *ratio_next;
+};
+
+struct assoc_arrays {
+ uint32_t n_arrays;
+ uint32_t array_sz;
+ uint32_t *min_array;
+};
+
+struct numa_topology {
+ unsigned int cpu_count;
+ unsigned int lmb_count;
+ unsigned int cpuless_node_count;
+ unsigned int cpuless_lmb_count;
+ unsigned int node_count, node_min, node_max;
+ struct numa_node *nodes[MAX_NUMNODES];
+ struct numa_node *ratio;
+ uint32_t min_common_depth;
+ struct assoc_arrays aa;
+};
+
+int numa_get_topology(struct numa_topology *numa);
+struct numa_node *numa_fetch_node(struct numa_topology *numa, int node_id);
+
+static inline int numa_aa_index_to_node(struct numa_topology *numa,
+ uint32_t aa_index)
+{
+ if (aa_index < numa->aa.n_arrays)
+ return numa->aa.min_array[aa_index];
+ return NUMA_NO_NODE;
+}
+
+static inline int next_node(struct numa_topology *numa, int nid,
+ struct numa_node **node)
+{
+ for (nid++; nid <= numa->node_max; nid++)
+ if (numa->nodes[nid]) {
+ *node = numa->nodes[nid];
+ break;
+ }
+ return nid;
+}
+
+#define numa_foreach_node(numa, nid, node) \
+ for (nid = (numa)->node_min, node = (numa)->nodes[nid]; \
+ nid <= (numa)->node_max; \
+ nid = next_node(numa, nid, &(node)))
+
+#define numa_foreach_node_by_ratio(numa, node) \
+ for (node = (numa)->ratio; node; node = node->ratio_next)
+
+#endif /* _NUMA_H_ */
--
2.29.2

View File

@ -1,13 +0,0 @@
diff -up powerpc-utils-1.3.8/src/sys_ident.c.me powerpc-utils-1.3.8/src/sys_ident.c
--- powerpc-utils-1.3.8/src/sys_ident.c.me 2020-10-06 15:35:56.620350621 +0200
+++ powerpc-utils-1.3.8/src/sys_ident.c 2020-10-06 15:36:31.468849936 +0200
@@ -267,7 +267,8 @@ print_proc_sn_value(void)
"parameter from RTAS\n");
return 2;
}
- pos = strstr(buf, "uid=");
+ /* Ignore length field (first 2 bytes) */
+ pos = strstr(buf +2, "uid=");
if (pos == NULL) {
fprintf(stderr, "Parameter from RTAS does "
"not contain uid\n");

View File

@ -1,443 +0,0 @@
From 3c549c7494e729a68b64ac5519bcf1506b24f945 Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.ibm.com>
Date: Wed, 25 Nov 2020 18:03:45 +0100
Subject: [PATCH 3/3] drmgr: introduce NUMA based LMB removal
When the NUMA topology can be read, all the LMBs found in the Device Tree
are linked the corresponding node. LMB not associated to node are
considered as not used.
LMB associated to CPU less node are accounted separately because they will
be targeted first to be remove. The LMB are removed from the CPU less nodes
to reach an average number LMBs per CPU less node.
Node with CPU have a ration indexed on their number of CPUs. The higher a
node have CPU the lower number LMB will be removed. This way node with a
high number of CPU will get a higher amount of memory.
When a LMB can't be removed (because its memory can't be offlined by the
kernel), the LMB count for node is decremented and the LMB is removed from
the node's LMB list. This way, it is no more accounted as 'active' and the
removal operation will continue without taking it in account anymore.
The removal is done through the remove by DRC index API, allowing to remove
a LMB at a time. One futur optimization would be to extend that API to
remove a linear range of LMB each time.
If the NUMA topology can't be read, we fallback using the legacy remove
way.
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
---
src/drmgr/drslot_chrp_mem.c | 335 +++++++++++++++++++++++++++++++++++-
src/drmgr/ofdt.h | 2 +
2 files changed, 336 insertions(+), 1 deletion(-)
diff --git a/src/drmgr/drslot_chrp_mem.c b/src/drmgr/drslot_chrp_mem.c
index 502aa3e9fff0..47d9f7b8ed90 100644
--- a/src/drmgr/drslot_chrp_mem.c
+++ b/src/drmgr/drslot_chrp_mem.c
@@ -31,12 +31,16 @@
#include "dr.h"
#include "ofdt.h"
#include "drmem.h"
+#include "common_numa.h"
static int block_sz_bytes = 0;
static char *state_strs[] = {"offline", "online"};
static char *usagestr = "-c mem {-a | -r} {-q <quantity> -p {variable_weight | ent_capacity} | {-q <quantity> | -s [<drc_name> | <drc_index>]}}";
+static struct numa_topology numa;
+static int numa_enabled = 0;
+
/**
* mem_usage
* @brief return usage string
@@ -306,6 +310,31 @@ get_mem_node_lmbs(struct lmb_list_head *lmb_list)
return rc;
}
+static int link_lmb_to_numa_node(struct dr_node *lmb)
+{
+ int nid;
+ struct numa_node *node;
+
+ nid = numa_aa_index_to_node(&numa, lmb->lmb_aa_index);
+ if (nid == NUMA_NO_NODE)
+ return 0;
+
+ node = numa_fetch_node(&numa, nid);
+ if (!node)
+ return -ENOMEM;
+
+ lmb->lmb_numa_next = node->lmbs;
+ node->lmbs = lmb;
+ node->n_lmbs++;
+
+ if (node->n_cpus)
+ numa.lmb_count++;
+ else
+ numa.cpuless_lmb_count++;
+
+ return 0;
+}
+
int add_lmb(struct lmb_list_head *lmb_list, uint32_t drc_index,
uint64_t address, uint64_t lmb_sz, uint32_t aa_index,
uint32_t flags)
@@ -324,6 +353,9 @@ int add_lmb(struct lmb_list_head *lmb_list, uint32_t drc_index,
lmb->lmb_address = address;
lmb->lmb_aa_index = aa_index;
+ if (numa_enabled && link_lmb_to_numa_node(lmb))
+ return -ENOMEM;
+
if (flags & DRMEM_ASSIGNED) {
int rc;
@@ -490,7 +522,7 @@ get_dynamic_reconfig_lmbs(struct lmb_list_head *lmb_list)
if (stat(DYNAMIC_RECONFIG_MEM_V1, &sbuf) == 0) {
rc = get_dynamic_reconfig_lmbs_v1(lmb_sz, lmb_list);
- } else if (is_lsslot_cmd &&
+ } else if ((is_lsslot_cmd || numa_enabled) &&
stat(DYNAMIC_RECONFIG_MEM_V2, &sbuf) == 0) {
rc = get_dynamic_reconfig_lmbs_v2(lmb_sz, lmb_list);
} else {
@@ -1424,11 +1456,312 @@ int valid_mem_options(void)
return 0;
}
+static int remove_lmb_by_index(uint32_t drc_index)
+{
+ char cmdbuf[128];
+ int offset;
+
+ offset = sprintf(cmdbuf, "memory remove index 0x%x", drc_index);
+
+ return __do_kernel_dlpar(cmdbuf, offset, 1 /* Don't report error */);
+}
+
+static int remove_lmb_from_node(struct numa_node *node, uint32_t count)
+{
+ struct dr_node *lmb;
+ int err, done = 0, unlinked = 0;
+
+ say(DEBUG, "Try removing %d / %d LMBs from node %d\n",
+ count, node->n_lmbs, node->node_id);
+
+ for (lmb = node->lmbs; lmb && done < count; lmb = lmb->lmb_numa_next) {
+ unlinked ++;
+ err = remove_lmb_by_index(lmb->drc_index);
+ if (err)
+ say(WARN,"Can't remove LMB node:%d index:0x%x: %s\n",
+ node->node_id, lmb->drc_index, strerror(-err));
+ else
+ done++;
+ }
+
+ /*
+ * Decrement the node LMB's count since whatever is the success
+ * of the removal operation, it will not be tried again on that
+ * LMB.
+ */
+ node->n_lmbs -= unlinked;
+
+ /*
+ * Update the node's list of LMB to not process the one we removed or
+ * tried to removed again.
+ */
+ node->lmbs = lmb;
+
+ /* Update numa's counters */
+ if (node->n_cpus)
+ numa.lmb_count -= unlinked;
+ else
+ numa.cpuless_node_count -= unlinked;
+
+ if (!node->n_lmbs) {
+ node->ratio = 0; /* for sanity only */
+ if (node->n_cpus)
+ numa.cpu_count -= node->n_cpus;
+ else
+ numa.cpuless_node_count--;
+ }
+
+ say(INFO, "Removed %d LMBs from node %d\n", done, node->node_id);
+ return done;
+}
+
+#define min(a,b) ((a < b) ? a : b)
+
+static void update_cpuless_node_ratio(void)
+{
+ struct numa_node *node;
+ int nid;
+
+ /*
+ * Assumptions:
+ * 1. numa->cpuless_node_count is up to date
+ * 2. numa->cpuless_lmb_count is up to date
+ * Nodes with no memory and nodes with CPUs are ignored here.
+ */
+ numa_foreach_node(&numa, nid, node) {
+ if (node->n_cpus ||!node->n_lmbs)
+ continue;
+ node->ratio = (node->n_lmbs * 100) / numa.cpuless_lmb_count;
+ }
+}
+
+/*
+ * Remove LMBs from node without CPUs only.
+ * The more the node has LMBs, the more LMBs will be removed from it.
+ *
+ * We have to retry the operation multiple times because some LMB cannot be
+ * removed due to the page usage in the kernel. In that case, that LMB is no
+ * more taken in account and the node's LMB count is decremented, assuming that
+ * LMB is unremovable at this time. Thus each node's ratio has to be computed on
+ * each iteration. This is not a big deal, usually, there are not so much nodes.
+ */
+static int remove_cpuless_lmbs(uint32_t count)
+{
+ struct numa_node *node;
+ int nid;
+ uint32_t total = count, todo, done = 0, this_loop;
+
+ while (count) {
+ count = min(count, numa.cpuless_lmb_count);
+ if (!count)
+ break;
+
+ update_cpuless_node_ratio();
+
+ this_loop = 0;
+ numa_foreach_node(&numa, nid, node) {
+ if (!node->n_lmbs || node->n_cpus)
+ continue;
+
+ todo = (count * node->ratio) / 100;
+ todo = min(todo, node->n_lmbs);
+ /* Fix rounded value to 0 */
+ if (!todo && node->n_lmbs)
+ todo = (count - this_loop);
+
+ if (todo)
+ todo = remove_lmb_from_node(node, todo);
+
+ this_loop += todo;
+ done += todo;
+ if (done >= total)
+ break;
+ }
+
+ /* Don't continue if we didn't make any progress. */
+ if (!this_loop)
+ break;
+
+ count -= this_loop;
+ }
+
+ say(DEBUG, "%d / %d LMBs removed from the CPU less nodes\n",
+ done, total);
+ return done;
+}
+
+static void update_node_ratio(void)
+{
+ int nid;
+ struct numa_node *node, *n, **p;
+ uint32_t cpu_ratio, mem_ratio;
+
+ /*
+ * Assumptions:
+ * 1. numa->cpu_count is up to date
+ * 2. numa->lmb_count is up to date
+ * Nodes with no memory and nodes with no CPU are ignored here.
+ */
+
+ numa.ratio = NULL;
+ numa_foreach_node(&numa, nid, node) {
+ if (!node->n_lmbs || !node->n_cpus)
+ continue;
+ cpu_ratio = (node->n_cpus * 100) / numa.cpu_count;
+ mem_ratio = (node->n_lmbs * 100) / numa.lmb_count;
+
+ /* Say that CPU ratio is 90% of the ratio */
+ node->ratio = (cpu_ratio * 9 + mem_ratio) / 10;
+ }
+
+ /* Create an ordered link of the nodes */
+ numa_foreach_node(&numa, nid, node) {
+ if (!node->n_lmbs || !node->n_cpus)
+ continue;
+
+ p = &numa.ratio;
+ for (n = numa.ratio;
+ n && n->ratio < node->ratio; n = n->ratio_next)
+ p = &n->ratio_next;
+ *p = node;
+ node->ratio_next = n;
+ }
+}
+
+/*
+ * Remove LMBs from node with CPUs.
+ *
+ * The less a node has CPU, the more memory will be removed from it.
+ *
+ * As for the CPU less nodes, we must iterate because some LMBs may not be
+ * removable at this time.
+ */
+static int remove_cpu_lmbs(uint32_t count)
+{
+ struct numa_node *node;
+ uint32_t total = count, todo, done = 0, this_loop;
+ uint32_t new_lmb_count;
+
+ while(count) {
+ count = min(count, numa.lmb_count);
+ if (!count)
+ break;
+
+ update_node_ratio();
+
+ new_lmb_count = numa.lmb_count - count;
+
+ this_loop = 0;
+ numa_foreach_node_by_ratio(&numa, node) {
+ if (!node->n_lmbs || !node->n_cpus)
+ continue;
+
+ todo = (new_lmb_count * node->ratio) / 100;
+ todo = node->n_lmbs - min(todo, node->n_lmbs);
+ todo = min(count, todo);
+
+ if (todo) {
+ todo = remove_lmb_from_node(node, todo);
+ count -= todo;
+ this_loop += todo;
+ }
+
+ if (!count)
+ break;
+ }
+
+ /* Don't continue if we didn't make any progress. */
+ if (!this_loop)
+ break;
+ done += this_loop;
+ }
+
+ say(DEBUG, "%d / %d LMBs removed from the CPU nodes\n",
+ done, total);
+ return done;
+}
+
+static void build_numa_topology(void)
+{
+ int rc;
+
+ rc = numa_get_topology(&numa);
+ if (rc)
+ return;
+
+ numa_enabled = 1;
+}
+
+static void clear_numa_lmb_links(void)
+{
+ int nid;
+ struct numa_node *node;
+
+ numa_foreach_node(&numa, nid, node)
+ node->lmbs = NULL;
+}
+
+static int numa_based_remove(uint32_t count)
+{
+ struct lmb_list_head *lmb_list;
+ struct numa_node *node;
+ int nid;
+ uint32_t done = 0;
+
+ /*
+ * Read the LMBs
+ * Link the LMBs to their node
+ * Update global counter
+ */
+ lmb_list = get_lmbs(LMB_NORMAL_SORT);
+ if (lmb_list == NULL) {
+ clear_numa_lmb_links();
+ return -1;
+ }
+
+ if (!numa.node_count) {
+ clear_numa_lmb_links();
+ free_lmbs(lmb_list);
+ return -EINVAL;
+ }
+
+ numa_foreach_node(&numa, nid, node) {
+ say(INFO, "node %4d %4d CPUs %8d LMBs\n",
+ nid, node->n_cpus, node->n_lmbs);
+ }
+
+ done += remove_cpuless_lmbs(count);
+ count -= done;
+
+ done += remove_cpu_lmbs(count);
+
+ report_resource_count(done);
+
+ clear_numa_lmb_links();
+ free_lmbs(lmb_list);
+ return 0;
+}
+
int do_mem_kernel_dlpar(void)
{
char cmdbuf[128];
int rc, offset;
+
+ if (usr_action == REMOVE && usr_drc_count) {
+ build_numa_topology();
+ if (numa_enabled) {
+ if (!numa_based_remove(usr_drc_count))
+ return 0;
+
+ /*
+ * If the NUMA based removal failed, lets try the legacy
+ * way.
+ */
+ say(WARN, "Can't do NUMA based removal operation.\n");
+ }
+ }
+
offset = sprintf(cmdbuf, "%s ", "memory");
switch (usr_action) {
diff --git a/src/drmgr/ofdt.h b/src/drmgr/ofdt.h
index 3850a77229b4..3c2840b2e0ee 100644
--- a/src/drmgr/ofdt.h
+++ b/src/drmgr/ofdt.h
@@ -92,6 +92,7 @@ struct dr_node {
uint32_t _lmb_aa_index;
struct mem_scn *_mem_scns;
struct of_node *_of_node;
+ struct dr_node *_numa_next;
} _smem;
#define lmb_address _node_u._smem._address
@@ -99,6 +100,7 @@ struct dr_node {
#define lmb_aa_index _node_u._smem._lmb_aa_index
#define lmb_mem_scns _node_u._smem._mem_scns
#define lmb_of_node _node_u._smem._of_node
+#define lmb_numa_next _node_u._smem._numa_next
struct hea_info {
uint _port_no;
--
2.29.2

View File

@ -1,33 +0,0 @@
commit 0b59d4a372aa266caa75f3b6a253b8f5aeaf3802
Author: Mingming Cao <mmc@linux.vnet.ibm.com>
Date: Mon Mar 1 19:34:29 2021 -0800
hcnmgr: Avoid cleanup of bond interface at boot time when no HNV exists
At boot time, hcn scans the device tree and discovers if there was a new
HNV being added while lpar was inactive. It also cleans up the old hnv
interfaces. This patch avoids cleaning up bonding interface when no HNV
network devices exists.
Signed-off-by: Mingming Cao <mmc@linux.vnet.ibm.com>
[tyreld: fixup commit log]
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
diff --git a/scripts/hcnmgr b/scripts/hcnmgr
index a76505e..c95edba 100644
--- a/scripts/hcnmgr
+++ b/scripts/hcnmgr
@@ -575,7 +575,13 @@ scanhcn() {
done
fi
+ if [ ${HcnIds[@]} -eq 0 ]; then
+ hcnlog DEBUG "scanhcn: scan for hybrid virtual network finished"
+ return $E_SUCCESS
+ fi
+
# Next clean up dead connections left from orgitinal LPAR after inactive miration
+ # Only do this when the HNV ID array is not empty
# list of all HCN ids
ids="${HcnIds[*]}"

View File

@ -1,32 +0,0 @@
commit 1cb8bd89d6386c60e75c47d4a4452d3f130d5138
Author: Mingming Cao <mmc@linux.vnet.ibm.com>
Date: Fri Mar 12 14:18:18 2021 -0800
hcnmgr: Avoid using xargs to process NM show connections
When removing HNV bonding connections xargs can fail to process the output of
nmcli show propererly.
Instead of piping into xargs fix this by using a loop to check for all related
bonding connections and remove them explicitly one by one.
Signed-off-by: Mingming Cao <mmc@linux.vnet.ibm.com>
[tyreld: fixed up commit log]
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
diff --git a/scripts/hcnmgr b/scripts/hcnmgr
index d66b5d1..30d31e7 100644
--- a/scripts/hcnmgr
+++ b/scripts/hcnmgr
@@ -377,7 +377,10 @@ rmhcn() {
fi
hcnlog INFO "rmhcn: delete bond $BONDNAME and slaves "
- nmcli -f NAME con show | grep "$BONDNAME" | xargs sudo nmcli con delete
+ for connection in $(nmcli -f NAME con show | grep "$BONDNAME"); do
+ hcnlog INFO "Delete bonding connection $connection"
+ nmcli con delete "$connection"
+ done
hcnlog DEBUG "rmhcn: exit"
return $E_SUCCESS
}

View File

@ -1,26 +0,0 @@
commit 366e17553ed647613668678c2d301d369038f41b
Author: Brahadambal Srinivasan <latha@linux.vnet.ibm.com>
Date: Thu Nov 12 19:00:47 2020 +0530
Update ppc64-cpu usage
'ppc64_cpu --help' doesn't list '--version' as an option. This patch
adds the option in the usage information of ppc64-cpu command.
Signed-off-by: Brahadambal Srinivasan <latha@linux.vnet.ibm.com>
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
diff --git a/src/ppc64_cpu.c b/src/ppc64_cpu.c
index 71f4720..2b0f66c 100644
--- a/src/ppc64_cpu.c
+++ b/src/ppc64_cpu.c
@@ -1195,7 +1195,8 @@ static void usage(void)
"ppc64_cpu --subcores-per-core # Get number of subcores per core\n"
"ppc64_cpu --subcores-per-core=X # Set subcores per core to X (1 or 4)\n"
"ppc64_cpu --threads-per-core # Get threads per core\n"
-"ppc64_cpu --info # Display system state information)\n");
+"ppc64_cpu --info # Display system state information\n"
+"ppc64_cpu --version # Display version of ppc64-cpu\n");
}
struct option longopts[] = {

View File

@ -1,162 +0,0 @@
commit 4b2d10942e2d964ecc1fe58c9460c34993ff10be
Author: Mingming Cao <mmc@linux.vnet.ibm.com>
Date: Thu Nov 19 21:38:24 2020 -0800
Disable vnic as backup vdevice for migratable SR_IOV
In version 1.0 we only allow configure ibmveth as
migritable SR_IOV backup vdevice. When ibmvnic become
more stablized we will enable vnic as backup.
Signed-off-by: Mingming Cao <mmc@linux.vnet.ibm.com>
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
diff --git a/scripts/hcnmgr b/scripts/hcnmgr
index 1135a85..0c09d8c 100644
--- a/scripts/hcnmgr
+++ b/scripts/hcnmgr
@@ -37,6 +37,8 @@ DRC_INDEX=0
DEVNAME=""
MODE=""
PHYSLOC=""
+VIO_TYPE=""
+VNIC_SPT=""
# Usage statements
usage() {
@@ -80,6 +82,7 @@ E_EPERM=1 # Platform not supported
E_BUSY=16 # Device busy
E_ENODEV=19 # Failed get device name
E_NOMODULE=5 # Failed to load bonding module
+E_INVAL_DEV=6 # Vdevice not supported
E_ENETUNREACH=101 # No network management command nmcli
#
@@ -93,19 +96,22 @@ err() {
case $eno in
"$E_INVAL")
- e_mesg="$HCNCMD:error code $eno, hybrid network ID HCNID does not exist"
+ e_mesg="$HCNCMD:error code $eno, Hybrid network ID HCNID does not exist"
+ ;;
+ "$E_INVAL_DEV")
+ e_mesg="$HCNCMD:error code $eno, Backing vdevice not supported"
;;
"$E_EPERM")
- e_mesg="$HCNCMD:error code $eno, platform is not supported"
+ e_mesg="$HCNCMD:error code $eno, Platform is not supported"
;;
"$E_BUSY")
- e_mesg="$HCNCMD:error code $eno, network device busy, no backup device"
+ e_mesg="$HCNCMD:error code $eno, Network device busy, no backup device"
;;
"$E_ENODEV")
- e_mesg="$HCNCMD:error code $eno, failed to find device or get device name"
+ e_mesg="$HCNCMD:error code $eno, Failed to find device or get device name"
;;
"$E_NOMODULE")
- e_mesg="$HCNCMD:error code $eno, failed to load bonding module"
+ e_mesg="$HCNCMD:error code $eno, Failed to load bonding module"
;;
"$E_ENETUNREACH")
e_mesg="$HCNCMD:error code $eno, nmcli command not installed"
@@ -147,6 +153,7 @@ hcnlog() {
esac
}
+
# function search_dev:
# Given DRX_INDEX, Search for device-tree, looking for migratable SR_IOV
# backend vnic or ibmveth device to configure hybrid network
@@ -171,6 +178,7 @@ search_dev() {
if [ -e "$dev"/ibm,hcn-id ] && get_dev_hcn "$dev"; then
hcnlog DEBUG "search_dev: found device "
hcnlog DEBUG "search_dev: exit"
+ VIO_TYPE="SRIOV"
return $E_SUCCESS
fi
done
@@ -184,6 +192,7 @@ search_dev() {
if [[ $index == "$1" ]]; then
hcnlog DEBUG "found matching drc_index $index in $dev"
if [ -e "$dev"/ibm,hcn-id ] && get_dev_hcn "$dev"; then
+ VIO_TYPE="VNIC"
hcnlog DEBUG "search_dev: found device "
hcnlog DEBUG "search_dev: exit"
return $E_SUCCESS
@@ -201,6 +210,7 @@ search_dev() {
if [ -e "$dev"/ibm,hcn-id ] && get_dev_hcn "$dev"; then
hcnlog DEBUG "search_dev: found device "
hcnlog DEBUG "search_dev: exit"
+ VIO_TYPE="L_LAN"
return $E_SUCCESS
fi
fi
@@ -340,6 +350,10 @@ do_config_vdevice() {
cfghcn() {
hcnlog DEBUG "cfghcn: enter $1"
search_dev "$1"
+ if [[ $VIO_TYPE == "VNIC" && $VNIC_SPT == "OFF" ]]; then
+ hcnlog WARN "Backing device $VIO_TYPE for Migratable VF is not supported in hcnmgr version $VERSION"
+ err $E_INVAL_DEV
+ fi
do_config_vdevice
return $E_SUCCESS
}
@@ -512,23 +526,10 @@ scanhcn() {
done
done
- hcnlog DEBUG "search vnic device with ibm,hcn-id propterty......"
+ hcnlog DEBUG "search ibmveth device with ibm,hcn-id propterty......"
# Look at every vNIC device with ibm,hcn-id propterty
# join or create bond for this hcnid if not exist, add vnic device as
# slave for this bond accosiated with hcnid, if not already to
- for dev in "$DT_PATH"/vdevice/vnic*; do
- [ -d "$dev" ] || continue
- if [ -e "$dev"/ibm,hcn-id ] && get_dev_hcn "$dev"; then
- hcnlog DEBUG "scanhcn found vnic device with hcnid "
- hcnlog INFO "scanhcn configure HCN and vnic device"
- do_config_vdevice
- fi
- done
-
- # Look at every veth device with ibm,hcn-id propterty
- # join or create bond for this hcnid if not exist, add ibmveth device as
- # slave for this bond accosiated with hcnid, if not already to
- hcnlog DEBUG "search ibmveth device with ibm,hcn-id propterty......"
for dev in "$DT_PATH"/vdevice/l-lan*; do
[ -d "$dev" ] || continue
if [ -e "$dev"/ibm,hcn-id ] && get_dev_hcn "$dev"; then
@@ -538,6 +539,21 @@ scanhcn() {
fi
done
+ if [[ $VNIC_SPT != "OFF" ]]; then
+ hcnlog DEBUG "search vnic device with ibm,hcn-id propterty......"
+ # Look at every vNIC device with ibm,hcn-id propterty
+ # join or create bond for this hcnid if not exist, add vnic device as
+ # slave for this bond accosiated with hcnid, if not already to
+ for dev in "$DT_PATH"/vdevice/vnic*; do
+ [ -d "$dev" ] || continue
+ if [ -e "$dev"/ibm,hcn-id ] && get_dev_hcn "$dev"; then
+ hcnlog DEBUG "scanhcn found vnic device with hcnid "
+ hcnlog INFO "scanhcn configure HCN and vnic device"
+ do_config_vdevice
+ fi
+ done
+ fi
+
# Next clean up dead connections left from orgitinal LPAR after inactive miration
# list of all HCN ids
@@ -583,6 +599,9 @@ fi
if ! nmcli --version >/dev/null 2>&1; then
err $E_ENETUNREACH
fi
+if [[ $VERSION == "1.0" ]]; then
+ VNIC_SPT="OFF"
+fi
#Validate bonding module is loaded
if ! lsmod | grep -q bonding; then

View File

@ -1,30 +0,0 @@
commit d9bcb21179ccfea122f326aca4690afe0f7de0c6
Author: Mingming Cao <mmc@linux.vnet.ibm.com>
Date: Mon Mar 1 21:34:34 2021 -0800
hcnmgr: Wait for sysfs device ready when looking up device name
At the time of calling ofpathname to look up for devicename, wait
for sysfs device ready. Otherwise, the OS may be in the middle of device
renaming.
Signed-off-by: Mingming Cao <mmc@linux.vnet.ibm.com>
[tyreld: fixed up commit log]
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
diff --git a/scripts/hcnmgr b/scripts/hcnmgr
index c95edba..0d20e7d 100644
--- a/scripts/hcnmgr
+++ b/scripts/hcnmgr
@@ -241,7 +241,10 @@ get_dev_hcn() {
# Let's retry a few times.
while [ $wait != 0 ]; do
if DEVNAME=$(ofpathname -l "$(echo "$1" | sed -e "s/\/proc\/device-tree//")" 2>/dev/null); then
- break
+ if [ -e /sys/class/net/"$DEVNAME" ]; then
+ hcnlog DEBUG "ofpathname waiting for /sys/class/net device $DEVNAME ready"
+ break
+ fi
fi
hcnlog DEBUG "ofpathname return $?, devname is $DEVNAME rety counter $wait"

View File

@ -1,30 +0,0 @@
commit e25d71be411b610e5e889f8efaaf04b38c2d9ecb
Author: Mingming Cao <mmc@linux.vnet.ibm.com>
Date: Fri Mar 12 13:50:33 2021 -0800
hcnmgr: Avoid using ifcfg file for checking bonding interface status
When configuring migratable sr_iov into hybrid network, it checks if
there is an existing HNV using the presense of ifcfg file location. This
is not preferred as the location can be different on distros.
This patch fixes this by using NetworkManager nmcli.
Signed-off-by: Mingming Cao <mmc@linux.vnet.ibm.com>
[tyreld: fixed spelling]
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
diff --git a/scripts/hcnmgr b/scripts/hcnmgr
index 0d20e7d..d66b5d1 100644
--- a/scripts/hcnmgr
+++ b/scripts/hcnmgr
@@ -282,8 +282,7 @@ do_config_vdevice() {
hcnlog DEBUG "Check if there is bond $BONDNAME with hcn id $HCNID"
- hcnlog DEBUG "ifconfig file $IFCONFIG_PATH/ifconfig-$BONDNAME"
- if [ ! -e "$IFCONFIG_PATH/ifcfg-$BONDNAME" ]; then
+ if ! nmcli -f NAME con show --active | grep -q "$BONDNAME\s"; then
hcnlog INFO "nmcli con add type bond con-name $BONDNAME ifname $BONDNAME"
nmcli con add type bond con-name "$BONDNAME" ifname "$BONDNAME"

View File

@ -1,86 +0,0 @@
commit e51995667279164a6c8ce4c3ba0217fffda22d6d
Author: Mingming Cao <mmc@linux.vnet.ibm.com>
Date: Thu Nov 19 21:41:31 2020 -0800
Clean up dead network config interface after inactive migration
With inactive parition migration, we found there are
dead network interface asscoiated with the SR_IOV from
source LPAR that still exist.
We need to cleanup the old network interface
related to this devname from source LPAR. Normally
in the active parition migration case, this was done
when HMC issue commands to OS remove the
VF from hybrid network. Because of the inactive
migration, the OS was not notified before migration
that the VF was removed, thus leaves the dead network
interface belongs on the original MVF on source lpar
not cleaned up. This cause the confusion of the network
manager to bring up the bondings with new MVFs at destination
LPAR.
After inactive parition migration the same devname
could possiblily used at the destination LPAR.
It can be assigned to a different hybrid
network (different hcnid). At the OS boot time,
the same devname but with different hcnid will
be configured. However the old network work interface
associated with the same devname from the source LPAR
has not been cleaned up yet.
This patch fix this by during the boot time scan and
if found an VFs has more than two bonding interfaces,
or has old VF ifcfg, removed the expired one.
Signed-off-by: Mingming Cao <mmc@linux.vnet.ibm.com>
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
diff --git a/scripts/hcnmgr b/scripts/hcnmgr
index 0c09d8c..a76505e 100644
--- a/scripts/hcnmgr
+++ b/scripts/hcnmgr
@@ -518,6 +518,27 @@ scanhcn() {
[ -d "$dev" ] || continue
if [ -e "$dev"/ibm,hcn-id ] && get_dev_hcn "$dev"; then
hcnlog DEBUG "scanhcn found sr-iov device with hcnid "
+
+ # After online from inactive migration, destination
+ # LPAR may have same mvf devname but associated with different
+ # bonding than from source LPAR
+ # clean up expired bonding SR_IOV connections
+
+ for cfg in $(ls $IFCONFIG_PATH | grep "$DEVNAME" | grep "bond"); do
+ hid=$(echo "$cfg" | sed -e 's/ifcfg-//' | cut -d '-' -f 1 | sed -e 's/bond//')
+ if [ -e "$IFCONFIG_PATH/ifcfg-$DEVNAME" ]; then
+ rm "$IFCONFIG_PATH/ifcfg-$DEVNAME"
+ fi
+ if [[ $hid != "" && $hid != "$HCNID" ]] ; then
+ hcnlog INFO "Delete dead bonding slave ifcfg file $IFCONFIG_PATH/$cfg"
+ rm $IFCONFIG_PATH/"$cfg"
+ if nmcli -f NAME con show | grep -q "bond$hid-$DEVNAME\s"; then
+ hcnlog INFO "Delete dead bonding connection $connection"
+ nmcli con delete "bond$hid-$DEVNAME"
+ fi
+ fi
+ done
+
hcnlog INFO "scanhcn configure HCN and sr-iov device"
do_config_vdevice
# Save found HCN ids in array HcnIds
@@ -558,13 +579,12 @@ scanhcn() {
# list of all HCN ids
ids="${HcnIds[*]}"
-
# After inactive migration, LPAR may have old bonding connections
# with network device on original LPAR
- # clean up dead bonding connections
+ # clean up dead bonding connections
for connection in $(nmcli -f NAME con show | grep "${ids// /\\|}"); do
dev=$(echo "$connection" | cut -d '-' -f 2)
- if [ ! -e /sys/class/net/"$dev" ]; then
+ if [[ $dev != "NAME" && ! -e /sys/class/net/"$dev" ]]; then
hcnlog INFO "Delete dead bonding connection $connection"
nmcli con delete "$connection"
fi

View File

@ -1,94 +0,0 @@
commit f1ec5f04fdac3e87c3c85c2d85f79339d916e864
Author: Mingming Cao <mmc@linux.vnet.ibm.com>
Date: Thu Nov 19 21:30:40 2020 -0800
HNV fixes for qrydev and remove lsdevinfo
This patch fixes a few issues found during testing:
1) Fix qrydev checking for active interface error
2) Skip collecting lsdevinfo right before migration
When the LPARs has large number of devices, this can
take quite a long time and flood the log message.
3) Wait for OS ready to lookup device name for configure
HNV device. We need give OS longer time to prepare pci
device after DLPAR to show up for ofpathname(). We have
run into several issues HNV issues the hcncfgdrc command
to configure the vdevices before ofpathname able
to find the device name yet.
4) Currently the hybrid network virtualization is only
supported on PowerVM LPAR. We should allow exit the hcnmgr
gracefully on other power platform instead of return error.
Signed-off-by: Mingming Cao <mmc@linux.vnet.ibm.com>
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
diff --git a/scripts/hcnmgr b/scripts/hcnmgr
index e699c6a..1135a85 100644
--- a/scripts/hcnmgr
+++ b/scripts/hcnmgr
@@ -218,7 +218,7 @@ search_dev() {
# $1 path to device-tree device
#
get_dev_hcn() {
- local wait=30
+ local wait=12
local dev=$1
hcnlog DEBUG "get_dev_hcn: enter $1"
@@ -235,7 +235,7 @@ get_dev_hcn() {
fi
hcnlog DEBUG "ofpathname return $?, devname is $DEVNAME rety counter $wait"
- sleep 1
+ sleep 15
((wait--))
if [[ $wait == 0 ]]; then
@@ -393,7 +393,7 @@ qrydev() {
BOND_PATH=$BOND_BASEPATH/$BONDNAME/bonding
hcnlog DEBUG "check if the network interface for this SR_IOV is not up, return success"
- if ! nmcli -f DEVICE con show --active | grep -q "$BONDNAME-$DEVNAME"; then
+ if ! nmcli -f DEVICE con show --active | grep -q "$DEVNAME"; then
hcnlog DEBUG "network connection $BONDNAME-$DEVNAME is inactive or nonexist"
hcnlog DEBUG "HCNID $HCNID devname $DEVNAME mode $MODE physloc $PHYSLOC"
hcnlog DEBUG "qryhcn: exit"
@@ -435,7 +435,6 @@ show_hcnstatus() {
nmcli connection show >>$LOG_FILE
nmcli device status >>$LOG_FILE
ip addr show >>$LOG_FILE
- lsdevinfo >>$LOG_FILE
}
#
@@ -569,10 +568,15 @@ exec &> >(tee -a $LOG_FILE)
NOW=$(date +"%m-%d-%Y %T")
echo "=======================$NOW============================"
+HCNCMD=$(basename "$0")
+hcnlog DEBUG "$HCNCMD enter"
+
#Validate this tool is running on powerpc platform
. "$PSERIES_PLATFORM"
if [ "$platform" != "$PLATFORM_PSERIES_LPAR" ]; then
- err $E_EPERM
+ hcnlog INFO "HNV is only supported on PowerVM LPAR"
+ hcnlog INFO "$HCNCMD exit"
+ exit 0
fi
#Validate NMCLI packages is install to manage networking
@@ -588,9 +592,6 @@ if ! lsmod | grep -q bonding; then
fi
fi
-HCNCMD=$(basename "$0")
-hcnlog DEBUG "$HCNCMD enter"
-
#getops for help and version
while getopts "sVhd:" arg; do
case "$arg" in

View File

@ -1,19 +0,0 @@
diff -up powerpc-utils-1.3.8/Makefile.am.me powerpc-utils-1.3.8/Makefile.am
--- powerpc-utils-1.3.8/Makefile.am.me 2020-09-04 11:34:16.575570705 +0200
+++ powerpc-utils-1.3.8/Makefile.am 2020-09-04 11:35:36.499111663 +0200
@@ -47,7 +47,14 @@ man_MANS = \
man/errinjct.8 \
man/vcpustat.8 \
man/rtas_dbg.8 \
- man/drmgr.8
+ man/drmgr.8 \
+ man/lsdevinfo.8 \
+ man/rtas_event_decode.8 \
+ man/ls-vdev.8 \
+ man/lsprop.8 \
+ man/ls-veth.8 \
+ man/nvsetenv.8 \
+ man/ls-vscsi.8
EXTRA_DIST += $(bin_SCRIPTS) $(sbin_SCRIPTS) $(man_MANS)

View File

@ -1,11 +0,0 @@
diff -up powerpc-utils-1.3.8/Makefile.am.me powerpc-utils-1.3.8/Makefile.am
--- powerpc-utils-1.3.8/Makefile.am.me 2021-05-31 15:24:26.031343026 +0200
+++ powerpc-utils-1.3.8/Makefile.am 2021-05-31 15:25:50.818195810 +0200
@@ -182,7 +182,6 @@ noinst_HEADERS += \
src/drmgr/drcpu.h \
src/drmgr/dr.h \
src/drmgr/drmem.h \
- src/drmgr/numa.h \
src/drmgr/drpci.h \
src/drmgr/rtas_calls.h \
src/drmgr/ofdt.h \

View File

@ -1,39 +0,0 @@
diff -up powerpc-utils-1.3.8/scripts/hcnmgr.me powerpc-utils-1.3.8/scripts/hcnmgr
--- powerpc-utils-1.3.8/scripts/hcnmgr.me 2021-04-29 11:52:12.557488115 +0200
+++ powerpc-utils-1.3.8/scripts/hcnmgr 2021-04-29 16:31:07.936124101 +0200
@@ -167,7 +167,7 @@ search_dev() {
# Look at pci ethernet devices
for pci_dev in "$DT_PATH"/pci*; do
[ -d "$pci_dev" ] || continue
- index=$(xxd -l 4 -p "$pci_dev"/ibm,my-drc-index)
+ index=$(od -tx -An --endian=big -N 4 "$pci_dev"/ibm,my-drc-index | tr -d ' ')
if [[ $index != "$1" ]]; then
continue
fi
@@ -188,7 +188,7 @@ search_dev() {
hcnlog DEBUG "search vnic device with drc_index $1"
for dev in "$DT_PATH"/vdevice/vnic*; do
[ -d "$dev" ] || continue
- index=$(xxd -l 4 -p "$dev"/ibm,my-drc-index)
+ index=$(od -tx -An --endian=big -N 4 "$dev"/ibm,my-drc-index | tr -d ' ')
if [[ $index == "$1" ]]; then
hcnlog DEBUG "found matching drc_index $index in $dev"
if [ -e "$dev"/ibm,hcn-id ] && get_dev_hcn "$dev"; then
@@ -204,7 +204,7 @@ search_dev() {
hcnlog DEBUG "search ibmveth device with drc_index $1"
for dev in "$DT_PATH"/vdevice/l-lan*; do
[ -d "$dev" ] || continue
- index=$(xxd -l 4 -p "$dev"/ibm,my-drc-index)
+ index=$(od -tx -An --endian=big -N 4 "$dev"/ibm,my-drc-index | tr -d ' ')
if [[ $index == "$1" ]]; then
hcnlog DEBUG "found matching drc_index $index in $dev"
if [ -e "$dev"/ibm,hcn-id ] && get_dev_hcn "$dev"; then
@@ -232,7 +232,7 @@ get_dev_hcn() {
local dev=$1
hcnlog DEBUG "get_dev_hcn: enter $1"
- HCNID=$(xxd -l 4 -p "$dev"/ibm,hcn-id)
+ HCNID=$(od -tx -An --endian=big -N 4 "$dev"/ibm,hcn-id | tr -d ' ')
MODE=$(tr -d '\0' <"$dev"/ibm,hcn-mode)
PHYSLOC=$(tr -d '\0' <"$dev"/ibm,loc-code)

View File

@ -0,0 +1,19 @@
diff -up powerpc-utils-1.3.9/Makefile.am.me powerpc-utils-1.3.9/Makefile.am
--- powerpc-utils-1.3.9/Makefile.am.me 2021-07-19 12:30:46.169839551 +0200
+++ powerpc-utils-1.3.9/Makefile.am 2021-07-19 12:33:10.336034675 +0200
@@ -48,7 +48,14 @@ man_MANS = \
man/vcpustat.8 \
man/rtas_dbg.8 \
man/drmgr.8 \
- man/lparnumascore.8
+ man/lparnumascore.8 \
+ man/lsdevinfo.8 \
+ man/rtas_event_decode.8 \
+ man/ls-vdev.8 \
+ man/lsprop.8 \
+ man/ls-veth.8 \
+ man/nvsetenv.8 \
+ man/ls-vscsi.8
EXTRA_DIST += $(bin_SCRIPTS) $(sbin_SCRIPTS) $(man_MANS)

View File

@ -1,132 +0,0 @@
commit 97269d301797e23b75d0c7a5cb63ce280783f615
Author: Laurent Dufour <ldufour@linux.ibm.com>
Date: Thu Mar 4 14:51:38 2021 +0100
lpartstat: add -x option for the security flavor
This allows user to get the security flavor settings fer the LPAR.
The output is :
$ lparstat -x
Speculative Execution Mode : 1
Where the output number means
0 = Speculative execution fully enabled
1 = Speculative execution controls to mitigate user-to-kernel side-channel
attacks
2 = Speculative execution controls to mitigate user-to-kernel and
user-to-user side-channel attacks
In the case the running kernel is not exposing the security flavor in
/proc/powerpc/lparcfg, the output is:
$ lparstat -x
Speculative Execution Mode : -
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
diff --git a/src/lparstat.c b/src/lparstat.c
index 23e4b85..00922c4 100644
--- a/src/lparstat.c
+++ b/src/lparstat.c
@@ -42,6 +42,7 @@
static bool o_legacy = false;
static bool o_scaled = false;
+static bool o_security = false;
static int threads_per_cpu;
static int cpus_in_system;
@@ -1152,6 +1153,15 @@ void print_scaled_output(int interval, int count)
} while (--count > 0);
}
+static void print_security_flavor(void)
+{
+ char value[64];
+ char *descr;
+
+ get_sysdata("security_flavor", &descr, value);
+ fprintf(stdout, "%-45s: %s\n", descr, value);
+}
+
static void usage(void)
{
printf("Usage: lparstat [ options ]\n\tlparstat <interval> [ count ]\n\n"
@@ -1159,6 +1169,7 @@ static void usage(void)
"\t-h, --help Show this message and exit.\n"
"\t-V, --version \tDisplay lparstat version information.\n"
"\t-i Lists details on the LPAR configuration.\n"
+ "\t-x Print the security mode settings for the LPAR.\n"
"\t-E Print SPURR metrics.\n"
"\t-l, --legacy Print the report in legacy format.\n"
"interval The interval parameter specifies the amount of time between each report.\n"
@@ -1184,7 +1195,7 @@ int main(int argc, char *argv[])
exit(1);
}
- while ((c = getopt_long(argc, argv, "iEVhl",
+ while ((c = getopt_long(argc, argv, "iEVhlx",
long_opts, &opt_index)) != -1) {
switch(c) {
case 'i':
@@ -1199,6 +1210,9 @@ int main(int argc, char *argv[])
case 'V':
printf("lparstat - %s\n", VERSION);
return 0;
+ case 'x':
+ o_security = true;
+ break;
case 'h':
usage();
return 0;
@@ -1223,6 +1237,8 @@ int main(int argc, char *argv[])
if (i_option)
print_iflag_data();
+ else if (o_security)
+ print_security_flavor();
else if (o_scaled) {
print_scaled_output(interval, count);
close_cpu_sysfs_fds(threads_in_system);
diff --git a/src/lparstat.h b/src/lparstat.h
index 9b7117f..26ed4ba 100644
--- a/src/lparstat.h
+++ b/src/lparstat.h
@@ -302,6 +302,10 @@ struct sysentry system_data[] = {
.descr = "Idle CPU value - SPURR",
.get = &get_cpu_idle_spurr},
+ /* Security flavor */
+ {.name = "security_flavor",
+ .descr = "Speculative Execution Mode"},
+
{.name[0] = '\0'},
};
diff -up powerpc-utils-1.3.8/man/lparstat.8.me powerpc-utils-1.3.8/man/lparstat.8
--- powerpc-utils-1.3.8/man/lparstat.8.me 2021-04-20 15:49:18.305532697 +0200
+++ powerpc-utils-1.3.8/man/lparstat.8 2021-04-20 15:52:04.703021972 +0200
@@ -209,6 +209,20 @@ The variable memory capacity weight of t
.TP
.SH
.TP
+\fB\-x\fR
+Display the LPAR security flavor mode
+.RS
+.TP
+.B 0
+Speculative execution fully enabled
+.TP
+.B 1
+Speculative execution controls to mitigate user-to-kernel side-channel attacks
+.TP
+.B 2
+Speculative execution controls to mitigate user-to-kernel and user-to-user side-channel attacks
+.RE
+.TP
\fB\-E\fR
Display Scaled Processor Utilization Resource Register(SPURR) based CPU utilization.
.RS

View File

@ -1,40 +0,0 @@
lparstat -E option reports the actual and normalized system utilization
based on the PURR/SPURR registers. Update the lparstat man page too with
the -E option details.
Reported-by: Pavithra Prakash <pavr...@in.ibm.com>
Signed-off-by: Kamalesh Babulal <kama...@linux.vnet.ibm.com>
---
Applies on top of next branch.
man/lparstat.8 | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/man/lparstat.8 b/man/lparstat.8
index 0f4c923aaef5..d00e42600165 100644
--- a/man/lparstat.8
+++ b/man/lparstat.8
@@ -209,6 +209,17 @@ The variable memory capacity weight of the LPAR.
.TP
.SH
.TP
+\fB\-E\fR
+Display Scaled Processor Utilization Resource Register(SPURR) based CPU utilization.
+.RS
+.RS
+Actual CPU utilization is based on Processor Utilization Resource Register(PURR).
+.RS
+.RE
+Normalized CPU utilization is based on Scaled Processor Utilization Resource Register(SPURR).
+.TP
+.SH
+.TP
\fB\-l, --legacy\fR
Display the report in legacy format.
.RS
base-commit: 60d9f54b13b75feee3fd7b25a92b24d0d97ea984
--
2.26.2

View File

@ -1,6 +1,6 @@
Name: powerpc-utils
Version: 1.3.8
Release: 9%{?dist}
Version: 1.3.9
Release: 1%{?dist}
Summary: PERL-based scripts for maintaining and servicing PowerPC systems
License: GPLv2
@ -26,29 +26,11 @@ Requires: perl(Data::Dumper)
Requires: %{name}-core = %{version}-%{release}
Patch1: powerpc-utils-1.3.8-man.patch
Patch2: powerpc-utils-1.3.8-makefile.patch
Patch2: powerpc-utils-1.3.9-makefile.patch
Patch3: powerpc-utils-1.3.5-pseries_platform-man.patch
Patch4: powerpc-utils-1.3.5-update_flash_nv.patch
Patch5: powerpc-utils-1.3.8-install-man.patch
Patch6: powerpc-utils-manpage-lparstat.patch
Patch7: powerpc-utils-1.3.8-hcnmgr.patch
Patch8: 0001-ofpathname-Use-NVMe-controller-physical-nsid.patch
Patch9: 0002-sys_ident--skip-length-field-from-search.patch
Patch10: powerpc-utils-1.3.8-f1ec5f04fdac3e87c3c85c2d85f79339d916e864.patch
Patch11: powerpc-utils-1.3.8-4b2d10942e2d964ecc1fe58c9460c34993ff10be.patch
Patch12: powerpc-utils-1.3.8-e51995667279164a6c8ce4c3ba0217fffda22d6d.patch
Patch14: powerpc-utils-lpartstat_x_option-97269d301797e23b75d0c7a5cb63ce280783f615.patch
Patch15: powerpc-utils-1.3.8-using-od.patch
Patch16: powerpc-utils-1.3.8-0b59d4a372aa266caa75f3b6a253b8f5aeaf3802.patch
Patch17: powerpc-utils-1.3.8-d9bcb21179ccfea122f326aca4690afe0f7de0c6.patch
Patch18: powerpc-utils-1.3.8-e25d71be411b610e5e889f8efaaf04b38c2d9ecb.patch
Patch19: powerpc-utils-1.3.8-1cb8bd89d6386c60e75c47d4a4452d3f130d5138.patch
Patch20: powerpc-utils-1.3.8-366e17553ed647613668678c2d301d369038f41b.patch
Patch21: 0001-drmgr-don-t-open-sysfs-file-for-each-command.patch
Patch22: 0002-drmgr-read-the-CPU-NUMA-topology.patch
Patch23: 0003-drmgr-introduce-NUMA-based-LMB-removal.patch
Patch24: 0001-drmgr-fix-remove-by-index-operation.patch
Patch25: powerpc-utils-1.3.8-numa-header.patch
%description
@ -189,6 +171,7 @@ systemctl daemon-reload >/dev/null 2>&1 || :
%{_sbindir}/ofpathname
%{_sbindir}/pseries_platform
%{_sbindir}/drmgr
%{_sbindir}/lparnumascore
%{_mandir}/man1/amsstat.1*
%{_mandir}/man5/lparcfg.5*
%{_mandir}/man8/activate_firmware.8*
@ -220,9 +203,13 @@ systemctl daemon-reload >/dev/null 2>&1 || :
%{_mandir}/man8/nvram.8*
%{_mandir}/man8/ofpathname.8*
%{_mandir}/man8/drmgr.8*
%{_mandir}/man8/lparnumascore.8*
%changelog
* Mon Jul 19 2021 Than Ngo <than@redhat.com> - 1.3.9-1
- Resolves: #1873868, rebase to 1.3.9
* Fri Jun 11 2021 Than Ngo <than@redhat.com> - 1.3.8-9
- Resolves: #1937038, New lparstat -x option to report the security flavor
- Use od instead xxd

View File

@ -1 +1 @@
SHA512 (powerpc-utils-1.3.8.tar.gz) = 7c9057131315d8fac6154c410562d9cd1807f76241bcc190566ece05b0ddd2ddbe1097749b7e644ccc9691641d40460a877368162e9b951fe790f53f638f635e
SHA512 (powerpc-utils-1.3.9.tar.gz) = db764b904b1f8b371fc94cb7334809a6973982b44e33862d2cd5a5d9fb2d38e2fa8dc06f27d698732fb0b4fb0ee86c5c62a3ceb1e08ffc2cedb21552ea4d0079