diff --git a/0001-drmgr-don-t-open-sysfs-file-for-each-command.patch b/0001-drmgr-don-t-open-sysfs-file-for-each-command.patch new file mode 100644 index 0000000..f6755c0 --- /dev/null +++ b/0001-drmgr-don-t-open-sysfs-file-for-each-command.patch @@ -0,0 +1,87 @@ +From 014e8ba4580c7917e258df084776c16079dc07ce Mon Sep 17 00:00:00 2001 +From: Laurent Dufour +Date: Tue, 24 Nov 2020 19:28:48 +0100 +Subject: [PATCH 1/3] drmgr: don't open sysfs file for each command + +The new __do_kernel_dlpar() API will be used in later commit to remove by +DRC Index LMB per LMB. This will avoiding opennig and closing the fd each +time. + +The fd closing will now be done at the process exit time. + +In addition add an optinal parameter to silently ignore some error. + +Also, change the log level of the "success" message to debug to match +the previous one saying "Trying.." + +Signed-off-by: Laurent Dufour +--- + src/drmgr/common.c | 22 +++++++++++++--------- + src/drmgr/dr.h | 3 ++- + 2 files changed, 15 insertions(+), 10 deletions(-) + +diff --git a/src/drmgr/common.c b/src/drmgr/common.c +index 5e8135bcf77e..25d244cb2f57 100644 +--- a/src/drmgr/common.c ++++ b/src/drmgr/common.c +@@ -1469,32 +1469,36 @@ int kernel_dlpar_exists(void) + * @param cmd command string to write to sysfs + * @returns 0 on success, !0 otherwise + */ +-int do_kernel_dlpar(const char *cmd, int cmdlen) ++int __do_kernel_dlpar(const char *cmd, int cmdlen, int silent_error) + { +- int fd, rc; ++ static int fd = -1; ++ int rc; + int my_errno; + + say(DEBUG, "Initiating kernel DLPAR \"%s\"\n", cmd); + + /* write to file */ +- fd = open(SYSFS_DLPAR_FILE, O_WRONLY); +- if (fd <= 0) { +- say(ERROR, "Could not open %s to initiate DLPAR request\n", +- SYSFS_DLPAR_FILE); +- return -1; ++ if (fd == -1) { ++ fd = open(SYSFS_DLPAR_FILE, O_WRONLY); ++ if (fd <= 0) { ++ say(ERROR, "Could not open %s to initiate DLPAR request\n", ++ SYSFS_DLPAR_FILE); ++ return -1; ++ } + } + + rc = write(fd, cmd, cmdlen); + my_errno = errno; +- close(fd); + if (rc <= 0) { ++ if (silent_error) ++ return (my_errno == 0) ? -1 : -my_errno; + /* write does not set errno for rc == 0 */ + say(ERROR, "Failed to write to %s: %s\n", SYSFS_DLPAR_FILE, + (rc == 0) ? "wrote 0 bytes" : strerror(my_errno)); + return -1; + } + +- say(INFO, "Success\n"); ++ say(DEBUG, "Success\n"); + return 0; + } + +diff --git a/src/drmgr/dr.h b/src/drmgr/dr.h +index f171bfea73c3..00d2fffc9919 100644 +--- a/src/drmgr/dr.h ++++ b/src/drmgr/dr.h +@@ -172,5 +172,6 @@ enum drc_type to_drc_type(const char *); + int handle_prrn(void); + + int kernel_dlpar_exists(void); +-int do_kernel_dlpar(const char *, int); ++int __do_kernel_dlpar(const char *, int, int); ++#define do_kernel_dlpar(c, l) __do_kernel_dlpar(c, l, 0) + #endif +-- +2.29.2 + diff --git a/0001-drmgr-fix-remove-by-index-operation.patch b/0001-drmgr-fix-remove-by-index-operation.patch new file mode 100644 index 0000000..3aba873 --- /dev/null +++ b/0001-drmgr-fix-remove-by-index-operation.patch @@ -0,0 +1,40 @@ +From 16469b696959aee4ce32d9f77483e1e3f192e82d Mon Sep 17 00:00:00 2001 +From: Laurent Dufour +Date: Fri, 16 Apr 2021 18:10:36 +0200 +Subject: [PATCH] drmgr: fix remove by index operation + +The commit e9f06531356f ("drmgr: introduce NUMA based LMB removal") +introduce a special processing when NUMA is on and the remove by count +operation is done. + +Unfortunately, that code is also triggered when doing a remove by index +operation (-s argument) becauses usr_drc_count is set to 1. As a +consequence the index constraint is not respected and any LMB can be +removed. + +Add a check agains usr_drc_index which is set when a remove by index +operation is done to ensure the numa removal code is not triggered in that +case. + +Fixes: e9f06531356f ("drmgr: introduce NUMA based LMB removal") +Signed-off-by: Laurent Dufour +--- + src/drmgr/drslot_chrp_mem.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/drmgr/drslot_chrp_mem.c b/src/drmgr/drslot_chrp_mem.c +index f17c94adc270..8db98bb9e9ea 100644 +--- a/src/drmgr/drslot_chrp_mem.c ++++ b/src/drmgr/drslot_chrp_mem.c +@@ -1749,7 +1749,7 @@ int do_mem_kernel_dlpar(void) + int rc, offset; + + +- if (usr_action == REMOVE && usr_drc_count) { ++ if (usr_action == REMOVE && usr_drc_count && !usr_drc_index) { + build_numa_topology(); + if (numa_enabled) { + if (!numa_based_remove(usr_drc_count)) +-- +2.31.1 + diff --git a/0002-drmgr-read-the-CPU-NUMA-topology.patch b/0002-drmgr-read-the-CPU-NUMA-topology.patch new file mode 100644 index 0000000..4d516cf --- /dev/null +++ b/0002-drmgr-read-the-CPU-NUMA-topology.patch @@ -0,0 +1,438 @@ +From 88caa91a4c8f0ac2376da433f697bc6845595dac Mon Sep 17 00:00:00 2001 +From: Laurent Dufour +Date: Wed, 2 Dec 2020 16:10:57 +0100 +Subject: [PATCH 2/3] drmgr: read the CPU NUMA topology + +This will be used in the next commit to compute LMB removal based on the +NUMA topology. + +The NUMA topology is read using the libnuma, so a dependency against it is +added in the configure file. + +Signed-off-by: Laurent Dufour +--- + Makefile.am | 5 +- + configure.ac | 4 + + src/drmgr/common_numa.c | 268 ++++++++++++++++++++++++++++++++++++++++ + src/drmgr/common_numa.h | 83 +++++++++++++ + 4 files changed, 359 insertions(+), 1 deletion(-) + create mode 100644 src/drmgr/common_numa.c + create mode 100644 src/drmgr/common_numa.h + +diff --git a/Makefile.am b/Makefile.am +index 2ff2232537df..31baaa74b353 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -155,6 +155,7 @@ src_drmgr_drmgr_SOURCES = \ + src/drmgr/common_cpu.c \ + src/drmgr/common_ofdt.c \ + src/drmgr/common_pci.c \ ++ src/drmgr/common_numa.c \ + src/drmgr/drmgr.c \ + src/drmgr/drmig_chrp_pmig.c \ + src/drmgr/drslot_chrp_cpu.c \ +@@ -171,13 +172,14 @@ noinst_HEADERS += \ + src/drmgr/drcpu.h \ + src/drmgr/dr.h \ + src/drmgr/drmem.h \ ++ src/drmgr/numa.h \ + src/drmgr/drpci.h \ + src/drmgr/rtas_calls.h \ + src/drmgr/ofdt.h \ + src/drmgr/rtas_calls.h \ + src/drmgr/options.c + +-src_drmgr_drmgr_LDADD = -lrtas ++src_drmgr_drmgr_LDADD = -lrtas -lnuma + + src_drmgr_lsslot_SOURCES = \ + src/drmgr/lsslot.c \ +@@ -186,6 +188,7 @@ src_drmgr_lsslot_SOURCES = \ + src/drmgr/common_cpu.c \ + src/drmgr/common_pci.c \ + src/drmgr/common_ofdt.c \ ++ src/drmgr/common_numa.c \ + src/drmgr/rtas_calls.c \ + src/drmgr/drslot_chrp_mem.c \ + $(pseries_platform_SOURCES) +diff --git a/configure.ac b/configure.ac +index de3c6758389a..0239754cc4f4 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -42,6 +42,10 @@ AC_CHECK_HEADER(zlib.h, + [AC_CHECK_LIB(z, inflate, [], [AC_MSG_FAILURE([zlib library is required for compilation])])], + [AC_MSG_FAILURE([zlib.h is required for compiliation])]) + ++AC_CHECK_HEADER(numa.h, ++ [AC_CHECK_LIB(numa, numa_available, [], [AC_MSG_FAILURE([numa library is required for compilation])])], ++ [AC_MSG_FAILURE([numa.h is required for compiliation])]) ++ + # check for librtas + AC_ARG_WITH([librtas], + [AS_HELP_STRING([--without-librtas], +diff --git a/src/drmgr/common_numa.c b/src/drmgr/common_numa.c +new file mode 100644 +index 000000000000..5778769b25b6 +--- /dev/null ++++ b/src/drmgr/common_numa.c +@@ -0,0 +1,268 @@ ++/** ++ * @file common_numa.c ++ * ++ * Copyright (C) IBM Corporation 2020 ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version 2 ++ * of the License, or (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ++ */ ++ ++#include ++#include ++#include ++ ++#include "dr.h" ++#include "ofdt.h" ++#include "drmem.h" /* for DYNAMIC_RECONFIG_MEM */ ++#include "common_numa.h" ++ ++#define RTAS_DIRECTORY "/proc/device-tree/rtas" ++#define CHOSEN_DIRECTORY "/proc/device-tree/chosen" ++#define ASSOC_REF_POINTS "ibm,associativity-reference-points" ++#define ASSOC_LOOKUP_ARRAYS "ibm,associativity-lookup-arrays" ++#define ARCHITECTURE_VEC_5 "ibm,architecture-vec-5" ++ ++/* ++ * Allocate and read a property, return the size. ++ * The read property is not converted to the host endianess. ++ */ ++static int load_property(char *dir, char *prop, uint32_t **buf) ++{ ++ int size; ++ ++ size = get_property_size(dir, prop); ++ if (!size) ++ return -ENOENT; ++ ++ *buf = zalloc(size); ++ if (!*buf) { ++ say(ERROR, "Could not allocate buffer read %s (%d bytes)\n", ++ prop, size); ++ return -ENOMEM; ++ } ++ ++ if (get_property(dir, prop, *buf, size)) { ++ free(*buf); ++ say(ERROR, "Can't retrieve %s/%s\n", dir, prop); ++ return -EINVAL; ++ } ++ ++ return size; ++} ++ ++/* ++ * Get the minimal common depth, based on the form 1 of the ibm,associativ- ++ * ity-reference-points property. We only support that form. ++ * ++ * We should check that the "ibm,architecture-vec-5" property byte 5 bit 0 ++ * has the value of one. ++ */ ++static int get_min_common_depth(struct numa_topology *numa) ++{ ++ int size; ++ uint32_t *p; ++ unsigned char val; ++ ++ size = load_property(CHOSEN_DIRECTORY, ARCHITECTURE_VEC_5, &p); ++ if (size < 0) ++ return size; ++ ++ /* PAPR byte start at 1 (and not 0) but there is the length field */ ++ if (size < 6) { ++ report_unknown_error(__FILE__, __LINE__); ++ free(p); ++ return -EINVAL; ++ } ++ val = ((unsigned char *)p)[5]; ++ free(p); ++ ++ if (!(val & 0x80)) ++ return -ENOTSUP; ++ ++ size = load_property(RTAS_DIRECTORY, ASSOC_REF_POINTS, &p); ++ if (size <= 0) ++ return size; ++ if (size < sizeof(uint32_t)) { ++ report_unknown_error(__FILE__, __LINE__); ++ free(p); ++ return -EINVAL; ++ } ++ ++ /* Get the first entry */ ++ numa->min_common_depth = be32toh(*p); ++ free(p); ++ return 0; ++} ++ ++static int get_assoc_arrays(struct numa_topology *numa) ++{ ++ int size; ++ int rc; ++ uint32_t *prop, i; ++ struct assoc_arrays *aa = &numa->aa; ++ ++ size = load_property(DYNAMIC_RECONFIG_MEM, ASSOC_LOOKUP_ARRAYS, &prop); ++ if (size < 0) ++ return size; ++ ++ size /= sizeof(uint32_t); ++ if (size < 2) { ++ say(ERROR, "Could not find the associativity lookup arrays\n"); ++ free(prop); ++ return -EINVAL; ++ } ++ ++ aa->n_arrays = be32toh(prop[0]); ++ aa->array_sz = be32toh(prop[1]); ++ ++ rc = -EINVAL; ++ if (numa->min_common_depth > aa->array_sz) { ++ say(ERROR, "Bad min common depth or associativity array size\n"); ++ goto out_free; ++ } ++ ++ /* Sanity check */ ++ if (size != (aa->n_arrays * aa->array_sz + 2)) { ++ say(ERROR, "Bad size of the associativity lookup arrays\n"); ++ goto out_free; ++ } ++ ++ aa->min_array = zalloc(aa->n_arrays * sizeof(uint32_t)); ++ ++ /* Keep only the most significant value */ ++ for (i = 0; i < aa->n_arrays; i++) { ++ int prop_index = i * aa->array_sz + numa->min_common_depth + 1; ++ ++ aa->min_array[i] = be32toh(prop[prop_index]); ++ } ++ rc = 0; ++ ++out_free: ++ free(prop); ++ return rc; ++} ++ ++struct numa_node *numa_fetch_node(struct numa_topology *numa, int nid) ++{ ++ struct numa_node *node; ++ ++ if (nid > MAX_NUMNODES) { ++ report_unknown_error(__FILE__, __LINE__); ++ return NULL; ++ } ++ ++ node = numa->nodes[nid]; ++ if (node) ++ return node; ++ ++ node = zalloc(sizeof(struct numa_node)); ++ if (!node) { ++ say(ERROR, "Can't allocate a new node\n"); ++ return NULL; ++ } ++ ++ node->node_id = nid; ++ ++ if (!numa->node_count || nid < numa->node_min) ++ numa->node_min = nid; ++ if (nid > numa->node_max) ++ numa->node_max = nid; ++ ++ numa->nodes[nid] = node; ++ numa->node_count++; ++ ++ return node; ++} ++ ++/* ++ * Read the number of CPU for each node using the libnuma to get the details ++ * from sysfs. ++ */ ++static int read_numa_topology(struct numa_topology *numa) ++{ ++ struct bitmask *cpus; ++ struct numa_node *node; ++ int rc, max_node, nid, i; ++ ++ if (numa_available() < 0) ++ return -ENOENT; ++ ++ max_node = numa_max_node(); ++ if (max_node >= MAX_NUMNODES) { ++ say(ERROR, "Too many nodes %d (max:%d)\n", ++ max_node, MAX_NUMNODES); ++ return -EINVAL; ++ } ++ ++ rc = 0; ++ ++ /* In case of allocation error, the libnuma is calling exit() */ ++ cpus = numa_allocate_cpumask(); ++ ++ for (nid = 0; nid <= max_node; nid++) { ++ ++ if (!numa_bitmask_isbitset(numa_nodes_ptr, nid)) ++ continue; ++ ++ node = numa_fetch_node(numa, nid); ++ if (!node) { ++ rc = -ENOMEM; ++ break; ++ } ++ ++ rc = numa_node_to_cpus(nid, cpus); ++ if (rc < 0) ++ break; ++ ++ /* Count the CPUs in that node */ ++ for (i = 0; i < cpus->size; i++) ++ if (numa_bitmask_isbitset(cpus, i)) ++ node->n_cpus++; ++ ++ numa->cpu_count += node->n_cpus; ++ } ++ ++ numa_bitmask_free(cpus); ++ ++ if (rc) { ++ numa_foreach_node(numa, nid, node) ++ node->n_cpus = 0; ++ numa->cpu_count = 0; ++ } ++ ++ return rc; ++} ++ ++int numa_get_topology(struct numa_topology *numa) ++{ ++ int rc; ++ ++ rc = get_min_common_depth(numa); ++ if (rc) ++ return rc; ++ ++ ++ rc = get_assoc_arrays(numa); ++ if (rc) ++ return rc; ++ ++ rc = read_numa_topology(numa); ++ if (rc) ++ return rc; ++ ++ if (!numa->node_count) ++ return -1; ++ ++ return 0; ++} +diff --git a/src/drmgr/common_numa.h b/src/drmgr/common_numa.h +new file mode 100644 +index 000000000000..4d0054926819 +--- /dev/null ++++ b/src/drmgr/common_numa.h +@@ -0,0 +1,83 @@ ++/** ++ * @file numa.h ++ * ++ * Copyright (C) IBM Corporation 2020 ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version 2 ++ * of the License, or (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ++ */ ++#ifndef _NUMA_H_ ++#define _NUMA_H_ ++ ++#define MAX_NUMNODES 256 ++#define NUMA_NO_NODE -1 ++ ++struct numa_node { ++ int node_id; ++ unsigned int n_cpus; ++ unsigned int n_lmbs; ++ unsigned int ratio; ++ struct dr_node *lmbs; /* linked by lmb_numa_next */ ++ struct numa_node *ratio_next; ++}; ++ ++struct assoc_arrays { ++ uint32_t n_arrays; ++ uint32_t array_sz; ++ uint32_t *min_array; ++}; ++ ++struct numa_topology { ++ unsigned int cpu_count; ++ unsigned int lmb_count; ++ unsigned int cpuless_node_count; ++ unsigned int cpuless_lmb_count; ++ unsigned int node_count, node_min, node_max; ++ struct numa_node *nodes[MAX_NUMNODES]; ++ struct numa_node *ratio; ++ uint32_t min_common_depth; ++ struct assoc_arrays aa; ++}; ++ ++int numa_get_topology(struct numa_topology *numa); ++struct numa_node *numa_fetch_node(struct numa_topology *numa, int node_id); ++ ++static inline int numa_aa_index_to_node(struct numa_topology *numa, ++ uint32_t aa_index) ++{ ++ if (aa_index < numa->aa.n_arrays) ++ return numa->aa.min_array[aa_index]; ++ return NUMA_NO_NODE; ++} ++ ++static inline int next_node(struct numa_topology *numa, int nid, ++ struct numa_node **node) ++{ ++ for (nid++; nid <= numa->node_max; nid++) ++ if (numa->nodes[nid]) { ++ *node = numa->nodes[nid]; ++ break; ++ } ++ return nid; ++} ++ ++#define numa_foreach_node(numa, nid, node) \ ++ for (nid = (numa)->node_min, node = (numa)->nodes[nid]; \ ++ nid <= (numa)->node_max; \ ++ nid = next_node(numa, nid, &(node))) ++ ++#define numa_foreach_node_by_ratio(numa, node) \ ++ for (node = (numa)->ratio; node; node = node->ratio_next) ++ ++#endif /* _NUMA_H_ */ +-- +2.29.2 + diff --git a/0003-drmgr-introduce-NUMA-based-LMB-removal.patch b/0003-drmgr-introduce-NUMA-based-LMB-removal.patch new file mode 100644 index 0000000..ee7c006 --- /dev/null +++ b/0003-drmgr-introduce-NUMA-based-LMB-removal.patch @@ -0,0 +1,443 @@ +From 3c549c7494e729a68b64ac5519bcf1506b24f945 Mon Sep 17 00:00:00 2001 +From: Laurent Dufour +Date: Wed, 25 Nov 2020 18:03:45 +0100 +Subject: [PATCH 3/3] drmgr: introduce NUMA based LMB removal + +When the NUMA topology can be read, all the LMBs found in the Device Tree +are linked the corresponding node. LMB not associated to node are +considered as not used. + +LMB associated to CPU less node are accounted separately because they will +be targeted first to be remove. The LMB are removed from the CPU less nodes +to reach an average number LMBs per CPU less node. + +Node with CPU have a ration indexed on their number of CPUs. The higher a +node have CPU the lower number LMB will be removed. This way node with a +high number of CPU will get a higher amount of memory. + +When a LMB can't be removed (because its memory can't be offlined by the +kernel), the LMB count for node is decremented and the LMB is removed from +the node's LMB list. This way, it is no more accounted as 'active' and the +removal operation will continue without taking it in account anymore. + +The removal is done through the remove by DRC index API, allowing to remove +a LMB at a time. One futur optimization would be to extend that API to +remove a linear range of LMB each time. + +If the NUMA topology can't be read, we fallback using the legacy remove +way. + +Signed-off-by: Laurent Dufour +--- + src/drmgr/drslot_chrp_mem.c | 335 +++++++++++++++++++++++++++++++++++- + src/drmgr/ofdt.h | 2 + + 2 files changed, 336 insertions(+), 1 deletion(-) + +diff --git a/src/drmgr/drslot_chrp_mem.c b/src/drmgr/drslot_chrp_mem.c +index 502aa3e9fff0..47d9f7b8ed90 100644 +--- a/src/drmgr/drslot_chrp_mem.c ++++ b/src/drmgr/drslot_chrp_mem.c +@@ -31,12 +31,16 @@ + #include "dr.h" + #include "ofdt.h" + #include "drmem.h" ++#include "common_numa.h" + + static int block_sz_bytes = 0; + static char *state_strs[] = {"offline", "online"}; + + static char *usagestr = "-c mem {-a | -r} {-q -p {variable_weight | ent_capacity} | {-q | -s [ | ]}}"; + ++static struct numa_topology numa; ++static int numa_enabled = 0; ++ + /** + * mem_usage + * @brief return usage string +@@ -306,6 +310,31 @@ get_mem_node_lmbs(struct lmb_list_head *lmb_list) + return rc; + } + ++static int link_lmb_to_numa_node(struct dr_node *lmb) ++{ ++ int nid; ++ struct numa_node *node; ++ ++ nid = numa_aa_index_to_node(&numa, lmb->lmb_aa_index); ++ if (nid == NUMA_NO_NODE) ++ return 0; ++ ++ node = numa_fetch_node(&numa, nid); ++ if (!node) ++ return -ENOMEM; ++ ++ lmb->lmb_numa_next = node->lmbs; ++ node->lmbs = lmb; ++ node->n_lmbs++; ++ ++ if (node->n_cpus) ++ numa.lmb_count++; ++ else ++ numa.cpuless_lmb_count++; ++ ++ return 0; ++} ++ + int add_lmb(struct lmb_list_head *lmb_list, uint32_t drc_index, + uint64_t address, uint64_t lmb_sz, uint32_t aa_index, + uint32_t flags) +@@ -324,6 +353,9 @@ int add_lmb(struct lmb_list_head *lmb_list, uint32_t drc_index, + lmb->lmb_address = address; + lmb->lmb_aa_index = aa_index; + ++ if (numa_enabled && link_lmb_to_numa_node(lmb)) ++ return -ENOMEM; ++ + if (flags & DRMEM_ASSIGNED) { + int rc; + +@@ -490,7 +522,7 @@ get_dynamic_reconfig_lmbs(struct lmb_list_head *lmb_list) + + if (stat(DYNAMIC_RECONFIG_MEM_V1, &sbuf) == 0) { + rc = get_dynamic_reconfig_lmbs_v1(lmb_sz, lmb_list); +- } else if (is_lsslot_cmd && ++ } else if ((is_lsslot_cmd || numa_enabled) && + stat(DYNAMIC_RECONFIG_MEM_V2, &sbuf) == 0) { + rc = get_dynamic_reconfig_lmbs_v2(lmb_sz, lmb_list); + } else { +@@ -1424,11 +1456,312 @@ int valid_mem_options(void) + return 0; + } + ++static int remove_lmb_by_index(uint32_t drc_index) ++{ ++ char cmdbuf[128]; ++ int offset; ++ ++ offset = sprintf(cmdbuf, "memory remove index 0x%x", drc_index); ++ ++ return __do_kernel_dlpar(cmdbuf, offset, 1 /* Don't report error */); ++} ++ ++static int remove_lmb_from_node(struct numa_node *node, uint32_t count) ++{ ++ struct dr_node *lmb; ++ int err, done = 0, unlinked = 0; ++ ++ say(DEBUG, "Try removing %d / %d LMBs from node %d\n", ++ count, node->n_lmbs, node->node_id); ++ ++ for (lmb = node->lmbs; lmb && done < count; lmb = lmb->lmb_numa_next) { ++ unlinked ++; ++ err = remove_lmb_by_index(lmb->drc_index); ++ if (err) ++ say(WARN,"Can't remove LMB node:%d index:0x%x: %s\n", ++ node->node_id, lmb->drc_index, strerror(-err)); ++ else ++ done++; ++ } ++ ++ /* ++ * Decrement the node LMB's count since whatever is the success ++ * of the removal operation, it will not be tried again on that ++ * LMB. ++ */ ++ node->n_lmbs -= unlinked; ++ ++ /* ++ * Update the node's list of LMB to not process the one we removed or ++ * tried to removed again. ++ */ ++ node->lmbs = lmb; ++ ++ /* Update numa's counters */ ++ if (node->n_cpus) ++ numa.lmb_count -= unlinked; ++ else ++ numa.cpuless_node_count -= unlinked; ++ ++ if (!node->n_lmbs) { ++ node->ratio = 0; /* for sanity only */ ++ if (node->n_cpus) ++ numa.cpu_count -= node->n_cpus; ++ else ++ numa.cpuless_node_count--; ++ } ++ ++ say(INFO, "Removed %d LMBs from node %d\n", done, node->node_id); ++ return done; ++} ++ ++#define min(a,b) ((a < b) ? a : b) ++ ++static void update_cpuless_node_ratio(void) ++{ ++ struct numa_node *node; ++ int nid; ++ ++ /* ++ * Assumptions: ++ * 1. numa->cpuless_node_count is up to date ++ * 2. numa->cpuless_lmb_count is up to date ++ * Nodes with no memory and nodes with CPUs are ignored here. ++ */ ++ numa_foreach_node(&numa, nid, node) { ++ if (node->n_cpus ||!node->n_lmbs) ++ continue; ++ node->ratio = (node->n_lmbs * 100) / numa.cpuless_lmb_count; ++ } ++} ++ ++/* ++ * Remove LMBs from node without CPUs only. ++ * The more the node has LMBs, the more LMBs will be removed from it. ++ * ++ * We have to retry the operation multiple times because some LMB cannot be ++ * removed due to the page usage in the kernel. In that case, that LMB is no ++ * more taken in account and the node's LMB count is decremented, assuming that ++ * LMB is unremovable at this time. Thus each node's ratio has to be computed on ++ * each iteration. This is not a big deal, usually, there are not so much nodes. ++ */ ++static int remove_cpuless_lmbs(uint32_t count) ++{ ++ struct numa_node *node; ++ int nid; ++ uint32_t total = count, todo, done = 0, this_loop; ++ ++ while (count) { ++ count = min(count, numa.cpuless_lmb_count); ++ if (!count) ++ break; ++ ++ update_cpuless_node_ratio(); ++ ++ this_loop = 0; ++ numa_foreach_node(&numa, nid, node) { ++ if (!node->n_lmbs || node->n_cpus) ++ continue; ++ ++ todo = (count * node->ratio) / 100; ++ todo = min(todo, node->n_lmbs); ++ /* Fix rounded value to 0 */ ++ if (!todo && node->n_lmbs) ++ todo = (count - this_loop); ++ ++ if (todo) ++ todo = remove_lmb_from_node(node, todo); ++ ++ this_loop += todo; ++ done += todo; ++ if (done >= total) ++ break; ++ } ++ ++ /* Don't continue if we didn't make any progress. */ ++ if (!this_loop) ++ break; ++ ++ count -= this_loop; ++ } ++ ++ say(DEBUG, "%d / %d LMBs removed from the CPU less nodes\n", ++ done, total); ++ return done; ++} ++ ++static void update_node_ratio(void) ++{ ++ int nid; ++ struct numa_node *node, *n, **p; ++ uint32_t cpu_ratio, mem_ratio; ++ ++ /* ++ * Assumptions: ++ * 1. numa->cpu_count is up to date ++ * 2. numa->lmb_count is up to date ++ * Nodes with no memory and nodes with no CPU are ignored here. ++ */ ++ ++ numa.ratio = NULL; ++ numa_foreach_node(&numa, nid, node) { ++ if (!node->n_lmbs || !node->n_cpus) ++ continue; ++ cpu_ratio = (node->n_cpus * 100) / numa.cpu_count; ++ mem_ratio = (node->n_lmbs * 100) / numa.lmb_count; ++ ++ /* Say that CPU ratio is 90% of the ratio */ ++ node->ratio = (cpu_ratio * 9 + mem_ratio) / 10; ++ } ++ ++ /* Create an ordered link of the nodes */ ++ numa_foreach_node(&numa, nid, node) { ++ if (!node->n_lmbs || !node->n_cpus) ++ continue; ++ ++ p = &numa.ratio; ++ for (n = numa.ratio; ++ n && n->ratio < node->ratio; n = n->ratio_next) ++ p = &n->ratio_next; ++ *p = node; ++ node->ratio_next = n; ++ } ++} ++ ++/* ++ * Remove LMBs from node with CPUs. ++ * ++ * The less a node has CPU, the more memory will be removed from it. ++ * ++ * As for the CPU less nodes, we must iterate because some LMBs may not be ++ * removable at this time. ++ */ ++static int remove_cpu_lmbs(uint32_t count) ++{ ++ struct numa_node *node; ++ uint32_t total = count, todo, done = 0, this_loop; ++ uint32_t new_lmb_count; ++ ++ while(count) { ++ count = min(count, numa.lmb_count); ++ if (!count) ++ break; ++ ++ update_node_ratio(); ++ ++ new_lmb_count = numa.lmb_count - count; ++ ++ this_loop = 0; ++ numa_foreach_node_by_ratio(&numa, node) { ++ if (!node->n_lmbs || !node->n_cpus) ++ continue; ++ ++ todo = (new_lmb_count * node->ratio) / 100; ++ todo = node->n_lmbs - min(todo, node->n_lmbs); ++ todo = min(count, todo); ++ ++ if (todo) { ++ todo = remove_lmb_from_node(node, todo); ++ count -= todo; ++ this_loop += todo; ++ } ++ ++ if (!count) ++ break; ++ } ++ ++ /* Don't continue if we didn't make any progress. */ ++ if (!this_loop) ++ break; ++ done += this_loop; ++ } ++ ++ say(DEBUG, "%d / %d LMBs removed from the CPU nodes\n", ++ done, total); ++ return done; ++} ++ ++static void build_numa_topology(void) ++{ ++ int rc; ++ ++ rc = numa_get_topology(&numa); ++ if (rc) ++ return; ++ ++ numa_enabled = 1; ++} ++ ++static void clear_numa_lmb_links(void) ++{ ++ int nid; ++ struct numa_node *node; ++ ++ numa_foreach_node(&numa, nid, node) ++ node->lmbs = NULL; ++} ++ ++static int numa_based_remove(uint32_t count) ++{ ++ struct lmb_list_head *lmb_list; ++ struct numa_node *node; ++ int nid; ++ uint32_t done = 0; ++ ++ /* ++ * Read the LMBs ++ * Link the LMBs to their node ++ * Update global counter ++ */ ++ lmb_list = get_lmbs(LMB_NORMAL_SORT); ++ if (lmb_list == NULL) { ++ clear_numa_lmb_links(); ++ return -1; ++ } ++ ++ if (!numa.node_count) { ++ clear_numa_lmb_links(); ++ free_lmbs(lmb_list); ++ return -EINVAL; ++ } ++ ++ numa_foreach_node(&numa, nid, node) { ++ say(INFO, "node %4d %4d CPUs %8d LMBs\n", ++ nid, node->n_cpus, node->n_lmbs); ++ } ++ ++ done += remove_cpuless_lmbs(count); ++ count -= done; ++ ++ done += remove_cpu_lmbs(count); ++ ++ report_resource_count(done); ++ ++ clear_numa_lmb_links(); ++ free_lmbs(lmb_list); ++ return 0; ++} ++ + int do_mem_kernel_dlpar(void) + { + char cmdbuf[128]; + int rc, offset; + ++ ++ if (usr_action == REMOVE && usr_drc_count) { ++ build_numa_topology(); ++ if (numa_enabled) { ++ if (!numa_based_remove(usr_drc_count)) ++ return 0; ++ ++ /* ++ * If the NUMA based removal failed, lets try the legacy ++ * way. ++ */ ++ say(WARN, "Can't do NUMA based removal operation.\n"); ++ } ++ } ++ + offset = sprintf(cmdbuf, "%s ", "memory"); + + switch (usr_action) { +diff --git a/src/drmgr/ofdt.h b/src/drmgr/ofdt.h +index 3850a77229b4..3c2840b2e0ee 100644 +--- a/src/drmgr/ofdt.h ++++ b/src/drmgr/ofdt.h +@@ -92,6 +92,7 @@ struct dr_node { + uint32_t _lmb_aa_index; + struct mem_scn *_mem_scns; + struct of_node *_of_node; ++ struct dr_node *_numa_next; + } _smem; + + #define lmb_address _node_u._smem._address +@@ -99,6 +100,7 @@ struct dr_node { + #define lmb_aa_index _node_u._smem._lmb_aa_index + #define lmb_mem_scns _node_u._smem._mem_scns + #define lmb_of_node _node_u._smem._of_node ++#define lmb_numa_next _node_u._smem._numa_next + + struct hea_info { + uint _port_no; +-- +2.29.2 + diff --git a/powerpc-utils-1.3.8-0b59d4a372aa266caa75f3b6a253b8f5aeaf3802.patch b/powerpc-utils-1.3.8-0b59d4a372aa266caa75f3b6a253b8f5aeaf3802.patch new file mode 100644 index 0000000..dcc06b0 --- /dev/null +++ b/powerpc-utils-1.3.8-0b59d4a372aa266caa75f3b6a253b8f5aeaf3802.patch @@ -0,0 +1,33 @@ +commit 0b59d4a372aa266caa75f3b6a253b8f5aeaf3802 +Author: Mingming Cao +Date: Mon Mar 1 19:34:29 2021 -0800 + + hcnmgr: Avoid cleanup of bond interface at boot time when no HNV exists + + At boot time, hcn scans the device tree and discovers if there was a new + HNV being added while lpar was inactive. It also cleans up the old hnv + interfaces. This patch avoids cleaning up bonding interface when no HNV + network devices exists. + + Signed-off-by: Mingming Cao + [tyreld: fixup commit log] + Signed-off-by: Tyrel Datwyler + +diff --git a/scripts/hcnmgr b/scripts/hcnmgr +index a76505e..c95edba 100644 +--- a/scripts/hcnmgr ++++ b/scripts/hcnmgr +@@ -575,7 +575,13 @@ scanhcn() { + done + fi + ++ if [ ${HcnIds[@]} -eq 0 ]; then ++ hcnlog DEBUG "scanhcn: scan for hybrid virtual network finished" ++ return $E_SUCCESS ++ fi ++ + # Next clean up dead connections left from orgitinal LPAR after inactive miration ++ # Only do this when the HNV ID array is not empty + + # list of all HCN ids + ids="${HcnIds[*]}" diff --git a/powerpc-utils-1.3.8-1cb8bd89d6386c60e75c47d4a4452d3f130d5138.patch b/powerpc-utils-1.3.8-1cb8bd89d6386c60e75c47d4a4452d3f130d5138.patch new file mode 100644 index 0000000..bfcb88b --- /dev/null +++ b/powerpc-utils-1.3.8-1cb8bd89d6386c60e75c47d4a4452d3f130d5138.patch @@ -0,0 +1,32 @@ +commit 1cb8bd89d6386c60e75c47d4a4452d3f130d5138 +Author: Mingming Cao +Date: Fri Mar 12 14:18:18 2021 -0800 + + hcnmgr: Avoid using xargs to process NM show connections + + When removing HNV bonding connections xargs can fail to process the output of + nmcli show propererly. + + Instead of piping into xargs fix this by using a loop to check for all related + bonding connections and remove them explicitly one by one. + + Signed-off-by: Mingming Cao + [tyreld: fixed up commit log] + Signed-off-by: Tyrel Datwyler + +diff --git a/scripts/hcnmgr b/scripts/hcnmgr +index d66b5d1..30d31e7 100644 +--- a/scripts/hcnmgr ++++ b/scripts/hcnmgr +@@ -377,7 +377,10 @@ rmhcn() { + fi + + hcnlog INFO "rmhcn: delete bond $BONDNAME and slaves " +- nmcli -f NAME con show | grep "$BONDNAME" | xargs sudo nmcli con delete ++ for connection in $(nmcli -f NAME con show | grep "$BONDNAME"); do ++ hcnlog INFO "Delete bonding connection $connection" ++ nmcli con delete "$connection" ++ done + hcnlog DEBUG "rmhcn: exit" + return $E_SUCCESS + } diff --git a/powerpc-utils-1.3.8-366e17553ed647613668678c2d301d369038f41b.patch b/powerpc-utils-1.3.8-366e17553ed647613668678c2d301d369038f41b.patch new file mode 100644 index 0000000..87edddb --- /dev/null +++ b/powerpc-utils-1.3.8-366e17553ed647613668678c2d301d369038f41b.patch @@ -0,0 +1,26 @@ +commit 366e17553ed647613668678c2d301d369038f41b +Author: Brahadambal Srinivasan +Date: Thu Nov 12 19:00:47 2020 +0530 + + Update ppc64-cpu usage + + 'ppc64_cpu --help' doesn't list '--version' as an option. This patch + adds the option in the usage information of ppc64-cpu command. + + Signed-off-by: Brahadambal Srinivasan + Signed-off-by: Tyrel Datwyler + +diff --git a/src/ppc64_cpu.c b/src/ppc64_cpu.c +index 71f4720..2b0f66c 100644 +--- a/src/ppc64_cpu.c ++++ b/src/ppc64_cpu.c +@@ -1195,7 +1195,8 @@ static void usage(void) + "ppc64_cpu --subcores-per-core # Get number of subcores per core\n" + "ppc64_cpu --subcores-per-core=X # Set subcores per core to X (1 or 4)\n" + "ppc64_cpu --threads-per-core # Get threads per core\n" +-"ppc64_cpu --info # Display system state information)\n"); ++"ppc64_cpu --info # Display system state information\n" ++"ppc64_cpu --version # Display version of ppc64-cpu\n"); + } + + struct option longopts[] = { diff --git a/powerpc-utils-1.3.8-d9bcb21179ccfea122f326aca4690afe0f7de0c6.patch b/powerpc-utils-1.3.8-d9bcb21179ccfea122f326aca4690afe0f7de0c6.patch new file mode 100644 index 0000000..9314916 --- /dev/null +++ b/powerpc-utils-1.3.8-d9bcb21179ccfea122f326aca4690afe0f7de0c6.patch @@ -0,0 +1,30 @@ +commit d9bcb21179ccfea122f326aca4690afe0f7de0c6 +Author: Mingming Cao +Date: Mon Mar 1 21:34:34 2021 -0800 + + hcnmgr: Wait for sysfs device ready when looking up device name + + At the time of calling ofpathname to look up for devicename, wait + for sysfs device ready. Otherwise, the OS may be in the middle of device + renaming. + + Signed-off-by: Mingming Cao + [tyreld: fixed up commit log] + Signed-off-by: Tyrel Datwyler + +diff --git a/scripts/hcnmgr b/scripts/hcnmgr +index c95edba..0d20e7d 100644 +--- a/scripts/hcnmgr ++++ b/scripts/hcnmgr +@@ -241,7 +241,10 @@ get_dev_hcn() { + # Let's retry a few times. + while [ $wait != 0 ]; do + if DEVNAME=$(ofpathname -l "$(echo "$1" | sed -e "s/\/proc\/device-tree//")" 2>/dev/null); then +- break ++ if [ -e /sys/class/net/"$DEVNAME" ]; then ++ hcnlog DEBUG "ofpathname waiting for /sys/class/net device $DEVNAME ready" ++ break ++ fi + fi + + hcnlog DEBUG "ofpathname return $?, devname is $DEVNAME rety counter $wait" diff --git a/powerpc-utils-1.3.8-e25d71be411b610e5e889f8efaaf04b38c2d9ecb.patch b/powerpc-utils-1.3.8-e25d71be411b610e5e889f8efaaf04b38c2d9ecb.patch new file mode 100644 index 0000000..19ba99c --- /dev/null +++ b/powerpc-utils-1.3.8-e25d71be411b610e5e889f8efaaf04b38c2d9ecb.patch @@ -0,0 +1,30 @@ +commit e25d71be411b610e5e889f8efaaf04b38c2d9ecb +Author: Mingming Cao +Date: Fri Mar 12 13:50:33 2021 -0800 + + hcnmgr: Avoid using ifcfg file for checking bonding interface status + + When configuring migratable sr_iov into hybrid network, it checks if + there is an existing HNV using the presense of ifcfg file location. This + is not preferred as the location can be different on distros. + + This patch fixes this by using NetworkManager nmcli. + + Signed-off-by: Mingming Cao + [tyreld: fixed spelling] + Signed-off-by: Tyrel Datwyler + +diff --git a/scripts/hcnmgr b/scripts/hcnmgr +index 0d20e7d..d66b5d1 100644 +--- a/scripts/hcnmgr ++++ b/scripts/hcnmgr +@@ -282,8 +282,7 @@ do_config_vdevice() { + + hcnlog DEBUG "Check if there is bond $BONDNAME with hcn id $HCNID" + +- hcnlog DEBUG "ifconfig file $IFCONFIG_PATH/ifconfig-$BONDNAME" +- if [ ! -e "$IFCONFIG_PATH/ifcfg-$BONDNAME" ]; then ++ if ! nmcli -f NAME con show --active | grep -q "$BONDNAME\s"; then + hcnlog INFO "nmcli con add type bond con-name $BONDNAME ifname $BONDNAME" + nmcli con add type bond con-name "$BONDNAME" ifname "$BONDNAME" + diff --git a/powerpc-utils-lpartstat_x_option-97269d301797e23b75d0c7a5cb63ce280783f615.patch b/powerpc-utils-lpartstat_x_option-97269d301797e23b75d0c7a5cb63ce280783f615.patch new file mode 100644 index 0000000..64b17e7 --- /dev/null +++ b/powerpc-utils-lpartstat_x_option-97269d301797e23b75d0c7a5cb63ce280783f615.patch @@ -0,0 +1,132 @@ +commit 97269d301797e23b75d0c7a5cb63ce280783f615 +Author: Laurent Dufour +Date: Thu Mar 4 14:51:38 2021 +0100 + + lpartstat: add -x option for the security flavor + + This allows user to get the security flavor settings fer the LPAR. + + The output is : + + $ lparstat -x + Speculative Execution Mode : 1 + + Where the output number means + 0 = Speculative execution fully enabled + 1 = Speculative execution controls to mitigate user-to-kernel side-channel + attacks + 2 = Speculative execution controls to mitigate user-to-kernel and + user-to-user side-channel attacks + + In the case the running kernel is not exposing the security flavor in + /proc/powerpc/lparcfg, the output is: + + $ lparstat -x + Speculative Execution Mode : - + + Signed-off-by: Laurent Dufour + Signed-off-by: Tyrel Datwyler + +diff --git a/src/lparstat.c b/src/lparstat.c +index 23e4b85..00922c4 100644 +--- a/src/lparstat.c ++++ b/src/lparstat.c +@@ -42,6 +42,7 @@ + + static bool o_legacy = false; + static bool o_scaled = false; ++static bool o_security = false; + + static int threads_per_cpu; + static int cpus_in_system; +@@ -1152,6 +1153,15 @@ void print_scaled_output(int interval, int count) + } while (--count > 0); + } + ++static void print_security_flavor(void) ++{ ++ char value[64]; ++ char *descr; ++ ++ get_sysdata("security_flavor", &descr, value); ++ fprintf(stdout, "%-45s: %s\n", descr, value); ++} ++ + static void usage(void) + { + printf("Usage: lparstat [ options ]\n\tlparstat [ count ]\n\n" +@@ -1159,6 +1169,7 @@ static void usage(void) + "\t-h, --help Show this message and exit.\n" + "\t-V, --version \tDisplay lparstat version information.\n" + "\t-i Lists details on the LPAR configuration.\n" ++ "\t-x Print the security mode settings for the LPAR.\n" + "\t-E Print SPURR metrics.\n" + "\t-l, --legacy Print the report in legacy format.\n" + "interval The interval parameter specifies the amount of time between each report.\n" +@@ -1184,7 +1195,7 @@ int main(int argc, char *argv[]) + exit(1); + } + +- while ((c = getopt_long(argc, argv, "iEVhl", ++ while ((c = getopt_long(argc, argv, "iEVhlx", + long_opts, &opt_index)) != -1) { + switch(c) { + case 'i': +@@ -1199,6 +1210,9 @@ int main(int argc, char *argv[]) + case 'V': + printf("lparstat - %s\n", VERSION); + return 0; ++ case 'x': ++ o_security = true; ++ break; + case 'h': + usage(); + return 0; +@@ -1223,6 +1237,8 @@ int main(int argc, char *argv[]) + + if (i_option) + print_iflag_data(); ++ else if (o_security) ++ print_security_flavor(); + else if (o_scaled) { + print_scaled_output(interval, count); + close_cpu_sysfs_fds(threads_in_system); +diff --git a/src/lparstat.h b/src/lparstat.h +index 9b7117f..26ed4ba 100644 +--- a/src/lparstat.h ++++ b/src/lparstat.h +@@ -302,6 +302,10 @@ struct sysentry system_data[] = { + .descr = "Idle CPU value - SPURR", + .get = &get_cpu_idle_spurr}, + ++ /* Security flavor */ ++ {.name = "security_flavor", ++ .descr = "Speculative Execution Mode"}, ++ + {.name[0] = '\0'}, + }; + +diff -up powerpc-utils-1.3.8/man/lparstat.8.me powerpc-utils-1.3.8/man/lparstat.8 +--- powerpc-utils-1.3.8/man/lparstat.8.me 2021-04-20 15:49:18.305532697 +0200 ++++ powerpc-utils-1.3.8/man/lparstat.8 2021-04-20 15:52:04.703021972 +0200 +@@ -209,6 +209,20 @@ The variable memory capacity weight of t + .TP + .SH + .TP ++\fB\-x\fR ++Display the LPAR security flavor mode ++.RS ++.TP ++.B 0 ++Speculative execution fully enabled ++.TP ++.B 1 ++Speculative execution controls to mitigate user-to-kernel side-channel attacks ++.TP ++.B 2 ++Speculative execution controls to mitigate user-to-kernel and user-to-user side-channel attacks ++.RE ++.TP + \fB\-E\fR + Display Scaled Processor Utilization Resource Register(SPURR) based CPU utilization. + .RS