rteval-loads/SOURCES/0008-stress-ng-add-checksum-sanity-check-on-bogo-ops-stat.patch

From 846f27b3d7bdfc3c2fc99fc6ddc6d51d7d822b80 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 18 May 2020 14:20:44 +0100
Subject: [PATCH 08/28] stress-ng: add checksum sanity check on bogo ops stats
 and run flag

ELISA request for run sanity check involves adding a duplicated bogo
ops and run flag in a different shared memory segment, hashing this
data and checking these with the stats at the end of a run. If any
corruption or run failures occur we have a mechanism of ensuring that
the measurements are sane with a hashed check on the data and comparing
the two separate copies.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 stress-ng.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++----
 stress-ng.h |  21 +++++++++-
 2 files changed, 125 insertions(+), 11 deletions(-)

diff --git a/stress-ng.c b/stress-ng.c
index 17e7377d6f5d..c57ab29b49e3 100644
--- a/stress-ng.c
+++ b/stress-ng.c
@@ -966,6 +966,16 @@ static const stress_help_t help_generic[] = {
 	{ NULL,		NULL,			NULL }
 };

+/*
+ *  stress_hash_checksum()
+ *	generate a hash of the checksum data
+ */
+static inline void stress_hash_checksum(stress_checksum_t *checksum)
+{
+	checksum->hash = stress_hash_jenkin((uint8_t *)&checksum->data,
+				sizeof(checksum->data));
+}
+
 /*
  *  stressor_name_find()
  *  	Find index into stressors by name
@@ -1651,6 +1661,7 @@ static void MLOCKED_TEXT stress_run(
 	double time_start, time_finish;
 	int32_t n_procs, j;
 	const int32_t total_procs = get_total_num_procs(procs_list);
+	stress_checksum_t *checksum = g_shared->checksums;

 	int32_t sched;

@@ -1659,12 +1670,11 @@ static void MLOCKED_TEXT stress_run(
 	long sched_runtime = -1;
 	long sched_deadline = -1;

-
 	wait_flag = true;
 	time_start = stress_time_now();
 	pr_dbg("starting stressors\n");
 	for (n_procs = 0; n_procs < total_procs; n_procs++) {
-		for (g_proc_current = procs_list; g_proc_current; g_proc_current = g_proc_current->next) {
+		for (g_proc_current = procs_list; g_proc_current; g_proc_current = g_proc_current->next, checksum++) {
 			if (g_opt_timeout && (stress_time_now() - time_start > g_opt_timeout))
 				goto abort;

@@ -1765,9 +1775,16 @@ again:
 							.page_size = stress_get_pagesize(),
 						};

+						(void)memset(checksum, 0, sizeof(*checksum));
 						rc = g_proc_current->stressor->info->stressor(&args);
 						pr_fail_check(&rc);
-						stats->run_ok = (rc == EXIT_SUCCESS);
+						if (rc == EXIT_SUCCESS) {
+							stats->run_ok = true;
+							checksum->data.run_ok = true;
+						}
+						stats->checksum = checksum;
+						checksum->data.counter = *args.counter;
+						stress_hash_checksum(checksum);
 					}
 #if defined(STRESS_PERF_STATS) && defined(HAVE_LINUX_PERF_EVENT_H)
 					if (g_opt_flags & OPT_FLAGS_PERF_STATS) {
@@ -1878,6 +1895,65 @@ static int show_stressors(void)
 	return 0;
 }

+/*
+ *  metrics_check()
+ *	as per ELISA request, sanity check bogo ops and run flag
+ *	to see if corruption occurred and print failure messages
+ *	and set *success to false if hash and data is dubious.
+ */
+static void metrics_check(bool *success)
+{
+	stress_proc_info_t *pi;
+	bool ok = true;
+
+	for (pi = procs_head; pi; pi = pi->next) {
+		int32_t j;
+
+		for (j = 0; j < pi->started_procs; j++) {
+			const stress_proc_stats_t *const stats = pi->stats[j];
+			const stress_checksum_t *checksum = stats->checksum;
+			stress_checksum_t stats_checksum;
+
+			if (checksum == NULL) {
+				pr_fail("%s instance %d unexpected null checksum data\n",
+					pi->stressor->name, j);
+				ok = false;
+				continue;
+			}
+
+			(void)memset(&stats_checksum, 0, sizeof(stats_checksum));
+			stats_checksum.data.counter = stats->counter;
+			stats_checksum.data.run_ok = stats->run_ok;
+			stress_hash_checksum(&stats_checksum);
+
+			if (stats->counter != checksum->data.counter) {
+				pr_fail("%s instance %d corrupted bogo-ops counter, %" PRIu64 " vs %" PRIu64 "\n",
+					pi->stressor->name, j,
+					stats->counter, checksum->data.counter);
+				ok = false;
+			}
+			if (stats->run_ok != checksum->data.run_ok) {
+				pr_fail("%s instance %d corrupted run flag, %d vs %d\n",
+					pi->stressor->name, j,
+					stats->run_ok, checksum->data.run_ok);
+				ok = false;
+			}
+			if (stats_checksum.hash != checksum->hash) {
+				pr_fail("%s instance %d hash error in bogo-ops counter and run flag, %" PRIu32 " vs %" PRIu32 "\n",
+					pi->stressor->name, j,
+					stats_checksum.hash, checksum->hash);
+				ok = false;
+			}
+		}
+	}
+	if (ok) {
+		pr_dbg("metrics check: all stressor metrics validated and sane\n");
+	} else {
+		pr_fail("metrics check: stressor metrics corrupted, data is compromised\n");
+		*success = false;
+	}
+}
+
 /*
  *  metrics_dump()
  *	output metrics
@@ -2093,10 +2169,11 @@ static void log_system_info(void)
  *	that is marked read-only to stop accidental smashing
  *	from a run-away stack expansion
  */
-static inline void stress_map_shared(const size_t len)
+static inline void stress_map_shared(const size_t num_procs)
 {
 	const size_t page_size = stress_get_pagesize();
-	const size_t sz = (len + (page_size << 1)) & ~(page_size - 1);
+	size_t len = sizeof(stress_shared_t) + (sizeof(stress_proc_stats_t) * num_procs);
+	size_t sz = (len + (page_size << 1)) & ~(page_size - 1);
 #if defined(HAVE_MPROTECT)
 	void *last_page;
 #endif
@@ -2104,7 +2181,7 @@ static inline void stress_map_shared(const size_t len)
 	g_shared = (stress_shared_t *)mmap(NULL, sz, PROT_READ | PROT_WRITE,
 		MAP_SHARED | MAP_ANON, -1, 0);
 	if (g_shared == MAP_FAILED) {
-		pr_err("Cannot mmap to shared memory region: errno=%d (%s)\n",
+		pr_err("Cannot mmap to shared memory region, errno=%d (%s)\n",
 			errno, strerror(errno));
 		free_procs();
 		exit(EXIT_FAILURE);
@@ -2137,6 +2214,25 @@ static inline void stress_map_shared(const size_t len)
 			g_shared->length -= sz;
 	}
 #endif
+
+	/*
+	 *  copy of checksums and run data in a different shared
+	 *  memory segment so that we can sanity check these for
+	 *  any form of corruption
+	 */
+	len = sizeof(stress_checksum_t) * STRESS_PROCS_MAX;
+	sz = (len + page_size) & ~(page_size - 1);
+	g_shared->checksums = (stress_checksum_t *)mmap(NULL, sz,
+		PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+	if (g_shared->checksums == MAP_FAILED) {
+		pr_err("Cannot mmap checksums, errno=%d (%s)\n",
+			errno, strerror(errno));
+		(void)munmap((void *)g_shared, g_shared->length);
+		free_procs();
+		exit(EXIT_FAILURE);
+	}
+	(void)memset(g_shared->checksums, 0, sz);
+	g_shared->checksums_length = sz;
 }

 /*
@@ -2145,6 +2241,7 @@ static inline void stress_map_shared(const size_t len)
  */
 void stress_unmap_shared(void)
 {
+	(void)munmap((void *)g_shared->checksums, g_shared->checksums_length);
 	(void)munmap((void *)g_shared, g_shared->length);
 }

@@ -2785,7 +2882,6 @@ static inline void stress_mlock_executable(void)
 int main(int argc, char **argv, char **envp)
 {
 	double duration = 0.0;			/* stressor run time in secs */
-	size_t len;
 	bool success = true, resource_success = true;
 	FILE *yaml;				/* YAML output file */
 	char *yaml_filename;			/* YAML file name */
@@ -2997,8 +3093,7 @@ int main(int argc, char **argv, char **envp)
 	 *  Allocate shared memory segment for shared data
 	 *  across all the child stressors
 	 */
-	len = sizeof(stress_shared_t) + (sizeof(stress_proc_stats_t) * get_total_num_procs(procs_head));
-	stress_map_shared(len);
+	stress_map_shared(get_total_num_procs(procs_head));

 	/*
 	 *  Setup spinlocks
@@ -3076,6 +3171,8 @@ int main(int argc, char **argv, char **envp)
 	if (g_opt_flags & OPT_FLAGS_METRICS)
 		metrics_dump(yaml, ticks_per_sec);

+	metrics_check(&success);
+
 #if defined(STRESS_PERF_STATS) && defined(HAVE_LINUX_PERF_EVENT_H)
 	/*
 	 *  Dump perf statistics
diff --git a/stress-ng.h b/stress-ng.h
index 85b2beccf051..54986499503d 100644
--- a/stress-ng.h
+++ b/stress-ng.h
@@ -907,6 +907,20 @@ typedef enum {

 typedef struct stress_proc_info *stress_pproc_info_t;

+/*
+ *  Per ELISA request, we have a duplicated counter
+ *  and run_ok flag in a different shared memory region
+ *  so we can sanity check these just in case the stats
+ *  have got corrupted.
+ */
+typedef struct {
+	struct {
+		uint64_t counter;	/* Copy of stats counter */
+		bool     run_ok;	/* Copy of run_ok */
+	} data;
+	uint32_t	hash;		/* Hash of data */
+} stress_checksum_t;
+
 /* settings for storing opt arg parsed data */
 typedef struct stress_setting {
 	struct stress_setting *next;	/* next setting in list */
@@ -1838,6 +1852,7 @@ typedef struct {
 	stress_tz_t tz;			/* thermal zones */
 #endif
 	bool run_ok;			/* true if stressor exited OK */
+	stress_checksum_t *checksum;	/* pointer to checksum data */
 } stress_proc_stats_t;

 #define	STRESS_WARN_HASH_MAX		(128)
@@ -1889,6 +1904,8 @@ typedef struct {
 	uint32_t softlockup_count;			/* Atomic counter of softlock children */
 #endif
 	uint8_t  str_shared[STR_SHARED_SIZE];		/* str copying buffer */
+	stress_checksum_t *checksums;			/* per stressor counter checksum */
+	size_t	checksums_length;			/* size of checksums mapping */
 	stress_proc_stats_t stats[0];			/* Shared statistics */
 } stress_shared_t;

@@ -3125,7 +3142,7 @@ typedef struct {
 	const char *name;		/* name of stress test */
 } stress_t;

-/* Per process information */
+/* Per stressor process information */
 typedef struct stress_proc_info {
 	struct stress_proc_info *next;	/* next proc info struct in list */
 	struct stress_proc_info *prev;	/* prev proc info struct in list */
@@ -3133,7 +3150,7 @@ typedef struct stress_proc_info {
 	pid_t	*pids;			/* process id */
 	stress_proc_stats_t **stats;	/* process proc stats info */
 	int32_t started_procs;		/* count of started processes */
-	int32_t num_procs;		/* number of process per stressor */
+	int32_t num_procs;		/* number of processes per stressor */
 	uint64_t bogo_ops;		/* number of bogo ops */
 } stress_proc_info_t;

--
2.21.3