294 lines
9.7 KiB
Diff
294 lines
9.7 KiB
Diff
|
From 846f27b3d7bdfc3c2fc99fc6ddc6d51d7d822b80 Mon Sep 17 00:00:00 2001
|
||
|
From: Colin Ian King <colin.king@canonical.com>
|
||
|
Date: Mon, 18 May 2020 14:20:44 +0100
|
||
|
Subject: [PATCH 08/28] stress-ng: add checksum sanity check on bogo ops stats
|
||
|
and run flag
|
||
|
|
||
|
ELISA request for run sanity check involves adding a duplicated bogo
|
||
|
ops and run flag in a different shared memory segment, hashing this
|
||
|
data and checking these with the stats at the end of a run. If any
|
||
|
corruption or run failures occur we have a mechanism of ensuring that
|
||
|
the measurements are sane with a hashed check on the data and comparing
|
||
|
the two separate copies.
|
||
|
|
||
|
Signed-off-by: Colin Ian King <colin.king@canonical.com>
|
||
|
---
|
||
|
stress-ng.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++----
|
||
|
stress-ng.h | 21 +++++++++-
|
||
|
2 files changed, 125 insertions(+), 11 deletions(-)
|
||
|
|
||
|
diff --git a/stress-ng.c b/stress-ng.c
|
||
|
index 17e7377d6f5d..c57ab29b49e3 100644
|
||
|
--- a/stress-ng.c
|
||
|
+++ b/stress-ng.c
|
||
|
@@ -966,6 +966,16 @@ static const stress_help_t help_generic[] = {
|
||
|
{ NULL, NULL, NULL }
|
||
|
};
|
||
|
|
||
|
+/*
|
||
|
+ * stress_hash_checksum()
|
||
|
+ * generate a hash of the checksum data
|
||
|
+ */
|
||
|
+static inline void stress_hash_checksum(stress_checksum_t *checksum)
|
||
|
+{
|
||
|
+ checksum->hash = stress_hash_jenkin((uint8_t *)&checksum->data,
|
||
|
+ sizeof(checksum->data));
|
||
|
+}
|
||
|
+
|
||
|
/*
|
||
|
* stressor_name_find()
|
||
|
* Find index into stressors by name
|
||
|
@@ -1651,6 +1661,7 @@ static void MLOCKED_TEXT stress_run(
|
||
|
double time_start, time_finish;
|
||
|
int32_t n_procs, j;
|
||
|
const int32_t total_procs = get_total_num_procs(procs_list);
|
||
|
+ stress_checksum_t *checksum = g_shared->checksums;
|
||
|
|
||
|
int32_t sched;
|
||
|
|
||
|
@@ -1659,12 +1670,11 @@ static void MLOCKED_TEXT stress_run(
|
||
|
long sched_runtime = -1;
|
||
|
long sched_deadline = -1;
|
||
|
|
||
|
-
|
||
|
wait_flag = true;
|
||
|
time_start = stress_time_now();
|
||
|
pr_dbg("starting stressors\n");
|
||
|
for (n_procs = 0; n_procs < total_procs; n_procs++) {
|
||
|
- for (g_proc_current = procs_list; g_proc_current; g_proc_current = g_proc_current->next) {
|
||
|
+ for (g_proc_current = procs_list; g_proc_current; g_proc_current = g_proc_current->next, checksum++) {
|
||
|
if (g_opt_timeout && (stress_time_now() - time_start > g_opt_timeout))
|
||
|
goto abort;
|
||
|
|
||
|
@@ -1765,9 +1775,16 @@ again:
|
||
|
.page_size = stress_get_pagesize(),
|
||
|
};
|
||
|
|
||
|
+ (void)memset(checksum, 0, sizeof(*checksum));
|
||
|
rc = g_proc_current->stressor->info->stressor(&args);
|
||
|
pr_fail_check(&rc);
|
||
|
- stats->run_ok = (rc == EXIT_SUCCESS);
|
||
|
+ if (rc == EXIT_SUCCESS) {
|
||
|
+ stats->run_ok = true;
|
||
|
+ checksum->data.run_ok = true;
|
||
|
+ }
|
||
|
+ stats->checksum = checksum;
|
||
|
+ checksum->data.counter = *args.counter;
|
||
|
+ stress_hash_checksum(checksum);
|
||
|
}
|
||
|
#if defined(STRESS_PERF_STATS) && defined(HAVE_LINUX_PERF_EVENT_H)
|
||
|
if (g_opt_flags & OPT_FLAGS_PERF_STATS) {
|
||
|
@@ -1878,6 +1895,65 @@ static int show_stressors(void)
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
+/*
|
||
|
+ * metrics_check()
|
||
|
+ * as per ELISA request, sanity check bogo ops and run flag
|
||
|
+ * to see if corruption occurred and print failure messages
|
||
|
+ * and set *success to false if hash and data is dubious.
|
||
|
+ */
|
||
|
+static void metrics_check(bool *success)
|
||
|
+{
|
||
|
+ stress_proc_info_t *pi;
|
||
|
+ bool ok = true;
|
||
|
+
|
||
|
+ for (pi = procs_head; pi; pi = pi->next) {
|
||
|
+ int32_t j;
|
||
|
+
|
||
|
+ for (j = 0; j < pi->started_procs; j++) {
|
||
|
+ const stress_proc_stats_t *const stats = pi->stats[j];
|
||
|
+ const stress_checksum_t *checksum = stats->checksum;
|
||
|
+ stress_checksum_t stats_checksum;
|
||
|
+
|
||
|
+ if (checksum == NULL) {
|
||
|
+ pr_fail("%s instance %d unexpected null checksum data\n",
|
||
|
+ pi->stressor->name, j);
|
||
|
+ ok = false;
|
||
|
+ continue;
|
||
|
+ }
|
||
|
+
|
||
|
+ (void)memset(&stats_checksum, 0, sizeof(stats_checksum));
|
||
|
+ stats_checksum.data.counter = stats->counter;
|
||
|
+ stats_checksum.data.run_ok = stats->run_ok;
|
||
|
+ stress_hash_checksum(&stats_checksum);
|
||
|
+
|
||
|
+ if (stats->counter != checksum->data.counter) {
|
||
|
+ pr_fail("%s instance %d corrupted bogo-ops counter, %" PRIu64 " vs %" PRIu64 "\n",
|
||
|
+ pi->stressor->name, j,
|
||
|
+ stats->counter, checksum->data.counter);
|
||
|
+ ok = false;
|
||
|
+ }
|
||
|
+ if (stats->run_ok != checksum->data.run_ok) {
|
||
|
+ pr_fail("%s instance %d corrupted run flag, %d vs %d\n",
|
||
|
+ pi->stressor->name, j,
|
||
|
+ stats->run_ok, checksum->data.run_ok);
|
||
|
+ ok = false;
|
||
|
+ }
|
||
|
+ if (stats_checksum.hash != checksum->hash) {
|
||
|
+ pr_fail("%s instance %d hash error in bogo-ops counter and run flag, %" PRIu32 " vs %" PRIu32 "\n",
|
||
|
+ pi->stressor->name, j,
|
||
|
+ stats_checksum.hash, checksum->hash);
|
||
|
+ ok = false;
|
||
|
+ }
|
||
|
+ }
|
||
|
+ }
|
||
|
+ if (ok) {
|
||
|
+ pr_dbg("metrics check: all stressor metrics validated and sane\n");
|
||
|
+ } else {
|
||
|
+ pr_fail("metrics check: stressor metrics corrupted, data is compromised\n");
|
||
|
+ *success = false;
|
||
|
+ }
|
||
|
+}
|
||
|
+
|
||
|
/*
|
||
|
* metrics_dump()
|
||
|
* output metrics
|
||
|
@@ -2093,10 +2169,11 @@ static void log_system_info(void)
|
||
|
* that is marked read-only to stop accidental smashing
|
||
|
* from a run-away stack expansion
|
||
|
*/
|
||
|
-static inline void stress_map_shared(const size_t len)
|
||
|
+static inline void stress_map_shared(const size_t num_procs)
|
||
|
{
|
||
|
const size_t page_size = stress_get_pagesize();
|
||
|
- const size_t sz = (len + (page_size << 1)) & ~(page_size - 1);
|
||
|
+ size_t len = sizeof(stress_shared_t) + (sizeof(stress_proc_stats_t) * num_procs);
|
||
|
+ size_t sz = (len + (page_size << 1)) & ~(page_size - 1);
|
||
|
#if defined(HAVE_MPROTECT)
|
||
|
void *last_page;
|
||
|
#endif
|
||
|
@@ -2104,7 +2181,7 @@ static inline void stress_map_shared(const size_t len)
|
||
|
g_shared = (stress_shared_t *)mmap(NULL, sz, PROT_READ | PROT_WRITE,
|
||
|
MAP_SHARED | MAP_ANON, -1, 0);
|
||
|
if (g_shared == MAP_FAILED) {
|
||
|
- pr_err("Cannot mmap to shared memory region: errno=%d (%s)\n",
|
||
|
+ pr_err("Cannot mmap to shared memory region, errno=%d (%s)\n",
|
||
|
errno, strerror(errno));
|
||
|
free_procs();
|
||
|
exit(EXIT_FAILURE);
|
||
|
@@ -2137,6 +2214,25 @@ static inline void stress_map_shared(const size_t len)
|
||
|
g_shared->length -= sz;
|
||
|
}
|
||
|
#endif
|
||
|
+
|
||
|
+ /*
|
||
|
+ * copy of checksums and run data in a different shared
|
||
|
+ * memory segment so that we can sanity check these for
|
||
|
+ * any form of corruption
|
||
|
+ */
|
||
|
+ len = sizeof(stress_checksum_t) * STRESS_PROCS_MAX;
|
||
|
+ sz = (len + page_size) & ~(page_size - 1);
|
||
|
+ g_shared->checksums = (stress_checksum_t *)mmap(NULL, sz,
|
||
|
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
|
||
|
+ if (g_shared->checksums == MAP_FAILED) {
|
||
|
+ pr_err("Cannot mmap checksums, errno=%d (%s)\n",
|
||
|
+ errno, strerror(errno));
|
||
|
+ (void)munmap((void *)g_shared, g_shared->length);
|
||
|
+ free_procs();
|
||
|
+ exit(EXIT_FAILURE);
|
||
|
+ }
|
||
|
+ (void)memset(g_shared->checksums, 0, sz);
|
||
|
+ g_shared->checksums_length = sz;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
@@ -2145,6 +2241,7 @@ static inline void stress_map_shared(const size_t len)
|
||
|
*/
|
||
|
void stress_unmap_shared(void)
|
||
|
{
|
||
|
+ (void)munmap((void *)g_shared->checksums, g_shared->checksums_length);
|
||
|
(void)munmap((void *)g_shared, g_shared->length);
|
||
|
}
|
||
|
|
||
|
@@ -2785,7 +2882,6 @@ static inline void stress_mlock_executable(void)
|
||
|
int main(int argc, char **argv, char **envp)
|
||
|
{
|
||
|
double duration = 0.0; /* stressor run time in secs */
|
||
|
- size_t len;
|
||
|
bool success = true, resource_success = true;
|
||
|
FILE *yaml; /* YAML output file */
|
||
|
char *yaml_filename; /* YAML file name */
|
||
|
@@ -2997,8 +3093,7 @@ int main(int argc, char **argv, char **envp)
|
||
|
* Allocate shared memory segment for shared data
|
||
|
* across all the child stressors
|
||
|
*/
|
||
|
- len = sizeof(stress_shared_t) + (sizeof(stress_proc_stats_t) * get_total_num_procs(procs_head));
|
||
|
- stress_map_shared(len);
|
||
|
+ stress_map_shared(get_total_num_procs(procs_head));
|
||
|
|
||
|
/*
|
||
|
* Setup spinlocks
|
||
|
@@ -3076,6 +3171,8 @@ int main(int argc, char **argv, char **envp)
|
||
|
if (g_opt_flags & OPT_FLAGS_METRICS)
|
||
|
metrics_dump(yaml, ticks_per_sec);
|
||
|
|
||
|
+ metrics_check(&success);
|
||
|
+
|
||
|
#if defined(STRESS_PERF_STATS) && defined(HAVE_LINUX_PERF_EVENT_H)
|
||
|
/*
|
||
|
* Dump perf statistics
|
||
|
diff --git a/stress-ng.h b/stress-ng.h
|
||
|
index 85b2beccf051..54986499503d 100644
|
||
|
--- a/stress-ng.h
|
||
|
+++ b/stress-ng.h
|
||
|
@@ -907,6 +907,20 @@ typedef enum {
|
||
|
|
||
|
typedef struct stress_proc_info *stress_pproc_info_t;
|
||
|
|
||
|
+/*
|
||
|
+ * Per ELISA request, we have a duplicated counter
|
||
|
+ * and run_ok flag in a different shared memory region
|
||
|
+ * so we can sanity check these just in case the stats
|
||
|
+ * have got corrupted.
|
||
|
+ */
|
||
|
+typedef struct {
|
||
|
+ struct {
|
||
|
+ uint64_t counter; /* Copy of stats counter */
|
||
|
+ bool run_ok; /* Copy of run_ok */
|
||
|
+ } data;
|
||
|
+ uint32_t hash; /* Hash of data */
|
||
|
+} stress_checksum_t;
|
||
|
+
|
||
|
/* settings for storing opt arg parsed data */
|
||
|
typedef struct stress_setting {
|
||
|
struct stress_setting *next; /* next setting in list */
|
||
|
@@ -1838,6 +1852,7 @@ typedef struct {
|
||
|
stress_tz_t tz; /* thermal zones */
|
||
|
#endif
|
||
|
bool run_ok; /* true if stressor exited OK */
|
||
|
+ stress_checksum_t *checksum; /* pointer to checksum data */
|
||
|
} stress_proc_stats_t;
|
||
|
|
||
|
#define STRESS_WARN_HASH_MAX (128)
|
||
|
@@ -1889,6 +1904,8 @@ typedef struct {
|
||
|
uint32_t softlockup_count; /* Atomic counter of softlock children */
|
||
|
#endif
|
||
|
uint8_t str_shared[STR_SHARED_SIZE]; /* str copying buffer */
|
||
|
+ stress_checksum_t *checksums; /* per stressor counter checksum */
|
||
|
+ size_t checksums_length; /* size of checksums mapping */
|
||
|
stress_proc_stats_t stats[0]; /* Shared statistics */
|
||
|
} stress_shared_t;
|
||
|
|
||
|
@@ -3125,7 +3142,7 @@ typedef struct {
|
||
|
const char *name; /* name of stress test */
|
||
|
} stress_t;
|
||
|
|
||
|
-/* Per process information */
|
||
|
+/* Per stressor process information */
|
||
|
typedef struct stress_proc_info {
|
||
|
struct stress_proc_info *next; /* next proc info struct in list */
|
||
|
struct stress_proc_info *prev; /* prev proc info struct in list */
|
||
|
@@ -3133,7 +3150,7 @@ typedef struct stress_proc_info {
|
||
|
pid_t *pids; /* process id */
|
||
|
stress_proc_stats_t **stats; /* process proc stats info */
|
||
|
int32_t started_procs; /* count of started processes */
|
||
|
- int32_t num_procs; /* number of process per stressor */
|
||
|
+ int32_t num_procs; /* number of processes per stressor */
|
||
|
uint64_t bogo_ops; /* number of bogo ops */
|
||
|
} stress_proc_info_t;
|
||
|
|
||
|
--
|
||
|
2.21.3
|
||
|
|