From 846f27b3d7bdfc3c2fc99fc6ddc6d51d7d822b80 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 May 2020 14:20:44 +0100 Subject: [PATCH 08/28] stress-ng: add checksum sanity check on bogo ops stats and run flag ELISA request for run sanity check involves adding a duplicated bogo ops and run flag in a different shared memory segment, hashing this data and checking these with the stats at the end of a run. If any corruption or run failures occur we have a mechanism of ensuring that the measurements are sane with a hashed check on the data and comparing the two separate copies. Signed-off-by: Colin Ian King --- stress-ng.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++---- stress-ng.h | 21 +++++++++- 2 files changed, 125 insertions(+), 11 deletions(-) diff --git a/stress-ng.c b/stress-ng.c index 17e7377d6f5d..c57ab29b49e3 100644 --- a/stress-ng.c +++ b/stress-ng.c @@ -966,6 +966,16 @@ static const stress_help_t help_generic[] = { { NULL, NULL, NULL } }; +/* + * stress_hash_checksum() + * generate a hash of the checksum data + */ +static inline void stress_hash_checksum(stress_checksum_t *checksum) +{ + checksum->hash = stress_hash_jenkin((uint8_t *)&checksum->data, + sizeof(checksum->data)); +} + /* * stressor_name_find() * Find index into stressors by name @@ -1651,6 +1661,7 @@ static void MLOCKED_TEXT stress_run( double time_start, time_finish; int32_t n_procs, j; const int32_t total_procs = get_total_num_procs(procs_list); + stress_checksum_t *checksum = g_shared->checksums; int32_t sched; @@ -1659,12 +1670,11 @@ static void MLOCKED_TEXT stress_run( long sched_runtime = -1; long sched_deadline = -1; - wait_flag = true; time_start = stress_time_now(); pr_dbg("starting stressors\n"); for (n_procs = 0; n_procs < total_procs; n_procs++) { - for (g_proc_current = procs_list; g_proc_current; g_proc_current = g_proc_current->next) { + for (g_proc_current = procs_list; g_proc_current; g_proc_current = g_proc_current->next, checksum++) { if (g_opt_timeout && (stress_time_now() - time_start > g_opt_timeout)) goto abort; @@ -1765,9 +1775,16 @@ again: .page_size = stress_get_pagesize(), }; + (void)memset(checksum, 0, sizeof(*checksum)); rc = g_proc_current->stressor->info->stressor(&args); pr_fail_check(&rc); - stats->run_ok = (rc == EXIT_SUCCESS); + if (rc == EXIT_SUCCESS) { + stats->run_ok = true; + checksum->data.run_ok = true; + } + stats->checksum = checksum; + checksum->data.counter = *args.counter; + stress_hash_checksum(checksum); } #if defined(STRESS_PERF_STATS) && defined(HAVE_LINUX_PERF_EVENT_H) if (g_opt_flags & OPT_FLAGS_PERF_STATS) { @@ -1878,6 +1895,65 @@ static int show_stressors(void) return 0; } +/* + * metrics_check() + * as per ELISA request, sanity check bogo ops and run flag + * to see if corruption occurred and print failure messages + * and set *success to false if hash and data is dubious. + */ +static void metrics_check(bool *success) +{ + stress_proc_info_t *pi; + bool ok = true; + + for (pi = procs_head; pi; pi = pi->next) { + int32_t j; + + for (j = 0; j < pi->started_procs; j++) { + const stress_proc_stats_t *const stats = pi->stats[j]; + const stress_checksum_t *checksum = stats->checksum; + stress_checksum_t stats_checksum; + + if (checksum == NULL) { + pr_fail("%s instance %d unexpected null checksum data\n", + pi->stressor->name, j); + ok = false; + continue; + } + + (void)memset(&stats_checksum, 0, sizeof(stats_checksum)); + stats_checksum.data.counter = stats->counter; + stats_checksum.data.run_ok = stats->run_ok; + stress_hash_checksum(&stats_checksum); + + if (stats->counter != checksum->data.counter) { + pr_fail("%s instance %d corrupted bogo-ops counter, %" PRIu64 " vs %" PRIu64 "\n", + pi->stressor->name, j, + stats->counter, checksum->data.counter); + ok = false; + } + if (stats->run_ok != checksum->data.run_ok) { + pr_fail("%s instance %d corrupted run flag, %d vs %d\n", + pi->stressor->name, j, + stats->run_ok, checksum->data.run_ok); + ok = false; + } + if (stats_checksum.hash != checksum->hash) { + pr_fail("%s instance %d hash error in bogo-ops counter and run flag, %" PRIu32 " vs %" PRIu32 "\n", + pi->stressor->name, j, + stats_checksum.hash, checksum->hash); + ok = false; + } + } + } + if (ok) { + pr_dbg("metrics check: all stressor metrics validated and sane\n"); + } else { + pr_fail("metrics check: stressor metrics corrupted, data is compromised\n"); + *success = false; + } +} + /* * metrics_dump() * output metrics @@ -2093,10 +2169,11 @@ static void log_system_info(void) * that is marked read-only to stop accidental smashing * from a run-away stack expansion */ -static inline void stress_map_shared(const size_t len) +static inline void stress_map_shared(const size_t num_procs) { const size_t page_size = stress_get_pagesize(); - const size_t sz = (len + (page_size << 1)) & ~(page_size - 1); + size_t len = sizeof(stress_shared_t) + (sizeof(stress_proc_stats_t) * num_procs); + size_t sz = (len + (page_size << 1)) & ~(page_size - 1); #if defined(HAVE_MPROTECT) void *last_page; #endif @@ -2104,7 +2181,7 @@ static inline void stress_map_shared(const size_t len) g_shared = (stress_shared_t *)mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); if (g_shared == MAP_FAILED) { - pr_err("Cannot mmap to shared memory region: errno=%d (%s)\n", + pr_err("Cannot mmap to shared memory region, errno=%d (%s)\n", errno, strerror(errno)); free_procs(); exit(EXIT_FAILURE); @@ -2137,6 +2214,25 @@ static inline void stress_map_shared(const size_t len) g_shared->length -= sz; } #endif + + /* + * copy of checksums and run data in a different shared + * memory segment so that we can sanity check these for + * any form of corruption + */ + len = sizeof(stress_checksum_t) * STRESS_PROCS_MAX; + sz = (len + page_size) & ~(page_size - 1); + g_shared->checksums = (stress_checksum_t *)mmap(NULL, sz, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); + if (g_shared->checksums == MAP_FAILED) { + pr_err("Cannot mmap checksums, errno=%d (%s)\n", + errno, strerror(errno)); + (void)munmap((void *)g_shared, g_shared->length); + free_procs(); + exit(EXIT_FAILURE); + } + (void)memset(g_shared->checksums, 0, sz); + g_shared->checksums_length = sz; } /* @@ -2145,6 +2241,7 @@ static inline void stress_map_shared(const size_t len) */ void stress_unmap_shared(void) { + (void)munmap((void *)g_shared->checksums, g_shared->checksums_length); (void)munmap((void *)g_shared, g_shared->length); } @@ -2785,7 +2882,6 @@ static inline void stress_mlock_executable(void) int main(int argc, char **argv, char **envp) { double duration = 0.0; /* stressor run time in secs */ - size_t len; bool success = true, resource_success = true; FILE *yaml; /* YAML output file */ char *yaml_filename; /* YAML file name */ @@ -2997,8 +3093,7 @@ int main(int argc, char **argv, char **envp) * Allocate shared memory segment for shared data * across all the child stressors */ - len = sizeof(stress_shared_t) + (sizeof(stress_proc_stats_t) * get_total_num_procs(procs_head)); - stress_map_shared(len); + stress_map_shared(get_total_num_procs(procs_head)); /* * Setup spinlocks @@ -3076,6 +3171,8 @@ int main(int argc, char **argv, char **envp) if (g_opt_flags & OPT_FLAGS_METRICS) metrics_dump(yaml, ticks_per_sec); + metrics_check(&success); + #if defined(STRESS_PERF_STATS) && defined(HAVE_LINUX_PERF_EVENT_H) /* * Dump perf statistics diff --git a/stress-ng.h b/stress-ng.h index 85b2beccf051..54986499503d 100644 --- a/stress-ng.h +++ b/stress-ng.h @@ -907,6 +907,20 @@ typedef enum { typedef struct stress_proc_info *stress_pproc_info_t; +/* + * Per ELISA request, we have a duplicated counter + * and run_ok flag in a different shared memory region + * so we can sanity check these just in case the stats + * have got corrupted. + */ +typedef struct { + struct { + uint64_t counter; /* Copy of stats counter */ + bool run_ok; /* Copy of run_ok */ + } data; + uint32_t hash; /* Hash of data */ +} stress_checksum_t; + /* settings for storing opt arg parsed data */ typedef struct stress_setting { struct stress_setting *next; /* next setting in list */ @@ -1838,6 +1852,7 @@ typedef struct { stress_tz_t tz; /* thermal zones */ #endif bool run_ok; /* true if stressor exited OK */ + stress_checksum_t *checksum; /* pointer to checksum data */ } stress_proc_stats_t; #define STRESS_WARN_HASH_MAX (128) @@ -1889,6 +1904,8 @@ typedef struct { uint32_t softlockup_count; /* Atomic counter of softlock children */ #endif uint8_t str_shared[STR_SHARED_SIZE]; /* str copying buffer */ + stress_checksum_t *checksums; /* per stressor counter checksum */ + size_t checksums_length; /* size of checksums mapping */ stress_proc_stats_t stats[0]; /* Shared statistics */ } stress_shared_t; @@ -3125,7 +3142,7 @@ typedef struct { const char *name; /* name of stress test */ } stress_t; -/* Per process information */ +/* Per stressor process information */ typedef struct stress_proc_info { struct stress_proc_info *next; /* next proc info struct in list */ struct stress_proc_info *prev; /* prev proc info struct in list */ @@ -3133,7 +3150,7 @@ typedef struct stress_proc_info { pid_t *pids; /* process id */ stress_proc_stats_t **stats; /* process proc stats info */ int32_t started_procs; /* count of started processes */ - int32_t num_procs; /* number of process per stressor */ + int32_t num_procs; /* number of processes per stressor */ uint64_t bogo_ops; /* number of bogo ops */ } stress_proc_info_t; -- 2.21.3