From 17751da7bb90ee60dd36ec90f13122557903c3c0 Mon Sep 17 00:00:00 2001 From: Anubhav Shelat Date: Fri, 21 Jun 2024 16:03:27 -0400 Subject: [PATCH] Add options for timerlat tracing in rteval, and store timerlat summary in rteval summary xml file. Convert base64 data to text before wrapping. Add sysreport to gating tests and check the exit status. Resolves: RHEL-44438 Signed-off-by: Anubhav Shelat --- ...erlat-Add-timerlat-tracing-to-rteval.patch | 526 ++++++++++++++++++ ...onvert-base64-data-to-text-before-wr.patch | 65 +++ rteval.spec | 12 +- tests/scripts/run_tests.sh | 6 + 4 files changed, 608 insertions(+), 1 deletion(-) create mode 100644 0001-rteval-timerlat-Add-timerlat-tracing-to-rteval.patch create mode 100644 0002-rteval-sysstat-Convert-base64-data-to-text-before-wr.patch diff --git a/0001-rteval-timerlat-Add-timerlat-tracing-to-rteval.patch b/0001-rteval-timerlat-Add-timerlat-tracing-to-rteval.patch new file mode 100644 index 0000000..097d329 --- /dev/null +++ b/0001-rteval-timerlat-Add-timerlat-tracing-to-rteval.patch @@ -0,0 +1,526 @@ +From 5909521e06ed92ea60ffb247b25ca86c232f71bf Mon Sep 17 00:00:00 2001 +From: John Kacur +Date: Thu, 20 Jun 2024 21:34:24 -0400 +Subject: [PATCH 1/3] rteval: timerlat: Add timerlat tracing to rteval + +This patch adds tracing from timerlat to rteval. + +These two options are added + + --timerlat-stoptrace USEC + Stop trace if thread latency higher than USEC + --timerlat-trace FILE + File to save trace to + +This also stores the summary from rtla timerlat in the rteval xml file. + +Signed-off-by: John Kacur +--- + rteval/modules/measurement/timerlat.py | 213 ++++++++++++++++++++++++- + rteval/rteval_text.xsl | 181 ++++++++++++++++++++- + 2 files changed, 389 insertions(+), 5 deletions(-) + +diff --git a/rteval/modules/measurement/timerlat.py b/rteval/modules/measurement/timerlat.py +index e8345fab1ad7..45adec1b33e1 100644 +--- a/rteval/modules/measurement/timerlat.py ++++ b/rteval/modules/measurement/timerlat.py +@@ -16,6 +16,7 @@ from rteval.modules import rtevalModulePrototype + from rteval.systopology import cpuinfo, SysTopology + from rteval.cpulist_utils import expand_cpulist, collapse_cpulist + ++ + class TLRunData: + ''' class to store instance data from a timerlat run ''' + def __init__(self, coreid, datatype, priority, logfnc): +@@ -188,6 +189,15 @@ class Timerlat(rtevalModulePrototype): + self.__cpus = [str(c) for c in expand_cpulist(self.__cpulist)] + self.__numcores = len(self.__cpus) + ++ # Has tracing been triggered ++ self.__stoptrace = False ++ # This stores the output from rtla ++ self.__posttrace = "" ++ # Stop Trace Data ++ self.__stdata = {} ++ # Stop Trace Cpu ++ self.stcpu = -1 ++ + self.__timerlat_out = None + self.__timerlat_err = None + self.__started = False +@@ -218,10 +228,21 @@ class Timerlat(rtevalModulePrototype): + self.__cmd = ['rtla', 'timerlat', 'hist', '-P', f'f:{int(self.__priority)}', '-u'] + self.__cmd.append(f'-c{self.__cpulist}') + self.__cmd.append(f'-E{self.__buckets}') ++ ++ if self.__cfg.stoptrace: ++ self.__cmd.append(f"-T{int(self.__cfg.stoptrace)}") ++ ++ if self.__cfg.trace: ++ if not self.__cfg.stoptrace: ++ self._log(Log.WARN, f'Ignoring trace={self.__cfg.trace}, because stoptrace not invoked') ++ else: ++ self.__cmd.append(f'-t={self.__cfg.trace}') ++ + self._log(Log.DEBUG, f'self.__cmd = {self.__cmd}') + self.__timerlat_out = tempfile.SpooledTemporaryFile(mode='w+b') + self.__timerlat_err = tempfile.SpooledTemporaryFile(mode='w+b') + ++ + def _WorkloadTask(self): + if self.__started: + return +@@ -252,6 +273,10 @@ class Timerlat(rtevalModulePrototype): + os.kill(self.__timerlat_process.pid, signal.SIGINT) + time.sleep(2) + ++ blocking_thread_detected = False ++ softirq_interference_detected = False ++ irq_interference_detected = False ++ + # Parse histogram output + self.__timerlat_out.seek(0) + for line in self.__timerlat_out: +@@ -261,6 +286,113 @@ class Timerlat(rtevalModulePrototype): + if not line: + continue + ++ # Parsing if stoptrace has been invoked ++ if self.__stoptrace: ++ self.__posttrace += line ++ line = line.strip() ++ fields = line.split() ++ if line.startswith("##") and fields[1] == "CPU": ++ self.stcpu = int(fields[2]) ++ self._log(Log.DEBUG, f"self.stcpu = {self.stcpu}") ++ self.__stdata[self.stcpu] = {} ++ continue ++ if self.stcpu == -1: ++ self._log(Log.WARN, "Stop trace has been invoked, but a stop cpu has not been identified.") ++ continue ++ if line.startswith('------------------'): ++ blocking_thread_detected = False ++ softirq_interference_detected = False ++ irq_interference_detected = False ++ continue ++ if line.startswith("Thread latency:"): ++ thread_latency_percent = fields[-1].strip('()%') ++ self._log(Log.DEBUG, f"thread_latency_percent = {thread_latency_percent}") ++ thread_latency = fields[-3] ++ self._log(Log.DEBUG, f"thread_latency = {thread_latency}") ++ self.__stdata[self.stcpu]["Thread_latency"] = (thread_latency, thread_latency_percent) ++ elif line.startswith("Previous IRQ interference"): ++ self._log(Log.DEBUG, f'Previous_IRQ_interference = {fields[-2]}') ++ self.__stdata[self.stcpu]["Previous_IRQ_interference"] = fields[-2] ++ elif line.startswith("IRQ handler delay:"): ++ irq_handler_delay_percent = fields[-2].strip('(') ++ irq_handler_delay = fields[-4] ++ # Do we have (exit from idle)? ++ if fields[3] == '(exit': ++ field_name = "IRQ_handler_delay_exit_from_idle" ++ else: ++ field_name = "IRQ_handler_delay" ++ self._log(Log.DEBUG, f"{field_name} = {irq_handler_delay}") ++ self._log(Log.DEBUG, f"{field_name}_percent = {irq_handler_delay_percent}") ++ self.__stdata[self.stcpu][field_name] = (irq_handler_delay, irq_handler_delay_percent) ++ elif line.startswith("IRQ latency:"): ++ self._log(Log.DEBUG, f"irq_latency = {fields[-2]}") ++ self.__stdata[self.stcpu]["IRQ_latency"] = fields[-2] ++ elif line.startswith("Timerlat IRQ duration"): ++ timerlat_irq_duration_percent = fields[-2].strip('(') ++ self._log(Log.DEBUG, f"timerlat_irq_duration_percent = {timerlat_irq_duration_percent}") ++ timerlat_irq_duration = fields[-4] ++ self._log(Log.DEBUG, f"timerlat_irq_duration = {timerlat_irq_duration}") ++ self.__stdata[self.stcpu]["Timerlat_IRQ_duration"] = (timerlat_irq_duration, timerlat_irq_duration_percent) ++ elif line.startswith("Blocking thread:"): ++ blocking_thread_percent = fields[-2].strip('(') ++ self._log(Log.DEBUG, f"blocking_thread_percent = {blocking_thread_percent}") ++ blocking_thread = fields[-4] ++ self._log(Log.DEBUG, f"blocking_thread = {blocking_thread}") ++ self.__stdata[self.stcpu]["Blocking_Thread"] = (blocking_thread, blocking_thread_percent) ++ blocking_thread_detected = True ++ irq_interference_detected = False ++ softirq_interference_detected = False ++ elif line.startswith("IRQ interference"): ++ irq_interference_percent = fields[-2].strip('(') ++ self._log(Log.DEBUG, f"irq_interference_percent = {irq_interference_percent}") ++ irq_interference = fields[-4] ++ self._log(Log.DEBUG, f"irq_interference = {irq_interference}") ++ self.__stdata[self.stcpu]["IRQ_interference"] = (irq_interference, irq_interference_percent) ++ blocking_thread_detected = False ++ irq_interference_detected = True ++ softirq_interference_detected = False ++ elif line.startswith("Softirq interference"): ++ softirq_interference_percent = fields[-2].strip('(') ++ self._log(Log.DEBUG, f"softirq_interference_percent = {softirq_interference_percent}") ++ softirq_interference = fields[-4] ++ self._log(Log.DEBUG, f"softirq_interference = {softirq_interference}") ++ self.__stdata[self.stcpu]["Softirq_interference"] = (softirq_interference, softirq_interference_percent) ++ blocking_thread_detected = False ++ irq_interference_detected = False ++ softirq_interference_detected = True ++ elif blocking_thread_detected: ++ self._log(Log.DEBUG, f'line={line}') ++ blocking_thread = " ".join(fields[0:-2]) ++ self._log(Log.DEBUG, f"blocking_thread = {blocking_thread}") ++ blocking_threadus = fields[-2] ++ self._log(Log.DEBUG, f"blocking_threadus = {blocking_threadus}") ++ self.__stdata[self.stcpu].setdefault("blocking_thread", []) ++ self.__stdata[self.stcpu]["blocking_thread"] += [(blocking_thread, blocking_threadus)] ++ elif softirq_interference_detected: ++ softirq = " ".join(fields[0:-2]) ++ softirq_latency = fields[-2] ++ self._log(Log.DEBUG, f'softirq = {softirq}') ++ self._log(Log.DEBUG, f'softirq_latency = {softirq_latency}') ++ self.__stdata[self.stcpu].setdefault("softirq_interference", []) ++ self.__stdata[self.stcpu]["softirq_interference"] += [(softirq, softirq_latency)] ++ elif irq_interference_detected: ++ irq_interference_name = " ".join(fields[0:-2]) ++ irq_interference_latency = fields[-2] ++ self._log(Log.DEBUG, f'irq_interference = {irq_interference_name}, latency = {irq_interference_latency}') ++ self.__stdata[self.stcpu].setdefault("irq_interference", []) ++ self.__stdata[self.stcpu]["irq_interference"] += [(irq_interference_name, irq_interference_latency)] ++ elif line.startswith("Max timerlat IRQ latency"): ++ self._log(Log.DEBUG, f"line={line}") ++ max_timerlat_irq_latency = fields[-5] ++ self._log(Log.DEBUG, f"max_timerlat_irq_latency = {max_timerlat_irq_latency}") ++ max_timerlat_cpu = int(fields[-1]) ++ self._log(Log.DEBUG, f"max_timerlat_cpu = {max_timerlat_cpu}") ++ self.__stdata.setdefault(max_timerlat_cpu, {}) ++ self.__stdata[max_timerlat_cpu]["Max_timerlat_IRQ_latency_from_idle"] = max_timerlat_irq_latency ++ else: ++ self._log(Log.DEBUG, f'line = {line}') ++ continue ++ + if line.startswith('#'): + if line.startswith('# Duration:'): + duration = line.split()[2] +@@ -285,9 +417,13 @@ class Timerlat(rtevalModulePrototype): + elif line.startswith('max:'): + #print(line) + continue ++ elif line.startswith('rtla timerlat hit stop tracing'): ++ self.__stoptrace = True ++ self.__posttrace += line ++ continue + else: +- pass + #print(line) ++ pass + + vals = line.split() + if not vals: +@@ -301,12 +437,18 @@ class Timerlat(rtevalModulePrototype): + continue + + for i, core in enumerate(self.__cpus): +- self.__timerlatdata[core].bucket(index, int(vals[i*3+1]), ++ # There might not be a count on every cpu if tracing invoked ++ if i*3 + 1 >= len(vals): ++ self.__timerlatdata[core].bucket(index, 0, 0, 0) ++ self.__timerlatdata['system'].bucket(index, 0, 0, 0) ++ else: ++ self.__timerlatdata[core].bucket(index, int(vals[i*3+1]), + int(vals[i*3+2]), + int(vals[i*3+3])) +- self.__timerlatdata['system'].bucket(index, int(vals[i*3+1]), ++ self.__timerlatdata['system'].bucket(index, int(vals[i*3+1]), + int(vals[i*3+2]), + int(vals[i*3+3])) ++ + # Generate statistics for each RunData object + for n in list(self.__timerlatdata.keys()): + self.__timerlatdata[n].reduce() +@@ -320,6 +462,62 @@ class Timerlat(rtevalModulePrototype): + rep_n = libxml2.newNode('timerlat') + rep_n.newProp('command_line', ' '.join(self.__cmd)) + ++ stoptrace_invoked_n = libxml2.newNode('stoptrace_invoked') ++ if self.stcpu != -1: ++ stoptrace_invoked_n.newProp("invoked", "true") ++ else: ++ stoptrace_invoked_n.newProp("invoked", "") ++ rep_n.addChild(stoptrace_invoked_n) ++ ++ if self.stcpu != -1: ++ self._log(Log.DEBUG, f'self.__stdata = {self.__stdata}') ++ for cpu in self.__stdata: ++ # This is Max timerlat IRQ latency from idle ++ # With no other data from that cpu, so don't create a ++ # stoptrace_report for this ++ if len(self.__stdata[cpu]) == 1: ++ continue ++ stoptrace_n = libxml2.newNode('stoptrace_report') ++ stoptrace_n.newProp("CPU", str(cpu)) ++ for k, v in self.__stdata[cpu].items(): ++ self._log(Log.DEBUG, f"cpu={cpu}, k={k}, v={v}") ++ if isinstance(v, tuple): ++ latency = str(v[0]) ++ percent = str(v[1]) ++ cpu_n = stoptrace_n.newTextChild(None, str(k), None) ++ n = cpu_n.newTextChild(None, "latency", latency) ++ n.newProp('unit', 'us') ++ ++ n = cpu_n.newTextChild(None, "latency_percent", percent) ++ n.newProp('unit', '%') ++ elif isinstance(v, list): ++ if k in ("blocking_thread", "softirq_interference", "irq_interference"): ++ for name, latency in v: ++ cpu_n = stoptrace_n.newTextChild(None, k, None) ++ n = cpu_n.newTextChild(None, "name", name) ++ n = cpu_n.newTextChild(None, "latency", latency) ++ n.newProp('unit', 'us') ++ else: ++ if k == "Max_timerlat_IRQ_latency_from_idle": ++ continue ++ cpu_n = stoptrace_n.newTextChild(None, str(k), str(v)) ++ cpu_n.newProp('unit', 'us') ++ rep_n.addChild(stoptrace_n) ++ ++ self._log(Log.DEBUG, f'timerlat: posttrace = \n{self.__posttrace}') ++ self._log(Log.DEBUG, 'timerlat: posttrace END') ++ for cpu in self.__stdata: ++ for k, v in self.__stdata[cpu].items(): ++ if isinstance(v, tuple): ++ continue ++ if k == "Max_timerlat_IRQ_latency_from_idle": ++ max_timerlat_n = libxml2.newNode('max_timerlat_report') ++ max_timerlat_n.newProp("CPU", str(cpu)) ++ cpu_n = max_timerlat_n.newTextChild(None, k, str(v)) ++ cpu_n.newProp('unit', 'us') ++ rep_n.addChild(max_timerlat_n) ++ return rep_n ++ + rep_n.addChild(self.__timerlatdata['system'].MakeReport()) + for thr in self.__cpus: + if str(thr) not in self.__timerlatdata: +@@ -341,7 +539,13 @@ def ModuleParameters(): + "metavar": "PRIO" }, + "buckets": {"descr": "Number of buckets", + "default": 3500, +- "metavar": "NUM"} ++ "metavar": "NUM" }, ++ "stoptrace": {"descr": "Stop trace if thread latency higher than USEC", ++ "default": None, ++ "metavar": "USEC" }, ++ "trace": {"descr": "File to save trace to", ++ "default": None, ++ "metavar": "FILE" }, + } + + def create(params, logger): +@@ -367,6 +571,7 @@ if __name__ == '__main__': + + cfg_tl = cfg.GetSection('timerlat') + cfg_tl.cpulist = collapse_cpulist(SysTopology().online_cpus()) ++ cfg_tl.stoptrace=50 + + RUNTIME = 10 + +diff --git a/rteval/rteval_text.xsl b/rteval/rteval_text.xsl +index 2f03bda0bb55..70777354efa5 100644 +--- a/rteval/rteval_text.xsl ++++ b/rteval/rteval_text.xsl +@@ -316,14 +316,26 @@ + + + +- ++ ++ rtla timerlat hit stop tracing ++ ++ ++ ++ ++ + ++ ++ + System: + + ++ + ++ ++ + Statistics: + ++ + + + +@@ -470,4 +482,171 @@ + + + ++ ++ ++ ## CPU ++ ++ hit stop tracing, analyzing it ## ++ ++ ++ ++ ++ ++ Previous IRQ interference: up to ++ ++ ++ ++ ++ ++ ++ ++ ++ IRQ handler delay: ++ ++ ++ ++ ( ++ ++ ++ ++ ) ++ ++ ++ ++ ++ IRQ handler delay: (exit from idle) ++ ++ ++ ++ ( ++ ++ ++ ++ ) ++ ++ ++ ++ IRQ latency: ++ ++ ++ ++ ++ ++ ++ ++ Timerlat IRQ duration: ++ ++ ++ ++ ++ ( ++ ++ ++ ++ ) ++ ++ ++ ++ Blocking thread: ++ ++ ++ ++ ++ ( ++ ++ ++ ++ ) ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ Softirq interference: ++ ++ ++ ++ ++ ( ++ ++ ++ ++ ) ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ IRQ interference: ++ ++ ++ ++ ++ ( ++ ++ ++ ++ ) ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ -------------------------------------------------------------------------------- ++ Thread latency: ++ ++ ++ ++ ++ ( ++ ++ ++ ++ ) ++ ++ ++ ++ ++ ++ ++ ++ Max timerlat IRQ latency from idle: ++ ++ ++ ++ in cpu ++ ++ ++ ++ ++ + +-- +2.45.2 + diff --git a/0002-rteval-sysstat-Convert-base64-data-to-text-before-wr.patch b/0002-rteval-sysstat-Convert-base64-data-to-text-before-wr.patch new file mode 100644 index 0000000..266e660 --- /dev/null +++ b/0002-rteval-sysstat-Convert-base64-data-to-text-before-wr.patch @@ -0,0 +1,65 @@ +From ca90d5aa7ae2ff6dac124c710fceadae028b5f4a Mon Sep 17 00:00:00 2001 +From: Crystal Wood +Date: Thu, 20 Jun 2024 21:18:05 -0500 +Subject: [PATCH 2/3] rteval: sysstat: Convert base64 data to text before + wrapping + +As of Python 3, b64encode() returns data, not a string, causing this: + +Traceback (most recent call last): + File "/home/crwood/git/rteval/./rteval-cmd", line 413, in + ec = rteval.Measure() + ^^^^^^^^^^^^^^^^ + File "/home/crwood/git/rteval/rteval/__init__.py", line 246, in Measure + self._report(measure_start, self.__rtevcfg.xslt_report) + File "/home/crwood/git/rteval/rteval/rtevalReport.py", line 63, in _report + self.__xmlreport.AppendXMLnodes(self._measuremods.MakeReport()) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/crwood/git/rteval/rteval/modules/measurement/__init__.py", line 190, in MakeReport + mprep_n = mp.MakeReport() + ^^^^^^^^^^^^^^^ + File "/home/crwood/git/rteval/rteval/modules/measurement/__init__.py", line 62, in MakeReport + rep_n = RtEvalModules.MakeReport(self) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/crwood/git/rteval/rteval/modules/__init__.py", line 559, in MakeReport + modrep_n = mod.MakeReport() + ^^^^^^^^^^^^^^^^ + File "/home/crwood/git/rteval/rteval/modules/measurement/sysstat.py", line 83, in MakeReport + data_n = rep_n.newTextChild(None, 'data', "\n"+"\n".join(textwrap.wrap(data, 75))+"\n") + ^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib64/python3.12/textwrap.py", line 384, in wrap + return w.wrap(text) + ^^^^^^^^^^^^ + File "/usr/lib64/python3.12/textwrap.py", line 356, in wrap + chunks = self._split_chunks(text) + ^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib64/python3.12/textwrap.py", line 342, in _split_chunks + text = self._munge_whitespace(text) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib64/python3.12/textwrap.py", line 153, in _munge_whitespace + text = text.translate(self.unicode_whitespace_trans) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +TypeError: a bytes-like object is required, not 'dict' + +Signed-off-by: Crystal Wood +Signed-off-by: John Kacur +--- + rteval/modules/measurement/sysstat.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/rteval/modules/measurement/sysstat.py b/rteval/modules/measurement/sysstat.py +index 57194a2b5f45..d4646c1646f4 100644 +--- a/rteval/modules/measurement/sysstat.py ++++ b/rteval/modules/measurement/sysstat.py +@@ -79,7 +79,7 @@ class sysstat(rtevalModulePrototype): + fp = open(self.__datafile, "rb") + compr = bz2.BZ2Compressor(9) + cmpr = compr.compress(fp.read()) +- data = base64.b64encode(cmpr + compr.flush()) ++ data = base64.b64encode(cmpr + compr.flush()).decode('utf-8') + data_n = rep_n.newTextChild(None, 'data', "\n"+"\n".join(textwrap.wrap(data, 75))+"\n") + data_n.newProp('contents', 'sysstat/sar binary data') + data_n.newProp('encoding', 'base64') +-- +2.45.2 + diff --git a/rteval.spec b/rteval.spec index a97f39e..ed89600 100644 --- a/rteval.spec +++ b/rteval.spec @@ -1,6 +1,6 @@ Name: rteval Version: 3.8 -Release: 4%{?dist} +Release: 5%{?dist} Summary: Utility to evaluate system suitability for RT Linux Group: Development/Tools @@ -35,6 +35,8 @@ BuildArch: noarch # Patches Patch1: 0001-Updated-rteval-man-page.patch +Patch2: 0001-rteval-timerlat-Add-timerlat-tracing-to-rteval.patch +Patch3: 0002-rteval-sysstat-Convert-base64-data-to-text-before-wr.patch %description The rteval script is a utility for measuring various aspects of @@ -48,6 +50,8 @@ to the screen. %prep %setup -q %patch1 -p1 +%patch2 -p1 +%patch3 -p1 %build %{__python3} setup.py build @@ -68,6 +72,12 @@ to the screen. %{_bindir}/rteval %changelog +* Fri Jun 21 2024 Anubhav Shelat - 3.8-5 +- Add options for timerlat tracing in rteval, and store timerlat summary in rteval summary xml file. +- Convert base64 data to text before wrapping. +- Add sysreport to gating tests and check the exit status. +Resolves: RHEL-44438 + * Mon Jun 17 2024 John Kacur - 3.8-4 - Add rtla as a Requires to provide timerlat Resolves: RHEL-38306 diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh index 9a2c9df..b58fada 100644 --- a/tests/scripts/run_tests.sh +++ b/tests/scripts/run_tests.sh @@ -17,4 +17,10 @@ if [[ $? != 0 ]]; then exit 2 fi +sudo rteval -s + +if [[ $? != 0]]; then + exit 3 +fi + exit 0