rteval/0001-rteval-timerlat-Add-timerlat-tracing-to-rteval.patch
Anubhav Shelat 52023fa87f Add options for timerlat tracing in rteval, and store timerlat summary in rteval summary xml file.
Convert base64 data to text before wrapping.
Add sysreport to gating tests and check the exit status.
Resolves: RHEL-44438

Signed-off-by: Anubhav Shelat <ashelat@redhat.com>
2024-06-24 09:35:30 -04:00

527 lines
23 KiB
Diff

From 5909521e06ed92ea60ffb247b25ca86c232f71bf Mon Sep 17 00:00:00 2001
From: John Kacur <jkacur@redhat.com>
Date: Thu, 20 Jun 2024 21:34:24 -0400
Subject: [PATCH 1/3] rteval: timerlat: Add timerlat tracing to rteval
This patch adds tracing from timerlat to rteval.
These two options are added
--timerlat-stoptrace USEC
Stop trace if thread latency higher than USEC
--timerlat-trace FILE
File to save trace to
This also stores the summary from rtla timerlat in the rteval xml file.
Signed-off-by: John Kacur <jkacur@redhat.com>
---
rteval/modules/measurement/timerlat.py | 213 ++++++++++++++++++++++++-
rteval/rteval_text.xsl | 181 ++++++++++++++++++++-
2 files changed, 389 insertions(+), 5 deletions(-)
diff --git a/rteval/modules/measurement/timerlat.py b/rteval/modules/measurement/timerlat.py
index e8345fab1ad7..45adec1b33e1 100644
--- a/rteval/modules/measurement/timerlat.py
+++ b/rteval/modules/measurement/timerlat.py
@@ -16,6 +16,7 @@ from rteval.modules import rtevalModulePrototype
from rteval.systopology import cpuinfo, SysTopology
from rteval.cpulist_utils import expand_cpulist, collapse_cpulist
+
class TLRunData:
''' class to store instance data from a timerlat run '''
def __init__(self, coreid, datatype, priority, logfnc):
@@ -188,6 +189,15 @@ class Timerlat(rtevalModulePrototype):
self.__cpus = [str(c) for c in expand_cpulist(self.__cpulist)]
self.__numcores = len(self.__cpus)
+ # Has tracing been triggered
+ self.__stoptrace = False
+ # This stores the output from rtla
+ self.__posttrace = ""
+ # Stop Trace Data
+ self.__stdata = {}
+ # Stop Trace Cpu
+ self.stcpu = -1
+
self.__timerlat_out = None
self.__timerlat_err = None
self.__started = False
@@ -218,10 +228,21 @@ class Timerlat(rtevalModulePrototype):
self.__cmd = ['rtla', 'timerlat', 'hist', '-P', f'f:{int(self.__priority)}', '-u']
self.__cmd.append(f'-c{self.__cpulist}')
self.__cmd.append(f'-E{self.__buckets}')
+
+ if self.__cfg.stoptrace:
+ self.__cmd.append(f"-T{int(self.__cfg.stoptrace)}")
+
+ if self.__cfg.trace:
+ if not self.__cfg.stoptrace:
+ self._log(Log.WARN, f'Ignoring trace={self.__cfg.trace}, because stoptrace not invoked')
+ else:
+ self.__cmd.append(f'-t={self.__cfg.trace}')
+
self._log(Log.DEBUG, f'self.__cmd = {self.__cmd}')
self.__timerlat_out = tempfile.SpooledTemporaryFile(mode='w+b')
self.__timerlat_err = tempfile.SpooledTemporaryFile(mode='w+b')
+
def _WorkloadTask(self):
if self.__started:
return
@@ -252,6 +273,10 @@ class Timerlat(rtevalModulePrototype):
os.kill(self.__timerlat_process.pid, signal.SIGINT)
time.sleep(2)
+ blocking_thread_detected = False
+ softirq_interference_detected = False
+ irq_interference_detected = False
+
# Parse histogram output
self.__timerlat_out.seek(0)
for line in self.__timerlat_out:
@@ -261,6 +286,113 @@ class Timerlat(rtevalModulePrototype):
if not line:
continue
+ # Parsing if stoptrace has been invoked
+ if self.__stoptrace:
+ self.__posttrace += line
+ line = line.strip()
+ fields = line.split()
+ if line.startswith("##") and fields[1] == "CPU":
+ self.stcpu = int(fields[2])
+ self._log(Log.DEBUG, f"self.stcpu = {self.stcpu}")
+ self.__stdata[self.stcpu] = {}
+ continue
+ if self.stcpu == -1:
+ self._log(Log.WARN, "Stop trace has been invoked, but a stop cpu has not been identified.")
+ continue
+ if line.startswith('------------------'):
+ blocking_thread_detected = False
+ softirq_interference_detected = False
+ irq_interference_detected = False
+ continue
+ if line.startswith("Thread latency:"):
+ thread_latency_percent = fields[-1].strip('()%')
+ self._log(Log.DEBUG, f"thread_latency_percent = {thread_latency_percent}")
+ thread_latency = fields[-3]
+ self._log(Log.DEBUG, f"thread_latency = {thread_latency}")
+ self.__stdata[self.stcpu]["Thread_latency"] = (thread_latency, thread_latency_percent)
+ elif line.startswith("Previous IRQ interference"):
+ self._log(Log.DEBUG, f'Previous_IRQ_interference = {fields[-2]}')
+ self.__stdata[self.stcpu]["Previous_IRQ_interference"] = fields[-2]
+ elif line.startswith("IRQ handler delay:"):
+ irq_handler_delay_percent = fields[-2].strip('(')
+ irq_handler_delay = fields[-4]
+ # Do we have (exit from idle)?
+ if fields[3] == '(exit':
+ field_name = "IRQ_handler_delay_exit_from_idle"
+ else:
+ field_name = "IRQ_handler_delay"
+ self._log(Log.DEBUG, f"{field_name} = {irq_handler_delay}")
+ self._log(Log.DEBUG, f"{field_name}_percent = {irq_handler_delay_percent}")
+ self.__stdata[self.stcpu][field_name] = (irq_handler_delay, irq_handler_delay_percent)
+ elif line.startswith("IRQ latency:"):
+ self._log(Log.DEBUG, f"irq_latency = {fields[-2]}")
+ self.__stdata[self.stcpu]["IRQ_latency"] = fields[-2]
+ elif line.startswith("Timerlat IRQ duration"):
+ timerlat_irq_duration_percent = fields[-2].strip('(')
+ self._log(Log.DEBUG, f"timerlat_irq_duration_percent = {timerlat_irq_duration_percent}")
+ timerlat_irq_duration = fields[-4]
+ self._log(Log.DEBUG, f"timerlat_irq_duration = {timerlat_irq_duration}")
+ self.__stdata[self.stcpu]["Timerlat_IRQ_duration"] = (timerlat_irq_duration, timerlat_irq_duration_percent)
+ elif line.startswith("Blocking thread:"):
+ blocking_thread_percent = fields[-2].strip('(')
+ self._log(Log.DEBUG, f"blocking_thread_percent = {blocking_thread_percent}")
+ blocking_thread = fields[-4]
+ self._log(Log.DEBUG, f"blocking_thread = {blocking_thread}")
+ self.__stdata[self.stcpu]["Blocking_Thread"] = (blocking_thread, blocking_thread_percent)
+ blocking_thread_detected = True
+ irq_interference_detected = False
+ softirq_interference_detected = False
+ elif line.startswith("IRQ interference"):
+ irq_interference_percent = fields[-2].strip('(')
+ self._log(Log.DEBUG, f"irq_interference_percent = {irq_interference_percent}")
+ irq_interference = fields[-4]
+ self._log(Log.DEBUG, f"irq_interference = {irq_interference}")
+ self.__stdata[self.stcpu]["IRQ_interference"] = (irq_interference, irq_interference_percent)
+ blocking_thread_detected = False
+ irq_interference_detected = True
+ softirq_interference_detected = False
+ elif line.startswith("Softirq interference"):
+ softirq_interference_percent = fields[-2].strip('(')
+ self._log(Log.DEBUG, f"softirq_interference_percent = {softirq_interference_percent}")
+ softirq_interference = fields[-4]
+ self._log(Log.DEBUG, f"softirq_interference = {softirq_interference}")
+ self.__stdata[self.stcpu]["Softirq_interference"] = (softirq_interference, softirq_interference_percent)
+ blocking_thread_detected = False
+ irq_interference_detected = False
+ softirq_interference_detected = True
+ elif blocking_thread_detected:
+ self._log(Log.DEBUG, f'line={line}')
+ blocking_thread = " ".join(fields[0:-2])
+ self._log(Log.DEBUG, f"blocking_thread = {blocking_thread}")
+ blocking_threadus = fields[-2]
+ self._log(Log.DEBUG, f"blocking_threadus = {blocking_threadus}")
+ self.__stdata[self.stcpu].setdefault("blocking_thread", [])
+ self.__stdata[self.stcpu]["blocking_thread"] += [(blocking_thread, blocking_threadus)]
+ elif softirq_interference_detected:
+ softirq = " ".join(fields[0:-2])
+ softirq_latency = fields[-2]
+ self._log(Log.DEBUG, f'softirq = {softirq}')
+ self._log(Log.DEBUG, f'softirq_latency = {softirq_latency}')
+ self.__stdata[self.stcpu].setdefault("softirq_interference", [])
+ self.__stdata[self.stcpu]["softirq_interference"] += [(softirq, softirq_latency)]
+ elif irq_interference_detected:
+ irq_interference_name = " ".join(fields[0:-2])
+ irq_interference_latency = fields[-2]
+ self._log(Log.DEBUG, f'irq_interference = {irq_interference_name}, latency = {irq_interference_latency}')
+ self.__stdata[self.stcpu].setdefault("irq_interference", [])
+ self.__stdata[self.stcpu]["irq_interference"] += [(irq_interference_name, irq_interference_latency)]
+ elif line.startswith("Max timerlat IRQ latency"):
+ self._log(Log.DEBUG, f"line={line}")
+ max_timerlat_irq_latency = fields[-5]
+ self._log(Log.DEBUG, f"max_timerlat_irq_latency = {max_timerlat_irq_latency}")
+ max_timerlat_cpu = int(fields[-1])
+ self._log(Log.DEBUG, f"max_timerlat_cpu = {max_timerlat_cpu}")
+ self.__stdata.setdefault(max_timerlat_cpu, {})
+ self.__stdata[max_timerlat_cpu]["Max_timerlat_IRQ_latency_from_idle"] = max_timerlat_irq_latency
+ else:
+ self._log(Log.DEBUG, f'line = {line}')
+ continue
+
if line.startswith('#'):
if line.startswith('# Duration:'):
duration = line.split()[2]
@@ -285,9 +417,13 @@ class Timerlat(rtevalModulePrototype):
elif line.startswith('max:'):
#print(line)
continue
+ elif line.startswith('rtla timerlat hit stop tracing'):
+ self.__stoptrace = True
+ self.__posttrace += line
+ continue
else:
- pass
#print(line)
+ pass
vals = line.split()
if not vals:
@@ -301,12 +437,18 @@ class Timerlat(rtevalModulePrototype):
continue
for i, core in enumerate(self.__cpus):
- self.__timerlatdata[core].bucket(index, int(vals[i*3+1]),
+ # There might not be a count on every cpu if tracing invoked
+ if i*3 + 1 >= len(vals):
+ self.__timerlatdata[core].bucket(index, 0, 0, 0)
+ self.__timerlatdata['system'].bucket(index, 0, 0, 0)
+ else:
+ self.__timerlatdata[core].bucket(index, int(vals[i*3+1]),
int(vals[i*3+2]),
int(vals[i*3+3]))
- self.__timerlatdata['system'].bucket(index, int(vals[i*3+1]),
+ self.__timerlatdata['system'].bucket(index, int(vals[i*3+1]),
int(vals[i*3+2]),
int(vals[i*3+3]))
+
# Generate statistics for each RunData object
for n in list(self.__timerlatdata.keys()):
self.__timerlatdata[n].reduce()
@@ -320,6 +462,62 @@ class Timerlat(rtevalModulePrototype):
rep_n = libxml2.newNode('timerlat')
rep_n.newProp('command_line', ' '.join(self.__cmd))
+ stoptrace_invoked_n = libxml2.newNode('stoptrace_invoked')
+ if self.stcpu != -1:
+ stoptrace_invoked_n.newProp("invoked", "true")
+ else:
+ stoptrace_invoked_n.newProp("invoked", "")
+ rep_n.addChild(stoptrace_invoked_n)
+
+ if self.stcpu != -1:
+ self._log(Log.DEBUG, f'self.__stdata = {self.__stdata}')
+ for cpu in self.__stdata:
+ # This is Max timerlat IRQ latency from idle
+ # With no other data from that cpu, so don't create a
+ # stoptrace_report for this
+ if len(self.__stdata[cpu]) == 1:
+ continue
+ stoptrace_n = libxml2.newNode('stoptrace_report')
+ stoptrace_n.newProp("CPU", str(cpu))
+ for k, v in self.__stdata[cpu].items():
+ self._log(Log.DEBUG, f"cpu={cpu}, k={k}, v={v}")
+ if isinstance(v, tuple):
+ latency = str(v[0])
+ percent = str(v[1])
+ cpu_n = stoptrace_n.newTextChild(None, str(k), None)
+ n = cpu_n.newTextChild(None, "latency", latency)
+ n.newProp('unit', 'us')
+
+ n = cpu_n.newTextChild(None, "latency_percent", percent)
+ n.newProp('unit', '%')
+ elif isinstance(v, list):
+ if k in ("blocking_thread", "softirq_interference", "irq_interference"):
+ for name, latency in v:
+ cpu_n = stoptrace_n.newTextChild(None, k, None)
+ n = cpu_n.newTextChild(None, "name", name)
+ n = cpu_n.newTextChild(None, "latency", latency)
+ n.newProp('unit', 'us')
+ else:
+ if k == "Max_timerlat_IRQ_latency_from_idle":
+ continue
+ cpu_n = stoptrace_n.newTextChild(None, str(k), str(v))
+ cpu_n.newProp('unit', 'us')
+ rep_n.addChild(stoptrace_n)
+
+ self._log(Log.DEBUG, f'timerlat: posttrace = \n{self.__posttrace}')
+ self._log(Log.DEBUG, 'timerlat: posttrace END')
+ for cpu in self.__stdata:
+ for k, v in self.__stdata[cpu].items():
+ if isinstance(v, tuple):
+ continue
+ if k == "Max_timerlat_IRQ_latency_from_idle":
+ max_timerlat_n = libxml2.newNode('max_timerlat_report')
+ max_timerlat_n.newProp("CPU", str(cpu))
+ cpu_n = max_timerlat_n.newTextChild(None, k, str(v))
+ cpu_n.newProp('unit', 'us')
+ rep_n.addChild(max_timerlat_n)
+ return rep_n
+
rep_n.addChild(self.__timerlatdata['system'].MakeReport())
for thr in self.__cpus:
if str(thr) not in self.__timerlatdata:
@@ -341,7 +539,13 @@ def ModuleParameters():
"metavar": "PRIO" },
"buckets": {"descr": "Number of buckets",
"default": 3500,
- "metavar": "NUM"}
+ "metavar": "NUM" },
+ "stoptrace": {"descr": "Stop trace if thread latency higher than USEC",
+ "default": None,
+ "metavar": "USEC" },
+ "trace": {"descr": "File to save trace to",
+ "default": None,
+ "metavar": "FILE" },
}
def create(params, logger):
@@ -367,6 +571,7 @@ if __name__ == '__main__':
cfg_tl = cfg.GetSection('timerlat')
cfg_tl.cpulist = collapse_cpulist(SysTopology().online_cpus())
+ cfg_tl.stoptrace=50
RUNTIME = 10
diff --git a/rteval/rteval_text.xsl b/rteval/rteval_text.xsl
index 2f03bda0bb55..70777354efa5 100644
--- a/rteval/rteval_text.xsl
+++ b/rteval/rteval_text.xsl
@@ -316,14 +316,26 @@
<xsl:value-of select="@command_line"/>
<xsl:text>&#10;&#10;</xsl:text>
- <xsl:apply-templates select="abort_report"/>
+ <xsl:if test="stoptrace_invoked">
+ <xsl:text>rtla timerlat hit stop tracing</xsl:text>
+ <xsl:text>
+</xsl:text>
+ <xsl:apply-templates select="stoptrace_report"/>
+ <xsl:apply-templates select="max_timerlat_report"/>
+ </xsl:if>
+ <!-- Make sure the description is available before printing System: -->
+ <xsl:if test="system/@description">
<xsl:text> System: </xsl:text>
<xsl:value-of select="system/@description"/>
<xsl:text>&#10;</xsl:text>
+ </xsl:if>
+ <!-- If stoptrace_invoked is true, no Statistics are available -->
+ <xsl:if test="stoptrace_invoked != true">
<xsl:text> Statistics: &#10;</xsl:text>
<xsl:apply-templates select="system/statistics"/>
+ </xsl:if>
<!-- Add CPU core info and stats-->
<xsl:apply-templates select="core">
@@ -470,4 +482,171 @@
</xsl:if>
</xsl:template>
+ <!-- Format posttrace information if present -->
+ <xsl:template match="stoptrace_report">
+ <xsl:text>## CPU </xsl:text>
+ <xsl:value-of select="@CPU"/>
+ <xsl:text> hit stop tracing, analyzing it ##</xsl:text>
+ <xsl:text>
+</xsl:text>
+
+
+ <xsl:if test="Previous_IRQ_interference">
+ <xsl:text>Previous IRQ interference: up to </xsl:text>
+ <xsl:value-of select="Previous_IRQ_interference"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Previous_IRQ_interference/@unit"/>
+ <xsl:text>
+</xsl:text>
+ </xsl:if>
+
+ <xsl:if test="IRQ_handler_delay">
+ <xsl:text>IRQ handler delay: </xsl:text>
+ <xsl:value-of select="IRQ_handler_delay/latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_handler_delay/latency/@unit"/>
+ <xsl:text> (</xsl:text>
+ <xsl:value-of select="IRQ_handler_delay/latency_percent"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_handler_delay/latency_percent/@unit"/>
+ <xsl:text>)
+</xsl:text>
+ </xsl:if>
+
+ <xsl:if test="IRQ_handler_delay_exit_from_idle">
+ <xsl:text>IRQ handler delay: (exit from idle) </xsl:text>
+ <xsl:value-of select="IRQ_handler_delay_exit_from_idle/latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_handler_delay_exit_from_idle/latency/@unit"/>
+ <xsl:text> (</xsl:text>
+ <xsl:value-of select="IRQ_handler_delay_exit_from_idle/latency_percent"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_handler_delay_exit_from_idle/latency_percent/@unit"/>
+ <xsl:text>)
+</xsl:text>
+ </xsl:if>
+
+ <xsl:text>IRQ latency:</xsl:text>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_latency/@unit"/>
+ <xsl:text>
+</xsl:text>
+
+ <xsl:text>Timerlat IRQ duration:</xsl:text>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Timerlat_IRQ_duration/latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Timerlat_IRQ_duration/latency/@unit"/>
+ <xsl:text> (</xsl:text>
+ <xsl:value-of select="Timerlat_IRQ_duration/latency_percent"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Timerlat_IRQ_duration/latency_percent/@unit"/>
+ <xsl:text>)
+</xsl:text>
+
+ <xsl:if test="Blocking_Thread">
+ <xsl:text>Blocking thread:</xsl:text>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Blocking_Thread/latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Blocking_Thread/latency/@unit"/>
+ <xsl:text> (</xsl:text>
+ <xsl:value-of select="Blocking_Thread/latency_percent"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Blocking_Thread/latency_percent/@unit"/>
+ <xsl:text>)
+</xsl:text>
+ </xsl:if>
+
+ <xsl:for-each select="blocking_thread">
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="name"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="latency/@unit"/>
+ <xsl:text>
+</xsl:text>
+ </xsl:for-each>
+
+ <xsl:if test="Softirq_interference">
+ <xsl:text>Softirq interference:</xsl:text>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Softirq_interference/latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Softirq_interference/latency/@unit"/>
+ <xsl:text> (</xsl:text>
+ <xsl:value-of select="Softirq_interference/latency_percent"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Softirq_interference/latency_percent/@unit"/>
+ <xsl:text>)
+</xsl:text>
+ </xsl:if>
+
+ <xsl:for-each select="softirq_interference">
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="name"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="latency/@unit"/>
+ <xsl:text>
+</xsl:text>
+ </xsl:for-each>
+
+ <xsl:if test="IRQ_interference">
+ <xsl:text>IRQ interference:</xsl:text>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_interference/latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_interference/latency/@unit"/>
+ <xsl:text> (</xsl:text>
+ <xsl:value-of select="IRQ_interference/latency_percent"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_interference/latency_percent/@unit"/>
+ <xsl:text>)
+</xsl:text>
+ </xsl:if>
+
+ <xsl:for-each select="irq_interference">
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="name"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="latency/@unit"/>
+ <xsl:text>
+</xsl:text>
+ </xsl:for-each>
+
+ <xsl:text>--------------------------------------------------------------------------------</xsl:text>
+ <xsl:text>Thread latency:</xsl:text>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Thread_latency/latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Thread_latency/latency/@unit"/>
+ <xsl:text> (</xsl:text>
+ <xsl:value-of select="Thread_latency/latency_percent"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Thread_latency/latency_percent/@unit"/>
+ <xsl:text>)
+
+</xsl:text>
+ </xsl:template>
+
+ <xsl:template match="max_timerlat_report">
+ <xsl:text>
+</xsl:text>
+ <xsl:text>Max timerlat IRQ latency from idle:</xsl:text>
+ <xsl:value-of select="Max_timerlat_IRQ_latency_from_idle"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Max_timerlat_IRQ_latency_from_idle/@unit"/>
+ <xsl:text> in cpu </xsl:text>
+ <xsl:value-of select="@CPU"/>
+ <xsl:text>
+</xsl:text>
+ </xsl:template>
+
</xsl:stylesheet>
--
2.45.2