rteval/0001-rteval-timerlat-Add-timerlat-tracing-to-rteval.patch

527 lines
23 KiB
Diff
Raw Normal View History

From 5909521e06ed92ea60ffb247b25ca86c232f71bf Mon Sep 17 00:00:00 2001
From: John Kacur <jkacur@redhat.com>
Date: Thu, 20 Jun 2024 21:34:24 -0400
Subject: [PATCH 1/3] rteval: timerlat: Add timerlat tracing to rteval
This patch adds tracing from timerlat to rteval.
These two options are added
--timerlat-stoptrace USEC
Stop trace if thread latency higher than USEC
--timerlat-trace FILE
File to save trace to
This also stores the summary from rtla timerlat in the rteval xml file.
Signed-off-by: John Kacur <jkacur@redhat.com>
---
rteval/modules/measurement/timerlat.py | 213 ++++++++++++++++++++++++-
rteval/rteval_text.xsl | 181 ++++++++++++++++++++-
2 files changed, 389 insertions(+), 5 deletions(-)
diff --git a/rteval/modules/measurement/timerlat.py b/rteval/modules/measurement/timerlat.py
index e8345fab1ad7..45adec1b33e1 100644
--- a/rteval/modules/measurement/timerlat.py
+++ b/rteval/modules/measurement/timerlat.py
@@ -16,6 +16,7 @@ from rteval.modules import rtevalModulePrototype
from rteval.systopology import cpuinfo, SysTopology
from rteval.cpulist_utils import expand_cpulist, collapse_cpulist
+
class TLRunData:
''' class to store instance data from a timerlat run '''
def __init__(self, coreid, datatype, priority, logfnc):
@@ -188,6 +189,15 @@ class Timerlat(rtevalModulePrototype):
self.__cpus = [str(c) for c in expand_cpulist(self.__cpulist)]
self.__numcores = len(self.__cpus)
+ # Has tracing been triggered
+ self.__stoptrace = False
+ # This stores the output from rtla
+ self.__posttrace = ""
+ # Stop Trace Data
+ self.__stdata = {}
+ # Stop Trace Cpu
+ self.stcpu = -1
+
self.__timerlat_out = None
self.__timerlat_err = None
self.__started = False
@@ -218,10 +228,21 @@ class Timerlat(rtevalModulePrototype):
self.__cmd = ['rtla', 'timerlat', 'hist', '-P', f'f:{int(self.__priority)}', '-u']
self.__cmd.append(f'-c{self.__cpulist}')
self.__cmd.append(f'-E{self.__buckets}')
+
+ if self.__cfg.stoptrace:
+ self.__cmd.append(f"-T{int(self.__cfg.stoptrace)}")
+
+ if self.__cfg.trace:
+ if not self.__cfg.stoptrace:
+ self._log(Log.WARN, f'Ignoring trace={self.__cfg.trace}, because stoptrace not invoked')
+ else:
+ self.__cmd.append(f'-t={self.__cfg.trace}')
+
self._log(Log.DEBUG, f'self.__cmd = {self.__cmd}')
self.__timerlat_out = tempfile.SpooledTemporaryFile(mode='w+b')
self.__timerlat_err = tempfile.SpooledTemporaryFile(mode='w+b')
+
def _WorkloadTask(self):
if self.__started:
return
@@ -252,6 +273,10 @@ class Timerlat(rtevalModulePrototype):
os.kill(self.__timerlat_process.pid, signal.SIGINT)
time.sleep(2)
+ blocking_thread_detected = False
+ softirq_interference_detected = False
+ irq_interference_detected = False
+
# Parse histogram output
self.__timerlat_out.seek(0)
for line in self.__timerlat_out:
@@ -261,6 +286,113 @@ class Timerlat(rtevalModulePrototype):
if not line:
continue
+ # Parsing if stoptrace has been invoked
+ if self.__stoptrace:
+ self.__posttrace += line
+ line = line.strip()
+ fields = line.split()
+ if line.startswith("##") and fields[1] == "CPU":
+ self.stcpu = int(fields[2])
+ self._log(Log.DEBUG, f"self.stcpu = {self.stcpu}")
+ self.__stdata[self.stcpu] = {}
+ continue
+ if self.stcpu == -1:
+ self._log(Log.WARN, "Stop trace has been invoked, but a stop cpu has not been identified.")
+ continue
+ if line.startswith('------------------'):
+ blocking_thread_detected = False
+ softirq_interference_detected = False
+ irq_interference_detected = False
+ continue
+ if line.startswith("Thread latency:"):
+ thread_latency_percent = fields[-1].strip('()%')
+ self._log(Log.DEBUG, f"thread_latency_percent = {thread_latency_percent}")
+ thread_latency = fields[-3]
+ self._log(Log.DEBUG, f"thread_latency = {thread_latency}")
+ self.__stdata[self.stcpu]["Thread_latency"] = (thread_latency, thread_latency_percent)
+ elif line.startswith("Previous IRQ interference"):
+ self._log(Log.DEBUG, f'Previous_IRQ_interference = {fields[-2]}')
+ self.__stdata[self.stcpu]["Previous_IRQ_interference"] = fields[-2]
+ elif line.startswith("IRQ handler delay:"):
+ irq_handler_delay_percent = fields[-2].strip('(')
+ irq_handler_delay = fields[-4]
+ # Do we have (exit from idle)?
+ if fields[3] == '(exit':
+ field_name = "IRQ_handler_delay_exit_from_idle"
+ else:
+ field_name = "IRQ_handler_delay"
+ self._log(Log.DEBUG, f"{field_name} = {irq_handler_delay}")
+ self._log(Log.DEBUG, f"{field_name}_percent = {irq_handler_delay_percent}")
+ self.__stdata[self.stcpu][field_name] = (irq_handler_delay, irq_handler_delay_percent)
+ elif line.startswith("IRQ latency:"):
+ self._log(Log.DEBUG, f"irq_latency = {fields[-2]}")
+ self.__stdata[self.stcpu]["IRQ_latency"] = fields[-2]
+ elif line.startswith("Timerlat IRQ duration"):
+ timerlat_irq_duration_percent = fields[-2].strip('(')
+ self._log(Log.DEBUG, f"timerlat_irq_duration_percent = {timerlat_irq_duration_percent}")
+ timerlat_irq_duration = fields[-4]
+ self._log(Log.DEBUG, f"timerlat_irq_duration = {timerlat_irq_duration}")
+ self.__stdata[self.stcpu]["Timerlat_IRQ_duration"] = (timerlat_irq_duration, timerlat_irq_duration_percent)
+ elif line.startswith("Blocking thread:"):
+ blocking_thread_percent = fields[-2].strip('(')
+ self._log(Log.DEBUG, f"blocking_thread_percent = {blocking_thread_percent}")
+ blocking_thread = fields[-4]
+ self._log(Log.DEBUG, f"blocking_thread = {blocking_thread}")
+ self.__stdata[self.stcpu]["Blocking_Thread"] = (blocking_thread, blocking_thread_percent)
+ blocking_thread_detected = True
+ irq_interference_detected = False
+ softirq_interference_detected = False
+ elif line.startswith("IRQ interference"):
+ irq_interference_percent = fields[-2].strip('(')
+ self._log(Log.DEBUG, f"irq_interference_percent = {irq_interference_percent}")
+ irq_interference = fields[-4]
+ self._log(Log.DEBUG, f"irq_interference = {irq_interference}")
+ self.__stdata[self.stcpu]["IRQ_interference"] = (irq_interference, irq_interference_percent)
+ blocking_thread_detected = False
+ irq_interference_detected = True
+ softirq_interference_detected = False
+ elif line.startswith("Softirq interference"):
+ softirq_interference_percent = fields[-2].strip('(')
+ self._log(Log.DEBUG, f"softirq_interference_percent = {softirq_interference_percent}")
+ softirq_interference = fields[-4]
+ self._log(Log.DEBUG, f"softirq_interference = {softirq_interference}")
+ self.__stdata[self.stcpu]["Softirq_interference"] = (softirq_interference, softirq_interference_percent)
+ blocking_thread_detected = False
+ irq_interference_detected = False
+ softirq_interference_detected = True
+ elif blocking_thread_detected:
+ self._log(Log.DEBUG, f'line={line}')
+ blocking_thread = " ".join(fields[0:-2])
+ self._log(Log.DEBUG, f"blocking_thread = {blocking_thread}")
+ blocking_threadus = fields[-2]
+ self._log(Log.DEBUG, f"blocking_threadus = {blocking_threadus}")
+ self.__stdata[self.stcpu].setdefault("blocking_thread", [])
+ self.__stdata[self.stcpu]["blocking_thread"] += [(blocking_thread, blocking_threadus)]
+ elif softirq_interference_detected:
+ softirq = " ".join(fields[0:-2])
+ softirq_latency = fields[-2]
+ self._log(Log.DEBUG, f'softirq = {softirq}')
+ self._log(Log.DEBUG, f'softirq_latency = {softirq_latency}')
+ self.__stdata[self.stcpu].setdefault("softirq_interference", [])
+ self.__stdata[self.stcpu]["softirq_interference"] += [(softirq, softirq_latency)]
+ elif irq_interference_detected:
+ irq_interference_name = " ".join(fields[0:-2])
+ irq_interference_latency = fields[-2]
+ self._log(Log.DEBUG, f'irq_interference = {irq_interference_name}, latency = {irq_interference_latency}')
+ self.__stdata[self.stcpu].setdefault("irq_interference", [])
+ self.__stdata[self.stcpu]["irq_interference"] += [(irq_interference_name, irq_interference_latency)]
+ elif line.startswith("Max timerlat IRQ latency"):
+ self._log(Log.DEBUG, f"line={line}")
+ max_timerlat_irq_latency = fields[-5]
+ self._log(Log.DEBUG, f"max_timerlat_irq_latency = {max_timerlat_irq_latency}")
+ max_timerlat_cpu = int(fields[-1])
+ self._log(Log.DEBUG, f"max_timerlat_cpu = {max_timerlat_cpu}")
+ self.__stdata.setdefault(max_timerlat_cpu, {})
+ self.__stdata[max_timerlat_cpu]["Max_timerlat_IRQ_latency_from_idle"] = max_timerlat_irq_latency
+ else:
+ self._log(Log.DEBUG, f'line = {line}')
+ continue
+
if line.startswith('#'):
if line.startswith('# Duration:'):
duration = line.split()[2]
@@ -285,9 +417,13 @@ class Timerlat(rtevalModulePrototype):
elif line.startswith('max:'):
#print(line)
continue
+ elif line.startswith('rtla timerlat hit stop tracing'):
+ self.__stoptrace = True
+ self.__posttrace += line
+ continue
else:
- pass
#print(line)
+ pass
vals = line.split()
if not vals:
@@ -301,12 +437,18 @@ class Timerlat(rtevalModulePrototype):
continue
for i, core in enumerate(self.__cpus):
- self.__timerlatdata[core].bucket(index, int(vals[i*3+1]),
+ # There might not be a count on every cpu if tracing invoked
+ if i*3 + 1 >= len(vals):
+ self.__timerlatdata[core].bucket(index, 0, 0, 0)
+ self.__timerlatdata['system'].bucket(index, 0, 0, 0)
+ else:
+ self.__timerlatdata[core].bucket(index, int(vals[i*3+1]),
int(vals[i*3+2]),
int(vals[i*3+3]))
- self.__timerlatdata['system'].bucket(index, int(vals[i*3+1]),
+ self.__timerlatdata['system'].bucket(index, int(vals[i*3+1]),
int(vals[i*3+2]),
int(vals[i*3+3]))
+
# Generate statistics for each RunData object
for n in list(self.__timerlatdata.keys()):
self.__timerlatdata[n].reduce()
@@ -320,6 +462,62 @@ class Timerlat(rtevalModulePrototype):
rep_n = libxml2.newNode('timerlat')
rep_n.newProp('command_line', ' '.join(self.__cmd))
+ stoptrace_invoked_n = libxml2.newNode('stoptrace_invoked')
+ if self.stcpu != -1:
+ stoptrace_invoked_n.newProp("invoked", "true")
+ else:
+ stoptrace_invoked_n.newProp("invoked", "")
+ rep_n.addChild(stoptrace_invoked_n)
+
+ if self.stcpu != -1:
+ self._log(Log.DEBUG, f'self.__stdata = {self.__stdata}')
+ for cpu in self.__stdata:
+ # This is Max timerlat IRQ latency from idle
+ # With no other data from that cpu, so don't create a
+ # stoptrace_report for this
+ if len(self.__stdata[cpu]) == 1:
+ continue
+ stoptrace_n = libxml2.newNode('stoptrace_report')
+ stoptrace_n.newProp("CPU", str(cpu))
+ for k, v in self.__stdata[cpu].items():
+ self._log(Log.DEBUG, f"cpu={cpu}, k={k}, v={v}")
+ if isinstance(v, tuple):
+ latency = str(v[0])
+ percent = str(v[1])
+ cpu_n = stoptrace_n.newTextChild(None, str(k), None)
+ n = cpu_n.newTextChild(None, "latency", latency)
+ n.newProp('unit', 'us')
+
+ n = cpu_n.newTextChild(None, "latency_percent", percent)
+ n.newProp('unit', '%')
+ elif isinstance(v, list):
+ if k in ("blocking_thread", "softirq_interference", "irq_interference"):
+ for name, latency in v:
+ cpu_n = stoptrace_n.newTextChild(None, k, None)
+ n = cpu_n.newTextChild(None, "name", name)
+ n = cpu_n.newTextChild(None, "latency", latency)
+ n.newProp('unit', 'us')
+ else:
+ if k == "Max_timerlat_IRQ_latency_from_idle":
+ continue
+ cpu_n = stoptrace_n.newTextChild(None, str(k), str(v))
+ cpu_n.newProp('unit', 'us')
+ rep_n.addChild(stoptrace_n)
+
+ self._log(Log.DEBUG, f'timerlat: posttrace = \n{self.__posttrace}')
+ self._log(Log.DEBUG, 'timerlat: posttrace END')
+ for cpu in self.__stdata:
+ for k, v in self.__stdata[cpu].items():
+ if isinstance(v, tuple):
+ continue
+ if k == "Max_timerlat_IRQ_latency_from_idle":
+ max_timerlat_n = libxml2.newNode('max_timerlat_report')
+ max_timerlat_n.newProp("CPU", str(cpu))
+ cpu_n = max_timerlat_n.newTextChild(None, k, str(v))
+ cpu_n.newProp('unit', 'us')
+ rep_n.addChild(max_timerlat_n)
+ return rep_n
+
rep_n.addChild(self.__timerlatdata['system'].MakeReport())
for thr in self.__cpus:
if str(thr) not in self.__timerlatdata:
@@ -341,7 +539,13 @@ def ModuleParameters():
"metavar": "PRIO" },
"buckets": {"descr": "Number of buckets",
"default": 3500,
- "metavar": "NUM"}
+ "metavar": "NUM" },
+ "stoptrace": {"descr": "Stop trace if thread latency higher than USEC",
+ "default": None,
+ "metavar": "USEC" },
+ "trace": {"descr": "File to save trace to",
+ "default": None,
+ "metavar": "FILE" },
}
def create(params, logger):
@@ -367,6 +571,7 @@ if __name__ == '__main__':
cfg_tl = cfg.GetSection('timerlat')
cfg_tl.cpulist = collapse_cpulist(SysTopology().online_cpus())
+ cfg_tl.stoptrace=50
RUNTIME = 10
diff --git a/rteval/rteval_text.xsl b/rteval/rteval_text.xsl
index 2f03bda0bb55..70777354efa5 100644
--- a/rteval/rteval_text.xsl
+++ b/rteval/rteval_text.xsl
@@ -316,14 +316,26 @@
<xsl:value-of select="@command_line"/>
<xsl:text>&#10;&#10;</xsl:text>
- <xsl:apply-templates select="abort_report"/>
+ <xsl:if test="stoptrace_invoked">
+ <xsl:text>rtla timerlat hit stop tracing</xsl:text>
+ <xsl:text>
+</xsl:text>
+ <xsl:apply-templates select="stoptrace_report"/>
+ <xsl:apply-templates select="max_timerlat_report"/>
+ </xsl:if>
+ <!-- Make sure the description is available before printing System: -->
+ <xsl:if test="system/@description">
<xsl:text> System: </xsl:text>
<xsl:value-of select="system/@description"/>
<xsl:text>&#10;</xsl:text>
+ </xsl:if>
+ <!-- If stoptrace_invoked is true, no Statistics are available -->
+ <xsl:if test="stoptrace_invoked != true">
<xsl:text> Statistics: &#10;</xsl:text>
<xsl:apply-templates select="system/statistics"/>
+ </xsl:if>
<!-- Add CPU core info and stats-->
<xsl:apply-templates select="core">
@@ -470,4 +482,171 @@
</xsl:if>
</xsl:template>
+ <!-- Format posttrace information if present -->
+ <xsl:template match="stoptrace_report">
+ <xsl:text>## CPU </xsl:text>
+ <xsl:value-of select="@CPU"/>
+ <xsl:text> hit stop tracing, analyzing it ##</xsl:text>
+ <xsl:text>
+</xsl:text>
+
+
+ <xsl:if test="Previous_IRQ_interference">
+ <xsl:text>Previous IRQ interference: up to </xsl:text>
+ <xsl:value-of select="Previous_IRQ_interference"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Previous_IRQ_interference/@unit"/>
+ <xsl:text>
+</xsl:text>
+ </xsl:if>
+
+ <xsl:if test="IRQ_handler_delay">
+ <xsl:text>IRQ handler delay: </xsl:text>
+ <xsl:value-of select="IRQ_handler_delay/latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_handler_delay/latency/@unit"/>
+ <xsl:text> (</xsl:text>
+ <xsl:value-of select="IRQ_handler_delay/latency_percent"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_handler_delay/latency_percent/@unit"/>
+ <xsl:text>)
+</xsl:text>
+ </xsl:if>
+
+ <xsl:if test="IRQ_handler_delay_exit_from_idle">
+ <xsl:text>IRQ handler delay: (exit from idle) </xsl:text>
+ <xsl:value-of select="IRQ_handler_delay_exit_from_idle/latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_handler_delay_exit_from_idle/latency/@unit"/>
+ <xsl:text> (</xsl:text>
+ <xsl:value-of select="IRQ_handler_delay_exit_from_idle/latency_percent"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_handler_delay_exit_from_idle/latency_percent/@unit"/>
+ <xsl:text>)
+</xsl:text>
+ </xsl:if>
+
+ <xsl:text>IRQ latency:</xsl:text>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_latency/@unit"/>
+ <xsl:text>
+</xsl:text>
+
+ <xsl:text>Timerlat IRQ duration:</xsl:text>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Timerlat_IRQ_duration/latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Timerlat_IRQ_duration/latency/@unit"/>
+ <xsl:text> (</xsl:text>
+ <xsl:value-of select="Timerlat_IRQ_duration/latency_percent"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Timerlat_IRQ_duration/latency_percent/@unit"/>
+ <xsl:text>)
+</xsl:text>
+
+ <xsl:if test="Blocking_Thread">
+ <xsl:text>Blocking thread:</xsl:text>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Blocking_Thread/latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Blocking_Thread/latency/@unit"/>
+ <xsl:text> (</xsl:text>
+ <xsl:value-of select="Blocking_Thread/latency_percent"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Blocking_Thread/latency_percent/@unit"/>
+ <xsl:text>)
+</xsl:text>
+ </xsl:if>
+
+ <xsl:for-each select="blocking_thread">
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="name"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="latency/@unit"/>
+ <xsl:text>
+</xsl:text>
+ </xsl:for-each>
+
+ <xsl:if test="Softirq_interference">
+ <xsl:text>Softirq interference:</xsl:text>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Softirq_interference/latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Softirq_interference/latency/@unit"/>
+ <xsl:text> (</xsl:text>
+ <xsl:value-of select="Softirq_interference/latency_percent"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Softirq_interference/latency_percent/@unit"/>
+ <xsl:text>)
+</xsl:text>
+ </xsl:if>
+
+ <xsl:for-each select="softirq_interference">
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="name"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="latency/@unit"/>
+ <xsl:text>
+</xsl:text>
+ </xsl:for-each>
+
+ <xsl:if test="IRQ_interference">
+ <xsl:text>IRQ interference:</xsl:text>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_interference/latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_interference/latency/@unit"/>
+ <xsl:text> (</xsl:text>
+ <xsl:value-of select="IRQ_interference/latency_percent"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="IRQ_interference/latency_percent/@unit"/>
+ <xsl:text>)
+</xsl:text>
+ </xsl:if>
+
+ <xsl:for-each select="irq_interference">
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="name"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="latency/@unit"/>
+ <xsl:text>
+</xsl:text>
+ </xsl:for-each>
+
+ <xsl:text>--------------------------------------------------------------------------------</xsl:text>
+ <xsl:text>Thread latency:</xsl:text>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Thread_latency/latency"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Thread_latency/latency/@unit"/>
+ <xsl:text> (</xsl:text>
+ <xsl:value-of select="Thread_latency/latency_percent"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Thread_latency/latency_percent/@unit"/>
+ <xsl:text>)
+
+</xsl:text>
+ </xsl:template>
+
+ <xsl:template match="max_timerlat_report">
+ <xsl:text>
+</xsl:text>
+ <xsl:text>Max timerlat IRQ latency from idle:</xsl:text>
+ <xsl:value-of select="Max_timerlat_IRQ_latency_from_idle"/>
+ <xsl:text> </xsl:text>
+ <xsl:value-of select="Max_timerlat_IRQ_latency_from_idle/@unit"/>
+ <xsl:text> in cpu </xsl:text>
+ <xsl:value-of select="@CPU"/>
+ <xsl:text>
+</xsl:text>
+ </xsl:template>
+
</xsl:stylesheet>
--
2.45.2