rteval/0001-rteval-timerlat-Add-timerlat-tracing-to-rteval.patch

From 5909521e06ed92ea60ffb247b25ca86c232f71bf Mon Sep 17 00:00:00 2001
From: John Kacur <jkacur@redhat.com>
Date: Thu, 20 Jun 2024 21:34:24 -0400
Subject: [PATCH 1/3] rteval: timerlat: Add timerlat tracing to rteval

This patch adds tracing from timerlat to rteval.

These two options are added

  --timerlat-stoptrace USEC
                        Stop trace if thread latency higher than USEC
  --timerlat-trace FILE
                        File to save trace to

This also stores the summary from rtla timerlat in the rteval xml file.

Signed-off-by: John Kacur <jkacur@redhat.com>
---
 rteval/modules/measurement/timerlat.py | 213 ++++++++++++++++++++++++-
 rteval/rteval_text.xsl                 | 181 ++++++++++++++++++++-
 2 files changed, 389 insertions(+), 5 deletions(-)

diff --git a/rteval/modules/measurement/timerlat.py b/rteval/modules/measurement/timerlat.py
index e8345fab1ad7..45adec1b33e1 100644
--- a/rteval/modules/measurement/timerlat.py
+++ b/rteval/modules/measurement/timerlat.py
@@ -16,6 +16,7 @@ from rteval.modules import rtevalModulePrototype
 from rteval.systopology import cpuinfo, SysTopology
 from rteval.cpulist_utils import expand_cpulist, collapse_cpulist

+
 class TLRunData:
     ''' class to store instance data from a timerlat run '''
     def __init__(self, coreid, datatype, priority, logfnc):
@@ -188,6 +189,15 @@ class Timerlat(rtevalModulePrototype):
         self.__cpus = [str(c) for c in expand_cpulist(self.__cpulist)]
         self.__numcores = len(self.__cpus)

+        # Has tracing been triggered
+        self.__stoptrace = False
+        # This stores the output from rtla
+        self.__posttrace = ""
+        # Stop Trace Data
+        self.__stdata = {}
+        # Stop Trace Cpu
+        self.stcpu = -1
+
         self.__timerlat_out = None
         self.__timerlat_err = None
         self.__started = False
@@ -218,10 +228,21 @@ class Timerlat(rtevalModulePrototype):
         self.__cmd = ['rtla', 'timerlat', 'hist', '-P', f'f:{int(self.__priority)}', '-u']
         self.__cmd.append(f'-c{self.__cpulist}')
         self.__cmd.append(f'-E{self.__buckets}')
+
+        if self.__cfg.stoptrace:
+            self.__cmd.append(f"-T{int(self.__cfg.stoptrace)}")
+
+        if self.__cfg.trace:
+            if not self.__cfg.stoptrace:
+                self._log(Log.WARN, f'Ignoring trace={self.__cfg.trace}, because stoptrace not invoked')
+            else:
+                self.__cmd.append(f'-t={self.__cfg.trace}')
+
         self._log(Log.DEBUG, f'self.__cmd = {self.__cmd}')
         self.__timerlat_out = tempfile.SpooledTemporaryFile(mode='w+b')
         self.__timerlat_err = tempfile.SpooledTemporaryFile(mode='w+b')

+
     def _WorkloadTask(self):
         if self.__started:
             return
@@ -252,6 +273,10 @@ class Timerlat(rtevalModulePrototype):
             os.kill(self.__timerlat_process.pid, signal.SIGINT)
             time.sleep(2)

+        blocking_thread_detected = False
+        softirq_interference_detected = False
+        irq_interference_detected = False
+
         # Parse histogram output
         self.__timerlat_out.seek(0)
         for line in self.__timerlat_out:
@@ -261,6 +286,113 @@ class Timerlat(rtevalModulePrototype):
             if not line:
                 continue

+            # Parsing if stoptrace has been invoked
+            if self.__stoptrace:
+                self.__posttrace += line
+                line = line.strip()
+                fields = line.split()
+                if line.startswith("##") and fields[1] == "CPU":
+                    self.stcpu = int(fields[2])
+                    self._log(Log.DEBUG, f"self.stcpu = {self.stcpu}")
+                    self.__stdata[self.stcpu] = {}
+                    continue
+                if self.stcpu == -1:
+                    self._log(Log.WARN, "Stop trace has been invoked, but a stop cpu has not been identified.")
+                    continue
+                if line.startswith('------------------'):
+                    blocking_thread_detected = False
+                    softirq_interference_detected = False
+                    irq_interference_detected = False
+                    continue
+                if line.startswith("Thread latency:"):
+                    thread_latency_percent = fields[-1].strip('()%')
+                    self._log(Log.DEBUG, f"thread_latency_percent = {thread_latency_percent}")
+                    thread_latency = fields[-3]
+                    self._log(Log.DEBUG, f"thread_latency = {thread_latency}")
+                    self.__stdata[self.stcpu]["Thread_latency"] = (thread_latency, thread_latency_percent)
+                elif line.startswith("Previous IRQ interference"):
+                    self._log(Log.DEBUG, f'Previous_IRQ_interference = {fields[-2]}')
+                    self.__stdata[self.stcpu]["Previous_IRQ_interference"] = fields[-2]
+                elif line.startswith("IRQ handler delay:"):
+                    irq_handler_delay_percent = fields[-2].strip('(')
+                    irq_handler_delay = fields[-4]
+                    # Do we have (exit from idle)?
+                    if fields[3] == '(exit':
+                        field_name = "IRQ_handler_delay_exit_from_idle"
+                    else:
+                        field_name = "IRQ_handler_delay"
+                    self._log(Log.DEBUG, f"{field_name} = {irq_handler_delay}")
+                    self._log(Log.DEBUG, f"{field_name}_percent = {irq_handler_delay_percent}")
+                    self.__stdata[self.stcpu][field_name] = (irq_handler_delay, irq_handler_delay_percent)
+                elif line.startswith("IRQ latency:"):
+                    self._log(Log.DEBUG, f"irq_latency = {fields[-2]}")
+                    self.__stdata[self.stcpu]["IRQ_latency"] = fields[-2]
+                elif line.startswith("Timerlat IRQ duration"):
+                    timerlat_irq_duration_percent = fields[-2].strip('(')
+                    self._log(Log.DEBUG, f"timerlat_irq_duration_percent = {timerlat_irq_duration_percent}")
+                    timerlat_irq_duration = fields[-4]
+                    self._log(Log.DEBUG, f"timerlat_irq_duration = {timerlat_irq_duration}")
+                    self.__stdata[self.stcpu]["Timerlat_IRQ_duration"] = (timerlat_irq_duration, timerlat_irq_duration_percent)
+                elif line.startswith("Blocking thread:"):
+                    blocking_thread_percent = fields[-2].strip('(')
+                    self._log(Log.DEBUG, f"blocking_thread_percent = {blocking_thread_percent}")
+                    blocking_thread = fields[-4]
+                    self._log(Log.DEBUG, f"blocking_thread = {blocking_thread}")
+                    self.__stdata[self.stcpu]["Blocking_Thread"] = (blocking_thread, blocking_thread_percent)
+                    blocking_thread_detected = True
+                    irq_interference_detected = False
+                    softirq_interference_detected = False
+                elif line.startswith("IRQ interference"):
+                    irq_interference_percent = fields[-2].strip('(')
+                    self._log(Log.DEBUG, f"irq_interference_percent = {irq_interference_percent}")
+                    irq_interference = fields[-4]
+                    self._log(Log.DEBUG, f"irq_interference = {irq_interference}")
+                    self.__stdata[self.stcpu]["IRQ_interference"] = (irq_interference, irq_interference_percent)
+                    blocking_thread_detected = False
+                    irq_interference_detected = True
+                    softirq_interference_detected = False
+                elif line.startswith("Softirq interference"):
+                    softirq_interference_percent = fields[-2].strip('(')
+                    self._log(Log.DEBUG, f"softirq_interference_percent = {softirq_interference_percent}")
+                    softirq_interference = fields[-4]
+                    self._log(Log.DEBUG, f"softirq_interference = {softirq_interference}")
+                    self.__stdata[self.stcpu]["Softirq_interference"] = (softirq_interference, softirq_interference_percent)
+                    blocking_thread_detected = False
+                    irq_interference_detected = False
+                    softirq_interference_detected = True
+                elif blocking_thread_detected:
+                    self._log(Log.DEBUG, f'line={line}')
+                    blocking_thread = " ".join(fields[0:-2])
+                    self._log(Log.DEBUG, f"blocking_thread = {blocking_thread}")
+                    blocking_threadus = fields[-2]
+                    self._log(Log.DEBUG, f"blocking_threadus = {blocking_threadus}")
+                    self.__stdata[self.stcpu].setdefault("blocking_thread", [])
+                    self.__stdata[self.stcpu]["blocking_thread"] += [(blocking_thread, blocking_threadus)]
+                elif softirq_interference_detected:
+                    softirq = " ".join(fields[0:-2])
+                    softirq_latency = fields[-2]
+                    self._log(Log.DEBUG, f'softirq = {softirq}')
+                    self._log(Log.DEBUG, f'softirq_latency = {softirq_latency}')
+                    self.__stdata[self.stcpu].setdefault("softirq_interference", [])
+                    self.__stdata[self.stcpu]["softirq_interference"] += [(softirq, softirq_latency)]
+                elif irq_interference_detected:
+                    irq_interference_name = " ".join(fields[0:-2])
+                    irq_interference_latency = fields[-2]
+                    self._log(Log.DEBUG, f'irq_interference = {irq_interference_name}, latency = {irq_interference_latency}')
+                    self.__stdata[self.stcpu].setdefault("irq_interference", [])
+                    self.__stdata[self.stcpu]["irq_interference"] += [(irq_interference_name, irq_interference_latency)]
+                elif line.startswith("Max timerlat IRQ latency"):
+                    self._log(Log.DEBUG, f"line={line}")
+                    max_timerlat_irq_latency = fields[-5]
+                    self._log(Log.DEBUG, f"max_timerlat_irq_latency = {max_timerlat_irq_latency}")
+                    max_timerlat_cpu = int(fields[-1])
+                    self._log(Log.DEBUG, f"max_timerlat_cpu = {max_timerlat_cpu}")
+                    self.__stdata.setdefault(max_timerlat_cpu, {})
+                    self.__stdata[max_timerlat_cpu]["Max_timerlat_IRQ_latency_from_idle"] = max_timerlat_irq_latency
+                else:
+                    self._log(Log.DEBUG, f'line = {line}')
+                continue
+
             if line.startswith('#'):
                 if line.startswith('# Duration:'):
                     duration = line.split()[2]
@@ -285,9 +417,13 @@ class Timerlat(rtevalModulePrototype):
             elif line.startswith('max:'):
                 #print(line)
                 continue
+            elif line.startswith('rtla timerlat hit stop tracing'):
+                self.__stoptrace = True
+                self.__posttrace += line
+                continue
             else:
-                pass
                 #print(line)
+                pass

             vals = line.split()
             if not vals:
@@ -301,12 +437,18 @@ class Timerlat(rtevalModulePrototype):
                 continue

             for i, core in enumerate(self.__cpus):
-                self.__timerlatdata[core].bucket(index, int(vals[i*3+1]),
+                # There might not be a count on every cpu if tracing invoked
+                if i*3 + 1 >= len(vals):
+                    self.__timerlatdata[core].bucket(index, 0, 0, 0)
+                    self.__timerlatdata['system'].bucket(index, 0, 0, 0)
+                else:
+                    self.__timerlatdata[core].bucket(index, int(vals[i*3+1]),
                                                  int(vals[i*3+2]),
                                                  int(vals[i*3+3]))
-                self.__timerlatdata['system'].bucket(index, int(vals[i*3+1]),
+                    self.__timerlatdata['system'].bucket(index, int(vals[i*3+1]),
                                                  int(vals[i*3+2]),
                                                  int(vals[i*3+3]))
+
         # Generate statistics for each RunData object
         for n in list(self.__timerlatdata.keys()):
             self.__timerlatdata[n].reduce()
@@ -320,6 +462,62 @@ class Timerlat(rtevalModulePrototype):
         rep_n = libxml2.newNode('timerlat')
         rep_n.newProp('command_line', ' '.join(self.__cmd))

+        stoptrace_invoked_n = libxml2.newNode('stoptrace_invoked')
+        if self.stcpu != -1:
+            stoptrace_invoked_n.newProp("invoked", "true")
+        else:
+            stoptrace_invoked_n.newProp("invoked", "")
+        rep_n.addChild(stoptrace_invoked_n)
+
+        if self.stcpu != -1:
+            self._log(Log.DEBUG, f'self.__stdata = {self.__stdata}')
+            for cpu in self.__stdata:
+                # This is  Max timerlat IRQ latency from idle
+                # With no other data from that cpu, so don't create a
+                # stoptrace_report for this
+                if len(self.__stdata[cpu]) == 1:
+                    continue
+                stoptrace_n = libxml2.newNode('stoptrace_report')
+                stoptrace_n.newProp("CPU", str(cpu))
+                for k, v in self.__stdata[cpu].items():
+                    self._log(Log.DEBUG, f"cpu={cpu}, k={k}, v={v}")
+                    if isinstance(v, tuple):
+                        latency = str(v[0])
+                        percent = str(v[1])
+                        cpu_n = stoptrace_n.newTextChild(None, str(k), None)
+                        n = cpu_n.newTextChild(None, "latency", latency)
+                        n.newProp('unit', 'us')
+
+                        n = cpu_n.newTextChild(None, "latency_percent", percent)
+                        n.newProp('unit', '%')
+                    elif isinstance(v, list):
+                        if k in ("blocking_thread", "softirq_interference", "irq_interference"):
+                            for name, latency in v:
+                                cpu_n = stoptrace_n.newTextChild(None, k, None)
+                                n = cpu_n.newTextChild(None, "name", name)
+                                n = cpu_n.newTextChild(None, "latency", latency)
+                                n.newProp('unit', 'us')
+                    else:
+                        if k == "Max_timerlat_IRQ_latency_from_idle":
+                            continue
+                        cpu_n = stoptrace_n.newTextChild(None, str(k), str(v))
+                        cpu_n.newProp('unit', 'us')
+                rep_n.addChild(stoptrace_n)
+
+            self._log(Log.DEBUG, f'timerlat: posttrace = \n{self.__posttrace}')
+            self._log(Log.DEBUG, 'timerlat: posttrace END')
+            for cpu in self.__stdata:
+                for k, v in self.__stdata[cpu].items():
+                    if isinstance(v, tuple):
+                        continue
+                    if k == "Max_timerlat_IRQ_latency_from_idle":
+                        max_timerlat_n = libxml2.newNode('max_timerlat_report')
+                        max_timerlat_n.newProp("CPU", str(cpu))
+                        cpu_n = max_timerlat_n.newTextChild(None, k, str(v))
+                        cpu_n.newProp('unit', 'us')
+                        rep_n.addChild(max_timerlat_n)
+            return rep_n
+
         rep_n.addChild(self.__timerlatdata['system'].MakeReport())
         for thr in self.__cpus:
             if str(thr) not in self.__timerlatdata:
@@ -341,7 +539,13 @@ def ModuleParameters():
                          "metavar": "PRIO" },
             "buckets":  {"descr": "Number of buckets",
                          "default": 3500,
-                         "metavar": "NUM"}
+                         "metavar": "NUM" },
+            "stoptrace": {"descr": "Stop trace if thread latency higher than USEC",
+                          "default": None,
+                          "metavar": "USEC" },
+            "trace":    {"descr": "File to save trace to",
+                         "default": None,
+                         "metavar": "FILE" },
            }

 def create(params, logger):
@@ -367,6 +571,7 @@ if __name__ == '__main__':

     cfg_tl = cfg.GetSection('timerlat')
     cfg_tl.cpulist = collapse_cpulist(SysTopology().online_cpus())
+    cfg_tl.stoptrace=50

     RUNTIME = 10

diff --git a/rteval/rteval_text.xsl b/rteval/rteval_text.xsl
index 2f03bda0bb55..70777354efa5 100644
--- a/rteval/rteval_text.xsl
+++ b/rteval/rteval_text.xsl
@@ -316,14 +316,26 @@
     <xsl:value-of select="@command_line"/>
     <xsl:text>&#10;&#10;</xsl:text>

-    <xsl:apply-templates select="abort_report"/>
+    <xsl:if test="stoptrace_invoked">
+     <xsl:text>rtla timerlat hit stop tracing</xsl:text>
+     <xsl:text>
+</xsl:text>
+    <xsl:apply-templates select="stoptrace_report"/>
+    <xsl:apply-templates select="max_timerlat_report"/>
+    </xsl:if>

+    <!-- Make sure the description is available before printing System: -->
+    <xsl:if test="system/@description">
     <xsl:text>          System:  </xsl:text>
     <xsl:value-of select="system/@description"/>
     <xsl:text>&#10;</xsl:text>
+    </xsl:if>

+    <!-- If stoptrace_invoked is true, no Statistics are available -->
+    <xsl:if test="stoptrace_invoked != true">
     <xsl:text>          Statistics: &#10;</xsl:text>
     <xsl:apply-templates select="system/statistics"/>
+    </xsl:if>

     <!-- Add CPU core info and stats-->
     <xsl:apply-templates select="core">
@@ -470,4 +482,171 @@
       </xsl:if>
   </xsl:template>

+  <!-- Format posttrace information if present -->
+  <xsl:template match="stoptrace_report">
+     <xsl:text>## CPU </xsl:text>
+     <xsl:value-of select="@CPU"/>
+     <xsl:text> hit stop tracing, analyzing it ##</xsl:text>
+     <xsl:text>
+</xsl:text>
+
+
+     <xsl:if test="Previous_IRQ_interference">
+     <xsl:text>Previous IRQ interference:			 up to       </xsl:text>
+     <xsl:value-of select="Previous_IRQ_interference"/>
+     <xsl:text> </xsl:text>
+     <xsl:value-of select="Previous_IRQ_interference/@unit"/>
+     <xsl:text>
+</xsl:text>
+     </xsl:if>
+
+     <xsl:if test="IRQ_handler_delay">
+     <xsl:text>IRQ handler delay:		                	</xsl:text>
+     <xsl:value-of select="IRQ_handler_delay/latency"/>
+     <xsl:text> </xsl:text>
+     <xsl:value-of select="IRQ_handler_delay/latency/@unit"/>
+     <xsl:text> (</xsl:text>
+     <xsl:value-of select="IRQ_handler_delay/latency_percent"/>
+     <xsl:text> </xsl:text>
+     <xsl:value-of select="IRQ_handler_delay/latency_percent/@unit"/>
+     <xsl:text>)
+</xsl:text>
+     </xsl:if>
+
+     <xsl:if test="IRQ_handler_delay_exit_from_idle">
+     <xsl:text>IRQ handler delay:		(exit from idle)	</xsl:text>
+     <xsl:value-of select="IRQ_handler_delay_exit_from_idle/latency"/>
+     <xsl:text> </xsl:text>
+     <xsl:value-of select="IRQ_handler_delay_exit_from_idle/latency/@unit"/>
+     <xsl:text> (</xsl:text>
+     <xsl:value-of select="IRQ_handler_delay_exit_from_idle/latency_percent"/>
+     <xsl:text> </xsl:text>
+     <xsl:value-of select="IRQ_handler_delay_exit_from_idle/latency_percent/@unit"/>
+     <xsl:text>)
+</xsl:text>
+     </xsl:if>
+
+     <xsl:text>IRQ latency:</xsl:text>
+     <xsl:text>						</xsl:text>
+     <xsl:value-of select="IRQ_latency"/>
+     <xsl:text> </xsl:text>
+     <xsl:value-of select="IRQ_latency/@unit"/>
+     <xsl:text>
+</xsl:text>
+
+     <xsl:text>Timerlat IRQ duration:</xsl:text>
+     <xsl:text>					</xsl:text>
+     <xsl:value-of select="Timerlat_IRQ_duration/latency"/>
+     <xsl:text> </xsl:text>
+     <xsl:value-of select="Timerlat_IRQ_duration/latency/@unit"/>
+     <xsl:text> (</xsl:text>
+     <xsl:value-of select="Timerlat_IRQ_duration/latency_percent"/>
+     <xsl:text> </xsl:text>
+     <xsl:value-of select="Timerlat_IRQ_duration/latency_percent/@unit"/>
+     <xsl:text>)
+</xsl:text>
+
+     <xsl:if test="Blocking_Thread">
+     <xsl:text>Blocking thread:</xsl:text>
+     <xsl:text>					</xsl:text>
+     <xsl:value-of select="Blocking_Thread/latency"/>
+     <xsl:text> </xsl:text>
+     <xsl:value-of select="Blocking_Thread/latency/@unit"/>
+     <xsl:text> (</xsl:text>
+     <xsl:value-of select="Blocking_Thread/latency_percent"/>
+     <xsl:text> </xsl:text>
+     <xsl:value-of select="Blocking_Thread/latency_percent/@unit"/>
+     <xsl:text>)
+</xsl:text>
+     </xsl:if>
+
+     <xsl:for-each select="blocking_thread">
+	     <xsl:text>			</xsl:text>
+	     <xsl:value-of select="name"/>
+	     <xsl:text>			</xsl:text>
+	     <xsl:value-of select="latency"/>
+             <xsl:text> </xsl:text>
+	     <xsl:value-of select="latency/@unit"/>
+             <xsl:text>
+</xsl:text>
+     </xsl:for-each>
+
+     <xsl:if test="Softirq_interference">
+     <xsl:text>Softirq interference:</xsl:text>
+     <xsl:text>					</xsl:text>
+     <xsl:value-of select="Softirq_interference/latency"/>
+     <xsl:text> </xsl:text>
+     <xsl:value-of select="Softirq_interference/latency/@unit"/>
+     <xsl:text> (</xsl:text>
+     <xsl:value-of select="Softirq_interference/latency_percent"/>
+     <xsl:text> </xsl:text>
+     <xsl:value-of select="Softirq_interference/latency_percent/@unit"/>
+     <xsl:text>)
+</xsl:text>
+     </xsl:if>
+
+     <xsl:for-each select="softirq_interference">
+	     <xsl:text>			</xsl:text>
+	     <xsl:value-of select="name"/>
+	     <xsl:text>			</xsl:text>
+	     <xsl:value-of select="latency"/>
+             <xsl:text> </xsl:text>
+	     <xsl:value-of select="latency/@unit"/>
+             <xsl:text>
+</xsl:text>
+     </xsl:for-each>
+
+     <xsl:if test="IRQ_interference">
+     <xsl:text>IRQ interference:</xsl:text>
+     <xsl:text>					</xsl:text>
+     <xsl:value-of select="IRQ_interference/latency"/>
+     <xsl:text> </xsl:text>
+     <xsl:value-of select="IRQ_interference/latency/@unit"/>
+     <xsl:text> (</xsl:text>
+     <xsl:value-of select="IRQ_interference/latency_percent"/>
+     <xsl:text> </xsl:text>
+     <xsl:value-of select="IRQ_interference/latency_percent/@unit"/>
+     <xsl:text>)
+</xsl:text>
+     </xsl:if>
+
+     <xsl:for-each select="irq_interference">
+	     <xsl:text>			</xsl:text>
+	     <xsl:value-of select="name"/>
+	     <xsl:text>			</xsl:text>
+	     <xsl:value-of select="latency"/>
+             <xsl:text> </xsl:text>
+	     <xsl:value-of select="latency/@unit"/>
+             <xsl:text>
+</xsl:text>
+     </xsl:for-each>
+
+     <xsl:text>--------------------------------------------------------------------------------</xsl:text>
+     <xsl:text>Thread latency:</xsl:text>
+     <xsl:text>						</xsl:text>
+     <xsl:value-of select="Thread_latency/latency"/>
+     <xsl:text> </xsl:text>
+     <xsl:value-of select="Thread_latency/latency/@unit"/>
+     <xsl:text> (</xsl:text>
+     <xsl:value-of select="Thread_latency/latency_percent"/>
+     <xsl:text> </xsl:text>
+     <xsl:value-of select="Thread_latency/latency_percent/@unit"/>
+     <xsl:text>)
+
+</xsl:text>
+  </xsl:template>
+
+  <xsl:template match="max_timerlat_report">
+     <xsl:text>
+</xsl:text>
+     <xsl:text>Max timerlat IRQ latency from idle:</xsl:text>
+     <xsl:value-of select="Max_timerlat_IRQ_latency_from_idle"/>
+     <xsl:text> </xsl:text>
+     <xsl:value-of select="Max_timerlat_IRQ_latency_from_idle/@unit"/>
+     <xsl:text> in cpu </xsl:text>
+     <xsl:value-of select="@CPU"/>
+     <xsl:text>
+</xsl:text>
+  </xsl:template>
+
 </xsl:stylesheet>
--
2.45.2