From 5909521e06ed92ea60ffb247b25ca86c232f71bf Mon Sep 17 00:00:00 2001 From: John Kacur Date: Thu, 20 Jun 2024 21:34:24 -0400 Subject: [PATCH 1/4] rteval: timerlat: Add timerlat tracing to rteval This patch adds tracing from timerlat to rteval. These two options are added --timerlat-stoptrace USEC Stop trace if thread latency higher than USEC --timerlat-trace FILE File to save trace to This also stores the summary from rtla timerlat in the rteval xml file. Signed-off-by: John Kacur --- rteval/modules/measurement/timerlat.py | 213 ++++++++++++++++++++++++- rteval/rteval_text.xsl | 181 ++++++++++++++++++++- 2 files changed, 389 insertions(+), 5 deletions(-) diff --git a/rteval/modules/measurement/timerlat.py b/rteval/modules/measurement/timerlat.py index e8345fab1ad7..45adec1b33e1 100644 --- a/rteval/modules/measurement/timerlat.py +++ b/rteval/modules/measurement/timerlat.py @@ -16,6 +16,7 @@ from rteval.modules import rtevalModulePrototype from rteval.systopology import cpuinfo, SysTopology from rteval.cpulist_utils import expand_cpulist, collapse_cpulist + class TLRunData: ''' class to store instance data from a timerlat run ''' def __init__(self, coreid, datatype, priority, logfnc): @@ -188,6 +189,15 @@ class Timerlat(rtevalModulePrototype): self.__cpus = [str(c) for c in expand_cpulist(self.__cpulist)] self.__numcores = len(self.__cpus) + # Has tracing been triggered + self.__stoptrace = False + # This stores the output from rtla + self.__posttrace = "" + # Stop Trace Data + self.__stdata = {} + # Stop Trace Cpu + self.stcpu = -1 + self.__timerlat_out = None self.__timerlat_err = None self.__started = False @@ -218,10 +228,21 @@ class Timerlat(rtevalModulePrototype): self.__cmd = ['rtla', 'timerlat', 'hist', '-P', f'f:{int(self.__priority)}', '-u'] self.__cmd.append(f'-c{self.__cpulist}') self.__cmd.append(f'-E{self.__buckets}') + + if self.__cfg.stoptrace: + self.__cmd.append(f"-T{int(self.__cfg.stoptrace)}") + + if self.__cfg.trace: + if not self.__cfg.stoptrace: + self._log(Log.WARN, f'Ignoring trace={self.__cfg.trace}, because stoptrace not invoked') + else: + self.__cmd.append(f'-t={self.__cfg.trace}') + self._log(Log.DEBUG, f'self.__cmd = {self.__cmd}') self.__timerlat_out = tempfile.SpooledTemporaryFile(mode='w+b') self.__timerlat_err = tempfile.SpooledTemporaryFile(mode='w+b') + def _WorkloadTask(self): if self.__started: return @@ -252,6 +273,10 @@ class Timerlat(rtevalModulePrototype): os.kill(self.__timerlat_process.pid, signal.SIGINT) time.sleep(2) + blocking_thread_detected = False + softirq_interference_detected = False + irq_interference_detected = False + # Parse histogram output self.__timerlat_out.seek(0) for line in self.__timerlat_out: @@ -261,6 +286,113 @@ class Timerlat(rtevalModulePrototype): if not line: continue + # Parsing if stoptrace has been invoked + if self.__stoptrace: + self.__posttrace += line + line = line.strip() + fields = line.split() + if line.startswith("##") and fields[1] == "CPU": + self.stcpu = int(fields[2]) + self._log(Log.DEBUG, f"self.stcpu = {self.stcpu}") + self.__stdata[self.stcpu] = {} + continue + if self.stcpu == -1: + self._log(Log.WARN, "Stop trace has been invoked, but a stop cpu has not been identified.") + continue + if line.startswith('------------------'): + blocking_thread_detected = False + softirq_interference_detected = False + irq_interference_detected = False + continue + if line.startswith("Thread latency:"): + thread_latency_percent = fields[-1].strip('()%') + self._log(Log.DEBUG, f"thread_latency_percent = {thread_latency_percent}") + thread_latency = fields[-3] + self._log(Log.DEBUG, f"thread_latency = {thread_latency}") + self.__stdata[self.stcpu]["Thread_latency"] = (thread_latency, thread_latency_percent) + elif line.startswith("Previous IRQ interference"): + self._log(Log.DEBUG, f'Previous_IRQ_interference = {fields[-2]}') + self.__stdata[self.stcpu]["Previous_IRQ_interference"] = fields[-2] + elif line.startswith("IRQ handler delay:"): + irq_handler_delay_percent = fields[-2].strip('(') + irq_handler_delay = fields[-4] + # Do we have (exit from idle)? + if fields[3] == '(exit': + field_name = "IRQ_handler_delay_exit_from_idle" + else: + field_name = "IRQ_handler_delay" + self._log(Log.DEBUG, f"{field_name} = {irq_handler_delay}") + self._log(Log.DEBUG, f"{field_name}_percent = {irq_handler_delay_percent}") + self.__stdata[self.stcpu][field_name] = (irq_handler_delay, irq_handler_delay_percent) + elif line.startswith("IRQ latency:"): + self._log(Log.DEBUG, f"irq_latency = {fields[-2]}") + self.__stdata[self.stcpu]["IRQ_latency"] = fields[-2] + elif line.startswith("Timerlat IRQ duration"): + timerlat_irq_duration_percent = fields[-2].strip('(') + self._log(Log.DEBUG, f"timerlat_irq_duration_percent = {timerlat_irq_duration_percent}") + timerlat_irq_duration = fields[-4] + self._log(Log.DEBUG, f"timerlat_irq_duration = {timerlat_irq_duration}") + self.__stdata[self.stcpu]["Timerlat_IRQ_duration"] = (timerlat_irq_duration, timerlat_irq_duration_percent) + elif line.startswith("Blocking thread:"): + blocking_thread_percent = fields[-2].strip('(') + self._log(Log.DEBUG, f"blocking_thread_percent = {blocking_thread_percent}") + blocking_thread = fields[-4] + self._log(Log.DEBUG, f"blocking_thread = {blocking_thread}") + self.__stdata[self.stcpu]["Blocking_Thread"] = (blocking_thread, blocking_thread_percent) + blocking_thread_detected = True + irq_interference_detected = False + softirq_interference_detected = False + elif line.startswith("IRQ interference"): + irq_interference_percent = fields[-2].strip('(') + self._log(Log.DEBUG, f"irq_interference_percent = {irq_interference_percent}") + irq_interference = fields[-4] + self._log(Log.DEBUG, f"irq_interference = {irq_interference}") + self.__stdata[self.stcpu]["IRQ_interference"] = (irq_interference, irq_interference_percent) + blocking_thread_detected = False + irq_interference_detected = True + softirq_interference_detected = False + elif line.startswith("Softirq interference"): + softirq_interference_percent = fields[-2].strip('(') + self._log(Log.DEBUG, f"softirq_interference_percent = {softirq_interference_percent}") + softirq_interference = fields[-4] + self._log(Log.DEBUG, f"softirq_interference = {softirq_interference}") + self.__stdata[self.stcpu]["Softirq_interference"] = (softirq_interference, softirq_interference_percent) + blocking_thread_detected = False + irq_interference_detected = False + softirq_interference_detected = True + elif blocking_thread_detected: + self._log(Log.DEBUG, f'line={line}') + blocking_thread = " ".join(fields[0:-2]) + self._log(Log.DEBUG, f"blocking_thread = {blocking_thread}") + blocking_threadus = fields[-2] + self._log(Log.DEBUG, f"blocking_threadus = {blocking_threadus}") + self.__stdata[self.stcpu].setdefault("blocking_thread", []) + self.__stdata[self.stcpu]["blocking_thread"] += [(blocking_thread, blocking_threadus)] + elif softirq_interference_detected: + softirq = " ".join(fields[0:-2]) + softirq_latency = fields[-2] + self._log(Log.DEBUG, f'softirq = {softirq}') + self._log(Log.DEBUG, f'softirq_latency = {softirq_latency}') + self.__stdata[self.stcpu].setdefault("softirq_interference", []) + self.__stdata[self.stcpu]["softirq_interference"] += [(softirq, softirq_latency)] + elif irq_interference_detected: + irq_interference_name = " ".join(fields[0:-2]) + irq_interference_latency = fields[-2] + self._log(Log.DEBUG, f'irq_interference = {irq_interference_name}, latency = {irq_interference_latency}') + self.__stdata[self.stcpu].setdefault("irq_interference", []) + self.__stdata[self.stcpu]["irq_interference"] += [(irq_interference_name, irq_interference_latency)] + elif line.startswith("Max timerlat IRQ latency"): + self._log(Log.DEBUG, f"line={line}") + max_timerlat_irq_latency = fields[-5] + self._log(Log.DEBUG, f"max_timerlat_irq_latency = {max_timerlat_irq_latency}") + max_timerlat_cpu = int(fields[-1]) + self._log(Log.DEBUG, f"max_timerlat_cpu = {max_timerlat_cpu}") + self.__stdata.setdefault(max_timerlat_cpu, {}) + self.__stdata[max_timerlat_cpu]["Max_timerlat_IRQ_latency_from_idle"] = max_timerlat_irq_latency + else: + self._log(Log.DEBUG, f'line = {line}') + continue + if line.startswith('#'): if line.startswith('# Duration:'): duration = line.split()[2] @@ -285,9 +417,13 @@ class Timerlat(rtevalModulePrototype): elif line.startswith('max:'): #print(line) continue + elif line.startswith('rtla timerlat hit stop tracing'): + self.__stoptrace = True + self.__posttrace += line + continue else: - pass #print(line) + pass vals = line.split() if not vals: @@ -301,12 +437,18 @@ class Timerlat(rtevalModulePrototype): continue for i, core in enumerate(self.__cpus): - self.__timerlatdata[core].bucket(index, int(vals[i*3+1]), + # There might not be a count on every cpu if tracing invoked + if i*3 + 1 >= len(vals): + self.__timerlatdata[core].bucket(index, 0, 0, 0) + self.__timerlatdata['system'].bucket(index, 0, 0, 0) + else: + self.__timerlatdata[core].bucket(index, int(vals[i*3+1]), int(vals[i*3+2]), int(vals[i*3+3])) - self.__timerlatdata['system'].bucket(index, int(vals[i*3+1]), + self.__timerlatdata['system'].bucket(index, int(vals[i*3+1]), int(vals[i*3+2]), int(vals[i*3+3])) + # Generate statistics for each RunData object for n in list(self.__timerlatdata.keys()): self.__timerlatdata[n].reduce() @@ -320,6 +462,62 @@ class Timerlat(rtevalModulePrototype): rep_n = libxml2.newNode('timerlat') rep_n.newProp('command_line', ' '.join(self.__cmd)) + stoptrace_invoked_n = libxml2.newNode('stoptrace_invoked') + if self.stcpu != -1: + stoptrace_invoked_n.newProp("invoked", "true") + else: + stoptrace_invoked_n.newProp("invoked", "") + rep_n.addChild(stoptrace_invoked_n) + + if self.stcpu != -1: + self._log(Log.DEBUG, f'self.__stdata = {self.__stdata}') + for cpu in self.__stdata: + # This is Max timerlat IRQ latency from idle + # With no other data from that cpu, so don't create a + # stoptrace_report for this + if len(self.__stdata[cpu]) == 1: + continue + stoptrace_n = libxml2.newNode('stoptrace_report') + stoptrace_n.newProp("CPU", str(cpu)) + for k, v in self.__stdata[cpu].items(): + self._log(Log.DEBUG, f"cpu={cpu}, k={k}, v={v}") + if isinstance(v, tuple): + latency = str(v[0]) + percent = str(v[1]) + cpu_n = stoptrace_n.newTextChild(None, str(k), None) + n = cpu_n.newTextChild(None, "latency", latency) + n.newProp('unit', 'us') + + n = cpu_n.newTextChild(None, "latency_percent", percent) + n.newProp('unit', '%') + elif isinstance(v, list): + if k in ("blocking_thread", "softirq_interference", "irq_interference"): + for name, latency in v: + cpu_n = stoptrace_n.newTextChild(None, k, None) + n = cpu_n.newTextChild(None, "name", name) + n = cpu_n.newTextChild(None, "latency", latency) + n.newProp('unit', 'us') + else: + if k == "Max_timerlat_IRQ_latency_from_idle": + continue + cpu_n = stoptrace_n.newTextChild(None, str(k), str(v)) + cpu_n.newProp('unit', 'us') + rep_n.addChild(stoptrace_n) + + self._log(Log.DEBUG, f'timerlat: posttrace = \n{self.__posttrace}') + self._log(Log.DEBUG, 'timerlat: posttrace END') + for cpu in self.__stdata: + for k, v in self.__stdata[cpu].items(): + if isinstance(v, tuple): + continue + if k == "Max_timerlat_IRQ_latency_from_idle": + max_timerlat_n = libxml2.newNode('max_timerlat_report') + max_timerlat_n.newProp("CPU", str(cpu)) + cpu_n = max_timerlat_n.newTextChild(None, k, str(v)) + cpu_n.newProp('unit', 'us') + rep_n.addChild(max_timerlat_n) + return rep_n + rep_n.addChild(self.__timerlatdata['system'].MakeReport()) for thr in self.__cpus: if str(thr) not in self.__timerlatdata: @@ -341,7 +539,13 @@ def ModuleParameters(): "metavar": "PRIO" }, "buckets": {"descr": "Number of buckets", "default": 3500, - "metavar": "NUM"} + "metavar": "NUM" }, + "stoptrace": {"descr": "Stop trace if thread latency higher than USEC", + "default": None, + "metavar": "USEC" }, + "trace": {"descr": "File to save trace to", + "default": None, + "metavar": "FILE" }, } def create(params, logger): @@ -367,6 +571,7 @@ if __name__ == '__main__': cfg_tl = cfg.GetSection('timerlat') cfg_tl.cpulist = collapse_cpulist(SysTopology().online_cpus()) + cfg_tl.stoptrace=50 RUNTIME = 10 diff --git a/rteval/rteval_text.xsl b/rteval/rteval_text.xsl index 2f03bda0bb55..70777354efa5 100644 --- a/rteval/rteval_text.xsl +++ b/rteval/rteval_text.xsl @@ -316,14 +316,26 @@ - + + rtla timerlat hit stop tracing + + + + + + + System: + + + Statistics: + @@ -470,4 +482,171 @@ + + + ## CPU + + hit stop tracing, analyzing it ## + + + + + + Previous IRQ interference: up to + + + + + + + + + IRQ handler delay: + + + + ( + + + + ) + + + + + IRQ handler delay: (exit from idle) + + + + ( + + + + ) + + + + IRQ latency: + + + + + + + + Timerlat IRQ duration: + + + + + ( + + + + ) + + + + Blocking thread: + + + + + ( + + + + ) + + + + + + + + + + + + + + + + Softirq interference: + + + + + ( + + + + ) + + + + + + + + + + + + + + + + IRQ interference: + + + + + ( + + + + ) + + + + + + + + + + + + + + + -------------------------------------------------------------------------------- + Thread latency: + + + + + ( + + + + ) + + + + + + + + Max timerlat IRQ latency from idle: + + + + in cpu + + + + + -- 2.45.2