51663f5738
* pmdabcc: resolve compilation issues of some bcc PMDA modules on aarch64, ppc64le and s390x * pmdabpf: support arm and powerpc architectures Resolves: rhbz#2024982 Resolves: rhzb#2024980
1434 lines
50 KiB
Diff
1434 lines
50 KiB
Diff
commit 14ffcd934e1c5099b471f4e73da32d1b32bac7e6
|
|
Author: Andreas Gerstmayr <agerstmayr@redhat.com>
|
|
Date: Mon Dec 13 20:10:40 2021 +0100
|
|
|
|
pmdabcc: sync bcc PMDA modules with upstream bcc tools
|
|
|
|
diff --git a/src/pmdas/bcc/modules/execsnoop.bpf b/src/pmdas/bcc/modules/execsnoop.bpf
|
|
index f69200773..aa755b3a1 100644
|
|
--- a/src/pmdas/bcc/modules/execsnoop.bpf
|
|
+++ b/src/pmdas/bcc/modules/execsnoop.bpf
|
|
@@ -4,40 +4,57 @@
|
|
#include <uapi/linux/ptrace.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/fs.h>
|
|
+
|
|
#define ARGSIZE 128
|
|
+
|
|
enum event_type {
|
|
EVENT_ARG,
|
|
EVENT_RET,
|
|
};
|
|
+
|
|
struct data_t {
|
|
u32 pid; // PID as in the userspace term (i.e. task->tgid in kernel)
|
|
u32 ppid; // Parent PID as in the userspace term (i.e task->real_parent->tgid in kernel)
|
|
+ u32 uid;
|
|
char comm[TASK_COMM_LEN];
|
|
enum event_type type;
|
|
char argv[ARGSIZE];
|
|
int retval;
|
|
};
|
|
+
|
|
BPF_PERF_OUTPUT(events);
|
|
+
|
|
static int __submit_arg(struct pt_regs *ctx, void *ptr, struct data_t *data)
|
|
{
|
|
- bpf_probe_read(data->argv, sizeof(data->argv), ptr);
|
|
+ bpf_probe_read_user(data->argv, sizeof(data->argv), ptr);
|
|
events.perf_submit(ctx, data, sizeof(struct data_t));
|
|
return 1;
|
|
}
|
|
+
|
|
static int submit_arg(struct pt_regs *ctx, void *ptr, struct data_t *data)
|
|
{
|
|
const char *argp = NULL;
|
|
- bpf_probe_read(&argp, sizeof(argp), ptr);
|
|
+ bpf_probe_read_user(&argp, sizeof(argp), ptr);
|
|
if (argp) {
|
|
return __submit_arg(ctx, (void *)(argp), data);
|
|
}
|
|
return 0;
|
|
}
|
|
+
|
|
int syscall__execve(struct pt_regs *ctx,
|
|
const char __user *filename,
|
|
const char __user *const __user *__argv,
|
|
const char __user *const __user *__envp)
|
|
{
|
|
+
|
|
+ u32 uid = bpf_get_current_uid_gid() & 0xffffffff;
|
|
+
|
|
+ UID_FILTER
|
|
+
|
|
+ if (container_should_be_filtered()) {
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
// create data here and pass to submit_arg to save stack space (#555)
|
|
struct data_t data = {};
|
|
struct task_struct *task;
|
|
@@ -52,25 +69,37 @@ int syscall__execve(struct pt_regs *ctx,
|
|
|
|
bpf_get_current_comm(&data.comm, sizeof(data.comm));
|
|
data.type = EVENT_ARG;
|
|
+
|
|
__submit_arg(ctx, (void *)filename, &data);
|
|
+
|
|
// skip first arg, as we submitted filename
|
|
#pragma unroll
|
|
for (int i = 1; i < MAXARG; i++) {
|
|
if (submit_arg(ctx, (void *)&__argv[i], &data) == 0)
|
|
goto out;
|
|
}
|
|
+
|
|
// handle truncated argument list
|
|
char ellipsis[] = "...";
|
|
__submit_arg(ctx, (void *)ellipsis, &data);
|
|
out:
|
|
return 0;
|
|
}
|
|
+
|
|
int do_ret_sys_execve(struct pt_regs *ctx)
|
|
{
|
|
+ if (container_should_be_filtered()) {
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
struct data_t data = {};
|
|
struct task_struct *task;
|
|
|
|
+ u32 uid = bpf_get_current_uid_gid() & 0xffffffff;
|
|
+ UID_FILTER
|
|
+
|
|
data.pid = bpf_get_current_pid_tgid() >> 32;
|
|
+ data.uid = uid;
|
|
|
|
task = (struct task_struct *)bpf_get_current_task();
|
|
// Some kernels, like Ubuntu 4.13.0-generic, return 0
|
|
@@ -82,5 +111,6 @@ int do_ret_sys_execve(struct pt_regs *ctx)
|
|
data.type = EVENT_RET;
|
|
data.retval = PT_REGS_RC(ctx);
|
|
events.perf_submit(ctx, &data, sizeof(data));
|
|
+
|
|
return 0;
|
|
}
|
|
diff --git a/src/pmdas/bcc/modules/execsnoop.python b/src/pmdas/bcc/modules/execsnoop.python
|
|
index 54382fa9b..1127cc471 100644
|
|
--- a/src/pmdas/bcc/modules/execsnoop.python
|
|
+++ b/src/pmdas/bcc/modules/execsnoop.python
|
|
@@ -44,20 +44,6 @@ MODULE = 'execsnoop'
|
|
BASENS = 'proc.exec.'
|
|
units_none = pmUnits(0, 0, 0, 0, 0, 0)
|
|
|
|
-TASK_COMM_LEN = 16 # linux/sched.h
|
|
-ARGSIZE = 128 # should match #define in execsnoop.bpf
|
|
-
|
|
-class Data(ct.Structure):
|
|
- """ execsnoop data struct """
|
|
- _fields_ = [
|
|
- ("pid", ct.c_uint),
|
|
- ("ppid", ct.c_uint),
|
|
- ("comm", ct.c_char * TASK_COMM_LEN),
|
|
- ("type", ct.c_int),
|
|
- ("argv", ct.c_char * ARGSIZE),
|
|
- ("retval", ct.c_int),
|
|
- ]
|
|
-
|
|
class EventType(object):
|
|
""" Event type """
|
|
EVENT_ARG = 0
|
|
@@ -137,7 +123,7 @@ class PCPBCCModule(PCPBCCBase):
|
|
|
|
def handle_event(self, _cpu, data, _size):
|
|
""" Event handler """
|
|
- event = ct.cast(data, ct.POINTER(Data)).contents
|
|
+ event = self.bpf["events"].event(data)
|
|
skip = False
|
|
|
|
if event.type == EventType.EVENT_ARG:
|
|
@@ -145,9 +131,9 @@ class PCPBCCModule(PCPBCCBase):
|
|
elif event.type == EventType.EVENT_RET:
|
|
if event.retval != 0 and not self.include_failed:
|
|
skip = True
|
|
- if self.command and not re.search(self.command, event.comm):
|
|
+ if self.command and not re.search(bytes(self.command), event.comm):
|
|
skip = True
|
|
- if self.args and not re.search(self.args, b" ".join(self.argv_cache[event.pid])):
|
|
+ if self.args and not re.search(bytes(self.args), b" ".join(self.argv_cache[event.pid])):
|
|
skip = True
|
|
|
|
if not skip:
|
|
@@ -177,10 +163,14 @@ class PCPBCCModule(PCPBCCBase):
|
|
|
|
self.bpf_text = self.bpf_text.replace("MAXARG", str(self.max_args))
|
|
|
|
+ bpf_text = self.bpf_text
|
|
+ bpf_text = bpf_text.replace('UID_FILTER', '')
|
|
+ bpf_text = bpf_text.replace('container_should_be_filtered()', '0')
|
|
+
|
|
if self.debug:
|
|
- self.log("BPF to be compiled:\n" + self.bpf_text.strip())
|
|
+ self.log("BPF to be compiled:\n" + bpf_text.strip())
|
|
|
|
- self.bpf = BPF(text=self.bpf_text)
|
|
+ self.bpf = BPF(text=bpf_text)
|
|
execve_fnname = self.get_syscall_fnname("execve")
|
|
self.bpf.attach_kprobe(event=execve_fnname, fn_name="syscall__execve")
|
|
self.bpf.attach_kretprobe(event=execve_fnname, fn_name="do_ret_sys_execve")
|
|
diff --git a/src/pmdas/bcc/modules/pcpbcc.python b/src/pmdas/bcc/modules/pcpbcc.python
|
|
index 0555dc33f..62783b7fc 100644
|
|
--- a/src/pmdas/bcc/modules/pcpbcc.python
|
|
+++ b/src/pmdas/bcc/modules/pcpbcc.python
|
|
@@ -14,6 +14,7 @@
|
|
""" PCP BCC PMDA module base class """
|
|
|
|
import re
|
|
+import platform
|
|
import ctypes as ct
|
|
from os import kill, listdir, path
|
|
from collections import OrderedDict
|
|
@@ -348,6 +349,16 @@ class PCPBCCBase(object):
|
|
""" Returns BCC version as an int tuple (for comparisons) """
|
|
return tuple(map(int, PCPBCCBase.bcc_version().split('.')))
|
|
|
|
+ @staticmethod
|
|
+ def kernel_version():
|
|
+ """Returns the kernel version"""
|
|
+ version_str = platform.release()
|
|
+ m = re.match(r'^(\d+)\.(\d+)\.(\d+)', version_str)
|
|
+ if m:
|
|
+ return tuple(map(int, m.groups()))
|
|
+ else:
|
|
+ return (0, 0, 0)
|
|
+
|
|
def perf_buffer_poller(self):
|
|
""" BPF poller """
|
|
try:
|
|
diff --git a/src/pmdas/bcc/modules/runqlat.python b/src/pmdas/bcc/modules/runqlat.python
|
|
index 27007c7e5..1c6c6b4b0 100644
|
|
--- a/src/pmdas/bcc/modules/runqlat.python
|
|
+++ b/src/pmdas/bcc/modules/runqlat.python
|
|
@@ -30,7 +30,11 @@ from modules.pcpbcc import PCPBCCBase
|
|
#
|
|
# BPF program
|
|
#
|
|
-bpf_src = "modules/runqlat.bpf"
|
|
+is_support_raw_tp = BPF.support_raw_tracepoint()
|
|
+if is_support_raw_tp:
|
|
+ bpf_src = "modules/runqlat_tp.bpf"
|
|
+else:
|
|
+ bpf_src = "modules/runqlat_kp.bpf"
|
|
|
|
#
|
|
# PCP BCC PMDA constants
|
|
@@ -59,6 +63,7 @@ class PCPBCCModule(PCPBCCBase):
|
|
self.proc_filter = self.config.get(MODULE, opt)
|
|
self.update_pids(self.get_proc_info(self.proc_filter))
|
|
|
|
+ self.log("Using BPF source file %s." % bpf_src)
|
|
self.log("Initialized.")
|
|
|
|
def metrics(self):
|
|
@@ -89,7 +94,23 @@ class PCPBCCModule(PCPBCCBase):
|
|
with open(path.dirname(__file__) + '/../' + bpf_src) as src:
|
|
self.bpf_text = src.read()
|
|
|
|
+ # BPF.kernel_struct_has_field requires BCC v0.23.0
|
|
+ # use kernel version check as alternative
|
|
+ # pylint: disable=no-member
|
|
+ if (
|
|
+ hasattr(BPF, "kernel_struct_has_field")
|
|
+ and BPF.kernel_struct_has_field(b"task_struct", b"__state") == 1
|
|
+ ) or self.kernel_version() >= (5, 14, 0):
|
|
+ self.bpf_text = self.bpf_text.replace('STATE_FIELD', '__state')
|
|
+ else:
|
|
+ self.bpf_text = self.bpf_text.replace('STATE_FIELD', 'state')
|
|
+
|
|
self.bpf_text = self.bpf_text.replace("FILTER", "PID_CHECK")
|
|
+ self.bpf_text = self.bpf_text.replace('FACTOR', 'delta /= 1000;')
|
|
+
|
|
+ self.bpf_text = self.bpf_text.replace('STORAGE', 'BPF_HISTOGRAM(dist);')
|
|
+ self.bpf_text = self.bpf_text.replace('STORE',
|
|
+ 'dist.increment(bpf_log2l(delta));')
|
|
|
|
if not self.pids and self.proc_filter and self.proc_refresh:
|
|
self.log("No process to attach found, activation postponed.")
|
|
@@ -102,9 +123,11 @@ class PCPBCCModule(PCPBCCBase):
|
|
|
|
self.reset_cache()
|
|
self.bpf = BPF(text=bpf_text)
|
|
- self.bpf.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
|
|
- self.bpf.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
|
|
- self.bpf.attach_kprobe(event_re=r"^finish_task_switch$|^finish_task_switch\.isra\.\d$", fn_name="trace_run")
|
|
+ if not is_support_raw_tp:
|
|
+ self.bpf.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
|
|
+ self.bpf.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
|
|
+ self.bpf.attach_kprobe(event_re=r"^finish_task_switch$|^finish_task_switch\.isra\.\d$",
|
|
+ fn_name="trace_run")
|
|
self.log("Compiled.")
|
|
except Exception as error: # pylint: disable=broad-except
|
|
self.bpf = None
|
|
diff --git a/src/pmdas/bcc/modules/runqlat.bpf b/src/pmdas/bcc/modules/runqlat_kp.bpf
|
|
similarity index 54%
|
|
rename from src/pmdas/bcc/modules/runqlat.bpf
|
|
rename to src/pmdas/bcc/modules/runqlat_kp.bpf
|
|
index a3664a035..dd643d600 100644
|
|
--- a/src/pmdas/bcc/modules/runqlat.bpf
|
|
+++ b/src/pmdas/bcc/modules/runqlat_kp.bpf
|
|
@@ -5,6 +5,7 @@
|
|
#include <linux/sched.h>
|
|
#include <linux/nsproxy.h>
|
|
#include <linux/pid_namespace.h>
|
|
+#include <linux/init_task.h>
|
|
|
|
typedef struct pid_key {
|
|
u64 id; // work around
|
|
@@ -17,7 +18,7 @@ typedef struct pidns_key {
|
|
} pidns_key_t;
|
|
|
|
BPF_HASH(start, u32);
|
|
-BPF_HISTOGRAM(dist);
|
|
+STORAGE
|
|
|
|
struct rq;
|
|
|
|
@@ -31,6 +32,45 @@ static int trace_enqueue(u32 tgid, u32 pid)
|
|
return 0;
|
|
}
|
|
|
|
+static __always_inline unsigned int pid_namespace(struct task_struct *task)
|
|
+{
|
|
+
|
|
+/* pids[] was removed from task_struct since commit 2c4704756cab7cfa031ada4dab361562f0e357c0
|
|
+ * Using the macro INIT_PID_LINK as a conditional judgment.
|
|
+ */
|
|
+#ifdef INIT_PID_LINK
|
|
+ struct pid_link pids;
|
|
+ unsigned int level;
|
|
+ struct upid upid;
|
|
+ struct ns_common ns;
|
|
+
|
|
+ /* get the pid namespace by following task_active_pid_ns(),
|
|
+ * pid->numbers[pid->level].ns
|
|
+ */
|
|
+ bpf_probe_read_kernel(&pids, sizeof(pids), &task->pids[PIDTYPE_PID]);
|
|
+ bpf_probe_read_kernel(&level, sizeof(level), &pids.pid->level);
|
|
+ bpf_probe_read_kernel(&upid, sizeof(upid), &pids.pid->numbers[level]);
|
|
+ bpf_probe_read_kernel(&ns, sizeof(ns), &upid.ns->ns);
|
|
+
|
|
+ return ns.inum;
|
|
+#else
|
|
+ struct pid *pid;
|
|
+ unsigned int level;
|
|
+ struct upid upid;
|
|
+ struct ns_common ns;
|
|
+
|
|
+ /* get the pid namespace by following task_active_pid_ns(),
|
|
+ * pid->numbers[pid->level].ns
|
|
+ */
|
|
+ bpf_probe_read_kernel(&pid, sizeof(pid), &task->thread_pid);
|
|
+ bpf_probe_read_kernel(&level, sizeof(level), &pid->level);
|
|
+ bpf_probe_read_kernel(&upid, sizeof(upid), &pid->numbers[level]);
|
|
+ bpf_probe_read_kernel(&ns, sizeof(ns), &upid.ns->ns);
|
|
+
|
|
+ return ns.inum;
|
|
+#endif
|
|
+}
|
|
+
|
|
int trace_wake_up_new_task(struct pt_regs *ctx, struct task_struct *p)
|
|
{
|
|
return trace_enqueue(p->tgid, p->pid);
|
|
@@ -48,7 +88,7 @@ int trace_run(struct pt_regs *ctx, struct task_struct *prev)
|
|
u32 pid, tgid;
|
|
|
|
// ivcsw: treat like an enqueue event and store timestamp
|
|
- if (prev->state == TASK_RUNNING) {
|
|
+ if (prev->STATE_FIELD == TASK_RUNNING) {
|
|
tgid = prev->tgid;
|
|
pid = prev->pid;
|
|
if (!(FILTER || pid == 0)) {
|
|
@@ -69,10 +109,10 @@ int trace_run(struct pt_regs *ctx, struct task_struct *prev)
|
|
return 0; // missed enqueue
|
|
}
|
|
delta = bpf_ktime_get_ns() - *tsp;
|
|
- delta /= 1000;
|
|
+ FACTOR
|
|
|
|
// store as histogram
|
|
- dist.increment(bpf_log2l(delta));
|
|
+ STORE
|
|
|
|
start.delete(&pid);
|
|
return 0;
|
|
diff --git a/src/pmdas/bcc/modules/runqlat_tp.bpf b/src/pmdas/bcc/modules/runqlat_tp.bpf
|
|
new file mode 100644
|
|
index 000000000..f0e9ce69b
|
|
--- /dev/null
|
|
+++ b/src/pmdas/bcc/modules/runqlat_tp.bpf
|
|
@@ -0,0 +1,124 @@
|
|
+// Copyright 2016 Netflix, Inc.
|
|
+// Licensed under the Apache License, Version 2.0 (the "License")
|
|
+
|
|
+#include <uapi/linux/ptrace.h>
|
|
+#include <linux/sched.h>
|
|
+#include <linux/nsproxy.h>
|
|
+#include <linux/pid_namespace.h>
|
|
+#include <linux/init_task.h>
|
|
+
|
|
+typedef struct pid_key {
|
|
+ u64 id; // work around
|
|
+ u64 slot;
|
|
+} pid_key_t;
|
|
+
|
|
+typedef struct pidns_key {
|
|
+ u64 id; // work around
|
|
+ u64 slot;
|
|
+} pidns_key_t;
|
|
+
|
|
+BPF_HASH(start, u32);
|
|
+STORAGE
|
|
+
|
|
+struct rq;
|
|
+
|
|
+// record enqueue timestamp
|
|
+static int trace_enqueue(u32 tgid, u32 pid)
|
|
+{
|
|
+ if (FILTER || pid == 0)
|
|
+ return 0;
|
|
+ u64 ts = bpf_ktime_get_ns();
|
|
+ start.update(&pid, &ts);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static __always_inline unsigned int pid_namespace(struct task_struct *task)
|
|
+{
|
|
+
|
|
+/* pids[] was removed from task_struct since commit 2c4704756cab7cfa031ada4dab361562f0e357c0
|
|
+ * Using the macro INIT_PID_LINK as a conditional judgment.
|
|
+ */
|
|
+#ifdef INIT_PID_LINK
|
|
+ struct pid_link pids;
|
|
+ unsigned int level;
|
|
+ struct upid upid;
|
|
+ struct ns_common ns;
|
|
+
|
|
+ /* get the pid namespace by following task_active_pid_ns(),
|
|
+ * pid->numbers[pid->level].ns
|
|
+ */
|
|
+ bpf_probe_read_kernel(&pids, sizeof(pids), &task->pids[PIDTYPE_PID]);
|
|
+ bpf_probe_read_kernel(&level, sizeof(level), &pids.pid->level);
|
|
+ bpf_probe_read_kernel(&upid, sizeof(upid), &pids.pid->numbers[level]);
|
|
+ bpf_probe_read_kernel(&ns, sizeof(ns), &upid.ns->ns);
|
|
+
|
|
+ return ns.inum;
|
|
+#else
|
|
+ struct pid *pid;
|
|
+ unsigned int level;
|
|
+ struct upid upid;
|
|
+ struct ns_common ns;
|
|
+
|
|
+ /* get the pid namespace by following task_active_pid_ns(),
|
|
+ * pid->numbers[pid->level].ns
|
|
+ */
|
|
+ bpf_probe_read_kernel(&pid, sizeof(pid), &task->thread_pid);
|
|
+ bpf_probe_read_kernel(&level, sizeof(level), &pid->level);
|
|
+ bpf_probe_read_kernel(&upid, sizeof(upid), &pid->numbers[level]);
|
|
+ bpf_probe_read_kernel(&ns, sizeof(ns), &upid.ns->ns);
|
|
+
|
|
+ return ns.inum;
|
|
+#endif
|
|
+}
|
|
+
|
|
+RAW_TRACEPOINT_PROBE(sched_wakeup)
|
|
+{
|
|
+ // TP_PROTO(struct task_struct *p)
|
|
+ struct task_struct *p = (struct task_struct *)ctx->args[0];
|
|
+ return trace_enqueue(p->tgid, p->pid);
|
|
+}
|
|
+
|
|
+RAW_TRACEPOINT_PROBE(sched_wakeup_new)
|
|
+{
|
|
+ // TP_PROTO(struct task_struct *p)
|
|
+ struct task_struct *p = (struct task_struct *)ctx->args[0];
|
|
+ return trace_enqueue(p->tgid, p->pid);
|
|
+}
|
|
+
|
|
+RAW_TRACEPOINT_PROBE(sched_switch)
|
|
+{
|
|
+ // TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
|
|
+ struct task_struct *prev = (struct task_struct *)ctx->args[1];
|
|
+ struct task_struct *next = (struct task_struct *)ctx->args[2];
|
|
+ u32 pid, tgid;
|
|
+
|
|
+ // ivcsw: treat like an enqueue event and store timestamp
|
|
+ if (prev->STATE_FIELD == TASK_RUNNING) {
|
|
+ tgid = prev->tgid;
|
|
+ pid = prev->pid;
|
|
+ if (!(FILTER || pid == 0)) {
|
|
+ u64 ts = bpf_ktime_get_ns();
|
|
+ start.update(&pid, &ts);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ tgid = next->tgid;
|
|
+ pid = next->pid;
|
|
+ if (FILTER || pid == 0)
|
|
+ return 0;
|
|
+ u64 *tsp, delta;
|
|
+
|
|
+ // fetch timestamp and calculate delta
|
|
+ tsp = start.lookup(&pid);
|
|
+ if (tsp == 0) {
|
|
+ return 0; // missed enqueue
|
|
+ }
|
|
+ delta = bpf_ktime_get_ns() - *tsp;
|
|
+ FACTOR
|
|
+
|
|
+ // store as histogram
|
|
+ STORE
|
|
+
|
|
+ start.delete(&pid);
|
|
+ return 0;
|
|
+}
|
|
diff --git a/src/pmdas/bcc/modules/tcplife.python b/src/pmdas/bcc/modules/tcplife.python
|
|
index 0c6f17c36..02c693a6a 100644
|
|
--- a/src/pmdas/bcc/modules/tcplife.python
|
|
+++ b/src/pmdas/bcc/modules/tcplife.python
|
|
@@ -37,16 +37,11 @@ from modules.pcpbcc import PCPBCCBase
|
|
#
|
|
# BPF program
|
|
#
|
|
-bpf_src = "modules/tcplife.bpf"
|
|
-# Compat with kernel < 4.16, bcc < 0.6
|
|
-TRACEFS = "/sys/kernel/debug/tracing"
|
|
-bpf_src_old_tp = "modules/tcplife_old_tp.bpf"
|
|
-bpf_src_old_kb = "modules/tcplife_old_kb.bpf"
|
|
-if not path.exists(TRACEFS + "/events/sock/inet_sock_set_state"):
|
|
- if path.exists(TRACEFS + "/events/tcp/tcp_set_state"):
|
|
- bpf_src = bpf_src_old_tp
|
|
- else:
|
|
- bpf_src = bpf_src_old_kb
|
|
+if BPF.tracepoint_exists("sock", "inet_sock_set_state"):
|
|
+ bpf_src = "modules/tcplife_tp.bpf"
|
|
+else:
|
|
+ bpf_src = "modules/tcplife_kp.bpf"
|
|
+
|
|
|
|
#
|
|
# PCP BCC PMDA constants
|
|
@@ -57,35 +52,6 @@ units_bytes = pmUnits(1, 0, 0, PM_SPACE_BYTE, 0, 0)
|
|
units_usecs = pmUnits(0, 1, 0, 0, PM_TIME_USEC, 0)
|
|
units_none = pmUnits(0, 0, 0, 0, 0, 0)
|
|
|
|
-TASK_COMM_LEN = 16 # linux/sched.h
|
|
-
|
|
-class Data_ipv4(ct.Structure):
|
|
- """ IPv4 data struct """
|
|
- _fields_ = [
|
|
- ("ts_us", ct.c_ulonglong),
|
|
- ("pid", ct.c_ulonglong),
|
|
- ("saddr", ct.c_ulonglong),
|
|
- ("daddr", ct.c_ulonglong),
|
|
- ("ports", ct.c_ulonglong),
|
|
- ("rx_b", ct.c_ulonglong),
|
|
- ("tx_b", ct.c_ulonglong),
|
|
- ("span_us", ct.c_ulonglong),
|
|
- ("task", ct.c_char * TASK_COMM_LEN)
|
|
- ]
|
|
-
|
|
-class Data_ipv6(ct.Structure):
|
|
- """ IPv6 data struct """
|
|
- _fields_ = [
|
|
- ("ts_us", ct.c_ulonglong),
|
|
- ("pid", ct.c_ulonglong),
|
|
- ("saddr", (ct.c_ulonglong * 2)),
|
|
- ("daddr", (ct.c_ulonglong * 2)),
|
|
- ("ports", ct.c_ulonglong),
|
|
- ("rx_b", ct.c_ulonglong),
|
|
- ("tx_b", ct.c_ulonglong),
|
|
- ("span_us", ct.c_ulonglong),
|
|
- ("task", ct.c_char * TASK_COMM_LEN)
|
|
- ]
|
|
|
|
#
|
|
# PCP BCC Module
|
|
@@ -129,24 +95,22 @@ class PCPBCCModule(PCPBCCBase):
|
|
self.lock = Lock()
|
|
self.thread = None
|
|
|
|
- # Compat with kernel < 4.16
|
|
self.log("Using BPF source file %s." % bpf_src)
|
|
|
|
# Exit hard if impossible to continue
|
|
- if self.bcc_version() == "0.6.1" and bpf_src == bpf_src_old_kb:
|
|
- raise RuntimeError("BCC 0.6.1 bug makes it incompatible with this module "
|
|
- "on kernel < 4.15.")
|
|
+ if self.bcc_version_tuple() < (0, 6, 1):
|
|
+ raise RuntimeError("BCC 0.6.1+ is required for this module.")
|
|
|
|
self.log("Initialized.")
|
|
|
|
def handle_ip_event(self, data, version):
|
|
""" IP event handler """
|
|
if version == 4:
|
|
- event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
|
|
+ event = self.bpf["ipv4_events"].event(data)
|
|
laddr = inet_ntop(AF_INET, pack("I", event.saddr))
|
|
daddr = inet_ntop(AF_INET, pack("I", event.daddr))
|
|
else:
|
|
- event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
|
|
+ event = self.bpf["ipv6_events"].event(data)
|
|
laddr = inet_ntop(AF_INET6, event.saddr)
|
|
daddr = inet_ntop(AF_INET6, event.daddr)
|
|
|
|
@@ -205,31 +169,25 @@ class PCPBCCModule(PCPBCCBase):
|
|
if not self.bpf_text:
|
|
with open(path.dirname(__file__) + '/../' + bpf_src) as src:
|
|
self.bpf_text = src.read()
|
|
- # Compat with bcc < 0.6
|
|
- self.log("Testing BCC compatilibility, possible errors below are safe to ignore.")
|
|
- try:
|
|
- test_txt = self.bpf_text.replace("// NEW: ", "").replace("FILTER_PID", "")
|
|
- test_bpf = BPF(text=test_txt)
|
|
- test_bpf.cleanup()
|
|
- self.bpf_text = self.bpf_text.replace("// NEW: ", "")
|
|
- except Exception: # pylint: disable=broad-except
|
|
- self.bpf_text = self.bpf_text.replace("// OLD: ", "")
|
|
- self.log("Tested BCC compatilibility, possible errors above are safe to ignore.")
|
|
|
|
if self.dports:
|
|
filterp = " && ".join(["dport != %d" % port for port in self.dports])
|
|
filter_txt = "if (%s) { birth.delete(&sk); return 0; }" % filterp
|
|
- self.bpf_text = self.bpf_text.replace("//FILTER_DPORT", filter_txt)
|
|
+ self.bpf_text = self.bpf_text.replace("FILTER_DPORT", filter_txt)
|
|
if self.lports:
|
|
filterp = " && ".join(["lport != %d" % port for port in self.lports])
|
|
filter_txt = "if (%s) { birth.delete(&sk); return 0; }" % filterp
|
|
- self.bpf_text = self.bpf_text.replace("//FILTER_LPORT", filter_txt)
|
|
+ self.bpf_text = self.bpf_text.replace("FILTER_LPORT", filter_txt)
|
|
|
|
if not self.pids and self.proc_filter and self.proc_refresh:
|
|
self.log("No process to attach found, activation postponed.")
|
|
return
|
|
|
|
bpf_text = self.apply_pid_filter(self.bpf_text, self.pids, False)
|
|
+ bpf_text = bpf_text.replace('FILTER_PID', '')
|
|
+ bpf_text = bpf_text.replace('FILTER_DPORT', '')
|
|
+ bpf_text = bpf_text.replace('FILTER_LPORT', '')
|
|
+ bpf_text = bpf_text.replace('FILTER_FAMILY', '')
|
|
|
|
if self.debug:
|
|
self.log("BPF to be compiled:\n" + bpf_text.strip())
|
|
diff --git a/src/pmdas/bcc/modules/tcplife_old_kb.bpf b/src/pmdas/bcc/modules/tcplife_kp.bpf
|
|
similarity index 81%
|
|
rename from src/pmdas/bcc/modules/tcplife_old_kb.bpf
|
|
rename to src/pmdas/bcc/modules/tcplife_kp.bpf
|
|
index eed01941a..5486c6a37 100644
|
|
--- a/src/pmdas/bcc/modules/tcplife_old_kb.bpf
|
|
+++ b/src/pmdas/bcc/modules/tcplife_kp.bpf
|
|
@@ -2,7 +2,6 @@
|
|
// Licensed under the Apache License, Version 2.0 (the "License")
|
|
|
|
#include <uapi/linux/ptrace.h>
|
|
-#define KBUILD_MODNAME "pcpbcctcplife"
|
|
#include <linux/tcp.h>
|
|
#include <net/sock.h>
|
|
#include <bcc/proto.h>
|
|
@@ -11,11 +10,10 @@ BPF_HASH(birth, struct sock *, u64);
|
|
|
|
// separate data structs for ipv4 and ipv6
|
|
struct ipv4_data_t {
|
|
- // XXX: switch some to u32's when supported
|
|
u64 ts_us;
|
|
- u64 pid;
|
|
- u64 saddr;
|
|
- u64 daddr;
|
|
+ u32 pid;
|
|
+ u32 saddr;
|
|
+ u32 daddr;
|
|
u64 ports;
|
|
u64 rx_b;
|
|
u64 tx_b;
|
|
@@ -26,7 +24,7 @@ BPF_PERF_OUTPUT(ipv4_events);
|
|
|
|
struct ipv6_data_t {
|
|
u64 ts_us;
|
|
- u64 pid;
|
|
+ u32 pid;
|
|
unsigned __int128 saddr;
|
|
unsigned __int128 daddr;
|
|
u64 ports;
|
|
@@ -49,12 +47,12 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
|
|
|
|
// lport is either used in a filter here, or later
|
|
u16 lport = sk->__sk_common.skc_num;
|
|
- //FILTER_LPORT
|
|
+ FILTER_LPORT
|
|
|
|
// dport is either used in a filter here, or later
|
|
u16 dport = sk->__sk_common.skc_dport;
|
|
dport = ntohs(dport);
|
|
- //FILTER_DPORT
|
|
+ FILTER_DPORT
|
|
|
|
/*
|
|
* This tool includes PID and comm context. It's best effort, and may
|
|
@@ -74,6 +72,9 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
|
|
* sets ESTABLISHED without a tcp_set_state() call. Until we know
|
|
* that for sure, match all early states to increase chances a
|
|
* timestamp is set.
|
|
+ * Note that this needs to be set before the PID filter later on,
|
|
+ * since the PID isn't reliable for these early stages, so we must
|
|
+ * save all timestamps and do the PID filter later when we can.
|
|
*/
|
|
u64 ts = bpf_ktime_get_ns();
|
|
birth.update(&sk, &ts);
|
|
@@ -101,7 +102,7 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
|
|
delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
|
|
birth.delete(&sk);
|
|
|
|
- // fetch possible cached data
|
|
+ // fetch possible cached data, and filter
|
|
struct id_t *mep;
|
|
mep = whoami.lookup(&sk);
|
|
if (mep != 0)
|
|
@@ -116,9 +117,13 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
|
|
|
|
u16 family = sk->__sk_common.skc_family;
|
|
|
|
+ FILTER_FAMILY
|
|
+
|
|
if (family == AF_INET) {
|
|
- struct ipv4_data_t data4 = {.span_us = delta_us,
|
|
- .rx_b = rx_b, .tx_b = tx_b};
|
|
+ struct ipv4_data_t data4 = {};
|
|
+ data4.span_us = delta_us;
|
|
+ data4.rx_b = rx_b;
|
|
+ data4.tx_b = tx_b;
|
|
data4.ts_us = bpf_ktime_get_ns() / 1000;
|
|
data4.saddr = sk->__sk_common.skc_rcv_saddr;
|
|
data4.daddr = sk->__sk_common.skc_daddr;
|
|
@@ -128,17 +133,19 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
|
|
if (mep == 0) {
|
|
bpf_get_current_comm(&data4.task, sizeof(data4.task));
|
|
} else {
|
|
- bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
|
|
+ bpf_probe_read_kernel(&data4.task, sizeof(data4.task), (void *)mep->task);
|
|
}
|
|
ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
|
|
|
|
} else /* 6 */ {
|
|
- struct ipv6_data_t data6 = {.span_us = delta_us,
|
|
- .rx_b = rx_b, .tx_b = tx_b};
|
|
+ struct ipv6_data_t data6 = {};
|
|
+ data6.span_us = delta_us;
|
|
+ data6.rx_b = rx_b;
|
|
+ data6.tx_b = tx_b;
|
|
data6.ts_us = bpf_ktime_get_ns() / 1000;
|
|
- bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
|
|
+ bpf_probe_read_kernel(&data6.saddr, sizeof(data6.saddr),
|
|
sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
|
|
- bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
|
|
+ bpf_probe_read_kernel(&data6.daddr, sizeof(data6.daddr),
|
|
sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
|
|
// a workaround until data6 compiles with separate lport/dport
|
|
data6.ports = dport + ((0ULL + lport) << 32);
|
|
@@ -146,7 +153,7 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
|
|
if (mep == 0) {
|
|
bpf_get_current_comm(&data6.task, sizeof(data6.task));
|
|
} else {
|
|
- bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
|
|
+ bpf_probe_read_kernel(&data6.task, sizeof(data6.task), (void *)mep->task);
|
|
}
|
|
ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
|
|
}
|
|
diff --git a/src/pmdas/bcc/modules/tcplife_old_tp.bpf b/src/pmdas/bcc/modules/tcplife_old_tp.bpf
|
|
deleted file mode 100644
|
|
index a7c9c625c..000000000
|
|
--- a/src/pmdas/bcc/modules/tcplife_old_tp.bpf
|
|
+++ /dev/null
|
|
@@ -1,166 +0,0 @@
|
|
-// Copyright 2016 Netflix, Inc.
|
|
-// Licensed under the Apache License, Version 2.0 (the "License")
|
|
-
|
|
-#include <uapi/linux/ptrace.h>
|
|
-#define KBUILD_MODNAME "pcpbcctcplife"
|
|
-#include <linux/tcp.h>
|
|
-#include <net/sock.h>
|
|
-#include <bcc/proto.h>
|
|
-
|
|
-BPF_HASH(birth, struct sock *, u64);
|
|
-
|
|
-// separate data structs for ipv4 and ipv6
|
|
-struct ipv4_data_t {
|
|
- // XXX: switch some to u32's when supported
|
|
- u64 ts_us;
|
|
- u64 pid;
|
|
- u64 saddr;
|
|
- u64 daddr;
|
|
- u64 ports;
|
|
- u64 rx_b;
|
|
- u64 tx_b;
|
|
- u64 span_us;
|
|
- char task[TASK_COMM_LEN];
|
|
-};
|
|
-BPF_PERF_OUTPUT(ipv4_events);
|
|
-
|
|
-struct ipv6_data_t {
|
|
- u64 ts_us;
|
|
- u64 pid;
|
|
- unsigned __int128 saddr;
|
|
- unsigned __int128 daddr;
|
|
- u64 ports;
|
|
- u64 rx_b;
|
|
- u64 tx_b;
|
|
- u64 span_us;
|
|
- char task[TASK_COMM_LEN];
|
|
-};
|
|
-BPF_PERF_OUTPUT(ipv6_events);
|
|
-
|
|
-struct id_t {
|
|
- u32 pid;
|
|
- char task[TASK_COMM_LEN];
|
|
-};
|
|
-BPF_HASH(whoami, struct sock *, struct id_t);
|
|
-
|
|
-TRACEPOINT_PROBE(tcp, tcp_set_state)
|
|
-{
|
|
- u32 pid = bpf_get_current_pid_tgid() >> 32;
|
|
- // sk is mostly used as a UUID, once for skc_family, and two tcp stats:
|
|
- struct sock *sk = (struct sock *)args->skaddr;
|
|
-
|
|
- // lport is either used in a filter here, or later
|
|
- u16 lport = args->sport;
|
|
- //FILTER_LPORT
|
|
-
|
|
- // dport is either used in a filter here, or later
|
|
- u16 dport = args->dport;
|
|
- //FILTER_DPORT
|
|
-
|
|
- /*
|
|
- * This tool includes PID and comm context. It's best effort, and may
|
|
- * be wrong in some situations. It currently works like this:
|
|
- * - record timestamp on any state < TCP_FIN_WAIT1
|
|
- * - cache task context on:
|
|
- * TCP_SYN_SENT: tracing from client
|
|
- * TCP_LAST_ACK: client-closed from server
|
|
- * - do output on TCP_CLOSE:
|
|
- * fetch task context if cached, or use current task
|
|
- */
|
|
-
|
|
- // capture birth time
|
|
- if (args->newstate < TCP_FIN_WAIT1) {
|
|
- /*
|
|
- * Matching just ESTABLISHED may be sufficient, provided no code-path
|
|
- * sets ESTABLISHED without a tcp_set_state() call. Until we know
|
|
- * that for sure, match all early states to increase chances a
|
|
- * timestamp is set.
|
|
- * Note that this needs to be set before the PID filter later on,
|
|
- * since the PID isn't reliable for these early stages, so we must
|
|
- * save all timestamps and do the PID filter later when we can.
|
|
- */
|
|
- u64 ts = bpf_ktime_get_ns();
|
|
- birth.update(&sk, &ts);
|
|
- }
|
|
-
|
|
- // record PID & comm on SYN_SENT
|
|
- if (args->newstate == TCP_SYN_SENT || args->newstate == TCP_LAST_ACK) {
|
|
- // now we can PID filter, both here and a little later on for CLOSE
|
|
- FILTER_PID
|
|
- struct id_t me = {.pid = pid};
|
|
- bpf_get_current_comm(&me.task, sizeof(me.task));
|
|
- whoami.update(&sk, &me);
|
|
- }
|
|
-
|
|
- if (args->newstate != TCP_CLOSE)
|
|
- return 0;
|
|
-
|
|
- // calculate lifespan
|
|
- u64 *tsp, delta_us;
|
|
- tsp = birth.lookup(&sk);
|
|
- if (tsp == 0) {
|
|
- whoami.delete(&sk); // may not exist
|
|
- return 0; // missed create
|
|
- }
|
|
- delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
|
|
- birth.delete(&sk);
|
|
-
|
|
- // fetch possible cached data, and filter
|
|
- struct id_t *mep;
|
|
- mep = whoami.lookup(&sk);
|
|
- if (mep != 0)
|
|
- pid = mep->pid;
|
|
- FILTER_PID
|
|
-
|
|
- // get throughput stats. see tcp_get_info().
|
|
- u64 rx_b = 0, tx_b = 0, sport = 0;
|
|
- struct tcp_sock *tp = (struct tcp_sock *)sk;
|
|
- // OLD: bpf_probe_read(&rx_b, sizeof(rx_b), &tp->bytes_received);
|
|
- // OLD: bpf_probe_read(&tx_b, sizeof(tx_b), &tp->bytes_acked);
|
|
- // NEW: rx_b = tp->bytes_received;
|
|
- // NEW: tx_b = tp->bytes_acked;
|
|
-
|
|
- u16 family = 0;
|
|
- // OLD: bpf_probe_read(&family, sizeof(family), &sk->__sk_common.skc_family);
|
|
- // NEW: family = sk->__sk_common.skc_family;
|
|
-
|
|
- if (family == AF_INET) {
|
|
-
|
|
- struct ipv4_data_t data4 = {.span_us = delta_us,
|
|
- .rx_b = rx_b, .tx_b = tx_b};
|
|
- data4.ts_us = bpf_ktime_get_ns() / 1000;
|
|
- bpf_probe_read(&data4.saddr, sizeof(u32), args->saddr);
|
|
- bpf_probe_read(&data4.daddr, sizeof(u32), args->daddr);
|
|
- // a workaround until data4 compiles with separate lport/dport
|
|
- data4.ports = dport + ((0ULL + lport) << 32);
|
|
- data4.pid = pid;
|
|
-
|
|
- if (mep == 0) {
|
|
- bpf_get_current_comm(&data4.task, sizeof(data4.task));
|
|
- } else {
|
|
- bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
|
|
- }
|
|
- ipv4_events.perf_submit(args, &data4, sizeof(data4));
|
|
-
|
|
- } else /* 6 */ {
|
|
- struct ipv6_data_t data6 = {.span_us = delta_us,
|
|
- .rx_b = rx_b, .tx_b = tx_b};
|
|
- data6.ts_us = bpf_ktime_get_ns() / 1000;
|
|
- bpf_probe_read(&data6.saddr, sizeof(data6.saddr), args->saddr_v6);
|
|
- bpf_probe_read(&data6.daddr, sizeof(data6.daddr), args->saddr_v6);
|
|
- // a workaround until data6 compiles with separate lport/dport
|
|
- data6.ports = dport + ((0ULL + lport) << 32);
|
|
- data6.pid = pid;
|
|
- if (mep == 0) {
|
|
- bpf_get_current_comm(&data6.task, sizeof(data6.task));
|
|
- } else {
|
|
- bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
|
|
- }
|
|
- ipv6_events.perf_submit(args, &data6, sizeof(data6));
|
|
- }
|
|
-
|
|
- if (mep != 0)
|
|
- whoami.delete(&sk);
|
|
-
|
|
- return 0;
|
|
-}
|
|
diff --git a/src/pmdas/bcc/modules/tcplife.bpf b/src/pmdas/bcc/modules/tcplife_tp.bpf
|
|
similarity index 80%
|
|
rename from src/pmdas/bcc/modules/tcplife.bpf
|
|
rename to src/pmdas/bcc/modules/tcplife_tp.bpf
|
|
index 19ca8d740..2b16b98e7 100644
|
|
--- a/src/pmdas/bcc/modules/tcplife.bpf
|
|
+++ b/src/pmdas/bcc/modules/tcplife_tp.bpf
|
|
@@ -2,7 +2,6 @@
|
|
// Licensed under the Apache License, Version 2.0 (the "License")
|
|
|
|
#include <uapi/linux/ptrace.h>
|
|
-#define KBUILD_MODNAME "pcpbcctcplife"
|
|
#include <linux/tcp.h>
|
|
#include <net/sock.h>
|
|
#include <bcc/proto.h>
|
|
@@ -11,11 +10,10 @@ BPF_HASH(birth, struct sock *, u64);
|
|
|
|
// separate data structs for ipv4 and ipv6
|
|
struct ipv4_data_t {
|
|
- // XXX: switch some to u32's when supported
|
|
u64 ts_us;
|
|
- u64 pid;
|
|
- u64 saddr;
|
|
- u64 daddr;
|
|
+ u32 pid;
|
|
+ u32 saddr;
|
|
+ u32 daddr;
|
|
u64 ports;
|
|
u64 rx_b;
|
|
u64 tx_b;
|
|
@@ -26,7 +24,7 @@ BPF_PERF_OUTPUT(ipv4_events);
|
|
|
|
struct ipv6_data_t {
|
|
u64 ts_us;
|
|
- u64 pid;
|
|
+ u32 pid;
|
|
unsigned __int128 saddr;
|
|
unsigned __int128 daddr;
|
|
u64 ports;
|
|
@@ -54,11 +52,11 @@ TRACEPOINT_PROBE(sock, inet_sock_set_state)
|
|
|
|
// lport is either used in a filter here, or later
|
|
u16 lport = args->sport;
|
|
- //FILTER_LPORT
|
|
+ FILTER_LPORT
|
|
|
|
// dport is either used in a filter here, or later
|
|
u16 dport = args->dport;
|
|
- //FILTER_DPORT
|
|
+ FILTER_DPORT
|
|
|
|
/*
|
|
* This tool includes PID and comm context. It's best effort, and may
|
|
@@ -115,20 +113,23 @@ TRACEPOINT_PROBE(sock, inet_sock_set_state)
|
|
pid = mep->pid;
|
|
FILTER_PID
|
|
|
|
+ u16 family = args->family;
|
|
+ FILTER_FAMILY
|
|
+
|
|
// get throughput stats. see tcp_get_info().
|
|
u64 rx_b = 0, tx_b = 0, sport = 0;
|
|
struct tcp_sock *tp = (struct tcp_sock *)sk;
|
|
- // OLD: bpf_probe_read(&rx_b, sizeof(rx_b), &tp->bytes_received);
|
|
- // OLD: bpf_probe_read(&tx_b, sizeof(tx_b), &tp->bytes_acked);
|
|
- // NEW: rx_b = tp->bytes_received;
|
|
- // NEW: tx_b = tp->bytes_acked;
|
|
+ rx_b = tp->bytes_received;
|
|
+ tx_b = tp->bytes_acked;
|
|
|
|
if (args->family == AF_INET) {
|
|
- struct ipv4_data_t data4 = {.span_us = delta_us,
|
|
- .rx_b = rx_b, .tx_b = tx_b};
|
|
+ struct ipv4_data_t data4 = {};
|
|
+ data4.span_us = delta_us;
|
|
+ data4.rx_b = rx_b;
|
|
+ data4.tx_b = tx_b;
|
|
data4.ts_us = bpf_ktime_get_ns() / 1000;
|
|
- bpf_probe_read(&data4.saddr, sizeof(u32), args->saddr);
|
|
- bpf_probe_read(&data4.daddr, sizeof(u32), args->daddr);
|
|
+ __builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr));
|
|
+ __builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr));
|
|
// a workaround until data4 compiles with separate lport/dport
|
|
data4.ports = dport + ((0ULL + lport) << 32);
|
|
data4.pid = pid;
|
|
@@ -136,23 +137,25 @@ TRACEPOINT_PROBE(sock, inet_sock_set_state)
|
|
if (mep == 0) {
|
|
bpf_get_current_comm(&data4.task, sizeof(data4.task));
|
|
} else {
|
|
- bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
|
|
+ bpf_probe_read_kernel(&data4.task, sizeof(data4.task), (void *)mep->task);
|
|
}
|
|
ipv4_events.perf_submit(args, &data4, sizeof(data4));
|
|
|
|
} else /* 6 */ {
|
|
- struct ipv6_data_t data6 = {.span_us = delta_us,
|
|
- .rx_b = rx_b, .tx_b = tx_b};
|
|
+ struct ipv6_data_t data6 = {};
|
|
+ data6.span_us = delta_us;
|
|
+ data6.rx_b = rx_b;
|
|
+ data6.tx_b = tx_b;
|
|
data6.ts_us = bpf_ktime_get_ns() / 1000;
|
|
- bpf_probe_read(&data6.saddr, sizeof(data6.saddr), args->saddr_v6);
|
|
- bpf_probe_read(&data6.daddr, sizeof(data6.daddr), args->saddr_v6);
|
|
+ __builtin_memcpy(&data6.saddr, args->saddr_v6, sizeof(data6.saddr));
|
|
+ __builtin_memcpy(&data6.daddr, args->daddr_v6, sizeof(data6.daddr));
|
|
// a workaround until data6 compiles with separate lport/dport
|
|
data6.ports = dport + ((0ULL + lport) << 32);
|
|
data6.pid = pid;
|
|
if (mep == 0) {
|
|
bpf_get_current_comm(&data6.task, sizeof(data6.task));
|
|
} else {
|
|
- bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
|
|
+ bpf_probe_read_kernel(&data6.task, sizeof(data6.task), (void *)mep->task);
|
|
}
|
|
ipv6_events.perf_submit(args, &data6, sizeof(data6));
|
|
}
|
|
diff --git a/src/pmdas/bcc/modules/tcpperpid.python b/src/pmdas/bcc/modules/tcpperpid.python
|
|
index 3cb2cfcfd..0096929a6 100644
|
|
--- a/src/pmdas/bcc/modules/tcpperpid.python
|
|
+++ b/src/pmdas/bcc/modules/tcpperpid.python
|
|
@@ -32,16 +32,10 @@ from modules.pcpbcc import PCPBCCBase
|
|
#
|
|
# BPF program
|
|
#
|
|
-bpf_src = "modules/tcplife.bpf"
|
|
-# Compat with kernel < 4.16, bcc < 0.6
|
|
-TRACEFS = "/sys/kernel/debug/tracing"
|
|
-bpf_src_old_tp = "modules/tcplife_old_tp.bpf"
|
|
-bpf_src_old_kb = "modules/tcplife_old_kb.bpf"
|
|
-if not path.exists(TRACEFS + "/events/sock/inet_sock_set_state"):
|
|
- if path.exists(TRACEFS + "/events/tcp/tcp_set_state"):
|
|
- bpf_src = bpf_src_old_tp
|
|
- else:
|
|
- bpf_src = bpf_src_old_kb
|
|
+if BPF.tracepoint_exists("sock", "inet_sock_set_state"):
|
|
+ bpf_src = "modules/tcplife_tp.bpf"
|
|
+else:
|
|
+ bpf_src = "modules/tcplife_kp.bpf"
|
|
|
|
# Alternative, "high resolution" BPF
|
|
bpf_highres = "modules/tcptop.bpf"
|
|
@@ -53,36 +47,6 @@ MODULE = 'tcpperpid'
|
|
BASENS = 'proc.io.net.total.'
|
|
units_bytes = pmUnits(1, 0, 0, PM_SPACE_BYTE, 0, 0)
|
|
|
|
-TASK_COMM_LEN = 16 # linux/sched.h
|
|
-
|
|
-class Data_ipv4(ct.Structure):
|
|
- """ IPv4 data struct """
|
|
- _fields_ = [
|
|
- ("ts_us", ct.c_ulonglong),
|
|
- ("pid", ct.c_ulonglong),
|
|
- ("saddr", ct.c_ulonglong),
|
|
- ("daddr", ct.c_ulonglong),
|
|
- ("ports", ct.c_ulonglong),
|
|
- ("rx_b", ct.c_ulonglong),
|
|
- ("tx_b", ct.c_ulonglong),
|
|
- ("span_us", ct.c_ulonglong),
|
|
- ("task", ct.c_char * TASK_COMM_LEN)
|
|
- ]
|
|
-
|
|
-class Data_ipv6(ct.Structure):
|
|
- """ IPv6 data struct """
|
|
- _fields_ = [
|
|
- ("ts_us", ct.c_ulonglong),
|
|
- ("pid", ct.c_ulonglong),
|
|
- ("saddr", (ct.c_ulonglong * 2)),
|
|
- ("daddr", (ct.c_ulonglong * 2)),
|
|
- ("ports", ct.c_ulonglong),
|
|
- ("rx_b", ct.c_ulonglong),
|
|
- ("tx_b", ct.c_ulonglong),
|
|
- ("span_us", ct.c_ulonglong),
|
|
- ("task", ct.c_char * TASK_COMM_LEN)
|
|
- ]
|
|
-
|
|
#
|
|
# PCP BCC Module
|
|
#
|
|
@@ -133,15 +97,14 @@ class PCPBCCModule(PCPBCCBase):
|
|
self.log("Using BPF source file %s." % src)
|
|
|
|
# Exit hard if impossible to continue
|
|
- if self.bcc_version() == "0.6.1" and src == bpf_src_old_kb and not self.highres:
|
|
- raise RuntimeError("BCC 0.6.1 bug makes it incompatible with this module "
|
|
- "on kernel < 4.15 in non-highres mode.")
|
|
+ if self.bcc_version_tuple() < (0, 6, 1) and not self.highres:
|
|
+ raise RuntimeError("BCC 0.6.1+ is required for this module in non-highres mode.")
|
|
|
|
self.log("Initialized.")
|
|
|
|
def handle_ipv4_event(self, _cpu, data, _size):
|
|
""" IPv4 event handler """
|
|
- event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
|
|
+ event = self.bpf["ipv4_events"].event(data)
|
|
pid = str(event.pid).zfill(6)
|
|
self.lock.acquire()
|
|
if pid not in self.ipv4_stats:
|
|
@@ -153,7 +116,7 @@ class PCPBCCModule(PCPBCCBase):
|
|
|
|
def handle_ipv6_event(self, _cpu, data, _size):
|
|
""" IPv6 event handler """
|
|
- event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
|
|
+ event = self.bpf["ipv6_events"].event(data)
|
|
pid = str(event.pid).zfill(6)
|
|
self.lock.acquire()
|
|
if pid not in self.ipv6_stats:
|
|
@@ -199,31 +162,25 @@ class PCPBCCModule(PCPBCCBase):
|
|
self.bpf_text = src.read()
|
|
if self.highres:
|
|
self.bpf_text = self.bpf_text.replace("FILTER", "FILTER_PID")
|
|
- # Compat with bcc < 0.6
|
|
- self.log("Testing BCC compatilibility, possible errors below are safe to ignore.")
|
|
- try:
|
|
- test_txt = self.bpf_text.replace("// NEW: ", "").replace("FILTER_PID", "")
|
|
- test_bpf = BPF(text=test_txt)
|
|
- test_bpf.cleanup()
|
|
- self.bpf_text = self.bpf_text.replace("// NEW: ", "")
|
|
- except Exception: # pylint: disable=broad-except
|
|
- self.bpf_text = self.bpf_text.replace("// OLD: ", "")
|
|
- self.log("Tested BCC compatilibility, possible errors above are safe to ignore.")
|
|
|
|
if self.dports:
|
|
filterp = " && ".join(["dport != %d" % port for port in self.dports])
|
|
filter_txt = "if (%s) { birth.delete(&sk); return 0; }" % filterp
|
|
- self.bpf_text = self.bpf_text.replace("//FILTER_DPORT", filter_txt)
|
|
+ self.bpf_text = self.bpf_text.replace("FILTER_DPORT", filter_txt)
|
|
if self.lports:
|
|
filterp = " && ".join(["lport != %d" % port for port in self.lports])
|
|
filter_txt = "if (%s) { birth.delete(&sk); return 0; }" % filterp
|
|
- self.bpf_text = self.bpf_text.replace("//FILTER_LPORT", filter_txt)
|
|
+ self.bpf_text = self.bpf_text.replace("FILTER_LPORT", filter_txt)
|
|
|
|
if not self.pids and self.proc_filter and self.proc_refresh:
|
|
self.log("No process to attach found, activation postponed.")
|
|
return
|
|
|
|
bpf_text = self.apply_pid_filter(self.bpf_text, self.pids, False)
|
|
+ bpf_text = bpf_text.replace('FILTER_PID', '')
|
|
+ bpf_text = bpf_text.replace('FILTER_DPORT', '')
|
|
+ bpf_text = bpf_text.replace('FILTER_LPORT', '')
|
|
+ bpf_text = bpf_text.replace('FILTER_FAMILY', '')
|
|
|
|
if self.debug:
|
|
self.log("BPF to be compiled:\n" + bpf_text.strip())
|
|
diff --git a/src/pmdas/bcc/modules/tcptop.bpf b/src/pmdas/bcc/modules/tcptop.bpf
|
|
index 349ee1529..c1fed7aef 100644
|
|
--- a/src/pmdas/bcc/modules/tcptop.bpf
|
|
+++ b/src/pmdas/bcc/modules/tcptop.bpf
|
|
@@ -4,6 +4,7 @@
|
|
#include <uapi/linux/ptrace.h>
|
|
#include <net/sock.h>
|
|
#include <bcc/proto.h>
|
|
+
|
|
struct ipv4_key_t {
|
|
u32 pid;
|
|
u32 saddr;
|
|
@@ -13,25 +14,32 @@ struct ipv4_key_t {
|
|
};
|
|
BPF_HASH(ipv4_send_bytes, struct ipv4_key_t);
|
|
BPF_HASH(ipv4_recv_bytes, struct ipv4_key_t);
|
|
+
|
|
struct ipv6_key_t {
|
|
+ unsigned __int128 saddr;
|
|
+ unsigned __int128 daddr;
|
|
u32 pid;
|
|
- // workaround until unsigned __int128 support:
|
|
- u64 saddr0;
|
|
- u64 saddr1;
|
|
- u64 daddr0;
|
|
- u64 daddr1;
|
|
u16 lport;
|
|
u16 dport;
|
|
+ u64 __pad__;
|
|
};
|
|
BPF_HASH(ipv6_send_bytes, struct ipv6_key_t);
|
|
BPF_HASH(ipv6_recv_bytes, struct ipv6_key_t);
|
|
+
|
|
int kprobe__tcp_sendmsg(struct pt_regs *ctx, struct sock *sk,
|
|
struct msghdr *msg, size_t size)
|
|
{
|
|
- u32 pid = bpf_get_current_pid_tgid();
|
|
- FILTER
|
|
+ if (container_should_be_filtered()) {
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ u32 pid = bpf_get_current_pid_tgid() >> 32;
|
|
+ FILTER_PID
|
|
+
|
|
u16 dport = 0, family = sk->__sk_common.skc_family;
|
|
- u64 *val, zero = 0;
|
|
+
|
|
+ FILTER_FAMILY
|
|
+
|
|
if (family == AF_INET) {
|
|
struct ipv4_key_t ipv4_key = {.pid = pid};
|
|
ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr;
|
|
@@ -39,31 +47,24 @@ int kprobe__tcp_sendmsg(struct pt_regs *ctx, struct sock *sk,
|
|
ipv4_key.lport = sk->__sk_common.skc_num;
|
|
dport = sk->__sk_common.skc_dport;
|
|
ipv4_key.dport = ntohs(dport);
|
|
- val = ipv4_send_bytes.lookup_or_init(&ipv4_key, &zero);
|
|
- if (val) {
|
|
- (*val) += size;
|
|
- }
|
|
+ ipv4_send_bytes.increment(ipv4_key, size);
|
|
+
|
|
} else if (family == AF_INET6) {
|
|
struct ipv6_key_t ipv6_key = {.pid = pid};
|
|
- bpf_probe_read(&ipv6_key.saddr0, sizeof(ipv6_key.saddr0),
|
|
- &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[0]);
|
|
- bpf_probe_read(&ipv6_key.saddr1, sizeof(ipv6_key.saddr1),
|
|
- &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[2]);
|
|
- bpf_probe_read(&ipv6_key.daddr0, sizeof(ipv6_key.daddr0),
|
|
- &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[0]);
|
|
- bpf_probe_read(&ipv6_key.daddr1, sizeof(ipv6_key.daddr1),
|
|
- &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[2]);
|
|
+ bpf_probe_read_kernel(&ipv6_key.saddr, sizeof(ipv6_key.saddr),
|
|
+ &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
|
|
+ bpf_probe_read_kernel(&ipv6_key.daddr, sizeof(ipv6_key.daddr),
|
|
+ &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
|
|
ipv6_key.lport = sk->__sk_common.skc_num;
|
|
dport = sk->__sk_common.skc_dport;
|
|
ipv6_key.dport = ntohs(dport);
|
|
- val = ipv6_send_bytes.lookup_or_init(&ipv6_key, &zero);
|
|
- if (val) {
|
|
- (*val) += size;
|
|
- }
|
|
+ ipv6_send_bytes.increment(ipv6_key, size);
|
|
}
|
|
// else drop
|
|
+
|
|
return 0;
|
|
}
|
|
+
|
|
/*
|
|
* tcp_recvmsg() would be obvious to trace, but is less suitable because:
|
|
* - we'd need to trace both entry and return, to have both sock and size
|
|
@@ -72,12 +73,21 @@ int kprobe__tcp_sendmsg(struct pt_regs *ctx, struct sock *sk,
|
|
*/
|
|
int kprobe__tcp_cleanup_rbuf(struct pt_regs *ctx, struct sock *sk, int copied)
|
|
{
|
|
- u32 pid = bpf_get_current_pid_tgid();
|
|
- FILTER
|
|
+ if (container_should_be_filtered()) {
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ u32 pid = bpf_get_current_pid_tgid() >> 32;
|
|
+ FILTER_PID
|
|
+
|
|
u16 dport = 0, family = sk->__sk_common.skc_family;
|
|
u64 *val, zero = 0;
|
|
+
|
|
if (copied <= 0)
|
|
return 0;
|
|
+
|
|
+ FILTER_FAMILY
|
|
+
|
|
if (family == AF_INET) {
|
|
struct ipv4_key_t ipv4_key = {.pid = pid};
|
|
ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr;
|
|
@@ -85,28 +95,20 @@ int kprobe__tcp_cleanup_rbuf(struct pt_regs *ctx, struct sock *sk, int copied)
|
|
ipv4_key.lport = sk->__sk_common.skc_num;
|
|
dport = sk->__sk_common.skc_dport;
|
|
ipv4_key.dport = ntohs(dport);
|
|
- val = ipv4_recv_bytes.lookup_or_init(&ipv4_key, &zero);
|
|
- if (val) {
|
|
- (*val) += copied;
|
|
- }
|
|
+ ipv4_recv_bytes.increment(ipv4_key, copied);
|
|
+
|
|
} else if (family == AF_INET6) {
|
|
struct ipv6_key_t ipv6_key = {.pid = pid};
|
|
- bpf_probe_read(&ipv6_key.saddr0, sizeof(ipv6_key.saddr0),
|
|
- &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[0]);
|
|
- bpf_probe_read(&ipv6_key.saddr1, sizeof(ipv6_key.saddr1),
|
|
- &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[2]);
|
|
- bpf_probe_read(&ipv6_key.daddr0, sizeof(ipv6_key.daddr0),
|
|
- &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[0]);
|
|
- bpf_probe_read(&ipv6_key.daddr1, sizeof(ipv6_key.daddr1),
|
|
- &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[2]);
|
|
+ bpf_probe_read_kernel(&ipv6_key.saddr, sizeof(ipv6_key.saddr),
|
|
+ &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
|
|
+ bpf_probe_read_kernel(&ipv6_key.daddr, sizeof(ipv6_key.daddr),
|
|
+ &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
|
|
ipv6_key.lport = sk->__sk_common.skc_num;
|
|
dport = sk->__sk_common.skc_dport;
|
|
ipv6_key.dport = ntohs(dport);
|
|
- val = ipv6_recv_bytes.lookup_or_init(&ipv6_key, &zero);
|
|
- if (val) {
|
|
- (*val) += copied;
|
|
- }
|
|
+ ipv6_recv_bytes.increment(ipv6_key, copied);
|
|
}
|
|
// else drop
|
|
+
|
|
return 0;
|
|
}
|
|
diff --git a/src/pmdas/bcc/modules/tcptop.python b/src/pmdas/bcc/modules/tcptop.python
|
|
index 45063dff3..db1c1da15 100644
|
|
--- a/src/pmdas/bcc/modules/tcptop.python
|
|
+++ b/src/pmdas/bcc/modules/tcptop.python
|
|
@@ -120,13 +120,14 @@ class PCPBCCModule(PCPBCCBase):
|
|
with open(path.dirname(__file__) + '/../' + bpf_src) as src:
|
|
self.bpf_text = src.read()
|
|
|
|
- self.bpf_text = self.bpf_text.replace("FILTER", "FILTER_PID")
|
|
-
|
|
if not self.pids and self.proc_filter and self.proc_refresh:
|
|
self.log("No process to attach found, activation postponed.")
|
|
return
|
|
|
|
bpf_text = self.apply_pid_filter(self.bpf_text, self.pids, False)
|
|
+ bpf_text = bpf_text.replace('FILTER_PID', '')
|
|
+ bpf_text = bpf_text.replace('FILTER_FAMILY', '')
|
|
+ bpf_text = bpf_text.replace('container_should_be_filtered()', '0')
|
|
|
|
if self.debug:
|
|
self.log("BPF to be compiled:\n" + bpf_text.strip())
|
|
@@ -155,21 +156,31 @@ class PCPBCCModule(PCPBCCBase):
|
|
|
|
@staticmethod
|
|
def ipv4_table_to_dict(table):
|
|
- """ Build hashable dict from IPv4 BPF table """
|
|
- return {TCPSessionKey(pid=k.pid,
|
|
- laddr=inet_ntop(AF_INET, pack("I", k.saddr)),
|
|
- lport=k.lport,
|
|
- daddr=inet_ntop(AF_INET, pack("I", k.daddr)),
|
|
- dport=k.dport):v.value for k, v in table.items()}
|
|
+ """Build hashable dict from IPv4 BPF table"""
|
|
+ return {
|
|
+ TCPSessionKey(
|
|
+ pid=k.pid,
|
|
+ laddr=inet_ntop(AF_INET, pack("I", k.saddr)),
|
|
+ lport=k.lport,
|
|
+ daddr=inet_ntop(AF_INET, pack("I", k.daddr)),
|
|
+ dport=k.dport,
|
|
+ ): v.value
|
|
+ for k, v in table.items()
|
|
+ }
|
|
|
|
@staticmethod
|
|
def ipv6_table_to_dict(table):
|
|
- """ Build hashable dict from IPv6 BPF table """
|
|
- return {TCPSessionKey(pid=k.pid,
|
|
- laddr=inet_ntop(AF_INET6, pack("QQ", k.saddr0, k.saddr1)),
|
|
- lport=k.lport,
|
|
- daddr=inet_ntop(AF_INET6, pack("QQ", k.daddr0, k.daddr1)),
|
|
- dport=k.dport):v.value for k, v in table.items()}
|
|
+ """Build hashable dict from IPv6 BPF table"""
|
|
+ return {
|
|
+ TCPSessionKey(
|
|
+ pid=k.pid,
|
|
+ laddr=inet_ntop(AF_INET6, k.saddr),
|
|
+ lport=k.lport,
|
|
+ daddr=inet_ntop(AF_INET6, k.daddr),
|
|
+ dport=k.dport,
|
|
+ ): v.value
|
|
+ for k, v in table.items()
|
|
+ }
|
|
|
|
def refresh_stats(self):
|
|
""" Refresh statistics from BPF table """
|
|
commit d45ce8e85035cc95ba897cd19967fad6d5d741be (cherry-picked)
|
|
Author: Andreas Gerstmayr <agerstmayr@redhat.com>
|
|
Date: Wed Dec 15 08:03:40 2021 +0100
|
|
|
|
qa: update qa/1118 to add new log output of runqlat bcc module
|
|
|
|
diff --git a/qa/1118 b/qa/1118
|
|
index 4123495b5..bcaec0a0d 100755
|
|
--- a/qa/1118
|
|
+++ b/qa/1118
|
|
@@ -21,12 +21,19 @@ _label_filter()
|
|
grep '"0-1"' | grep '"statistic":"histogram"' | grep '"lower_bound":0' | grep 'upper_bound":1' > /dev/null && echo 'OK'
|
|
}
|
|
|
|
+_install_filter()
|
|
+{
|
|
+ sed \
|
|
+ -e "s/Using BPF source file .\+/Using BPF source file X/g" \
|
|
+ #end
|
|
+}
|
|
+
|
|
_prepare_pmda bcc
|
|
trap "_pmdabcc_cleanup; exit \$status" 0 1 2 3 15
|
|
_stop_auto_restart pmcd
|
|
|
|
# real QA test starts here
|
|
-cat <<EOF | _pmdabcc_install
|
|
+cat <<EOF | _pmdabcc_install | _install_filter
|
|
# Installed by PCP QA test $seq on `date`
|
|
[pmda]
|
|
modules = runqlat
|
|
diff --git a/qa/1118.out b/qa/1118.out
|
|
index 16a9fa4b4..e74f97964 100644
|
|
--- a/qa/1118.out
|
|
+++ b/qa/1118.out
|
|
@@ -9,6 +9,7 @@ Info: runqlat
|
|
Info: Modules configured.
|
|
Info: Initializing modules:
|
|
Info: runqlat
|
|
+Info: runqlat: Using BPF source file X
|
|
Info: runqlat: Initialized.
|
|
Info: Modules initialized.
|
|
Info: Registering metrics:
|
|
@@ -25,6 +26,7 @@ Info: runqlat
|
|
Info: Modules configured.
|
|
Info: Initializing modules:
|
|
Info: runqlat
|
|
+Info: runqlat: Using BPF source file X
|
|
Info: runqlat: Initialized.
|
|
Info: Modules initialized.
|
|
Info: Registering metrics:
|