pcp/SOURCES/redhat-bugzilla-2024982.patch

1434 lines
50 KiB
Diff

commit 14ffcd934e1c5099b471f4e73da32d1b32bac7e6
Author: Andreas Gerstmayr <agerstmayr@redhat.com>
Date: Mon Dec 13 20:10:40 2021 +0100
pmdabcc: sync bcc PMDA modules with upstream bcc tools
diff --git a/src/pmdas/bcc/modules/execsnoop.bpf b/src/pmdas/bcc/modules/execsnoop.bpf
index f69200773..aa755b3a1 100644
--- a/src/pmdas/bcc/modules/execsnoop.bpf
+++ b/src/pmdas/bcc/modules/execsnoop.bpf
@@ -4,40 +4,57 @@
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
#include <linux/fs.h>
+
#define ARGSIZE 128
+
enum event_type {
EVENT_ARG,
EVENT_RET,
};
+
struct data_t {
u32 pid; // PID as in the userspace term (i.e. task->tgid in kernel)
u32 ppid; // Parent PID as in the userspace term (i.e task->real_parent->tgid in kernel)
+ u32 uid;
char comm[TASK_COMM_LEN];
enum event_type type;
char argv[ARGSIZE];
int retval;
};
+
BPF_PERF_OUTPUT(events);
+
static int __submit_arg(struct pt_regs *ctx, void *ptr, struct data_t *data)
{
- bpf_probe_read(data->argv, sizeof(data->argv), ptr);
+ bpf_probe_read_user(data->argv, sizeof(data->argv), ptr);
events.perf_submit(ctx, data, sizeof(struct data_t));
return 1;
}
+
static int submit_arg(struct pt_regs *ctx, void *ptr, struct data_t *data)
{
const char *argp = NULL;
- bpf_probe_read(&argp, sizeof(argp), ptr);
+ bpf_probe_read_user(&argp, sizeof(argp), ptr);
if (argp) {
return __submit_arg(ctx, (void *)(argp), data);
}
return 0;
}
+
int syscall__execve(struct pt_regs *ctx,
const char __user *filename,
const char __user *const __user *__argv,
const char __user *const __user *__envp)
{
+
+ u32 uid = bpf_get_current_uid_gid() & 0xffffffff;
+
+ UID_FILTER
+
+ if (container_should_be_filtered()) {
+ return 0;
+ }
+
// create data here and pass to submit_arg to save stack space (#555)
struct data_t data = {};
struct task_struct *task;
@@ -52,25 +69,37 @@ int syscall__execve(struct pt_regs *ctx,
bpf_get_current_comm(&data.comm, sizeof(data.comm));
data.type = EVENT_ARG;
+
__submit_arg(ctx, (void *)filename, &data);
+
// skip first arg, as we submitted filename
#pragma unroll
for (int i = 1; i < MAXARG; i++) {
if (submit_arg(ctx, (void *)&__argv[i], &data) == 0)
goto out;
}
+
// handle truncated argument list
char ellipsis[] = "...";
__submit_arg(ctx, (void *)ellipsis, &data);
out:
return 0;
}
+
int do_ret_sys_execve(struct pt_regs *ctx)
{
+ if (container_should_be_filtered()) {
+ return 0;
+ }
+
struct data_t data = {};
struct task_struct *task;
+ u32 uid = bpf_get_current_uid_gid() & 0xffffffff;
+ UID_FILTER
+
data.pid = bpf_get_current_pid_tgid() >> 32;
+ data.uid = uid;
task = (struct task_struct *)bpf_get_current_task();
// Some kernels, like Ubuntu 4.13.0-generic, return 0
@@ -82,5 +111,6 @@ int do_ret_sys_execve(struct pt_regs *ctx)
data.type = EVENT_RET;
data.retval = PT_REGS_RC(ctx);
events.perf_submit(ctx, &data, sizeof(data));
+
return 0;
}
diff --git a/src/pmdas/bcc/modules/execsnoop.python b/src/pmdas/bcc/modules/execsnoop.python
index 54382fa9b..1127cc471 100644
--- a/src/pmdas/bcc/modules/execsnoop.python
+++ b/src/pmdas/bcc/modules/execsnoop.python
@@ -44,20 +44,6 @@ MODULE = 'execsnoop'
BASENS = 'proc.exec.'
units_none = pmUnits(0, 0, 0, 0, 0, 0)
-TASK_COMM_LEN = 16 # linux/sched.h
-ARGSIZE = 128 # should match #define in execsnoop.bpf
-
-class Data(ct.Structure):
- """ execsnoop data struct """
- _fields_ = [
- ("pid", ct.c_uint),
- ("ppid", ct.c_uint),
- ("comm", ct.c_char * TASK_COMM_LEN),
- ("type", ct.c_int),
- ("argv", ct.c_char * ARGSIZE),
- ("retval", ct.c_int),
- ]
-
class EventType(object):
""" Event type """
EVENT_ARG = 0
@@ -137,7 +123,7 @@ class PCPBCCModule(PCPBCCBase):
def handle_event(self, _cpu, data, _size):
""" Event handler """
- event = ct.cast(data, ct.POINTER(Data)).contents
+ event = self.bpf["events"].event(data)
skip = False
if event.type == EventType.EVENT_ARG:
@@ -145,9 +131,9 @@ class PCPBCCModule(PCPBCCBase):
elif event.type == EventType.EVENT_RET:
if event.retval != 0 and not self.include_failed:
skip = True
- if self.command and not re.search(self.command, event.comm):
+ if self.command and not re.search(bytes(self.command), event.comm):
skip = True
- if self.args and not re.search(self.args, b" ".join(self.argv_cache[event.pid])):
+ if self.args and not re.search(bytes(self.args), b" ".join(self.argv_cache[event.pid])):
skip = True
if not skip:
@@ -177,10 +163,14 @@ class PCPBCCModule(PCPBCCBase):
self.bpf_text = self.bpf_text.replace("MAXARG", str(self.max_args))
+ bpf_text = self.bpf_text
+ bpf_text = bpf_text.replace('UID_FILTER', '')
+ bpf_text = bpf_text.replace('container_should_be_filtered()', '0')
+
if self.debug:
- self.log("BPF to be compiled:\n" + self.bpf_text.strip())
+ self.log("BPF to be compiled:\n" + bpf_text.strip())
- self.bpf = BPF(text=self.bpf_text)
+ self.bpf = BPF(text=bpf_text)
execve_fnname = self.get_syscall_fnname("execve")
self.bpf.attach_kprobe(event=execve_fnname, fn_name="syscall__execve")
self.bpf.attach_kretprobe(event=execve_fnname, fn_name="do_ret_sys_execve")
diff --git a/src/pmdas/bcc/modules/pcpbcc.python b/src/pmdas/bcc/modules/pcpbcc.python
index 0555dc33f..62783b7fc 100644
--- a/src/pmdas/bcc/modules/pcpbcc.python
+++ b/src/pmdas/bcc/modules/pcpbcc.python
@@ -14,6 +14,7 @@
""" PCP BCC PMDA module base class """
import re
+import platform
import ctypes as ct
from os import kill, listdir, path
from collections import OrderedDict
@@ -348,6 +349,16 @@ class PCPBCCBase(object):
""" Returns BCC version as an int tuple (for comparisons) """
return tuple(map(int, PCPBCCBase.bcc_version().split('.')))
+ @staticmethod
+ def kernel_version():
+ """Returns the kernel version"""
+ version_str = platform.release()
+ m = re.match(r'^(\d+)\.(\d+)\.(\d+)', version_str)
+ if m:
+ return tuple(map(int, m.groups()))
+ else:
+ return (0, 0, 0)
+
def perf_buffer_poller(self):
""" BPF poller """
try:
diff --git a/src/pmdas/bcc/modules/runqlat.python b/src/pmdas/bcc/modules/runqlat.python
index 27007c7e5..1c6c6b4b0 100644
--- a/src/pmdas/bcc/modules/runqlat.python
+++ b/src/pmdas/bcc/modules/runqlat.python
@@ -30,7 +30,11 @@ from modules.pcpbcc import PCPBCCBase
#
# BPF program
#
-bpf_src = "modules/runqlat.bpf"
+is_support_raw_tp = BPF.support_raw_tracepoint()
+if is_support_raw_tp:
+ bpf_src = "modules/runqlat_tp.bpf"
+else:
+ bpf_src = "modules/runqlat_kp.bpf"
#
# PCP BCC PMDA constants
@@ -59,6 +63,7 @@ class PCPBCCModule(PCPBCCBase):
self.proc_filter = self.config.get(MODULE, opt)
self.update_pids(self.get_proc_info(self.proc_filter))
+ self.log("Using BPF source file %s." % bpf_src)
self.log("Initialized.")
def metrics(self):
@@ -89,7 +94,23 @@ class PCPBCCModule(PCPBCCBase):
with open(path.dirname(__file__) + '/../' + bpf_src) as src:
self.bpf_text = src.read()
+ # BPF.kernel_struct_has_field requires BCC v0.23.0
+ # use kernel version check as alternative
+ # pylint: disable=no-member
+ if (
+ hasattr(BPF, "kernel_struct_has_field")
+ and BPF.kernel_struct_has_field(b"task_struct", b"__state") == 1
+ ) or self.kernel_version() >= (5, 14, 0):
+ self.bpf_text = self.bpf_text.replace('STATE_FIELD', '__state')
+ else:
+ self.bpf_text = self.bpf_text.replace('STATE_FIELD', 'state')
+
self.bpf_text = self.bpf_text.replace("FILTER", "PID_CHECK")
+ self.bpf_text = self.bpf_text.replace('FACTOR', 'delta /= 1000;')
+
+ self.bpf_text = self.bpf_text.replace('STORAGE', 'BPF_HISTOGRAM(dist);')
+ self.bpf_text = self.bpf_text.replace('STORE',
+ 'dist.increment(bpf_log2l(delta));')
if not self.pids and self.proc_filter and self.proc_refresh:
self.log("No process to attach found, activation postponed.")
@@ -102,9 +123,11 @@ class PCPBCCModule(PCPBCCBase):
self.reset_cache()
self.bpf = BPF(text=bpf_text)
- self.bpf.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
- self.bpf.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
- self.bpf.attach_kprobe(event_re=r"^finish_task_switch$|^finish_task_switch\.isra\.\d$", fn_name="trace_run")
+ if not is_support_raw_tp:
+ self.bpf.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
+ self.bpf.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
+ self.bpf.attach_kprobe(event_re=r"^finish_task_switch$|^finish_task_switch\.isra\.\d$",
+ fn_name="trace_run")
self.log("Compiled.")
except Exception as error: # pylint: disable=broad-except
self.bpf = None
diff --git a/src/pmdas/bcc/modules/runqlat.bpf b/src/pmdas/bcc/modules/runqlat_kp.bpf
similarity index 54%
rename from src/pmdas/bcc/modules/runqlat.bpf
rename to src/pmdas/bcc/modules/runqlat_kp.bpf
index a3664a035..dd643d600 100644
--- a/src/pmdas/bcc/modules/runqlat.bpf
+++ b/src/pmdas/bcc/modules/runqlat_kp.bpf
@@ -5,6 +5,7 @@
#include <linux/sched.h>
#include <linux/nsproxy.h>
#include <linux/pid_namespace.h>
+#include <linux/init_task.h>
typedef struct pid_key {
u64 id; // work around
@@ -17,7 +18,7 @@ typedef struct pidns_key {
} pidns_key_t;
BPF_HASH(start, u32);
-BPF_HISTOGRAM(dist);
+STORAGE
struct rq;
@@ -31,6 +32,45 @@ static int trace_enqueue(u32 tgid, u32 pid)
return 0;
}
+static __always_inline unsigned int pid_namespace(struct task_struct *task)
+{
+
+/* pids[] was removed from task_struct since commit 2c4704756cab7cfa031ada4dab361562f0e357c0
+ * Using the macro INIT_PID_LINK as a conditional judgment.
+ */
+#ifdef INIT_PID_LINK
+ struct pid_link pids;
+ unsigned int level;
+ struct upid upid;
+ struct ns_common ns;
+
+ /* get the pid namespace by following task_active_pid_ns(),
+ * pid->numbers[pid->level].ns
+ */
+ bpf_probe_read_kernel(&pids, sizeof(pids), &task->pids[PIDTYPE_PID]);
+ bpf_probe_read_kernel(&level, sizeof(level), &pids.pid->level);
+ bpf_probe_read_kernel(&upid, sizeof(upid), &pids.pid->numbers[level]);
+ bpf_probe_read_kernel(&ns, sizeof(ns), &upid.ns->ns);
+
+ return ns.inum;
+#else
+ struct pid *pid;
+ unsigned int level;
+ struct upid upid;
+ struct ns_common ns;
+
+ /* get the pid namespace by following task_active_pid_ns(),
+ * pid->numbers[pid->level].ns
+ */
+ bpf_probe_read_kernel(&pid, sizeof(pid), &task->thread_pid);
+ bpf_probe_read_kernel(&level, sizeof(level), &pid->level);
+ bpf_probe_read_kernel(&upid, sizeof(upid), &pid->numbers[level]);
+ bpf_probe_read_kernel(&ns, sizeof(ns), &upid.ns->ns);
+
+ return ns.inum;
+#endif
+}
+
int trace_wake_up_new_task(struct pt_regs *ctx, struct task_struct *p)
{
return trace_enqueue(p->tgid, p->pid);
@@ -48,7 +88,7 @@ int trace_run(struct pt_regs *ctx, struct task_struct *prev)
u32 pid, tgid;
// ivcsw: treat like an enqueue event and store timestamp
- if (prev->state == TASK_RUNNING) {
+ if (prev->STATE_FIELD == TASK_RUNNING) {
tgid = prev->tgid;
pid = prev->pid;
if (!(FILTER || pid == 0)) {
@@ -69,10 +109,10 @@ int trace_run(struct pt_regs *ctx, struct task_struct *prev)
return 0; // missed enqueue
}
delta = bpf_ktime_get_ns() - *tsp;
- delta /= 1000;
+ FACTOR
// store as histogram
- dist.increment(bpf_log2l(delta));
+ STORE
start.delete(&pid);
return 0;
diff --git a/src/pmdas/bcc/modules/runqlat_tp.bpf b/src/pmdas/bcc/modules/runqlat_tp.bpf
new file mode 100644
index 000000000..f0e9ce69b
--- /dev/null
+++ b/src/pmdas/bcc/modules/runqlat_tp.bpf
@@ -0,0 +1,124 @@
+// Copyright 2016 Netflix, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
+#include <linux/init_task.h>
+
+typedef struct pid_key {
+ u64 id; // work around
+ u64 slot;
+} pid_key_t;
+
+typedef struct pidns_key {
+ u64 id; // work around
+ u64 slot;
+} pidns_key_t;
+
+BPF_HASH(start, u32);
+STORAGE
+
+struct rq;
+
+// record enqueue timestamp
+static int trace_enqueue(u32 tgid, u32 pid)
+{
+ if (FILTER || pid == 0)
+ return 0;
+ u64 ts = bpf_ktime_get_ns();
+ start.update(&pid, &ts);
+ return 0;
+}
+
+static __always_inline unsigned int pid_namespace(struct task_struct *task)
+{
+
+/* pids[] was removed from task_struct since commit 2c4704756cab7cfa031ada4dab361562f0e357c0
+ * Using the macro INIT_PID_LINK as a conditional judgment.
+ */
+#ifdef INIT_PID_LINK
+ struct pid_link pids;
+ unsigned int level;
+ struct upid upid;
+ struct ns_common ns;
+
+ /* get the pid namespace by following task_active_pid_ns(),
+ * pid->numbers[pid->level].ns
+ */
+ bpf_probe_read_kernel(&pids, sizeof(pids), &task->pids[PIDTYPE_PID]);
+ bpf_probe_read_kernel(&level, sizeof(level), &pids.pid->level);
+ bpf_probe_read_kernel(&upid, sizeof(upid), &pids.pid->numbers[level]);
+ bpf_probe_read_kernel(&ns, sizeof(ns), &upid.ns->ns);
+
+ return ns.inum;
+#else
+ struct pid *pid;
+ unsigned int level;
+ struct upid upid;
+ struct ns_common ns;
+
+ /* get the pid namespace by following task_active_pid_ns(),
+ * pid->numbers[pid->level].ns
+ */
+ bpf_probe_read_kernel(&pid, sizeof(pid), &task->thread_pid);
+ bpf_probe_read_kernel(&level, sizeof(level), &pid->level);
+ bpf_probe_read_kernel(&upid, sizeof(upid), &pid->numbers[level]);
+ bpf_probe_read_kernel(&ns, sizeof(ns), &upid.ns->ns);
+
+ return ns.inum;
+#endif
+}
+
+RAW_TRACEPOINT_PROBE(sched_wakeup)
+{
+ // TP_PROTO(struct task_struct *p)
+ struct task_struct *p = (struct task_struct *)ctx->args[0];
+ return trace_enqueue(p->tgid, p->pid);
+}
+
+RAW_TRACEPOINT_PROBE(sched_wakeup_new)
+{
+ // TP_PROTO(struct task_struct *p)
+ struct task_struct *p = (struct task_struct *)ctx->args[0];
+ return trace_enqueue(p->tgid, p->pid);
+}
+
+RAW_TRACEPOINT_PROBE(sched_switch)
+{
+ // TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
+ struct task_struct *prev = (struct task_struct *)ctx->args[1];
+ struct task_struct *next = (struct task_struct *)ctx->args[2];
+ u32 pid, tgid;
+
+ // ivcsw: treat like an enqueue event and store timestamp
+ if (prev->STATE_FIELD == TASK_RUNNING) {
+ tgid = prev->tgid;
+ pid = prev->pid;
+ if (!(FILTER || pid == 0)) {
+ u64 ts = bpf_ktime_get_ns();
+ start.update(&pid, &ts);
+ }
+ }
+
+ tgid = next->tgid;
+ pid = next->pid;
+ if (FILTER || pid == 0)
+ return 0;
+ u64 *tsp, delta;
+
+ // fetch timestamp and calculate delta
+ tsp = start.lookup(&pid);
+ if (tsp == 0) {
+ return 0; // missed enqueue
+ }
+ delta = bpf_ktime_get_ns() - *tsp;
+ FACTOR
+
+ // store as histogram
+ STORE
+
+ start.delete(&pid);
+ return 0;
+}
diff --git a/src/pmdas/bcc/modules/tcplife.python b/src/pmdas/bcc/modules/tcplife.python
index 0c6f17c36..02c693a6a 100644
--- a/src/pmdas/bcc/modules/tcplife.python
+++ b/src/pmdas/bcc/modules/tcplife.python
@@ -37,16 +37,11 @@ from modules.pcpbcc import PCPBCCBase
#
# BPF program
#
-bpf_src = "modules/tcplife.bpf"
-# Compat with kernel < 4.16, bcc < 0.6
-TRACEFS = "/sys/kernel/debug/tracing"
-bpf_src_old_tp = "modules/tcplife_old_tp.bpf"
-bpf_src_old_kb = "modules/tcplife_old_kb.bpf"
-if not path.exists(TRACEFS + "/events/sock/inet_sock_set_state"):
- if path.exists(TRACEFS + "/events/tcp/tcp_set_state"):
- bpf_src = bpf_src_old_tp
- else:
- bpf_src = bpf_src_old_kb
+if BPF.tracepoint_exists("sock", "inet_sock_set_state"):
+ bpf_src = "modules/tcplife_tp.bpf"
+else:
+ bpf_src = "modules/tcplife_kp.bpf"
+
#
# PCP BCC PMDA constants
@@ -57,35 +52,6 @@ units_bytes = pmUnits(1, 0, 0, PM_SPACE_BYTE, 0, 0)
units_usecs = pmUnits(0, 1, 0, 0, PM_TIME_USEC, 0)
units_none = pmUnits(0, 0, 0, 0, 0, 0)
-TASK_COMM_LEN = 16 # linux/sched.h
-
-class Data_ipv4(ct.Structure):
- """ IPv4 data struct """
- _fields_ = [
- ("ts_us", ct.c_ulonglong),
- ("pid", ct.c_ulonglong),
- ("saddr", ct.c_ulonglong),
- ("daddr", ct.c_ulonglong),
- ("ports", ct.c_ulonglong),
- ("rx_b", ct.c_ulonglong),
- ("tx_b", ct.c_ulonglong),
- ("span_us", ct.c_ulonglong),
- ("task", ct.c_char * TASK_COMM_LEN)
- ]
-
-class Data_ipv6(ct.Structure):
- """ IPv6 data struct """
- _fields_ = [
- ("ts_us", ct.c_ulonglong),
- ("pid", ct.c_ulonglong),
- ("saddr", (ct.c_ulonglong * 2)),
- ("daddr", (ct.c_ulonglong * 2)),
- ("ports", ct.c_ulonglong),
- ("rx_b", ct.c_ulonglong),
- ("tx_b", ct.c_ulonglong),
- ("span_us", ct.c_ulonglong),
- ("task", ct.c_char * TASK_COMM_LEN)
- ]
#
# PCP BCC Module
@@ -129,24 +95,22 @@ class PCPBCCModule(PCPBCCBase):
self.lock = Lock()
self.thread = None
- # Compat with kernel < 4.16
self.log("Using BPF source file %s." % bpf_src)
# Exit hard if impossible to continue
- if self.bcc_version() == "0.6.1" and bpf_src == bpf_src_old_kb:
- raise RuntimeError("BCC 0.6.1 bug makes it incompatible with this module "
- "on kernel < 4.15.")
+ if self.bcc_version_tuple() < (0, 6, 1):
+ raise RuntimeError("BCC 0.6.1+ is required for this module.")
self.log("Initialized.")
def handle_ip_event(self, data, version):
""" IP event handler """
if version == 4:
- event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
+ event = self.bpf["ipv4_events"].event(data)
laddr = inet_ntop(AF_INET, pack("I", event.saddr))
daddr = inet_ntop(AF_INET, pack("I", event.daddr))
else:
- event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
+ event = self.bpf["ipv6_events"].event(data)
laddr = inet_ntop(AF_INET6, event.saddr)
daddr = inet_ntop(AF_INET6, event.daddr)
@@ -205,31 +169,25 @@ class PCPBCCModule(PCPBCCBase):
if not self.bpf_text:
with open(path.dirname(__file__) + '/../' + bpf_src) as src:
self.bpf_text = src.read()
- # Compat with bcc < 0.6
- self.log("Testing BCC compatilibility, possible errors below are safe to ignore.")
- try:
- test_txt = self.bpf_text.replace("// NEW: ", "").replace("FILTER_PID", "")
- test_bpf = BPF(text=test_txt)
- test_bpf.cleanup()
- self.bpf_text = self.bpf_text.replace("// NEW: ", "")
- except Exception: # pylint: disable=broad-except
- self.bpf_text = self.bpf_text.replace("// OLD: ", "")
- self.log("Tested BCC compatilibility, possible errors above are safe to ignore.")
if self.dports:
filterp = " && ".join(["dport != %d" % port for port in self.dports])
filter_txt = "if (%s) { birth.delete(&sk); return 0; }" % filterp
- self.bpf_text = self.bpf_text.replace("//FILTER_DPORT", filter_txt)
+ self.bpf_text = self.bpf_text.replace("FILTER_DPORT", filter_txt)
if self.lports:
filterp = " && ".join(["lport != %d" % port for port in self.lports])
filter_txt = "if (%s) { birth.delete(&sk); return 0; }" % filterp
- self.bpf_text = self.bpf_text.replace("//FILTER_LPORT", filter_txt)
+ self.bpf_text = self.bpf_text.replace("FILTER_LPORT", filter_txt)
if not self.pids and self.proc_filter and self.proc_refresh:
self.log("No process to attach found, activation postponed.")
return
bpf_text = self.apply_pid_filter(self.bpf_text, self.pids, False)
+ bpf_text = bpf_text.replace('FILTER_PID', '')
+ bpf_text = bpf_text.replace('FILTER_DPORT', '')
+ bpf_text = bpf_text.replace('FILTER_LPORT', '')
+ bpf_text = bpf_text.replace('FILTER_FAMILY', '')
if self.debug:
self.log("BPF to be compiled:\n" + bpf_text.strip())
diff --git a/src/pmdas/bcc/modules/tcplife_old_kb.bpf b/src/pmdas/bcc/modules/tcplife_kp.bpf
similarity index 81%
rename from src/pmdas/bcc/modules/tcplife_old_kb.bpf
rename to src/pmdas/bcc/modules/tcplife_kp.bpf
index eed01941a..5486c6a37 100644
--- a/src/pmdas/bcc/modules/tcplife_old_kb.bpf
+++ b/src/pmdas/bcc/modules/tcplife_kp.bpf
@@ -2,7 +2,6 @@
// Licensed under the Apache License, Version 2.0 (the "License")
#include <uapi/linux/ptrace.h>
-#define KBUILD_MODNAME "pcpbcctcplife"
#include <linux/tcp.h>
#include <net/sock.h>
#include <bcc/proto.h>
@@ -11,11 +10,10 @@ BPF_HASH(birth, struct sock *, u64);
// separate data structs for ipv4 and ipv6
struct ipv4_data_t {
- // XXX: switch some to u32's when supported
u64 ts_us;
- u64 pid;
- u64 saddr;
- u64 daddr;
+ u32 pid;
+ u32 saddr;
+ u32 daddr;
u64 ports;
u64 rx_b;
u64 tx_b;
@@ -26,7 +24,7 @@ BPF_PERF_OUTPUT(ipv4_events);
struct ipv6_data_t {
u64 ts_us;
- u64 pid;
+ u32 pid;
unsigned __int128 saddr;
unsigned __int128 daddr;
u64 ports;
@@ -49,12 +47,12 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
// lport is either used in a filter here, or later
u16 lport = sk->__sk_common.skc_num;
- //FILTER_LPORT
+ FILTER_LPORT
// dport is either used in a filter here, or later
u16 dport = sk->__sk_common.skc_dport;
dport = ntohs(dport);
- //FILTER_DPORT
+ FILTER_DPORT
/*
* This tool includes PID and comm context. It's best effort, and may
@@ -74,6 +72,9 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
* sets ESTABLISHED without a tcp_set_state() call. Until we know
* that for sure, match all early states to increase chances a
* timestamp is set.
+ * Note that this needs to be set before the PID filter later on,
+ * since the PID isn't reliable for these early stages, so we must
+ * save all timestamps and do the PID filter later when we can.
*/
u64 ts = bpf_ktime_get_ns();
birth.update(&sk, &ts);
@@ -101,7 +102,7 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
birth.delete(&sk);
- // fetch possible cached data
+ // fetch possible cached data, and filter
struct id_t *mep;
mep = whoami.lookup(&sk);
if (mep != 0)
@@ -116,9 +117,13 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
u16 family = sk->__sk_common.skc_family;
+ FILTER_FAMILY
+
if (family == AF_INET) {
- struct ipv4_data_t data4 = {.span_us = delta_us,
- .rx_b = rx_b, .tx_b = tx_b};
+ struct ipv4_data_t data4 = {};
+ data4.span_us = delta_us;
+ data4.rx_b = rx_b;
+ data4.tx_b = tx_b;
data4.ts_us = bpf_ktime_get_ns() / 1000;
data4.saddr = sk->__sk_common.skc_rcv_saddr;
data4.daddr = sk->__sk_common.skc_daddr;
@@ -128,17 +133,19 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
if (mep == 0) {
bpf_get_current_comm(&data4.task, sizeof(data4.task));
} else {
- bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
+ bpf_probe_read_kernel(&data4.task, sizeof(data4.task), (void *)mep->task);
}
ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
} else /* 6 */ {
- struct ipv6_data_t data6 = {.span_us = delta_us,
- .rx_b = rx_b, .tx_b = tx_b};
+ struct ipv6_data_t data6 = {};
+ data6.span_us = delta_us;
+ data6.rx_b = rx_b;
+ data6.tx_b = tx_b;
data6.ts_us = bpf_ktime_get_ns() / 1000;
- bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
+ bpf_probe_read_kernel(&data6.saddr, sizeof(data6.saddr),
sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
- bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
+ bpf_probe_read_kernel(&data6.daddr, sizeof(data6.daddr),
sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
// a workaround until data6 compiles with separate lport/dport
data6.ports = dport + ((0ULL + lport) << 32);
@@ -146,7 +153,7 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
if (mep == 0) {
bpf_get_current_comm(&data6.task, sizeof(data6.task));
} else {
- bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
+ bpf_probe_read_kernel(&data6.task, sizeof(data6.task), (void *)mep->task);
}
ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
}
diff --git a/src/pmdas/bcc/modules/tcplife_old_tp.bpf b/src/pmdas/bcc/modules/tcplife_old_tp.bpf
deleted file mode 100644
index a7c9c625c..000000000
--- a/src/pmdas/bcc/modules/tcplife_old_tp.bpf
+++ /dev/null
@@ -1,166 +0,0 @@
-// Copyright 2016 Netflix, Inc.
-// Licensed under the Apache License, Version 2.0 (the "License")
-
-#include <uapi/linux/ptrace.h>
-#define KBUILD_MODNAME "pcpbcctcplife"
-#include <linux/tcp.h>
-#include <net/sock.h>
-#include <bcc/proto.h>
-
-BPF_HASH(birth, struct sock *, u64);
-
-// separate data structs for ipv4 and ipv6
-struct ipv4_data_t {
- // XXX: switch some to u32's when supported
- u64 ts_us;
- u64 pid;
- u64 saddr;
- u64 daddr;
- u64 ports;
- u64 rx_b;
- u64 tx_b;
- u64 span_us;
- char task[TASK_COMM_LEN];
-};
-BPF_PERF_OUTPUT(ipv4_events);
-
-struct ipv6_data_t {
- u64 ts_us;
- u64 pid;
- unsigned __int128 saddr;
- unsigned __int128 daddr;
- u64 ports;
- u64 rx_b;
- u64 tx_b;
- u64 span_us;
- char task[TASK_COMM_LEN];
-};
-BPF_PERF_OUTPUT(ipv6_events);
-
-struct id_t {
- u32 pid;
- char task[TASK_COMM_LEN];
-};
-BPF_HASH(whoami, struct sock *, struct id_t);
-
-TRACEPOINT_PROBE(tcp, tcp_set_state)
-{
- u32 pid = bpf_get_current_pid_tgid() >> 32;
- // sk is mostly used as a UUID, once for skc_family, and two tcp stats:
- struct sock *sk = (struct sock *)args->skaddr;
-
- // lport is either used in a filter here, or later
- u16 lport = args->sport;
- //FILTER_LPORT
-
- // dport is either used in a filter here, or later
- u16 dport = args->dport;
- //FILTER_DPORT
-
- /*
- * This tool includes PID and comm context. It's best effort, and may
- * be wrong in some situations. It currently works like this:
- * - record timestamp on any state < TCP_FIN_WAIT1
- * - cache task context on:
- * TCP_SYN_SENT: tracing from client
- * TCP_LAST_ACK: client-closed from server
- * - do output on TCP_CLOSE:
- * fetch task context if cached, or use current task
- */
-
- // capture birth time
- if (args->newstate < TCP_FIN_WAIT1) {
- /*
- * Matching just ESTABLISHED may be sufficient, provided no code-path
- * sets ESTABLISHED without a tcp_set_state() call. Until we know
- * that for sure, match all early states to increase chances a
- * timestamp is set.
- * Note that this needs to be set before the PID filter later on,
- * since the PID isn't reliable for these early stages, so we must
- * save all timestamps and do the PID filter later when we can.
- */
- u64 ts = bpf_ktime_get_ns();
- birth.update(&sk, &ts);
- }
-
- // record PID & comm on SYN_SENT
- if (args->newstate == TCP_SYN_SENT || args->newstate == TCP_LAST_ACK) {
- // now we can PID filter, both here and a little later on for CLOSE
- FILTER_PID
- struct id_t me = {.pid = pid};
- bpf_get_current_comm(&me.task, sizeof(me.task));
- whoami.update(&sk, &me);
- }
-
- if (args->newstate != TCP_CLOSE)
- return 0;
-
- // calculate lifespan
- u64 *tsp, delta_us;
- tsp = birth.lookup(&sk);
- if (tsp == 0) {
- whoami.delete(&sk); // may not exist
- return 0; // missed create
- }
- delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
- birth.delete(&sk);
-
- // fetch possible cached data, and filter
- struct id_t *mep;
- mep = whoami.lookup(&sk);
- if (mep != 0)
- pid = mep->pid;
- FILTER_PID
-
- // get throughput stats. see tcp_get_info().
- u64 rx_b = 0, tx_b = 0, sport = 0;
- struct tcp_sock *tp = (struct tcp_sock *)sk;
- // OLD: bpf_probe_read(&rx_b, sizeof(rx_b), &tp->bytes_received);
- // OLD: bpf_probe_read(&tx_b, sizeof(tx_b), &tp->bytes_acked);
- // NEW: rx_b = tp->bytes_received;
- // NEW: tx_b = tp->bytes_acked;
-
- u16 family = 0;
- // OLD: bpf_probe_read(&family, sizeof(family), &sk->__sk_common.skc_family);
- // NEW: family = sk->__sk_common.skc_family;
-
- if (family == AF_INET) {
-
- struct ipv4_data_t data4 = {.span_us = delta_us,
- .rx_b = rx_b, .tx_b = tx_b};
- data4.ts_us = bpf_ktime_get_ns() / 1000;
- bpf_probe_read(&data4.saddr, sizeof(u32), args->saddr);
- bpf_probe_read(&data4.daddr, sizeof(u32), args->daddr);
- // a workaround until data4 compiles with separate lport/dport
- data4.ports = dport + ((0ULL + lport) << 32);
- data4.pid = pid;
-
- if (mep == 0) {
- bpf_get_current_comm(&data4.task, sizeof(data4.task));
- } else {
- bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
- }
- ipv4_events.perf_submit(args, &data4, sizeof(data4));
-
- } else /* 6 */ {
- struct ipv6_data_t data6 = {.span_us = delta_us,
- .rx_b = rx_b, .tx_b = tx_b};
- data6.ts_us = bpf_ktime_get_ns() / 1000;
- bpf_probe_read(&data6.saddr, sizeof(data6.saddr), args->saddr_v6);
- bpf_probe_read(&data6.daddr, sizeof(data6.daddr), args->saddr_v6);
- // a workaround until data6 compiles with separate lport/dport
- data6.ports = dport + ((0ULL + lport) << 32);
- data6.pid = pid;
- if (mep == 0) {
- bpf_get_current_comm(&data6.task, sizeof(data6.task));
- } else {
- bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
- }
- ipv6_events.perf_submit(args, &data6, sizeof(data6));
- }
-
- if (mep != 0)
- whoami.delete(&sk);
-
- return 0;
-}
diff --git a/src/pmdas/bcc/modules/tcplife.bpf b/src/pmdas/bcc/modules/tcplife_tp.bpf
similarity index 80%
rename from src/pmdas/bcc/modules/tcplife.bpf
rename to src/pmdas/bcc/modules/tcplife_tp.bpf
index 19ca8d740..2b16b98e7 100644
--- a/src/pmdas/bcc/modules/tcplife.bpf
+++ b/src/pmdas/bcc/modules/tcplife_tp.bpf
@@ -2,7 +2,6 @@
// Licensed under the Apache License, Version 2.0 (the "License")
#include <uapi/linux/ptrace.h>
-#define KBUILD_MODNAME "pcpbcctcplife"
#include <linux/tcp.h>
#include <net/sock.h>
#include <bcc/proto.h>
@@ -11,11 +10,10 @@ BPF_HASH(birth, struct sock *, u64);
// separate data structs for ipv4 and ipv6
struct ipv4_data_t {
- // XXX: switch some to u32's when supported
u64 ts_us;
- u64 pid;
- u64 saddr;
- u64 daddr;
+ u32 pid;
+ u32 saddr;
+ u32 daddr;
u64 ports;
u64 rx_b;
u64 tx_b;
@@ -26,7 +24,7 @@ BPF_PERF_OUTPUT(ipv4_events);
struct ipv6_data_t {
u64 ts_us;
- u64 pid;
+ u32 pid;
unsigned __int128 saddr;
unsigned __int128 daddr;
u64 ports;
@@ -54,11 +52,11 @@ TRACEPOINT_PROBE(sock, inet_sock_set_state)
// lport is either used in a filter here, or later
u16 lport = args->sport;
- //FILTER_LPORT
+ FILTER_LPORT
// dport is either used in a filter here, or later
u16 dport = args->dport;
- //FILTER_DPORT
+ FILTER_DPORT
/*
* This tool includes PID and comm context. It's best effort, and may
@@ -115,20 +113,23 @@ TRACEPOINT_PROBE(sock, inet_sock_set_state)
pid = mep->pid;
FILTER_PID
+ u16 family = args->family;
+ FILTER_FAMILY
+
// get throughput stats. see tcp_get_info().
u64 rx_b = 0, tx_b = 0, sport = 0;
struct tcp_sock *tp = (struct tcp_sock *)sk;
- // OLD: bpf_probe_read(&rx_b, sizeof(rx_b), &tp->bytes_received);
- // OLD: bpf_probe_read(&tx_b, sizeof(tx_b), &tp->bytes_acked);
- // NEW: rx_b = tp->bytes_received;
- // NEW: tx_b = tp->bytes_acked;
+ rx_b = tp->bytes_received;
+ tx_b = tp->bytes_acked;
if (args->family == AF_INET) {
- struct ipv4_data_t data4 = {.span_us = delta_us,
- .rx_b = rx_b, .tx_b = tx_b};
+ struct ipv4_data_t data4 = {};
+ data4.span_us = delta_us;
+ data4.rx_b = rx_b;
+ data4.tx_b = tx_b;
data4.ts_us = bpf_ktime_get_ns() / 1000;
- bpf_probe_read(&data4.saddr, sizeof(u32), args->saddr);
- bpf_probe_read(&data4.daddr, sizeof(u32), args->daddr);
+ __builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr));
+ __builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr));
// a workaround until data4 compiles with separate lport/dport
data4.ports = dport + ((0ULL + lport) << 32);
data4.pid = pid;
@@ -136,23 +137,25 @@ TRACEPOINT_PROBE(sock, inet_sock_set_state)
if (mep == 0) {
bpf_get_current_comm(&data4.task, sizeof(data4.task));
} else {
- bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
+ bpf_probe_read_kernel(&data4.task, sizeof(data4.task), (void *)mep->task);
}
ipv4_events.perf_submit(args, &data4, sizeof(data4));
} else /* 6 */ {
- struct ipv6_data_t data6 = {.span_us = delta_us,
- .rx_b = rx_b, .tx_b = tx_b};
+ struct ipv6_data_t data6 = {};
+ data6.span_us = delta_us;
+ data6.rx_b = rx_b;
+ data6.tx_b = tx_b;
data6.ts_us = bpf_ktime_get_ns() / 1000;
- bpf_probe_read(&data6.saddr, sizeof(data6.saddr), args->saddr_v6);
- bpf_probe_read(&data6.daddr, sizeof(data6.daddr), args->saddr_v6);
+ __builtin_memcpy(&data6.saddr, args->saddr_v6, sizeof(data6.saddr));
+ __builtin_memcpy(&data6.daddr, args->daddr_v6, sizeof(data6.daddr));
// a workaround until data6 compiles with separate lport/dport
data6.ports = dport + ((0ULL + lport) << 32);
data6.pid = pid;
if (mep == 0) {
bpf_get_current_comm(&data6.task, sizeof(data6.task));
} else {
- bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
+ bpf_probe_read_kernel(&data6.task, sizeof(data6.task), (void *)mep->task);
}
ipv6_events.perf_submit(args, &data6, sizeof(data6));
}
diff --git a/src/pmdas/bcc/modules/tcpperpid.python b/src/pmdas/bcc/modules/tcpperpid.python
index 3cb2cfcfd..0096929a6 100644
--- a/src/pmdas/bcc/modules/tcpperpid.python
+++ b/src/pmdas/bcc/modules/tcpperpid.python
@@ -32,16 +32,10 @@ from modules.pcpbcc import PCPBCCBase
#
# BPF program
#
-bpf_src = "modules/tcplife.bpf"
-# Compat with kernel < 4.16, bcc < 0.6
-TRACEFS = "/sys/kernel/debug/tracing"
-bpf_src_old_tp = "modules/tcplife_old_tp.bpf"
-bpf_src_old_kb = "modules/tcplife_old_kb.bpf"
-if not path.exists(TRACEFS + "/events/sock/inet_sock_set_state"):
- if path.exists(TRACEFS + "/events/tcp/tcp_set_state"):
- bpf_src = bpf_src_old_tp
- else:
- bpf_src = bpf_src_old_kb
+if BPF.tracepoint_exists("sock", "inet_sock_set_state"):
+ bpf_src = "modules/tcplife_tp.bpf"
+else:
+ bpf_src = "modules/tcplife_kp.bpf"
# Alternative, "high resolution" BPF
bpf_highres = "modules/tcptop.bpf"
@@ -53,36 +47,6 @@ MODULE = 'tcpperpid'
BASENS = 'proc.io.net.total.'
units_bytes = pmUnits(1, 0, 0, PM_SPACE_BYTE, 0, 0)
-TASK_COMM_LEN = 16 # linux/sched.h
-
-class Data_ipv4(ct.Structure):
- """ IPv4 data struct """
- _fields_ = [
- ("ts_us", ct.c_ulonglong),
- ("pid", ct.c_ulonglong),
- ("saddr", ct.c_ulonglong),
- ("daddr", ct.c_ulonglong),
- ("ports", ct.c_ulonglong),
- ("rx_b", ct.c_ulonglong),
- ("tx_b", ct.c_ulonglong),
- ("span_us", ct.c_ulonglong),
- ("task", ct.c_char * TASK_COMM_LEN)
- ]
-
-class Data_ipv6(ct.Structure):
- """ IPv6 data struct """
- _fields_ = [
- ("ts_us", ct.c_ulonglong),
- ("pid", ct.c_ulonglong),
- ("saddr", (ct.c_ulonglong * 2)),
- ("daddr", (ct.c_ulonglong * 2)),
- ("ports", ct.c_ulonglong),
- ("rx_b", ct.c_ulonglong),
- ("tx_b", ct.c_ulonglong),
- ("span_us", ct.c_ulonglong),
- ("task", ct.c_char * TASK_COMM_LEN)
- ]
-
#
# PCP BCC Module
#
@@ -133,15 +97,14 @@ class PCPBCCModule(PCPBCCBase):
self.log("Using BPF source file %s." % src)
# Exit hard if impossible to continue
- if self.bcc_version() == "0.6.1" and src == bpf_src_old_kb and not self.highres:
- raise RuntimeError("BCC 0.6.1 bug makes it incompatible with this module "
- "on kernel < 4.15 in non-highres mode.")
+ if self.bcc_version_tuple() < (0, 6, 1) and not self.highres:
+ raise RuntimeError("BCC 0.6.1+ is required for this module in non-highres mode.")
self.log("Initialized.")
def handle_ipv4_event(self, _cpu, data, _size):
""" IPv4 event handler """
- event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
+ event = self.bpf["ipv4_events"].event(data)
pid = str(event.pid).zfill(6)
self.lock.acquire()
if pid not in self.ipv4_stats:
@@ -153,7 +116,7 @@ class PCPBCCModule(PCPBCCBase):
def handle_ipv6_event(self, _cpu, data, _size):
""" IPv6 event handler """
- event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
+ event = self.bpf["ipv6_events"].event(data)
pid = str(event.pid).zfill(6)
self.lock.acquire()
if pid not in self.ipv6_stats:
@@ -199,31 +162,25 @@ class PCPBCCModule(PCPBCCBase):
self.bpf_text = src.read()
if self.highres:
self.bpf_text = self.bpf_text.replace("FILTER", "FILTER_PID")
- # Compat with bcc < 0.6
- self.log("Testing BCC compatilibility, possible errors below are safe to ignore.")
- try:
- test_txt = self.bpf_text.replace("// NEW: ", "").replace("FILTER_PID", "")
- test_bpf = BPF(text=test_txt)
- test_bpf.cleanup()
- self.bpf_text = self.bpf_text.replace("// NEW: ", "")
- except Exception: # pylint: disable=broad-except
- self.bpf_text = self.bpf_text.replace("// OLD: ", "")
- self.log("Tested BCC compatilibility, possible errors above are safe to ignore.")
if self.dports:
filterp = " && ".join(["dport != %d" % port for port in self.dports])
filter_txt = "if (%s) { birth.delete(&sk); return 0; }" % filterp
- self.bpf_text = self.bpf_text.replace("//FILTER_DPORT", filter_txt)
+ self.bpf_text = self.bpf_text.replace("FILTER_DPORT", filter_txt)
if self.lports:
filterp = " && ".join(["lport != %d" % port for port in self.lports])
filter_txt = "if (%s) { birth.delete(&sk); return 0; }" % filterp
- self.bpf_text = self.bpf_text.replace("//FILTER_LPORT", filter_txt)
+ self.bpf_text = self.bpf_text.replace("FILTER_LPORT", filter_txt)
if not self.pids and self.proc_filter and self.proc_refresh:
self.log("No process to attach found, activation postponed.")
return
bpf_text = self.apply_pid_filter(self.bpf_text, self.pids, False)
+ bpf_text = bpf_text.replace('FILTER_PID', '')
+ bpf_text = bpf_text.replace('FILTER_DPORT', '')
+ bpf_text = bpf_text.replace('FILTER_LPORT', '')
+ bpf_text = bpf_text.replace('FILTER_FAMILY', '')
if self.debug:
self.log("BPF to be compiled:\n" + bpf_text.strip())
diff --git a/src/pmdas/bcc/modules/tcptop.bpf b/src/pmdas/bcc/modules/tcptop.bpf
index 349ee1529..c1fed7aef 100644
--- a/src/pmdas/bcc/modules/tcptop.bpf
+++ b/src/pmdas/bcc/modules/tcptop.bpf
@@ -4,6 +4,7 @@
#include <uapi/linux/ptrace.h>
#include <net/sock.h>
#include <bcc/proto.h>
+
struct ipv4_key_t {
u32 pid;
u32 saddr;
@@ -13,25 +14,32 @@ struct ipv4_key_t {
};
BPF_HASH(ipv4_send_bytes, struct ipv4_key_t);
BPF_HASH(ipv4_recv_bytes, struct ipv4_key_t);
+
struct ipv6_key_t {
+ unsigned __int128 saddr;
+ unsigned __int128 daddr;
u32 pid;
- // workaround until unsigned __int128 support:
- u64 saddr0;
- u64 saddr1;
- u64 daddr0;
- u64 daddr1;
u16 lport;
u16 dport;
+ u64 __pad__;
};
BPF_HASH(ipv6_send_bytes, struct ipv6_key_t);
BPF_HASH(ipv6_recv_bytes, struct ipv6_key_t);
+
int kprobe__tcp_sendmsg(struct pt_regs *ctx, struct sock *sk,
struct msghdr *msg, size_t size)
{
- u32 pid = bpf_get_current_pid_tgid();
- FILTER
+ if (container_should_be_filtered()) {
+ return 0;
+ }
+
+ u32 pid = bpf_get_current_pid_tgid() >> 32;
+ FILTER_PID
+
u16 dport = 0, family = sk->__sk_common.skc_family;
- u64 *val, zero = 0;
+
+ FILTER_FAMILY
+
if (family == AF_INET) {
struct ipv4_key_t ipv4_key = {.pid = pid};
ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr;
@@ -39,31 +47,24 @@ int kprobe__tcp_sendmsg(struct pt_regs *ctx, struct sock *sk,
ipv4_key.lport = sk->__sk_common.skc_num;
dport = sk->__sk_common.skc_dport;
ipv4_key.dport = ntohs(dport);
- val = ipv4_send_bytes.lookup_or_init(&ipv4_key, &zero);
- if (val) {
- (*val) += size;
- }
+ ipv4_send_bytes.increment(ipv4_key, size);
+
} else if (family == AF_INET6) {
struct ipv6_key_t ipv6_key = {.pid = pid};
- bpf_probe_read(&ipv6_key.saddr0, sizeof(ipv6_key.saddr0),
- &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[0]);
- bpf_probe_read(&ipv6_key.saddr1, sizeof(ipv6_key.saddr1),
- &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[2]);
- bpf_probe_read(&ipv6_key.daddr0, sizeof(ipv6_key.daddr0),
- &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[0]);
- bpf_probe_read(&ipv6_key.daddr1, sizeof(ipv6_key.daddr1),
- &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[2]);
+ bpf_probe_read_kernel(&ipv6_key.saddr, sizeof(ipv6_key.saddr),
+ &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
+ bpf_probe_read_kernel(&ipv6_key.daddr, sizeof(ipv6_key.daddr),
+ &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
ipv6_key.lport = sk->__sk_common.skc_num;
dport = sk->__sk_common.skc_dport;
ipv6_key.dport = ntohs(dport);
- val = ipv6_send_bytes.lookup_or_init(&ipv6_key, &zero);
- if (val) {
- (*val) += size;
- }
+ ipv6_send_bytes.increment(ipv6_key, size);
}
// else drop
+
return 0;
}
+
/*
* tcp_recvmsg() would be obvious to trace, but is less suitable because:
* - we'd need to trace both entry and return, to have both sock and size
@@ -72,12 +73,21 @@ int kprobe__tcp_sendmsg(struct pt_regs *ctx, struct sock *sk,
*/
int kprobe__tcp_cleanup_rbuf(struct pt_regs *ctx, struct sock *sk, int copied)
{
- u32 pid = bpf_get_current_pid_tgid();
- FILTER
+ if (container_should_be_filtered()) {
+ return 0;
+ }
+
+ u32 pid = bpf_get_current_pid_tgid() >> 32;
+ FILTER_PID
+
u16 dport = 0, family = sk->__sk_common.skc_family;
u64 *val, zero = 0;
+
if (copied <= 0)
return 0;
+
+ FILTER_FAMILY
+
if (family == AF_INET) {
struct ipv4_key_t ipv4_key = {.pid = pid};
ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr;
@@ -85,28 +95,20 @@ int kprobe__tcp_cleanup_rbuf(struct pt_regs *ctx, struct sock *sk, int copied)
ipv4_key.lport = sk->__sk_common.skc_num;
dport = sk->__sk_common.skc_dport;
ipv4_key.dport = ntohs(dport);
- val = ipv4_recv_bytes.lookup_or_init(&ipv4_key, &zero);
- if (val) {
- (*val) += copied;
- }
+ ipv4_recv_bytes.increment(ipv4_key, copied);
+
} else if (family == AF_INET6) {
struct ipv6_key_t ipv6_key = {.pid = pid};
- bpf_probe_read(&ipv6_key.saddr0, sizeof(ipv6_key.saddr0),
- &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[0]);
- bpf_probe_read(&ipv6_key.saddr1, sizeof(ipv6_key.saddr1),
- &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[2]);
- bpf_probe_read(&ipv6_key.daddr0, sizeof(ipv6_key.daddr0),
- &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[0]);
- bpf_probe_read(&ipv6_key.daddr1, sizeof(ipv6_key.daddr1),
- &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[2]);
+ bpf_probe_read_kernel(&ipv6_key.saddr, sizeof(ipv6_key.saddr),
+ &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
+ bpf_probe_read_kernel(&ipv6_key.daddr, sizeof(ipv6_key.daddr),
+ &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
ipv6_key.lport = sk->__sk_common.skc_num;
dport = sk->__sk_common.skc_dport;
ipv6_key.dport = ntohs(dport);
- val = ipv6_recv_bytes.lookup_or_init(&ipv6_key, &zero);
- if (val) {
- (*val) += copied;
- }
+ ipv6_recv_bytes.increment(ipv6_key, copied);
}
// else drop
+
return 0;
}
diff --git a/src/pmdas/bcc/modules/tcptop.python b/src/pmdas/bcc/modules/tcptop.python
index 45063dff3..db1c1da15 100644
--- a/src/pmdas/bcc/modules/tcptop.python
+++ b/src/pmdas/bcc/modules/tcptop.python
@@ -120,13 +120,14 @@ class PCPBCCModule(PCPBCCBase):
with open(path.dirname(__file__) + '/../' + bpf_src) as src:
self.bpf_text = src.read()
- self.bpf_text = self.bpf_text.replace("FILTER", "FILTER_PID")
-
if not self.pids and self.proc_filter and self.proc_refresh:
self.log("No process to attach found, activation postponed.")
return
bpf_text = self.apply_pid_filter(self.bpf_text, self.pids, False)
+ bpf_text = bpf_text.replace('FILTER_PID', '')
+ bpf_text = bpf_text.replace('FILTER_FAMILY', '')
+ bpf_text = bpf_text.replace('container_should_be_filtered()', '0')
if self.debug:
self.log("BPF to be compiled:\n" + bpf_text.strip())
@@ -155,21 +156,31 @@ class PCPBCCModule(PCPBCCBase):
@staticmethod
def ipv4_table_to_dict(table):
- """ Build hashable dict from IPv4 BPF table """
- return {TCPSessionKey(pid=k.pid,
- laddr=inet_ntop(AF_INET, pack("I", k.saddr)),
- lport=k.lport,
- daddr=inet_ntop(AF_INET, pack("I", k.daddr)),
- dport=k.dport):v.value for k, v in table.items()}
+ """Build hashable dict from IPv4 BPF table"""
+ return {
+ TCPSessionKey(
+ pid=k.pid,
+ laddr=inet_ntop(AF_INET, pack("I", k.saddr)),
+ lport=k.lport,
+ daddr=inet_ntop(AF_INET, pack("I", k.daddr)),
+ dport=k.dport,
+ ): v.value
+ for k, v in table.items()
+ }
@staticmethod
def ipv6_table_to_dict(table):
- """ Build hashable dict from IPv6 BPF table """
- return {TCPSessionKey(pid=k.pid,
- laddr=inet_ntop(AF_INET6, pack("QQ", k.saddr0, k.saddr1)),
- lport=k.lport,
- daddr=inet_ntop(AF_INET6, pack("QQ", k.daddr0, k.daddr1)),
- dport=k.dport):v.value for k, v in table.items()}
+ """Build hashable dict from IPv6 BPF table"""
+ return {
+ TCPSessionKey(
+ pid=k.pid,
+ laddr=inet_ntop(AF_INET6, k.saddr),
+ lport=k.lport,
+ daddr=inet_ntop(AF_INET6, k.daddr),
+ dport=k.dport,
+ ): v.value
+ for k, v in table.items()
+ }
def refresh_stats(self):
""" Refresh statistics from BPF table """
commit d45ce8e85035cc95ba897cd19967fad6d5d741be (cherry-picked)
Author: Andreas Gerstmayr <agerstmayr@redhat.com>
Date: Wed Dec 15 08:03:40 2021 +0100
qa: update qa/1118 to add new log output of runqlat bcc module
diff --git a/qa/1118 b/qa/1118
index 4123495b5..bcaec0a0d 100755
--- a/qa/1118
+++ b/qa/1118
@@ -21,12 +21,19 @@ _label_filter()
grep '"0-1"' | grep '"statistic":"histogram"' | grep '"lower_bound":0' | grep 'upper_bound":1' > /dev/null && echo 'OK'
}
+_install_filter()
+{
+ sed \
+ -e "s/Using BPF source file .\+/Using BPF source file X/g" \
+ #end
+}
+
_prepare_pmda bcc
trap "_pmdabcc_cleanup; exit \$status" 0 1 2 3 15
_stop_auto_restart pmcd
# real QA test starts here
-cat <<EOF | _pmdabcc_install
+cat <<EOF | _pmdabcc_install | _install_filter
# Installed by PCP QA test $seq on `date`
[pmda]
modules = runqlat
diff --git a/qa/1118.out b/qa/1118.out
index 16a9fa4b4..e74f97964 100644
--- a/qa/1118.out
+++ b/qa/1118.out
@@ -9,6 +9,7 @@ Info: runqlat
Info: Modules configured.
Info: Initializing modules:
Info: runqlat
+Info: runqlat: Using BPF source file X
Info: runqlat: Initialized.
Info: Modules initialized.
Info: Registering metrics:
@@ -25,6 +26,7 @@ Info: runqlat
Info: Modules configured.
Info: Initializing modules:
Info: runqlat
+Info: runqlat: Using BPF source file X
Info: runqlat: Initialized.
Info: Modules initialized.
Info: Registering metrics: