commit 14ffcd934e1c5099b471f4e73da32d1b32bac7e6 Author: Andreas Gerstmayr Date: Mon Dec 13 20:10:40 2021 +0100 pmdabcc: sync bcc PMDA modules with upstream bcc tools diff --git a/src/pmdas/bcc/modules/execsnoop.bpf b/src/pmdas/bcc/modules/execsnoop.bpf index f69200773..aa755b3a1 100644 --- a/src/pmdas/bcc/modules/execsnoop.bpf +++ b/src/pmdas/bcc/modules/execsnoop.bpf @@ -4,40 +4,57 @@ #include #include #include + #define ARGSIZE 128 + enum event_type { EVENT_ARG, EVENT_RET, }; + struct data_t { u32 pid; // PID as in the userspace term (i.e. task->tgid in kernel) u32 ppid; // Parent PID as in the userspace term (i.e task->real_parent->tgid in kernel) + u32 uid; char comm[TASK_COMM_LEN]; enum event_type type; char argv[ARGSIZE]; int retval; }; + BPF_PERF_OUTPUT(events); + static int __submit_arg(struct pt_regs *ctx, void *ptr, struct data_t *data) { - bpf_probe_read(data->argv, sizeof(data->argv), ptr); + bpf_probe_read_user(data->argv, sizeof(data->argv), ptr); events.perf_submit(ctx, data, sizeof(struct data_t)); return 1; } + static int submit_arg(struct pt_regs *ctx, void *ptr, struct data_t *data) { const char *argp = NULL; - bpf_probe_read(&argp, sizeof(argp), ptr); + bpf_probe_read_user(&argp, sizeof(argp), ptr); if (argp) { return __submit_arg(ctx, (void *)(argp), data); } return 0; } + int syscall__execve(struct pt_regs *ctx, const char __user *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { + + u32 uid = bpf_get_current_uid_gid() & 0xffffffff; + + UID_FILTER + + if (container_should_be_filtered()) { + return 0; + } + // create data here and pass to submit_arg to save stack space (#555) struct data_t data = {}; struct task_struct *task; @@ -52,25 +69,37 @@ int syscall__execve(struct pt_regs *ctx, bpf_get_current_comm(&data.comm, sizeof(data.comm)); data.type = EVENT_ARG; + __submit_arg(ctx, (void *)filename, &data); + // skip first arg, as we submitted filename #pragma unroll for (int i = 1; i < MAXARG; i++) { if (submit_arg(ctx, (void *)&__argv[i], &data) == 0) goto out; } + // handle truncated argument list char ellipsis[] = "..."; __submit_arg(ctx, (void *)ellipsis, &data); out: return 0; } + int do_ret_sys_execve(struct pt_regs *ctx) { + if (container_should_be_filtered()) { + return 0; + } + struct data_t data = {}; struct task_struct *task; + u32 uid = bpf_get_current_uid_gid() & 0xffffffff; + UID_FILTER + data.pid = bpf_get_current_pid_tgid() >> 32; + data.uid = uid; task = (struct task_struct *)bpf_get_current_task(); // Some kernels, like Ubuntu 4.13.0-generic, return 0 @@ -82,5 +111,6 @@ int do_ret_sys_execve(struct pt_regs *ctx) data.type = EVENT_RET; data.retval = PT_REGS_RC(ctx); events.perf_submit(ctx, &data, sizeof(data)); + return 0; } diff --git a/src/pmdas/bcc/modules/execsnoop.python b/src/pmdas/bcc/modules/execsnoop.python index 54382fa9b..1127cc471 100644 --- a/src/pmdas/bcc/modules/execsnoop.python +++ b/src/pmdas/bcc/modules/execsnoop.python @@ -44,20 +44,6 @@ MODULE = 'execsnoop' BASENS = 'proc.exec.' units_none = pmUnits(0, 0, 0, 0, 0, 0) -TASK_COMM_LEN = 16 # linux/sched.h -ARGSIZE = 128 # should match #define in execsnoop.bpf - -class Data(ct.Structure): - """ execsnoop data struct """ - _fields_ = [ - ("pid", ct.c_uint), - ("ppid", ct.c_uint), - ("comm", ct.c_char * TASK_COMM_LEN), - ("type", ct.c_int), - ("argv", ct.c_char * ARGSIZE), - ("retval", ct.c_int), - ] - class EventType(object): """ Event type """ EVENT_ARG = 0 @@ -137,7 +123,7 @@ class PCPBCCModule(PCPBCCBase): def handle_event(self, _cpu, data, _size): """ Event handler """ - event = ct.cast(data, ct.POINTER(Data)).contents + event = self.bpf["events"].event(data) skip = False if event.type == EventType.EVENT_ARG: @@ -145,9 +131,9 @@ class PCPBCCModule(PCPBCCBase): elif event.type == EventType.EVENT_RET: if event.retval != 0 and not self.include_failed: skip = True - if self.command and not re.search(self.command, event.comm): + if self.command and not re.search(bytes(self.command), event.comm): skip = True - if self.args and not re.search(self.args, b" ".join(self.argv_cache[event.pid])): + if self.args and not re.search(bytes(self.args), b" ".join(self.argv_cache[event.pid])): skip = True if not skip: @@ -177,10 +163,14 @@ class PCPBCCModule(PCPBCCBase): self.bpf_text = self.bpf_text.replace("MAXARG", str(self.max_args)) + bpf_text = self.bpf_text + bpf_text = bpf_text.replace('UID_FILTER', '') + bpf_text = bpf_text.replace('container_should_be_filtered()', '0') + if self.debug: - self.log("BPF to be compiled:\n" + self.bpf_text.strip()) + self.log("BPF to be compiled:\n" + bpf_text.strip()) - self.bpf = BPF(text=self.bpf_text) + self.bpf = BPF(text=bpf_text) execve_fnname = self.get_syscall_fnname("execve") self.bpf.attach_kprobe(event=execve_fnname, fn_name="syscall__execve") self.bpf.attach_kretprobe(event=execve_fnname, fn_name="do_ret_sys_execve") diff --git a/src/pmdas/bcc/modules/pcpbcc.python b/src/pmdas/bcc/modules/pcpbcc.python index 0555dc33f..62783b7fc 100644 --- a/src/pmdas/bcc/modules/pcpbcc.python +++ b/src/pmdas/bcc/modules/pcpbcc.python @@ -14,6 +14,7 @@ """ PCP BCC PMDA module base class """ import re +import platform import ctypes as ct from os import kill, listdir, path from collections import OrderedDict @@ -348,6 +349,16 @@ class PCPBCCBase(object): """ Returns BCC version as an int tuple (for comparisons) """ return tuple(map(int, PCPBCCBase.bcc_version().split('.'))) + @staticmethod + def kernel_version(): + """Returns the kernel version""" + version_str = platform.release() + m = re.match(r'^(\d+)\.(\d+)\.(\d+)', version_str) + if m: + return tuple(map(int, m.groups())) + else: + return (0, 0, 0) + def perf_buffer_poller(self): """ BPF poller """ try: diff --git a/src/pmdas/bcc/modules/runqlat.python b/src/pmdas/bcc/modules/runqlat.python index 27007c7e5..1c6c6b4b0 100644 --- a/src/pmdas/bcc/modules/runqlat.python +++ b/src/pmdas/bcc/modules/runqlat.python @@ -30,7 +30,11 @@ from modules.pcpbcc import PCPBCCBase # # BPF program # -bpf_src = "modules/runqlat.bpf" +is_support_raw_tp = BPF.support_raw_tracepoint() +if is_support_raw_tp: + bpf_src = "modules/runqlat_tp.bpf" +else: + bpf_src = "modules/runqlat_kp.bpf" # # PCP BCC PMDA constants @@ -59,6 +63,7 @@ class PCPBCCModule(PCPBCCBase): self.proc_filter = self.config.get(MODULE, opt) self.update_pids(self.get_proc_info(self.proc_filter)) + self.log("Using BPF source file %s." % bpf_src) self.log("Initialized.") def metrics(self): @@ -89,7 +94,23 @@ class PCPBCCModule(PCPBCCBase): with open(path.dirname(__file__) + '/../' + bpf_src) as src: self.bpf_text = src.read() + # BPF.kernel_struct_has_field requires BCC v0.23.0 + # use kernel version check as alternative + # pylint: disable=no-member + if ( + hasattr(BPF, "kernel_struct_has_field") + and BPF.kernel_struct_has_field(b"task_struct", b"__state") == 1 + ) or self.kernel_version() >= (5, 14, 0): + self.bpf_text = self.bpf_text.replace('STATE_FIELD', '__state') + else: + self.bpf_text = self.bpf_text.replace('STATE_FIELD', 'state') + self.bpf_text = self.bpf_text.replace("FILTER", "PID_CHECK") + self.bpf_text = self.bpf_text.replace('FACTOR', 'delta /= 1000;') + + self.bpf_text = self.bpf_text.replace('STORAGE', 'BPF_HISTOGRAM(dist);') + self.bpf_text = self.bpf_text.replace('STORE', + 'dist.increment(bpf_log2l(delta));') if not self.pids and self.proc_filter and self.proc_refresh: self.log("No process to attach found, activation postponed.") @@ -102,9 +123,11 @@ class PCPBCCModule(PCPBCCBase): self.reset_cache() self.bpf = BPF(text=bpf_text) - self.bpf.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup") - self.bpf.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task") - self.bpf.attach_kprobe(event_re=r"^finish_task_switch$|^finish_task_switch\.isra\.\d$", fn_name="trace_run") + if not is_support_raw_tp: + self.bpf.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup") + self.bpf.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task") + self.bpf.attach_kprobe(event_re=r"^finish_task_switch$|^finish_task_switch\.isra\.\d$", + fn_name="trace_run") self.log("Compiled.") except Exception as error: # pylint: disable=broad-except self.bpf = None diff --git a/src/pmdas/bcc/modules/runqlat.bpf b/src/pmdas/bcc/modules/runqlat_kp.bpf similarity index 54% rename from src/pmdas/bcc/modules/runqlat.bpf rename to src/pmdas/bcc/modules/runqlat_kp.bpf index a3664a035..dd643d600 100644 --- a/src/pmdas/bcc/modules/runqlat.bpf +++ b/src/pmdas/bcc/modules/runqlat_kp.bpf @@ -5,6 +5,7 @@ #include #include #include +#include typedef struct pid_key { u64 id; // work around @@ -17,7 +18,7 @@ typedef struct pidns_key { } pidns_key_t; BPF_HASH(start, u32); -BPF_HISTOGRAM(dist); +STORAGE struct rq; @@ -31,6 +32,45 @@ static int trace_enqueue(u32 tgid, u32 pid) return 0; } +static __always_inline unsigned int pid_namespace(struct task_struct *task) +{ + +/* pids[] was removed from task_struct since commit 2c4704756cab7cfa031ada4dab361562f0e357c0 + * Using the macro INIT_PID_LINK as a conditional judgment. + */ +#ifdef INIT_PID_LINK + struct pid_link pids; + unsigned int level; + struct upid upid; + struct ns_common ns; + + /* get the pid namespace by following task_active_pid_ns(), + * pid->numbers[pid->level].ns + */ + bpf_probe_read_kernel(&pids, sizeof(pids), &task->pids[PIDTYPE_PID]); + bpf_probe_read_kernel(&level, sizeof(level), &pids.pid->level); + bpf_probe_read_kernel(&upid, sizeof(upid), &pids.pid->numbers[level]); + bpf_probe_read_kernel(&ns, sizeof(ns), &upid.ns->ns); + + return ns.inum; +#else + struct pid *pid; + unsigned int level; + struct upid upid; + struct ns_common ns; + + /* get the pid namespace by following task_active_pid_ns(), + * pid->numbers[pid->level].ns + */ + bpf_probe_read_kernel(&pid, sizeof(pid), &task->thread_pid); + bpf_probe_read_kernel(&level, sizeof(level), &pid->level); + bpf_probe_read_kernel(&upid, sizeof(upid), &pid->numbers[level]); + bpf_probe_read_kernel(&ns, sizeof(ns), &upid.ns->ns); + + return ns.inum; +#endif +} + int trace_wake_up_new_task(struct pt_regs *ctx, struct task_struct *p) { return trace_enqueue(p->tgid, p->pid); @@ -48,7 +88,7 @@ int trace_run(struct pt_regs *ctx, struct task_struct *prev) u32 pid, tgid; // ivcsw: treat like an enqueue event and store timestamp - if (prev->state == TASK_RUNNING) { + if (prev->STATE_FIELD == TASK_RUNNING) { tgid = prev->tgid; pid = prev->pid; if (!(FILTER || pid == 0)) { @@ -69,10 +109,10 @@ int trace_run(struct pt_regs *ctx, struct task_struct *prev) return 0; // missed enqueue } delta = bpf_ktime_get_ns() - *tsp; - delta /= 1000; + FACTOR // store as histogram - dist.increment(bpf_log2l(delta)); + STORE start.delete(&pid); return 0; diff --git a/src/pmdas/bcc/modules/runqlat_tp.bpf b/src/pmdas/bcc/modules/runqlat_tp.bpf new file mode 100644 index 000000000..f0e9ce69b --- /dev/null +++ b/src/pmdas/bcc/modules/runqlat_tp.bpf @@ -0,0 +1,124 @@ +// Copyright 2016 Netflix, Inc. +// Licensed under the Apache License, Version 2.0 (the "License") + +#include +#include +#include +#include +#include + +typedef struct pid_key { + u64 id; // work around + u64 slot; +} pid_key_t; + +typedef struct pidns_key { + u64 id; // work around + u64 slot; +} pidns_key_t; + +BPF_HASH(start, u32); +STORAGE + +struct rq; + +// record enqueue timestamp +static int trace_enqueue(u32 tgid, u32 pid) +{ + if (FILTER || pid == 0) + return 0; + u64 ts = bpf_ktime_get_ns(); + start.update(&pid, &ts); + return 0; +} + +static __always_inline unsigned int pid_namespace(struct task_struct *task) +{ + +/* pids[] was removed from task_struct since commit 2c4704756cab7cfa031ada4dab361562f0e357c0 + * Using the macro INIT_PID_LINK as a conditional judgment. + */ +#ifdef INIT_PID_LINK + struct pid_link pids; + unsigned int level; + struct upid upid; + struct ns_common ns; + + /* get the pid namespace by following task_active_pid_ns(), + * pid->numbers[pid->level].ns + */ + bpf_probe_read_kernel(&pids, sizeof(pids), &task->pids[PIDTYPE_PID]); + bpf_probe_read_kernel(&level, sizeof(level), &pids.pid->level); + bpf_probe_read_kernel(&upid, sizeof(upid), &pids.pid->numbers[level]); + bpf_probe_read_kernel(&ns, sizeof(ns), &upid.ns->ns); + + return ns.inum; +#else + struct pid *pid; + unsigned int level; + struct upid upid; + struct ns_common ns; + + /* get the pid namespace by following task_active_pid_ns(), + * pid->numbers[pid->level].ns + */ + bpf_probe_read_kernel(&pid, sizeof(pid), &task->thread_pid); + bpf_probe_read_kernel(&level, sizeof(level), &pid->level); + bpf_probe_read_kernel(&upid, sizeof(upid), &pid->numbers[level]); + bpf_probe_read_kernel(&ns, sizeof(ns), &upid.ns->ns); + + return ns.inum; +#endif +} + +RAW_TRACEPOINT_PROBE(sched_wakeup) +{ + // TP_PROTO(struct task_struct *p) + struct task_struct *p = (struct task_struct *)ctx->args[0]; + return trace_enqueue(p->tgid, p->pid); +} + +RAW_TRACEPOINT_PROBE(sched_wakeup_new) +{ + // TP_PROTO(struct task_struct *p) + struct task_struct *p = (struct task_struct *)ctx->args[0]; + return trace_enqueue(p->tgid, p->pid); +} + +RAW_TRACEPOINT_PROBE(sched_switch) +{ + // TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next) + struct task_struct *prev = (struct task_struct *)ctx->args[1]; + struct task_struct *next = (struct task_struct *)ctx->args[2]; + u32 pid, tgid; + + // ivcsw: treat like an enqueue event and store timestamp + if (prev->STATE_FIELD == TASK_RUNNING) { + tgid = prev->tgid; + pid = prev->pid; + if (!(FILTER || pid == 0)) { + u64 ts = bpf_ktime_get_ns(); + start.update(&pid, &ts); + } + } + + tgid = next->tgid; + pid = next->pid; + if (FILTER || pid == 0) + return 0; + u64 *tsp, delta; + + // fetch timestamp and calculate delta + tsp = start.lookup(&pid); + if (tsp == 0) { + return 0; // missed enqueue + } + delta = bpf_ktime_get_ns() - *tsp; + FACTOR + + // store as histogram + STORE + + start.delete(&pid); + return 0; +} diff --git a/src/pmdas/bcc/modules/tcplife.python b/src/pmdas/bcc/modules/tcplife.python index 0c6f17c36..02c693a6a 100644 --- a/src/pmdas/bcc/modules/tcplife.python +++ b/src/pmdas/bcc/modules/tcplife.python @@ -37,16 +37,11 @@ from modules.pcpbcc import PCPBCCBase # # BPF program # -bpf_src = "modules/tcplife.bpf" -# Compat with kernel < 4.16, bcc < 0.6 -TRACEFS = "/sys/kernel/debug/tracing" -bpf_src_old_tp = "modules/tcplife_old_tp.bpf" -bpf_src_old_kb = "modules/tcplife_old_kb.bpf" -if not path.exists(TRACEFS + "/events/sock/inet_sock_set_state"): - if path.exists(TRACEFS + "/events/tcp/tcp_set_state"): - bpf_src = bpf_src_old_tp - else: - bpf_src = bpf_src_old_kb +if BPF.tracepoint_exists("sock", "inet_sock_set_state"): + bpf_src = "modules/tcplife_tp.bpf" +else: + bpf_src = "modules/tcplife_kp.bpf" + # # PCP BCC PMDA constants @@ -57,35 +52,6 @@ units_bytes = pmUnits(1, 0, 0, PM_SPACE_BYTE, 0, 0) units_usecs = pmUnits(0, 1, 0, 0, PM_TIME_USEC, 0) units_none = pmUnits(0, 0, 0, 0, 0, 0) -TASK_COMM_LEN = 16 # linux/sched.h - -class Data_ipv4(ct.Structure): - """ IPv4 data struct """ - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("pid", ct.c_ulonglong), - ("saddr", ct.c_ulonglong), - ("daddr", ct.c_ulonglong), - ("ports", ct.c_ulonglong), - ("rx_b", ct.c_ulonglong), - ("tx_b", ct.c_ulonglong), - ("span_us", ct.c_ulonglong), - ("task", ct.c_char * TASK_COMM_LEN) - ] - -class Data_ipv6(ct.Structure): - """ IPv6 data struct """ - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("pid", ct.c_ulonglong), - ("saddr", (ct.c_ulonglong * 2)), - ("daddr", (ct.c_ulonglong * 2)), - ("ports", ct.c_ulonglong), - ("rx_b", ct.c_ulonglong), - ("tx_b", ct.c_ulonglong), - ("span_us", ct.c_ulonglong), - ("task", ct.c_char * TASK_COMM_LEN) - ] # # PCP BCC Module @@ -129,24 +95,22 @@ class PCPBCCModule(PCPBCCBase): self.lock = Lock() self.thread = None - # Compat with kernel < 4.16 self.log("Using BPF source file %s." % bpf_src) # Exit hard if impossible to continue - if self.bcc_version() == "0.6.1" and bpf_src == bpf_src_old_kb: - raise RuntimeError("BCC 0.6.1 bug makes it incompatible with this module " - "on kernel < 4.15.") + if self.bcc_version_tuple() < (0, 6, 1): + raise RuntimeError("BCC 0.6.1+ is required for this module.") self.log("Initialized.") def handle_ip_event(self, data, version): """ IP event handler """ if version == 4: - event = ct.cast(data, ct.POINTER(Data_ipv4)).contents + event = self.bpf["ipv4_events"].event(data) laddr = inet_ntop(AF_INET, pack("I", event.saddr)) daddr = inet_ntop(AF_INET, pack("I", event.daddr)) else: - event = ct.cast(data, ct.POINTER(Data_ipv6)).contents + event = self.bpf["ipv6_events"].event(data) laddr = inet_ntop(AF_INET6, event.saddr) daddr = inet_ntop(AF_INET6, event.daddr) @@ -205,31 +169,25 @@ class PCPBCCModule(PCPBCCBase): if not self.bpf_text: with open(path.dirname(__file__) + '/../' + bpf_src) as src: self.bpf_text = src.read() - # Compat with bcc < 0.6 - self.log("Testing BCC compatilibility, possible errors below are safe to ignore.") - try: - test_txt = self.bpf_text.replace("// NEW: ", "").replace("FILTER_PID", "") - test_bpf = BPF(text=test_txt) - test_bpf.cleanup() - self.bpf_text = self.bpf_text.replace("// NEW: ", "") - except Exception: # pylint: disable=broad-except - self.bpf_text = self.bpf_text.replace("// OLD: ", "") - self.log("Tested BCC compatilibility, possible errors above are safe to ignore.") if self.dports: filterp = " && ".join(["dport != %d" % port for port in self.dports]) filter_txt = "if (%s) { birth.delete(&sk); return 0; }" % filterp - self.bpf_text = self.bpf_text.replace("//FILTER_DPORT", filter_txt) + self.bpf_text = self.bpf_text.replace("FILTER_DPORT", filter_txt) if self.lports: filterp = " && ".join(["lport != %d" % port for port in self.lports]) filter_txt = "if (%s) { birth.delete(&sk); return 0; }" % filterp - self.bpf_text = self.bpf_text.replace("//FILTER_LPORT", filter_txt) + self.bpf_text = self.bpf_text.replace("FILTER_LPORT", filter_txt) if not self.pids and self.proc_filter and self.proc_refresh: self.log("No process to attach found, activation postponed.") return bpf_text = self.apply_pid_filter(self.bpf_text, self.pids, False) + bpf_text = bpf_text.replace('FILTER_PID', '') + bpf_text = bpf_text.replace('FILTER_DPORT', '') + bpf_text = bpf_text.replace('FILTER_LPORT', '') + bpf_text = bpf_text.replace('FILTER_FAMILY', '') if self.debug: self.log("BPF to be compiled:\n" + bpf_text.strip()) diff --git a/src/pmdas/bcc/modules/tcplife_old_kb.bpf b/src/pmdas/bcc/modules/tcplife_kp.bpf similarity index 81% rename from src/pmdas/bcc/modules/tcplife_old_kb.bpf rename to src/pmdas/bcc/modules/tcplife_kp.bpf index eed01941a..5486c6a37 100644 --- a/src/pmdas/bcc/modules/tcplife_old_kb.bpf +++ b/src/pmdas/bcc/modules/tcplife_kp.bpf @@ -2,7 +2,6 @@ // Licensed under the Apache License, Version 2.0 (the "License") #include -#define KBUILD_MODNAME "pcpbcctcplife" #include #include #include @@ -11,11 +10,10 @@ BPF_HASH(birth, struct sock *, u64); // separate data structs for ipv4 and ipv6 struct ipv4_data_t { - // XXX: switch some to u32's when supported u64 ts_us; - u64 pid; - u64 saddr; - u64 daddr; + u32 pid; + u32 saddr; + u32 daddr; u64 ports; u64 rx_b; u64 tx_b; @@ -26,7 +24,7 @@ BPF_PERF_OUTPUT(ipv4_events); struct ipv6_data_t { u64 ts_us; - u64 pid; + u32 pid; unsigned __int128 saddr; unsigned __int128 daddr; u64 ports; @@ -49,12 +47,12 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state) // lport is either used in a filter here, or later u16 lport = sk->__sk_common.skc_num; - //FILTER_LPORT + FILTER_LPORT // dport is either used in a filter here, or later u16 dport = sk->__sk_common.skc_dport; dport = ntohs(dport); - //FILTER_DPORT + FILTER_DPORT /* * This tool includes PID and comm context. It's best effort, and may @@ -74,6 +72,9 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state) * sets ESTABLISHED without a tcp_set_state() call. Until we know * that for sure, match all early states to increase chances a * timestamp is set. + * Note that this needs to be set before the PID filter later on, + * since the PID isn't reliable for these early stages, so we must + * save all timestamps and do the PID filter later when we can. */ u64 ts = bpf_ktime_get_ns(); birth.update(&sk, &ts); @@ -101,7 +102,7 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state) delta_us = (bpf_ktime_get_ns() - *tsp) / 1000; birth.delete(&sk); - // fetch possible cached data + // fetch possible cached data, and filter struct id_t *mep; mep = whoami.lookup(&sk); if (mep != 0) @@ -116,9 +117,13 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state) u16 family = sk->__sk_common.skc_family; + FILTER_FAMILY + if (family == AF_INET) { - struct ipv4_data_t data4 = {.span_us = delta_us, - .rx_b = rx_b, .tx_b = tx_b}; + struct ipv4_data_t data4 = {}; + data4.span_us = delta_us; + data4.rx_b = rx_b; + data4.tx_b = tx_b; data4.ts_us = bpf_ktime_get_ns() / 1000; data4.saddr = sk->__sk_common.skc_rcv_saddr; data4.daddr = sk->__sk_common.skc_daddr; @@ -128,17 +133,19 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state) if (mep == 0) { bpf_get_current_comm(&data4.task, sizeof(data4.task)); } else { - bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task); + bpf_probe_read_kernel(&data4.task, sizeof(data4.task), (void *)mep->task); } ipv4_events.perf_submit(ctx, &data4, sizeof(data4)); } else /* 6 */ { - struct ipv6_data_t data6 = {.span_us = delta_us, - .rx_b = rx_b, .tx_b = tx_b}; + struct ipv6_data_t data6 = {}; + data6.span_us = delta_us; + data6.rx_b = rx_b; + data6.tx_b = tx_b; data6.ts_us = bpf_ktime_get_ns() / 1000; - bpf_probe_read(&data6.saddr, sizeof(data6.saddr), + bpf_probe_read_kernel(&data6.saddr, sizeof(data6.saddr), sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); - bpf_probe_read(&data6.daddr, sizeof(data6.daddr), + bpf_probe_read_kernel(&data6.daddr, sizeof(data6.daddr), sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32); // a workaround until data6 compiles with separate lport/dport data6.ports = dport + ((0ULL + lport) << 32); @@ -146,7 +153,7 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state) if (mep == 0) { bpf_get_current_comm(&data6.task, sizeof(data6.task)); } else { - bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task); + bpf_probe_read_kernel(&data6.task, sizeof(data6.task), (void *)mep->task); } ipv6_events.perf_submit(ctx, &data6, sizeof(data6)); } diff --git a/src/pmdas/bcc/modules/tcplife_old_tp.bpf b/src/pmdas/bcc/modules/tcplife_old_tp.bpf deleted file mode 100644 index a7c9c625c..000000000 --- a/src/pmdas/bcc/modules/tcplife_old_tp.bpf +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright 2016 Netflix, Inc. -// Licensed under the Apache License, Version 2.0 (the "License") - -#include -#define KBUILD_MODNAME "pcpbcctcplife" -#include -#include -#include - -BPF_HASH(birth, struct sock *, u64); - -// separate data structs for ipv4 and ipv6 -struct ipv4_data_t { - // XXX: switch some to u32's when supported - u64 ts_us; - u64 pid; - u64 saddr; - u64 daddr; - u64 ports; - u64 rx_b; - u64 tx_b; - u64 span_us; - char task[TASK_COMM_LEN]; -}; -BPF_PERF_OUTPUT(ipv4_events); - -struct ipv6_data_t { - u64 ts_us; - u64 pid; - unsigned __int128 saddr; - unsigned __int128 daddr; - u64 ports; - u64 rx_b; - u64 tx_b; - u64 span_us; - char task[TASK_COMM_LEN]; -}; -BPF_PERF_OUTPUT(ipv6_events); - -struct id_t { - u32 pid; - char task[TASK_COMM_LEN]; -}; -BPF_HASH(whoami, struct sock *, struct id_t); - -TRACEPOINT_PROBE(tcp, tcp_set_state) -{ - u32 pid = bpf_get_current_pid_tgid() >> 32; - // sk is mostly used as a UUID, once for skc_family, and two tcp stats: - struct sock *sk = (struct sock *)args->skaddr; - - // lport is either used in a filter here, or later - u16 lport = args->sport; - //FILTER_LPORT - - // dport is either used in a filter here, or later - u16 dport = args->dport; - //FILTER_DPORT - - /* - * This tool includes PID and comm context. It's best effort, and may - * be wrong in some situations. It currently works like this: - * - record timestamp on any state < TCP_FIN_WAIT1 - * - cache task context on: - * TCP_SYN_SENT: tracing from client - * TCP_LAST_ACK: client-closed from server - * - do output on TCP_CLOSE: - * fetch task context if cached, or use current task - */ - - // capture birth time - if (args->newstate < TCP_FIN_WAIT1) { - /* - * Matching just ESTABLISHED may be sufficient, provided no code-path - * sets ESTABLISHED without a tcp_set_state() call. Until we know - * that for sure, match all early states to increase chances a - * timestamp is set. - * Note that this needs to be set before the PID filter later on, - * since the PID isn't reliable for these early stages, so we must - * save all timestamps and do the PID filter later when we can. - */ - u64 ts = bpf_ktime_get_ns(); - birth.update(&sk, &ts); - } - - // record PID & comm on SYN_SENT - if (args->newstate == TCP_SYN_SENT || args->newstate == TCP_LAST_ACK) { - // now we can PID filter, both here and a little later on for CLOSE - FILTER_PID - struct id_t me = {.pid = pid}; - bpf_get_current_comm(&me.task, sizeof(me.task)); - whoami.update(&sk, &me); - } - - if (args->newstate != TCP_CLOSE) - return 0; - - // calculate lifespan - u64 *tsp, delta_us; - tsp = birth.lookup(&sk); - if (tsp == 0) { - whoami.delete(&sk); // may not exist - return 0; // missed create - } - delta_us = (bpf_ktime_get_ns() - *tsp) / 1000; - birth.delete(&sk); - - // fetch possible cached data, and filter - struct id_t *mep; - mep = whoami.lookup(&sk); - if (mep != 0) - pid = mep->pid; - FILTER_PID - - // get throughput stats. see tcp_get_info(). - u64 rx_b = 0, tx_b = 0, sport = 0; - struct tcp_sock *tp = (struct tcp_sock *)sk; - // OLD: bpf_probe_read(&rx_b, sizeof(rx_b), &tp->bytes_received); - // OLD: bpf_probe_read(&tx_b, sizeof(tx_b), &tp->bytes_acked); - // NEW: rx_b = tp->bytes_received; - // NEW: tx_b = tp->bytes_acked; - - u16 family = 0; - // OLD: bpf_probe_read(&family, sizeof(family), &sk->__sk_common.skc_family); - // NEW: family = sk->__sk_common.skc_family; - - if (family == AF_INET) { - - struct ipv4_data_t data4 = {.span_us = delta_us, - .rx_b = rx_b, .tx_b = tx_b}; - data4.ts_us = bpf_ktime_get_ns() / 1000; - bpf_probe_read(&data4.saddr, sizeof(u32), args->saddr); - bpf_probe_read(&data4.daddr, sizeof(u32), args->daddr); - // a workaround until data4 compiles with separate lport/dport - data4.ports = dport + ((0ULL + lport) << 32); - data4.pid = pid; - - if (mep == 0) { - bpf_get_current_comm(&data4.task, sizeof(data4.task)); - } else { - bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task); - } - ipv4_events.perf_submit(args, &data4, sizeof(data4)); - - } else /* 6 */ { - struct ipv6_data_t data6 = {.span_us = delta_us, - .rx_b = rx_b, .tx_b = tx_b}; - data6.ts_us = bpf_ktime_get_ns() / 1000; - bpf_probe_read(&data6.saddr, sizeof(data6.saddr), args->saddr_v6); - bpf_probe_read(&data6.daddr, sizeof(data6.daddr), args->saddr_v6); - // a workaround until data6 compiles with separate lport/dport - data6.ports = dport + ((0ULL + lport) << 32); - data6.pid = pid; - if (mep == 0) { - bpf_get_current_comm(&data6.task, sizeof(data6.task)); - } else { - bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task); - } - ipv6_events.perf_submit(args, &data6, sizeof(data6)); - } - - if (mep != 0) - whoami.delete(&sk); - - return 0; -} diff --git a/src/pmdas/bcc/modules/tcplife.bpf b/src/pmdas/bcc/modules/tcplife_tp.bpf similarity index 80% rename from src/pmdas/bcc/modules/tcplife.bpf rename to src/pmdas/bcc/modules/tcplife_tp.bpf index 19ca8d740..2b16b98e7 100644 --- a/src/pmdas/bcc/modules/tcplife.bpf +++ b/src/pmdas/bcc/modules/tcplife_tp.bpf @@ -2,7 +2,6 @@ // Licensed under the Apache License, Version 2.0 (the "License") #include -#define KBUILD_MODNAME "pcpbcctcplife" #include #include #include @@ -11,11 +10,10 @@ BPF_HASH(birth, struct sock *, u64); // separate data structs for ipv4 and ipv6 struct ipv4_data_t { - // XXX: switch some to u32's when supported u64 ts_us; - u64 pid; - u64 saddr; - u64 daddr; + u32 pid; + u32 saddr; + u32 daddr; u64 ports; u64 rx_b; u64 tx_b; @@ -26,7 +24,7 @@ BPF_PERF_OUTPUT(ipv4_events); struct ipv6_data_t { u64 ts_us; - u64 pid; + u32 pid; unsigned __int128 saddr; unsigned __int128 daddr; u64 ports; @@ -54,11 +52,11 @@ TRACEPOINT_PROBE(sock, inet_sock_set_state) // lport is either used in a filter here, or later u16 lport = args->sport; - //FILTER_LPORT + FILTER_LPORT // dport is either used in a filter here, or later u16 dport = args->dport; - //FILTER_DPORT + FILTER_DPORT /* * This tool includes PID and comm context. It's best effort, and may @@ -115,20 +113,23 @@ TRACEPOINT_PROBE(sock, inet_sock_set_state) pid = mep->pid; FILTER_PID + u16 family = args->family; + FILTER_FAMILY + // get throughput stats. see tcp_get_info(). u64 rx_b = 0, tx_b = 0, sport = 0; struct tcp_sock *tp = (struct tcp_sock *)sk; - // OLD: bpf_probe_read(&rx_b, sizeof(rx_b), &tp->bytes_received); - // OLD: bpf_probe_read(&tx_b, sizeof(tx_b), &tp->bytes_acked); - // NEW: rx_b = tp->bytes_received; - // NEW: tx_b = tp->bytes_acked; + rx_b = tp->bytes_received; + tx_b = tp->bytes_acked; if (args->family == AF_INET) { - struct ipv4_data_t data4 = {.span_us = delta_us, - .rx_b = rx_b, .tx_b = tx_b}; + struct ipv4_data_t data4 = {}; + data4.span_us = delta_us; + data4.rx_b = rx_b; + data4.tx_b = tx_b; data4.ts_us = bpf_ktime_get_ns() / 1000; - bpf_probe_read(&data4.saddr, sizeof(u32), args->saddr); - bpf_probe_read(&data4.daddr, sizeof(u32), args->daddr); + __builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr)); + __builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr)); // a workaround until data4 compiles with separate lport/dport data4.ports = dport + ((0ULL + lport) << 32); data4.pid = pid; @@ -136,23 +137,25 @@ TRACEPOINT_PROBE(sock, inet_sock_set_state) if (mep == 0) { bpf_get_current_comm(&data4.task, sizeof(data4.task)); } else { - bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task); + bpf_probe_read_kernel(&data4.task, sizeof(data4.task), (void *)mep->task); } ipv4_events.perf_submit(args, &data4, sizeof(data4)); } else /* 6 */ { - struct ipv6_data_t data6 = {.span_us = delta_us, - .rx_b = rx_b, .tx_b = tx_b}; + struct ipv6_data_t data6 = {}; + data6.span_us = delta_us; + data6.rx_b = rx_b; + data6.tx_b = tx_b; data6.ts_us = bpf_ktime_get_ns() / 1000; - bpf_probe_read(&data6.saddr, sizeof(data6.saddr), args->saddr_v6); - bpf_probe_read(&data6.daddr, sizeof(data6.daddr), args->saddr_v6); + __builtin_memcpy(&data6.saddr, args->saddr_v6, sizeof(data6.saddr)); + __builtin_memcpy(&data6.daddr, args->daddr_v6, sizeof(data6.daddr)); // a workaround until data6 compiles with separate lport/dport data6.ports = dport + ((0ULL + lport) << 32); data6.pid = pid; if (mep == 0) { bpf_get_current_comm(&data6.task, sizeof(data6.task)); } else { - bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task); + bpf_probe_read_kernel(&data6.task, sizeof(data6.task), (void *)mep->task); } ipv6_events.perf_submit(args, &data6, sizeof(data6)); } diff --git a/src/pmdas/bcc/modules/tcpperpid.python b/src/pmdas/bcc/modules/tcpperpid.python index 3cb2cfcfd..0096929a6 100644 --- a/src/pmdas/bcc/modules/tcpperpid.python +++ b/src/pmdas/bcc/modules/tcpperpid.python @@ -32,16 +32,10 @@ from modules.pcpbcc import PCPBCCBase # # BPF program # -bpf_src = "modules/tcplife.bpf" -# Compat with kernel < 4.16, bcc < 0.6 -TRACEFS = "/sys/kernel/debug/tracing" -bpf_src_old_tp = "modules/tcplife_old_tp.bpf" -bpf_src_old_kb = "modules/tcplife_old_kb.bpf" -if not path.exists(TRACEFS + "/events/sock/inet_sock_set_state"): - if path.exists(TRACEFS + "/events/tcp/tcp_set_state"): - bpf_src = bpf_src_old_tp - else: - bpf_src = bpf_src_old_kb +if BPF.tracepoint_exists("sock", "inet_sock_set_state"): + bpf_src = "modules/tcplife_tp.bpf" +else: + bpf_src = "modules/tcplife_kp.bpf" # Alternative, "high resolution" BPF bpf_highres = "modules/tcptop.bpf" @@ -53,36 +47,6 @@ MODULE = 'tcpperpid' BASENS = 'proc.io.net.total.' units_bytes = pmUnits(1, 0, 0, PM_SPACE_BYTE, 0, 0) -TASK_COMM_LEN = 16 # linux/sched.h - -class Data_ipv4(ct.Structure): - """ IPv4 data struct """ - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("pid", ct.c_ulonglong), - ("saddr", ct.c_ulonglong), - ("daddr", ct.c_ulonglong), - ("ports", ct.c_ulonglong), - ("rx_b", ct.c_ulonglong), - ("tx_b", ct.c_ulonglong), - ("span_us", ct.c_ulonglong), - ("task", ct.c_char * TASK_COMM_LEN) - ] - -class Data_ipv6(ct.Structure): - """ IPv6 data struct """ - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("pid", ct.c_ulonglong), - ("saddr", (ct.c_ulonglong * 2)), - ("daddr", (ct.c_ulonglong * 2)), - ("ports", ct.c_ulonglong), - ("rx_b", ct.c_ulonglong), - ("tx_b", ct.c_ulonglong), - ("span_us", ct.c_ulonglong), - ("task", ct.c_char * TASK_COMM_LEN) - ] - # # PCP BCC Module # @@ -133,15 +97,14 @@ class PCPBCCModule(PCPBCCBase): self.log("Using BPF source file %s." % src) # Exit hard if impossible to continue - if self.bcc_version() == "0.6.1" and src == bpf_src_old_kb and not self.highres: - raise RuntimeError("BCC 0.6.1 bug makes it incompatible with this module " - "on kernel < 4.15 in non-highres mode.") + if self.bcc_version_tuple() < (0, 6, 1) and not self.highres: + raise RuntimeError("BCC 0.6.1+ is required for this module in non-highres mode.") self.log("Initialized.") def handle_ipv4_event(self, _cpu, data, _size): """ IPv4 event handler """ - event = ct.cast(data, ct.POINTER(Data_ipv4)).contents + event = self.bpf["ipv4_events"].event(data) pid = str(event.pid).zfill(6) self.lock.acquire() if pid not in self.ipv4_stats: @@ -153,7 +116,7 @@ class PCPBCCModule(PCPBCCBase): def handle_ipv6_event(self, _cpu, data, _size): """ IPv6 event handler """ - event = ct.cast(data, ct.POINTER(Data_ipv6)).contents + event = self.bpf["ipv6_events"].event(data) pid = str(event.pid).zfill(6) self.lock.acquire() if pid not in self.ipv6_stats: @@ -199,31 +162,25 @@ class PCPBCCModule(PCPBCCBase): self.bpf_text = src.read() if self.highres: self.bpf_text = self.bpf_text.replace("FILTER", "FILTER_PID") - # Compat with bcc < 0.6 - self.log("Testing BCC compatilibility, possible errors below are safe to ignore.") - try: - test_txt = self.bpf_text.replace("// NEW: ", "").replace("FILTER_PID", "") - test_bpf = BPF(text=test_txt) - test_bpf.cleanup() - self.bpf_text = self.bpf_text.replace("// NEW: ", "") - except Exception: # pylint: disable=broad-except - self.bpf_text = self.bpf_text.replace("// OLD: ", "") - self.log("Tested BCC compatilibility, possible errors above are safe to ignore.") if self.dports: filterp = " && ".join(["dport != %d" % port for port in self.dports]) filter_txt = "if (%s) { birth.delete(&sk); return 0; }" % filterp - self.bpf_text = self.bpf_text.replace("//FILTER_DPORT", filter_txt) + self.bpf_text = self.bpf_text.replace("FILTER_DPORT", filter_txt) if self.lports: filterp = " && ".join(["lport != %d" % port for port in self.lports]) filter_txt = "if (%s) { birth.delete(&sk); return 0; }" % filterp - self.bpf_text = self.bpf_text.replace("//FILTER_LPORT", filter_txt) + self.bpf_text = self.bpf_text.replace("FILTER_LPORT", filter_txt) if not self.pids and self.proc_filter and self.proc_refresh: self.log("No process to attach found, activation postponed.") return bpf_text = self.apply_pid_filter(self.bpf_text, self.pids, False) + bpf_text = bpf_text.replace('FILTER_PID', '') + bpf_text = bpf_text.replace('FILTER_DPORT', '') + bpf_text = bpf_text.replace('FILTER_LPORT', '') + bpf_text = bpf_text.replace('FILTER_FAMILY', '') if self.debug: self.log("BPF to be compiled:\n" + bpf_text.strip()) diff --git a/src/pmdas/bcc/modules/tcptop.bpf b/src/pmdas/bcc/modules/tcptop.bpf index 349ee1529..c1fed7aef 100644 --- a/src/pmdas/bcc/modules/tcptop.bpf +++ b/src/pmdas/bcc/modules/tcptop.bpf @@ -4,6 +4,7 @@ #include #include #include + struct ipv4_key_t { u32 pid; u32 saddr; @@ -13,25 +14,32 @@ struct ipv4_key_t { }; BPF_HASH(ipv4_send_bytes, struct ipv4_key_t); BPF_HASH(ipv4_recv_bytes, struct ipv4_key_t); + struct ipv6_key_t { + unsigned __int128 saddr; + unsigned __int128 daddr; u32 pid; - // workaround until unsigned __int128 support: - u64 saddr0; - u64 saddr1; - u64 daddr0; - u64 daddr1; u16 lport; u16 dport; + u64 __pad__; }; BPF_HASH(ipv6_send_bytes, struct ipv6_key_t); BPF_HASH(ipv6_recv_bytes, struct ipv6_key_t); + int kprobe__tcp_sendmsg(struct pt_regs *ctx, struct sock *sk, struct msghdr *msg, size_t size) { - u32 pid = bpf_get_current_pid_tgid(); - FILTER + if (container_should_be_filtered()) { + return 0; + } + + u32 pid = bpf_get_current_pid_tgid() >> 32; + FILTER_PID + u16 dport = 0, family = sk->__sk_common.skc_family; - u64 *val, zero = 0; + + FILTER_FAMILY + if (family == AF_INET) { struct ipv4_key_t ipv4_key = {.pid = pid}; ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr; @@ -39,31 +47,24 @@ int kprobe__tcp_sendmsg(struct pt_regs *ctx, struct sock *sk, ipv4_key.lport = sk->__sk_common.skc_num; dport = sk->__sk_common.skc_dport; ipv4_key.dport = ntohs(dport); - val = ipv4_send_bytes.lookup_or_init(&ipv4_key, &zero); - if (val) { - (*val) += size; - } + ipv4_send_bytes.increment(ipv4_key, size); + } else if (family == AF_INET6) { struct ipv6_key_t ipv6_key = {.pid = pid}; - bpf_probe_read(&ipv6_key.saddr0, sizeof(ipv6_key.saddr0), - &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[0]); - bpf_probe_read(&ipv6_key.saddr1, sizeof(ipv6_key.saddr1), - &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[2]); - bpf_probe_read(&ipv6_key.daddr0, sizeof(ipv6_key.daddr0), - &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[0]); - bpf_probe_read(&ipv6_key.daddr1, sizeof(ipv6_key.daddr1), - &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[2]); + bpf_probe_read_kernel(&ipv6_key.saddr, sizeof(ipv6_key.saddr), + &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); + bpf_probe_read_kernel(&ipv6_key.daddr, sizeof(ipv6_key.daddr), + &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32); ipv6_key.lport = sk->__sk_common.skc_num; dport = sk->__sk_common.skc_dport; ipv6_key.dport = ntohs(dport); - val = ipv6_send_bytes.lookup_or_init(&ipv6_key, &zero); - if (val) { - (*val) += size; - } + ipv6_send_bytes.increment(ipv6_key, size); } // else drop + return 0; } + /* * tcp_recvmsg() would be obvious to trace, but is less suitable because: * - we'd need to trace both entry and return, to have both sock and size @@ -72,12 +73,21 @@ int kprobe__tcp_sendmsg(struct pt_regs *ctx, struct sock *sk, */ int kprobe__tcp_cleanup_rbuf(struct pt_regs *ctx, struct sock *sk, int copied) { - u32 pid = bpf_get_current_pid_tgid(); - FILTER + if (container_should_be_filtered()) { + return 0; + } + + u32 pid = bpf_get_current_pid_tgid() >> 32; + FILTER_PID + u16 dport = 0, family = sk->__sk_common.skc_family; u64 *val, zero = 0; + if (copied <= 0) return 0; + + FILTER_FAMILY + if (family == AF_INET) { struct ipv4_key_t ipv4_key = {.pid = pid}; ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr; @@ -85,28 +95,20 @@ int kprobe__tcp_cleanup_rbuf(struct pt_regs *ctx, struct sock *sk, int copied) ipv4_key.lport = sk->__sk_common.skc_num; dport = sk->__sk_common.skc_dport; ipv4_key.dport = ntohs(dport); - val = ipv4_recv_bytes.lookup_or_init(&ipv4_key, &zero); - if (val) { - (*val) += copied; - } + ipv4_recv_bytes.increment(ipv4_key, copied); + } else if (family == AF_INET6) { struct ipv6_key_t ipv6_key = {.pid = pid}; - bpf_probe_read(&ipv6_key.saddr0, sizeof(ipv6_key.saddr0), - &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[0]); - bpf_probe_read(&ipv6_key.saddr1, sizeof(ipv6_key.saddr1), - &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[2]); - bpf_probe_read(&ipv6_key.daddr0, sizeof(ipv6_key.daddr0), - &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[0]); - bpf_probe_read(&ipv6_key.daddr1, sizeof(ipv6_key.daddr1), - &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[2]); + bpf_probe_read_kernel(&ipv6_key.saddr, sizeof(ipv6_key.saddr), + &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); + bpf_probe_read_kernel(&ipv6_key.daddr, sizeof(ipv6_key.daddr), + &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32); ipv6_key.lport = sk->__sk_common.skc_num; dport = sk->__sk_common.skc_dport; ipv6_key.dport = ntohs(dport); - val = ipv6_recv_bytes.lookup_or_init(&ipv6_key, &zero); - if (val) { - (*val) += copied; - } + ipv6_recv_bytes.increment(ipv6_key, copied); } // else drop + return 0; } diff --git a/src/pmdas/bcc/modules/tcptop.python b/src/pmdas/bcc/modules/tcptop.python index 45063dff3..db1c1da15 100644 --- a/src/pmdas/bcc/modules/tcptop.python +++ b/src/pmdas/bcc/modules/tcptop.python @@ -120,13 +120,14 @@ class PCPBCCModule(PCPBCCBase): with open(path.dirname(__file__) + '/../' + bpf_src) as src: self.bpf_text = src.read() - self.bpf_text = self.bpf_text.replace("FILTER", "FILTER_PID") - if not self.pids and self.proc_filter and self.proc_refresh: self.log("No process to attach found, activation postponed.") return bpf_text = self.apply_pid_filter(self.bpf_text, self.pids, False) + bpf_text = bpf_text.replace('FILTER_PID', '') + bpf_text = bpf_text.replace('FILTER_FAMILY', '') + bpf_text = bpf_text.replace('container_should_be_filtered()', '0') if self.debug: self.log("BPF to be compiled:\n" + bpf_text.strip()) @@ -155,21 +156,31 @@ class PCPBCCModule(PCPBCCBase): @staticmethod def ipv4_table_to_dict(table): - """ Build hashable dict from IPv4 BPF table """ - return {TCPSessionKey(pid=k.pid, - laddr=inet_ntop(AF_INET, pack("I", k.saddr)), - lport=k.lport, - daddr=inet_ntop(AF_INET, pack("I", k.daddr)), - dport=k.dport):v.value for k, v in table.items()} + """Build hashable dict from IPv4 BPF table""" + return { + TCPSessionKey( + pid=k.pid, + laddr=inet_ntop(AF_INET, pack("I", k.saddr)), + lport=k.lport, + daddr=inet_ntop(AF_INET, pack("I", k.daddr)), + dport=k.dport, + ): v.value + for k, v in table.items() + } @staticmethod def ipv6_table_to_dict(table): - """ Build hashable dict from IPv6 BPF table """ - return {TCPSessionKey(pid=k.pid, - laddr=inet_ntop(AF_INET6, pack("QQ", k.saddr0, k.saddr1)), - lport=k.lport, - daddr=inet_ntop(AF_INET6, pack("QQ", k.daddr0, k.daddr1)), - dport=k.dport):v.value for k, v in table.items()} + """Build hashable dict from IPv6 BPF table""" + return { + TCPSessionKey( + pid=k.pid, + laddr=inet_ntop(AF_INET6, k.saddr), + lport=k.lport, + daddr=inet_ntop(AF_INET6, k.daddr), + dport=k.dport, + ): v.value + for k, v in table.items() + } def refresh_stats(self): """ Refresh statistics from BPF table """ commit d45ce8e85035cc95ba897cd19967fad6d5d741be (cherry-picked) Author: Andreas Gerstmayr Date: Wed Dec 15 08:03:40 2021 +0100 qa: update qa/1118 to add new log output of runqlat bcc module diff --git a/qa/1118 b/qa/1118 index 4123495b5..bcaec0a0d 100755 --- a/qa/1118 +++ b/qa/1118 @@ -21,12 +21,19 @@ _label_filter() grep '"0-1"' | grep '"statistic":"histogram"' | grep '"lower_bound":0' | grep 'upper_bound":1' > /dev/null && echo 'OK' } +_install_filter() +{ + sed \ + -e "s/Using BPF source file .\+/Using BPF source file X/g" \ + #end +} + _prepare_pmda bcc trap "_pmdabcc_cleanup; exit \$status" 0 1 2 3 15 _stop_auto_restart pmcd # real QA test starts here -cat <