Rebase bcc to v0.29.1 and enable libbpf-tools on s390x

Also fix bpf-bindsnoop, ttysnoop and sync libbpf.

Resolves: bz#2253688
Resolves: bz#2249458
This commit is contained in:
Jerome Marchand 2024-02-05 14:52:12 +01:00
parent eae885cfcc
commit 110e48716f
10 changed files with 986 additions and 1425 deletions

1
.gitignore vendored
View File

@ -21,3 +21,4 @@
/bcc-0.26.0.tar.gz
/bcc-0.27.0.tar.gz
/bcc-0.28.0.tar.gz
/bcc-0.29.1.tar.gz

View File

@ -0,0 +1,132 @@
From 89126c7452c29736d38dc072a952b0b0c831fade Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Mon, 29 Jan 2024 16:13:30 -0800
Subject: [PATCH] [PATCH] Fix ttysnoop.py with newer kernels
Jerome Marchand reported that ttysnoop.py won't work properly
with newer kernels (#4884). I did some investigation and found
that some kernel data structure change caused verification failure.
The failure is caused by the following:
; kvec = from->kvec;
// R1=ptr_iov_iter()
15: (79) r1 = *(u64 *)(r1 +16) ; R1_w=scalar()
; count = kvec->iov_len;
16: (bf) r2 = r1 ; R1_w=scalar(id=1) R2_w=scalar(id=1)
17: (07) r2 += 8 ; R2_w=scalar()
18: (05) goto pc+3
;
22: (79) r2 = *(u64 *)(r2 +0)
R2 invalid mem access 'scalar'
So basically, loading 'iov_iter + 16' returns a scalar but verifier
expects it to be a pointer.
In v6.4, we have
struct iovec
{
void __user *iov_base; /* BSD uses caddr_t (1003.1g requires void *) */
__kernel_size_t iov_len; /* Must be size_t (1003.1g) */
};
struct iov_iter {
u8 iter_type;
bool copy_mc;
bool nofault;
bool data_source;
bool user_backed;
union {
size_t iov_offset;
int last_offset;
};
union {
struct iovec __ubuf_iovec;
struct {
union {
const struct iovec *__iov;
const struct kvec *kvec;
const struct bio_vec *bvec;
struct xarray *xarray;
struct pipe_inode_info *pipe;
void __user *ubuf;
};
size_t count;
};
};
union {
unsigned long nr_segs;
struct {
unsigned int head;
unsigned int start_head;
};
loff_t xarray_start;
};
};
The kernel traversal chain will be
"struct iov_iter" -> "struct iovec __ubuf_iovec" -> "void __user *iov_base".
Since the "iov_base" type is a ptr to void, the kernel considers the
loaded value as a scalar which caused verification failure.
But for old kernel like 5.19, we do not have this issue.
struct iovec
{
void __user *iov_base; /* BSD uses caddr_t (1003.1g requires void *) */
__kernel_size_t iov_len; /* Must be size_t (1003.1g) */
};
struct iov_iter {
u8 iter_type;
bool nofault;
bool data_source;
bool user_backed;
size_t iov_offset;
size_t count;
union {
const struct iovec *iov;
const struct kvec *kvec;
const struct bio_vec *bvec;
struct xarray *xarray;
struct pipe_inode_info *pipe;
void __user *ubuf;
};
union {
unsigned long nr_segs;
struct {
unsigned int head;
unsigned int start_head;
};
loff_t xarray_start;
};
};
The kernel traversal chain will be
"struct iov_iter" -> "const struct iovec *iov"
Note that "const struct iovec *iov" is used since it is the *first* member
inside the union. The traversal stops once we hit a pointer.
So the kernel verifier returns a 'struct iovec' object (untrusted, cannot
be used as a parameter to a call) and verifier can proceed.
To fix the problem, let us use bpf_probe_read_kernel() instead
so ttysnoop.py can continue to work with newer kernel.
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
tools/ttysnoop.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tools/ttysnoop.py b/tools/ttysnoop.py
index 77f97b7c..aca09db4 100755
--- a/tools/ttysnoop.py
+++ b/tools/ttysnoop.py
@@ -162,8 +162,8 @@ PROBE_TTY_WRITE
*/
case CASE_ITER_IOVEC_NAME:
kvec = from->kvec;
- buf = kvec->iov_base;
- count = kvec->iov_len;
+ bpf_probe_read_kernel(&buf, sizeof(buf), &kvec->iov_base);
+ bpf_probe_read_kernel(&count, sizeof(count), &kvec->iov_len);
break;
CASE_ITER_UBUF_TEXT
/* TODO: Support more type */
--
2.43.0

View File

@ -0,0 +1,727 @@
From c0691e35cd65d5400f0b792d5eba81f8eae236dc Mon Sep 17 00:00:00 2001
From: yonghong-song <ys114321@gmail.com>
Date: Tue, 30 Jan 2024 09:14:30 -0800
Subject: [PATCH] Sync with latest libbpf repo (#4889)
Sync with latest libbpf repo.
The top libbpf commit is:
3b0973892891 sync: remove NETDEV_XSK_FLAGS_MASK which is not in bpf/bpf-next anymore
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
introspection/bps.c | 1 +
src/cc/compat/linux/virtual_bpf.h | 368 ++++++++++++++++++++++++++----
src/cc/libbpf | 2 +-
3 files changed, 326 insertions(+), 45 deletions(-)
diff --git a/introspection/bps.c b/introspection/bps.c
index 3956fbf2..8cdef54a 100644
--- a/introspection/bps.c
+++ b/introspection/bps.c
@@ -48,6 +48,7 @@ static const char * const prog_type_strings[] = {
[BPF_PROG_TYPE_LSM] = "lsm",
[BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup",
[BPF_PROG_TYPE_SYSCALL] = "syscall",
+ [BPF_PROG_TYPE_NETFILTER] = "netfilter",
};
static const char * const map_type_strings[] = {
diff --git a/src/cc/compat/linux/virtual_bpf.h b/src/cc/compat/linux/virtual_bpf.h
index a182123e..fcabe71a 100644
--- a/src/cc/compat/linux/virtual_bpf.h
+++ b/src/cc/compat/linux/virtual_bpf.h
@@ -20,6 +20,7 @@ R"********(
/* ld/ldx fields */
#define BPF_DW 0x18 /* double word (64-bit) */
+#define BPF_MEMSX 0x80 /* load with sign extension */
#define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */
#define BPF_XADD 0xc0 /* exclusive add - legacy name */
@@ -847,6 +848,36 @@ union bpf_iter_link_info {
* Returns zero on success. On error, -1 is returned and *errno*
* is set appropriately.
*
+ * BPF_TOKEN_CREATE
+ * Description
+ * Create BPF token with embedded information about what
+ * BPF-related functionality it allows:
+ * - a set of allowed bpf() syscall commands;
+ * - a set of allowed BPF map types to be created with
+ * BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed;
+ * - a set of allowed BPF program types and BPF program attach
+ * types to be loaded with BPF_PROG_LOAD command, if
+ * BPF_PROG_LOAD itself is allowed.
+ *
+ * BPF token is created (derived) from an instance of BPF FS,
+ * assuming it has necessary delegation mount options specified.
+ * This BPF token can be passed as an extra parameter to various
+ * bpf() syscall commands to grant BPF subsystem functionality to
+ * unprivileged processes.
+ *
+ * When created, BPF token is "associated" with the owning
+ * user namespace of BPF FS instance (super block) that it was
+ * derived from, and subsequent BPF operations performed with
+ * BPF token would be performing capabilities checks (i.e.,
+ * CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within
+ * that user namespace. Without BPF token, such capabilities
+ * have to be granted in init user namespace, making bpf()
+ * syscall incompatible with user namespace, for the most part.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
* NOTES
* eBPF objects (maps and programs) can be shared between processes.
*
@@ -901,6 +932,8 @@ enum bpf_cmd {
BPF_ITER_CREATE,
BPF_LINK_DETACH,
BPF_PROG_BIND_MAP,
+ BPF_TOKEN_CREATE,
+ __MAX_BPF_CMD,
};
enum bpf_map_type {
@@ -932,7 +965,14 @@ enum bpf_map_type {
*/
BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED,
BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
- BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+ BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED,
+ /* BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE is available to bpf programs
+ * attaching to a cgroup. The new mechanism (BPF_MAP_TYPE_CGRP_STORAGE +
+ * local percpu kptr) supports all BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE
+ * functionality and more. So mark * BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE
+ * deprecated.
+ */
+ BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE = BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED,
BPF_MAP_TYPE_QUEUE,
BPF_MAP_TYPE_STACK,
BPF_MAP_TYPE_SK_STORAGE,
@@ -944,6 +984,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_BLOOM_FILTER,
BPF_MAP_TYPE_USER_RINGBUF,
BPF_MAP_TYPE_CGRP_STORAGE,
+ __MAX_BPF_MAP_TYPE
};
/* Note that tracing related programs such as
@@ -987,6 +1028,8 @@ enum bpf_prog_type {
BPF_PROG_TYPE_LSM,
BPF_PROG_TYPE_SK_LOOKUP,
BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
+ BPF_PROG_TYPE_NETFILTER,
+ __MAX_BPF_PROG_TYPE
};
enum bpf_attach_type {
@@ -1035,6 +1078,17 @@ enum bpf_attach_type {
BPF_TRACE_KPROBE_MULTI,
BPF_LSM_CGROUP,
BPF_STRUCT_OPS,
+ BPF_NETFILTER,
+ BPF_TCX_INGRESS,
+ BPF_TCX_EGRESS,
+ BPF_TRACE_UPROBE_MULTI,
+ BPF_CGROUP_UNIX_CONNECT,
+ BPF_CGROUP_UNIX_SENDMSG,
+ BPF_CGROUP_UNIX_RECVMSG,
+ BPF_CGROUP_UNIX_GETPEERNAME,
+ BPF_CGROUP_UNIX_GETSOCKNAME,
+ BPF_NETKIT_PRIMARY,
+ BPF_NETKIT_PEER,
__MAX_BPF_ATTACH_TYPE
};
@@ -1051,8 +1105,23 @@ enum bpf_link_type {
BPF_LINK_TYPE_PERF_EVENT = 7,
BPF_LINK_TYPE_KPROBE_MULTI = 8,
BPF_LINK_TYPE_STRUCT_OPS = 9,
+ BPF_LINK_TYPE_NETFILTER = 10,
+ BPF_LINK_TYPE_TCX = 11,
+ BPF_LINK_TYPE_UPROBE_MULTI = 12,
+ BPF_LINK_TYPE_NETKIT = 13,
+ __MAX_BPF_LINK_TYPE,
+};
+
+#define MAX_BPF_LINK_TYPE __MAX_BPF_LINK_TYPE
- MAX_BPF_LINK_TYPE,
+enum bpf_perf_event_type {
+ BPF_PERF_EVENT_UNSPEC = 0,
+ BPF_PERF_EVENT_UPROBE = 1,
+ BPF_PERF_EVENT_URETPROBE = 2,
+ BPF_PERF_EVENT_KPROBE = 3,
+ BPF_PERF_EVENT_KRETPROBE = 4,
+ BPF_PERF_EVENT_TRACEPOINT = 5,
+ BPF_PERF_EVENT_EVENT = 6,
};
/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
@@ -1101,7 +1170,12 @@ enum bpf_link_type {
*/
#define BPF_F_ALLOW_OVERRIDE (1U << 0)
#define BPF_F_ALLOW_MULTI (1U << 1)
+/* Generic attachment flags. */
#define BPF_F_REPLACE (1U << 2)
+#define BPF_F_BEFORE (1U << 3)
+#define BPF_F_AFTER (1U << 4)
+#define BPF_F_ID (1U << 5)
+#define BPF_F_LINK BPF_F_LINK /* 1 << 13 */
/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
* verifier will perform strict alignment checking as if the kernel
@@ -1163,10 +1237,27 @@ enum bpf_link_type {
*/
#define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6)
+/* The verifier internal test flag. Behavior is undefined */
+#define BPF_F_TEST_REG_INVARIANTS (1U << 7)
+
/* link_create.kprobe_multi.flags used in LINK_CREATE command for
* BPF_TRACE_KPROBE_MULTI attach type to create return probe.
*/
-#define BPF_F_KPROBE_MULTI_RETURN (1U << 0)
+enum {
+ BPF_F_KPROBE_MULTI_RETURN = (1U << 0)
+};
+
+/* link_create.uprobe_multi.flags used in LINK_CREATE command for
+ * BPF_TRACE_UPROBE_MULTI attach type to create return probe.
+ */
+enum {
+ BPF_F_UPROBE_MULTI_RETURN = (1U << 0)
+};
+
+/* link_create.netfilter.flags used in LINK_CREATE command for
+ * BPF_PROG_TYPE_NETFILTER to enable IP packet defragmentation.
+ */
+#define BPF_F_NETFILTER_IP_DEFRAG (1U << 0)
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* the following extensions:
@@ -1271,6 +1362,15 @@ enum {
/* Create a map that will be registered/unregesitered by the backed bpf_link */
BPF_F_LINK = (1U << 13),
+
+/* Get path from provided FD in BPF_OBJ_PIN/BPF_OBJ_GET commands */
+ BPF_F_PATH_FD = (1U << 14),
+
+/* Flag for value_type_btf_obj_fd, the fd is available */
+ BPF_F_VTYPE_BTF_OBJ_FD = (1U << 15),
+
+/* BPF token FD is passed in a corresponding command's token_fd field */
+ BPF_F_TOKEN_FD = (1U << 16),
};
/* Flags for BPF_PROG_QUERY. */
@@ -1344,6 +1444,15 @@ union bpf_attr {
* to using 5 hash functions).
*/
__u64 map_extra;
+
+ __s32 value_type_btf_obj_fd; /* fd pointing to a BTF
+ * type data for
+ * btf_vmlinux_value_type_id.
+ */
+ /* BPF token FD to use with BPF_MAP_CREATE operation.
+ * If provided, map_flags should have BPF_F_TOKEN_FD flag set.
+ */
+ __s32 map_token_fd;
};
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -1413,23 +1522,39 @@ union bpf_attr {
* truncated), or smaller (if log buffer wasn't filled completely).
*/
__u32 log_true_size;
+ /* BPF token FD to use with BPF_PROG_LOAD operation.
+ * If provided, prog_flags should have BPF_F_TOKEN_FD flag set.
+ */
+ __s32 prog_token_fd;
};
struct { /* anonymous struct used by BPF_OBJ_* commands */
__aligned_u64 pathname;
__u32 bpf_fd;
__u32 file_flags;
+ /* Same as dirfd in openat() syscall; see openat(2)
+ * manpage for details of path FD and pathname semantics;
+ * path_fd should accompanied by BPF_F_PATH_FD flag set in
+ * file_flags field, otherwise it should be set to zero;
+ * if BPF_F_PATH_FD flag is not set, AT_FDCWD is assumed.
+ */
+ __s32 path_fd;
};
struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
- __u32 target_fd; /* container object to attach to */
- __u32 attach_bpf_fd; /* eBPF program to attach */
+ union {
+ __u32 target_fd; /* target object to attach to or ... */
+ __u32 target_ifindex; /* target ifindex */
+ };
+ __u32 attach_bpf_fd;
__u32 attach_type;
__u32 attach_flags;
- __u32 replace_bpf_fd; /* previously attached eBPF
- * program to replace if
- * BPF_F_REPLACE is used
- */
+ __u32 replace_bpf_fd;
+ union {
+ __u32 relative_fd;
+ __u32 relative_id;
+ };
+ __u64 expected_revision;
};
struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
@@ -1475,16 +1600,26 @@ union bpf_attr {
} info;
struct { /* anonymous struct used by BPF_PROG_QUERY command */
- __u32 target_fd; /* container object to query */
+ union {
+ __u32 target_fd; /* target object to query or ... */
+ __u32 target_ifindex; /* target ifindex */
+ };
__u32 attach_type;
__u32 query_flags;
__u32 attach_flags;
__aligned_u64 prog_ids;
- __u32 prog_cnt;
+ union {
+ __u32 prog_cnt;
+ __u32 count;
+ };
+ __u32 :32;
/* output: per-program attach_flags.
* not allowed to be set during effective query.
*/
__aligned_u64 prog_attach_flags;
+ __aligned_u64 link_ids;
+ __aligned_u64 link_attach_flags;
+ __u64 revision;
} query;
struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
@@ -1503,6 +1638,11 @@ union bpf_attr {
* truncated), or smaller (if log buffer wasn't filled completely).
*/
__u32 btf_log_true_size;
+ __u32 btf_flags;
+ /* BPF token FD to use with BPF_BTF_LOAD operation.
+ * If provided, btf_flags should have BPF_F_TOKEN_FD flag set.
+ */
+ __s32 btf_token_fd;
};
struct {
@@ -1527,13 +1667,13 @@ union bpf_attr {
__u32 map_fd; /* struct_ops to attach */
};
union {
- __u32 target_fd; /* object to attach to */
- __u32 target_ifindex; /* target ifindex */
+ __u32 target_fd; /* target object to attach to or ... */
+ __u32 target_ifindex; /* target ifindex */
};
__u32 attach_type; /* attach type */
__u32 flags; /* extra flags */
union {
- __u32 target_btf_id; /* btf_id of target to attach to */
+ __u32 target_btf_id; /* btf_id of target to attach to */
struct {
__aligned_u64 iter_info; /* extra bpf_iter_link_info */
__u32 iter_info_len; /* iter_info length */
@@ -1561,6 +1701,35 @@ union bpf_attr {
*/
__u64 cookie;
} tracing;
+ struct {
+ __u32 pf;
+ __u32 hooknum;
+ __s32 priority;
+ __u32 flags;
+ } netfilter;
+ struct {
+ union {
+ __u32 relative_fd;
+ __u32 relative_id;
+ };
+ __u64 expected_revision;
+ } tcx;
+ struct {
+ __aligned_u64 path;
+ __aligned_u64 offsets;
+ __aligned_u64 ref_ctr_offsets;
+ __aligned_u64 cookies;
+ __u32 cnt;
+ __u32 flags;
+ __u32 pid;
+ } uprobe_multi;
+ struct {
+ union {
+ __u32 relative_fd;
+ __u32 relative_id;
+ };
+ __u64 expected_revision;
+ } netkit;
};
} link_create;
@@ -1604,6 +1773,11 @@ union bpf_attr {
__u32 flags; /* extra flags */
} prog_bind_map;
+ struct { /* struct used by BPF_TOKEN_CREATE command */
+ __u32 flags;
+ __u32 bpffs_fd;
+ } token_create;
+
} __attribute__((aligned(8)));
/* The description below is an attempt at providing documentation to eBPF
@@ -1879,7 +2053,9 @@ union bpf_attr {
* performed again, if the helper is used in combination with
* direct packet access.
* Return
- * 0 on success, or a negative error in case of failure.
+ * 0 on success, or a negative error in case of failure. Positive
+ * error indicates a potential drop or congestion in the target
+ * device. The particular positive error codes are not defined.
*
* u64 bpf_get_current_pid_tgid(void)
* Description
@@ -2612,8 +2788,8 @@ union bpf_attr {
* *bpf_socket* should be one of the following:
*
* * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
- * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
- * and **BPF_CGROUP_INET6_CONNECT**.
+ * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**,
+ * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**.
*
* This helper actually implements a subset of **setsockopt()**.
* It supports the following *level*\ s:
@@ -2851,8 +3027,8 @@ union bpf_attr {
* *bpf_socket* should be one of the following:
*
* * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
- * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
- * and **BPF_CGROUP_INET6_CONNECT**.
+ * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**,
+ * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**.
*
* This helper actually implements a subset of **getsockopt()**.
* It supports the same set of *optname*\ s that is supported by
@@ -3160,6 +3336,10 @@ union bpf_attr {
* **BPF_FIB_LOOKUP_DIRECT**
* Do a direct table lookup vs full lookup using FIB
* rules.
+ * **BPF_FIB_LOOKUP_TBID**
+ * Used with BPF_FIB_LOOKUP_DIRECT.
+ * Use the routing table ID present in *params*->tbid
+ * for the fib lookup.
* **BPF_FIB_LOOKUP_OUTPUT**
* Perform lookup from an egress perspective (default is
* ingress).
@@ -3168,6 +3348,11 @@ union bpf_attr {
* and *params*->smac will not be set as output. A common
* use case is to call **bpf_redirect_neigh**\ () after
* doing **bpf_fib_lookup**\ ().
+ * **BPF_FIB_LOOKUP_SRC**
+ * Derive and set source IP addr in *params*->ipv{4,6}_src
+ * for the nexthop. If the src addr cannot be derived,
+ * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this
+ * case, *params*->dmac and *params*->smac are not set either.
*
* *ctx* is either **struct xdp_md** for XDP programs or
* **struct sk_buff** tc cls_act programs.
@@ -4137,9 +4322,6 @@ union bpf_attr {
* **-EOPNOTSUPP** if the operation is not supported, for example
* a call from outside of TC ingress.
*
- * **-ESOCKTNOSUPPORT** if the socket type is not supported
- * (reuseport).
- *
* long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags)
* Description
* Helper is overloaded depending on BPF program type. This
@@ -4404,6 +4586,8 @@ union bpf_attr {
* long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags)
* Description
* Return a user or a kernel stack in bpf program provided buffer.
+ * Note: the user stack will only be populated if the *task* is
+ * the current task; all other tasks will return -EOPNOTSUPP.
* To achieve this, the helper needs *task*, which is a valid
* pointer to **struct task_struct**. To store the stacktrace, the
* bpf program provides *buf* with a nonnegative *size*.
@@ -4415,6 +4599,7 @@ union bpf_attr {
*
* **BPF_F_USER_STACK**
* Collect a user space stack instead of a kernel stack.
+ * The *task* must be the current task.
* **BPF_F_USER_BUILD_ID**
* Collect buildid+offset instead of ips for user stack,
* only valid if **BPF_F_USER_STACK** is also specified.
@@ -4718,9 +4903,9 @@ union bpf_attr {
* going through the CPU's backlog queue.
*
* The *flags* argument is reserved and must be 0. The helper is
- * currently only supported for tc BPF program types at the ingress
- * hook and for veth device types. The peer device must reside in a
- * different network namespace.
+ * currently only supported for tc BPF program types at the
+ * ingress hook and for veth and netkit target device types. The
+ * peer device must reside in a different network namespace.
* Return
* The helper returns **TC_ACT_REDIRECT** on success or
* **TC_ACT_SHOT** on error.
@@ -5003,6 +5188,8 @@ union bpf_attr {
* **BPF_F_TIMER_ABS**
* Start the timer in absolute expire value instead of the
* default relative one.
+ * **BPF_F_TIMER_CPU_PIN**
+ * Timer will be pinned to the CPU of the caller.
*
* Return
* 0 on success.
@@ -5022,9 +5209,14 @@ union bpf_attr {
* u64 bpf_get_func_ip(void *ctx)
* Description
* Get address of the traced function (for tracing and kprobe programs).
+ *
+ * When called for kprobe program attached as uprobe it returns
+ * probe address for both entry and return uprobe.
+ *
* Return
- * Address of the traced function.
+ * Address of the traced function for kprobe.
* 0 for kprobes placed within the function (not at the entry).
+ * Address of the probe for uprobe and return uprobe.
*
* u64 bpf_get_attach_cookie(void *ctx)
* Description
@@ -6165,6 +6357,19 @@ struct bpf_sock_tuple {
};
};
+/* (Simplified) user return codes for tcx prog type.
+ * A valid tcx program must return one of these defined values. All other
+ * return codes are reserved for future use. Must remain compatible with
+ * their TC_ACT_* counter-parts. For compatibility in behavior, unknown
+ * return codes are mapped to TCX_NEXT.
+ */
+enum tcx_action_base {
+ TCX_NEXT = -1,
+ TCX_PASS = 0,
+ TCX_DROP = 2,
+ TCX_REDIRECT = 7,
+};
+
struct bpf_xdp_sock {
__u32 queue_id;
};
@@ -6346,7 +6551,7 @@ struct bpf_map_info {
__u32 btf_id;
__u32 btf_key_type_id;
__u32 btf_value_type_id;
- __u32 :32; /* alignment pad */
+ __u32 btf_vmlinux_id;
__u64 map_extra;
} __attribute__((aligned(8)));
@@ -6411,6 +6616,69 @@ struct bpf_link_info {
struct {
__u32 map_id;
} struct_ops;
+ struct {
+ __u32 pf;
+ __u32 hooknum;
+ __s32 priority;
+ __u32 flags;
+ } netfilter;
+ struct {
+ __aligned_u64 addrs;
+ __u32 count; /* in/out: kprobe_multi function count */
+ __u32 flags;
+ __u64 missed;
+ __aligned_u64 cookies;
+ } kprobe_multi;
+ struct {
+ __aligned_u64 path;
+ __aligned_u64 offsets;
+ __aligned_u64 ref_ctr_offsets;
+ __aligned_u64 cookies;
+ __u32 path_size; /* in/out: real path size on success, including zero byte */
+ __u32 count; /* in/out: uprobe_multi offsets/ref_ctr_offsets/cookies count */
+ __u32 flags;
+ __u32 pid;
+ } uprobe_multi;
+ struct {
+ __u32 type; /* enum bpf_perf_event_type */
+ __u32 :32;
+ union {
+ struct {
+ __aligned_u64 file_name; /* in/out */
+ __u32 name_len;
+ __u32 offset; /* offset from file_name */
+ __u64 cookie;
+ } uprobe; /* BPF_PERF_EVENT_UPROBE, BPF_PERF_EVENT_URETPROBE */
+ struct {
+ __aligned_u64 func_name; /* in/out */
+ __u32 name_len;
+ __u32 offset; /* offset from func_name */
+ __u64 addr;
+ __u64 missed;
+ __u64 cookie;
+ } kprobe; /* BPF_PERF_EVENT_KPROBE, BPF_PERF_EVENT_KRETPROBE */
+ struct {
+ __aligned_u64 tp_name; /* in/out */
+ __u32 name_len;
+ __u32 :32;
+ __u64 cookie;
+ } tracepoint; /* BPF_PERF_EVENT_TRACEPOINT */
+ struct {
+ __u64 config;
+ __u32 type;
+ __u32 :32;
+ __u64 cookie;
+ } event; /* BPF_PERF_EVENT_EVENT */
+ };
+ } perf_event;
+ struct {
+ __u32 ifindex;
+ __u32 attach_type;
+ } tcx;
+ struct {
+ __u32 ifindex;
+ __u32 attach_type;
+ } netkit;
};
} __attribute__((aligned(8)));
@@ -6707,6 +6975,7 @@ enum {
BPF_TCP_LISTEN,
BPF_TCP_CLOSING, /* Now a valid state */
BPF_TCP_NEW_SYN_RECV,
+ BPF_TCP_BOUND_INACTIVE,
BPF_TCP_MAX_STATES /* Leave at the end! */
};
@@ -6808,6 +7077,8 @@ enum {
BPF_FIB_LOOKUP_DIRECT = (1U << 0),
BPF_FIB_LOOKUP_OUTPUT = (1U << 1),
BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2),
+ BPF_FIB_LOOKUP_TBID = (1U << 3),
+ BPF_FIB_LOOKUP_SRC = (1U << 4),
};
enum {
@@ -6820,6 +7091,7 @@ enum {
BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */
BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */
BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */
+ BPF_FIB_LKUP_RET_NO_SRC_ADDR, /* failed to derive IP src addr */
};
struct bpf_fib_lookup {
@@ -6854,6 +7126,9 @@ struct bpf_fib_lookup {
__u32 rt_metric;
};
+ /* input: source address to consider for lookup
+ * output: source address result from lookup
+ */
union {
__be32 ipv4_src;
__u32 ipv6_src[4]; /* in6_addr; network order */
@@ -6868,9 +7143,19 @@ struct bpf_fib_lookup {
__u32 ipv6_dst[4]; /* in6_addr; network order */
};
- /* output */
- __be16 h_vlan_proto;
- __be16 h_vlan_TCI;
+ union {
+ struct {
+ /* output */
+ __be16 h_vlan_proto;
+ __be16 h_vlan_TCI;
+ };
+ /* input: when accompanied with the
+ * 'BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_TBID` flags, a
+ * specific routing table to use for the fib lookup.
+ */
+ __u32 tbid;
+ };
+
__u8 smac[6]; /* ETH_ALEN */
__u8 dmac[6]; /* ETH_ALEN */
};
@@ -6956,38 +7241,31 @@ struct bpf_spin_lock {
};
struct bpf_timer {
- __u64 :64;
- __u64 :64;
+ __u64 __opaque[2];
} __attribute__((aligned(8)));
struct bpf_dynptr {
- __u64 :64;
- __u64 :64;
+ __u64 __opaque[2];
} __attribute__((aligned(8)));
struct bpf_list_head {
- __u64 :64;
- __u64 :64;
+ __u64 __opaque[2];
} __attribute__((aligned(8)));
struct bpf_list_node {
- __u64 :64;
- __u64 :64;
+ __u64 __opaque[3];
} __attribute__((aligned(8)));
struct bpf_rb_root {
- __u64 :64;
- __u64 :64;
+ __u64 __opaque[2];
} __attribute__((aligned(8)));
struct bpf_rb_node {
- __u64 :64;
- __u64 :64;
- __u64 :64;
+ __u64 __opaque[4];
} __attribute__((aligned(8)));
struct bpf_refcount {
- __u32 :32;
+ __u32 __opaque[1];
} __attribute__((aligned(4)));
struct bpf_sysctl {
@@ -7143,9 +7421,11 @@ struct bpf_core_relo {
* Flags to control bpf_timer_start() behaviour.
* - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is
* relative to current time.
+ * - BPF_F_TIMER_CPU_PIN: Timer will be pinned to the CPU of the caller.
*/
enum {
BPF_F_TIMER_ABS = (1ULL << 0),
+ BPF_F_TIMER_CPU_PIN = (1ULL << 1),
};
/* BPF numbers iterator state */
--
2.43.0

View File

@ -1,31 +0,0 @@
From 0973fd70c1c50e57a3db0b09e239b1d1fd3f1c55 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Fri, 21 Jul 2023 16:10:18 +0200
Subject: [PATCH] Use bpf_obj_get_info_by_fd() instead of
bpf_btf_get_info_by_fd()
The libbpf version in rawhide doesn't have the typed
bpf_*_get_info_by_fd().
---
src/cc/libbpf.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c
index 360fd81d..a3e34da2 100644
--- a/src/cc/libbpf.c
+++ b/src/cc/libbpf.c
@@ -727,9 +727,9 @@ static int find_btf_id(const char *module_name, const char *func_name,
info.name = ptr_to_u64(name);
info.name_len = sizeof(name);
- err = bpf_btf_get_info_by_fd(fd, &info, &len);
+ err = bpf_obj_get_info_by_fd(fd, &info, &len);
if (err) {
- fprintf(stderr, "bpf_btf_get_info_by_fd failed: %d\n", err);
+ fprintf(stderr, "bpf_obj_get_info_by_fd failed: %d\n", err);
goto err_out;
}
--
2.41.0

View File

@ -10,7 +10,7 @@
%endif
%endif
%ifarch x86_64 ppc64 ppc64le aarch64
%ifarch x86_64 ppc64 ppc64le aarch64 s390x
%bcond_without libbpf_tools
%else
%bcond_with libbpf_tools
@ -24,16 +24,15 @@
Name: bcc
Version: 0.28.0
Release: 3%{?dist}
Version: 0.29.1
Release: 1%{?dist}
Summary: BPF Compiler Collection (BCC)
License: Apache-2.0
URL: https://github.com/iovisor/bcc
Source0: %{url}/archive/v%{version}/%{name}-%{version}.tar.gz
Patch0: Use-bpf_obj_get_info_by_fd-instead-of-bpf_btf_get_in.patch
Patch1: libbpf-tools-add-block_io_-start-done-tracepoints-su.patch
Patch2: tools-Add-support-for-the-new-block_io_-tracepoints.patch
Patch3: tool-slabratetop-add-definition-of-freelist_aba_t.patch
Patch0: libbpf-tools-Fix-bindsnoop-for-kernel-v6.6.patch
Patch1: Fix-ttysnoop.py-with-newer-kernels.patch
Patch2: Sync-with-latest-libbpf-repo-4889.patch
# Arches will be included as upstream support is added and dependencies are
# satisfied in the respective arches
@ -242,6 +241,11 @@ cp -a libbpf-tools/tmp-install/bin/* %{buildroot}/%{_sbindir}/
%endif
%changelog
* Mon Feb 05 2024 Jerome Marchand <jmarchan@redhat.com> - 0.29.1-1
- Rebase to the latest release version (#2253688)
- Enable libbpf-tools on s390x (#2249458)
- Misc 0.29.1 fixes
* Tue Jan 23 2024 Fedora Release Engineering <releng@fedoraproject.org> - 0.28.0-3
- Rebuilt for https://fedoraproject.org/wiki/Fedora_40_Mass_Rebuild

View File

@ -0,0 +1,114 @@
From abf7b251c1461dcbe0c1e75d1d0da71662c9fae1 Mon Sep 17 00:00:00 2001
From: Hengqi Chen <hengqi.chen@gmail.com>
Date: Sun, 17 Dec 2023 11:27:10 +0000
Subject: [PATCH] libbpf-tools: Fix bindsnoop for kernel v6.6+
The freebind field in struct inet_sock gone in recent kernel
versions due to some kernel refactor works ([0]). The change
breaks the bindsnoop tool. Fix it in a CO-RE way.
This should close #4838.
[0]: https://lore.kernel.org/all/20230816081547.1272409-1-edumazet@google.com/
Signed-off-by: Hengqi Chen <hengqi.chen@gmail.com>
---
libbpf-tools/bindsnoop.bpf.c | 8 +++--
libbpf-tools/core_fixes.bpf.h | 56 +++++++++++++++++++++++++++++++++++
2 files changed, 61 insertions(+), 3 deletions(-)
diff --git a/libbpf-tools/bindsnoop.bpf.c b/libbpf-tools/bindsnoop.bpf.c
index 41dce942..ead19c67 100644
--- a/libbpf-tools/bindsnoop.bpf.c
+++ b/libbpf-tools/bindsnoop.bpf.c
@@ -5,7 +5,9 @@
#include <bpf/bpf_core_read.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_endian.h>
+
#include "bindsnoop.h"
+#include "core_fixes.bpf.h"
#define MAX_ENTRIES 10240
#define MAX_PORTS 1024
@@ -85,9 +87,9 @@ static int probe_exit(struct pt_regs *ctx, short ver)
if (filter_by_port && !port)
goto cleanup;
- opts.fields.freebind = BPF_CORE_READ_BITFIELD_PROBED(inet_sock, freebind);
- opts.fields.transparent = BPF_CORE_READ_BITFIELD_PROBED(inet_sock, transparent);
- opts.fields.bind_address_no_port = BPF_CORE_READ_BITFIELD_PROBED(inet_sock, bind_address_no_port);
+ opts.fields.freebind = get_inet_sock_freebind(inet_sock);
+ opts.fields.transparent = get_inet_sock_transparent(inet_sock);
+ opts.fields.bind_address_no_port = get_inet_sock_bind_address_no_port(inet_sock);
opts.fields.reuseaddress = BPF_CORE_READ_BITFIELD_PROBED(sock, __sk_common.skc_reuse);
opts.fields.reuseport = BPF_CORE_READ_BITFIELD_PROBED(sock, __sk_common.skc_reuseport);
event.opts = opts.data;
diff --git a/libbpf-tools/core_fixes.bpf.h b/libbpf-tools/core_fixes.bpf.h
index 84cb7f18..a4c84c02 100644
--- a/libbpf-tools/core_fixes.bpf.h
+++ b/libbpf-tools/core_fixes.bpf.h
@@ -249,4 +249,60 @@ static __always_inline __u64 get_sock_ident(struct sock *sk)
return (__u64)sk;
}
+/**
+ * During kernel 6.6 development cycle, several bitfields in struct inet_sock gone,
+ * they are placed in inet_sock::inet_flags instead ([0]).
+ *
+ * References:
+ * [0]: https://lore.kernel.org/all/20230816081547.1272409-1-edumazet@google.com/
+ */
+struct inet_sock___o {
+ __u8 freebind: 1;
+ __u8 transparent: 1;
+ __u8 bind_address_no_port: 1;
+};
+
+enum {
+ INET_FLAGS_FREEBIND___x = 11,
+ INET_FLAGS_TRANSPARENT___x = 15,
+ INET_FLAGS_BIND_ADDRESS_NO_PORT___x = 18,
+};
+
+struct inet_sock___x {
+ unsigned long inet_flags;
+};
+
+static __always_inline __u8 get_inet_sock_freebind(void *inet_sock)
+{
+ unsigned long inet_flags;
+
+ if (bpf_core_field_exists(struct inet_sock___o, freebind))
+ return BPF_CORE_READ_BITFIELD_PROBED((struct inet_sock___o *)inet_sock, freebind);
+
+ inet_flags = BPF_CORE_READ((struct inet_sock___x *)inet_sock, inet_flags);
+ return (1 << INET_FLAGS_FREEBIND___x) & inet_flags ? 1 : 0;
+}
+
+static __always_inline __u8 get_inet_sock_transparent(void *inet_sock)
+{
+ unsigned long inet_flags;
+
+ if (bpf_core_field_exists(struct inet_sock___o, transparent))
+ return BPF_CORE_READ_BITFIELD_PROBED((struct inet_sock___o *)inet_sock, transparent);
+
+ inet_flags = BPF_CORE_READ((struct inet_sock___x *)inet_sock, inet_flags);
+ return (1 << INET_FLAGS_TRANSPARENT___x) & inet_flags ? 1 : 0;
+}
+
+static __always_inline __u8 get_inet_sock_bind_address_no_port(void *inet_sock)
+{
+ unsigned long inet_flags;
+
+ if (bpf_core_field_exists(struct inet_sock___o, bind_address_no_port))
+ return BPF_CORE_READ_BITFIELD_PROBED((struct inet_sock___o *)inet_sock, bind_address_no_port);
+
+ inet_flags = BPF_CORE_READ((struct inet_sock___x *)inet_sock, inet_flags);
+ return (1 << INET_FLAGS_BIND_ADDRESS_NO_PORT___x) & inet_flags ? 1 : 0;
+}
+
#endif /* __CORE_FIXES_BPF_H */
--
2.43.0

View File

@ -1,476 +0,0 @@
From e1dfbe2d09583205acca1d1b5b09caefb460f2fd Mon Sep 17 00:00:00 2001
From: mickey_zhu <mickey_zhu@realsil.com.cn>
Date: Tue, 27 Jun 2023 16:32:44 +0800
Subject: [PATCH 1/2] libbpf-tools: add block_io_{start,done} tracepoints
support to bio tools
Some bio tools fail to kprobe blk_account_io_{start,done} after v5.17,
because they become inlined, see [0]. To fix this issue, tracepoints
blick_io_{start,done} are introcuded in kernel, see[1].
Update related bio tools to support new tracepoints, and also simplify
attach.
[0] Kernel commit 450b7879e345 (block: move blk_account_io_{start,done} to blk-mq.c)
[1] Kernel commit 5a80bd075f3b (block: introduce block_io_start/block_io_done tracepoints)
Change-Id: I62b957abd7ce2901eb114bd57c78938e4f083e4d
Signed-off-by: Mickey Zhu <mickey_zhu@realsil.com.cn>
---
libbpf-tools/biosnoop.bpf.c | 9 ++++
libbpf-tools/biosnoop.c | 78 +++++++++++++--------------------
libbpf-tools/biostacks.bpf.c | 46 +++++++++++++------
libbpf-tools/biostacks.c | 85 +++++++++++++++++++++---------------
libbpf-tools/biotop.bpf.c | 44 +++++++++++++++++--
libbpf-tools/biotop.c | 59 ++++++++++++++++---------
6 files changed, 199 insertions(+), 122 deletions(-)
diff --git a/libbpf-tools/biosnoop.bpf.c b/libbpf-tools/biosnoop.bpf.c
index b791555f..fcc5c5ce 100644
--- a/libbpf-tools/biosnoop.bpf.c
+++ b/libbpf-tools/biosnoop.bpf.c
@@ -76,6 +76,15 @@ int BPF_PROG(blk_account_io_start, struct request *rq)
return trace_pid(rq);
}
+SEC("tp_btf/block_io_start")
+int BPF_PROG(block_io_start, struct request *rq)
+{
+ if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
+ return 0;
+
+ return trace_pid(rq);
+}
+
SEC("kprobe/blk_account_io_merge_bio")
int BPF_KPROBE(blk_account_io_merge_bio, struct request *rq)
{
diff --git a/libbpf-tools/biosnoop.c b/libbpf-tools/biosnoop.c
index 21773729..f9468900 100644
--- a/libbpf-tools/biosnoop.c
+++ b/libbpf-tools/biosnoop.c
@@ -212,6 +212,16 @@ void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt)
fprintf(stderr, "lost %llu events on CPU #%d\n", lost_cnt, cpu);
}
+static void blk_account_io_set_attach_target(struct biosnoop_bpf *obj)
+{
+ if (fentry_can_attach("blk_account_io_start", NULL))
+ bpf_program__set_attach_target(obj->progs.blk_account_io_start,
+ 0, "blk_account_io_start");
+ else
+ bpf_program__set_attach_target(obj->progs.blk_account_io_start,
+ 0, "__blk_account_io_start");
+}
+
int main(int argc, char **argv)
{
const struct partition *partition;
@@ -260,12 +270,23 @@ int main(int argc, char **argv)
obj->rodata->filter_cg = env.cg;
obj->rodata->min_ns = env.min_lat_ms * 1000000;
- if (fentry_can_attach("blk_account_io_start", NULL))
- bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0,
- "blk_account_io_start");
- else
- bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0,
- "__blk_account_io_start");
+ if (tracepoint_exists("block", "block_io_start"))
+ bpf_program__set_autoload(obj->progs.blk_account_io_start, false);
+ else {
+ bpf_program__set_autoload(obj->progs.block_io_start, false);
+ blk_account_io_set_attach_target(obj);
+ }
+
+ ksyms = ksyms__load();
+ if (!ksyms) {
+ fprintf(stderr, "failed to load kallsyms\n");
+ goto cleanup;
+ }
+ if (!ksyms__get_symbol(ksyms, "blk_account_io_merge_bio"))
+ bpf_program__set_autoload(obj->progs.blk_account_io_merge_bio, false);
+
+ if (!env.queued)
+ bpf_program__set_autoload(obj->progs.block_rq_insert, false);
err = biosnoop_bpf__load(obj);
if (err) {
@@ -288,48 +309,9 @@ int main(int argc, char **argv)
}
}
- obj->links.blk_account_io_start = bpf_program__attach(obj->progs.blk_account_io_start);
- if (!obj->links.blk_account_io_start) {
- err = -errno;
- fprintf(stderr, "failed to attach blk_account_io_start: %s\n",
- strerror(-err));
- goto cleanup;
- }
- ksyms = ksyms__load();
- if (!ksyms) {
- err = -ENOMEM;
- fprintf(stderr, "failed to load kallsyms\n");
- goto cleanup;
- }
- if (ksyms__get_symbol(ksyms, "blk_account_io_merge_bio")) {
- obj->links.blk_account_io_merge_bio =
- bpf_program__attach(obj->progs.blk_account_io_merge_bio);
- if (!obj->links.blk_account_io_merge_bio) {
- err = -errno;
- fprintf(stderr, "failed to attach blk_account_io_merge_bio: %s\n",
- strerror(-err));
- goto cleanup;
- }
- }
- if (env.queued) {
- obj->links.block_rq_insert =
- bpf_program__attach(obj->progs.block_rq_insert);
- if (!obj->links.block_rq_insert) {
- err = -errno;
- fprintf(stderr, "failed to attach block_rq_insert: %s\n", strerror(-err));
- goto cleanup;
- }
- }
- obj->links.block_rq_issue = bpf_program__attach(obj->progs.block_rq_issue);
- if (!obj->links.block_rq_issue) {
- err = -errno;
- fprintf(stderr, "failed to attach block_rq_issue: %s\n", strerror(-err));
- goto cleanup;
- }
- obj->links.block_rq_complete = bpf_program__attach(obj->progs.block_rq_complete);
- if (!obj->links.block_rq_complete) {
- err = -errno;
- fprintf(stderr, "failed to attach block_rq_complete: %s\n", strerror(-err));
+ err = biosnoop_bpf__attach(obj);
+ if (err) {
+ fprintf(stderr, "failed to attach BPF programs: %d\n", err);
goto cleanup;
}
diff --git a/libbpf-tools/biostacks.bpf.c b/libbpf-tools/biostacks.bpf.c
index c3950910..0ca69880 100644
--- a/libbpf-tools/biostacks.bpf.c
+++ b/libbpf-tools/biostacks.bpf.c
@@ -67,20 +67,8 @@ int trace_start(void *ctx, struct request *rq, bool merge_bio)
return 0;
}
-SEC("fentry/blk_account_io_start")
-int BPF_PROG(blk_account_io_start, struct request *rq)
-{
- return trace_start(ctx, rq, false);
-}
-
-SEC("kprobe/blk_account_io_merge_bio")
-int BPF_KPROBE(blk_account_io_merge_bio, struct request *rq)
-{
- return trace_start(ctx, rq, true);
-}
-
-SEC("fentry/blk_account_io_done")
-int BPF_PROG(blk_account_io_done, struct request *rq)
+static __always_inline
+int trace_done(void *ctx, struct request *rq)
{
u64 slot, ts = bpf_ktime_get_ns();
struct internal_rqinfo *i_rqinfop;
@@ -110,4 +98,34 @@ int BPF_PROG(blk_account_io_done, struct request *rq)
return 0;
}
+SEC("kprobe/blk_account_io_merge_bio")
+int BPF_KPROBE(blk_account_io_merge_bio, struct request *rq)
+{
+ return trace_start(ctx, rq, true);
+}
+
+SEC("fentry/blk_account_io_start")
+int BPF_PROG(blk_account_io_start, struct request *rq)
+{
+ return trace_start(ctx, rq, false);
+}
+
+SEC("fentry/blk_account_io_done")
+int BPF_PROG(blk_account_io_done, struct request *rq)
+{
+ return trace_done(ctx, rq);
+}
+
+SEC("tp_btf/block_io_start")
+int BPF_PROG(block_io_start, struct request *rq)
+{
+ return trace_start(ctx, rq, false);
+}
+
+SEC("tp_btf/block_io_done")
+int BPF_PROG(block_io_done, struct request *rq)
+{
+ return trace_done(ctx, rq);
+}
+
char LICENSE[] SEC("license") = "GPL";
diff --git a/libbpf-tools/biostacks.c b/libbpf-tools/biostacks.c
index e1878d1f..e7875f76 100644
--- a/libbpf-tools/biostacks.c
+++ b/libbpf-tools/biostacks.c
@@ -128,6 +128,39 @@ void print_map(struct ksyms *ksyms, struct partitions *partitions, int fd)
return;
}
+static bool has_block_io_tracepoints(void)
+{
+ return tracepoint_exists("block", "block_io_start") &&
+ tracepoint_exists("block", "block_io_done");
+}
+
+static void disable_block_io_tracepoints(struct biostacks_bpf *obj)
+{
+ bpf_program__set_autoload(obj->progs.block_io_start, false);
+ bpf_program__set_autoload(obj->progs.block_io_done, false);
+}
+
+static void disable_blk_account_io_fentry(struct biostacks_bpf *obj)
+{
+ bpf_program__set_autoload(obj->progs.blk_account_io_start, false);
+ bpf_program__set_autoload(obj->progs.blk_account_io_done, false);
+}
+
+static void blk_account_io_set_attach_target(struct biostacks_bpf *obj)
+{
+ if (fentry_can_attach("blk_account_io_start", NULL)) {
+ bpf_program__set_attach_target(obj->progs.blk_account_io_start,
+ 0, "blk_account_io_start");
+ bpf_program__set_attach_target(obj->progs.blk_account_io_done,
+ 0, "blk_account_io_done");
+ } else {
+ bpf_program__set_attach_target(obj->progs.blk_account_io_start,
+ 0, "__blk_account_io_start");
+ bpf_program__set_attach_target(obj->progs.blk_account_io_done,
+ 0, "__blk_account_io_done");
+ }
+}
+
int main(int argc, char **argv)
{
struct partitions *partitions = NULL;
@@ -172,50 +205,30 @@ int main(int argc, char **argv)
obj->rodata->targ_ms = env.milliseconds;
- if (fentry_can_attach("blk_account_io_start", NULL)) {
- bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0,
- "blk_account_io_start");
- bpf_program__set_attach_target(obj->progs.blk_account_io_done, 0,
- "blk_account_io_done");
- } else {
- bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0,
- "__blk_account_io_start");
- bpf_program__set_attach_target(obj->progs.blk_account_io_done, 0,
- "__blk_account_io_done");
- }
-
- err = biostacks_bpf__load(obj);
- if (err) {
- fprintf(stderr, "failed to load BPF object: %d\n", err);
- goto cleanup;
+ if (has_block_io_tracepoints())
+ disable_blk_account_io_fentry(obj);
+ else {
+ disable_block_io_tracepoints(obj);
+ blk_account_io_set_attach_target(obj);
}
- obj->links.blk_account_io_start = bpf_program__attach(obj->progs.blk_account_io_start);
- if (!obj->links.blk_account_io_start) {
- err = -errno;
- fprintf(stderr, "failed to attach blk_account_io_start: %s\n", strerror(-err));
- goto cleanup;
- }
ksyms = ksyms__load();
if (!ksyms) {
fprintf(stderr, "failed to load kallsyms\n");
goto cleanup;
}
- if (ksyms__get_symbol(ksyms, "blk_account_io_merge_bio")) {
- obj->links.blk_account_io_merge_bio =
- bpf_program__attach(obj->progs.blk_account_io_merge_bio);
- if (!obj->links.blk_account_io_merge_bio) {
- err = -errno;
- fprintf(stderr, "failed to attach blk_account_io_merge_bio: %s\n",
- strerror(-err));
- goto cleanup;
- }
+ if (!ksyms__get_symbol(ksyms, "blk_account_io_merge_bio"))
+ bpf_program__set_autoload(obj->progs.blk_account_io_merge_bio, false);
+
+ err = biostacks_bpf__load(obj);
+ if (err) {
+ fprintf(stderr, "failed to load BPF object: %d\n", err);
+ goto cleanup;
}
- obj->links.blk_account_io_done = bpf_program__attach(obj->progs.blk_account_io_done);
- if (!obj->links.blk_account_io_done) {
- err = -errno;
- fprintf(stderr, "failed to attach blk_account_io_done: %s\n",
- strerror(-err));
+
+ err = biostacks_bpf__attach(obj);
+ if (err) {
+ fprintf(stderr, "failed to attach BPF programs: %d\n", err);
goto cleanup;
}
diff --git a/libbpf-tools/biotop.bpf.c b/libbpf-tools/biotop.bpf.c
index 226e32d3..07631378 100644
--- a/libbpf-tools/biotop.bpf.c
+++ b/libbpf-tools/biotop.bpf.c
@@ -30,8 +30,8 @@ struct {
__type(value, struct val_t);
} counts SEC(".maps");
-SEC("kprobe")
-int BPF_KPROBE(blk_account_io_start, struct request *req)
+static __always_inline
+int trace_start(struct request *req)
{
struct who_t who = {};
@@ -56,8 +56,8 @@ int BPF_KPROBE(blk_mq_start_request, struct request *req)
return 0;
}
-SEC("kprobe")
-int BPF_KPROBE(blk_account_io_done, struct request *req, u64 now)
+static __always_inline
+int trace_done(struct request *req)
{
struct val_t *valp, zero = {};
struct info_t info = {};
@@ -103,4 +103,40 @@ int BPF_KPROBE(blk_account_io_done, struct request *req, u64 now)
return 0;
}
+SEC("kprobe/blk_account_io_start")
+int BPF_KPROBE(blk_account_io_start, struct request *req)
+{
+ return trace_start(req);
+}
+
+SEC("kprobe/blk_account_io_done")
+int BPF_KPROBE(blk_account_io_done, struct request *req)
+{
+ return trace_done(req);
+}
+
+SEC("kprobe/__blk_account_io_start")
+int BPF_KPROBE(__blk_account_io_start, struct request *req)
+{
+ return trace_start(req);
+}
+
+SEC("kprobe/__blk_account_io_done")
+int BPF_KPROBE(__blk_account_io_done, struct request *req)
+{
+ return trace_done(req);
+}
+
+SEC("tp_btf/block_io_start")
+int BPF_PROG(block_io_start, struct request *req)
+{
+ return trace_start(req);
+}
+
+SEC("tp_btf/block_io_done")
+int BPF_PROG(block_io_done, struct request *req)
+{
+ return trace_done(req);
+}
+
char LICENSE[] SEC("license") = "GPL";
diff --git a/libbpf-tools/biotop.c b/libbpf-tools/biotop.c
index 75484281..5b3a7cf3 100644
--- a/libbpf-tools/biotop.c
+++ b/libbpf-tools/biotop.c
@@ -354,6 +354,38 @@ static int print_stat(struct biotop_bpf *obj)
return err;
}
+static bool has_block_io_tracepoints(void)
+{
+ return tracepoint_exists("block", "block_io_start") &&
+ tracepoint_exists("block", "block_io_done");
+}
+
+static void disable_block_io_tracepoints(struct biotop_bpf *obj)
+{
+ bpf_program__set_autoload(obj->progs.block_io_start, false);
+ bpf_program__set_autoload(obj->progs.block_io_done, false);
+}
+
+static void disable_blk_account_io_kprobes(struct biotop_bpf *obj)
+{
+ bpf_program__set_autoload(obj->progs.blk_account_io_start, false);
+ bpf_program__set_autoload(obj->progs.blk_account_io_done, false);
+ bpf_program__set_autoload(obj->progs.__blk_account_io_start, false);
+ bpf_program__set_autoload(obj->progs.__blk_account_io_done, false);
+}
+
+static void blk_account_io_set_autoload(struct biotop_bpf *obj,
+ struct ksyms *ksyms)
+{
+ if (!ksyms__get_symbol(ksyms, "__blk_account_io_start")) {
+ bpf_program__set_autoload(obj->progs.__blk_account_io_start, false);
+ bpf_program__set_autoload(obj->progs.__blk_account_io_done, false);
+ } else {
+ bpf_program__set_autoload(obj->progs.blk_account_io_start, false);
+ bpf_program__set_autoload(obj->progs.blk_account_io_done, false);
+ }
+}
+
int main(int argc, char **argv)
{
static const struct argp argp = {
@@ -386,32 +418,19 @@ int main(int argc, char **argv)
goto cleanup;
}
+ if (has_block_io_tracepoints())
+ disable_blk_account_io_kprobes(obj);
+ else {
+ disable_block_io_tracepoints(obj);
+ blk_account_io_set_autoload(obj, ksyms);
+ }
+
err = biotop_bpf__load(obj);
if (err) {
warn("failed to load BPF object: %d\n", err);
goto cleanup;
}
- if (ksyms__get_symbol(ksyms, "__blk_account_io_start"))
- obj->links.blk_account_io_start = bpf_program__attach_kprobe(obj->progs.blk_account_io_start, false, "__blk_account_io_start");
- else
- obj->links.blk_account_io_start = bpf_program__attach_kprobe(obj->progs.blk_account_io_start, false, "blk_account_io_start");
-
- if (!obj->links.blk_account_io_start) {
- warn("failed to load attach blk_account_io_start\n");
- goto cleanup;
- }
-
- if (ksyms__get_symbol(ksyms, "__blk_account_io_done"))
- obj->links.blk_account_io_done = bpf_program__attach_kprobe(obj->progs.blk_account_io_done, false, "__blk_account_io_done");
- else
- obj->links.blk_account_io_done = bpf_program__attach_kprobe(obj->progs.blk_account_io_done, false, "blk_account_io_done");
-
- if (!obj->links.blk_account_io_done) {
- warn("failed to load attach blk_account_io_done\n");
- goto cleanup;
- }
-
err = biotop_bpf__attach(obj);
if (err) {
warn("failed to attach BPF programs: %d\n", err);
--
2.41.0

View File

@ -1 +1 @@
SHA512 (bcc-0.28.0.tar.gz) = 792ce93dba64b1f87390b2602dcaeba04ac8b2863652b06eb9a907b93bc6137a944b856cc6fa9c7a38671c89814740967561ca4f3b29c267babca7dc5e78aa02
SHA512 (bcc-0.29.1.tar.gz) = 9e60130ea602e19e6c6f88a8c17023cea5daf4c5bcc7af8816e9f5c662341136eb449a3fdf870ffad215495ac3bf895115c0d968d92ce79ebe2899b3e2464d24

View File

@ -1,55 +0,0 @@
From 59a1fccfc78482af189150b7937b21244f34e48a Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Thu, 3 Aug 2023 16:11:50 +0200
Subject: [PATCH] tool/slabratetop: add definition of freelist_aba_t
With recent kernel containing the commit 6801be4f2653 ("slub: Replace
cmpxchg_double()"), slabratetop fails to compiles with the following
error:
In file included from /virtual/main.c:86:
include/linux/slub_def.h:56:3: error: unknown type name 'freelist_aba_t'
freelist_aba_t freelist_tid;
^
2 warnings and 1 error generated.
Traceback (most recent call last):
File "/usr/share/bcc/tools/slabratetop", line 187, in <module>
b = BPF(text=bpf_text)
^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/site-packages/bcc/__init__.py", line 479, in __init__
raise Exception("Failed to compile BPF module %s" % (src_file or "<text>"))
Exception: Failed to compile BPF module <text>
Adding the definition of freelist_aba_t fixes the issue.
---
tools/slabratetop.py | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/tools/slabratetop.py b/tools/slabratetop.py
index 8fbcac5e..8a7d486e 100755
--- a/tools/slabratetop.py
+++ b/tools/slabratetop.py
@@ -141,6 +141,20 @@ static inline void *slab_address(const struct slab *slab)
return NULL;
}
+#ifdef CONFIG_64BIT
+typedef __uint128_t freelist_full_t;
+#else
+typedef u64 freelist_full_t;
+#endif
+
+typedef union {
+ struct {
+ void *freelist;
+ unsigned long counter;
+ };
+ freelist_full_t full;
+} freelist_aba_t;
+
#ifdef CONFIG_SLUB
#include <linux/slub_def.h>
#else
--
2.41.0

View File

@ -1,855 +0,0 @@
From 53ef33b5ad42e6a4baa37821119199f2d846beff Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Thu, 27 Jul 2023 18:19:18 +0200
Subject: [PATCH 2/2] tools: Add support for the new block_io_* tracepoints
The bio tools currently depends on blk_account_io_done/start functions
that can be inlined. To fix that, a couple of tracepoints have been
added upstream (block:block_io_start/done). This patch add the support
for those tracepoints when they are available.
Unfortunately, the bio tools relies on data that is not available to
the tracepoints (mostly the struct request). So the tracepoints can't
be used as drop in replacement for blk_account_io_*. Main difference,
is that we can't use the struct request as the hash key anymore, so it
now uses the couple (dev_t, sector) for that purpose.
For the biolatency tool, the -F option is disabled when only the
tracepoints are available because the flags are not all accessible
from the tracepoints. Otherwise, all features of the tools should
remain.
Closes #4261
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
---
tools/biolatency.py | 166 ++++++++++++++++++++++++++++--------
tools/biosnoop.py | 200 +++++++++++++++++++++++++++++++++-----------
tools/biotop.py | 108 +++++++++++++++++++-----
3 files changed, 371 insertions(+), 103 deletions(-)
diff --git a/tools/biolatency.py b/tools/biolatency.py
index 8fe43a7c..03b48a4c 100755
--- a/tools/biolatency.py
+++ b/tools/biolatency.py
@@ -11,6 +11,7 @@
#
# 20-Sep-2015 Brendan Gregg Created this.
# 31-Mar-2022 Rocky Xing Added disk filter support.
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
from __future__ import print_function
from bcc import BPF
@@ -72,7 +73,7 @@ bpf_text = """
#include <linux/blk-mq.h>
typedef struct disk_key {
- char disk[DISK_NAME_LEN];
+ dev_t dev;
u64 slot;
} disk_key_t;
@@ -86,26 +87,70 @@ typedef struct ext_val {
u64 count;
} ext_val_t;
-BPF_HASH(start, struct request *);
+struct tp_args {
+ u64 __unused__;
+ dev_t dev;
+ sector_t sector;
+ unsigned int nr_sector;
+ unsigned int bytes;
+ char rwbs[8];
+ char comm[16];
+ char cmd[];
+};
+
+struct start_key {
+ dev_t dev;
+ u32 _pad;
+ sector_t sector;
+ CMD_FLAGS
+};
+
+BPF_HASH(start, struct start_key);
STORAGE
+static dev_t ddevt(struct gendisk *disk) {
+ return (disk->major << 20) | disk->first_minor;
+}
+
// time block I/O
-int trace_req_start(struct pt_regs *ctx, struct request *req)
+static int __trace_req_start(struct start_key key)
{
DISK_FILTER
u64 ts = bpf_ktime_get_ns();
- start.update(&req, &ts);
+ start.update(&key, &ts);
return 0;
}
+int trace_req_start(struct pt_regs *ctx, struct request *req)
+{
+ struct start_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ SET_FLAGS
+
+ return __trace_req_start(key);
+}
+
+int trace_req_start_tp(struct tp_args *args)
+{
+ struct start_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_req_start(key);
+}
+
// output
-int trace_req_done(struct pt_regs *ctx, struct request *req)
+static int __trace_req_done(struct start_key key)
{
u64 *tsp, delta;
// fetch timestamp and calculate delta
- tsp = start.lookup(&req);
+ tsp = start.lookup(&key);
if (tsp == 0) {
return 0; // missed issue
}
@@ -116,9 +161,31 @@ int trace_req_done(struct pt_regs *ctx, struct request *req)
// store as histogram
STORE
- start.delete(&req);
+ start.delete(&key);
return 0;
}
+
+int trace_req_done(struct pt_regs *ctx, struct request *req)
+{
+ struct start_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ SET_FLAGS
+
+ return __trace_req_done(key);
+}
+
+int trace_req_done_tp(struct tp_args *args)
+{
+ struct start_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_req_done(key);
+}
"""
# code substitutions
@@ -134,21 +201,18 @@ store_str = ""
if args.disks:
storage_str += "BPF_HISTOGRAM(dist, disk_key_t);"
disks_str = """
- disk_key_t key = {.slot = bpf_log2l(delta)};
- void *__tmp = (void *)req->__RQ_DISK__->disk_name;
- bpf_probe_read(&key.disk, sizeof(key.disk), __tmp);
- dist.atomic_increment(key);
+ disk_key_t dkey = {};
+ dkey.dev = key.dev;
+ dkey.slot = bpf_log2l(delta);
+ dist.atomic_increment(dkey);
"""
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
- store_str += disks_str.replace('__RQ_DISK__', 'rq_disk')
- else:
- store_str += disks_str.replace('__RQ_DISK__', 'q->disk')
+ store_str += disks_str
elif args.flags:
storage_str += "BPF_HISTOGRAM(dist, flag_key_t);"
store_str += """
- flag_key_t key = {.slot = bpf_log2l(delta)};
- key.flags = req->cmd_flags;
- dist.atomic_increment(key);
+ flag_key_t fkey = {.slot = bpf_log2l(delta)};
+ fkey.flags = key.flags;
+ dist.atomic_increment(fkey);
"""
else:
storage_str += "BPF_HISTOGRAM(dist);"
@@ -161,21 +225,13 @@ store_str = ""
exit(1)
stat_info = os.stat(disk_path)
- major = os.major(stat_info.st_rdev)
- minor = os.minor(stat_info.st_rdev)
-
- disk_field_str = ""
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
- disk_field_str = 'req->rq_disk'
- else:
- disk_field_str = 'req->q->disk'
+ dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev)
disk_filter_str = """
- struct gendisk *disk = %s;
- if (!(disk->major == %d && disk->first_minor == %d)) {
+ if(key.dev != %s) {
return 0;
}
- """ % (disk_field_str, major, minor)
+ """ % (dev)
bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str)
else:
@@ -194,6 +250,16 @@ store_str = ""
bpf_text = bpf_text.replace("STORAGE", storage_str)
bpf_text = bpf_text.replace("STORE", store_str)
+if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
+ bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk')
+else:
+ bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk')
+if args.flags:
+ bpf_text = bpf_text.replace('CMD_FLAGS', 'u64 flags;')
+ bpf_text = bpf_text.replace('SET_FLAGS', 'key.flags = req->cmd_flags;')
+else:
+ bpf_text = bpf_text.replace('CMD_FLAGS', '')
+ bpf_text = bpf_text.replace('SET_FLAGS', '')
if debug or args.ebpf:
print(bpf_text)
@@ -205,25 +271,53 @@ b = BPF(text=bpf_text)
if args.queued:
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_req_start")
- else:
+ elif BPF.get_kprobe_functions(b'blk_account_io_start'):
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_req_start")
+ else:
+ if args.flags:
+ # Some flags are accessible in the rwbs field (RAHEAD, SYNC and META)
+ # but other aren't. Disable the -F option for tracepoint for now.
+ print("ERROR: blk_account_io_start probe not available. Can't use -F.")
+ exit()
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_req_start_tp")
else:
if BPF.get_kprobe_functions(b'blk_start_request'):
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
+
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_done")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_done")
+else:
+ if args.flags:
+ print("ERROR: blk_account_io_done probe not available. Can't use -F.")
+ exit()
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_done_tp")
+
if not args.json:
print("Tracing block device I/O... Hit Ctrl-C to end.")
-def disk_print(s):
- disk = s.decode('utf-8', 'replace')
- if not disk:
- disk = "<unknown>"
- return disk
+# cache disk major,minor -> diskname
+diskstats = "/proc/diskstats"
+disklookup = {}
+with open(diskstats) as stats:
+ for line in stats:
+ a = line.split()
+ disklookup[a[0] + "," + a[1]] = a[2]
+
+def disk_print(d):
+ major = d >> 20
+ minor = d & ((1 << 20) - 1)
+
+ disk = str(major) + "," + str(minor)
+ if disk in disklookup:
+ diskname = disklookup[disk]
+ else:
+ diskname = "?"
+
+ return diskname
# see blk_fill_rwbs():
req_opf = {
diff --git a/tools/biosnoop.py b/tools/biosnoop.py
index 33703233..f0fef98b 100755
--- a/tools/biosnoop.py
+++ b/tools/biosnoop.py
@@ -14,6 +14,7 @@
# 11-Feb-2016 Allan McAleavy updated for BPF_PERF_OUTPUT
# 21-Jun-2022 Rocky Xing Added disk filter support.
# 13-Oct-2022 Rocky Xing Added support for displaying block I/O pattern.
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
from __future__ import print_function
from bcc import BPF
@@ -64,6 +65,24 @@ struct val_t {
char name[TASK_COMM_LEN];
};
+struct tp_args {
+ u64 __unused__;
+ dev_t dev;
+ sector_t sector;
+ unsigned int nr_sector;
+ unsigned int bytes;
+ char rwbs[8];
+ char comm[16];
+ char cmd[];
+};
+
+struct hash_key {
+ dev_t dev;
+ u32 rwflag;
+ sector_t sector;
+};
+
+
#ifdef INCLUDE_PATTERN
struct sector_key_t {
u32 dev_major;
@@ -79,6 +98,7 @@ enum bio_pattern {
struct data_t {
u32 pid;
+ u32 dev;
u64 rwflag;
u64 delta;
u64 qdelta;
@@ -88,7 +108,6 @@ struct data_t {
enum bio_pattern pattern;
#endif
u64 ts;
- char disk_name[DISK_NAME_LEN];
char name[TASK_COMM_LEN];
};
@@ -96,12 +115,45 @@ struct data_t {
BPF_HASH(last_sectors, struct sector_key_t, u64);
#endif
-BPF_HASH(start, struct request *, struct start_req_t);
-BPF_HASH(infobyreq, struct request *, struct val_t);
+BPF_HASH(start, struct hash_key, struct start_req_t);
+BPF_HASH(infobyreq, struct hash_key, struct val_t);
BPF_PERF_OUTPUT(events);
+static dev_t ddevt(struct gendisk *disk) {
+ return (disk->major << 20) | disk->first_minor;
+}
+
+/*
+ * The following deals with a kernel version change (in mainline 4.7, although
+ * it may be backported to earlier kernels) with how block request write flags
+ * are tested. We handle both pre- and post-change versions here. Please avoid
+ * kernel version tests like this as much as possible: they inflate the code,
+ * test, and maintenance burden.
+ */
+static int get_rwflag(u32 cmd_flags) {
+#ifdef REQ_WRITE
+ return !!(cmd_flags & REQ_WRITE);
+#elif defined(REQ_OP_SHIFT)
+ return !!((cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
+#else
+ return !!((cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
+#endif
+}
+
+#define RWBS_LEN 8
+
+static int get_rwflag_tp(char *rwbs) {
+ for (int i = 0; i < RWBS_LEN; i++) {
+ if (rwbs[i] == 'W')
+ return 1;
+ if (rwbs[i] == '\\0')
+ return 0;
+ }
+ return 0;
+}
+
// cache PID and comm by-req
-int trace_pid_start(struct pt_regs *ctx, struct request *req)
+static int __trace_pid_start(struct hash_key key)
{
DISK_FILTER
@@ -113,47 +165,76 @@ int trace_pid_start(struct pt_regs *ctx, struct request *req)
if (##QUEUE##) {
val.ts = bpf_ktime_get_ns();
}
- infobyreq.update(&req, &val);
+ infobyreq.update(&key, &val);
}
return 0;
}
+
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .rwflag = get_rwflag(req->cmd_flags),
+ .sector = req->__sector
+ };
+
+ return __trace_pid_start(key);
+}
+
+int trace_pid_start_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .rwflag = get_rwflag_tp(args->rwbs),
+ .sector = args->sector
+ };
+
+ return __trace_pid_start(key);
+}
+
// time block I/O
int trace_req_start(struct pt_regs *ctx, struct request *req)
{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .rwflag = get_rwflag(req->cmd_flags),
+ .sector = req->__sector
+ };
+
DISK_FILTER
struct start_req_t start_req = {
.ts = bpf_ktime_get_ns(),
.data_len = req->__data_len
};
- start.update(&req, &start_req);
+ start.update(&key, &start_req);
return 0;
}
// output
-int trace_req_completion(struct pt_regs *ctx, struct request *req)
+static int __trace_req_completion(void *ctx, struct hash_key key)
{
struct start_req_t *startp;
struct val_t *valp;
struct data_t data = {};
- struct gendisk *rq_disk;
+ //struct gendisk *rq_disk;
u64 ts;
// fetch timestamp and calculate delta
- startp = start.lookup(&req);
+ startp = start.lookup(&key);
if (startp == 0) {
// missed tracing issue
return 0;
}
ts = bpf_ktime_get_ns();
- rq_disk = req->__RQ_DISK__;
+ //rq_disk = req->__RQ_DISK__;
data.delta = ts - startp->ts;
data.ts = ts / 1000;
data.qdelta = 0;
data.len = startp->data_len;
- valp = infobyreq.lookup(&req);
+ valp = infobyreq.lookup(&key);
if (valp == 0) {
data.name[0] = '?';
data.name[1] = 0;
@@ -162,10 +243,9 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
data.qdelta = startp->ts - valp->ts;
}
data.pid = valp->pid;
- data.sector = req->__sector;
+ data.sector = key.sector;
+ data.dev = key.dev;
bpf_probe_read_kernel(&data.name, sizeof(data.name), valp->name);
- bpf_probe_read_kernel(&data.disk_name, sizeof(data.disk_name),
- rq_disk->disk_name);
}
#ifdef INCLUDE_PATTERN
@@ -174,8 +254,8 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
u64 *sector, last_sector;
struct sector_key_t sector_key = {
- .dev_major = rq_disk->major,
- .dev_minor = rq_disk->first_minor
+ .dev_major = key.dev >> 20,
+ .dev_minor = key.dev & ((1 << 20) - 1)
};
sector = last_sectors.lookup(&sector_key);
@@ -187,27 +267,36 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
last_sectors.update(&sector_key, &last_sector);
#endif
-/*
- * The following deals with a kernel version change (in mainline 4.7, although
- * it may be backported to earlier kernels) with how block request write flags
- * are tested. We handle both pre- and post-change versions here. Please avoid
- * kernel version tests like this as much as possible: they inflate the code,
- * test, and maintenance burden.
- */
-#ifdef REQ_WRITE
- data.rwflag = !!(req->cmd_flags & REQ_WRITE);
-#elif defined(REQ_OP_SHIFT)
- data.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
-#else
- data.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
-#endif
+ data.rwflag = key.rwflag;
events.perf_submit(ctx, &data, sizeof(data));
- start.delete(&req);
- infobyreq.delete(&req);
+ start.delete(&key);
+ infobyreq.delete(&key);
return 0;
}
+
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .rwflag = get_rwflag(req->cmd_flags),
+ .sector = req->__sector
+ };
+
+ return __trace_req_completion(ctx, key);
+}
+
+int trace_req_completion_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .rwflag = get_rwflag_tp(args->rwbs),
+ .sector = args->sector
+ };
+
+ return __trace_req_completion(args, key);
+}
"""
if args.queue:
bpf_text = bpf_text.replace('##QUEUE##', '1')
@@ -225,21 +314,13 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
exit(1)
stat_info = os.stat(disk_path)
- major = os.major(stat_info.st_rdev)
- minor = os.minor(stat_info.st_rdev)
-
- disk_field_str = ""
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
- disk_field_str = 'req->rq_disk'
- else:
- disk_field_str = 'req->q->disk'
+ dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev)
disk_filter_str = """
- struct gendisk *disk = %s;
- if (!(disk->major == %d && disk->first_minor == %d)) {
+ if(key.dev != %s) {
return 0;
}
- """ % (disk_field_str, major, minor)
+ """ % (dev)
bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str)
else:
@@ -254,15 +335,19 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
b = BPF(text=bpf_text)
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_start'):
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
+else:
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp")
if BPF.get_kprobe_functions(b'blk_start_request'):
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
+else:
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp")
# header
print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID",
@@ -273,6 +358,27 @@ print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID",
print("%7s " % ("QUE(ms)"), end="")
print("%7s" % "LAT(ms)")
+
+# cache disk major,minor -> diskname
+diskstats = "/proc/diskstats"
+disklookup = {}
+with open(diskstats) as stats:
+ for line in stats:
+ a = line.split()
+ disklookup[a[0] + "," + a[1]] = a[2]
+
+def disk_print(d):
+ major = d >> 20
+ minor = d & ((1 << 20) - 1)
+
+ disk = str(major) + "," + str(minor)
+ if disk in disklookup:
+ diskname = disklookup[disk]
+ else:
+ diskname = "<unknown>"
+
+ return diskname
+
rwflg = ""
pattern = ""
start_ts = 0
@@ -297,9 +403,7 @@ P_RANDOM = 2
delta = float(event.ts) - start_ts
- disk_name = event.disk_name.decode('utf-8', 'replace')
- if not disk_name:
- disk_name = '<unknown>'
+ disk_name = disk_print(event.dev)
print("%-11.6f %-14.14s %-7s %-9s %-1s %-10s %-7s" % (
delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid,
diff --git a/tools/biotop.py b/tools/biotop.py
index fcdd373f..2620983a 100755
--- a/tools/biotop.py
+++ b/tools/biotop.py
@@ -14,6 +14,7 @@
#
# 06-Feb-2016 Brendan Gregg Created this.
# 17-Mar-2022 Rocky Xing Added PID filter support.
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
from __future__ import print_function
from bcc import BPF
@@ -88,14 +89,35 @@ struct val_t {
u32 io;
};
-BPF_HASH(start, struct request *, struct start_req_t);
-BPF_HASH(whobyreq, struct request *, struct who_t);
+struct tp_args {
+ u64 __unused__;
+ dev_t dev;
+ sector_t sector;
+ unsigned int nr_sector;
+ unsigned int bytes;
+ char rwbs[8];
+ char comm[16];
+ char cmd[];
+};
+
+struct hash_key {
+ dev_t dev;
+ u32 _pad;
+ sector_t sector;
+};
+
+BPF_HASH(start, struct hash_key, struct start_req_t);
+BPF_HASH(whobyreq, struct hash_key, struct who_t);
BPF_HASH(counts, struct info_t, struct val_t);
+static dev_t ddevt(struct gendisk *disk) {
+ return (disk->major << 20) | disk->first_minor;
+}
+
// cache PID and comm by-req
-int trace_pid_start(struct pt_regs *ctx, struct request *req)
+static int __trace_pid_start(struct hash_key key)
{
- struct who_t who = {};
+ struct who_t who;
u32 pid;
if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
@@ -104,30 +126,54 @@ int trace_pid_start(struct pt_regs *ctx, struct request *req)
return 0;
who.pid = pid;
- whobyreq.update(&req, &who);
+ whobyreq.update(&key, &who);
}
return 0;
}
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ return __trace_pid_start(key);
+}
+
+int trace_pid_start_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_pid_start(key);
+}
+
// time block I/O
int trace_req_start(struct pt_regs *ctx, struct request *req)
{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
struct start_req_t start_req = {
.ts = bpf_ktime_get_ns(),
.data_len = req->__data_len
};
- start.update(&req, &start_req);
+ start.update(&key, &start_req);
return 0;
}
// output
-int trace_req_completion(struct pt_regs *ctx, struct request *req)
+static int __trace_req_completion(struct hash_key key)
{
struct start_req_t *startp;
// fetch timestamp and calculate delta
- startp = start.lookup(&req);
+ startp = start.lookup(&key);
if (startp == 0) {
return 0; // missed tracing issue
}
@@ -135,12 +181,12 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
struct who_t *whop;
u32 pid;
- whop = whobyreq.lookup(&req);
+ whop = whobyreq.lookup(&key);
pid = whop != 0 ? whop->pid : 0;
if (FILTER_PID) {
- start.delete(&req);
+ start.delete(&key);
if (whop != 0) {
- whobyreq.delete(&req);
+ whobyreq.delete(&key);
}
return 0;
}
@@ -150,8 +196,8 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
// setup info_t key
struct info_t info = {};
- info.major = req->__RQ_DISK__->major;
- info.minor = req->__RQ_DISK__->first_minor;
+ info.major = key.dev >> 20;
+ info.minor = key.dev & ((1 << 20) - 1);
/*
* The following deals with a kernel version change (in mainline 4.7, although
* it may be backported to earlier kernels) with how block request write flags
@@ -159,13 +205,13 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
* kernel version tests like this as much as possible: they inflate the code,
* test, and maintenance burden.
*/
-#ifdef REQ_WRITE
+/*#ifdef REQ_WRITE
info.rwflag = !!(req->cmd_flags & REQ_WRITE);
#elif defined(REQ_OP_SHIFT)
info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
#else
info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
-#endif
+#endif*/
if (whop == 0) {
// missed pid who, save stats as pid 0
@@ -183,11 +229,31 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
valp->io++;
}
- start.delete(&req);
- whobyreq.delete(&req);
+ start.delete(&key);
+ whobyreq.delete(&key);
return 0;
}
+
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ return __trace_req_completion(key);
+}
+
+int trace_req_completion_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_req_completion(key);
+}
"""
if args.ebpf:
@@ -207,15 +273,19 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
b = BPF(text=bpf_text)
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_start'):
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
+else:
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp")
if BPF.get_kprobe_functions(b'blk_start_request'):
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
+else:
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp")
print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
--
2.41.0