diff --git a/.gitignore b/.gitignore index a16e598..7041d2f 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ /bcc-0.26.0.tar.gz /bcc-0.27.0.tar.gz /bcc-0.28.0.tar.gz +/bcc-0.29.1.tar.gz diff --git a/Fix-ttysnoop.py-with-newer-kernels.patch b/Fix-ttysnoop.py-with-newer-kernels.patch new file mode 100644 index 0000000..437eaf5 --- /dev/null +++ b/Fix-ttysnoop.py-with-newer-kernels.patch @@ -0,0 +1,132 @@ +From 89126c7452c29736d38dc072a952b0b0c831fade Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Mon, 29 Jan 2024 16:13:30 -0800 +Subject: [PATCH] [PATCH] Fix ttysnoop.py with newer kernels + +Jerome Marchand reported that ttysnoop.py won't work properly +with newer kernels (#4884). I did some investigation and found +that some kernel data structure change caused verification failure. +The failure is caused by the following: + ; kvec = from->kvec; + // R1=ptr_iov_iter() + 15: (79) r1 = *(u64 *)(r1 +16) ; R1_w=scalar() + ; count = kvec->iov_len; + 16: (bf) r2 = r1 ; R1_w=scalar(id=1) R2_w=scalar(id=1) + 17: (07) r2 += 8 ; R2_w=scalar() + 18: (05) goto pc+3 + ; + 22: (79) r2 = *(u64 *)(r2 +0) + R2 invalid mem access 'scalar' + +So basically, loading 'iov_iter + 16' returns a scalar but verifier +expects it to be a pointer. + +In v6.4, we have + struct iovec + { + void __user *iov_base; /* BSD uses caddr_t (1003.1g requires void *) */ + __kernel_size_t iov_len; /* Must be size_t (1003.1g) */ + }; + struct iov_iter { + u8 iter_type; + bool copy_mc; + bool nofault; + bool data_source; + bool user_backed; + union { + size_t iov_offset; + int last_offset; + }; + union { + struct iovec __ubuf_iovec; + struct { + union { + const struct iovec *__iov; + const struct kvec *kvec; + const struct bio_vec *bvec; + struct xarray *xarray; + struct pipe_inode_info *pipe; + void __user *ubuf; + }; + size_t count; + }; + }; + union { + unsigned long nr_segs; + struct { + unsigned int head; + unsigned int start_head; + }; + loff_t xarray_start; + }; + }; + +The kernel traversal chain will be + "struct iov_iter" -> "struct iovec __ubuf_iovec" -> "void __user *iov_base". +Since the "iov_base" type is a ptr to void, the kernel considers the +loaded value as a scalar which caused verification failure. + +But for old kernel like 5.19, we do not have this issue. + struct iovec + { + void __user *iov_base; /* BSD uses caddr_t (1003.1g requires void *) */ + __kernel_size_t iov_len; /* Must be size_t (1003.1g) */ + }; + struct iov_iter { + u8 iter_type; + bool nofault; + bool data_source; + bool user_backed; + size_t iov_offset; + size_t count; + union { + const struct iovec *iov; + const struct kvec *kvec; + const struct bio_vec *bvec; + struct xarray *xarray; + struct pipe_inode_info *pipe; + void __user *ubuf; + }; + union { + unsigned long nr_segs; + struct { + unsigned int head; + unsigned int start_head; + }; + loff_t xarray_start; + }; + }; + +The kernel traversal chain will be + "struct iov_iter" -> "const struct iovec *iov" +Note that "const struct iovec *iov" is used since it is the *first* member +inside the union. The traversal stops once we hit a pointer. +So the kernel verifier returns a 'struct iovec' object (untrusted, cannot +be used as a parameter to a call) and verifier can proceed. + +To fix the problem, let us use bpf_probe_read_kernel() instead +so ttysnoop.py can continue to work with newer kernel. + +Signed-off-by: Yonghong Song +--- + tools/ttysnoop.py | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/tools/ttysnoop.py b/tools/ttysnoop.py +index 77f97b7c..aca09db4 100755 +--- a/tools/ttysnoop.py ++++ b/tools/ttysnoop.py +@@ -162,8 +162,8 @@ PROBE_TTY_WRITE + */ + case CASE_ITER_IOVEC_NAME: + kvec = from->kvec; +- buf = kvec->iov_base; +- count = kvec->iov_len; ++ bpf_probe_read_kernel(&buf, sizeof(buf), &kvec->iov_base); ++ bpf_probe_read_kernel(&count, sizeof(count), &kvec->iov_len); + break; + CASE_ITER_UBUF_TEXT + /* TODO: Support more type */ +-- +2.43.0 + diff --git a/Sync-with-latest-libbpf-repo-4889.patch b/Sync-with-latest-libbpf-repo-4889.patch new file mode 100644 index 0000000..9b57902 --- /dev/null +++ b/Sync-with-latest-libbpf-repo-4889.patch @@ -0,0 +1,727 @@ +From c0691e35cd65d5400f0b792d5eba81f8eae236dc Mon Sep 17 00:00:00 2001 +From: yonghong-song +Date: Tue, 30 Jan 2024 09:14:30 -0800 +Subject: [PATCH] Sync with latest libbpf repo (#4889) + +Sync with latest libbpf repo. +The top libbpf commit is: + 3b0973892891 sync: remove NETDEV_XSK_FLAGS_MASK which is not in bpf/bpf-next anymore + +Signed-off-by: Yonghong Song +--- + introspection/bps.c | 1 + + src/cc/compat/linux/virtual_bpf.h | 368 ++++++++++++++++++++++++++---- + src/cc/libbpf | 2 +- + 3 files changed, 326 insertions(+), 45 deletions(-) + +diff --git a/introspection/bps.c b/introspection/bps.c +index 3956fbf2..8cdef54a 100644 +--- a/introspection/bps.c ++++ b/introspection/bps.c +@@ -48,6 +48,7 @@ static const char * const prog_type_strings[] = { + [BPF_PROG_TYPE_LSM] = "lsm", + [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", + [BPF_PROG_TYPE_SYSCALL] = "syscall", ++ [BPF_PROG_TYPE_NETFILTER] = "netfilter", + }; + + static const char * const map_type_strings[] = { +diff --git a/src/cc/compat/linux/virtual_bpf.h b/src/cc/compat/linux/virtual_bpf.h +index a182123e..fcabe71a 100644 +--- a/src/cc/compat/linux/virtual_bpf.h ++++ b/src/cc/compat/linux/virtual_bpf.h +@@ -20,6 +20,7 @@ R"********( + + /* ld/ldx fields */ + #define BPF_DW 0x18 /* double word (64-bit) */ ++#define BPF_MEMSX 0x80 /* load with sign extension */ + #define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */ + #define BPF_XADD 0xc0 /* exclusive add - legacy name */ + +@@ -847,6 +848,36 @@ union bpf_iter_link_info { + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * ++ * BPF_TOKEN_CREATE ++ * Description ++ * Create BPF token with embedded information about what ++ * BPF-related functionality it allows: ++ * - a set of allowed bpf() syscall commands; ++ * - a set of allowed BPF map types to be created with ++ * BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed; ++ * - a set of allowed BPF program types and BPF program attach ++ * types to be loaded with BPF_PROG_LOAD command, if ++ * BPF_PROG_LOAD itself is allowed. ++ * ++ * BPF token is created (derived) from an instance of BPF FS, ++ * assuming it has necessary delegation mount options specified. ++ * This BPF token can be passed as an extra parameter to various ++ * bpf() syscall commands to grant BPF subsystem functionality to ++ * unprivileged processes. ++ * ++ * When created, BPF token is "associated" with the owning ++ * user namespace of BPF FS instance (super block) that it was ++ * derived from, and subsequent BPF operations performed with ++ * BPF token would be performing capabilities checks (i.e., ++ * CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within ++ * that user namespace. Without BPF token, such capabilities ++ * have to be granted in init user namespace, making bpf() ++ * syscall incompatible with user namespace, for the most part. ++ * ++ * Return ++ * A new file descriptor (a nonnegative integer), or -1 if an ++ * error occurred (in which case, *errno* is set appropriately). ++ * + * NOTES + * eBPF objects (maps and programs) can be shared between processes. + * +@@ -901,6 +932,8 @@ enum bpf_cmd { + BPF_ITER_CREATE, + BPF_LINK_DETACH, + BPF_PROG_BIND_MAP, ++ BPF_TOKEN_CREATE, ++ __MAX_BPF_CMD, + }; + + enum bpf_map_type { +@@ -932,7 +965,14 @@ enum bpf_map_type { + */ + BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, +- BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, ++ BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED, ++ /* BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE is available to bpf programs ++ * attaching to a cgroup. The new mechanism (BPF_MAP_TYPE_CGRP_STORAGE + ++ * local percpu kptr) supports all BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE ++ * functionality and more. So mark * BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE ++ * deprecated. ++ */ ++ BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE = BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED, + BPF_MAP_TYPE_QUEUE, + BPF_MAP_TYPE_STACK, + BPF_MAP_TYPE_SK_STORAGE, +@@ -944,6 +984,7 @@ enum bpf_map_type { + BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, + BPF_MAP_TYPE_CGRP_STORAGE, ++ __MAX_BPF_MAP_TYPE + }; + + /* Note that tracing related programs such as +@@ -987,6 +1028,8 @@ enum bpf_prog_type { + BPF_PROG_TYPE_LSM, + BPF_PROG_TYPE_SK_LOOKUP, + BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ ++ BPF_PROG_TYPE_NETFILTER, ++ __MAX_BPF_PROG_TYPE + }; + + enum bpf_attach_type { +@@ -1035,6 +1078,17 @@ enum bpf_attach_type { + BPF_TRACE_KPROBE_MULTI, + BPF_LSM_CGROUP, + BPF_STRUCT_OPS, ++ BPF_NETFILTER, ++ BPF_TCX_INGRESS, ++ BPF_TCX_EGRESS, ++ BPF_TRACE_UPROBE_MULTI, ++ BPF_CGROUP_UNIX_CONNECT, ++ BPF_CGROUP_UNIX_SENDMSG, ++ BPF_CGROUP_UNIX_RECVMSG, ++ BPF_CGROUP_UNIX_GETPEERNAME, ++ BPF_CGROUP_UNIX_GETSOCKNAME, ++ BPF_NETKIT_PRIMARY, ++ BPF_NETKIT_PEER, + __MAX_BPF_ATTACH_TYPE + }; + +@@ -1051,8 +1105,23 @@ enum bpf_link_type { + BPF_LINK_TYPE_PERF_EVENT = 7, + BPF_LINK_TYPE_KPROBE_MULTI = 8, + BPF_LINK_TYPE_STRUCT_OPS = 9, ++ BPF_LINK_TYPE_NETFILTER = 10, ++ BPF_LINK_TYPE_TCX = 11, ++ BPF_LINK_TYPE_UPROBE_MULTI = 12, ++ BPF_LINK_TYPE_NETKIT = 13, ++ __MAX_BPF_LINK_TYPE, ++}; ++ ++#define MAX_BPF_LINK_TYPE __MAX_BPF_LINK_TYPE + +- MAX_BPF_LINK_TYPE, ++enum bpf_perf_event_type { ++ BPF_PERF_EVENT_UNSPEC = 0, ++ BPF_PERF_EVENT_UPROBE = 1, ++ BPF_PERF_EVENT_URETPROBE = 2, ++ BPF_PERF_EVENT_KPROBE = 3, ++ BPF_PERF_EVENT_KRETPROBE = 4, ++ BPF_PERF_EVENT_TRACEPOINT = 5, ++ BPF_PERF_EVENT_EVENT = 6, + }; + + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command +@@ -1101,7 +1170,12 @@ enum bpf_link_type { + */ + #define BPF_F_ALLOW_OVERRIDE (1U << 0) + #define BPF_F_ALLOW_MULTI (1U << 1) ++/* Generic attachment flags. */ + #define BPF_F_REPLACE (1U << 2) ++#define BPF_F_BEFORE (1U << 3) ++#define BPF_F_AFTER (1U << 4) ++#define BPF_F_ID (1U << 5) ++#define BPF_F_LINK BPF_F_LINK /* 1 << 13 */ + + /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the + * verifier will perform strict alignment checking as if the kernel +@@ -1163,10 +1237,27 @@ enum bpf_link_type { + */ + #define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) + ++/* The verifier internal test flag. Behavior is undefined */ ++#define BPF_F_TEST_REG_INVARIANTS (1U << 7) ++ + /* link_create.kprobe_multi.flags used in LINK_CREATE command for + * BPF_TRACE_KPROBE_MULTI attach type to create return probe. + */ +-#define BPF_F_KPROBE_MULTI_RETURN (1U << 0) ++enum { ++ BPF_F_KPROBE_MULTI_RETURN = (1U << 0) ++}; ++ ++/* link_create.uprobe_multi.flags used in LINK_CREATE command for ++ * BPF_TRACE_UPROBE_MULTI attach type to create return probe. ++ */ ++enum { ++ BPF_F_UPROBE_MULTI_RETURN = (1U << 0) ++}; ++ ++/* link_create.netfilter.flags used in LINK_CREATE command for ++ * BPF_PROG_TYPE_NETFILTER to enable IP packet defragmentation. ++ */ ++#define BPF_F_NETFILTER_IP_DEFRAG (1U << 0) + + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have + * the following extensions: +@@ -1271,6 +1362,15 @@ enum { + + /* Create a map that will be registered/unregesitered by the backed bpf_link */ + BPF_F_LINK = (1U << 13), ++ ++/* Get path from provided FD in BPF_OBJ_PIN/BPF_OBJ_GET commands */ ++ BPF_F_PATH_FD = (1U << 14), ++ ++/* Flag for value_type_btf_obj_fd, the fd is available */ ++ BPF_F_VTYPE_BTF_OBJ_FD = (1U << 15), ++ ++/* BPF token FD is passed in a corresponding command's token_fd field */ ++ BPF_F_TOKEN_FD = (1U << 16), + }; + + /* Flags for BPF_PROG_QUERY. */ +@@ -1344,6 +1444,15 @@ union bpf_attr { + * to using 5 hash functions). + */ + __u64 map_extra; ++ ++ __s32 value_type_btf_obj_fd; /* fd pointing to a BTF ++ * type data for ++ * btf_vmlinux_value_type_id. ++ */ ++ /* BPF token FD to use with BPF_MAP_CREATE operation. ++ * If provided, map_flags should have BPF_F_TOKEN_FD flag set. ++ */ ++ __s32 map_token_fd; + }; + + struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ +@@ -1413,23 +1522,39 @@ union bpf_attr { + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 log_true_size; ++ /* BPF token FD to use with BPF_PROG_LOAD operation. ++ * If provided, prog_flags should have BPF_F_TOKEN_FD flag set. ++ */ ++ __s32 prog_token_fd; + }; + + struct { /* anonymous struct used by BPF_OBJ_* commands */ + __aligned_u64 pathname; + __u32 bpf_fd; + __u32 file_flags; ++ /* Same as dirfd in openat() syscall; see openat(2) ++ * manpage for details of path FD and pathname semantics; ++ * path_fd should accompanied by BPF_F_PATH_FD flag set in ++ * file_flags field, otherwise it should be set to zero; ++ * if BPF_F_PATH_FD flag is not set, AT_FDCWD is assumed. ++ */ ++ __s32 path_fd; + }; + + struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ +- __u32 target_fd; /* container object to attach to */ +- __u32 attach_bpf_fd; /* eBPF program to attach */ ++ union { ++ __u32 target_fd; /* target object to attach to or ... */ ++ __u32 target_ifindex; /* target ifindex */ ++ }; ++ __u32 attach_bpf_fd; + __u32 attach_type; + __u32 attach_flags; +- __u32 replace_bpf_fd; /* previously attached eBPF +- * program to replace if +- * BPF_F_REPLACE is used +- */ ++ __u32 replace_bpf_fd; ++ union { ++ __u32 relative_fd; ++ __u32 relative_id; ++ }; ++ __u64 expected_revision; + }; + + struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ +@@ -1475,16 +1600,26 @@ union bpf_attr { + } info; + + struct { /* anonymous struct used by BPF_PROG_QUERY command */ +- __u32 target_fd; /* container object to query */ ++ union { ++ __u32 target_fd; /* target object to query or ... */ ++ __u32 target_ifindex; /* target ifindex */ ++ }; + __u32 attach_type; + __u32 query_flags; + __u32 attach_flags; + __aligned_u64 prog_ids; +- __u32 prog_cnt; ++ union { ++ __u32 prog_cnt; ++ __u32 count; ++ }; ++ __u32 :32; + /* output: per-program attach_flags. + * not allowed to be set during effective query. + */ + __aligned_u64 prog_attach_flags; ++ __aligned_u64 link_ids; ++ __aligned_u64 link_attach_flags; ++ __u64 revision; + } query; + + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ +@@ -1503,6 +1638,11 @@ union bpf_attr { + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 btf_log_true_size; ++ __u32 btf_flags; ++ /* BPF token FD to use with BPF_BTF_LOAD operation. ++ * If provided, btf_flags should have BPF_F_TOKEN_FD flag set. ++ */ ++ __s32 btf_token_fd; + }; + + struct { +@@ -1527,13 +1667,13 @@ union bpf_attr { + __u32 map_fd; /* struct_ops to attach */ + }; + union { +- __u32 target_fd; /* object to attach to */ +- __u32 target_ifindex; /* target ifindex */ ++ __u32 target_fd; /* target object to attach to or ... */ ++ __u32 target_ifindex; /* target ifindex */ + }; + __u32 attach_type; /* attach type */ + __u32 flags; /* extra flags */ + union { +- __u32 target_btf_id; /* btf_id of target to attach to */ ++ __u32 target_btf_id; /* btf_id of target to attach to */ + struct { + __aligned_u64 iter_info; /* extra bpf_iter_link_info */ + __u32 iter_info_len; /* iter_info length */ +@@ -1561,6 +1701,35 @@ union bpf_attr { + */ + __u64 cookie; + } tracing; ++ struct { ++ __u32 pf; ++ __u32 hooknum; ++ __s32 priority; ++ __u32 flags; ++ } netfilter; ++ struct { ++ union { ++ __u32 relative_fd; ++ __u32 relative_id; ++ }; ++ __u64 expected_revision; ++ } tcx; ++ struct { ++ __aligned_u64 path; ++ __aligned_u64 offsets; ++ __aligned_u64 ref_ctr_offsets; ++ __aligned_u64 cookies; ++ __u32 cnt; ++ __u32 flags; ++ __u32 pid; ++ } uprobe_multi; ++ struct { ++ union { ++ __u32 relative_fd; ++ __u32 relative_id; ++ }; ++ __u64 expected_revision; ++ } netkit; + }; + } link_create; + +@@ -1604,6 +1773,11 @@ union bpf_attr { + __u32 flags; /* extra flags */ + } prog_bind_map; + ++ struct { /* struct used by BPF_TOKEN_CREATE command */ ++ __u32 flags; ++ __u32 bpffs_fd; ++ } token_create; ++ + } __attribute__((aligned(8))); + + /* The description below is an attempt at providing documentation to eBPF +@@ -1879,7 +2053,9 @@ union bpf_attr { + * performed again, if the helper is used in combination with + * direct packet access. + * Return +- * 0 on success, or a negative error in case of failure. ++ * 0 on success, or a negative error in case of failure. Positive ++ * error indicates a potential drop or congestion in the target ++ * device. The particular positive error codes are not defined. + * + * u64 bpf_get_current_pid_tgid(void) + * Description +@@ -2612,8 +2788,8 @@ union bpf_attr { + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. +- * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** +- * and **BPF_CGROUP_INET6_CONNECT**. ++ * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, ++ * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. + * + * This helper actually implements a subset of **setsockopt()**. + * It supports the following *level*\ s: +@@ -2851,8 +3027,8 @@ union bpf_attr { + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. +- * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** +- * and **BPF_CGROUP_INET6_CONNECT**. ++ * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, ++ * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. + * + * This helper actually implements a subset of **getsockopt()**. + * It supports the same set of *optname*\ s that is supported by +@@ -3160,6 +3336,10 @@ union bpf_attr { + * **BPF_FIB_LOOKUP_DIRECT** + * Do a direct table lookup vs full lookup using FIB + * rules. ++ * **BPF_FIB_LOOKUP_TBID** ++ * Used with BPF_FIB_LOOKUP_DIRECT. ++ * Use the routing table ID present in *params*->tbid ++ * for the fib lookup. + * **BPF_FIB_LOOKUP_OUTPUT** + * Perform lookup from an egress perspective (default is + * ingress). +@@ -3168,6 +3348,11 @@ union bpf_attr { + * and *params*->smac will not be set as output. A common + * use case is to call **bpf_redirect_neigh**\ () after + * doing **bpf_fib_lookup**\ (). ++ * **BPF_FIB_LOOKUP_SRC** ++ * Derive and set source IP addr in *params*->ipv{4,6}_src ++ * for the nexthop. If the src addr cannot be derived, ++ * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this ++ * case, *params*->dmac and *params*->smac are not set either. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** tc cls_act programs. +@@ -4137,9 +4322,6 @@ union bpf_attr { + * **-EOPNOTSUPP** if the operation is not supported, for example + * a call from outside of TC ingress. + * +- * **-ESOCKTNOSUPPORT** if the socket type is not supported +- * (reuseport). +- * + * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags) + * Description + * Helper is overloaded depending on BPF program type. This +@@ -4404,6 +4586,8 @@ union bpf_attr { + * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. ++ * Note: the user stack will only be populated if the *task* is ++ * the current task; all other tasks will return -EOPNOTSUPP. + * To achieve this, the helper needs *task*, which is a valid + * pointer to **struct task_struct**. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. +@@ -4415,6 +4599,7 @@ union bpf_attr { + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. ++ * The *task* must be the current task. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. +@@ -4718,9 +4903,9 @@ union bpf_attr { + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is +- * currently only supported for tc BPF program types at the ingress +- * hook and for veth device types. The peer device must reside in a +- * different network namespace. ++ * currently only supported for tc BPF program types at the ++ * ingress hook and for veth and netkit target device types. The ++ * peer device must reside in a different network namespace. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. +@@ -5003,6 +5188,8 @@ union bpf_attr { + * **BPF_F_TIMER_ABS** + * Start the timer in absolute expire value instead of the + * default relative one. ++ * **BPF_F_TIMER_CPU_PIN** ++ * Timer will be pinned to the CPU of the caller. + * + * Return + * 0 on success. +@@ -5022,9 +5209,14 @@ union bpf_attr { + * u64 bpf_get_func_ip(void *ctx) + * Description + * Get address of the traced function (for tracing and kprobe programs). ++ * ++ * When called for kprobe program attached as uprobe it returns ++ * probe address for both entry and return uprobe. ++ * + * Return +- * Address of the traced function. ++ * Address of the traced function for kprobe. + * 0 for kprobes placed within the function (not at the entry). ++ * Address of the probe for uprobe and return uprobe. + * + * u64 bpf_get_attach_cookie(void *ctx) + * Description +@@ -6165,6 +6357,19 @@ struct bpf_sock_tuple { + }; + }; + ++/* (Simplified) user return codes for tcx prog type. ++ * A valid tcx program must return one of these defined values. All other ++ * return codes are reserved for future use. Must remain compatible with ++ * their TC_ACT_* counter-parts. For compatibility in behavior, unknown ++ * return codes are mapped to TCX_NEXT. ++ */ ++enum tcx_action_base { ++ TCX_NEXT = -1, ++ TCX_PASS = 0, ++ TCX_DROP = 2, ++ TCX_REDIRECT = 7, ++}; ++ + struct bpf_xdp_sock { + __u32 queue_id; + }; +@@ -6346,7 +6551,7 @@ struct bpf_map_info { + __u32 btf_id; + __u32 btf_key_type_id; + __u32 btf_value_type_id; +- __u32 :32; /* alignment pad */ ++ __u32 btf_vmlinux_id; + __u64 map_extra; + } __attribute__((aligned(8))); + +@@ -6411,6 +6616,69 @@ struct bpf_link_info { + struct { + __u32 map_id; + } struct_ops; ++ struct { ++ __u32 pf; ++ __u32 hooknum; ++ __s32 priority; ++ __u32 flags; ++ } netfilter; ++ struct { ++ __aligned_u64 addrs; ++ __u32 count; /* in/out: kprobe_multi function count */ ++ __u32 flags; ++ __u64 missed; ++ __aligned_u64 cookies; ++ } kprobe_multi; ++ struct { ++ __aligned_u64 path; ++ __aligned_u64 offsets; ++ __aligned_u64 ref_ctr_offsets; ++ __aligned_u64 cookies; ++ __u32 path_size; /* in/out: real path size on success, including zero byte */ ++ __u32 count; /* in/out: uprobe_multi offsets/ref_ctr_offsets/cookies count */ ++ __u32 flags; ++ __u32 pid; ++ } uprobe_multi; ++ struct { ++ __u32 type; /* enum bpf_perf_event_type */ ++ __u32 :32; ++ union { ++ struct { ++ __aligned_u64 file_name; /* in/out */ ++ __u32 name_len; ++ __u32 offset; /* offset from file_name */ ++ __u64 cookie; ++ } uprobe; /* BPF_PERF_EVENT_UPROBE, BPF_PERF_EVENT_URETPROBE */ ++ struct { ++ __aligned_u64 func_name; /* in/out */ ++ __u32 name_len; ++ __u32 offset; /* offset from func_name */ ++ __u64 addr; ++ __u64 missed; ++ __u64 cookie; ++ } kprobe; /* BPF_PERF_EVENT_KPROBE, BPF_PERF_EVENT_KRETPROBE */ ++ struct { ++ __aligned_u64 tp_name; /* in/out */ ++ __u32 name_len; ++ __u32 :32; ++ __u64 cookie; ++ } tracepoint; /* BPF_PERF_EVENT_TRACEPOINT */ ++ struct { ++ __u64 config; ++ __u32 type; ++ __u32 :32; ++ __u64 cookie; ++ } event; /* BPF_PERF_EVENT_EVENT */ ++ }; ++ } perf_event; ++ struct { ++ __u32 ifindex; ++ __u32 attach_type; ++ } tcx; ++ struct { ++ __u32 ifindex; ++ __u32 attach_type; ++ } netkit; + }; + } __attribute__((aligned(8))); + +@@ -6707,6 +6975,7 @@ enum { + BPF_TCP_LISTEN, + BPF_TCP_CLOSING, /* Now a valid state */ + BPF_TCP_NEW_SYN_RECV, ++ BPF_TCP_BOUND_INACTIVE, + + BPF_TCP_MAX_STATES /* Leave at the end! */ + }; +@@ -6808,6 +7077,8 @@ enum { + BPF_FIB_LOOKUP_DIRECT = (1U << 0), + BPF_FIB_LOOKUP_OUTPUT = (1U << 1), + BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), ++ BPF_FIB_LOOKUP_TBID = (1U << 3), ++ BPF_FIB_LOOKUP_SRC = (1U << 4), + }; + + enum { +@@ -6820,6 +7091,7 @@ enum { + BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ + BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ + BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ ++ BPF_FIB_LKUP_RET_NO_SRC_ADDR, /* failed to derive IP src addr */ + }; + + struct bpf_fib_lookup { +@@ -6854,6 +7126,9 @@ struct bpf_fib_lookup { + __u32 rt_metric; + }; + ++ /* input: source address to consider for lookup ++ * output: source address result from lookup ++ */ + union { + __be32 ipv4_src; + __u32 ipv6_src[4]; /* in6_addr; network order */ +@@ -6868,9 +7143,19 @@ struct bpf_fib_lookup { + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + +- /* output */ +- __be16 h_vlan_proto; +- __be16 h_vlan_TCI; ++ union { ++ struct { ++ /* output */ ++ __be16 h_vlan_proto; ++ __be16 h_vlan_TCI; ++ }; ++ /* input: when accompanied with the ++ * 'BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_TBID` flags, a ++ * specific routing table to use for the fib lookup. ++ */ ++ __u32 tbid; ++ }; ++ + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ + }; +@@ -6956,38 +7241,31 @@ struct bpf_spin_lock { + }; + + struct bpf_timer { +- __u64 :64; +- __u64 :64; ++ __u64 __opaque[2]; + } __attribute__((aligned(8))); + + struct bpf_dynptr { +- __u64 :64; +- __u64 :64; ++ __u64 __opaque[2]; + } __attribute__((aligned(8))); + + struct bpf_list_head { +- __u64 :64; +- __u64 :64; ++ __u64 __opaque[2]; + } __attribute__((aligned(8))); + + struct bpf_list_node { +- __u64 :64; +- __u64 :64; ++ __u64 __opaque[3]; + } __attribute__((aligned(8))); + + struct bpf_rb_root { +- __u64 :64; +- __u64 :64; ++ __u64 __opaque[2]; + } __attribute__((aligned(8))); + + struct bpf_rb_node { +- __u64 :64; +- __u64 :64; +- __u64 :64; ++ __u64 __opaque[4]; + } __attribute__((aligned(8))); + + struct bpf_refcount { +- __u32 :32; ++ __u32 __opaque[1]; + } __attribute__((aligned(4))); + + struct bpf_sysctl { +@@ -7143,9 +7421,11 @@ struct bpf_core_relo { + * Flags to control bpf_timer_start() behaviour. + * - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is + * relative to current time. ++ * - BPF_F_TIMER_CPU_PIN: Timer will be pinned to the CPU of the caller. + */ + enum { + BPF_F_TIMER_ABS = (1ULL << 0), ++ BPF_F_TIMER_CPU_PIN = (1ULL << 1), + }; + + /* BPF numbers iterator state */ +-- +2.43.0 + diff --git a/Use-bpf_obj_get_info_by_fd-instead-of-bpf_btf_get_in.patch b/Use-bpf_obj_get_info_by_fd-instead-of-bpf_btf_get_in.patch deleted file mode 100644 index b8b1725..0000000 --- a/Use-bpf_obj_get_info_by_fd-instead-of-bpf_btf_get_in.patch +++ /dev/null @@ -1,31 +0,0 @@ -From 0973fd70c1c50e57a3db0b09e239b1d1fd3f1c55 Mon Sep 17 00:00:00 2001 -From: Jerome Marchand -Date: Fri, 21 Jul 2023 16:10:18 +0200 -Subject: [PATCH] Use bpf_obj_get_info_by_fd() instead of - bpf_btf_get_info_by_fd() - -The libbpf version in rawhide doesn't have the typed -bpf_*_get_info_by_fd(). ---- - src/cc/libbpf.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c -index 360fd81d..a3e34da2 100644 ---- a/src/cc/libbpf.c -+++ b/src/cc/libbpf.c -@@ -727,9 +727,9 @@ static int find_btf_id(const char *module_name, const char *func_name, - info.name = ptr_to_u64(name); - info.name_len = sizeof(name); - -- err = bpf_btf_get_info_by_fd(fd, &info, &len); -+ err = bpf_obj_get_info_by_fd(fd, &info, &len); - if (err) { -- fprintf(stderr, "bpf_btf_get_info_by_fd failed: %d\n", err); -+ fprintf(stderr, "bpf_obj_get_info_by_fd failed: %d\n", err); - goto err_out; - } - --- -2.41.0 - diff --git a/bcc.spec b/bcc.spec index 6ec482d..f115ac1 100644 --- a/bcc.spec +++ b/bcc.spec @@ -10,7 +10,7 @@ %endif %endif -%ifarch x86_64 ppc64 ppc64le aarch64 +%ifarch x86_64 ppc64 ppc64le aarch64 s390x %bcond_without libbpf_tools %else %bcond_with libbpf_tools @@ -24,16 +24,15 @@ Name: bcc -Version: 0.28.0 -Release: 3%{?dist} +Version: 0.29.1 +Release: 1%{?dist} Summary: BPF Compiler Collection (BCC) License: Apache-2.0 URL: https://github.com/iovisor/bcc Source0: %{url}/archive/v%{version}/%{name}-%{version}.tar.gz -Patch0: Use-bpf_obj_get_info_by_fd-instead-of-bpf_btf_get_in.patch -Patch1: libbpf-tools-add-block_io_-start-done-tracepoints-su.patch -Patch2: tools-Add-support-for-the-new-block_io_-tracepoints.patch -Patch3: tool-slabratetop-add-definition-of-freelist_aba_t.patch +Patch0: libbpf-tools-Fix-bindsnoop-for-kernel-v6.6.patch +Patch1: Fix-ttysnoop.py-with-newer-kernels.patch +Patch2: Sync-with-latest-libbpf-repo-4889.patch # Arches will be included as upstream support is added and dependencies are # satisfied in the respective arches @@ -242,6 +241,11 @@ cp -a libbpf-tools/tmp-install/bin/* %{buildroot}/%{_sbindir}/ %endif %changelog +* Mon Feb 05 2024 Jerome Marchand - 0.29.1-1 +- Rebase to the latest release version (#2253688) +- Enable libbpf-tools on s390x (#2249458) +- Misc 0.29.1 fixes + * Tue Jan 23 2024 Fedora Release Engineering - 0.28.0-3 - Rebuilt for https://fedoraproject.org/wiki/Fedora_40_Mass_Rebuild diff --git a/libbpf-tools-Fix-bindsnoop-for-kernel-v6.6.patch b/libbpf-tools-Fix-bindsnoop-for-kernel-v6.6.patch new file mode 100644 index 0000000..5002141 --- /dev/null +++ b/libbpf-tools-Fix-bindsnoop-for-kernel-v6.6.patch @@ -0,0 +1,114 @@ +From abf7b251c1461dcbe0c1e75d1d0da71662c9fae1 Mon Sep 17 00:00:00 2001 +From: Hengqi Chen +Date: Sun, 17 Dec 2023 11:27:10 +0000 +Subject: [PATCH] libbpf-tools: Fix bindsnoop for kernel v6.6+ + +The freebind field in struct inet_sock gone in recent kernel +versions due to some kernel refactor works ([0]). The change +breaks the bindsnoop tool. Fix it in a CO-RE way. + +This should close #4838. + + [0]: https://lore.kernel.org/all/20230816081547.1272409-1-edumazet@google.com/ + +Signed-off-by: Hengqi Chen +--- + libbpf-tools/bindsnoop.bpf.c | 8 +++-- + libbpf-tools/core_fixes.bpf.h | 56 +++++++++++++++++++++++++++++++++++ + 2 files changed, 61 insertions(+), 3 deletions(-) + +diff --git a/libbpf-tools/bindsnoop.bpf.c b/libbpf-tools/bindsnoop.bpf.c +index 41dce942..ead19c67 100644 +--- a/libbpf-tools/bindsnoop.bpf.c ++++ b/libbpf-tools/bindsnoop.bpf.c +@@ -5,7 +5,9 @@ + #include + #include + #include ++ + #include "bindsnoop.h" ++#include "core_fixes.bpf.h" + + #define MAX_ENTRIES 10240 + #define MAX_PORTS 1024 +@@ -85,9 +87,9 @@ static int probe_exit(struct pt_regs *ctx, short ver) + if (filter_by_port && !port) + goto cleanup; + +- opts.fields.freebind = BPF_CORE_READ_BITFIELD_PROBED(inet_sock, freebind); +- opts.fields.transparent = BPF_CORE_READ_BITFIELD_PROBED(inet_sock, transparent); +- opts.fields.bind_address_no_port = BPF_CORE_READ_BITFIELD_PROBED(inet_sock, bind_address_no_port); ++ opts.fields.freebind = get_inet_sock_freebind(inet_sock); ++ opts.fields.transparent = get_inet_sock_transparent(inet_sock); ++ opts.fields.bind_address_no_port = get_inet_sock_bind_address_no_port(inet_sock); + opts.fields.reuseaddress = BPF_CORE_READ_BITFIELD_PROBED(sock, __sk_common.skc_reuse); + opts.fields.reuseport = BPF_CORE_READ_BITFIELD_PROBED(sock, __sk_common.skc_reuseport); + event.opts = opts.data; +diff --git a/libbpf-tools/core_fixes.bpf.h b/libbpf-tools/core_fixes.bpf.h +index 84cb7f18..a4c84c02 100644 +--- a/libbpf-tools/core_fixes.bpf.h ++++ b/libbpf-tools/core_fixes.bpf.h +@@ -249,4 +249,60 @@ static __always_inline __u64 get_sock_ident(struct sock *sk) + return (__u64)sk; + } + ++/** ++ * During kernel 6.6 development cycle, several bitfields in struct inet_sock gone, ++ * they are placed in inet_sock::inet_flags instead ([0]). ++ * ++ * References: ++ * [0]: https://lore.kernel.org/all/20230816081547.1272409-1-edumazet@google.com/ ++ */ ++struct inet_sock___o { ++ __u8 freebind: 1; ++ __u8 transparent: 1; ++ __u8 bind_address_no_port: 1; ++}; ++ ++enum { ++ INET_FLAGS_FREEBIND___x = 11, ++ INET_FLAGS_TRANSPARENT___x = 15, ++ INET_FLAGS_BIND_ADDRESS_NO_PORT___x = 18, ++}; ++ ++struct inet_sock___x { ++ unsigned long inet_flags; ++}; ++ ++static __always_inline __u8 get_inet_sock_freebind(void *inet_sock) ++{ ++ unsigned long inet_flags; ++ ++ if (bpf_core_field_exists(struct inet_sock___o, freebind)) ++ return BPF_CORE_READ_BITFIELD_PROBED((struct inet_sock___o *)inet_sock, freebind); ++ ++ inet_flags = BPF_CORE_READ((struct inet_sock___x *)inet_sock, inet_flags); ++ return (1 << INET_FLAGS_FREEBIND___x) & inet_flags ? 1 : 0; ++} ++ ++static __always_inline __u8 get_inet_sock_transparent(void *inet_sock) ++{ ++ unsigned long inet_flags; ++ ++ if (bpf_core_field_exists(struct inet_sock___o, transparent)) ++ return BPF_CORE_READ_BITFIELD_PROBED((struct inet_sock___o *)inet_sock, transparent); ++ ++ inet_flags = BPF_CORE_READ((struct inet_sock___x *)inet_sock, inet_flags); ++ return (1 << INET_FLAGS_TRANSPARENT___x) & inet_flags ? 1 : 0; ++} ++ ++static __always_inline __u8 get_inet_sock_bind_address_no_port(void *inet_sock) ++{ ++ unsigned long inet_flags; ++ ++ if (bpf_core_field_exists(struct inet_sock___o, bind_address_no_port)) ++ return BPF_CORE_READ_BITFIELD_PROBED((struct inet_sock___o *)inet_sock, bind_address_no_port); ++ ++ inet_flags = BPF_CORE_READ((struct inet_sock___x *)inet_sock, inet_flags); ++ return (1 << INET_FLAGS_BIND_ADDRESS_NO_PORT___x) & inet_flags ? 1 : 0; ++} ++ + #endif /* __CORE_FIXES_BPF_H */ +-- +2.43.0 + diff --git a/libbpf-tools-add-block_io_-start-done-tracepoints-su.patch b/libbpf-tools-add-block_io_-start-done-tracepoints-su.patch deleted file mode 100644 index f3468f4..0000000 --- a/libbpf-tools-add-block_io_-start-done-tracepoints-su.patch +++ /dev/null @@ -1,476 +0,0 @@ -From e1dfbe2d09583205acca1d1b5b09caefb460f2fd Mon Sep 17 00:00:00 2001 -From: mickey_zhu -Date: Tue, 27 Jun 2023 16:32:44 +0800 -Subject: [PATCH 1/2] libbpf-tools: add block_io_{start,done} tracepoints - support to bio tools - -Some bio tools fail to kprobe blk_account_io_{start,done} after v5.17, -because they become inlined, see [0]. To fix this issue, tracepoints -blick_io_{start,done} are introcuded in kernel, see[1]. - -Update related bio tools to support new tracepoints, and also simplify -attach. - -[0] Kernel commit 450b7879e345 (block: move blk_account_io_{start,done} to blk-mq.c) -[1] Kernel commit 5a80bd075f3b (block: introduce block_io_start/block_io_done tracepoints) - -Change-Id: I62b957abd7ce2901eb114bd57c78938e4f083e4d -Signed-off-by: Mickey Zhu ---- - libbpf-tools/biosnoop.bpf.c | 9 ++++ - libbpf-tools/biosnoop.c | 78 +++++++++++++-------------------- - libbpf-tools/biostacks.bpf.c | 46 +++++++++++++------ - libbpf-tools/biostacks.c | 85 +++++++++++++++++++++--------------- - libbpf-tools/biotop.bpf.c | 44 +++++++++++++++++-- - libbpf-tools/biotop.c | 59 ++++++++++++++++--------- - 6 files changed, 199 insertions(+), 122 deletions(-) - -diff --git a/libbpf-tools/biosnoop.bpf.c b/libbpf-tools/biosnoop.bpf.c -index b791555f..fcc5c5ce 100644 ---- a/libbpf-tools/biosnoop.bpf.c -+++ b/libbpf-tools/biosnoop.bpf.c -@@ -76,6 +76,15 @@ int BPF_PROG(blk_account_io_start, struct request *rq) - return trace_pid(rq); - } - -+SEC("tp_btf/block_io_start") -+int BPF_PROG(block_io_start, struct request *rq) -+{ -+ if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0)) -+ return 0; -+ -+ return trace_pid(rq); -+} -+ - SEC("kprobe/blk_account_io_merge_bio") - int BPF_KPROBE(blk_account_io_merge_bio, struct request *rq) - { -diff --git a/libbpf-tools/biosnoop.c b/libbpf-tools/biosnoop.c -index 21773729..f9468900 100644 ---- a/libbpf-tools/biosnoop.c -+++ b/libbpf-tools/biosnoop.c -@@ -212,6 +212,16 @@ void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt) - fprintf(stderr, "lost %llu events on CPU #%d\n", lost_cnt, cpu); - } - -+static void blk_account_io_set_attach_target(struct biosnoop_bpf *obj) -+{ -+ if (fentry_can_attach("blk_account_io_start", NULL)) -+ bpf_program__set_attach_target(obj->progs.blk_account_io_start, -+ 0, "blk_account_io_start"); -+ else -+ bpf_program__set_attach_target(obj->progs.blk_account_io_start, -+ 0, "__blk_account_io_start"); -+} -+ - int main(int argc, char **argv) - { - const struct partition *partition; -@@ -260,12 +270,23 @@ int main(int argc, char **argv) - obj->rodata->filter_cg = env.cg; - obj->rodata->min_ns = env.min_lat_ms * 1000000; - -- if (fentry_can_attach("blk_account_io_start", NULL)) -- bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0, -- "blk_account_io_start"); -- else -- bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0, -- "__blk_account_io_start"); -+ if (tracepoint_exists("block", "block_io_start")) -+ bpf_program__set_autoload(obj->progs.blk_account_io_start, false); -+ else { -+ bpf_program__set_autoload(obj->progs.block_io_start, false); -+ blk_account_io_set_attach_target(obj); -+ } -+ -+ ksyms = ksyms__load(); -+ if (!ksyms) { -+ fprintf(stderr, "failed to load kallsyms\n"); -+ goto cleanup; -+ } -+ if (!ksyms__get_symbol(ksyms, "blk_account_io_merge_bio")) -+ bpf_program__set_autoload(obj->progs.blk_account_io_merge_bio, false); -+ -+ if (!env.queued) -+ bpf_program__set_autoload(obj->progs.block_rq_insert, false); - - err = biosnoop_bpf__load(obj); - if (err) { -@@ -288,48 +309,9 @@ int main(int argc, char **argv) - } - } - -- obj->links.blk_account_io_start = bpf_program__attach(obj->progs.blk_account_io_start); -- if (!obj->links.blk_account_io_start) { -- err = -errno; -- fprintf(stderr, "failed to attach blk_account_io_start: %s\n", -- strerror(-err)); -- goto cleanup; -- } -- ksyms = ksyms__load(); -- if (!ksyms) { -- err = -ENOMEM; -- fprintf(stderr, "failed to load kallsyms\n"); -- goto cleanup; -- } -- if (ksyms__get_symbol(ksyms, "blk_account_io_merge_bio")) { -- obj->links.blk_account_io_merge_bio = -- bpf_program__attach(obj->progs.blk_account_io_merge_bio); -- if (!obj->links.blk_account_io_merge_bio) { -- err = -errno; -- fprintf(stderr, "failed to attach blk_account_io_merge_bio: %s\n", -- strerror(-err)); -- goto cleanup; -- } -- } -- if (env.queued) { -- obj->links.block_rq_insert = -- bpf_program__attach(obj->progs.block_rq_insert); -- if (!obj->links.block_rq_insert) { -- err = -errno; -- fprintf(stderr, "failed to attach block_rq_insert: %s\n", strerror(-err)); -- goto cleanup; -- } -- } -- obj->links.block_rq_issue = bpf_program__attach(obj->progs.block_rq_issue); -- if (!obj->links.block_rq_issue) { -- err = -errno; -- fprintf(stderr, "failed to attach block_rq_issue: %s\n", strerror(-err)); -- goto cleanup; -- } -- obj->links.block_rq_complete = bpf_program__attach(obj->progs.block_rq_complete); -- if (!obj->links.block_rq_complete) { -- err = -errno; -- fprintf(stderr, "failed to attach block_rq_complete: %s\n", strerror(-err)); -+ err = biosnoop_bpf__attach(obj); -+ if (err) { -+ fprintf(stderr, "failed to attach BPF programs: %d\n", err); - goto cleanup; - } - -diff --git a/libbpf-tools/biostacks.bpf.c b/libbpf-tools/biostacks.bpf.c -index c3950910..0ca69880 100644 ---- a/libbpf-tools/biostacks.bpf.c -+++ b/libbpf-tools/biostacks.bpf.c -@@ -67,20 +67,8 @@ int trace_start(void *ctx, struct request *rq, bool merge_bio) - return 0; - } - --SEC("fentry/blk_account_io_start") --int BPF_PROG(blk_account_io_start, struct request *rq) --{ -- return trace_start(ctx, rq, false); --} -- --SEC("kprobe/blk_account_io_merge_bio") --int BPF_KPROBE(blk_account_io_merge_bio, struct request *rq) --{ -- return trace_start(ctx, rq, true); --} -- --SEC("fentry/blk_account_io_done") --int BPF_PROG(blk_account_io_done, struct request *rq) -+static __always_inline -+int trace_done(void *ctx, struct request *rq) - { - u64 slot, ts = bpf_ktime_get_ns(); - struct internal_rqinfo *i_rqinfop; -@@ -110,4 +98,34 @@ int BPF_PROG(blk_account_io_done, struct request *rq) - return 0; - } - -+SEC("kprobe/blk_account_io_merge_bio") -+int BPF_KPROBE(blk_account_io_merge_bio, struct request *rq) -+{ -+ return trace_start(ctx, rq, true); -+} -+ -+SEC("fentry/blk_account_io_start") -+int BPF_PROG(blk_account_io_start, struct request *rq) -+{ -+ return trace_start(ctx, rq, false); -+} -+ -+SEC("fentry/blk_account_io_done") -+int BPF_PROG(blk_account_io_done, struct request *rq) -+{ -+ return trace_done(ctx, rq); -+} -+ -+SEC("tp_btf/block_io_start") -+int BPF_PROG(block_io_start, struct request *rq) -+{ -+ return trace_start(ctx, rq, false); -+} -+ -+SEC("tp_btf/block_io_done") -+int BPF_PROG(block_io_done, struct request *rq) -+{ -+ return trace_done(ctx, rq); -+} -+ - char LICENSE[] SEC("license") = "GPL"; -diff --git a/libbpf-tools/biostacks.c b/libbpf-tools/biostacks.c -index e1878d1f..e7875f76 100644 ---- a/libbpf-tools/biostacks.c -+++ b/libbpf-tools/biostacks.c -@@ -128,6 +128,39 @@ void print_map(struct ksyms *ksyms, struct partitions *partitions, int fd) - return; - } - -+static bool has_block_io_tracepoints(void) -+{ -+ return tracepoint_exists("block", "block_io_start") && -+ tracepoint_exists("block", "block_io_done"); -+} -+ -+static void disable_block_io_tracepoints(struct biostacks_bpf *obj) -+{ -+ bpf_program__set_autoload(obj->progs.block_io_start, false); -+ bpf_program__set_autoload(obj->progs.block_io_done, false); -+} -+ -+static void disable_blk_account_io_fentry(struct biostacks_bpf *obj) -+{ -+ bpf_program__set_autoload(obj->progs.blk_account_io_start, false); -+ bpf_program__set_autoload(obj->progs.blk_account_io_done, false); -+} -+ -+static void blk_account_io_set_attach_target(struct biostacks_bpf *obj) -+{ -+ if (fentry_can_attach("blk_account_io_start", NULL)) { -+ bpf_program__set_attach_target(obj->progs.blk_account_io_start, -+ 0, "blk_account_io_start"); -+ bpf_program__set_attach_target(obj->progs.blk_account_io_done, -+ 0, "blk_account_io_done"); -+ } else { -+ bpf_program__set_attach_target(obj->progs.blk_account_io_start, -+ 0, "__blk_account_io_start"); -+ bpf_program__set_attach_target(obj->progs.blk_account_io_done, -+ 0, "__blk_account_io_done"); -+ } -+} -+ - int main(int argc, char **argv) - { - struct partitions *partitions = NULL; -@@ -172,50 +205,30 @@ int main(int argc, char **argv) - - obj->rodata->targ_ms = env.milliseconds; - -- if (fentry_can_attach("blk_account_io_start", NULL)) { -- bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0, -- "blk_account_io_start"); -- bpf_program__set_attach_target(obj->progs.blk_account_io_done, 0, -- "blk_account_io_done"); -- } else { -- bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0, -- "__blk_account_io_start"); -- bpf_program__set_attach_target(obj->progs.blk_account_io_done, 0, -- "__blk_account_io_done"); -- } -- -- err = biostacks_bpf__load(obj); -- if (err) { -- fprintf(stderr, "failed to load BPF object: %d\n", err); -- goto cleanup; -+ if (has_block_io_tracepoints()) -+ disable_blk_account_io_fentry(obj); -+ else { -+ disable_block_io_tracepoints(obj); -+ blk_account_io_set_attach_target(obj); - } - -- obj->links.blk_account_io_start = bpf_program__attach(obj->progs.blk_account_io_start); -- if (!obj->links.blk_account_io_start) { -- err = -errno; -- fprintf(stderr, "failed to attach blk_account_io_start: %s\n", strerror(-err)); -- goto cleanup; -- } - ksyms = ksyms__load(); - if (!ksyms) { - fprintf(stderr, "failed to load kallsyms\n"); - goto cleanup; - } -- if (ksyms__get_symbol(ksyms, "blk_account_io_merge_bio")) { -- obj->links.blk_account_io_merge_bio = -- bpf_program__attach(obj->progs.blk_account_io_merge_bio); -- if (!obj->links.blk_account_io_merge_bio) { -- err = -errno; -- fprintf(stderr, "failed to attach blk_account_io_merge_bio: %s\n", -- strerror(-err)); -- goto cleanup; -- } -+ if (!ksyms__get_symbol(ksyms, "blk_account_io_merge_bio")) -+ bpf_program__set_autoload(obj->progs.blk_account_io_merge_bio, false); -+ -+ err = biostacks_bpf__load(obj); -+ if (err) { -+ fprintf(stderr, "failed to load BPF object: %d\n", err); -+ goto cleanup; - } -- obj->links.blk_account_io_done = bpf_program__attach(obj->progs.blk_account_io_done); -- if (!obj->links.blk_account_io_done) { -- err = -errno; -- fprintf(stderr, "failed to attach blk_account_io_done: %s\n", -- strerror(-err)); -+ -+ err = biostacks_bpf__attach(obj); -+ if (err) { -+ fprintf(stderr, "failed to attach BPF programs: %d\n", err); - goto cleanup; - } - -diff --git a/libbpf-tools/biotop.bpf.c b/libbpf-tools/biotop.bpf.c -index 226e32d3..07631378 100644 ---- a/libbpf-tools/biotop.bpf.c -+++ b/libbpf-tools/biotop.bpf.c -@@ -30,8 +30,8 @@ struct { - __type(value, struct val_t); - } counts SEC(".maps"); - --SEC("kprobe") --int BPF_KPROBE(blk_account_io_start, struct request *req) -+static __always_inline -+int trace_start(struct request *req) - { - struct who_t who = {}; - -@@ -56,8 +56,8 @@ int BPF_KPROBE(blk_mq_start_request, struct request *req) - return 0; - } - --SEC("kprobe") --int BPF_KPROBE(blk_account_io_done, struct request *req, u64 now) -+static __always_inline -+int trace_done(struct request *req) - { - struct val_t *valp, zero = {}; - struct info_t info = {}; -@@ -103,4 +103,40 @@ int BPF_KPROBE(blk_account_io_done, struct request *req, u64 now) - return 0; - } - -+SEC("kprobe/blk_account_io_start") -+int BPF_KPROBE(blk_account_io_start, struct request *req) -+{ -+ return trace_start(req); -+} -+ -+SEC("kprobe/blk_account_io_done") -+int BPF_KPROBE(blk_account_io_done, struct request *req) -+{ -+ return trace_done(req); -+} -+ -+SEC("kprobe/__blk_account_io_start") -+int BPF_KPROBE(__blk_account_io_start, struct request *req) -+{ -+ return trace_start(req); -+} -+ -+SEC("kprobe/__blk_account_io_done") -+int BPF_KPROBE(__blk_account_io_done, struct request *req) -+{ -+ return trace_done(req); -+} -+ -+SEC("tp_btf/block_io_start") -+int BPF_PROG(block_io_start, struct request *req) -+{ -+ return trace_start(req); -+} -+ -+SEC("tp_btf/block_io_done") -+int BPF_PROG(block_io_done, struct request *req) -+{ -+ return trace_done(req); -+} -+ - char LICENSE[] SEC("license") = "GPL"; -diff --git a/libbpf-tools/biotop.c b/libbpf-tools/biotop.c -index 75484281..5b3a7cf3 100644 ---- a/libbpf-tools/biotop.c -+++ b/libbpf-tools/biotop.c -@@ -354,6 +354,38 @@ static int print_stat(struct biotop_bpf *obj) - return err; - } - -+static bool has_block_io_tracepoints(void) -+{ -+ return tracepoint_exists("block", "block_io_start") && -+ tracepoint_exists("block", "block_io_done"); -+} -+ -+static void disable_block_io_tracepoints(struct biotop_bpf *obj) -+{ -+ bpf_program__set_autoload(obj->progs.block_io_start, false); -+ bpf_program__set_autoload(obj->progs.block_io_done, false); -+} -+ -+static void disable_blk_account_io_kprobes(struct biotop_bpf *obj) -+{ -+ bpf_program__set_autoload(obj->progs.blk_account_io_start, false); -+ bpf_program__set_autoload(obj->progs.blk_account_io_done, false); -+ bpf_program__set_autoload(obj->progs.__blk_account_io_start, false); -+ bpf_program__set_autoload(obj->progs.__blk_account_io_done, false); -+} -+ -+static void blk_account_io_set_autoload(struct biotop_bpf *obj, -+ struct ksyms *ksyms) -+{ -+ if (!ksyms__get_symbol(ksyms, "__blk_account_io_start")) { -+ bpf_program__set_autoload(obj->progs.__blk_account_io_start, false); -+ bpf_program__set_autoload(obj->progs.__blk_account_io_done, false); -+ } else { -+ bpf_program__set_autoload(obj->progs.blk_account_io_start, false); -+ bpf_program__set_autoload(obj->progs.blk_account_io_done, false); -+ } -+} -+ - int main(int argc, char **argv) - { - static const struct argp argp = { -@@ -386,32 +418,19 @@ int main(int argc, char **argv) - goto cleanup; - } - -+ if (has_block_io_tracepoints()) -+ disable_blk_account_io_kprobes(obj); -+ else { -+ disable_block_io_tracepoints(obj); -+ blk_account_io_set_autoload(obj, ksyms); -+ } -+ - err = biotop_bpf__load(obj); - if (err) { - warn("failed to load BPF object: %d\n", err); - goto cleanup; - } - -- if (ksyms__get_symbol(ksyms, "__blk_account_io_start")) -- obj->links.blk_account_io_start = bpf_program__attach_kprobe(obj->progs.blk_account_io_start, false, "__blk_account_io_start"); -- else -- obj->links.blk_account_io_start = bpf_program__attach_kprobe(obj->progs.blk_account_io_start, false, "blk_account_io_start"); -- -- if (!obj->links.blk_account_io_start) { -- warn("failed to load attach blk_account_io_start\n"); -- goto cleanup; -- } -- -- if (ksyms__get_symbol(ksyms, "__blk_account_io_done")) -- obj->links.blk_account_io_done = bpf_program__attach_kprobe(obj->progs.blk_account_io_done, false, "__blk_account_io_done"); -- else -- obj->links.blk_account_io_done = bpf_program__attach_kprobe(obj->progs.blk_account_io_done, false, "blk_account_io_done"); -- -- if (!obj->links.blk_account_io_done) { -- warn("failed to load attach blk_account_io_done\n"); -- goto cleanup; -- } -- - err = biotop_bpf__attach(obj); - if (err) { - warn("failed to attach BPF programs: %d\n", err); --- -2.41.0 - diff --git a/sources b/sources index def4132..ab57345 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (bcc-0.28.0.tar.gz) = 792ce93dba64b1f87390b2602dcaeba04ac8b2863652b06eb9a907b93bc6137a944b856cc6fa9c7a38671c89814740967561ca4f3b29c267babca7dc5e78aa02 +SHA512 (bcc-0.29.1.tar.gz) = 9e60130ea602e19e6c6f88a8c17023cea5daf4c5bcc7af8816e9f5c662341136eb449a3fdf870ffad215495ac3bf895115c0d968d92ce79ebe2899b3e2464d24 diff --git a/tool-slabratetop-add-definition-of-freelist_aba_t.patch b/tool-slabratetop-add-definition-of-freelist_aba_t.patch deleted file mode 100644 index fcbf1e5..0000000 --- a/tool-slabratetop-add-definition-of-freelist_aba_t.patch +++ /dev/null @@ -1,55 +0,0 @@ -From 59a1fccfc78482af189150b7937b21244f34e48a Mon Sep 17 00:00:00 2001 -From: Jerome Marchand -Date: Thu, 3 Aug 2023 16:11:50 +0200 -Subject: [PATCH] tool/slabratetop: add definition of freelist_aba_t - -With recent kernel containing the commit 6801be4f2653 ("slub: Replace -cmpxchg_double()"), slabratetop fails to compiles with the following -error: - -In file included from /virtual/main.c:86: -include/linux/slub_def.h:56:3: error: unknown type name 'freelist_aba_t' - freelist_aba_t freelist_tid; - ^ -2 warnings and 1 error generated. -Traceback (most recent call last): - File "/usr/share/bcc/tools/slabratetop", line 187, in - b = BPF(text=bpf_text) - ^^^^^^^^^^^^^^^^^^ - File "/usr/lib/python3.12/site-packages/bcc/__init__.py", line 479, in __init__ - raise Exception("Failed to compile BPF module %s" % (src_file or "")) -Exception: Failed to compile BPF module - -Adding the definition of freelist_aba_t fixes the issue. ---- - tools/slabratetop.py | 14 ++++++++++++++ - 1 file changed, 14 insertions(+) - -diff --git a/tools/slabratetop.py b/tools/slabratetop.py -index 8fbcac5e..8a7d486e 100755 ---- a/tools/slabratetop.py -+++ b/tools/slabratetop.py -@@ -141,6 +141,20 @@ static inline void *slab_address(const struct slab *slab) - return NULL; - } - -+#ifdef CONFIG_64BIT -+typedef __uint128_t freelist_full_t; -+#else -+typedef u64 freelist_full_t; -+#endif -+ -+typedef union { -+ struct { -+ void *freelist; -+ unsigned long counter; -+ }; -+ freelist_full_t full; -+} freelist_aba_t; -+ - #ifdef CONFIG_SLUB - #include - #else --- -2.41.0 - diff --git a/tools-Add-support-for-the-new-block_io_-tracepoints.patch b/tools-Add-support-for-the-new-block_io_-tracepoints.patch deleted file mode 100644 index e8dfa78..0000000 --- a/tools-Add-support-for-the-new-block_io_-tracepoints.patch +++ /dev/null @@ -1,855 +0,0 @@ -From 53ef33b5ad42e6a4baa37821119199f2d846beff Mon Sep 17 00:00:00 2001 -From: Jerome Marchand -Date: Thu, 27 Jul 2023 18:19:18 +0200 -Subject: [PATCH 2/2] tools: Add support for the new block_io_* tracepoints - -The bio tools currently depends on blk_account_io_done/start functions -that can be inlined. To fix that, a couple of tracepoints have been -added upstream (block:block_io_start/done). This patch add the support -for those tracepoints when they are available. - -Unfortunately, the bio tools relies on data that is not available to -the tracepoints (mostly the struct request). So the tracepoints can't -be used as drop in replacement for blk_account_io_*. Main difference, -is that we can't use the struct request as the hash key anymore, so it -now uses the couple (dev_t, sector) for that purpose. - -For the biolatency tool, the -F option is disabled when only the -tracepoints are available because the flags are not all accessible -from the tracepoints. Otherwise, all features of the tools should -remain. - -Closes #4261 - -Signed-off-by: Jerome Marchand ---- - tools/biolatency.py | 166 ++++++++++++++++++++++++++++-------- - tools/biosnoop.py | 200 +++++++++++++++++++++++++++++++++----------- - tools/biotop.py | 108 +++++++++++++++++++----- - 3 files changed, 371 insertions(+), 103 deletions(-) - -diff --git a/tools/biolatency.py b/tools/biolatency.py -index 8fe43a7c..03b48a4c 100755 ---- a/tools/biolatency.py -+++ b/tools/biolatency.py -@@ -11,6 +11,7 @@ - # - # 20-Sep-2015 Brendan Gregg Created this. - # 31-Mar-2022 Rocky Xing Added disk filter support. -+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints - - from __future__ import print_function - from bcc import BPF -@@ -72,7 +73,7 @@ bpf_text = """ - #include - - typedef struct disk_key { -- char disk[DISK_NAME_LEN]; -+ dev_t dev; - u64 slot; - } disk_key_t; - -@@ -86,26 +87,70 @@ typedef struct ext_val { - u64 count; - } ext_val_t; - --BPF_HASH(start, struct request *); -+struct tp_args { -+ u64 __unused__; -+ dev_t dev; -+ sector_t sector; -+ unsigned int nr_sector; -+ unsigned int bytes; -+ char rwbs[8]; -+ char comm[16]; -+ char cmd[]; -+}; -+ -+struct start_key { -+ dev_t dev; -+ u32 _pad; -+ sector_t sector; -+ CMD_FLAGS -+}; -+ -+BPF_HASH(start, struct start_key); - STORAGE - -+static dev_t ddevt(struct gendisk *disk) { -+ return (disk->major << 20) | disk->first_minor; -+} -+ - // time block I/O --int trace_req_start(struct pt_regs *ctx, struct request *req) -+static int __trace_req_start(struct start_key key) - { - DISK_FILTER - - u64 ts = bpf_ktime_get_ns(); -- start.update(&req, &ts); -+ start.update(&key, &ts); - return 0; - } - -+int trace_req_start(struct pt_regs *ctx, struct request *req) -+{ -+ struct start_key key = { -+ .dev = ddevt(req->__RQ_DISK__), -+ .sector = req->__sector -+ }; -+ -+ SET_FLAGS -+ -+ return __trace_req_start(key); -+} -+ -+int trace_req_start_tp(struct tp_args *args) -+{ -+ struct start_key key = { -+ .dev = args->dev, -+ .sector = args->sector -+ }; -+ -+ return __trace_req_start(key); -+} -+ - // output --int trace_req_done(struct pt_regs *ctx, struct request *req) -+static int __trace_req_done(struct start_key key) - { - u64 *tsp, delta; - - // fetch timestamp and calculate delta -- tsp = start.lookup(&req); -+ tsp = start.lookup(&key); - if (tsp == 0) { - return 0; // missed issue - } -@@ -116,9 +161,31 @@ int trace_req_done(struct pt_regs *ctx, struct request *req) - // store as histogram - STORE - -- start.delete(&req); -+ start.delete(&key); - return 0; - } -+ -+int trace_req_done(struct pt_regs *ctx, struct request *req) -+{ -+ struct start_key key = { -+ .dev = ddevt(req->__RQ_DISK__), -+ .sector = req->__sector -+ }; -+ -+ SET_FLAGS -+ -+ return __trace_req_done(key); -+} -+ -+int trace_req_done_tp(struct tp_args *args) -+{ -+ struct start_key key = { -+ .dev = args->dev, -+ .sector = args->sector -+ }; -+ -+ return __trace_req_done(key); -+} - """ - - # code substitutions -@@ -134,21 +201,18 @@ store_str = "" - if args.disks: - storage_str += "BPF_HISTOGRAM(dist, disk_key_t);" - disks_str = """ -- disk_key_t key = {.slot = bpf_log2l(delta)}; -- void *__tmp = (void *)req->__RQ_DISK__->disk_name; -- bpf_probe_read(&key.disk, sizeof(key.disk), __tmp); -- dist.atomic_increment(key); -+ disk_key_t dkey = {}; -+ dkey.dev = key.dev; -+ dkey.slot = bpf_log2l(delta); -+ dist.atomic_increment(dkey); - """ -- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1: -- store_str += disks_str.replace('__RQ_DISK__', 'rq_disk') -- else: -- store_str += disks_str.replace('__RQ_DISK__', 'q->disk') -+ store_str += disks_str - elif args.flags: - storage_str += "BPF_HISTOGRAM(dist, flag_key_t);" - store_str += """ -- flag_key_t key = {.slot = bpf_log2l(delta)}; -- key.flags = req->cmd_flags; -- dist.atomic_increment(key); -+ flag_key_t fkey = {.slot = bpf_log2l(delta)}; -+ fkey.flags = key.flags; -+ dist.atomic_increment(fkey); - """ - else: - storage_str += "BPF_HISTOGRAM(dist);" -@@ -161,21 +225,13 @@ store_str = "" - exit(1) - - stat_info = os.stat(disk_path) -- major = os.major(stat_info.st_rdev) -- minor = os.minor(stat_info.st_rdev) -- -- disk_field_str = "" -- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1: -- disk_field_str = 'req->rq_disk' -- else: -- disk_field_str = 'req->q->disk' -+ dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev) - - disk_filter_str = """ -- struct gendisk *disk = %s; -- if (!(disk->major == %d && disk->first_minor == %d)) { -+ if(key.dev != %s) { - return 0; - } -- """ % (disk_field_str, major, minor) -+ """ % (dev) - - bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str) - else: -@@ -194,6 +250,16 @@ store_str = "" - - bpf_text = bpf_text.replace("STORAGE", storage_str) - bpf_text = bpf_text.replace("STORE", store_str) -+if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1: -+ bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk') -+else: -+ bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk') -+if args.flags: -+ bpf_text = bpf_text.replace('CMD_FLAGS', 'u64 flags;') -+ bpf_text = bpf_text.replace('SET_FLAGS', 'key.flags = req->cmd_flags;') -+else: -+ bpf_text = bpf_text.replace('CMD_FLAGS', '') -+ bpf_text = bpf_text.replace('SET_FLAGS', '') - - if debug or args.ebpf: - print(bpf_text) -@@ -205,25 +271,53 @@ b = BPF(text=bpf_text) - if args.queued: - if BPF.get_kprobe_functions(b'__blk_account_io_start'): - b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_req_start") -- else: -+ elif BPF.get_kprobe_functions(b'blk_account_io_start'): - b.attach_kprobe(event="blk_account_io_start", fn_name="trace_req_start") -+ else: -+ if args.flags: -+ # Some flags are accessible in the rwbs field (RAHEAD, SYNC and META) -+ # but other aren't. Disable the -F option for tracepoint for now. -+ print("ERROR: blk_account_io_start probe not available. Can't use -F.") -+ exit() -+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_req_start_tp") - else: - if BPF.get_kprobe_functions(b'blk_start_request'): - b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") - b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") -+ - if BPF.get_kprobe_functions(b'__blk_account_io_done'): - b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_done") --else: -+elif BPF.get_kprobe_functions(b'blk_account_io_done'): - b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_done") -+else: -+ if args.flags: -+ print("ERROR: blk_account_io_done probe not available. Can't use -F.") -+ exit() -+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_done_tp") -+ - - if not args.json: - print("Tracing block device I/O... Hit Ctrl-C to end.") - --def disk_print(s): -- disk = s.decode('utf-8', 'replace') -- if not disk: -- disk = "" -- return disk -+# cache disk major,minor -> diskname -+diskstats = "/proc/diskstats" -+disklookup = {} -+with open(diskstats) as stats: -+ for line in stats: -+ a = line.split() -+ disklookup[a[0] + "," + a[1]] = a[2] -+ -+def disk_print(d): -+ major = d >> 20 -+ minor = d & ((1 << 20) - 1) -+ -+ disk = str(major) + "," + str(minor) -+ if disk in disklookup: -+ diskname = disklookup[disk] -+ else: -+ diskname = "?" -+ -+ return diskname - - # see blk_fill_rwbs(): - req_opf = { -diff --git a/tools/biosnoop.py b/tools/biosnoop.py -index 33703233..f0fef98b 100755 ---- a/tools/biosnoop.py -+++ b/tools/biosnoop.py -@@ -14,6 +14,7 @@ - # 11-Feb-2016 Allan McAleavy updated for BPF_PERF_OUTPUT - # 21-Jun-2022 Rocky Xing Added disk filter support. - # 13-Oct-2022 Rocky Xing Added support for displaying block I/O pattern. -+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints - - from __future__ import print_function - from bcc import BPF -@@ -64,6 +65,24 @@ struct val_t { - char name[TASK_COMM_LEN]; - }; - -+struct tp_args { -+ u64 __unused__; -+ dev_t dev; -+ sector_t sector; -+ unsigned int nr_sector; -+ unsigned int bytes; -+ char rwbs[8]; -+ char comm[16]; -+ char cmd[]; -+}; -+ -+struct hash_key { -+ dev_t dev; -+ u32 rwflag; -+ sector_t sector; -+}; -+ -+ - #ifdef INCLUDE_PATTERN - struct sector_key_t { - u32 dev_major; -@@ -79,6 +98,7 @@ enum bio_pattern { - - struct data_t { - u32 pid; -+ u32 dev; - u64 rwflag; - u64 delta; - u64 qdelta; -@@ -88,7 +108,6 @@ struct data_t { - enum bio_pattern pattern; - #endif - u64 ts; -- char disk_name[DISK_NAME_LEN]; - char name[TASK_COMM_LEN]; - }; - -@@ -96,12 +115,45 @@ struct data_t { - BPF_HASH(last_sectors, struct sector_key_t, u64); - #endif - --BPF_HASH(start, struct request *, struct start_req_t); --BPF_HASH(infobyreq, struct request *, struct val_t); -+BPF_HASH(start, struct hash_key, struct start_req_t); -+BPF_HASH(infobyreq, struct hash_key, struct val_t); - BPF_PERF_OUTPUT(events); - -+static dev_t ddevt(struct gendisk *disk) { -+ return (disk->major << 20) | disk->first_minor; -+} -+ -+/* -+ * The following deals with a kernel version change (in mainline 4.7, although -+ * it may be backported to earlier kernels) with how block request write flags -+ * are tested. We handle both pre- and post-change versions here. Please avoid -+ * kernel version tests like this as much as possible: they inflate the code, -+ * test, and maintenance burden. -+ */ -+static int get_rwflag(u32 cmd_flags) { -+#ifdef REQ_WRITE -+ return !!(cmd_flags & REQ_WRITE); -+#elif defined(REQ_OP_SHIFT) -+ return !!((cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE); -+#else -+ return !!((cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE); -+#endif -+} -+ -+#define RWBS_LEN 8 -+ -+static int get_rwflag_tp(char *rwbs) { -+ for (int i = 0; i < RWBS_LEN; i++) { -+ if (rwbs[i] == 'W') -+ return 1; -+ if (rwbs[i] == '\\0') -+ return 0; -+ } -+ return 0; -+} -+ - // cache PID and comm by-req --int trace_pid_start(struct pt_regs *ctx, struct request *req) -+static int __trace_pid_start(struct hash_key key) - { - DISK_FILTER - -@@ -113,47 +165,76 @@ int trace_pid_start(struct pt_regs *ctx, struct request *req) - if (##QUEUE##) { - val.ts = bpf_ktime_get_ns(); - } -- infobyreq.update(&req, &val); -+ infobyreq.update(&key, &val); - } - return 0; - } - -+ -+int trace_pid_start(struct pt_regs *ctx, struct request *req) -+{ -+ struct hash_key key = { -+ .dev = ddevt(req->__RQ_DISK__), -+ .rwflag = get_rwflag(req->cmd_flags), -+ .sector = req->__sector -+ }; -+ -+ return __trace_pid_start(key); -+} -+ -+int trace_pid_start_tp(struct tp_args *args) -+{ -+ struct hash_key key = { -+ .dev = args->dev, -+ .rwflag = get_rwflag_tp(args->rwbs), -+ .sector = args->sector -+ }; -+ -+ return __trace_pid_start(key); -+} -+ - // time block I/O - int trace_req_start(struct pt_regs *ctx, struct request *req) - { -+ struct hash_key key = { -+ .dev = ddevt(req->__RQ_DISK__), -+ .rwflag = get_rwflag(req->cmd_flags), -+ .sector = req->__sector -+ }; -+ - DISK_FILTER - - struct start_req_t start_req = { - .ts = bpf_ktime_get_ns(), - .data_len = req->__data_len - }; -- start.update(&req, &start_req); -+ start.update(&key, &start_req); - return 0; - } - - // output --int trace_req_completion(struct pt_regs *ctx, struct request *req) -+static int __trace_req_completion(void *ctx, struct hash_key key) - { - struct start_req_t *startp; - struct val_t *valp; - struct data_t data = {}; -- struct gendisk *rq_disk; -+ //struct gendisk *rq_disk; - u64 ts; - - // fetch timestamp and calculate delta -- startp = start.lookup(&req); -+ startp = start.lookup(&key); - if (startp == 0) { - // missed tracing issue - return 0; - } - ts = bpf_ktime_get_ns(); -- rq_disk = req->__RQ_DISK__; -+ //rq_disk = req->__RQ_DISK__; - data.delta = ts - startp->ts; - data.ts = ts / 1000; - data.qdelta = 0; - data.len = startp->data_len; - -- valp = infobyreq.lookup(&req); -+ valp = infobyreq.lookup(&key); - if (valp == 0) { - data.name[0] = '?'; - data.name[1] = 0; -@@ -162,10 +243,9 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req) - data.qdelta = startp->ts - valp->ts; - } - data.pid = valp->pid; -- data.sector = req->__sector; -+ data.sector = key.sector; -+ data.dev = key.dev; - bpf_probe_read_kernel(&data.name, sizeof(data.name), valp->name); -- bpf_probe_read_kernel(&data.disk_name, sizeof(data.disk_name), -- rq_disk->disk_name); - } - - #ifdef INCLUDE_PATTERN -@@ -174,8 +254,8 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req) - u64 *sector, last_sector; - - struct sector_key_t sector_key = { -- .dev_major = rq_disk->major, -- .dev_minor = rq_disk->first_minor -+ .dev_major = key.dev >> 20, -+ .dev_minor = key.dev & ((1 << 20) - 1) - }; - - sector = last_sectors.lookup(§or_key); -@@ -187,27 +267,36 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req) - last_sectors.update(§or_key, &last_sector); - #endif - --/* -- * The following deals with a kernel version change (in mainline 4.7, although -- * it may be backported to earlier kernels) with how block request write flags -- * are tested. We handle both pre- and post-change versions here. Please avoid -- * kernel version tests like this as much as possible: they inflate the code, -- * test, and maintenance burden. -- */ --#ifdef REQ_WRITE -- data.rwflag = !!(req->cmd_flags & REQ_WRITE); --#elif defined(REQ_OP_SHIFT) -- data.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE); --#else -- data.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE); --#endif -+ data.rwflag = key.rwflag; - - events.perf_submit(ctx, &data, sizeof(data)); -- start.delete(&req); -- infobyreq.delete(&req); -+ start.delete(&key); -+ infobyreq.delete(&key); - - return 0; - } -+ -+int trace_req_completion(struct pt_regs *ctx, struct request *req) -+{ -+ struct hash_key key = { -+ .dev = ddevt(req->__RQ_DISK__), -+ .rwflag = get_rwflag(req->cmd_flags), -+ .sector = req->__sector -+ }; -+ -+ return __trace_req_completion(ctx, key); -+} -+ -+int trace_req_completion_tp(struct tp_args *args) -+{ -+ struct hash_key key = { -+ .dev = args->dev, -+ .rwflag = get_rwflag_tp(args->rwbs), -+ .sector = args->sector -+ }; -+ -+ return __trace_req_completion(args, key); -+} - """ - if args.queue: - bpf_text = bpf_text.replace('##QUEUE##', '1') -@@ -225,21 +314,13 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req) - exit(1) - - stat_info = os.stat(disk_path) -- major = os.major(stat_info.st_rdev) -- minor = os.minor(stat_info.st_rdev) -- -- disk_field_str = "" -- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1: -- disk_field_str = 'req->rq_disk' -- else: -- disk_field_str = 'req->q->disk' -+ dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev) - - disk_filter_str = """ -- struct gendisk *disk = %s; -- if (!(disk->major == %d && disk->first_minor == %d)) { -+ if(key.dev != %s) { - return 0; - } -- """ % (disk_field_str, major, minor) -+ """ % (dev) - - bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str) - else: -@@ -254,15 +335,19 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req) - b = BPF(text=bpf_text) - if BPF.get_kprobe_functions(b'__blk_account_io_start'): - b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start") --else: -+elif BPF.get_kprobe_functions(b'blk_account_io_start'): - b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start") -+else: -+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp") - if BPF.get_kprobe_functions(b'blk_start_request'): - b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") - b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") - if BPF.get_kprobe_functions(b'__blk_account_io_done'): - b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion") --else: -+elif BPF.get_kprobe_functions(b'blk_account_io_done'): - b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion") -+else: -+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp") - - # header - print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID", -@@ -273,6 +358,27 @@ print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID", - print("%7s " % ("QUE(ms)"), end="") - print("%7s" % "LAT(ms)") - -+ -+# cache disk major,minor -> diskname -+diskstats = "/proc/diskstats" -+disklookup = {} -+with open(diskstats) as stats: -+ for line in stats: -+ a = line.split() -+ disklookup[a[0] + "," + a[1]] = a[2] -+ -+def disk_print(d): -+ major = d >> 20 -+ minor = d & ((1 << 20) - 1) -+ -+ disk = str(major) + "," + str(minor) -+ if disk in disklookup: -+ diskname = disklookup[disk] -+ else: -+ diskname = "" -+ -+ return diskname -+ - rwflg = "" - pattern = "" - start_ts = 0 -@@ -297,9 +403,7 @@ P_RANDOM = 2 - - delta = float(event.ts) - start_ts - -- disk_name = event.disk_name.decode('utf-8', 'replace') -- if not disk_name: -- disk_name = '' -+ disk_name = disk_print(event.dev) - - print("%-11.6f %-14.14s %-7s %-9s %-1s %-10s %-7s" % ( - delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid, -diff --git a/tools/biotop.py b/tools/biotop.py -index fcdd373f..2620983a 100755 ---- a/tools/biotop.py -+++ b/tools/biotop.py -@@ -14,6 +14,7 @@ - # - # 06-Feb-2016 Brendan Gregg Created this. - # 17-Mar-2022 Rocky Xing Added PID filter support. -+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints - - from __future__ import print_function - from bcc import BPF -@@ -88,14 +89,35 @@ struct val_t { - u32 io; - }; - --BPF_HASH(start, struct request *, struct start_req_t); --BPF_HASH(whobyreq, struct request *, struct who_t); -+struct tp_args { -+ u64 __unused__; -+ dev_t dev; -+ sector_t sector; -+ unsigned int nr_sector; -+ unsigned int bytes; -+ char rwbs[8]; -+ char comm[16]; -+ char cmd[]; -+}; -+ -+struct hash_key { -+ dev_t dev; -+ u32 _pad; -+ sector_t sector; -+}; -+ -+BPF_HASH(start, struct hash_key, struct start_req_t); -+BPF_HASH(whobyreq, struct hash_key, struct who_t); - BPF_HASH(counts, struct info_t, struct val_t); - -+static dev_t ddevt(struct gendisk *disk) { -+ return (disk->major << 20) | disk->first_minor; -+} -+ - // cache PID and comm by-req --int trace_pid_start(struct pt_regs *ctx, struct request *req) -+static int __trace_pid_start(struct hash_key key) - { -- struct who_t who = {}; -+ struct who_t who; - u32 pid; - - if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) { -@@ -104,30 +126,54 @@ int trace_pid_start(struct pt_regs *ctx, struct request *req) - return 0; - - who.pid = pid; -- whobyreq.update(&req, &who); -+ whobyreq.update(&key, &who); - } - - return 0; - } - -+int trace_pid_start(struct pt_regs *ctx, struct request *req) -+{ -+ struct hash_key key = { -+ .dev = ddevt(req->__RQ_DISK__), -+ .sector = req->__sector -+ }; -+ -+ return __trace_pid_start(key); -+} -+ -+int trace_pid_start_tp(struct tp_args *args) -+{ -+ struct hash_key key = { -+ .dev = args->dev, -+ .sector = args->sector -+ }; -+ -+ return __trace_pid_start(key); -+} -+ - // time block I/O - int trace_req_start(struct pt_regs *ctx, struct request *req) - { -+ struct hash_key key = { -+ .dev = ddevt(req->__RQ_DISK__), -+ .sector = req->__sector -+ }; - struct start_req_t start_req = { - .ts = bpf_ktime_get_ns(), - .data_len = req->__data_len - }; -- start.update(&req, &start_req); -+ start.update(&key, &start_req); - return 0; - } - - // output --int trace_req_completion(struct pt_regs *ctx, struct request *req) -+static int __trace_req_completion(struct hash_key key) - { - struct start_req_t *startp; - - // fetch timestamp and calculate delta -- startp = start.lookup(&req); -+ startp = start.lookup(&key); - if (startp == 0) { - return 0; // missed tracing issue - } -@@ -135,12 +181,12 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req) - struct who_t *whop; - u32 pid; - -- whop = whobyreq.lookup(&req); -+ whop = whobyreq.lookup(&key); - pid = whop != 0 ? whop->pid : 0; - if (FILTER_PID) { -- start.delete(&req); -+ start.delete(&key); - if (whop != 0) { -- whobyreq.delete(&req); -+ whobyreq.delete(&key); - } - return 0; - } -@@ -150,8 +196,8 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req) - - // setup info_t key - struct info_t info = {}; -- info.major = req->__RQ_DISK__->major; -- info.minor = req->__RQ_DISK__->first_minor; -+ info.major = key.dev >> 20; -+ info.minor = key.dev & ((1 << 20) - 1); - /* - * The following deals with a kernel version change (in mainline 4.7, although - * it may be backported to earlier kernels) with how block request write flags -@@ -159,13 +205,13 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req) - * kernel version tests like this as much as possible: they inflate the code, - * test, and maintenance burden. - */ --#ifdef REQ_WRITE -+/*#ifdef REQ_WRITE - info.rwflag = !!(req->cmd_flags & REQ_WRITE); - #elif defined(REQ_OP_SHIFT) - info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE); - #else - info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE); --#endif -+#endif*/ - - if (whop == 0) { - // missed pid who, save stats as pid 0 -@@ -183,11 +229,31 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req) - valp->io++; - } - -- start.delete(&req); -- whobyreq.delete(&req); -+ start.delete(&key); -+ whobyreq.delete(&key); - - return 0; - } -+ -+int trace_req_completion(struct pt_regs *ctx, struct request *req) -+{ -+ struct hash_key key = { -+ .dev = ddevt(req->__RQ_DISK__), -+ .sector = req->__sector -+ }; -+ -+ return __trace_req_completion(key); -+} -+ -+int trace_req_completion_tp(struct tp_args *args) -+{ -+ struct hash_key key = { -+ .dev = args->dev, -+ .sector = args->sector -+ }; -+ -+ return __trace_req_completion(key); -+} - """ - - if args.ebpf: -@@ -207,15 +273,19 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req) - b = BPF(text=bpf_text) - if BPF.get_kprobe_functions(b'__blk_account_io_start'): - b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start") --else: -+elif BPF.get_kprobe_functions(b'blk_account_io_start'): - b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start") -+else: -+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp") - if BPF.get_kprobe_functions(b'blk_start_request'): - b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") - b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") - if BPF.get_kprobe_functions(b'__blk_account_io_done'): - b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion") --else: -+elif BPF.get_kprobe_functions(b'blk_account_io_done'): - b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion") -+else: -+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp") - - print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval) - --- -2.41.0 -