import CS bcc-0.28.0-5.el9

This commit is contained in:
eabdullin 2024-03-28 09:48:15 +00:00
parent 598b42e0b2
commit 2f4d557103
20 changed files with 122847 additions and 1022 deletions

View File

@ -1 +1 @@
7c555629ea7bbd69fa22827076c11113cf2af104 SOURCES/bcc-0.26.0.tar.gz
8ce0ccb0724da475f127d62acc10a88569956474 SOURCES/bcc-0.28.0.tar.gz

2
.gitignore vendored
View File

@ -1 +1 @@
SOURCES/bcc-0.26.0.tar.gz
SOURCES/bcc-0.28.0.tar.gz

View File

@ -1,58 +0,0 @@
From 64f9c355a62f78000270d025b479b7eeba7349e9 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Wed, 11 Jan 2023 16:46:32 +0100
Subject: [PATCH] killsnoop: add missing -s and -T options to the synopsis
The -s option is missing from the synopsis of the killsnoop manpage,
example file and the comment on top of the tool itself.
Also, -T option is missing from the example file.
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
---
man/man8/killsnoop.8 | 2 +-
tools/killsnoop.py | 2 +-
tools/killsnoop_example.txt | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/man/man8/killsnoop.8 b/man/man8/killsnoop.8
index 3f63d2ee..cb2a975e 100644
--- a/man/man8/killsnoop.8
+++ b/man/man8/killsnoop.8
@@ -2,7 +2,7 @@
.SH NAME
killsnoop \- Trace signals issued by the kill() syscall. Uses Linux eBPF/bcc.
.SH SYNOPSIS
-.B killsnoop [\-h] [\-x] [-p PID] [-T PID]
+.B killsnoop [\-h] [\-x] [-p PID] [-T PID] [-s SIGNAL]
.SH DESCRIPTION
killsnoop traces the kill() syscall, to show signals sent via this method. This
may be useful to troubleshoot failing applications, where an unknown mechanism
diff --git a/tools/killsnoop.py b/tools/killsnoop.py
index c0166f1d..9cce8dcc 100755
--- a/tools/killsnoop.py
+++ b/tools/killsnoop.py
@@ -4,7 +4,7 @@
# killsnoop Trace signals issued by the kill() syscall.
# For Linux, uses BCC, eBPF. Embedded C.
#
-# USAGE: killsnoop [-h] [-x] [-p PID] [-T PID]
+# USAGE: killsnoop [-h] [-x] [-p PID] [-T PID] [-s SIGNAL]
#
# Copyright (c) 2015 Brendan Gregg.
# Licensed under the Apache License, Version 2.0 (the "License")
diff --git a/tools/killsnoop_example.txt b/tools/killsnoop_example.txt
index 904fe6ef..97c3ad70 100644
--- a/tools/killsnoop_example.txt
+++ b/tools/killsnoop_example.txt
@@ -19,7 +19,7 @@ The second line showed the same signal sent, this time resulting in a -3
USAGE message:
# ./killsnoop -h
-usage: killsnoop [-h] [-x] [-p PID]
+usage: killsnoop [-h] [-x] [-p PID] [-T PID] [-s SIGNAL]
Trace signals issued by the kill() syscall
--
2.39.2

View File

@ -1,363 +0,0 @@
From 34f77c4aaaa039fd2ef3d51b8b61db30fc34912f Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 19 Apr 2023 23:46:53 -0700
Subject: [PATCH] sync with latest libbpf repo
Sync libbpf submodule up to the following commit:
44b0bc9ad70a ci: Regenerate latest vmlinux.h for old kernel CI tests.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
src/cc/compat/linux/virtual_bpf.h | 141 ++++++++++++++++++++++++++----
src/cc/export/helpers.h | 6 +-
2 files changed, 126 insertions(+), 21 deletions(-)
diff --git a/src/cc/compat/linux/virtual_bpf.h b/src/cc/compat/linux/virtual_bpf.h
index be3a4627..a182123e 100644
--- a/src/cc/compat/linux/virtual_bpf.h
+++ b/src/cc/compat/linux/virtual_bpf.h
@@ -1034,6 +1034,7 @@ enum bpf_attach_type {
BPF_PERF_EVENT,
BPF_TRACE_KPROBE_MULTI,
BPF_LSM_CGROUP,
+ BPF_STRUCT_OPS,
__MAX_BPF_ATTACH_TYPE
};
@@ -1109,7 +1110,7 @@ enum bpf_link_type {
*/
#define BPF_F_STRICT_ALIGNMENT (1U << 0)
-/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the
+/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROG_LOAD command, the
* verifier will allow any alignment whatsoever. On platforms
* with strict alignment requirements for loads ands stores (such
* as sparc and mips) the verifier validates that all loads and
@@ -1157,6 +1158,11 @@ enum bpf_link_type {
*/
#define BPF_F_XDP_HAS_FRAGS (1U << 5)
+/* If BPF_F_XDP_DEV_BOUND_ONLY is used in BPF_PROG_LOAD command, the loaded
+ * program becomes device-bound but can access XDP metadata.
+ */
+#define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6)
+
/* link_create.kprobe_multi.flags used in LINK_CREATE command for
* BPF_TRACE_KPROBE_MULTI attach type to create return probe.
*/
@@ -1262,6 +1268,9 @@ enum {
/* Create a map that is suitable to be an inner map with dynamic max entries */
BPF_F_INNER_MAP = (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+ BPF_F_LINK = (1U << 13),
};
/* Flags for BPF_PROG_QUERY. */
@@ -1399,6 +1408,11 @@ union bpf_attr {
__aligned_u64 fd_array; /* array of FDs */
__aligned_u64 core_relos;
__u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */
+ /* output: actual total log contents size (including termintaing zero).
+ * It could be both larger than original log_size (if log was
+ * truncated), or smaller (if log buffer wasn't filled completely).
+ */
+ __u32 log_true_size;
};
struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -1484,6 +1498,11 @@ union bpf_attr {
__u32 btf_size;
__u32 btf_log_size;
__u32 btf_log_level;
+ /* output: actual total log contents size (including termintaing zero).
+ * It could be both larger than original log_size (if log was
+ * truncated), or smaller (if log buffer wasn't filled completely).
+ */
+ __u32 btf_log_true_size;
};
struct {
@@ -1503,7 +1522,10 @@ union bpf_attr {
} task_fd_query;
struct { /* struct used by BPF_LINK_CREATE command */
- __u32 prog_fd; /* eBPF program to attach */
+ union {
+ __u32 prog_fd; /* eBPF program to attach */
+ __u32 map_fd; /* struct_ops to attach */
+ };
union {
__u32 target_fd; /* object to attach to */
__u32 target_ifindex; /* target ifindex */
@@ -1544,12 +1566,23 @@ union bpf_attr {
struct { /* struct used by BPF_LINK_UPDATE command */
__u32 link_fd; /* link fd */
- /* new program fd to update link with */
- __u32 new_prog_fd;
+ union {
+ /* new program fd to update link with */
+ __u32 new_prog_fd;
+ /* new struct_ops map fd to update link with */
+ __u32 new_map_fd;
+ };
__u32 flags; /* extra flags */
- /* expected link's program fd; is specified only if
- * BPF_F_REPLACE flag is set in flags */
- __u32 old_prog_fd;
+ union {
+ /* expected link's program fd; is specified only if
+ * BPF_F_REPLACE flag is set in flags.
+ */
+ __u32 old_prog_fd;
+ /* expected link's map fd; is specified only
+ * if BPF_F_REPLACE flag is set.
+ */
+ __u32 old_map_fd;
+ };
} link_update;
struct {
@@ -1643,17 +1676,17 @@ union bpf_attr {
* Description
* This helper is a "printk()-like" facility for debugging. It
* prints a message defined by format *fmt* (of size *fmt_size*)
- * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * to file *\/sys/kernel/tracing/trace* from TraceFS, if
* available. It can take up to three additional **u64**
* arguments (as an eBPF helpers, the total number of arguments is
* limited to five).
*
* Each time the helper is called, it appends a line to the trace.
- * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
- * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
+ * Lines are discarded while *\/sys/kernel/tracing/trace* is
+ * open, use *\/sys/kernel/tracing/trace_pipe* to avoid this.
* The format of the trace is customizable, and the exact output
* one will get depends on the options set in
- * *\/sys/kernel/debug/tracing/trace_options* (see also the
+ * *\/sys/kernel/tracing/trace_options* (see also the
* *README* file under the same directory). However, it usually
* defaults to something like:
*
@@ -2002,6 +2035,9 @@ union bpf_attr {
* sending the packet. This flag was added for GRE
* encapsulation, but might be used with other protocols
* as well in the future.
+ * **BPF_F_NO_TUNNEL_KEY**
+ * Add a flag to tunnel metadata indicating that no tunnel
+ * key should be set in the resulting tunnel header.
*
* Here is a typical usage on the transmit path:
*
@@ -2645,6 +2681,11 @@ union bpf_attr {
* Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
* L2 type as Ethernet.
*
+ * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**,
+ * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**:
+ * Indicate the new IP header version after decapsulating the outer
+ * IP header. Used when the inner and outer IP versions are different.
+ *
* A call to this helper is susceptible to change the underlying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
@@ -2789,7 +2830,7 @@ union bpf_attr {
*
* long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
* Description
- * For en eBPF program attached to a perf event, retrieve the
+ * For an eBPF program attached to a perf event, retrieve the
* value of the event counter associated to *ctx* and store it in
* the structure pointed by *buf* and of size *buf_size*. Enabled
* and running times are also stored in the structure (see
@@ -3122,6 +3163,11 @@ union bpf_attr {
* **BPF_FIB_LOOKUP_OUTPUT**
* Perform lookup from an egress perspective (default is
* ingress).
+ * **BPF_FIB_LOOKUP_SKIP_NEIGH**
+ * Skip the neighbour table lookup. *params*->dmac
+ * and *params*->smac will not be set as output. A common
+ * use case is to call **bpf_redirect_neigh**\ () after
+ * doing **bpf_fib_lookup**\ ().
*
* *ctx* is either **struct xdp_md** for XDP programs or
* **struct sk_buff** tc cls_act programs.
@@ -4952,6 +4998,12 @@ union bpf_attr {
* different maps if key/value layout matches across maps.
* Every bpf_timer_set_callback() can have different callback_fn.
*
+ * *flags* can be one of:
+ *
+ * **BPF_F_TIMER_ABS**
+ * Start the timer in absolute expire value instead of the
+ * default relative one.
+ *
* Return
* 0 on success.
* **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier
@@ -5294,7 +5346,7 @@ union bpf_attr {
* Return
* Nothing. Always succeeds.
*
- * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags)
+ * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags)
* Description
* Read *len* bytes from *src* into *dst*, starting from *offset*
* into *src*.
@@ -5304,22 +5356,36 @@ union bpf_attr {
* of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
* *flags* is not 0.
*
- * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
+ * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
* Description
* Write *len* bytes from *src* into *dst*, starting from *offset*
* into *dst*.
- * *flags* is currently unused.
+ *
+ * *flags* must be 0 except for skb-type dynptrs.
+ *
+ * For skb-type dynptrs:
+ * * All data slices of the dynptr are automatically
+ * invalidated after **bpf_dynptr_write**\ (). This is
+ * because writing may pull the skb and change the
+ * underlying packet buffer.
+ *
+ * * For *flags*, please see the flags accepted by
+ * **bpf_skb_store_bytes**\ ().
* Return
* 0 on success, -E2BIG if *offset* + *len* exceeds the length
* of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
- * is a read-only dynptr or if *flags* is not 0.
+ * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs,
+ * other errors correspond to errors returned by **bpf_skb_store_bytes**\ ().
*
- * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
+ * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
* Description
* Get a pointer to the underlying dynptr data.
*
* *len* must be a statically known value. The returned data slice
* is invalidated whenever the dynptr is invalidated.
+ *
+ * skb and xdp type dynptrs may not use bpf_dynptr_data. They should
+ * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
* Return
* Pointer to the underlying dynptr data, NULL if the dynptr is
* read-only, if the dynptr is invalid, or if the offset and length
@@ -5415,7 +5481,7 @@ union bpf_attr {
* Drain samples from the specified user ring buffer, and invoke
* the provided callback for each such sample:
*
- * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx);
+ * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx);
*
* If **callback_fn** returns 0, the helper will continue to try
* and drain the next sample, up to a maximum of
@@ -5765,6 +5831,7 @@ enum {
BPF_F_ZERO_CSUM_TX = (1ULL << 1),
BPF_F_DONT_FRAGMENT = (1ULL << 2),
BPF_F_SEQ_NUMBER = (1ULL << 3),
+ BPF_F_NO_TUNNEL_KEY = (1ULL << 4),
};
/* BPF_FUNC_skb_get_tunnel_key flags. */
@@ -5804,6 +5871,8 @@ enum {
BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4),
BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5),
BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6),
+ BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7),
+ BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8),
};
enum {
@@ -6339,6 +6408,9 @@ struct bpf_link_info {
struct {
__u32 ifindex;
} xdp;
+ struct {
+ __u32 map_id;
+ } struct_ops;
};
} __attribute__((aligned(8)));
@@ -6735,6 +6807,7 @@ struct bpf_raw_tracepoint_args {
enum {
BPF_FIB_LOOKUP_DIRECT = (1U << 0),
BPF_FIB_LOOKUP_OUTPUT = (1U << 1),
+ BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2),
};
enum {
@@ -6902,6 +6975,21 @@ struct bpf_list_node {
__u64 :64;
} __attribute__((aligned(8)));
+struct bpf_rb_root {
+ __u64 :64;
+ __u64 :64;
+} __attribute__((aligned(8)));
+
+struct bpf_rb_node {
+ __u64 :64;
+ __u64 :64;
+ __u64 :64;
+} __attribute__((aligned(8)));
+
+struct bpf_refcount {
+ __u32 :32;
+} __attribute__((aligned(4)));
+
struct bpf_sysctl {
__u32 write; /* Sysctl is being read (= 0) or written (= 1).
* Allows 1,2,4-byte read, but no write.
@@ -7051,5 +7139,22 @@ struct bpf_core_relo {
enum bpf_core_relo_kind kind;
};
+/*
+ * Flags to control bpf_timer_start() behaviour.
+ * - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is
+ * relative to current time.
+ */
+enum {
+ BPF_F_TIMER_ABS = (1ULL << 0),
+};
+
+/* BPF numbers iterator state */
+struct bpf_iter_num {
+ /* opaque iterator state; having __u64 here allows to preserve correct
+ * alignment requirements in vmlinux.h, generated from BTF
+ */
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
#endif /* _UAPI__LINUX_BPF_H__ */
)********"
diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h
index d7b869e0..e989440a 100644
--- a/src/cc/export/helpers.h
+++ b/src/cc/export/helpers.h
@@ -1006,13 +1006,13 @@ static void (*bpf_ringbuf_submit_dynptr)(struct bpf_dynptr *ptr, __u64 flags) =
(void *)BPF_FUNC_ringbuf_submit_dynptr;
static void (*bpf_ringbuf_discard_dynptr)(struct bpf_dynptr *ptr, __u64 flags) =
(void *)BPF_FUNC_ringbuf_discard_dynptr;
-static long (*bpf_dynptr_read)(void *dst, __u32 len, struct bpf_dynptr *src, __u32 offset,
+static long (*bpf_dynptr_read)(void *dst, __u32 len, const struct bpf_dynptr *src, __u32 offset,
__u64 flags) =
(void *)BPF_FUNC_dynptr_read;
-static long (*bpf_dynptr_write)(struct bpf_dynptr *dst, __u32 offset, void *src, __u32 len,
+static long (*bpf_dynptr_write)(const struct bpf_dynptr *dst, __u32 offset, void *src, __u32 len,
__u64 flags) =
(void *)BPF_FUNC_dynptr_write;
-static void *(*bpf_dynptr_data)(struct bpf_dynptr *ptr, __u32 offset, __u32 len) =
+static void *(*bpf_dynptr_data)(const struct bpf_dynptr *ptr, __u32 offset, __u32 len) =
(void *)BPF_FUNC_dynptr_data;
static __s64 (*bpf_tcp_raw_gen_syncookie_ipv4)(struct iphdr *iph, struct tcphdr *th,
__u32 th_len) =
--
2.41.0

View File

@ -1,71 +0,0 @@
From cc35f70515cb0f3b8032b8fb68f9f37a844e74c8 Mon Sep 17 00:00:00 2001
From: Rong Tao <rongtao@cestc.cn>
Date: Fri, 10 Feb 2023 23:28:55 +0800
Subject: [PATCH] tools/compactsnoop.py: Fix raw_tracepoint Invalid argument
error
kernel commit abd4349ff9b8("mm: compaction: cleanup the compaction trace
events") change the arguments of 'mm_compaction_begin' from (start_pfn,
migrate_pfn, free_pfn, end_pfn, sync) to (cc, start_pfn, end_pfn, sync),
and change the arguments of 'mm_compaction_end' from (start_pfn,
migrate_pfn, free_pfn, end_pfn, sync, ret) to (cc, start_pfn, end_pfn,
sync, ret).
Replacing RAW_TRACEPOINT_PROBE with TRACEPOINT_PROBE solves this problem
and guarantees compatibility.
$ sudo ./compactsnoop.py
bpf_attach_raw_tracepoint (mm_compaction_begin): Invalid argument
Traceback (most recent call last):
File "/home/sdb/Git/bcc/tools/./compactsnoop.py", line 292, in <module>
b = BPF(text=bpf_text)
^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/site-packages/bcc/__init__.py", line 483, in __init__
self._trace_autoload()
File "/usr/lib/python3.11/site-packages/bcc/__init__.py", line 1462, in _trace_autoload
self.attach_raw_tracepoint(tp=tp, fn_name=fn.name)
File "/usr/lib/python3.11/site-packages/bcc/__init__.py", line 1055, in attach_raw_tracepoint
raise Exception("Failed to attach BPF to raw tracepoint")
Exception: Failed to attach BPF to raw tracepoint
Signed-off-by: Rong Tao <rongtao@cestc.cn>
---
tools/compactsnoop.py | 13 ++++---------
1 file changed, 4 insertions(+), 9 deletions(-)
diff --git a/tools/compactsnoop.py b/tools/compactsnoop.py
index 2643e8ed..2b395dec 100755
--- a/tools/compactsnoop.py
+++ b/tools/compactsnoop.py
@@ -237,11 +237,9 @@ RAW_TRACEPOINT_PROBE(mm_compaction_suitable)
return 0;
}
-RAW_TRACEPOINT_PROBE(mm_compaction_begin)
+TRACEPOINT_PROBE(compaction, mm_compaction_begin)
{
- // TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
- // unsigned long free_pfn, unsigned long zone_end, bool sync)
- bool sync = (bool)ctx->args[4];
+ bool sync = args->sync;
u64 id = bpf_get_current_pid_tgid();
struct val_t *valp = start.lookup(&id);
@@ -255,12 +253,9 @@ RAW_TRACEPOINT_PROBE(mm_compaction_begin)
return 0;
}
-RAW_TRACEPOINT_PROBE(mm_compaction_end)
+TRACEPOINT_PROBE(compaction, mm_compaction_end)
{
- // TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
- // unsigned long free_pfn, unsigned long zone_end, bool sync,
- // int status)
- submit_event(ctx, ctx->args[5]);
+ submit_event(args, args->status);
return 0;
}
"""
--
2.39.2

View File

@ -1,76 +0,0 @@
From 0e9384ec4c88d2da2d23475f58ec9bff7eb48639 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Tue, 25 Apr 2023 16:04:05 +0200
Subject: [PATCH] tools/deadlock: Add an option to set the maximum number of
stack traces
Commit 77f5252d ("tools/deadlock: support specifies maxnum of threads
and edge cases (#3455)") allow to set the maximum number of threads
and edge cases to be able to reduce the memory usage of the deadlock
tool. It however let the size of the map of stack traces fixed. It's
current size, 640k (actually rounded up to 1M) takes 1Gb of vmalloced
kernel memory.
This patch adds an option to make the maximum number of stack traces
user defined. It also set the default value to 64k, in line with the
current default for the number of edge cases and threads.
It fix the following issue on system with limited memory ressources:
could not open bpf map: stack_traces, error: Cannot allocate memory
Traceback (most recent call last):
File "/tmp/./deadlock.py", line 577, in <module>
main()
File "/tmp/./deadlock.py", line 489, in main
bpf = BPF(text=text)
File "/usr/lib/python3.9/site-packages/bcc/__init__.py", line 479, in __init__
raise Exception("Failed to compile BPF module %s" % (src_file or "<text>"))
Exception: Failed to compile BPF module <text>
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
---
tools/deadlock.c | 2 +-
tools/deadlock.py | 8 ++++++++
2 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/tools/deadlock.c b/tools/deadlock.c
index 006dc121..6ae405ba 100644
--- a/tools/deadlock.c
+++ b/tools/deadlock.c
@@ -60,7 +60,7 @@ struct thread_created_leaf_t {
BPF_HASH(thread_to_parent, u32, struct thread_created_leaf_t);
// Stack traces when threads are created and when mutexes are locked/unlocked.
-BPF_STACK_TRACE(stack_traces, 655360);
+BPF_STACK_TRACE(stack_traces, MAX_TRACES);
// The first argument to the user space function we are tracing
// is a pointer to the mutex M held by thread T.
diff --git a/tools/deadlock.py b/tools/deadlock.py
index 12de099f..f7eb4ce0 100755
--- a/tools/deadlock.py
+++ b/tools/deadlock.py
@@ -467,6 +467,13 @@ import time
help='Specifies the maximum number of edge cases that can be recorded. '
'default 65536. Note. 88 bytes per edge case.'
)
+ parser.add_argument(
+ '-s', '--stacktraces', type=int, default=65536,
+ help='Specifies the maximum number of stack traces that can be recorded. '
+ 'This number is rounded up to the next power of two.'
+ 'default 65536. Note. 1 kbytes vmalloced per stack trace.'
+ )
+
args = parser.parse_args()
if not args.binary:
try:
@@ -479,6 +486,7 @@ import time
text = f.read()
text = text.replace('MAX_THREADS', str(args.threads));
text = text.replace('MAX_EDGES', str(args.edges));
+ text = text.replace('MAX_TRACES', str(args.stacktraces));
bpf = BPF(text=text)
# Trace where threads are created
--
2.39.2

View File

@ -1,57 +0,0 @@
From 29f0fa3693d679102680fece9ed5e606e291c5fa Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Fri, 7 Apr 2023 14:30:54 +0200
Subject: [PATCH] tools/funcslower: fix printing of folded stacks
When trying to print folded stack, funcslower tries to join bytes to a
string. Let's perform that operation with bytes only, and decode
before printing.
Also, decode symbols name before printing for the default stack
format, to avoid unsightly b'xxx' output.
It fixes the following error:
Exception ignored on calling ctypes callback function: <function PerfEventArray._open_perf_buffer.<locals>.raw_cb_ at 0x7f200541e5e0>
Traceback (most recent call last):
File "/usr/lib/python3.9/site-packages/bcc/table.py", line 982, in raw_cb_
callback(cpu, data, size)
File "/usr/share/bcc/tools/funcslower", line 340, in print_event
print_stack(event)
File "/usr/share/bcc/tools/funcslower", line 324, in print_stack
print("%s %d" % (";".join(line), 1))
TypeError: sequence item 1: expected str instance, bytes found
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
---
tools/funcslower.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/tools/funcslower.py b/tools/funcslower.py
index 6df7f24c..4b3798a0 100755
--- a/tools/funcslower.py
+++ b/tools/funcslower.py
@@ -317,17 +317,17 @@ earliest_ts = 0
# print folded stack output
user_stack = list(user_stack)
kernel_stack = list(kernel_stack)
- line = [event.comm.decode('utf-8', 'replace')] + \
+ line = [event.comm] + \
[b.sym(addr, event.tgid_pid) for addr in reversed(user_stack)] + \
(do_delimiter and ["-"] or []) + \
[b.ksym(addr) for addr in reversed(kernel_stack)]
- print("%s %d" % (";".join(line), 1))
+ print("%s %d" % (b';'.join(line).decode('utf-8', 'replace'), 1))
else:
# print default multi-line stack output.
for addr in kernel_stack:
- print(" %s" % b.ksym(addr))
+ print(" %s" % b.ksym(addr).decode('utf-8', 'replace'))
for addr in user_stack:
- print(" %s" % b.sym(addr, event.tgid_pid))
+ print(" %s" % b.sym(addr, event.tgid_pid).decode('utf-8', 'replace'))
def print_event(cpu, data, size):
event = b["events"].event(data)
--
2.39.2

View File

@ -1,85 +0,0 @@
From 9965f8397950d8aa1bc1a5decbc2250d0627a798 Mon Sep 17 00:00:00 2001
From: Rong Tao <rongtao@cestc.cn>
Date: Fri, 10 Feb 2023 22:16:56 +0800
Subject: [PATCH] tools/nfsslower.py: Fix uninitialized struct pad error
The verifier is unhappy, if data struct _pad_ is not initialized, see [0][1].
$ sudo ./nfsslower.py
...
; bpf_perf_event_output(ctx, (void *)bpf_pseudo_fd(1, -2), CUR_CPU_IDENTIFIER, &data, sizeof(data));
83: (79) r1 = *(u64 *)(r10 -144) ; R1_w=ctx(off=0,imm=0) R10=fp0
84: (18) r3 = 0xffffffff ; R3_w=4294967295
86: (b7) r5 = 96 ; R5_w=96
87: (85) call bpf_perf_event_output#25
invalid indirect read from stack R4 off -136+92 size 96
processed 84 insns (limit 1000000) max_states_per_insn 0 total_states 4 peak_states 4 mark_read 4
...
raise Exception("Failed to load BPF program %s: %s" %
Exception: Failed to load BPF program b'raw_tracepoint__nfs_commit_done': Permission denied
[0] https://github.com/iovisor/bcc/issues/2623
[1] https://github.com/iovisor/bcc/pull/4453
Signed-off-by: Rong Tao <rongtao@cestc.cn>
---
tools/nfsslower.py | 29 +++++++++++++++++++++--------
1 file changed, 21 insertions(+), 8 deletions(-)
diff --git a/tools/nfsslower.py b/tools/nfsslower.py
index 34756f72..99f63f0f 100755
--- a/tools/nfsslower.py
+++ b/tools/nfsslower.py
@@ -195,8 +195,11 @@ static int trace_exit(struct pt_regs *ctx, int type)
// populate output struct
u32 size = PT_REGS_RC(ctx);
- struct data_t data = {.type = type, .size = size, .delta_us = delta_us,
- .pid = pid};
+ struct data_t data = {};
+ data.type = type;
+ data.size = size;
+ data.delta_us = delta_us;
+ data.pid = pid;
data.ts_us = ts / 1000;
data.offset = valp->offset;
bpf_get_current_comm(&data.task, sizeof(data.task));
@@ -280,9 +283,14 @@ RAW_TRACEPOINT_PROBE(nfs_commit_done)
u64 ts = bpf_ktime_get_ns();
u64 delta_us = (ts - cp->ts) / 1000;
u32 pid = bpf_get_current_pid_tgid() >> 32;
- struct data_t data = {.type = TRACE_COMMIT, .offset = cp->offset,
- .size = cp->count, .ts_us = ts/1000, .delta_us = delta_us,
- .pid = pid};
+
+ struct data_t data = {};
+ data.type = TRACE_COMMIT;
+ data.offset = cp->offset;
+ data.size = cp->count;
+ data.ts_us = ts/1000;
+ data.delta_us = delta_us;
+ data.pid = pid;
commitinfo.delete(&key);
bpf_get_current_comm(&data.task, sizeof(data.task));
@@ -325,9 +333,14 @@ int trace_nfs_commit_done(struct pt_regs *ctx, void *task, void *calldata)
u64 ts = bpf_ktime_get_ns();
u64 delta_us = (ts - cp->ts) / 1000;
u32 pid = bpf_get_current_pid_tgid() >> 32;
- struct data_t data = {.type = TRACE_COMMIT, .offset = cp->offset,
- .size = cp->count, .ts_us = ts/1000, .delta_us = delta_us,
- .pid = pid};
+
+ struct data_t data = {};
+ data.type = TRACE_COMMIT;
+ data.offset = cp->offset;
+ data.size = cp->count;
+ data.ts_us = ts/1000;
+ data.delta_us = delta_us;
+ data.pid = pid;
commitinfo.delete(&key);
bpf_get_current_comm(&data.task, sizeof(data.task));
--
2.39.2

View File

@ -1,159 +0,0 @@
From 02fce045ce02fe81d8649ce63ce81d5cdf3e3a72 Mon Sep 17 00:00:00 2001
From: Rong Tao <rongtao@cestc.cn>
Date: Mon, 30 Jan 2023 17:39:35 +0800
Subject: [PATCH] tools/readahead: Fix: Failed to attach BPF program
entry__do_page_cache_readahead
since commit 56a4d67c264e("mm/readahead: Switch to page_cache_ra_order") switch
do_page_cache_ra() to page_cache_ra_order() (v5.17), and commit bb3c579e25e5
("mm/filemap: Add filemap_alloc_folio") swap __page_cache_alloc() to
filemap_alloc_folio() (since v5.15)
Reprocude the error(fedora37, 6.1.7-200.fc37.aarch64):
$ sudo ./readahead.py
cannot attach kprobe, probe entry may not exist
Traceback (most recent call last):
File "/home/rongtao/Git/bcc/tools/./readahead.py", line 159, in <module>
b.attach_kprobe(event=ra_event, fn_name="entry__do_page_cache_readahead")
File "/usr/lib/python3.11/site-packages/bcc/__init__.py", line 840, in attach_kprobe
raise Exception("Failed to attach BPF program %s to kprobe %s" %
Exception: Failed to attach BPF program b'entry__do_page_cache_readahead' to kprobe b'do_page_cache_ra'
Signed-off-by: Rong Tao <rongtao@cestc.cn>
---
tools/readahead.py | 69 +++++++++++++++++++++++++++++++++++++---------
1 file changed, 56 insertions(+), 13 deletions(-)
diff --git a/tools/readahead.py b/tools/readahead.py
index f2afdcb3..adad2ea8 100755
--- a/tools/readahead.py
+++ b/tools/readahead.py
@@ -12,6 +12,7 @@
#
# 20-Aug-2020 Suchakra Sharma Ported from bpftrace to BCC
# 17-Sep-2021 Hengqi Chen Migrated to kfunc
+# 30-Jan-2023 Rong Tao Support more kfunc/kprobe, introduce folio
from __future__ import print_function
from bcc import BPF
@@ -38,6 +39,7 @@ args = parser.parse_args()
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/mm_types.h>
+#include <linux/mm.h>
BPF_HASH(flag, u32, u8); // used to track if we are in do_page_cache_readahead()
BPF_HASH(birth, struct page*, u64); // used to track timestamps of cache alloc'ed page
@@ -65,7 +67,7 @@ int exit__do_page_cache_readahead(struct pt_regs *ctx) {
int exit__page_cache_alloc(struct pt_regs *ctx) {
u32 pid;
u64 ts;
- struct page *retval = (struct page*) PT_REGS_RC(ctx);
+ struct page *retval = (struct page*) GET_RETVAL_PAGE;
u32 zero = 0; // static key for accessing pages[0]
pid = bpf_get_current_pid_tgid();
u8 *f = flag.lookup(&pid);
@@ -111,6 +113,23 @@ KRETFUNC_PROBE(RA_FUNC)
return 0;
}
+KFUNC_PROBE(mark_page_accessed, struct page *arg0)
+{
+ u64 ts, delta;
+ u32 zero = 0; // static key for accessing pages[0]
+ u64 *bts = birth.lookup(&arg0);
+
+ if (bts != NULL) {
+ delta = bpf_ktime_get_ns() - *bts;
+ dist.atomic_increment(bpf_log2l(delta/1000000));
+ pages.atomic_increment(zero, -1);
+ birth.delete(&arg0); // remove the entry from hashmap
+ }
+ return 0;
+}
+"""
+
+bpf_text_kfunc_cache_alloc_ret_page = """
KRETFUNC_PROBE(__page_cache_alloc, gfp_t gfp, struct page *retval)
{
u64 ts;
@@ -125,18 +144,22 @@ KRETFUNC_PROBE(__page_cache_alloc, gfp_t gfp, struct page *retval)
}
return 0;
}
+"""
-KFUNC_PROBE(mark_page_accessed, struct page *arg0)
+bpf_text_kfunc_cache_alloc_ret_folio = """
+KRETFUNC_PROBE(filemap_alloc_folio, gfp_t gfp, unsigned int order,
+ struct folio *retval)
{
- u64 ts, delta;
+ u64 ts;
u32 zero = 0; // static key for accessing pages[0]
- u64 *bts = birth.lookup(&arg0);
+ u32 pid = bpf_get_current_pid_tgid();
+ u8 *f = flag.lookup(&pid);
+ struct page *page = folio_page(retval, 0);
- if (bts != NULL) {
- delta = bpf_ktime_get_ns() - *bts;
- dist.atomic_increment(bpf_log2l(delta/1000000));
- pages.atomic_increment(zero, -1);
- birth.delete(&arg0); // remove the entry from hashmap
+ if (f != NULL && *f == 1) {
+ ts = bpf_ktime_get_ns();
+ birth.update(&page, &ts);
+ pages.atomic_increment(zero);
}
return 0;
}
@@ -145,20 +168,40 @@ KFUNC_PROBE(mark_page_accessed, struct page *arg0)
if BPF.support_kfunc():
if BPF.get_kprobe_functions(b"__do_page_cache_readahead"):
ra_func = "__do_page_cache_readahead"
- else:
+ elif BPF.get_kprobe_functions(b"do_page_cache_ra"):
ra_func = "do_page_cache_ra"
+ elif BPF.get_kprobe_functions(b"page_cache_ra_order"):
+ ra_func = "page_cache_ra_order"
+ else:
+ print("Not found any kfunc.")
+ exit()
bpf_text += bpf_text_kfunc.replace("RA_FUNC", ra_func)
+ if BPF.get_kprobe_functions(b"__page_cache_alloc"):
+ bpf_text += bpf_text_kfunc_cache_alloc_ret_page
+ else:
+ bpf_text += bpf_text_kfunc_cache_alloc_ret_folio
b = BPF(text=bpf_text)
else:
bpf_text += bpf_text_kprobe
- b = BPF(text=bpf_text)
if BPF.get_kprobe_functions(b"__do_page_cache_readahead"):
ra_event = "__do_page_cache_readahead"
- else:
+ elif BPF.get_kprobe_functions(b"do_page_cache_ra"):
ra_event = "do_page_cache_ra"
+ elif BPF.get_kprobe_functions(b"page_cache_ra_order"):
+ ra_event = "page_cache_ra_order"
+ else:
+ print("Not found any kprobe.")
+ exit()
+ if BPF.get_kprobe_functions(b"__page_cache_alloc"):
+ cache_func = "__page_cache_alloc"
+ bpf_text = bpf_text.replace('GET_RETVAL_PAGE', 'PT_REGS_RC(ctx)')
+ else:
+ cache_func = "filemap_alloc_folio"
+ bpf_text = bpf_text.replace('GET_RETVAL_PAGE', 'folio_page((struct folio *)PT_REGS_RC(ctx), 0)')
+ b = BPF(text=bpf_text)
b.attach_kprobe(event=ra_event, fn_name="entry__do_page_cache_readahead")
b.attach_kretprobe(event=ra_event, fn_name="exit__do_page_cache_readahead")
- b.attach_kretprobe(event="__page_cache_alloc", fn_name="exit__page_cache_alloc")
+ b.attach_kretprobe(event=cache_func, fn_name="exit__page_cache_alloc")
b.attach_kprobe(event="mark_page_accessed", fn_name="entry_mark_page_accessed")
# header
--
2.39.1

View File

@ -1,140 +0,0 @@
From 533db3453a09695f79368792cdd5fbe2ddeaa55e Mon Sep 17 00:00:00 2001
From: Rong Tao <rongtao@cestc.cn>
Date: Sun, 22 Jan 2023 15:44:46 +0800
Subject: [PATCH] tools/slabratetop: Fix error: incomplete definition of type
'struct slab'
kernel commit 40f3bf0cb04c("mm: Convert struct page to struct slab in functions
used by other subsystems") introduce slab_address() function, commit 6e48a966dfd1
("mm/kasan: Convert to struct folio and struct slab") linux/kasan.h adds a
dependency on the slab struct, This leads to the following problems:
$ sudo ./slabratetop.py
In file included from /virtual/main.c:13:
include/linux/slub_def.h:162:26: warning: call to undeclared function 'slab_address';
ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
void *object = x - (x - slab_address(slab)) % cache->size;
^
include/linux/slub_def.h:162:46: error: invalid operands to binary expression ('void *' and 'unsigned int')
void *object = x - (x - slab_address(slab)) % cache->size;
~~~~~~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~
include/linux/slub_def.h:164:8: error: incomplete definition of type 'struct slab'
(slab->objects - 1) * cache->size;
~~~~^
include/linux/kasan.h:13:8: note: forward declaration of 'struct slab'
struct slab;
^
...
At first, I wanted to fix this with a kernel patch [1], however, bcc as a
downstream project of the kernel, this issue should be solved inside the bcc
project. This is agreed by kernel maintainer and bcc maintainer @yonghong-song.
This solution is provided by @yonghong-song [0].
[0] https://github.com/iovisor/bcc/issues/4438
[1] https://lore.kernel.org/all/tencent_ABA832E296819D1053D6C625ADCAF76BC706@qq.com/
Signed-off-by: Rong Tao <rongtao@cestc.cn>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
tools/slabratetop.py | 76 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 76 insertions(+)
diff --git a/tools/slabratetop.py b/tools/slabratetop.py
index ac44b2bd..8fbcac5e 100755
--- a/tools/slabratetop.py
+++ b/tools/slabratetop.py
@@ -14,6 +14,9 @@
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 15-Oct-2016 Brendan Gregg Created this.
+# 23-Jan-2023 Rong Tao Introduce kernel internal data structure and
+# functions to temporarily solve problem for
+# >=5.16(TODO: fix this workaround)
from __future__ import print_function
from bcc import BPF
@@ -65,6 +68,79 @@ bpf_text = """
// 5.9, but it does not hurt to have it here for versions 5.4 to 5.8.
struct memcg_cache_params {};
+// introduce kernel interval slab structure and slab_address() function, solved
+// 'undefined' error for >=5.16. TODO: we should fix this workaround if BCC
+// framework support BTF/CO-RE.
+struct slab {
+ unsigned long __page_flags;
+
+#if defined(CONFIG_SLAB)
+
+ struct kmem_cache *slab_cache;
+ union {
+ struct {
+ struct list_head slab_list;
+ void *freelist; /* array of free object indexes */
+ void *s_mem; /* first object */
+ };
+ struct rcu_head rcu_head;
+ };
+ unsigned int active;
+
+#elif defined(CONFIG_SLUB)
+
+ struct kmem_cache *slab_cache;
+ union {
+ struct {
+ union {
+ struct list_head slab_list;
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ struct {
+ struct slab *next;
+ int slabs; /* Nr of slabs left */
+ };
+#endif
+ };
+ /* Double-word boundary */
+ void *freelist; /* first free object */
+ union {
+ unsigned long counters;
+ struct {
+ unsigned inuse:16;
+ unsigned objects:15;
+ unsigned frozen:1;
+ };
+ };
+ };
+ struct rcu_head rcu_head;
+ };
+ unsigned int __unused;
+
+#elif defined(CONFIG_SLOB)
+
+ struct list_head slab_list;
+ void *__unused_1;
+ void *freelist; /* first free block */
+ long units;
+ unsigned int __unused_2;
+
+#else
+#error "Unexpected slab allocator configured"
+#endif
+
+ atomic_t __page_refcount;
+#ifdef CONFIG_MEMCG
+ unsigned long memcg_data;
+#endif
+};
+
+// slab_address() will not be used, and NULL will be returned directly, which
+// can avoid adaptation of different kernel versions
+static inline void *slab_address(const struct slab *slab)
+{
+ return NULL;
+}
+
#ifdef CONFIG_SLUB
#include <linux/slub_def.h>
#else
--
2.39.1

View File

@ -0,0 +1,66 @@
From 63808fbdcb70ce2e858db0a42e7e3eeec153d5b6 Mon Sep 17 00:00:00 2001
From: Abhishek Dubey <adubey@linux.ibm.com>
Date: Wed, 20 Sep 2023 10:37:38 -0400
Subject: [PATCH 4/4] Adding memory zones for Power server
config PPC_BOOK3S_64 skips setting ZONE_DMA for
server processor. NORMAL and MOVABLE zones are
available on Power.
Signed-off-by: Abhishek Dubey <adubey@linux.ibm.com>
---
tools/compactsnoop.py | 28 +++++++++++++++++++---------
1 file changed, 19 insertions(+), 9 deletions(-)
diff --git a/tools/compactsnoop.py b/tools/compactsnoop.py
index 2b395dec..1a476aad 100755
--- a/tools/compactsnoop.py
+++ b/tools/compactsnoop.py
@@ -260,11 +260,12 @@ TRACEPOINT_PROBE(compaction, mm_compaction_end)
}
"""
-if platform.machine() != 'x86_64':
+if platform.machine() != 'x86_64' and platform.machine() != 'ppc64le':
print("""
- Currently only support x86_64 servers, if you want to use it on
- other platforms, please refer include/linux/mmzone.h to modify
- zone_idex_to_str to get the right zone type
+ Currently only support x86_64 and power servers, if you want
+ to use it on other platforms(including power embedded processors),
+ please refer include/linux/mmzone.h to modify zone_idex_to_str to
+ get the right zone type
""")
exit()
@@ -296,13 +297,22 @@ initial_ts = 0
# from include/linux/mmzone.h
# NOTICE: consider only x86_64 servers
zone_type = {
- 0: "ZONE_DMA",
- 1: "ZONE_DMA32",
- 2: "ZONE_NORMAL",
+ 'x86_64':
+ {
+ 0: "ZONE_DMA",
+ 1: "ZONE_DMA32",
+ 2: "ZONE_NORMAL"
+ },
+ # Zones in Power server only
+ 'ppc64le':
+ {
+ 0: "ZONE_NORMAL",
+ 1: "ZONE_MOVABLE"
+ }
}
- if idx in zone_type:
- return zone_type[idx]
+ if idx in zone_type[platform.machine()]:
+ return zone_type[platform.machine()][idx]
else:
return str(idx)
--
2.43.0

View File

@ -0,0 +1,45 @@
From e6493835a28c08c45fd374e70dba7aa66f700d08 Mon Sep 17 00:00:00 2001
From: Abhishek Dubey <adubey@linux.ibm.com>
Date: Tue, 14 Nov 2023 03:54:19 -0500
Subject: [PATCH 2/4] Fixing pvalloc memleak test
Request to allocate 30K bytes using pvalloc(), results
in allocating 3*64Kb(on 64Kb pagesize system). The assertion
expects leak to be 30Kb, whereas leaked memory is much more
due to pvalloc's implementation for power.
Signed-off-by: Abhishek Dubey <adubey@linux.ibm.com>
---
tests/python/test_tools_memleak.py | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/tests/python/test_tools_memleak.py b/tests/python/test_tools_memleak.py
index cae7e35d..4e921a0c 100755
--- a/tests/python/test_tools_memleak.py
+++ b/tests/python/test_tools_memleak.py
@@ -3,6 +3,7 @@
from unittest import main, skipUnless, TestCase
from utils import kernel_version_ge
import os
+import platform
import subprocess
import sys
import tempfile
@@ -102,7 +103,13 @@ TOOLS_DIR = "/bcc/tools/"
self.assertEqual(cfg.leaking_amount, self.run_leaker("memalign"))
def test_pvalloc(self):
- self.assertEqual(cfg.leaking_amount, self.run_leaker("pvalloc"))
+ # pvalloc's implementation for power invokes mmap(), which adjusts the
+ # allocated size to meet pvalloc's constraints. Actual leaked memory
+ # could be more than requested, hence assertLessEqual.
+ if platform.machine() == 'ppc64le':
+ self.assertLessEqual(cfg.leaking_amount, self.run_leaker("pvalloc"))
+ else:
+ self.assertEqual(cfg.leaking_amount, self.run_leaker("pvalloc"))
def test_aligned_alloc(self):
self.assertEqual(cfg.leaking_amount, self.run_leaker("aligned_alloc"))
--
2.43.0

View File

@ -0,0 +1,41 @@
From a5d86850e3bfeaa23ef4c82dccb9288a2cd42a27 Mon Sep 17 00:00:00 2001
From: Abhishek Dubey <adubey@linux.ibm.com>
Date: Mon, 11 Sep 2023 05:10:36 -0400
Subject: [PATCH 3/4] Skipping USDT tests for Power processor
Support for Power processor in folly package is absent,
so skipping USDT tests having dependency on folly.
Signed-off-by: Abhishek Dubey <adubey@linux.ibm.com>
---
tests/python/CMakeLists.txt | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/tests/python/CMakeLists.txt b/tests/python/CMakeLists.txt
index a42a16ce..81a547f0 100644
--- a/tests/python/CMakeLists.txt
+++ b/tests/python/CMakeLists.txt
@@ -71,12 +71,14 @@ add_test(NAME py_test_tools_smoke WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMAND ${TEST_WRAPPER} py_test_tools_smoke sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_tools_smoke.py)
add_test(NAME py_test_tools_memleak WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMAND ${TEST_WRAPPER} py_test_tools_memleak sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_tools_memleak.py)
-add_test(NAME py_test_usdt WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
- COMMAND ${TEST_WRAPPER} py_test_usdt sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_usdt.py)
-add_test(NAME py_test_usdt2 WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
- COMMAND ${TEST_WRAPPER} py_test_usdt2 sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_usdt2.py)
-add_test(NAME py_test_usdt3 WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
- COMMAND ${TEST_WRAPPER} py_test_usdt3 sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_usdt3.py)
+if(NOT(CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64"))
+ add_test(NAME py_test_usdt WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+ COMMAND ${TEST_WRAPPER} py_test_usdt sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_usdt.py)
+ add_test(NAME py_test_usdt2 WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+ COMMAND ${TEST_WRAPPER} py_test_usdt2 sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_usdt2.py)
+ add_test(NAME py_test_usdt3 WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+ COMMAND ${TEST_WRAPPER} py_test_usdt3 sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_usdt3.py)
+endif()
add_test(NAME py_test_license WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMAND ${TEST_WRAPPER} py_test_license sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_license.py)
add_test(NAME py_test_free_bcc_memory WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
--
2.43.0

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,476 @@
From 60860bf3a400dcf72b4026fb2973803cfb12ccf1 Mon Sep 17 00:00:00 2001
From: mickey_zhu <mickey_zhu@realsil.com.cn>
Date: Tue, 27 Jun 2023 16:32:44 +0800
Subject: [PATCH] libbpf-tools: add block_io_{start,done} tracepoints support
to bio tools
Some bio tools fail to kprobe blk_account_io_{start,done} after v5.17,
because they become inlined, see [0]. To fix this issue, tracepoints
blick_io_{start,done} are introcuded in kernel, see[1].
Update related bio tools to support new tracepoints, and also simplify
attach.
[0] Kernel commit 450b7879e345 (block: move blk_account_io_{start,done} to blk-mq.c)
[1] Kernel commit 5a80bd075f3b (block: introduce block_io_start/block_io_done tracepoints)
Change-Id: I62b957abd7ce2901eb114bd57c78938e4f083e4d
Signed-off-by: Mickey Zhu <mickey_zhu@realsil.com.cn>
---
libbpf-tools/biosnoop.bpf.c | 9 ++++
libbpf-tools/biosnoop.c | 78 +++++++++++++--------------------
libbpf-tools/biostacks.bpf.c | 46 +++++++++++++------
libbpf-tools/biostacks.c | 85 +++++++++++++++++++++---------------
libbpf-tools/biotop.bpf.c | 44 +++++++++++++++++--
libbpf-tools/biotop.c | 59 ++++++++++++++++---------
6 files changed, 199 insertions(+), 122 deletions(-)
diff --git a/libbpf-tools/biosnoop.bpf.c b/libbpf-tools/biosnoop.bpf.c
index b791555f..fcc5c5ce 100644
--- a/libbpf-tools/biosnoop.bpf.c
+++ b/libbpf-tools/biosnoop.bpf.c
@@ -76,6 +76,15 @@ int BPF_PROG(blk_account_io_start, struct request *rq)
return trace_pid(rq);
}
+SEC("tp_btf/block_io_start")
+int BPF_PROG(block_io_start, struct request *rq)
+{
+ if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
+ return 0;
+
+ return trace_pid(rq);
+}
+
SEC("kprobe/blk_account_io_merge_bio")
int BPF_KPROBE(blk_account_io_merge_bio, struct request *rq)
{
diff --git a/libbpf-tools/biosnoop.c b/libbpf-tools/biosnoop.c
index 21773729..f9468900 100644
--- a/libbpf-tools/biosnoop.c
+++ b/libbpf-tools/biosnoop.c
@@ -212,6 +212,16 @@ void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt)
fprintf(stderr, "lost %llu events on CPU #%d\n", lost_cnt, cpu);
}
+static void blk_account_io_set_attach_target(struct biosnoop_bpf *obj)
+{
+ if (fentry_can_attach("blk_account_io_start", NULL))
+ bpf_program__set_attach_target(obj->progs.blk_account_io_start,
+ 0, "blk_account_io_start");
+ else
+ bpf_program__set_attach_target(obj->progs.blk_account_io_start,
+ 0, "__blk_account_io_start");
+}
+
int main(int argc, char **argv)
{
const struct partition *partition;
@@ -260,12 +270,23 @@ int main(int argc, char **argv)
obj->rodata->filter_cg = env.cg;
obj->rodata->min_ns = env.min_lat_ms * 1000000;
- if (fentry_can_attach("blk_account_io_start", NULL))
- bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0,
- "blk_account_io_start");
- else
- bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0,
- "__blk_account_io_start");
+ if (tracepoint_exists("block", "block_io_start"))
+ bpf_program__set_autoload(obj->progs.blk_account_io_start, false);
+ else {
+ bpf_program__set_autoload(obj->progs.block_io_start, false);
+ blk_account_io_set_attach_target(obj);
+ }
+
+ ksyms = ksyms__load();
+ if (!ksyms) {
+ fprintf(stderr, "failed to load kallsyms\n");
+ goto cleanup;
+ }
+ if (!ksyms__get_symbol(ksyms, "blk_account_io_merge_bio"))
+ bpf_program__set_autoload(obj->progs.blk_account_io_merge_bio, false);
+
+ if (!env.queued)
+ bpf_program__set_autoload(obj->progs.block_rq_insert, false);
err = biosnoop_bpf__load(obj);
if (err) {
@@ -288,48 +309,9 @@ int main(int argc, char **argv)
}
}
- obj->links.blk_account_io_start = bpf_program__attach(obj->progs.blk_account_io_start);
- if (!obj->links.blk_account_io_start) {
- err = -errno;
- fprintf(stderr, "failed to attach blk_account_io_start: %s\n",
- strerror(-err));
- goto cleanup;
- }
- ksyms = ksyms__load();
- if (!ksyms) {
- err = -ENOMEM;
- fprintf(stderr, "failed to load kallsyms\n");
- goto cleanup;
- }
- if (ksyms__get_symbol(ksyms, "blk_account_io_merge_bio")) {
- obj->links.blk_account_io_merge_bio =
- bpf_program__attach(obj->progs.blk_account_io_merge_bio);
- if (!obj->links.blk_account_io_merge_bio) {
- err = -errno;
- fprintf(stderr, "failed to attach blk_account_io_merge_bio: %s\n",
- strerror(-err));
- goto cleanup;
- }
- }
- if (env.queued) {
- obj->links.block_rq_insert =
- bpf_program__attach(obj->progs.block_rq_insert);
- if (!obj->links.block_rq_insert) {
- err = -errno;
- fprintf(stderr, "failed to attach block_rq_insert: %s\n", strerror(-err));
- goto cleanup;
- }
- }
- obj->links.block_rq_issue = bpf_program__attach(obj->progs.block_rq_issue);
- if (!obj->links.block_rq_issue) {
- err = -errno;
- fprintf(stderr, "failed to attach block_rq_issue: %s\n", strerror(-err));
- goto cleanup;
- }
- obj->links.block_rq_complete = bpf_program__attach(obj->progs.block_rq_complete);
- if (!obj->links.block_rq_complete) {
- err = -errno;
- fprintf(stderr, "failed to attach block_rq_complete: %s\n", strerror(-err));
+ err = biosnoop_bpf__attach(obj);
+ if (err) {
+ fprintf(stderr, "failed to attach BPF programs: %d\n", err);
goto cleanup;
}
diff --git a/libbpf-tools/biostacks.bpf.c b/libbpf-tools/biostacks.bpf.c
index c3950910..0ca69880 100644
--- a/libbpf-tools/biostacks.bpf.c
+++ b/libbpf-tools/biostacks.bpf.c
@@ -67,20 +67,8 @@ int trace_start(void *ctx, struct request *rq, bool merge_bio)
return 0;
}
-SEC("fentry/blk_account_io_start")
-int BPF_PROG(blk_account_io_start, struct request *rq)
-{
- return trace_start(ctx, rq, false);
-}
-
-SEC("kprobe/blk_account_io_merge_bio")
-int BPF_KPROBE(blk_account_io_merge_bio, struct request *rq)
-{
- return trace_start(ctx, rq, true);
-}
-
-SEC("fentry/blk_account_io_done")
-int BPF_PROG(blk_account_io_done, struct request *rq)
+static __always_inline
+int trace_done(void *ctx, struct request *rq)
{
u64 slot, ts = bpf_ktime_get_ns();
struct internal_rqinfo *i_rqinfop;
@@ -110,4 +98,34 @@ int BPF_PROG(blk_account_io_done, struct request *rq)
return 0;
}
+SEC("kprobe/blk_account_io_merge_bio")
+int BPF_KPROBE(blk_account_io_merge_bio, struct request *rq)
+{
+ return trace_start(ctx, rq, true);
+}
+
+SEC("fentry/blk_account_io_start")
+int BPF_PROG(blk_account_io_start, struct request *rq)
+{
+ return trace_start(ctx, rq, false);
+}
+
+SEC("fentry/blk_account_io_done")
+int BPF_PROG(blk_account_io_done, struct request *rq)
+{
+ return trace_done(ctx, rq);
+}
+
+SEC("tp_btf/block_io_start")
+int BPF_PROG(block_io_start, struct request *rq)
+{
+ return trace_start(ctx, rq, false);
+}
+
+SEC("tp_btf/block_io_done")
+int BPF_PROG(block_io_done, struct request *rq)
+{
+ return trace_done(ctx, rq);
+}
+
char LICENSE[] SEC("license") = "GPL";
diff --git a/libbpf-tools/biostacks.c b/libbpf-tools/biostacks.c
index e1878d1f..e7875f76 100644
--- a/libbpf-tools/biostacks.c
+++ b/libbpf-tools/biostacks.c
@@ -128,6 +128,39 @@ void print_map(struct ksyms *ksyms, struct partitions *partitions, int fd)
return;
}
+static bool has_block_io_tracepoints(void)
+{
+ return tracepoint_exists("block", "block_io_start") &&
+ tracepoint_exists("block", "block_io_done");
+}
+
+static void disable_block_io_tracepoints(struct biostacks_bpf *obj)
+{
+ bpf_program__set_autoload(obj->progs.block_io_start, false);
+ bpf_program__set_autoload(obj->progs.block_io_done, false);
+}
+
+static void disable_blk_account_io_fentry(struct biostacks_bpf *obj)
+{
+ bpf_program__set_autoload(obj->progs.blk_account_io_start, false);
+ bpf_program__set_autoload(obj->progs.blk_account_io_done, false);
+}
+
+static void blk_account_io_set_attach_target(struct biostacks_bpf *obj)
+{
+ if (fentry_can_attach("blk_account_io_start", NULL)) {
+ bpf_program__set_attach_target(obj->progs.blk_account_io_start,
+ 0, "blk_account_io_start");
+ bpf_program__set_attach_target(obj->progs.blk_account_io_done,
+ 0, "blk_account_io_done");
+ } else {
+ bpf_program__set_attach_target(obj->progs.blk_account_io_start,
+ 0, "__blk_account_io_start");
+ bpf_program__set_attach_target(obj->progs.blk_account_io_done,
+ 0, "__blk_account_io_done");
+ }
+}
+
int main(int argc, char **argv)
{
struct partitions *partitions = NULL;
@@ -172,50 +205,30 @@ int main(int argc, char **argv)
obj->rodata->targ_ms = env.milliseconds;
- if (fentry_can_attach("blk_account_io_start", NULL)) {
- bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0,
- "blk_account_io_start");
- bpf_program__set_attach_target(obj->progs.blk_account_io_done, 0,
- "blk_account_io_done");
- } else {
- bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0,
- "__blk_account_io_start");
- bpf_program__set_attach_target(obj->progs.blk_account_io_done, 0,
- "__blk_account_io_done");
- }
-
- err = biostacks_bpf__load(obj);
- if (err) {
- fprintf(stderr, "failed to load BPF object: %d\n", err);
- goto cleanup;
+ if (has_block_io_tracepoints())
+ disable_blk_account_io_fentry(obj);
+ else {
+ disable_block_io_tracepoints(obj);
+ blk_account_io_set_attach_target(obj);
}
- obj->links.blk_account_io_start = bpf_program__attach(obj->progs.blk_account_io_start);
- if (!obj->links.blk_account_io_start) {
- err = -errno;
- fprintf(stderr, "failed to attach blk_account_io_start: %s\n", strerror(-err));
- goto cleanup;
- }
ksyms = ksyms__load();
if (!ksyms) {
fprintf(stderr, "failed to load kallsyms\n");
goto cleanup;
}
- if (ksyms__get_symbol(ksyms, "blk_account_io_merge_bio")) {
- obj->links.blk_account_io_merge_bio =
- bpf_program__attach(obj->progs.blk_account_io_merge_bio);
- if (!obj->links.blk_account_io_merge_bio) {
- err = -errno;
- fprintf(stderr, "failed to attach blk_account_io_merge_bio: %s\n",
- strerror(-err));
- goto cleanup;
- }
+ if (!ksyms__get_symbol(ksyms, "blk_account_io_merge_bio"))
+ bpf_program__set_autoload(obj->progs.blk_account_io_merge_bio, false);
+
+ err = biostacks_bpf__load(obj);
+ if (err) {
+ fprintf(stderr, "failed to load BPF object: %d\n", err);
+ goto cleanup;
}
- obj->links.blk_account_io_done = bpf_program__attach(obj->progs.blk_account_io_done);
- if (!obj->links.blk_account_io_done) {
- err = -errno;
- fprintf(stderr, "failed to attach blk_account_io_done: %s\n",
- strerror(-err));
+
+ err = biostacks_bpf__attach(obj);
+ if (err) {
+ fprintf(stderr, "failed to attach BPF programs: %d\n", err);
goto cleanup;
}
diff --git a/libbpf-tools/biotop.bpf.c b/libbpf-tools/biotop.bpf.c
index 226e32d3..07631378 100644
--- a/libbpf-tools/biotop.bpf.c
+++ b/libbpf-tools/biotop.bpf.c
@@ -30,8 +30,8 @@ struct {
__type(value, struct val_t);
} counts SEC(".maps");
-SEC("kprobe")
-int BPF_KPROBE(blk_account_io_start, struct request *req)
+static __always_inline
+int trace_start(struct request *req)
{
struct who_t who = {};
@@ -56,8 +56,8 @@ int BPF_KPROBE(blk_mq_start_request, struct request *req)
return 0;
}
-SEC("kprobe")
-int BPF_KPROBE(blk_account_io_done, struct request *req, u64 now)
+static __always_inline
+int trace_done(struct request *req)
{
struct val_t *valp, zero = {};
struct info_t info = {};
@@ -103,4 +103,40 @@ int BPF_KPROBE(blk_account_io_done, struct request *req, u64 now)
return 0;
}
+SEC("kprobe/blk_account_io_start")
+int BPF_KPROBE(blk_account_io_start, struct request *req)
+{
+ return trace_start(req);
+}
+
+SEC("kprobe/blk_account_io_done")
+int BPF_KPROBE(blk_account_io_done, struct request *req)
+{
+ return trace_done(req);
+}
+
+SEC("kprobe/__blk_account_io_start")
+int BPF_KPROBE(__blk_account_io_start, struct request *req)
+{
+ return trace_start(req);
+}
+
+SEC("kprobe/__blk_account_io_done")
+int BPF_KPROBE(__blk_account_io_done, struct request *req)
+{
+ return trace_done(req);
+}
+
+SEC("tp_btf/block_io_start")
+int BPF_PROG(block_io_start, struct request *req)
+{
+ return trace_start(req);
+}
+
+SEC("tp_btf/block_io_done")
+int BPF_PROG(block_io_done, struct request *req)
+{
+ return trace_done(req);
+}
+
char LICENSE[] SEC("license") = "GPL";
diff --git a/libbpf-tools/biotop.c b/libbpf-tools/biotop.c
index 75484281..5b3a7cf3 100644
--- a/libbpf-tools/biotop.c
+++ b/libbpf-tools/biotop.c
@@ -354,6 +354,38 @@ static int print_stat(struct biotop_bpf *obj)
return err;
}
+static bool has_block_io_tracepoints(void)
+{
+ return tracepoint_exists("block", "block_io_start") &&
+ tracepoint_exists("block", "block_io_done");
+}
+
+static void disable_block_io_tracepoints(struct biotop_bpf *obj)
+{
+ bpf_program__set_autoload(obj->progs.block_io_start, false);
+ bpf_program__set_autoload(obj->progs.block_io_done, false);
+}
+
+static void disable_blk_account_io_kprobes(struct biotop_bpf *obj)
+{
+ bpf_program__set_autoload(obj->progs.blk_account_io_start, false);
+ bpf_program__set_autoload(obj->progs.blk_account_io_done, false);
+ bpf_program__set_autoload(obj->progs.__blk_account_io_start, false);
+ bpf_program__set_autoload(obj->progs.__blk_account_io_done, false);
+}
+
+static void blk_account_io_set_autoload(struct biotop_bpf *obj,
+ struct ksyms *ksyms)
+{
+ if (!ksyms__get_symbol(ksyms, "__blk_account_io_start")) {
+ bpf_program__set_autoload(obj->progs.__blk_account_io_start, false);
+ bpf_program__set_autoload(obj->progs.__blk_account_io_done, false);
+ } else {
+ bpf_program__set_autoload(obj->progs.blk_account_io_start, false);
+ bpf_program__set_autoload(obj->progs.blk_account_io_done, false);
+ }
+}
+
int main(int argc, char **argv)
{
static const struct argp argp = {
@@ -386,32 +418,19 @@ int main(int argc, char **argv)
goto cleanup;
}
+ if (has_block_io_tracepoints())
+ disable_blk_account_io_kprobes(obj);
+ else {
+ disable_block_io_tracepoints(obj);
+ blk_account_io_set_autoload(obj, ksyms);
+ }
+
err = biotop_bpf__load(obj);
if (err) {
warn("failed to load BPF object: %d\n", err);
goto cleanup;
}
- if (ksyms__get_symbol(ksyms, "__blk_account_io_start"))
- obj->links.blk_account_io_start = bpf_program__attach_kprobe(obj->progs.blk_account_io_start, false, "__blk_account_io_start");
- else
- obj->links.blk_account_io_start = bpf_program__attach_kprobe(obj->progs.blk_account_io_start, false, "blk_account_io_start");
-
- if (!obj->links.blk_account_io_start) {
- warn("failed to load attach blk_account_io_start\n");
- goto cleanup;
- }
-
- if (ksyms__get_symbol(ksyms, "__blk_account_io_done"))
- obj->links.blk_account_io_done = bpf_program__attach_kprobe(obj->progs.blk_account_io_done, false, "__blk_account_io_done");
- else
- obj->links.blk_account_io_done = bpf_program__attach_kprobe(obj->progs.blk_account_io_done, false, "blk_account_io_done");
-
- if (!obj->links.blk_account_io_done) {
- warn("failed to load attach blk_account_io_done\n");
- goto cleanup;
- }
-
err = biotop_bpf__attach(obj);
if (err) {
warn("failed to attach BPF programs: %d\n", err);
--
2.41.0

View File

@ -0,0 +1,855 @@
From 2e758b65231f976c67a0aad791aabc7927ea7086 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Thu, 27 Jul 2023 18:19:18 +0200
Subject: [PATCH] tools: Add support for the new block_io_* tracepoints
The bio tools currently depends on blk_account_io_done/start functions
that can be inlined. To fix that, a couple of tracepoints have been
added upstream (block:block_io_start/done). This patch add the support
for those tracepoints when they are available.
Unfortunately, the bio tools relies on data that is not available to
the tracepoints (mostly the struct request). So the tracepoints can't
be used as drop in replacement for blk_account_io_*. Main difference,
is that we can't use the struct request as the hash key anymore, so it
now uses the couple (dev_t, sector) for that purpose.
For the biolatency tool, the -F option is disabled when only the
tracepoints are available because the flags are not all accessible
from the tracepoints. Otherwise, all features of the tools should
remain.
Closes #4261
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
---
tools/biolatency.py | 166 ++++++++++++++++++++++++++++--------
tools/biosnoop.py | 200 +++++++++++++++++++++++++++++++++-----------
tools/biotop.py | 108 +++++++++++++++++++-----
3 files changed, 371 insertions(+), 103 deletions(-)
diff --git a/tools/biolatency.py b/tools/biolatency.py
index 8fe43a7c..03b48a4c 100755
--- a/tools/biolatency.py
+++ b/tools/biolatency.py
@@ -11,6 +11,7 @@
#
# 20-Sep-2015 Brendan Gregg Created this.
# 31-Mar-2022 Rocky Xing Added disk filter support.
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
from __future__ import print_function
from bcc import BPF
@@ -72,7 +73,7 @@ bpf_text = """
#include <linux/blk-mq.h>
typedef struct disk_key {
- char disk[DISK_NAME_LEN];
+ dev_t dev;
u64 slot;
} disk_key_t;
@@ -86,26 +87,70 @@ typedef struct ext_val {
u64 count;
} ext_val_t;
-BPF_HASH(start, struct request *);
+struct tp_args {
+ u64 __unused__;
+ dev_t dev;
+ sector_t sector;
+ unsigned int nr_sector;
+ unsigned int bytes;
+ char rwbs[8];
+ char comm[16];
+ char cmd[];
+};
+
+struct start_key {
+ dev_t dev;
+ u32 _pad;
+ sector_t sector;
+ CMD_FLAGS
+};
+
+BPF_HASH(start, struct start_key);
STORAGE
+static dev_t ddevt(struct gendisk *disk) {
+ return (disk->major << 20) | disk->first_minor;
+}
+
// time block I/O
-int trace_req_start(struct pt_regs *ctx, struct request *req)
+static int __trace_req_start(struct start_key key)
{
DISK_FILTER
u64 ts = bpf_ktime_get_ns();
- start.update(&req, &ts);
+ start.update(&key, &ts);
return 0;
}
+int trace_req_start(struct pt_regs *ctx, struct request *req)
+{
+ struct start_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ SET_FLAGS
+
+ return __trace_req_start(key);
+}
+
+int trace_req_start_tp(struct tp_args *args)
+{
+ struct start_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_req_start(key);
+}
+
// output
-int trace_req_done(struct pt_regs *ctx, struct request *req)
+static int __trace_req_done(struct start_key key)
{
u64 *tsp, delta;
// fetch timestamp and calculate delta
- tsp = start.lookup(&req);
+ tsp = start.lookup(&key);
if (tsp == 0) {
return 0; // missed issue
}
@@ -116,9 +161,31 @@ int trace_req_done(struct pt_regs *ctx, struct request *req)
// store as histogram
STORE
- start.delete(&req);
+ start.delete(&key);
return 0;
}
+
+int trace_req_done(struct pt_regs *ctx, struct request *req)
+{
+ struct start_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ SET_FLAGS
+
+ return __trace_req_done(key);
+}
+
+int trace_req_done_tp(struct tp_args *args)
+{
+ struct start_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_req_done(key);
+}
"""
# code substitutions
@@ -134,21 +201,18 @@ store_str = ""
if args.disks:
storage_str += "BPF_HISTOGRAM(dist, disk_key_t);"
disks_str = """
- disk_key_t key = {.slot = bpf_log2l(delta)};
- void *__tmp = (void *)req->__RQ_DISK__->disk_name;
- bpf_probe_read(&key.disk, sizeof(key.disk), __tmp);
- dist.atomic_increment(key);
+ disk_key_t dkey = {};
+ dkey.dev = key.dev;
+ dkey.slot = bpf_log2l(delta);
+ dist.atomic_increment(dkey);
"""
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
- store_str += disks_str.replace('__RQ_DISK__', 'rq_disk')
- else:
- store_str += disks_str.replace('__RQ_DISK__', 'q->disk')
+ store_str += disks_str
elif args.flags:
storage_str += "BPF_HISTOGRAM(dist, flag_key_t);"
store_str += """
- flag_key_t key = {.slot = bpf_log2l(delta)};
- key.flags = req->cmd_flags;
- dist.atomic_increment(key);
+ flag_key_t fkey = {.slot = bpf_log2l(delta)};
+ fkey.flags = key.flags;
+ dist.atomic_increment(fkey);
"""
else:
storage_str += "BPF_HISTOGRAM(dist);"
@@ -161,21 +225,13 @@ store_str = ""
exit(1)
stat_info = os.stat(disk_path)
- major = os.major(stat_info.st_rdev)
- minor = os.minor(stat_info.st_rdev)
-
- disk_field_str = ""
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
- disk_field_str = 'req->rq_disk'
- else:
- disk_field_str = 'req->q->disk'
+ dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev)
disk_filter_str = """
- struct gendisk *disk = %s;
- if (!(disk->major == %d && disk->first_minor == %d)) {
+ if(key.dev != %s) {
return 0;
}
- """ % (disk_field_str, major, minor)
+ """ % (dev)
bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str)
else:
@@ -194,6 +250,16 @@ store_str = ""
bpf_text = bpf_text.replace("STORAGE", storage_str)
bpf_text = bpf_text.replace("STORE", store_str)
+if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
+ bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk')
+else:
+ bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk')
+if args.flags:
+ bpf_text = bpf_text.replace('CMD_FLAGS', 'u64 flags;')
+ bpf_text = bpf_text.replace('SET_FLAGS', 'key.flags = req->cmd_flags;')
+else:
+ bpf_text = bpf_text.replace('CMD_FLAGS', '')
+ bpf_text = bpf_text.replace('SET_FLAGS', '')
if debug or args.ebpf:
print(bpf_text)
@@ -205,25 +271,53 @@ b = BPF(text=bpf_text)
if args.queued:
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_req_start")
- else:
+ elif BPF.get_kprobe_functions(b'blk_account_io_start'):
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_req_start")
+ else:
+ if args.flags:
+ # Some flags are accessible in the rwbs field (RAHEAD, SYNC and META)
+ # but other aren't. Disable the -F option for tracepoint for now.
+ print("ERROR: blk_account_io_start probe not available. Can't use -F.")
+ exit()
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_req_start_tp")
else:
if BPF.get_kprobe_functions(b'blk_start_request'):
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
+
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_done")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_done")
+else:
+ if args.flags:
+ print("ERROR: blk_account_io_done probe not available. Can't use -F.")
+ exit()
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_done_tp")
+
if not args.json:
print("Tracing block device I/O... Hit Ctrl-C to end.")
-def disk_print(s):
- disk = s.decode('utf-8', 'replace')
- if not disk:
- disk = "<unknown>"
- return disk
+# cache disk major,minor -> diskname
+diskstats = "/proc/diskstats"
+disklookup = {}
+with open(diskstats) as stats:
+ for line in stats:
+ a = line.split()
+ disklookup[a[0] + "," + a[1]] = a[2]
+
+def disk_print(d):
+ major = d >> 20
+ minor = d & ((1 << 20) - 1)
+
+ disk = str(major) + "," + str(minor)
+ if disk in disklookup:
+ diskname = disklookup[disk]
+ else:
+ diskname = "?"
+
+ return diskname
# see blk_fill_rwbs():
req_opf = {
diff --git a/tools/biosnoop.py b/tools/biosnoop.py
index 33703233..f0fef98b 100755
--- a/tools/biosnoop.py
+++ b/tools/biosnoop.py
@@ -14,6 +14,7 @@
# 11-Feb-2016 Allan McAleavy updated for BPF_PERF_OUTPUT
# 21-Jun-2022 Rocky Xing Added disk filter support.
# 13-Oct-2022 Rocky Xing Added support for displaying block I/O pattern.
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
from __future__ import print_function
from bcc import BPF
@@ -64,6 +65,24 @@ struct val_t {
char name[TASK_COMM_LEN];
};
+struct tp_args {
+ u64 __unused__;
+ dev_t dev;
+ sector_t sector;
+ unsigned int nr_sector;
+ unsigned int bytes;
+ char rwbs[8];
+ char comm[16];
+ char cmd[];
+};
+
+struct hash_key {
+ dev_t dev;
+ u32 rwflag;
+ sector_t sector;
+};
+
+
#ifdef INCLUDE_PATTERN
struct sector_key_t {
u32 dev_major;
@@ -79,6 +98,7 @@ enum bio_pattern {
struct data_t {
u32 pid;
+ u32 dev;
u64 rwflag;
u64 delta;
u64 qdelta;
@@ -88,7 +108,6 @@ struct data_t {
enum bio_pattern pattern;
#endif
u64 ts;
- char disk_name[DISK_NAME_LEN];
char name[TASK_COMM_LEN];
};
@@ -96,12 +115,45 @@ struct data_t {
BPF_HASH(last_sectors, struct sector_key_t, u64);
#endif
-BPF_HASH(start, struct request *, struct start_req_t);
-BPF_HASH(infobyreq, struct request *, struct val_t);
+BPF_HASH(start, struct hash_key, struct start_req_t);
+BPF_HASH(infobyreq, struct hash_key, struct val_t);
BPF_PERF_OUTPUT(events);
+static dev_t ddevt(struct gendisk *disk) {
+ return (disk->major << 20) | disk->first_minor;
+}
+
+/*
+ * The following deals with a kernel version change (in mainline 4.7, although
+ * it may be backported to earlier kernels) with how block request write flags
+ * are tested. We handle both pre- and post-change versions here. Please avoid
+ * kernel version tests like this as much as possible: they inflate the code,
+ * test, and maintenance burden.
+ */
+static int get_rwflag(u32 cmd_flags) {
+#ifdef REQ_WRITE
+ return !!(cmd_flags & REQ_WRITE);
+#elif defined(REQ_OP_SHIFT)
+ return !!((cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
+#else
+ return !!((cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
+#endif
+}
+
+#define RWBS_LEN 8
+
+static int get_rwflag_tp(char *rwbs) {
+ for (int i = 0; i < RWBS_LEN; i++) {
+ if (rwbs[i] == 'W')
+ return 1;
+ if (rwbs[i] == '\\0')
+ return 0;
+ }
+ return 0;
+}
+
// cache PID and comm by-req
-int trace_pid_start(struct pt_regs *ctx, struct request *req)
+static int __trace_pid_start(struct hash_key key)
{
DISK_FILTER
@@ -113,47 +165,76 @@ int trace_pid_start(struct pt_regs *ctx, struct request *req)
if (##QUEUE##) {
val.ts = bpf_ktime_get_ns();
}
- infobyreq.update(&req, &val);
+ infobyreq.update(&key, &val);
}
return 0;
}
+
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .rwflag = get_rwflag(req->cmd_flags),
+ .sector = req->__sector
+ };
+
+ return __trace_pid_start(key);
+}
+
+int trace_pid_start_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .rwflag = get_rwflag_tp(args->rwbs),
+ .sector = args->sector
+ };
+
+ return __trace_pid_start(key);
+}
+
// time block I/O
int trace_req_start(struct pt_regs *ctx, struct request *req)
{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .rwflag = get_rwflag(req->cmd_flags),
+ .sector = req->__sector
+ };
+
DISK_FILTER
struct start_req_t start_req = {
.ts = bpf_ktime_get_ns(),
.data_len = req->__data_len
};
- start.update(&req, &start_req);
+ start.update(&key, &start_req);
return 0;
}
// output
-int trace_req_completion(struct pt_regs *ctx, struct request *req)
+static int __trace_req_completion(void *ctx, struct hash_key key)
{
struct start_req_t *startp;
struct val_t *valp;
struct data_t data = {};
- struct gendisk *rq_disk;
+ //struct gendisk *rq_disk;
u64 ts;
// fetch timestamp and calculate delta
- startp = start.lookup(&req);
+ startp = start.lookup(&key);
if (startp == 0) {
// missed tracing issue
return 0;
}
ts = bpf_ktime_get_ns();
- rq_disk = req->__RQ_DISK__;
+ //rq_disk = req->__RQ_DISK__;
data.delta = ts - startp->ts;
data.ts = ts / 1000;
data.qdelta = 0;
data.len = startp->data_len;
- valp = infobyreq.lookup(&req);
+ valp = infobyreq.lookup(&key);
if (valp == 0) {
data.name[0] = '?';
data.name[1] = 0;
@@ -162,10 +243,9 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
data.qdelta = startp->ts - valp->ts;
}
data.pid = valp->pid;
- data.sector = req->__sector;
+ data.sector = key.sector;
+ data.dev = key.dev;
bpf_probe_read_kernel(&data.name, sizeof(data.name), valp->name);
- bpf_probe_read_kernel(&data.disk_name, sizeof(data.disk_name),
- rq_disk->disk_name);
}
#ifdef INCLUDE_PATTERN
@@ -174,8 +254,8 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
u64 *sector, last_sector;
struct sector_key_t sector_key = {
- .dev_major = rq_disk->major,
- .dev_minor = rq_disk->first_minor
+ .dev_major = key.dev >> 20,
+ .dev_minor = key.dev & ((1 << 20) - 1)
};
sector = last_sectors.lookup(&sector_key);
@@ -187,27 +267,36 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
last_sectors.update(&sector_key, &last_sector);
#endif
-/*
- * The following deals with a kernel version change (in mainline 4.7, although
- * it may be backported to earlier kernels) with how block request write flags
- * are tested. We handle both pre- and post-change versions here. Please avoid
- * kernel version tests like this as much as possible: they inflate the code,
- * test, and maintenance burden.
- */
-#ifdef REQ_WRITE
- data.rwflag = !!(req->cmd_flags & REQ_WRITE);
-#elif defined(REQ_OP_SHIFT)
- data.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
-#else
- data.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
-#endif
+ data.rwflag = key.rwflag;
events.perf_submit(ctx, &data, sizeof(data));
- start.delete(&req);
- infobyreq.delete(&req);
+ start.delete(&key);
+ infobyreq.delete(&key);
return 0;
}
+
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .rwflag = get_rwflag(req->cmd_flags),
+ .sector = req->__sector
+ };
+
+ return __trace_req_completion(ctx, key);
+}
+
+int trace_req_completion_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .rwflag = get_rwflag_tp(args->rwbs),
+ .sector = args->sector
+ };
+
+ return __trace_req_completion(args, key);
+}
"""
if args.queue:
bpf_text = bpf_text.replace('##QUEUE##', '1')
@@ -225,21 +314,13 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
exit(1)
stat_info = os.stat(disk_path)
- major = os.major(stat_info.st_rdev)
- minor = os.minor(stat_info.st_rdev)
-
- disk_field_str = ""
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
- disk_field_str = 'req->rq_disk'
- else:
- disk_field_str = 'req->q->disk'
+ dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev)
disk_filter_str = """
- struct gendisk *disk = %s;
- if (!(disk->major == %d && disk->first_minor == %d)) {
+ if(key.dev != %s) {
return 0;
}
- """ % (disk_field_str, major, minor)
+ """ % (dev)
bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str)
else:
@@ -254,15 +335,19 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
b = BPF(text=bpf_text)
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_start'):
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
+else:
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp")
if BPF.get_kprobe_functions(b'blk_start_request'):
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
+else:
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp")
# header
print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID",
@@ -273,6 +358,27 @@ print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID",
print("%7s " % ("QUE(ms)"), end="")
print("%7s" % "LAT(ms)")
+
+# cache disk major,minor -> diskname
+diskstats = "/proc/diskstats"
+disklookup = {}
+with open(diskstats) as stats:
+ for line in stats:
+ a = line.split()
+ disklookup[a[0] + "," + a[1]] = a[2]
+
+def disk_print(d):
+ major = d >> 20
+ minor = d & ((1 << 20) - 1)
+
+ disk = str(major) + "," + str(minor)
+ if disk in disklookup:
+ diskname = disklookup[disk]
+ else:
+ diskname = "<unknown>"
+
+ return diskname
+
rwflg = ""
pattern = ""
start_ts = 0
@@ -297,9 +403,7 @@ P_RANDOM = 2
delta = float(event.ts) - start_ts
- disk_name = event.disk_name.decode('utf-8', 'replace')
- if not disk_name:
- disk_name = '<unknown>'
+ disk_name = disk_print(event.dev)
print("%-11.6f %-14.14s %-7s %-9s %-1s %-10s %-7s" % (
delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid,
diff --git a/tools/biotop.py b/tools/biotop.py
index fcdd373f..2620983a 100755
--- a/tools/biotop.py
+++ b/tools/biotop.py
@@ -14,6 +14,7 @@
#
# 06-Feb-2016 Brendan Gregg Created this.
# 17-Mar-2022 Rocky Xing Added PID filter support.
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
from __future__ import print_function
from bcc import BPF
@@ -88,14 +89,35 @@ struct val_t {
u32 io;
};
-BPF_HASH(start, struct request *, struct start_req_t);
-BPF_HASH(whobyreq, struct request *, struct who_t);
+struct tp_args {
+ u64 __unused__;
+ dev_t dev;
+ sector_t sector;
+ unsigned int nr_sector;
+ unsigned int bytes;
+ char rwbs[8];
+ char comm[16];
+ char cmd[];
+};
+
+struct hash_key {
+ dev_t dev;
+ u32 _pad;
+ sector_t sector;
+};
+
+BPF_HASH(start, struct hash_key, struct start_req_t);
+BPF_HASH(whobyreq, struct hash_key, struct who_t);
BPF_HASH(counts, struct info_t, struct val_t);
+static dev_t ddevt(struct gendisk *disk) {
+ return (disk->major << 20) | disk->first_minor;
+}
+
// cache PID and comm by-req
-int trace_pid_start(struct pt_regs *ctx, struct request *req)
+static int __trace_pid_start(struct hash_key key)
{
- struct who_t who = {};
+ struct who_t who;
u32 pid;
if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
@@ -104,30 +126,54 @@ int trace_pid_start(struct pt_regs *ctx, struct request *req)
return 0;
who.pid = pid;
- whobyreq.update(&req, &who);
+ whobyreq.update(&key, &who);
}
return 0;
}
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ return __trace_pid_start(key);
+}
+
+int trace_pid_start_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_pid_start(key);
+}
+
// time block I/O
int trace_req_start(struct pt_regs *ctx, struct request *req)
{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
struct start_req_t start_req = {
.ts = bpf_ktime_get_ns(),
.data_len = req->__data_len
};
- start.update(&req, &start_req);
+ start.update(&key, &start_req);
return 0;
}
// output
-int trace_req_completion(struct pt_regs *ctx, struct request *req)
+static int __trace_req_completion(struct hash_key key)
{
struct start_req_t *startp;
// fetch timestamp and calculate delta
- startp = start.lookup(&req);
+ startp = start.lookup(&key);
if (startp == 0) {
return 0; // missed tracing issue
}
@@ -135,12 +181,12 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
struct who_t *whop;
u32 pid;
- whop = whobyreq.lookup(&req);
+ whop = whobyreq.lookup(&key);
pid = whop != 0 ? whop->pid : 0;
if (FILTER_PID) {
- start.delete(&req);
+ start.delete(&key);
if (whop != 0) {
- whobyreq.delete(&req);
+ whobyreq.delete(&key);
}
return 0;
}
@@ -150,8 +196,8 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
// setup info_t key
struct info_t info = {};
- info.major = req->__RQ_DISK__->major;
- info.minor = req->__RQ_DISK__->first_minor;
+ info.major = key.dev >> 20;
+ info.minor = key.dev & ((1 << 20) - 1);
/*
* The following deals with a kernel version change (in mainline 4.7, although
* it may be backported to earlier kernels) with how block request write flags
@@ -159,13 +205,13 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
* kernel version tests like this as much as possible: they inflate the code,
* test, and maintenance burden.
*/
-#ifdef REQ_WRITE
+/*#ifdef REQ_WRITE
info.rwflag = !!(req->cmd_flags & REQ_WRITE);
#elif defined(REQ_OP_SHIFT)
info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
#else
info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
-#endif
+#endif*/
if (whop == 0) {
// missed pid who, save stats as pid 0
@@ -183,11 +229,31 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
valp->io++;
}
- start.delete(&req);
- whobyreq.delete(&req);
+ start.delete(&key);
+ whobyreq.delete(&key);
return 0;
}
+
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ return __trace_req_completion(key);
+}
+
+int trace_req_completion_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_req_completion(key);
+}
"""
if args.ebpf:
@@ -207,15 +273,19 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
b = BPF(text=bpf_text)
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_start'):
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
+else:
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp")
if BPF.get_kprobe_functions(b'blk_start_request'):
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
+else:
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp")
print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
--
2.41.0

View File

@ -0,0 +1,156 @@
From 0d1a67ba9490aabbb874819d8d07b1868c8c2b1d Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Wed, 1 Feb 2023 17:30:03 +0100
Subject: [PATCH 2/2] tools/tcpstates: fix IPv6 journal
When logging ipv6 state change, journal_fields tries to pack
event.addr and event.daddr, which is not an integer in this, to
present a bytes-like object to socket.inet_ntop. This can be fixed by
having a similar type for [sd]addr for IPv4 and IPv6. Making both an
array of u32 solves the issue by presenting a bytes-like object
directly to inet_ntop, without the need for the struct packing stage.
Also now, the similar behavior, makes it easier to factor code for
IPv4 and IPv6.
It solves the following error:
/usr/share/bcc/tools/tcpstates -Y
SKADDR C-PID C-COMM LADDR LPORT RADDR RPORT OLDSTATE -> NEWSTATE MS
ffff8b2e83e56180 0 swapper/9 :: 22 :: 0 LISTEN -> SYN_RECV 0.000
Exception ignored on calling ctypes callback function: <function PerfEventArray._open_perf_buffer.<locals>.raw_cb_ at 0x7f894c8d7f70>
Traceback (most recent call last):
File "/usr/lib/python3.9/site-packages/bcc/table.py", line 982, in raw_cb_
callback(cpu, data, size)
File "/usr/share/bcc/tools/tcpstates", line 419, in print_ipv6_event
journal.send(**journal_fields(event, AF_INET6))
File "/usr/share/bcc/tools/tcpstates", line 348, in journal_fields
'OBJECT_' + addr_pfx + '_SOURCE_ADDRESS': inet_ntop(addr_family, pack("I", event.saddr)),
struct.error: required argument is not an integer
ffff8b2e83e56180 0 swapper/9 2620:52:0:2580:5054:ff:fe6b:6f1f 22 2620:52:0:2b11:2f5e:407d:b35d:4663 60396 SYN_RECV -> ESTABLISHED 0.010
Exception ignored on calling ctypes callback function: <function PerfEventArray._open_perf_buffer.<locals>.raw_cb_ at 0x7f894c8d7f70>
Traceback (most recent call last):
File "/usr/lib/python3.9/site-packages/bcc/table.py", line 982, in raw_cb_
callback(cpu, data, size)
File "/usr/share/bcc/tools/tcpstates", line 419, in print_ipv6_event
journal.send(**journal_fields(event, AF_INET6))
File "/usr/share/bcc/tools/tcpstates", line 348, in journal_fields
'OBJECT_' + addr_pfx + '_SOURCE_ADDRESS': inet_ntop(addr_family, pack("I", event.saddr)),
struct.error: required argument is not an integer
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
---
tools/tcpstates.py | 55 +++++++++++++++++-----------------------------
1 file changed, 20 insertions(+), 35 deletions(-)
diff --git a/tools/tcpstates.py b/tools/tcpstates.py
index 9b2ccfa4..6c845c9b 100755
--- a/tools/tcpstates.py
+++ b/tools/tcpstates.py
@@ -19,7 +19,6 @@ from __future__ import print_function
from bcc import BPF
import argparse
from socket import inet_ntop, AF_INET, AF_INET6
-from struct import pack
from time import strftime, time
from os import getuid
@@ -78,8 +77,8 @@ BPF_HASH(last, struct sock *, u64);
struct ipv4_data_t {
u64 ts_us;
u64 skaddr;
- u32 saddr;
- u32 daddr;
+ u32 saddr[1];
+ u32 daddr[1];
u64 span_us;
u32 pid;
u16 lport;
@@ -93,8 +92,8 @@ BPF_PERF_OUTPUT(ipv4_events);
struct ipv6_data_t {
u64 ts_us;
u64 skaddr;
- unsigned __int128 saddr;
- unsigned __int128 daddr;
+ u32 saddr[4];
+ u32 daddr[4];
u64 span_us;
u32 pid;
u16 lport;
@@ -350,9 +349,9 @@ format_string = ("%-16x %-5d %-10.10s %s%-15s %-5d %-15s %-5d %-11s " +
'OBJECT_PID': str(event.pid),
'OBJECT_COMM': event.task.decode('utf-8', 'replace'),
# Custom fields, aka "stuff we sort of made up".
- 'OBJECT_' + addr_pfx + '_SOURCE_ADDRESS': inet_ntop(addr_family, pack("I", event.saddr)),
+ 'OBJECT_' + addr_pfx + '_SOURCE_ADDRESS': inet_ntop(addr_family, event.saddr),
'OBJECT_TCP_SOURCE_PORT': str(event.lport),
- 'OBJECT_' + addr_pfx + '_DESTINATION_ADDRESS': inet_ntop(addr_family, pack("I", event.daddr)),
+ 'OBJECT_' + addr_pfx + '_DESTINATION_ADDRESS': inet_ntop(addr_family, event.daddr),
'OBJECT_TCP_DESTINATION_PORT': str(event.dport),
'OBJECT_TCP_OLD_STATE': tcpstate2str(event.oldstate),
'OBJECT_TCP_NEW_STATE': tcpstate2str(event.newstate),
@@ -373,8 +372,7 @@ format_string = ("%-16x %-5d %-10.10s %s%-15s %-5d %-15s %-5d %-11s " +
return fields
# process event
-def print_ipv4_event(cpu, data, size):
- event = b["ipv4_events"].event(data)
+def print_event(event, addr_family):
global start_ts
if args.time:
if args.csv:
@@ -389,39 +387,26 @@ format_string = ("%-16x %-5d %-10.10s %s%-15s %-5d %-15s %-5d %-11s " +
print("%.6f," % delta_s, end="")
else:
print("%-9.6f " % delta_s, end="")
+ if addr_family == AF_INET:
+ version = "4"
+ else:
+ version = "6"
print(format_string % (event.skaddr, event.pid, event.task.decode('utf-8', 'replace'),
- "4" if args.wide or args.csv else "",
- inet_ntop(AF_INET, pack("I", event.saddr)), event.lport,
- inet_ntop(AF_INET, pack("I", event.daddr)), event.dport,
+ version if args.wide or args.csv else "",
+ inet_ntop(addr_family, event.saddr), event.lport,
+ inet_ntop(addr_family, event.daddr), event.dport,
tcpstate2str(event.oldstate), tcpstate2str(event.newstate),
float(event.span_us) / 1000))
if args.journal:
- journal.send(**journal_fields(event, AF_INET))
+ journal.send(**journal_fields(event, addr_family))
+
+def print_ipv4_event(cpu, data, size):
+ event = b["ipv4_events"].event(data)
+ print_event(event, AF_INET)
def print_ipv6_event(cpu, data, size):
event = b["ipv6_events"].event(data)
- global start_ts
- if args.time:
- if args.csv:
- print("%s," % strftime("%H:%M:%S"), end="")
- else:
- print("%-8s " % strftime("%H:%M:%S"), end="")
- if args.timestamp:
- if start_ts == 0:
- start_ts = event.ts_us
- delta_s = (float(event.ts_us) - start_ts) / 1000000
- if args.csv:
- print("%.6f," % delta_s, end="")
- else:
- print("%-9.6f " % delta_s, end="")
- print(format_string % (event.skaddr, event.pid, event.task.decode('utf-8', 'replace'),
- "6" if args.wide or args.csv else "",
- inet_ntop(AF_INET6, event.saddr), event.lport,
- inet_ntop(AF_INET6, event.daddr), event.dport,
- tcpstate2str(event.oldstate), tcpstate2str(event.newstate),
- float(event.span_us) / 1000))
- if args.journal:
- journal.send(**journal_fields(event, AF_INET6))
+ print_event(event, AF_INET6)
# initialize BPF
b = BPF(text=bpf_text)
--
2.41.0

View File

@ -0,0 +1,144 @@
From 53b89f35e8970beef55046c1bf035264f110f06d Mon Sep 17 00:00:00 2001
From: hejun01 <hejun01@corp.netease.com>
Date: Thu, 29 Jun 2023 20:24:07 +0800
Subject: [PATCH 1/2] tools/tcpstates: fix context ptr modified error
Introduce local variable tcp_new_state,
to avoid llvm optimization of args->newstate,
which will cause context ptr args modified.
spilt event.ports to lport and dport.
switch type of TCP state from unsigned int to int.
---
tools/tcpstates.py | 47 +++++++++++++++++++++++++---------------------
1 file changed, 26 insertions(+), 21 deletions(-)
diff --git a/tools/tcpstates.py b/tools/tcpstates.py
index 89f3638c..9b2ccfa4 100755
--- a/tools/tcpstates.py
+++ b/tools/tcpstates.py
@@ -82,9 +82,10 @@ struct ipv4_data_t {
u32 daddr;
u64 span_us;
u32 pid;
- u32 ports;
- u32 oldstate;
- u32 newstate;
+ u16 lport;
+ u16 dport;
+ int oldstate;
+ int newstate;
char task[TASK_COMM_LEN];
};
BPF_PERF_OUTPUT(ipv4_events);
@@ -96,9 +97,10 @@ struct ipv6_data_t {
unsigned __int128 daddr;
u64 span_us;
u32 pid;
- u32 ports;
- u32 oldstate;
- u32 newstate;
+ u16 lport;
+ u16 dport;
+ int oldstate;
+ int newstate;
char task[TASK_COMM_LEN];
};
BPF_PERF_OUTPUT(ipv6_events);
@@ -132,6 +134,9 @@ TRACEPOINT_PROBE(sock, inet_sock_set_state)
u16 family = args->family;
FILTER_FAMILY
+ // workaround to avoid llvm optimization which will cause context ptr args modified
+ int tcp_newstate = args->newstate;
+
if (args->family == AF_INET) {
struct ipv4_data_t data4 = {
.span_us = delta_us,
@@ -141,8 +146,8 @@ TRACEPOINT_PROBE(sock, inet_sock_set_state)
data4.ts_us = bpf_ktime_get_ns() / 1000;
__builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr));
__builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr));
- // a workaround until data4 compiles with separate lport/dport
- data4.ports = dport + ((0ULL + lport) << 16);
+ data4.lport = lport;
+ data4.dport = dport;
data4.pid = pid;
bpf_get_current_comm(&data4.task, sizeof(data4.task));
@@ -157,14 +162,14 @@ TRACEPOINT_PROBE(sock, inet_sock_set_state)
data6.ts_us = bpf_ktime_get_ns() / 1000;
__builtin_memcpy(&data6.saddr, args->saddr_v6, sizeof(data6.saddr));
__builtin_memcpy(&data6.daddr, args->daddr_v6, sizeof(data6.daddr));
- // a workaround until data6 compiles with separate lport/dport
- data6.ports = dport + ((0ULL + lport) << 16);
+ data6.lport = lport;
+ data6.dport = dport;
data6.pid = pid;
bpf_get_current_comm(&data6.task, sizeof(data6.task));
ipv6_events.perf_submit(args, &data6, sizeof(data6));
}
- if (args->newstate == TCP_CLOSE) {
+ if (tcp_newstate == TCP_CLOSE) {
last.delete(&sk);
} else {
u64 ts = bpf_ktime_get_ns();
@@ -210,8 +215,8 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
data4.ts_us = bpf_ktime_get_ns() / 1000;
data4.saddr = sk->__sk_common.skc_rcv_saddr;
data4.daddr = sk->__sk_common.skc_daddr;
- // a workaround until data4 compiles with separate lport/dport
- data4.ports = dport + ((0ULL + lport) << 16);
+ data4.lport = lport;
+ data4.dport = dport;
data4.pid = pid;
bpf_get_current_comm(&data4.task, sizeof(data4.task));
@@ -228,8 +233,8 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
bpf_probe_read_kernel(&data6.daddr, sizeof(data6.daddr),
sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
- // a workaround until data6 compiles with separate lport/dport
- data6.ports = dport + ((0ULL + lport) << 16);
+ data6.lport = lport;
+ data6.dport = dport;
data6.pid = pid;
bpf_get_current_comm(&data6.task, sizeof(data6.task));
ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
@@ -346,9 +351,9 @@ format_string = ("%-16x %-5d %-10.10s %s%-15s %-5d %-15s %-5d %-11s " +
'OBJECT_COMM': event.task.decode('utf-8', 'replace'),
# Custom fields, aka "stuff we sort of made up".
'OBJECT_' + addr_pfx + '_SOURCE_ADDRESS': inet_ntop(addr_family, pack("I", event.saddr)),
- 'OBJECT_TCP_SOURCE_PORT': str(event.ports >> 16),
+ 'OBJECT_TCP_SOURCE_PORT': str(event.lport),
'OBJECT_' + addr_pfx + '_DESTINATION_ADDRESS': inet_ntop(addr_family, pack("I", event.daddr)),
- 'OBJECT_TCP_DESTINATION_PORT': str(event.ports & 0xffff),
+ 'OBJECT_TCP_DESTINATION_PORT': str(event.dport),
'OBJECT_TCP_OLD_STATE': tcpstate2str(event.oldstate),
'OBJECT_TCP_NEW_STATE': tcpstate2str(event.newstate),
'OBJECT_TCP_SPAN_TIME': str(event.span_us)
@@ -386,8 +391,8 @@ format_string = ("%-16x %-5d %-10.10s %s%-15s %-5d %-15s %-5d %-11s " +
print("%-9.6f " % delta_s, end="")
print(format_string % (event.skaddr, event.pid, event.task.decode('utf-8', 'replace'),
"4" if args.wide or args.csv else "",
- inet_ntop(AF_INET, pack("I", event.saddr)), event.ports >> 16,
- inet_ntop(AF_INET, pack("I", event.daddr)), event.ports & 0xffff,
+ inet_ntop(AF_INET, pack("I", event.saddr)), event.lport,
+ inet_ntop(AF_INET, pack("I", event.daddr)), event.dport,
tcpstate2str(event.oldstate), tcpstate2str(event.newstate),
float(event.span_us) / 1000))
if args.journal:
@@ -411,8 +416,8 @@ format_string = ("%-16x %-5d %-10.10s %s%-15s %-5d %-15s %-5d %-11s " +
print("%-9.6f " % delta_s, end="")
print(format_string % (event.skaddr, event.pid, event.task.decode('utf-8', 'replace'),
"6" if args.wide or args.csv else "",
- inet_ntop(AF_INET6, event.saddr), event.ports >> 16,
- inet_ntop(AF_INET6, event.daddr), event.ports & 0xffff,
+ inet_ntop(AF_INET6, event.saddr), event.lport,
+ inet_ntop(AF_INET6, event.daddr), event.dport,
tcpstate2str(event.oldstate), tcpstate2str(event.newstate),
float(event.span_us) / 1000))
if args.journal:
--
2.41.0

View File

@ -0,0 +1,53 @@
From 88274e83ca1a61699741d5b1d5499beb64cac753 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Mon, 16 Oct 2023 19:41:29 +0200
Subject: [PATCH] tools/trace: don't raise an exception in a ctype callback
To exit the tool when the maximal number of event is reached (-M
option), the tool currently call exit(), which raise a SystemExit
exception. The handling of exception from ctype callback doesn't seem
straightforward and dependent on python version.
This patch avoid the issue altogether by using a global variable
instead.
Closes #3049
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
---
tools/trace.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/tools/trace.py b/tools/trace.py
index 9c7cca71..2aa096fa 100755
--- a/tools/trace.py
+++ b/tools/trace.py
@@ -43,6 +43,7 @@ import sys
build_id_enabled = False
aggregate = False
symcount = {}
+ done = False
@classmethod
def configure(cls, args):
@@ -635,7 +636,7 @@ BPF_PERF_OUTPUT(%s);
if self.aggregate:
self.print_aggregate_events()
sys.stdout.flush()
- exit()
+ Probe.done = True;
def attach(self, bpf, verbose):
if len(self.library) == 0:
@@ -895,7 +896,7 @@ trace -s /lib/x86_64-linux-gnu/libc.so.6,/bin/ping 'p:c:inet_pton' -U
"-" if not all_probes_trivial else ""))
sys.stdout.flush()
- while True:
+ while not Probe.done:
self.bpf.perf_buffer_poll()
def run(self):
--
2.41.0

View File

@ -10,7 +10,7 @@
%endif
%endif
%ifarch x86_64 ppc64 ppc64le aarch64
%ifarch x86_64 ppc64 ppc64le aarch64 s390x
%bcond_without libbpf_tools
%else
%bcond_with libbpf_tools
@ -24,20 +24,21 @@
Name: bcc
Version: 0.26.0
Release: 4%{?dist}
Version: 0.28.0
Release: 5%{?dist}
Summary: BPF Compiler Collection (BCC)
License: ASL 2.0
URL: https://github.com/iovisor/bcc
Source0: %{url}/archive/v%{version}/%{name}-%{version}.tar.gz
Patch0: %%{name}-%%{version}-tools-nfsslower.py-Fix-uninitialized-struct-pad-erro.patch
Patch1: %%{name}-%%{version}-tools-slabratetop-Fix-error-incomplete-definition-of.patch
Patch2: %%{name}-%%{version}-tools-readahead-Fix-Failed-to-attach-BPF-program-ent.patch
Patch3: %%{name}-%%{version}-tools-compactsnoop.py-Fix-raw_tracepoint-Invalid-arg.patch
Patch4: %%{name}-%%{version}-killsnoop-add-missing-s-and-T-options-to-the-synopsi.patch
Patch5: %%{name}-%%{version}-tools-funcslower-fix-printing-of-folded-stacks.patch
Patch6: %%{name}-%%{version}-tools-deadlock-Add-an-option-to-set-the-maximum-numb.patch
Patch7: %%{name}-%%{version}-sync-with-latest-libbpf-repo.patch
Patch0: %%{name}-%%{version}-tools-tcpstates-fix-context-ptr-modified-error.patch
Patch1: %%{name}-%%{version}-tools-tcpstates-fix-IPv6-journal.patch
Patch2: %%{name}-%%{version}-tools-Add-support-for-the-new-block_io_-tracepoints.patch
Patch3: %%{name}-%%{version}-tools-trace-don-t-raise-an-exception-in-a-ctype-call.patch
Patch4: %%{name}-%%{version}-libbpf-tools-add-block_io_-start-done-tracepoints-su.patch
Patch5: %%{name}-%%{version}-libbpf-tools-Add-s390x-support.patch
Patch6: %%{name}-%%{version}-Fixing-pvalloc-memleak-test.patch
Patch7: %%{name}-%%{version}-Skipping-USDT-tests-for-Power-processor.patch
Patch8: %%{name}-%%{version}-Adding-memory-zones-for-Power-server.patch
# Arches will be included as upstream support is added and dependencies are
# satisfied in the respective arches
@ -261,6 +262,29 @@ cp -a libbpf-tools/tmp-install/bin/* %{buildroot}/%{_sbindir}/
%endif
%changelog
* Wed Dec 13 2023 Jerome Marchand <jmarchan@redhat.com> - 0.28.0-5
- Fix libbpf bio tools (RHEL-19368)
- Add S390x support to libbpf-tools (RHEL-16325)
- Power enhancements(RHEL-11477)
* Tue Nov 21 2023 Jerome Marchand <jmarchan@redhat.com> - 0.28.0-4
- Rebuild with LLVM 17 in the side tag (RHEL-10591)
* Tue Nov 21 2023 Jerome Marchand <jmarchan@redhat.com> - 0.28.0-3
- Rebuild with LLVM 17 (RHEL-10591)
* Mon Nov 06 2023 Jerome Marchand <jmarchan@redhat.com> - 0.28.0-2
- Fix trace tool (RHEL-8605)
* Mon Oct 23 2023 Jerome Marchand <jmarchan@redhat.com> - 0.28.0-1
- Rebase to v0.28.0 (RHEL-9976)
- Rebuild with LLVM 17 (RHEL-10591)
- Fix bpf-biosnoop out of bound access (RHEL-8664)
- Fix kvmexit missing VM exit reasons and statistics (RHEL-8702)
- Fix multi-word array type handling (RHEL-8674)
- Fix tcpstates -Y (RHEL-8490)
- Fix bio tools (RHEL-8553)
* Wed Aug 09 2023 Jerome Marchand <jmarchan@redhat.com> - 0.26.0-4
- Fix tcpretrans (rhbz#2226967)