Rebase to the latest version

Rebase bcc to bcc-0.28.0

Also fix bio tools and slabratetop

Resolves: bz#2218440
Resolves: bz#2184370
Resolves: bz#2217179

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
This commit is contained in:
Jerome Marchand 2023-08-10 15:24:06 +02:00
parent 030dab2cb0
commit 2cce202bb4
9 changed files with 1430 additions and 226979 deletions

1
.gitignore vendored
View File

@ -20,3 +20,4 @@
/bcc-0.25.0.tar.gz
/bcc-0.26.0.tar.gz
/bcc-0.27.0.tar.gz
/bcc-0.28.0.tar.gz

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,31 @@
From 0973fd70c1c50e57a3db0b09e239b1d1fd3f1c55 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Fri, 21 Jul 2023 16:10:18 +0200
Subject: [PATCH] Use bpf_obj_get_info_by_fd() instead of
bpf_btf_get_info_by_fd()
The libbpf version in rawhide doesn't have the typed
bpf_*_get_info_by_fd().
---
src/cc/libbpf.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c
index 360fd81d..a3e34da2 100644
--- a/src/cc/libbpf.c
+++ b/src/cc/libbpf.c
@@ -727,9 +727,9 @@ static int find_btf_id(const char *module_name, const char *func_name,
info.name = ptr_to_u64(name);
info.name_len = sizeof(name);
- err = bpf_btf_get_info_by_fd(fd, &info, &len);
+ err = bpf_obj_get_info_by_fd(fd, &info, &len);
if (err) {
- fprintf(stderr, "bpf_btf_get_info_by_fd failed: %d\n", err);
+ fprintf(stderr, "bpf_obj_get_info_by_fd failed: %d\n", err);
goto err_out;
}
--
2.41.0

View File

@ -24,14 +24,16 @@
Name: bcc
Version: 0.27.0
Release: 4%{?dist}
Version: 0.28.0
Release: 1%{?dist}
Summary: BPF Compiler Collection (BCC)
License: Apache-2.0
URL: https://github.com/iovisor/bcc
Source0: %{url}/archive/v%{version}/%{name}-%{version}.tar.gz
Patch0: Updating-Powerpc-vmlinux-headers-from-Linux-kernel-6.patch
Patch1: sync-with-latest-libbpf-repo.patch
Patch0: Use-bpf_obj_get_info_by_fd-instead-of-bpf_btf_get_in.patch
Patch1: libbpf-tools-add-block_io_-start-done-tracepoints-su.patch
Patch2: tools-Add-support-for-the-new-block_io_-tracepoints.patch
Patch3: tool-slabratetop-add-definition-of-freelist_aba_t.patch
# Arches will be included as upstream support is added and dependencies are
# satisfied in the respective arches
@ -240,6 +242,11 @@ cp -a libbpf-tools/tmp-install/bin/* %{buildroot}/%{_sbindir}/
%endif
%changelog
* Thu Aug 10 2023 Jerome Marchand <jmarchan@redhat.com> - 0.28.0-1
- Rebase to the latest release version (#2218440)
- Fix bio tools (#2184370)
- Fix slabratetop
* Wed Jul 19 2023 Fedora Release Engineering <releng@fedoraproject.org> - 0.27.0-4
- Rebuilt for https://fedoraproject.org/wiki/Fedora_39_Mass_Rebuild

View File

@ -0,0 +1,476 @@
From e1dfbe2d09583205acca1d1b5b09caefb460f2fd Mon Sep 17 00:00:00 2001
From: mickey_zhu <mickey_zhu@realsil.com.cn>
Date: Tue, 27 Jun 2023 16:32:44 +0800
Subject: [PATCH 1/2] libbpf-tools: add block_io_{start,done} tracepoints
support to bio tools
Some bio tools fail to kprobe blk_account_io_{start,done} after v5.17,
because they become inlined, see [0]. To fix this issue, tracepoints
blick_io_{start,done} are introcuded in kernel, see[1].
Update related bio tools to support new tracepoints, and also simplify
attach.
[0] Kernel commit 450b7879e345 (block: move blk_account_io_{start,done} to blk-mq.c)
[1] Kernel commit 5a80bd075f3b (block: introduce block_io_start/block_io_done tracepoints)
Change-Id: I62b957abd7ce2901eb114bd57c78938e4f083e4d
Signed-off-by: Mickey Zhu <mickey_zhu@realsil.com.cn>
---
libbpf-tools/biosnoop.bpf.c | 9 ++++
libbpf-tools/biosnoop.c | 78 +++++++++++++--------------------
libbpf-tools/biostacks.bpf.c | 46 +++++++++++++------
libbpf-tools/biostacks.c | 85 +++++++++++++++++++++---------------
libbpf-tools/biotop.bpf.c | 44 +++++++++++++++++--
libbpf-tools/biotop.c | 59 ++++++++++++++++---------
6 files changed, 199 insertions(+), 122 deletions(-)
diff --git a/libbpf-tools/biosnoop.bpf.c b/libbpf-tools/biosnoop.bpf.c
index b791555f..fcc5c5ce 100644
--- a/libbpf-tools/biosnoop.bpf.c
+++ b/libbpf-tools/biosnoop.bpf.c
@@ -76,6 +76,15 @@ int BPF_PROG(blk_account_io_start, struct request *rq)
return trace_pid(rq);
}
+SEC("tp_btf/block_io_start")
+int BPF_PROG(block_io_start, struct request *rq)
+{
+ if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
+ return 0;
+
+ return trace_pid(rq);
+}
+
SEC("kprobe/blk_account_io_merge_bio")
int BPF_KPROBE(blk_account_io_merge_bio, struct request *rq)
{
diff --git a/libbpf-tools/biosnoop.c b/libbpf-tools/biosnoop.c
index 21773729..f9468900 100644
--- a/libbpf-tools/biosnoop.c
+++ b/libbpf-tools/biosnoop.c
@@ -212,6 +212,16 @@ void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt)
fprintf(stderr, "lost %llu events on CPU #%d\n", lost_cnt, cpu);
}
+static void blk_account_io_set_attach_target(struct biosnoop_bpf *obj)
+{
+ if (fentry_can_attach("blk_account_io_start", NULL))
+ bpf_program__set_attach_target(obj->progs.blk_account_io_start,
+ 0, "blk_account_io_start");
+ else
+ bpf_program__set_attach_target(obj->progs.blk_account_io_start,
+ 0, "__blk_account_io_start");
+}
+
int main(int argc, char **argv)
{
const struct partition *partition;
@@ -260,12 +270,23 @@ int main(int argc, char **argv)
obj->rodata->filter_cg = env.cg;
obj->rodata->min_ns = env.min_lat_ms * 1000000;
- if (fentry_can_attach("blk_account_io_start", NULL))
- bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0,
- "blk_account_io_start");
- else
- bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0,
- "__blk_account_io_start");
+ if (tracepoint_exists("block", "block_io_start"))
+ bpf_program__set_autoload(obj->progs.blk_account_io_start, false);
+ else {
+ bpf_program__set_autoload(obj->progs.block_io_start, false);
+ blk_account_io_set_attach_target(obj);
+ }
+
+ ksyms = ksyms__load();
+ if (!ksyms) {
+ fprintf(stderr, "failed to load kallsyms\n");
+ goto cleanup;
+ }
+ if (!ksyms__get_symbol(ksyms, "blk_account_io_merge_bio"))
+ bpf_program__set_autoload(obj->progs.blk_account_io_merge_bio, false);
+
+ if (!env.queued)
+ bpf_program__set_autoload(obj->progs.block_rq_insert, false);
err = biosnoop_bpf__load(obj);
if (err) {
@@ -288,48 +309,9 @@ int main(int argc, char **argv)
}
}
- obj->links.blk_account_io_start = bpf_program__attach(obj->progs.blk_account_io_start);
- if (!obj->links.blk_account_io_start) {
- err = -errno;
- fprintf(stderr, "failed to attach blk_account_io_start: %s\n",
- strerror(-err));
- goto cleanup;
- }
- ksyms = ksyms__load();
- if (!ksyms) {
- err = -ENOMEM;
- fprintf(stderr, "failed to load kallsyms\n");
- goto cleanup;
- }
- if (ksyms__get_symbol(ksyms, "blk_account_io_merge_bio")) {
- obj->links.blk_account_io_merge_bio =
- bpf_program__attach(obj->progs.blk_account_io_merge_bio);
- if (!obj->links.blk_account_io_merge_bio) {
- err = -errno;
- fprintf(stderr, "failed to attach blk_account_io_merge_bio: %s\n",
- strerror(-err));
- goto cleanup;
- }
- }
- if (env.queued) {
- obj->links.block_rq_insert =
- bpf_program__attach(obj->progs.block_rq_insert);
- if (!obj->links.block_rq_insert) {
- err = -errno;
- fprintf(stderr, "failed to attach block_rq_insert: %s\n", strerror(-err));
- goto cleanup;
- }
- }
- obj->links.block_rq_issue = bpf_program__attach(obj->progs.block_rq_issue);
- if (!obj->links.block_rq_issue) {
- err = -errno;
- fprintf(stderr, "failed to attach block_rq_issue: %s\n", strerror(-err));
- goto cleanup;
- }
- obj->links.block_rq_complete = bpf_program__attach(obj->progs.block_rq_complete);
- if (!obj->links.block_rq_complete) {
- err = -errno;
- fprintf(stderr, "failed to attach block_rq_complete: %s\n", strerror(-err));
+ err = biosnoop_bpf__attach(obj);
+ if (err) {
+ fprintf(stderr, "failed to attach BPF programs: %d\n", err);
goto cleanup;
}
diff --git a/libbpf-tools/biostacks.bpf.c b/libbpf-tools/biostacks.bpf.c
index c3950910..0ca69880 100644
--- a/libbpf-tools/biostacks.bpf.c
+++ b/libbpf-tools/biostacks.bpf.c
@@ -67,20 +67,8 @@ int trace_start(void *ctx, struct request *rq, bool merge_bio)
return 0;
}
-SEC("fentry/blk_account_io_start")
-int BPF_PROG(blk_account_io_start, struct request *rq)
-{
- return trace_start(ctx, rq, false);
-}
-
-SEC("kprobe/blk_account_io_merge_bio")
-int BPF_KPROBE(blk_account_io_merge_bio, struct request *rq)
-{
- return trace_start(ctx, rq, true);
-}
-
-SEC("fentry/blk_account_io_done")
-int BPF_PROG(blk_account_io_done, struct request *rq)
+static __always_inline
+int trace_done(void *ctx, struct request *rq)
{
u64 slot, ts = bpf_ktime_get_ns();
struct internal_rqinfo *i_rqinfop;
@@ -110,4 +98,34 @@ int BPF_PROG(blk_account_io_done, struct request *rq)
return 0;
}
+SEC("kprobe/blk_account_io_merge_bio")
+int BPF_KPROBE(blk_account_io_merge_bio, struct request *rq)
+{
+ return trace_start(ctx, rq, true);
+}
+
+SEC("fentry/blk_account_io_start")
+int BPF_PROG(blk_account_io_start, struct request *rq)
+{
+ return trace_start(ctx, rq, false);
+}
+
+SEC("fentry/blk_account_io_done")
+int BPF_PROG(blk_account_io_done, struct request *rq)
+{
+ return trace_done(ctx, rq);
+}
+
+SEC("tp_btf/block_io_start")
+int BPF_PROG(block_io_start, struct request *rq)
+{
+ return trace_start(ctx, rq, false);
+}
+
+SEC("tp_btf/block_io_done")
+int BPF_PROG(block_io_done, struct request *rq)
+{
+ return trace_done(ctx, rq);
+}
+
char LICENSE[] SEC("license") = "GPL";
diff --git a/libbpf-tools/biostacks.c b/libbpf-tools/biostacks.c
index e1878d1f..e7875f76 100644
--- a/libbpf-tools/biostacks.c
+++ b/libbpf-tools/biostacks.c
@@ -128,6 +128,39 @@ void print_map(struct ksyms *ksyms, struct partitions *partitions, int fd)
return;
}
+static bool has_block_io_tracepoints(void)
+{
+ return tracepoint_exists("block", "block_io_start") &&
+ tracepoint_exists("block", "block_io_done");
+}
+
+static void disable_block_io_tracepoints(struct biostacks_bpf *obj)
+{
+ bpf_program__set_autoload(obj->progs.block_io_start, false);
+ bpf_program__set_autoload(obj->progs.block_io_done, false);
+}
+
+static void disable_blk_account_io_fentry(struct biostacks_bpf *obj)
+{
+ bpf_program__set_autoload(obj->progs.blk_account_io_start, false);
+ bpf_program__set_autoload(obj->progs.blk_account_io_done, false);
+}
+
+static void blk_account_io_set_attach_target(struct biostacks_bpf *obj)
+{
+ if (fentry_can_attach("blk_account_io_start", NULL)) {
+ bpf_program__set_attach_target(obj->progs.blk_account_io_start,
+ 0, "blk_account_io_start");
+ bpf_program__set_attach_target(obj->progs.blk_account_io_done,
+ 0, "blk_account_io_done");
+ } else {
+ bpf_program__set_attach_target(obj->progs.blk_account_io_start,
+ 0, "__blk_account_io_start");
+ bpf_program__set_attach_target(obj->progs.blk_account_io_done,
+ 0, "__blk_account_io_done");
+ }
+}
+
int main(int argc, char **argv)
{
struct partitions *partitions = NULL;
@@ -172,50 +205,30 @@ int main(int argc, char **argv)
obj->rodata->targ_ms = env.milliseconds;
- if (fentry_can_attach("blk_account_io_start", NULL)) {
- bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0,
- "blk_account_io_start");
- bpf_program__set_attach_target(obj->progs.blk_account_io_done, 0,
- "blk_account_io_done");
- } else {
- bpf_program__set_attach_target(obj->progs.blk_account_io_start, 0,
- "__blk_account_io_start");
- bpf_program__set_attach_target(obj->progs.blk_account_io_done, 0,
- "__blk_account_io_done");
- }
-
- err = biostacks_bpf__load(obj);
- if (err) {
- fprintf(stderr, "failed to load BPF object: %d\n", err);
- goto cleanup;
+ if (has_block_io_tracepoints())
+ disable_blk_account_io_fentry(obj);
+ else {
+ disable_block_io_tracepoints(obj);
+ blk_account_io_set_attach_target(obj);
}
- obj->links.blk_account_io_start = bpf_program__attach(obj->progs.blk_account_io_start);
- if (!obj->links.blk_account_io_start) {
- err = -errno;
- fprintf(stderr, "failed to attach blk_account_io_start: %s\n", strerror(-err));
- goto cleanup;
- }
ksyms = ksyms__load();
if (!ksyms) {
fprintf(stderr, "failed to load kallsyms\n");
goto cleanup;
}
- if (ksyms__get_symbol(ksyms, "blk_account_io_merge_bio")) {
- obj->links.blk_account_io_merge_bio =
- bpf_program__attach(obj->progs.blk_account_io_merge_bio);
- if (!obj->links.blk_account_io_merge_bio) {
- err = -errno;
- fprintf(stderr, "failed to attach blk_account_io_merge_bio: %s\n",
- strerror(-err));
- goto cleanup;
- }
+ if (!ksyms__get_symbol(ksyms, "blk_account_io_merge_bio"))
+ bpf_program__set_autoload(obj->progs.blk_account_io_merge_bio, false);
+
+ err = biostacks_bpf__load(obj);
+ if (err) {
+ fprintf(stderr, "failed to load BPF object: %d\n", err);
+ goto cleanup;
}
- obj->links.blk_account_io_done = bpf_program__attach(obj->progs.blk_account_io_done);
- if (!obj->links.blk_account_io_done) {
- err = -errno;
- fprintf(stderr, "failed to attach blk_account_io_done: %s\n",
- strerror(-err));
+
+ err = biostacks_bpf__attach(obj);
+ if (err) {
+ fprintf(stderr, "failed to attach BPF programs: %d\n", err);
goto cleanup;
}
diff --git a/libbpf-tools/biotop.bpf.c b/libbpf-tools/biotop.bpf.c
index 226e32d3..07631378 100644
--- a/libbpf-tools/biotop.bpf.c
+++ b/libbpf-tools/biotop.bpf.c
@@ -30,8 +30,8 @@ struct {
__type(value, struct val_t);
} counts SEC(".maps");
-SEC("kprobe")
-int BPF_KPROBE(blk_account_io_start, struct request *req)
+static __always_inline
+int trace_start(struct request *req)
{
struct who_t who = {};
@@ -56,8 +56,8 @@ int BPF_KPROBE(blk_mq_start_request, struct request *req)
return 0;
}
-SEC("kprobe")
-int BPF_KPROBE(blk_account_io_done, struct request *req, u64 now)
+static __always_inline
+int trace_done(struct request *req)
{
struct val_t *valp, zero = {};
struct info_t info = {};
@@ -103,4 +103,40 @@ int BPF_KPROBE(blk_account_io_done, struct request *req, u64 now)
return 0;
}
+SEC("kprobe/blk_account_io_start")
+int BPF_KPROBE(blk_account_io_start, struct request *req)
+{
+ return trace_start(req);
+}
+
+SEC("kprobe/blk_account_io_done")
+int BPF_KPROBE(blk_account_io_done, struct request *req)
+{
+ return trace_done(req);
+}
+
+SEC("kprobe/__blk_account_io_start")
+int BPF_KPROBE(__blk_account_io_start, struct request *req)
+{
+ return trace_start(req);
+}
+
+SEC("kprobe/__blk_account_io_done")
+int BPF_KPROBE(__blk_account_io_done, struct request *req)
+{
+ return trace_done(req);
+}
+
+SEC("tp_btf/block_io_start")
+int BPF_PROG(block_io_start, struct request *req)
+{
+ return trace_start(req);
+}
+
+SEC("tp_btf/block_io_done")
+int BPF_PROG(block_io_done, struct request *req)
+{
+ return trace_done(req);
+}
+
char LICENSE[] SEC("license") = "GPL";
diff --git a/libbpf-tools/biotop.c b/libbpf-tools/biotop.c
index 75484281..5b3a7cf3 100644
--- a/libbpf-tools/biotop.c
+++ b/libbpf-tools/biotop.c
@@ -354,6 +354,38 @@ static int print_stat(struct biotop_bpf *obj)
return err;
}
+static bool has_block_io_tracepoints(void)
+{
+ return tracepoint_exists("block", "block_io_start") &&
+ tracepoint_exists("block", "block_io_done");
+}
+
+static void disable_block_io_tracepoints(struct biotop_bpf *obj)
+{
+ bpf_program__set_autoload(obj->progs.block_io_start, false);
+ bpf_program__set_autoload(obj->progs.block_io_done, false);
+}
+
+static void disable_blk_account_io_kprobes(struct biotop_bpf *obj)
+{
+ bpf_program__set_autoload(obj->progs.blk_account_io_start, false);
+ bpf_program__set_autoload(obj->progs.blk_account_io_done, false);
+ bpf_program__set_autoload(obj->progs.__blk_account_io_start, false);
+ bpf_program__set_autoload(obj->progs.__blk_account_io_done, false);
+}
+
+static void blk_account_io_set_autoload(struct biotop_bpf *obj,
+ struct ksyms *ksyms)
+{
+ if (!ksyms__get_symbol(ksyms, "__blk_account_io_start")) {
+ bpf_program__set_autoload(obj->progs.__blk_account_io_start, false);
+ bpf_program__set_autoload(obj->progs.__blk_account_io_done, false);
+ } else {
+ bpf_program__set_autoload(obj->progs.blk_account_io_start, false);
+ bpf_program__set_autoload(obj->progs.blk_account_io_done, false);
+ }
+}
+
int main(int argc, char **argv)
{
static const struct argp argp = {
@@ -386,32 +418,19 @@ int main(int argc, char **argv)
goto cleanup;
}
+ if (has_block_io_tracepoints())
+ disable_blk_account_io_kprobes(obj);
+ else {
+ disable_block_io_tracepoints(obj);
+ blk_account_io_set_autoload(obj, ksyms);
+ }
+
err = biotop_bpf__load(obj);
if (err) {
warn("failed to load BPF object: %d\n", err);
goto cleanup;
}
- if (ksyms__get_symbol(ksyms, "__blk_account_io_start"))
- obj->links.blk_account_io_start = bpf_program__attach_kprobe(obj->progs.blk_account_io_start, false, "__blk_account_io_start");
- else
- obj->links.blk_account_io_start = bpf_program__attach_kprobe(obj->progs.blk_account_io_start, false, "blk_account_io_start");
-
- if (!obj->links.blk_account_io_start) {
- warn("failed to load attach blk_account_io_start\n");
- goto cleanup;
- }
-
- if (ksyms__get_symbol(ksyms, "__blk_account_io_done"))
- obj->links.blk_account_io_done = bpf_program__attach_kprobe(obj->progs.blk_account_io_done, false, "__blk_account_io_done");
- else
- obj->links.blk_account_io_done = bpf_program__attach_kprobe(obj->progs.blk_account_io_done, false, "blk_account_io_done");
-
- if (!obj->links.blk_account_io_done) {
- warn("failed to load attach blk_account_io_done\n");
- goto cleanup;
- }
-
err = biotop_bpf__attach(obj);
if (err) {
warn("failed to attach BPF programs: %d\n", err);
--
2.41.0

View File

@ -1 +1 @@
SHA512 (bcc-0.27.0.tar.gz) = 16df9f42444bcac3be967a43ba4183349b71e75c370957f518977051968277f9ffa8a5e3dfdb2f3bdc9b6b59b575ed82e694f5504ebc74bc0ca4cf3a4b753bfd
SHA512 (bcc-0.28.0.tar.gz) = 792ce93dba64b1f87390b2602dcaeba04ac8b2863652b06eb9a907b93bc6137a944b856cc6fa9c7a38671c89814740967561ca4f3b29c267babca7dc5e78aa02

View File

@ -1,364 +0,0 @@
From 70e879960428f5726067db93f7c742c8b39d4886 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 19 Apr 2023 23:46:53 -0700
Subject: [PATCH] sync with latest libbpf repo
Sync libbpf submodule up to the following commit:
44b0bc9ad70a ci: Regenerate latest vmlinux.h for old kernel CI tests.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
src/cc/compat/linux/virtual_bpf.h | 141 ++++++++++++++++++++++++++----
src/cc/export/helpers.h | 6 +-
src/cc/libbpf | 2 +-
3 files changed, 127 insertions(+), 22 deletions(-)
diff --git a/src/cc/compat/linux/virtual_bpf.h b/src/cc/compat/linux/virtual_bpf.h
index be3a4627..a182123e 100644
--- a/src/cc/compat/linux/virtual_bpf.h
+++ b/src/cc/compat/linux/virtual_bpf.h
@@ -1034,6 +1034,7 @@ enum bpf_attach_type {
BPF_PERF_EVENT,
BPF_TRACE_KPROBE_MULTI,
BPF_LSM_CGROUP,
+ BPF_STRUCT_OPS,
__MAX_BPF_ATTACH_TYPE
};
@@ -1109,7 +1110,7 @@ enum bpf_link_type {
*/
#define BPF_F_STRICT_ALIGNMENT (1U << 0)
-/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the
+/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROG_LOAD command, the
* verifier will allow any alignment whatsoever. On platforms
* with strict alignment requirements for loads ands stores (such
* as sparc and mips) the verifier validates that all loads and
@@ -1157,6 +1158,11 @@ enum bpf_link_type {
*/
#define BPF_F_XDP_HAS_FRAGS (1U << 5)
+/* If BPF_F_XDP_DEV_BOUND_ONLY is used in BPF_PROG_LOAD command, the loaded
+ * program becomes device-bound but can access XDP metadata.
+ */
+#define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6)
+
/* link_create.kprobe_multi.flags used in LINK_CREATE command for
* BPF_TRACE_KPROBE_MULTI attach type to create return probe.
*/
@@ -1262,6 +1268,9 @@ enum {
/* Create a map that is suitable to be an inner map with dynamic max entries */
BPF_F_INNER_MAP = (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+ BPF_F_LINK = (1U << 13),
};
/* Flags for BPF_PROG_QUERY. */
@@ -1399,6 +1408,11 @@ union bpf_attr {
__aligned_u64 fd_array; /* array of FDs */
__aligned_u64 core_relos;
__u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */
+ /* output: actual total log contents size (including termintaing zero).
+ * It could be both larger than original log_size (if log was
+ * truncated), or smaller (if log buffer wasn't filled completely).
+ */
+ __u32 log_true_size;
};
struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -1484,6 +1498,11 @@ union bpf_attr {
__u32 btf_size;
__u32 btf_log_size;
__u32 btf_log_level;
+ /* output: actual total log contents size (including termintaing zero).
+ * It could be both larger than original log_size (if log was
+ * truncated), or smaller (if log buffer wasn't filled completely).
+ */
+ __u32 btf_log_true_size;
};
struct {
@@ -1503,7 +1522,10 @@ union bpf_attr {
} task_fd_query;
struct { /* struct used by BPF_LINK_CREATE command */
- __u32 prog_fd; /* eBPF program to attach */
+ union {
+ __u32 prog_fd; /* eBPF program to attach */
+ __u32 map_fd; /* struct_ops to attach */
+ };
union {
__u32 target_fd; /* object to attach to */
__u32 target_ifindex; /* target ifindex */
@@ -1544,12 +1566,23 @@ union bpf_attr {
struct { /* struct used by BPF_LINK_UPDATE command */
__u32 link_fd; /* link fd */
- /* new program fd to update link with */
- __u32 new_prog_fd;
+ union {
+ /* new program fd to update link with */
+ __u32 new_prog_fd;
+ /* new struct_ops map fd to update link with */
+ __u32 new_map_fd;
+ };
__u32 flags; /* extra flags */
- /* expected link's program fd; is specified only if
- * BPF_F_REPLACE flag is set in flags */
- __u32 old_prog_fd;
+ union {
+ /* expected link's program fd; is specified only if
+ * BPF_F_REPLACE flag is set in flags.
+ */
+ __u32 old_prog_fd;
+ /* expected link's map fd; is specified only
+ * if BPF_F_REPLACE flag is set.
+ */
+ __u32 old_map_fd;
+ };
} link_update;
struct {
@@ -1643,17 +1676,17 @@ union bpf_attr {
* Description
* This helper is a "printk()-like" facility for debugging. It
* prints a message defined by format *fmt* (of size *fmt_size*)
- * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * to file *\/sys/kernel/tracing/trace* from TraceFS, if
* available. It can take up to three additional **u64**
* arguments (as an eBPF helpers, the total number of arguments is
* limited to five).
*
* Each time the helper is called, it appends a line to the trace.
- * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
- * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
+ * Lines are discarded while *\/sys/kernel/tracing/trace* is
+ * open, use *\/sys/kernel/tracing/trace_pipe* to avoid this.
* The format of the trace is customizable, and the exact output
* one will get depends on the options set in
- * *\/sys/kernel/debug/tracing/trace_options* (see also the
+ * *\/sys/kernel/tracing/trace_options* (see also the
* *README* file under the same directory). However, it usually
* defaults to something like:
*
@@ -2002,6 +2035,9 @@ union bpf_attr {
* sending the packet. This flag was added for GRE
* encapsulation, but might be used with other protocols
* as well in the future.
+ * **BPF_F_NO_TUNNEL_KEY**
+ * Add a flag to tunnel metadata indicating that no tunnel
+ * key should be set in the resulting tunnel header.
*
* Here is a typical usage on the transmit path:
*
@@ -2645,6 +2681,11 @@ union bpf_attr {
* Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
* L2 type as Ethernet.
*
+ * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**,
+ * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**:
+ * Indicate the new IP header version after decapsulating the outer
+ * IP header. Used when the inner and outer IP versions are different.
+ *
* A call to this helper is susceptible to change the underlying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
@@ -2789,7 +2830,7 @@ union bpf_attr {
*
* long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
* Description
- * For en eBPF program attached to a perf event, retrieve the
+ * For an eBPF program attached to a perf event, retrieve the
* value of the event counter associated to *ctx* and store it in
* the structure pointed by *buf* and of size *buf_size*. Enabled
* and running times are also stored in the structure (see
@@ -3122,6 +3163,11 @@ union bpf_attr {
* **BPF_FIB_LOOKUP_OUTPUT**
* Perform lookup from an egress perspective (default is
* ingress).
+ * **BPF_FIB_LOOKUP_SKIP_NEIGH**
+ * Skip the neighbour table lookup. *params*->dmac
+ * and *params*->smac will not be set as output. A common
+ * use case is to call **bpf_redirect_neigh**\ () after
+ * doing **bpf_fib_lookup**\ ().
*
* *ctx* is either **struct xdp_md** for XDP programs or
* **struct sk_buff** tc cls_act programs.
@@ -4952,6 +4998,12 @@ union bpf_attr {
* different maps if key/value layout matches across maps.
* Every bpf_timer_set_callback() can have different callback_fn.
*
+ * *flags* can be one of:
+ *
+ * **BPF_F_TIMER_ABS**
+ * Start the timer in absolute expire value instead of the
+ * default relative one.
+ *
* Return
* 0 on success.
* **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier
@@ -5294,7 +5346,7 @@ union bpf_attr {
* Return
* Nothing. Always succeeds.
*
- * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags)
+ * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags)
* Description
* Read *len* bytes from *src* into *dst*, starting from *offset*
* into *src*.
@@ -5304,22 +5356,36 @@ union bpf_attr {
* of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
* *flags* is not 0.
*
- * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
+ * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
* Description
* Write *len* bytes from *src* into *dst*, starting from *offset*
* into *dst*.
- * *flags* is currently unused.
+ *
+ * *flags* must be 0 except for skb-type dynptrs.
+ *
+ * For skb-type dynptrs:
+ * * All data slices of the dynptr are automatically
+ * invalidated after **bpf_dynptr_write**\ (). This is
+ * because writing may pull the skb and change the
+ * underlying packet buffer.
+ *
+ * * For *flags*, please see the flags accepted by
+ * **bpf_skb_store_bytes**\ ().
* Return
* 0 on success, -E2BIG if *offset* + *len* exceeds the length
* of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
- * is a read-only dynptr or if *flags* is not 0.
+ * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs,
+ * other errors correspond to errors returned by **bpf_skb_store_bytes**\ ().
*
- * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
+ * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
* Description
* Get a pointer to the underlying dynptr data.
*
* *len* must be a statically known value. The returned data slice
* is invalidated whenever the dynptr is invalidated.
+ *
+ * skb and xdp type dynptrs may not use bpf_dynptr_data. They should
+ * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
* Return
* Pointer to the underlying dynptr data, NULL if the dynptr is
* read-only, if the dynptr is invalid, or if the offset and length
@@ -5415,7 +5481,7 @@ union bpf_attr {
* Drain samples from the specified user ring buffer, and invoke
* the provided callback for each such sample:
*
- * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx);
+ * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx);
*
* If **callback_fn** returns 0, the helper will continue to try
* and drain the next sample, up to a maximum of
@@ -5765,6 +5831,7 @@ enum {
BPF_F_ZERO_CSUM_TX = (1ULL << 1),
BPF_F_DONT_FRAGMENT = (1ULL << 2),
BPF_F_SEQ_NUMBER = (1ULL << 3),
+ BPF_F_NO_TUNNEL_KEY = (1ULL << 4),
};
/* BPF_FUNC_skb_get_tunnel_key flags. */
@@ -5804,6 +5871,8 @@ enum {
BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4),
BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5),
BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6),
+ BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7),
+ BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8),
};
enum {
@@ -6339,6 +6408,9 @@ struct bpf_link_info {
struct {
__u32 ifindex;
} xdp;
+ struct {
+ __u32 map_id;
+ } struct_ops;
};
} __attribute__((aligned(8)));
@@ -6735,6 +6807,7 @@ struct bpf_raw_tracepoint_args {
enum {
BPF_FIB_LOOKUP_DIRECT = (1U << 0),
BPF_FIB_LOOKUP_OUTPUT = (1U << 1),
+ BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2),
};
enum {
@@ -6902,6 +6975,21 @@ struct bpf_list_node {
__u64 :64;
} __attribute__((aligned(8)));
+struct bpf_rb_root {
+ __u64 :64;
+ __u64 :64;
+} __attribute__((aligned(8)));
+
+struct bpf_rb_node {
+ __u64 :64;
+ __u64 :64;
+ __u64 :64;
+} __attribute__((aligned(8)));
+
+struct bpf_refcount {
+ __u32 :32;
+} __attribute__((aligned(4)));
+
struct bpf_sysctl {
__u32 write; /* Sysctl is being read (= 0) or written (= 1).
* Allows 1,2,4-byte read, but no write.
@@ -7051,5 +7139,22 @@ struct bpf_core_relo {
enum bpf_core_relo_kind kind;
};
+/*
+ * Flags to control bpf_timer_start() behaviour.
+ * - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is
+ * relative to current time.
+ */
+enum {
+ BPF_F_TIMER_ABS = (1ULL << 0),
+};
+
+/* BPF numbers iterator state */
+struct bpf_iter_num {
+ /* opaque iterator state; having __u64 here allows to preserve correct
+ * alignment requirements in vmlinux.h, generated from BTF
+ */
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
#endif /* _UAPI__LINUX_BPF_H__ */
)********"
diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h
index 3873c4a4..dae050bb 100644
--- a/src/cc/export/helpers.h
+++ b/src/cc/export/helpers.h
@@ -1006,13 +1006,13 @@ static void (*bpf_ringbuf_submit_dynptr)(struct bpf_dynptr *ptr, __u64 flags) =
(void *)BPF_FUNC_ringbuf_submit_dynptr;
static void (*bpf_ringbuf_discard_dynptr)(struct bpf_dynptr *ptr, __u64 flags) =
(void *)BPF_FUNC_ringbuf_discard_dynptr;
-static long (*bpf_dynptr_read)(void *dst, __u32 len, struct bpf_dynptr *src, __u32 offset,
+static long (*bpf_dynptr_read)(void *dst, __u32 len, const struct bpf_dynptr *src, __u32 offset,
__u64 flags) =
(void *)BPF_FUNC_dynptr_read;
-static long (*bpf_dynptr_write)(struct bpf_dynptr *dst, __u32 offset, void *src, __u32 len,
+static long (*bpf_dynptr_write)(const struct bpf_dynptr *dst, __u32 offset, void *src, __u32 len,
__u64 flags) =
(void *)BPF_FUNC_dynptr_write;
-static void *(*bpf_dynptr_data)(struct bpf_dynptr *ptr, __u32 offset, __u32 len) =
+static void *(*bpf_dynptr_data)(const struct bpf_dynptr *ptr, __u32 offset, __u32 len) =
(void *)BPF_FUNC_dynptr_data;
static __s64 (*bpf_tcp_raw_gen_syncookie_ipv4)(struct iphdr *iph, struct tcphdr *th,
__u32 th_len) =
--
2.39.2

View File

@ -0,0 +1,55 @@
From 59a1fccfc78482af189150b7937b21244f34e48a Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Thu, 3 Aug 2023 16:11:50 +0200
Subject: [PATCH] tool/slabratetop: add definition of freelist_aba_t
With recent kernel containing the commit 6801be4f2653 ("slub: Replace
cmpxchg_double()"), slabratetop fails to compiles with the following
error:
In file included from /virtual/main.c:86:
include/linux/slub_def.h:56:3: error: unknown type name 'freelist_aba_t'
freelist_aba_t freelist_tid;
^
2 warnings and 1 error generated.
Traceback (most recent call last):
File "/usr/share/bcc/tools/slabratetop", line 187, in <module>
b = BPF(text=bpf_text)
^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/site-packages/bcc/__init__.py", line 479, in __init__
raise Exception("Failed to compile BPF module %s" % (src_file or "<text>"))
Exception: Failed to compile BPF module <text>
Adding the definition of freelist_aba_t fixes the issue.
---
tools/slabratetop.py | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/tools/slabratetop.py b/tools/slabratetop.py
index 8fbcac5e..8a7d486e 100755
--- a/tools/slabratetop.py
+++ b/tools/slabratetop.py
@@ -141,6 +141,20 @@ static inline void *slab_address(const struct slab *slab)
return NULL;
}
+#ifdef CONFIG_64BIT
+typedef __uint128_t freelist_full_t;
+#else
+typedef u64 freelist_full_t;
+#endif
+
+typedef union {
+ struct {
+ void *freelist;
+ unsigned long counter;
+ };
+ freelist_full_t full;
+} freelist_aba_t;
+
#ifdef CONFIG_SLUB
#include <linux/slub_def.h>
#else
--
2.41.0

View File

@ -0,0 +1,855 @@
From 53ef33b5ad42e6a4baa37821119199f2d846beff Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Thu, 27 Jul 2023 18:19:18 +0200
Subject: [PATCH 2/2] tools: Add support for the new block_io_* tracepoints
The bio tools currently depends on blk_account_io_done/start functions
that can be inlined. To fix that, a couple of tracepoints have been
added upstream (block:block_io_start/done). This patch add the support
for those tracepoints when they are available.
Unfortunately, the bio tools relies on data that is not available to
the tracepoints (mostly the struct request). So the tracepoints can't
be used as drop in replacement for blk_account_io_*. Main difference,
is that we can't use the struct request as the hash key anymore, so it
now uses the couple (dev_t, sector) for that purpose.
For the biolatency tool, the -F option is disabled when only the
tracepoints are available because the flags are not all accessible
from the tracepoints. Otherwise, all features of the tools should
remain.
Closes #4261
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
---
tools/biolatency.py | 166 ++++++++++++++++++++++++++++--------
tools/biosnoop.py | 200 +++++++++++++++++++++++++++++++++-----------
tools/biotop.py | 108 +++++++++++++++++++-----
3 files changed, 371 insertions(+), 103 deletions(-)
diff --git a/tools/biolatency.py b/tools/biolatency.py
index 8fe43a7c..03b48a4c 100755
--- a/tools/biolatency.py
+++ b/tools/biolatency.py
@@ -11,6 +11,7 @@
#
# 20-Sep-2015 Brendan Gregg Created this.
# 31-Mar-2022 Rocky Xing Added disk filter support.
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
from __future__ import print_function
from bcc import BPF
@@ -72,7 +73,7 @@ bpf_text = """
#include <linux/blk-mq.h>
typedef struct disk_key {
- char disk[DISK_NAME_LEN];
+ dev_t dev;
u64 slot;
} disk_key_t;
@@ -86,26 +87,70 @@ typedef struct ext_val {
u64 count;
} ext_val_t;
-BPF_HASH(start, struct request *);
+struct tp_args {
+ u64 __unused__;
+ dev_t dev;
+ sector_t sector;
+ unsigned int nr_sector;
+ unsigned int bytes;
+ char rwbs[8];
+ char comm[16];
+ char cmd[];
+};
+
+struct start_key {
+ dev_t dev;
+ u32 _pad;
+ sector_t sector;
+ CMD_FLAGS
+};
+
+BPF_HASH(start, struct start_key);
STORAGE
+static dev_t ddevt(struct gendisk *disk) {
+ return (disk->major << 20) | disk->first_minor;
+}
+
// time block I/O
-int trace_req_start(struct pt_regs *ctx, struct request *req)
+static int __trace_req_start(struct start_key key)
{
DISK_FILTER
u64 ts = bpf_ktime_get_ns();
- start.update(&req, &ts);
+ start.update(&key, &ts);
return 0;
}
+int trace_req_start(struct pt_regs *ctx, struct request *req)
+{
+ struct start_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ SET_FLAGS
+
+ return __trace_req_start(key);
+}
+
+int trace_req_start_tp(struct tp_args *args)
+{
+ struct start_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_req_start(key);
+}
+
// output
-int trace_req_done(struct pt_regs *ctx, struct request *req)
+static int __trace_req_done(struct start_key key)
{
u64 *tsp, delta;
// fetch timestamp and calculate delta
- tsp = start.lookup(&req);
+ tsp = start.lookup(&key);
if (tsp == 0) {
return 0; // missed issue
}
@@ -116,9 +161,31 @@ int trace_req_done(struct pt_regs *ctx, struct request *req)
// store as histogram
STORE
- start.delete(&req);
+ start.delete(&key);
return 0;
}
+
+int trace_req_done(struct pt_regs *ctx, struct request *req)
+{
+ struct start_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ SET_FLAGS
+
+ return __trace_req_done(key);
+}
+
+int trace_req_done_tp(struct tp_args *args)
+{
+ struct start_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_req_done(key);
+}
"""
# code substitutions
@@ -134,21 +201,18 @@ store_str = ""
if args.disks:
storage_str += "BPF_HISTOGRAM(dist, disk_key_t);"
disks_str = """
- disk_key_t key = {.slot = bpf_log2l(delta)};
- void *__tmp = (void *)req->__RQ_DISK__->disk_name;
- bpf_probe_read(&key.disk, sizeof(key.disk), __tmp);
- dist.atomic_increment(key);
+ disk_key_t dkey = {};
+ dkey.dev = key.dev;
+ dkey.slot = bpf_log2l(delta);
+ dist.atomic_increment(dkey);
"""
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
- store_str += disks_str.replace('__RQ_DISK__', 'rq_disk')
- else:
- store_str += disks_str.replace('__RQ_DISK__', 'q->disk')
+ store_str += disks_str
elif args.flags:
storage_str += "BPF_HISTOGRAM(dist, flag_key_t);"
store_str += """
- flag_key_t key = {.slot = bpf_log2l(delta)};
- key.flags = req->cmd_flags;
- dist.atomic_increment(key);
+ flag_key_t fkey = {.slot = bpf_log2l(delta)};
+ fkey.flags = key.flags;
+ dist.atomic_increment(fkey);
"""
else:
storage_str += "BPF_HISTOGRAM(dist);"
@@ -161,21 +225,13 @@ store_str = ""
exit(1)
stat_info = os.stat(disk_path)
- major = os.major(stat_info.st_rdev)
- minor = os.minor(stat_info.st_rdev)
-
- disk_field_str = ""
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
- disk_field_str = 'req->rq_disk'
- else:
- disk_field_str = 'req->q->disk'
+ dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev)
disk_filter_str = """
- struct gendisk *disk = %s;
- if (!(disk->major == %d && disk->first_minor == %d)) {
+ if(key.dev != %s) {
return 0;
}
- """ % (disk_field_str, major, minor)
+ """ % (dev)
bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str)
else:
@@ -194,6 +250,16 @@ store_str = ""
bpf_text = bpf_text.replace("STORAGE", storage_str)
bpf_text = bpf_text.replace("STORE", store_str)
+if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
+ bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk')
+else:
+ bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk')
+if args.flags:
+ bpf_text = bpf_text.replace('CMD_FLAGS', 'u64 flags;')
+ bpf_text = bpf_text.replace('SET_FLAGS', 'key.flags = req->cmd_flags;')
+else:
+ bpf_text = bpf_text.replace('CMD_FLAGS', '')
+ bpf_text = bpf_text.replace('SET_FLAGS', '')
if debug or args.ebpf:
print(bpf_text)
@@ -205,25 +271,53 @@ b = BPF(text=bpf_text)
if args.queued:
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_req_start")
- else:
+ elif BPF.get_kprobe_functions(b'blk_account_io_start'):
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_req_start")
+ else:
+ if args.flags:
+ # Some flags are accessible in the rwbs field (RAHEAD, SYNC and META)
+ # but other aren't. Disable the -F option for tracepoint for now.
+ print("ERROR: blk_account_io_start probe not available. Can't use -F.")
+ exit()
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_req_start_tp")
else:
if BPF.get_kprobe_functions(b'blk_start_request'):
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
+
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_done")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_done")
+else:
+ if args.flags:
+ print("ERROR: blk_account_io_done probe not available. Can't use -F.")
+ exit()
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_done_tp")
+
if not args.json:
print("Tracing block device I/O... Hit Ctrl-C to end.")
-def disk_print(s):
- disk = s.decode('utf-8', 'replace')
- if not disk:
- disk = "<unknown>"
- return disk
+# cache disk major,minor -> diskname
+diskstats = "/proc/diskstats"
+disklookup = {}
+with open(diskstats) as stats:
+ for line in stats:
+ a = line.split()
+ disklookup[a[0] + "," + a[1]] = a[2]
+
+def disk_print(d):
+ major = d >> 20
+ minor = d & ((1 << 20) - 1)
+
+ disk = str(major) + "," + str(minor)
+ if disk in disklookup:
+ diskname = disklookup[disk]
+ else:
+ diskname = "?"
+
+ return diskname
# see blk_fill_rwbs():
req_opf = {
diff --git a/tools/biosnoop.py b/tools/biosnoop.py
index 33703233..f0fef98b 100755
--- a/tools/biosnoop.py
+++ b/tools/biosnoop.py
@@ -14,6 +14,7 @@
# 11-Feb-2016 Allan McAleavy updated for BPF_PERF_OUTPUT
# 21-Jun-2022 Rocky Xing Added disk filter support.
# 13-Oct-2022 Rocky Xing Added support for displaying block I/O pattern.
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
from __future__ import print_function
from bcc import BPF
@@ -64,6 +65,24 @@ struct val_t {
char name[TASK_COMM_LEN];
};
+struct tp_args {
+ u64 __unused__;
+ dev_t dev;
+ sector_t sector;
+ unsigned int nr_sector;
+ unsigned int bytes;
+ char rwbs[8];
+ char comm[16];
+ char cmd[];
+};
+
+struct hash_key {
+ dev_t dev;
+ u32 rwflag;
+ sector_t sector;
+};
+
+
#ifdef INCLUDE_PATTERN
struct sector_key_t {
u32 dev_major;
@@ -79,6 +98,7 @@ enum bio_pattern {
struct data_t {
u32 pid;
+ u32 dev;
u64 rwflag;
u64 delta;
u64 qdelta;
@@ -88,7 +108,6 @@ struct data_t {
enum bio_pattern pattern;
#endif
u64 ts;
- char disk_name[DISK_NAME_LEN];
char name[TASK_COMM_LEN];
};
@@ -96,12 +115,45 @@ struct data_t {
BPF_HASH(last_sectors, struct sector_key_t, u64);
#endif
-BPF_HASH(start, struct request *, struct start_req_t);
-BPF_HASH(infobyreq, struct request *, struct val_t);
+BPF_HASH(start, struct hash_key, struct start_req_t);
+BPF_HASH(infobyreq, struct hash_key, struct val_t);
BPF_PERF_OUTPUT(events);
+static dev_t ddevt(struct gendisk *disk) {
+ return (disk->major << 20) | disk->first_minor;
+}
+
+/*
+ * The following deals with a kernel version change (in mainline 4.7, although
+ * it may be backported to earlier kernels) with how block request write flags
+ * are tested. We handle both pre- and post-change versions here. Please avoid
+ * kernel version tests like this as much as possible: they inflate the code,
+ * test, and maintenance burden.
+ */
+static int get_rwflag(u32 cmd_flags) {
+#ifdef REQ_WRITE
+ return !!(cmd_flags & REQ_WRITE);
+#elif defined(REQ_OP_SHIFT)
+ return !!((cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
+#else
+ return !!((cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
+#endif
+}
+
+#define RWBS_LEN 8
+
+static int get_rwflag_tp(char *rwbs) {
+ for (int i = 0; i < RWBS_LEN; i++) {
+ if (rwbs[i] == 'W')
+ return 1;
+ if (rwbs[i] == '\\0')
+ return 0;
+ }
+ return 0;
+}
+
// cache PID and comm by-req
-int trace_pid_start(struct pt_regs *ctx, struct request *req)
+static int __trace_pid_start(struct hash_key key)
{
DISK_FILTER
@@ -113,47 +165,76 @@ int trace_pid_start(struct pt_regs *ctx, struct request *req)
if (##QUEUE##) {
val.ts = bpf_ktime_get_ns();
}
- infobyreq.update(&req, &val);
+ infobyreq.update(&key, &val);
}
return 0;
}
+
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .rwflag = get_rwflag(req->cmd_flags),
+ .sector = req->__sector
+ };
+
+ return __trace_pid_start(key);
+}
+
+int trace_pid_start_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .rwflag = get_rwflag_tp(args->rwbs),
+ .sector = args->sector
+ };
+
+ return __trace_pid_start(key);
+}
+
// time block I/O
int trace_req_start(struct pt_regs *ctx, struct request *req)
{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .rwflag = get_rwflag(req->cmd_flags),
+ .sector = req->__sector
+ };
+
DISK_FILTER
struct start_req_t start_req = {
.ts = bpf_ktime_get_ns(),
.data_len = req->__data_len
};
- start.update(&req, &start_req);
+ start.update(&key, &start_req);
return 0;
}
// output
-int trace_req_completion(struct pt_regs *ctx, struct request *req)
+static int __trace_req_completion(void *ctx, struct hash_key key)
{
struct start_req_t *startp;
struct val_t *valp;
struct data_t data = {};
- struct gendisk *rq_disk;
+ //struct gendisk *rq_disk;
u64 ts;
// fetch timestamp and calculate delta
- startp = start.lookup(&req);
+ startp = start.lookup(&key);
if (startp == 0) {
// missed tracing issue
return 0;
}
ts = bpf_ktime_get_ns();
- rq_disk = req->__RQ_DISK__;
+ //rq_disk = req->__RQ_DISK__;
data.delta = ts - startp->ts;
data.ts = ts / 1000;
data.qdelta = 0;
data.len = startp->data_len;
- valp = infobyreq.lookup(&req);
+ valp = infobyreq.lookup(&key);
if (valp == 0) {
data.name[0] = '?';
data.name[1] = 0;
@@ -162,10 +243,9 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
data.qdelta = startp->ts - valp->ts;
}
data.pid = valp->pid;
- data.sector = req->__sector;
+ data.sector = key.sector;
+ data.dev = key.dev;
bpf_probe_read_kernel(&data.name, sizeof(data.name), valp->name);
- bpf_probe_read_kernel(&data.disk_name, sizeof(data.disk_name),
- rq_disk->disk_name);
}
#ifdef INCLUDE_PATTERN
@@ -174,8 +254,8 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
u64 *sector, last_sector;
struct sector_key_t sector_key = {
- .dev_major = rq_disk->major,
- .dev_minor = rq_disk->first_minor
+ .dev_major = key.dev >> 20,
+ .dev_minor = key.dev & ((1 << 20) - 1)
};
sector = last_sectors.lookup(&sector_key);
@@ -187,27 +267,36 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
last_sectors.update(&sector_key, &last_sector);
#endif
-/*
- * The following deals with a kernel version change (in mainline 4.7, although
- * it may be backported to earlier kernels) with how block request write flags
- * are tested. We handle both pre- and post-change versions here. Please avoid
- * kernel version tests like this as much as possible: they inflate the code,
- * test, and maintenance burden.
- */
-#ifdef REQ_WRITE
- data.rwflag = !!(req->cmd_flags & REQ_WRITE);
-#elif defined(REQ_OP_SHIFT)
- data.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
-#else
- data.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
-#endif
+ data.rwflag = key.rwflag;
events.perf_submit(ctx, &data, sizeof(data));
- start.delete(&req);
- infobyreq.delete(&req);
+ start.delete(&key);
+ infobyreq.delete(&key);
return 0;
}
+
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .rwflag = get_rwflag(req->cmd_flags),
+ .sector = req->__sector
+ };
+
+ return __trace_req_completion(ctx, key);
+}
+
+int trace_req_completion_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .rwflag = get_rwflag_tp(args->rwbs),
+ .sector = args->sector
+ };
+
+ return __trace_req_completion(args, key);
+}
"""
if args.queue:
bpf_text = bpf_text.replace('##QUEUE##', '1')
@@ -225,21 +314,13 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
exit(1)
stat_info = os.stat(disk_path)
- major = os.major(stat_info.st_rdev)
- minor = os.minor(stat_info.st_rdev)
-
- disk_field_str = ""
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
- disk_field_str = 'req->rq_disk'
- else:
- disk_field_str = 'req->q->disk'
+ dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev)
disk_filter_str = """
- struct gendisk *disk = %s;
- if (!(disk->major == %d && disk->first_minor == %d)) {
+ if(key.dev != %s) {
return 0;
}
- """ % (disk_field_str, major, minor)
+ """ % (dev)
bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str)
else:
@@ -254,15 +335,19 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
b = BPF(text=bpf_text)
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_start'):
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
+else:
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp")
if BPF.get_kprobe_functions(b'blk_start_request'):
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
+else:
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp")
# header
print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID",
@@ -273,6 +358,27 @@ print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID",
print("%7s " % ("QUE(ms)"), end="")
print("%7s" % "LAT(ms)")
+
+# cache disk major,minor -> diskname
+diskstats = "/proc/diskstats"
+disklookup = {}
+with open(diskstats) as stats:
+ for line in stats:
+ a = line.split()
+ disklookup[a[0] + "," + a[1]] = a[2]
+
+def disk_print(d):
+ major = d >> 20
+ minor = d & ((1 << 20) - 1)
+
+ disk = str(major) + "," + str(minor)
+ if disk in disklookup:
+ diskname = disklookup[disk]
+ else:
+ diskname = "<unknown>"
+
+ return diskname
+
rwflg = ""
pattern = ""
start_ts = 0
@@ -297,9 +403,7 @@ P_RANDOM = 2
delta = float(event.ts) - start_ts
- disk_name = event.disk_name.decode('utf-8', 'replace')
- if not disk_name:
- disk_name = '<unknown>'
+ disk_name = disk_print(event.dev)
print("%-11.6f %-14.14s %-7s %-9s %-1s %-10s %-7s" % (
delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid,
diff --git a/tools/biotop.py b/tools/biotop.py
index fcdd373f..2620983a 100755
--- a/tools/biotop.py
+++ b/tools/biotop.py
@@ -14,6 +14,7 @@
#
# 06-Feb-2016 Brendan Gregg Created this.
# 17-Mar-2022 Rocky Xing Added PID filter support.
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
from __future__ import print_function
from bcc import BPF
@@ -88,14 +89,35 @@ struct val_t {
u32 io;
};
-BPF_HASH(start, struct request *, struct start_req_t);
-BPF_HASH(whobyreq, struct request *, struct who_t);
+struct tp_args {
+ u64 __unused__;
+ dev_t dev;
+ sector_t sector;
+ unsigned int nr_sector;
+ unsigned int bytes;
+ char rwbs[8];
+ char comm[16];
+ char cmd[];
+};
+
+struct hash_key {
+ dev_t dev;
+ u32 _pad;
+ sector_t sector;
+};
+
+BPF_HASH(start, struct hash_key, struct start_req_t);
+BPF_HASH(whobyreq, struct hash_key, struct who_t);
BPF_HASH(counts, struct info_t, struct val_t);
+static dev_t ddevt(struct gendisk *disk) {
+ return (disk->major << 20) | disk->first_minor;
+}
+
// cache PID and comm by-req
-int trace_pid_start(struct pt_regs *ctx, struct request *req)
+static int __trace_pid_start(struct hash_key key)
{
- struct who_t who = {};
+ struct who_t who;
u32 pid;
if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
@@ -104,30 +126,54 @@ int trace_pid_start(struct pt_regs *ctx, struct request *req)
return 0;
who.pid = pid;
- whobyreq.update(&req, &who);
+ whobyreq.update(&key, &who);
}
return 0;
}
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ return __trace_pid_start(key);
+}
+
+int trace_pid_start_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_pid_start(key);
+}
+
// time block I/O
int trace_req_start(struct pt_regs *ctx, struct request *req)
{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
struct start_req_t start_req = {
.ts = bpf_ktime_get_ns(),
.data_len = req->__data_len
};
- start.update(&req, &start_req);
+ start.update(&key, &start_req);
return 0;
}
// output
-int trace_req_completion(struct pt_regs *ctx, struct request *req)
+static int __trace_req_completion(struct hash_key key)
{
struct start_req_t *startp;
// fetch timestamp and calculate delta
- startp = start.lookup(&req);
+ startp = start.lookup(&key);
if (startp == 0) {
return 0; // missed tracing issue
}
@@ -135,12 +181,12 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
struct who_t *whop;
u32 pid;
- whop = whobyreq.lookup(&req);
+ whop = whobyreq.lookup(&key);
pid = whop != 0 ? whop->pid : 0;
if (FILTER_PID) {
- start.delete(&req);
+ start.delete(&key);
if (whop != 0) {
- whobyreq.delete(&req);
+ whobyreq.delete(&key);
}
return 0;
}
@@ -150,8 +196,8 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
// setup info_t key
struct info_t info = {};
- info.major = req->__RQ_DISK__->major;
- info.minor = req->__RQ_DISK__->first_minor;
+ info.major = key.dev >> 20;
+ info.minor = key.dev & ((1 << 20) - 1);
/*
* The following deals with a kernel version change (in mainline 4.7, although
* it may be backported to earlier kernels) with how block request write flags
@@ -159,13 +205,13 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
* kernel version tests like this as much as possible: they inflate the code,
* test, and maintenance burden.
*/
-#ifdef REQ_WRITE
+/*#ifdef REQ_WRITE
info.rwflag = !!(req->cmd_flags & REQ_WRITE);
#elif defined(REQ_OP_SHIFT)
info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
#else
info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
-#endif
+#endif*/
if (whop == 0) {
// missed pid who, save stats as pid 0
@@ -183,11 +229,31 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
valp->io++;
}
- start.delete(&req);
- whobyreq.delete(&req);
+ start.delete(&key);
+ whobyreq.delete(&key);
return 0;
}
+
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ return __trace_req_completion(key);
+}
+
+int trace_req_completion_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_req_completion(key);
+}
"""
if args.ebpf:
@@ -207,15 +273,19 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
b = BPF(text=bpf_text)
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_start'):
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
+else:
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp")
if BPF.get_kprobe_functions(b'blk_start_request'):
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
+else:
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp")
print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
--
2.41.0