Rebase bcc to the latest version

Rebase bcc to v0.28.0 and rebuild it with LLVM 17. The rebase fixes
the following issues:
 - bpf-biosnoop out of bound access
 - kvmexit missing VM exit reasons and statistics
 - multi-word array type handling

Also fix tcpstates -Y issue and bio tools on non x86_64 arches.

Resolves: RHEL-9976
Resolves: RHEL-8664
Resolves: RHEL-8702
Resolves: RHEL-8674
Resolves: RHEL-8490
Resolves: RHEL-10591
Resolves: RHEL-8553

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
This commit is contained in:
Jerome Marchand 2023-10-17 18:00:03 +02:00
parent 6436833ca5
commit 2053933c95
18 changed files with 1171 additions and 1515 deletions

1
.gitignore vendored
View File

@ -17,3 +17,4 @@
/bcc-0.24.0.tar.gz
/bcc-0.25.0.tar.gz
/bcc-0.26.0.tar.gz
/bcc-0.28.0.tar.gz

View File

@ -1,77 +0,0 @@
From c17a12ac030c5d9c812e611f8132570af0e795af Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 13 Aug 2022 17:50:07 -0700
Subject: [PATCH 1/2] Fix bpf_pseudo_fd() type conversion error
With llvm15 and llvm16, the following command line
sudo ./trace.py 'smp_call_function_single "%K", arg1'
will cause error:
/virtual/main.c:60:36: error: incompatible integer to pointer conversion passing 'u64'
(aka 'unsigned long long') to parameter of type 'void *' [-Wint-conversion]
bpf_perf_event_output(ctx, bpf_pseudo_fd(1, -1), CUR_CPU_IDENTIFIER, &__data, sizeof(__data));
^~~~~~~~~~~~~~~~~~~~
1 error generated.
Failed to compile BPF module <text>
In helpers.h, we have
u64 bpf_pseudo_fd(u64, u64) asm("llvm.bpf.pseudo");
Apparently, <= llvm14 can tolerate u64 -> 'void *' conversion, but
llvm15 by default will cause an error.
Let us explicitly convert bpf_pseudo_fd to 'void *' to avoid
such errors.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
src/cc/frontends/clang/b_frontend_action.cc | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/src/cc/frontends/clang/b_frontend_action.cc b/src/cc/frontends/clang/b_frontend_action.cc
index a4e05b16..dbeba3e4 100644
--- a/src/cc/frontends/clang/b_frontend_action.cc
+++ b/src/cc/frontends/clang/b_frontend_action.cc
@@ -957,7 +957,7 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) {
string arg0 = rewriter_.getRewrittenText(expansionRange(Call->getArg(0)->getSourceRange()));
string args_other = rewriter_.getRewrittenText(expansionRange(SourceRange(GET_BEGINLOC(Call->getArg(1)),
GET_ENDLOC(Call->getArg(2)))));
- txt = "bpf_perf_event_output(" + arg0 + ", bpf_pseudo_fd(1, " + fd + ")";
+ txt = "bpf_perf_event_output(" + arg0 + ", (void *)bpf_pseudo_fd(1, " + fd + ")";
txt += ", CUR_CPU_IDENTIFIER, " + args_other + ")";
// e.g.
@@ -986,7 +986,7 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) {
string meta_len = rewriter_.getRewrittenText(expansionRange(Call->getArg(3)->getSourceRange()));
txt = "bpf_perf_event_output(" +
skb + ", " +
- "bpf_pseudo_fd(1, " + fd + "), " +
+ "(void *)bpf_pseudo_fd(1, " + fd + "), " +
"((__u64)" + skb_len + " << 32) | BPF_F_CURRENT_CPU, " +
meta + ", " +
meta_len + ");";
@@ -1006,12 +1006,12 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) {
string keyp = rewriter_.getRewrittenText(expansionRange(Call->getArg(1)->getSourceRange()));
string flag = rewriter_.getRewrittenText(expansionRange(Call->getArg(2)->getSourceRange()));
txt = "bpf_" + string(memb_name) + "(" + ctx + ", " +
- "bpf_pseudo_fd(1, " + fd + "), " + keyp + ", " + flag + ");";
+ "(void *)bpf_pseudo_fd(1, " + fd + "), " + keyp + ", " + flag + ");";
} else if (memb_name == "ringbuf_output") {
string name = string(Ref->getDecl()->getName());
string args = rewriter_.getRewrittenText(expansionRange(SourceRange(GET_BEGINLOC(Call->getArg(0)),
GET_ENDLOC(Call->getArg(2)))));
- txt = "bpf_ringbuf_output(bpf_pseudo_fd(1, " + fd + ")";
+ txt = "bpf_ringbuf_output((void *)bpf_pseudo_fd(1, " + fd + ")";
txt += ", " + args + ")";
// e.g.
@@ -1033,7 +1033,7 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) {
} else if (memb_name == "ringbuf_reserve") {
string name = string(Ref->getDecl()->getName());
string arg0 = rewriter_.getRewrittenText(expansionRange(Call->getArg(0)->getSourceRange()));
- txt = "bpf_ringbuf_reserve(bpf_pseudo_fd(1, " + fd + ")";
+ txt = "bpf_ringbuf_reserve((void *)bpf_pseudo_fd(1, " + fd + ")";
txt += ", " + arg0 + ", 0)"; // Flags in reserve are meaningless
} else if (memb_name == "ringbuf_discard") {
string name = string(Ref->getDecl()->getName());
--
2.38.1

View File

@ -1,96 +0,0 @@
From 9ae3908ae38b3e8d8e36a52c0e5664c453d4c015 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Wed, 26 Oct 2022 14:41:54 +0200
Subject: [PATCH 2/2] Fix clang 15 int to pointer conversion errors
Since version 15, clang issues error for implicit conversion of
integer to pointer. Several tools are broken. This patch add explicit
pointer cast where needed.
Fixes the following errors:
/virtual/main.c:37:18: error: incompatible integer to pointer conversion initializing 'struct request *' with an expression of type 'unsigned long' [-Wint-conversion]
struct request *req = ctx->di;
^ ~~~~~~~
/virtual/main.c:49:18: error: incompatible integer to pointer conversion initializing 'struct request *' with an expression of type 'unsigned long' [-Wint-conversion]
struct request *req = ctx->di;
^ ~~~~~~~
2 errors generated.
/virtual/main.c:73:19: error: incompatible integer to pointer conversion initializing 'struct pt_regs *' with an expression of type 'unsigned long' [-Wint-conversion]
struct pt_regs * __ctx = ctx->di;
^ ~~~~~~~
/virtual/main.c:100:240: error: incompatible integer to pointer conversion passing 'u64' (aka 'unsigned long long') to parameter of type 'const void *' [-Wint-conversion]
data.ppid = ({ typeof(pid_t) _val; __builtin_memset(&_val, 0, sizeof(_val)); bpf_probe_read(&_val, sizeof(_val), (u64)&({ typeof(struct task_struct *) _val; __builtin_memset(&_val, 0, sizeof(_val)); bpf_probe_read(&_val, sizeof(_val), (u64)&task->real_parent); _val; })->tgid); _val; });
^~~~~~~~~~~~~~~~~~~~~~~
/virtual/main.c:100:118: error: incompatible integer to pointer conversion passing 'u64' (aka 'unsigned long long') to parameter of type 'const void *' [-Wint-conversion]
data.ppid = ({ typeof(pid_t) _val; __builtin_memset(&_val, 0, sizeof(_val)); bpf_probe_read(&_val, sizeof(_val), (u64)&({ typeof(struct task_struct *) _val; __builtin_memset(&_val, 0, sizeof(_val)); bpf_probe_read(&_val, sizeof(_val), (u64)&task->real_parent); _val; })->tgid); _val; });
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
---
src/cc/frontends/clang/b_frontend_action.cc | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/src/cc/frontends/clang/b_frontend_action.cc b/src/cc/frontends/clang/b_frontend_action.cc
index dbeba3e4..c0582464 100644
--- a/src/cc/frontends/clang/b_frontend_action.cc
+++ b/src/cc/frontends/clang/b_frontend_action.cc
@@ -517,9 +517,9 @@ bool ProbeVisitor::VisitUnaryOperator(UnaryOperator *E) {
string pre, post;
pre = "({ typeof(" + E->getType().getAsString() + ") _val; __builtin_memset(&_val, 0, sizeof(_val));";
if (cannot_fall_back_safely)
- pre += " bpf_probe_read_kernel(&_val, sizeof(_val), (u64)";
+ pre += " bpf_probe_read_kernel(&_val, sizeof(_val), (void *)";
else
- pre += " bpf_probe_read(&_val, sizeof(_val), (u64)";
+ pre += " bpf_probe_read(&_val, sizeof(_val), (void *)";
post = "); _val; })";
rewriter_.ReplaceText(expansionLoc(E->getOperatorLoc()), 1, pre);
rewriter_.InsertTextAfterToken(expansionLoc(GET_ENDLOC(sub)), post);
@@ -581,9 +581,9 @@ bool ProbeVisitor::VisitMemberExpr(MemberExpr *E) {
string pre, post;
pre = "({ typeof(" + E->getType().getAsString() + ") _val; __builtin_memset(&_val, 0, sizeof(_val));";
if (cannot_fall_back_safely)
- pre += " bpf_probe_read_kernel(&_val, sizeof(_val), (u64)&";
+ pre += " bpf_probe_read_kernel(&_val, sizeof(_val), (void *)&";
else
- pre += " bpf_probe_read(&_val, sizeof(_val), (u64)&";
+ pre += " bpf_probe_read(&_val, sizeof(_val), (void *)&";
post = rhs + "); _val; })";
rewriter_.InsertText(expansionLoc(GET_BEGINLOC(E)), pre);
rewriter_.ReplaceText(expansionRange(SourceRange(member, GET_ENDLOC(E))), post);
@@ -635,9 +635,9 @@ bool ProbeVisitor::VisitArraySubscriptExpr(ArraySubscriptExpr *E) {
pre = "({ typeof(" + E->getType().getAsString() + ") _val; __builtin_memset(&_val, 0, sizeof(_val));";
if (cannot_fall_back_safely)
- pre += " bpf_probe_read_kernel(&_val, sizeof(_val), (u64)((";
+ pre += " bpf_probe_read_kernel(&_val, sizeof(_val), (void *)((";
else
- pre += " bpf_probe_read(&_val, sizeof(_val), (u64)((";
+ pre += " bpf_probe_read(&_val, sizeof(_val), (void *)((";
if (isMemberDereference(base)) {
pre += "&";
// If the base of the array subscript is a member dereference, we'll rewrite
@@ -747,8 +747,8 @@ void BTypeVisitor::genParamDirectAssign(FunctionDecl *D, string& preamble,
arg->addAttr(UnavailableAttr::CreateImplicit(C, "ptregs"));
size_t d = idx - 1;
const char *reg = calling_conv_regs[d];
- preamble += " " + text + " = " + fn_args_[0]->getName().str() + "->" +
- string(reg) + ";";
+ preamble += " " + text + " = (" + arg->getType().getAsString() + ")" +
+ fn_args_[0]->getName().str() + "->" + string(reg) + ";";
}
}
}
@@ -762,7 +762,7 @@ void BTypeVisitor::genParamIndirectAssign(FunctionDecl *D, string& preamble,
if (idx == 0) {
new_ctx = "__" + arg->getName().str();
- preamble += " struct pt_regs * " + new_ctx + " = " +
+ preamble += " struct pt_regs * " + new_ctx + " = (void *)" +
arg->getName().str() + "->" +
string(calling_conv_regs[0]) + ";";
} else {
--
2.38.1

View File

@ -1,222 +0,0 @@
From 2f6565681e627d11dde0177503100669df020684 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Sun, 28 Aug 2022 07:44:01 +0200
Subject: [PATCH] Fix some documentation issues (#4197)
* compactsnoop-manpage: fix the name of the tool in the NAME section
In its manpage, compactsnoop tools is called compacstall in the NAME
section. I don't know where that name comes from, but it should be
compactsnoop.
* dirtop-manpage: use '-d' option in the EXAMPLES section
The mandatory '-d' option of dirtop is missing in the EXAMPLES
section. Copy it from the usage message. Also remove '.py' suffixes.
* funclatency-manpage: fix typo in one of the examples
There is a spurious colon in one of the manpage examples. Remove it.
* tools/killsnoop: add '-s' option in the synopsis of the example file
Commit 33c8b1ac ("Update man page and example file") added '-s' option
to the manpage and an example in the example file, but missed the
sysnopsis in that later case.
* trace-manpage: add missing options (-c,-n,-f and -B) to the synopsis
Copy the full sysopsis from the usage message.
* tcptracer-manpage: add missing '-t' option in the manpage
Add '-t' option to the synopsis and description.
* tcpsubnet-manpage: remove '--ebpf' option from the manpage
This option is explicitly suppressed in argparse and no manpage of
other tools mentions it.
* manpages: remove '.py' suffix from the synopsis of some *snoop tools
Other manpages don't show the suffix, nor do the usage messages.
---
man/man8/bindsnoop.8 | 2 +-
man/man8/compactsnoop.8 | 4 ++--
man/man8/dirtop.8 | 8 ++++----
man/man8/drsnoop.8 | 2 +-
man/man8/funclatency.8 | 2 +-
man/man8/opensnoop.8 | 2 +-
man/man8/tcpsubnet.8 | 5 +----
man/man8/tcptracer.8 | 5 ++++-
man/man8/trace.8 | 6 ++++--
tools/killsnoop_example.txt | 2 ++
10 files changed, 21 insertions(+), 17 deletions(-)
diff --git a/man/man8/bindsnoop.8 b/man/man8/bindsnoop.8
index f8fa1850..0eb42ccb 100644
--- a/man/man8/bindsnoop.8
+++ b/man/man8/bindsnoop.8
@@ -2,7 +2,7 @@
.SH NAME
bindsnoop \- Trace bind() system calls.
.SH SYNOPSIS
-.B bindsnoop.py [\fB-h\fP] [\fB-w\fP] [\fB-t\fP] [\fB-p\fP PID] [\fB-P\fP PORT] [\fB-E\fP] [\fB-U\fP] [\fB-u\fP UID] [\fB--count\fP] [\fB--cgroupmap MAP\fP] [\fB--mntnsmap MNTNSMAP\fP]
+.B bindsnoop [\fB-h\fP] [\fB-w\fP] [\fB-t\fP] [\fB-p\fP PID] [\fB-P\fP PORT] [\fB-E\fP] [\fB-U\fP] [\fB-u\fP UID] [\fB--count\fP] [\fB--cgroupmap MAP\fP] [\fB--mntnsmap MNTNSMAP\fP]
.SH DESCRIPTION
bindsnoop reports socket options set before the bind call that would impact this system call behavior.
.PP
diff --git a/man/man8/compactsnoop.8 b/man/man8/compactsnoop.8
index a2933d7a..e9cde0ce 100644
--- a/man/man8/compactsnoop.8
+++ b/man/man8/compactsnoop.8
@@ -1,8 +1,8 @@
.TH compactsnoop 8 "2019-11-1" "USER COMMANDS"
.SH NAME
-compactstall \- Trace compact zone events. Uses Linux eBPF/bcc.
+compactsnoop \- Trace compact zone events. Uses Linux eBPF/bcc.
.SH SYNOPSIS
-.B compactsnoop.py [\-h] [\-T] [\-p PID] [\-d DURATION] [\-K] [\-e]
+.B compactsnoop [\-h] [\-T] [\-p PID] [\-d DURATION] [\-K] [\-e]
.SH DESCRIPTION
compactsnoop traces the compact zone events, showing which processes are
allocing pages with memory compaction. This can be useful for discovering
diff --git a/man/man8/dirtop.8 b/man/man8/dirtop.8
index cc61a676..eaa0c0c4 100644
--- a/man/man8/dirtop.8
+++ b/man/man8/dirtop.8
@@ -55,19 +55,19 @@ Number of interval summaries.
.TP
Summarize block device I/O by directory, 1 second screen refresh:
#
-.B dirtop.py
+.B dirtop -d '/hdfs/uuid/*/yarn'
.TP
Don't clear the screen, and top 8 rows only:
#
-.B dirtop.py -Cr 8
+.B dirtop -d '/hdfs/uuid/*/yarn' -Cr 8
.TP
5 second summaries, 10 times only:
#
-.B dirtop.py 5 10
+.B dirtop -d '/hdfs/uuid/*/yarn' 5 10
.TP
Report read & write IOs generated in mutliple yarn and data directories:
#
-.B dirtop.py -d '/hdfs/uuid/*/yarn,/hdfs/uuid/*/data'
+.B dirtop -d '/hdfs/uuid/*/yarn,/hdfs/uuid/*/data'
.SH FIELDS
.TP
loadavg:
diff --git a/man/man8/drsnoop.8 b/man/man8/drsnoop.8
index 90ca901f..8fb3789a 100644
--- a/man/man8/drsnoop.8
+++ b/man/man8/drsnoop.8
@@ -2,7 +2,7 @@
.SH NAME
drsnoop \- Trace direct reclaim events. Uses Linux eBPF/bcc.
.SH SYNOPSIS
-.B drsnoop.py [\-h] [\-T] [\-U] [\-p PID] [\-t TID] [\-u UID] [\-d DURATION] [-n name] [-v]
+.B drsnoop [\-h] [\-T] [\-U] [\-p PID] [\-t TID] [\-u UID] [\-d DURATION] [-n name] [-v]
.SH DESCRIPTION
drsnoop trace direct reclaim events, showing which processes are allocing pages
with direct reclaiming. This can be useful for discovering when allocstall (/p-
diff --git a/man/man8/funclatency.8 b/man/man8/funclatency.8
index 9012b832..f96f6098 100644
--- a/man/man8/funclatency.8
+++ b/man/man8/funclatency.8
@@ -89,7 +89,7 @@ Print the BPF program (for debugging purposes).
.TP
Time vfs_read() for process ID 181 only:
#
-.B funclatency \-p 181 vfs_read:
+.B funclatency \-p 181 vfs_read
.TP
Time both vfs_fstat() and vfs_fstatat() calls, by use of a wildcard:
#
diff --git a/man/man8/opensnoop.8 b/man/man8/opensnoop.8
index fee83263..d1888772 100644
--- a/man/man8/opensnoop.8
+++ b/man/man8/opensnoop.8
@@ -2,7 +2,7 @@
.SH NAME
opensnoop \- Trace open() syscalls. Uses Linux eBPF/bcc.
.SH SYNOPSIS
-.B opensnoop.py [\-h] [\-T] [\-U] [\-x] [\-p PID] [\-t TID] [\-u UID]
+.B opensnoop [\-h] [\-T] [\-U] [\-x] [\-p PID] [\-t TID] [\-u UID]
[\-d DURATION] [\-n NAME] [\-e] [\-f FLAG_FILTER]
[--cgroupmap MAPPATH] [--mntnsmap MAPPATH]
.SH DESCRIPTION
diff --git a/man/man8/tcpsubnet.8 b/man/man8/tcpsubnet.8
index 525b8082..ad5f1be1 100644
--- a/man/man8/tcpsubnet.8
+++ b/man/man8/tcpsubnet.8
@@ -2,7 +2,7 @@
.SH NAME
tcpsubnet \- Summarize and aggregate IPv4 TCP traffic by subnet.
.SH SYNOPSIS
-.B tcpsubnet [\-h] [\-v] [\--ebpf] [\-J] [\-f FORMAT] [\-i INTERVAL] [subnets]
+.B tcpsubnet [\-h] [\-v] [\-J] [\-f FORMAT] [\-i INTERVAL] [subnets]
.SH DESCRIPTION
This tool summarizes and aggregates IPv4 TCP sent to the subnets
passed in argument and prints to stdout on a fixed interval.
@@ -35,9 +35,6 @@ Interval between updates, seconds (default 1).
Format output units. Supported values are bkmBKM. When using
kmKM the output will be rounded to floor.
.TP
-\--ebpf
-Prints the BPF program.
-.TP
subnets
Comma separated list of subnets. Traffic will be categorized
in theses subnets. Order matters.
diff --git a/man/man8/tcptracer.8 b/man/man8/tcptracer.8
index 59240f4b..19a6164d 100644
--- a/man/man8/tcptracer.8
+++ b/man/man8/tcptracer.8
@@ -2,7 +2,7 @@
.SH NAME
tcptracer \- Trace TCP established connections. Uses Linux eBPF/bcc.
.SH SYNOPSIS
-.B tcptracer [\-h] [\-v] [\-p PID] [\-N NETNS] [\-\-cgroupmap MAPPATH] [--mntnsmap MAPPATH] [\-4 | \-6]
+.B tcptracer [\-h] [\-v] [-t] [\-p PID] [\-N NETNS] [\-\-cgroupmap MAPPATH] [--mntnsmap MAPPATH] [\-4 | \-6]
.SH DESCRIPTION
This tool traces established TCP connections that open and close while tracing,
and prints a line of output per connect, accept and close events. This includes
@@ -23,6 +23,9 @@ Print usage message.
\-v
Print full lines, with long event type names and network namespace numbers.
.TP
+\-t
+Include timestamp on output
+.TP
\-p PID
Trace this process ID only (filtered in-kernel).
.TP
diff --git a/man/man8/trace.8 b/man/man8/trace.8
index c4417e5f..64a5e799 100644
--- a/man/man8/trace.8
+++ b/man/man8/trace.8
@@ -2,9 +2,11 @@
.SH NAME
trace \- Trace a function and print its arguments or return value, optionally evaluating a filter. Uses Linux eBPF/bcc.
.SH SYNOPSIS
-.B trace [-h] [-b BUFFER_PAGES] [-p PID] [-L TID] [--uid UID] [-v] [-Z STRING_SIZE] [-S] [-s SYM_FILE_LIST]
- [-M MAX_EVENTS] [-t] [-u] [-T] [-C] [-K] [-U] [-a] [-I header] [-A]
+.B trace [-h] [-b BUFFER_PAGES] [-p PID] [-L TID] [--uid UID] [-v] [-Z STRING_SIZE] [-S] [-M MAX_EVENTS] [-t]
+ [-u] [-T] [-C] [-c CGROUP_PATH] [-n NAME] [-f MSG_FILTER] [-B] [-s SYM_FILE_LIST] [-K] [-U] [-a]
+ [-I header] [-A]
probe [probe ...]
+
.SH DESCRIPTION
trace probes functions you specify and displays trace messages if a particular
condition is met. You can control the message format to display function
diff --git a/tools/killsnoop_example.txt b/tools/killsnoop_example.txt
index 7746f2a0..038d09c6 100644
--- a/tools/killsnoop_example.txt
+++ b/tools/killsnoop_example.txt
@@ -27,6 +27,8 @@ Trace signals issued by the kill() syscall
-h, --help show this help message and exit
-x, --failed only show failed kill syscalls
-p PID, --pid PID trace this PID only
+ -s SIGNAL, --signal SIGNAL
+ trace this signal only
examples:
./killsnoop # trace all kill() signals
--
2.38.1

View File

@ -1,100 +0,0 @@
From 2e14fbaf9105e0b504f243ffc6d7d5a16e13a2a7 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Fri, 14 Oct 2022 13:01:58 +0000
Subject: [PATCH] bcc: support building with external libbpf package and older
uapi linux/bpf.h
When building bcc with a relatively new packaged libbpf (0.8.1)
and -DCMAKE_USE_LIBBPF_PACKAGE:BOOL=TRUE, multiple compilation
failures are encountered due the fact the system uapi header
in /usr/include/linux/bpf.h is not very recent (this is often
the case for distros, which sync it via a kernel headers
package quite conservatively due to use by glibc).
With libbpf built via git submodule, the uapi header included in
the libbpf package is used, so here a similar approach is proposed
for the external package build. Instead of having to sync
another file the already present compat/linux/virtual_bpf.h
is used; we copy it to compat/linux/bpf.h (eliminating the
string prefix/suffix on first/last lines).
From there, we ensure that places that assume the presence of
the libbpf git submodule point at compat/ as a location to
find the uapi header.
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
examples/cpp/CMakeLists.txt | 4 ++++
introspection/CMakeLists.txt | 4 ++++
src/cc/CMakeLists.txt | 6 ++++++
tests/cc/CMakeLists.txt | 4 ++++
4 files changed, 18 insertions(+)
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index 801e6bad..8d09ae11 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -4,7 +4,11 @@
include_directories(${PROJECT_BINARY_DIR}/src/cc)
include_directories(${PROJECT_SOURCE_DIR}/src/cc)
include_directories(${PROJECT_SOURCE_DIR}/src/cc/api)
+if (CMAKE_USE_LIBBPF_PACKAGE AND LIBBPF_FOUND)
+include_directories(${PROJECT_SOURCE_DIR}/src/cc/compat)
+else()
include_directories(${PROJECT_SOURCE_DIR}/src/cc/libbpf/include/uapi)
+endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
diff --git a/introspection/CMakeLists.txt b/introspection/CMakeLists.txt
index dcbe69a3..ce2d03dc 100644
--- a/introspection/CMakeLists.txt
+++ b/introspection/CMakeLists.txt
@@ -3,7 +3,11 @@
include_directories(${PROJECT_SOURCE_DIR}/src/cc)
include_directories(${PROJECT_SOURCE_DIR}/src/cc/api)
+if (CMAKE_USE_LIBBPF_PACKAGE AND LIBBPF_FOUND)
+include_directories(${PROJECT_SOURCE_DIR}/src/cc/compat)
+else()
include_directories(${PROJECT_SOURCE_DIR}/src/cc/libbpf/include/uapi)
+endif()
option(INSTALL_INTROSPECTION "Install BPF introspection tools" ON)
option(BPS_LINK_RT "Pass -lrt to linker when linking bps tool" ON)
diff --git a/src/cc/CMakeLists.txt b/src/cc/CMakeLists.txt
index ffe8feec..c7f53530 100644
--- a/src/cc/CMakeLists.txt
+++ b/src/cc/CMakeLists.txt
@@ -15,6 +15,12 @@ endif (LIBDEBUGINFOD_FOUND)
# todo: if check for kernel version
if (CMAKE_USE_LIBBPF_PACKAGE AND LIBBPF_FOUND)
include_directories(${LIBBPF_INCLUDE_DIRS})
+ # create up-to-date linux/bpf.h from virtual_bpf.h (remove string wrapper);
+ # when libbpf is built as a submodule we use its version of linux/bpf.h
+ # so this does similar for the libbpf package, removing reliance on the
+ # system uapi header which can be out of date.
+ execute_process(COMMAND sh -c "cd ${CMAKE_CURRENT_SOURCE_DIR}/compat/linux && grep -ve '\\*\\*\\*\\*' virtual_bpf.h > bpf.h")
+ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/compat)
else()
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/libbpf/include)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/libbpf/include/uapi)
diff --git a/tests/cc/CMakeLists.txt b/tests/cc/CMakeLists.txt
index 677867d7..47056455 100644
--- a/tests/cc/CMakeLists.txt
+++ b/tests/cc/CMakeLists.txt
@@ -3,7 +3,11 @@
include_directories(${PROJECT_SOURCE_DIR}/src/cc)
include_directories(${PROJECT_SOURCE_DIR}/src/cc/api)
+if (CMAKE_USE_LIBBPF_PACKAGE AND LIBBPF_FOUND)
+include_directories(${PROJECT_SOURCE_DIR}/src/cc/compat)
+else()
include_directories(${PROJECT_SOURCE_DIR}/src/cc/libbpf/include/uapi)
+endif()
include_directories(${PROJECT_SOURCE_DIR}/tests/python/include)
add_executable(test_static test_static.c)
--
2.37.3

View File

@ -1,58 +0,0 @@
From 64f9c355a62f78000270d025b479b7eeba7349e9 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Wed, 11 Jan 2023 16:46:32 +0100
Subject: [PATCH] killsnoop: add missing -s and -T options to the synopsis
The -s option is missing from the synopsis of the killsnoop manpage,
example file and the comment on top of the tool itself.
Also, -T option is missing from the example file.
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
---
man/man8/killsnoop.8 | 2 +-
tools/killsnoop.py | 2 +-
tools/killsnoop_example.txt | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/man/man8/killsnoop.8 b/man/man8/killsnoop.8
index 3f63d2ee..cb2a975e 100644
--- a/man/man8/killsnoop.8
+++ b/man/man8/killsnoop.8
@@ -2,7 +2,7 @@
.SH NAME
killsnoop \- Trace signals issued by the kill() syscall. Uses Linux eBPF/bcc.
.SH SYNOPSIS
-.B killsnoop [\-h] [\-x] [-p PID] [-T PID]
+.B killsnoop [\-h] [\-x] [-p PID] [-T PID] [-s SIGNAL]
.SH DESCRIPTION
killsnoop traces the kill() syscall, to show signals sent via this method. This
may be useful to troubleshoot failing applications, where an unknown mechanism
diff --git a/tools/killsnoop.py b/tools/killsnoop.py
index c0166f1d..9cce8dcc 100755
--- a/tools/killsnoop.py
+++ b/tools/killsnoop.py
@@ -4,7 +4,7 @@
# killsnoop Trace signals issued by the kill() syscall.
# For Linux, uses BCC, eBPF. Embedded C.
#
-# USAGE: killsnoop [-h] [-x] [-p PID] [-T PID]
+# USAGE: killsnoop [-h] [-x] [-p PID] [-T PID] [-s SIGNAL]
#
# Copyright (c) 2015 Brendan Gregg.
# Licensed under the Apache License, Version 2.0 (the "License")
diff --git a/tools/killsnoop_example.txt b/tools/killsnoop_example.txt
index 904fe6ef..97c3ad70 100644
--- a/tools/killsnoop_example.txt
+++ b/tools/killsnoop_example.txt
@@ -19,7 +19,7 @@ The second line showed the same signal sent, this time resulting in a -3
USAGE message:
# ./killsnoop -h
-usage: killsnoop [-h] [-x] [-p PID]
+usage: killsnoop [-h] [-x] [-p PID] [-T PID] [-s SIGNAL]
Trace signals issued by the kill() syscall
--
2.39.2

View File

@ -1,363 +0,0 @@
From 34f77c4aaaa039fd2ef3d51b8b61db30fc34912f Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 19 Apr 2023 23:46:53 -0700
Subject: [PATCH] sync with latest libbpf repo
Sync libbpf submodule up to the following commit:
44b0bc9ad70a ci: Regenerate latest vmlinux.h for old kernel CI tests.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
src/cc/compat/linux/virtual_bpf.h | 141 ++++++++++++++++++++++++++----
src/cc/export/helpers.h | 6 +-
2 files changed, 126 insertions(+), 21 deletions(-)
diff --git a/src/cc/compat/linux/virtual_bpf.h b/src/cc/compat/linux/virtual_bpf.h
index be3a4627..a182123e 100644
--- a/src/cc/compat/linux/virtual_bpf.h
+++ b/src/cc/compat/linux/virtual_bpf.h
@@ -1034,6 +1034,7 @@ enum bpf_attach_type {
BPF_PERF_EVENT,
BPF_TRACE_KPROBE_MULTI,
BPF_LSM_CGROUP,
+ BPF_STRUCT_OPS,
__MAX_BPF_ATTACH_TYPE
};
@@ -1109,7 +1110,7 @@ enum bpf_link_type {
*/
#define BPF_F_STRICT_ALIGNMENT (1U << 0)
-/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the
+/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROG_LOAD command, the
* verifier will allow any alignment whatsoever. On platforms
* with strict alignment requirements for loads ands stores (such
* as sparc and mips) the verifier validates that all loads and
@@ -1157,6 +1158,11 @@ enum bpf_link_type {
*/
#define BPF_F_XDP_HAS_FRAGS (1U << 5)
+/* If BPF_F_XDP_DEV_BOUND_ONLY is used in BPF_PROG_LOAD command, the loaded
+ * program becomes device-bound but can access XDP metadata.
+ */
+#define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6)
+
/* link_create.kprobe_multi.flags used in LINK_CREATE command for
* BPF_TRACE_KPROBE_MULTI attach type to create return probe.
*/
@@ -1262,6 +1268,9 @@ enum {
/* Create a map that is suitable to be an inner map with dynamic max entries */
BPF_F_INNER_MAP = (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+ BPF_F_LINK = (1U << 13),
};
/* Flags for BPF_PROG_QUERY. */
@@ -1399,6 +1408,11 @@ union bpf_attr {
__aligned_u64 fd_array; /* array of FDs */
__aligned_u64 core_relos;
__u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */
+ /* output: actual total log contents size (including termintaing zero).
+ * It could be both larger than original log_size (if log was
+ * truncated), or smaller (if log buffer wasn't filled completely).
+ */
+ __u32 log_true_size;
};
struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -1484,6 +1498,11 @@ union bpf_attr {
__u32 btf_size;
__u32 btf_log_size;
__u32 btf_log_level;
+ /* output: actual total log contents size (including termintaing zero).
+ * It could be both larger than original log_size (if log was
+ * truncated), or smaller (if log buffer wasn't filled completely).
+ */
+ __u32 btf_log_true_size;
};
struct {
@@ -1503,7 +1522,10 @@ union bpf_attr {
} task_fd_query;
struct { /* struct used by BPF_LINK_CREATE command */
- __u32 prog_fd; /* eBPF program to attach */
+ union {
+ __u32 prog_fd; /* eBPF program to attach */
+ __u32 map_fd; /* struct_ops to attach */
+ };
union {
__u32 target_fd; /* object to attach to */
__u32 target_ifindex; /* target ifindex */
@@ -1544,12 +1566,23 @@ union bpf_attr {
struct { /* struct used by BPF_LINK_UPDATE command */
__u32 link_fd; /* link fd */
- /* new program fd to update link with */
- __u32 new_prog_fd;
+ union {
+ /* new program fd to update link with */
+ __u32 new_prog_fd;
+ /* new struct_ops map fd to update link with */
+ __u32 new_map_fd;
+ };
__u32 flags; /* extra flags */
- /* expected link's program fd; is specified only if
- * BPF_F_REPLACE flag is set in flags */
- __u32 old_prog_fd;
+ union {
+ /* expected link's program fd; is specified only if
+ * BPF_F_REPLACE flag is set in flags.
+ */
+ __u32 old_prog_fd;
+ /* expected link's map fd; is specified only
+ * if BPF_F_REPLACE flag is set.
+ */
+ __u32 old_map_fd;
+ };
} link_update;
struct {
@@ -1643,17 +1676,17 @@ union bpf_attr {
* Description
* This helper is a "printk()-like" facility for debugging. It
* prints a message defined by format *fmt* (of size *fmt_size*)
- * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * to file *\/sys/kernel/tracing/trace* from TraceFS, if
* available. It can take up to three additional **u64**
* arguments (as an eBPF helpers, the total number of arguments is
* limited to five).
*
* Each time the helper is called, it appends a line to the trace.
- * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
- * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
+ * Lines are discarded while *\/sys/kernel/tracing/trace* is
+ * open, use *\/sys/kernel/tracing/trace_pipe* to avoid this.
* The format of the trace is customizable, and the exact output
* one will get depends on the options set in
- * *\/sys/kernel/debug/tracing/trace_options* (see also the
+ * *\/sys/kernel/tracing/trace_options* (see also the
* *README* file under the same directory). However, it usually
* defaults to something like:
*
@@ -2002,6 +2035,9 @@ union bpf_attr {
* sending the packet. This flag was added for GRE
* encapsulation, but might be used with other protocols
* as well in the future.
+ * **BPF_F_NO_TUNNEL_KEY**
+ * Add a flag to tunnel metadata indicating that no tunnel
+ * key should be set in the resulting tunnel header.
*
* Here is a typical usage on the transmit path:
*
@@ -2645,6 +2681,11 @@ union bpf_attr {
* Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
* L2 type as Ethernet.
*
+ * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**,
+ * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**:
+ * Indicate the new IP header version after decapsulating the outer
+ * IP header. Used when the inner and outer IP versions are different.
+ *
* A call to this helper is susceptible to change the underlying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
@@ -2789,7 +2830,7 @@ union bpf_attr {
*
* long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
* Description
- * For en eBPF program attached to a perf event, retrieve the
+ * For an eBPF program attached to a perf event, retrieve the
* value of the event counter associated to *ctx* and store it in
* the structure pointed by *buf* and of size *buf_size*. Enabled
* and running times are also stored in the structure (see
@@ -3122,6 +3163,11 @@ union bpf_attr {
* **BPF_FIB_LOOKUP_OUTPUT**
* Perform lookup from an egress perspective (default is
* ingress).
+ * **BPF_FIB_LOOKUP_SKIP_NEIGH**
+ * Skip the neighbour table lookup. *params*->dmac
+ * and *params*->smac will not be set as output. A common
+ * use case is to call **bpf_redirect_neigh**\ () after
+ * doing **bpf_fib_lookup**\ ().
*
* *ctx* is either **struct xdp_md** for XDP programs or
* **struct sk_buff** tc cls_act programs.
@@ -4952,6 +4998,12 @@ union bpf_attr {
* different maps if key/value layout matches across maps.
* Every bpf_timer_set_callback() can have different callback_fn.
*
+ * *flags* can be one of:
+ *
+ * **BPF_F_TIMER_ABS**
+ * Start the timer in absolute expire value instead of the
+ * default relative one.
+ *
* Return
* 0 on success.
* **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier
@@ -5294,7 +5346,7 @@ union bpf_attr {
* Return
* Nothing. Always succeeds.
*
- * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags)
+ * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags)
* Description
* Read *len* bytes from *src* into *dst*, starting from *offset*
* into *src*.
@@ -5304,22 +5356,36 @@ union bpf_attr {
* of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
* *flags* is not 0.
*
- * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
+ * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
* Description
* Write *len* bytes from *src* into *dst*, starting from *offset*
* into *dst*.
- * *flags* is currently unused.
+ *
+ * *flags* must be 0 except for skb-type dynptrs.
+ *
+ * For skb-type dynptrs:
+ * * All data slices of the dynptr are automatically
+ * invalidated after **bpf_dynptr_write**\ (). This is
+ * because writing may pull the skb and change the
+ * underlying packet buffer.
+ *
+ * * For *flags*, please see the flags accepted by
+ * **bpf_skb_store_bytes**\ ().
* Return
* 0 on success, -E2BIG if *offset* + *len* exceeds the length
* of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
- * is a read-only dynptr or if *flags* is not 0.
+ * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs,
+ * other errors correspond to errors returned by **bpf_skb_store_bytes**\ ().
*
- * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len)
+ * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
* Description
* Get a pointer to the underlying dynptr data.
*
* *len* must be a statically known value. The returned data slice
* is invalidated whenever the dynptr is invalidated.
+ *
+ * skb and xdp type dynptrs may not use bpf_dynptr_data. They should
+ * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
* Return
* Pointer to the underlying dynptr data, NULL if the dynptr is
* read-only, if the dynptr is invalid, or if the offset and length
@@ -5415,7 +5481,7 @@ union bpf_attr {
* Drain samples from the specified user ring buffer, and invoke
* the provided callback for each such sample:
*
- * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx);
+ * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx);
*
* If **callback_fn** returns 0, the helper will continue to try
* and drain the next sample, up to a maximum of
@@ -5765,6 +5831,7 @@ enum {
BPF_F_ZERO_CSUM_TX = (1ULL << 1),
BPF_F_DONT_FRAGMENT = (1ULL << 2),
BPF_F_SEQ_NUMBER = (1ULL << 3),
+ BPF_F_NO_TUNNEL_KEY = (1ULL << 4),
};
/* BPF_FUNC_skb_get_tunnel_key flags. */
@@ -5804,6 +5871,8 @@ enum {
BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4),
BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5),
BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6),
+ BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7),
+ BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8),
};
enum {
@@ -6339,6 +6408,9 @@ struct bpf_link_info {
struct {
__u32 ifindex;
} xdp;
+ struct {
+ __u32 map_id;
+ } struct_ops;
};
} __attribute__((aligned(8)));
@@ -6735,6 +6807,7 @@ struct bpf_raw_tracepoint_args {
enum {
BPF_FIB_LOOKUP_DIRECT = (1U << 0),
BPF_FIB_LOOKUP_OUTPUT = (1U << 1),
+ BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2),
};
enum {
@@ -6902,6 +6975,21 @@ struct bpf_list_node {
__u64 :64;
} __attribute__((aligned(8)));
+struct bpf_rb_root {
+ __u64 :64;
+ __u64 :64;
+} __attribute__((aligned(8)));
+
+struct bpf_rb_node {
+ __u64 :64;
+ __u64 :64;
+ __u64 :64;
+} __attribute__((aligned(8)));
+
+struct bpf_refcount {
+ __u32 :32;
+} __attribute__((aligned(4)));
+
struct bpf_sysctl {
__u32 write; /* Sysctl is being read (= 0) or written (= 1).
* Allows 1,2,4-byte read, but no write.
@@ -7051,5 +7139,22 @@ struct bpf_core_relo {
enum bpf_core_relo_kind kind;
};
+/*
+ * Flags to control bpf_timer_start() behaviour.
+ * - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is
+ * relative to current time.
+ */
+enum {
+ BPF_F_TIMER_ABS = (1ULL << 0),
+};
+
+/* BPF numbers iterator state */
+struct bpf_iter_num {
+ /* opaque iterator state; having __u64 here allows to preserve correct
+ * alignment requirements in vmlinux.h, generated from BTF
+ */
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
#endif /* _UAPI__LINUX_BPF_H__ */
)********"
diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h
index d7b869e0..e989440a 100644
--- a/src/cc/export/helpers.h
+++ b/src/cc/export/helpers.h
@@ -1006,13 +1006,13 @@ static void (*bpf_ringbuf_submit_dynptr)(struct bpf_dynptr *ptr, __u64 flags) =
(void *)BPF_FUNC_ringbuf_submit_dynptr;
static void (*bpf_ringbuf_discard_dynptr)(struct bpf_dynptr *ptr, __u64 flags) =
(void *)BPF_FUNC_ringbuf_discard_dynptr;
-static long (*bpf_dynptr_read)(void *dst, __u32 len, struct bpf_dynptr *src, __u32 offset,
+static long (*bpf_dynptr_read)(void *dst, __u32 len, const struct bpf_dynptr *src, __u32 offset,
__u64 flags) =
(void *)BPF_FUNC_dynptr_read;
-static long (*bpf_dynptr_write)(struct bpf_dynptr *dst, __u32 offset, void *src, __u32 len,
+static long (*bpf_dynptr_write)(const struct bpf_dynptr *dst, __u32 offset, void *src, __u32 len,
__u64 flags) =
(void *)BPF_FUNC_dynptr_write;
-static void *(*bpf_dynptr_data)(struct bpf_dynptr *ptr, __u32 offset, __u32 len) =
+static void *(*bpf_dynptr_data)(const struct bpf_dynptr *ptr, __u32 offset, __u32 len) =
(void *)BPF_FUNC_dynptr_data;
static __s64 (*bpf_tcp_raw_gen_syncookie_ipv4)(struct iphdr *iph, struct tcphdr *th,
__u32 th_len) =
--
2.41.0

View File

@ -1,71 +0,0 @@
From cc35f70515cb0f3b8032b8fb68f9f37a844e74c8 Mon Sep 17 00:00:00 2001
From: Rong Tao <rongtao@cestc.cn>
Date: Fri, 10 Feb 2023 23:28:55 +0800
Subject: [PATCH] tools/compactsnoop.py: Fix raw_tracepoint Invalid argument
error
kernel commit abd4349ff9b8("mm: compaction: cleanup the compaction trace
events") change the arguments of 'mm_compaction_begin' from (start_pfn,
migrate_pfn, free_pfn, end_pfn, sync) to (cc, start_pfn, end_pfn, sync),
and change the arguments of 'mm_compaction_end' from (start_pfn,
migrate_pfn, free_pfn, end_pfn, sync, ret) to (cc, start_pfn, end_pfn,
sync, ret).
Replacing RAW_TRACEPOINT_PROBE with TRACEPOINT_PROBE solves this problem
and guarantees compatibility.
$ sudo ./compactsnoop.py
bpf_attach_raw_tracepoint (mm_compaction_begin): Invalid argument
Traceback (most recent call last):
File "/home/sdb/Git/bcc/tools/./compactsnoop.py", line 292, in <module>
b = BPF(text=bpf_text)
^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/site-packages/bcc/__init__.py", line 483, in __init__
self._trace_autoload()
File "/usr/lib/python3.11/site-packages/bcc/__init__.py", line 1462, in _trace_autoload
self.attach_raw_tracepoint(tp=tp, fn_name=fn.name)
File "/usr/lib/python3.11/site-packages/bcc/__init__.py", line 1055, in attach_raw_tracepoint
raise Exception("Failed to attach BPF to raw tracepoint")
Exception: Failed to attach BPF to raw tracepoint
Signed-off-by: Rong Tao <rongtao@cestc.cn>
---
tools/compactsnoop.py | 13 ++++---------
1 file changed, 4 insertions(+), 9 deletions(-)
diff --git a/tools/compactsnoop.py b/tools/compactsnoop.py
index 2643e8ed..2b395dec 100755
--- a/tools/compactsnoop.py
+++ b/tools/compactsnoop.py
@@ -237,11 +237,9 @@ RAW_TRACEPOINT_PROBE(mm_compaction_suitable)
return 0;
}
-RAW_TRACEPOINT_PROBE(mm_compaction_begin)
+TRACEPOINT_PROBE(compaction, mm_compaction_begin)
{
- // TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
- // unsigned long free_pfn, unsigned long zone_end, bool sync)
- bool sync = (bool)ctx->args[4];
+ bool sync = args->sync;
u64 id = bpf_get_current_pid_tgid();
struct val_t *valp = start.lookup(&id);
@@ -255,12 +253,9 @@ RAW_TRACEPOINT_PROBE(mm_compaction_begin)
return 0;
}
-RAW_TRACEPOINT_PROBE(mm_compaction_end)
+TRACEPOINT_PROBE(compaction, mm_compaction_end)
{
- // TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
- // unsigned long free_pfn, unsigned long zone_end, bool sync,
- // int status)
- submit_event(ctx, ctx->args[5]);
+ submit_event(args, args->status);
return 0;
}
"""
--
2.39.2

View File

@ -1,76 +0,0 @@
From 0e9384ec4c88d2da2d23475f58ec9bff7eb48639 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Tue, 25 Apr 2023 16:04:05 +0200
Subject: [PATCH] tools/deadlock: Add an option to set the maximum number of
stack traces
Commit 77f5252d ("tools/deadlock: support specifies maxnum of threads
and edge cases (#3455)") allow to set the maximum number of threads
and edge cases to be able to reduce the memory usage of the deadlock
tool. It however let the size of the map of stack traces fixed. It's
current size, 640k (actually rounded up to 1M) takes 1Gb of vmalloced
kernel memory.
This patch adds an option to make the maximum number of stack traces
user defined. It also set the default value to 64k, in line with the
current default for the number of edge cases and threads.
It fix the following issue on system with limited memory ressources:
could not open bpf map: stack_traces, error: Cannot allocate memory
Traceback (most recent call last):
File "/tmp/./deadlock.py", line 577, in <module>
main()
File "/tmp/./deadlock.py", line 489, in main
bpf = BPF(text=text)
File "/usr/lib/python3.9/site-packages/bcc/__init__.py", line 479, in __init__
raise Exception("Failed to compile BPF module %s" % (src_file or "<text>"))
Exception: Failed to compile BPF module <text>
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
---
tools/deadlock.c | 2 +-
tools/deadlock.py | 8 ++++++++
2 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/tools/deadlock.c b/tools/deadlock.c
index 006dc121..6ae405ba 100644
--- a/tools/deadlock.c
+++ b/tools/deadlock.c
@@ -60,7 +60,7 @@ struct thread_created_leaf_t {
BPF_HASH(thread_to_parent, u32, struct thread_created_leaf_t);
// Stack traces when threads are created and when mutexes are locked/unlocked.
-BPF_STACK_TRACE(stack_traces, 655360);
+BPF_STACK_TRACE(stack_traces, MAX_TRACES);
// The first argument to the user space function we are tracing
// is a pointer to the mutex M held by thread T.
diff --git a/tools/deadlock.py b/tools/deadlock.py
index 12de099f..f7eb4ce0 100755
--- a/tools/deadlock.py
+++ b/tools/deadlock.py
@@ -467,6 +467,13 @@ import time
help='Specifies the maximum number of edge cases that can be recorded. '
'default 65536. Note. 88 bytes per edge case.'
)
+ parser.add_argument(
+ '-s', '--stacktraces', type=int, default=65536,
+ help='Specifies the maximum number of stack traces that can be recorded. '
+ 'This number is rounded up to the next power of two.'
+ 'default 65536. Note. 1 kbytes vmalloced per stack trace.'
+ )
+
args = parser.parse_args()
if not args.binary:
try:
@@ -479,6 +486,7 @@ import time
text = f.read()
text = text.replace('MAX_THREADS', str(args.threads));
text = text.replace('MAX_EDGES', str(args.edges));
+ text = text.replace('MAX_TRACES', str(args.stacktraces));
bpf = BPF(text=text)
# Trace where threads are created
--
2.39.2

View File

@ -1,57 +0,0 @@
From 29f0fa3693d679102680fece9ed5e606e291c5fa Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Fri, 7 Apr 2023 14:30:54 +0200
Subject: [PATCH] tools/funcslower: fix printing of folded stacks
When trying to print folded stack, funcslower tries to join bytes to a
string. Let's perform that operation with bytes only, and decode
before printing.
Also, decode symbols name before printing for the default stack
format, to avoid unsightly b'xxx' output.
It fixes the following error:
Exception ignored on calling ctypes callback function: <function PerfEventArray._open_perf_buffer.<locals>.raw_cb_ at 0x7f200541e5e0>
Traceback (most recent call last):
File "/usr/lib/python3.9/site-packages/bcc/table.py", line 982, in raw_cb_
callback(cpu, data, size)
File "/usr/share/bcc/tools/funcslower", line 340, in print_event
print_stack(event)
File "/usr/share/bcc/tools/funcslower", line 324, in print_stack
print("%s %d" % (";".join(line), 1))
TypeError: sequence item 1: expected str instance, bytes found
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
---
tools/funcslower.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/tools/funcslower.py b/tools/funcslower.py
index 6df7f24c..4b3798a0 100755
--- a/tools/funcslower.py
+++ b/tools/funcslower.py
@@ -317,17 +317,17 @@ earliest_ts = 0
# print folded stack output
user_stack = list(user_stack)
kernel_stack = list(kernel_stack)
- line = [event.comm.decode('utf-8', 'replace')] + \
+ line = [event.comm] + \
[b.sym(addr, event.tgid_pid) for addr in reversed(user_stack)] + \
(do_delimiter and ["-"] or []) + \
[b.ksym(addr) for addr in reversed(kernel_stack)]
- print("%s %d" % (";".join(line), 1))
+ print("%s %d" % (b';'.join(line).decode('utf-8', 'replace'), 1))
else:
# print default multi-line stack output.
for addr in kernel_stack:
- print(" %s" % b.ksym(addr))
+ print(" %s" % b.ksym(addr).decode('utf-8', 'replace'))
for addr in user_stack:
- print(" %s" % b.sym(addr, event.tgid_pid))
+ print(" %s" % b.sym(addr, event.tgid_pid).decode('utf-8', 'replace'))
def print_event(cpu, data, size):
event = b["events"].event(data)
--
2.39.2

View File

@ -1,85 +0,0 @@
From 9965f8397950d8aa1bc1a5decbc2250d0627a798 Mon Sep 17 00:00:00 2001
From: Rong Tao <rongtao@cestc.cn>
Date: Fri, 10 Feb 2023 22:16:56 +0800
Subject: [PATCH] tools/nfsslower.py: Fix uninitialized struct pad error
The verifier is unhappy, if data struct _pad_ is not initialized, see [0][1].
$ sudo ./nfsslower.py
...
; bpf_perf_event_output(ctx, (void *)bpf_pseudo_fd(1, -2), CUR_CPU_IDENTIFIER, &data, sizeof(data));
83: (79) r1 = *(u64 *)(r10 -144) ; R1_w=ctx(off=0,imm=0) R10=fp0
84: (18) r3 = 0xffffffff ; R3_w=4294967295
86: (b7) r5 = 96 ; R5_w=96
87: (85) call bpf_perf_event_output#25
invalid indirect read from stack R4 off -136+92 size 96
processed 84 insns (limit 1000000) max_states_per_insn 0 total_states 4 peak_states 4 mark_read 4
...
raise Exception("Failed to load BPF program %s: %s" %
Exception: Failed to load BPF program b'raw_tracepoint__nfs_commit_done': Permission denied
[0] https://github.com/iovisor/bcc/issues/2623
[1] https://github.com/iovisor/bcc/pull/4453
Signed-off-by: Rong Tao <rongtao@cestc.cn>
---
tools/nfsslower.py | 29 +++++++++++++++++++++--------
1 file changed, 21 insertions(+), 8 deletions(-)
diff --git a/tools/nfsslower.py b/tools/nfsslower.py
index 34756f72..99f63f0f 100755
--- a/tools/nfsslower.py
+++ b/tools/nfsslower.py
@@ -195,8 +195,11 @@ static int trace_exit(struct pt_regs *ctx, int type)
// populate output struct
u32 size = PT_REGS_RC(ctx);
- struct data_t data = {.type = type, .size = size, .delta_us = delta_us,
- .pid = pid};
+ struct data_t data = {};
+ data.type = type;
+ data.size = size;
+ data.delta_us = delta_us;
+ data.pid = pid;
data.ts_us = ts / 1000;
data.offset = valp->offset;
bpf_get_current_comm(&data.task, sizeof(data.task));
@@ -280,9 +283,14 @@ RAW_TRACEPOINT_PROBE(nfs_commit_done)
u64 ts = bpf_ktime_get_ns();
u64 delta_us = (ts - cp->ts) / 1000;
u32 pid = bpf_get_current_pid_tgid() >> 32;
- struct data_t data = {.type = TRACE_COMMIT, .offset = cp->offset,
- .size = cp->count, .ts_us = ts/1000, .delta_us = delta_us,
- .pid = pid};
+
+ struct data_t data = {};
+ data.type = TRACE_COMMIT;
+ data.offset = cp->offset;
+ data.size = cp->count;
+ data.ts_us = ts/1000;
+ data.delta_us = delta_us;
+ data.pid = pid;
commitinfo.delete(&key);
bpf_get_current_comm(&data.task, sizeof(data.task));
@@ -325,9 +333,14 @@ int trace_nfs_commit_done(struct pt_regs *ctx, void *task, void *calldata)
u64 ts = bpf_ktime_get_ns();
u64 delta_us = (ts - cp->ts) / 1000;
u32 pid = bpf_get_current_pid_tgid() >> 32;
- struct data_t data = {.type = TRACE_COMMIT, .offset = cp->offset,
- .size = cp->count, .ts_us = ts/1000, .delta_us = delta_us,
- .pid = pid};
+
+ struct data_t data = {};
+ data.type = TRACE_COMMIT;
+ data.offset = cp->offset;
+ data.size = cp->count;
+ data.ts_us = ts/1000;
+ data.delta_us = delta_us;
+ data.pid = pid;
commitinfo.delete(&key);
bpf_get_current_comm(&data.task, sizeof(data.task));
--
2.39.2

View File

@ -1,159 +0,0 @@
From 02fce045ce02fe81d8649ce63ce81d5cdf3e3a72 Mon Sep 17 00:00:00 2001
From: Rong Tao <rongtao@cestc.cn>
Date: Mon, 30 Jan 2023 17:39:35 +0800
Subject: [PATCH] tools/readahead: Fix: Failed to attach BPF program
entry__do_page_cache_readahead
since commit 56a4d67c264e("mm/readahead: Switch to page_cache_ra_order") switch
do_page_cache_ra() to page_cache_ra_order() (v5.17), and commit bb3c579e25e5
("mm/filemap: Add filemap_alloc_folio") swap __page_cache_alloc() to
filemap_alloc_folio() (since v5.15)
Reprocude the error(fedora37, 6.1.7-200.fc37.aarch64):
$ sudo ./readahead.py
cannot attach kprobe, probe entry may not exist
Traceback (most recent call last):
File "/home/rongtao/Git/bcc/tools/./readahead.py", line 159, in <module>
b.attach_kprobe(event=ra_event, fn_name="entry__do_page_cache_readahead")
File "/usr/lib/python3.11/site-packages/bcc/__init__.py", line 840, in attach_kprobe
raise Exception("Failed to attach BPF program %s to kprobe %s" %
Exception: Failed to attach BPF program b'entry__do_page_cache_readahead' to kprobe b'do_page_cache_ra'
Signed-off-by: Rong Tao <rongtao@cestc.cn>
---
tools/readahead.py | 69 +++++++++++++++++++++++++++++++++++++---------
1 file changed, 56 insertions(+), 13 deletions(-)
diff --git a/tools/readahead.py b/tools/readahead.py
index f2afdcb3..adad2ea8 100755
--- a/tools/readahead.py
+++ b/tools/readahead.py
@@ -12,6 +12,7 @@
#
# 20-Aug-2020 Suchakra Sharma Ported from bpftrace to BCC
# 17-Sep-2021 Hengqi Chen Migrated to kfunc
+# 30-Jan-2023 Rong Tao Support more kfunc/kprobe, introduce folio
from __future__ import print_function
from bcc import BPF
@@ -38,6 +39,7 @@ args = parser.parse_args()
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/mm_types.h>
+#include <linux/mm.h>
BPF_HASH(flag, u32, u8); // used to track if we are in do_page_cache_readahead()
BPF_HASH(birth, struct page*, u64); // used to track timestamps of cache alloc'ed page
@@ -65,7 +67,7 @@ int exit__do_page_cache_readahead(struct pt_regs *ctx) {
int exit__page_cache_alloc(struct pt_regs *ctx) {
u32 pid;
u64 ts;
- struct page *retval = (struct page*) PT_REGS_RC(ctx);
+ struct page *retval = (struct page*) GET_RETVAL_PAGE;
u32 zero = 0; // static key for accessing pages[0]
pid = bpf_get_current_pid_tgid();
u8 *f = flag.lookup(&pid);
@@ -111,6 +113,23 @@ KRETFUNC_PROBE(RA_FUNC)
return 0;
}
+KFUNC_PROBE(mark_page_accessed, struct page *arg0)
+{
+ u64 ts, delta;
+ u32 zero = 0; // static key for accessing pages[0]
+ u64 *bts = birth.lookup(&arg0);
+
+ if (bts != NULL) {
+ delta = bpf_ktime_get_ns() - *bts;
+ dist.atomic_increment(bpf_log2l(delta/1000000));
+ pages.atomic_increment(zero, -1);
+ birth.delete(&arg0); // remove the entry from hashmap
+ }
+ return 0;
+}
+"""
+
+bpf_text_kfunc_cache_alloc_ret_page = """
KRETFUNC_PROBE(__page_cache_alloc, gfp_t gfp, struct page *retval)
{
u64 ts;
@@ -125,18 +144,22 @@ KRETFUNC_PROBE(__page_cache_alloc, gfp_t gfp, struct page *retval)
}
return 0;
}
+"""
-KFUNC_PROBE(mark_page_accessed, struct page *arg0)
+bpf_text_kfunc_cache_alloc_ret_folio = """
+KRETFUNC_PROBE(filemap_alloc_folio, gfp_t gfp, unsigned int order,
+ struct folio *retval)
{
- u64 ts, delta;
+ u64 ts;
u32 zero = 0; // static key for accessing pages[0]
- u64 *bts = birth.lookup(&arg0);
+ u32 pid = bpf_get_current_pid_tgid();
+ u8 *f = flag.lookup(&pid);
+ struct page *page = folio_page(retval, 0);
- if (bts != NULL) {
- delta = bpf_ktime_get_ns() - *bts;
- dist.atomic_increment(bpf_log2l(delta/1000000));
- pages.atomic_increment(zero, -1);
- birth.delete(&arg0); // remove the entry from hashmap
+ if (f != NULL && *f == 1) {
+ ts = bpf_ktime_get_ns();
+ birth.update(&page, &ts);
+ pages.atomic_increment(zero);
}
return 0;
}
@@ -145,20 +168,40 @@ KFUNC_PROBE(mark_page_accessed, struct page *arg0)
if BPF.support_kfunc():
if BPF.get_kprobe_functions(b"__do_page_cache_readahead"):
ra_func = "__do_page_cache_readahead"
- else:
+ elif BPF.get_kprobe_functions(b"do_page_cache_ra"):
ra_func = "do_page_cache_ra"
+ elif BPF.get_kprobe_functions(b"page_cache_ra_order"):
+ ra_func = "page_cache_ra_order"
+ else:
+ print("Not found any kfunc.")
+ exit()
bpf_text += bpf_text_kfunc.replace("RA_FUNC", ra_func)
+ if BPF.get_kprobe_functions(b"__page_cache_alloc"):
+ bpf_text += bpf_text_kfunc_cache_alloc_ret_page
+ else:
+ bpf_text += bpf_text_kfunc_cache_alloc_ret_folio
b = BPF(text=bpf_text)
else:
bpf_text += bpf_text_kprobe
- b = BPF(text=bpf_text)
if BPF.get_kprobe_functions(b"__do_page_cache_readahead"):
ra_event = "__do_page_cache_readahead"
- else:
+ elif BPF.get_kprobe_functions(b"do_page_cache_ra"):
ra_event = "do_page_cache_ra"
+ elif BPF.get_kprobe_functions(b"page_cache_ra_order"):
+ ra_event = "page_cache_ra_order"
+ else:
+ print("Not found any kprobe.")
+ exit()
+ if BPF.get_kprobe_functions(b"__page_cache_alloc"):
+ cache_func = "__page_cache_alloc"
+ bpf_text = bpf_text.replace('GET_RETVAL_PAGE', 'PT_REGS_RC(ctx)')
+ else:
+ cache_func = "filemap_alloc_folio"
+ bpf_text = bpf_text.replace('GET_RETVAL_PAGE', 'folio_page((struct folio *)PT_REGS_RC(ctx), 0)')
+ b = BPF(text=bpf_text)
b.attach_kprobe(event=ra_event, fn_name="entry__do_page_cache_readahead")
b.attach_kretprobe(event=ra_event, fn_name="exit__do_page_cache_readahead")
- b.attach_kretprobe(event="__page_cache_alloc", fn_name="exit__page_cache_alloc")
+ b.attach_kretprobe(event=cache_func, fn_name="exit__page_cache_alloc")
b.attach_kprobe(event="mark_page_accessed", fn_name="entry_mark_page_accessed")
# header
--
2.39.1

View File

@ -1,140 +0,0 @@
From 533db3453a09695f79368792cdd5fbe2ddeaa55e Mon Sep 17 00:00:00 2001
From: Rong Tao <rongtao@cestc.cn>
Date: Sun, 22 Jan 2023 15:44:46 +0800
Subject: [PATCH] tools/slabratetop: Fix error: incomplete definition of type
'struct slab'
kernel commit 40f3bf0cb04c("mm: Convert struct page to struct slab in functions
used by other subsystems") introduce slab_address() function, commit 6e48a966dfd1
("mm/kasan: Convert to struct folio and struct slab") linux/kasan.h adds a
dependency on the slab struct, This leads to the following problems:
$ sudo ./slabratetop.py
In file included from /virtual/main.c:13:
include/linux/slub_def.h:162:26: warning: call to undeclared function 'slab_address';
ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
void *object = x - (x - slab_address(slab)) % cache->size;
^
include/linux/slub_def.h:162:46: error: invalid operands to binary expression ('void *' and 'unsigned int')
void *object = x - (x - slab_address(slab)) % cache->size;
~~~~~~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~
include/linux/slub_def.h:164:8: error: incomplete definition of type 'struct slab'
(slab->objects - 1) * cache->size;
~~~~^
include/linux/kasan.h:13:8: note: forward declaration of 'struct slab'
struct slab;
^
...
At first, I wanted to fix this with a kernel patch [1], however, bcc as a
downstream project of the kernel, this issue should be solved inside the bcc
project. This is agreed by kernel maintainer and bcc maintainer @yonghong-song.
This solution is provided by @yonghong-song [0].
[0] https://github.com/iovisor/bcc/issues/4438
[1] https://lore.kernel.org/all/tencent_ABA832E296819D1053D6C625ADCAF76BC706@qq.com/
Signed-off-by: Rong Tao <rongtao@cestc.cn>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
tools/slabratetop.py | 76 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 76 insertions(+)
diff --git a/tools/slabratetop.py b/tools/slabratetop.py
index ac44b2bd..8fbcac5e 100755
--- a/tools/slabratetop.py
+++ b/tools/slabratetop.py
@@ -14,6 +14,9 @@
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 15-Oct-2016 Brendan Gregg Created this.
+# 23-Jan-2023 Rong Tao Introduce kernel internal data structure and
+# functions to temporarily solve problem for
+# >=5.16(TODO: fix this workaround)
from __future__ import print_function
from bcc import BPF
@@ -65,6 +68,79 @@ bpf_text = """
// 5.9, but it does not hurt to have it here for versions 5.4 to 5.8.
struct memcg_cache_params {};
+// introduce kernel interval slab structure and slab_address() function, solved
+// 'undefined' error for >=5.16. TODO: we should fix this workaround if BCC
+// framework support BTF/CO-RE.
+struct slab {
+ unsigned long __page_flags;
+
+#if defined(CONFIG_SLAB)
+
+ struct kmem_cache *slab_cache;
+ union {
+ struct {
+ struct list_head slab_list;
+ void *freelist; /* array of free object indexes */
+ void *s_mem; /* first object */
+ };
+ struct rcu_head rcu_head;
+ };
+ unsigned int active;
+
+#elif defined(CONFIG_SLUB)
+
+ struct kmem_cache *slab_cache;
+ union {
+ struct {
+ union {
+ struct list_head slab_list;
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ struct {
+ struct slab *next;
+ int slabs; /* Nr of slabs left */
+ };
+#endif
+ };
+ /* Double-word boundary */
+ void *freelist; /* first free object */
+ union {
+ unsigned long counters;
+ struct {
+ unsigned inuse:16;
+ unsigned objects:15;
+ unsigned frozen:1;
+ };
+ };
+ };
+ struct rcu_head rcu_head;
+ };
+ unsigned int __unused;
+
+#elif defined(CONFIG_SLOB)
+
+ struct list_head slab_list;
+ void *__unused_1;
+ void *freelist; /* first free block */
+ long units;
+ unsigned int __unused_2;
+
+#else
+#error "Unexpected slab allocator configured"
+#endif
+
+ atomic_t __page_refcount;
+#ifdef CONFIG_MEMCG
+ unsigned long memcg_data;
+#endif
+};
+
+// slab_address() will not be used, and NULL will be returned directly, which
+// can avoid adaptation of different kernel versions
+static inline void *slab_address(const struct slab *slab)
+{
+ return NULL;
+}
+
#ifdef CONFIG_SLUB
#include <linux/slub_def.h>
#else
--
2.39.1

View File

@ -0,0 +1,855 @@
From 2e758b65231f976c67a0aad791aabc7927ea7086 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Thu, 27 Jul 2023 18:19:18 +0200
Subject: [PATCH] tools: Add support for the new block_io_* tracepoints
The bio tools currently depends on blk_account_io_done/start functions
that can be inlined. To fix that, a couple of tracepoints have been
added upstream (block:block_io_start/done). This patch add the support
for those tracepoints when they are available.
Unfortunately, the bio tools relies on data that is not available to
the tracepoints (mostly the struct request). So the tracepoints can't
be used as drop in replacement for blk_account_io_*. Main difference,
is that we can't use the struct request as the hash key anymore, so it
now uses the couple (dev_t, sector) for that purpose.
For the biolatency tool, the -F option is disabled when only the
tracepoints are available because the flags are not all accessible
from the tracepoints. Otherwise, all features of the tools should
remain.
Closes #4261
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
---
tools/biolatency.py | 166 ++++++++++++++++++++++++++++--------
tools/biosnoop.py | 200 +++++++++++++++++++++++++++++++++-----------
tools/biotop.py | 108 +++++++++++++++++++-----
3 files changed, 371 insertions(+), 103 deletions(-)
diff --git a/tools/biolatency.py b/tools/biolatency.py
index 8fe43a7c..03b48a4c 100755
--- a/tools/biolatency.py
+++ b/tools/biolatency.py
@@ -11,6 +11,7 @@
#
# 20-Sep-2015 Brendan Gregg Created this.
# 31-Mar-2022 Rocky Xing Added disk filter support.
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
from __future__ import print_function
from bcc import BPF
@@ -72,7 +73,7 @@ bpf_text = """
#include <linux/blk-mq.h>
typedef struct disk_key {
- char disk[DISK_NAME_LEN];
+ dev_t dev;
u64 slot;
} disk_key_t;
@@ -86,26 +87,70 @@ typedef struct ext_val {
u64 count;
} ext_val_t;
-BPF_HASH(start, struct request *);
+struct tp_args {
+ u64 __unused__;
+ dev_t dev;
+ sector_t sector;
+ unsigned int nr_sector;
+ unsigned int bytes;
+ char rwbs[8];
+ char comm[16];
+ char cmd[];
+};
+
+struct start_key {
+ dev_t dev;
+ u32 _pad;
+ sector_t sector;
+ CMD_FLAGS
+};
+
+BPF_HASH(start, struct start_key);
STORAGE
+static dev_t ddevt(struct gendisk *disk) {
+ return (disk->major << 20) | disk->first_minor;
+}
+
// time block I/O
-int trace_req_start(struct pt_regs *ctx, struct request *req)
+static int __trace_req_start(struct start_key key)
{
DISK_FILTER
u64 ts = bpf_ktime_get_ns();
- start.update(&req, &ts);
+ start.update(&key, &ts);
return 0;
}
+int trace_req_start(struct pt_regs *ctx, struct request *req)
+{
+ struct start_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ SET_FLAGS
+
+ return __trace_req_start(key);
+}
+
+int trace_req_start_tp(struct tp_args *args)
+{
+ struct start_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_req_start(key);
+}
+
// output
-int trace_req_done(struct pt_regs *ctx, struct request *req)
+static int __trace_req_done(struct start_key key)
{
u64 *tsp, delta;
// fetch timestamp and calculate delta
- tsp = start.lookup(&req);
+ tsp = start.lookup(&key);
if (tsp == 0) {
return 0; // missed issue
}
@@ -116,9 +161,31 @@ int trace_req_done(struct pt_regs *ctx, struct request *req)
// store as histogram
STORE
- start.delete(&req);
+ start.delete(&key);
return 0;
}
+
+int trace_req_done(struct pt_regs *ctx, struct request *req)
+{
+ struct start_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ SET_FLAGS
+
+ return __trace_req_done(key);
+}
+
+int trace_req_done_tp(struct tp_args *args)
+{
+ struct start_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_req_done(key);
+}
"""
# code substitutions
@@ -134,21 +201,18 @@ store_str = ""
if args.disks:
storage_str += "BPF_HISTOGRAM(dist, disk_key_t);"
disks_str = """
- disk_key_t key = {.slot = bpf_log2l(delta)};
- void *__tmp = (void *)req->__RQ_DISK__->disk_name;
- bpf_probe_read(&key.disk, sizeof(key.disk), __tmp);
- dist.atomic_increment(key);
+ disk_key_t dkey = {};
+ dkey.dev = key.dev;
+ dkey.slot = bpf_log2l(delta);
+ dist.atomic_increment(dkey);
"""
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
- store_str += disks_str.replace('__RQ_DISK__', 'rq_disk')
- else:
- store_str += disks_str.replace('__RQ_DISK__', 'q->disk')
+ store_str += disks_str
elif args.flags:
storage_str += "BPF_HISTOGRAM(dist, flag_key_t);"
store_str += """
- flag_key_t key = {.slot = bpf_log2l(delta)};
- key.flags = req->cmd_flags;
- dist.atomic_increment(key);
+ flag_key_t fkey = {.slot = bpf_log2l(delta)};
+ fkey.flags = key.flags;
+ dist.atomic_increment(fkey);
"""
else:
storage_str += "BPF_HISTOGRAM(dist);"
@@ -161,21 +225,13 @@ store_str = ""
exit(1)
stat_info = os.stat(disk_path)
- major = os.major(stat_info.st_rdev)
- minor = os.minor(stat_info.st_rdev)
-
- disk_field_str = ""
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
- disk_field_str = 'req->rq_disk'
- else:
- disk_field_str = 'req->q->disk'
+ dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev)
disk_filter_str = """
- struct gendisk *disk = %s;
- if (!(disk->major == %d && disk->first_minor == %d)) {
+ if(key.dev != %s) {
return 0;
}
- """ % (disk_field_str, major, minor)
+ """ % (dev)
bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str)
else:
@@ -194,6 +250,16 @@ store_str = ""
bpf_text = bpf_text.replace("STORAGE", storage_str)
bpf_text = bpf_text.replace("STORE", store_str)
+if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
+ bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk')
+else:
+ bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk')
+if args.flags:
+ bpf_text = bpf_text.replace('CMD_FLAGS', 'u64 flags;')
+ bpf_text = bpf_text.replace('SET_FLAGS', 'key.flags = req->cmd_flags;')
+else:
+ bpf_text = bpf_text.replace('CMD_FLAGS', '')
+ bpf_text = bpf_text.replace('SET_FLAGS', '')
if debug or args.ebpf:
print(bpf_text)
@@ -205,25 +271,53 @@ b = BPF(text=bpf_text)
if args.queued:
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_req_start")
- else:
+ elif BPF.get_kprobe_functions(b'blk_account_io_start'):
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_req_start")
+ else:
+ if args.flags:
+ # Some flags are accessible in the rwbs field (RAHEAD, SYNC and META)
+ # but other aren't. Disable the -F option for tracepoint for now.
+ print("ERROR: blk_account_io_start probe not available. Can't use -F.")
+ exit()
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_req_start_tp")
else:
if BPF.get_kprobe_functions(b'blk_start_request'):
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
+
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_done")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_done")
+else:
+ if args.flags:
+ print("ERROR: blk_account_io_done probe not available. Can't use -F.")
+ exit()
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_done_tp")
+
if not args.json:
print("Tracing block device I/O... Hit Ctrl-C to end.")
-def disk_print(s):
- disk = s.decode('utf-8', 'replace')
- if not disk:
- disk = "<unknown>"
- return disk
+# cache disk major,minor -> diskname
+diskstats = "/proc/diskstats"
+disklookup = {}
+with open(diskstats) as stats:
+ for line in stats:
+ a = line.split()
+ disklookup[a[0] + "," + a[1]] = a[2]
+
+def disk_print(d):
+ major = d >> 20
+ minor = d & ((1 << 20) - 1)
+
+ disk = str(major) + "," + str(minor)
+ if disk in disklookup:
+ diskname = disklookup[disk]
+ else:
+ diskname = "?"
+
+ return diskname
# see blk_fill_rwbs():
req_opf = {
diff --git a/tools/biosnoop.py b/tools/biosnoop.py
index 33703233..f0fef98b 100755
--- a/tools/biosnoop.py
+++ b/tools/biosnoop.py
@@ -14,6 +14,7 @@
# 11-Feb-2016 Allan McAleavy updated for BPF_PERF_OUTPUT
# 21-Jun-2022 Rocky Xing Added disk filter support.
# 13-Oct-2022 Rocky Xing Added support for displaying block I/O pattern.
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
from __future__ import print_function
from bcc import BPF
@@ -64,6 +65,24 @@ struct val_t {
char name[TASK_COMM_LEN];
};
+struct tp_args {
+ u64 __unused__;
+ dev_t dev;
+ sector_t sector;
+ unsigned int nr_sector;
+ unsigned int bytes;
+ char rwbs[8];
+ char comm[16];
+ char cmd[];
+};
+
+struct hash_key {
+ dev_t dev;
+ u32 rwflag;
+ sector_t sector;
+};
+
+
#ifdef INCLUDE_PATTERN
struct sector_key_t {
u32 dev_major;
@@ -79,6 +98,7 @@ enum bio_pattern {
struct data_t {
u32 pid;
+ u32 dev;
u64 rwflag;
u64 delta;
u64 qdelta;
@@ -88,7 +108,6 @@ struct data_t {
enum bio_pattern pattern;
#endif
u64 ts;
- char disk_name[DISK_NAME_LEN];
char name[TASK_COMM_LEN];
};
@@ -96,12 +115,45 @@ struct data_t {
BPF_HASH(last_sectors, struct sector_key_t, u64);
#endif
-BPF_HASH(start, struct request *, struct start_req_t);
-BPF_HASH(infobyreq, struct request *, struct val_t);
+BPF_HASH(start, struct hash_key, struct start_req_t);
+BPF_HASH(infobyreq, struct hash_key, struct val_t);
BPF_PERF_OUTPUT(events);
+static dev_t ddevt(struct gendisk *disk) {
+ return (disk->major << 20) | disk->first_minor;
+}
+
+/*
+ * The following deals with a kernel version change (in mainline 4.7, although
+ * it may be backported to earlier kernels) with how block request write flags
+ * are tested. We handle both pre- and post-change versions here. Please avoid
+ * kernel version tests like this as much as possible: they inflate the code,
+ * test, and maintenance burden.
+ */
+static int get_rwflag(u32 cmd_flags) {
+#ifdef REQ_WRITE
+ return !!(cmd_flags & REQ_WRITE);
+#elif defined(REQ_OP_SHIFT)
+ return !!((cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
+#else
+ return !!((cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
+#endif
+}
+
+#define RWBS_LEN 8
+
+static int get_rwflag_tp(char *rwbs) {
+ for (int i = 0; i < RWBS_LEN; i++) {
+ if (rwbs[i] == 'W')
+ return 1;
+ if (rwbs[i] == '\\0')
+ return 0;
+ }
+ return 0;
+}
+
// cache PID and comm by-req
-int trace_pid_start(struct pt_regs *ctx, struct request *req)
+static int __trace_pid_start(struct hash_key key)
{
DISK_FILTER
@@ -113,47 +165,76 @@ int trace_pid_start(struct pt_regs *ctx, struct request *req)
if (##QUEUE##) {
val.ts = bpf_ktime_get_ns();
}
- infobyreq.update(&req, &val);
+ infobyreq.update(&key, &val);
}
return 0;
}
+
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .rwflag = get_rwflag(req->cmd_flags),
+ .sector = req->__sector
+ };
+
+ return __trace_pid_start(key);
+}
+
+int trace_pid_start_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .rwflag = get_rwflag_tp(args->rwbs),
+ .sector = args->sector
+ };
+
+ return __trace_pid_start(key);
+}
+
// time block I/O
int trace_req_start(struct pt_regs *ctx, struct request *req)
{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .rwflag = get_rwflag(req->cmd_flags),
+ .sector = req->__sector
+ };
+
DISK_FILTER
struct start_req_t start_req = {
.ts = bpf_ktime_get_ns(),
.data_len = req->__data_len
};
- start.update(&req, &start_req);
+ start.update(&key, &start_req);
return 0;
}
// output
-int trace_req_completion(struct pt_regs *ctx, struct request *req)
+static int __trace_req_completion(void *ctx, struct hash_key key)
{
struct start_req_t *startp;
struct val_t *valp;
struct data_t data = {};
- struct gendisk *rq_disk;
+ //struct gendisk *rq_disk;
u64 ts;
// fetch timestamp and calculate delta
- startp = start.lookup(&req);
+ startp = start.lookup(&key);
if (startp == 0) {
// missed tracing issue
return 0;
}
ts = bpf_ktime_get_ns();
- rq_disk = req->__RQ_DISK__;
+ //rq_disk = req->__RQ_DISK__;
data.delta = ts - startp->ts;
data.ts = ts / 1000;
data.qdelta = 0;
data.len = startp->data_len;
- valp = infobyreq.lookup(&req);
+ valp = infobyreq.lookup(&key);
if (valp == 0) {
data.name[0] = '?';
data.name[1] = 0;
@@ -162,10 +243,9 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
data.qdelta = startp->ts - valp->ts;
}
data.pid = valp->pid;
- data.sector = req->__sector;
+ data.sector = key.sector;
+ data.dev = key.dev;
bpf_probe_read_kernel(&data.name, sizeof(data.name), valp->name);
- bpf_probe_read_kernel(&data.disk_name, sizeof(data.disk_name),
- rq_disk->disk_name);
}
#ifdef INCLUDE_PATTERN
@@ -174,8 +254,8 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
u64 *sector, last_sector;
struct sector_key_t sector_key = {
- .dev_major = rq_disk->major,
- .dev_minor = rq_disk->first_minor
+ .dev_major = key.dev >> 20,
+ .dev_minor = key.dev & ((1 << 20) - 1)
};
sector = last_sectors.lookup(&sector_key);
@@ -187,27 +267,36 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
last_sectors.update(&sector_key, &last_sector);
#endif
-/*
- * The following deals with a kernel version change (in mainline 4.7, although
- * it may be backported to earlier kernels) with how block request write flags
- * are tested. We handle both pre- and post-change versions here. Please avoid
- * kernel version tests like this as much as possible: they inflate the code,
- * test, and maintenance burden.
- */
-#ifdef REQ_WRITE
- data.rwflag = !!(req->cmd_flags & REQ_WRITE);
-#elif defined(REQ_OP_SHIFT)
- data.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
-#else
- data.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
-#endif
+ data.rwflag = key.rwflag;
events.perf_submit(ctx, &data, sizeof(data));
- start.delete(&req);
- infobyreq.delete(&req);
+ start.delete(&key);
+ infobyreq.delete(&key);
return 0;
}
+
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .rwflag = get_rwflag(req->cmd_flags),
+ .sector = req->__sector
+ };
+
+ return __trace_req_completion(ctx, key);
+}
+
+int trace_req_completion_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .rwflag = get_rwflag_tp(args->rwbs),
+ .sector = args->sector
+ };
+
+ return __trace_req_completion(args, key);
+}
"""
if args.queue:
bpf_text = bpf_text.replace('##QUEUE##', '1')
@@ -225,21 +314,13 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
exit(1)
stat_info = os.stat(disk_path)
- major = os.major(stat_info.st_rdev)
- minor = os.minor(stat_info.st_rdev)
-
- disk_field_str = ""
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
- disk_field_str = 'req->rq_disk'
- else:
- disk_field_str = 'req->q->disk'
+ dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev)
disk_filter_str = """
- struct gendisk *disk = %s;
- if (!(disk->major == %d && disk->first_minor == %d)) {
+ if(key.dev != %s) {
return 0;
}
- """ % (disk_field_str, major, minor)
+ """ % (dev)
bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str)
else:
@@ -254,15 +335,19 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
b = BPF(text=bpf_text)
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_start'):
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
+else:
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp")
if BPF.get_kprobe_functions(b'blk_start_request'):
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
+else:
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp")
# header
print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID",
@@ -273,6 +358,27 @@ print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID",
print("%7s " % ("QUE(ms)"), end="")
print("%7s" % "LAT(ms)")
+
+# cache disk major,minor -> diskname
+diskstats = "/proc/diskstats"
+disklookup = {}
+with open(diskstats) as stats:
+ for line in stats:
+ a = line.split()
+ disklookup[a[0] + "," + a[1]] = a[2]
+
+def disk_print(d):
+ major = d >> 20
+ minor = d & ((1 << 20) - 1)
+
+ disk = str(major) + "," + str(minor)
+ if disk in disklookup:
+ diskname = disklookup[disk]
+ else:
+ diskname = "<unknown>"
+
+ return diskname
+
rwflg = ""
pattern = ""
start_ts = 0
@@ -297,9 +403,7 @@ P_RANDOM = 2
delta = float(event.ts) - start_ts
- disk_name = event.disk_name.decode('utf-8', 'replace')
- if not disk_name:
- disk_name = '<unknown>'
+ disk_name = disk_print(event.dev)
print("%-11.6f %-14.14s %-7s %-9s %-1s %-10s %-7s" % (
delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid,
diff --git a/tools/biotop.py b/tools/biotop.py
index fcdd373f..2620983a 100755
--- a/tools/biotop.py
+++ b/tools/biotop.py
@@ -14,6 +14,7 @@
#
# 06-Feb-2016 Brendan Gregg Created this.
# 17-Mar-2022 Rocky Xing Added PID filter support.
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
from __future__ import print_function
from bcc import BPF
@@ -88,14 +89,35 @@ struct val_t {
u32 io;
};
-BPF_HASH(start, struct request *, struct start_req_t);
-BPF_HASH(whobyreq, struct request *, struct who_t);
+struct tp_args {
+ u64 __unused__;
+ dev_t dev;
+ sector_t sector;
+ unsigned int nr_sector;
+ unsigned int bytes;
+ char rwbs[8];
+ char comm[16];
+ char cmd[];
+};
+
+struct hash_key {
+ dev_t dev;
+ u32 _pad;
+ sector_t sector;
+};
+
+BPF_HASH(start, struct hash_key, struct start_req_t);
+BPF_HASH(whobyreq, struct hash_key, struct who_t);
BPF_HASH(counts, struct info_t, struct val_t);
+static dev_t ddevt(struct gendisk *disk) {
+ return (disk->major << 20) | disk->first_minor;
+}
+
// cache PID and comm by-req
-int trace_pid_start(struct pt_regs *ctx, struct request *req)
+static int __trace_pid_start(struct hash_key key)
{
- struct who_t who = {};
+ struct who_t who;
u32 pid;
if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
@@ -104,30 +126,54 @@ int trace_pid_start(struct pt_regs *ctx, struct request *req)
return 0;
who.pid = pid;
- whobyreq.update(&req, &who);
+ whobyreq.update(&key, &who);
}
return 0;
}
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ return __trace_pid_start(key);
+}
+
+int trace_pid_start_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_pid_start(key);
+}
+
// time block I/O
int trace_req_start(struct pt_regs *ctx, struct request *req)
{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
struct start_req_t start_req = {
.ts = bpf_ktime_get_ns(),
.data_len = req->__data_len
};
- start.update(&req, &start_req);
+ start.update(&key, &start_req);
return 0;
}
// output
-int trace_req_completion(struct pt_regs *ctx, struct request *req)
+static int __trace_req_completion(struct hash_key key)
{
struct start_req_t *startp;
// fetch timestamp and calculate delta
- startp = start.lookup(&req);
+ startp = start.lookup(&key);
if (startp == 0) {
return 0; // missed tracing issue
}
@@ -135,12 +181,12 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
struct who_t *whop;
u32 pid;
- whop = whobyreq.lookup(&req);
+ whop = whobyreq.lookup(&key);
pid = whop != 0 ? whop->pid : 0;
if (FILTER_PID) {
- start.delete(&req);
+ start.delete(&key);
if (whop != 0) {
- whobyreq.delete(&req);
+ whobyreq.delete(&key);
}
return 0;
}
@@ -150,8 +196,8 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
// setup info_t key
struct info_t info = {};
- info.major = req->__RQ_DISK__->major;
- info.minor = req->__RQ_DISK__->first_minor;
+ info.major = key.dev >> 20;
+ info.minor = key.dev & ((1 << 20) - 1);
/*
* The following deals with a kernel version change (in mainline 4.7, although
* it may be backported to earlier kernels) with how block request write flags
@@ -159,13 +205,13 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
* kernel version tests like this as much as possible: they inflate the code,
* test, and maintenance burden.
*/
-#ifdef REQ_WRITE
+/*#ifdef REQ_WRITE
info.rwflag = !!(req->cmd_flags & REQ_WRITE);
#elif defined(REQ_OP_SHIFT)
info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
#else
info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
-#endif
+#endif*/
if (whop == 0) {
// missed pid who, save stats as pid 0
@@ -183,11 +229,31 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
valp->io++;
}
- start.delete(&req);
- whobyreq.delete(&req);
+ start.delete(&key);
+ whobyreq.delete(&key);
return 0;
}
+
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ return __trace_req_completion(key);
+}
+
+int trace_req_completion_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_req_completion(key);
+}
"""
if args.ebpf:
@@ -207,15 +273,19 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
b = BPF(text=bpf_text)
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_start'):
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
+else:
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp")
if BPF.get_kprobe_functions(b'blk_start_request'):
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
+else:
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp")
print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
--
2.41.0

View File

@ -0,0 +1,156 @@
From 0d1a67ba9490aabbb874819d8d07b1868c8c2b1d Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Wed, 1 Feb 2023 17:30:03 +0100
Subject: [PATCH 2/2] tools/tcpstates: fix IPv6 journal
When logging ipv6 state change, journal_fields tries to pack
event.addr and event.daddr, which is not an integer in this, to
present a bytes-like object to socket.inet_ntop. This can be fixed by
having a similar type for [sd]addr for IPv4 and IPv6. Making both an
array of u32 solves the issue by presenting a bytes-like object
directly to inet_ntop, without the need for the struct packing stage.
Also now, the similar behavior, makes it easier to factor code for
IPv4 and IPv6.
It solves the following error:
/usr/share/bcc/tools/tcpstates -Y
SKADDR C-PID C-COMM LADDR LPORT RADDR RPORT OLDSTATE -> NEWSTATE MS
ffff8b2e83e56180 0 swapper/9 :: 22 :: 0 LISTEN -> SYN_RECV 0.000
Exception ignored on calling ctypes callback function: <function PerfEventArray._open_perf_buffer.<locals>.raw_cb_ at 0x7f894c8d7f70>
Traceback (most recent call last):
File "/usr/lib/python3.9/site-packages/bcc/table.py", line 982, in raw_cb_
callback(cpu, data, size)
File "/usr/share/bcc/tools/tcpstates", line 419, in print_ipv6_event
journal.send(**journal_fields(event, AF_INET6))
File "/usr/share/bcc/tools/tcpstates", line 348, in journal_fields
'OBJECT_' + addr_pfx + '_SOURCE_ADDRESS': inet_ntop(addr_family, pack("I", event.saddr)),
struct.error: required argument is not an integer
ffff8b2e83e56180 0 swapper/9 2620:52:0:2580:5054:ff:fe6b:6f1f 22 2620:52:0:2b11:2f5e:407d:b35d:4663 60396 SYN_RECV -> ESTABLISHED 0.010
Exception ignored on calling ctypes callback function: <function PerfEventArray._open_perf_buffer.<locals>.raw_cb_ at 0x7f894c8d7f70>
Traceback (most recent call last):
File "/usr/lib/python3.9/site-packages/bcc/table.py", line 982, in raw_cb_
callback(cpu, data, size)
File "/usr/share/bcc/tools/tcpstates", line 419, in print_ipv6_event
journal.send(**journal_fields(event, AF_INET6))
File "/usr/share/bcc/tools/tcpstates", line 348, in journal_fields
'OBJECT_' + addr_pfx + '_SOURCE_ADDRESS': inet_ntop(addr_family, pack("I", event.saddr)),
struct.error: required argument is not an integer
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
---
tools/tcpstates.py | 55 +++++++++++++++++-----------------------------
1 file changed, 20 insertions(+), 35 deletions(-)
diff --git a/tools/tcpstates.py b/tools/tcpstates.py
index 9b2ccfa4..6c845c9b 100755
--- a/tools/tcpstates.py
+++ b/tools/tcpstates.py
@@ -19,7 +19,6 @@ from __future__ import print_function
from bcc import BPF
import argparse
from socket import inet_ntop, AF_INET, AF_INET6
-from struct import pack
from time import strftime, time
from os import getuid
@@ -78,8 +77,8 @@ BPF_HASH(last, struct sock *, u64);
struct ipv4_data_t {
u64 ts_us;
u64 skaddr;
- u32 saddr;
- u32 daddr;
+ u32 saddr[1];
+ u32 daddr[1];
u64 span_us;
u32 pid;
u16 lport;
@@ -93,8 +92,8 @@ BPF_PERF_OUTPUT(ipv4_events);
struct ipv6_data_t {
u64 ts_us;
u64 skaddr;
- unsigned __int128 saddr;
- unsigned __int128 daddr;
+ u32 saddr[4];
+ u32 daddr[4];
u64 span_us;
u32 pid;
u16 lport;
@@ -350,9 +349,9 @@ format_string = ("%-16x %-5d %-10.10s %s%-15s %-5d %-15s %-5d %-11s " +
'OBJECT_PID': str(event.pid),
'OBJECT_COMM': event.task.decode('utf-8', 'replace'),
# Custom fields, aka "stuff we sort of made up".
- 'OBJECT_' + addr_pfx + '_SOURCE_ADDRESS': inet_ntop(addr_family, pack("I", event.saddr)),
+ 'OBJECT_' + addr_pfx + '_SOURCE_ADDRESS': inet_ntop(addr_family, event.saddr),
'OBJECT_TCP_SOURCE_PORT': str(event.lport),
- 'OBJECT_' + addr_pfx + '_DESTINATION_ADDRESS': inet_ntop(addr_family, pack("I", event.daddr)),
+ 'OBJECT_' + addr_pfx + '_DESTINATION_ADDRESS': inet_ntop(addr_family, event.daddr),
'OBJECT_TCP_DESTINATION_PORT': str(event.dport),
'OBJECT_TCP_OLD_STATE': tcpstate2str(event.oldstate),
'OBJECT_TCP_NEW_STATE': tcpstate2str(event.newstate),
@@ -373,8 +372,7 @@ format_string = ("%-16x %-5d %-10.10s %s%-15s %-5d %-15s %-5d %-11s " +
return fields
# process event
-def print_ipv4_event(cpu, data, size):
- event = b["ipv4_events"].event(data)
+def print_event(event, addr_family):
global start_ts
if args.time:
if args.csv:
@@ -389,39 +387,26 @@ format_string = ("%-16x %-5d %-10.10s %s%-15s %-5d %-15s %-5d %-11s " +
print("%.6f," % delta_s, end="")
else:
print("%-9.6f " % delta_s, end="")
+ if addr_family == AF_INET:
+ version = "4"
+ else:
+ version = "6"
print(format_string % (event.skaddr, event.pid, event.task.decode('utf-8', 'replace'),
- "4" if args.wide or args.csv else "",
- inet_ntop(AF_INET, pack("I", event.saddr)), event.lport,
- inet_ntop(AF_INET, pack("I", event.daddr)), event.dport,
+ version if args.wide or args.csv else "",
+ inet_ntop(addr_family, event.saddr), event.lport,
+ inet_ntop(addr_family, event.daddr), event.dport,
tcpstate2str(event.oldstate), tcpstate2str(event.newstate),
float(event.span_us) / 1000))
if args.journal:
- journal.send(**journal_fields(event, AF_INET))
+ journal.send(**journal_fields(event, addr_family))
+
+def print_ipv4_event(cpu, data, size):
+ event = b["ipv4_events"].event(data)
+ print_event(event, AF_INET)
def print_ipv6_event(cpu, data, size):
event = b["ipv6_events"].event(data)
- global start_ts
- if args.time:
- if args.csv:
- print("%s," % strftime("%H:%M:%S"), end="")
- else:
- print("%-8s " % strftime("%H:%M:%S"), end="")
- if args.timestamp:
- if start_ts == 0:
- start_ts = event.ts_us
- delta_s = (float(event.ts_us) - start_ts) / 1000000
- if args.csv:
- print("%.6f," % delta_s, end="")
- else:
- print("%-9.6f " % delta_s, end="")
- print(format_string % (event.skaddr, event.pid, event.task.decode('utf-8', 'replace'),
- "6" if args.wide or args.csv else "",
- inet_ntop(AF_INET6, event.saddr), event.lport,
- inet_ntop(AF_INET6, event.daddr), event.dport,
- tcpstate2str(event.oldstate), tcpstate2str(event.newstate),
- float(event.span_us) / 1000))
- if args.journal:
- journal.send(**journal_fields(event, AF_INET6))
+ print_event(event, AF_INET6)
# initialize BPF
b = BPF(text=bpf_text)
--
2.41.0

View File

@ -0,0 +1,144 @@
From 53b89f35e8970beef55046c1bf035264f110f06d Mon Sep 17 00:00:00 2001
From: hejun01 <hejun01@corp.netease.com>
Date: Thu, 29 Jun 2023 20:24:07 +0800
Subject: [PATCH 1/2] tools/tcpstates: fix context ptr modified error
Introduce local variable tcp_new_state,
to avoid llvm optimization of args->newstate,
which will cause context ptr args modified.
spilt event.ports to lport and dport.
switch type of TCP state from unsigned int to int.
---
tools/tcpstates.py | 47 +++++++++++++++++++++++++---------------------
1 file changed, 26 insertions(+), 21 deletions(-)
diff --git a/tools/tcpstates.py b/tools/tcpstates.py
index 89f3638c..9b2ccfa4 100755
--- a/tools/tcpstates.py
+++ b/tools/tcpstates.py
@@ -82,9 +82,10 @@ struct ipv4_data_t {
u32 daddr;
u64 span_us;
u32 pid;
- u32 ports;
- u32 oldstate;
- u32 newstate;
+ u16 lport;
+ u16 dport;
+ int oldstate;
+ int newstate;
char task[TASK_COMM_LEN];
};
BPF_PERF_OUTPUT(ipv4_events);
@@ -96,9 +97,10 @@ struct ipv6_data_t {
unsigned __int128 daddr;
u64 span_us;
u32 pid;
- u32 ports;
- u32 oldstate;
- u32 newstate;
+ u16 lport;
+ u16 dport;
+ int oldstate;
+ int newstate;
char task[TASK_COMM_LEN];
};
BPF_PERF_OUTPUT(ipv6_events);
@@ -132,6 +134,9 @@ TRACEPOINT_PROBE(sock, inet_sock_set_state)
u16 family = args->family;
FILTER_FAMILY
+ // workaround to avoid llvm optimization which will cause context ptr args modified
+ int tcp_newstate = args->newstate;
+
if (args->family == AF_INET) {
struct ipv4_data_t data4 = {
.span_us = delta_us,
@@ -141,8 +146,8 @@ TRACEPOINT_PROBE(sock, inet_sock_set_state)
data4.ts_us = bpf_ktime_get_ns() / 1000;
__builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr));
__builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr));
- // a workaround until data4 compiles with separate lport/dport
- data4.ports = dport + ((0ULL + lport) << 16);
+ data4.lport = lport;
+ data4.dport = dport;
data4.pid = pid;
bpf_get_current_comm(&data4.task, sizeof(data4.task));
@@ -157,14 +162,14 @@ TRACEPOINT_PROBE(sock, inet_sock_set_state)
data6.ts_us = bpf_ktime_get_ns() / 1000;
__builtin_memcpy(&data6.saddr, args->saddr_v6, sizeof(data6.saddr));
__builtin_memcpy(&data6.daddr, args->daddr_v6, sizeof(data6.daddr));
- // a workaround until data6 compiles with separate lport/dport
- data6.ports = dport + ((0ULL + lport) << 16);
+ data6.lport = lport;
+ data6.dport = dport;
data6.pid = pid;
bpf_get_current_comm(&data6.task, sizeof(data6.task));
ipv6_events.perf_submit(args, &data6, sizeof(data6));
}
- if (args->newstate == TCP_CLOSE) {
+ if (tcp_newstate == TCP_CLOSE) {
last.delete(&sk);
} else {
u64 ts = bpf_ktime_get_ns();
@@ -210,8 +215,8 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
data4.ts_us = bpf_ktime_get_ns() / 1000;
data4.saddr = sk->__sk_common.skc_rcv_saddr;
data4.daddr = sk->__sk_common.skc_daddr;
- // a workaround until data4 compiles with separate lport/dport
- data4.ports = dport + ((0ULL + lport) << 16);
+ data4.lport = lport;
+ data4.dport = dport;
data4.pid = pid;
bpf_get_current_comm(&data4.task, sizeof(data4.task));
@@ -228,8 +233,8 @@ int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
bpf_probe_read_kernel(&data6.daddr, sizeof(data6.daddr),
sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
- // a workaround until data6 compiles with separate lport/dport
- data6.ports = dport + ((0ULL + lport) << 16);
+ data6.lport = lport;
+ data6.dport = dport;
data6.pid = pid;
bpf_get_current_comm(&data6.task, sizeof(data6.task));
ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
@@ -346,9 +351,9 @@ format_string = ("%-16x %-5d %-10.10s %s%-15s %-5d %-15s %-5d %-11s " +
'OBJECT_COMM': event.task.decode('utf-8', 'replace'),
# Custom fields, aka "stuff we sort of made up".
'OBJECT_' + addr_pfx + '_SOURCE_ADDRESS': inet_ntop(addr_family, pack("I", event.saddr)),
- 'OBJECT_TCP_SOURCE_PORT': str(event.ports >> 16),
+ 'OBJECT_TCP_SOURCE_PORT': str(event.lport),
'OBJECT_' + addr_pfx + '_DESTINATION_ADDRESS': inet_ntop(addr_family, pack("I", event.daddr)),
- 'OBJECT_TCP_DESTINATION_PORT': str(event.ports & 0xffff),
+ 'OBJECT_TCP_DESTINATION_PORT': str(event.dport),
'OBJECT_TCP_OLD_STATE': tcpstate2str(event.oldstate),
'OBJECT_TCP_NEW_STATE': tcpstate2str(event.newstate),
'OBJECT_TCP_SPAN_TIME': str(event.span_us)
@@ -386,8 +391,8 @@ format_string = ("%-16x %-5d %-10.10s %s%-15s %-5d %-15s %-5d %-11s " +
print("%-9.6f " % delta_s, end="")
print(format_string % (event.skaddr, event.pid, event.task.decode('utf-8', 'replace'),
"4" if args.wide or args.csv else "",
- inet_ntop(AF_INET, pack("I", event.saddr)), event.ports >> 16,
- inet_ntop(AF_INET, pack("I", event.daddr)), event.ports & 0xffff,
+ inet_ntop(AF_INET, pack("I", event.saddr)), event.lport,
+ inet_ntop(AF_INET, pack("I", event.daddr)), event.dport,
tcpstate2str(event.oldstate), tcpstate2str(event.newstate),
float(event.span_us) / 1000))
if args.journal:
@@ -411,8 +416,8 @@ format_string = ("%-16x %-5d %-10.10s %s%-15s %-5d %-15s %-5d %-11s " +
print("%-9.6f " % delta_s, end="")
print(format_string % (event.skaddr, event.pid, event.task.decode('utf-8', 'replace'),
"6" if args.wide or args.csv else "",
- inet_ntop(AF_INET6, event.saddr), event.ports >> 16,
- inet_ntop(AF_INET6, event.daddr), event.ports & 0xffff,
+ inet_ntop(AF_INET6, event.saddr), event.lport,
+ inet_ntop(AF_INET6, event.daddr), event.dport,
tcpstate2str(event.oldstate), tcpstate2str(event.newstate),
float(event.span_us) / 1000))
if args.journal:
--
2.41.0

View File

@ -24,20 +24,15 @@
Name: bcc
Version: 0.26.0
Release: 4%{?dist}
Version: 0.28.0
Release: 1%{?dist}
Summary: BPF Compiler Collection (BCC)
License: ASL 2.0
URL: https://github.com/iovisor/bcc
Source0: %{url}/archive/v%{version}/%{name}-%{version}.tar.gz
Patch0: %%{name}-%%{version}-tools-nfsslower.py-Fix-uninitialized-struct-pad-erro.patch
Patch1: %%{name}-%%{version}-tools-slabratetop-Fix-error-incomplete-definition-of.patch
Patch2: %%{name}-%%{version}-tools-readahead-Fix-Failed-to-attach-BPF-program-ent.patch
Patch3: %%{name}-%%{version}-tools-compactsnoop.py-Fix-raw_tracepoint-Invalid-arg.patch
Patch4: %%{name}-%%{version}-killsnoop-add-missing-s-and-T-options-to-the-synopsi.patch
Patch5: %%{name}-%%{version}-tools-funcslower-fix-printing-of-folded-stacks.patch
Patch6: %%{name}-%%{version}-tools-deadlock-Add-an-option-to-set-the-maximum-numb.patch
Patch7: %%{name}-%%{version}-sync-with-latest-libbpf-repo.patch
Patch0: %%{name}-%%{version}-tools-tcpstates-fix-context-ptr-modified-error.patch
Patch1: %%{name}-%%{version}-tools-tcpstates-fix-IPv6-journal.patch
Patch2: %%{name}-%%{version}-tools-Add-support-for-the-new-block_io_-tracepoints.patch
# Arches will be included as upstream support is added and dependencies are
# satisfied in the respective arches
@ -261,6 +256,15 @@ cp -a libbpf-tools/tmp-install/bin/* %{buildroot}/%{_sbindir}/
%endif
%changelog
* Mon Oct 23 2023 Jerome Marchand <jmarchan@redhat.com> - 0.28.0-1
- Rebase to v0.28.0 (RHEL-9976)
- Rebuild with LLVM 17 (RHEL-10591)
- Fix bpf-biosnoop out of bound access (RHEL-8664)
- Fix kvmexit missing VM exit reasons and statistics (RHEL-8702)
- Fix multi-word array type handling (RHEL-8674)
- Fix tcpstates -Y (RHEL-8490)
- Fix bio tools (RHEL-8553)
* Wed Aug 09 2023 Jerome Marchand <jmarchan@redhat.com> - 0.26.0-4
- Fix tcpretrans (rhbz#2226967)

View File

@ -1 +1 @@
SHA512 (bcc-0.26.0.tar.gz) = 394872a5780cc7651c91b584ccc13f18f64585b5843364433c042d9ded70faaf15a2e1125d51498508427b089f5bf826f13004d15a1892aada1a5f228a2a8adb
SHA512 (bcc-0.28.0.tar.gz) = 792ce93dba64b1f87390b2602dcaeba04ac8b2863652b06eb9a907b93bc6137a944b856cc6fa9c7a38671c89814740967561ca4f3b29c267babca7dc5e78aa02