bcc/bcc-0.28.0-tools-Add-support-for-the-new-block_io_-tracepoints.patch
Jerome Marchand 2053933c95 Rebase bcc to the latest version
Rebase bcc to v0.28.0 and rebuild it with LLVM 17. The rebase fixes
the following issues:
 - bpf-biosnoop out of bound access
 - kvmexit missing VM exit reasons and statistics
 - multi-word array type handling

Also fix tcpstates -Y issue and bio tools on non x86_64 arches.

Resolves: RHEL-9976
Resolves: RHEL-8664
Resolves: RHEL-8702
Resolves: RHEL-8674
Resolves: RHEL-8490
Resolves: RHEL-10591
Resolves: RHEL-8553

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
2023-10-23 17:11:26 +02:00

856 lines
24 KiB
Diff

From 2e758b65231f976c67a0aad791aabc7927ea7086 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Thu, 27 Jul 2023 18:19:18 +0200
Subject: [PATCH] tools: Add support for the new block_io_* tracepoints
The bio tools currently depends on blk_account_io_done/start functions
that can be inlined. To fix that, a couple of tracepoints have been
added upstream (block:block_io_start/done). This patch add the support
for those tracepoints when they are available.
Unfortunately, the bio tools relies on data that is not available to
the tracepoints (mostly the struct request). So the tracepoints can't
be used as drop in replacement for blk_account_io_*. Main difference,
is that we can't use the struct request as the hash key anymore, so it
now uses the couple (dev_t, sector) for that purpose.
For the biolatency tool, the -F option is disabled when only the
tracepoints are available because the flags are not all accessible
from the tracepoints. Otherwise, all features of the tools should
remain.
Closes #4261
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
---
tools/biolatency.py | 166 ++++++++++++++++++++++++++++--------
tools/biosnoop.py | 200 +++++++++++++++++++++++++++++++++-----------
tools/biotop.py | 108 +++++++++++++++++++-----
3 files changed, 371 insertions(+), 103 deletions(-)
diff --git a/tools/biolatency.py b/tools/biolatency.py
index 8fe43a7c..03b48a4c 100755
--- a/tools/biolatency.py
+++ b/tools/biolatency.py
@@ -11,6 +11,7 @@
#
# 20-Sep-2015 Brendan Gregg Created this.
# 31-Mar-2022 Rocky Xing Added disk filter support.
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
from __future__ import print_function
from bcc import BPF
@@ -72,7 +73,7 @@ bpf_text = """
#include <linux/blk-mq.h>
typedef struct disk_key {
- char disk[DISK_NAME_LEN];
+ dev_t dev;
u64 slot;
} disk_key_t;
@@ -86,26 +87,70 @@ typedef struct ext_val {
u64 count;
} ext_val_t;
-BPF_HASH(start, struct request *);
+struct tp_args {
+ u64 __unused__;
+ dev_t dev;
+ sector_t sector;
+ unsigned int nr_sector;
+ unsigned int bytes;
+ char rwbs[8];
+ char comm[16];
+ char cmd[];
+};
+
+struct start_key {
+ dev_t dev;
+ u32 _pad;
+ sector_t sector;
+ CMD_FLAGS
+};
+
+BPF_HASH(start, struct start_key);
STORAGE
+static dev_t ddevt(struct gendisk *disk) {
+ return (disk->major << 20) | disk->first_minor;
+}
+
// time block I/O
-int trace_req_start(struct pt_regs *ctx, struct request *req)
+static int __trace_req_start(struct start_key key)
{
DISK_FILTER
u64 ts = bpf_ktime_get_ns();
- start.update(&req, &ts);
+ start.update(&key, &ts);
return 0;
}
+int trace_req_start(struct pt_regs *ctx, struct request *req)
+{
+ struct start_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ SET_FLAGS
+
+ return __trace_req_start(key);
+}
+
+int trace_req_start_tp(struct tp_args *args)
+{
+ struct start_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_req_start(key);
+}
+
// output
-int trace_req_done(struct pt_regs *ctx, struct request *req)
+static int __trace_req_done(struct start_key key)
{
u64 *tsp, delta;
// fetch timestamp and calculate delta
- tsp = start.lookup(&req);
+ tsp = start.lookup(&key);
if (tsp == 0) {
return 0; // missed issue
}
@@ -116,9 +161,31 @@ int trace_req_done(struct pt_regs *ctx, struct request *req)
// store as histogram
STORE
- start.delete(&req);
+ start.delete(&key);
return 0;
}
+
+int trace_req_done(struct pt_regs *ctx, struct request *req)
+{
+ struct start_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ SET_FLAGS
+
+ return __trace_req_done(key);
+}
+
+int trace_req_done_tp(struct tp_args *args)
+{
+ struct start_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_req_done(key);
+}
"""
# code substitutions
@@ -134,21 +201,18 @@ store_str = ""
if args.disks:
storage_str += "BPF_HISTOGRAM(dist, disk_key_t);"
disks_str = """
- disk_key_t key = {.slot = bpf_log2l(delta)};
- void *__tmp = (void *)req->__RQ_DISK__->disk_name;
- bpf_probe_read(&key.disk, sizeof(key.disk), __tmp);
- dist.atomic_increment(key);
+ disk_key_t dkey = {};
+ dkey.dev = key.dev;
+ dkey.slot = bpf_log2l(delta);
+ dist.atomic_increment(dkey);
"""
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
- store_str += disks_str.replace('__RQ_DISK__', 'rq_disk')
- else:
- store_str += disks_str.replace('__RQ_DISK__', 'q->disk')
+ store_str += disks_str
elif args.flags:
storage_str += "BPF_HISTOGRAM(dist, flag_key_t);"
store_str += """
- flag_key_t key = {.slot = bpf_log2l(delta)};
- key.flags = req->cmd_flags;
- dist.atomic_increment(key);
+ flag_key_t fkey = {.slot = bpf_log2l(delta)};
+ fkey.flags = key.flags;
+ dist.atomic_increment(fkey);
"""
else:
storage_str += "BPF_HISTOGRAM(dist);"
@@ -161,21 +225,13 @@ store_str = ""
exit(1)
stat_info = os.stat(disk_path)
- major = os.major(stat_info.st_rdev)
- minor = os.minor(stat_info.st_rdev)
-
- disk_field_str = ""
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
- disk_field_str = 'req->rq_disk'
- else:
- disk_field_str = 'req->q->disk'
+ dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev)
disk_filter_str = """
- struct gendisk *disk = %s;
- if (!(disk->major == %d && disk->first_minor == %d)) {
+ if(key.dev != %s) {
return 0;
}
- """ % (disk_field_str, major, minor)
+ """ % (dev)
bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str)
else:
@@ -194,6 +250,16 @@ store_str = ""
bpf_text = bpf_text.replace("STORAGE", storage_str)
bpf_text = bpf_text.replace("STORE", store_str)
+if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
+ bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk')
+else:
+ bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk')
+if args.flags:
+ bpf_text = bpf_text.replace('CMD_FLAGS', 'u64 flags;')
+ bpf_text = bpf_text.replace('SET_FLAGS', 'key.flags = req->cmd_flags;')
+else:
+ bpf_text = bpf_text.replace('CMD_FLAGS', '')
+ bpf_text = bpf_text.replace('SET_FLAGS', '')
if debug or args.ebpf:
print(bpf_text)
@@ -205,25 +271,53 @@ b = BPF(text=bpf_text)
if args.queued:
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_req_start")
- else:
+ elif BPF.get_kprobe_functions(b'blk_account_io_start'):
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_req_start")
+ else:
+ if args.flags:
+ # Some flags are accessible in the rwbs field (RAHEAD, SYNC and META)
+ # but other aren't. Disable the -F option for tracepoint for now.
+ print("ERROR: blk_account_io_start probe not available. Can't use -F.")
+ exit()
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_req_start_tp")
else:
if BPF.get_kprobe_functions(b'blk_start_request'):
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
+
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_done")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_done")
+else:
+ if args.flags:
+ print("ERROR: blk_account_io_done probe not available. Can't use -F.")
+ exit()
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_done_tp")
+
if not args.json:
print("Tracing block device I/O... Hit Ctrl-C to end.")
-def disk_print(s):
- disk = s.decode('utf-8', 'replace')
- if not disk:
- disk = "<unknown>"
- return disk
+# cache disk major,minor -> diskname
+diskstats = "/proc/diskstats"
+disklookup = {}
+with open(diskstats) as stats:
+ for line in stats:
+ a = line.split()
+ disklookup[a[0] + "," + a[1]] = a[2]
+
+def disk_print(d):
+ major = d >> 20
+ minor = d & ((1 << 20) - 1)
+
+ disk = str(major) + "," + str(minor)
+ if disk in disklookup:
+ diskname = disklookup[disk]
+ else:
+ diskname = "?"
+
+ return diskname
# see blk_fill_rwbs():
req_opf = {
diff --git a/tools/biosnoop.py b/tools/biosnoop.py
index 33703233..f0fef98b 100755
--- a/tools/biosnoop.py
+++ b/tools/biosnoop.py
@@ -14,6 +14,7 @@
# 11-Feb-2016 Allan McAleavy updated for BPF_PERF_OUTPUT
# 21-Jun-2022 Rocky Xing Added disk filter support.
# 13-Oct-2022 Rocky Xing Added support for displaying block I/O pattern.
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
from __future__ import print_function
from bcc import BPF
@@ -64,6 +65,24 @@ struct val_t {
char name[TASK_COMM_LEN];
};
+struct tp_args {
+ u64 __unused__;
+ dev_t dev;
+ sector_t sector;
+ unsigned int nr_sector;
+ unsigned int bytes;
+ char rwbs[8];
+ char comm[16];
+ char cmd[];
+};
+
+struct hash_key {
+ dev_t dev;
+ u32 rwflag;
+ sector_t sector;
+};
+
+
#ifdef INCLUDE_PATTERN
struct sector_key_t {
u32 dev_major;
@@ -79,6 +98,7 @@ enum bio_pattern {
struct data_t {
u32 pid;
+ u32 dev;
u64 rwflag;
u64 delta;
u64 qdelta;
@@ -88,7 +108,6 @@ struct data_t {
enum bio_pattern pattern;
#endif
u64 ts;
- char disk_name[DISK_NAME_LEN];
char name[TASK_COMM_LEN];
};
@@ -96,12 +115,45 @@ struct data_t {
BPF_HASH(last_sectors, struct sector_key_t, u64);
#endif
-BPF_HASH(start, struct request *, struct start_req_t);
-BPF_HASH(infobyreq, struct request *, struct val_t);
+BPF_HASH(start, struct hash_key, struct start_req_t);
+BPF_HASH(infobyreq, struct hash_key, struct val_t);
BPF_PERF_OUTPUT(events);
+static dev_t ddevt(struct gendisk *disk) {
+ return (disk->major << 20) | disk->first_minor;
+}
+
+/*
+ * The following deals with a kernel version change (in mainline 4.7, although
+ * it may be backported to earlier kernels) with how block request write flags
+ * are tested. We handle both pre- and post-change versions here. Please avoid
+ * kernel version tests like this as much as possible: they inflate the code,
+ * test, and maintenance burden.
+ */
+static int get_rwflag(u32 cmd_flags) {
+#ifdef REQ_WRITE
+ return !!(cmd_flags & REQ_WRITE);
+#elif defined(REQ_OP_SHIFT)
+ return !!((cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
+#else
+ return !!((cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
+#endif
+}
+
+#define RWBS_LEN 8
+
+static int get_rwflag_tp(char *rwbs) {
+ for (int i = 0; i < RWBS_LEN; i++) {
+ if (rwbs[i] == 'W')
+ return 1;
+ if (rwbs[i] == '\\0')
+ return 0;
+ }
+ return 0;
+}
+
// cache PID and comm by-req
-int trace_pid_start(struct pt_regs *ctx, struct request *req)
+static int __trace_pid_start(struct hash_key key)
{
DISK_FILTER
@@ -113,47 +165,76 @@ int trace_pid_start(struct pt_regs *ctx, struct request *req)
if (##QUEUE##) {
val.ts = bpf_ktime_get_ns();
}
- infobyreq.update(&req, &val);
+ infobyreq.update(&key, &val);
}
return 0;
}
+
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .rwflag = get_rwflag(req->cmd_flags),
+ .sector = req->__sector
+ };
+
+ return __trace_pid_start(key);
+}
+
+int trace_pid_start_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .rwflag = get_rwflag_tp(args->rwbs),
+ .sector = args->sector
+ };
+
+ return __trace_pid_start(key);
+}
+
// time block I/O
int trace_req_start(struct pt_regs *ctx, struct request *req)
{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .rwflag = get_rwflag(req->cmd_flags),
+ .sector = req->__sector
+ };
+
DISK_FILTER
struct start_req_t start_req = {
.ts = bpf_ktime_get_ns(),
.data_len = req->__data_len
};
- start.update(&req, &start_req);
+ start.update(&key, &start_req);
return 0;
}
// output
-int trace_req_completion(struct pt_regs *ctx, struct request *req)
+static int __trace_req_completion(void *ctx, struct hash_key key)
{
struct start_req_t *startp;
struct val_t *valp;
struct data_t data = {};
- struct gendisk *rq_disk;
+ //struct gendisk *rq_disk;
u64 ts;
// fetch timestamp and calculate delta
- startp = start.lookup(&req);
+ startp = start.lookup(&key);
if (startp == 0) {
// missed tracing issue
return 0;
}
ts = bpf_ktime_get_ns();
- rq_disk = req->__RQ_DISK__;
+ //rq_disk = req->__RQ_DISK__;
data.delta = ts - startp->ts;
data.ts = ts / 1000;
data.qdelta = 0;
data.len = startp->data_len;
- valp = infobyreq.lookup(&req);
+ valp = infobyreq.lookup(&key);
if (valp == 0) {
data.name[0] = '?';
data.name[1] = 0;
@@ -162,10 +243,9 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
data.qdelta = startp->ts - valp->ts;
}
data.pid = valp->pid;
- data.sector = req->__sector;
+ data.sector = key.sector;
+ data.dev = key.dev;
bpf_probe_read_kernel(&data.name, sizeof(data.name), valp->name);
- bpf_probe_read_kernel(&data.disk_name, sizeof(data.disk_name),
- rq_disk->disk_name);
}
#ifdef INCLUDE_PATTERN
@@ -174,8 +254,8 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
u64 *sector, last_sector;
struct sector_key_t sector_key = {
- .dev_major = rq_disk->major,
- .dev_minor = rq_disk->first_minor
+ .dev_major = key.dev >> 20,
+ .dev_minor = key.dev & ((1 << 20) - 1)
};
sector = last_sectors.lookup(&sector_key);
@@ -187,27 +267,36 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
last_sectors.update(&sector_key, &last_sector);
#endif
-/*
- * The following deals with a kernel version change (in mainline 4.7, although
- * it may be backported to earlier kernels) with how block request write flags
- * are tested. We handle both pre- and post-change versions here. Please avoid
- * kernel version tests like this as much as possible: they inflate the code,
- * test, and maintenance burden.
- */
-#ifdef REQ_WRITE
- data.rwflag = !!(req->cmd_flags & REQ_WRITE);
-#elif defined(REQ_OP_SHIFT)
- data.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
-#else
- data.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
-#endif
+ data.rwflag = key.rwflag;
events.perf_submit(ctx, &data, sizeof(data));
- start.delete(&req);
- infobyreq.delete(&req);
+ start.delete(&key);
+ infobyreq.delete(&key);
return 0;
}
+
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .rwflag = get_rwflag(req->cmd_flags),
+ .sector = req->__sector
+ };
+
+ return __trace_req_completion(ctx, key);
+}
+
+int trace_req_completion_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .rwflag = get_rwflag_tp(args->rwbs),
+ .sector = args->sector
+ };
+
+ return __trace_req_completion(args, key);
+}
"""
if args.queue:
bpf_text = bpf_text.replace('##QUEUE##', '1')
@@ -225,21 +314,13 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
exit(1)
stat_info = os.stat(disk_path)
- major = os.major(stat_info.st_rdev)
- minor = os.minor(stat_info.st_rdev)
-
- disk_field_str = ""
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
- disk_field_str = 'req->rq_disk'
- else:
- disk_field_str = 'req->q->disk'
+ dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev)
disk_filter_str = """
- struct gendisk *disk = %s;
- if (!(disk->major == %d && disk->first_minor == %d)) {
+ if(key.dev != %s) {
return 0;
}
- """ % (disk_field_str, major, minor)
+ """ % (dev)
bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str)
else:
@@ -254,15 +335,19 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
b = BPF(text=bpf_text)
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_start'):
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
+else:
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp")
if BPF.get_kprobe_functions(b'blk_start_request'):
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
+else:
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp")
# header
print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID",
@@ -273,6 +358,27 @@ print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID",
print("%7s " % ("QUE(ms)"), end="")
print("%7s" % "LAT(ms)")
+
+# cache disk major,minor -> diskname
+diskstats = "/proc/diskstats"
+disklookup = {}
+with open(diskstats) as stats:
+ for line in stats:
+ a = line.split()
+ disklookup[a[0] + "," + a[1]] = a[2]
+
+def disk_print(d):
+ major = d >> 20
+ minor = d & ((1 << 20) - 1)
+
+ disk = str(major) + "," + str(minor)
+ if disk in disklookup:
+ diskname = disklookup[disk]
+ else:
+ diskname = "<unknown>"
+
+ return diskname
+
rwflg = ""
pattern = ""
start_ts = 0
@@ -297,9 +403,7 @@ P_RANDOM = 2
delta = float(event.ts) - start_ts
- disk_name = event.disk_name.decode('utf-8', 'replace')
- if not disk_name:
- disk_name = '<unknown>'
+ disk_name = disk_print(event.dev)
print("%-11.6f %-14.14s %-7s %-9s %-1s %-10s %-7s" % (
delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid,
diff --git a/tools/biotop.py b/tools/biotop.py
index fcdd373f..2620983a 100755
--- a/tools/biotop.py
+++ b/tools/biotop.py
@@ -14,6 +14,7 @@
#
# 06-Feb-2016 Brendan Gregg Created this.
# 17-Mar-2022 Rocky Xing Added PID filter support.
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
from __future__ import print_function
from bcc import BPF
@@ -88,14 +89,35 @@ struct val_t {
u32 io;
};
-BPF_HASH(start, struct request *, struct start_req_t);
-BPF_HASH(whobyreq, struct request *, struct who_t);
+struct tp_args {
+ u64 __unused__;
+ dev_t dev;
+ sector_t sector;
+ unsigned int nr_sector;
+ unsigned int bytes;
+ char rwbs[8];
+ char comm[16];
+ char cmd[];
+};
+
+struct hash_key {
+ dev_t dev;
+ u32 _pad;
+ sector_t sector;
+};
+
+BPF_HASH(start, struct hash_key, struct start_req_t);
+BPF_HASH(whobyreq, struct hash_key, struct who_t);
BPF_HASH(counts, struct info_t, struct val_t);
+static dev_t ddevt(struct gendisk *disk) {
+ return (disk->major << 20) | disk->first_minor;
+}
+
// cache PID and comm by-req
-int trace_pid_start(struct pt_regs *ctx, struct request *req)
+static int __trace_pid_start(struct hash_key key)
{
- struct who_t who = {};
+ struct who_t who;
u32 pid;
if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
@@ -104,30 +126,54 @@ int trace_pid_start(struct pt_regs *ctx, struct request *req)
return 0;
who.pid = pid;
- whobyreq.update(&req, &who);
+ whobyreq.update(&key, &who);
}
return 0;
}
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ return __trace_pid_start(key);
+}
+
+int trace_pid_start_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_pid_start(key);
+}
+
// time block I/O
int trace_req_start(struct pt_regs *ctx, struct request *req)
{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
struct start_req_t start_req = {
.ts = bpf_ktime_get_ns(),
.data_len = req->__data_len
};
- start.update(&req, &start_req);
+ start.update(&key, &start_req);
return 0;
}
// output
-int trace_req_completion(struct pt_regs *ctx, struct request *req)
+static int __trace_req_completion(struct hash_key key)
{
struct start_req_t *startp;
// fetch timestamp and calculate delta
- startp = start.lookup(&req);
+ startp = start.lookup(&key);
if (startp == 0) {
return 0; // missed tracing issue
}
@@ -135,12 +181,12 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
struct who_t *whop;
u32 pid;
- whop = whobyreq.lookup(&req);
+ whop = whobyreq.lookup(&key);
pid = whop != 0 ? whop->pid : 0;
if (FILTER_PID) {
- start.delete(&req);
+ start.delete(&key);
if (whop != 0) {
- whobyreq.delete(&req);
+ whobyreq.delete(&key);
}
return 0;
}
@@ -150,8 +196,8 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
// setup info_t key
struct info_t info = {};
- info.major = req->__RQ_DISK__->major;
- info.minor = req->__RQ_DISK__->first_minor;
+ info.major = key.dev >> 20;
+ info.minor = key.dev & ((1 << 20) - 1);
/*
* The following deals with a kernel version change (in mainline 4.7, although
* it may be backported to earlier kernels) with how block request write flags
@@ -159,13 +205,13 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
* kernel version tests like this as much as possible: they inflate the code,
* test, and maintenance burden.
*/
-#ifdef REQ_WRITE
+/*#ifdef REQ_WRITE
info.rwflag = !!(req->cmd_flags & REQ_WRITE);
#elif defined(REQ_OP_SHIFT)
info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
#else
info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
-#endif
+#endif*/
if (whop == 0) {
// missed pid who, save stats as pid 0
@@ -183,11 +229,31 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
valp->io++;
}
- start.delete(&req);
- whobyreq.delete(&req);
+ start.delete(&key);
+ whobyreq.delete(&key);
return 0;
}
+
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
+{
+ struct hash_key key = {
+ .dev = ddevt(req->__RQ_DISK__),
+ .sector = req->__sector
+ };
+
+ return __trace_req_completion(key);
+}
+
+int trace_req_completion_tp(struct tp_args *args)
+{
+ struct hash_key key = {
+ .dev = args->dev,
+ .sector = args->sector
+ };
+
+ return __trace_req_completion(key);
+}
"""
if args.ebpf:
@@ -207,15 +273,19 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
b = BPF(text=bpf_text)
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_start'):
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
+else:
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp")
if BPF.get_kprobe_functions(b'blk_start_request'):
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion")
-else:
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
+else:
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp")
print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
--
2.41.0