856 lines
24 KiB
Diff
856 lines
24 KiB
Diff
|
From 2e758b65231f976c67a0aad791aabc7927ea7086 Mon Sep 17 00:00:00 2001
|
||
|
From: Jerome Marchand <jmarchan@redhat.com>
|
||
|
Date: Thu, 27 Jul 2023 18:19:18 +0200
|
||
|
Subject: [PATCH] tools: Add support for the new block_io_* tracepoints
|
||
|
|
||
|
The bio tools currently depends on blk_account_io_done/start functions
|
||
|
that can be inlined. To fix that, a couple of tracepoints have been
|
||
|
added upstream (block:block_io_start/done). This patch add the support
|
||
|
for those tracepoints when they are available.
|
||
|
|
||
|
Unfortunately, the bio tools relies on data that is not available to
|
||
|
the tracepoints (mostly the struct request). So the tracepoints can't
|
||
|
be used as drop in replacement for blk_account_io_*. Main difference,
|
||
|
is that we can't use the struct request as the hash key anymore, so it
|
||
|
now uses the couple (dev_t, sector) for that purpose.
|
||
|
|
||
|
For the biolatency tool, the -F option is disabled when only the
|
||
|
tracepoints are available because the flags are not all accessible
|
||
|
from the tracepoints. Otherwise, all features of the tools should
|
||
|
remain.
|
||
|
|
||
|
Closes #4261
|
||
|
|
||
|
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
|
||
|
---
|
||
|
tools/biolatency.py | 166 ++++++++++++++++++++++++++++--------
|
||
|
tools/biosnoop.py | 200 +++++++++++++++++++++++++++++++++-----------
|
||
|
tools/biotop.py | 108 +++++++++++++++++++-----
|
||
|
3 files changed, 371 insertions(+), 103 deletions(-)
|
||
|
|
||
|
diff --git a/tools/biolatency.py b/tools/biolatency.py
|
||
|
index 8fe43a7c..03b48a4c 100755
|
||
|
--- a/tools/biolatency.py
|
||
|
+++ b/tools/biolatency.py
|
||
|
@@ -11,6 +11,7 @@
|
||
|
#
|
||
|
# 20-Sep-2015 Brendan Gregg Created this.
|
||
|
# 31-Mar-2022 Rocky Xing Added disk filter support.
|
||
|
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
|
||
|
|
||
|
from __future__ import print_function
|
||
|
from bcc import BPF
|
||
|
@@ -72,7 +73,7 @@ bpf_text = """
|
||
|
#include <linux/blk-mq.h>
|
||
|
|
||
|
typedef struct disk_key {
|
||
|
- char disk[DISK_NAME_LEN];
|
||
|
+ dev_t dev;
|
||
|
u64 slot;
|
||
|
} disk_key_t;
|
||
|
|
||
|
@@ -86,26 +87,70 @@ typedef struct ext_val {
|
||
|
u64 count;
|
||
|
} ext_val_t;
|
||
|
|
||
|
-BPF_HASH(start, struct request *);
|
||
|
+struct tp_args {
|
||
|
+ u64 __unused__;
|
||
|
+ dev_t dev;
|
||
|
+ sector_t sector;
|
||
|
+ unsigned int nr_sector;
|
||
|
+ unsigned int bytes;
|
||
|
+ char rwbs[8];
|
||
|
+ char comm[16];
|
||
|
+ char cmd[];
|
||
|
+};
|
||
|
+
|
||
|
+struct start_key {
|
||
|
+ dev_t dev;
|
||
|
+ u32 _pad;
|
||
|
+ sector_t sector;
|
||
|
+ CMD_FLAGS
|
||
|
+};
|
||
|
+
|
||
|
+BPF_HASH(start, struct start_key);
|
||
|
STORAGE
|
||
|
|
||
|
+static dev_t ddevt(struct gendisk *disk) {
|
||
|
+ return (disk->major << 20) | disk->first_minor;
|
||
|
+}
|
||
|
+
|
||
|
// time block I/O
|
||
|
-int trace_req_start(struct pt_regs *ctx, struct request *req)
|
||
|
+static int __trace_req_start(struct start_key key)
|
||
|
{
|
||
|
DISK_FILTER
|
||
|
|
||
|
u64 ts = bpf_ktime_get_ns();
|
||
|
- start.update(&req, &ts);
|
||
|
+ start.update(&key, &ts);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
+int trace_req_start(struct pt_regs *ctx, struct request *req)
|
||
|
+{
|
||
|
+ struct start_key key = {
|
||
|
+ .dev = ddevt(req->__RQ_DISK__),
|
||
|
+ .sector = req->__sector
|
||
|
+ };
|
||
|
+
|
||
|
+ SET_FLAGS
|
||
|
+
|
||
|
+ return __trace_req_start(key);
|
||
|
+}
|
||
|
+
|
||
|
+int trace_req_start_tp(struct tp_args *args)
|
||
|
+{
|
||
|
+ struct start_key key = {
|
||
|
+ .dev = args->dev,
|
||
|
+ .sector = args->sector
|
||
|
+ };
|
||
|
+
|
||
|
+ return __trace_req_start(key);
|
||
|
+}
|
||
|
+
|
||
|
// output
|
||
|
-int trace_req_done(struct pt_regs *ctx, struct request *req)
|
||
|
+static int __trace_req_done(struct start_key key)
|
||
|
{
|
||
|
u64 *tsp, delta;
|
||
|
|
||
|
// fetch timestamp and calculate delta
|
||
|
- tsp = start.lookup(&req);
|
||
|
+ tsp = start.lookup(&key);
|
||
|
if (tsp == 0) {
|
||
|
return 0; // missed issue
|
||
|
}
|
||
|
@@ -116,9 +161,31 @@ int trace_req_done(struct pt_regs *ctx, struct request *req)
|
||
|
// store as histogram
|
||
|
STORE
|
||
|
|
||
|
- start.delete(&req);
|
||
|
+ start.delete(&key);
|
||
|
return 0;
|
||
|
}
|
||
|
+
|
||
|
+int trace_req_done(struct pt_regs *ctx, struct request *req)
|
||
|
+{
|
||
|
+ struct start_key key = {
|
||
|
+ .dev = ddevt(req->__RQ_DISK__),
|
||
|
+ .sector = req->__sector
|
||
|
+ };
|
||
|
+
|
||
|
+ SET_FLAGS
|
||
|
+
|
||
|
+ return __trace_req_done(key);
|
||
|
+}
|
||
|
+
|
||
|
+int trace_req_done_tp(struct tp_args *args)
|
||
|
+{
|
||
|
+ struct start_key key = {
|
||
|
+ .dev = args->dev,
|
||
|
+ .sector = args->sector
|
||
|
+ };
|
||
|
+
|
||
|
+ return __trace_req_done(key);
|
||
|
+}
|
||
|
"""
|
||
|
|
||
|
# code substitutions
|
||
|
@@ -134,21 +201,18 @@ store_str = ""
|
||
|
if args.disks:
|
||
|
storage_str += "BPF_HISTOGRAM(dist, disk_key_t);"
|
||
|
disks_str = """
|
||
|
- disk_key_t key = {.slot = bpf_log2l(delta)};
|
||
|
- void *__tmp = (void *)req->__RQ_DISK__->disk_name;
|
||
|
- bpf_probe_read(&key.disk, sizeof(key.disk), __tmp);
|
||
|
- dist.atomic_increment(key);
|
||
|
+ disk_key_t dkey = {};
|
||
|
+ dkey.dev = key.dev;
|
||
|
+ dkey.slot = bpf_log2l(delta);
|
||
|
+ dist.atomic_increment(dkey);
|
||
|
"""
|
||
|
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
|
||
|
- store_str += disks_str.replace('__RQ_DISK__', 'rq_disk')
|
||
|
- else:
|
||
|
- store_str += disks_str.replace('__RQ_DISK__', 'q->disk')
|
||
|
+ store_str += disks_str
|
||
|
elif args.flags:
|
||
|
storage_str += "BPF_HISTOGRAM(dist, flag_key_t);"
|
||
|
store_str += """
|
||
|
- flag_key_t key = {.slot = bpf_log2l(delta)};
|
||
|
- key.flags = req->cmd_flags;
|
||
|
- dist.atomic_increment(key);
|
||
|
+ flag_key_t fkey = {.slot = bpf_log2l(delta)};
|
||
|
+ fkey.flags = key.flags;
|
||
|
+ dist.atomic_increment(fkey);
|
||
|
"""
|
||
|
else:
|
||
|
storage_str += "BPF_HISTOGRAM(dist);"
|
||
|
@@ -161,21 +225,13 @@ store_str = ""
|
||
|
exit(1)
|
||
|
|
||
|
stat_info = os.stat(disk_path)
|
||
|
- major = os.major(stat_info.st_rdev)
|
||
|
- minor = os.minor(stat_info.st_rdev)
|
||
|
-
|
||
|
- disk_field_str = ""
|
||
|
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
|
||
|
- disk_field_str = 'req->rq_disk'
|
||
|
- else:
|
||
|
- disk_field_str = 'req->q->disk'
|
||
|
+ dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev)
|
||
|
|
||
|
disk_filter_str = """
|
||
|
- struct gendisk *disk = %s;
|
||
|
- if (!(disk->major == %d && disk->first_minor == %d)) {
|
||
|
+ if(key.dev != %s) {
|
||
|
return 0;
|
||
|
}
|
||
|
- """ % (disk_field_str, major, minor)
|
||
|
+ """ % (dev)
|
||
|
|
||
|
bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str)
|
||
|
else:
|
||
|
@@ -194,6 +250,16 @@ store_str = ""
|
||
|
|
||
|
bpf_text = bpf_text.replace("STORAGE", storage_str)
|
||
|
bpf_text = bpf_text.replace("STORE", store_str)
|
||
|
+if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
|
||
|
+ bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk')
|
||
|
+else:
|
||
|
+ bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk')
|
||
|
+if args.flags:
|
||
|
+ bpf_text = bpf_text.replace('CMD_FLAGS', 'u64 flags;')
|
||
|
+ bpf_text = bpf_text.replace('SET_FLAGS', 'key.flags = req->cmd_flags;')
|
||
|
+else:
|
||
|
+ bpf_text = bpf_text.replace('CMD_FLAGS', '')
|
||
|
+ bpf_text = bpf_text.replace('SET_FLAGS', '')
|
||
|
|
||
|
if debug or args.ebpf:
|
||
|
print(bpf_text)
|
||
|
@@ -205,25 +271,53 @@ b = BPF(text=bpf_text)
|
||
|
if args.queued:
|
||
|
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
|
||
|
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_req_start")
|
||
|
- else:
|
||
|
+ elif BPF.get_kprobe_functions(b'blk_account_io_start'):
|
||
|
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_req_start")
|
||
|
+ else:
|
||
|
+ if args.flags:
|
||
|
+ # Some flags are accessible in the rwbs field (RAHEAD, SYNC and META)
|
||
|
+ # but other aren't. Disable the -F option for tracepoint for now.
|
||
|
+ print("ERROR: blk_account_io_start probe not available. Can't use -F.")
|
||
|
+ exit()
|
||
|
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_req_start_tp")
|
||
|
else:
|
||
|
if BPF.get_kprobe_functions(b'blk_start_request'):
|
||
|
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
|
||
|
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
|
||
|
+
|
||
|
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
|
||
|
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_done")
|
||
|
-else:
|
||
|
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
|
||
|
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_done")
|
||
|
+else:
|
||
|
+ if args.flags:
|
||
|
+ print("ERROR: blk_account_io_done probe not available. Can't use -F.")
|
||
|
+ exit()
|
||
|
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_done_tp")
|
||
|
+
|
||
|
|
||
|
if not args.json:
|
||
|
print("Tracing block device I/O... Hit Ctrl-C to end.")
|
||
|
|
||
|
-def disk_print(s):
|
||
|
- disk = s.decode('utf-8', 'replace')
|
||
|
- if not disk:
|
||
|
- disk = "<unknown>"
|
||
|
- return disk
|
||
|
+# cache disk major,minor -> diskname
|
||
|
+diskstats = "/proc/diskstats"
|
||
|
+disklookup = {}
|
||
|
+with open(diskstats) as stats:
|
||
|
+ for line in stats:
|
||
|
+ a = line.split()
|
||
|
+ disklookup[a[0] + "," + a[1]] = a[2]
|
||
|
+
|
||
|
+def disk_print(d):
|
||
|
+ major = d >> 20
|
||
|
+ minor = d & ((1 << 20) - 1)
|
||
|
+
|
||
|
+ disk = str(major) + "," + str(minor)
|
||
|
+ if disk in disklookup:
|
||
|
+ diskname = disklookup[disk]
|
||
|
+ else:
|
||
|
+ diskname = "?"
|
||
|
+
|
||
|
+ return diskname
|
||
|
|
||
|
# see blk_fill_rwbs():
|
||
|
req_opf = {
|
||
|
diff --git a/tools/biosnoop.py b/tools/biosnoop.py
|
||
|
index 33703233..f0fef98b 100755
|
||
|
--- a/tools/biosnoop.py
|
||
|
+++ b/tools/biosnoop.py
|
||
|
@@ -14,6 +14,7 @@
|
||
|
# 11-Feb-2016 Allan McAleavy updated for BPF_PERF_OUTPUT
|
||
|
# 21-Jun-2022 Rocky Xing Added disk filter support.
|
||
|
# 13-Oct-2022 Rocky Xing Added support for displaying block I/O pattern.
|
||
|
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
|
||
|
|
||
|
from __future__ import print_function
|
||
|
from bcc import BPF
|
||
|
@@ -64,6 +65,24 @@ struct val_t {
|
||
|
char name[TASK_COMM_LEN];
|
||
|
};
|
||
|
|
||
|
+struct tp_args {
|
||
|
+ u64 __unused__;
|
||
|
+ dev_t dev;
|
||
|
+ sector_t sector;
|
||
|
+ unsigned int nr_sector;
|
||
|
+ unsigned int bytes;
|
||
|
+ char rwbs[8];
|
||
|
+ char comm[16];
|
||
|
+ char cmd[];
|
||
|
+};
|
||
|
+
|
||
|
+struct hash_key {
|
||
|
+ dev_t dev;
|
||
|
+ u32 rwflag;
|
||
|
+ sector_t sector;
|
||
|
+};
|
||
|
+
|
||
|
+
|
||
|
#ifdef INCLUDE_PATTERN
|
||
|
struct sector_key_t {
|
||
|
u32 dev_major;
|
||
|
@@ -79,6 +98,7 @@ enum bio_pattern {
|
||
|
|
||
|
struct data_t {
|
||
|
u32 pid;
|
||
|
+ u32 dev;
|
||
|
u64 rwflag;
|
||
|
u64 delta;
|
||
|
u64 qdelta;
|
||
|
@@ -88,7 +108,6 @@ struct data_t {
|
||
|
enum bio_pattern pattern;
|
||
|
#endif
|
||
|
u64 ts;
|
||
|
- char disk_name[DISK_NAME_LEN];
|
||
|
char name[TASK_COMM_LEN];
|
||
|
};
|
||
|
|
||
|
@@ -96,12 +115,45 @@ struct data_t {
|
||
|
BPF_HASH(last_sectors, struct sector_key_t, u64);
|
||
|
#endif
|
||
|
|
||
|
-BPF_HASH(start, struct request *, struct start_req_t);
|
||
|
-BPF_HASH(infobyreq, struct request *, struct val_t);
|
||
|
+BPF_HASH(start, struct hash_key, struct start_req_t);
|
||
|
+BPF_HASH(infobyreq, struct hash_key, struct val_t);
|
||
|
BPF_PERF_OUTPUT(events);
|
||
|
|
||
|
+static dev_t ddevt(struct gendisk *disk) {
|
||
|
+ return (disk->major << 20) | disk->first_minor;
|
||
|
+}
|
||
|
+
|
||
|
+/*
|
||
|
+ * The following deals with a kernel version change (in mainline 4.7, although
|
||
|
+ * it may be backported to earlier kernels) with how block request write flags
|
||
|
+ * are tested. We handle both pre- and post-change versions here. Please avoid
|
||
|
+ * kernel version tests like this as much as possible: they inflate the code,
|
||
|
+ * test, and maintenance burden.
|
||
|
+ */
|
||
|
+static int get_rwflag(u32 cmd_flags) {
|
||
|
+#ifdef REQ_WRITE
|
||
|
+ return !!(cmd_flags & REQ_WRITE);
|
||
|
+#elif defined(REQ_OP_SHIFT)
|
||
|
+ return !!((cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
|
||
|
+#else
|
||
|
+ return !!((cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
|
||
|
+#endif
|
||
|
+}
|
||
|
+
|
||
|
+#define RWBS_LEN 8
|
||
|
+
|
||
|
+static int get_rwflag_tp(char *rwbs) {
|
||
|
+ for (int i = 0; i < RWBS_LEN; i++) {
|
||
|
+ if (rwbs[i] == 'W')
|
||
|
+ return 1;
|
||
|
+ if (rwbs[i] == '\\0')
|
||
|
+ return 0;
|
||
|
+ }
|
||
|
+ return 0;
|
||
|
+}
|
||
|
+
|
||
|
// cache PID and comm by-req
|
||
|
-int trace_pid_start(struct pt_regs *ctx, struct request *req)
|
||
|
+static int __trace_pid_start(struct hash_key key)
|
||
|
{
|
||
|
DISK_FILTER
|
||
|
|
||
|
@@ -113,47 +165,76 @@ int trace_pid_start(struct pt_regs *ctx, struct request *req)
|
||
|
if (##QUEUE##) {
|
||
|
val.ts = bpf_ktime_get_ns();
|
||
|
}
|
||
|
- infobyreq.update(&req, &val);
|
||
|
+ infobyreq.update(&key, &val);
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
+
|
||
|
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
|
||
|
+{
|
||
|
+ struct hash_key key = {
|
||
|
+ .dev = ddevt(req->__RQ_DISK__),
|
||
|
+ .rwflag = get_rwflag(req->cmd_flags),
|
||
|
+ .sector = req->__sector
|
||
|
+ };
|
||
|
+
|
||
|
+ return __trace_pid_start(key);
|
||
|
+}
|
||
|
+
|
||
|
+int trace_pid_start_tp(struct tp_args *args)
|
||
|
+{
|
||
|
+ struct hash_key key = {
|
||
|
+ .dev = args->dev,
|
||
|
+ .rwflag = get_rwflag_tp(args->rwbs),
|
||
|
+ .sector = args->sector
|
||
|
+ };
|
||
|
+
|
||
|
+ return __trace_pid_start(key);
|
||
|
+}
|
||
|
+
|
||
|
// time block I/O
|
||
|
int trace_req_start(struct pt_regs *ctx, struct request *req)
|
||
|
{
|
||
|
+ struct hash_key key = {
|
||
|
+ .dev = ddevt(req->__RQ_DISK__),
|
||
|
+ .rwflag = get_rwflag(req->cmd_flags),
|
||
|
+ .sector = req->__sector
|
||
|
+ };
|
||
|
+
|
||
|
DISK_FILTER
|
||
|
|
||
|
struct start_req_t start_req = {
|
||
|
.ts = bpf_ktime_get_ns(),
|
||
|
.data_len = req->__data_len
|
||
|
};
|
||
|
- start.update(&req, &start_req);
|
||
|
+ start.update(&key, &start_req);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
// output
|
||
|
-int trace_req_completion(struct pt_regs *ctx, struct request *req)
|
||
|
+static int __trace_req_completion(void *ctx, struct hash_key key)
|
||
|
{
|
||
|
struct start_req_t *startp;
|
||
|
struct val_t *valp;
|
||
|
struct data_t data = {};
|
||
|
- struct gendisk *rq_disk;
|
||
|
+ //struct gendisk *rq_disk;
|
||
|
u64 ts;
|
||
|
|
||
|
// fetch timestamp and calculate delta
|
||
|
- startp = start.lookup(&req);
|
||
|
+ startp = start.lookup(&key);
|
||
|
if (startp == 0) {
|
||
|
// missed tracing issue
|
||
|
return 0;
|
||
|
}
|
||
|
ts = bpf_ktime_get_ns();
|
||
|
- rq_disk = req->__RQ_DISK__;
|
||
|
+ //rq_disk = req->__RQ_DISK__;
|
||
|
data.delta = ts - startp->ts;
|
||
|
data.ts = ts / 1000;
|
||
|
data.qdelta = 0;
|
||
|
data.len = startp->data_len;
|
||
|
|
||
|
- valp = infobyreq.lookup(&req);
|
||
|
+ valp = infobyreq.lookup(&key);
|
||
|
if (valp == 0) {
|
||
|
data.name[0] = '?';
|
||
|
data.name[1] = 0;
|
||
|
@@ -162,10 +243,9 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
|
||
|
data.qdelta = startp->ts - valp->ts;
|
||
|
}
|
||
|
data.pid = valp->pid;
|
||
|
- data.sector = req->__sector;
|
||
|
+ data.sector = key.sector;
|
||
|
+ data.dev = key.dev;
|
||
|
bpf_probe_read_kernel(&data.name, sizeof(data.name), valp->name);
|
||
|
- bpf_probe_read_kernel(&data.disk_name, sizeof(data.disk_name),
|
||
|
- rq_disk->disk_name);
|
||
|
}
|
||
|
|
||
|
#ifdef INCLUDE_PATTERN
|
||
|
@@ -174,8 +254,8 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
|
||
|
u64 *sector, last_sector;
|
||
|
|
||
|
struct sector_key_t sector_key = {
|
||
|
- .dev_major = rq_disk->major,
|
||
|
- .dev_minor = rq_disk->first_minor
|
||
|
+ .dev_major = key.dev >> 20,
|
||
|
+ .dev_minor = key.dev & ((1 << 20) - 1)
|
||
|
};
|
||
|
|
||
|
sector = last_sectors.lookup(§or_key);
|
||
|
@@ -187,27 +267,36 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
|
||
|
last_sectors.update(§or_key, &last_sector);
|
||
|
#endif
|
||
|
|
||
|
-/*
|
||
|
- * The following deals with a kernel version change (in mainline 4.7, although
|
||
|
- * it may be backported to earlier kernels) with how block request write flags
|
||
|
- * are tested. We handle both pre- and post-change versions here. Please avoid
|
||
|
- * kernel version tests like this as much as possible: they inflate the code,
|
||
|
- * test, and maintenance burden.
|
||
|
- */
|
||
|
-#ifdef REQ_WRITE
|
||
|
- data.rwflag = !!(req->cmd_flags & REQ_WRITE);
|
||
|
-#elif defined(REQ_OP_SHIFT)
|
||
|
- data.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
|
||
|
-#else
|
||
|
- data.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
|
||
|
-#endif
|
||
|
+ data.rwflag = key.rwflag;
|
||
|
|
||
|
events.perf_submit(ctx, &data, sizeof(data));
|
||
|
- start.delete(&req);
|
||
|
- infobyreq.delete(&req);
|
||
|
+ start.delete(&key);
|
||
|
+ infobyreq.delete(&key);
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
+
|
||
|
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
|
||
|
+{
|
||
|
+ struct hash_key key = {
|
||
|
+ .dev = ddevt(req->__RQ_DISK__),
|
||
|
+ .rwflag = get_rwflag(req->cmd_flags),
|
||
|
+ .sector = req->__sector
|
||
|
+ };
|
||
|
+
|
||
|
+ return __trace_req_completion(ctx, key);
|
||
|
+}
|
||
|
+
|
||
|
+int trace_req_completion_tp(struct tp_args *args)
|
||
|
+{
|
||
|
+ struct hash_key key = {
|
||
|
+ .dev = args->dev,
|
||
|
+ .rwflag = get_rwflag_tp(args->rwbs),
|
||
|
+ .sector = args->sector
|
||
|
+ };
|
||
|
+
|
||
|
+ return __trace_req_completion(args, key);
|
||
|
+}
|
||
|
"""
|
||
|
if args.queue:
|
||
|
bpf_text = bpf_text.replace('##QUEUE##', '1')
|
||
|
@@ -225,21 +314,13 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
|
||
|
exit(1)
|
||
|
|
||
|
stat_info = os.stat(disk_path)
|
||
|
- major = os.major(stat_info.st_rdev)
|
||
|
- minor = os.minor(stat_info.st_rdev)
|
||
|
-
|
||
|
- disk_field_str = ""
|
||
|
- if BPF.kernel_struct_has_field(b'request', b'rq_disk') == 1:
|
||
|
- disk_field_str = 'req->rq_disk'
|
||
|
- else:
|
||
|
- disk_field_str = 'req->q->disk'
|
||
|
+ dev = os.major(stat_info.st_rdev) << 20 | os.minor(stat_info.st_rdev)
|
||
|
|
||
|
disk_filter_str = """
|
||
|
- struct gendisk *disk = %s;
|
||
|
- if (!(disk->major == %d && disk->first_minor == %d)) {
|
||
|
+ if(key.dev != %s) {
|
||
|
return 0;
|
||
|
}
|
||
|
- """ % (disk_field_str, major, minor)
|
||
|
+ """ % (dev)
|
||
|
|
||
|
bpf_text = bpf_text.replace('DISK_FILTER', disk_filter_str)
|
||
|
else:
|
||
|
@@ -254,15 +335,19 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
|
||
|
b = BPF(text=bpf_text)
|
||
|
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
|
||
|
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
|
||
|
-else:
|
||
|
+elif BPF.get_kprobe_functions(b'blk_account_io_start'):
|
||
|
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
|
||
|
+else:
|
||
|
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp")
|
||
|
if BPF.get_kprobe_functions(b'blk_start_request'):
|
||
|
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
|
||
|
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
|
||
|
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
|
||
|
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion")
|
||
|
-else:
|
||
|
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
|
||
|
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
|
||
|
+else:
|
||
|
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp")
|
||
|
|
||
|
# header
|
||
|
print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID",
|
||
|
@@ -273,6 +358,27 @@ print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID",
|
||
|
print("%7s " % ("QUE(ms)"), end="")
|
||
|
print("%7s" % "LAT(ms)")
|
||
|
|
||
|
+
|
||
|
+# cache disk major,minor -> diskname
|
||
|
+diskstats = "/proc/diskstats"
|
||
|
+disklookup = {}
|
||
|
+with open(diskstats) as stats:
|
||
|
+ for line in stats:
|
||
|
+ a = line.split()
|
||
|
+ disklookup[a[0] + "," + a[1]] = a[2]
|
||
|
+
|
||
|
+def disk_print(d):
|
||
|
+ major = d >> 20
|
||
|
+ minor = d & ((1 << 20) - 1)
|
||
|
+
|
||
|
+ disk = str(major) + "," + str(minor)
|
||
|
+ if disk in disklookup:
|
||
|
+ diskname = disklookup[disk]
|
||
|
+ else:
|
||
|
+ diskname = "<unknown>"
|
||
|
+
|
||
|
+ return diskname
|
||
|
+
|
||
|
rwflg = ""
|
||
|
pattern = ""
|
||
|
start_ts = 0
|
||
|
@@ -297,9 +403,7 @@ P_RANDOM = 2
|
||
|
|
||
|
delta = float(event.ts) - start_ts
|
||
|
|
||
|
- disk_name = event.disk_name.decode('utf-8', 'replace')
|
||
|
- if not disk_name:
|
||
|
- disk_name = '<unknown>'
|
||
|
+ disk_name = disk_print(event.dev)
|
||
|
|
||
|
print("%-11.6f %-14.14s %-7s %-9s %-1s %-10s %-7s" % (
|
||
|
delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid,
|
||
|
diff --git a/tools/biotop.py b/tools/biotop.py
|
||
|
index fcdd373f..2620983a 100755
|
||
|
--- a/tools/biotop.py
|
||
|
+++ b/tools/biotop.py
|
||
|
@@ -14,6 +14,7 @@
|
||
|
#
|
||
|
# 06-Feb-2016 Brendan Gregg Created this.
|
||
|
# 17-Mar-2022 Rocky Xing Added PID filter support.
|
||
|
+# 01-Aug-2023 Jerome Marchand Added support for block tracepoints
|
||
|
|
||
|
from __future__ import print_function
|
||
|
from bcc import BPF
|
||
|
@@ -88,14 +89,35 @@ struct val_t {
|
||
|
u32 io;
|
||
|
};
|
||
|
|
||
|
-BPF_HASH(start, struct request *, struct start_req_t);
|
||
|
-BPF_HASH(whobyreq, struct request *, struct who_t);
|
||
|
+struct tp_args {
|
||
|
+ u64 __unused__;
|
||
|
+ dev_t dev;
|
||
|
+ sector_t sector;
|
||
|
+ unsigned int nr_sector;
|
||
|
+ unsigned int bytes;
|
||
|
+ char rwbs[8];
|
||
|
+ char comm[16];
|
||
|
+ char cmd[];
|
||
|
+};
|
||
|
+
|
||
|
+struct hash_key {
|
||
|
+ dev_t dev;
|
||
|
+ u32 _pad;
|
||
|
+ sector_t sector;
|
||
|
+};
|
||
|
+
|
||
|
+BPF_HASH(start, struct hash_key, struct start_req_t);
|
||
|
+BPF_HASH(whobyreq, struct hash_key, struct who_t);
|
||
|
BPF_HASH(counts, struct info_t, struct val_t);
|
||
|
|
||
|
+static dev_t ddevt(struct gendisk *disk) {
|
||
|
+ return (disk->major << 20) | disk->first_minor;
|
||
|
+}
|
||
|
+
|
||
|
// cache PID and comm by-req
|
||
|
-int trace_pid_start(struct pt_regs *ctx, struct request *req)
|
||
|
+static int __trace_pid_start(struct hash_key key)
|
||
|
{
|
||
|
- struct who_t who = {};
|
||
|
+ struct who_t who;
|
||
|
u32 pid;
|
||
|
|
||
|
if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
|
||
|
@@ -104,30 +126,54 @@ int trace_pid_start(struct pt_regs *ctx, struct request *req)
|
||
|
return 0;
|
||
|
|
||
|
who.pid = pid;
|
||
|
- whobyreq.update(&req, &who);
|
||
|
+ whobyreq.update(&key, &who);
|
||
|
}
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
|
||
|
+{
|
||
|
+ struct hash_key key = {
|
||
|
+ .dev = ddevt(req->__RQ_DISK__),
|
||
|
+ .sector = req->__sector
|
||
|
+ };
|
||
|
+
|
||
|
+ return __trace_pid_start(key);
|
||
|
+}
|
||
|
+
|
||
|
+int trace_pid_start_tp(struct tp_args *args)
|
||
|
+{
|
||
|
+ struct hash_key key = {
|
||
|
+ .dev = args->dev,
|
||
|
+ .sector = args->sector
|
||
|
+ };
|
||
|
+
|
||
|
+ return __trace_pid_start(key);
|
||
|
+}
|
||
|
+
|
||
|
// time block I/O
|
||
|
int trace_req_start(struct pt_regs *ctx, struct request *req)
|
||
|
{
|
||
|
+ struct hash_key key = {
|
||
|
+ .dev = ddevt(req->__RQ_DISK__),
|
||
|
+ .sector = req->__sector
|
||
|
+ };
|
||
|
struct start_req_t start_req = {
|
||
|
.ts = bpf_ktime_get_ns(),
|
||
|
.data_len = req->__data_len
|
||
|
};
|
||
|
- start.update(&req, &start_req);
|
||
|
+ start.update(&key, &start_req);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
// output
|
||
|
-int trace_req_completion(struct pt_regs *ctx, struct request *req)
|
||
|
+static int __trace_req_completion(struct hash_key key)
|
||
|
{
|
||
|
struct start_req_t *startp;
|
||
|
|
||
|
// fetch timestamp and calculate delta
|
||
|
- startp = start.lookup(&req);
|
||
|
+ startp = start.lookup(&key);
|
||
|
if (startp == 0) {
|
||
|
return 0; // missed tracing issue
|
||
|
}
|
||
|
@@ -135,12 +181,12 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
|
||
|
struct who_t *whop;
|
||
|
u32 pid;
|
||
|
|
||
|
- whop = whobyreq.lookup(&req);
|
||
|
+ whop = whobyreq.lookup(&key);
|
||
|
pid = whop != 0 ? whop->pid : 0;
|
||
|
if (FILTER_PID) {
|
||
|
- start.delete(&req);
|
||
|
+ start.delete(&key);
|
||
|
if (whop != 0) {
|
||
|
- whobyreq.delete(&req);
|
||
|
+ whobyreq.delete(&key);
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
@@ -150,8 +196,8 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
|
||
|
|
||
|
// setup info_t key
|
||
|
struct info_t info = {};
|
||
|
- info.major = req->__RQ_DISK__->major;
|
||
|
- info.minor = req->__RQ_DISK__->first_minor;
|
||
|
+ info.major = key.dev >> 20;
|
||
|
+ info.minor = key.dev & ((1 << 20) - 1);
|
||
|
/*
|
||
|
* The following deals with a kernel version change (in mainline 4.7, although
|
||
|
* it may be backported to earlier kernels) with how block request write flags
|
||
|
@@ -159,13 +205,13 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
|
||
|
* kernel version tests like this as much as possible: they inflate the code,
|
||
|
* test, and maintenance burden.
|
||
|
*/
|
||
|
-#ifdef REQ_WRITE
|
||
|
+/*#ifdef REQ_WRITE
|
||
|
info.rwflag = !!(req->cmd_flags & REQ_WRITE);
|
||
|
#elif defined(REQ_OP_SHIFT)
|
||
|
info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
|
||
|
#else
|
||
|
info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
|
||
|
-#endif
|
||
|
+#endif*/
|
||
|
|
||
|
if (whop == 0) {
|
||
|
// missed pid who, save stats as pid 0
|
||
|
@@ -183,11 +229,31 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
|
||
|
valp->io++;
|
||
|
}
|
||
|
|
||
|
- start.delete(&req);
|
||
|
- whobyreq.delete(&req);
|
||
|
+ start.delete(&key);
|
||
|
+ whobyreq.delete(&key);
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
+
|
||
|
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
|
||
|
+{
|
||
|
+ struct hash_key key = {
|
||
|
+ .dev = ddevt(req->__RQ_DISK__),
|
||
|
+ .sector = req->__sector
|
||
|
+ };
|
||
|
+
|
||
|
+ return __trace_req_completion(key);
|
||
|
+}
|
||
|
+
|
||
|
+int trace_req_completion_tp(struct tp_args *args)
|
||
|
+{
|
||
|
+ struct hash_key key = {
|
||
|
+ .dev = args->dev,
|
||
|
+ .sector = args->sector
|
||
|
+ };
|
||
|
+
|
||
|
+ return __trace_req_completion(key);
|
||
|
+}
|
||
|
"""
|
||
|
|
||
|
if args.ebpf:
|
||
|
@@ -207,15 +273,19 @@ int trace_req_completion(struct pt_regs *ctx, struct request *req)
|
||
|
b = BPF(text=bpf_text)
|
||
|
if BPF.get_kprobe_functions(b'__blk_account_io_start'):
|
||
|
b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
|
||
|
-else:
|
||
|
+elif BPF.get_kprobe_functions(b'blk_account_io_start'):
|
||
|
b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
|
||
|
+else:
|
||
|
+ b.attach_tracepoint(tp="block:block_io_start", fn_name="trace_pid_start_tp")
|
||
|
if BPF.get_kprobe_functions(b'blk_start_request'):
|
||
|
b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
|
||
|
b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
|
||
|
if BPF.get_kprobe_functions(b'__blk_account_io_done'):
|
||
|
b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion")
|
||
|
-else:
|
||
|
+elif BPF.get_kprobe_functions(b'blk_account_io_done'):
|
||
|
b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
|
||
|
+else:
|
||
|
+ b.attach_tracepoint(tp="block:block_io_done", fn_name="trace_req_completion_tp")
|
||
|
|
||
|
print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
|
||
|
|
||
|
--
|
||
|
2.41.0
|
||
|
|