228 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			228 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
 | |
| // Copyright (c) 2021 Facebook
 | |
| // Copyright (c) 2021 Google
 | |
| #include "vmlinux.h"
 | |
| #include <bpf/bpf_helpers.h>
 | |
| #include <bpf/bpf_tracing.h>
 | |
| #include <bpf/bpf_core_read.h>
 | |
| 
 | |
| #define MAX_LEVELS  10  // max cgroup hierarchy level: arbitrary
 | |
| #define MAX_EVENTS  32  // max events per cgroup: arbitrary
 | |
| 
 | |
| // NOTE: many of map and global data will be modified before loading
 | |
| //       from the userspace (perf tool) using the skeleton helpers.
 | |
| 
 | |
| // single set of global perf events to measure
 | |
| struct {
 | |
| 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
 | |
| 	__uint(key_size, sizeof(__u32));
 | |
| 	__uint(value_size, sizeof(int));
 | |
| 	__uint(max_entries, 1);
 | |
| } events SEC(".maps");
 | |
| 
 | |
| // from cgroup id to event index
 | |
| struct {
 | |
| 	__uint(type, BPF_MAP_TYPE_HASH);
 | |
| 	__uint(key_size, sizeof(__u64));
 | |
| 	__uint(value_size, sizeof(__u32));
 | |
| 	__uint(max_entries, 1);
 | |
| } cgrp_idx SEC(".maps");
 | |
| 
 | |
| // per-cpu event snapshots to calculate delta
 | |
| struct {
 | |
| 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 | |
| 	__uint(key_size, sizeof(__u32));
 | |
| 	__uint(value_size, sizeof(struct bpf_perf_event_value));
 | |
| } prev_readings SEC(".maps");
 | |
| 
 | |
| // aggregated event values for each cgroup (per-cpu)
 | |
| // will be read from the user-space
 | |
| struct {
 | |
| 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 | |
| 	__uint(key_size, sizeof(__u32));
 | |
| 	__uint(value_size, sizeof(struct bpf_perf_event_value));
 | |
| } cgrp_readings SEC(".maps");
 | |
| 
 | |
| /* new kernel cgroup definition */
 | |
| struct cgroup___new {
 | |
| 	int level;
 | |
| 	struct cgroup *ancestors[];
 | |
| } __attribute__((preserve_access_index));
 | |
| 
 | |
| /* old kernel cgroup definition */
 | |
| struct cgroup___old {
 | |
| 	int level;
 | |
| 	u64 ancestor_ids[];
 | |
| } __attribute__((preserve_access_index));
 | |
| 
 | |
| const volatile __u32 num_events = 1;
 | |
| const volatile __u32 num_cpus = 1;
 | |
| const volatile int use_cgroup_v2 = 0;
 | |
| 
 | |
| int enabled = 0;
 | |
| int perf_subsys_id = -1;
 | |
| 
 | |
| static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
 | |
| {
 | |
| 	/* recast pointer to capture new type for compiler */
 | |
| 	struct cgroup___new *cgrp_new = (void *)cgrp;
 | |
| 
 | |
| 	if (bpf_core_field_exists(cgrp_new->ancestors)) {
 | |
| 		return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id);
 | |
| 	} else {
 | |
| 		/* recast pointer to capture old type for compiler */
 | |
| 		struct cgroup___old *cgrp_old = (void *)cgrp;
 | |
| 
 | |
| 		return BPF_CORE_READ(cgrp_old, ancestor_ids[level]);
 | |
| 	}
 | |
| }
 | |
| 
 | |
| static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
 | |
| {
 | |
| 	struct task_struct *p = (void *)bpf_get_current_task();
 | |
| 	struct cgroup *cgrp;
 | |
| 	register int i = 0;
 | |
| 	__u32 *elem;
 | |
| 	int level;
 | |
| 	int cnt;
 | |
| 
 | |
| 	if (perf_subsys_id == -1) {
 | |
| #if __has_builtin(__builtin_preserve_enum_value)
 | |
| 		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
 | |
| 						     perf_event_cgrp_id);
 | |
| #else
 | |
| 		perf_subsys_id = perf_event_cgrp_id;
 | |
| #endif
 | |
| 	}
 | |
| 	cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
 | |
| 	level = BPF_CORE_READ(cgrp, level);
 | |
| 
 | |
| 	for (cnt = 0; i < MAX_LEVELS; i++) {
 | |
| 		__u64 cgrp_id;
 | |
| 
 | |
| 		if (i > level)
 | |
| 			break;
 | |
| 
 | |
| 		// convert cgroup-id to a map index
 | |
| 		cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i);
 | |
| 		elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
 | |
| 		if (!elem)
 | |
| 			continue;
 | |
| 
 | |
| 		cgrps[cnt++] = *elem;
 | |
| 		if (cnt == size)
 | |
| 			break;
 | |
| 	}
 | |
| 
 | |
| 	return cnt;
 | |
| }
 | |
| 
 | |
| static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
 | |
| {
 | |
| 	register int i = 0;
 | |
| 	__u32 *elem;
 | |
| 	int cnt;
 | |
| 
 | |
| 	for (cnt = 0; i < MAX_LEVELS; i++) {
 | |
| 		__u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
 | |
| 
 | |
| 		if (cgrp_id == 0)
 | |
| 			break;
 | |
| 
 | |
| 		// convert cgroup-id to a map index
 | |
| 		elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
 | |
| 		if (!elem)
 | |
| 			continue;
 | |
| 
 | |
| 		cgrps[cnt++] = *elem;
 | |
| 		if (cnt == size)
 | |
| 			break;
 | |
| 	}
 | |
| 
 | |
| 	return cnt;
 | |
| }
 | |
| 
 | |
| static int bperf_cgroup_count(void)
 | |
| {
 | |
| 	register __u32 idx = 0;  // to have it in a register to pass BPF verifier
 | |
| 	register int c = 0;
 | |
| 	struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
 | |
| 	__u32 cpu = bpf_get_smp_processor_id();
 | |
| 	__u32 cgrp_idx[MAX_LEVELS];
 | |
| 	int cgrp_cnt;
 | |
| 	__u32 key, cgrp;
 | |
| 	long err;
 | |
| 
 | |
| 	if (use_cgroup_v2)
 | |
| 		cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS);
 | |
| 	else
 | |
| 		cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS);
 | |
| 
 | |
| 	for ( ; idx < MAX_EVENTS; idx++) {
 | |
| 		if (idx == num_events)
 | |
| 			break;
 | |
| 
 | |
| 		// XXX: do not pass idx directly (for verifier)
 | |
| 		key = idx;
 | |
| 		// this is per-cpu array for diff
 | |
| 		prev_val = bpf_map_lookup_elem(&prev_readings, &key);
 | |
| 		if (!prev_val) {
 | |
| 			val.counter = val.enabled = val.running = 0;
 | |
| 			bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY);
 | |
| 
 | |
| 			prev_val = bpf_map_lookup_elem(&prev_readings, &key);
 | |
| 			if (!prev_val)
 | |
| 				continue;
 | |
| 		}
 | |
| 
 | |
| 		// read from global perf_event array
 | |
| 		key = idx * num_cpus + cpu;
 | |
| 		err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
 | |
| 		if (err)
 | |
| 			continue;
 | |
| 
 | |
| 		if (enabled) {
 | |
| 			delta.counter = val.counter - prev_val->counter;
 | |
| 			delta.enabled = val.enabled - prev_val->enabled;
 | |
| 			delta.running = val.running - prev_val->running;
 | |
| 
 | |
| 			for (c = 0; c < MAX_LEVELS; c++) {
 | |
| 				if (c == cgrp_cnt)
 | |
| 					break;
 | |
| 
 | |
| 				cgrp = cgrp_idx[c];
 | |
| 
 | |
| 				// aggregate the result by cgroup
 | |
| 				key = cgrp * num_events + idx;
 | |
| 				cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key);
 | |
| 				if (cgrp_val) {
 | |
| 					cgrp_val->counter += delta.counter;
 | |
| 					cgrp_val->enabled += delta.enabled;
 | |
| 					cgrp_val->running += delta.running;
 | |
| 				} else {
 | |
| 					bpf_map_update_elem(&cgrp_readings, &key,
 | |
| 							    &delta, BPF_ANY);
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		*prev_val = val;
 | |
| 	}
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| // This will be attached to cgroup-switches event for each cpu
 | |
| SEC("perf_event")
 | |
| int BPF_PROG(on_cgrp_switch)
 | |
| {
 | |
| 	return bperf_cgroup_count();
 | |
| }
 | |
| 
 | |
| SEC("raw_tp/sched_switch")
 | |
| int BPF_PROG(trigger_read)
 | |
| {
 | |
| 	return bperf_cgroup_count();
 | |
| }
 | |
| 
 | |
| char LICENSE[] SEC("license") = "Dual BSD/GPL";
 |