468 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			468 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
 | 
						|
Performance Counters for Linux
 | 
						|
------------------------------
 | 
						|
 | 
						|
Performance counters are special hardware registers available on most modern
 | 
						|
CPUs. These registers count the number of certain types of hw events: such
 | 
						|
as instructions executed, cachemisses suffered, or branches mis-predicted -
 | 
						|
without slowing down the kernel or applications. These registers can also
 | 
						|
trigger interrupts when a threshold number of events have passed - and can
 | 
						|
thus be used to profile the code that runs on that CPU.
 | 
						|
 | 
						|
The Linux Performance Counter subsystem provides an abstraction of these
 | 
						|
hardware capabilities. It provides per task and per CPU counters, counter
 | 
						|
groups, and it provides event capabilities on top of those.  It
 | 
						|
provides "virtual" 64-bit counters, regardless of the width of the
 | 
						|
underlying hardware counters.
 | 
						|
 | 
						|
Performance counters are accessed via special file descriptors.
 | 
						|
There's one file descriptor per virtual counter used.
 | 
						|
 | 
						|
The special file descriptor is opened via the sys_perf_event_open()
 | 
						|
system call:
 | 
						|
 | 
						|
   int sys_perf_event_open(struct perf_event_attr *hw_event_uptr,
 | 
						|
			     pid_t pid, int cpu, int group_fd,
 | 
						|
			     unsigned long flags);
 | 
						|
 | 
						|
The syscall returns the new fd. The fd can be used via the normal
 | 
						|
VFS system calls: read() can be used to read the counter, fcntl()
 | 
						|
can be used to set the blocking mode, etc.
 | 
						|
 | 
						|
Multiple counters can be kept open at a time, and the counters
 | 
						|
can be poll()ed.
 | 
						|
 | 
						|
When creating a new counter fd, 'perf_event_attr' is:
 | 
						|
 | 
						|
struct perf_event_attr {
 | 
						|
        /*
 | 
						|
         * The MSB of the config word signifies if the rest contains cpu
 | 
						|
         * specific (raw) counter configuration data, if unset, the next
 | 
						|
         * 7 bits are an event type and the rest of the bits are the event
 | 
						|
         * identifier.
 | 
						|
         */
 | 
						|
        __u64                   config;
 | 
						|
 | 
						|
        __u64                   irq_period;
 | 
						|
        __u32                   record_type;
 | 
						|
        __u32                   read_format;
 | 
						|
 | 
						|
        __u64                   disabled       :  1, /* off by default        */
 | 
						|
                                inherit        :  1, /* children inherit it   */
 | 
						|
                                pinned         :  1, /* must always be on PMU */
 | 
						|
                                exclusive      :  1, /* only group on PMU     */
 | 
						|
                                exclude_user   :  1, /* don't count user      */
 | 
						|
                                exclude_kernel :  1, /* ditto kernel          */
 | 
						|
                                exclude_hv     :  1, /* ditto hypervisor      */
 | 
						|
                                exclude_idle   :  1, /* don't count when idle */
 | 
						|
                                mmap           :  1, /* include mmap data     */
 | 
						|
                                munmap         :  1, /* include munmap data   */
 | 
						|
                                comm           :  1, /* include comm data     */
 | 
						|
 | 
						|
                                __reserved_1   : 52;
 | 
						|
 | 
						|
        __u32                   extra_config_len;
 | 
						|
        __u32                   wakeup_events;  /* wakeup every n events */
 | 
						|
 | 
						|
        __u64                   __reserved_2;
 | 
						|
        __u64                   __reserved_3;
 | 
						|
};
 | 
						|
 | 
						|
The 'config' field specifies what the counter should count.  It
 | 
						|
is divided into 3 bit-fields:
 | 
						|
 | 
						|
raw_type: 1 bit   (most significant bit)	0x8000_0000_0000_0000
 | 
						|
type:	  7 bits  (next most significant)	0x7f00_0000_0000_0000
 | 
						|
event_id: 56 bits (least significant)		0x00ff_ffff_ffff_ffff
 | 
						|
 | 
						|
If 'raw_type' is 1, then the counter will count a hardware event
 | 
						|
specified by the remaining 63 bits of event_config.  The encoding is
 | 
						|
machine-specific.
 | 
						|
 | 
						|
If 'raw_type' is 0, then the 'type' field says what kind of counter
 | 
						|
this is, with the following encoding:
 | 
						|
 | 
						|
enum perf_type_id {
 | 
						|
	PERF_TYPE_HARDWARE		= 0,
 | 
						|
	PERF_TYPE_SOFTWARE		= 1,
 | 
						|
	PERF_TYPE_TRACEPOINT		= 2,
 | 
						|
};
 | 
						|
 | 
						|
A counter of PERF_TYPE_HARDWARE will count the hardware event
 | 
						|
specified by 'event_id':
 | 
						|
 | 
						|
/*
 | 
						|
 * Generalized performance counter event types, used by the hw_event.event_id
 | 
						|
 * parameter of the sys_perf_event_open() syscall:
 | 
						|
 */
 | 
						|
enum perf_hw_id {
 | 
						|
	/*
 | 
						|
	 * Common hardware events, generalized by the kernel:
 | 
						|
	 */
 | 
						|
	PERF_COUNT_HW_CPU_CYCLES		= 0,
 | 
						|
	PERF_COUNT_HW_INSTRUCTIONS		= 1,
 | 
						|
	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
 | 
						|
	PERF_COUNT_HW_CACHE_MISSES		= 3,
 | 
						|
	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
 | 
						|
	PERF_COUNT_HW_BRANCH_MISSES		= 5,
 | 
						|
	PERF_COUNT_HW_BUS_CYCLES		= 6,
 | 
						|
};
 | 
						|
 | 
						|
These are standardized types of events that work relatively uniformly
 | 
						|
on all CPUs that implement Performance Counters support under Linux,
 | 
						|
although there may be variations (e.g., different CPUs might count
 | 
						|
cache references and misses at different levels of the cache hierarchy).
 | 
						|
If a CPU is not able to count the selected event, then the system call
 | 
						|
will return -EINVAL.
 | 
						|
 | 
						|
More hw_event_types are supported as well, but they are CPU-specific
 | 
						|
and accessed as raw events.  For example, to count "External bus
 | 
						|
cycles while bus lock signal asserted" events on Intel Core CPUs, pass
 | 
						|
in a 0x4064 event_id value and set hw_event.raw_type to 1.
 | 
						|
 | 
						|
A counter of type PERF_TYPE_SOFTWARE will count one of the available
 | 
						|
software events, selected by 'event_id':
 | 
						|
 | 
						|
/*
 | 
						|
 * Special "software" counters provided by the kernel, even if the hardware
 | 
						|
 * does not support performance counters. These counters measure various
 | 
						|
 * physical and sw events of the kernel (and allow the profiling of them as
 | 
						|
 * well):
 | 
						|
 */
 | 
						|
enum perf_sw_ids {
 | 
						|
	PERF_COUNT_SW_CPU_CLOCK		= 0,
 | 
						|
	PERF_COUNT_SW_TASK_CLOCK	= 1,
 | 
						|
	PERF_COUNT_SW_PAGE_FAULTS	= 2,
 | 
						|
	PERF_COUNT_SW_CONTEXT_SWITCHES	= 3,
 | 
						|
	PERF_COUNT_SW_CPU_MIGRATIONS	= 4,
 | 
						|
	PERF_COUNT_SW_PAGE_FAULTS_MIN	= 5,
 | 
						|
	PERF_COUNT_SW_PAGE_FAULTS_MAJ	= 6,
 | 
						|
	PERF_COUNT_SW_ALIGNMENT_FAULTS	= 7,
 | 
						|
	PERF_COUNT_SW_EMULATION_FAULTS	= 8,
 | 
						|
};
 | 
						|
 | 
						|
Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
 | 
						|
tracer is available, and event_id values can be obtained from
 | 
						|
/debug/tracing/events/*/*/id
 | 
						|
 | 
						|
 | 
						|
Counters come in two flavours: counting counters and sampling
 | 
						|
counters.  A "counting" counter is one that is used for counting the
 | 
						|
number of events that occur, and is characterised by having
 | 
						|
irq_period = 0.
 | 
						|
 | 
						|
 | 
						|
A read() on a counter returns the current value of the counter and possible
 | 
						|
additional values as specified by 'read_format', each value is a u64 (8 bytes)
 | 
						|
in size.
 | 
						|
 | 
						|
/*
 | 
						|
 * Bits that can be set in hw_event.read_format to request that
 | 
						|
 * reads on the counter should return the indicated quantities,
 | 
						|
 * in increasing order of bit value, after the counter value.
 | 
						|
 */
 | 
						|
enum perf_event_read_format {
 | 
						|
        PERF_FORMAT_TOTAL_TIME_ENABLED  =  1,
 | 
						|
        PERF_FORMAT_TOTAL_TIME_RUNNING  =  2,
 | 
						|
};
 | 
						|
 | 
						|
Using these additional values one can establish the overcommit ratio for a
 | 
						|
particular counter allowing one to take the round-robin scheduling effect
 | 
						|
into account.
 | 
						|
 | 
						|
 | 
						|
A "sampling" counter is one that is set up to generate an interrupt
 | 
						|
every N events, where N is given by 'irq_period'.  A sampling counter
 | 
						|
has irq_period > 0. The record_type controls what data is recorded on each
 | 
						|
interrupt:
 | 
						|
 | 
						|
/*
 | 
						|
 * Bits that can be set in hw_event.record_type to request information
 | 
						|
 * in the overflow packets.
 | 
						|
 */
 | 
						|
enum perf_event_record_format {
 | 
						|
        PERF_RECORD_IP          = 1U << 0,
 | 
						|
        PERF_RECORD_TID         = 1U << 1,
 | 
						|
        PERF_RECORD_TIME        = 1U << 2,
 | 
						|
        PERF_RECORD_ADDR        = 1U << 3,
 | 
						|
        PERF_RECORD_GROUP       = 1U << 4,
 | 
						|
        PERF_RECORD_CALLCHAIN   = 1U << 5,
 | 
						|
};
 | 
						|
 | 
						|
Such (and other) events will be recorded in a ring-buffer, which is
 | 
						|
available to user-space using mmap() (see below).
 | 
						|
 | 
						|
The 'disabled' bit specifies whether the counter starts out disabled
 | 
						|
or enabled.  If it is initially disabled, it can be enabled by ioctl
 | 
						|
or prctl (see below).
 | 
						|
 | 
						|
The 'inherit' bit, if set, specifies that this counter should count
 | 
						|
events on descendant tasks as well as the task specified.  This only
 | 
						|
applies to new descendents, not to any existing descendents at the
 | 
						|
time the counter is created (nor to any new descendents of existing
 | 
						|
descendents).
 | 
						|
 | 
						|
The 'pinned' bit, if set, specifies that the counter should always be
 | 
						|
on the CPU if at all possible.  It only applies to hardware counters
 | 
						|
and only to group leaders.  If a pinned counter cannot be put onto the
 | 
						|
CPU (e.g. because there are not enough hardware counters or because of
 | 
						|
a conflict with some other event), then the counter goes into an
 | 
						|
'error' state, where reads return end-of-file (i.e. read() returns 0)
 | 
						|
until the counter is subsequently enabled or disabled.
 | 
						|
 | 
						|
The 'exclusive' bit, if set, specifies that when this counter's group
 | 
						|
is on the CPU, it should be the only group using the CPU's counters.
 | 
						|
In future, this will allow sophisticated monitoring programs to supply
 | 
						|
extra configuration information via 'extra_config_len' to exploit
 | 
						|
advanced features of the CPU's Performance Monitor Unit (PMU) that are
 | 
						|
not otherwise accessible and that might disrupt other hardware
 | 
						|
counters.
 | 
						|
 | 
						|
The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
 | 
						|
way to request that counting of events be restricted to times when the
 | 
						|
CPU is in user, kernel and/or hypervisor mode.
 | 
						|
 | 
						|
Furthermore the 'exclude_host' and 'exclude_guest' bits provide a way
 | 
						|
to request counting of events restricted to guest and host contexts when
 | 
						|
using Linux as the hypervisor.
 | 
						|
 | 
						|
The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap
 | 
						|
operations, these can be used to relate userspace IP addresses to actual
 | 
						|
code, even after the mapping (or even the whole process) is gone,
 | 
						|
these events are recorded in the ring-buffer (see below).
 | 
						|
 | 
						|
The 'comm' bit allows tracking of process comm data on process creation.
 | 
						|
This too is recorded in the ring-buffer (see below).
 | 
						|
 | 
						|
The 'pid' parameter to the sys_perf_event_open() system call allows the
 | 
						|
counter to be specific to a task:
 | 
						|
 | 
						|
 pid == 0: if the pid parameter is zero, the counter is attached to the
 | 
						|
 current task.
 | 
						|
 | 
						|
 pid > 0: the counter is attached to a specific task (if the current task
 | 
						|
 has sufficient privilege to do so)
 | 
						|
 | 
						|
 pid < 0: all tasks are counted (per cpu counters)
 | 
						|
 | 
						|
The 'cpu' parameter allows a counter to be made specific to a CPU:
 | 
						|
 | 
						|
 cpu >= 0: the counter is restricted to a specific CPU
 | 
						|
 cpu == -1: the counter counts on all CPUs
 | 
						|
 | 
						|
(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
 | 
						|
 | 
						|
A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
 | 
						|
events of that task and 'follows' that task to whatever CPU the task
 | 
						|
gets schedule to. Per task counters can be created by any user, for
 | 
						|
their own tasks.
 | 
						|
 | 
						|
A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
 | 
						|
all events on CPU-x. Per CPU counters need CAP_PERFMON or CAP_SYS_ADMIN
 | 
						|
privilege.
 | 
						|
 | 
						|
The 'flags' parameter is currently unused and must be zero.
 | 
						|
 | 
						|
The 'group_fd' parameter allows counter "groups" to be set up.  A
 | 
						|
counter group has one counter which is the group "leader".  The leader
 | 
						|
is created first, with group_fd = -1 in the sys_perf_event_open call
 | 
						|
that creates it.  The rest of the group members are created
 | 
						|
subsequently, with group_fd giving the fd of the group leader.
 | 
						|
(A single counter on its own is created with group_fd = -1 and is
 | 
						|
considered to be a group with only 1 member.)
 | 
						|
 | 
						|
A counter group is scheduled onto the CPU as a unit, that is, it will
 | 
						|
only be put onto the CPU if all of the counters in the group can be
 | 
						|
put onto the CPU.  This means that the values of the member counters
 | 
						|
can be meaningfully compared, added, divided (to get ratios), etc.,
 | 
						|
with each other, since they have counted events for the same set of
 | 
						|
executed instructions.
 | 
						|
 | 
						|
 | 
						|
Like stated, asynchronous events, like counter overflow or PROT_EXEC mmap
 | 
						|
tracking are logged into a ring-buffer. This ring-buffer is created and
 | 
						|
accessed through mmap().
 | 
						|
 | 
						|
The mmap size should be 1+2^n pages, where the first page is a meta-data page
 | 
						|
(struct perf_event_mmap_page) that contains various bits of information such
 | 
						|
as where the ring-buffer head is.
 | 
						|
 | 
						|
/*
 | 
						|
 * Structure of the page that can be mapped via mmap
 | 
						|
 */
 | 
						|
struct perf_event_mmap_page {
 | 
						|
        __u32   version;                /* version number of this structure */
 | 
						|
        __u32   compat_version;         /* lowest version this is compat with */
 | 
						|
 | 
						|
        /*
 | 
						|
         * Bits needed to read the hw counters in user-space.
 | 
						|
         *
 | 
						|
         *   u32 seq;
 | 
						|
         *   s64 count;
 | 
						|
         *
 | 
						|
         *   do {
 | 
						|
         *     seq = pc->lock;
 | 
						|
         *
 | 
						|
         *     barrier()
 | 
						|
         *     if (pc->index) {
 | 
						|
         *       count = pmc_read(pc->index - 1);
 | 
						|
         *       count += pc->offset;
 | 
						|
         *     } else
 | 
						|
         *       goto regular_read;
 | 
						|
         *
 | 
						|
         *     barrier();
 | 
						|
         *   } while (pc->lock != seq);
 | 
						|
         *
 | 
						|
         * NOTE: for obvious reason this only works on self-monitoring
 | 
						|
         *       processes.
 | 
						|
         */
 | 
						|
        __u32   lock;                   /* seqlock for synchronization */
 | 
						|
        __u32   index;                  /* hardware counter identifier */
 | 
						|
        __s64   offset;                 /* add to hardware counter value */
 | 
						|
 | 
						|
        /*
 | 
						|
         * Control data for the mmap() data buffer.
 | 
						|
         *
 | 
						|
         * User-space reading this value should issue an rmb(), on SMP capable
 | 
						|
         * platforms, after reading this value -- see perf_event_wakeup().
 | 
						|
         */
 | 
						|
        __u32   data_head;              /* head in the data section */
 | 
						|
};
 | 
						|
 | 
						|
NOTE: the hw-counter userspace bits are arch specific and are currently only
 | 
						|
      implemented on powerpc.
 | 
						|
 | 
						|
The following 2^n pages are the ring-buffer which contains events of the form:
 | 
						|
 | 
						|
#define PERF_RECORD_MISC_KERNEL          (1 << 0)
 | 
						|
#define PERF_RECORD_MISC_USER            (1 << 1)
 | 
						|
#define PERF_RECORD_MISC_OVERFLOW        (1 << 2)
 | 
						|
 | 
						|
struct perf_event_header {
 | 
						|
        __u32   type;
 | 
						|
        __u16   misc;
 | 
						|
        __u16   size;
 | 
						|
};
 | 
						|
 | 
						|
enum perf_event_type {
 | 
						|
 | 
						|
        /*
 | 
						|
         * The MMAP events record the PROT_EXEC mappings so that we can
 | 
						|
         * correlate userspace IPs to code. They have the following structure:
 | 
						|
         *
 | 
						|
         * struct {
 | 
						|
         *      struct perf_event_header        header;
 | 
						|
         *
 | 
						|
         *      u32                             pid, tid;
 | 
						|
         *      u64                             addr;
 | 
						|
         *      u64                             len;
 | 
						|
         *      u64                             pgoff;
 | 
						|
         *      char                            filename[];
 | 
						|
         * };
 | 
						|
         */
 | 
						|
        PERF_RECORD_MMAP                 = 1,
 | 
						|
        PERF_RECORD_MUNMAP               = 2,
 | 
						|
 | 
						|
        /*
 | 
						|
         * struct {
 | 
						|
         *      struct perf_event_header        header;
 | 
						|
         *
 | 
						|
         *      u32                             pid, tid;
 | 
						|
         *      char                            comm[];
 | 
						|
         * };
 | 
						|
         */
 | 
						|
        PERF_RECORD_COMM                 = 3,
 | 
						|
 | 
						|
        /*
 | 
						|
         * When header.misc & PERF_RECORD_MISC_OVERFLOW the event_type field
 | 
						|
         * will be PERF_RECORD_*
 | 
						|
         *
 | 
						|
         * struct {
 | 
						|
         *      struct perf_event_header        header;
 | 
						|
         *
 | 
						|
         *      { u64                   ip;       } && PERF_RECORD_IP
 | 
						|
         *      { u32                   pid, tid; } && PERF_RECORD_TID
 | 
						|
         *      { u64                   time;     } && PERF_RECORD_TIME
 | 
						|
         *      { u64                   addr;     } && PERF_RECORD_ADDR
 | 
						|
         *
 | 
						|
         *      { u64                   nr;
 | 
						|
         *        { u64 event, val; }   cnt[nr];  } && PERF_RECORD_GROUP
 | 
						|
         *
 | 
						|
         *      { u16                   nr,
 | 
						|
         *                              hv,
 | 
						|
         *                              kernel,
 | 
						|
         *                              user;
 | 
						|
         *        u64                   ips[nr];  } && PERF_RECORD_CALLCHAIN
 | 
						|
         * };
 | 
						|
         */
 | 
						|
};
 | 
						|
 | 
						|
NOTE: PERF_RECORD_CALLCHAIN is arch specific and currently only implemented
 | 
						|
      on x86.
 | 
						|
 | 
						|
Notification of new events is possible through poll()/select()/epoll() and
 | 
						|
fcntl() managing signals.
 | 
						|
 | 
						|
Normally a notification is generated for every page filled, however one can
 | 
						|
additionally set perf_event_attr.wakeup_events to generate one every
 | 
						|
so many counter overflow events.
 | 
						|
 | 
						|
Future work will include a splice() interface to the ring-buffer.
 | 
						|
 | 
						|
 | 
						|
Counters can be enabled and disabled in two ways: via ioctl and via
 | 
						|
prctl.  When a counter is disabled, it doesn't count or generate
 | 
						|
events but does continue to exist and maintain its count value.
 | 
						|
 | 
						|
An individual counter can be enabled with
 | 
						|
 | 
						|
	ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
 | 
						|
 | 
						|
or disabled with
 | 
						|
 | 
						|
	ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
 | 
						|
 | 
						|
For a counter group, pass PERF_IOC_FLAG_GROUP as the third argument.
 | 
						|
Enabling or disabling the leader of a group enables or disables the
 | 
						|
whole group; that is, while the group leader is disabled, none of the
 | 
						|
counters in the group will count.  Enabling or disabling a member of a
 | 
						|
group other than the leader only affects that counter - disabling an
 | 
						|
non-leader stops that counter from counting but doesn't affect any
 | 
						|
other counter.
 | 
						|
 | 
						|
Additionally, non-inherited overflow counters can use
 | 
						|
 | 
						|
	ioctl(fd, PERF_EVENT_IOC_REFRESH, nr);
 | 
						|
 | 
						|
to enable a counter for 'nr' events, after which it gets disabled again.
 | 
						|
 | 
						|
A process can enable or disable all the counter groups that are
 | 
						|
attached to it, using prctl:
 | 
						|
 | 
						|
	prctl(PR_TASK_PERF_EVENTS_ENABLE);
 | 
						|
 | 
						|
	prctl(PR_TASK_PERF_EVENTS_DISABLE);
 | 
						|
 | 
						|
This applies to all counters on the current process, whether created
 | 
						|
by this process or by another, and doesn't affect any counters that
 | 
						|
this process has created on other processes.  It only enables or
 | 
						|
disables the group leaders, not any other members in the groups.
 | 
						|
 | 
						|
 | 
						|
Arch requirements
 | 
						|
-----------------
 | 
						|
 | 
						|
If your architecture does not have hardware performance metrics, you can
 | 
						|
still use the generic software counters based on hrtimers for sampling.
 | 
						|
 | 
						|
So to start with, in order to add HAVE_PERF_EVENTS to your Kconfig, you
 | 
						|
will need at least this:
 | 
						|
	- asm/perf_event.h - a basic stub will suffice at first
 | 
						|
	- support for atomic64 types (and associated helper functions)
 | 
						|
 | 
						|
If your architecture does have hardware capabilities, you can override the
 | 
						|
weak stub hw_perf_event_init() to register hardware counters.
 | 
						|
 | 
						|
Architectures that have d-cache aliassing issues, such as Sparc and ARM,
 | 
						|
should select PERF_USE_VMALLOC in order to avoid these for perf mmap().
 |