papi/papi-arm64fastread.patch
2023-04-27 15:35:55 -04:00

638 lines
18 KiB
Diff

commit 9a1f2d897f4086bc1d60102de984c849445b5e97
Author: Masahiko, Yamada <yamada.masahiko@fujitsu.com>
Date: Tue Feb 21 19:18:40 2023 +0900
PAPI_read performance improvement for the arm64 processor
We developed PAPI_read performance improvements for the arm64 processor
with a plan to port direct user space PMU register access processing from
libperf to the papi library without using libperf.
The workaround has been implemented that stores the counter value at the
time of reset and subtracts the counter value at the time of reset from
the read counter value at the next read.
When reset processing is called, the value of pc->offset is cleared to 0,
and only the counter value read from the PMU counter is referenced.
There was no problem with the counters FAILED with negative values during
the multiplex+reset test, except for sdsc2-mpx and sdsc4-mpx.
To apply the workaround only during reset, the _pe_reset function call sets
the reset_flag and the next _pe_start function call clears the reset_flag.
The workaround works if the mmap_read_self function is called between calls
to the _pe_reset function and the next call to the _pe_start function.
Switching PMU register direct access from user space from OFF to ON is done by
changing the setting of the kernel variable "/proc/sys/kernel/perf_user_access".
Setting PMU Register Direct Access from User Space Off
$ echo 0 > /proc/sys/kernel/perf_user_access
$ cat /proc/sys/kernel/perf_user_access
0
Setting PMU Register Direct Access from User Space ON
$ echo 1 > /proc/sys/kernel/perf_user_access
$ cat /proc/sys/kernel/perf_user_access
1
Performance of PAPI_read has been improved as expected from the execution
result of the papi_cost command.
Improvement effect of switching PMU register direct access from user space
from OFF to ON
Total cost for PAPI_read (2 counters) over 1000000 iterations
min cycles: 689 -> 28
max cycles: 3876 -> 1323
mean cycles: 724.471979 -> 28.888076
Total cost for PAPI_read_ts (2 counters) over 1000000 iterations
min cycles: 693 -> 29
max cycles: 4066 -> 3718
mean cycles: 726.753003 -> 29.977226
Total cost for PAPI_read (1 derived_[add|sub] counter) over 1000000 iterations
min cycles: 698 -> 28
max cycles: 7406 -> 2346
mean cycles: 728.527079 -> 28.880691
Signed-off-by: Masahiko, Yamada <yamada.masahiko@fujitsu.com>
diff --git a/src/components/perf_event/perf_event.c b/src/components/perf_event/perf_event.c
index b4877d18e..331288c55 100644
--- a/src/components/perf_event/perf_event.c
+++ b/src/components/perf_event/perf_event.c
@@ -682,6 +682,12 @@ set_up_mmap( pe_control_t *ctl, int evt_idx)
+/* Request user access for arm64 */
+static inline void arm64_request_user_access(struct perf_event_attr *hw_event)
+{
+ hw_event->config1=0x2; /* Request user access */
+}
+
/* Open all events in the control state */
static int
open_pe_events( pe_context_t *ctx, pe_control_t *ctl )
@@ -735,6 +741,11 @@ open_pe_events( pe_context_t *ctx, pe_control_t *ctl )
if (( i == 0 ) || (ctl->multiplexed)) {
ctl->events[i].attr.pinned = !ctl->multiplexed;
ctl->events[i].attr.disabled = 1;
+#if defined(__aarch64__)
+ if (_perf_event_vector.cmp_info.fast_counter_read) {
+ arm64_request_user_access(&ctl->events[i].attr);
+ }
+#endif
ctl->events[i].group_leader_fd=-1;
ctl->events[i].attr.read_format = get_read_format(
ctl->multiplexed,
@@ -743,6 +754,11 @@ open_pe_events( pe_context_t *ctx, pe_control_t *ctl )
} else {
ctl->events[i].attr.pinned=0;
ctl->events[i].attr.disabled = 0;
+#if defined(__aarch64__)
+ if (_perf_event_vector.cmp_info.fast_counter_read) {
+ arm64_request_user_access(&ctl->events[i].attr);
+ }
+#endif
ctl->events[i].group_leader_fd=ctl->events[0].event_fd;
ctl->events[i].attr.read_format = get_read_format(
ctl->multiplexed,
@@ -1047,8 +1063,16 @@ _pe_reset( hwd_context_t *ctx, hwd_control_state_t *ctl )
/* We need to reset all of the events, not just the group leaders */
for( i = 0; i < pe_ctl->num_events; i++ ) {
- ret = ioctl( pe_ctl->events[i].event_fd,
- PERF_EVENT_IOC_RESET, NULL );
+ if (_perf_event_vector.cmp_info.fast_counter_read) {
+ ret = ioctl( pe_ctl->events[i].event_fd,
+ PERF_EVENT_IOC_RESET, NULL );
+ pe_ctl->reset_counts[i] = mmap_read_reset_count(
+ pe_ctl->events[i].mmap_buf);
+ pe_ctl->reset_flag = 1;
+ } else {
+ ret = ioctl( pe_ctl->events[i].event_fd,
+ PERF_EVENT_IOC_RESET, NULL );
+ }
if ( ret == -1 ) {
PAPIERROR("ioctl(%d, PERF_EVENT_IOC_RESET, NULL) "
"returned error, Linux says: %s",
@@ -1119,6 +1143,8 @@ _pe_rdpmc_read( hwd_context_t *ctx, hwd_control_state_t *ctl,
for ( i = 0; i < pe_ctl->num_events; i++ ) {
count = mmap_read_self(pe_ctl->events[i].mmap_buf,
+ pe_ctl->reset_flag,
+ pe_ctl->reset_counts[i],
&enabled,&running);
if (count==0xffffffffffffffffULL) {
@@ -1438,6 +1464,10 @@ _pe_start( hwd_context_t *ctx, hwd_control_state_t *ctl )
pe_ctl->events[i].event_fd);
ret=ioctl( pe_ctl->events[i].event_fd,
PERF_EVENT_IOC_ENABLE, NULL) ;
+ if (_perf_event_vector.cmp_info.fast_counter_read) {
+ pe_ctl->reset_counts[i] = 0LL;
+ pe_ctl->reset_flag = 0;
+ }
/* ioctls always return -1 on failure */
if (ret == -1) {
@@ -2297,6 +2327,29 @@ _pe_shutdown_component( void ) {
}
+#if defined(__aarch64__)
+/* Check access PMU counter from User space for arm64 support */
+static int _pe_detect_arm64_access(void) {
+
+ FILE *fff;
+ int perf_user_access;
+ int retval;
+
+ fff=fopen("/proc/sys/kernel/perf_user_access","r");
+ if (fff==NULL) {
+ return 0;
+ }
+
+ /* 1 means you can access PMU counter from User space */
+ /* 0 means you can not access PMU counter from User space */
+ retval=fscanf(fff,"%d",&perf_user_access);
+ if (retval!=1) fprintf(stderr,"Error reading /proc/sys/kernel/perf_user_access\n");
+ fclose(fff);
+
+ return perf_user_access;
+}
+#endif
+
/* Check the mmap page for rdpmc support */
static int _pe_detect_rdpmc(void) {
@@ -2305,10 +2358,13 @@ static int _pe_detect_rdpmc(void) {
void *addr;
struct perf_event_mmap_page *our_mmap;
int page_size=getpagesize();
+#if defined(__aarch64__)
+ int retval;
+#endif
-#if defined(__i386__) || defined (__x86_64__)
+#if defined(__i386__) || defined (__x86_64__) || defined(__aarch64__)
#else
- /* We only support rdpmc on x86 for now */
+ /* We support rdpmc on x86 and arm64 for now */
return 0;
#endif
@@ -2318,12 +2374,23 @@ static int _pe_detect_rdpmc(void) {
return 0;
}
+#if defined(__aarch64__)
+ /* Detect if we can use PMU counter from User space for arm64 */
+ retval = _pe_detect_arm64_access();
+ if (retval == 0) {
+ return 0;
+ }
+#endif
+
/* Create a fake instructions event so we can read a mmap page */
memset(&pe,0,sizeof(struct perf_event_attr));
pe.type=PERF_TYPE_HARDWARE;
pe.size=sizeof(struct perf_event_attr);
pe.config=PERF_COUNT_HW_INSTRUCTIONS;
+#if defined(__aarch64__)
+ arm64_request_user_access(&pe);
+#endif
pe.exclude_kernel=1;
pe.disabled=1;
diff --git a/src/components/perf_event/perf_event_lib.h b/src/components/perf_event/perf_event_lib.h
index 0c50ab9f0..cfba8ac49 100644
--- a/src/components/perf_event/perf_event_lib.h
+++ b/src/components/perf_event/perf_event_lib.h
@@ -36,6 +36,8 @@ typedef struct {
pid_t tid; /* thread we are monitoring */
pe_event_info_t events[PERF_EVENT_MAX_MPX_COUNTERS];
long long counts[PERF_EVENT_MAX_MPX_COUNTERS];
+ unsigned int reset_flag;
+ long long reset_counts[PERF_EVENT_MAX_MPX_COUNTERS];
} pe_control_t;
diff --git a/src/components/perf_event/perf_helpers.h b/src/components/perf_event/perf_helpers.h
index 92dca4fd0..097286865 100644
--- a/src/components/perf_event/perf_helpers.h
+++ b/src/components/perf_event/perf_helpers.h
@@ -29,6 +29,74 @@ sys_perf_event_open( struct perf_event_attr *hw_event,
return ret;
}
+
+/*
+ * We define u64 as uint64_t for every architecture
+ * so that we can print it with "%"PRIx64 without getting warnings.
+ *
+ * typedef __u64 u64;
+ * typedef __s64 s64;
+ */
+typedef uint64_t u64;
+typedef int64_t s64;
+
+typedef __u32 u32;
+typedef __s32 s32;
+
+typedef __u16 u16;
+typedef __s16 s16;
+
+typedef __u8 u8;
+typedef __s8 s8;
+
+
+#ifdef __SIZEOF_INT128__
+static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift)
+{
+ return (u64)(((unsigned __int128)a * b) >> shift);
+}
+
+#else
+
+#ifdef __i386__
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+ u32 high, low;
+
+ asm ("mull %[b]" : "=a" (low), "=d" (high)
+ : [a] "a" (a), [b] "rm" (b) );
+
+ return low | ((u64)high) << 32;
+}
+#else
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+ return (u64)a * b;
+}
+#endif
+
+static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift)
+{
+ u32 ah, al;
+ u64 ret;
+
+ al = a;
+ ah = a >> 32;
+
+ ret = mul_u32_u32(al, b) >> shift;
+ if (ah)
+ ret += mul_u32_u32(ah, b) << (32 - shift);
+
+ return ret;
+}
+
+#endif /* __SIZEOF_INT128__ */
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#endif
+
+
#if defined(__x86_64__) || defined(__i386__)
@@ -52,19 +120,140 @@ static inline unsigned long long rdpmc(unsigned int counter) {
#define barrier() __asm__ volatile("" ::: "memory")
+
+#elif defined(__aarch64__)
+
+/* Indirect stringification. Doing two levels allows the parameter to be a
+ * macro itself. For example, compile with -DFOO=bar, __stringify(FOO)
+ * converts to "bar".
+ */
+
+#define __stringify_1(x...) #x
+#define __stringify(x...) __stringify_1(x)
+
+#define read_sysreg(r) ({ \
+ u64 __val; \
+ asm volatile("mrs %0, " __stringify(r) : "=r" (__val)); \
+ __val; \
+})
+
+static u64 read_pmccntr(void)
+{
+ return read_sysreg(pmccntr_el0);
+}
+
+#define PMEVCNTR_READ(idx) \
+ static u64 read_pmevcntr_##idx(void) { \
+ return read_sysreg(pmevcntr##idx##_el0); \
+ }
+
+PMEVCNTR_READ(0);
+PMEVCNTR_READ(1);
+PMEVCNTR_READ(2);
+PMEVCNTR_READ(3);
+PMEVCNTR_READ(4);
+PMEVCNTR_READ(5);
+PMEVCNTR_READ(6);
+PMEVCNTR_READ(7);
+PMEVCNTR_READ(8);
+PMEVCNTR_READ(9);
+PMEVCNTR_READ(10);
+PMEVCNTR_READ(11);
+PMEVCNTR_READ(12);
+PMEVCNTR_READ(13);
+PMEVCNTR_READ(14);
+PMEVCNTR_READ(15);
+PMEVCNTR_READ(16);
+PMEVCNTR_READ(17);
+PMEVCNTR_READ(18);
+PMEVCNTR_READ(19);
+PMEVCNTR_READ(20);
+PMEVCNTR_READ(21);
+PMEVCNTR_READ(22);
+PMEVCNTR_READ(23);
+PMEVCNTR_READ(24);
+PMEVCNTR_READ(25);
+PMEVCNTR_READ(26);
+PMEVCNTR_READ(27);
+PMEVCNTR_READ(28);
+PMEVCNTR_READ(29);
+PMEVCNTR_READ(30);
+
+/*
+ * Read a value direct from PMEVCNTR<idx>
+ */
+static u64 rdpmc(unsigned int counter)
+{
+ static u64 (* const read_f[])(void) = {
+ read_pmevcntr_0,
+ read_pmevcntr_1,
+ read_pmevcntr_2,
+ read_pmevcntr_3,
+ read_pmevcntr_4,
+ read_pmevcntr_5,
+ read_pmevcntr_6,
+ read_pmevcntr_7,
+ read_pmevcntr_8,
+ read_pmevcntr_9,
+ read_pmevcntr_10,
+ read_pmevcntr_11,
+ read_pmevcntr_13,
+ read_pmevcntr_12,
+ read_pmevcntr_14,
+ read_pmevcntr_15,
+ read_pmevcntr_16,
+ read_pmevcntr_17,
+ read_pmevcntr_18,
+ read_pmevcntr_19,
+ read_pmevcntr_20,
+ read_pmevcntr_21,
+ read_pmevcntr_22,
+ read_pmevcntr_23,
+ read_pmevcntr_24,
+ read_pmevcntr_25,
+ read_pmevcntr_26,
+ read_pmevcntr_27,
+ read_pmevcntr_28,
+ read_pmevcntr_29,
+ read_pmevcntr_30,
+ read_pmccntr
+ };
+
+ if (counter < ARRAY_SIZE(read_f))
+ return (read_f[counter])();
+
+ return 0;
+}
+
+static u64 rdtsc(void) { return read_sysreg(cntvct_el0); }
+
+#define barrier() asm volatile("dmb ish" : : : "memory")
+
+#endif
+
+#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__)
+
+static inline u64 adjust_cap_usr_time_short(u64 a, u64 b, u64 c)
+{
+ u64 ret;
+ ret = b + ((a - b) & c);
+ return ret;
+}
+
/* based on the code in include/uapi/linux/perf_event.h */
static inline unsigned long long mmap_read_self(void *addr,
+ int user_reset_flag,
+ unsigned long long reset,
unsigned long long *en,
unsigned long long *ru) {
struct perf_event_mmap_page *pc = addr;
- uint32_t seq, time_mult, time_shift, index, width;
+ uint32_t seq, time_mult = 0, time_shift = 0, index, width;
int64_t count;
uint64_t enabled, running;
- uint64_t cyc, time_offset;
+ uint64_t cyc = 0, time_offset = 0, time_cycles = 0, time_mask = ~0ULL;
int64_t pmc = 0;
- uint64_t quot, rem;
uint64_t delta = 0;
@@ -96,12 +285,11 @@ static inline unsigned long long mmap_read_self(void *addr,
time_mult = pc->time_mult;
time_shift = pc->time_shift;
- quot=(cyc>>time_shift);
- rem = cyc & (((uint64_t)1 << time_shift) - 1);
- delta = time_offset + (quot * time_mult) +
- ((rem * time_mult) >> time_shift);
+ if (pc->cap_user_time_short) {
+ time_cycles = pc->time_cycles;
+ time_mask = pc->time_mask;
+ }
}
- enabled+=delta;
/* actually do the measurement */
@@ -116,8 +304,9 @@ static inline unsigned long long mmap_read_self(void *addr,
/* numbers which break if an IOC_RESET is done */
width = pc->pmc_width;
count = pc->offset;
- count<<=(64-width);
- count>>=(64-width);
+ if (user_reset_flag == 1) {
+ count = 0;
+ }
/* Ugh, libpfm4 perf_event.h has cap_usr_rdpmc */
/* while actual perf_event.h has cap_user_rdpmc */
@@ -130,14 +319,14 @@ static inline unsigned long long mmap_read_self(void *addr,
pmc = rdpmc(index-1);
/* sign extend result */
+ if (user_reset_flag == 1) {
+ pmc-=reset;
+ }
pmc<<=(64-width);
pmc>>=(64-width);
/* add current count into the existing kernel count */
count+=pmc;
-
- /* Only adjust if index is valid */
- running+=delta;
} else {
/* Falling back because rdpmc not supported */
/* for this event. */
@@ -148,14 +337,66 @@ static inline unsigned long long mmap_read_self(void *addr,
} while (pc->lock != seq);
+ if (enabled != running) {
+
+ /* Adjust for cap_usr_time_short, a nop if not */
+ cyc = adjust_cap_usr_time_short(cyc, time_cycles, time_mask);
+
+ delta = time_offset + mul_u64_u32_shr(cyc, time_mult, time_shift);
+
+ enabled+=delta;
+ if (index)
+ /* Only adjust if index is valid */
+ running+=delta;
+ }
+
if (en) *en=enabled;
if (ru) *ru=running;
return count;
}
+static inline unsigned long long mmap_read_reset_count(void *addr) {
+
+ struct perf_event_mmap_page *pc = addr;
+ uint32_t seq, index;
+ uint64_t count = 0;
+
+ if (pc == NULL) {
+ return count;
+ }
+
+ do {
+ /* The barrier ensures we get the most up to date */
+ /* version of the pc->lock variable */
+
+ seq=pc->lock;
+ barrier();
+
+ /* actually do the measurement */
+
+ /* Ugh, libpfm4 perf_event.h has cap_usr_rdpmc */
+ /* while actual perf_event.h has cap_user_rdpmc */
+
+ /* Index of register to read */
+ /* 0 means stopped/not-active */
+ /* Need to subtract 1 to get actual index to rdpmc() */
+ index = pc->index;
+
+ if (pc->cap_usr_rdpmc && index) {
+ /* Read counter value */
+ count = rdpmc(index-1);
+ }
+ barrier();
+
+ } while (pc->lock != seq);
+
+ return count;
+}
+
#else
static inline unsigned long long mmap_read_self(void *addr,
+ int user_reset_flag,
unsigned long long *en,
unsigned long long *ru) {
commit 693dd5c014d1f0b9a3eae63de051389ed8eb338b
Author: Giuseppe Congiu <gcongiu@icl.utk.edu>
Date: Tue Feb 21 07:46:14 2023 -0500
perf_event: bug fix in mmap_read_self
Commit 9a1f2d897 broke the perf_event component for power cpus. The
mmap_read_self function is missing one argument. This patch restores the
missing argument in the function.
diff --git a/src/components/perf_event/perf_helpers.h b/src/components/perf_event/perf_helpers.h
index 097286865..7ad3524f0 100644
--- a/src/components/perf_event/perf_helpers.h
+++ b/src/components/perf_event/perf_helpers.h
@@ -397,6 +397,7 @@ static inline unsigned long long mmap_read_reset_count(void *addr) {
#else
static inline unsigned long long mmap_read_self(void *addr,
int user_reset_flag,
+ unsigned long long reset,
unsigned long long *en,
unsigned long long *ru) {
commit 1b3e75b7f11c7e2b7c590948216d6aaeec299010
Author: Giuseppe Congiu <gcongiu@icl.utk.edu>
Date: Tue Feb 21 14:21:03 2023 +0100
perf_event: add missing mmap_read_reset_count for non default cpus
Power cpus do not have a version of mmap_read_reset_count. Implement the
missing function.
diff --git a/src/components/perf_event/perf_helpers.h b/src/components/perf_event/perf_helpers.h
index 7ad3524f0..73e82c8ae 100644
--- a/src/components/perf_event/perf_helpers.h
+++ b/src/components/perf_event/perf_helpers.h
@@ -409,6 +409,11 @@ static inline unsigned long long mmap_read_self(void *addr,
return (unsigned long long)(-1);
}
+static inline unsigned long long mmap_read_reset_count(void *addr __attribute__((unused))) {
+
+ return (unsigned long long)(-1);
+}
+
#endif
/* These functions are based on builtin-record.c in the */
commit 37d0c77b7b4d00a958dff50dc715cf63e0cd6084
Author: Giuseppe Congiu <gcongiu@icl.utk.edu>
Date: Tue Feb 21 14:22:53 2023 +0100
perf_event: used unused attribute in mmap_read_self
diff --git a/src/components/perf_event/perf_helpers.h b/src/components/perf_event/perf_helpers.h
index 73e82c8ae..59c8a2fc8 100644
--- a/src/components/perf_event/perf_helpers.h
+++ b/src/components/perf_event/perf_helpers.h
@@ -395,16 +395,11 @@ static inline unsigned long long mmap_read_reset_count(void *addr) {
}
#else
-static inline unsigned long long mmap_read_self(void *addr,
- int user_reset_flag,
- unsigned long long reset,
- unsigned long long *en,
- unsigned long long *ru) {
-
- (void)addr;
-
- *en=0;
- *ru=0;
+static inline unsigned long long mmap_read_self(void *addr __attribute__((unused)),
+ int user_reset_flag __attribute__((unused)),
+ unsigned long long reset __attribute__((unused)),
+ unsigned long long *en __attribute__((unused)),
+ unsigned long long *ru __attribute__((unused))) {
return (unsigned long long)(-1);
}