From 1983fd9790b5767608940c33efb63c7fdea2be07 Mon Sep 17 00:00:00 2001 From: William Cohen Date: Thu, 27 Apr 2023 15:44:25 -0400 Subject: [PATCH] Improve aarch64 read speed. Resolves: #2161146 --- papi-arm64fastread.patch | 637 +++++++++++++++++++++++++++++++++++++++ papi-fastread.patch | 83 +++++ papi.spec | 13 +- 3 files changed, 730 insertions(+), 3 deletions(-) create mode 100644 papi-arm64fastread.patch create mode 100644 papi-fastread.patch diff --git a/papi-arm64fastread.patch b/papi-arm64fastread.patch new file mode 100644 index 0000000..986743b --- /dev/null +++ b/papi-arm64fastread.patch @@ -0,0 +1,637 @@ +commit 9a1f2d897f4086bc1d60102de984c849445b5e97 +Author: Masahiko, Yamada +Date: Tue Feb 21 19:18:40 2023 +0900 + + PAPI_read performance improvement for the arm64 processor + + We developed PAPI_read performance improvements for the arm64 processor + with a plan to port direct user space PMU register access processing from + libperf to the papi library without using libperf. + + The workaround has been implemented that stores the counter value at the + time of reset and subtracts the counter value at the time of reset from + the read counter value at the next read. + When reset processing is called, the value of pc->offset is cleared to 0, + and only the counter value read from the PMU counter is referenced. + There was no problem with the counters FAILED with negative values during + the multiplex+reset test, except for sdsc2-mpx and sdsc4-mpx. + To apply the workaround only during reset, the _pe_reset function call sets + the reset_flag and the next _pe_start function call clears the reset_flag. + The workaround works if the mmap_read_self function is called between calls + to the _pe_reset function and the next call to the _pe_start function. + + Switching PMU register direct access from user space from OFF to ON is done by + changing the setting of the kernel variable "/proc/sys/kernel/perf_user_access". + + Setting PMU Register Direct Access from User Space Off + $ echo 0 > /proc/sys/kernel/perf_user_access + $ cat /proc/sys/kernel/perf_user_access + 0 + + Setting PMU Register Direct Access from User Space ON + $ echo 1 > /proc/sys/kernel/perf_user_access + $ cat /proc/sys/kernel/perf_user_access + 1 + + Performance of PAPI_read has been improved as expected from the execution + result of the papi_cost command. + + Improvement effect of switching PMU register direct access from user space + from OFF to ON + + Total cost for PAPI_read (2 counters) over 1000000 iterations + min cycles: 689 -> 28 + max cycles: 3876 -> 1323 + mean cycles: 724.471979 -> 28.888076 + + Total cost for PAPI_read_ts (2 counters) over 1000000 iterations + min cycles: 693 -> 29 + max cycles: 4066 -> 3718 + mean cycles: 726.753003 -> 29.977226 + + Total cost for PAPI_read (1 derived_[add|sub] counter) over 1000000 iterations + min cycles: 698 -> 28 + max cycles: 7406 -> 2346 + mean cycles: 728.527079 -> 28.880691 + + Signed-off-by: Masahiko, Yamada + +diff --git a/src/components/perf_event/perf_event.c b/src/components/perf_event/perf_event.c +index b4877d18e..331288c55 100644 +--- a/src/components/perf_event/perf_event.c ++++ b/src/components/perf_event/perf_event.c +@@ -682,6 +682,12 @@ set_up_mmap( pe_control_t *ctl, int evt_idx) + + + ++/* Request user access for arm64 */ ++static inline void arm64_request_user_access(struct perf_event_attr *hw_event) ++{ ++ hw_event->config1=0x2; /* Request user access */ ++} ++ + /* Open all events in the control state */ + static int + open_pe_events( pe_context_t *ctx, pe_control_t *ctl ) +@@ -735,6 +741,11 @@ open_pe_events( pe_context_t *ctx, pe_control_t *ctl ) + if (( i == 0 ) || (ctl->multiplexed)) { + ctl->events[i].attr.pinned = !ctl->multiplexed; + ctl->events[i].attr.disabled = 1; ++#if defined(__aarch64__) ++ if (_perf_event_vector.cmp_info.fast_counter_read) { ++ arm64_request_user_access(&ctl->events[i].attr); ++ } ++#endif + ctl->events[i].group_leader_fd=-1; + ctl->events[i].attr.read_format = get_read_format( + ctl->multiplexed, +@@ -743,6 +754,11 @@ open_pe_events( pe_context_t *ctx, pe_control_t *ctl ) + } else { + ctl->events[i].attr.pinned=0; + ctl->events[i].attr.disabled = 0; ++#if defined(__aarch64__) ++ if (_perf_event_vector.cmp_info.fast_counter_read) { ++ arm64_request_user_access(&ctl->events[i].attr); ++ } ++#endif + ctl->events[i].group_leader_fd=ctl->events[0].event_fd; + ctl->events[i].attr.read_format = get_read_format( + ctl->multiplexed, +@@ -1047,8 +1063,16 @@ _pe_reset( hwd_context_t *ctx, hwd_control_state_t *ctl ) + + /* We need to reset all of the events, not just the group leaders */ + for( i = 0; i < pe_ctl->num_events; i++ ) { +- ret = ioctl( pe_ctl->events[i].event_fd, +- PERF_EVENT_IOC_RESET, NULL ); ++ if (_perf_event_vector.cmp_info.fast_counter_read) { ++ ret = ioctl( pe_ctl->events[i].event_fd, ++ PERF_EVENT_IOC_RESET, NULL ); ++ pe_ctl->reset_counts[i] = mmap_read_reset_count( ++ pe_ctl->events[i].mmap_buf); ++ pe_ctl->reset_flag = 1; ++ } else { ++ ret = ioctl( pe_ctl->events[i].event_fd, ++ PERF_EVENT_IOC_RESET, NULL ); ++ } + if ( ret == -1 ) { + PAPIERROR("ioctl(%d, PERF_EVENT_IOC_RESET, NULL) " + "returned error, Linux says: %s", +@@ -1119,6 +1143,8 @@ _pe_rdpmc_read( hwd_context_t *ctx, hwd_control_state_t *ctl, + for ( i = 0; i < pe_ctl->num_events; i++ ) { + + count = mmap_read_self(pe_ctl->events[i].mmap_buf, ++ pe_ctl->reset_flag, ++ pe_ctl->reset_counts[i], + &enabled,&running); + + if (count==0xffffffffffffffffULL) { +@@ -1438,6 +1464,10 @@ _pe_start( hwd_context_t *ctx, hwd_control_state_t *ctl ) + pe_ctl->events[i].event_fd); + ret=ioctl( pe_ctl->events[i].event_fd, + PERF_EVENT_IOC_ENABLE, NULL) ; ++ if (_perf_event_vector.cmp_info.fast_counter_read) { ++ pe_ctl->reset_counts[i] = 0LL; ++ pe_ctl->reset_flag = 0; ++ } + + /* ioctls always return -1 on failure */ + if (ret == -1) { +@@ -2297,6 +2327,29 @@ _pe_shutdown_component( void ) { + } + + ++#if defined(__aarch64__) ++/* Check access PMU counter from User space for arm64 support */ ++static int _pe_detect_arm64_access(void) { ++ ++ FILE *fff; ++ int perf_user_access; ++ int retval; ++ ++ fff=fopen("/proc/sys/kernel/perf_user_access","r"); ++ if (fff==NULL) { ++ return 0; ++ } ++ ++ /* 1 means you can access PMU counter from User space */ ++ /* 0 means you can not access PMU counter from User space */ ++ retval=fscanf(fff,"%d",&perf_user_access); ++ if (retval!=1) fprintf(stderr,"Error reading /proc/sys/kernel/perf_user_access\n"); ++ fclose(fff); ++ ++ return perf_user_access; ++} ++#endif ++ + /* Check the mmap page for rdpmc support */ + static int _pe_detect_rdpmc(void) { + +@@ -2305,10 +2358,13 @@ static int _pe_detect_rdpmc(void) { + void *addr; + struct perf_event_mmap_page *our_mmap; + int page_size=getpagesize(); ++#if defined(__aarch64__) ++ int retval; ++#endif + +-#if defined(__i386__) || defined (__x86_64__) ++#if defined(__i386__) || defined (__x86_64__) || defined(__aarch64__) + #else +- /* We only support rdpmc on x86 for now */ ++ /* We support rdpmc on x86 and arm64 for now */ + return 0; + #endif + +@@ -2318,12 +2374,23 @@ static int _pe_detect_rdpmc(void) { + return 0; + } + ++#if defined(__aarch64__) ++ /* Detect if we can use PMU counter from User space for arm64 */ ++ retval = _pe_detect_arm64_access(); ++ if (retval == 0) { ++ return 0; ++ } ++#endif ++ + /* Create a fake instructions event so we can read a mmap page */ + memset(&pe,0,sizeof(struct perf_event_attr)); + + pe.type=PERF_TYPE_HARDWARE; + pe.size=sizeof(struct perf_event_attr); + pe.config=PERF_COUNT_HW_INSTRUCTIONS; ++#if defined(__aarch64__) ++ arm64_request_user_access(&pe); ++#endif + pe.exclude_kernel=1; + pe.disabled=1; + +diff --git a/src/components/perf_event/perf_event_lib.h b/src/components/perf_event/perf_event_lib.h +index 0c50ab9f0..cfba8ac49 100644 +--- a/src/components/perf_event/perf_event_lib.h ++++ b/src/components/perf_event/perf_event_lib.h +@@ -36,6 +36,8 @@ typedef struct { + pid_t tid; /* thread we are monitoring */ + pe_event_info_t events[PERF_EVENT_MAX_MPX_COUNTERS]; + long long counts[PERF_EVENT_MAX_MPX_COUNTERS]; ++ unsigned int reset_flag; ++ long long reset_counts[PERF_EVENT_MAX_MPX_COUNTERS]; + } pe_control_t; + + +diff --git a/src/components/perf_event/perf_helpers.h b/src/components/perf_event/perf_helpers.h +index 92dca4fd0..097286865 100644 +--- a/src/components/perf_event/perf_helpers.h ++++ b/src/components/perf_event/perf_helpers.h +@@ -29,6 +29,74 @@ sys_perf_event_open( struct perf_event_attr *hw_event, + return ret; + } + ++ ++/* ++ * We define u64 as uint64_t for every architecture ++ * so that we can print it with "%"PRIx64 without getting warnings. ++ * ++ * typedef __u64 u64; ++ * typedef __s64 s64; ++ */ ++typedef uint64_t u64; ++typedef int64_t s64; ++ ++typedef __u32 u32; ++typedef __s32 s32; ++ ++typedef __u16 u16; ++typedef __s16 s16; ++ ++typedef __u8 u8; ++typedef __s8 s8; ++ ++ ++#ifdef __SIZEOF_INT128__ ++static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift) ++{ ++ return (u64)(((unsigned __int128)a * b) >> shift); ++} ++ ++#else ++ ++#ifdef __i386__ ++static inline u64 mul_u32_u32(u32 a, u32 b) ++{ ++ u32 high, low; ++ ++ asm ("mull %[b]" : "=a" (low), "=d" (high) ++ : [a] "a" (a), [b] "rm" (b) ); ++ ++ return low | ((u64)high) << 32; ++} ++#else ++static inline u64 mul_u32_u32(u32 a, u32 b) ++{ ++ return (u64)a * b; ++} ++#endif ++ ++static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift) ++{ ++ u32 ah, al; ++ u64 ret; ++ ++ al = a; ++ ah = a >> 32; ++ ++ ret = mul_u32_u32(al, b) >> shift; ++ if (ah) ++ ret += mul_u32_u32(ah, b) << (32 - shift); ++ ++ return ret; ++} ++ ++#endif /* __SIZEOF_INT128__ */ ++ ++#ifndef ARRAY_SIZE ++#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) ++#endif ++ ++ + #if defined(__x86_64__) || defined(__i386__) + + +@@ -52,19 +120,140 @@ static inline unsigned long long rdpmc(unsigned int counter) { + + #define barrier() __asm__ volatile("" ::: "memory") + ++ ++#elif defined(__aarch64__) ++ ++/* Indirect stringification. Doing two levels allows the parameter to be a ++ * macro itself. For example, compile with -DFOO=bar, __stringify(FOO) ++ * converts to "bar". ++ */ ++ ++#define __stringify_1(x...) #x ++#define __stringify(x...) __stringify_1(x) ++ ++#define read_sysreg(r) ({ \ ++ u64 __val; \ ++ asm volatile("mrs %0, " __stringify(r) : "=r" (__val)); \ ++ __val; \ ++}) ++ ++static u64 read_pmccntr(void) ++{ ++ return read_sysreg(pmccntr_el0); ++} ++ ++#define PMEVCNTR_READ(idx) \ ++ static u64 read_pmevcntr_##idx(void) { \ ++ return read_sysreg(pmevcntr##idx##_el0); \ ++ } ++ ++PMEVCNTR_READ(0); ++PMEVCNTR_READ(1); ++PMEVCNTR_READ(2); ++PMEVCNTR_READ(3); ++PMEVCNTR_READ(4); ++PMEVCNTR_READ(5); ++PMEVCNTR_READ(6); ++PMEVCNTR_READ(7); ++PMEVCNTR_READ(8); ++PMEVCNTR_READ(9); ++PMEVCNTR_READ(10); ++PMEVCNTR_READ(11); ++PMEVCNTR_READ(12); ++PMEVCNTR_READ(13); ++PMEVCNTR_READ(14); ++PMEVCNTR_READ(15); ++PMEVCNTR_READ(16); ++PMEVCNTR_READ(17); ++PMEVCNTR_READ(18); ++PMEVCNTR_READ(19); ++PMEVCNTR_READ(20); ++PMEVCNTR_READ(21); ++PMEVCNTR_READ(22); ++PMEVCNTR_READ(23); ++PMEVCNTR_READ(24); ++PMEVCNTR_READ(25); ++PMEVCNTR_READ(26); ++PMEVCNTR_READ(27); ++PMEVCNTR_READ(28); ++PMEVCNTR_READ(29); ++PMEVCNTR_READ(30); ++ ++/* ++ * Read a value direct from PMEVCNTR ++ */ ++static u64 rdpmc(unsigned int counter) ++{ ++ static u64 (* const read_f[])(void) = { ++ read_pmevcntr_0, ++ read_pmevcntr_1, ++ read_pmevcntr_2, ++ read_pmevcntr_3, ++ read_pmevcntr_4, ++ read_pmevcntr_5, ++ read_pmevcntr_6, ++ read_pmevcntr_7, ++ read_pmevcntr_8, ++ read_pmevcntr_9, ++ read_pmevcntr_10, ++ read_pmevcntr_11, ++ read_pmevcntr_13, ++ read_pmevcntr_12, ++ read_pmevcntr_14, ++ read_pmevcntr_15, ++ read_pmevcntr_16, ++ read_pmevcntr_17, ++ read_pmevcntr_18, ++ read_pmevcntr_19, ++ read_pmevcntr_20, ++ read_pmevcntr_21, ++ read_pmevcntr_22, ++ read_pmevcntr_23, ++ read_pmevcntr_24, ++ read_pmevcntr_25, ++ read_pmevcntr_26, ++ read_pmevcntr_27, ++ read_pmevcntr_28, ++ read_pmevcntr_29, ++ read_pmevcntr_30, ++ read_pmccntr ++ }; ++ ++ if (counter < ARRAY_SIZE(read_f)) ++ return (read_f[counter])(); ++ ++ return 0; ++} ++ ++static u64 rdtsc(void) { return read_sysreg(cntvct_el0); } ++ ++#define barrier() asm volatile("dmb ish" : : : "memory") ++ ++#endif ++ ++#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) ++ ++static inline u64 adjust_cap_usr_time_short(u64 a, u64 b, u64 c) ++{ ++ u64 ret; ++ ret = b + ((a - b) & c); ++ return ret; ++} ++ + /* based on the code in include/uapi/linux/perf_event.h */ + static inline unsigned long long mmap_read_self(void *addr, ++ int user_reset_flag, ++ unsigned long long reset, + unsigned long long *en, + unsigned long long *ru) { + + struct perf_event_mmap_page *pc = addr; + +- uint32_t seq, time_mult, time_shift, index, width; ++ uint32_t seq, time_mult = 0, time_shift = 0, index, width; + int64_t count; + uint64_t enabled, running; +- uint64_t cyc, time_offset; ++ uint64_t cyc = 0, time_offset = 0, time_cycles = 0, time_mask = ~0ULL; + int64_t pmc = 0; +- uint64_t quot, rem; + uint64_t delta = 0; + + +@@ -96,12 +285,11 @@ static inline unsigned long long mmap_read_self(void *addr, + time_mult = pc->time_mult; + time_shift = pc->time_shift; + +- quot=(cyc>>time_shift); +- rem = cyc & (((uint64_t)1 << time_shift) - 1); +- delta = time_offset + (quot * time_mult) + +- ((rem * time_mult) >> time_shift); ++ if (pc->cap_user_time_short) { ++ time_cycles = pc->time_cycles; ++ time_mask = pc->time_mask; ++ } + } +- enabled+=delta; + + /* actually do the measurement */ + +@@ -116,8 +304,9 @@ static inline unsigned long long mmap_read_self(void *addr, + /* numbers which break if an IOC_RESET is done */ + width = pc->pmc_width; + count = pc->offset; +- count<<=(64-width); +- count>>=(64-width); ++ if (user_reset_flag == 1) { ++ count = 0; ++ } + + /* Ugh, libpfm4 perf_event.h has cap_usr_rdpmc */ + /* while actual perf_event.h has cap_user_rdpmc */ +@@ -130,14 +319,14 @@ static inline unsigned long long mmap_read_self(void *addr, + pmc = rdpmc(index-1); + + /* sign extend result */ ++ if (user_reset_flag == 1) { ++ pmc-=reset; ++ } + pmc<<=(64-width); + pmc>>=(64-width); + + /* add current count into the existing kernel count */ + count+=pmc; +- +- /* Only adjust if index is valid */ +- running+=delta; + } else { + /* Falling back because rdpmc not supported */ + /* for this event. */ +@@ -148,14 +337,66 @@ static inline unsigned long long mmap_read_self(void *addr, + + } while (pc->lock != seq); + ++ if (enabled != running) { ++ ++ /* Adjust for cap_usr_time_short, a nop if not */ ++ cyc = adjust_cap_usr_time_short(cyc, time_cycles, time_mask); ++ ++ delta = time_offset + mul_u64_u32_shr(cyc, time_mult, time_shift); ++ ++ enabled+=delta; ++ if (index) ++ /* Only adjust if index is valid */ ++ running+=delta; ++ } ++ + if (en) *en=enabled; + if (ru) *ru=running; + + return count; + } + ++static inline unsigned long long mmap_read_reset_count(void *addr) { ++ ++ struct perf_event_mmap_page *pc = addr; ++ uint32_t seq, index; ++ uint64_t count = 0; ++ ++ if (pc == NULL) { ++ return count; ++ } ++ ++ do { ++ /* The barrier ensures we get the most up to date */ ++ /* version of the pc->lock variable */ ++ ++ seq=pc->lock; ++ barrier(); ++ ++ /* actually do the measurement */ ++ ++ /* Ugh, libpfm4 perf_event.h has cap_usr_rdpmc */ ++ /* while actual perf_event.h has cap_user_rdpmc */ ++ ++ /* Index of register to read */ ++ /* 0 means stopped/not-active */ ++ /* Need to subtract 1 to get actual index to rdpmc() */ ++ index = pc->index; ++ ++ if (pc->cap_usr_rdpmc && index) { ++ /* Read counter value */ ++ count = rdpmc(index-1); ++ } ++ barrier(); ++ ++ } while (pc->lock != seq); ++ ++ return count; ++} ++ + #else + static inline unsigned long long mmap_read_self(void *addr, ++ int user_reset_flag, + unsigned long long *en, + unsigned long long *ru) { + +commit 693dd5c014d1f0b9a3eae63de051389ed8eb338b +Author: Giuseppe Congiu +Date: Tue Feb 21 07:46:14 2023 -0500 + + perf_event: bug fix in mmap_read_self + + Commit 9a1f2d897 broke the perf_event component for power cpus. The + mmap_read_self function is missing one argument. This patch restores the + missing argument in the function. + +diff --git a/src/components/perf_event/perf_helpers.h b/src/components/perf_event/perf_helpers.h +index 097286865..7ad3524f0 100644 +--- a/src/components/perf_event/perf_helpers.h ++++ b/src/components/perf_event/perf_helpers.h +@@ -397,6 +397,7 @@ static inline unsigned long long mmap_read_reset_count(void *addr) { + #else + static inline unsigned long long mmap_read_self(void *addr, + int user_reset_flag, ++ unsigned long long reset, + unsigned long long *en, + unsigned long long *ru) { + +commit 1b3e75b7f11c7e2b7c590948216d6aaeec299010 +Author: Giuseppe Congiu +Date: Tue Feb 21 14:21:03 2023 +0100 + + perf_event: add missing mmap_read_reset_count for non default cpus + + Power cpus do not have a version of mmap_read_reset_count. Implement the + missing function. + +diff --git a/src/components/perf_event/perf_helpers.h b/src/components/perf_event/perf_helpers.h +index 7ad3524f0..73e82c8ae 100644 +--- a/src/components/perf_event/perf_helpers.h ++++ b/src/components/perf_event/perf_helpers.h +@@ -409,6 +409,11 @@ static inline unsigned long long mmap_read_self(void *addr, + return (unsigned long long)(-1); + } + ++static inline unsigned long long mmap_read_reset_count(void *addr __attribute__((unused))) { ++ ++ return (unsigned long long)(-1); ++} ++ + #endif + + /* These functions are based on builtin-record.c in the */ +commit 37d0c77b7b4d00a958dff50dc715cf63e0cd6084 +Author: Giuseppe Congiu +Date: Tue Feb 21 14:22:53 2023 +0100 + + perf_event: used unused attribute in mmap_read_self + +diff --git a/src/components/perf_event/perf_helpers.h b/src/components/perf_event/perf_helpers.h +index 73e82c8ae..59c8a2fc8 100644 +--- a/src/components/perf_event/perf_helpers.h ++++ b/src/components/perf_event/perf_helpers.h +@@ -395,16 +395,11 @@ static inline unsigned long long mmap_read_reset_count(void *addr) { + } + + #else +-static inline unsigned long long mmap_read_self(void *addr, +- int user_reset_flag, +- unsigned long long reset, +- unsigned long long *en, +- unsigned long long *ru) { +- +- (void)addr; +- +- *en=0; +- *ru=0; ++static inline unsigned long long mmap_read_self(void *addr __attribute__((unused)), ++ int user_reset_flag __attribute__((unused)), ++ unsigned long long reset __attribute__((unused)), ++ unsigned long long *en __attribute__((unused)), ++ unsigned long long *ru __attribute__((unused))) { + + return (unsigned long long)(-1); + } diff --git a/papi-fastread.patch b/papi-fastread.patch new file mode 100644 index 0000000..417ab8d --- /dev/null +++ b/papi-fastread.patch @@ -0,0 +1,83 @@ +commit 6de699f68e6e2bffb44a1b943aca52d7d838d4d4 +Author: Vince Weaver +Date: Thu Jan 10 14:54:42 2019 -0500 + + perf_event: internally indicate we need fallback when rdpmc not available + +diff --git a/src/components/perf_event/perf_helpers.h b/src/components/perf_event/perf_helpers.h +index 20dfbacd1..7f0ff6c95 100644 +--- a/src/components/perf_event/perf_helpers.h ++++ b/src/components/perf_event/perf_helpers.h +@@ -138,6 +138,10 @@ static inline unsigned long long mmap_read_self(void *addr, + + /* Only adjust if index is valid */ + running+=delta; ++ } else { ++ /* Falling back because rdpmc not supported */ ++ /* for this event. */ ++ return 0xffffffffffffffffULL; + } + + barrier(); +commit a76b1580f035726b28b0e37afc75de760ceaf1e4 +Author: Vince Weaver +Date: Thu Jan 10 15:36:03 2019 -0500 + + perf_event: properly fall back to read() if rdpmc read attempt fails + + The code wasn't properly handling this. + + We now fall back to read() if *any* rdpmc call in an eventset fails. + In theory it is possible to only fall back in a per-event fashion but + that would make the code a lot more complex. + +diff --git a/src/components/perf_event/perf_event.c b/src/components/perf_event/perf_event.c +index a3bd0b800..1f4bbcee7 100644 +--- a/src/components/perf_event/perf_event.c ++++ b/src/components/perf_event/perf_event.c +@@ -1094,6 +1094,7 @@ _pe_rdpmc_read( hwd_context_t *ctx, hwd_control_state_t *ctl, + int i; + pe_control_t *pe_ctl = ( pe_control_t *) ctl; + unsigned long long count, enabled, running, adjusted; ++ int errors=0; + + /* we must read each counter individually */ + for ( i = 0; i < pe_ctl->num_events; i++ ) { +@@ -1101,7 +1102,9 @@ _pe_rdpmc_read( hwd_context_t *ctx, hwd_control_state_t *ctl, + count = mmap_read_self(pe_ctl->events[i].mmap_buf, + &enabled,&running); + +- /* TODO: more error checking? */ ++ if (count==0xffffffffffffffffULL) { ++ errors++; ++ } + + /* Handle multiplexing case */ + if (enabled == running) { +@@ -1127,6 +1130,8 @@ _pe_rdpmc_read( hwd_context_t *ctx, hwd_control_state_t *ctl, + + SUBDBG("EXIT: *events: %p\n", *events); + ++ if (errors) return PAPI_ESYS; ++ + return PAPI_OK; + } + +@@ -1253,10 +1258,16 @@ _pe_read( hwd_context_t *ctx, hwd_control_state_t *ctl, + int i, j, ret = -1; + pe_control_t *pe_ctl = ( pe_control_t *) ctl; + long long papi_pe_buffer[READ_BUFFER_SIZE]; ++ int result; + + /* Handle fast case */ ++ /* FIXME: we fallback to slow reads if *any* event in eventset fails */ ++ /* in theory we could only fall back for the one event */ ++ /* but that makes the code more complicated. */ + if ((_perf_event_vector.cmp_info.fast_counter_read) && (!pe_ctl->inherit)) { +- return _pe_rdpmc_read( ctx, ctl, events, flags); ++ result=_pe_rdpmc_read( ctx, ctl, events, flags); ++ /* if successful we are done, otherwise fall back to read */ ++ if (result==PAPI_OK) return PAPI_OK; + } + + /* Handle case where we are multiplexing */ diff --git a/papi.spec b/papi.spec index 97160af..7300c6c 100644 --- a/papi.spec +++ b/papi.spec @@ -8,7 +8,7 @@ Summary: Performance Application Programming Interface Name: papi Version: 5.6.0 -Release: 16%{?dist} +Release: 17%{?dist} License: BSD Group: Development/System Requires: papi-libs = %{version}-%{release} @@ -25,6 +25,8 @@ Patch8: papi-rhbz2037417.patch Patch9: papi-rhbz2037426.patch Patch10: papi-rhbz2037427.patch Patch11: papi-zen.patch +Patch20: papi-fastread.patch +Patch21: papi-arm64fastread.patch BuildRequires: autoconf BuildRequires: doxygen BuildRequires: ncurses-devel @@ -33,8 +35,8 @@ BuildRequires: kernel-headers >= 2.6.32 BuildRequires: chrpath BuildRequires: lm_sensors-devel %if %{without bundled_libpfm} -BuildRequires: libpfm-devel >= 4.10.1-3 -BuildRequires: libpfm-static >= 4.10.1-3 +BuildRequires: libpfm-devel >= 4.13.0-1 +BuildRequires: libpfm-static >= 4.13.0-1 %endif # Following required for net component BuildRequires: net-tools @@ -99,6 +101,8 @@ the PAPI user-space libraries and interfaces. %patch9 -p1 %patch10 -p1 %patch11 -p1 +%patch20 -p1 +%patch21 -p1 %build %if %{without bundled_libpfm} @@ -181,6 +185,9 @@ chrpath --delete $RPM_BUILD_ROOT%{_libdir}/*.so* %{_libdir}/*.a %changelog +* Thu Apr 27 2023 William Cohen - 5.6.0-17 +- Improve aarch64 read speed. (rhbz2161146) + * Fri May 6 2022 William Cohen - 5.6.0-16 - AMD Zen2/3 support (rhbz2071558)