From be501ad0473feec7b66330c05e0d890015837e1a Mon Sep 17 00:00:00 2001 From: eabdullin Date: Thu, 21 Sep 2023 19:48:59 +0000 Subject: [PATCH] import CS papi-6.0.0-15.el9 --- SOURCES/papi-701eventupdate.patch | 796 ++++++++++++++++++++++++++++++ SOURCES/papi-a64fx.patch | 34 -- SOURCES/papi-arm64fastread.patch | 637 ++++++++++++++++++++++++ SOURCES/papi-thread_init.patch | 106 ++++ SPECS/papi.spec | 23 +- 5 files changed, 1557 insertions(+), 39 deletions(-) create mode 100644 SOURCES/papi-701eventupdate.patch delete mode 100644 SOURCES/papi-a64fx.patch create mode 100644 SOURCES/papi-arm64fastread.patch create mode 100644 SOURCES/papi-thread_init.patch diff --git a/SOURCES/papi-701eventupdate.patch b/SOURCES/papi-701eventupdate.patch new file mode 100644 index 0000000..46d592b --- /dev/null +++ b/SOURCES/papi-701eventupdate.patch @@ -0,0 +1,796 @@ +commit ae449f73abd0849f05ab3e1f3a64bde0c670c645 +Author: Anthony +Date: Fri Jul 17 12:05:14 2020 -0400 + + Separated the cache preset events of AMD Zen1 and Zen2 and added some more. + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index 8e96adfbd..2325bd4dc 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -397,7 +397,6 @@ PRESET,PAPI_FSQ_INS,NOT_DERIVED,RETIRED_SSE_AVX_OPERATIONS:SINGLE_DIV_OPS:DOUBLE + # + CPU,amd64_fam17h + CPU,amd64_fam17h_zen1 +-CPU,amd64_fam17h_zen2 + # + PRESET,PAPI_TOT_INS,NOT_DERIVED,RETIRED_INSTRUCTIONS + PRESET,PAPI_TOT_CYC,NOT_DERIVED,CYCLES_NOT_IN_HALT +@@ -434,6 +433,27 @@ PRESET,PAPI_FML_INS,NOT_DERIVED,RETIRED_SSE_AVX_OPERATIONS:SP_MULT_FLOPS:DP_MULT + PRESET,PAPI_FAD_INS,NOT_DERIVED,RETIRED_SSE_AVX_OPERATIONS:SP_ADD_SUB_FLOPS:DP_ADD_SUB_FLOPS + PRESET,PAPI_FDV_INS,NOT_DERIVED,RETIRED_SSE_AVX_OPERATIONS:SP_DIV_FLOPS:DP_DIV_FLOPS,NOTE,"Counts both divide and square root instructions" + PRESET,PAPI_FSQ_INS,NOT_DERIVED,RETIRED_SSE_AVX_OPERATIONS:SP_DIV_FLOPS:DP_DIV_FLOPS,NOTE,"Counts both divide and square root instructions" ++# Events discovered via CAT ++PRESET,PAPI_L2_DCM,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_C ++PRESET,PAPI_L2_DCR,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:RD_BLK_L ++PRESET,PAPI_L2_DCH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_L_HIT_X ++# ++# ++CPU,amd64_fam17h_zen2 ++# Events copied from zen1 that also exist on zen2 ++PRESET,PAPI_TLB_DM,NOT_DERIVED,L1_DTLB_MISS:TLB_RELOAD_1G_L2_MISS:TLB_RELOAD_2M_L2_MISS:TLB_RELOAD_COALESCED_PAGE_MISS:TLB_RELOAD_4K_L2_MISS:TLB_RELOAD_1G_L2_HIT:TLB_RELOAD_2M_L2_HIT:TLB_RELOAD_COALESCED_PAGE_HIT:TLB_RELOAD_4K_L2_HIT ++PRESET,PAPI_TLB_IM,DERIVED_ADD,L1_ITLB_MISS_L2_ITLB_HIT,L1_ITLB_MISS_L2_ITLB_MISS:IF1G:IF2M:IF4K ++PRESET,PAPI_BR_TKN,NOT_DERIVED,RETIRED_TAKEN_BRANCH_INSTRUCTIONS ++PRESET,PAPI_BR_MSP,NOT_DERIVED,RETIRED_BRANCH_INSTRUCTIONS_MISPREDICTED ++PRESET,PAPI_TOT_INS,NOT_DERIVED,RETIRED_INSTRUCTIONS ++PRESET,PAPI_BR_INS,NOT_DERIVED,RETIRED_BRANCH_INSTRUCTIONS ++PRESET,PAPI_TOT_CYC,NOT_DERIVED,CYCLES_NOT_IN_HALT ++# Events discovered via CAT ++PRESET,PAPI_L1_DCA,NOT_DERIVED,perf::PERF_COUNT_HW_CACHE_L1D:ACCESS ++PRESET,PAPI_L2_DCM,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_C ++PRESET,PAPI_L2_DCR,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:RD_BLK_L ++PRESET,PAPI_L2_DCH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_L_HIT_X ++ + # + # + CPU,Intel architectural PMU +@@ -1877,6 +1897,21 @@ PRESET,PAPI_L2_DCR,NOT_DERIVED,L2D_CACHE_RD + PRESET,PAPI_L2_DCW,NOT_DERIVED,L2D_CACHE_WR + PRESET,PAPI_L2_LDM,NOT_DERIVED,L2D_CACHE_REFILL_RD + ++######################### ++# ARM Fujitsu A64FX # ++######################### ++CPU,arm_a64fx ++# ++PRESET,PAPI_TOT_INS,NOT_DERIVED,INST_RETIRED ++PRESET,PAPI_TOT_CYC,NOT_DERIVED,CPU_CYCLES ++PRESET,PAPI_FP_INS,NOT_DERIVED,VFP_SPEC ++PRESET,PAPI_VEC_INS,NOT_DERIVED,ASE_SPEC ++PRESET,PAPI_L1_DCM,NOT_DERIVED,L1D_CACHE_REFILL ++PRESET,PAPI_L1_ICA,NOT_DERIVED,L1I_CACHE ++PRESET,PAPI_L1_ICM,NOT_DERIVED,L1I_CACHE_REFILL ++PRESET,PAPI_L2_DCH,NOT_DERIVED,L2D_CACHE ++PRESET,PAPI_L2_DCM,NOT_DERIVED,L2D_CACHE_REFILL ++ + # + CPU,mips_74k + # +commit ccc22b5dda46fea8933d99950c3e30b5298cdd1d +Author: Heike Jagode +Date: Thu Sep 24 13:33:38 2020 -0400 + + Added presets for floating-point operations (FP_OPS, DP_OPS, SP_OPS) + for AMD zen2. + + PPR (under section 2.1.15.3. -- https://www.amd.com/system/files/TechDocs/54945_3.03_ppr_ZP_B2_pub.zip) + explains that FLOP events require MergeEvent support, which was included + in the 5.6 kernel. + + ===>>> Hence, a kernel version 5.6 or greater is required. + + NOTE: without the MergeEvent support in the kernel, + there is no guarantee that the SSE/AVX FLOP + events produce any useful data whatsoever. + + These events have been tested and verified for + scalar flops, SSE, AVX, and FMA: + + (1) for one AVX instruction (e.g. _mm256_add_pd()), + the RETIRED_SSE_AVX_FLOPS:ADD_SUB_FLOPS event returns + a count of 4 (in the case of double precision), and + a count of 8 (in the case of single precision). + + (2) for one AVX FMA instruction (e.g. _mm256_macc_pd()), + the RETIRED_SSE_AVX_FLOPS:MAC_FLOPS event returns + a count of 8 (in the case of double precision), and + a count of 16 (in the case of single precision). + + (3) for one SSE instruction (e.g. _mm_mul_pd()), + the RETIRED_SSE_AVX_FLOPS:MULT_FLOPS event returns + a count of 2 (in the case of double precision), and + a count of 4 (in the case of single precision). + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index 2325bd4dc..2ff3e4d16 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -454,8 +454,19 @@ PRESET,PAPI_L2_DCM,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_ + PRESET,PAPI_L2_DCR,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:RD_BLK_L + PRESET,PAPI_L2_DCH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_L_HIT_X + +-# +-# ++# New FLOP event on zen2 ++# PPR (under section 2.1.15.3. -- ++# https://www.amd.com/system/files/TechDocs/54945_3.03_ppr_ZP_B2_pub.zip) ++# explains that FLOP events require MergeEvent support, which was included ++# in the 5.6 kernel. ++# Hence, a kernel version 5.6 or greater is required. ++# NOTE: without the MergeEvent support in the kernel, there is no guarantee ++# that this SSE/AVX FLOP event produces any useful data whatsoever. ++PRESET,PAPI_FP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY ++PRESET,PAPI_DP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY ++PRESET,PAPI_SP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY ++ ++ + CPU,Intel architectural PMU + CPU,ix86arch + # +commit 35f93252a6e222299c03f2c94912334488e76b02 +Author: Heike Jagode +Date: Thu Sep 24 18:40:59 2020 -0400 + + Added presets for floating-point instructions (FP_INS, VEC_DP, VEC_SP) + for AMD zen2. + + For unoptimized code (like native MMM), these events may include + non-numeric floating-point instructions, e.g. MOVSD: move or merge + scalar double-precision floating-point value instructions. + + Tested with: + 1) SSE double: _mm_mul_pd / _mm_add_pd + 2) SSE single: _mm_mul_ps / _mm_add_ps + 3) AVX double: _mm256_mul_pd / _mm256_add_pd + 4) AVX single: _mm256_mul_ps / _mm256_add_ps + 5) FMA double: _mm256_macc_pd + 6) FMA single: _mm256_macc_pd + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index 2ff3e4d16..60a64564d 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -465,6 +465,11 @@ PRESET,PAPI_L2_DCH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_ + PRESET,PAPI_FP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY + PRESET,PAPI_DP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY + PRESET,PAPI_SP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY ++# Floating-point instructions (including non-numeric floating-point instructions, ++# e.g. Move or Merge Scalar Double-Precision Floating-Point values) ++PAPI_FP_INS,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR ++PAPI_VEC_DP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR ++PAPI_VEC_SP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR + + + CPU,Intel architectural PMU +commit 344f6493425d865577508ff32b6f65516b1b4394 +Author: Heike Jagode +Date: Thu Sep 24 19:03:31 2020 -0400 + + Added missing 'PRESET' to csv file. + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index 60a64564d..724d520f0 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -467,9 +467,9 @@ PRESET,PAPI_DP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY + PRESET,PAPI_SP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY + # Floating-point instructions (including non-numeric floating-point instructions, + # e.g. Move or Merge Scalar Double-Precision Floating-Point values) +-PAPI_FP_INS,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR +-PAPI_VEC_DP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR +-PAPI_VEC_SP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR ++PRESET,PAPI_FP_INS,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR ++PRESET,PAPI_VEC_DP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR ++PRESET,PAPI_VEC_SP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR + + + CPU,Intel architectural PMU +commit 4616aa717c5301a9a478876661eb8ac1f18c0333 +Author: Heike Jagode +Date: Thu Oct 8 11:36:23 2020 -0400 + + For zen2, since FP_OPS counts both single- and double-prec operations + correctly, we don't need to confuse the user with additional + DP_OPS and SP_OPS events. So, I'm taking them out. + + Same applies for events counting FP instructions. + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index 724d520f0..9ebf557e1 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -463,13 +463,20 @@ PRESET,PAPI_L2_DCH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_ + # NOTE: without the MergeEvent support in the kernel, there is no guarantee + # that this SSE/AVX FLOP event produces any useful data whatsoever. + PRESET,PAPI_FP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY +-PRESET,PAPI_DP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY +-PRESET,PAPI_SP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY ++# Since FP_OPS counts both single- and double-prec operations ++# correctly, we don't need to confuse the user with additional ++# DP_OPS and SP_OPS events. So, I'm taking them out. ++#PRESET,PAPI_DP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY ++#PRESET,PAPI_SP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY ++# + # Floating-point instructions (including non-numeric floating-point instructions, + # e.g. Move or Merge Scalar Double-Precision Floating-Point values) + PRESET,PAPI_FP_INS,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR +-PRESET,PAPI_VEC_DP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR +-PRESET,PAPI_VEC_SP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR ++# Since FP_INS counts both single- and double-prec instuctions ++# correctly, we don't need to confuse the user with additional ++# VEC_DP and VEC_SP events. So, I'm taking them out. ++#PRESET,PAPI_VEC_DP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR ++#PRESET,PAPI_VEC_SP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR + + + CPU,Intel architectural PMU +commit 274219e85ba8adcd2e9c78507adf7edb05b71daa +Author: Sebastian Mobo +Date: Thu Oct 8 13:40:21 2020 -0400 + + Added instruction-cache preset events for the Zen2. + + Signed-off-by: Anthony + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index 9ebf557e1..fd75f9371 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -453,7 +453,12 @@ PRESET,PAPI_L1_DCA,NOT_DERIVED,perf::PERF_COUNT_HW_CACHE_L1D:ACCESS + PRESET,PAPI_L2_DCM,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_C + PRESET,PAPI_L2_DCR,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:RD_BLK_L + PRESET,PAPI_L2_DCH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_L_HIT_X +- ++# ++PRESET,PAPI_L1_ICM,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:CACHEABLE_IC_READ ++# ++PRESET,PAPI_L2_ICR,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:CACHEABLE_IC_READ ++PRESET,PAPI_L2_ICM,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:IC_FILL_MISS ++PRESET,PAPI_L2_ICH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:IC_FILL_HIT_X:IC_FILL_HIT_S + # New FLOP event on zen2 + # PPR (under section 2.1.15.3. -- + # https://www.amd.com/system/files/TechDocs/54945_3.03_ppr_ZP_B2_pub.zip) +commit b87ac4beda096086e0040f8ec1b44c4791a9739c +Author: Masahiko, Yamada +Date: Mon Dec 14 14:06:22 2020 +0900 + + Corrected typo for A64FX support (PAPI_L2_DCH is a typo of PAPI_L2_DCA) + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index fd75f9371..164f05641 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -1937,7 +1937,7 @@ PRESET,PAPI_VEC_INS,NOT_DERIVED,ASE_SPEC + PRESET,PAPI_L1_DCM,NOT_DERIVED,L1D_CACHE_REFILL + PRESET,PAPI_L1_ICA,NOT_DERIVED,L1I_CACHE + PRESET,PAPI_L1_ICM,NOT_DERIVED,L1I_CACHE_REFILL +-PRESET,PAPI_L2_DCH,NOT_DERIVED,L2D_CACHE ++PRESET,PAPI_L2_DCA,NOT_DERIVED,L2D_CACHE + PRESET,PAPI_L2_DCM,NOT_DERIVED,L2D_CACHE_REFILL + + # +commit 869864f813f0681b5c9a4b65de2135c8708a2afb +Author: Masahiko, Yamada +Date: Mon Dec 14 19:34:59 2020 +0900 + + Add or modify various A64FX support events, including floating point events (PAPI_FP_OPS, PAPI_SP_OPS, PAPI_DP_OPS). + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index 164f05641..9192b1041 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -1930,15 +1930,46 @@ PRESET,PAPI_L2_LDM,NOT_DERIVED,L2D_CACHE_REFILL_RD + ######################### + CPU,arm_a64fx + # ++PRESET,PAPI_PRF_DM,DERIVED_SUB,L2D_CACHE_REFILL_PRF,L2D_CACHE_MIBMCH_PRF ++PRESET,PAPI_MEM_SCY,NOT_DERIVED,LD_COMP_WAIT_L2_MISS ++PRESET,PAPI_STL_ICY,DERIVED_ADD,STALL_FRONTEND,STALL_BACKEND ++PRESET,PAPI_STL_CCY,NOT_DERIVED,0INST_COMMIT ++PRESET,PAPI_FUL_CCY,DERIVED_SUB,CPU_CYCLES,0INST_COMMIT,1INST_COMMIT,2INST_COMMIT,3INST_COMMIT,4INST_COMMIT ++PRESET,PAPI_HW_INT,DERIVED_ADD,EXC_IRQ,EXC_FIQ ++PRESET,PAPI_BR_MSP,NOT_DERIVED,BR_MIS_PRED ++PRESET,PAPI_BR_PRC,DERIVED_SUB,BR_PRED,BR_MIS_PRED ++PRESET,PAPI_FMA_INS,NOT_DERIVED,FP_FMA_SPEC + PRESET,PAPI_TOT_INS,NOT_DERIVED,INST_RETIRED + PRESET,PAPI_TOT_CYC,NOT_DERIVED,CPU_CYCLES + PRESET,PAPI_FP_INS,NOT_DERIVED,VFP_SPEC ++PRESET,PAPI_LD_INS,NOT_DERIVED,LD_SPEC ++PRESET,PAPI_SR_INS,NOT_DERIVED,ST_SPEC ++PRESET,PAPI_BR_INS,NOT_DERIVED,BR_PRED + PRESET,PAPI_VEC_INS,NOT_DERIVED,ASE_SPEC ++PRESET,PAPI_RES_STL,NOT_DERIVED,STALL_BACKEND ++PRESET,PAPI_LST_INS,NOT_DERIVED,LDST_SPEC ++PRESET,PAPI_SYC_INS,DERIVED_ADD,ISB_SPEC,DSB_SPEC,DMB_SPEC ++PRESET,PAPI_L1_DCA,NOT_DERIVED,L1D_CACHE ++PRESET,PAPI_L1_DCH,DERIVED_SUB,L1D_CACHE,L1D_CACHE_REFILL + PRESET,PAPI_L1_DCM,NOT_DERIVED,L1D_CACHE_REFILL + PRESET,PAPI_L1_ICA,NOT_DERIVED,L1I_CACHE ++PRESET,PAPI_L1_ICH,DERIVED_SUB,L1I_CACHE,L1I_CACHE_REFILL + PRESET,PAPI_L1_ICM,NOT_DERIVED,L1I_CACHE_REFILL ++PRESET,PAPI_L1_TCA,DERIVED_ADD,L1D_CACHE,L1I_CACHE ++PRESET,PAPI_L1_TCH,DERIVED_POSTFIX,N0|N1|-|N2|+|N3|-|,L1D_CACHE,L1D_CACHE_REFILL,L1I_CACHE,L1I_CACHE_REFILL ++PRESET,PAPI_L1_TCM,DERIVED_ADD,L1D_CACHE_REFILL,L1I_CACHE_REFILL + PRESET,PAPI_L2_DCA,NOT_DERIVED,L2D_CACHE +-PRESET,PAPI_L2_DCM,NOT_DERIVED,L2D_CACHE_REFILL ++PRESET,PAPI_L2_DCH,DERIVED_POSTFIX,N0|N1|-|N2|+|N3|+|,L2D_CACHE,L2D_CACHE_REFILL,L2D_SWAP_DM,L2D_CACHE_MIBMCH_PRF ++PRESET,PAPI_L2_DCM,DERIVED_SUB,L2D_CACHE_REFILL,L2D_SWAP_DM,L2D_CACHE_MIBMCH_PRF ++PRESET,PAPI_L2_TCA,NOT_DERIVED,L2D_CACHE ++PRESET,PAPI_L2_TCH,DERIVED_POSTFIX,N0|N1|-|N2|+|N3|+|,L2D_CACHE,L2D_CACHE_REFILL,L2D_SWAP_DM,L2D_CACHE_MIBMCH_PRF ++PRESET,PAPI_L2_TCM,DERIVED_SUB,L2D_CACHE_REFILL,L2D_SWAP_DM,L2D_CACHE_MIBMCH_PRF ++PRESET,PAPI_TLB_DM,NOT_DERIVED,L2D_TLB_REFILL ++PRESET,PAPI_TLB_IM,NOT_DERIVED,L2I_TLB_REFILL ++PRESET,PAPI_TLB_TL,DERIVED_ADD,L2D_TLB_REFILL,L2I_TLB_REFILL ++PRESET,PAPI_FP_OPS,DERIVED_POSTFIX,N0|512|128|/|*|N1|+|,FP_SCALE_OPS_SPEC,FP_FIXED_OPS_SPEC ++PRESET,PAPI_SP_OPS,DERIVED_POSTFIX,N0|512|128|/|*|N1|+|,FP_SP_SCALE_OPS_SPEC,FP_SP_FIXED_OPS_SPEC ++PRESET,PAPI_DP_OPS,DERIVED_POSTFIX,N0|512|128|/|*|N1|+|,FP_DP_SCALE_OPS_SPEC,FP_DP_FIXED_OPS_SPEC + + # + CPU,mips_74k +commit 7a3c22763ef2ba00a2b8cb069c3501f35ecb13de +Author: Masahiko, Yamada +Date: Tue Dec 15 13:43:43 2020 +0900 + + modify PAPI_FP_INS and PAPI_VEC_INS for A64FX supports + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index 9192b1041..7b4ceb674 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -1941,11 +1941,11 @@ PRESET,PAPI_BR_PRC,DERIVED_SUB,BR_PRED,BR_MIS_PRED + PRESET,PAPI_FMA_INS,NOT_DERIVED,FP_FMA_SPEC + PRESET,PAPI_TOT_INS,NOT_DERIVED,INST_RETIRED + PRESET,PAPI_TOT_CYC,NOT_DERIVED,CPU_CYCLES +-PRESET,PAPI_FP_INS,NOT_DERIVED,VFP_SPEC ++PRESET,PAPI_FP_INS,NOT_DERIVED,FP_SPEC + PRESET,PAPI_LD_INS,NOT_DERIVED,LD_SPEC + PRESET,PAPI_SR_INS,NOT_DERIVED,ST_SPEC + PRESET,PAPI_BR_INS,NOT_DERIVED,BR_PRED +-PRESET,PAPI_VEC_INS,NOT_DERIVED,ASE_SPEC ++PRESET,PAPI_VEC_INS,NOT_DERIVED,SIMD_INST_RETIRED + PRESET,PAPI_RES_STL,NOT_DERIVED,STALL_BACKEND + PRESET,PAPI_LST_INS,NOT_DERIVED,LDST_SPEC + PRESET,PAPI_SYC_INS,DERIVED_ADD,ISB_SPEC,DSB_SPEC,DMB_SPEC +commit 530d4763fb8e6dd52109387bd58c8c1305fd6b63 +Author: Masahiko, Yamada +Date: Fri Feb 12 15:01:21 2021 +0900 + + remove PAPI_L1_DCA and PAPI_L1_DCH for a64fx + + There seems to be a problem with PAPI_L1_DCA and PAPI_L1_DCH for a64fx that prefetch overcounts. + I delete (comment out) PAPI_L1_DCA and PAPI_L1_DCH for a64fx from the papi_events.csv file. + I will issue the pullrequest again once I have identified how to handle the overcount. + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index 7b4ceb674..0f5ec8344 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -1949,8 +1949,8 @@ PRESET,PAPI_VEC_INS,NOT_DERIVED,SIMD_INST_RETIRED + PRESET,PAPI_RES_STL,NOT_DERIVED,STALL_BACKEND + PRESET,PAPI_LST_INS,NOT_DERIVED,LDST_SPEC + PRESET,PAPI_SYC_INS,DERIVED_ADD,ISB_SPEC,DSB_SPEC,DMB_SPEC +-PRESET,PAPI_L1_DCA,NOT_DERIVED,L1D_CACHE +-PRESET,PAPI_L1_DCH,DERIVED_SUB,L1D_CACHE,L1D_CACHE_REFILL ++#PRESET,PAPI_L1_DCA,NOT_DERIVED,L1D_CACHE ++#PRESET,PAPI_L1_DCH,DERIVED_SUB,L1D_CACHE,L1D_CACHE_REFILL + PRESET,PAPI_L1_DCM,NOT_DERIVED,L1D_CACHE_REFILL + PRESET,PAPI_L1_ICA,NOT_DERIVED,L1I_CACHE + PRESET,PAPI_L1_ICH,DERIVED_SUB,L1I_CACHE,L1I_CACHE_REFILL +commit 340f68940234f2db181147fc249907b4f1293e62 +Author: Masahiko, Yamada +Date: Tue Feb 16 17:16:24 2021 +0900 + + remove PAPI_L1_TCA and PAPI_L1_TCH for a64fx + + PAPI_L1_TCA and PAPI_L1_TCH for a64fx measure L1D_CACHE just like PAPI_L1_DCA and PAPI_L1_DCH, + so I delete (comment out) PAPI_L1_TCA and PAPI_L1_TCH for a64fx from the papi_events.csv file. + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index 0f5ec8344..4ef647959 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -1955,8 +1955,8 @@ PRESET,PAPI_L1_DCM,NOT_DERIVED,L1D_CACHE_REFILL + PRESET,PAPI_L1_ICA,NOT_DERIVED,L1I_CACHE + PRESET,PAPI_L1_ICH,DERIVED_SUB,L1I_CACHE,L1I_CACHE_REFILL + PRESET,PAPI_L1_ICM,NOT_DERIVED,L1I_CACHE_REFILL +-PRESET,PAPI_L1_TCA,DERIVED_ADD,L1D_CACHE,L1I_CACHE +-PRESET,PAPI_L1_TCH,DERIVED_POSTFIX,N0|N1|-|N2|+|N3|-|,L1D_CACHE,L1D_CACHE_REFILL,L1I_CACHE,L1I_CACHE_REFILL ++#PRESET,PAPI_L1_TCA,DERIVED_ADD,L1D_CACHE,L1I_CACHE ++#PRESET,PAPI_L1_TCH,DERIVED_POSTFIX,N0|N1|-|N2|+|N3|-|,L1D_CACHE,L1D_CACHE_REFILL,L1I_CACHE,L1I_CACHE_REFILL + PRESET,PAPI_L1_TCM,DERIVED_ADD,L1D_CACHE_REFILL,L1I_CACHE_REFILL + PRESET,PAPI_L2_DCA,NOT_DERIVED,L2D_CACHE + PRESET,PAPI_L2_DCH,DERIVED_POSTFIX,N0|N1|-|N2|+|N3|+|,L2D_CACHE,L2D_CACHE_REFILL,L2D_SWAP_DM,L2D_CACHE_MIBMCH_PRF +commit 02f34baafb868d183f21bebfd3c46574847b9929 +Author: Swarup Sahoo +Date: Tue May 18 02:51:56 2021 +0530 + + Added AMD Zen3 preset events. Refer section 2.1.17.2 of PPR for AMD family 19h model 01h, https://www.amd.com/system/files/TechDocs/55898_pub.zip + + Signed-off-by: Swarup Sahoo + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index 4ef647959..d9e9da8a3 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -482,6 +482,33 @@ PRESET,PAPI_FP_INS,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X + # VEC_DP and VEC_SP events. So, I'm taking them out. + #PRESET,PAPI_VEC_DP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR + #PRESET,PAPI_VEC_SP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR ++# ++# ++CPU,amd64_fam19h_zen3 ++PRESET,PAPI_TOT_INS,NOT_DERIVED,RETIRED_INSTRUCTIONS ++PRESET,PAPI_TOT_CYC,NOT_DERIVED,CYCLES_NOT_IN_HALT ++PRESET,PAPI_BR_INS,NOT_DERIVED,RETIRED_BRANCH_INSTRUCTIONS ++PRESET,PAPI_BR_TKN,NOT_DERIVED,RETIRED_TAKEN_BRANCH_INSTRUCTIONS ++PRESET,PAPI_BR_MSP,NOT_DERIVED,RETIRED_BRANCH_INSTRUCTIONS_MISPREDICTED ++PRESET,PAPI_TLB_DM,NOT_DERIVED, L1_DTLB_MISS:TLB_RELOAD_1G_L2_MISS:TLB_RELOAD_2M_L2_MISS:TLB_RELOAD_COALESCED_PAGE_MISS:TLB_RELOAD_4K_L2_MISS:TLB_RELOAD_1G_L2_HIT:TLB_RELOAD_2M_L2_HIT:TLB_RELOAD_COALESCED_PAGE_HIT:TLB_RELOAD_4K_L2_HIT ++PRESET,PAPI_TLB_IM,DERIVED_ADD,L1_ITLB_MISS_L2_ITLB_HIT,L1_ITLB_MISS_L2_ITLB_MISS:COALESCED4K:IF1G:IF2M:IF4K ++PRESET,PAPI_L1_DCA,NOT_DERIVED,LS_DISPATCH:LD_ST_DISPATCH:STORE_DISPATCH:LD_DISPATCH ++PRESET,PAPI_L1_DCM,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:RD_BLK_L:RD_BLK_X:LS_RD_BLK_C_S:CHANGE_TO_X ++PRESET,PAPI_L2_DCM,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_C ++PRESET,PAPI_L2_DCR,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:RD_BLK_L:RD_BLK_X:LS_RD_BLK_C_S:CHANGE_TO_X ++PRESET,PAPI_L2_DCH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_C_S:LS_RD_BLK_L_HIT_X:LS_RD_BLK_L_HIT_S:LS_RD_BLK_X ++PRESET,PAPI_L2_ICR,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:CACHEABLE_IC_READ ++PRESET,PAPI_L2_ICA,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:CACHEABLE_IC_READ ++PRESET,PAPI_L2_ICM,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:IC_FILL_MISS ++PRESET,PAPI_L2_ICH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:IC_FILL_HIT_X:IC_FILL_HIT_S ++# RETIRED_SSE_AVX_FLOPS requires MergeEvent support. ++PRESET,PAPI_VEC_INS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY ++PRESET,PAPI_FP_INS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY ++PRESET,PAPI_FP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY ++PRESET,PAPI_FML_INS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:MULT_FLOPS ++PRESET,PAPI_FAD_INS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ADD_SUB_FLOPS ++PRESET,PAPI_FDV_INS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:DIV_FLOPS ++PRESET,PAPI_FSQ_INS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:DIV_FLOPS + + + CPU,Intel architectural PMU +commit 6964aa356fa606f320c7b871123aceb5c1f21999 +Author: Masahiko, Yamada +Date: Tue Aug 24 14:17:29 2021 +0900 + + Fix the PAPI_FUL_CCY setting for a64fx + + In a64fx, the maximum number of instruction commits is 4, so the following setting was incorrect. + PAPI_FUL_CCY=CPU_CYCLES-0INST_COMMIT-1INST_COMMIT-2INST_COMMIT-3INST_COMMIT-4INST_COMMIT + + The correct settings are:. + PAPI_FUL_CCY=CPU_CYCLES-0INST_COMMIT-1INST_COMMIT-2INST_COMMIT-3INST_COMMIT + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index 4ef647959..74deb712f 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -1934,7 +1934,7 @@ PRESET,PAPI_PRF_DM,DERIVED_SUB,L2D_CACHE_REFILL_PRF,L2D_CACHE_MIBMCH_PRF + PRESET,PAPI_MEM_SCY,NOT_DERIVED,LD_COMP_WAIT_L2_MISS + PRESET,PAPI_STL_ICY,DERIVED_ADD,STALL_FRONTEND,STALL_BACKEND + PRESET,PAPI_STL_CCY,NOT_DERIVED,0INST_COMMIT +-PRESET,PAPI_FUL_CCY,DERIVED_SUB,CPU_CYCLES,0INST_COMMIT,1INST_COMMIT,2INST_COMMIT,3INST_COMMIT,4INST_COMMIT ++PRESET,PAPI_FUL_CCY,DERIVED_SUB,CPU_CYCLES,0INST_COMMIT,1INST_COMMIT,2INST_COMMIT,3INST_COMMIT + PRESET,PAPI_HW_INT,DERIVED_ADD,EXC_IRQ,EXC_FIQ + PRESET,PAPI_BR_MSP,NOT_DERIVED,BR_MIS_PRED + PRESET,PAPI_BR_PRC,DERIVED_SUB,BR_PRED,BR_MIS_PRED +commit fbf3b9e3d17c4ec4bd7e33410c44fc5aed57e36f +Author: Masahiko, Yamada +Date: Fri Mar 4 15:41:30 2022 +0900 + + Add PAPI idle-related preset events for a64fx + + For a64fx, add four PAPI idle-related preset events + (PAPI_BRU_IDL/PAPI_FXU_IDL/PAPI_FPU_IDL/PAPI_LSU_IDL). + + PAPI_BRU_IDL = BR_COMP_WAIT + PAPI_FXU_IDL = EU_COMP_WAIT - FL_COMP_WAIT + PAPI_FPU_IDL = FL_COMP_WAIT + PAPI_LSU_IDL = LD_COMP_WAIT + + The specifications of BR_COMP_WAIT, EU_COMP_WAIT, FL_COMP_WAIT, + and LD_COMP_WAIT can be found in the "14.4. Cycle Accounting" + on A64FX_Microarchitecture_Manual_en_1.5.pdf at the following URL:. + https://github.com/fujitsu/A64FX/blob/master/doc + + Signed-off-by: Masahiko, Yamada + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index 74deb712f..1cd498e91 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -1935,6 +1935,10 @@ PRESET,PAPI_MEM_SCY,NOT_DERIVED,LD_COMP_WAIT_L2_MISS + PRESET,PAPI_STL_ICY,DERIVED_ADD,STALL_FRONTEND,STALL_BACKEND + PRESET,PAPI_STL_CCY,NOT_DERIVED,0INST_COMMIT + PRESET,PAPI_FUL_CCY,DERIVED_SUB,CPU_CYCLES,0INST_COMMIT,1INST_COMMIT,2INST_COMMIT,3INST_COMMIT ++PRESET,PAPI_BRU_IDL,NOT_DERIVED,BR_COMP_WAIT ++PRESET,PAPI_FXU_IDL,DERIVED_SUB,EU_COMP_WAIT,FL_COMP_WAIT ++PRESET,PAPI_FPU_IDL,NOT_DERIVED,FL_COMP_WAIT ++PRESET,PAPI_LSU_IDL,NOT_DERIVED,LD_COMP_WAIT + PRESET,PAPI_HW_INT,DERIVED_ADD,EXC_IRQ,EXC_FIQ + PRESET,PAPI_BR_MSP,NOT_DERIVED,BR_MIS_PRED + PRESET,PAPI_BR_PRC,DERIVED_SUB,BR_PRED,BR_MIS_PRED +commit 3c5364839f583185c1e8dca58d5fe36c9ec82876 +Author: Daniel Barry +Date: Tue Aug 30 23:17:30 2022 +0000 + + papi_avail: add presets for Intel Ice Lake SP + + Define preset events for the Intel Ice Lake SP processor. + These presets have been verified using the Counter Analysis Toolkit benchmarks. + + These changes have been tested on the Intel Ice Lake architecture. + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index a013f58af..8f23e030c 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -929,6 +929,63 @@ PRESET,PAPI_CA_ITV,NOT_DERIVED,OFFCORE_RESPONSE_0:SNP_HIT_WITH_FWD + + # End of hsw,bdw,skl,clx list + # ++ ++# Intel Ice Lake SP events ++CPU,icx ++PRESET,PAPI_TOT_CYC,NOT_DERIVED,CPU_CLK_UNHALTED:THREAD_P ++PRESET,PAPI_TOT_INS,NOT_DERIVED,INST_RETIRED:ANY_P ++PRESET,PAPI_REF_CYC,NOT_DERIVED,UNHALTED_REFERENCE_CYCLES ++# Loads and stores ++PRESET,PAPI_LD_INS,NOT_DERIVED,MEM_INST_RETIRED:ALL_LOADS ++PRESET,PAPI_SR_INS,NOT_DERIVED,MEM_INST_RETIRED:ALL_STORES ++PRESET,PAPI_LST_INS,DERIVED_ADD,MEM_INST_RETIRED:ALL_LOADS,MEM_INST_RETIRED:ALL_STORES ++# L1 cache ++PRESET,PAPI_L1_ICM,NOT_DERIVED,L2_RQSTS:ALL_CODE_RD ++PRESET,PAPI_L1_DCM,NOT_DERIVED,L1D:REPLACEMENT ++PRESET,PAPI_L1_TCM,DERIVED_ADD,L1D:REPLACEMENT,L2_RQSTS:ALL_CODE_RD ++# L2 cache ++PRESET,PAPI_L2_DCA,NOT_DERIVED,L2_RQSTS:ALL_DEMAND_REFERENCES ++PRESET,PAPI_L2_DCR,NOT_DERIVED,L2_RQSTS:ALL_DEMAND_DATA_RD ++PRESET,PAPI_L2_ICH,NOT_DERIVED,L2_RQSTS:CODE_RD_HIT ++PRESET,PAPI_L2_ICM,NOT_DERIVED,L2_RQSTS:CODE_RD_MISS ++PRESET,PAPI_L2_ICR,NOT_DERIVED,L2_RQSTS:ALL_CODE_RD ++#PRESET,PAPI_L2_TCH,NOT_DERIVED,MEM_LOAD_UOPS_RETIRED:L2_HIT ++#PRESET,PAPI_L2_TCM,NOT_DERIVED,MEM_LOAD_UOPS_RETIRED:L2_MISS ++PRESET,PAPI_L2_DCM,DERIVED_SUB,LLC_REFERENCES,L2_RQSTS:CODE_RD_MISS ++PRESET,PAPI_L2_ICA,NOT_DERIVED,L2_RQSTS:ALL_CODE_RD ++#PRESET,PAPI_L2_LDH,NOT_DERIVED,L2_RQSTS:DEMAND_DATA_RD_HIT ++PRESET,PAPI_L2_LDM,NOT_DERIVED,L2_RQSTS:DEMAND_DATA_RD_MISS ++PRESET,PAPI_L2_TCA,DERIVED_ADD,L2_RQSTS:ALL_DEMAND_REFERENCES,L2_RQSTS:ALL_CODE_RD ++PRESET,PAPI_L2_TCM,NOT_DERIVED,LLC_REFERENCES ++PRESET,PAPI_L2_TCR,DERIVED_ADD,L2_RQSTS:ALL_DEMAND_DATA_RD,L2_RQSTS:ALL_CODE_RD ++# L3 cache ++PRESET,PAPI_L3_DCA,DERIVED_SUB,LLC_REFERENCES,L2_RQSTS:CODE_RD_MISS ++PRESET,PAPI_L3_DCR,NOT_DERIVED,OFFCORE_REQUESTS:DEMAND_DATA_RD ++PRESET,PAPI_L3_ICA,NOT_DERIVED,L2_RQSTS:CODE_RD_MISS ++PRESET,PAPI_L3_ICR,NOT_DERIVED,L2_RQSTS:CODE_RD_MISS ++#PRESET,PAPI_L3_LDH,NOT_DERIVED,MEM_LOAD_UOPS_RETIRED:L3_HIT ++PRESET,PAPI_L3_LDM,NOT_DERIVED,MEM_LOAD_RETIRED:L3_MISS ++PRESET,PAPI_L3_TCA,NOT_DERIVED,LLC_REFERENCES ++PRESET,PAPI_L3_TCM,NOT_DERIVED,LLC_MISSES ++# SMP ++PRESET,PAPI_CA_SHR,NOT_DERIVED,OFFCORE_REQUESTS:ALL_DATA_RD ++# Branches ++PRESET,PAPI_BR_UCN,DERIVED_SUB,BR_INST_RETIRED:ALL_BRANCHES,BR_INST_RETIRED:COND ++PRESET,PAPI_BR_CN,NOT_DERIVED,BR_INST_RETIRED:COND ++PRESET,PAPI_BR_TKN,NOT_DERIVED,BR_INST_RETIRED:COND_TAKEN ++PRESET,PAPI_BR_NTK,NOT_DERIVED,BR_INST_RETIRED:COND_NTAKEN ++PRESET,PAPI_BR_MSP,NOT_DERIVED,BR_MISP_RETIRED:COND ++PRESET,PAPI_BR_PRC,DERIVED_SUB,BR_INST_RETIRED:COND,BR_MISP_RETIRED:COND ++PRESET,PAPI_BR_INS,NOT_DERIVED,BR_INST_RETIRED:ALL_BRANCHES ++#FLOPs ++# PAPI_DP_OPS = FP_ARITH:SCALAR_DOUBLE + 2*FP_ARITH:128B_PACKED_DOUBLE + 4*256B_PACKED_DOUBLE + 8*512B_PACKED_DOUBLE ++PRESET,PAPI_DP_OPS,DERIVED_POSTFIX,N0|N1|2|*|+|N2|4|*|+|N3|8|*|+|,FP_ARITH:SCALAR_DOUBLE,FP_ARITH:128B_PACKED_DOUBLE,FP_ARITH:256B_PACKED_DOUBLE,FP_ARITH:512B_PACKED_DOUBLE ++# PAPI_SP_OPS = FP_ARITH:SCALAR_SINGLE + 4*FP_ARITH:128B_PACKED_SINGLE + 8*256B_PACKED_SINGLE + 16*512B_PACKED_SINGLE ++PRESET,PAPI_SP_OPS,DERIVED_POSTFIX,N0|N1|4|*|+|N2|8|*|+|N3|16|*|+|,FP_ARITH:SCALAR_SINGLE,FP_ARITH:128B_PACKED_SINGLE,FP_ARITH:256B_PACKED_SINGLE,FP_ARITH:512B_PACKED_SINGLE ++PRESET,PAPI_VEC_DP,DERIVED_POSTFIX,N0|N1|N2|N3|+|+|+|,FP_ARITH:SCALAR_DOUBLE,FP_ARITH:128B_PACKED_DOUBLE,FP_ARITH:256B_PACKED_DOUBLE,FP_ARITH:512B_PACKED_DOUBLE ++PRESET,PAPI_VEC_SP,DERIVED_POSTFIX,N0|N1|N2|N3|+|+|+|,FP_ARITH:SCALAR_SINGLE,FP_ARITH:128B_PACKED_SINGLE,FP_ARITH:256B_PACKED_SINGLE,FP_ARITH:512B_PACKED_SINGLE ++# End of icx list ++ + # + # Intel MIC / Xeon-Phi / Knights Landing + # Intel Knights Mill +commit d4da29b07befb9f7c11e351dbfef835b74cdd67a +Author: John Linford +Date: Mon Mar 20 17:11:37 2023 -0500 + + Add minimal events for Arm Neoverse N1 + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index 8f23e030c..a4d5a9756 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -2059,6 +2059,41 @@ PRESET,PAPI_FP_OPS,DERIVED_POSTFIX,N0|512|128|/|*|N1|+|,FP_SCALE_OPS_SPEC,FP_FIX + PRESET,PAPI_SP_OPS,DERIVED_POSTFIX,N0|512|128|/|*|N1|+|,FP_SP_SCALE_OPS_SPEC,FP_SP_FIXED_OPS_SPEC + PRESET,PAPI_DP_OPS,DERIVED_POSTFIX,N0|512|128|/|*|N1|+|,FP_DP_SCALE_OPS_SPEC,FP_DP_FIXED_OPS_SPEC + ++######################### ++# ARM Neoverse N1 # ++######################### ++CPU,arm_n1 ++# ++PRESET,PAPI_TOT_INS,NOT_DERIVED,INST_RETIRED ++PRESET,PAPI_TOT_CYC,NOT_DERIVED,CPU_CYCLES ++PRESET,PAPI_FP_INS,NOT_DERIVED,VFP_SPEC ++PRESET,PAPI_VEC_INS,NOT_DERIVED,ASE_SPEC ++PRESET,PAPI_BR_INS,NOT_DERIVED,BR_RETIRED ++PRESET,PAPI_BR_PRC,DERIVED_SUB,BR_PRED,BR_MIS_PRED ++PRESET,PAPI_BR_MSP,NOT_DERIVED,BR_MIS_PRED ++PRESET,PAPI_BR_INS,NOT_DERIVED,BR_PRED ++PRESET,PAPI_LD_INS,NOT_DERIVED,LD_SPEC ++PRESET,PAPI_SR_INS,NOT_DERIVED,ST_SPEC ++PRESET,PAPI_LST_INS,DERIVED_ADD,LD_SPEC,ST_SPEC ++PRESET,PAPI_L1_DCA,NOT_DERIVED,L1D_CACHE ++PRESET,PAPI_L1_DCM,NOT_DERIVED,L1D_CACHE_REFILL ++PRESET,PAPI_L1_DCR,NOT_DERIVED,L1D_CACHE_RD ++PRESET,PAPI_L1_DCW,NOT_DERIVED,L1D_CACHE_WR ++PRESET,PAPI_L1_ICA,NOT_DERIVED,L1I_CACHE_ACCESS ++PRESET,PAPI_L1_ICH,DERIVED_SUB,L1I_CACHE_ACCESS,L1I_CACHE_REFILL ++PRESET,PAPI_L1_ICM,NOT_DERIVED,L1I_CACHE_REFILL ++PRESET,PAPI_L2_TCA,NOT_DERIVED,L2D_CACHE_ACCESS ++PRESET,PAPI_L2_DCA,DERIVED_ADD,L2D_CACHE_RD,L2D_CACHE_WR ++PRESET,PAPI_L2_DCM,NOT_DERIVED,L2D_CACHE_REFILL ++PRESET,PAPI_L2_DCR,NOT_DERIVED,L2D_CACHE_RD ++PRESET,PAPI_L2_DCW,NOT_DERIVED,L2D_CACHE_WR ++PRESET,PAPI_L2_LDM,NOT_DERIVED,L2D_CACHE_REFILL_RD ++PRESET,PAPI_STL_ICY,DERIVED_ADD,STALL_FRONTEND,STALL_BACKEND ++PRESET,PAPI_RES_STL,NOT_DERIVED,STALL_BACKEND ++PRESET,PAPI_HW_INT,DERIVED_ADD,EXC_IRQ,EXC_FIQ ++PRESET,PAPI_SYC_INS,DERIVED_ADD,ISB_SPEC,DSB_SPEC,DMB_SPEC ++PRESET,PAPI_TLB_DM,NOT_DERIVED,L2D_TLB_REFILL ++ + # + CPU,mips_74k + # +commit 88e686f877abcf19c5f50d4e23cbf8ea920a40b6 +Author: John Linford +Date: Mon Mar 20 14:54:41 2023 -0500 + + Add minimal events for Arm Neoverse V1 + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index a4d5a9756..207d6d1db 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -2094,6 +2094,41 @@ PRESET,PAPI_HW_INT,DERIVED_ADD,EXC_IRQ,EXC_FIQ + PRESET,PAPI_SYC_INS,DERIVED_ADD,ISB_SPEC,DSB_SPEC,DMB_SPEC + PRESET,PAPI_TLB_DM,NOT_DERIVED,L2D_TLB_REFILL + ++######################### ++# ARM Neoverse V1 # ++######################### ++CPU,arm_v1 ++# ++PRESET,PAPI_TOT_INS,NOT_DERIVED,INST_RETIRED ++PRESET,PAPI_TOT_CYC,NOT_DERIVED,CPU_CYCLES ++PRESET,PAPI_FP_INS,NOT_DERIVED,VFP_SPEC ++PRESET,PAPI_VEC_INS,DERIVED_ADD,SVE_INST_SPEC,ASE_INST_SPEC ++PRESET,PAPI_BR_INS,NOT_DERIVED,BR_RETIRED ++PRESET,PAPI_BR_PRC,DERIVED_SUB,BR_PRED,BR_MIS_PRED ++PRESET,PAPI_BR_MSP,NOT_DERIVED,BR_MIS_PRED ++PRESET,PAPI_BR_INS,NOT_DERIVED,BR_PRED ++PRESET,PAPI_LD_INS,NOT_DERIVED,LD_SPEC ++PRESET,PAPI_SR_INS,NOT_DERIVED,ST_SPEC ++PRESET,PAPI_LST_INS,DERIVED_ADD,LD_SPEC,ST_SPEC ++PRESET,PAPI_L1_DCA,NOT_DERIVED,L1D_CACHE ++PRESET,PAPI_L1_DCM,NOT_DERIVED,L1D_CACHE_REFILL ++PRESET,PAPI_L1_DCR,NOT_DERIVED,L1D_CACHE_RD ++PRESET,PAPI_L1_DCW,NOT_DERIVED,L1D_CACHE_WR ++PRESET,PAPI_L1_ICA,NOT_DERIVED,L1I_CACHE_ACCESS ++PRESET,PAPI_L1_ICH,DERIVED_SUB,L1I_CACHE_ACCESS,L1I_CACHE_REFILL ++PRESET,PAPI_L1_ICM,NOT_DERIVED,L1I_CACHE_REFILL ++PRESET,PAPI_L2_TCA,NOT_DERIVED,L2D_CACHE_ACCESS ++PRESET,PAPI_L2_DCA,DERIVED_ADD,L2D_CACHE_RD,L2D_CACHE_WR ++PRESET,PAPI_L2_DCM,NOT_DERIVED,L2D_CACHE_REFILL ++PRESET,PAPI_L2_DCR,NOT_DERIVED,L2D_CACHE_RD ++PRESET,PAPI_L2_DCW,NOT_DERIVED,L2D_CACHE_WR ++PRESET,PAPI_L2_LDM,NOT_DERIVED,L2D_CACHE_REFILL_RD ++PRESET,PAPI_STL_ICY,DERIVED_ADD,STALL_FRONTEND,STALL_BACKEND ++PRESET,PAPI_RES_STL,NOT_DERIVED,STALL_BACKEND ++PRESET,PAPI_HW_INT,DERIVED_ADD,EXC_IRQ,EXC_FIQ ++PRESET,PAPI_SYC_INS,DERIVED_ADD,ISB_SPEC,DSB_SPEC,DMB_SPEC ++PRESET,PAPI_TLB_DM,NOT_DERIVED,L2D_TLB_REFILL ++ + # + CPU,mips_74k + # +commit e911f951115bb551925c5b07e7f5b721d5fe3bbe +Author: John Linford +Date: Mon Mar 20 17:14:18 2023 -0500 + + Add minimal events for Arm Neoverse N2 + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index 207d6d1db..d27d956c1 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -2094,6 +2094,41 @@ PRESET,PAPI_HW_INT,DERIVED_ADD,EXC_IRQ,EXC_FIQ + PRESET,PAPI_SYC_INS,DERIVED_ADD,ISB_SPEC,DSB_SPEC,DMB_SPEC + PRESET,PAPI_TLB_DM,NOT_DERIVED,L2D_TLB_REFILL + ++######################### ++# ARM Neoverse N2 # ++######################### ++CPU,arm_n2 ++# ++PRESET,PAPI_TOT_INS,NOT_DERIVED,INST_RETIRED ++PRESET,PAPI_TOT_CYC,NOT_DERIVED,CPU_CYCLES ++PRESET,PAPI_FP_INS,NOT_DERIVED,VFP_SPEC ++PRESET,PAPI_VEC_INS,DERIVED_ADD,SVE_INST_SPEC,ASE_INST_SPEC ++PRESET,PAPI_BR_INS,NOT_DERIVED,BR_RETIRED ++PRESET,PAPI_BR_PRC,DERIVED_SUB,BR_PRED,BR_MIS_PRED ++PRESET,PAPI_BR_MSP,NOT_DERIVED,BR_MIS_PRED ++PRESET,PAPI_BR_INS,NOT_DERIVED,BR_PRED ++PRESET,PAPI_LD_INS,NOT_DERIVED,LD_SPEC ++PRESET,PAPI_SR_INS,NOT_DERIVED,ST_SPEC ++PRESET,PAPI_LST_INS,DERIVED_ADD,LD_SPEC,ST_SPEC ++PRESET,PAPI_L1_DCA,NOT_DERIVED,L1D_CACHE ++PRESET,PAPI_L1_DCM,NOT_DERIVED,L1D_CACHE_REFILL ++PRESET,PAPI_L1_DCR,NOT_DERIVED,L1D_CACHE_RD ++PRESET,PAPI_L1_DCW,NOT_DERIVED,L1D_CACHE_WR ++PRESET,PAPI_L1_ICA,NOT_DERIVED,L1I_CACHE_ACCESS ++PRESET,PAPI_L1_ICH,DERIVED_SUB,L1I_CACHE_ACCESS,L1I_CACHE_REFILL ++PRESET,PAPI_L1_ICM,NOT_DERIVED,L1I_CACHE_REFILL ++PRESET,PAPI_L2_TCA,NOT_DERIVED,L2D_CACHE_ACCESS ++PRESET,PAPI_L2_DCA,DERIVED_ADD,L2D_CACHE_RD,L2D_CACHE_WR ++PRESET,PAPI_L2_DCM,NOT_DERIVED,L2D_CACHE_REFILL ++PRESET,PAPI_L2_DCR,NOT_DERIVED,L2D_CACHE_RD ++PRESET,PAPI_L2_DCW,NOT_DERIVED,L2D_CACHE_WR ++PRESET,PAPI_L2_LDM,NOT_DERIVED,L2D_CACHE_REFILL_RD ++PRESET,PAPI_STL_ICY,DERIVED_ADD,STALL_FRONTEND,STALL_BACKEND ++PRESET,PAPI_RES_STL,NOT_DERIVED,STALL_BACKEND ++PRESET,PAPI_HW_INT,DERIVED_ADD,EXC_IRQ,EXC_FIQ ++PRESET,PAPI_SYC_INS,DERIVED_ADD,ISB_SPEC,DSB_SPEC,DMB_SPEC ++PRESET,PAPI_TLB_DM,NOT_DERIVED,L2D_TLB_REFILL ++ + ######################### + # ARM Neoverse V1 # + ######################### +commit 05dc580247cb18fca882a33d8e356d79032d2ed1 +Author: John Linford +Date: Mon Mar 20 17:08:35 2023 -0500 + + Add minimal events for Arm Neoverse V2 + +diff --git a/src/papi_events.csv b/src/papi_events.csv +index d27d956c1..549e337c7 100644 +--- a/src/papi_events.csv ++++ b/src/papi_events.csv +@@ -2164,6 +2164,41 @@ PRESET,PAPI_HW_INT,DERIVED_ADD,EXC_IRQ,EXC_FIQ + PRESET,PAPI_SYC_INS,DERIVED_ADD,ISB_SPEC,DSB_SPEC,DMB_SPEC + PRESET,PAPI_TLB_DM,NOT_DERIVED,L2D_TLB_REFILL + ++######################### ++# ARM Neoverse V2 # ++######################### ++CPU,arm_v2 ++# ++PRESET,PAPI_TOT_INS,NOT_DERIVED,INST_RETIRED ++PRESET,PAPI_TOT_CYC,NOT_DERIVED,CPU_CYCLES ++PRESET,PAPI_FP_INS,NOT_DERIVED,VFP_SPEC ++PRESET,PAPI_VEC_INS,DERIVED_ADD,SVE_INST_SPEC,ASE_INST_SPEC ++PRESET,PAPI_BR_INS,NOT_DERIVED,BR_RETIRED ++PRESET,PAPI_BR_PRC,DERIVED_SUB,BR_PRED,BR_MIS_PRED ++PRESET,PAPI_BR_MSP,NOT_DERIVED,BR_MIS_PRED ++PRESET,PAPI_BR_INS,NOT_DERIVED,BR_PRED ++PRESET,PAPI_LD_INS,NOT_DERIVED,LD_SPEC ++PRESET,PAPI_SR_INS,NOT_DERIVED,ST_SPEC ++PRESET,PAPI_LST_INS,DERIVED_ADD,LD_SPEC,ST_SPEC ++PRESET,PAPI_L1_DCA,NOT_DERIVED,L1D_CACHE ++PRESET,PAPI_L1_DCM,NOT_DERIVED,L1D_CACHE_REFILL ++PRESET,PAPI_L1_DCR,NOT_DERIVED,L1D_CACHE_RD ++PRESET,PAPI_L1_DCW,NOT_DERIVED,L1D_CACHE_WR ++PRESET,PAPI_L1_ICA,NOT_DERIVED,L1I_CACHE_ACCESS ++PRESET,PAPI_L1_ICH,DERIVED_SUB,L1I_CACHE_ACCESS,L1I_CACHE_REFILL ++PRESET,PAPI_L1_ICM,NOT_DERIVED,L1I_CACHE_REFILL ++PRESET,PAPI_L2_TCA,NOT_DERIVED,L2D_CACHE_ACCESS ++PRESET,PAPI_L2_DCA,DERIVED_ADD,L2D_CACHE_RD,L2D_CACHE_WR ++PRESET,PAPI_L2_DCM,NOT_DERIVED,L2D_CACHE_REFILL ++PRESET,PAPI_L2_DCR,NOT_DERIVED,L2D_CACHE_RD ++PRESET,PAPI_L2_DCW,NOT_DERIVED,L2D_CACHE_WR ++PRESET,PAPI_L2_LDM,NOT_DERIVED,L2D_CACHE_REFILL_RD ++PRESET,PAPI_STL_ICY,DERIVED_ADD,STALL_FRONTEND,STALL_BACKEND ++PRESET,PAPI_RES_STL,NOT_DERIVED,STALL_BACKEND ++PRESET,PAPI_HW_INT,DERIVED_ADD,EXC_IRQ,EXC_FIQ ++PRESET,PAPI_SYC_INS,DERIVED_ADD,ISB_SPEC,DSB_SPEC,DMB_SPEC ++PRESET,PAPI_TLB_DM,NOT_DERIVED,L2D_TLB_REFILL ++ + # + CPU,mips_74k + # diff --git a/SOURCES/papi-a64fx.patch b/SOURCES/papi-a64fx.patch deleted file mode 100644 index f4d1132..0000000 --- a/SOURCES/papi-a64fx.patch +++ /dev/null @@ -1,34 +0,0 @@ -commit 9a44d82928ed17ba2ff21eb88b89c5829d0ea30e -Author: Steve Kaufmann -Date: Wed Jun 24 14:08:08 2020 -0400 - - Added PAPI preset support for Fujitsu A64FX. - - Signed-off-by: Heike Jagode - -diff --git a/src/papi_events.csv b/src/papi_events.csv -index 8e96adfbd..1b5c15542 100644 ---- a/src/papi_events.csv -+++ b/src/papi_events.csv -@@ -1877,6 +1877,21 @@ PRESET,PAPI_L2_DCR,NOT_DERIVED,L2D_CACHE_RD - PRESET,PAPI_L2_DCW,NOT_DERIVED,L2D_CACHE_WR - PRESET,PAPI_L2_LDM,NOT_DERIVED,L2D_CACHE_REFILL_RD - -+######################### -+# ARM Fujitsu A64FX # -+######################### -+CPU,arm_a64fx -+# -+PRESET,PAPI_TOT_INS,NOT_DERIVED,INST_RETIRED -+PRESET,PAPI_TOT_CYC,NOT_DERIVED,CPU_CYCLES -+PRESET,PAPI_FP_INS,NOT_DERIVED,VFP_SPEC -+PRESET,PAPI_VEC_INS,NOT_DERIVED,ASE_SPEC -+PRESET,PAPI_L1_DCM,NOT_DERIVED,L1D_CACHE_REFILL -+PRESET,PAPI_L1_ICA,NOT_DERIVED,L1I_CACHE -+PRESET,PAPI_L1_ICM,NOT_DERIVED,L1I_CACHE_REFILL -+PRESET,PAPI_L2_DCH,NOT_DERIVED,L2D_CACHE -+PRESET,PAPI_L2_DCM,NOT_DERIVED,L2D_CACHE_REFILL -+ - # - CPU,mips_74k - # diff --git a/SOURCES/papi-arm64fastread.patch b/SOURCES/papi-arm64fastread.patch new file mode 100644 index 0000000..986743b --- /dev/null +++ b/SOURCES/papi-arm64fastread.patch @@ -0,0 +1,637 @@ +commit 9a1f2d897f4086bc1d60102de984c849445b5e97 +Author: Masahiko, Yamada +Date: Tue Feb 21 19:18:40 2023 +0900 + + PAPI_read performance improvement for the arm64 processor + + We developed PAPI_read performance improvements for the arm64 processor + with a plan to port direct user space PMU register access processing from + libperf to the papi library without using libperf. + + The workaround has been implemented that stores the counter value at the + time of reset and subtracts the counter value at the time of reset from + the read counter value at the next read. + When reset processing is called, the value of pc->offset is cleared to 0, + and only the counter value read from the PMU counter is referenced. + There was no problem with the counters FAILED with negative values during + the multiplex+reset test, except for sdsc2-mpx and sdsc4-mpx. + To apply the workaround only during reset, the _pe_reset function call sets + the reset_flag and the next _pe_start function call clears the reset_flag. + The workaround works if the mmap_read_self function is called between calls + to the _pe_reset function and the next call to the _pe_start function. + + Switching PMU register direct access from user space from OFF to ON is done by + changing the setting of the kernel variable "/proc/sys/kernel/perf_user_access". + + Setting PMU Register Direct Access from User Space Off + $ echo 0 > /proc/sys/kernel/perf_user_access + $ cat /proc/sys/kernel/perf_user_access + 0 + + Setting PMU Register Direct Access from User Space ON + $ echo 1 > /proc/sys/kernel/perf_user_access + $ cat /proc/sys/kernel/perf_user_access + 1 + + Performance of PAPI_read has been improved as expected from the execution + result of the papi_cost command. + + Improvement effect of switching PMU register direct access from user space + from OFF to ON + + Total cost for PAPI_read (2 counters) over 1000000 iterations + min cycles: 689 -> 28 + max cycles: 3876 -> 1323 + mean cycles: 724.471979 -> 28.888076 + + Total cost for PAPI_read_ts (2 counters) over 1000000 iterations + min cycles: 693 -> 29 + max cycles: 4066 -> 3718 + mean cycles: 726.753003 -> 29.977226 + + Total cost for PAPI_read (1 derived_[add|sub] counter) over 1000000 iterations + min cycles: 698 -> 28 + max cycles: 7406 -> 2346 + mean cycles: 728.527079 -> 28.880691 + + Signed-off-by: Masahiko, Yamada + +diff --git a/src/components/perf_event/perf_event.c b/src/components/perf_event/perf_event.c +index b4877d18e..331288c55 100644 +--- a/src/components/perf_event/perf_event.c ++++ b/src/components/perf_event/perf_event.c +@@ -682,6 +682,12 @@ set_up_mmap( pe_control_t *ctl, int evt_idx) + + + ++/* Request user access for arm64 */ ++static inline void arm64_request_user_access(struct perf_event_attr *hw_event) ++{ ++ hw_event->config1=0x2; /* Request user access */ ++} ++ + /* Open all events in the control state */ + static int + open_pe_events( pe_context_t *ctx, pe_control_t *ctl ) +@@ -735,6 +741,11 @@ open_pe_events( pe_context_t *ctx, pe_control_t *ctl ) + if (( i == 0 ) || (ctl->multiplexed)) { + ctl->events[i].attr.pinned = !ctl->multiplexed; + ctl->events[i].attr.disabled = 1; ++#if defined(__aarch64__) ++ if (_perf_event_vector.cmp_info.fast_counter_read) { ++ arm64_request_user_access(&ctl->events[i].attr); ++ } ++#endif + ctl->events[i].group_leader_fd=-1; + ctl->events[i].attr.read_format = get_read_format( + ctl->multiplexed, +@@ -743,6 +754,11 @@ open_pe_events( pe_context_t *ctx, pe_control_t *ctl ) + } else { + ctl->events[i].attr.pinned=0; + ctl->events[i].attr.disabled = 0; ++#if defined(__aarch64__) ++ if (_perf_event_vector.cmp_info.fast_counter_read) { ++ arm64_request_user_access(&ctl->events[i].attr); ++ } ++#endif + ctl->events[i].group_leader_fd=ctl->events[0].event_fd; + ctl->events[i].attr.read_format = get_read_format( + ctl->multiplexed, +@@ -1047,8 +1063,16 @@ _pe_reset( hwd_context_t *ctx, hwd_control_state_t *ctl ) + + /* We need to reset all of the events, not just the group leaders */ + for( i = 0; i < pe_ctl->num_events; i++ ) { +- ret = ioctl( pe_ctl->events[i].event_fd, +- PERF_EVENT_IOC_RESET, NULL ); ++ if (_perf_event_vector.cmp_info.fast_counter_read) { ++ ret = ioctl( pe_ctl->events[i].event_fd, ++ PERF_EVENT_IOC_RESET, NULL ); ++ pe_ctl->reset_counts[i] = mmap_read_reset_count( ++ pe_ctl->events[i].mmap_buf); ++ pe_ctl->reset_flag = 1; ++ } else { ++ ret = ioctl( pe_ctl->events[i].event_fd, ++ PERF_EVENT_IOC_RESET, NULL ); ++ } + if ( ret == -1 ) { + PAPIERROR("ioctl(%d, PERF_EVENT_IOC_RESET, NULL) " + "returned error, Linux says: %s", +@@ -1119,6 +1143,8 @@ _pe_rdpmc_read( hwd_context_t *ctx, hwd_control_state_t *ctl, + for ( i = 0; i < pe_ctl->num_events; i++ ) { + + count = mmap_read_self(pe_ctl->events[i].mmap_buf, ++ pe_ctl->reset_flag, ++ pe_ctl->reset_counts[i], + &enabled,&running); + + if (count==0xffffffffffffffffULL) { +@@ -1438,6 +1464,10 @@ _pe_start( hwd_context_t *ctx, hwd_control_state_t *ctl ) + pe_ctl->events[i].event_fd); + ret=ioctl( pe_ctl->events[i].event_fd, + PERF_EVENT_IOC_ENABLE, NULL) ; ++ if (_perf_event_vector.cmp_info.fast_counter_read) { ++ pe_ctl->reset_counts[i] = 0LL; ++ pe_ctl->reset_flag = 0; ++ } + + /* ioctls always return -1 on failure */ + if (ret == -1) { +@@ -2297,6 +2327,29 @@ _pe_shutdown_component( void ) { + } + + ++#if defined(__aarch64__) ++/* Check access PMU counter from User space for arm64 support */ ++static int _pe_detect_arm64_access(void) { ++ ++ FILE *fff; ++ int perf_user_access; ++ int retval; ++ ++ fff=fopen("/proc/sys/kernel/perf_user_access","r"); ++ if (fff==NULL) { ++ return 0; ++ } ++ ++ /* 1 means you can access PMU counter from User space */ ++ /* 0 means you can not access PMU counter from User space */ ++ retval=fscanf(fff,"%d",&perf_user_access); ++ if (retval!=1) fprintf(stderr,"Error reading /proc/sys/kernel/perf_user_access\n"); ++ fclose(fff); ++ ++ return perf_user_access; ++} ++#endif ++ + /* Check the mmap page for rdpmc support */ + static int _pe_detect_rdpmc(void) { + +@@ -2305,10 +2358,13 @@ static int _pe_detect_rdpmc(void) { + void *addr; + struct perf_event_mmap_page *our_mmap; + int page_size=getpagesize(); ++#if defined(__aarch64__) ++ int retval; ++#endif + +-#if defined(__i386__) || defined (__x86_64__) ++#if defined(__i386__) || defined (__x86_64__) || defined(__aarch64__) + #else +- /* We only support rdpmc on x86 for now */ ++ /* We support rdpmc on x86 and arm64 for now */ + return 0; + #endif + +@@ -2318,12 +2374,23 @@ static int _pe_detect_rdpmc(void) { + return 0; + } + ++#if defined(__aarch64__) ++ /* Detect if we can use PMU counter from User space for arm64 */ ++ retval = _pe_detect_arm64_access(); ++ if (retval == 0) { ++ return 0; ++ } ++#endif ++ + /* Create a fake instructions event so we can read a mmap page */ + memset(&pe,0,sizeof(struct perf_event_attr)); + + pe.type=PERF_TYPE_HARDWARE; + pe.size=sizeof(struct perf_event_attr); + pe.config=PERF_COUNT_HW_INSTRUCTIONS; ++#if defined(__aarch64__) ++ arm64_request_user_access(&pe); ++#endif + pe.exclude_kernel=1; + pe.disabled=1; + +diff --git a/src/components/perf_event/perf_event_lib.h b/src/components/perf_event/perf_event_lib.h +index 0c50ab9f0..cfba8ac49 100644 +--- a/src/components/perf_event/perf_event_lib.h ++++ b/src/components/perf_event/perf_event_lib.h +@@ -36,6 +36,8 @@ typedef struct { + pid_t tid; /* thread we are monitoring */ + pe_event_info_t events[PERF_EVENT_MAX_MPX_COUNTERS]; + long long counts[PERF_EVENT_MAX_MPX_COUNTERS]; ++ unsigned int reset_flag; ++ long long reset_counts[PERF_EVENT_MAX_MPX_COUNTERS]; + } pe_control_t; + + +diff --git a/src/components/perf_event/perf_helpers.h b/src/components/perf_event/perf_helpers.h +index 92dca4fd0..097286865 100644 +--- a/src/components/perf_event/perf_helpers.h ++++ b/src/components/perf_event/perf_helpers.h +@@ -29,6 +29,74 @@ sys_perf_event_open( struct perf_event_attr *hw_event, + return ret; + } + ++ ++/* ++ * We define u64 as uint64_t for every architecture ++ * so that we can print it with "%"PRIx64 without getting warnings. ++ * ++ * typedef __u64 u64; ++ * typedef __s64 s64; ++ */ ++typedef uint64_t u64; ++typedef int64_t s64; ++ ++typedef __u32 u32; ++typedef __s32 s32; ++ ++typedef __u16 u16; ++typedef __s16 s16; ++ ++typedef __u8 u8; ++typedef __s8 s8; ++ ++ ++#ifdef __SIZEOF_INT128__ ++static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift) ++{ ++ return (u64)(((unsigned __int128)a * b) >> shift); ++} ++ ++#else ++ ++#ifdef __i386__ ++static inline u64 mul_u32_u32(u32 a, u32 b) ++{ ++ u32 high, low; ++ ++ asm ("mull %[b]" : "=a" (low), "=d" (high) ++ : [a] "a" (a), [b] "rm" (b) ); ++ ++ return low | ((u64)high) << 32; ++} ++#else ++static inline u64 mul_u32_u32(u32 a, u32 b) ++{ ++ return (u64)a * b; ++} ++#endif ++ ++static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift) ++{ ++ u32 ah, al; ++ u64 ret; ++ ++ al = a; ++ ah = a >> 32; ++ ++ ret = mul_u32_u32(al, b) >> shift; ++ if (ah) ++ ret += mul_u32_u32(ah, b) << (32 - shift); ++ ++ return ret; ++} ++ ++#endif /* __SIZEOF_INT128__ */ ++ ++#ifndef ARRAY_SIZE ++#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) ++#endif ++ ++ + #if defined(__x86_64__) || defined(__i386__) + + +@@ -52,19 +120,140 @@ static inline unsigned long long rdpmc(unsigned int counter) { + + #define barrier() __asm__ volatile("" ::: "memory") + ++ ++#elif defined(__aarch64__) ++ ++/* Indirect stringification. Doing two levels allows the parameter to be a ++ * macro itself. For example, compile with -DFOO=bar, __stringify(FOO) ++ * converts to "bar". ++ */ ++ ++#define __stringify_1(x...) #x ++#define __stringify(x...) __stringify_1(x) ++ ++#define read_sysreg(r) ({ \ ++ u64 __val; \ ++ asm volatile("mrs %0, " __stringify(r) : "=r" (__val)); \ ++ __val; \ ++}) ++ ++static u64 read_pmccntr(void) ++{ ++ return read_sysreg(pmccntr_el0); ++} ++ ++#define PMEVCNTR_READ(idx) \ ++ static u64 read_pmevcntr_##idx(void) { \ ++ return read_sysreg(pmevcntr##idx##_el0); \ ++ } ++ ++PMEVCNTR_READ(0); ++PMEVCNTR_READ(1); ++PMEVCNTR_READ(2); ++PMEVCNTR_READ(3); ++PMEVCNTR_READ(4); ++PMEVCNTR_READ(5); ++PMEVCNTR_READ(6); ++PMEVCNTR_READ(7); ++PMEVCNTR_READ(8); ++PMEVCNTR_READ(9); ++PMEVCNTR_READ(10); ++PMEVCNTR_READ(11); ++PMEVCNTR_READ(12); ++PMEVCNTR_READ(13); ++PMEVCNTR_READ(14); ++PMEVCNTR_READ(15); ++PMEVCNTR_READ(16); ++PMEVCNTR_READ(17); ++PMEVCNTR_READ(18); ++PMEVCNTR_READ(19); ++PMEVCNTR_READ(20); ++PMEVCNTR_READ(21); ++PMEVCNTR_READ(22); ++PMEVCNTR_READ(23); ++PMEVCNTR_READ(24); ++PMEVCNTR_READ(25); ++PMEVCNTR_READ(26); ++PMEVCNTR_READ(27); ++PMEVCNTR_READ(28); ++PMEVCNTR_READ(29); ++PMEVCNTR_READ(30); ++ ++/* ++ * Read a value direct from PMEVCNTR ++ */ ++static u64 rdpmc(unsigned int counter) ++{ ++ static u64 (* const read_f[])(void) = { ++ read_pmevcntr_0, ++ read_pmevcntr_1, ++ read_pmevcntr_2, ++ read_pmevcntr_3, ++ read_pmevcntr_4, ++ read_pmevcntr_5, ++ read_pmevcntr_6, ++ read_pmevcntr_7, ++ read_pmevcntr_8, ++ read_pmevcntr_9, ++ read_pmevcntr_10, ++ read_pmevcntr_11, ++ read_pmevcntr_13, ++ read_pmevcntr_12, ++ read_pmevcntr_14, ++ read_pmevcntr_15, ++ read_pmevcntr_16, ++ read_pmevcntr_17, ++ read_pmevcntr_18, ++ read_pmevcntr_19, ++ read_pmevcntr_20, ++ read_pmevcntr_21, ++ read_pmevcntr_22, ++ read_pmevcntr_23, ++ read_pmevcntr_24, ++ read_pmevcntr_25, ++ read_pmevcntr_26, ++ read_pmevcntr_27, ++ read_pmevcntr_28, ++ read_pmevcntr_29, ++ read_pmevcntr_30, ++ read_pmccntr ++ }; ++ ++ if (counter < ARRAY_SIZE(read_f)) ++ return (read_f[counter])(); ++ ++ return 0; ++} ++ ++static u64 rdtsc(void) { return read_sysreg(cntvct_el0); } ++ ++#define barrier() asm volatile("dmb ish" : : : "memory") ++ ++#endif ++ ++#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) ++ ++static inline u64 adjust_cap_usr_time_short(u64 a, u64 b, u64 c) ++{ ++ u64 ret; ++ ret = b + ((a - b) & c); ++ return ret; ++} ++ + /* based on the code in include/uapi/linux/perf_event.h */ + static inline unsigned long long mmap_read_self(void *addr, ++ int user_reset_flag, ++ unsigned long long reset, + unsigned long long *en, + unsigned long long *ru) { + + struct perf_event_mmap_page *pc = addr; + +- uint32_t seq, time_mult, time_shift, index, width; ++ uint32_t seq, time_mult = 0, time_shift = 0, index, width; + int64_t count; + uint64_t enabled, running; +- uint64_t cyc, time_offset; ++ uint64_t cyc = 0, time_offset = 0, time_cycles = 0, time_mask = ~0ULL; + int64_t pmc = 0; +- uint64_t quot, rem; + uint64_t delta = 0; + + +@@ -96,12 +285,11 @@ static inline unsigned long long mmap_read_self(void *addr, + time_mult = pc->time_mult; + time_shift = pc->time_shift; + +- quot=(cyc>>time_shift); +- rem = cyc & (((uint64_t)1 << time_shift) - 1); +- delta = time_offset + (quot * time_mult) + +- ((rem * time_mult) >> time_shift); ++ if (pc->cap_user_time_short) { ++ time_cycles = pc->time_cycles; ++ time_mask = pc->time_mask; ++ } + } +- enabled+=delta; + + /* actually do the measurement */ + +@@ -116,8 +304,9 @@ static inline unsigned long long mmap_read_self(void *addr, + /* numbers which break if an IOC_RESET is done */ + width = pc->pmc_width; + count = pc->offset; +- count<<=(64-width); +- count>>=(64-width); ++ if (user_reset_flag == 1) { ++ count = 0; ++ } + + /* Ugh, libpfm4 perf_event.h has cap_usr_rdpmc */ + /* while actual perf_event.h has cap_user_rdpmc */ +@@ -130,14 +319,14 @@ static inline unsigned long long mmap_read_self(void *addr, + pmc = rdpmc(index-1); + + /* sign extend result */ ++ if (user_reset_flag == 1) { ++ pmc-=reset; ++ } + pmc<<=(64-width); + pmc>>=(64-width); + + /* add current count into the existing kernel count */ + count+=pmc; +- +- /* Only adjust if index is valid */ +- running+=delta; + } else { + /* Falling back because rdpmc not supported */ + /* for this event. */ +@@ -148,14 +337,66 @@ static inline unsigned long long mmap_read_self(void *addr, + + } while (pc->lock != seq); + ++ if (enabled != running) { ++ ++ /* Adjust for cap_usr_time_short, a nop if not */ ++ cyc = adjust_cap_usr_time_short(cyc, time_cycles, time_mask); ++ ++ delta = time_offset + mul_u64_u32_shr(cyc, time_mult, time_shift); ++ ++ enabled+=delta; ++ if (index) ++ /* Only adjust if index is valid */ ++ running+=delta; ++ } ++ + if (en) *en=enabled; + if (ru) *ru=running; + + return count; + } + ++static inline unsigned long long mmap_read_reset_count(void *addr) { ++ ++ struct perf_event_mmap_page *pc = addr; ++ uint32_t seq, index; ++ uint64_t count = 0; ++ ++ if (pc == NULL) { ++ return count; ++ } ++ ++ do { ++ /* The barrier ensures we get the most up to date */ ++ /* version of the pc->lock variable */ ++ ++ seq=pc->lock; ++ barrier(); ++ ++ /* actually do the measurement */ ++ ++ /* Ugh, libpfm4 perf_event.h has cap_usr_rdpmc */ ++ /* while actual perf_event.h has cap_user_rdpmc */ ++ ++ /* Index of register to read */ ++ /* 0 means stopped/not-active */ ++ /* Need to subtract 1 to get actual index to rdpmc() */ ++ index = pc->index; ++ ++ if (pc->cap_usr_rdpmc && index) { ++ /* Read counter value */ ++ count = rdpmc(index-1); ++ } ++ barrier(); ++ ++ } while (pc->lock != seq); ++ ++ return count; ++} ++ + #else + static inline unsigned long long mmap_read_self(void *addr, ++ int user_reset_flag, + unsigned long long *en, + unsigned long long *ru) { + +commit 693dd5c014d1f0b9a3eae63de051389ed8eb338b +Author: Giuseppe Congiu +Date: Tue Feb 21 07:46:14 2023 -0500 + + perf_event: bug fix in mmap_read_self + + Commit 9a1f2d897 broke the perf_event component for power cpus. The + mmap_read_self function is missing one argument. This patch restores the + missing argument in the function. + +diff --git a/src/components/perf_event/perf_helpers.h b/src/components/perf_event/perf_helpers.h +index 097286865..7ad3524f0 100644 +--- a/src/components/perf_event/perf_helpers.h ++++ b/src/components/perf_event/perf_helpers.h +@@ -397,6 +397,7 @@ static inline unsigned long long mmap_read_reset_count(void *addr) { + #else + static inline unsigned long long mmap_read_self(void *addr, + int user_reset_flag, ++ unsigned long long reset, + unsigned long long *en, + unsigned long long *ru) { + +commit 1b3e75b7f11c7e2b7c590948216d6aaeec299010 +Author: Giuseppe Congiu +Date: Tue Feb 21 14:21:03 2023 +0100 + + perf_event: add missing mmap_read_reset_count for non default cpus + + Power cpus do not have a version of mmap_read_reset_count. Implement the + missing function. + +diff --git a/src/components/perf_event/perf_helpers.h b/src/components/perf_event/perf_helpers.h +index 7ad3524f0..73e82c8ae 100644 +--- a/src/components/perf_event/perf_helpers.h ++++ b/src/components/perf_event/perf_helpers.h +@@ -409,6 +409,11 @@ static inline unsigned long long mmap_read_self(void *addr, + return (unsigned long long)(-1); + } + ++static inline unsigned long long mmap_read_reset_count(void *addr __attribute__((unused))) { ++ ++ return (unsigned long long)(-1); ++} ++ + #endif + + /* These functions are based on builtin-record.c in the */ +commit 37d0c77b7b4d00a958dff50dc715cf63e0cd6084 +Author: Giuseppe Congiu +Date: Tue Feb 21 14:22:53 2023 +0100 + + perf_event: used unused attribute in mmap_read_self + +diff --git a/src/components/perf_event/perf_helpers.h b/src/components/perf_event/perf_helpers.h +index 73e82c8ae..59c8a2fc8 100644 +--- a/src/components/perf_event/perf_helpers.h ++++ b/src/components/perf_event/perf_helpers.h +@@ -395,16 +395,11 @@ static inline unsigned long long mmap_read_reset_count(void *addr) { + } + + #else +-static inline unsigned long long mmap_read_self(void *addr, +- int user_reset_flag, +- unsigned long long reset, +- unsigned long long *en, +- unsigned long long *ru) { +- +- (void)addr; +- +- *en=0; +- *ru=0; ++static inline unsigned long long mmap_read_self(void *addr __attribute__((unused)), ++ int user_reset_flag __attribute__((unused)), ++ unsigned long long reset __attribute__((unused)), ++ unsigned long long *en __attribute__((unused)), ++ unsigned long long *ru __attribute__((unused))) { + + return (unsigned long long)(-1); + } diff --git a/SOURCES/papi-thread_init.patch b/SOURCES/papi-thread_init.patch new file mode 100644 index 0000000..0e790c6 --- /dev/null +++ b/SOURCES/papi-thread_init.patch @@ -0,0 +1,106 @@ +commit 3625bdbad9fd57d1cdb1e5615854545167d4adcb +Author: Anthony Castaldo +Date: Wed Aug 26 17:18:29 2020 -0400 + + This modifies PAPI_library_init() to initialize components in two classes, + separated by the initialization of the papi thread structure. The first class + is those that need no thread structure, currently everything but perf_event and + perf_event_uncore. Following the init of the threading structure, we init the + second class (perf_event and perf_event_uncore) that DOES need the thread + structure to successfully init_component(). This required a change to + _papi_hwi_init_global(), to add an argument to distinguish which class it + should initialize. + +diff --git a/src/papi.c b/src/papi.c +index 33cc2993..ed75af49 100644 +--- a/src/papi.c ++++ b/src/papi.c +@@ -1151,7 +1151,23 @@ PAPI_library_init( int version ) + papi_return( init_retval ); + } + +- /* Initialize thread globals, including the main threads */ ++ /* Initialize component globals EXCEPT for perf_event, perf_event_uncore. ++ * To avoid race conditions, these components use the thread local storage ++ * construct initialized by _papi_hwi_init_global_threads(), from within ++ * their init_component(). So these must have init_component() run AFTER ++ * _papi_hwi_init_global_threads. Other components demand that init threads ++ * run AFTER init_component(), which sets up globals they need. ++ */ ++ ++ tmp = _papi_hwi_init_global( 0 ); /* Selector 0 to skip perf_event, perf_event_uncore */ ++ if ( tmp ) { ++ init_retval = tmp; ++ _papi_hwi_shutdown_global_internal( ); ++ _in_papi_library_init_cnt--; ++ papi_return( init_retval ); ++ } ++ ++ /* Initialize thread globals, including the main threads */ + + tmp = _papi_hwi_init_global_threads( ); + if ( tmp ) { +@@ -1161,9 +1177,9 @@ PAPI_library_init( int version ) + papi_return( init_retval ); + } + +- /* Initialize component globals */ ++ /* Initialize perf_event, perf_event_uncore components */ + +- tmp = _papi_hwi_init_global( ); ++ tmp = _papi_hwi_init_global( 1 ); /* Selector 1 for only perf_event, perf_event_uncore */ + if ( tmp ) { + init_retval = tmp; + _papi_hwi_shutdown_global_internal( ); +diff --git a/src/papi_internal.c b/src/papi_internal.c +index 5a1ccd43..e6dd319c 100644 +--- a/src/papi_internal.c ++++ b/src/papi_internal.c +@@ -1928,11 +1928,13 @@ int papi_num_components = ( sizeof ( _papi_hwd ) / sizeof ( *_papi_hwd ) ) - 1; + * Routine that initializes all available components. + * A component is available if a pointer to its info vector + * appears in the NULL terminated_papi_hwd table. ++ * Modified to accept an arg: 0=do not init perf_event or ++ * perf_event_uncore. 1=init ONLY perf_event or perf_event_uncore. + */ + int +-_papi_hwi_init_global( void ) ++_papi_hwi_init_global( int PE_OR_PEU ) + { +- int retval, i = 0; ++ int retval, is_pe_peu, i = 0; + + retval = _papi_hwi_innoculate_os_vector( &_papi_os_vector ); + if ( retval != PAPI_OK ) { +@@ -1940,14 +1942,16 @@ _papi_hwi_init_global( void ) + } + + while ( _papi_hwd[i] ) { +- ++ is_pe_peu = 0; ++ if (strcmp(_papi_hwd[i]->cmp_info.name, "perf_event") == 0) is_pe_peu=1; ++ if (strcmp(_papi_hwd[i]->cmp_info.name, "perf_event_uncore") == 0) is_pe_peu=1; + retval = _papi_hwi_innoculate_vector( _papi_hwd[i] ); + if ( retval != PAPI_OK ) { + return retval; + } + + /* We can be disabled by user before init */ +- if (!_papi_hwd[i]->cmp_info.disabled) { ++ if (!_papi_hwd[i]->cmp_info.disabled && (PE_OR_PEU == is_pe_peu)) { + retval = _papi_hwd[i]->init_component( i ); + _papi_hwd[i]->cmp_info.disabled=retval; + +diff --git a/src/papi_internal.h b/src/papi_internal.h +index 6492fea4..e0f5acd7 100644 +--- a/src/papi_internal.h ++++ b/src/papi_internal.h +@@ -467,7 +467,7 @@ int _papi_hwi_read( hwd_context_t * context, EventSetInfo_t * ESI, + long long *values ); + int _papi_hwi_cleanup_eventset( EventSetInfo_t * ESI ); + int _papi_hwi_convert_eventset_to_multiplex( _papi_int_multiplex_t * mpx ); +-int _papi_hwi_init_global( void ); ++int _papi_hwi_init_global( int PE_OR_PEU ); + int _papi_hwi_init_global_internal( void ); + int _papi_hwi_init_os(void); + void _papi_hwi_init_errors(void); diff --git a/SPECS/papi.spec b/SPECS/papi.spec index bb51b3b..80dd15c 100644 --- a/SPECS/papi.spec +++ b/SPECS/papi.spec @@ -11,7 +11,7 @@ Summary: Performance Application Programming Interface Name: papi Version: 6.0.0 -Release: 12%{?dist} +Release: 15%{?dist} License: BSD Requires: papi-libs = %{version}-%{release} URL: http://icl.cs.utk.edu/papi/ @@ -22,11 +22,13 @@ URL: http://icl.cs.utk.edu/papi/ # so when papi is rebased to a newer version it can be used as is. Source0: http://icl.cs.utk.edu/projects/papi/downloads/%{name}-%{version}-noiozone.tar.gz Patch1: papi-python3.patch -Patch2: papi-a64fx.patch Patch4: papi-config.patch Patch5: papi-nostatic.patch Patch6: papi-lto.patch Patch7: papi-rhbz1923967.patch +Patch21: papi-arm64fastread.patch +Patch31: papi-701eventupdate.patch +Patch40: papi-thread_init.patch BuildRequires: make BuildRequires: autoconf BuildRequires: doxygen @@ -36,9 +38,9 @@ BuildRequires: kernel-headers >= 2.6.32 BuildRequires: chrpath BuildRequires: lm_sensors-devel %if %{without bundled_libpfm} -BuildRequires: libpfm-devel >= 4.6.0-1 +BuildRequires: libpfm-devel >= 4.13.0-1 %if %{with_static} -BuildRequires: libpfm-static >= 4.6.0-1 +BuildRequires: libpfm-static >= 4.13.0-1 %endif %endif # Following required for net component @@ -95,11 +97,13 @@ the PAPI user-space libraries and interfaces. %prep %setup -q %patch1 -p1 -b .python3 -%patch2 -p1 -b .a64fx %patch4 -p1 %patch5 -p1 %patch6 -p1 %patch7 -p1 +%patch21 -p1 +%patch31 -p1 +%patch40 -p1 %build @@ -192,6 +196,15 @@ chrpath --delete $RPM_BUILD_ROOT%{_libdir}/*.so* %endif %changelog +* Fri Jun 16 2023 William Cohen - 6.0.0-15 +- Address thread initialization order. (RHBZ#2215582) + +* Thu May 4 2023 William Cohen - 6.0.0-14 +- Update papi event presets (RHBZ#2111923, RHBZ#2111942, RHBZ#2111947) + +* Thu Apr 27 2023 William Cohen - 6.0.0-13 +- Improve aarch64 read speed. (rhbz2186927) + * Thu May 26 2022 William Cohen - 6.0.0-12 - Disable problematic IBM Power9 events. (RHBZ#1923967)