commit 20890adcb59a1c1648cb70be65332c03a3781e1a Author: Anthony Castaldo Date: Thu Jan 16 16:43:51 2020 -0500 Added two machine types to papi_events.csv to be in line with libpfm4 update to support amd64_fam17h_zen1 and zen2. diff --git a/src/papi_events.csv b/src/papi_events.csv index 97446ad2c..8e96adfbd 100644 --- a/src/papi_events.csv +++ b/src/papi_events.csv @@ -396,6 +396,8 @@ PRESET,PAPI_FSQ_INS,NOT_DERIVED,RETIRED_SSE_AVX_OPERATIONS:SINGLE_DIV_OPS:DOUBLE # # CPU,amd64_fam17h +CPU,amd64_fam17h_zen1 +CPU,amd64_fam17h_zen2 # PRESET,PAPI_TOT_INS,NOT_DERIVED,RETIRED_INSTRUCTIONS PRESET,PAPI_TOT_CYC,NOT_DERIVED,CYCLES_NOT_IN_HALT commit ae449f73abd0849f05ab3e1f3a64bde0c670c645 Author: Anthony Date: Fri Jul 17 12:05:14 2020 -0400 Separated the cache preset events of AMD Zen1 and Zen2 and added some more. diff --git a/src/papi_events.csv b/src/papi_events.csv index 8e96adfbd..2325bd4dc 100644 --- a/src/papi_events.csv +++ b/src/papi_events.csv @@ -397,7 +397,6 @@ PRESET,PAPI_FSQ_INS,NOT_DERIVED,RETIRED_SSE_AVX_OPERATIONS:SINGLE_DIV_OPS:DOUBLE # CPU,amd64_fam17h CPU,amd64_fam17h_zen1 -CPU,amd64_fam17h_zen2 # PRESET,PAPI_TOT_INS,NOT_DERIVED,RETIRED_INSTRUCTIONS PRESET,PAPI_TOT_CYC,NOT_DERIVED,CYCLES_NOT_IN_HALT @@ -434,6 +433,27 @@ PRESET,PAPI_FML_INS,NOT_DERIVED,RETIRED_SSE_AVX_OPERATIONS:SP_MULT_FLOPS:DP_MULT PRESET,PAPI_FAD_INS,NOT_DERIVED,RETIRED_SSE_AVX_OPERATIONS:SP_ADD_SUB_FLOPS:DP_ADD_SUB_FLOPS PRESET,PAPI_FDV_INS,NOT_DERIVED,RETIRED_SSE_AVX_OPERATIONS:SP_DIV_FLOPS:DP_DIV_FLOPS,NOTE,"Counts both divide and square root instructions" PRESET,PAPI_FSQ_INS,NOT_DERIVED,RETIRED_SSE_AVX_OPERATIONS:SP_DIV_FLOPS:DP_DIV_FLOPS,NOTE,"Counts both divide and square root instructions" +# Events discovered via CAT +PRESET,PAPI_L2_DCM,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_C +PRESET,PAPI_L2_DCR,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:RD_BLK_L +PRESET,PAPI_L2_DCH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_L_HIT_X +# +# +CPU,amd64_fam17h_zen2 +# Events copied from zen1 that also exist on zen2 +PRESET,PAPI_TLB_DM,NOT_DERIVED,L1_DTLB_MISS:TLB_RELOAD_1G_L2_MISS:TLB_RELOAD_2M_L2_MISS:TLB_RELOAD_COALESCED_PAGE_MISS:TLB_RELOAD_4K_L2_MISS:TLB_RELOAD_1G_L2_HIT:TLB_RELOAD_2M_L2_HIT:TLB_RELOAD_COALESCED_PAGE_HIT:TLB_RELOAD_4K_L2_HIT +PRESET,PAPI_TLB_IM,DERIVED_ADD,L1_ITLB_MISS_L2_ITLB_HIT,L1_ITLB_MISS_L2_ITLB_MISS:IF1G:IF2M:IF4K +PRESET,PAPI_BR_TKN,NOT_DERIVED,RETIRED_TAKEN_BRANCH_INSTRUCTIONS +PRESET,PAPI_BR_MSP,NOT_DERIVED,RETIRED_BRANCH_INSTRUCTIONS_MISPREDICTED +PRESET,PAPI_TOT_INS,NOT_DERIVED,RETIRED_INSTRUCTIONS +PRESET,PAPI_BR_INS,NOT_DERIVED,RETIRED_BRANCH_INSTRUCTIONS +PRESET,PAPI_TOT_CYC,NOT_DERIVED,CYCLES_NOT_IN_HALT +# Events discovered via CAT +PRESET,PAPI_L1_DCA,NOT_DERIVED,perf::PERF_COUNT_HW_CACHE_L1D:ACCESS +PRESET,PAPI_L2_DCM,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_C +PRESET,PAPI_L2_DCR,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:RD_BLK_L +PRESET,PAPI_L2_DCH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_L_HIT_X + # # CPU,Intel architectural PMU commit ccc22b5dda46fea8933d99950c3e30b5298cdd1d Author: Heike Jagode Date: Thu Sep 24 13:33:38 2020 -0400 Added presets for floating-point operations (FP_OPS, DP_OPS, SP_OPS) for AMD zen2. PPR (under section 2.1.15.3. -- https://www.amd.com/system/files/TechDocs/54945_3.03_ppr_ZP_B2_pub.zip) explains that FLOP events require MergeEvent support, which was included in the 5.6 kernel. ===>>> Hence, a kernel version 5.6 or greater is required. NOTE: without the MergeEvent support in the kernel, there is no guarantee that the SSE/AVX FLOP events produce any useful data whatsoever. These events have been tested and verified for scalar flops, SSE, AVX, and FMA: (1) for one AVX instruction (e.g. _mm256_add_pd()), the RETIRED_SSE_AVX_FLOPS:ADD_SUB_FLOPS event returns a count of 4 (in the case of double precision), and a count of 8 (in the case of single precision). (2) for one AVX FMA instruction (e.g. _mm256_macc_pd()), the RETIRED_SSE_AVX_FLOPS:MAC_FLOPS event returns a count of 8 (in the case of double precision), and a count of 16 (in the case of single precision). (3) for one SSE instruction (e.g. _mm_mul_pd()), the RETIRED_SSE_AVX_FLOPS:MULT_FLOPS event returns a count of 2 (in the case of double precision), and a count of 4 (in the case of single precision). diff --git a/src/papi_events.csv b/src/papi_events.csv index 2325bd4dc..2ff3e4d16 100644 --- a/src/papi_events.csv +++ b/src/papi_events.csv @@ -454,8 +454,19 @@ PRESET,PAPI_L2_DCM,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_ PRESET,PAPI_L2_DCR,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:RD_BLK_L PRESET,PAPI_L2_DCH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_L_HIT_X -# -# +# New FLOP event on zen2 +# PPR (under section 2.1.15.3. -- +# https://www.amd.com/system/files/TechDocs/54945_3.03_ppr_ZP_B2_pub.zip) +# explains that FLOP events require MergeEvent support, which was included +# in the 5.6 kernel. +# Hence, a kernel version 5.6 or greater is required. +# NOTE: without the MergeEvent support in the kernel, there is no guarantee +# that this SSE/AVX FLOP event produces any useful data whatsoever. +PRESET,PAPI_FP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY +PRESET,PAPI_DP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY +PRESET,PAPI_SP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY + + CPU,Intel architectural PMU CPU,ix86arch # commit 35f93252a6e222299c03f2c94912334488e76b02 Author: Heike Jagode Date: Thu Sep 24 18:40:59 2020 -0400 Added presets for floating-point instructions (FP_INS, VEC_DP, VEC_SP) for AMD zen2. For unoptimized code (like native MMM), these events may include non-numeric floating-point instructions, e.g. MOVSD: move or merge scalar double-precision floating-point value instructions. Tested with: 1) SSE double: _mm_mul_pd / _mm_add_pd 2) SSE single: _mm_mul_ps / _mm_add_ps 3) AVX double: _mm256_mul_pd / _mm256_add_pd 4) AVX single: _mm256_mul_ps / _mm256_add_ps 5) FMA double: _mm256_macc_pd 6) FMA single: _mm256_macc_pd diff --git a/src/papi_events.csv b/src/papi_events.csv index 2ff3e4d16..60a64564d 100644 --- a/src/papi_events.csv +++ b/src/papi_events.csv @@ -465,6 +465,11 @@ PRESET,PAPI_L2_DCH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_ PRESET,PAPI_FP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY PRESET,PAPI_DP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY PRESET,PAPI_SP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY +# Floating-point instructions (including non-numeric floating-point instructions, +# e.g. Move or Merge Scalar Double-Precision Floating-Point values) +PAPI_FP_INS,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR +PAPI_VEC_DP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR +PAPI_VEC_SP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR CPU,Intel architectural PMU commit 344f6493425d865577508ff32b6f65516b1b4394 Author: Heike Jagode Date: Thu Sep 24 19:03:31 2020 -0400 Added missing 'PRESET' to csv file. diff --git a/src/papi_events.csv b/src/papi_events.csv index 60a64564d..724d520f0 100644 --- a/src/papi_events.csv +++ b/src/papi_events.csv @@ -467,9 +467,9 @@ PRESET,PAPI_DP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY PRESET,PAPI_SP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY # Floating-point instructions (including non-numeric floating-point instructions, # e.g. Move or Merge Scalar Double-Precision Floating-Point values) -PAPI_FP_INS,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR -PAPI_VEC_DP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR -PAPI_VEC_SP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR +PRESET,PAPI_FP_INS,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR +PRESET,PAPI_VEC_DP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR +PRESET,PAPI_VEC_SP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR CPU,Intel architectural PMU commit 4616aa717c5301a9a478876661eb8ac1f18c0333 Author: Heike Jagode Date: Thu Oct 8 11:36:23 2020 -0400 For zen2, since FP_OPS counts both single- and double-prec operations correctly, we don't need to confuse the user with additional DP_OPS and SP_OPS events. So, I'm taking them out. Same applies for events counting FP instructions. diff --git a/src/papi_events.csv b/src/papi_events.csv index 724d520f0..9ebf557e1 100644 --- a/src/papi_events.csv +++ b/src/papi_events.csv @@ -463,13 +463,20 @@ PRESET,PAPI_L2_DCH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_ # NOTE: without the MergeEvent support in the kernel, there is no guarantee # that this SSE/AVX FLOP event produces any useful data whatsoever. PRESET,PAPI_FP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY -PRESET,PAPI_DP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY -PRESET,PAPI_SP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY +# Since FP_OPS counts both single- and double-prec operations +# correctly, we don't need to confuse the user with additional +# DP_OPS and SP_OPS events. So, I'm taking them out. +#PRESET,PAPI_DP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY +#PRESET,PAPI_SP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY +# # Floating-point instructions (including non-numeric floating-point instructions, # e.g. Move or Merge Scalar Double-Precision Floating-Point values) PRESET,PAPI_FP_INS,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR -PRESET,PAPI_VEC_DP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR -PRESET,PAPI_VEC_SP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR +# Since FP_INS counts both single- and double-prec instuctions +# correctly, we don't need to confuse the user with additional +# VEC_DP and VEC_SP events. So, I'm taking them out. +#PRESET,PAPI_VEC_DP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR +#PRESET,PAPI_VEC_SP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR CPU,Intel architectural PMU commit 274219e85ba8adcd2e9c78507adf7edb05b71daa Author: Sebastian Mobo Date: Thu Oct 8 13:40:21 2020 -0400 Added instruction-cache preset events for the Zen2. Signed-off-by: Anthony diff --git a/src/papi_events.csv b/src/papi_events.csv index 9ebf557e1..fd75f9371 100644 --- a/src/papi_events.csv +++ b/src/papi_events.csv @@ -453,7 +453,12 @@ PRESET,PAPI_L1_DCA,NOT_DERIVED,perf::PERF_COUNT_HW_CACHE_L1D:ACCESS PRESET,PAPI_L2_DCM,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_C PRESET,PAPI_L2_DCR,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:RD_BLK_L PRESET,PAPI_L2_DCH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_L_HIT_X - +# +PRESET,PAPI_L1_ICM,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:CACHEABLE_IC_READ +# +PRESET,PAPI_L2_ICR,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:CACHEABLE_IC_READ +PRESET,PAPI_L2_ICM,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:IC_FILL_MISS +PRESET,PAPI_L2_ICH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:IC_FILL_HIT_X:IC_FILL_HIT_S # New FLOP event on zen2 # PPR (under section 2.1.15.3. -- # https://www.amd.com/system/files/TechDocs/54945_3.03_ppr_ZP_B2_pub.zip) commit 02f34baafb868d183f21bebfd3c46574847b9929 Author: Swarup Sahoo Date: Tue May 18 02:51:56 2021 +0530 Added AMD Zen3 preset events. Refer section 2.1.17.2 of PPR for AMD family 19h model 01h, https://www.amd.com/system/files/TechDocs/55898_pub.zip Signed-off-by: Swarup Sahoo diff --git a/src/papi_events.csv b/src/papi_events.csv index 4ef647959..d9e9da8a3 100644 --- a/src/papi_events.csv +++ b/src/papi_events.csv @@ -482,6 +482,33 @@ PRESET,PAPI_FP_INS,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X # VEC_DP and VEC_SP events. So, I'm taking them out. #PRESET,PAPI_VEC_DP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR #PRESET,PAPI_VEC_SP,NOT_DERIVED,RETIRED_MMX_FP_INSTRUCTIONS:SSE_INSTR:MMX_INSTR:X87_INSTR +# +# +CPU,amd64_fam19h_zen3 +PRESET,PAPI_TOT_INS,NOT_DERIVED,RETIRED_INSTRUCTIONS +PRESET,PAPI_TOT_CYC,NOT_DERIVED,CYCLES_NOT_IN_HALT +PRESET,PAPI_BR_INS,NOT_DERIVED,RETIRED_BRANCH_INSTRUCTIONS +PRESET,PAPI_BR_TKN,NOT_DERIVED,RETIRED_TAKEN_BRANCH_INSTRUCTIONS +PRESET,PAPI_BR_MSP,NOT_DERIVED,RETIRED_BRANCH_INSTRUCTIONS_MISPREDICTED +PRESET,PAPI_TLB_DM,NOT_DERIVED, L1_DTLB_MISS:TLB_RELOAD_1G_L2_MISS:TLB_RELOAD_2M_L2_MISS:TLB_RELOAD_COALESCED_PAGE_MISS:TLB_RELOAD_4K_L2_MISS:TLB_RELOAD_1G_L2_HIT:TLB_RELOAD_2M_L2_HIT:TLB_RELOAD_COALESCED_PAGE_HIT:TLB_RELOAD_4K_L2_HIT +PRESET,PAPI_TLB_IM,DERIVED_ADD,L1_ITLB_MISS_L2_ITLB_HIT,L1_ITLB_MISS_L2_ITLB_MISS:COALESCED4K:IF1G:IF2M:IF4K +PRESET,PAPI_L1_DCA,NOT_DERIVED,LS_DISPATCH:LD_ST_DISPATCH:STORE_DISPATCH:LD_DISPATCH +PRESET,PAPI_L1_DCM,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:RD_BLK_L:RD_BLK_X:LS_RD_BLK_C_S:CHANGE_TO_X +PRESET,PAPI_L2_DCM,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_C +PRESET,PAPI_L2_DCR,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:RD_BLK_L:RD_BLK_X:LS_RD_BLK_C_S:CHANGE_TO_X +PRESET,PAPI_L2_DCH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:LS_RD_BLK_C_S:LS_RD_BLK_L_HIT_X:LS_RD_BLK_L_HIT_S:LS_RD_BLK_X +PRESET,PAPI_L2_ICR,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:CACHEABLE_IC_READ +PRESET,PAPI_L2_ICA,NOT_DERIVED,REQUESTS_TO_L2_GROUP1:CACHEABLE_IC_READ +PRESET,PAPI_L2_ICM,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:IC_FILL_MISS +PRESET,PAPI_L2_ICH,NOT_DERIVED,CORE_TO_L2_CACHEABLE_REQUEST_ACCESS_STATUS:IC_FILL_HIT_X:IC_FILL_HIT_S +# RETIRED_SSE_AVX_FLOPS requires MergeEvent support. +PRESET,PAPI_VEC_INS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY +PRESET,PAPI_FP_INS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY +PRESET,PAPI_FP_OPS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ANY +PRESET,PAPI_FML_INS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:MULT_FLOPS +PRESET,PAPI_FAD_INS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:ADD_SUB_FLOPS +PRESET,PAPI_FDV_INS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:DIV_FLOPS +PRESET,PAPI_FSQ_INS,NOT_DERIVED,RETIRED_SSE_AVX_FLOPS:DIV_FLOPS CPU,Intel architectural PMU