forked from rpms/glibc
		
	Compare commits
	
		
			No commits in common. "a10s" and "c8" have entirely different histories.
		
	
	
		
	
		
| @ -1 +0,0 @@ | |||||||
| 1 |  | ||||||
							
								
								
									
										7
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										7
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -1,6 +1 @@ | |||||||
| # Release tarballs. | SOURCES/glibc-2.28.tar.xz | ||||||
| /glibc-*.tar.[gx]z |  | ||||||
| # Generated (source) RPMs. |  | ||||||
| /*.rpm |  | ||||||
| # Expanded source trees. |  | ||||||
| /glibc-*/ |  | ||||||
|  | |||||||
							
								
								
									
										1
									
								
								.glibc.metadata
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.glibc.metadata
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1 @@ | |||||||
|  | ccb5dc9e51a9884df8488f86982439d47b283b2a SOURCES/glibc-2.28.tar.xz | ||||||
							
								
								
									
										11823
									
								
								ChangeLog.old
									
									
									
									
									
								
							
							
						
						
									
										11823
									
								
								ChangeLog.old
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -1,2 +0,0 @@ | |||||||
| All of the useful glibc maintainer scripts are located at: |  | ||||||
| https://pagure.io/glibc-maintainer-scripts |  | ||||||
							
								
								
									
										6058
									
								
								SOURCES/ChangeLog.old
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6058
									
								
								SOURCES/ChangeLog.old
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										496
									
								
								SOURCES/SUPPORTED
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										496
									
								
								SOURCES/SUPPORTED
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,496 @@ | |||||||
|  | # This file names the currently supported and somewhat tested locales. | ||||||
|  | # If you have any additions please file a glibc bug report. | ||||||
|  | SUPPORTED-LOCALES=\ | ||||||
|  | C.UTF-8/UTF-8 \ | ||||||
|  | aa_DJ.UTF-8/UTF-8 \ | ||||||
|  | aa_DJ/ISO-8859-1 \ | ||||||
|  | aa_ER/UTF-8 \ | ||||||
|  | aa_ER@saaho/UTF-8 \ | ||||||
|  | aa_ET/UTF-8 \ | ||||||
|  | af_ZA.UTF-8/UTF-8 \ | ||||||
|  | af_ZA/ISO-8859-1 \ | ||||||
|  | agr_PE/UTF-8 \ | ||||||
|  | ak_GH/UTF-8 \ | ||||||
|  | am_ET/UTF-8 \ | ||||||
|  | an_ES.UTF-8/UTF-8 \ | ||||||
|  | an_ES/ISO-8859-15 \ | ||||||
|  | anp_IN/UTF-8 \ | ||||||
|  | ar_AE.UTF-8/UTF-8 \ | ||||||
|  | ar_AE/ISO-8859-6 \ | ||||||
|  | ar_BH.UTF-8/UTF-8 \ | ||||||
|  | ar_BH/ISO-8859-6 \ | ||||||
|  | ar_DZ.UTF-8/UTF-8 \ | ||||||
|  | ar_DZ/ISO-8859-6 \ | ||||||
|  | ar_EG.UTF-8/UTF-8 \ | ||||||
|  | ar_EG/ISO-8859-6 \ | ||||||
|  | ar_IN/UTF-8 \ | ||||||
|  | ar_IQ.UTF-8/UTF-8 \ | ||||||
|  | ar_IQ/ISO-8859-6 \ | ||||||
|  | ar_JO.UTF-8/UTF-8 \ | ||||||
|  | ar_JO/ISO-8859-6 \ | ||||||
|  | ar_KW.UTF-8/UTF-8 \ | ||||||
|  | ar_KW/ISO-8859-6 \ | ||||||
|  | ar_LB.UTF-8/UTF-8 \ | ||||||
|  | ar_LB/ISO-8859-6 \ | ||||||
|  | ar_LY.UTF-8/UTF-8 \ | ||||||
|  | ar_LY/ISO-8859-6 \ | ||||||
|  | ar_MA.UTF-8/UTF-8 \ | ||||||
|  | ar_MA/ISO-8859-6 \ | ||||||
|  | ar_OM.UTF-8/UTF-8 \ | ||||||
|  | ar_OM/ISO-8859-6 \ | ||||||
|  | ar_QA.UTF-8/UTF-8 \ | ||||||
|  | ar_QA/ISO-8859-6 \ | ||||||
|  | ar_SA.UTF-8/UTF-8 \ | ||||||
|  | ar_SA/ISO-8859-6 \ | ||||||
|  | ar_SD.UTF-8/UTF-8 \ | ||||||
|  | ar_SD/ISO-8859-6 \ | ||||||
|  | ar_SS/UTF-8 \ | ||||||
|  | ar_SY.UTF-8/UTF-8 \ | ||||||
|  | ar_SY/ISO-8859-6 \ | ||||||
|  | ar_TN.UTF-8/UTF-8 \ | ||||||
|  | ar_TN/ISO-8859-6 \ | ||||||
|  | ar_YE.UTF-8/UTF-8 \ | ||||||
|  | ar_YE/ISO-8859-6 \ | ||||||
|  | ayc_PE/UTF-8 \ | ||||||
|  | az_AZ/UTF-8 \ | ||||||
|  | az_IR/UTF-8 \ | ||||||
|  | as_IN/UTF-8 \ | ||||||
|  | ast_ES.UTF-8/UTF-8 \ | ||||||
|  | ast_ES/ISO-8859-15 \ | ||||||
|  | be_BY.UTF-8/UTF-8 \ | ||||||
|  | be_BY/CP1251 \ | ||||||
|  | be_BY@latin/UTF-8 \ | ||||||
|  | bem_ZM/UTF-8 \ | ||||||
|  | ber_DZ/UTF-8 \ | ||||||
|  | ber_MA/UTF-8 \ | ||||||
|  | bg_BG.UTF-8/UTF-8 \ | ||||||
|  | bg_BG/CP1251 \ | ||||||
|  | bhb_IN.UTF-8/UTF-8 \ | ||||||
|  | bho_IN/UTF-8 \ | ||||||
|  | bho_NP/UTF-8 \ | ||||||
|  | bi_VU/UTF-8 \ | ||||||
|  | bn_BD/UTF-8 \ | ||||||
|  | bn_IN/UTF-8 \ | ||||||
|  | bo_CN/UTF-8 \ | ||||||
|  | bo_IN/UTF-8 \ | ||||||
|  | br_FR.UTF-8/UTF-8 \ | ||||||
|  | br_FR/ISO-8859-1 \ | ||||||
|  | br_FR@euro/ISO-8859-15 \ | ||||||
|  | brx_IN/UTF-8 \ | ||||||
|  | bs_BA.UTF-8/UTF-8 \ | ||||||
|  | bs_BA/ISO-8859-2 \ | ||||||
|  | byn_ER/UTF-8 \ | ||||||
|  | ca_AD.UTF-8/UTF-8 \ | ||||||
|  | ca_AD/ISO-8859-15 \ | ||||||
|  | ca_ES.UTF-8/UTF-8 \ | ||||||
|  | ca_ES/ISO-8859-1 \ | ||||||
|  | ca_ES@euro/ISO-8859-15 \ | ||||||
|  | ca_ES@valencia/UTF-8 \ | ||||||
|  | ca_FR.UTF-8/UTF-8 \ | ||||||
|  | ca_FR/ISO-8859-15 \ | ||||||
|  | ca_IT.UTF-8/UTF-8 \ | ||||||
|  | ca_IT/ISO-8859-15 \ | ||||||
|  | ce_RU/UTF-8 \ | ||||||
|  | chr_US/UTF-8 \ | ||||||
|  | cmn_TW/UTF-8 \ | ||||||
|  | crh_UA/UTF-8 \ | ||||||
|  | cs_CZ.UTF-8/UTF-8 \ | ||||||
|  | cs_CZ/ISO-8859-2 \ | ||||||
|  | csb_PL/UTF-8 \ | ||||||
|  | cv_RU/UTF-8 \ | ||||||
|  | cy_GB.UTF-8/UTF-8 \ | ||||||
|  | cy_GB/ISO-8859-14 \ | ||||||
|  | da_DK.UTF-8/UTF-8 \ | ||||||
|  | da_DK/ISO-8859-1 \ | ||||||
|  | da_DK.ISO-8859-15/ISO-8859-15 \ | ||||||
|  | de_AT.UTF-8/UTF-8 \ | ||||||
|  | de_AT/ISO-8859-1 \ | ||||||
|  | de_AT@euro/ISO-8859-15 \ | ||||||
|  | de_BE.UTF-8/UTF-8 \ | ||||||
|  | de_BE/ISO-8859-1 \ | ||||||
|  | de_BE@euro/ISO-8859-15 \ | ||||||
|  | de_CH.UTF-8/UTF-8 \ | ||||||
|  | de_CH/ISO-8859-1 \ | ||||||
|  | de_DE.UTF-8/UTF-8 \ | ||||||
|  | de_DE/ISO-8859-1 \ | ||||||
|  | de_DE@euro/ISO-8859-15 \ | ||||||
|  | de_IT.UTF-8/UTF-8 \ | ||||||
|  | de_IT/ISO-8859-1 \ | ||||||
|  | de_LI.UTF-8/UTF-8 \ | ||||||
|  | de_LU.UTF-8/UTF-8 \ | ||||||
|  | de_LU/ISO-8859-1 \ | ||||||
|  | de_LU@euro/ISO-8859-15 \ | ||||||
|  | doi_IN/UTF-8 \ | ||||||
|  | dsb_DE/UTF-8 \ | ||||||
|  | dv_MV/UTF-8 \ | ||||||
|  | dz_BT/UTF-8 \ | ||||||
|  | el_GR.UTF-8/UTF-8 \ | ||||||
|  | el_GR/ISO-8859-7 \ | ||||||
|  | el_GR@euro/ISO-8859-7 \ | ||||||
|  | el_CY.UTF-8/UTF-8 \ | ||||||
|  | el_CY/ISO-8859-7 \ | ||||||
|  | en_AG/UTF-8 \ | ||||||
|  | en_AU.UTF-8/UTF-8 \ | ||||||
|  | en_AU/ISO-8859-1 \ | ||||||
|  | en_BW.UTF-8/UTF-8 \ | ||||||
|  | en_BW/ISO-8859-1 \ | ||||||
|  | en_CA.UTF-8/UTF-8 \ | ||||||
|  | en_CA/ISO-8859-1 \ | ||||||
|  | en_DK.UTF-8/UTF-8 \ | ||||||
|  | en_DK/ISO-8859-1 \ | ||||||
|  | en_GB.UTF-8/UTF-8 \ | ||||||
|  | en_GB/ISO-8859-1 \ | ||||||
|  | en_GB.ISO-8859-15/ISO-8859-15 \ | ||||||
|  | en_HK.UTF-8/UTF-8 \ | ||||||
|  | en_HK/ISO-8859-1 \ | ||||||
|  | en_IE.UTF-8/UTF-8 \ | ||||||
|  | en_IE/ISO-8859-1 \ | ||||||
|  | en_IE@euro/ISO-8859-15 \ | ||||||
|  | en_IL/UTF-8 \ | ||||||
|  | en_IN/UTF-8 \ | ||||||
|  | en_NG/UTF-8 \ | ||||||
|  | en_NZ.UTF-8/UTF-8 \ | ||||||
|  | en_NZ/ISO-8859-1 \ | ||||||
|  | en_PH.UTF-8/UTF-8 \ | ||||||
|  | en_PH/ISO-8859-1 \ | ||||||
|  | en_SC.UTF-8/UTF-8 \ | ||||||
|  | en_SG.UTF-8/UTF-8 \ | ||||||
|  | en_SG/ISO-8859-1 \ | ||||||
|  | en_US.UTF-8/UTF-8 \ | ||||||
|  | en_US/ISO-8859-1 \ | ||||||
|  | en_US.ISO-8859-15/ISO-8859-15 \ | ||||||
|  | en_US@ampm/UTF-8 \ | ||||||
|  | en_US.UTF-8@ampm/UTF-8 \ | ||||||
|  | en_ZA.UTF-8/UTF-8 \ | ||||||
|  | en_ZA/ISO-8859-1 \ | ||||||
|  | en_ZM/UTF-8 \ | ||||||
|  | en_ZW.UTF-8/UTF-8 \ | ||||||
|  | en_ZW/ISO-8859-1 \ | ||||||
|  | eo/UTF-8 \ | ||||||
|  | es_AR.UTF-8/UTF-8 \ | ||||||
|  | es_AR/ISO-8859-1 \ | ||||||
|  | es_BO.UTF-8/UTF-8 \ | ||||||
|  | es_BO/ISO-8859-1 \ | ||||||
|  | es_CL.UTF-8/UTF-8 \ | ||||||
|  | es_CL/ISO-8859-1 \ | ||||||
|  | es_CO.UTF-8/UTF-8 \ | ||||||
|  | es_CO/ISO-8859-1 \ | ||||||
|  | es_CR.UTF-8/UTF-8 \ | ||||||
|  | es_CR/ISO-8859-1 \ | ||||||
|  | es_CU/UTF-8 \ | ||||||
|  | es_DO.UTF-8/UTF-8 \ | ||||||
|  | es_DO/ISO-8859-1 \ | ||||||
|  | es_EC.UTF-8/UTF-8 \ | ||||||
|  | es_EC/ISO-8859-1 \ | ||||||
|  | es_ES.UTF-8/UTF-8 \ | ||||||
|  | es_ES/ISO-8859-1 \ | ||||||
|  | es_ES@euro/ISO-8859-15 \ | ||||||
|  | es_GT.UTF-8/UTF-8 \ | ||||||
|  | es_GT/ISO-8859-1 \ | ||||||
|  | es_HN.UTF-8/UTF-8 \ | ||||||
|  | es_HN/ISO-8859-1 \ | ||||||
|  | es_MX.UTF-8/UTF-8 \ | ||||||
|  | es_MX/ISO-8859-1 \ | ||||||
|  | es_NI.UTF-8/UTF-8 \ | ||||||
|  | es_NI/ISO-8859-1 \ | ||||||
|  | es_PA.UTF-8/UTF-8 \ | ||||||
|  | es_PA/ISO-8859-1 \ | ||||||
|  | es_PE.UTF-8/UTF-8 \ | ||||||
|  | es_PE/ISO-8859-1 \ | ||||||
|  | es_PR.UTF-8/UTF-8 \ | ||||||
|  | es_PR/ISO-8859-1 \ | ||||||
|  | es_PY.UTF-8/UTF-8 \ | ||||||
|  | es_PY/ISO-8859-1 \ | ||||||
|  | es_SV.UTF-8/UTF-8 \ | ||||||
|  | es_SV/ISO-8859-1 \ | ||||||
|  | es_US.UTF-8/UTF-8 \ | ||||||
|  | es_US/ISO-8859-1 \ | ||||||
|  | es_UY.UTF-8/UTF-8 \ | ||||||
|  | es_UY/ISO-8859-1 \ | ||||||
|  | es_VE.UTF-8/UTF-8 \ | ||||||
|  | es_VE/ISO-8859-1 \ | ||||||
|  | et_EE.UTF-8/UTF-8 \ | ||||||
|  | et_EE/ISO-8859-1 \ | ||||||
|  | et_EE.ISO-8859-15/ISO-8859-15 \ | ||||||
|  | eu_ES.UTF-8/UTF-8 \ | ||||||
|  | eu_ES/ISO-8859-1 \ | ||||||
|  | eu_ES@euro/ISO-8859-15 \ | ||||||
|  | fa_IR/UTF-8 \ | ||||||
|  | ff_SN/UTF-8 \ | ||||||
|  | fi_FI.UTF-8/UTF-8 \ | ||||||
|  | fi_FI/ISO-8859-1 \ | ||||||
|  | fi_FI@euro/ISO-8859-15 \ | ||||||
|  | fil_PH/UTF-8 \ | ||||||
|  | fo_FO.UTF-8/UTF-8 \ | ||||||
|  | fo_FO/ISO-8859-1 \ | ||||||
|  | fr_BE.UTF-8/UTF-8 \ | ||||||
|  | fr_BE/ISO-8859-1 \ | ||||||
|  | fr_BE@euro/ISO-8859-15 \ | ||||||
|  | fr_CA.UTF-8/UTF-8 \ | ||||||
|  | fr_CA/ISO-8859-1 \ | ||||||
|  | fr_CH.UTF-8/UTF-8 \ | ||||||
|  | fr_CH/ISO-8859-1 \ | ||||||
|  | fr_FR.UTF-8/UTF-8 \ | ||||||
|  | fr_FR/ISO-8859-1 \ | ||||||
|  | fr_FR@euro/ISO-8859-15 \ | ||||||
|  | fr_LU.UTF-8/UTF-8 \ | ||||||
|  | fr_LU/ISO-8859-1 \ | ||||||
|  | fr_LU@euro/ISO-8859-15 \ | ||||||
|  | fur_IT/UTF-8 \ | ||||||
|  | fy_NL/UTF-8 \ | ||||||
|  | fy_DE/UTF-8 \ | ||||||
|  | ga_IE.UTF-8/UTF-8 \ | ||||||
|  | ga_IE/ISO-8859-1 \ | ||||||
|  | ga_IE@euro/ISO-8859-15 \ | ||||||
|  | gd_GB.UTF-8/UTF-8 \ | ||||||
|  | gd_GB/ISO-8859-15 \ | ||||||
|  | gez_ER/UTF-8 \ | ||||||
|  | gez_ER@abegede/UTF-8 \ | ||||||
|  | gez_ET/UTF-8 \ | ||||||
|  | gez_ET@abegede/UTF-8 \ | ||||||
|  | gl_ES.UTF-8/UTF-8 \ | ||||||
|  | gl_ES/ISO-8859-1 \ | ||||||
|  | gl_ES@euro/ISO-8859-15 \ | ||||||
|  | gu_IN/UTF-8 \ | ||||||
|  | gv_GB.UTF-8/UTF-8 \ | ||||||
|  | gv_GB/ISO-8859-1 \ | ||||||
|  | ha_NG/UTF-8 \ | ||||||
|  | hak_TW/UTF-8 \ | ||||||
|  | he_IL.UTF-8/UTF-8 \ | ||||||
|  | he_IL/ISO-8859-8 \ | ||||||
|  | hi_IN/UTF-8 \ | ||||||
|  | hif_FJ/UTF-8 \ | ||||||
|  | hne_IN/UTF-8 \ | ||||||
|  | hr_HR.UTF-8/UTF-8 \ | ||||||
|  | hr_HR/ISO-8859-2 \ | ||||||
|  | hsb_DE/ISO-8859-2 \ | ||||||
|  | hsb_DE.UTF-8/UTF-8 \ | ||||||
|  | ht_HT/UTF-8 \ | ||||||
|  | hu_HU.UTF-8/UTF-8 \ | ||||||
|  | hu_HU/ISO-8859-2 \ | ||||||
|  | hy_AM/UTF-8 \ | ||||||
|  | hy_AM.ARMSCII-8/ARMSCII-8 \ | ||||||
|  | ia_FR/UTF-8 \ | ||||||
|  | id_ID.UTF-8/UTF-8 \ | ||||||
|  | id_ID/ISO-8859-1 \ | ||||||
|  | ig_NG/UTF-8 \ | ||||||
|  | ik_CA/UTF-8 \ | ||||||
|  | is_IS.UTF-8/UTF-8 \ | ||||||
|  | is_IS/ISO-8859-1 \ | ||||||
|  | it_CH.UTF-8/UTF-8 \ | ||||||
|  | it_CH/ISO-8859-1 \ | ||||||
|  | it_IT.UTF-8/UTF-8 \ | ||||||
|  | it_IT/ISO-8859-1 \ | ||||||
|  | it_IT@euro/ISO-8859-15 \ | ||||||
|  | iu_CA/UTF-8 \ | ||||||
|  | ja_JP.EUC-JP/EUC-JP \ | ||||||
|  | ja_JP.UTF-8/UTF-8 \ | ||||||
|  | ka_GE.UTF-8/UTF-8 \ | ||||||
|  | ka_GE/GEORGIAN-PS \ | ||||||
|  | kab_DZ/UTF-8 \ | ||||||
|  | kk_KZ.UTF-8/UTF-8 \ | ||||||
|  | kk_KZ/PT154 \ | ||||||
|  | kl_GL.UTF-8/UTF-8 \ | ||||||
|  | kl_GL/ISO-8859-1 \ | ||||||
|  | km_KH/UTF-8 \ | ||||||
|  | kn_IN/UTF-8 \ | ||||||
|  | ko_KR.EUC-KR/EUC-KR \ | ||||||
|  | ko_KR.UTF-8/UTF-8 \ | ||||||
|  | kok_IN/UTF-8 \ | ||||||
|  | ks_IN/UTF-8 \ | ||||||
|  | ks_IN@devanagari/UTF-8 \ | ||||||
|  | ku_TR.UTF-8/UTF-8 \ | ||||||
|  | ku_TR/ISO-8859-9 \ | ||||||
|  | kw_GB.UTF-8/UTF-8 \ | ||||||
|  | kw_GB/ISO-8859-1 \ | ||||||
|  | ky_KG/UTF-8 \ | ||||||
|  | lb_LU/UTF-8 \ | ||||||
|  | lg_UG.UTF-8/UTF-8 \ | ||||||
|  | lg_UG/ISO-8859-10 \ | ||||||
|  | li_BE/UTF-8 \ | ||||||
|  | li_NL/UTF-8 \ | ||||||
|  | lij_IT/UTF-8 \ | ||||||
|  | ln_CD/UTF-8 \ | ||||||
|  | lo_LA/UTF-8 \ | ||||||
|  | lt_LT.UTF-8/UTF-8 \ | ||||||
|  | lt_LT/ISO-8859-13 \ | ||||||
|  | lv_LV.UTF-8/UTF-8 \ | ||||||
|  | lv_LV/ISO-8859-13 \ | ||||||
|  | lzh_TW/UTF-8 \ | ||||||
|  | mag_IN/UTF-8 \ | ||||||
|  | mai_IN/UTF-8 \ | ||||||
|  | mai_NP/UTF-8 \ | ||||||
|  | mfe_MU/UTF-8 \ | ||||||
|  | mg_MG.UTF-8/UTF-8 \ | ||||||
|  | mg_MG/ISO-8859-15 \ | ||||||
|  | mhr_RU/UTF-8 \ | ||||||
|  | mi_NZ.UTF-8/UTF-8 \ | ||||||
|  | mi_NZ/ISO-8859-13 \ | ||||||
|  | miq_NI/UTF-8 \ | ||||||
|  | mjw_IN/UTF-8 \ | ||||||
|  | mk_MK.UTF-8/UTF-8 \ | ||||||
|  | mk_MK/ISO-8859-5 \ | ||||||
|  | ml_IN/UTF-8 \ | ||||||
|  | mn_MN/UTF-8 \ | ||||||
|  | mni_IN/UTF-8 \ | ||||||
|  | mr_IN/UTF-8 \ | ||||||
|  | ms_MY.UTF-8/UTF-8 \ | ||||||
|  | ms_MY/ISO-8859-1 \ | ||||||
|  | mt_MT.UTF-8/UTF-8 \ | ||||||
|  | mt_MT/ISO-8859-3 \ | ||||||
|  | my_MM/UTF-8 \ | ||||||
|  | nan_TW/UTF-8 \ | ||||||
|  | nan_TW@latin/UTF-8 \ | ||||||
|  | nb_NO.UTF-8/UTF-8 \ | ||||||
|  | nb_NO/ISO-8859-1 \ | ||||||
|  | nds_DE/UTF-8 \ | ||||||
|  | nds_NL/UTF-8 \ | ||||||
|  | ne_NP/UTF-8 \ | ||||||
|  | nhn_MX/UTF-8 \ | ||||||
|  | niu_NU/UTF-8 \ | ||||||
|  | niu_NZ/UTF-8 \ | ||||||
|  | nl_AW/UTF-8 \ | ||||||
|  | nl_BE.UTF-8/UTF-8 \ | ||||||
|  | nl_BE/ISO-8859-1 \ | ||||||
|  | nl_BE@euro/ISO-8859-15 \ | ||||||
|  | nl_NL.UTF-8/UTF-8 \ | ||||||
|  | nl_NL/ISO-8859-1 \ | ||||||
|  | nl_NL@euro/ISO-8859-15 \ | ||||||
|  | nn_NO.UTF-8/UTF-8 \ | ||||||
|  | nn_NO/ISO-8859-1 \ | ||||||
|  | nr_ZA/UTF-8 \ | ||||||
|  | nso_ZA/UTF-8 \ | ||||||
|  | oc_FR.UTF-8/UTF-8 \ | ||||||
|  | oc_FR/ISO-8859-1 \ | ||||||
|  | om_ET/UTF-8 \ | ||||||
|  | om_KE.UTF-8/UTF-8 \ | ||||||
|  | om_KE/ISO-8859-1 \ | ||||||
|  | or_IN/UTF-8 \ | ||||||
|  | os_RU/UTF-8 \ | ||||||
|  | pa_IN/UTF-8 \ | ||||||
|  | pa_PK/UTF-8 \ | ||||||
|  | pap_AW/UTF-8 \ | ||||||
|  | pap_CW/UTF-8 \ | ||||||
|  | pl_PL.UTF-8/UTF-8 \ | ||||||
|  | pl_PL/ISO-8859-2 \ | ||||||
|  | ps_AF/UTF-8 \ | ||||||
|  | pt_BR.UTF-8/UTF-8 \ | ||||||
|  | pt_BR/ISO-8859-1 \ | ||||||
|  | pt_PT.UTF-8/UTF-8 \ | ||||||
|  | pt_PT/ISO-8859-1 \ | ||||||
|  | pt_PT@euro/ISO-8859-15 \ | ||||||
|  | quz_PE/UTF-8 \ | ||||||
|  | raj_IN/UTF-8 \ | ||||||
|  | ro_RO.UTF-8/UTF-8 \ | ||||||
|  | ro_RO/ISO-8859-2 \ | ||||||
|  | ru_RU.KOI8-R/KOI8-R \ | ||||||
|  | ru_RU.UTF-8/UTF-8 \ | ||||||
|  | ru_RU/ISO-8859-5 \ | ||||||
|  | ru_UA.UTF-8/UTF-8 \ | ||||||
|  | ru_UA/KOI8-U \ | ||||||
|  | rw_RW/UTF-8 \ | ||||||
|  | sa_IN/UTF-8 \ | ||||||
|  | sah_RU/UTF-8 \ | ||||||
|  | sat_IN/UTF-8 \ | ||||||
|  | sc_IT/UTF-8 \ | ||||||
|  | sd_IN/UTF-8 \ | ||||||
|  | sd_IN@devanagari/UTF-8 \ | ||||||
|  | se_NO/UTF-8 \ | ||||||
|  | sgs_LT/UTF-8 \ | ||||||
|  | shn_MM/UTF-8 \ | ||||||
|  | shs_CA/UTF-8 \ | ||||||
|  | si_LK/UTF-8 \ | ||||||
|  | sid_ET/UTF-8 \ | ||||||
|  | sk_SK.UTF-8/UTF-8 \ | ||||||
|  | sk_SK/ISO-8859-2 \ | ||||||
|  | sl_SI.UTF-8/UTF-8 \ | ||||||
|  | sl_SI/ISO-8859-2 \ | ||||||
|  | sm_WS/UTF-8 \ | ||||||
|  | so_DJ.UTF-8/UTF-8 \ | ||||||
|  | so_DJ/ISO-8859-1 \ | ||||||
|  | so_ET/UTF-8 \ | ||||||
|  | so_KE.UTF-8/UTF-8 \ | ||||||
|  | so_KE/ISO-8859-1 \ | ||||||
|  | so_SO.UTF-8/UTF-8 \ | ||||||
|  | so_SO/ISO-8859-1 \ | ||||||
|  | sq_AL.UTF-8/UTF-8 \ | ||||||
|  | sq_AL/ISO-8859-1 \ | ||||||
|  | sq_MK/UTF-8 \ | ||||||
|  | sr_ME/UTF-8 \ | ||||||
|  | sr_RS/UTF-8 \ | ||||||
|  | sr_RS@latin/UTF-8 \ | ||||||
|  | ss_ZA/UTF-8 \ | ||||||
|  | st_ZA.UTF-8/UTF-8 \ | ||||||
|  | st_ZA/ISO-8859-1 \ | ||||||
|  | sv_FI.UTF-8/UTF-8 \ | ||||||
|  | sv_FI/ISO-8859-1 \ | ||||||
|  | sv_FI@euro/ISO-8859-15 \ | ||||||
|  | sv_SE.UTF-8/UTF-8 \ | ||||||
|  | sv_SE/ISO-8859-1 \ | ||||||
|  | sv_SE.ISO-8859-15/ISO-8859-15 \ | ||||||
|  | sw_KE/UTF-8 \ | ||||||
|  | sw_TZ/UTF-8 \ | ||||||
|  | szl_PL/UTF-8 \ | ||||||
|  | ta_IN/UTF-8 \ | ||||||
|  | ta_LK/UTF-8 \ | ||||||
|  | tcy_IN.UTF-8/UTF-8 \ | ||||||
|  | te_IN/UTF-8 \ | ||||||
|  | tg_TJ.UTF-8/UTF-8 \ | ||||||
|  | tg_TJ/KOI8-T \ | ||||||
|  | th_TH.UTF-8/UTF-8 \ | ||||||
|  | th_TH/TIS-620 \ | ||||||
|  | the_NP/UTF-8 \ | ||||||
|  | ti_ER/UTF-8 \ | ||||||
|  | ti_ET/UTF-8 \ | ||||||
|  | tig_ER/UTF-8 \ | ||||||
|  | tk_TM/UTF-8 \ | ||||||
|  | tl_PH.UTF-8/UTF-8 \ | ||||||
|  | tl_PH/ISO-8859-1 \ | ||||||
|  | tn_ZA/UTF-8 \ | ||||||
|  | to_TO/UTF-8 \ | ||||||
|  | tpi_PG/UTF-8 \ | ||||||
|  | tr_CY.UTF-8/UTF-8 \ | ||||||
|  | tr_CY/ISO-8859-9 \ | ||||||
|  | tr_TR.UTF-8/UTF-8 \ | ||||||
|  | tr_TR/ISO-8859-9 \ | ||||||
|  | ts_ZA/UTF-8 \ | ||||||
|  | tt_RU/UTF-8 \ | ||||||
|  | tt_RU@iqtelif/UTF-8 \ | ||||||
|  | ug_CN/UTF-8 \ | ||||||
|  | uk_UA.UTF-8/UTF-8 \ | ||||||
|  | uk_UA/KOI8-U \ | ||||||
|  | unm_US/UTF-8 \ | ||||||
|  | ur_IN/UTF-8 \ | ||||||
|  | ur_PK/UTF-8 \ | ||||||
|  | uz_UZ.UTF-8/UTF-8 \ | ||||||
|  | uz_UZ/ISO-8859-1 \ | ||||||
|  | uz_UZ@cyrillic/UTF-8 \ | ||||||
|  | ve_ZA/UTF-8 \ | ||||||
|  | vi_VN/UTF-8 \ | ||||||
|  | wa_BE/ISO-8859-1 \ | ||||||
|  | wa_BE@euro/ISO-8859-15 \ | ||||||
|  | wa_BE.UTF-8/UTF-8 \ | ||||||
|  | wae_CH/UTF-8 \ | ||||||
|  | wal_ET/UTF-8 \ | ||||||
|  | wo_SN/UTF-8 \ | ||||||
|  | xh_ZA.UTF-8/UTF-8 \ | ||||||
|  | xh_ZA/ISO-8859-1 \ | ||||||
|  | yi_US.UTF-8/UTF-8 \ | ||||||
|  | yi_US/CP1255 \ | ||||||
|  | yo_NG/UTF-8 \ | ||||||
|  | yue_HK/UTF-8 \ | ||||||
|  | yuw_PG/UTF-8 \ | ||||||
|  | zh_CN.GB18030/GB18030 \ | ||||||
|  | zh_CN.GBK/GBK \ | ||||||
|  | zh_CN.UTF-8/UTF-8 \ | ||||||
|  | zh_CN/GB2312 \ | ||||||
|  | zh_HK.UTF-8/UTF-8 \ | ||||||
|  | zh_HK/BIG5-HKSCS \ | ||||||
|  | zh_SG.UTF-8/UTF-8 \ | ||||||
|  | zh_SG.GBK/GBK \ | ||||||
|  | zh_SG/GB2312 \ | ||||||
|  | zh_TW.EUC-TW/EUC-TW \ | ||||||
|  | zh_TW.UTF-8/UTF-8 \ | ||||||
|  | zh_TW/BIG5 \ | ||||||
|  | zu_ZA.UTF-8/UTF-8 \ | ||||||
|  | zu_ZA/ISO-8859-1 \ | ||||||
							
								
								
									
										862
									
								
								SOURCES/build-locale-archive.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										862
									
								
								SOURCES/build-locale-archive.c
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,862 @@ | |||||||
|  | #define _GNU_SOURCE | ||||||
|  | #include <assert.h> | ||||||
|  | #include <dirent.h> | ||||||
|  | #include <errno.h> | ||||||
|  | #include <fcntl.h> | ||||||
|  | #include <locale.h> | ||||||
|  | #include <stdarg.h> | ||||||
|  | #include <stdbool.h> | ||||||
|  | #include <stdio.h> | ||||||
|  | #include <stdlib.h> | ||||||
|  | #include <getopt.h> | ||||||
|  | #include <string.h> | ||||||
|  | #include <sys/mman.h> | ||||||
|  | #include <sys/stat.h> | ||||||
|  | #include <unistd.h> | ||||||
|  | #include "../locale/hashval.h" | ||||||
|  | #define __LC_LAST 13 | ||||||
|  | #include "../locale/locarchive.h" | ||||||
|  | #include "../crypt/md5.h" | ||||||
|  | 
 | ||||||
|  | const char *alias_file = DATADIR "/locale/locale.alias"; | ||||||
|  | const char *locar_file = PREFIX "/lib/locale/locale-archive"; | ||||||
|  | const char *tmpl_file = PREFIX "/lib/locale/locale-archive.tmpl"; | ||||||
|  | const char *loc_path = PREFIX "/lib/locale/"; | ||||||
|  | /* Flags set by `--verbose` option.  */ | ||||||
|  | int be_quiet = 1; | ||||||
|  | int verbose = 0; | ||||||
|  | int max_locarchive_open_retry = 10; | ||||||
|  | const char *output_prefix; | ||||||
|  | 
 | ||||||
|  | /* Endianness should have been taken care of by localedef.  We don't need to do
 | ||||||
|  |    additional swapping.  We need this variable exported however, since | ||||||
|  |    locarchive.c uses it to determine if it needs to swap endianness of a value | ||||||
|  |    before writing to or reading from the archive.  */ | ||||||
|  | bool swap_endianness_p = false; | ||||||
|  | 
 | ||||||
|  | static const char *locnames[] = | ||||||
|  |   { | ||||||
|  | #define DEFINE_CATEGORY(category, category_name, items, a) \ | ||||||
|  |   [category] = category_name, | ||||||
|  | #include "../locale/categories.def" | ||||||
|  | #undef  DEFINE_CATEGORY | ||||||
|  |   }; | ||||||
|  | 
 | ||||||
|  | static int | ||||||
|  | is_prime (unsigned long candidate) | ||||||
|  | { | ||||||
|  |   /* No even number and none less than 10 will be passed here.  */ | ||||||
|  |   unsigned long int divn = 3; | ||||||
|  |   unsigned long int sq = divn * divn; | ||||||
|  | 
 | ||||||
|  |   while (sq < candidate && candidate % divn != 0) | ||||||
|  |     { | ||||||
|  |       ++divn; | ||||||
|  |       sq += 4 * divn; | ||||||
|  |       ++divn; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |   return candidate % divn != 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | unsigned long | ||||||
|  | next_prime (unsigned long seed) | ||||||
|  | { | ||||||
|  |   /* Make it definitely odd.  */ | ||||||
|  |   seed |= 1; | ||||||
|  | 
 | ||||||
|  |   while (!is_prime (seed)) | ||||||
|  |     seed += 2; | ||||||
|  | 
 | ||||||
|  |   return seed; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void | ||||||
|  | error (int status, int errnum, const char *message, ...) | ||||||
|  | { | ||||||
|  |   va_list args; | ||||||
|  | 
 | ||||||
|  |   va_start (args, message); | ||||||
|  |   fflush (stdout); | ||||||
|  |   fprintf (stderr, "%s: ", program_invocation_name); | ||||||
|  |   vfprintf (stderr, message, args); | ||||||
|  |   va_end (args); | ||||||
|  |   if (errnum) | ||||||
|  |     fprintf (stderr, ": %s", strerror (errnum)); | ||||||
|  |   putc ('\n', stderr); | ||||||
|  |   fflush (stderr); | ||||||
|  |   if (status) | ||||||
|  |     exit (errnum == EROFS ? 0 : status); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void * | ||||||
|  | xmalloc (size_t size) | ||||||
|  | { | ||||||
|  |   void *p = malloc (size); | ||||||
|  |   if (p == NULL) | ||||||
|  |     error (EXIT_FAILURE, errno, "could not allocate %zd bytes of memory", size); | ||||||
|  |   return p; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void | ||||||
|  | open_tmpl_archive (struct locarhandle *ah) | ||||||
|  | { | ||||||
|  |   struct stat64 st; | ||||||
|  |   int fd; | ||||||
|  |   struct locarhead head; | ||||||
|  |   const char *archivefname = ah->fname == NULL ? tmpl_file : ah->fname; | ||||||
|  | 
 | ||||||
|  |   /* Open the archive.  We must have exclusive write access.  */ | ||||||
|  |   fd = open64 (archivefname, O_RDONLY); | ||||||
|  |   if (fd == -1) | ||||||
|  |     error (EXIT_FAILURE, errno, "cannot open locale archive template file \"%s\"", | ||||||
|  | 	   archivefname); | ||||||
|  | 
 | ||||||
|  |   if (fstat64 (fd, &st) < 0) | ||||||
|  |     error (EXIT_FAILURE, errno, "cannot stat locale archive template file \"%s\"", | ||||||
|  | 	   archivefname); | ||||||
|  | 
 | ||||||
|  |   /* Read the header.  */ | ||||||
|  |   if (TEMP_FAILURE_RETRY (read (fd, &head, sizeof (head))) != sizeof (head)) | ||||||
|  |     error (EXIT_FAILURE, errno, "cannot read archive header"); | ||||||
|  | 
 | ||||||
|  |   ah->fd = fd; | ||||||
|  |   ah->mmaped = (head.sumhash_offset | ||||||
|  | 		+ head.sumhash_size * sizeof (struct sumhashent)); | ||||||
|  |   if (ah->mmaped > (unsigned long) st.st_size) | ||||||
|  |     error (EXIT_FAILURE, 0, "locale archive template file truncated"); | ||||||
|  |   ah->mmaped = st.st_size; | ||||||
|  |   ah->reserved = st.st_size; | ||||||
|  | 
 | ||||||
|  |   /* Now we know how large the administrative information part is.
 | ||||||
|  |      Map all of it.  */ | ||||||
|  |   ah->addr = mmap64 (NULL, ah->mmaped, PROT_READ, MAP_SHARED, fd, 0); | ||||||
|  |   if (ah->addr == MAP_FAILED) | ||||||
|  |     error (EXIT_FAILURE, errno, "cannot map archive header"); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /* Open the locale archive.  */ | ||||||
|  | extern void open_archive (struct locarhandle *ah, bool readonly); | ||||||
|  | 
 | ||||||
|  | /* Close the locale archive.  */ | ||||||
|  | extern void close_archive (struct locarhandle *ah); | ||||||
|  | 
 | ||||||
|  | /* Add given locale data to the archive.  */ | ||||||
|  | extern int add_locale_to_archive (struct locarhandle *ah, const char *name, | ||||||
|  | 				  locale_data_t data, bool replace); | ||||||
|  | 
 | ||||||
|  | extern void add_alias (struct locarhandle *ah, const char *alias, | ||||||
|  | 		       bool replace, const char *oldname, | ||||||
|  | 		       uint32_t *locrec_offset_p); | ||||||
|  | 
 | ||||||
|  | extern struct namehashent * | ||||||
|  | insert_name (struct locarhandle *ah, | ||||||
|  | 	     const char *name, size_t name_len, bool replace); | ||||||
|  | 
 | ||||||
|  | struct nameent | ||||||
|  | { | ||||||
|  |   char *name; | ||||||
|  |   struct locrecent *locrec; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | struct dataent | ||||||
|  | { | ||||||
|  |   const unsigned char *sum; | ||||||
|  |   uint32_t file_offset; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | static int | ||||||
|  | nameentcmp (const void *a, const void *b) | ||||||
|  | { | ||||||
|  |   struct locrecent *la = ((const struct nameent *) a)->locrec; | ||||||
|  |   struct locrecent *lb = ((const struct nameent *) b)->locrec; | ||||||
|  |   uint32_t start_a = -1, end_a = 0; | ||||||
|  |   uint32_t start_b = -1, end_b = 0; | ||||||
|  |   int cnt; | ||||||
|  | 
 | ||||||
|  |   for (cnt = 0; cnt < __LC_LAST; ++cnt) | ||||||
|  |     if (cnt != LC_ALL) | ||||||
|  |       { | ||||||
|  | 	if (la->record[cnt].offset < start_a) | ||||||
|  | 	  start_a = la->record[cnt].offset; | ||||||
|  | 	if (la->record[cnt].offset + la->record[cnt].len > end_a) | ||||||
|  | 	  end_a = la->record[cnt].offset + la->record[cnt].len; | ||||||
|  |       } | ||||||
|  |   assert (start_a != (uint32_t)-1); | ||||||
|  |   assert (end_a != 0); | ||||||
|  | 
 | ||||||
|  |   for (cnt = 0; cnt < __LC_LAST; ++cnt) | ||||||
|  |     if (cnt != LC_ALL) | ||||||
|  |       { | ||||||
|  | 	if (lb->record[cnt].offset < start_b) | ||||||
|  | 	  start_b = lb->record[cnt].offset; | ||||||
|  | 	if (lb->record[cnt].offset + lb->record[cnt].len > end_b) | ||||||
|  | 	  end_b = lb->record[cnt].offset + lb->record[cnt].len; | ||||||
|  |       } | ||||||
|  |   assert (start_b != (uint32_t)-1); | ||||||
|  |   assert (end_b != 0); | ||||||
|  | 
 | ||||||
|  |   if (start_a != start_b) | ||||||
|  |     return (int)start_a - (int)start_b; | ||||||
|  |   return (int)end_a - (int)end_b; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int | ||||||
|  | dataentcmp (const void *a, const void *b) | ||||||
|  | { | ||||||
|  |   if (((const struct dataent *) a)->file_offset | ||||||
|  |       < ((const struct dataent *) b)->file_offset) | ||||||
|  |     return -1; | ||||||
|  | 
 | ||||||
|  |   if (((const struct dataent *) a)->file_offset | ||||||
|  |       > ((const struct dataent *) b)->file_offset) | ||||||
|  |     return 1; | ||||||
|  | 
 | ||||||
|  |   return 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int | ||||||
|  | sumsearchfn (const void *key, const void *ent) | ||||||
|  | { | ||||||
|  |   uint32_t keyn = *(uint32_t *)key; | ||||||
|  |   uint32_t entn = ((struct dataent *)ent)->file_offset; | ||||||
|  | 
 | ||||||
|  |   if (keyn < entn) | ||||||
|  |     return -1; | ||||||
|  |   if (keyn > entn) | ||||||
|  |     return 1; | ||||||
|  |   return 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void | ||||||
|  | compute_data (struct locarhandle *ah, struct nameent *name, size_t sumused, | ||||||
|  | 	      struct dataent *files, locale_data_t data) | ||||||
|  | { | ||||||
|  |   int cnt; | ||||||
|  |   struct locrecent *locrec = name->locrec; | ||||||
|  |   struct dataent *file; | ||||||
|  |   data[LC_ALL].addr = ((char *) ah->addr) + locrec->record[LC_ALL].offset; | ||||||
|  |   data[LC_ALL].size = locrec->record[LC_ALL].len; | ||||||
|  |   for (cnt = 0; cnt < __LC_LAST; ++cnt) | ||||||
|  |     if (cnt != LC_ALL) | ||||||
|  |       { | ||||||
|  | 	data[cnt].addr = ((char *) ah->addr) + locrec->record[cnt].offset; | ||||||
|  | 	data[cnt].size = locrec->record[cnt].len; | ||||||
|  | 	if (data[cnt].addr >= data[LC_ALL].addr | ||||||
|  | 	    && data[cnt].addr + data[cnt].size | ||||||
|  | 	       <= data[LC_ALL].addr + data[LC_ALL].size) | ||||||
|  | 	  __md5_buffer (data[cnt].addr, data[cnt].size, data[cnt].sum); | ||||||
|  | 	else | ||||||
|  | 	  { | ||||||
|  | 	    file = bsearch (&locrec->record[cnt].offset, files, sumused, | ||||||
|  | 			    sizeof (*files), sumsearchfn); | ||||||
|  | 	    if (file == NULL) | ||||||
|  | 	      error (EXIT_FAILURE, 0, "inconsistent template file"); | ||||||
|  | 	    memcpy (data[cnt].sum, file->sum, sizeof (data[cnt].sum)); | ||||||
|  | 	  } | ||||||
|  |       } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int | ||||||
|  | fill_archive (struct locarhandle *tmpl_ah, | ||||||
|  | 	      const char *fname, | ||||||
|  | 	      size_t install_langs_count, char *install_langs_list[], | ||||||
|  | 	      size_t nlist, char *list[], | ||||||
|  | 	      const char *primary) | ||||||
|  | { | ||||||
|  |   struct locarhandle ah; | ||||||
|  |   struct locarhead *head; | ||||||
|  |   int result = 0; | ||||||
|  |   struct nameent *names; | ||||||
|  |   struct namehashent *namehashtab; | ||||||
|  |   size_t cnt, used; | ||||||
|  |   struct dataent *files; | ||||||
|  |   struct sumhashent *sumhashtab; | ||||||
|  |   size_t sumused; | ||||||
|  |   struct locrecent *primary_locrec = NULL; | ||||||
|  |   struct nameent *primary_nameent = NULL; | ||||||
|  | 
 | ||||||
|  |   head = tmpl_ah->addr; | ||||||
|  |   names = (struct nameent *) malloc (head->namehash_used | ||||||
|  | 				     * sizeof (struct nameent)); | ||||||
|  |   files = (struct dataent *) malloc (head->sumhash_used | ||||||
|  | 				     * sizeof (struct dataent)); | ||||||
|  |   if (names == NULL || files == NULL) | ||||||
|  |     error (EXIT_FAILURE, errno, "could not allocate tables"); | ||||||
|  | 
 | ||||||
|  |   namehashtab = (struct namehashent *) ((char *) tmpl_ah->addr | ||||||
|  | 					+ head->namehash_offset); | ||||||
|  |   sumhashtab = (struct sumhashent *) ((char *) tmpl_ah->addr | ||||||
|  | 				      + head->sumhash_offset); | ||||||
|  | 
 | ||||||
|  |   for (cnt = used = 0; cnt < head->namehash_size; ++cnt) | ||||||
|  |     if (namehashtab[cnt].locrec_offset != 0) | ||||||
|  |       { | ||||||
|  | 	char * name; | ||||||
|  | 	int i; | ||||||
|  | 	assert (used < head->namehash_used); | ||||||
|  |         name = tmpl_ah->addr + namehashtab[cnt].name_offset; | ||||||
|  |         if (install_langs_count == 0) | ||||||
|  |           { | ||||||
|  | 	    /* Always intstall the entry.  */ | ||||||
|  |             names[used].name = name; | ||||||
|  |             names[used++].locrec | ||||||
|  |                 = (struct locrecent *) ((char *) tmpl_ah->addr + | ||||||
|  |                                         namehashtab[cnt].locrec_offset); | ||||||
|  |           } | ||||||
|  |         else | ||||||
|  |           { | ||||||
|  | 	    /* Only install the entry if the user asked for it via
 | ||||||
|  | 	       --install-langs.  */ | ||||||
|  |             for (i = 0; i < install_langs_count; i++) | ||||||
|  |               { | ||||||
|  | 		/* Add one for "_" and one for the null terminator.  */ | ||||||
|  | 		size_t len = strlen (install_langs_list[i]) + 2; | ||||||
|  | 		char *install_lang = (char *)xmalloc (len); | ||||||
|  |                 strcpy (install_lang, install_langs_list[i]); | ||||||
|  |                 if (strchr (install_lang, '_') == NULL) | ||||||
|  |                   strcat (install_lang, "_"); | ||||||
|  |                 if (strncmp (name, install_lang, strlen (install_lang)) == 0) | ||||||
|  |                   { | ||||||
|  |                     names[used].name = name; | ||||||
|  |                     names[used++].locrec | ||||||
|  | 		      = (struct locrecent *) ((char *)tmpl_ah->addr | ||||||
|  | 					      + namehashtab[cnt].locrec_offset); | ||||||
|  |                   } | ||||||
|  | 		free (install_lang); | ||||||
|  |               } | ||||||
|  |           } | ||||||
|  |       } | ||||||
|  | 
 | ||||||
|  |   /* Sort the names.  */ | ||||||
|  |   qsort (names, used, sizeof (struct nameent), nameentcmp); | ||||||
|  | 
 | ||||||
|  |   for (cnt = sumused = 0; cnt < head->sumhash_size; ++cnt) | ||||||
|  |     if (sumhashtab[cnt].file_offset != 0) | ||||||
|  |       { | ||||||
|  | 	assert (sumused < head->sumhash_used); | ||||||
|  | 	files[sumused].sum = (const unsigned char *) sumhashtab[cnt].sum; | ||||||
|  | 	files[sumused++].file_offset = sumhashtab[cnt].file_offset; | ||||||
|  |       } | ||||||
|  | 
 | ||||||
|  |   /* Sort by file locations.  */ | ||||||
|  |   qsort (files, sumused, sizeof (struct dataent), dataentcmp); | ||||||
|  | 
 | ||||||
|  |   /* Open the archive.  This call never returns if we cannot
 | ||||||
|  |      successfully open the archive.  */ | ||||||
|  |   ah.fname = NULL; | ||||||
|  |   if (fname != NULL) | ||||||
|  |     ah.fname = fname; | ||||||
|  |   open_archive (&ah, false); | ||||||
|  | 
 | ||||||
|  |   if (primary != NULL) | ||||||
|  |     { | ||||||
|  |       for (cnt = 0; cnt < used; ++cnt) | ||||||
|  | 	if (strcmp (names[cnt].name, primary) == 0) | ||||||
|  | 	  break; | ||||||
|  |       if (cnt < used) | ||||||
|  | 	{ | ||||||
|  | 	  locale_data_t data; | ||||||
|  | 
 | ||||||
|  | 	  compute_data (tmpl_ah, &names[cnt], sumused, files, data); | ||||||
|  | 	  result |= add_locale_to_archive (&ah, primary, data, 0); | ||||||
|  | 	  primary_locrec = names[cnt].locrec; | ||||||
|  | 	  primary_nameent = &names[cnt]; | ||||||
|  | 	} | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |   for (cnt = 0; cnt < used; ++cnt) | ||||||
|  |     if (&names[cnt] == primary_nameent) | ||||||
|  |       continue; | ||||||
|  |     else if ((cnt > 0 && names[cnt - 1].locrec == names[cnt].locrec) | ||||||
|  | 	     || names[cnt].locrec == primary_locrec) | ||||||
|  |       { | ||||||
|  | 	const char *oldname; | ||||||
|  | 	struct namehashent *namehashent; | ||||||
|  | 	uint32_t locrec_offset; | ||||||
|  | 
 | ||||||
|  | 	if (names[cnt].locrec == primary_locrec) | ||||||
|  | 	  oldname = primary; | ||||||
|  | 	else | ||||||
|  | 	  oldname = names[cnt - 1].name; | ||||||
|  | 	namehashent = insert_name (&ah, oldname, strlen (oldname), true); | ||||||
|  | 	assert (namehashent->name_offset != 0); | ||||||
|  | 	assert (namehashent->locrec_offset != 0); | ||||||
|  | 	locrec_offset = namehashent->locrec_offset; | ||||||
|  | 	add_alias (&ah, names[cnt].name, 0, oldname, &locrec_offset); | ||||||
|  |       } | ||||||
|  |     else | ||||||
|  |       { | ||||||
|  | 	locale_data_t data; | ||||||
|  | 
 | ||||||
|  | 	compute_data (tmpl_ah, &names[cnt], sumused, files, data); | ||||||
|  | 	result |= add_locale_to_archive (&ah, names[cnt].name, data, 0); | ||||||
|  |       } | ||||||
|  | 
 | ||||||
|  |   while (nlist-- > 0) | ||||||
|  |     { | ||||||
|  |       const char *fname = *list++; | ||||||
|  |       size_t fnamelen = strlen (fname); | ||||||
|  |       struct stat64 st; | ||||||
|  |       DIR *dirp; | ||||||
|  |       struct dirent64 *d; | ||||||
|  |       int seen; | ||||||
|  |       locale_data_t data; | ||||||
|  |       int cnt; | ||||||
|  | 
 | ||||||
|  |       /* First see whether this really is a directory and whether it
 | ||||||
|  | 	 contains all the require locale category files.  */ | ||||||
|  |       if (stat64 (fname, &st) < 0) | ||||||
|  | 	{ | ||||||
|  | 	  error (0, 0, "stat of \"%s\" failed: %s: ignored", fname, | ||||||
|  | 		 strerror (errno)); | ||||||
|  | 	  continue; | ||||||
|  | 	} | ||||||
|  |       if (!S_ISDIR (st.st_mode)) | ||||||
|  | 	{ | ||||||
|  | 	  error (0, 0, "\"%s\" is no directory; ignored", fname); | ||||||
|  | 	  continue; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  |       dirp = opendir (fname); | ||||||
|  |       if (dirp == NULL) | ||||||
|  | 	{ | ||||||
|  | 	  error (0, 0, "cannot open directory \"%s\": %s: ignored", | ||||||
|  | 		 fname, strerror (errno)); | ||||||
|  | 	  continue; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  |       seen = 0; | ||||||
|  |       while ((d = readdir64 (dirp)) != NULL) | ||||||
|  | 	{ | ||||||
|  | 	  for (cnt = 0; cnt < __LC_LAST; ++cnt) | ||||||
|  | 	    if (cnt != LC_ALL) | ||||||
|  | 	      if (strcmp (d->d_name, locnames[cnt]) == 0) | ||||||
|  | 		{ | ||||||
|  | 		  unsigned char d_type; | ||||||
|  | 
 | ||||||
|  | 		  /* We have an object of the required name.  If it's
 | ||||||
|  | 		     a directory we have to look at a file with the | ||||||
|  | 		     prefix "SYS_".  Otherwise we have found what we | ||||||
|  | 		     are looking for.  */ | ||||||
|  | #ifdef _DIRENT_HAVE_D_TYPE | ||||||
|  | 		  d_type = d->d_type; | ||||||
|  | 
 | ||||||
|  | 		  if (d_type != DT_REG) | ||||||
|  | #endif | ||||||
|  | 		    { | ||||||
|  | 		      char fullname[fnamelen + 2 * strlen (d->d_name) + 7]; | ||||||
|  | 
 | ||||||
|  | #ifdef _DIRENT_HAVE_D_TYPE | ||||||
|  | 		      if (d_type == DT_UNKNOWN || d_type == DT_LNK) | ||||||
|  | #endif | ||||||
|  | 			{ | ||||||
|  | 			  strcpy (stpcpy (stpcpy (fullname, fname), "/"), | ||||||
|  | 				  d->d_name); | ||||||
|  | 
 | ||||||
|  | 			  if (stat64 (fullname, &st) == -1) | ||||||
|  | 			    /* We cannot stat the file, ignore it.  */ | ||||||
|  | 			    break; | ||||||
|  | 
 | ||||||
|  | 			  d_type = IFTODT (st.st_mode); | ||||||
|  | 			} | ||||||
|  | 
 | ||||||
|  | 		      if (d_type == DT_DIR) | ||||||
|  | 			{ | ||||||
|  | 			  /* We have to do more tests.  The file is a
 | ||||||
|  | 			     directory and it therefore must contain a | ||||||
|  | 			     regular file with the same name except a | ||||||
|  | 			     "SYS_" prefix.  */ | ||||||
|  | 			  char *t = stpcpy (stpcpy (fullname, fname), "/"); | ||||||
|  | 			  strcpy (stpcpy (stpcpy (t, d->d_name), "/SYS_"), | ||||||
|  | 				  d->d_name); | ||||||
|  | 
 | ||||||
|  | 			  if (stat64 (fullname, &st) == -1) | ||||||
|  | 			    /* There is no SYS_* file or we cannot
 | ||||||
|  | 			       access it.  */ | ||||||
|  | 			    break; | ||||||
|  | 
 | ||||||
|  | 			  d_type = IFTODT (st.st_mode); | ||||||
|  | 			} | ||||||
|  | 		    } | ||||||
|  | 
 | ||||||
|  | 		  /* If we found a regular file (eventually after
 | ||||||
|  | 		     following a symlink) we are successful.  */ | ||||||
|  | 		  if (d_type == DT_REG) | ||||||
|  | 		    ++seen; | ||||||
|  | 		  break; | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  |       closedir (dirp); | ||||||
|  | 
 | ||||||
|  |       if (seen != __LC_LAST - 1) | ||||||
|  | 	{ | ||||||
|  | 	  /* We don't have all locale category files.  Ignore the name.  */ | ||||||
|  | 	  error (0, 0, "incomplete set of locale files in \"%s\"", | ||||||
|  | 		 fname); | ||||||
|  | 	  continue; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  |       /* Add the files to the archive.  To do this we first compute
 | ||||||
|  | 	 sizes and the MD5 sums of all the files.  */ | ||||||
|  |       for (cnt = 0; cnt < __LC_LAST; ++cnt) | ||||||
|  | 	if (cnt != LC_ALL) | ||||||
|  | 	  { | ||||||
|  | 	    char fullname[fnamelen + 2 * strlen (locnames[cnt]) + 7]; | ||||||
|  | 	    int fd; | ||||||
|  | 
 | ||||||
|  | 	    strcpy (stpcpy (stpcpy (fullname, fname), "/"), locnames[cnt]); | ||||||
|  | 	    fd = open64 (fullname, O_RDONLY); | ||||||
|  | 	    if (fd == -1 || fstat64 (fd, &st) == -1) | ||||||
|  | 	      { | ||||||
|  | 		/* Cannot read the file.  */ | ||||||
|  | 		if (fd != -1) | ||||||
|  | 		  close (fd); | ||||||
|  | 		break; | ||||||
|  | 	      } | ||||||
|  | 
 | ||||||
|  | 	    if (S_ISDIR (st.st_mode)) | ||||||
|  | 	      { | ||||||
|  | 		char *t; | ||||||
|  | 		close (fd); | ||||||
|  | 		t = stpcpy (stpcpy (fullname, fname), "/"); | ||||||
|  | 		strcpy (stpcpy (stpcpy (t, locnames[cnt]), "/SYS_"), | ||||||
|  | 			locnames[cnt]); | ||||||
|  | 
 | ||||||
|  | 		fd = open64 (fullname, O_RDONLY); | ||||||
|  | 		if (fd == -1 || fstat64 (fd, &st) == -1 | ||||||
|  | 		    || !S_ISREG (st.st_mode)) | ||||||
|  | 		  { | ||||||
|  | 		    if (fd != -1) | ||||||
|  | 		      close (fd); | ||||||
|  | 		    break; | ||||||
|  | 		  } | ||||||
|  | 	      } | ||||||
|  | 
 | ||||||
|  | 	    /* Map the file.  */ | ||||||
|  | 	    data[cnt].addr = mmap64 (NULL, st.st_size, PROT_READ, MAP_SHARED, | ||||||
|  | 				     fd, 0); | ||||||
|  | 	    if (data[cnt].addr == MAP_FAILED) | ||||||
|  | 	      { | ||||||
|  | 		/* Cannot map it.  */ | ||||||
|  | 		close (fd); | ||||||
|  | 		break; | ||||||
|  | 	      } | ||||||
|  | 
 | ||||||
|  | 	    data[cnt].size = st.st_size; | ||||||
|  | 	    __md5_buffer (data[cnt].addr, st.st_size, data[cnt].sum); | ||||||
|  | 
 | ||||||
|  | 	    /* We don't need the file descriptor anymore.  */ | ||||||
|  | 	    close (fd); | ||||||
|  | 	  } | ||||||
|  | 
 | ||||||
|  |       if (cnt != __LC_LAST) | ||||||
|  | 	{ | ||||||
|  | 	  while (cnt-- > 0) | ||||||
|  | 	    if (cnt != LC_ALL) | ||||||
|  | 	      munmap (data[cnt].addr, data[cnt].size); | ||||||
|  | 
 | ||||||
|  | 	  error (0, 0, "cannot read all files in \"%s\": ignored", fname); | ||||||
|  | 
 | ||||||
|  | 	  continue; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  |       result |= add_locale_to_archive (&ah, basename (fname), data, 0); | ||||||
|  | 
 | ||||||
|  |       for (cnt = 0; cnt < __LC_LAST; ++cnt) | ||||||
|  | 	if (cnt != LC_ALL) | ||||||
|  | 	  munmap (data[cnt].addr, data[cnt].size); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |   /* We are done.  */ | ||||||
|  |   close_archive (&ah); | ||||||
|  | 
 | ||||||
|  |   return result; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void usage() | ||||||
|  | { | ||||||
|  |   printf ("\
 | ||||||
|  | Usage: build-locale-archive [OPTION]... [TEMPLATE-FILE] [ARCHIVE-FILE]\n\ | ||||||
|  |  Builds a locale archive from a template file.\n\ | ||||||
|  |  Options:\n\ | ||||||
|  |   -h, --help                 Print this usage message.\n\ | ||||||
|  |   -v, --verbose              Verbose execution.\n\ | ||||||
|  |   -l, --install-langs=LIST   Only include locales given in LIST into the \n\ | ||||||
|  |                              locale archive.  LIST is a colon separated list\n\ | ||||||
|  |                              of locale prefixes, for example \"de:en:ja\".\n\
 | ||||||
|  |                              The special argument \"all\" means to install\n\
 | ||||||
|  |                              all languages and it must be present by itself.\n\ | ||||||
|  |                              If \"all\" is present with any other language it\n\
 | ||||||
|  |                              will be treated as the name of a locale.\n\ | ||||||
|  |                              If the --install-langs option is missing, all\n\ | ||||||
|  |                              locales are installed. The colon separated list\n\ | ||||||
|  |                              can contain any strings matching the beginning of\n\ | ||||||
|  |                              locale names.\n\ | ||||||
|  |                              If a string does not contain a \"_\", it is added.\n\
 | ||||||
|  |                              Examples:\n\ | ||||||
|  |                                --install-langs=\"en\"\n\
 | ||||||
|  |                                  installs en_US, en_US.iso88591,\n\ | ||||||
|  |                                  en_US.iso885915, en_US.utf8,\n\ | ||||||
|  |                                  en_GB ...\n\ | ||||||
|  |                                --install-langs=\"en_US.utf8\"\n\
 | ||||||
|  |                                  installs only en_US.utf8.\n\ | ||||||
|  |                                --install-langs=\"ko\"\n\
 | ||||||
|  |                                  installs ko_KR, ko_KR.euckr,\n\ | ||||||
|  |                                  ko_KR.utf8 but *not* kok_IN\n\ | ||||||
|  |                                  because \"ko\" does not contain\n\
 | ||||||
|  |                                  \"_\" and it is silently added\n\
 | ||||||
|  |                                --install-langs\"ko:kok\"\n\
 | ||||||
|  |                                  installs ko_KR, ko_KR.euckr,\n\ | ||||||
|  |                                  ko_KR.utf8, kok_IN, and\n\ | ||||||
|  |                                  kok_IN.utf8.\n\ | ||||||
|  |                                --install-langs=\"POSIX\" will\n\
 | ||||||
|  |                                  installs *no* locales at all\n\ | ||||||
|  |                                  because POSIX matches none of\n\ | ||||||
|  |                                  the locales. Actually, any string\n\ | ||||||
|  |                                  matching nothing will do that.\n\ | ||||||
|  |                                  POSIX and C will always be\n\ | ||||||
|  |                                  available because they are\n\ | ||||||
|  |                                  builtin.\n\ | ||||||
|  |                              Aliases are installed as well,\n\ | ||||||
|  |                              i.e. --install-langs=\"de\"\n\
 | ||||||
|  |                              will install not only every locale starting with\n\ | ||||||
|  |                              \"de\" but also the aliases \"deutsch\"\n\
 | ||||||
|  |                              and and \"german\" although the latter does not\n\
 | ||||||
|  |                              start with \"de\".\n\
 | ||||||
|  | \n\ | ||||||
|  |   If the arguments TEMPLATE-FILE and ARCHIVE-FILE are not given the locations\n\ | ||||||
|  |   where the glibc used expects these files are used by default.\n\ | ||||||
|  | "); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | int main (int argc, char *argv[]) | ||||||
|  | { | ||||||
|  |   char path[4096]; | ||||||
|  |   DIR *dirp; | ||||||
|  |   struct dirent64 *d; | ||||||
|  |   struct stat64 st; | ||||||
|  |   char *list[16384], *primary; | ||||||
|  |   char *lang; | ||||||
|  |   int install_langs_count = 0; | ||||||
|  |   int i; | ||||||
|  |   char *install_langs_arg, *ila_start; | ||||||
|  |   char **install_langs_list = NULL; | ||||||
|  |   unsigned int cnt = 0; | ||||||
|  |   struct locarhandle tmpl_ah; | ||||||
|  |   char *new_locar_fname = NULL; | ||||||
|  |   size_t loc_path_len = strlen (loc_path); | ||||||
|  | 
 | ||||||
|  |   while (1) | ||||||
|  |     { | ||||||
|  |       int c; | ||||||
|  | 
 | ||||||
|  |       static struct option long_options[] = | ||||||
|  |         { | ||||||
|  |             {"help",            no_argument,       0, 'h'}, | ||||||
|  |             {"verbose",         no_argument,       0, 'v'}, | ||||||
|  |             {"install-langs",   required_argument, 0, 'l'}, | ||||||
|  |             {0, 0, 0, 0} | ||||||
|  |         }; | ||||||
|  |       /* getopt_long stores the option index here. */ | ||||||
|  |       int option_index = 0; | ||||||
|  | 
 | ||||||
|  |       c = getopt_long (argc, argv, "vhl:", | ||||||
|  |                        long_options, &option_index); | ||||||
|  | 
 | ||||||
|  |       /* Detect the end of the options. */ | ||||||
|  |       if (c == -1) | ||||||
|  |         break; | ||||||
|  | 
 | ||||||
|  |       switch (c) | ||||||
|  |         { | ||||||
|  |         case 0: | ||||||
|  |           printf ("unknown option %s", long_options[option_index].name); | ||||||
|  |           if (optarg) | ||||||
|  |             printf (" with arg %s", optarg); | ||||||
|  |           printf ("\n"); | ||||||
|  |           usage (); | ||||||
|  |           exit (1); | ||||||
|  | 
 | ||||||
|  |         case 'v': | ||||||
|  |           verbose = 1; | ||||||
|  |           be_quiet = 0; | ||||||
|  |           break; | ||||||
|  | 
 | ||||||
|  |         case 'h': | ||||||
|  |           usage (); | ||||||
|  |           exit (0); | ||||||
|  | 
 | ||||||
|  |         case 'l': | ||||||
|  |           install_langs_arg = ila_start = strdup (optarg); | ||||||
|  |           /* If the argument to --install-lang is "all", do
 | ||||||
|  |              not limit the list of languages to install and install | ||||||
|  |              them all.  We do not support installing a single locale | ||||||
|  | 	     called "all".  */ | ||||||
|  | #define MAGIC_INSTALL_ALL "all" | ||||||
|  |           if (install_langs_arg != NULL | ||||||
|  | 	      && install_langs_arg[0] != '\0' | ||||||
|  | 	      && !(strncmp(install_langs_arg, MAGIC_INSTALL_ALL, | ||||||
|  | 			   strlen(MAGIC_INSTALL_ALL)) == 0 | ||||||
|  | 		   && strlen (install_langs_arg) == 3)) | ||||||
|  |             { | ||||||
|  | 	      /* Count the number of languages we will install.  */ | ||||||
|  |               while (true) | ||||||
|  |                 { | ||||||
|  |                   lang = strtok(install_langs_arg, ":;,"); | ||||||
|  |                   if (lang == NULL) | ||||||
|  |                     break; | ||||||
|  |                   install_langs_count++; | ||||||
|  |                   install_langs_arg = NULL; | ||||||
|  |                 } | ||||||
|  | 	      free (ila_start); | ||||||
|  | 
 | ||||||
|  | 	      /* Reject an entire string made up of delimiters.  */ | ||||||
|  | 	      if (install_langs_count == 0) | ||||||
|  | 		break; | ||||||
|  | 
 | ||||||
|  | 	      /* Copy the list.  */ | ||||||
|  | 	      install_langs_list = (char **)xmalloc (sizeof(char *) * install_langs_count); | ||||||
|  | 	      install_langs_arg = ila_start = strdup (optarg); | ||||||
|  | 	      install_langs_count = 0; | ||||||
|  | 	      while (true) | ||||||
|  |                 { | ||||||
|  |                   lang = strtok(install_langs_arg, ":;,"); | ||||||
|  |                   if (lang == NULL) | ||||||
|  |                     break; | ||||||
|  |                   install_langs_list[install_langs_count] = lang; | ||||||
|  | 		  install_langs_count++; | ||||||
|  |                   install_langs_arg = NULL; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |           break; | ||||||
|  | 
 | ||||||
|  |         case '?': | ||||||
|  |           /* getopt_long already printed an error message. */ | ||||||
|  |           usage (); | ||||||
|  |           exit (0); | ||||||
|  | 
 | ||||||
|  |         default: | ||||||
|  |           abort (); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |   tmpl_ah.fname = NULL; | ||||||
|  |   if (optind < argc) | ||||||
|  |     tmpl_ah.fname = argv[optind]; | ||||||
|  |   if (optind + 1 < argc) | ||||||
|  |     new_locar_fname = argv[optind + 1]; | ||||||
|  |   if (verbose) | ||||||
|  |     { | ||||||
|  |       if (tmpl_ah.fname) | ||||||
|  |         printf("input archive file specified on command line: %s\n", | ||||||
|  |                tmpl_ah.fname); | ||||||
|  |       else | ||||||
|  |         printf("using default input archive file.\n"); | ||||||
|  |       if (new_locar_fname) | ||||||
|  |         printf("output archive file specified on command line: %s\n", | ||||||
|  |                new_locar_fname); | ||||||
|  |       else | ||||||
|  |         printf("using default output archive file.\n"); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |   dirp = opendir (loc_path); | ||||||
|  |   if (dirp == NULL) | ||||||
|  |     error (EXIT_FAILURE, errno, "cannot open directory \"%s\"", loc_path); | ||||||
|  | 
 | ||||||
|  |   open_tmpl_archive (&tmpl_ah); | ||||||
|  | 
 | ||||||
|  |   if (new_locar_fname) | ||||||
|  |     unlink (new_locar_fname); | ||||||
|  |   else | ||||||
|  |     unlink (locar_file); | ||||||
|  |   primary = getenv ("LC_ALL"); | ||||||
|  |   if (primary == NULL) | ||||||
|  |     primary = getenv ("LANG"); | ||||||
|  |   if (primary != NULL) | ||||||
|  |     { | ||||||
|  |       if (strncmp (primary, "ja", 2) != 0 | ||||||
|  | 	  && strncmp (primary, "ko", 2) != 0 | ||||||
|  | 	  && strncmp (primary, "zh", 2) != 0) | ||||||
|  | 	{ | ||||||
|  | 	  char *ptr = malloc (strlen (primary) + strlen (".utf8") + 1), *p, *q; | ||||||
|  | 	  /* This leads to invalid locales sometimes:
 | ||||||
|  | 	     de_DE.iso885915@euro -> de_DE.utf8@euro */ | ||||||
|  | 	  if (ptr != NULL) | ||||||
|  | 	    { | ||||||
|  | 	      p = ptr; | ||||||
|  | 	      q = primary; | ||||||
|  | 	      while (*q && *q != '.' && *q != '@') | ||||||
|  | 		*p++ = *q++; | ||||||
|  | 	      if (*q == '.') | ||||||
|  | 		while (*q && *q != '@') | ||||||
|  | 		  q++; | ||||||
|  | 	      p = stpcpy (p, ".utf8"); | ||||||
|  | 	      strcpy (p, q); | ||||||
|  | 	      primary = ptr; | ||||||
|  | 	    } | ||||||
|  | 	  else | ||||||
|  | 	    primary = NULL; | ||||||
|  | 	} | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |   memcpy (path, loc_path, loc_path_len); | ||||||
|  | 
 | ||||||
|  |   while ((d = readdir64 (dirp)) != NULL) | ||||||
|  |     { | ||||||
|  |       if (strcmp (d->d_name, ".") == 0 || strcmp (d->d_name, "..") == 0) | ||||||
|  | 	continue; | ||||||
|  |       if (strchr (d->d_name, '_') == NULL) | ||||||
|  | 	continue; | ||||||
|  | 
 | ||||||
|  |       size_t d_name_len = strlen (d->d_name); | ||||||
|  |       if (loc_path_len + d_name_len + 1 > sizeof (path)) | ||||||
|  | 	{ | ||||||
|  | 	  error (0, 0, "too long filename \"%s\"", d->d_name); | ||||||
|  | 	  continue; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  |       memcpy (path + loc_path_len, d->d_name, d_name_len + 1); | ||||||
|  |       if (stat64 (path, &st) < 0) | ||||||
|  | 	{ | ||||||
|  | 	  error (0, errno, "cannot stat \"%s\"", path); | ||||||
|  | 	  continue; | ||||||
|  | 	} | ||||||
|  |       if (! S_ISDIR (st.st_mode)) | ||||||
|  | 	continue; | ||||||
|  |       if (cnt == 16384) | ||||||
|  | 	{ | ||||||
|  | 	  error (0, 0, "too many directories in \"%s\"", loc_path); | ||||||
|  | 	  break; | ||||||
|  | 	} | ||||||
|  |       list[cnt] = strdup (path); | ||||||
|  |       if (list[cnt] == NULL) | ||||||
|  | 	{ | ||||||
|  | 	  error (0, errno, "cannot add file to list \"%s\"", path); | ||||||
|  | 	  continue; | ||||||
|  | 	} | ||||||
|  |       if (primary != NULL && cnt > 0 && strcmp (primary, d->d_name) == 0) | ||||||
|  | 	{ | ||||||
|  | 	  char *p = list[0]; | ||||||
|  | 	  list[0] = list[cnt]; | ||||||
|  | 	  list[cnt] = p; | ||||||
|  | 	} | ||||||
|  |       cnt++; | ||||||
|  |     } | ||||||
|  |   closedir (dirp); | ||||||
|  |   /* Store the archive to the file specified as the second argument on the
 | ||||||
|  |      command line or the default locale archive.  */ | ||||||
|  |   fill_archive (&tmpl_ah, new_locar_fname, | ||||||
|  |                 install_langs_count, install_langs_list, | ||||||
|  |                 cnt, list, primary); | ||||||
|  |   close_archive (&tmpl_ah); | ||||||
|  |   truncate (tmpl_file, 0); | ||||||
|  |   if (install_langs_count > 0) | ||||||
|  |     { | ||||||
|  |       free (ila_start); | ||||||
|  |       free (install_langs_list); | ||||||
|  |     } | ||||||
|  |   char *tz_argv[] = { "/usr/sbin/tzdata-update", NULL }; | ||||||
|  |   execve (tz_argv[0], (char *const *)tz_argv, (char *const *)&tz_argv[1]); | ||||||
|  |   exit (0); | ||||||
|  | } | ||||||
							
								
								
									
										112
									
								
								SOURCES/glibc-RHEL-10481.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										112
									
								
								SOURCES/glibc-RHEL-10481.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,112 @@ | |||||||
|  | commit 849274d48fc59bfa6db3c713c8ced8026b20f3b7 | ||||||
|  | Author: Florian Weimer <fweimer@redhat.com> | ||||||
|  | Date:   Thu Nov 16 19:55:35 2023 +0100 | ||||||
|  | 
 | ||||||
|  |     elf: Fix force_first handling in dlclose (bug 30981) | ||||||
|  |      | ||||||
|  |     The force_first parameter was ineffective because the dlclose'd | ||||||
|  |     object was not necessarily the first in the maps array.  Also | ||||||
|  |     enable force_first handling unconditionally, regardless of namespace. | ||||||
|  |     The initial object in a namespace should be destructed first, too. | ||||||
|  |      | ||||||
|  |     The _dl_sort_maps_dfs function had early returns for relocation | ||||||
|  |     dependency processing which broke force_first handling, too, and | ||||||
|  |     this is fixed in this change as well. | ||||||
|  |      | ||||||
|  |     Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org> | ||||||
|  | 
 | ||||||
|  | diff --git a/elf/dl-close.c b/elf/dl-close.c
 | ||||||
|  | index 66524b6708c59f29..8107c2d5f6ad2bc6 100644
 | ||||||
|  | --- a/elf/dl-close.c
 | ||||||
|  | +++ b/elf/dl-close.c
 | ||||||
|  | @@ -182,6 +182,16 @@ _dl_close_worker (struct link_map *map, bool force)
 | ||||||
|  |      } | ||||||
|  |    assert (idx == nloaded); | ||||||
|  |   | ||||||
|  | +  /* Put the dlclose'd map first, so that its destructor runs first.
 | ||||||
|  | +     The map variable is NULL after a retry.  */
 | ||||||
|  | +  if (map != NULL)
 | ||||||
|  | +    {
 | ||||||
|  | +      maps[map->l_idx] = maps[0];
 | ||||||
|  | +      maps[map->l_idx]->l_idx = map->l_idx;
 | ||||||
|  | +      maps[0] = map;
 | ||||||
|  | +      maps[0]->l_idx = 0;
 | ||||||
|  | +    }
 | ||||||
|  | +
 | ||||||
|  |    /* Keep track of the lowest index link map we have covered already.  */ | ||||||
|  |    int done_index = -1; | ||||||
|  |    while (++done_index < nloaded) | ||||||
|  | @@ -255,9 +265,10 @@ _dl_close_worker (struct link_map *map, bool force)
 | ||||||
|  |  	  } | ||||||
|  |      } | ||||||
|  |   | ||||||
|  | -  /* Sort the entries.  We can skip looking for the binary itself which is
 | ||||||
|  | -     at the front of the search list for the main namespace.  */
 | ||||||
|  | -  _dl_sort_maps (maps, nloaded, (nsid == LM_ID_BASE), true);
 | ||||||
|  | +  /* Sort the entries.  Unless retrying, the maps[0] object (the
 | ||||||
|  | +     original argument to dlclose) needs to remain first, so that its
 | ||||||
|  | +     destructor runs first.  */
 | ||||||
|  | +  _dl_sort_maps (maps, nloaded, /* force_first */ map != NULL, true);
 | ||||||
|  |   | ||||||
|  |    /* Call all termination functions at once.  */ | ||||||
|  |    bool unload_any = false; | ||||||
|  | @@ -768,7 +779,11 @@ _dl_close_worker (struct link_map *map, bool force)
 | ||||||
|  |    /* Recheck if we need to retry, release the lock.  */ | ||||||
|  |   out: | ||||||
|  |    if (dl_close_state == rerun) | ||||||
|  | -    goto retry;
 | ||||||
|  | +    {
 | ||||||
|  | +      /* The map may have been deallocated.  */
 | ||||||
|  | +      map = NULL;
 | ||||||
|  | +      goto retry;
 | ||||||
|  | +    }
 | ||||||
|  |   | ||||||
|  |    dl_close_state = not_pending; | ||||||
|  |  } | ||||||
|  | diff --git a/elf/dl-sort-maps.c b/elf/dl-sort-maps.c
 | ||||||
|  | index aeb79b40b45054c0..c17ac325eca658ef 100644
 | ||||||
|  | --- a/elf/dl-sort-maps.c
 | ||||||
|  | +++ b/elf/dl-sort-maps.c
 | ||||||
|  | @@ -260,13 +260,12 @@ _dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps,
 | ||||||
|  |  	     The below memcpy is not needed in the do_reldeps case here, | ||||||
|  |  	     since we wrote back to maps[] during DFS traversal.  */ | ||||||
|  |  	  if (maps_head == maps) | ||||||
|  | -	    return;
 | ||||||
|  | +	    break;
 | ||||||
|  |  	} | ||||||
|  |        assert (maps_head == maps); | ||||||
|  | -      return;
 | ||||||
|  |      } | ||||||
|  | -
 | ||||||
|  | -  memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
 | ||||||
|  | +  else
 | ||||||
|  | +    memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
 | ||||||
|  |   | ||||||
|  |    /* Skipping the first object at maps[0] is not valid in general, | ||||||
|  |       since traversing along object dependency-links may "find" that | ||||||
|  | diff --git a/elf/dso-sort-tests-1.def b/elf/dso-sort-tests-1.def
 | ||||||
|  | index 4bf9052db16fb352..cf6453e9eb85ac65 100644
 | ||||||
|  | --- a/elf/dso-sort-tests-1.def
 | ||||||
|  | +++ b/elf/dso-sort-tests-1.def
 | ||||||
|  | @@ -56,14 +56,16 @@ output: b>a>{}<a<b
 | ||||||
|  |  # relocation(dynamic) dependencies. While this is technically unspecified, the | ||||||
|  |  # presumed reasonable practical behavior is for the destructor order to respect | ||||||
|  |  # the static DT_NEEDED links (here this means the a->b->c->d order). | ||||||
|  | -# The older dynamic_sort=1 algorithm does not achieve this, while the DFS-based
 | ||||||
|  | -# dynamic_sort=2 algorithm does, although it is still arguable whether going
 | ||||||
|  | -# beyond spec to do this is the right thing to do.
 | ||||||
|  | +# The older dynamic_sort=1 algorithm originally did not achieve this,
 | ||||||
|  | +# but this was a bug in the way _dl_sort_maps was called from _dl_close_worker,
 | ||||||
|  | +# effectively disabling proper force_first handling.
 | ||||||
|  | +# The new dynamic_sort=2 algorithm shows the effect of the simpler force_first
 | ||||||
|  | +# handling: the a object is simply moved to the front.
 | ||||||
|  |  # The below expected outputs are what the two algorithms currently produce | ||||||
|  |  # respectively, for regression testing purposes. | ||||||
|  |  tst-bz15311: {+a;+e;+f;+g;+d;%d;-d;-g;-f;-e;-a};a->b->c->d;d=>[ba];c=>a;b=>e=>a;c=>f=>b;d=>g=>c | ||||||
|  | -output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<c<d<g<f<b<e];}
 | ||||||
|  | -output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<g<f<a<b<c<d<e];}
 | ||||||
|  | +output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<b<c<d<g<f<e];}
 | ||||||
|  | +output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<g<f<b<c<d<e];}
 | ||||||
|  |   | ||||||
|  |  # Test that even in the presence of dependency loops involving dlopen'ed | ||||||
|  |  # object, that object is initialized last (and not unloaded prematurely). | ||||||
							
								
								
									
										236
									
								
								SOURCES/glibc-RHEL-105326.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										236
									
								
								SOURCES/glibc-RHEL-105326.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,236 @@ | |||||||
|  | commit 7ea06e994093fa0bcca0d0ee2c1db271d8d7885d | ||||||
|  | Author: Florian Weimer <fweimer@redhat.com> | ||||||
|  | Date:   Mon Jul 21 21:43:49 2025 +0200 | ||||||
|  | 
 | ||||||
|  |     posix: Fix double-free after allocation failure in regcomp (bug 33185) | ||||||
|  | 
 | ||||||
|  |     If a memory allocation failure occurs during bracket expression | ||||||
|  |     parsing in regcomp, a double-free error may result. | ||||||
|  | 
 | ||||||
|  |     Reported-by: Anastasia Belova <abelova@astralinux.ru> | ||||||
|  |     Co-authored-by: Paul Eggert <eggert@cs.ucla.edu> | ||||||
|  |     Reviewed-by: Andreas K. Huettel <dilfridge@gentoo.org> | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	posix/Makefile | ||||||
|  | 	  (tests list not reformatted/sorted downstream) | ||||||
|  | 	posix/tst-regcomp-bracket-free.c | ||||||
|  | 	  (missing strerrorname_np downstream) | ||||||
|  | 
 | ||||||
|  | diff --git a/posix/Makefile b/posix/Makefile
 | ||||||
|  | index 83162123f9c927a0..42a0290370b40fd9 100644
 | ||||||
|  | --- a/posix/Makefile
 | ||||||
|  | +++ b/posix/Makefile
 | ||||||
|  | @@ -96,7 +96,7 @@ tests		:= test-errno tstgetopt testfnm runtests runptests \
 | ||||||
|  |  		   tst-posix_fadvise tst-posix_fadvise64 \ | ||||||
|  |  		   tst-sysconf-empty-chroot tst-glob_symlinks tst-fexecve \ | ||||||
|  |  		   tst-glob-tilde test-ssize-max tst-spawn4 bug-regex37 \ | ||||||
|  | -		   bug-regex38 tst-regcomp-truncated
 | ||||||
|  | +		   bug-regex38 tst-regcomp-truncated tst-regcomp-bracket-free
 | ||||||
|  |  tests-internal	:= bug-regex5 bug-regex20 bug-regex33 \ | ||||||
|  |  		   tst-rfc3484 tst-rfc3484-2 tst-rfc3484-3 \ | ||||||
|  |  		   tst-glob_lstat_compat tst-spawn4-compat | ||||||
|  | diff --git a/posix/regcomp.c b/posix/regcomp.c
 | ||||||
|  | index 545d188468c376e7..b737b22da8703d6c 100644
 | ||||||
|  | --- a/posix/regcomp.c
 | ||||||
|  | +++ b/posix/regcomp.c
 | ||||||
|  | @@ -3375,6 +3375,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 | ||||||
|  |      { | ||||||
|  |  #ifdef RE_ENABLE_I18N | ||||||
|  |        free_charset (mbcset); | ||||||
|  | +      mbcset = NULL;
 | ||||||
|  |  #endif | ||||||
|  |        /* Build a tree for simple bracket.  */ | ||||||
|  |        br_token.type = SIMPLE_BRACKET; | ||||||
|  | @@ -3390,7 +3391,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 | ||||||
|  |   parse_bracket_exp_free_return: | ||||||
|  |    re_free (sbcset); | ||||||
|  |  #ifdef RE_ENABLE_I18N | ||||||
|  | -  free_charset (mbcset);
 | ||||||
|  | +  if (__glibc_likely (mbcset != NULL))
 | ||||||
|  | +    free_charset (mbcset);
 | ||||||
|  |  #endif /* RE_ENABLE_I18N */ | ||||||
|  |    return NULL; | ||||||
|  |  } | ||||||
|  | diff --git a/posix/tst-regcomp-bracket-free.c b/posix/tst-regcomp-bracket-free.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 0000000000000000..e6041ddaeba3045c
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/posix/tst-regcomp-bracket-free.c
 | ||||||
|  | @@ -0,0 +1,176 @@
 | ||||||
|  | +/* Test regcomp bracket parsing with injected allocation failures (bug 33185).
 | ||||||
|  | +   Copyright (C) 2025 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +/* This test invokes regcomp multiple times, failing one memory
 | ||||||
|  | +   allocation in each call.  The function call should fail with
 | ||||||
|  | +   REG_ESPACE (or succeed if it can recover from the allocation
 | ||||||
|  | +   failure).  Previously, there was double-free bug.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <errno.h>
 | ||||||
|  | +#include <regex.h>
 | ||||||
|  | +#include <stdio.h>
 | ||||||
|  | +#include <string.h>
 | ||||||
|  | +#include <support/check.h>
 | ||||||
|  | +#include <support/namespace.h>
 | ||||||
|  | +#include <support/support.h>
 | ||||||
|  | +
 | ||||||
|  | +/* Data structure allocated via MAP_SHARED, so that writes from the
 | ||||||
|  | +   subprocess are visible.  */
 | ||||||
|  | +struct shared_data
 | ||||||
|  | +{
 | ||||||
|  | +  /* Number of tracked allocations performed so far.  */
 | ||||||
|  | +  volatile unsigned int allocation_count;
 | ||||||
|  | +
 | ||||||
|  | +  /* If this number is reached, one allocation fails.  */
 | ||||||
|  | +  volatile unsigned int failing_allocation;
 | ||||||
|  | +
 | ||||||
|  | +  /* The subprocess stores the expected name here.  */
 | ||||||
|  | +  char name[100];
 | ||||||
|  | +};
 | ||||||
|  | +
 | ||||||
|  | +/* Allocation count in shared mapping.  */
 | ||||||
|  | +static struct shared_data *shared;
 | ||||||
|  | +
 | ||||||
|  | +/* Returns true if a failure should be injected for this allocation.  */
 | ||||||
|  | +static bool
 | ||||||
|  | +fail_this_allocation (void)
 | ||||||
|  | +{
 | ||||||
|  | +  if (shared != NULL)
 | ||||||
|  | +    {
 | ||||||
|  | +      unsigned int count = shared->allocation_count;
 | ||||||
|  | +      shared->allocation_count = count + 1;
 | ||||||
|  | +      return count == shared->failing_allocation;
 | ||||||
|  | +    }
 | ||||||
|  | +  else
 | ||||||
|  | +    return false;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +/* Failure-injecting wrappers for allocation functions used by glibc.  */
 | ||||||
|  | +
 | ||||||
|  | +void *
 | ||||||
|  | +malloc (size_t size)
 | ||||||
|  | +{
 | ||||||
|  | +  if (fail_this_allocation ())
 | ||||||
|  | +    {
 | ||||||
|  | +      errno = ENOMEM;
 | ||||||
|  | +      return NULL;
 | ||||||
|  | +    }
 | ||||||
|  | +  extern __typeof (malloc) __libc_malloc;
 | ||||||
|  | +  return __libc_malloc (size);
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +void *
 | ||||||
|  | +calloc (size_t a, size_t b)
 | ||||||
|  | +{
 | ||||||
|  | +  if (fail_this_allocation ())
 | ||||||
|  | +    {
 | ||||||
|  | +      errno = ENOMEM;
 | ||||||
|  | +      return NULL;
 | ||||||
|  | +    }
 | ||||||
|  | +  extern __typeof (calloc) __libc_calloc;
 | ||||||
|  | +  return __libc_calloc (a, b);
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +void *
 | ||||||
|  | +realloc (void *ptr, size_t size)
 | ||||||
|  | +{
 | ||||||
|  | +  if (fail_this_allocation ())
 | ||||||
|  | +    {
 | ||||||
|  | +      errno = ENOMEM;
 | ||||||
|  | +      return NULL;
 | ||||||
|  | +    }
 | ||||||
|  | +  extern __typeof (realloc) __libc_realloc;
 | ||||||
|  | +  return __libc_realloc (ptr, size);
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +/* No-op subprocess to verify that support_isolate_in_subprocess does
 | ||||||
|  | +   not perform any heap allocations.  */
 | ||||||
|  | +static void
 | ||||||
|  | +no_op (void *ignored)
 | ||||||
|  | +{
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +/* Perform a regcomp call in a subprocess.  Used to count its
 | ||||||
|  | +   allocations.  */
 | ||||||
|  | +static void
 | ||||||
|  | +initialize (void *regexp1)
 | ||||||
|  | +{
 | ||||||
|  | +  const char *regexp = regexp1;
 | ||||||
|  | +
 | ||||||
|  | +  shared->allocation_count = 0;
 | ||||||
|  | +
 | ||||||
|  | +  regex_t reg;
 | ||||||
|  | +  TEST_COMPARE (regcomp (®, regexp, 0), 0);
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +/* Perform regcomp in a subprocess with fault injection.  */
 | ||||||
|  | +static void
 | ||||||
|  | +test_in_subprocess (void *regexp1)
 | ||||||
|  | +{
 | ||||||
|  | +  const char *regexp = regexp1;
 | ||||||
|  | +  unsigned int inject_at = shared->failing_allocation;
 | ||||||
|  | +
 | ||||||
|  | +  regex_t reg;
 | ||||||
|  | +  int ret = regcomp (®, regexp, 0);
 | ||||||
|  | +
 | ||||||
|  | +  if (ret != 0)
 | ||||||
|  | +    {
 | ||||||
|  | +      TEST_COMPARE (ret, REG_ESPACE);
 | ||||||
|  | +      printf ("info: allocation %u failure results in return value %d,"
 | ||||||
|  | +              " error %s (%d)\n",
 | ||||||
|  | +              inject_at, ret, strerror (errno), errno);
 | ||||||
|  | +    }
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +do_test (void)
 | ||||||
|  | +{
 | ||||||
|  | +  char regexp[] = "[:alpha:]";
 | ||||||
|  | +
 | ||||||
|  | +  shared = support_shared_allocate (sizeof (*shared));
 | ||||||
|  | +
 | ||||||
|  | +  /* Disable fault injection.  */
 | ||||||
|  | +  shared->failing_allocation = ~0U;
 | ||||||
|  | +
 | ||||||
|  | +  support_isolate_in_subprocess (no_op, NULL);
 | ||||||
|  | +  TEST_COMPARE (shared->allocation_count, 0);
 | ||||||
|  | +
 | ||||||
|  | +  support_isolate_in_subprocess (initialize, regexp);
 | ||||||
|  | +
 | ||||||
|  | +  /* The number of allocations in the successful case, plus some
 | ||||||
|  | +     slack.  Once the number of expected allocations is exceeded,
 | ||||||
|  | +     injecting further failures does not make a difference.  */
 | ||||||
|  | +  unsigned int maximum_allocation_count = shared->allocation_count;
 | ||||||
|  | +  printf ("info: successful call performs %u allocations\n",
 | ||||||
|  | +          maximum_allocation_count);
 | ||||||
|  | +  maximum_allocation_count += 10;
 | ||||||
|  | +
 | ||||||
|  | +  for (unsigned int inject_at = 0; inject_at <= maximum_allocation_count;
 | ||||||
|  | +       ++inject_at)
 | ||||||
|  | +    {
 | ||||||
|  | +      shared->allocation_count = 0;
 | ||||||
|  | +      shared->failing_allocation = inject_at;
 | ||||||
|  | +      support_isolate_in_subprocess (test_in_subprocess, regexp);
 | ||||||
|  | +    }
 | ||||||
|  | +
 | ||||||
|  | +  support_shared_free (shared);
 | ||||||
|  | +
 | ||||||
|  | +  return 0;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +#include <support/test-driver.c>
 | ||||||
							
								
								
									
										83
									
								
								SOURCES/glibc-RHEL-1192.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										83
									
								
								SOURCES/glibc-RHEL-1192.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,83 @@ | |||||||
|  | commit c00b984fcd53f679ca2dafcd1aee2c89836e6e73 | ||||||
|  | Author: Florian Weimer <fweimer@redhat.com> | ||||||
|  | Date:   Tue Aug 29 08:28:31 2023 +0200 | ||||||
|  | 
 | ||||||
|  |     nscd: Skip unusable entries in first pass in prune_cache (bug 30800) | ||||||
|  |      | ||||||
|  |     Previously, if an entry was marked unusable for any reason, but had | ||||||
|  |     not timed out yet, the assert would trigger. | ||||||
|  |      | ||||||
|  |     One way to get into such state is if a data change is detected during | ||||||
|  |     re-validation of an entry.  This causes the entry to be marked as not | ||||||
|  |     usable.  If exits nscd soon after that, then the clock jumps | ||||||
|  |     backwards, and nscd restarted, the cache re-validation run after | ||||||
|  |     startup triggers the removed assert. | ||||||
|  |      | ||||||
|  |     The change is more complicated than just the removal of the assert | ||||||
|  |     because entries marked as not usable should be garbage-collected in | ||||||
|  |     the second pass.  To make this happen, it is necessary to update some | ||||||
|  |     book-keeping data. | ||||||
|  |      | ||||||
|  |     Reviewed-by: DJ Delorie <dj@redhat.com> | ||||||
|  | 
 | ||||||
|  | diff --git a/nscd/cache.c b/nscd/cache.c
 | ||||||
|  | index efe4214d953edb30..2fd3f78ebb567bbe 100644
 | ||||||
|  | --- a/nscd/cache.c
 | ||||||
|  | +++ b/nscd/cache.c
 | ||||||
|  | @@ -371,8 +371,11 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
 | ||||||
|  |  		       serv2str[runp->type], str, dh->timeout); | ||||||
|  |  	    } | ||||||
|  |   | ||||||
|  | -	  /* Check whether the entry timed out.  */
 | ||||||
|  | -	  if (dh->timeout < now)
 | ||||||
|  | +	  /* Check whether the entry timed out.  Timed out entries
 | ||||||
|  | +	     will be revalidated.  For unusable records, it is still
 | ||||||
|  | +	     necessary to record that the bucket needs to be scanned
 | ||||||
|  | +	     again below.  */
 | ||||||
|  | +	  if (dh->timeout < now || !dh->usable)
 | ||||||
|  |  	    { | ||||||
|  |  	      /* This hash bucket could contain entries which need to | ||||||
|  |  		 be looked at.  */ | ||||||
|  | @@ -384,7 +387,7 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
 | ||||||
|  |  	      /* We only have to look at the data of the first entries | ||||||
|  |  		 since the count information is kept in the data part | ||||||
|  |  		 which is shared.  */ | ||||||
|  | -	      if (runp->first)
 | ||||||
|  | +	      if (runp->first && dh->usable)
 | ||||||
|  |  		{ | ||||||
|  |   | ||||||
|  |  		  /* At this point there are two choices: we reload the | ||||||
|  | @@ -400,9 +403,6 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
 | ||||||
|  |  		    { | ||||||
|  |  		      /* Remove the value.  */ | ||||||
|  |  		      dh->usable = false; | ||||||
|  | -
 | ||||||
|  | -		      /* We definitely have some garbage entries now.  */
 | ||||||
|  | -		      any = true;
 | ||||||
|  |  		    } | ||||||
|  |  		  else | ||||||
|  |  		    { | ||||||
|  | @@ -414,18 +414,15 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
 | ||||||
|  |   | ||||||
|  |  		      time_t timeout = readdfcts[runp->type] (table, runp, dh); | ||||||
|  |  		      next_timeout = MIN (next_timeout, timeout); | ||||||
|  | -
 | ||||||
|  | -		      /* If the entry has been replaced, we might need
 | ||||||
|  | -			 cleanup.  */
 | ||||||
|  | -		      any |= !dh->usable;
 | ||||||
|  |  		    } | ||||||
|  |  		} | ||||||
|  | +
 | ||||||
|  | +	      /* If the entry has been replaced, we might need cleanup.  */
 | ||||||
|  | +	      any |= !dh->usable;
 | ||||||
|  |  	    } | ||||||
|  |  	  else | ||||||
|  | -	    {
 | ||||||
|  | -	      assert (dh->usable);
 | ||||||
|  | -	      next_timeout = MIN (next_timeout, dh->timeout);
 | ||||||
|  | -	    }
 | ||||||
|  | +	    /* Entry has not timed out and is usable.  */
 | ||||||
|  | +	    next_timeout = MIN (next_timeout, dh->timeout);
 | ||||||
|  |   | ||||||
|  |  	  run = runp->next; | ||||||
|  |  	} | ||||||
							
								
								
									
										72
									
								
								SOURCES/glibc-RHEL-13720-1.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										72
									
								
								SOURCES/glibc-RHEL-13720-1.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,72 @@ | |||||||
|  | commit 2aa0974d2573441bffd596b07bff8698b1f2f18c | ||||||
|  | Author: Florian Weimer <fweimer@redhat.com> | ||||||
|  | Date:   Fri Oct 20 14:29:50 2023 +0200 | ||||||
|  | 
 | ||||||
|  |     elf: ldconfig should skip temporary files created by package managers | ||||||
|  |      | ||||||
|  |     This avoids crashes due to partially written files, after a package | ||||||
|  |     update is interrupted. | ||||||
|  |      | ||||||
|  |     Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	elf/ldconfig.c | ||||||
|  | 	  (missing alloca removal downstream) | ||||||
|  | 
 | ||||||
|  | diff --git a/elf/ldconfig.c b/elf/ldconfig.c
 | ||||||
|  | index 8c66d7e5426d8cc4..51de08f91fbaf093 100644
 | ||||||
|  | --- a/elf/ldconfig.c
 | ||||||
|  | +++ b/elf/ldconfig.c
 | ||||||
|  | @@ -771,6 +771,31 @@ struct dlib_entry
 | ||||||
|  |    struct dlib_entry *next; | ||||||
|  |  }; | ||||||
|  |   | ||||||
|  | +/* Skip some temporary DSO files.  These files may be partially written
 | ||||||
|  | +   and lead to ldconfig crashes when examined.  */
 | ||||||
|  | +static bool
 | ||||||
|  | +skip_dso_based_on_name (const char *name, size_t len)
 | ||||||
|  | +{
 | ||||||
|  | +  /* Skip temporary files created by the prelink program.  Files with
 | ||||||
|  | +     names like these are never really DSOs we want to look at.  */
 | ||||||
|  | +  if (len >= sizeof (".#prelink#") - 1)
 | ||||||
|  | +    {
 | ||||||
|  | +      if (strcmp (name + len - sizeof (".#prelink#") + 1,
 | ||||||
|  | +		  ".#prelink#") == 0)
 | ||||||
|  | +	return true;
 | ||||||
|  | +      if (len >= sizeof (".#prelink#.XXXXXX") - 1
 | ||||||
|  | +	  && memcmp (name + len - sizeof (".#prelink#.XXXXXX")
 | ||||||
|  | +		     + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0)
 | ||||||
|  | +	return true;
 | ||||||
|  | +    }
 | ||||||
|  | +  /* Skip temporary files created by RPM.  */
 | ||||||
|  | +  if (memchr (name, len, ';') != NULL)
 | ||||||
|  | +    return true;
 | ||||||
|  | +  /* Skip temporary files created by dpkg.  */
 | ||||||
|  | +  if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
 | ||||||
|  | +    return true;
 | ||||||
|  | +  return false;
 | ||||||
|  | +}
 | ||||||
|  |   | ||||||
|  |  static void | ||||||
|  |  search_dir (const struct dir_entry *entry) | ||||||
|  | @@ -849,18 +874,8 @@ search_dir (const struct dir_entry *entry)
 | ||||||
|  |  	continue; | ||||||
|  |   | ||||||
|  |        size_t len = strlen (direntry->d_name); | ||||||
|  | -      /* Skip temporary files created by the prelink program.  Files with
 | ||||||
|  | -	 names like these are never really DSOs we want to look at.  */
 | ||||||
|  | -      if (len >= sizeof (".#prelink#") - 1)
 | ||||||
|  | -	{
 | ||||||
|  | -	  if (strcmp (direntry->d_name + len - sizeof (".#prelink#") + 1,
 | ||||||
|  | -		      ".#prelink#") == 0)
 | ||||||
|  | -	    continue;
 | ||||||
|  | -	  if (len >= sizeof (".#prelink#.XXXXXX") - 1
 | ||||||
|  | -	      && memcmp (direntry->d_name + len - sizeof (".#prelink#.XXXXXX")
 | ||||||
|  | -			 + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0)
 | ||||||
|  | -	    continue;
 | ||||||
|  | -	}
 | ||||||
|  | +      if (skip_dso_based_on_name (direntry->d_name, len))
 | ||||||
|  | +	continue;
 | ||||||
|  |        len += strlen (entry->path) + 2; | ||||||
|  |        if (len > file_name_len) | ||||||
|  |  	{ | ||||||
							
								
								
									
										61
									
								
								SOURCES/glibc-RHEL-13720-2.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										61
									
								
								SOURCES/glibc-RHEL-13720-2.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,61 @@ | |||||||
|  | commit cfb5a97a93ea656e3b2263e42142a4032986d9ba | ||||||
|  | Author: Florian Weimer <fweimer@redhat.com> | ||||||
|  | Date:   Mon Oct 23 12:53:16 2023 +0200 | ||||||
|  | 
 | ||||||
|  |     ldconfig: Fixes for skipping temporary files. | ||||||
|  |      | ||||||
|  |     Arguments to a memchr call were swapped, causing incorrect skipping | ||||||
|  |     of files. | ||||||
|  |      | ||||||
|  |     Files related to dpkg have different names: they actually end in | ||||||
|  |     .dpkg-new and .dpkg-tmp, not .tmp as I mistakenly assumed. | ||||||
|  |      | ||||||
|  |     Fixes commit 2aa0974d2573441bffd59 ("elf: ldconfig should skip | ||||||
|  |     temporary files created by package managers"). | ||||||
|  | 
 | ||||||
|  | diff --git a/elf/ldconfig.c b/elf/ldconfig.c
 | ||||||
|  | index 51de08f91fbaf093..fb19dd68d41c07a4 100644
 | ||||||
|  | --- a/elf/ldconfig.c
 | ||||||
|  | +++ b/elf/ldconfig.c
 | ||||||
|  | @@ -771,6 +771,17 @@ struct dlib_entry
 | ||||||
|  |    struct dlib_entry *next; | ||||||
|  |  }; | ||||||
|  |   | ||||||
|  | +/* Return true if the N bytes at NAME end with with the characters in
 | ||||||
|  | +   the string SUFFIX.  (NAME[N + 1] does not have to be a null byte.)
 | ||||||
|  | +   Expected to be called with a string literal for SUFFIX.  */
 | ||||||
|  | +static inline bool
 | ||||||
|  | +endswithn (const char *name, size_t n, const char *suffix)
 | ||||||
|  | +{
 | ||||||
|  | +  return (n >= strlen (suffix)
 | ||||||
|  | +	  && memcmp (name + n - strlen (suffix), suffix,
 | ||||||
|  | +		     strlen (suffix)) == 0);
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  |  /* Skip some temporary DSO files.  These files may be partially written | ||||||
|  |     and lead to ldconfig crashes when examined.  */ | ||||||
|  |  static bool | ||||||
|  | @@ -780,8 +791,7 @@ skip_dso_based_on_name (const char *name, size_t len)
 | ||||||
|  |       names like these are never really DSOs we want to look at.  */ | ||||||
|  |    if (len >= sizeof (".#prelink#") - 1) | ||||||
|  |      { | ||||||
|  | -      if (strcmp (name + len - sizeof (".#prelink#") + 1,
 | ||||||
|  | -		  ".#prelink#") == 0)
 | ||||||
|  | +      if (endswithn (name, len, ".#prelink#"))
 | ||||||
|  |  	return true; | ||||||
|  |        if (len >= sizeof (".#prelink#.XXXXXX") - 1 | ||||||
|  |  	  && memcmp (name + len - sizeof (".#prelink#.XXXXXX") | ||||||
|  | @@ -789,10 +799,11 @@ skip_dso_based_on_name (const char *name, size_t len)
 | ||||||
|  |  	return true; | ||||||
|  |      } | ||||||
|  |    /* Skip temporary files created by RPM.  */ | ||||||
|  | -  if (memchr (name, len, ';') != NULL)
 | ||||||
|  | +  if (memchr (name, ';', len) != NULL)
 | ||||||
|  |      return true; | ||||||
|  |    /* Skip temporary files created by dpkg.  */ | ||||||
|  | -  if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
 | ||||||
|  | +  if (endswithn (name, len, ".dpkg-new")
 | ||||||
|  | +      || endswithn (name, len, ".dpkg-tmp"))
 | ||||||
|  |      return true; | ||||||
|  |    return false; | ||||||
|  |  } | ||||||
							
								
								
									
										259
									
								
								SOURCES/glibc-RHEL-15696-1.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										259
									
								
								SOURCES/glibc-RHEL-15696-1.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,259 @@ | |||||||
|  | From 97700a34f36721b11a754cf37a1cc40695ece1fd Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Mon, 21 Jan 2019 11:23:59 -0800 | ||||||
|  | Subject: [PATCH] x86-64 memchr/wmemchr: Properly handle the length parameter | ||||||
|  |  [BZ# 24097] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | On x32, the size_t parameter may be passed in the lower 32 bits of a | ||||||
|  | 64-bit register with the non-zero upper 32 bits.  The string/memory | ||||||
|  | functions written in assembly can only use the lower 32 bits of a | ||||||
|  | 64-bit register as length or must clear the upper 32 bits before using | ||||||
|  | the full 64-bit register for length. | ||||||
|  | 
 | ||||||
|  | This pach fixes memchr/wmemchr for x32.  Tested on x86-64 and x32.  On | ||||||
|  | x86-64, libc.so is the same with and withou the fix. | ||||||
|  | 
 | ||||||
|  | 	[BZ# 24097] | ||||||
|  | 	CVE-2019-6488 | ||||||
|  | 	* sysdeps/x86_64/memchr.S: Use RDX_LP for length.  Clear the | ||||||
|  | 	upper 32 bits of RDX register. | ||||||
|  | 	* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise. | ||||||
|  | 	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and | ||||||
|  | 	tst-size_t-wmemchr. | ||||||
|  | 	* sysdeps/x86_64/x32/test-size_t.h: New file. | ||||||
|  | 	* sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise. | ||||||
|  | 	* sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/memchr.S                 | 10 ++-- | ||||||
|  |  sysdeps/x86_64/multiarch/memchr-avx2.S  |  8 ++- | ||||||
|  |  sysdeps/x86_64/x32/Makefile             |  8 +++ | ||||||
|  |  sysdeps/x86_64/x32/test-size_t.h        | 35 ++++++++++++ | ||||||
|  |  sysdeps/x86_64/x32/tst-size_t-memchr.c  | 72 +++++++++++++++++++++++++ | ||||||
|  |  sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++ | ||||||
|  |  6 files changed, 148 insertions(+), 5 deletions(-) | ||||||
|  |  create mode 100644 sysdeps/x86_64/x32/test-size_t.h | ||||||
|  |  create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c | ||||||
|  |  create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	ChangeLog | ||||||
|  | 	(removed) | ||||||
|  | 	NEWS | ||||||
|  | 	(removed) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
 | ||||||
|  | index feef5d4f..cb320257 100644
 | ||||||
|  | --- a/sysdeps/x86_64/memchr.S
 | ||||||
|  | +++ b/sysdeps/x86_64/memchr.S
 | ||||||
|  | @@ -34,12 +34,16 @@ ENTRY(MEMCHR)
 | ||||||
|  |  	mov	%edi, %ecx | ||||||
|  |   | ||||||
|  |  #ifdef USE_AS_WMEMCHR | ||||||
|  | -	test	%rdx, %rdx
 | ||||||
|  | +	test	%RDX_LP, %RDX_LP
 | ||||||
|  |  	jz	L(return_null) | ||||||
|  | -	shl	$2, %rdx
 | ||||||
|  | +	shl	$2, %RDX_LP
 | ||||||
|  |  #else | ||||||
|  | +# ifdef __ILP32__
 | ||||||
|  | +	/* Clear the upper 32 bits.  */
 | ||||||
|  | +	movl	%edx, %edx
 | ||||||
|  | +# endif
 | ||||||
|  |  	punpcklbw %xmm1, %xmm1 | ||||||
|  | -	test	%rdx, %rdx
 | ||||||
|  | +	test	%RDX_LP, %RDX_LP
 | ||||||
|  |  	jz	L(return_null) | ||||||
|  |  	punpcklbw %xmm1, %xmm1 | ||||||
|  |  #endif | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
 | ||||||
|  | index 5f5e7725..c81da19b 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
 | ||||||
|  | @@ -40,16 +40,20 @@
 | ||||||
|  |  ENTRY (MEMCHR) | ||||||
|  |  # ifndef USE_AS_RAWMEMCHR | ||||||
|  |  	/* Check for zero length.  */ | ||||||
|  | -	testq	%rdx, %rdx
 | ||||||
|  | +	test	%RDX_LP, %RDX_LP
 | ||||||
|  |  	jz	L(null) | ||||||
|  |  # endif | ||||||
|  |  	movl	%edi, %ecx | ||||||
|  |  	/* Broadcast CHAR to YMM0.  */ | ||||||
|  |  	vmovd	%esi, %xmm0 | ||||||
|  |  # ifdef USE_AS_WMEMCHR | ||||||
|  | -	shl	$2, %rdx
 | ||||||
|  | +	shl	$2, %RDX_LP
 | ||||||
|  |  	vpbroadcastd %xmm0, %ymm0 | ||||||
|  |  # else | ||||||
|  | +#  ifdef __ILP32__
 | ||||||
|  | +	/* Clear the upper 32 bits.  */
 | ||||||
|  | +	movl	%edx, %edx
 | ||||||
|  | +#  endif
 | ||||||
|  |  	vpbroadcastb %xmm0, %ymm0 | ||||||
|  |  # endif | ||||||
|  |  	/* Check if we may cross page boundary with one vector load.  */ | ||||||
|  | diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | index f2ebc24f..7d528889 100644
 | ||||||
|  | --- a/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | @@ -4,3 +4,11 @@ ifeq ($(subdir),math)
 | ||||||
|  |  # 64-bit llround.  Add -fno-builtin-lround to silence the compiler. | ||||||
|  |  CFLAGS-s_llround.c += -fno-builtin-lround | ||||||
|  |  endif | ||||||
|  | +
 | ||||||
|  | +ifeq ($(subdir),string)
 | ||||||
|  | +tests += tst-size_t-memchr
 | ||||||
|  | +endif
 | ||||||
|  | +
 | ||||||
|  | +ifeq ($(subdir),wcsmbs)
 | ||||||
|  | +tests += tst-size_t-wmemchr
 | ||||||
|  | +endif
 | ||||||
|  | diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..78a94086
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/test-size_t.h
 | ||||||
|  | @@ -0,0 +1,35 @@
 | ||||||
|  | +/* Test string/memory functions with size_t in the lower 32 bits of
 | ||||||
|  | +   64-bit register.
 | ||||||
|  | +   Copyright (C) 2019 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <http://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#define TEST_MAIN
 | ||||||
|  | +#include <string/test-string.h>
 | ||||||
|  | +
 | ||||||
|  | +/* On x32, parameter_t may be passed in a 64-bit register with the LEN
 | ||||||
|  | +   field in the lower 32 bits.  When the LEN field of 64-bit register
 | ||||||
|  | +   is passed to string/memory function as the size_t parameter, only
 | ||||||
|  | +   the lower 32 bits can be used.  */
 | ||||||
|  | +typedef struct
 | ||||||
|  | +{
 | ||||||
|  | +  union
 | ||||||
|  | +    {
 | ||||||
|  | +      size_t len;
 | ||||||
|  | +      void (*fn) (void);
 | ||||||
|  | +    };
 | ||||||
|  | +  void *p;
 | ||||||
|  | +} parameter_t;
 | ||||||
|  | diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..29a3daf1
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c
 | ||||||
|  | @@ -0,0 +1,72 @@
 | ||||||
|  | +/* Test memchr with size_t in the lower 32 bits of 64-bit register.
 | ||||||
|  | +   Copyright (C) 2019 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <http://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#ifndef WIDE
 | ||||||
|  | +# define TEST_NAME "memchr"
 | ||||||
|  | +#else
 | ||||||
|  | +# define TEST_NAME "wmemchr"
 | ||||||
|  | +#endif /* WIDE */
 | ||||||
|  | +#include "test-size_t.h"
 | ||||||
|  | +
 | ||||||
|  | +#ifndef WIDE
 | ||||||
|  | +# define MEMCHR memchr
 | ||||||
|  | +# define CHAR char
 | ||||||
|  | +# define UCHAR unsigned char
 | ||||||
|  | +#else
 | ||||||
|  | +# include <wchar.h>
 | ||||||
|  | +# define MEMCHR wmemchr
 | ||||||
|  | +# define CHAR wchar_t
 | ||||||
|  | +# define UCHAR wchar_t
 | ||||||
|  | +#endif /* WIDE */
 | ||||||
|  | +
 | ||||||
|  | +IMPL (MEMCHR, 1)
 | ||||||
|  | +
 | ||||||
|  | +typedef CHAR * (*proto_t) (const CHAR*, int, size_t);
 | ||||||
|  | +
 | ||||||
|  | +static CHAR *
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +do_memchr (parameter_t a, parameter_t b)
 | ||||||
|  | +{
 | ||||||
|  | +  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +test_main (void)
 | ||||||
|  | +{
 | ||||||
|  | +  test_init ();
 | ||||||
|  | +
 | ||||||
|  | +  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
 | ||||||
|  | +  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
 | ||||||
|  | +
 | ||||||
|  | +  int ret = 0;
 | ||||||
|  | +  FOR_EACH_IMPL (impl, 0)
 | ||||||
|  | +    {
 | ||||||
|  | +      c.fn = impl->fn;
 | ||||||
|  | +      CHAR *res = do_memchr (src, c);
 | ||||||
|  | +      if (res)
 | ||||||
|  | +	{
 | ||||||
|  | +	  error (0, 0, "Wrong result in function %s: %p != NULL",
 | ||||||
|  | +		 impl->name, res);
 | ||||||
|  | +	  ret = 1;
 | ||||||
|  | +	}
 | ||||||
|  | +    }
 | ||||||
|  | +
 | ||||||
|  | +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +#include <support/test-driver.c>
 | ||||||
|  | diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..877801d6
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
 | ||||||
|  | @@ -0,0 +1,20 @@
 | ||||||
|  | +/* Test wmemchr with size_t in the lower 32 bits of 64-bit register.
 | ||||||
|  | +   Copyright (C) 2019 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <http://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#define WIDE 1
 | ||||||
|  | +#include "tst-size_t-memchr.c"
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										41
									
								
								SOURCES/glibc-RHEL-15696-10.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								SOURCES/glibc-RHEL-15696-10.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,41 @@ | |||||||
|  | From ddf0992cf57a93200e0c782e2a94d0733a5a0b87 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Sun, 9 Jan 2022 16:02:21 -0600 | ||||||
|  | Subject: [PATCH] x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to | ||||||
|  | __wcscmp_avx2. For x86_64 this covers the entire address range so any | ||||||
|  | length larger could not possibly be used to bound `s1` or `s2`. | ||||||
|  | 
 | ||||||
|  | test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strcmp-avx2.S | 10 ++++++++++ | ||||||
|  |  1 file changed, 10 insertions(+) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | index 156c1949..8fb8eedc 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | @@ -83,6 +83,16 @@ ENTRY (STRCMP)
 | ||||||
|  |  	je	L(char0) | ||||||
|  |  	jb	L(zero) | ||||||
|  |  #  ifdef USE_AS_WCSCMP | ||||||
|  | +#  ifndef __ILP32__
 | ||||||
|  | +	movq	%rdx, %rcx
 | ||||||
|  | +	/* Check if length could overflow when multiplied by
 | ||||||
|  | +	   sizeof(wchar_t). Checking top 8 bits will cover all potential
 | ||||||
|  | +	   overflow cases as well as redirect cases where its impossible to
 | ||||||
|  | +	   length to bound a valid memory region. In these cases just use
 | ||||||
|  | +	   'wcscmp'.  */
 | ||||||
|  | +	shrq	$56, %rcx
 | ||||||
|  | +	jnz	__wcscmp_avx2
 | ||||||
|  | +#  endif
 | ||||||
|  |  	/* Convert units: from wide to byte char.  */ | ||||||
|  |  	shl	$2, %RDX_LP | ||||||
|  |  #  endif | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										257
									
								
								SOURCES/glibc-RHEL-15696-100.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										257
									
								
								SOURCES/glibc-RHEL-15696-100.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,257 @@ | |||||||
|  | From 244b415d386487521882debb845a040a4758cb18 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Fri, 25 Mar 2022 17:13:33 -0500 | ||||||
|  | Subject: [PATCH] x86: Small improvements for wcslen | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Just a few QOL changes. | ||||||
|  |     1. Prefer `add` > `lea` as it has high execution units it can run | ||||||
|  |        on. | ||||||
|  |     2. Don't break macro-fusion between `test` and `jcc` | ||||||
|  |     3. Reduce code size by removing gratuitous padding bytes (-90 | ||||||
|  |        bytes). | ||||||
|  | 
 | ||||||
|  | geometric_mean(N=20) of all benchmarks New / Original: 0.959 | ||||||
|  | 
 | ||||||
|  | All string/memory tests pass. | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++--------------------- | ||||||
|  |  1 file changed, 41 insertions(+), 45 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
 | ||||||
|  | index 9f5f7232..254bb030 100644
 | ||||||
|  | --- a/sysdeps/x86_64/wcslen.S
 | ||||||
|  | +++ b/sysdeps/x86_64/wcslen.S
 | ||||||
|  | @@ -41,82 +41,82 @@ ENTRY (__wcslen)
 | ||||||
|  |  	pxor	%xmm0, %xmm0 | ||||||
|  |   | ||||||
|  |  	lea	32(%rdi), %rax | ||||||
|  | -	lea	16(%rdi), %rcx
 | ||||||
|  | +	addq	$16, %rdi
 | ||||||
|  |  	and	$-16, %rax | ||||||
|  |   | ||||||
|  |  	pcmpeqd	(%rax), %xmm0 | ||||||
|  |  	pmovmskb %xmm0, %edx | ||||||
|  |  	pxor	%xmm1, %xmm1 | ||||||
|  | +	addq	$16, %rax
 | ||||||
|  |  	test	%edx, %edx | ||||||
|  | -	lea	16(%rax), %rax
 | ||||||
|  |  	jnz	L(exit) | ||||||
|  |   | ||||||
|  |  	pcmpeqd	(%rax), %xmm1 | ||||||
|  |  	pmovmskb %xmm1, %edx | ||||||
|  |  	pxor	%xmm2, %xmm2 | ||||||
|  | +	addq	$16, %rax
 | ||||||
|  |  	test	%edx, %edx | ||||||
|  | -	lea	16(%rax), %rax
 | ||||||
|  |  	jnz	L(exit) | ||||||
|  |   | ||||||
|  |  	pcmpeqd	(%rax), %xmm2 | ||||||
|  |  	pmovmskb %xmm2, %edx | ||||||
|  |  	pxor	%xmm3, %xmm3 | ||||||
|  | +	addq	$16, %rax
 | ||||||
|  |  	test	%edx, %edx | ||||||
|  | -	lea	16(%rax), %rax
 | ||||||
|  |  	jnz	L(exit) | ||||||
|  |   | ||||||
|  |  	pcmpeqd	(%rax), %xmm3 | ||||||
|  |  	pmovmskb %xmm3, %edx | ||||||
|  | +	addq	$16, %rax
 | ||||||
|  |  	test	%edx, %edx | ||||||
|  | -	lea	16(%rax), %rax
 | ||||||
|  |  	jnz	L(exit) | ||||||
|  |   | ||||||
|  |  	pcmpeqd	(%rax), %xmm0 | ||||||
|  |  	pmovmskb %xmm0, %edx | ||||||
|  | +	addq	$16, %rax
 | ||||||
|  |  	test	%edx, %edx | ||||||
|  | -	lea	16(%rax), %rax
 | ||||||
|  |  	jnz	L(exit) | ||||||
|  |   | ||||||
|  |  	pcmpeqd	(%rax), %xmm1 | ||||||
|  |  	pmovmskb %xmm1, %edx | ||||||
|  | +	addq	$16, %rax
 | ||||||
|  |  	test	%edx, %edx | ||||||
|  | -	lea	16(%rax), %rax
 | ||||||
|  |  	jnz	L(exit) | ||||||
|  |   | ||||||
|  |  	pcmpeqd	(%rax), %xmm2 | ||||||
|  |  	pmovmskb %xmm2, %edx | ||||||
|  | +	addq	$16, %rax
 | ||||||
|  |  	test	%edx, %edx | ||||||
|  | -	lea	16(%rax), %rax
 | ||||||
|  |  	jnz	L(exit) | ||||||
|  |   | ||||||
|  |  	pcmpeqd	(%rax), %xmm3 | ||||||
|  |  	pmovmskb %xmm3, %edx | ||||||
|  | +	addq	$16, %rax
 | ||||||
|  |  	test	%edx, %edx | ||||||
|  | -	lea	16(%rax), %rax
 | ||||||
|  |  	jnz	L(exit) | ||||||
|  |   | ||||||
|  |  	pcmpeqd	(%rax), %xmm0 | ||||||
|  |  	pmovmskb %xmm0, %edx | ||||||
|  | +	addq	$16, %rax
 | ||||||
|  |  	test	%edx, %edx | ||||||
|  | -	lea	16(%rax), %rax
 | ||||||
|  |  	jnz	L(exit) | ||||||
|  |   | ||||||
|  |  	pcmpeqd	(%rax), %xmm1 | ||||||
|  |  	pmovmskb %xmm1, %edx | ||||||
|  | +	addq	$16, %rax
 | ||||||
|  |  	test	%edx, %edx | ||||||
|  | -	lea	16(%rax), %rax
 | ||||||
|  |  	jnz	L(exit) | ||||||
|  |   | ||||||
|  |  	pcmpeqd	(%rax), %xmm2 | ||||||
|  |  	pmovmskb %xmm2, %edx | ||||||
|  | +	addq	$16, %rax
 | ||||||
|  |  	test	%edx, %edx | ||||||
|  | -	lea	16(%rax), %rax
 | ||||||
|  |  	jnz	L(exit) | ||||||
|  |   | ||||||
|  |  	pcmpeqd	(%rax), %xmm3 | ||||||
|  |  	pmovmskb %xmm3, %edx | ||||||
|  | +	addq	$16, %rax
 | ||||||
|  |  	test	%edx, %edx | ||||||
|  | -	lea	16(%rax), %rax
 | ||||||
|  |  	jnz	L(exit) | ||||||
|  |   | ||||||
|  |  	and	$-0x40, %rax | ||||||
|  | @@ -133,104 +133,100 @@ L(aligned_64_loop):
 | ||||||
|  |  	pminub	%xmm0, %xmm2 | ||||||
|  |  	pcmpeqd	%xmm3, %xmm2 | ||||||
|  |  	pmovmskb %xmm2, %edx | ||||||
|  | +	addq	$64, %rax
 | ||||||
|  |  	test	%edx, %edx | ||||||
|  | -	lea	64(%rax), %rax
 | ||||||
|  |  	jz	L(aligned_64_loop) | ||||||
|  |   | ||||||
|  |  	pcmpeqd	-64(%rax), %xmm3 | ||||||
|  |  	pmovmskb %xmm3, %edx | ||||||
|  | +    addq	$48, %rdi
 | ||||||
|  |  	test	%edx, %edx | ||||||
|  | -	lea	48(%rcx), %rcx
 | ||||||
|  |  	jnz	L(exit) | ||||||
|  |   | ||||||
|  |  	pcmpeqd	%xmm1, %xmm3 | ||||||
|  |  	pmovmskb %xmm3, %edx | ||||||
|  | +    addq	$-16, %rdi
 | ||||||
|  |  	test	%edx, %edx | ||||||
|  | -	lea	-16(%rcx), %rcx
 | ||||||
|  |  	jnz	L(exit) | ||||||
|  |   | ||||||
|  |  	pcmpeqd	-32(%rax), %xmm3 | ||||||
|  |  	pmovmskb %xmm3, %edx | ||||||
|  | +    addq	$-16, %rdi
 | ||||||
|  |  	test	%edx, %edx | ||||||
|  | -	lea	-16(%rcx), %rcx
 | ||||||
|  |  	jnz	L(exit) | ||||||
|  |   | ||||||
|  |  	pcmpeqd	%xmm6, %xmm3 | ||||||
|  |  	pmovmskb %xmm3, %edx | ||||||
|  | +    addq	$-16, %rdi
 | ||||||
|  |  	test	%edx, %edx | ||||||
|  | -	lea	-16(%rcx), %rcx
 | ||||||
|  | -	jnz	L(exit)
 | ||||||
|  | -
 | ||||||
|  | -	jmp	L(aligned_64_loop)
 | ||||||
|  | +	jz	L(aligned_64_loop)
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(exit): | ||||||
|  | -	sub	%rcx, %rax
 | ||||||
|  | +	sub	%rdi, %rax
 | ||||||
|  |  	shr	$2, %rax | ||||||
|  |  	test	%dl, %dl | ||||||
|  |  	jz	L(exit_high) | ||||||
|  |   | ||||||
|  | -	mov	%dl, %cl
 | ||||||
|  | -	and	$15, %cl
 | ||||||
|  | +	andl	$15, %edx
 | ||||||
|  |  	jz	L(exit_1) | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	/* No align here. Naturally aligned % 16 == 1.  */
 | ||||||
|  |  L(exit_high): | ||||||
|  | -	mov	%dh, %ch
 | ||||||
|  | -	and	$15, %ch
 | ||||||
|  | +	andl	$(15 << 8), %edx
 | ||||||
|  |  	jz	L(exit_3) | ||||||
|  |  	add	$2, %rax | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	.p2align 3
 | ||||||
|  |  L(exit_1): | ||||||
|  |  	add	$1, %rax | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	.p2align 3
 | ||||||
|  |  L(exit_3): | ||||||
|  |  	add	$3, %rax | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	.p2align 3
 | ||||||
|  |  L(exit_tail0): | ||||||
|  | -	xor	%rax, %rax
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	.p2align 3
 | ||||||
|  |  L(exit_tail1): | ||||||
|  | -	mov	$1, %rax
 | ||||||
|  | +	movl	$1, %eax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	.p2align 3
 | ||||||
|  |  L(exit_tail2): | ||||||
|  | -	mov	$2, %rax
 | ||||||
|  | +	movl	$2, %eax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	.p2align 3
 | ||||||
|  |  L(exit_tail3): | ||||||
|  | -	mov	$3, %rax
 | ||||||
|  | +	movl	$3, %eax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	.p2align 3
 | ||||||
|  |  L(exit_tail4): | ||||||
|  | -	mov	$4, %rax
 | ||||||
|  | +	movl	$4, %eax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	.p2align 3
 | ||||||
|  |  L(exit_tail5): | ||||||
|  | -	mov	$5, %rax
 | ||||||
|  | +	movl	$5, %eax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	.p2align 3
 | ||||||
|  |  L(exit_tail6): | ||||||
|  | -	mov	$6, %rax
 | ||||||
|  | +	movl	$6, %eax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	.p2align 3
 | ||||||
|  |  L(exit_tail7): | ||||||
|  | -	mov	$7, %rax
 | ||||||
|  | +	movl	$7, %eax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  END (__wcslen) | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										964
									
								
								SOURCES/glibc-RHEL-15696-101.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										964
									
								
								SOURCES/glibc-RHEL-15696-101.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,964 @@ | |||||||
|  | From 7cbc03d03091d5664060924789afe46d30a5477e Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Fri, 15 Apr 2022 12:28:00 -0500 | ||||||
|  | Subject: [PATCH] x86: Remove memcmp-sse4.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Code didn't actually use any sse4 instructions since `ptest` was | ||||||
|  | removed in: | ||||||
|  | 
 | ||||||
|  | commit 2f9062d7171850451e6044ef78d91ff8c017b9c0 | ||||||
|  | Author: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date:   Wed Nov 10 16:18:56 2021 -0600 | ||||||
|  | 
 | ||||||
|  |     x86: Shrink memcmp-sse4.S code size | ||||||
|  | 
 | ||||||
|  | The new memcmp-sse2 implementation is also faster. | ||||||
|  | 
 | ||||||
|  | geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905 | ||||||
|  | 
 | ||||||
|  | Note there are two regressions preferring SSE2 for Size = 1 and Size = | ||||||
|  | 65. | ||||||
|  | 
 | ||||||
|  | Size = 1: | ||||||
|  | size, align0, align1, ret, New Time/Old Time | ||||||
|  |    1,      1,      1,   0,               1.2 | ||||||
|  |    1,      1,      1,   1,             1.197 | ||||||
|  |    1,      1,      1,  -1,               1.2 | ||||||
|  | 
 | ||||||
|  | This is intentional. Size == 1 is significantly less hot based on | ||||||
|  | profiles of GCC11 and Python3 than sizes [4, 8] (which is made | ||||||
|  | hotter). | ||||||
|  | 
 | ||||||
|  | Python3 Size = 1        -> 13.64% | ||||||
|  | Python3 Size = [4, 8]   -> 60.92% | ||||||
|  | 
 | ||||||
|  | GCC11   Size = 1        ->  1.29% | ||||||
|  | GCC11   Size = [4, 8]   -> 33.86% | ||||||
|  | 
 | ||||||
|  | size, align0, align1, ret, New Time/Old Time | ||||||
|  |    4,      4,      4,   0,             0.622 | ||||||
|  |    4,      4,      4,   1,             0.797 | ||||||
|  |    4,      4,      4,  -1,             0.805 | ||||||
|  |    5,      5,      5,   0,             0.623 | ||||||
|  |    5,      5,      5,   1,             0.777 | ||||||
|  |    5,      5,      5,  -1,             0.802 | ||||||
|  |    6,      6,      6,   0,             0.625 | ||||||
|  |    6,      6,      6,   1,             0.813 | ||||||
|  |    6,      6,      6,  -1,             0.788 | ||||||
|  |    7,      7,      7,   0,             0.625 | ||||||
|  |    7,      7,      7,   1,             0.799 | ||||||
|  |    7,      7,      7,  -1,             0.795 | ||||||
|  |    8,      8,      8,   0,             0.625 | ||||||
|  |    8,      8,      8,   1,             0.848 | ||||||
|  |    8,      8,      8,  -1,             0.914 | ||||||
|  |    9,      9,      9,   0,             0.625 | ||||||
|  | 
 | ||||||
|  | Size = 65: | ||||||
|  | size, align0, align1, ret, New Time/Old Time | ||||||
|  |   65,      0,      0,   0,             1.103 | ||||||
|  |   65,      0,      0,   1,             1.216 | ||||||
|  |   65,      0,      0,  -1,             1.227 | ||||||
|  |   65,     65,      0,   0,             1.091 | ||||||
|  |   65,      0,     65,   1,              1.19 | ||||||
|  |   65,     65,     65,  -1,             1.215 | ||||||
|  | 
 | ||||||
|  | This is because A) the checks in range [65, 96] are now unrolled 2x | ||||||
|  | and B) because smaller values <= 16 are now given a hotter path. By | ||||||
|  | contrast the SSE4 version has a branch for Size = 80. The unrolled | ||||||
|  | version has get better performance for returns which need both | ||||||
|  | comparisons. | ||||||
|  | 
 | ||||||
|  | size, align0, align1, ret, New Time/Old Time | ||||||
|  |  128,      4,      8,   0,             0.858 | ||||||
|  |  128,      4,      8,   1,             0.879 | ||||||
|  |  128,      4,      8,  -1,             0.888 | ||||||
|  | 
 | ||||||
|  | As well, out of microbenchmark environments that are not full | ||||||
|  | predictable the branch will have a real-cost. | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/Makefile          |   2 - | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   4 - | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-memcmp.h    |   4 - | ||||||
|  |  sysdeps/x86_64/multiarch/memcmp-sse4.S     | 804 --------------------- | ||||||
|  |  4 files changed, 814 deletions(-) | ||||||
|  |  delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | index bca82e38..b503e4b8 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | @@ -11,7 +11,6 @@ sysdep_routines += \
 | ||||||
|  |    memcmp-avx2-movbe-rtm \ | ||||||
|  |    memcmp-evex-movbe \ | ||||||
|  |    memcmp-sse2 \ | ||||||
|  | -  memcmp-sse4 \
 | ||||||
|  |    memcmp-ssse3 \ | ||||||
|  |    memcpy-ssse3 \ | ||||||
|  |    memcpy-ssse3-back \ | ||||||
|  | @@ -174,7 +173,6 @@ sysdep_routines += \
 | ||||||
|  |    wmemcmp-avx2-movbe-rtm \ | ||||||
|  |    wmemcmp-c \ | ||||||
|  |    wmemcmp-evex-movbe \ | ||||||
|  | -  wmemcmp-sse4 \
 | ||||||
|  |    wmemcmp-ssse3 \ | ||||||
|  |  # sysdep_routines | ||||||
|  |  endif | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | index 14314367..450a2917 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | @@ -78,8 +78,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (BMI2) | ||||||
|  |  			       && CPU_FEATURE_USABLE (MOVBE)), | ||||||
|  |  			      __memcmp_evex_movbe) | ||||||
|  | -	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
 | ||||||
|  | -			      __memcmp_sse4_1)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), | ||||||
|  |  			      __memcmp_ssse3) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) | ||||||
|  | @@ -824,8 +822,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (BMI2) | ||||||
|  |  			       && CPU_FEATURE_USABLE (MOVBE)), | ||||||
|  |  			      __wmemcmp_evex_movbe) | ||||||
|  | -	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
 | ||||||
|  | -			      __wmemcmp_sse4_1)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), | ||||||
|  |  			      __wmemcmp_ssse3) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
 | ||||||
|  | index 690dffe8..0bc47a7f 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
 | ||||||
|  | @@ -21,7 +21,6 @@
 | ||||||
|  |   | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; | ||||||
|  | -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; | ||||||
|  | @@ -47,9 +46,6 @@ IFUNC_SELECTOR (void)
 | ||||||
|  |  	return OPTIMIZE (avx2_movbe); | ||||||
|  |      } | ||||||
|  |   | ||||||
|  | -  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
 | ||||||
|  | -    return OPTIMIZE (sse4_1);
 | ||||||
|  | -
 | ||||||
|  |    if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) | ||||||
|  |      return OPTIMIZE (ssse3); | ||||||
|  |   | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
 | ||||||
|  | deleted file mode 100644 | ||||||
|  | index 50060006..00000000
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
 | ||||||
|  | +++ /dev/null
 | ||||||
|  | @@ -1,804 +0,0 @@
 | ||||||
|  | -/* memcmp with SSE4.1, wmemcmp with SSE4.1
 | ||||||
|  | -   Copyright (C) 2010-2018 Free Software Foundation, Inc.
 | ||||||
|  | -   Contributed by Intel Corporation.
 | ||||||
|  | -   This file is part of the GNU C Library.
 | ||||||
|  | -
 | ||||||
|  | -   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | -   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | -   License as published by the Free Software Foundation; either
 | ||||||
|  | -   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | -
 | ||||||
|  | -   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | -   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | -   Lesser General Public License for more details.
 | ||||||
|  | -
 | ||||||
|  | -   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | -   License along with the GNU C Library; if not, see
 | ||||||
|  | -   <http://www.gnu.org/licenses/>.  */
 | ||||||
|  | -
 | ||||||
|  | -#if IS_IN (libc)
 | ||||||
|  | -
 | ||||||
|  | -# include <sysdep.h>
 | ||||||
|  | -
 | ||||||
|  | -# ifndef MEMCMP
 | ||||||
|  | -#  define MEMCMP	__memcmp_sse4_1
 | ||||||
|  | -# endif
 | ||||||
|  | -
 | ||||||
|  | -#ifdef USE_AS_WMEMCMP
 | ||||||
|  | -# define CMPEQ	pcmpeqd
 | ||||||
|  | -# define CHAR_SIZE	4
 | ||||||
|  | -#else
 | ||||||
|  | -# define CMPEQ	pcmpeqb
 | ||||||
|  | -# define CHAR_SIZE	1
 | ||||||
|  | -#endif
 | ||||||
|  | -
 | ||||||
|  | -
 | ||||||
|  | -/* Warning!
 | ||||||
|  | -           wmemcmp has to use SIGNED comparison for elements.
 | ||||||
|  | -           memcmp has to use UNSIGNED comparison for elemnts.
 | ||||||
|  | -*/
 | ||||||
|  | -
 | ||||||
|  | -	.section .text.sse4.1,"ax",@progbits
 | ||||||
|  | -ENTRY (MEMCMP)
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -	shl	$2, %RDX_LP
 | ||||||
|  | -# elif defined __ILP32__
 | ||||||
|  | -	/* Clear the upper 32 bits.  */
 | ||||||
|  | -	mov	%edx, %edx
 | ||||||
|  | -# endif
 | ||||||
|  | -	cmp	$79, %RDX_LP
 | ||||||
|  | -	ja	L(79bytesormore)
 | ||||||
|  | -
 | ||||||
|  | -	cmp	$CHAR_SIZE, %RDX_LP
 | ||||||
|  | -	jbe	L(firstbyte)
 | ||||||
|  | -
 | ||||||
|  | -	/* N in (CHAR_SIZE, 79) bytes.  */
 | ||||||
|  | -	cmpl	$32, %edx
 | ||||||
|  | -	ja	L(more_32_bytes)
 | ||||||
|  | -
 | ||||||
|  | -	cmpl	$16, %edx
 | ||||||
|  | -	jae	L(16_to_32_bytes)
 | ||||||
|  | -
 | ||||||
|  | -# ifndef USE_AS_WMEMCMP
 | ||||||
|  | -	cmpl	$8, %edx
 | ||||||
|  | -	jae	L(8_to_16_bytes)
 | ||||||
|  | -
 | ||||||
|  | -	cmpl	$4, %edx
 | ||||||
|  | -	jb	L(2_to_3_bytes)
 | ||||||
|  | -
 | ||||||
|  | -	movl	(%rdi), %eax
 | ||||||
|  | -	movl	(%rsi), %ecx
 | ||||||
|  | -
 | ||||||
|  | -	bswap	%eax
 | ||||||
|  | -	bswap	%ecx
 | ||||||
|  | -
 | ||||||
|  | -	shlq	$32, %rax
 | ||||||
|  | -	shlq	$32, %rcx
 | ||||||
|  | -
 | ||||||
|  | -	movl	-4(%rdi, %rdx), %edi
 | ||||||
|  | -	movl	-4(%rsi, %rdx), %esi
 | ||||||
|  | -
 | ||||||
|  | -	bswap	%edi
 | ||||||
|  | -	bswap	%esi
 | ||||||
|  | -
 | ||||||
|  | -	orq	%rdi, %rax
 | ||||||
|  | -	orq	%rsi, %rcx
 | ||||||
|  | -	subq	%rcx, %rax
 | ||||||
|  | -	cmovne	%edx, %eax
 | ||||||
|  | -	sbbl	%ecx, %ecx
 | ||||||
|  | -	orl	%ecx, %eax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4,, 8
 | ||||||
|  | -L(2_to_3_bytes):
 | ||||||
|  | -	movzwl	(%rdi), %eax
 | ||||||
|  | -	movzwl	(%rsi), %ecx
 | ||||||
|  | -	shll	$8, %eax
 | ||||||
|  | -	shll	$8, %ecx
 | ||||||
|  | -	bswap	%eax
 | ||||||
|  | -	bswap	%ecx
 | ||||||
|  | -	movzbl	-1(%rdi, %rdx), %edi
 | ||||||
|  | -	movzbl	-1(%rsi, %rdx), %esi
 | ||||||
|  | -	orl	%edi, %eax
 | ||||||
|  | -	orl	%esi, %ecx
 | ||||||
|  | -	subl	%ecx, %eax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4,, 8
 | ||||||
|  | -L(8_to_16_bytes):
 | ||||||
|  | -	movq	(%rdi), %rax
 | ||||||
|  | -	movq	(%rsi), %rcx
 | ||||||
|  | -
 | ||||||
|  | -	bswap	%rax
 | ||||||
|  | -	bswap	%rcx
 | ||||||
|  | -
 | ||||||
|  | -	subq	%rcx, %rax
 | ||||||
|  | -	jne	L(8_to_16_bytes_done)
 | ||||||
|  | -
 | ||||||
|  | -	movq	-8(%rdi, %rdx), %rax
 | ||||||
|  | -	movq	-8(%rsi, %rdx), %rcx
 | ||||||
|  | -
 | ||||||
|  | -	bswap	%rax
 | ||||||
|  | -	bswap	%rcx
 | ||||||
|  | -
 | ||||||
|  | -	subq	%rcx, %rax
 | ||||||
|  | -
 | ||||||
|  | -L(8_to_16_bytes_done):
 | ||||||
|  | -	cmovne	%edx, %eax
 | ||||||
|  | -	sbbl	%ecx, %ecx
 | ||||||
|  | -	orl	%ecx, %eax
 | ||||||
|  | -	ret
 | ||||||
|  | -# else
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	movl	(%rdi), %ecx
 | ||||||
|  | -	cmpl	(%rsi), %ecx
 | ||||||
|  | -	jne	L(8_to_16_bytes_done)
 | ||||||
|  | -	movl	4(%rdi), %ecx
 | ||||||
|  | -	cmpl	4(%rsi), %ecx
 | ||||||
|  | -	jne	L(8_to_16_bytes_done)
 | ||||||
|  | -	movl	-4(%rdi, %rdx), %ecx
 | ||||||
|  | -	cmpl	-4(%rsi, %rdx), %ecx
 | ||||||
|  | -	jne	L(8_to_16_bytes_done)
 | ||||||
|  | -	ret
 | ||||||
|  | -# endif
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4,, 3
 | ||||||
|  | -L(ret_zero):
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -L(zero):
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4,, 8
 | ||||||
|  | -L(firstbyte):
 | ||||||
|  | -	jb	L(ret_zero)
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	movl	(%rdi), %ecx
 | ||||||
|  | -	cmpl	(%rsi), %ecx
 | ||||||
|  | -	je	L(zero)
 | ||||||
|  | -L(8_to_16_bytes_done):
 | ||||||
|  | -	setg	%al
 | ||||||
|  | -	leal	-1(%rax, %rax), %eax
 | ||||||
|  | -# else
 | ||||||
|  | -	movzbl	(%rdi), %eax
 | ||||||
|  | -	movzbl	(%rsi), %ecx
 | ||||||
|  | -	sub	%ecx, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(vec_return_begin_48):
 | ||||||
|  | -	addq	$16, %rdi
 | ||||||
|  | -	addq	$16, %rsi
 | ||||||
|  | -L(vec_return_begin_32):
 | ||||||
|  | -	bsfl	%eax, %eax
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -	movl	32(%rdi, %rax), %ecx
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  | -	cmpl	32(%rsi, %rax), %ecx
 | ||||||
|  | -	setg	%dl
 | ||||||
|  | -	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | -# else
 | ||||||
|  | -	movzbl	32(%rsi, %rax), %ecx
 | ||||||
|  | -	movzbl	32(%rdi, %rax), %eax
 | ||||||
|  | -	subl	%ecx, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(vec_return_begin_16):
 | ||||||
|  | -	addq	$16, %rdi
 | ||||||
|  | -	addq	$16, %rsi
 | ||||||
|  | -L(vec_return_begin):
 | ||||||
|  | -	bsfl	%eax, %eax
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -	movl	(%rdi, %rax), %ecx
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  | -	cmpl	(%rsi, %rax), %ecx
 | ||||||
|  | -	setg	%dl
 | ||||||
|  | -	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | -# else
 | ||||||
|  | -	movzbl	(%rsi, %rax), %ecx
 | ||||||
|  | -	movzbl	(%rdi, %rax), %eax
 | ||||||
|  | -	subl	%ecx, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(vec_return_end_16):
 | ||||||
|  | -	subl	$16, %edx
 | ||||||
|  | -L(vec_return_end):
 | ||||||
|  | -	bsfl	%eax, %eax
 | ||||||
|  | -	addl	%edx, %eax
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -	movl	-16(%rdi, %rax), %ecx
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  | -	cmpl	-16(%rsi, %rax), %ecx
 | ||||||
|  | -	setg	%dl
 | ||||||
|  | -	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | -# else
 | ||||||
|  | -	movzbl	-16(%rsi, %rax), %ecx
 | ||||||
|  | -	movzbl	-16(%rdi, %rax), %eax
 | ||||||
|  | -	subl	%ecx, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4,, 8
 | ||||||
|  | -L(more_32_bytes):
 | ||||||
|  | -	movdqu	(%rdi), %xmm0
 | ||||||
|  | -	movdqu	(%rsi), %xmm1
 | ||||||
|  | -	CMPEQ	%xmm0, %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	16(%rdi), %xmm0
 | ||||||
|  | -	movdqu	16(%rsi), %xmm1
 | ||||||
|  | -	CMPEQ	%xmm0, %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_16)
 | ||||||
|  | -
 | ||||||
|  | -	cmpl	$64, %edx
 | ||||||
|  | -	jbe	L(32_to_64_bytes)
 | ||||||
|  | -	movdqu	32(%rdi), %xmm0
 | ||||||
|  | -	movdqu	32(%rsi), %xmm1
 | ||||||
|  | -	CMPEQ	%xmm0, %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_32)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4,, 6
 | ||||||
|  | -L(32_to_64_bytes):
 | ||||||
|  | -	movdqu	-32(%rdi, %rdx), %xmm0
 | ||||||
|  | -	movdqu	-32(%rsi, %rdx), %xmm1
 | ||||||
|  | -	CMPEQ	%xmm0, %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_end_16)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	-16(%rdi, %rdx), %xmm0
 | ||||||
|  | -	movdqu	-16(%rsi, %rdx), %xmm1
 | ||||||
|  | -	CMPEQ	%xmm0, %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_end)
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(16_to_32_bytes):
 | ||||||
|  | -	movdqu	(%rdi), %xmm0
 | ||||||
|  | -	movdqu	(%rsi), %xmm1
 | ||||||
|  | -	CMPEQ	%xmm0, %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	-16(%rdi, %rdx), %xmm0
 | ||||||
|  | -	movdqu	-16(%rsi, %rdx), %xmm1
 | ||||||
|  | -	CMPEQ	%xmm0, %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_end)
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(79bytesormore):
 | ||||||
|  | -	movdqu	(%rdi), %xmm0
 | ||||||
|  | -	movdqu	(%rsi), %xmm1
 | ||||||
|  | -	CMPEQ	%xmm0, %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin)
 | ||||||
|  | -
 | ||||||
|  | -
 | ||||||
|  | -	mov	%rsi, %rcx
 | ||||||
|  | -	and	$-16, %rsi
 | ||||||
|  | -	add	$16, %rsi
 | ||||||
|  | -	sub	%rsi, %rcx
 | ||||||
|  | -
 | ||||||
|  | -	sub	%rcx, %rdi
 | ||||||
|  | -	add	%rcx, %rdx
 | ||||||
|  | -	test	$0xf, %rdi
 | ||||||
|  | -	jz	L(2aligned)
 | ||||||
|  | -
 | ||||||
|  | -	cmp	$128, %rdx
 | ||||||
|  | -	ja	L(128bytesormore)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4,, 6
 | ||||||
|  | -L(less128bytes):
 | ||||||
|  | -	movdqu	(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	16(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	16(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_16)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	32(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	32(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_32)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	48(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	48(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_48)
 | ||||||
|  | -
 | ||||||
|  | -	cmp	$96, %rdx
 | ||||||
|  | -	jb	L(32_to_64_bytes)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$64, %rdi
 | ||||||
|  | -	addq	$64, %rsi
 | ||||||
|  | -	subq	$64, %rdx
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4,, 6
 | ||||||
|  | -L(last_64_bytes):
 | ||||||
|  | -	movdqu	(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	16(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	16(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_16)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	-32(%rdi, %rdx), %xmm0
 | ||||||
|  | -	movdqu	-32(%rsi, %rdx), %xmm1
 | ||||||
|  | -	CMPEQ	%xmm0, %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_end_16)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	-16(%rdi, %rdx), %xmm0
 | ||||||
|  | -	movdqu	-16(%rsi, %rdx), %xmm1
 | ||||||
|  | -	CMPEQ	%xmm0, %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_end)
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(128bytesormore):
 | ||||||
|  | -	cmp	$256, %rdx
 | ||||||
|  | -	ja	L(unaligned_loop)
 | ||||||
|  | -L(less256bytes):
 | ||||||
|  | -	movdqu	(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	16(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	16(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_16)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	32(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	32(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_32)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	48(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	48(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_48)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$64, %rdi
 | ||||||
|  | -	addq	$64, %rsi
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	16(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	16(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_16)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	32(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	32(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_32)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	48(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	48(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_48)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$-128, %rdx
 | ||||||
|  | -	subq	$-64, %rsi
 | ||||||
|  | -	subq	$-64, %rdi
 | ||||||
|  | -
 | ||||||
|  | -	cmp	$64, %rdx
 | ||||||
|  | -	ja	L(less128bytes)
 | ||||||
|  | -
 | ||||||
|  | -	cmp	$32, %rdx
 | ||||||
|  | -	ja	L(last_64_bytes)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	-32(%rdi, %rdx), %xmm0
 | ||||||
|  | -	movdqu	-32(%rsi, %rdx), %xmm1
 | ||||||
|  | -	CMPEQ	%xmm0, %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_end_16)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	-16(%rdi, %rdx), %xmm0
 | ||||||
|  | -	movdqu	-16(%rsi, %rdx), %xmm1
 | ||||||
|  | -	CMPEQ	%xmm0, %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_end)
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(unaligned_loop):
 | ||||||
|  | -# ifdef DATA_CACHE_SIZE_HALF
 | ||||||
|  | -	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
 | ||||||
|  | -# else
 | ||||||
|  | -	mov	__x86_data_cache_size_half(%rip), %R8_LP
 | ||||||
|  | -# endif
 | ||||||
|  | -	movq	%r8, %r9
 | ||||||
|  | -	addq	%r8, %r8
 | ||||||
|  | -	addq	%r9, %r8
 | ||||||
|  | -	cmpq	%r8, %rdx
 | ||||||
|  | -	ja	L(L2_L3_cache_unaligned)
 | ||||||
|  | -	sub	$64, %rdx
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(64bytesormore_loop):
 | ||||||
|  | -	movdqu	(%rdi), %xmm0
 | ||||||
|  | -	movdqu	16(%rdi), %xmm1
 | ||||||
|  | -	movdqu	32(%rdi), %xmm2
 | ||||||
|  | -	movdqu	48(%rdi), %xmm3
 | ||||||
|  | -
 | ||||||
|  | -	CMPEQ	(%rsi), %xmm0
 | ||||||
|  | -	CMPEQ	16(%rsi), %xmm1
 | ||||||
|  | -	CMPEQ	32(%rsi), %xmm2
 | ||||||
|  | -	CMPEQ	48(%rsi), %xmm3
 | ||||||
|  | -
 | ||||||
|  | -	pand	%xmm0, %xmm1
 | ||||||
|  | -	pand	%xmm2, %xmm3
 | ||||||
|  | -	pand	%xmm1, %xmm3
 | ||||||
|  | -
 | ||||||
|  | -	pmovmskb %xmm3, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(64bytesormore_loop_end)
 | ||||||
|  | -
 | ||||||
|  | -	add	$64, %rsi
 | ||||||
|  | -	add	$64, %rdi
 | ||||||
|  | -	sub	$64, %rdx
 | ||||||
|  | -	ja	L(64bytesormore_loop)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4,, 6
 | ||||||
|  | -L(loop_tail):
 | ||||||
|  | -	addq	%rdx, %rdi
 | ||||||
|  | -	movdqu	(%rdi), %xmm0
 | ||||||
|  | -	movdqu	16(%rdi), %xmm1
 | ||||||
|  | -	movdqu	32(%rdi), %xmm2
 | ||||||
|  | -	movdqu	48(%rdi), %xmm3
 | ||||||
|  | -
 | ||||||
|  | -	addq	%rdx, %rsi
 | ||||||
|  | -	movdqu	(%rsi), %xmm4
 | ||||||
|  | -	movdqu	16(%rsi), %xmm5
 | ||||||
|  | -	movdqu	32(%rsi), %xmm6
 | ||||||
|  | -	movdqu	48(%rsi), %xmm7
 | ||||||
|  | -
 | ||||||
|  | -	CMPEQ	%xmm4, %xmm0
 | ||||||
|  | -	CMPEQ	%xmm5, %xmm1
 | ||||||
|  | -	CMPEQ	%xmm6, %xmm2
 | ||||||
|  | -	CMPEQ	%xmm7, %xmm3
 | ||||||
|  | -
 | ||||||
|  | -	pand	%xmm0, %xmm1
 | ||||||
|  | -	pand	%xmm2, %xmm3
 | ||||||
|  | -	pand	%xmm1, %xmm3
 | ||||||
|  | -
 | ||||||
|  | -	pmovmskb %xmm3, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(64bytesormore_loop_end)
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -L(L2_L3_cache_unaligned):
 | ||||||
|  | -	subq	$64, %rdx
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(L2_L3_unaligned_128bytes_loop):
 | ||||||
|  | -	prefetchnta 0x1c0(%rdi)
 | ||||||
|  | -	prefetchnta 0x1c0(%rsi)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	(%rdi), %xmm0
 | ||||||
|  | -	movdqu	16(%rdi), %xmm1
 | ||||||
|  | -	movdqu	32(%rdi), %xmm2
 | ||||||
|  | -	movdqu	48(%rdi), %xmm3
 | ||||||
|  | -
 | ||||||
|  | -	CMPEQ	(%rsi), %xmm0
 | ||||||
|  | -	CMPEQ	16(%rsi), %xmm1
 | ||||||
|  | -	CMPEQ	32(%rsi), %xmm2
 | ||||||
|  | -	CMPEQ	48(%rsi), %xmm3
 | ||||||
|  | -
 | ||||||
|  | -	pand	%xmm0, %xmm1
 | ||||||
|  | -	pand	%xmm2, %xmm3
 | ||||||
|  | -	pand	%xmm1, %xmm3
 | ||||||
|  | -
 | ||||||
|  | -	pmovmskb %xmm3, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(64bytesormore_loop_end)
 | ||||||
|  | -
 | ||||||
|  | -	add	$64, %rsi
 | ||||||
|  | -	add	$64, %rdi
 | ||||||
|  | -	sub	$64, %rdx
 | ||||||
|  | -	ja	L(L2_L3_unaligned_128bytes_loop)
 | ||||||
|  | -	jmp	L(loop_tail)
 | ||||||
|  | -
 | ||||||
|  | -
 | ||||||
|  | -	/* This case is for machines which are sensitive for unaligned
 | ||||||
|  | -	 * instructions.  */
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(2aligned):
 | ||||||
|  | -	cmp	$128, %rdx
 | ||||||
|  | -	ja	L(128bytesormorein2aligned)
 | ||||||
|  | -L(less128bytesin2aligned):
 | ||||||
|  | -	movdqa	(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin)
 | ||||||
|  | -
 | ||||||
|  | -	movdqa	16(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	16(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_16)
 | ||||||
|  | -
 | ||||||
|  | -	movdqa	32(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	32(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_32)
 | ||||||
|  | -
 | ||||||
|  | -	movdqa	48(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	48(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_48)
 | ||||||
|  | -
 | ||||||
|  | -	cmp	$96, %rdx
 | ||||||
|  | -	jb	L(32_to_64_bytes)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$64, %rdi
 | ||||||
|  | -	addq	$64, %rsi
 | ||||||
|  | -	subq	$64, %rdx
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4,, 6
 | ||||||
|  | -L(aligned_last_64_bytes):
 | ||||||
|  | -	movdqa	(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin)
 | ||||||
|  | -
 | ||||||
|  | -	movdqa	16(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	16(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_16)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	-32(%rdi, %rdx), %xmm0
 | ||||||
|  | -	movdqu	-32(%rsi, %rdx), %xmm1
 | ||||||
|  | -	CMPEQ	%xmm0, %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_end_16)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	-16(%rdi, %rdx), %xmm0
 | ||||||
|  | -	movdqu	-16(%rsi, %rdx), %xmm1
 | ||||||
|  | -	CMPEQ	%xmm0, %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_end)
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(128bytesormorein2aligned):
 | ||||||
|  | -	cmp	$256, %rdx
 | ||||||
|  | -	ja	L(aligned_loop)
 | ||||||
|  | -L(less256bytesin2alinged):
 | ||||||
|  | -	movdqa	(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin)
 | ||||||
|  | -
 | ||||||
|  | -	movdqa	16(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	16(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_16)
 | ||||||
|  | -
 | ||||||
|  | -	movdqa	32(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	32(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_32)
 | ||||||
|  | -
 | ||||||
|  | -	movdqa	48(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	48(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_48)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$64, %rdi
 | ||||||
|  | -	addq	$64, %rsi
 | ||||||
|  | -
 | ||||||
|  | -	movdqa	(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin)
 | ||||||
|  | -
 | ||||||
|  | -	movdqa	16(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	16(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_16)
 | ||||||
|  | -
 | ||||||
|  | -	movdqa	32(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	32(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_32)
 | ||||||
|  | -
 | ||||||
|  | -	movdqa	48(%rdi), %xmm1
 | ||||||
|  | -	CMPEQ	48(%rsi), %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_begin_48)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$-128, %rdx
 | ||||||
|  | -	subq	$-64, %rsi
 | ||||||
|  | -	subq	$-64, %rdi
 | ||||||
|  | -
 | ||||||
|  | -	cmp	$64, %rdx
 | ||||||
|  | -	ja	L(less128bytesin2aligned)
 | ||||||
|  | -
 | ||||||
|  | -	cmp	$32, %rdx
 | ||||||
|  | -	ja	L(aligned_last_64_bytes)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	-32(%rdi, %rdx), %xmm0
 | ||||||
|  | -	movdqu	-32(%rsi, %rdx), %xmm1
 | ||||||
|  | -	CMPEQ	%xmm0, %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_end_16)
 | ||||||
|  | -
 | ||||||
|  | -	movdqu	-16(%rdi, %rdx), %xmm0
 | ||||||
|  | -	movdqu	-16(%rsi, %rdx), %xmm1
 | ||||||
|  | -	CMPEQ	%xmm0, %xmm1
 | ||||||
|  | -	pmovmskb %xmm1, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(vec_return_end)
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(aligned_loop):
 | ||||||
|  | -# ifdef DATA_CACHE_SIZE_HALF
 | ||||||
|  | -	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
 | ||||||
|  | -# else
 | ||||||
|  | -	mov	__x86_data_cache_size_half(%rip), %R8_LP
 | ||||||
|  | -# endif
 | ||||||
|  | -	movq	%r8, %r9
 | ||||||
|  | -	addq	%r8, %r8
 | ||||||
|  | -	addq	%r9, %r8
 | ||||||
|  | -	cmpq	%r8, %rdx
 | ||||||
|  | -	ja	L(L2_L3_cache_aligned)
 | ||||||
|  | -
 | ||||||
|  | -	sub	$64, %rdx
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(64bytesormore_loopin2aligned):
 | ||||||
|  | -	movdqa	(%rdi), %xmm0
 | ||||||
|  | -	movdqa	16(%rdi), %xmm1
 | ||||||
|  | -	movdqa	32(%rdi), %xmm2
 | ||||||
|  | -	movdqa	48(%rdi), %xmm3
 | ||||||
|  | -
 | ||||||
|  | -	CMPEQ	(%rsi), %xmm0
 | ||||||
|  | -	CMPEQ	16(%rsi), %xmm1
 | ||||||
|  | -	CMPEQ	32(%rsi), %xmm2
 | ||||||
|  | -	CMPEQ	48(%rsi), %xmm3
 | ||||||
|  | -
 | ||||||
|  | -	pand	%xmm0, %xmm1
 | ||||||
|  | -	pand	%xmm2, %xmm3
 | ||||||
|  | -	pand	%xmm1, %xmm3
 | ||||||
|  | -
 | ||||||
|  | -	pmovmskb %xmm3, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(64bytesormore_loop_end)
 | ||||||
|  | -	add	$64, %rsi
 | ||||||
|  | -	add	$64, %rdi
 | ||||||
|  | -	sub	$64, %rdx
 | ||||||
|  | -	ja	L(64bytesormore_loopin2aligned)
 | ||||||
|  | -	jmp	L(loop_tail)
 | ||||||
|  | -
 | ||||||
|  | -L(L2_L3_cache_aligned):
 | ||||||
|  | -	subq	$64, %rdx
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(L2_L3_aligned_128bytes_loop):
 | ||||||
|  | -	prefetchnta 0x1c0(%rdi)
 | ||||||
|  | -	prefetchnta 0x1c0(%rsi)
 | ||||||
|  | -	movdqa	(%rdi), %xmm0
 | ||||||
|  | -	movdqa	16(%rdi), %xmm1
 | ||||||
|  | -	movdqa	32(%rdi), %xmm2
 | ||||||
|  | -	movdqa	48(%rdi), %xmm3
 | ||||||
|  | -
 | ||||||
|  | -	CMPEQ	(%rsi), %xmm0
 | ||||||
|  | -	CMPEQ	16(%rsi), %xmm1
 | ||||||
|  | -	CMPEQ	32(%rsi), %xmm2
 | ||||||
|  | -	CMPEQ	48(%rsi), %xmm3
 | ||||||
|  | -
 | ||||||
|  | -	pand	%xmm0, %xmm1
 | ||||||
|  | -	pand	%xmm2, %xmm3
 | ||||||
|  | -	pand	%xmm1, %xmm3
 | ||||||
|  | -
 | ||||||
|  | -	pmovmskb %xmm3, %eax
 | ||||||
|  | -	incw	%ax
 | ||||||
|  | -	jnz	L(64bytesormore_loop_end)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$64, %rsi
 | ||||||
|  | -	addq	$64, %rdi
 | ||||||
|  | -	subq	$64, %rdx
 | ||||||
|  | -	ja	L(L2_L3_aligned_128bytes_loop)
 | ||||||
|  | -	jmp	L(loop_tail)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(64bytesormore_loop_end):
 | ||||||
|  | -	pmovmskb %xmm0, %ecx
 | ||||||
|  | -	incw	%cx
 | ||||||
|  | -	jnz	L(loop_end_ret)
 | ||||||
|  | -
 | ||||||
|  | -	pmovmskb %xmm1, %ecx
 | ||||||
|  | -	notw	%cx
 | ||||||
|  | -	sall	$16, %ecx
 | ||||||
|  | -	jnz	L(loop_end_ret)
 | ||||||
|  | -
 | ||||||
|  | -	pmovmskb %xmm2, %ecx
 | ||||||
|  | -	notw	%cx
 | ||||||
|  | -	shlq	$32, %rcx
 | ||||||
|  | -	jnz	L(loop_end_ret)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$48, %rdi
 | ||||||
|  | -	addq	$48, %rsi
 | ||||||
|  | -	movq	%rax, %rcx
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4,, 6
 | ||||||
|  | -L(loop_end_ret):
 | ||||||
|  | -	bsfq	%rcx, %rcx
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -	movl	(%rdi, %rcx), %eax
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  | -	cmpl	(%rsi, %rcx), %eax
 | ||||||
|  | -	setg	%dl
 | ||||||
|  | -	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | -# else
 | ||||||
|  | -	movzbl	(%rdi, %rcx), %eax
 | ||||||
|  | -	movzbl	(%rsi, %rcx), %ecx
 | ||||||
|  | -	subl	%ecx, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | -	ret
 | ||||||
|  | -END (MEMCMP)
 | ||||||
|  | -#endif
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										263
									
								
								SOURCES/glibc-RHEL-15696-102.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										263
									
								
								SOURCES/glibc-RHEL-15696-102.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,263 @@ | |||||||
|  | From 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Fri, 15 Apr 2022 12:28:01 -0500 | ||||||
|  | Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Old code was both inefficient and wasted code size. New code (-62 | ||||||
|  | bytes) and comparable or better performance in the page cross case. | ||||||
|  | 
 | ||||||
|  | geometric_mean(N=20) of page cross cases New / Original: 0.960 | ||||||
|  | 
 | ||||||
|  | size, align0, align1, ret, New Time/Old Time | ||||||
|  |    1,   4095,      0,   0,             1.001 | ||||||
|  |    1,   4095,      0,   1,             0.999 | ||||||
|  |    1,   4095,      0,  -1,               1.0 | ||||||
|  |    2,   4094,      0,   0,               1.0 | ||||||
|  |    2,   4094,      0,   1,               1.0 | ||||||
|  |    2,   4094,      0,  -1,               1.0 | ||||||
|  |    3,   4093,      0,   0,               1.0 | ||||||
|  |    3,   4093,      0,   1,               1.0 | ||||||
|  |    3,   4093,      0,  -1,               1.0 | ||||||
|  |    4,   4092,      0,   0,             0.987 | ||||||
|  |    4,   4092,      0,   1,               1.0 | ||||||
|  |    4,   4092,      0,  -1,               1.0 | ||||||
|  |    5,   4091,      0,   0,             0.984 | ||||||
|  |    5,   4091,      0,   1,             1.002 | ||||||
|  |    5,   4091,      0,  -1,             1.005 | ||||||
|  |    6,   4090,      0,   0,             0.993 | ||||||
|  |    6,   4090,      0,   1,             1.001 | ||||||
|  |    6,   4090,      0,  -1,             1.003 | ||||||
|  |    7,   4089,      0,   0,             0.991 | ||||||
|  |    7,   4089,      0,   1,               1.0 | ||||||
|  |    7,   4089,      0,  -1,             1.001 | ||||||
|  |    8,   4088,      0,   0,             0.875 | ||||||
|  |    8,   4088,      0,   1,             0.881 | ||||||
|  |    8,   4088,      0,  -1,             0.888 | ||||||
|  |    9,   4087,      0,   0,             0.872 | ||||||
|  |    9,   4087,      0,   1,             0.879 | ||||||
|  |    9,   4087,      0,  -1,             0.883 | ||||||
|  |   10,   4086,      0,   0,             0.878 | ||||||
|  |   10,   4086,      0,   1,             0.886 | ||||||
|  |   10,   4086,      0,  -1,             0.873 | ||||||
|  |   11,   4085,      0,   0,             0.878 | ||||||
|  |   11,   4085,      0,   1,             0.881 | ||||||
|  |   11,   4085,      0,  -1,             0.879 | ||||||
|  |   12,   4084,      0,   0,             0.873 | ||||||
|  |   12,   4084,      0,   1,             0.889 | ||||||
|  |   12,   4084,      0,  -1,             0.875 | ||||||
|  |   13,   4083,      0,   0,             0.873 | ||||||
|  |   13,   4083,      0,   1,             0.863 | ||||||
|  |   13,   4083,      0,  -1,             0.863 | ||||||
|  |   14,   4082,      0,   0,             0.838 | ||||||
|  |   14,   4082,      0,   1,             0.869 | ||||||
|  |   14,   4082,      0,  -1,             0.877 | ||||||
|  |   15,   4081,      0,   0,             0.841 | ||||||
|  |   15,   4081,      0,   1,             0.869 | ||||||
|  |   15,   4081,      0,  -1,             0.876 | ||||||
|  |   16,   4080,      0,   0,             0.988 | ||||||
|  |   16,   4080,      0,   1,              0.99 | ||||||
|  |   16,   4080,      0,  -1,             0.989 | ||||||
|  |   17,   4079,      0,   0,             0.978 | ||||||
|  |   17,   4079,      0,   1,             0.981 | ||||||
|  |   17,   4079,      0,  -1,              0.98 | ||||||
|  |   18,   4078,      0,   0,             0.981 | ||||||
|  |   18,   4078,      0,   1,              0.98 | ||||||
|  |   18,   4078,      0,  -1,             0.985 | ||||||
|  |   19,   4077,      0,   0,             0.977 | ||||||
|  |   19,   4077,      0,   1,             0.979 | ||||||
|  |   19,   4077,      0,  -1,             0.986 | ||||||
|  |   20,   4076,      0,   0,             0.977 | ||||||
|  |   20,   4076,      0,   1,             0.986 | ||||||
|  |   20,   4076,      0,  -1,             0.984 | ||||||
|  |   21,   4075,      0,   0,             0.977 | ||||||
|  |   21,   4075,      0,   1,             0.983 | ||||||
|  |   21,   4075,      0,  -1,             0.988 | ||||||
|  |   22,   4074,      0,   0,             0.983 | ||||||
|  |   22,   4074,      0,   1,             0.994 | ||||||
|  |   22,   4074,      0,  -1,             0.993 | ||||||
|  |   23,   4073,      0,   0,              0.98 | ||||||
|  |   23,   4073,      0,   1,             0.992 | ||||||
|  |   23,   4073,      0,  -1,             0.995 | ||||||
|  |   24,   4072,      0,   0,             0.989 | ||||||
|  |   24,   4072,      0,   1,             0.989 | ||||||
|  |   24,   4072,      0,  -1,             0.991 | ||||||
|  |   25,   4071,      0,   0,              0.99 | ||||||
|  |   25,   4071,      0,   1,             0.999 | ||||||
|  |   25,   4071,      0,  -1,             0.996 | ||||||
|  |   26,   4070,      0,   0,             0.993 | ||||||
|  |   26,   4070,      0,   1,             0.995 | ||||||
|  |   26,   4070,      0,  -1,             0.998 | ||||||
|  |   27,   4069,      0,   0,             0.993 | ||||||
|  |   27,   4069,      0,   1,             0.999 | ||||||
|  |   27,   4069,      0,  -1,               1.0 | ||||||
|  |   28,   4068,      0,   0,             0.997 | ||||||
|  |   28,   4068,      0,   1,               1.0 | ||||||
|  |   28,   4068,      0,  -1,             0.999 | ||||||
|  |   29,   4067,      0,   0,             0.996 | ||||||
|  |   29,   4067,      0,   1,             0.999 | ||||||
|  |   29,   4067,      0,  -1,             0.999 | ||||||
|  |   30,   4066,      0,   0,             0.991 | ||||||
|  |   30,   4066,      0,   1,             1.001 | ||||||
|  |   30,   4066,      0,  -1,             0.999 | ||||||
|  |   31,   4065,      0,   0,             0.988 | ||||||
|  |   31,   4065,      0,   1,             0.998 | ||||||
|  |   31,   4065,      0,  -1,             0.998 | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++-------- | ||||||
|  |  1 file changed, 61 insertions(+), 37 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
 | ||||||
|  | index 16fc673e..99258cf5 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
 | ||||||
|  | @@ -429,22 +429,21 @@ L(page_cross_less_vec):
 | ||||||
|  |  # ifndef USE_AS_WMEMCMP | ||||||
|  |  	cmpl	$8, %edx | ||||||
|  |  	jae	L(between_8_15) | ||||||
|  | +	/* Fall through for [4, 7].  */
 | ||||||
|  |  	cmpl	$4, %edx | ||||||
|  | -	jae	L(between_4_7)
 | ||||||
|  | +	jb	L(between_2_3)
 | ||||||
|  |   | ||||||
|  | -	/* Load as big endian to avoid branches.  */
 | ||||||
|  | -	movzwl	(%rdi), %eax
 | ||||||
|  | -	movzwl	(%rsi), %ecx
 | ||||||
|  | -	shll	$8, %eax
 | ||||||
|  | -	shll	$8, %ecx
 | ||||||
|  | -	bswap	%eax
 | ||||||
|  | -	bswap	%ecx
 | ||||||
|  | -	movzbl	-1(%rdi, %rdx), %edi
 | ||||||
|  | -	movzbl	-1(%rsi, %rdx), %esi
 | ||||||
|  | -	orl	%edi, %eax
 | ||||||
|  | -	orl	%esi, %ecx
 | ||||||
|  | -	/* Subtraction is okay because the upper 8 bits are zero.  */
 | ||||||
|  | -	subl	%ecx, %eax
 | ||||||
|  | +	movbe	(%rdi), %eax
 | ||||||
|  | +	movbe	(%rsi), %ecx
 | ||||||
|  | +	shlq	$32, %rax
 | ||||||
|  | +	shlq	$32, %rcx
 | ||||||
|  | +	movbe	-4(%rdi, %rdx), %edi
 | ||||||
|  | +	movbe	-4(%rsi, %rdx), %esi
 | ||||||
|  | +	orq	%rdi, %rax
 | ||||||
|  | +	orq	%rsi, %rcx
 | ||||||
|  | +	subq	%rcx, %rax
 | ||||||
|  | +	/* Fast path for return zero.  */
 | ||||||
|  | +	jnz	L(ret_nonzero)
 | ||||||
|  |  	/* No ymm register was touched.  */ | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | @@ -457,9 +456,33 @@ L(one_or_less):
 | ||||||
|  |  	/* No ymm register was touched.  */ | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | +	.p2align 4,, 5
 | ||||||
|  | +L(ret_nonzero):
 | ||||||
|  | +	sbbl	%eax, %eax
 | ||||||
|  | +	orl	$1, %eax
 | ||||||
|  | +	/* No ymm register was touched.  */
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4,, 2
 | ||||||
|  | +L(zero):
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	/* No ymm register was touched.  */
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(between_8_15): | ||||||
|  | -# endif
 | ||||||
|  | +	movbe	(%rdi), %rax
 | ||||||
|  | +	movbe	(%rsi), %rcx
 | ||||||
|  | +	subq	%rcx, %rax
 | ||||||
|  | +	jnz	L(ret_nonzero)
 | ||||||
|  | +	movbe	-8(%rdi, %rdx), %rax
 | ||||||
|  | +	movbe	-8(%rsi, %rdx), %rcx
 | ||||||
|  | +	subq	%rcx, %rax
 | ||||||
|  | +	/* Fast path for return zero.  */
 | ||||||
|  | +	jnz	L(ret_nonzero)
 | ||||||
|  | +	/* No ymm register was touched.  */
 | ||||||
|  | +	ret
 | ||||||
|  | +# else
 | ||||||
|  |  	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */ | ||||||
|  |  	vmovq	(%rdi), %xmm1 | ||||||
|  |  	vmovq	(%rsi), %xmm2 | ||||||
|  | @@ -475,16 +498,13 @@ L(between_8_15):
 | ||||||
|  |  	VPCMPEQ	%xmm1, %xmm2, %xmm2 | ||||||
|  |  	vpmovmskb %xmm2, %eax | ||||||
|  |  	subl	$0xffff, %eax | ||||||
|  | +	/* Fast path for return zero.  */
 | ||||||
|  |  	jnz	L(return_vec_0) | ||||||
|  |  	/* No ymm register was touched.  */ | ||||||
|  |  	ret | ||||||
|  | +# endif
 | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(zero):
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	.p2align 4,, 10
 | ||||||
|  |  L(between_16_31): | ||||||
|  |  	/* From 16 to 31 bytes.  No branch when size == 16.  */ | ||||||
|  |  	vmovdqu	(%rsi), %xmm2 | ||||||
|  | @@ -501,11 +521,17 @@ L(between_16_31):
 | ||||||
|  |  	VPCMPEQ	(%rdi), %xmm2, %xmm2 | ||||||
|  |  	vpmovmskb %xmm2, %eax | ||||||
|  |  	subl	$0xffff, %eax | ||||||
|  | +	/* Fast path for return zero.  */
 | ||||||
|  |  	jnz	L(return_vec_0) | ||||||
|  |  	/* No ymm register was touched.  */ | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  # ifdef USE_AS_WMEMCMP | ||||||
|  | +	.p2align 4,, 2
 | ||||||
|  | +L(zero):
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(one_or_less): | ||||||
|  |  	jb	L(zero) | ||||||
|  | @@ -520,22 +546,20 @@ L(one_or_less):
 | ||||||
|  |  # else | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(between_4_7):
 | ||||||
|  | -	/* Load as big endian with overlapping movbe to avoid branches.
 | ||||||
|  | -	 */
 | ||||||
|  | -	movbe	(%rdi), %eax
 | ||||||
|  | -	movbe	(%rsi), %ecx
 | ||||||
|  | -	shlq	$32, %rax
 | ||||||
|  | -	shlq	$32, %rcx
 | ||||||
|  | -	movbe	-4(%rdi, %rdx), %edi
 | ||||||
|  | -	movbe	-4(%rsi, %rdx), %esi
 | ||||||
|  | -	orq	%rdi, %rax
 | ||||||
|  | -	orq	%rsi, %rcx
 | ||||||
|  | -	subq	%rcx, %rax
 | ||||||
|  | -	jz	L(zero_4_7)
 | ||||||
|  | -	sbbl	%eax, %eax
 | ||||||
|  | -	orl	$1, %eax
 | ||||||
|  | -L(zero_4_7):
 | ||||||
|  | +L(between_2_3):
 | ||||||
|  | +	/* Load as big endian to avoid branches.  */
 | ||||||
|  | +	movzwl	(%rdi), %eax
 | ||||||
|  | +	movzwl	(%rsi), %ecx
 | ||||||
|  | +	bswap	%eax
 | ||||||
|  | +	bswap	%ecx
 | ||||||
|  | +	shrl	%eax
 | ||||||
|  | +	shrl	%ecx
 | ||||||
|  | +	movzbl	-1(%rdi, %rdx), %edi
 | ||||||
|  | +	movzbl	-1(%rsi, %rdx), %esi
 | ||||||
|  | +	orl	%edi, %eax
 | ||||||
|  | +	orl	%esi, %ecx
 | ||||||
|  | +	/* Subtraction is okay because the upper bit is zero.  */
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  |  	/* No ymm register was touched.  */ | ||||||
|  |  	ret | ||||||
|  |  # endif | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										876
									
								
								SOURCES/glibc-RHEL-15696-103.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										876
									
								
								SOURCES/glibc-RHEL-15696-103.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,876 @@ | |||||||
|  | From 5307aa9c1800f36a64c183c091c9af392c1fa75c Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Thu, 21 Apr 2022 20:52:28 -0500 | ||||||
|  | Subject: [PATCH] x86: Optimize {str|wcs}rchr-sse2 | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | The new code unrolls the main loop slightly without adding too much | ||||||
|  | overhead and minimizes the comparisons for the search CHAR. | ||||||
|  | 
 | ||||||
|  | Geometric Mean of all benchmarks New / Old: 0.741 | ||||||
|  | See email for all results. | ||||||
|  | 
 | ||||||
|  | Full xcheck passes on x86_64 with and without multiarch enabled. | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +- | ||||||
|  |  sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +- | ||||||
|  |  sysdeps/x86_64/strrchr.S                | 510 +++++++++++++++--------- | ||||||
|  |  sysdeps/x86_64/wcsrchr.S                | 266 +----------- | ||||||
|  |  4 files changed, 338 insertions(+), 443 deletions(-) | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	sysdeps/x86_64/wcsrchr.S | ||||||
|  | 	(copyright header) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
 | ||||||
|  | index 0ec76fe9..6bb1284b 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
 | ||||||
|  | @@ -17,7 +17,7 @@
 | ||||||
|  |     <http://www.gnu.org/licenses/>.  */ | ||||||
|  |   | ||||||
|  |  #if IS_IN (libc) | ||||||
|  | -# define strrchr __strrchr_sse2
 | ||||||
|  | +# define STRRCHR __strrchr_sse2
 | ||||||
|  |   | ||||||
|  |  # undef weak_alias | ||||||
|  |  # define weak_alias(strrchr, rindex) | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
 | ||||||
|  | index d015e953..f26d53b5 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
 | ||||||
|  | @@ -17,7 +17,6 @@
 | ||||||
|  |     <http://www.gnu.org/licenses/>.  */ | ||||||
|  |   | ||||||
|  |  #if IS_IN (libc) | ||||||
|  | -# define wcsrchr __wcsrchr_sse2
 | ||||||
|  | +# define STRRCHR	__wcsrchr_sse2
 | ||||||
|  |  #endif | ||||||
|  | -
 | ||||||
|  |  #include "../wcsrchr.S" | ||||||
|  | diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
 | ||||||
|  | index aca98e7e..a58cc220 100644
 | ||||||
|  | --- a/sysdeps/x86_64/strrchr.S
 | ||||||
|  | +++ b/sysdeps/x86_64/strrchr.S
 | ||||||
|  | @@ -19,210 +19,360 @@
 | ||||||
|  |   | ||||||
|  |  #include <sysdep.h> | ||||||
|  |   | ||||||
|  | +#ifndef STRRCHR
 | ||||||
|  | +# define STRRCHR	strrchr
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +#ifdef USE_AS_WCSRCHR
 | ||||||
|  | +# define PCMPEQ	pcmpeqd
 | ||||||
|  | +# define CHAR_SIZE	4
 | ||||||
|  | +# define PMINU	pminud
 | ||||||
|  | +#else
 | ||||||
|  | +# define PCMPEQ	pcmpeqb
 | ||||||
|  | +# define CHAR_SIZE	1
 | ||||||
|  | +# define PMINU	pminub
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +#define PAGE_SIZE	4096
 | ||||||
|  | +#define VEC_SIZE	16
 | ||||||
|  | +
 | ||||||
|  |  	.text | ||||||
|  | -ENTRY (strrchr)
 | ||||||
|  | -	movd	%esi, %xmm1
 | ||||||
|  | +ENTRY(STRRCHR)
 | ||||||
|  | +	movd	%esi, %xmm0
 | ||||||
|  |  	movq	%rdi, %rax | ||||||
|  | -	andl	$4095, %eax
 | ||||||
|  | -	punpcklbw	%xmm1, %xmm1
 | ||||||
|  | -	cmpq	$4032, %rax
 | ||||||
|  | -	punpcklwd	%xmm1, %xmm1
 | ||||||
|  | -	pshufd	$0, %xmm1, %xmm1
 | ||||||
|  | +	andl	$(PAGE_SIZE - 1), %eax
 | ||||||
|  | +#ifndef USE_AS_WCSRCHR
 | ||||||
|  | +	punpcklbw %xmm0, %xmm0
 | ||||||
|  | +	punpcklwd %xmm0, %xmm0
 | ||||||
|  | +#endif
 | ||||||
|  | +	pshufd	$0, %xmm0, %xmm0
 | ||||||
|  | +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 | ||||||
|  |  	ja	L(cross_page) | ||||||
|  | -	movdqu	(%rdi), %xmm0
 | ||||||
|  | +
 | ||||||
|  | +L(cross_page_continue):
 | ||||||
|  | +	movups	(%rdi), %xmm1
 | ||||||
|  |  	pxor	%xmm2, %xmm2 | ||||||
|  | -	movdqa	%xmm0, %xmm3
 | ||||||
|  | -	pcmpeqb	%xmm1, %xmm0
 | ||||||
|  | -	pcmpeqb	%xmm2, %xmm3
 | ||||||
|  | -	pmovmskb	%xmm0, %ecx
 | ||||||
|  | -	pmovmskb	%xmm3, %edx
 | ||||||
|  | -	testq	%rdx, %rdx
 | ||||||
|  | -	je	L(next_48_bytes)
 | ||||||
|  | -	leaq	-1(%rdx), %rax
 | ||||||
|  | -	xorq	%rdx, %rax
 | ||||||
|  | -	andq	%rcx, %rax
 | ||||||
|  | -	je	L(exit)
 | ||||||
|  | -	bsrq	%rax, %rax
 | ||||||
|  | +	PCMPEQ	%xmm1, %xmm2
 | ||||||
|  | +	pmovmskb %xmm2, %ecx
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jz	L(aligned_more)
 | ||||||
|  | +
 | ||||||
|  | +	PCMPEQ	%xmm0, %xmm1
 | ||||||
|  | +	pmovmskb %xmm1, %eax
 | ||||||
|  | +	leal	-1(%rcx), %edx
 | ||||||
|  | +	xorl	%edx, %ecx
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jz	L(ret0)
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  | +	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
 | ||||||
|  | +	   search CHAR is zero we are correct. Either way `andq
 | ||||||
|  | +	   -CHAR_SIZE, %rax` gets the correct result.  */
 | ||||||
|  | +#ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	andq	$-CHAR_SIZE, %rax
 | ||||||
|  | +#endif
 | ||||||
|  | +L(ret0):
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | +	/* Returns for first vec x1/x2 have hard coded backward search
 | ||||||
|  | +	   path for earlier matches.  */
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(next_48_bytes):
 | ||||||
|  | -	movdqu	16(%rdi), %xmm4
 | ||||||
|  | -	movdqa	%xmm4, %xmm5
 | ||||||
|  | -	movdqu	32(%rdi), %xmm3
 | ||||||
|  | -	pcmpeqb	%xmm1, %xmm4
 | ||||||
|  | -	pcmpeqb	%xmm2, %xmm5
 | ||||||
|  | -	movdqu	48(%rdi), %xmm0
 | ||||||
|  | -	pmovmskb	%xmm5, %edx
 | ||||||
|  | -	movdqa	%xmm3, %xmm5
 | ||||||
|  | -	pcmpeqb	%xmm1, %xmm3
 | ||||||
|  | -	pcmpeqb	%xmm2, %xmm5
 | ||||||
|  | -	pcmpeqb	%xmm0, %xmm2
 | ||||||
|  | -	salq	$16, %rdx
 | ||||||
|  | -	pmovmskb	%xmm3, %r8d
 | ||||||
|  | -	pmovmskb	%xmm5, %eax
 | ||||||
|  | -	pmovmskb	%xmm2, %esi
 | ||||||
|  | -	salq	$32, %r8
 | ||||||
|  | -	salq	$32, %rax
 | ||||||
|  | -	pcmpeqb	%xmm1, %xmm0
 | ||||||
|  | -	orq	%rdx, %rax
 | ||||||
|  | -	movq	%rsi, %rdx
 | ||||||
|  | -	pmovmskb	%xmm4, %esi
 | ||||||
|  | -	salq	$48, %rdx
 | ||||||
|  | -	salq	$16, %rsi
 | ||||||
|  | -	orq	%r8, %rsi
 | ||||||
|  | -	orq	%rcx, %rsi
 | ||||||
|  | -	pmovmskb	%xmm0, %ecx
 | ||||||
|  | -	salq	$48, %rcx
 | ||||||
|  | -	orq	%rcx, %rsi
 | ||||||
|  | -	orq	%rdx, %rax
 | ||||||
|  | -	je	L(loop_header2)
 | ||||||
|  | -	leaq	-1(%rax), %rcx
 | ||||||
|  | -	xorq	%rax, %rcx
 | ||||||
|  | -	andq	%rcx, %rsi
 | ||||||
|  | -	je	L(exit)
 | ||||||
|  | -	bsrq	%rsi, %rsi
 | ||||||
|  | -	leaq	(%rdi,%rsi), %rax
 | ||||||
|  | +L(first_vec_x0_test):
 | ||||||
|  | +	PCMPEQ	%xmm0, %xmm1
 | ||||||
|  | +	pmovmskb %xmm1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jz	L(ret0)
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  | +	addq	%r8, %rax
 | ||||||
|  | +#ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	andq	$-CHAR_SIZE, %rax
 | ||||||
|  | +#endif
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(loop_header2):
 | ||||||
|  | -	testq	%rsi, %rsi
 | ||||||
|  | -	movq	%rdi, %rcx
 | ||||||
|  | -	je	L(no_c_found)
 | ||||||
|  | -L(loop_header):
 | ||||||
|  | -	addq	$64, %rdi
 | ||||||
|  | -	pxor	%xmm7, %xmm7
 | ||||||
|  | -	andq	$-64, %rdi
 | ||||||
|  | -	jmp	L(loop_entry)
 | ||||||
|  | +L(first_vec_x1):
 | ||||||
|  | +	PCMPEQ	%xmm0, %xmm2
 | ||||||
|  | +	pmovmskb %xmm2, %eax
 | ||||||
|  | +	leal	-1(%rcx), %edx
 | ||||||
|  | +	xorl	%edx, %ecx
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jz	L(first_vec_x0_test)
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  | +	leaq	(VEC_SIZE)(%rdi, %rax), %rax
 | ||||||
|  | +#ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	andq	$-CHAR_SIZE, %rax
 | ||||||
|  | +#endif
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(loop64):
 | ||||||
|  | -	testq	%rdx, %rdx
 | ||||||
|  | -	cmovne	%rdx, %rsi
 | ||||||
|  | -	cmovne	%rdi, %rcx
 | ||||||
|  | -	addq	$64, %rdi
 | ||||||
|  | -L(loop_entry):
 | ||||||
|  | -	movdqa	32(%rdi), %xmm3
 | ||||||
|  | -	pxor	%xmm6, %xmm6
 | ||||||
|  | -	movdqa	48(%rdi), %xmm2
 | ||||||
|  | -	movdqa	%xmm3, %xmm0
 | ||||||
|  | -	movdqa	16(%rdi), %xmm4
 | ||||||
|  | -	pminub	%xmm2, %xmm0
 | ||||||
|  | -	movdqa	(%rdi), %xmm5
 | ||||||
|  | -	pminub	%xmm4, %xmm0
 | ||||||
|  | -	pminub	%xmm5, %xmm0
 | ||||||
|  | -	pcmpeqb	%xmm7, %xmm0
 | ||||||
|  | -	pmovmskb	%xmm0, %eax
 | ||||||
|  | -	movdqa	%xmm5, %xmm0
 | ||||||
|  | -	pcmpeqb	%xmm1, %xmm0
 | ||||||
|  | -	pmovmskb	%xmm0, %r9d
 | ||||||
|  | -	movdqa	%xmm4, %xmm0
 | ||||||
|  | -	pcmpeqb	%xmm1, %xmm0
 | ||||||
|  | -	pmovmskb	%xmm0, %edx
 | ||||||
|  | -	movdqa	%xmm3, %xmm0
 | ||||||
|  | -	pcmpeqb	%xmm1, %xmm0
 | ||||||
|  | -	salq	$16, %rdx
 | ||||||
|  | -	pmovmskb	%xmm0, %r10d
 | ||||||
|  | -	movdqa	%xmm2, %xmm0
 | ||||||
|  | -	pcmpeqb	%xmm1, %xmm0
 | ||||||
|  | -	salq	$32, %r10
 | ||||||
|  | -	orq	%r10, %rdx
 | ||||||
|  | -	pmovmskb	%xmm0, %r8d
 | ||||||
|  | -	orq	%r9, %rdx
 | ||||||
|  | -	salq	$48, %r8
 | ||||||
|  | -	orq	%r8, %rdx
 | ||||||
|  | +L(first_vec_x1_test):
 | ||||||
|  | +	PCMPEQ	%xmm0, %xmm2
 | ||||||
|  | +	pmovmskb %xmm2, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	je	L(loop64)
 | ||||||
|  | -	pcmpeqb	%xmm6, %xmm4
 | ||||||
|  | -	pcmpeqb	%xmm6, %xmm3
 | ||||||
|  | -	pcmpeqb	%xmm6, %xmm5
 | ||||||
|  | -	pmovmskb	%xmm4, %eax
 | ||||||
|  | -	pmovmskb	%xmm3, %r10d
 | ||||||
|  | -	pcmpeqb	%xmm6, %xmm2
 | ||||||
|  | -	pmovmskb	%xmm5, %r9d
 | ||||||
|  | -	salq	$32, %r10
 | ||||||
|  | -	salq	$16, %rax
 | ||||||
|  | -	pmovmskb	%xmm2, %r8d
 | ||||||
|  | -	orq	%r10, %rax
 | ||||||
|  | -	orq	%r9, %rax
 | ||||||
|  | -	salq	$48, %r8
 | ||||||
|  | -	orq	%r8, %rax
 | ||||||
|  | -	leaq	-1(%rax), %r8
 | ||||||
|  | -	xorq	%rax, %r8
 | ||||||
|  | -	andq	%r8, %rdx
 | ||||||
|  | -	cmovne	%rdi, %rcx
 | ||||||
|  | -	cmovne	%rdx, %rsi
 | ||||||
|  | -	bsrq	%rsi, %rsi
 | ||||||
|  | -	leaq	(%rcx,%rsi), %rax
 | ||||||
|  | +	jz	L(first_vec_x0_test)
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  | +	leaq	(VEC_SIZE)(%rdi, %rax), %rax
 | ||||||
|  | +#ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	andq	$-CHAR_SIZE, %rax
 | ||||||
|  | +#endif
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_vec_x2):
 | ||||||
|  | +	PCMPEQ	%xmm0, %xmm3
 | ||||||
|  | +	pmovmskb %xmm3, %eax
 | ||||||
|  | +	leal	-1(%rcx), %edx
 | ||||||
|  | +	xorl	%edx, %ecx
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jz	L(first_vec_x1_test)
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  | +	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
 | ||||||
|  | +#ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	andq	$-CHAR_SIZE, %rax
 | ||||||
|  | +#endif
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(aligned_more):
 | ||||||
|  | +	/* Save original pointer if match was in VEC 0.  */
 | ||||||
|  | +	movq	%rdi, %r8
 | ||||||
|  | +	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | +
 | ||||||
|  | +	movaps	VEC_SIZE(%rdi), %xmm2
 | ||||||
|  | +	pxor	%xmm3, %xmm3
 | ||||||
|  | +	PCMPEQ	%xmm2, %xmm3
 | ||||||
|  | +	pmovmskb %xmm3, %ecx
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jnz	L(first_vec_x1)
 | ||||||
|  | +
 | ||||||
|  | +	movaps	(VEC_SIZE * 2)(%rdi), %xmm3
 | ||||||
|  | +	pxor	%xmm4, %xmm4
 | ||||||
|  | +	PCMPEQ	%xmm3, %xmm4
 | ||||||
|  | +	pmovmskb %xmm4, %ecx
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jnz	L(first_vec_x2)
 | ||||||
|  | +
 | ||||||
|  | +	addq	$VEC_SIZE, %rdi
 | ||||||
|  | +	/* Save pointer again before realigning.  */
 | ||||||
|  | +	movq	%rdi, %rsi
 | ||||||
|  | +	andq	$-(VEC_SIZE * 2), %rdi
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_loop):
 | ||||||
|  | +	/* Do 2x VEC at a time.  */
 | ||||||
|  | +	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
 | ||||||
|  | +	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
 | ||||||
|  | +	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
 | ||||||
|  | +	   detecting zero. Note if this is found to be a bottleneck it
 | ||||||
|  | +	   may be worth adding an SSE4.1 wcsrchr implementation.  */
 | ||||||
|  | +#ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	movaps	%xmm5, %xmm6
 | ||||||
|  | +	pxor	%xmm8, %xmm8
 | ||||||
|  | +
 | ||||||
|  | +	PCMPEQ	%xmm8, %xmm5
 | ||||||
|  | +	PCMPEQ	%xmm4, %xmm8
 | ||||||
|  | +	por	%xmm5, %xmm8
 | ||||||
|  | +#else
 | ||||||
|  | +	movaps	%xmm5, %xmm6
 | ||||||
|  | +	PMINU	%xmm4, %xmm5
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +	movaps	%xmm4, %xmm9
 | ||||||
|  | +	PCMPEQ	%xmm0, %xmm4
 | ||||||
|  | +	PCMPEQ	%xmm0, %xmm6
 | ||||||
|  | +	movaps	%xmm6, %xmm7
 | ||||||
|  | +	por	%xmm4, %xmm6
 | ||||||
|  | +#ifndef USE_AS_WCSRCHR
 | ||||||
|  | +	pxor	%xmm8, %xmm8
 | ||||||
|  | +	PCMPEQ	%xmm5, %xmm8
 | ||||||
|  | +#endif
 | ||||||
|  | +	pmovmskb %xmm8, %ecx
 | ||||||
|  | +	pmovmskb %xmm6, %eax
 | ||||||
|  | +
 | ||||||
|  | +	addq	$(VEC_SIZE * 2), %rdi
 | ||||||
|  | +	/* Use `addl` 1) so we can undo it with `subl` and 2) it can
 | ||||||
|  | +	   macro-fuse with `jz`.  */
 | ||||||
|  | +	addl	%ecx, %eax
 | ||||||
|  | +	jz	L(first_loop)
 | ||||||
|  | +
 | ||||||
|  | +	/* Check if there is zero match.  */
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jz	L(second_loop_match)
 | ||||||
|  | +
 | ||||||
|  | +	/* Check if there was a match in last iteration.  */
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +	jnz	L(new_match)
 | ||||||
|  | +
 | ||||||
|  | +L(first_loop_old_match):
 | ||||||
|  | +	PCMPEQ	%xmm0, %xmm2
 | ||||||
|  | +	PCMPEQ	%xmm0, %xmm3
 | ||||||
|  | +	pmovmskb %xmm2, %ecx
 | ||||||
|  | +	pmovmskb %xmm3, %eax
 | ||||||
|  | +	addl	%eax, %ecx
 | ||||||
|  | +	jz	L(first_vec_x0_test)
 | ||||||
|  | +	/* NB: We could move this shift to before the branch and save a
 | ||||||
|  | +	   bit of code size / performance on the fall through. The
 | ||||||
|  | +	   branch leads to the null case which generally seems hotter
 | ||||||
|  | +	   than char in first 3x VEC.  */
 | ||||||
|  | +	sall	$16, %eax
 | ||||||
|  | +	orl	%ecx, %eax
 | ||||||
|  | +
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  | +	addq	%rsi, %rax
 | ||||||
|  | +#ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	andq	$-CHAR_SIZE, %rax
 | ||||||
|  | +#endif
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(new_match):
 | ||||||
|  | +	pxor	%xmm6, %xmm6
 | ||||||
|  | +	PCMPEQ	%xmm9, %xmm6
 | ||||||
|  | +	pmovmskb %xmm6, %eax
 | ||||||
|  | +	sall	$16, %ecx
 | ||||||
|  | +	orl	%eax, %ecx
 | ||||||
|  | +
 | ||||||
|  | +	/* We can't reuse either of the old comparisons as since we mask
 | ||||||
|  | +	   of zeros after first zero (instead of using the full
 | ||||||
|  | +	   comparison) we can't gurantee no interference between match
 | ||||||
|  | +	   after end of string and valid match.  */
 | ||||||
|  | +	pmovmskb %xmm4, %eax
 | ||||||
|  | +	pmovmskb %xmm7, %edx
 | ||||||
|  | +	sall	$16, %edx
 | ||||||
|  | +	orl	%edx, %eax
 | ||||||
|  | +
 | ||||||
|  | +	leal	-1(%ecx), %edx
 | ||||||
|  | +	xorl	%edx, %ecx
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jz	L(first_loop_old_match)
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  | +#ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	andq	$-CHAR_SIZE, %rax
 | ||||||
|  | +#endif
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | +	/* Save minimum state for getting most recent match. We can
 | ||||||
|  | +	   throw out all previous work.  */
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(no_c_found):
 | ||||||
|  | -	movl	$1, %esi
 | ||||||
|  | -	xorl	%ecx, %ecx
 | ||||||
|  | -	jmp	L(loop_header)
 | ||||||
|  | +L(second_loop_match):
 | ||||||
|  | +	movq	%rdi, %rsi
 | ||||||
|  | +	movaps	%xmm4, %xmm2
 | ||||||
|  | +	movaps	%xmm7, %xmm3
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(exit):
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | +L(second_loop):
 | ||||||
|  | +	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
 | ||||||
|  | +	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
 | ||||||
|  | +	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
 | ||||||
|  | +	   detecting zero. Note if this is found to be a bottleneck it
 | ||||||
|  | +	   may be worth adding an SSE4.1 wcsrchr implementation.  */
 | ||||||
|  | +#ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	movaps	%xmm5, %xmm6
 | ||||||
|  | +	pxor	%xmm8, %xmm8
 | ||||||
|  | +
 | ||||||
|  | +	PCMPEQ	%xmm8, %xmm5
 | ||||||
|  | +	PCMPEQ	%xmm4, %xmm8
 | ||||||
|  | +	por	%xmm5, %xmm8
 | ||||||
|  | +#else
 | ||||||
|  | +	movaps	%xmm5, %xmm6
 | ||||||
|  | +	PMINU	%xmm4, %xmm5
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +	movaps	%xmm4, %xmm9
 | ||||||
|  | +	PCMPEQ	%xmm0, %xmm4
 | ||||||
|  | +	PCMPEQ	%xmm0, %xmm6
 | ||||||
|  | +	movaps	%xmm6, %xmm7
 | ||||||
|  | +	por	%xmm4, %xmm6
 | ||||||
|  | +#ifndef USE_AS_WCSRCHR
 | ||||||
|  | +	pxor	%xmm8, %xmm8
 | ||||||
|  | +	PCMPEQ	%xmm5, %xmm8
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +	pmovmskb %xmm8, %ecx
 | ||||||
|  | +	pmovmskb %xmm6, %eax
 | ||||||
|  | +
 | ||||||
|  | +	addq	$(VEC_SIZE * 2), %rdi
 | ||||||
|  | +	/* Either null term or new occurence of CHAR.  */
 | ||||||
|  | +	addl	%ecx, %eax
 | ||||||
|  | +	jz	L(second_loop)
 | ||||||
|  | +
 | ||||||
|  | +	/* No null term so much be new occurence of CHAR.  */
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jz	L(second_loop_match)
 | ||||||
|  | +
 | ||||||
|  | +
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +	jnz	L(second_loop_new_match)
 | ||||||
|  | +
 | ||||||
|  | +L(second_loop_old_match):
 | ||||||
|  | +	pmovmskb %xmm2, %ecx
 | ||||||
|  | +	pmovmskb %xmm3, %eax
 | ||||||
|  | +	sall	$16, %eax
 | ||||||
|  | +	orl	%ecx, %eax
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  | +	addq	%rsi, %rax
 | ||||||
|  | +#ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	andq	$-CHAR_SIZE, %rax
 | ||||||
|  | +#endif
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | +L(second_loop_new_match):
 | ||||||
|  | +	pxor	%xmm6, %xmm6
 | ||||||
|  | +	PCMPEQ	%xmm9, %xmm6
 | ||||||
|  | +	pmovmskb %xmm6, %eax
 | ||||||
|  | +	sall	$16, %ecx
 | ||||||
|  | +	orl	%eax, %ecx
 | ||||||
|  | +
 | ||||||
|  | +	/* We can't reuse either of the old comparisons as since we mask
 | ||||||
|  | +	   of zeros after first zero (instead of using the full
 | ||||||
|  | +	   comparison) we can't gurantee no interference between match
 | ||||||
|  | +	   after end of string and valid match.  */
 | ||||||
|  | +	pmovmskb %xmm4, %eax
 | ||||||
|  | +	pmovmskb %xmm7, %edx
 | ||||||
|  | +	sall	$16, %edx
 | ||||||
|  | +	orl	%edx, %eax
 | ||||||
|  | +
 | ||||||
|  | +	leal	-1(%ecx), %edx
 | ||||||
|  | +	xorl	%edx, %ecx
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jz	L(second_loop_old_match)
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  | +#ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	andq	$-CHAR_SIZE, %rax
 | ||||||
|  | +#endif
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4,, 4
 | ||||||
|  |  L(cross_page): | ||||||
|  | -	movq	%rdi, %rax
 | ||||||
|  | -	pxor	%xmm0, %xmm0
 | ||||||
|  | -	andq	$-64, %rax
 | ||||||
|  | -	movdqu	(%rax), %xmm5
 | ||||||
|  | -	movdqa	%xmm5, %xmm6
 | ||||||
|  | -	movdqu	16(%rax), %xmm4
 | ||||||
|  | -	pcmpeqb	%xmm1, %xmm5
 | ||||||
|  | -	pcmpeqb	%xmm0, %xmm6
 | ||||||
|  | -	movdqu	32(%rax), %xmm3
 | ||||||
|  | -	pmovmskb	%xmm6, %esi
 | ||||||
|  | -	movdqa	%xmm4, %xmm6
 | ||||||
|  | -	movdqu	48(%rax), %xmm2
 | ||||||
|  | -	pcmpeqb	%xmm1, %xmm4
 | ||||||
|  | -	pcmpeqb	%xmm0, %xmm6
 | ||||||
|  | -	pmovmskb	%xmm6, %edx
 | ||||||
|  | -	movdqa	%xmm3, %xmm6
 | ||||||
|  | -	pcmpeqb	%xmm1, %xmm3
 | ||||||
|  | -	pcmpeqb	%xmm0, %xmm6
 | ||||||
|  | -	pcmpeqb	%xmm2, %xmm0
 | ||||||
|  | -	salq	$16, %rdx
 | ||||||
|  | -	pmovmskb	%xmm3, %r9d
 | ||||||
|  | -	pmovmskb	%xmm6, %r8d
 | ||||||
|  | -	pmovmskb	%xmm0, %ecx
 | ||||||
|  | -	salq	$32, %r9
 | ||||||
|  | -	salq	$32, %r8
 | ||||||
|  | -	pcmpeqb	%xmm1, %xmm2
 | ||||||
|  | -	orq	%r8, %rdx
 | ||||||
|  | -	salq	$48, %rcx
 | ||||||
|  | -	pmovmskb	%xmm5, %r8d
 | ||||||
|  | -	orq	%rsi, %rdx
 | ||||||
|  | -	pmovmskb	%xmm4, %esi
 | ||||||
|  | -	orq	%rcx, %rdx
 | ||||||
|  | -	pmovmskb	%xmm2, %ecx
 | ||||||
|  | -	salq	$16, %rsi
 | ||||||
|  | -	salq	$48, %rcx
 | ||||||
|  | -	orq	%r9, %rsi
 | ||||||
|  | -	orq	%r8, %rsi
 | ||||||
|  | -	orq	%rcx, %rsi
 | ||||||
|  | +	movq	%rdi, %rsi
 | ||||||
|  | +	andq	$-VEC_SIZE, %rsi
 | ||||||
|  | +	movaps	(%rsi), %xmm1
 | ||||||
|  | +	pxor	%xmm2, %xmm2
 | ||||||
|  | +	PCMPEQ	%xmm1, %xmm2
 | ||||||
|  | +	pmovmskb %xmm2, %edx
 | ||||||
|  |  	movl	%edi, %ecx | ||||||
|  | -	subl	%eax, %ecx
 | ||||||
|  | -	shrq	%cl, %rdx
 | ||||||
|  | -	shrq	%cl, %rsi
 | ||||||
|  | -	testq	%rdx, %rdx
 | ||||||
|  | -	je	L(loop_header2)
 | ||||||
|  | -	leaq	-1(%rdx), %rax
 | ||||||
|  | -	xorq	%rdx, %rax
 | ||||||
|  | -	andq	%rax, %rsi
 | ||||||
|  | -	je	L(exit)
 | ||||||
|  | -	bsrq	%rsi, %rax
 | ||||||
|  | +	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | +	sarl	%cl, %edx
 | ||||||
|  | +	jz	L(cross_page_continue)
 | ||||||
|  | +	PCMPEQ	%xmm0, %xmm1
 | ||||||
|  | +	pmovmskb %xmm1, %eax
 | ||||||
|  | +	sarl	%cl, %eax
 | ||||||
|  | +	leal	-1(%rdx), %ecx
 | ||||||
|  | +	xorl	%edx, %ecx
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jz	L(ret1)
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  | +#ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	andq	$-CHAR_SIZE, %rax
 | ||||||
|  | +#endif
 | ||||||
|  | +L(ret1):
 | ||||||
|  |  	ret | ||||||
|  | -END (strrchr)
 | ||||||
|  | +END(STRRCHR)
 | ||||||
|  |   | ||||||
|  | -weak_alias (strrchr, rindex)
 | ||||||
|  | -libc_hidden_builtin_def (strrchr)
 | ||||||
|  | +#ifndef USE_AS_WCSRCHR
 | ||||||
|  | +	weak_alias (STRRCHR, rindex)
 | ||||||
|  | +	libc_hidden_builtin_def (STRRCHR)
 | ||||||
|  | +#endif
 | ||||||
|  | diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
 | ||||||
|  | index 2f388537..ae3cfa7d 100644
 | ||||||
|  | --- a/sysdeps/x86_64/wcsrchr.S
 | ||||||
|  | +++ b/sysdeps/x86_64/wcsrchr.S
 | ||||||
|  | @@ -17,266 +17,12 @@
 | ||||||
|  |     License along with the GNU C Library; if not, see | ||||||
|  |     <http://www.gnu.org/licenses/>.  */ | ||||||
|  |   | ||||||
|  | -#include <sysdep.h>
 | ||||||
|  |   | ||||||
|  | -	.text
 | ||||||
|  | -ENTRY (wcsrchr)
 | ||||||
|  | +#define USE_AS_WCSRCHR	1
 | ||||||
|  | +#define NO_PMINU	1
 | ||||||
|  |   | ||||||
|  | -	movd	%rsi, %xmm1
 | ||||||
|  | -	mov	%rdi, %rcx
 | ||||||
|  | -	punpckldq %xmm1, %xmm1
 | ||||||
|  | -	pxor	%xmm2, %xmm2
 | ||||||
|  | -	punpckldq %xmm1, %xmm1
 | ||||||
|  | -	and	$63, %rcx
 | ||||||
|  | -	cmp	$48, %rcx
 | ||||||
|  | -	ja	L(crosscache)
 | ||||||
|  | +#ifndef STRRCHR
 | ||||||
|  | +# define STRRCHR	wcsrchr
 | ||||||
|  | +#endif
 | ||||||
|  |   | ||||||
|  | -	movdqu	(%rdi), %xmm0
 | ||||||
|  | -	pcmpeqd	%xmm0, %xmm2
 | ||||||
|  | -	pcmpeqd	%xmm1, %xmm0
 | ||||||
|  | -	pmovmskb %xmm2, %rcx
 | ||||||
|  | -	pmovmskb %xmm0, %rax
 | ||||||
|  | -	add	$16, %rdi
 | ||||||
|  | -
 | ||||||
|  | -	test	%rax, %rax
 | ||||||
|  | -	jnz	L(unaligned_match1)
 | ||||||
|  | -
 | ||||||
|  | -	test	%rcx, %rcx
 | ||||||
|  | -	jnz	L(return_null)
 | ||||||
|  | -
 | ||||||
|  | -	and	$-16, %rdi
 | ||||||
|  | -	xor	%r8, %r8
 | ||||||
|  | -	jmp	L(loop)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(unaligned_match1):
 | ||||||
|  | -	test	%rcx, %rcx
 | ||||||
|  | -	jnz	L(prolog_find_zero_1)
 | ||||||
|  | -
 | ||||||
|  | -	mov	%rax, %r8
 | ||||||
|  | -	mov	%rdi, %rsi
 | ||||||
|  | -	and	$-16, %rdi
 | ||||||
|  | -	jmp	L(loop)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(crosscache):
 | ||||||
|  | -	and	$15, %rcx
 | ||||||
|  | -	and	$-16, %rdi
 | ||||||
|  | -	pxor	%xmm3, %xmm3
 | ||||||
|  | -	movdqa	(%rdi), %xmm0
 | ||||||
|  | -	pcmpeqd	%xmm0, %xmm3
 | ||||||
|  | -	pcmpeqd	%xmm1, %xmm0
 | ||||||
|  | -	pmovmskb %xmm3, %rdx
 | ||||||
|  | -	pmovmskb %xmm0, %rax
 | ||||||
|  | -	shr	%cl, %rdx
 | ||||||
|  | -	shr	%cl, %rax
 | ||||||
|  | -	add	$16, %rdi
 | ||||||
|  | -
 | ||||||
|  | -	test	%rax, %rax
 | ||||||
|  | -	jnz	L(unaligned_match)
 | ||||||
|  | -
 | ||||||
|  | -	test	%rdx, %rdx
 | ||||||
|  | -	jnz	L(return_null)
 | ||||||
|  | -
 | ||||||
|  | -	xor	%r8, %r8
 | ||||||
|  | -	jmp	L(loop)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(unaligned_match):
 | ||||||
|  | -	test	%rdx, %rdx
 | ||||||
|  | -	jnz	L(prolog_find_zero)
 | ||||||
|  | -
 | ||||||
|  | -	mov	%rax, %r8
 | ||||||
|  | -	lea	(%rdi, %rcx), %rsi
 | ||||||
|  | -
 | ||||||
|  | -/* Loop start on aligned string.  */
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(loop):
 | ||||||
|  | -	movdqa	(%rdi), %xmm0
 | ||||||
|  | -	pcmpeqd	%xmm0, %xmm2
 | ||||||
|  | -	add	$16, %rdi
 | ||||||
|  | -	pcmpeqd	%xmm1, %xmm0
 | ||||||
|  | -	pmovmskb %xmm2, %rcx
 | ||||||
|  | -	pmovmskb %xmm0, %rax
 | ||||||
|  | -	or	%rax, %rcx
 | ||||||
|  | -	jnz	L(matches)
 | ||||||
|  | -
 | ||||||
|  | -	movdqa	(%rdi), %xmm3
 | ||||||
|  | -	pcmpeqd	%xmm3, %xmm2
 | ||||||
|  | -	add	$16, %rdi
 | ||||||
|  | -	pcmpeqd	%xmm1, %xmm3
 | ||||||
|  | -	pmovmskb %xmm2, %rcx
 | ||||||
|  | -	pmovmskb %xmm3, %rax
 | ||||||
|  | -	or	%rax, %rcx
 | ||||||
|  | -	jnz	L(matches)
 | ||||||
|  | -
 | ||||||
|  | -	movdqa	(%rdi), %xmm4
 | ||||||
|  | -	pcmpeqd	%xmm4, %xmm2
 | ||||||
|  | -	add	$16, %rdi
 | ||||||
|  | -	pcmpeqd	%xmm1, %xmm4
 | ||||||
|  | -	pmovmskb %xmm2, %rcx
 | ||||||
|  | -	pmovmskb %xmm4, %rax
 | ||||||
|  | -	or	%rax, %rcx
 | ||||||
|  | -	jnz	L(matches)
 | ||||||
|  | -
 | ||||||
|  | -	movdqa	(%rdi), %xmm5
 | ||||||
|  | -	pcmpeqd	%xmm5, %xmm2
 | ||||||
|  | -	add	$16, %rdi
 | ||||||
|  | -	pcmpeqd	%xmm1, %xmm5
 | ||||||
|  | -	pmovmskb %xmm2, %rcx
 | ||||||
|  | -	pmovmskb %xmm5, %rax
 | ||||||
|  | -	or	%rax, %rcx
 | ||||||
|  | -	jz	L(loop)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(matches):
 | ||||||
|  | -	test	%rax, %rax
 | ||||||
|  | -	jnz	L(match)
 | ||||||
|  | -L(return_value):
 | ||||||
|  | -	test	%r8, %r8
 | ||||||
|  | -	jz	L(return_null)
 | ||||||
|  | -	mov	%r8, %rax
 | ||||||
|  | -	mov	%rsi, %rdi
 | ||||||
|  | -
 | ||||||
|  | -	test	$15 << 4, %ah
 | ||||||
|  | -	jnz	L(match_fourth_wchar)
 | ||||||
|  | -	test	%ah, %ah
 | ||||||
|  | -	jnz	L(match_third_wchar)
 | ||||||
|  | -	test	$15 << 4, %al
 | ||||||
|  | -	jnz	L(match_second_wchar)
 | ||||||
|  | -	lea	-16(%rdi), %rax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(match):
 | ||||||
|  | -	pmovmskb %xmm2, %rcx
 | ||||||
|  | -	test	%rcx, %rcx
 | ||||||
|  | -	jnz	L(find_zero)
 | ||||||
|  | -	mov	%rax, %r8
 | ||||||
|  | -	mov	%rdi, %rsi
 | ||||||
|  | -	jmp	L(loop)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(find_zero):
 | ||||||
|  | -	test	$15, %cl
 | ||||||
|  | -	jnz	L(find_zero_in_first_wchar)
 | ||||||
|  | -	test	%cl, %cl
 | ||||||
|  | -	jnz	L(find_zero_in_second_wchar)
 | ||||||
|  | -	test	$15, %ch
 | ||||||
|  | -	jnz	L(find_zero_in_third_wchar)
 | ||||||
|  | -
 | ||||||
|  | -	and	$1 << 13 - 1, %rax
 | ||||||
|  | -	jz	L(return_value)
 | ||||||
|  | -
 | ||||||
|  | -	test	$15 << 4, %ah
 | ||||||
|  | -	jnz	L(match_fourth_wchar)
 | ||||||
|  | -	test	%ah, %ah
 | ||||||
|  | -	jnz	L(match_third_wchar)
 | ||||||
|  | -	test	$15 << 4, %al
 | ||||||
|  | -	jnz	L(match_second_wchar)
 | ||||||
|  | -	lea	-16(%rdi), %rax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(find_zero_in_first_wchar):
 | ||||||
|  | -	test	$1, %rax
 | ||||||
|  | -	jz	L(return_value)
 | ||||||
|  | -	lea	-16(%rdi), %rax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(find_zero_in_second_wchar):
 | ||||||
|  | -	and	$1 << 5 - 1, %rax
 | ||||||
|  | -	jz	L(return_value)
 | ||||||
|  | -
 | ||||||
|  | -	test	$15 << 4, %al
 | ||||||
|  | -	jnz	L(match_second_wchar)
 | ||||||
|  | -	lea	-16(%rdi), %rax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(find_zero_in_third_wchar):
 | ||||||
|  | -	and	$1 << 9 - 1, %rax
 | ||||||
|  | -	jz	L(return_value)
 | ||||||
|  | -
 | ||||||
|  | -	test	%ah, %ah
 | ||||||
|  | -	jnz	L(match_third_wchar)
 | ||||||
|  | -	test	$15 << 4, %al
 | ||||||
|  | -	jnz	L(match_second_wchar)
 | ||||||
|  | -	lea	-16(%rdi), %rax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(prolog_find_zero):
 | ||||||
|  | -	add	%rcx, %rdi
 | ||||||
|  | -	mov     %rdx, %rcx
 | ||||||
|  | -L(prolog_find_zero_1):
 | ||||||
|  | -	test	$15, %cl
 | ||||||
|  | -	jnz	L(prolog_find_zero_in_first_wchar)
 | ||||||
|  | -	test	%cl, %cl
 | ||||||
|  | -	jnz	L(prolog_find_zero_in_second_wchar)
 | ||||||
|  | -	test	$15, %ch
 | ||||||
|  | -	jnz	L(prolog_find_zero_in_third_wchar)
 | ||||||
|  | -
 | ||||||
|  | -	and	$1 << 13 - 1, %rax
 | ||||||
|  | -	jz	L(return_null)
 | ||||||
|  | -
 | ||||||
|  | -	test	$15 << 4, %ah
 | ||||||
|  | -	jnz	L(match_fourth_wchar)
 | ||||||
|  | -	test	%ah, %ah
 | ||||||
|  | -	jnz	L(match_third_wchar)
 | ||||||
|  | -	test	$15 << 4, %al
 | ||||||
|  | -	jnz	L(match_second_wchar)
 | ||||||
|  | -	lea	-16(%rdi), %rax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(prolog_find_zero_in_first_wchar):
 | ||||||
|  | -	test	$1, %rax
 | ||||||
|  | -	jz	L(return_null)
 | ||||||
|  | -	lea	-16(%rdi), %rax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(prolog_find_zero_in_second_wchar):
 | ||||||
|  | -	and	$1 << 5 - 1, %rax
 | ||||||
|  | -	jz	L(return_null)
 | ||||||
|  | -
 | ||||||
|  | -	test	$15 << 4, %al
 | ||||||
|  | -	jnz	L(match_second_wchar)
 | ||||||
|  | -	lea	-16(%rdi), %rax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(prolog_find_zero_in_third_wchar):
 | ||||||
|  | -	and	$1 << 9 - 1, %rax
 | ||||||
|  | -	jz	L(return_null)
 | ||||||
|  | -
 | ||||||
|  | -	test	%ah, %ah
 | ||||||
|  | -	jnz	L(match_third_wchar)
 | ||||||
|  | -	test	$15 << 4, %al
 | ||||||
|  | -	jnz	L(match_second_wchar)
 | ||||||
|  | -	lea	-16(%rdi), %rax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(match_second_wchar):
 | ||||||
|  | -	lea	-12(%rdi), %rax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(match_third_wchar):
 | ||||||
|  | -	lea	-8(%rdi), %rax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(match_fourth_wchar):
 | ||||||
|  | -	lea	-4(%rdi), %rax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(return_null):
 | ||||||
|  | -	xor	%rax, %rax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -END (wcsrchr)
 | ||||||
|  | +#include "../strrchr.S"
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										501
									
								
								SOURCES/glibc-RHEL-15696-104.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										501
									
								
								SOURCES/glibc-RHEL-15696-104.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,501 @@ | |||||||
|  | From df7e295d18ffa34f629578c0017a9881af7620f6 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Thu, 21 Apr 2022 20:52:29 -0500 | ||||||
|  | Subject: [PATCH] x86: Optimize {str|wcs}rchr-avx2 | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | The new code unrolls the main loop slightly without adding too much | ||||||
|  | overhead and minimizes the comparisons for the search CHAR. | ||||||
|  | 
 | ||||||
|  | Geometric Mean of all benchmarks New / Old: 0.832 | ||||||
|  | See email for all results. | ||||||
|  | 
 | ||||||
|  | Full xcheck passes on x86_64 with and without multiarch enabled. | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++--------- | ||||||
|  |  1 file changed, 269 insertions(+), 157 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
 | ||||||
|  | index c949410b..3d26fad4 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
 | ||||||
|  | @@ -27,9 +27,13 @@
 | ||||||
|  |  # ifdef USE_AS_WCSRCHR | ||||||
|  |  #  define VPBROADCAST	vpbroadcastd | ||||||
|  |  #  define VPCMPEQ	vpcmpeqd | ||||||
|  | +#  define VPMIN	vpminud
 | ||||||
|  | +#  define CHAR_SIZE	4
 | ||||||
|  |  # else | ||||||
|  |  #  define VPBROADCAST	vpbroadcastb | ||||||
|  |  #  define VPCMPEQ	vpcmpeqb | ||||||
|  | +#  define VPMIN	vpminub
 | ||||||
|  | +#  define CHAR_SIZE	1
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  # ifndef VZEROUPPER | ||||||
|  | @@ -41,196 +45,304 @@
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  # define VEC_SIZE	32 | ||||||
|  | +# define PAGE_SIZE	4096
 | ||||||
|  |   | ||||||
|  | -	.section SECTION(.text),"ax",@progbits
 | ||||||
|  | -ENTRY (STRRCHR)
 | ||||||
|  | -	movd	%esi, %xmm4
 | ||||||
|  | -	movl	%edi, %ecx
 | ||||||
|  | +	.section SECTION(.text), "ax", @progbits
 | ||||||
|  | +ENTRY(STRRCHR)
 | ||||||
|  | +	movd	%esi, %xmm7
 | ||||||
|  | +	movl	%edi, %eax
 | ||||||
|  |  	/* Broadcast CHAR to YMM4.  */ | ||||||
|  | -	VPBROADCAST %xmm4, %ymm4
 | ||||||
|  | +	VPBROADCAST %xmm7, %ymm7
 | ||||||
|  |  	vpxor	%xmm0, %xmm0, %xmm0 | ||||||
|  |   | ||||||
|  | -	/* Check if we may cross page boundary with one vector load.  */
 | ||||||
|  | -	andl	$(2 * VEC_SIZE - 1), %ecx
 | ||||||
|  | -	cmpl	$VEC_SIZE, %ecx
 | ||||||
|  | -	ja	L(cros_page_boundary)
 | ||||||
|  | +	/* Shift here instead of `andl` to save code size (saves a fetch
 | ||||||
|  | +	   block).  */
 | ||||||
|  | +	sall	$20, %eax
 | ||||||
|  | +	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
 | ||||||
|  | +	ja	L(cross_page)
 | ||||||
|  |   | ||||||
|  | +L(page_cross_continue):
 | ||||||
|  |  	vmovdqu	(%rdi), %ymm1 | ||||||
|  | -	VPCMPEQ	%ymm1, %ymm0, %ymm2
 | ||||||
|  | -	VPCMPEQ	%ymm1, %ymm4, %ymm3
 | ||||||
|  | -	vpmovmskb %ymm2, %ecx
 | ||||||
|  | -	vpmovmskb %ymm3, %eax
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | +	/* Check end of string match.  */
 | ||||||
|  | +	VPCMPEQ	%ymm1, %ymm0, %ymm6
 | ||||||
|  | +	vpmovmskb %ymm6, %ecx
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jz	L(aligned_more)
 | ||||||
|  | +
 | ||||||
|  | +	/* Only check match with search CHAR if needed.  */
 | ||||||
|  | +	VPCMPEQ	%ymm1, %ymm7, %ymm1
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	/* Check if match before first zero.  */
 | ||||||
|  | +	blsmskl	%ecx, %ecx
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jz	L(ret0)
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  | +	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
 | ||||||
|  | +	   search CHAR is zero we are correct. Either way `andq
 | ||||||
|  | +	   -CHAR_SIZE, %rax` gets the correct result.  */
 | ||||||
|  | +# ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	andq	$-CHAR_SIZE, %rax
 | ||||||
|  | +# endif
 | ||||||
|  | +L(ret0):
 | ||||||
|  | +L(return_vzeroupper):
 | ||||||
|  | +	ZERO_UPPER_VEC_REGISTERS_RETURN
 | ||||||
|  | +
 | ||||||
|  | +	/* Returns for first vec x1/x2 have hard coded backward search
 | ||||||
|  | +	   path for earlier matches.  */
 | ||||||
|  | +	.p2align 4,, 10
 | ||||||
|  | +L(first_vec_x1):
 | ||||||
|  | +	VPCMPEQ	%ymm2, %ymm7, %ymm6
 | ||||||
|  | +	vpmovmskb %ymm6, %eax
 | ||||||
|  | +	blsmskl	%ecx, %ecx
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jnz	L(first_vec_x1_return)
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4,, 4
 | ||||||
|  | +L(first_vec_x0_test):
 | ||||||
|  | +	VPCMPEQ	%ymm1, %ymm7, %ymm6
 | ||||||
|  | +	vpmovmskb %ymm6, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jz	L(ret1)
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  | +	addq	%r8, %rax
 | ||||||
|  | +# ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	andq	$-CHAR_SIZE, %rax
 | ||||||
|  | +# endif
 | ||||||
|  | +L(ret1):
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  | +	.p2align 4,, 10
 | ||||||
|  | +L(first_vec_x0_x1_test):
 | ||||||
|  | +	VPCMPEQ	%ymm2, %ymm7, %ymm6
 | ||||||
|  | +	vpmovmskb %ymm6, %eax
 | ||||||
|  | +	/* Check ymm2 for search CHAR match. If no match then check ymm1
 | ||||||
|  | +	   before returning.  */
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | +	jz	L(first_vec_x0_test)
 | ||||||
|  | +	.p2align 4,, 4
 | ||||||
|  | +L(first_vec_x1_return):
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  | +	leaq	1(%rdi, %rax), %rax
 | ||||||
|  | +# ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	andq	$-CHAR_SIZE, %rax
 | ||||||
|  | +# endif
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  | -	testl	%ecx, %ecx
 | ||||||
|  | -	jnz	L(return_null)
 | ||||||
|  |   | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  | -	jmp	L(aligned_loop)
 | ||||||
|  | +	.p2align 4,, 10
 | ||||||
|  | +L(first_vec_x2):
 | ||||||
|  | +	VPCMPEQ	%ymm3, %ymm7, %ymm6
 | ||||||
|  | +	vpmovmskb %ymm6, %eax
 | ||||||
|  | +	blsmskl	%ecx, %ecx
 | ||||||
|  | +	/* If no in-range search CHAR match in ymm3 then need to check
 | ||||||
|  | +	   ymm1/ymm2 for an earlier match (we delay checking search
 | ||||||
|  | +	   CHAR matches until needed).  */
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jz	L(first_vec_x0_x1_test)
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  | +	leaq	(VEC_SIZE + 1)(%rdi, %rax), %rax
 | ||||||
|  | +# ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	andq	$-CHAR_SIZE, %rax
 | ||||||
|  | +# endif
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec):
 | ||||||
|  | -	/* Check if there is a nul CHAR.  */
 | ||||||
|  | +L(aligned_more):
 | ||||||
|  | +	/* Save original pointer if match was in VEC 0.  */
 | ||||||
|  | +	movq	%rdi, %r8
 | ||||||
|  | +
 | ||||||
|  | +	/* Align src.  */
 | ||||||
|  | +	orq	$(VEC_SIZE - 1), %rdi
 | ||||||
|  | +	vmovdqu	1(%rdi), %ymm2
 | ||||||
|  | +	VPCMPEQ	%ymm2, %ymm0, %ymm6
 | ||||||
|  | +	vpmovmskb %ymm6, %ecx
 | ||||||
|  |  	testl	%ecx, %ecx | ||||||
|  | -	jnz	L(char_and_nul_in_first_vec)
 | ||||||
|  | +	jnz	L(first_vec_x1)
 | ||||||
|  |   | ||||||
|  | -	/* Remember the match and keep searching.  */
 | ||||||
|  | -	movl	%eax, %edx
 | ||||||
|  | -	movq	%rdi, %rsi
 | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | -	jmp	L(aligned_loop)
 | ||||||
|  | +	vmovdqu	(VEC_SIZE + 1)(%rdi), %ymm3
 | ||||||
|  | +	VPCMPEQ	%ymm3, %ymm0, %ymm6
 | ||||||
|  | +	vpmovmskb %ymm6, %ecx
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jnz	L(first_vec_x2)
 | ||||||
|  |   | ||||||
|  | +	/* Save pointer again before realigning.  */
 | ||||||
|  | +	movq	%rdi, %rsi
 | ||||||
|  | +	addq	$(VEC_SIZE + 1), %rdi
 | ||||||
|  | +	andq	$-(VEC_SIZE * 2), %rdi
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(cros_page_boundary):
 | ||||||
|  | -	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | -	vmovdqa	(%rdi), %ymm1
 | ||||||
|  | -	VPCMPEQ	%ymm1, %ymm0, %ymm2
 | ||||||
|  | -	VPCMPEQ	%ymm1, %ymm4, %ymm3
 | ||||||
|  | -	vpmovmskb %ymm2, %edx
 | ||||||
|  | -	vpmovmskb %ymm3, %eax
 | ||||||
|  | -	shrl	%cl, %edx
 | ||||||
|  | -	shrl	%cl, %eax
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -
 | ||||||
|  | -	/* Check if there is a CHAR.  */
 | ||||||
|  | +L(first_aligned_loop):
 | ||||||
|  | +	/* Do 2x VEC at a time. Any more and the cost of finding the
 | ||||||
|  | +	   match outweights loop benefit.  */
 | ||||||
|  | +	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
 | ||||||
|  | +	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
 | ||||||
|  | +
 | ||||||
|  | +	VPCMPEQ	%ymm4, %ymm7, %ymm6
 | ||||||
|  | +	VPMIN	%ymm4, %ymm5, %ymm8
 | ||||||
|  | +	VPCMPEQ	%ymm5, %ymm7, %ymm10
 | ||||||
|  | +	vpor	%ymm6, %ymm10, %ymm5
 | ||||||
|  | +	VPCMPEQ	%ymm8, %ymm0, %ymm8
 | ||||||
|  | +	vpor	%ymm5, %ymm8, %ymm9
 | ||||||
|  | +
 | ||||||
|  | +	vpmovmskb %ymm9, %eax
 | ||||||
|  | +	addq	$(VEC_SIZE * 2), %rdi
 | ||||||
|  | +	/* No zero or search CHAR.  */
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(found_char)
 | ||||||
|  | -
 | ||||||
|  | -	testl	%edx, %edx
 | ||||||
|  | -	jnz	L(return_null)
 | ||||||
|  | +	jz	L(first_aligned_loop)
 | ||||||
|  |   | ||||||
|  | -	jmp	L(aligned_loop)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(found_char):
 | ||||||
|  | -	testl	%edx, %edx
 | ||||||
|  | -	jnz	L(char_and_nul)
 | ||||||
|  | +	/* If no zero CHAR then go to second loop (this allows us to
 | ||||||
|  | +	   throw away all prior work).  */
 | ||||||
|  | +	vpmovmskb %ymm8, %ecx
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jz	L(second_aligned_loop_prep)
 | ||||||
|  |   | ||||||
|  | -	/* Remember the match and keep searching.  */
 | ||||||
|  | -	movl	%eax, %edx
 | ||||||
|  | -	leaq	(%rdi, %rcx), %rsi
 | ||||||
|  | +	/* Search char could be zero so we need to get the true match.
 | ||||||
|  | +	 */
 | ||||||
|  | +	vpmovmskb %ymm5, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(first_aligned_loop_return)
 | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(aligned_loop):
 | ||||||
|  | -	vmovdqa	(%rdi), %ymm1
 | ||||||
|  | -	VPCMPEQ	%ymm1, %ymm0, %ymm2
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -	VPCMPEQ	%ymm1, %ymm4, %ymm3
 | ||||||
|  | -	vpmovmskb %ymm2, %ecx
 | ||||||
|  | -	vpmovmskb %ymm3, %eax
 | ||||||
|  | -	orl	%eax, %ecx
 | ||||||
|  | -	jnz	L(char_nor_null)
 | ||||||
|  | -
 | ||||||
|  | -	vmovdqa	(%rdi), %ymm1
 | ||||||
|  | -	VPCMPEQ	%ymm1, %ymm0, %ymm2
 | ||||||
|  | -	add	$VEC_SIZE, %rdi
 | ||||||
|  | -	VPCMPEQ	%ymm1, %ymm4, %ymm3
 | ||||||
|  | -	vpmovmskb %ymm2, %ecx
 | ||||||
|  | +	.p2align 4,, 4
 | ||||||
|  | +L(first_vec_x1_or_x2):
 | ||||||
|  | +	VPCMPEQ	%ymm3, %ymm7, %ymm3
 | ||||||
|  | +	VPCMPEQ	%ymm2, %ymm7, %ymm2
 | ||||||
|  |  	vpmovmskb %ymm3, %eax | ||||||
|  | -	orl	%eax, %ecx
 | ||||||
|  | -	jnz	L(char_nor_null)
 | ||||||
|  | -
 | ||||||
|  | -	vmovdqa	(%rdi), %ymm1
 | ||||||
|  | -	VPCMPEQ	%ymm1, %ymm0, %ymm2
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -	VPCMPEQ	%ymm1, %ymm4, %ymm3
 | ||||||
|  | -	vpmovmskb %ymm2, %ecx
 | ||||||
|  | -	vpmovmskb %ymm3, %eax
 | ||||||
|  | -	orl	%eax, %ecx
 | ||||||
|  | -	jnz	L(char_nor_null)
 | ||||||
|  | -
 | ||||||
|  | -	vmovdqa	(%rdi), %ymm1
 | ||||||
|  | -	VPCMPEQ	%ymm1, %ymm0, %ymm2
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -	VPCMPEQ	%ymm1, %ymm4, %ymm3
 | ||||||
|  | -	vpmovmskb %ymm2, %ecx
 | ||||||
|  | -	vpmovmskb %ymm3, %eax
 | ||||||
|  | -	orl	%eax, %ecx
 | ||||||
|  | -	jz	L(aligned_loop)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(char_nor_null):
 | ||||||
|  | -	/* Find a CHAR or a nul CHAR in a loop.  */
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(match)
 | ||||||
|  | -L(return_value):
 | ||||||
|  | -	testl	%edx, %edx
 | ||||||
|  | -	jz	L(return_null)
 | ||||||
|  | -	movl	%edx, %eax
 | ||||||
|  | -	movq	%rsi, %rdi
 | ||||||
|  | +	vpmovmskb %ymm2, %edx
 | ||||||
|  | +	/* Use add for macro-fusion.  */
 | ||||||
|  | +	addq	%rax, %rdx
 | ||||||
|  | +	jz	L(first_vec_x0_test)
 | ||||||
|  | +	/* NB: We could move this shift to before the branch and save a
 | ||||||
|  | +	   bit of code size / performance on the fall through. The
 | ||||||
|  | +	   branch leads to the null case which generally seems hotter
 | ||||||
|  | +	   than char in first 3x VEC.  */
 | ||||||
|  | +	salq	$32, %rax
 | ||||||
|  | +	addq	%rdx, %rax
 | ||||||
|  | +	bsrq	%rax, %rax
 | ||||||
|  | +	leaq	1(%rsi, %rax), %rax
 | ||||||
|  | +# ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	andq	$-CHAR_SIZE, %rax
 | ||||||
|  | +# endif
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  | +	.p2align 4,, 8
 | ||||||
|  | +L(first_aligned_loop_return):
 | ||||||
|  | +	VPCMPEQ	%ymm4, %ymm0, %ymm4
 | ||||||
|  | +	vpmovmskb %ymm4, %edx
 | ||||||
|  | +	salq	$32, %rcx
 | ||||||
|  | +	orq	%rdx, %rcx
 | ||||||
|  | +
 | ||||||
|  | +	vpmovmskb %ymm10, %eax
 | ||||||
|  | +	vpmovmskb %ymm6, %edx
 | ||||||
|  | +	salq	$32, %rax
 | ||||||
|  | +	orq	%rdx, %rax
 | ||||||
|  | +	blsmskq	%rcx, %rcx
 | ||||||
|  | +	andq	%rcx, %rax
 | ||||||
|  | +	jz	L(first_vec_x1_or_x2)
 | ||||||
|  | +
 | ||||||
|  | +	bsrq	%rax, %rax
 | ||||||
|  | +	leaq	-(VEC_SIZE * 2)(%rdi, %rax), %rax
 | ||||||
|  |  # ifdef USE_AS_WCSRCHR | ||||||
|  | -	/* Keep the first bit for each matching CHAR for bsr.  */
 | ||||||
|  | -	andl	$0x11111111, %eax
 | ||||||
|  | +	andq	$-CHAR_SIZE, %rax
 | ||||||
|  |  # endif | ||||||
|  | -	bsrl	%eax, %eax
 | ||||||
|  | -	leaq	-VEC_SIZE(%rdi, %rax), %rax
 | ||||||
|  | -L(return_vzeroupper):
 | ||||||
|  | -	ZERO_UPPER_VEC_REGISTERS_RETURN
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  | +	/* Search char cannot be zero.  */
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(match):
 | ||||||
|  | -	/* Find a CHAR.  Check if there is a nul CHAR.  */
 | ||||||
|  | -	vpmovmskb %ymm2, %ecx
 | ||||||
|  | -	testl	%ecx, %ecx
 | ||||||
|  | -	jnz	L(find_nul)
 | ||||||
|  | -
 | ||||||
|  | -	/* Remember the match and keep searching.  */
 | ||||||
|  | -	movl	%eax, %edx
 | ||||||
|  | +L(second_aligned_loop_set_furthest_match):
 | ||||||
|  | +	/* Save VEC and pointer from most recent match.  */
 | ||||||
|  | +L(second_aligned_loop_prep):
 | ||||||
|  |  	movq	%rdi, %rsi | ||||||
|  | -	jmp	L(aligned_loop)
 | ||||||
|  | +	vmovdqu	%ymm6, %ymm2
 | ||||||
|  | +	vmovdqu	%ymm10, %ymm3
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(find_nul):
 | ||||||
|  | -# ifdef USE_AS_WCSRCHR
 | ||||||
|  | -	/* Keep the first bit for each matching CHAR for bsr.  */
 | ||||||
|  | -	andl	$0x11111111, %ecx
 | ||||||
|  | -	andl	$0x11111111, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | -	/* Mask out any matching bits after the nul CHAR.  */
 | ||||||
|  | -	movl	%ecx, %r8d
 | ||||||
|  | -	subl	$1, %r8d
 | ||||||
|  | -	xorl	%ecx, %r8d
 | ||||||
|  | -	andl	%r8d, %eax
 | ||||||
|  | +L(second_aligned_loop):
 | ||||||
|  | +	/* Search 2x at at time.  */
 | ||||||
|  | +	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
 | ||||||
|  | +	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
 | ||||||
|  | +
 | ||||||
|  | +	VPCMPEQ	%ymm4, %ymm7, %ymm6
 | ||||||
|  | +	VPMIN	%ymm4, %ymm5, %ymm1
 | ||||||
|  | +	VPCMPEQ	%ymm5, %ymm7, %ymm10
 | ||||||
|  | +	vpor	%ymm6, %ymm10, %ymm5
 | ||||||
|  | +	VPCMPEQ	%ymm1, %ymm0, %ymm1
 | ||||||
|  | +	vpor	%ymm5, %ymm1, %ymm9
 | ||||||
|  | +
 | ||||||
|  | +	vpmovmskb %ymm9, %eax
 | ||||||
|  | +	addq	$(VEC_SIZE * 2), %rdi
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	/* If there is no CHAR here, return the remembered one.  */
 | ||||||
|  | -	jz	L(return_value)
 | ||||||
|  | -	bsrl	%eax, %eax
 | ||||||
|  | -	leaq	-VEC_SIZE(%rdi, %rax), %rax
 | ||||||
|  | -	VZEROUPPER_RETURN
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(char_and_nul):
 | ||||||
|  | -	/* Find both a CHAR and a nul CHAR.  */
 | ||||||
|  | -	addq	%rcx, %rdi
 | ||||||
|  | -	movl	%edx, %ecx
 | ||||||
|  | -L(char_and_nul_in_first_vec):
 | ||||||
|  | -# ifdef USE_AS_WCSRCHR
 | ||||||
|  | -	/* Keep the first bit for each matching CHAR for bsr.  */
 | ||||||
|  | -	andl	$0x11111111, %ecx
 | ||||||
|  | -	andl	$0x11111111, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | -	/* Mask out any matching bits after the nul CHAR.  */
 | ||||||
|  | -	movl	%ecx, %r8d
 | ||||||
|  | -	subl	$1, %r8d
 | ||||||
|  | -	xorl	%ecx, %r8d
 | ||||||
|  | -	andl	%r8d, %eax
 | ||||||
|  | +	jz	L(second_aligned_loop)
 | ||||||
|  | +	vpmovmskb %ymm1, %ecx
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jz	L(second_aligned_loop_set_furthest_match)
 | ||||||
|  | +	vpmovmskb %ymm5, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	/* Return null pointer if the nul CHAR comes first.  */
 | ||||||
|  | -	jz	L(return_null)
 | ||||||
|  | -	bsrl	%eax, %eax
 | ||||||
|  | -	leaq	-VEC_SIZE(%rdi, %rax), %rax
 | ||||||
|  | +	jnz	L(return_new_match)
 | ||||||
|  | +
 | ||||||
|  | +	/* This is the hot patch. We know CHAR is inbounds and that
 | ||||||
|  | +	   ymm3/ymm2 have latest match.  */
 | ||||||
|  | +	.p2align 4,, 4
 | ||||||
|  | +L(return_old_match):
 | ||||||
|  | +	vpmovmskb %ymm3, %eax
 | ||||||
|  | +	vpmovmskb %ymm2, %edx
 | ||||||
|  | +	salq	$32, %rax
 | ||||||
|  | +	orq	%rdx, %rax
 | ||||||
|  | +	bsrq	%rax, %rax
 | ||||||
|  | +	/* Search char cannot be zero so safe to just use lea for
 | ||||||
|  | +	   wcsrchr.  */
 | ||||||
|  | +	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(return_null):
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | +	/* Last iteration also potentially has a match.  */
 | ||||||
|  | +	.p2align 4,, 8
 | ||||||
|  | +L(return_new_match):
 | ||||||
|  | +	VPCMPEQ	%ymm4, %ymm0, %ymm4
 | ||||||
|  | +	vpmovmskb %ymm4, %edx
 | ||||||
|  | +	salq	$32, %rcx
 | ||||||
|  | +	orq	%rdx, %rcx
 | ||||||
|  | +
 | ||||||
|  | +	vpmovmskb %ymm10, %eax
 | ||||||
|  | +	vpmovmskb %ymm6, %edx
 | ||||||
|  | +	salq	$32, %rax
 | ||||||
|  | +	orq	%rdx, %rax
 | ||||||
|  | +	blsmskq	%rcx, %rcx
 | ||||||
|  | +	andq	%rcx, %rax
 | ||||||
|  | +	jz	L(return_old_match)
 | ||||||
|  | +	bsrq	%rax, %rax
 | ||||||
|  | +	/* Search char cannot be zero so safe to just use lea for
 | ||||||
|  | +	   wcsrchr.  */
 | ||||||
|  | +	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  | -END (STRRCHR)
 | ||||||
|  | +	.p2align 4,, 4
 | ||||||
|  | +L(cross_page):
 | ||||||
|  | +	movq	%rdi, %rsi
 | ||||||
|  | +	andq	$-VEC_SIZE, %rsi
 | ||||||
|  | +	vmovdqu	(%rsi), %ymm1
 | ||||||
|  | +	VPCMPEQ	%ymm1, %ymm0, %ymm6
 | ||||||
|  | +	vpmovmskb %ymm6, %ecx
 | ||||||
|  | +	/* Shift out zero CHAR matches that are before the begining of
 | ||||||
|  | +	   src (rdi).  */
 | ||||||
|  | +	shrxl	%edi, %ecx, %ecx
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jz	L(page_cross_continue)
 | ||||||
|  | +	VPCMPEQ	%ymm1, %ymm7, %ymm1
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  | +
 | ||||||
|  | +	/* Shift out search CHAR matches that are before the begining of
 | ||||||
|  | +	   src (rdi).  */
 | ||||||
|  | +	shrxl	%edi, %eax, %eax
 | ||||||
|  | +	blsmskl	%ecx, %ecx
 | ||||||
|  | +	/* Check if any search CHAR match in range.  */
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jz	L(ret2)
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  | +# ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	andq	$-CHAR_SIZE, %rax
 | ||||||
|  | +# endif
 | ||||||
|  | +L(ret2):
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +END(STRRCHR)
 | ||||||
|  |  #endif | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										558
									
								
								SOURCES/glibc-RHEL-15696-105.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										558
									
								
								SOURCES/glibc-RHEL-15696-105.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,558 @@ | |||||||
|  | From c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Thu, 21 Apr 2022 20:52:30 -0500 | ||||||
|  | Subject: [PATCH] x86: Optimize {str|wcs}rchr-evex | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | The new code unrolls the main loop slightly without adding too much | ||||||
|  | overhead and minimizes the comparisons for the search CHAR. | ||||||
|  | 
 | ||||||
|  | Geometric Mean of all benchmarks New / Old: 0.755 | ||||||
|  | See email for all results. | ||||||
|  | 
 | ||||||
|  | Full xcheck passes on x86_64 with and without multiarch enabled. | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++--------- | ||||||
|  |  1 file changed, 290 insertions(+), 181 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
 | ||||||
|  | index f920b5a5..f5b6d755 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strrchr-evex.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
 | ||||||
|  | @@ -24,242 +24,351 @@
 | ||||||
|  |  #  define STRRCHR	__strrchr_evex | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  | -# define VMOVU		vmovdqu64
 | ||||||
|  | -# define VMOVA		vmovdqa64
 | ||||||
|  | +# define VMOVU	vmovdqu64
 | ||||||
|  | +# define VMOVA	vmovdqa64
 | ||||||
|  |   | ||||||
|  |  # ifdef USE_AS_WCSRCHR | ||||||
|  | +#  define SHIFT_REG	esi
 | ||||||
|  | +
 | ||||||
|  | +#  define kunpck	kunpckbw
 | ||||||
|  | +#  define kmov_2x	kmovd
 | ||||||
|  | +#  define maskz_2x	ecx
 | ||||||
|  | +#  define maskm_2x	eax
 | ||||||
|  | +#  define CHAR_SIZE	4
 | ||||||
|  | +#  define VPMIN	vpminud
 | ||||||
|  | +#  define VPTESTN	vptestnmd
 | ||||||
|  |  #  define VPBROADCAST	vpbroadcastd | ||||||
|  | -#  define VPCMP		vpcmpd
 | ||||||
|  | -#  define SHIFT_REG	r8d
 | ||||||
|  | +#  define VPCMP	vpcmpd
 | ||||||
|  |  # else | ||||||
|  | +#  define SHIFT_REG	edi
 | ||||||
|  | +
 | ||||||
|  | +#  define kunpck	kunpckdq
 | ||||||
|  | +#  define kmov_2x	kmovq
 | ||||||
|  | +#  define maskz_2x	rcx
 | ||||||
|  | +#  define maskm_2x	rax
 | ||||||
|  | +
 | ||||||
|  | +#  define CHAR_SIZE	1
 | ||||||
|  | +#  define VPMIN	vpminub
 | ||||||
|  | +#  define VPTESTN	vptestnmb
 | ||||||
|  |  #  define VPBROADCAST	vpbroadcastb | ||||||
|  | -#  define VPCMP		vpcmpb
 | ||||||
|  | -#  define SHIFT_REG	ecx
 | ||||||
|  | +#  define VPCMP	vpcmpb
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  # define XMMZERO	xmm16 | ||||||
|  |  # define YMMZERO	ymm16 | ||||||
|  |  # define YMMMATCH	ymm17 | ||||||
|  | -# define YMM1		ymm18
 | ||||||
|  | +# define YMMSAVE	ymm18
 | ||||||
|  | +
 | ||||||
|  | +# define YMM1	ymm19
 | ||||||
|  | +# define YMM2	ymm20
 | ||||||
|  | +# define YMM3	ymm21
 | ||||||
|  | +# define YMM4	ymm22
 | ||||||
|  | +# define YMM5	ymm23
 | ||||||
|  | +# define YMM6	ymm24
 | ||||||
|  | +# define YMM7	ymm25
 | ||||||
|  | +# define YMM8	ymm26
 | ||||||
|  |   | ||||||
|  | -# define VEC_SIZE	32
 | ||||||
|  |   | ||||||
|  | -	.section .text.evex,"ax",@progbits
 | ||||||
|  | -ENTRY (STRRCHR)
 | ||||||
|  | -	movl	%edi, %ecx
 | ||||||
|  | +# define VEC_SIZE	32
 | ||||||
|  | +# define PAGE_SIZE	4096
 | ||||||
|  | +	.section .text.evex, "ax", @progbits
 | ||||||
|  | +ENTRY(STRRCHR)
 | ||||||
|  | +	movl	%edi, %eax
 | ||||||
|  |  	/* Broadcast CHAR to YMMMATCH.  */ | ||||||
|  |  	VPBROADCAST %esi, %YMMMATCH | ||||||
|  |   | ||||||
|  | -	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
 | ||||||
|  | -
 | ||||||
|  | -	/* Check if we may cross page boundary with one vector load.  */
 | ||||||
|  | -	andl	$(2 * VEC_SIZE - 1), %ecx
 | ||||||
|  | -	cmpl	$VEC_SIZE, %ecx
 | ||||||
|  | -	ja	L(cros_page_boundary)
 | ||||||
|  | +	andl	$(PAGE_SIZE - 1), %eax
 | ||||||
|  | +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 | ||||||
|  | +	jg	L(cross_page_boundary)
 | ||||||
|  |   | ||||||
|  | +L(page_cross_continue):
 | ||||||
|  |  	VMOVU	(%rdi), %YMM1 | ||||||
|  | -
 | ||||||
|  | -	/* Each bit in K0 represents a null byte in YMM1.  */
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM1, %k0
 | ||||||
|  | -	/* Each bit in K1 represents a CHAR in YMM1.  */
 | ||||||
|  | -	VPCMP	$0, %YMMMATCH, %YMM1, %k1
 | ||||||
|  | +	/* k0 has a 1 for each zero CHAR in YMM1.  */
 | ||||||
|  | +	VPTESTN	%YMM1, %YMM1, %k0
 | ||||||
|  |  	kmovd	%k0, %ecx | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | -
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | -
 | ||||||
|  |  	testl	%ecx, %ecx | ||||||
|  | -	jnz	L(return_null)
 | ||||||
|  | -
 | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  | -	jmp	L(aligned_loop)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(first_vec):
 | ||||||
|  | -	/* Check if there is a null byte.  */
 | ||||||
|  | -	testl	%ecx, %ecx
 | ||||||
|  | -	jnz	L(char_and_nul_in_first_vec)
 | ||||||
|  | -
 | ||||||
|  | -	/* Remember the match and keep searching.  */
 | ||||||
|  | -	movl	%eax, %edx
 | ||||||
|  | -	movq	%rdi, %rsi
 | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | -	jmp	L(aligned_loop)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(cros_page_boundary):
 | ||||||
|  | -	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | +	jz	L(aligned_more)
 | ||||||
|  | +	/* fallthrough: zero CHAR in first VEC.  */
 | ||||||
|  |   | ||||||
|  | +	/* K1 has a 1 for each search CHAR match in YMM1.  */
 | ||||||
|  | +	VPCMP	$0, %YMMMATCH, %YMM1, %k1
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	/* Build mask up until first zero CHAR (used to mask of
 | ||||||
|  | +	   potential search CHAR matches past the end of the string).
 | ||||||
|  | +	 */
 | ||||||
|  | +	blsmskl	%ecx, %ecx
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jz	L(ret0)
 | ||||||
|  | +	/* Get last match (the `andl` removed any out of bounds
 | ||||||
|  | +	   matches).  */
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  |  # ifdef USE_AS_WCSRCHR | ||||||
|  | -	/* NB: Divide shift count by 4 since each bit in K1 represent 4
 | ||||||
|  | -	   bytes.  */
 | ||||||
|  | -	movl	%ecx, %SHIFT_REG
 | ||||||
|  | -	sarl	$2, %SHIFT_REG
 | ||||||
|  | +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +# else
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  |  # endif | ||||||
|  | +L(ret0):
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -	VMOVA	(%rdi), %YMM1
 | ||||||
|  | -
 | ||||||
|  | -	/* Each bit in K0 represents a null byte in YMM1.  */
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM1, %k0
 | ||||||
|  | -	/* Each bit in K1 represents a CHAR in YMM1.  */
 | ||||||
|  | +	/* Returns for first vec x1/x2/x3 have hard coded backward
 | ||||||
|  | +	   search path for earlier matches.  */
 | ||||||
|  | +	.p2align 4,, 6
 | ||||||
|  | +L(first_vec_x1):
 | ||||||
|  | +	VPCMP	$0, %YMMMATCH, %YMM2, %k1
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	blsmskl	%ecx, %ecx
 | ||||||
|  | +	/* eax non-zero if search CHAR in range.  */
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jnz	L(first_vec_x1_return)
 | ||||||
|  | +
 | ||||||
|  | +	/* fallthrough: no match in YMM2 then need to check for earlier
 | ||||||
|  | +	   matches (in YMM1).  */
 | ||||||
|  | +	.p2align 4,, 4
 | ||||||
|  | +L(first_vec_x0_test):
 | ||||||
|  |  	VPCMP	$0, %YMMMATCH, %YMM1, %k1 | ||||||
|  | -	kmovd	%k0, %edx
 | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  | -
 | ||||||
|  | -	shrxl	%SHIFT_REG, %edx, %edx
 | ||||||
|  | -	shrxl	%SHIFT_REG, %eax, %eax
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -
 | ||||||
|  | -	/* Check if there is a CHAR.  */
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(found_char)
 | ||||||
|  | -
 | ||||||
|  | -	testl	%edx, %edx
 | ||||||
|  | -	jnz	L(return_null)
 | ||||||
|  | -
 | ||||||
|  | -	jmp	L(aligned_loop)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(found_char):
 | ||||||
|  | -	testl	%edx, %edx
 | ||||||
|  | -	jnz	L(char_and_nul)
 | ||||||
|  | -
 | ||||||
|  | -	/* Remember the match and keep searching.  */
 | ||||||
|  | -	movl	%eax, %edx
 | ||||||
|  | -	leaq	(%rdi, %rcx), %rsi
 | ||||||
|  | +	jz	L(ret1)
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  | +# ifdef USE_AS_WCSRCHR
 | ||||||
|  | +	leaq	(%rsi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +# else
 | ||||||
|  | +	addq	%rsi, %rax
 | ||||||
|  | +# endif
 | ||||||
|  | +L(ret1):
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(aligned_loop):
 | ||||||
|  | -	VMOVA	(%rdi), %YMM1
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | +	.p2align 4,, 10
 | ||||||
|  | +L(first_vec_x1_or_x2):
 | ||||||
|  | +	VPCMP	$0, %YMM3, %YMMMATCH, %k3
 | ||||||
|  | +	VPCMP	$0, %YMM2, %YMMMATCH, %k2
 | ||||||
|  | +	/* K2 and K3 have 1 for any search CHAR match. Test if any
 | ||||||
|  | +	   matches between either of them. Otherwise check YMM1.  */
 | ||||||
|  | +	kortestd %k2, %k3
 | ||||||
|  | +	jz	L(first_vec_x0_test)
 | ||||||
|  | +
 | ||||||
|  | +	/* Guranteed that YMM2 and YMM3 are within range so merge the
 | ||||||
|  | +	   two bitmasks then get last result.  */
 | ||||||
|  | +	kunpck	%k2, %k3, %k3
 | ||||||
|  | +	kmovq	%k3, %rax
 | ||||||
|  | +	bsrq	%rax, %rax
 | ||||||
|  | +	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -	/* Each bit in K0 represents a null byte in YMM1.  */
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM1, %k0
 | ||||||
|  | -	/* Each bit in K1 represents a CHAR in YMM1.  */
 | ||||||
|  | -	VPCMP	$0, %YMMMATCH, %YMM1, %k1
 | ||||||
|  | -	kmovd	%k0, %ecx
 | ||||||
|  | +	.p2align 4,, 6
 | ||||||
|  | +L(first_vec_x3):
 | ||||||
|  | +	VPCMP	$0, %YMMMATCH, %YMM4, %k1
 | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  | -	orl	%eax, %ecx
 | ||||||
|  | -	jnz	L(char_nor_null)
 | ||||||
|  | +	blsmskl	%ecx, %ecx
 | ||||||
|  | +	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jz	L(first_vec_x1_or_x2)
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  | +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -	VMOVA	(%rdi), %YMM1
 | ||||||
|  | -	add	$VEC_SIZE, %rdi
 | ||||||
|  | +	.p2align 4,, 6
 | ||||||
|  | +L(first_vec_x0_x1_test):
 | ||||||
|  | +	VPCMP	$0, %YMMMATCH, %YMM2, %k1
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	/* Check YMM2 for last match first. If no match try YMM1.  */
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jz	L(first_vec_x0_test)
 | ||||||
|  | +	.p2align 4,, 4
 | ||||||
|  | +L(first_vec_x1_return):
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  | +	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -	/* Each bit in K0 represents a null byte in YMM1.  */
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM1, %k0
 | ||||||
|  | -	/* Each bit in K1 represents a CHAR in YMM1.  */
 | ||||||
|  | -	VPCMP	$0, %YMMMATCH, %YMM1, %k1
 | ||||||
|  | -	kmovd	%k0, %ecx
 | ||||||
|  | +	.p2align 4,, 10
 | ||||||
|  | +L(first_vec_x2):
 | ||||||
|  | +	VPCMP	$0, %YMMMATCH, %YMM3, %k1
 | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  | -	orl	%eax, %ecx
 | ||||||
|  | -	jnz	L(char_nor_null)
 | ||||||
|  | +	blsmskl	%ecx, %ecx
 | ||||||
|  | +	/* Check YMM3 for last match first. If no match try YMM2/YMM1.
 | ||||||
|  | +	 */
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jz	L(first_vec_x0_x1_test)
 | ||||||
|  | +	bsrl	%eax, %eax
 | ||||||
|  | +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -	VMOVA	(%rdi), %YMM1
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  |   | ||||||
|  | -	/* Each bit in K0 represents a null byte in YMM1.  */
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM1, %k0
 | ||||||
|  | -	/* Each bit in K1 represents a CHAR in YMM1.  */
 | ||||||
|  | -	VPCMP	$0, %YMMMATCH, %YMM1, %k1
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(aligned_more):
 | ||||||
|  | +	/* Need to keep original pointer incase YMM1 has last match.  */
 | ||||||
|  | +	movq	%rdi, %rsi
 | ||||||
|  | +	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | +	VMOVU	VEC_SIZE(%rdi), %YMM2
 | ||||||
|  | +	VPTESTN	%YMM2, %YMM2, %k0
 | ||||||
|  |  	kmovd	%k0, %ecx | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | -	orl	%eax, %ecx
 | ||||||
|  | -	jnz	L(char_nor_null)
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jnz	L(first_vec_x1)
 | ||||||
|  |   | ||||||
|  | -	VMOVA	(%rdi), %YMM1
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
 | ||||||
|  | +	VPTESTN	%YMM3, %YMM3, %k0
 | ||||||
|  | +	kmovd	%k0, %ecx
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jnz	L(first_vec_x2)
 | ||||||
|  |   | ||||||
|  | -	/* Each bit in K0 represents a null byte in YMM1.  */
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM1, %k0
 | ||||||
|  | -	/* Each bit in K1 represents a CHAR in YMM1.  */
 | ||||||
|  | -	VPCMP	$0, %YMMMATCH, %YMM1, %k1
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
 | ||||||
|  | +	VPTESTN	%YMM4, %YMM4, %k0
 | ||||||
|  |  	kmovd	%k0, %ecx | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | -	orl	%eax, %ecx
 | ||||||
|  | -	jz	L(aligned_loop)
 | ||||||
|  | +	movq	%rdi, %r8
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jnz	L(first_vec_x3)
 | ||||||
|  |   | ||||||
|  | +	andq	$-(VEC_SIZE * 2), %rdi
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(char_nor_null):
 | ||||||
|  | -	/* Find a CHAR or a null byte in a loop.  */
 | ||||||
|  | +L(first_aligned_loop):
 | ||||||
|  | +	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
 | ||||||
|  | +	   they don't store a match.  */
 | ||||||
|  | +	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
 | ||||||
|  | +	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
 | ||||||
|  | +
 | ||||||
|  | +	VPCMP	$0, %YMM5, %YMMMATCH, %k2
 | ||||||
|  | +	vpxord	%YMM6, %YMMMATCH, %YMM7
 | ||||||
|  | +
 | ||||||
|  | +	VPMIN	%YMM5, %YMM6, %YMM8
 | ||||||
|  | +	VPMIN	%YMM8, %YMM7, %YMM7
 | ||||||
|  | +
 | ||||||
|  | +	VPTESTN	%YMM7, %YMM7, %k1
 | ||||||
|  | +	subq	$(VEC_SIZE * -2), %rdi
 | ||||||
|  | +	kortestd %k1, %k2
 | ||||||
|  | +	jz	L(first_aligned_loop)
 | ||||||
|  | +
 | ||||||
|  | +	VPCMP	$0, %YMM6, %YMMMATCH, %k3
 | ||||||
|  | +	VPTESTN	%YMM8, %YMM8, %k1
 | ||||||
|  | +	ktestd	%k1, %k1
 | ||||||
|  | +	jz	L(second_aligned_loop_prep)
 | ||||||
|  | +
 | ||||||
|  | +	kortestd %k2, %k3
 | ||||||
|  | +	jnz	L(return_first_aligned_loop)
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4,, 6
 | ||||||
|  | +L(first_vec_x1_or_x2_or_x3):
 | ||||||
|  | +	VPCMP	$0, %YMM4, %YMMMATCH, %k4
 | ||||||
|  | +	kmovd	%k4, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(match)
 | ||||||
|  | -L(return_value):
 | ||||||
|  | -	testl	%edx, %edx
 | ||||||
|  | -	jz	L(return_null)
 | ||||||
|  | -	movl	%edx, %eax
 | ||||||
|  | -	movq	%rsi, %rdi
 | ||||||
|  | +	jz	L(first_vec_x1_or_x2)
 | ||||||
|  |  	bsrl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_WCSRCHR
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
 | ||||||
|  | -# else
 | ||||||
|  | -	leaq	-VEC_SIZE(%rdi, %rax), %rax
 | ||||||
|  | -# endif
 | ||||||
|  | +	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(match):
 | ||||||
|  | -	/* Find a CHAR.  Check if there is a null byte.  */
 | ||||||
|  | -	kmovd	%k0, %ecx
 | ||||||
|  | -	testl	%ecx, %ecx
 | ||||||
|  | -	jnz	L(find_nul)
 | ||||||
|  | +	.p2align 4,, 8
 | ||||||
|  | +L(return_first_aligned_loop):
 | ||||||
|  | +	VPTESTN	%YMM5, %YMM5, %k0
 | ||||||
|  | +	kunpck	%k0, %k1, %k0
 | ||||||
|  | +	kmov_2x	%k0, %maskz_2x
 | ||||||
|  | +
 | ||||||
|  | +	blsmsk	%maskz_2x, %maskz_2x
 | ||||||
|  | +	kunpck	%k2, %k3, %k3
 | ||||||
|  | +	kmov_2x	%k3, %maskm_2x
 | ||||||
|  | +	and	%maskz_2x, %maskm_2x
 | ||||||
|  | +	jz	L(first_vec_x1_or_x2_or_x3)
 | ||||||
|  |   | ||||||
|  | -	/* Remember the match and keep searching.  */
 | ||||||
|  | -	movl	%eax, %edx
 | ||||||
|  | +	bsr	%maskm_2x, %maskm_2x
 | ||||||
|  | +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +	/* We can throw away the work done for the first 4x checks here
 | ||||||
|  | +	   as we have a later match. This is the 'fast' path persay.
 | ||||||
|  | +	 */
 | ||||||
|  | +L(second_aligned_loop_prep):
 | ||||||
|  | +L(second_aligned_loop_set_furthest_match):
 | ||||||
|  |  	movq	%rdi, %rsi | ||||||
|  | -	jmp	L(aligned_loop)
 | ||||||
|  | +	kunpck	%k2, %k3, %k4
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(find_nul):
 | ||||||
|  | -	/* Mask out any matching bits after the null byte.  */
 | ||||||
|  | -	movl	%ecx, %r8d
 | ||||||
|  | -	subl	$1, %r8d
 | ||||||
|  | -	xorl	%ecx, %r8d
 | ||||||
|  | -	andl	%r8d, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	/* If there is no CHAR here, return the remembered one.  */
 | ||||||
|  | -	jz	L(return_value)
 | ||||||
|  | -	bsrl	%eax, %eax
 | ||||||
|  | +L(second_aligned_loop):
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
 | ||||||
|  | +
 | ||||||
|  | +	VPCMP	$0, %YMM1, %YMMMATCH, %k2
 | ||||||
|  | +	vpxord	%YMM2, %YMMMATCH, %YMM3
 | ||||||
|  | +
 | ||||||
|  | +	VPMIN	%YMM1, %YMM2, %YMM4
 | ||||||
|  | +	VPMIN	%YMM3, %YMM4, %YMM3
 | ||||||
|  | +
 | ||||||
|  | +	VPTESTN	%YMM3, %YMM3, %k1
 | ||||||
|  | +	subq	$(VEC_SIZE * -2), %rdi
 | ||||||
|  | +	kortestd %k1, %k2
 | ||||||
|  | +	jz	L(second_aligned_loop)
 | ||||||
|  | +
 | ||||||
|  | +	VPCMP	$0, %YMM2, %YMMMATCH, %k3
 | ||||||
|  | +	VPTESTN	%YMM4, %YMM4, %k1
 | ||||||
|  | +	ktestd	%k1, %k1
 | ||||||
|  | +	jz	L(second_aligned_loop_set_furthest_match)
 | ||||||
|  | +
 | ||||||
|  | +	kortestd %k2, %k3
 | ||||||
|  | +	/* branch here because there is a significant advantage interms
 | ||||||
|  | +	   of output dependency chance in using edx.  */
 | ||||||
|  | +	jnz	L(return_new_match)
 | ||||||
|  | +L(return_old_match):
 | ||||||
|  | +	kmovq	%k4, %rax
 | ||||||
|  | +	bsrq	%rax, %rax
 | ||||||
|  | +	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +L(return_new_match):
 | ||||||
|  | +	VPTESTN	%YMM1, %YMM1, %k0
 | ||||||
|  | +	kunpck	%k0, %k1, %k0
 | ||||||
|  | +	kmov_2x	%k0, %maskz_2x
 | ||||||
|  | +
 | ||||||
|  | +	blsmsk	%maskz_2x, %maskz_2x
 | ||||||
|  | +	kunpck	%k2, %k3, %k3
 | ||||||
|  | +	kmov_2x	%k3, %maskm_2x
 | ||||||
|  | +	and	%maskz_2x, %maskm_2x
 | ||||||
|  | +	jz	L(return_old_match)
 | ||||||
|  | +
 | ||||||
|  | +	bsr	%maskm_2x, %maskm_2x
 | ||||||
|  | +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +L(cross_page_boundary):
 | ||||||
|  | +	/* eax contains all the page offset bits of src (rdi). `xor rdi,
 | ||||||
|  | +	   rax` sets pointer will all page offset bits cleared so
 | ||||||
|  | +	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
 | ||||||
|  | +	   before page cross (guranteed to be safe to read). Doing this
 | ||||||
|  | +	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
 | ||||||
|  | +	   a bit of code size.  */
 | ||||||
|  | +	xorq	%rdi, %rax
 | ||||||
|  | +	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
 | ||||||
|  | +	VPTESTN	%YMM1, %YMM1, %k0
 | ||||||
|  | +	kmovd	%k0, %ecx
 | ||||||
|  | +
 | ||||||
|  | +	/* Shift out zero CHAR matches that are before the begining of
 | ||||||
|  | +	   src (rdi).  */
 | ||||||
|  |  # ifdef USE_AS_WCSRCHR | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
 | ||||||
|  | -# else
 | ||||||
|  | -	leaq	-VEC_SIZE(%rdi, %rax), %rax
 | ||||||
|  | +	movl	%edi, %esi
 | ||||||
|  | +	andl	$(VEC_SIZE - 1), %esi
 | ||||||
|  | +	shrl	$2, %esi
 | ||||||
|  |  # endif | ||||||
|  | -	ret
 | ||||||
|  | +	shrxl	%SHIFT_REG, %ecx, %ecx
 | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(char_and_nul):
 | ||||||
|  | -	/* Find both a CHAR and a null byte.  */
 | ||||||
|  | -	addq	%rcx, %rdi
 | ||||||
|  | -	movl	%edx, %ecx
 | ||||||
|  | -L(char_and_nul_in_first_vec):
 | ||||||
|  | -	/* Mask out any matching bits after the null byte.  */
 | ||||||
|  | -	movl	%ecx, %r8d
 | ||||||
|  | -	subl	$1, %r8d
 | ||||||
|  | -	xorl	%ecx, %r8d
 | ||||||
|  | -	andl	%r8d, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	/* Return null pointer if the null byte comes first.  */
 | ||||||
|  | -	jz	L(return_null)
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jz	L(page_cross_continue)
 | ||||||
|  | +
 | ||||||
|  | +	/* Found zero CHAR so need to test for search CHAR.  */
 | ||||||
|  | +	VPCMP	$0, %YMMMATCH, %YMM1, %k1
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	/* Shift out search CHAR matches that are before the begining of
 | ||||||
|  | +	   src (rdi).  */
 | ||||||
|  | +	shrxl	%SHIFT_REG, %eax, %eax
 | ||||||
|  | +
 | ||||||
|  | +	/* Check if any search CHAR match in range.  */
 | ||||||
|  | +	blsmskl	%ecx, %ecx
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jz	L(ret3)
 | ||||||
|  |  	bsrl	%eax, %eax | ||||||
|  |  # ifdef USE_AS_WCSRCHR | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
 | ||||||
|  | +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  # else | ||||||
|  | -	leaq	-VEC_SIZE(%rdi, %rax), %rax
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  |  # endif | ||||||
|  | +L(ret3):
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(return_null):
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -END (STRRCHR)
 | ||||||
|  | +END(STRRCHR)
 | ||||||
|  |  #endif | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										73
									
								
								SOURCES/glibc-RHEL-15696-106.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										73
									
								
								SOURCES/glibc-RHEL-15696-106.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,73 @@ | |||||||
|  | From 911c63a51c690dd1a97dfc587097277029baf00f Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Wed, 27 Apr 2022 15:13:02 -0500 | ||||||
|  | Subject: [PATCH] sysdeps: Add 'get_fast_jitter' interace in fast-jitter.h | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | 'get_fast_jitter' is meant to be used purely for performance | ||||||
|  | purposes. In all cases it's used it should be acceptable to get no | ||||||
|  | randomness (see default case). An example use case is in setting | ||||||
|  | jitter for retries between threads at a lock. There is a | ||||||
|  | performance benefit to having jitter, but only if the jitter can | ||||||
|  | be generated very quickly and ultimately there is no serious issue | ||||||
|  | if no jitter is generated. | ||||||
|  | 
 | ||||||
|  | The implementation generally uses 'HP_TIMING_NOW' iff it is | ||||||
|  | inlined (avoid any potential syscall paths). | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/generic/fast-jitter.h | 42 +++++++++++++++++++++++++++++++++++ | ||||||
|  |  1 file changed, 42 insertions(+) | ||||||
|  |  create mode 100644 sysdeps/generic/fast-jitter.h | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/generic/fast-jitter.h b/sysdeps/generic/fast-jitter.h
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..4dd53e34
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/generic/fast-jitter.h
 | ||||||
|  | @@ -0,0 +1,42 @@
 | ||||||
|  | +/* Fallback for fast jitter just return 0.
 | ||||||
|  | +   Copyright (C) 2019-2022 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#ifndef _FAST_JITTER_H
 | ||||||
|  | +# define _FAST_JITTER_H
 | ||||||
|  | +
 | ||||||
|  | +# include <stdint.h>
 | ||||||
|  | +# include <hp-timing.h>
 | ||||||
|  | +
 | ||||||
|  | +/* Baseline just return 0.  We could create jitter using a clock or
 | ||||||
|  | +   'random_bits' but that may imply a syscall and the goal of
 | ||||||
|  | +   'get_fast_jitter' is minimal overhead "randomness" when such
 | ||||||
|  | +   randomness helps performance.  Adding high overhead the function
 | ||||||
|  | +   defeats the purpose.  */
 | ||||||
|  | +static inline uint32_t
 | ||||||
|  | +get_fast_jitter (void)
 | ||||||
|  | +{
 | ||||||
|  | +# if HP_TIMING_INLINE
 | ||||||
|  | +  hp_timing_t jitter;
 | ||||||
|  | +  HP_TIMING_NOW (jitter);
 | ||||||
|  | +  return (uint32_t) jitter;
 | ||||||
|  | +# else
 | ||||||
|  | +  return 0;
 | ||||||
|  | +# endif
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +#endif
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										226
									
								
								SOURCES/glibc-RHEL-15696-107.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										226
									
								
								SOURCES/glibc-RHEL-15696-107.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,226 @@ | |||||||
|  | From 8162147872491bb5b48e91543b19c49a29ae6b6d Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Wangyang Guo <wangyang.guo@intel.com> | ||||||
|  | Date: Fri, 6 May 2022 01:50:10 +0000 | ||||||
|  | Subject: [PATCH] nptl: Add backoff mechanism to spinlock loop | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | When mutiple threads waiting for lock at the same time, once lock owner | ||||||
|  | releases the lock, waiters will see lock available and all try to lock, | ||||||
|  | which may cause an expensive CAS storm. | ||||||
|  | 
 | ||||||
|  | Binary exponential backoff with random jitter is introduced. As try-lock | ||||||
|  | attempt increases, there is more likely that a larger number threads | ||||||
|  | compete for adaptive mutex lock, so increase wait time in exponential. | ||||||
|  | A random jitter is also added to avoid synchronous try-lock from other | ||||||
|  | threads. | ||||||
|  | 
 | ||||||
|  | v2: Remove read-check before try-lock for performance. | ||||||
|  | 
 | ||||||
|  | v3: | ||||||
|  | 1. Restore read-check since it works well in some platform. | ||||||
|  | 2. Make backoff arch dependent, and enable it for x86_64. | ||||||
|  | 3. Limit max backoff to reduce latency in large critical section. | ||||||
|  | 
 | ||||||
|  | v4: Fix strict-prototypes error in sysdeps/nptl/pthread_mutex_backoff.h | ||||||
|  | 
 | ||||||
|  | v5: Commit log updated for regression in large critical section. | ||||||
|  | 
 | ||||||
|  | Result of pthread-mutex-locks bench | ||||||
|  | 
 | ||||||
|  | Test Platform: Xeon 8280L (2 socket, 112 CPUs in total) | ||||||
|  | First Row: thread number | ||||||
|  | First Col: critical section length | ||||||
|  | Values: backoff vs upstream, time based, low is better | ||||||
|  | 
 | ||||||
|  | non-critical-length: 1 | ||||||
|  | 	1	2	4	8	16	32	64	112	140 | ||||||
|  | 0	0.99	0.58	0.52	0.49	0.43	0.44	0.46	0.52	0.54 | ||||||
|  | 1	0.98	0.43	0.56	0.50	0.44	0.45	0.50	0.56	0.57 | ||||||
|  | 2	0.99	0.41	0.57	0.51	0.45	0.47	0.48	0.60	0.61 | ||||||
|  | 4	0.99	0.45	0.59	0.53	0.48	0.49	0.52	0.64	0.65 | ||||||
|  | 8	1.00	0.66	0.71	0.63	0.56	0.59	0.66	0.72	0.71 | ||||||
|  | 16	0.97	0.78	0.91	0.73	0.67	0.70	0.79	0.80	0.80 | ||||||
|  | 32	0.95	1.17	0.98	0.87	0.82	0.86	0.89	0.90	0.90 | ||||||
|  | 64	0.96	0.95	1.01	1.01	0.98	1.00	1.03	0.99	0.99 | ||||||
|  | 128	0.99	1.01	1.01	1.17	1.08	1.12	1.02	0.97	1.02 | ||||||
|  | 
 | ||||||
|  | non-critical-length: 32 | ||||||
|  | 	1	2	4	8	16	32	64	112	140 | ||||||
|  | 0	1.03	0.97	0.75	0.65	0.58	0.58	0.56	0.70	0.70 | ||||||
|  | 1	0.94	0.95	0.76	0.65	0.58	0.58	0.61	0.71	0.72 | ||||||
|  | 2	0.97	0.96	0.77	0.66	0.58	0.59	0.62	0.74	0.74 | ||||||
|  | 4	0.99	0.96	0.78	0.66	0.60	0.61	0.66	0.76	0.77 | ||||||
|  | 8	0.99	0.99	0.84	0.70	0.64	0.66	0.71	0.80	0.80 | ||||||
|  | 16	0.98	0.97	0.95	0.76	0.70	0.73	0.81	0.85	0.84 | ||||||
|  | 32	1.04	1.12	1.04	0.89	0.82	0.86	0.93	0.91	0.91 | ||||||
|  | 64	0.99	1.15	1.07	1.00	0.99	1.01	1.05	0.99	0.99 | ||||||
|  | 128	1.00	1.21	1.20	1.22	1.25	1.31	1.12	1.10	0.99 | ||||||
|  | 
 | ||||||
|  | non-critical-length: 128 | ||||||
|  | 	1	2	4	8	16	32	64	112	140 | ||||||
|  | 0	1.02	1.00	0.99	0.67	0.61	0.61	0.61	0.74	0.73 | ||||||
|  | 1	0.95	0.99	1.00	0.68	0.61	0.60	0.60	0.74	0.74 | ||||||
|  | 2	1.00	1.04	1.00	0.68	0.59	0.61	0.65	0.76	0.76 | ||||||
|  | 4	1.00	0.96	0.98	0.70	0.63	0.63	0.67	0.78	0.77 | ||||||
|  | 8	1.01	1.02	0.89	0.73	0.65	0.67	0.71	0.81	0.80 | ||||||
|  | 16	0.99	0.96	0.96	0.79	0.71	0.73	0.80	0.84	0.84 | ||||||
|  | 32	0.99	0.95	1.05	0.89	0.84	0.85	0.94	0.92	0.91 | ||||||
|  | 64	1.00	0.99	1.16	1.04	1.00	1.02	1.06	0.99	0.99 | ||||||
|  | 128	1.00	1.06	0.98	1.14	1.39	1.26	1.08	1.02	0.98 | ||||||
|  | 
 | ||||||
|  | There is regression in large critical section. But adaptive mutex is | ||||||
|  | aimed for "quick" locks. Small critical section is more common when | ||||||
|  | users choose to use adaptive pthread_mutex. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Wangyang Guo <wangyang.guo@intel.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	pthreadP.h | ||||||
|  | 	(had been moved) | ||||||
|  | 	nptl/pthread_mutex_lock.c | ||||||
|  | 	(max_adaptive_count renamed) | ||||||
|  | 
 | ||||||
|  | ---
 | ||||||
|  |  nptl/pthreadP.h                             |  1 + | ||||||
|  |  nptl/pthread_mutex_lock.c                   | 16 +++++++-- | ||||||
|  |  sysdeps/nptl/pthread_mutex_backoff.h        | 35 ++++++++++++++++++ | ||||||
|  |  sysdeps/x86_64/nptl/pthread_mutex_backoff.h | 39 +++++++++++++++++++++ | ||||||
|  |  4 files changed, 89 insertions(+), 2 deletions(-) | ||||||
|  |  create mode 100644 sysdeps/nptl/pthread_mutex_backoff.h | ||||||
|  |  create mode 100644 sysdeps/x86_64/nptl/pthread_mutex_backoff.h | ||||||
|  | 
 | ||||||
|  | diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
 | ||||||
|  | index 7ddc166c..1550e3b6 100644
 | ||||||
|  | --- a/nptl/pthreadP.h
 | ||||||
|  | +++ b/nptl/pthreadP.h
 | ||||||
|  | @@ -33,6 +33,7 @@
 | ||||||
|  |  #include <kernel-features.h> | ||||||
|  |  #include <errno.h> | ||||||
|  |  #include <internal-signals.h> | ||||||
|  | +#include <pthread_mutex_backoff.h>
 | ||||||
|  |   | ||||||
|  |   | ||||||
|  |  /* Atomic operations on TLS memory.  */ | ||||||
|  | diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
 | ||||||
|  | index d96a9933..c7770fc9 100644
 | ||||||
|  | --- a/nptl/pthread_mutex_lock.c
 | ||||||
|  | +++ b/nptl/pthread_mutex_lock.c
 | ||||||
|  | @@ -133,14 +133,26 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
 | ||||||
|  |  	  int cnt = 0; | ||||||
|  |  	  int max_cnt = MIN (MAX_ADAPTIVE_COUNT, | ||||||
|  |  			     mutex->__data.__spins * 2 + 10); | ||||||
|  | +	  int spin_count, exp_backoff = 1;
 | ||||||
|  | +	  unsigned int jitter = get_jitter ();
 | ||||||
|  |  	  do | ||||||
|  |  	    { | ||||||
|  | -	      if (cnt++ >= max_cnt)
 | ||||||
|  | +	      /* In each loop, spin count is exponential backoff plus
 | ||||||
|  | +		 random jitter, random range is [0, exp_backoff-1].  */
 | ||||||
|  | +	      spin_count = exp_backoff + (jitter & (exp_backoff - 1));
 | ||||||
|  | +	      cnt += spin_count;
 | ||||||
|  | +	      if (cnt >= max_cnt)
 | ||||||
|  |  		{ | ||||||
|  | +		  /* If cnt exceeds max spin count, just go to wait
 | ||||||
|  | +		     queue.  */
 | ||||||
|  |  		  LLL_MUTEX_LOCK (mutex); | ||||||
|  |  		  break; | ||||||
|  |  		} | ||||||
|  | -	      atomic_spin_nop ();
 | ||||||
|  | +	      do
 | ||||||
|  | +		atomic_spin_nop ();
 | ||||||
|  | +	      while (--spin_count > 0);
 | ||||||
|  | +	      /* Prepare for next loop.  */
 | ||||||
|  | +	      exp_backoff = get_next_backoff (exp_backoff);
 | ||||||
|  |  	    } | ||||||
|  |  	  while (LLL_MUTEX_READ_LOCK (mutex) != 0 | ||||||
|  |  		 || LLL_MUTEX_TRYLOCK (mutex) != 0); | ||||||
|  | diff --git a/sysdeps/nptl/pthread_mutex_backoff.h b/sysdeps/nptl/pthread_mutex_backoff.h
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..5b26c22a
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/nptl/pthread_mutex_backoff.h
 | ||||||
|  | @@ -0,0 +1,35 @@
 | ||||||
|  | +/* Pthread mutex backoff configuration.
 | ||||||
|  | +   Copyright (C) 2022 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +#ifndef _PTHREAD_MUTEX_BACKOFF_H
 | ||||||
|  | +#define _PTHREAD_MUTEX_BACKOFF_H 1
 | ||||||
|  | +
 | ||||||
|  | +static inline unsigned int
 | ||||||
|  | +get_jitter (void)
 | ||||||
|  | +{
 | ||||||
|  | +  /* Arch dependent random jitter, return 0 disables random.  */
 | ||||||
|  | +  return 0;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static inline int
 | ||||||
|  | +get_next_backoff (int backoff)
 | ||||||
|  | +{
 | ||||||
|  | +  /* Next backoff, return 1 disables mutex backoff.  */
 | ||||||
|  | +  return 1;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +#endif
 | ||||||
|  | diff --git a/sysdeps/x86_64/nptl/pthread_mutex_backoff.h b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..ec74c3d9
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
 | ||||||
|  | @@ -0,0 +1,39 @@
 | ||||||
|  | +/* Pthread mutex backoff configuration.
 | ||||||
|  | +   Copyright (C) 2022 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +#ifndef _PTHREAD_MUTEX_BACKOFF_H
 | ||||||
|  | +#define _PTHREAD_MUTEX_BACKOFF_H 1
 | ||||||
|  | +
 | ||||||
|  | +#include <fast-jitter.h>
 | ||||||
|  | +
 | ||||||
|  | +static inline unsigned int
 | ||||||
|  | +get_jitter (void)
 | ||||||
|  | +{
 | ||||||
|  | +  return get_fast_jitter ();
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +#define MAX_BACKOFF 16
 | ||||||
|  | +
 | ||||||
|  | +static inline int
 | ||||||
|  | +get_next_backoff (int backoff)
 | ||||||
|  | +{
 | ||||||
|  | +  /* Binary expontial backoff. Limiting max backoff
 | ||||||
|  | +     can reduce latency in large critical section.  */
 | ||||||
|  | +  return (backoff < MAX_BACKOFF) ? backoff << 1 : backoff;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +#endif
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										55
									
								
								SOURCES/glibc-RHEL-15696-108.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								SOURCES/glibc-RHEL-15696-108.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,55 @@ | |||||||
|  | From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Tue, 15 Feb 2022 08:18:15 -0600 | ||||||
|  | Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ | ||||||
|  |  #28896] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would | ||||||
|  | call strcmp-avx2 and wcscmp-avx2 respectively. This would have | ||||||
|  | not checks around vzeroupper and would trigger spurious | ||||||
|  | aborts. This commit fixes that. | ||||||
|  | 
 | ||||||
|  | test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on | ||||||
|  | AVX2 machines with and without RTM. | ||||||
|  | 
 | ||||||
|  | Co-authored-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strcmp-avx2.S | 8 ++------ | ||||||
|  |  1 file changed, 2 insertions(+), 6 deletions(-) | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	sysdeps/x86_64/multiarch/strcmp-avx2.S | ||||||
|  | 	(split into two patches due to upstream bug differences) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | index 28cc98b6..e267c6cb 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | @@ -345,10 +345,10 @@ L(one_or_less):
 | ||||||
|  |  	movq	%LOCALE_REG, %rdx | ||||||
|  |  #  endif | ||||||
|  |  	jb	L(ret_zero) | ||||||
|  | -#  ifdef USE_AS_WCSCMP
 | ||||||
|  |  	/* 'nbe' covers the case where length is negative (large | ||||||
|  |  	   unsigned).  */ | ||||||
|  | -	jnbe	__wcscmp_avx2
 | ||||||
|  | +	jnbe	OVERFLOW_STRCMP
 | ||||||
|  | +#  ifdef USE_AS_WCSCMP
 | ||||||
|  |  	movl	(%rdi), %edx | ||||||
|  |  	xorl	%eax, %eax | ||||||
|  |  	cmpl	(%rsi), %edx | ||||||
|  | @@ -357,10 +357,6 @@ L(one_or_less):
 | ||||||
|  |  	negl	%eax | ||||||
|  |  	orl	$1, %eax | ||||||
|  |  #  else | ||||||
|  | -	/* 'nbe' covers the case where length is negative (large
 | ||||||
|  | -	   unsigned).  */
 | ||||||
|  | -
 | ||||||
|  | -	jnbe	__strcmp_avx2
 | ||||||
|  |  	movzbl	(%rdi), %eax | ||||||
|  |  	movzbl	(%rsi), %ecx | ||||||
|  |  	TOLOWER_gpr (%rax, %eax) | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										60
									
								
								SOURCES/glibc-RHEL-15696-109.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										60
									
								
								SOURCES/glibc-RHEL-15696-109.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,60 @@ | |||||||
|  | From 259a17cc98058d2576511201f85d28cb5d9de2a2 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Stefan Liebler <stli@linux.ibm.com> | ||||||
|  | Date: Mon, 28 Jun 2021 13:01:07 +0200 | ||||||
|  | Subject: s390x: Update math: redirect roundeven function | ||||||
|  | 
 | ||||||
|  | After recent commit | ||||||
|  | 447954a206837b5f153869cfeeeab44631c3fac9 | ||||||
|  | "math: redirect roundeven function", building on | ||||||
|  | s390x fails with: | ||||||
|  | Error: symbol `__roundevenl' is already defined | ||||||
|  | 
 | ||||||
|  | Similar to aarch64/riscv fix, this patch redirects target | ||||||
|  | specific functions for s390x: | ||||||
|  | commit 3213ed770cbc5821920d16caa93c85e92dd7b9f6 | ||||||
|  | "Update math: redirect roundeven function" | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/s390/fpu/s_roundeven.c b/sysdeps/s390/fpu/s_roundeven.c
 | ||||||
|  | index 40b07e054b..0773adfed0 100644
 | ||||||
|  | --- a/sysdeps/s390/fpu/s_roundeven.c
 | ||||||
|  | +++ b/sysdeps/s390/fpu/s_roundeven.c
 | ||||||
|  | @@ -18,6 +18,7 @@
 | ||||||
|  |     <https://www.gnu.org/licenses/>.  */ | ||||||
|  |   | ||||||
|  |  #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT | ||||||
|  | +# define NO_MATH_REDIRECT
 | ||||||
|  |  # include <math.h> | ||||||
|  |  # include <libm-alias-double.h> | ||||||
|  |   | ||||||
|  | @@ -31,7 +32,6 @@ __roundeven (double x)
 | ||||||
|  |    __asm__ ("fidbra %0,4,%1,4" : "=f" (y) : "f" (x)); | ||||||
|  |    return y; | ||||||
|  |  } | ||||||
|  | -hidden_def (__roundeven)
 | ||||||
|  |  libm_alias_double (__roundeven, roundeven) | ||||||
|  |   | ||||||
|  |  #else | ||||||
|  | diff --git a/sysdeps/s390/fpu/s_roundevenf.c b/sysdeps/s390/fpu/s_roundevenf.c
 | ||||||
|  | index d2fbf3d2b6..289785bc4a 100644
 | ||||||
|  | --- a/sysdeps/s390/fpu/s_roundevenf.c
 | ||||||
|  | +++ b/sysdeps/s390/fpu/s_roundevenf.c
 | ||||||
|  | @@ -18,6 +18,7 @@
 | ||||||
|  |     <https://www.gnu.org/licenses/>.  */ | ||||||
|  |   | ||||||
|  |  #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT | ||||||
|  | +# define NO_MATH_REDIRECT
 | ||||||
|  |  # include <math.h> | ||||||
|  |  # include <libm-alias-float.h> | ||||||
|  |   | ||||||
|  | diff --git a/sysdeps/s390/fpu/s_roundevenl.c b/sysdeps/s390/fpu/s_roundevenl.c
 | ||||||
|  | index 29ab7a8616..94b6459ab4 100644
 | ||||||
|  | --- a/sysdeps/s390/fpu/s_roundevenl.c
 | ||||||
|  | +++ b/sysdeps/s390/fpu/s_roundevenl.c
 | ||||||
|  | @@ -18,6 +18,7 @@
 | ||||||
|  |     <https://www.gnu.org/licenses/>.  */ | ||||||
|  |   | ||||||
|  |  #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT | ||||||
|  | +# define NO_MATH_REDIRECT
 | ||||||
|  |  # include <math.h> | ||||||
|  |  # include <math_private.h> | ||||||
|  |  # include <libm-alias-ldouble.h> | ||||||
							
								
								
									
										74
									
								
								SOURCES/glibc-RHEL-15696-11.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								SOURCES/glibc-RHEL-15696-11.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,74 @@ | |||||||
|  | From 1da50d4bda07f04135dca39f40e79fc9eabed1f8 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Fri, 26 Feb 2021 05:36:59 -0800 | ||||||
|  | Subject: [PATCH] x86: Set Prefer_No_VZEROUPPER and add Prefer_AVX2_STRCMP | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | 1. Set Prefer_No_VZEROUPPER if RTM is usable to avoid RTM abort triggered | ||||||
|  | by VZEROUPPER inside a transactionally executing RTM region. | ||||||
|  | 2. Since to compare 2 32-byte strings, 256-bit EVEX strcmp requires 2 | ||||||
|  | loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp requires 1 load, 2 VPCMPEQs, | ||||||
|  | 1 VPMINU and 1 VPMOVMSKB, AVX2 strcmp is faster than EVEX strcmp.  Add | ||||||
|  | Prefer_AVX2_STRCMP to prefer AVX2 strcmp family functions. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86/cpu-features.c                    | 20 +++++++++++++++++-- | ||||||
|  |  sysdeps/x86/cpu-tunables.c                    |  2 ++ | ||||||
|  |  ...cpu-features-preferred_feature_index_1.def |  1 + | ||||||
|  |  3 files changed, 21 insertions(+), 2 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
 | ||||||
|  | index 91042505..3610ee5c 100644
 | ||||||
|  | --- a/sysdeps/x86/cpu-features.c
 | ||||||
|  | +++ b/sysdeps/x86/cpu-features.c
 | ||||||
|  | @@ -524,8 +524,24 @@ init_cpu_features (struct cpu_features *cpu_features)
 | ||||||
|  |  	cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER] | ||||||
|  |  	  |= bit_arch_Prefer_No_VZEROUPPER; | ||||||
|  |        else | ||||||
|  | -	cpu_features->preferred[index_arch_Prefer_No_AVX512]
 | ||||||
|  | -	  |= bit_arch_Prefer_No_AVX512;
 | ||||||
|  | +	{
 | ||||||
|  | +	  cpu_features->preferred[index_arch_Prefer_No_AVX512]
 | ||||||
|  | +	    |= bit_arch_Prefer_No_AVX512;
 | ||||||
|  | +
 | ||||||
|  | +	  /* Avoid RTM abort triggered by VZEROUPPER inside a
 | ||||||
|  | +	     transactionally executing RTM region.  */
 | ||||||
|  | +	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 | ||||||
|  | +	    cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
 | ||||||
|  | +	      |= bit_arch_Prefer_No_VZEROUPPER;
 | ||||||
|  | +
 | ||||||
|  | +	  /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
 | ||||||
|  | +	     requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
 | ||||||
|  | +	     requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
 | ||||||
|  | +	     AVX2 strcmp is faster than EVEX strcmp.  */
 | ||||||
|  | +	  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
 | ||||||
|  | +	    cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
 | ||||||
|  | +	      |= bit_arch_Prefer_AVX2_STRCMP;
 | ||||||
|  | +	}
 | ||||||
|  |      } | ||||||
|  |    /* This spells out "AuthenticAMD".  */ | ||||||
|  |    else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) | ||||||
|  | diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
 | ||||||
|  | index 3173b2b9..73adbaba 100644
 | ||||||
|  | --- a/sysdeps/x86/cpu-tunables.c
 | ||||||
|  | +++ b/sysdeps/x86/cpu-tunables.c
 | ||||||
|  | @@ -239,6 +239,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
 | ||||||
|  |  	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, | ||||||
|  |  						Fast_Copy_Backward, | ||||||
|  |  						disable, 18); | ||||||
|  | +	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
 | ||||||
|  | +		(n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
 | ||||||
|  |  	    } | ||||||
|  |  	  break; | ||||||
|  |  	case 19: | ||||||
|  | diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
 | ||||||
|  | index 17a5cc42..4ca70b40 100644
 | ||||||
|  | --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
 | ||||||
|  | +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
 | ||||||
|  | @@ -32,3 +32,4 @@ BIT (Prefer_ERMS)
 | ||||||
|  |  BIT (Prefer_FSRM) | ||||||
|  |  BIT (Prefer_No_AVX512) | ||||||
|  |  BIT (MathVec_Prefer_No_AVX512) | ||||||
|  | +BIT (Prefer_AVX2_STRCMP)
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										26
									
								
								SOURCES/glibc-RHEL-15696-110.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								SOURCES/glibc-RHEL-15696-110.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,26 @@ | |||||||
|  | From 3213ed770cbc5821920d16caa93c85e92dd7b9f6 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Wed, 23 Jun 2021 13:29:41 -0700 | ||||||
|  | Subject: Update math: redirect roundeven function | ||||||
|  | 
 | ||||||
|  | Redirect target specific roundeven functions for aarch64, ldbl-128ibm | ||||||
|  | and riscv. | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	sysdeps/aarch64/* | ||||||
|  | 	(not needed) | ||||||
|  | 	sysdeps/riscv/* | ||||||
|  | 	(not supported) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
 | ||||||
|  | index 6701970f4a..90eecf496b 100644
 | ||||||
|  | --- a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
 | ||||||
|  | +++ b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
 | ||||||
|  | @@ -17,6 +17,7 @@
 | ||||||
|  |     License along with the GNU C Library; if not, see | ||||||
|  |     <http://www.gnu.org/licenses/>.  */ | ||||||
|  |   | ||||||
|  | +#define NO_MATH_REDIRECT
 | ||||||
|  |  #include <math.h> | ||||||
|  |  #include <math_private.h> | ||||||
|  |   | ||||||
							
								
								
									
										3410
									
								
								SOURCES/glibc-RHEL-15696-12.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3410
									
								
								SOURCES/glibc-RHEL-15696-12.patch
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										1488
									
								
								SOURCES/glibc-RHEL-15696-13.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1488
									
								
								SOURCES/glibc-RHEL-15696-13.patch
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										242
									
								
								SOURCES/glibc-RHEL-15696-14.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										242
									
								
								SOURCES/glibc-RHEL-15696-14.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,242 @@ | |||||||
|  | From 63ad43566f7a25d140dc723598aeb441ad657eed Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Fri, 5 Mar 2021 06:46:08 -0800 | ||||||
|  | Subject: [PATCH] x86-64: Add memmove family functions with 256-bit EVEX | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Update ifunc-memmove.h to select the function optimized with 256-bit EVEX | ||||||
|  | instructions using YMM16-YMM31 registers to avoid RTM abort with usable | ||||||
|  | AVX512VL since VZEROUPPER isn't needed at function exit. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/Makefile             |  1 + | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 36 +++++++++++++++++++ | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-memmove.h      | 21 +++++++++-- | ||||||
|  |  .../multiarch/memmove-evex-unaligned-erms.S   | 33 +++++++++++++++++ | ||||||
|  |  .../multiarch/memmove-vec-unaligned-erms.S    | 24 ++++++++----- | ||||||
|  |  5 files changed, 104 insertions(+), 11 deletions(-) | ||||||
|  |  create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | index 46783cd1..4563fc56 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | @@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
 | ||||||
|  |  		   memset-avx2-unaligned-erms \ | ||||||
|  |  		   memset-avx512-unaligned-erms \ | ||||||
|  |  		   memchr-evex \ | ||||||
|  | +		   memmove-evex-unaligned-erms \
 | ||||||
|  |  		   memrchr-evex \ | ||||||
|  |  		   rawmemchr-evex \ | ||||||
|  |  		   stpcpy-evex \ | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | index 082e4da3..6bd3abfc 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | @@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memmove_chk, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX), | ||||||
|  |  			      __memmove_chk_avx_unaligned_erms) | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  | +			      __memmove_chk_evex_unaligned)
 | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  | +			      __memmove_chk_evex_unaligned_erms)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memmove_chk, | ||||||
|  |  			      CPU_FEATURE_USABLE (SSSE3), | ||||||
|  |  			      __memmove_chk_ssse3_back) | ||||||
|  | @@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memmove, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX), | ||||||
|  |  			      __memmove_avx_unaligned_erms) | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, memmove,
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  | +			      __memmove_evex_unaligned)
 | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, memmove,
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  | +			      __memmove_evex_unaligned_erms)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memmove, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX512F), | ||||||
|  |  			      __memmove_avx512_no_vzeroupper) | ||||||
|  | @@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX), | ||||||
|  |  			      __memcpy_chk_avx_unaligned_erms) | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  | +			      __memcpy_chk_evex_unaligned)
 | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  | +			      __memcpy_chk_evex_unaligned_erms)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, | ||||||
|  |  			      CPU_FEATURE_USABLE (SSSE3), | ||||||
|  |  			      __memcpy_chk_ssse3_back) | ||||||
|  | @@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memcpy, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX), | ||||||
|  |  			      __memcpy_avx_unaligned_erms) | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, memcpy,
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  | +			      __memcpy_evex_unaligned)
 | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, memcpy,
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  | +			      __memcpy_evex_unaligned_erms)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), | ||||||
|  |  			      __memcpy_ssse3_back) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), | ||||||
|  | @@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX), | ||||||
|  |  			      __mempcpy_chk_avx_unaligned_erms) | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  | +			      __mempcpy_chk_evex_unaligned)
 | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  | +			      __mempcpy_chk_evex_unaligned_erms)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, | ||||||
|  |  			      CPU_FEATURE_USABLE (SSSE3), | ||||||
|  |  			      __mempcpy_chk_ssse3_back) | ||||||
|  | @@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, mempcpy, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX), | ||||||
|  |  			      __mempcpy_avx_unaligned_erms) | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, mempcpy,
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  | +			      __mempcpy_evex_unaligned)
 | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, mempcpy,
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  | +			      __mempcpy_evex_unaligned_erms)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), | ||||||
|  |  			      __mempcpy_ssse3_back) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
 | ||||||
|  | index 5e5f0299..6f8bce5f 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
 | ||||||
|  | @@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
 | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) | ||||||
|  |    attribute_hidden; | ||||||
|  | +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
 | ||||||
|  | +  attribute_hidden;
 | ||||||
|  | +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
 | ||||||
|  | +  attribute_hidden;
 | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) | ||||||
|  |    attribute_hidden; | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms) | ||||||
|  | @@ -59,10 +63,21 @@ IFUNC_SELECTOR (void)
 | ||||||
|  |   | ||||||
|  |    if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) | ||||||
|  |      { | ||||||
|  | -      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 | ||||||
|  | -	return OPTIMIZE (avx_unaligned_erms);
 | ||||||
|  | +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
 | ||||||
|  | +	{
 | ||||||
|  | +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 | ||||||
|  | +	    return OPTIMIZE (evex_unaligned_erms);
 | ||||||
|  | +
 | ||||||
|  | +	  return OPTIMIZE (evex_unaligned);
 | ||||||
|  | +	}
 | ||||||
|  | +
 | ||||||
|  | +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 | ||||||
|  | +	{
 | ||||||
|  | +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 | ||||||
|  | +	    return OPTIMIZE (avx_unaligned_erms);
 | ||||||
|  |   | ||||||
|  | -      return OPTIMIZE (avx_unaligned);
 | ||||||
|  | +	  return OPTIMIZE (avx_unaligned);
 | ||||||
|  | +	}
 | ||||||
|  |      } | ||||||
|  |   | ||||||
|  |    if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3) | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..0cbce8f9
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
 | ||||||
|  | @@ -0,0 +1,33 @@
 | ||||||
|  | +#if IS_IN (libc)
 | ||||||
|  | +# define VEC_SIZE	32
 | ||||||
|  | +# define XMM0		xmm16
 | ||||||
|  | +# define XMM1		xmm17
 | ||||||
|  | +# define YMM0		ymm16
 | ||||||
|  | +# define YMM1		ymm17
 | ||||||
|  | +# define VEC0		ymm16
 | ||||||
|  | +# define VEC1		ymm17
 | ||||||
|  | +# define VEC2		ymm18
 | ||||||
|  | +# define VEC3		ymm19
 | ||||||
|  | +# define VEC4		ymm20
 | ||||||
|  | +# define VEC5		ymm21
 | ||||||
|  | +# define VEC6		ymm22
 | ||||||
|  | +# define VEC7		ymm23
 | ||||||
|  | +# define VEC8		ymm24
 | ||||||
|  | +# define VEC9		ymm25
 | ||||||
|  | +# define VEC10		ymm26
 | ||||||
|  | +# define VEC11		ymm27
 | ||||||
|  | +# define VEC12		ymm28
 | ||||||
|  | +# define VEC13		ymm29
 | ||||||
|  | +# define VEC14		ymm30
 | ||||||
|  | +# define VEC15		ymm31
 | ||||||
|  | +# define VEC(i)		VEC##i
 | ||||||
|  | +# define VMOVNT		vmovntdq
 | ||||||
|  | +# define VMOVU		vmovdqu64
 | ||||||
|  | +# define VMOVA		vmovdqa64
 | ||||||
|  | +# define VZEROUPPER
 | ||||||
|  | +
 | ||||||
|  | +# define SECTION(p)		p##.evex
 | ||||||
|  | +# define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
 | ||||||
|  | +
 | ||||||
|  | +# include "memmove-vec-unaligned-erms.S"
 | ||||||
|  | +#endif
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | index 274aa1c7..08e21692 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | @@ -48,6 +48,14 @@
 | ||||||
|  |  # define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s) | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  | +#ifndef XMM0
 | ||||||
|  | +# define XMM0				xmm0
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +#ifndef YMM0
 | ||||||
|  | +# define YMM0				ymm0
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  |  #ifndef VZEROUPPER | ||||||
|  |  # if VEC_SIZE > 16 | ||||||
|  |  #  define VZEROUPPER vzeroupper | ||||||
|  | @@ -277,20 +285,20 @@ L(less_vec):
 | ||||||
|  |  #if VEC_SIZE > 32 | ||||||
|  |  L(between_32_63): | ||||||
|  |  	/* From 32 to 63.  No branch when size == 32.  */ | ||||||
|  | -	vmovdqu	(%rsi), %ymm0
 | ||||||
|  | -	vmovdqu	-32(%rsi,%rdx), %ymm1
 | ||||||
|  | -	vmovdqu	%ymm0, (%rdi)
 | ||||||
|  | -	vmovdqu	%ymm1, -32(%rdi,%rdx)
 | ||||||
|  | +	VMOVU	(%rsi), %YMM0
 | ||||||
|  | +	VMOVU	-32(%rsi,%rdx), %YMM1
 | ||||||
|  | +	VMOVU	%YMM0, (%rdi)
 | ||||||
|  | +	VMOVU	%YMM1, -32(%rdi,%rdx)
 | ||||||
|  |  	VZEROUPPER | ||||||
|  |  	ret | ||||||
|  |  #endif | ||||||
|  |  #if VEC_SIZE > 16 | ||||||
|  |  	/* From 16 to 31.  No branch when size == 16.  */ | ||||||
|  |  L(between_16_31): | ||||||
|  | -	vmovdqu	(%rsi), %xmm0
 | ||||||
|  | -	vmovdqu	-16(%rsi,%rdx), %xmm1
 | ||||||
|  | -	vmovdqu	%xmm0, (%rdi)
 | ||||||
|  | -	vmovdqu	%xmm1, -16(%rdi,%rdx)
 | ||||||
|  | +	VMOVU	(%rsi), %XMM0
 | ||||||
|  | +	VMOVU	-16(%rsi,%rdx), %XMM1
 | ||||||
|  | +	VMOVU	%XMM0, (%rdi)
 | ||||||
|  | +	VMOVU	%XMM1, -16(%rdi,%rdx)
 | ||||||
|  |  	ret | ||||||
|  |  #endif | ||||||
|  |  L(between_8_15): | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										254
									
								
								SOURCES/glibc-RHEL-15696-15.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										254
									
								
								SOURCES/glibc-RHEL-15696-15.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,254 @@ | |||||||
|  | From 1b968b6b9b3aac702ac2f133e0dd16cfdbb415ee Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Fri, 5 Mar 2021 07:15:03 -0800 | ||||||
|  | Subject: [PATCH] x86-64: Add memset family functions with 256-bit EVEX | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized | ||||||
|  | with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM | ||||||
|  | abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at | ||||||
|  | function exit. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/Makefile             |  1 + | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 22 +++++++++++++++++ | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-memset.h       | 24 +++++++++++++++---- | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-wmemset.h      | 13 ++++++---- | ||||||
|  |  .../multiarch/memset-evex-unaligned-erms.S    | 24 +++++++++++++++++++ | ||||||
|  |  .../multiarch/memset-vec-unaligned-erms.S     | 20 +++++++++++----- | ||||||
|  |  6 files changed, 90 insertions(+), 14 deletions(-) | ||||||
|  |  create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | index 4563fc56..1cc0a10e 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | @@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
 | ||||||
|  |  		   memchr-evex \ | ||||||
|  |  		   memmove-evex-unaligned-erms \ | ||||||
|  |  		   memrchr-evex \ | ||||||
|  | +		   memset-evex-unaligned-erms \
 | ||||||
|  |  		   rawmemchr-evex \ | ||||||
|  |  		   stpcpy-evex \ | ||||||
|  |  		   stpncpy-evex \ | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | index 6bd3abfc..7cf83485 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | @@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memset_chk, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX2), | ||||||
|  |  			      __memset_chk_avx2_unaligned_erms) | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX512VL)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  | +			      __memset_chk_evex_unaligned)
 | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX512VL)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  | +			      __memset_chk_evex_unaligned_erms)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memset_chk, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX512F), | ||||||
|  |  			      __memset_chk_avx512_unaligned_erms) | ||||||
|  | @@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memset, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX2), | ||||||
|  |  			      __memset_avx2_unaligned_erms) | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, memset,
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX512VL)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  | +			      __memset_evex_unaligned)
 | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, memset,
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX512VL)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  | +			      __memset_evex_unaligned_erms)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memset, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX512F), | ||||||
|  |  			      __memset_avx512_unaligned_erms) | ||||||
|  | @@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wmemset, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX2), | ||||||
|  |  			      __wmemset_avx2_unaligned) | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, wmemset,
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  | +			      __wmemset_evex_unaligned)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wmemset, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX512F), | ||||||
|  |  			      __wmemset_avx512_unaligned)) | ||||||
|  | @@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __wmemset_chk, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX2), | ||||||
|  |  			      __wmemset_chk_avx2_unaligned) | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  | +			      __wmemset_chk_evex_unaligned)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __wmemset_chk, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX512F), | ||||||
|  |  			      __wmemset_chk_avx512_unaligned)) | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
 | ||||||
|  | index 708bd72e..6f31f4dc 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
 | ||||||
|  | @@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
 | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms) | ||||||
|  |    attribute_hidden; | ||||||
|  | +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
 | ||||||
|  | +  attribute_hidden;
 | ||||||
|  | +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
 | ||||||
|  | +  attribute_hidden;
 | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) | ||||||
|  |    attribute_hidden; | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms) | ||||||
|  | @@ -56,10 +60,22 @@ IFUNC_SELECTOR (void)
 | ||||||
|  |   | ||||||
|  |    if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) | ||||||
|  |      { | ||||||
|  | -      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 | ||||||
|  | -	return OPTIMIZE (avx2_unaligned_erms);
 | ||||||
|  | -      else
 | ||||||
|  | -	return OPTIMIZE (avx2_unaligned);
 | ||||||
|  | +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 | ||||||
|  | +	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 | ||||||
|  | +	{
 | ||||||
|  | +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 | ||||||
|  | +	    return OPTIMIZE (evex_unaligned_erms);
 | ||||||
|  | +
 | ||||||
|  | +	  return OPTIMIZE (evex_unaligned);
 | ||||||
|  | +	}
 | ||||||
|  | +
 | ||||||
|  | +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 | ||||||
|  | +	{
 | ||||||
|  | +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 | ||||||
|  | +	    return OPTIMIZE (avx2_unaligned_erms);
 | ||||||
|  | +
 | ||||||
|  | +	  return OPTIMIZE (avx2_unaligned);
 | ||||||
|  | +	}
 | ||||||
|  |      } | ||||||
|  |   | ||||||
|  |    if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
 | ||||||
|  | index eb242210..9290c4bf 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
 | ||||||
|  | @@ -20,6 +20,7 @@
 | ||||||
|  |   | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; | ||||||
|  | +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
 | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; | ||||||
|  |   | ||||||
|  |  static inline void * | ||||||
|  | @@ -27,14 +28,18 @@ IFUNC_SELECTOR (void)
 | ||||||
|  |  { | ||||||
|  |    const struct cpu_features* cpu_features = __get_cpu_features (); | ||||||
|  |   | ||||||
|  | -  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
 | ||||||
|  | -      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
 | ||||||
|  | +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
 | ||||||
|  |        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) | ||||||
|  |      { | ||||||
|  |        if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) | ||||||
|  | -	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
 | ||||||
|  | +	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
 | ||||||
|  | +	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 | ||||||
|  |  	return OPTIMIZE (avx512_unaligned); | ||||||
|  | -      else
 | ||||||
|  | +
 | ||||||
|  | +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
 | ||||||
|  | +	return OPTIMIZE (evex_unaligned);
 | ||||||
|  | +
 | ||||||
|  | +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 | ||||||
|  |  	return OPTIMIZE (avx2_unaligned); | ||||||
|  |      } | ||||||
|  |   | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..ae0a4d6e
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
 | ||||||
|  | @@ -0,0 +1,24 @@
 | ||||||
|  | +#if IS_IN (libc)
 | ||||||
|  | +# define VEC_SIZE	32
 | ||||||
|  | +# define XMM0		xmm16
 | ||||||
|  | +# define YMM0		ymm16
 | ||||||
|  | +# define VEC0		ymm16
 | ||||||
|  | +# define VEC(i)		VEC##i
 | ||||||
|  | +# define VMOVU		vmovdqu64
 | ||||||
|  | +# define VMOVA		vmovdqa64
 | ||||||
|  | +# define VZEROUPPER
 | ||||||
|  | +
 | ||||||
|  | +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
 | ||||||
|  | +  movq r, %rax; \
 | ||||||
|  | +  vpbroadcastb d, %VEC0
 | ||||||
|  | +
 | ||||||
|  | +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
 | ||||||
|  | +  movq r, %rax; \
 | ||||||
|  | +  vpbroadcastd d, %VEC0
 | ||||||
|  | +
 | ||||||
|  | +# define SECTION(p)		p##.evex
 | ||||||
|  | +# define MEMSET_SYMBOL(p,s)	p##_evex_##s
 | ||||||
|  | +# define WMEMSET_SYMBOL(p,s)	p##_evex_##s
 | ||||||
|  | +
 | ||||||
|  | +# include "memset-vec-unaligned-erms.S"
 | ||||||
|  | +#endif
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 | ||||||
|  | index 9a0fd818..71e91a8f 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 | ||||||
|  | @@ -34,6 +34,14 @@
 | ||||||
|  |  # define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s) | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  | +#ifndef XMM0
 | ||||||
|  | +# define XMM0				xmm0
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +#ifndef YMM0
 | ||||||
|  | +# define YMM0				ymm0
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  |  #ifndef VZEROUPPER | ||||||
|  |  # if VEC_SIZE > 16 | ||||||
|  |  #  define VZEROUPPER			vzeroupper | ||||||
|  | @@ -67,7 +75,7 @@
 | ||||||
|  |  ENTRY (__bzero) | ||||||
|  |  	mov	%RDI_LP, %RAX_LP /* Set return value.  */ | ||||||
|  |  	mov	%RSI_LP, %RDX_LP /* Set n.  */ | ||||||
|  | -	pxor	%xmm0, %xmm0
 | ||||||
|  | +	pxor	%XMM0, %XMM0
 | ||||||
|  |  	jmp	L(entry_from_bzero) | ||||||
|  |  END (__bzero) | ||||||
|  |  weak_alias (__bzero, bzero) | ||||||
|  | @@ -223,7 +231,7 @@ L(less_vec):
 | ||||||
|  |  	cmpb	$16, %dl | ||||||
|  |  	jae	L(between_16_31) | ||||||
|  |  # endif | ||||||
|  | -	MOVQ	%xmm0, %rcx
 | ||||||
|  | +	MOVQ	%XMM0, %rcx
 | ||||||
|  |  	cmpb	$8, %dl | ||||||
|  |  	jae	L(between_8_15) | ||||||
|  |  	cmpb	$4, %dl | ||||||
|  | @@ -238,16 +246,16 @@ L(less_vec):
 | ||||||
|  |  # if VEC_SIZE > 32 | ||||||
|  |  	/* From 32 to 63.  No branch when size == 32.  */ | ||||||
|  |  L(between_32_63): | ||||||
|  | -	vmovdqu	%ymm0, -32(%rdi,%rdx)
 | ||||||
|  | -	vmovdqu	%ymm0, (%rdi)
 | ||||||
|  | +	VMOVU	%YMM0, -32(%rdi,%rdx)
 | ||||||
|  | +	VMOVU	%YMM0, (%rdi)
 | ||||||
|  |  	VZEROUPPER | ||||||
|  |  	ret | ||||||
|  |  # endif | ||||||
|  |  # if VEC_SIZE > 16 | ||||||
|  |  	/* From 16 to 31.  No branch when size == 16.  */ | ||||||
|  |  L(between_16_31): | ||||||
|  | -	vmovdqu	%xmm0, -16(%rdi,%rdx)
 | ||||||
|  | -	vmovdqu	%xmm0, (%rdi)
 | ||||||
|  | +	VMOVU	%XMM0, -16(%rdi,%rdx)
 | ||||||
|  | +	VMOVU	%XMM0, (%rdi)
 | ||||||
|  |  	VZEROUPPER | ||||||
|  |  	ret | ||||||
|  |  # endif | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										561
									
								
								SOURCES/glibc-RHEL-15696-16.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										561
									
								
								SOURCES/glibc-RHEL-15696-16.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,561 @@ | |||||||
|  | From 91264fe3577fe887b4860923fa6142b5274c8965 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Fri, 5 Mar 2021 07:20:28 -0800 | ||||||
|  | Subject: [PATCH] x86-64: Add memcmp family functions with 256-bit EVEX | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Update ifunc-memcmp.h to select the function optimized with 256-bit EVEX | ||||||
|  | instructions using YMM16-YMM31 registers to avoid RTM abort with usable | ||||||
|  | AVX512VL, AVX512BW and MOVBE since VZEROUPPER isn't needed at function | ||||||
|  | exit. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/Makefile             |   4 +- | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  10 + | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-memcmp.h       |  13 +- | ||||||
|  |  sysdeps/x86_64/multiarch/memcmp-evex-movbe.S  | 440 ++++++++++++++++++ | ||||||
|  |  sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S |   4 + | ||||||
|  |  5 files changed, 467 insertions(+), 4 deletions(-) | ||||||
|  |  create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | ||||||
|  |  create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | index 1cc0a10e..9d79b138 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | @@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
 | ||||||
|  |  		   memset-avx2-unaligned-erms \ | ||||||
|  |  		   memset-avx512-unaligned-erms \ | ||||||
|  |  		   memchr-evex \ | ||||||
|  | +		   memcmp-evex-movbe \
 | ||||||
|  |  		   memmove-evex-unaligned-erms \ | ||||||
|  |  		   memrchr-evex \ | ||||||
|  |  		   memset-evex-unaligned-erms \ | ||||||
|  | @@ -81,7 +82,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
 | ||||||
|  |  		   wcsncmp-evex \ | ||||||
|  |  		   wcsnlen-evex \ | ||||||
|  |  		   wcsrchr-evex \ | ||||||
|  | -		   wmemchr-evex
 | ||||||
|  | +		   wmemchr-evex \
 | ||||||
|  | +		   wmemcmp-evex-movbe
 | ||||||
|  |  endif | ||||||
|  |   | ||||||
|  |  ifeq ($(subdir),debug) | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | index 7cf83485..c8da910e 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | @@ -56,6 +56,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX2) | ||||||
|  |  			       && CPU_FEATURE_USABLE (MOVBE)), | ||||||
|  |  			      __memcmp_avx2_movbe) | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, memcmp,
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX512VL)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (MOVBE)),
 | ||||||
|  | +			      __memcmp_evex_movbe)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), | ||||||
|  |  			      __memcmp_sse4_1) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), | ||||||
|  | @@ -558,6 +563,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX2) | ||||||
|  |  			       && CPU_FEATURE_USABLE (MOVBE)), | ||||||
|  |  			      __wmemcmp_avx2_movbe) | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, wmemcmp,
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX512VL)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (MOVBE)),
 | ||||||
|  | +			      __wmemcmp_evex_movbe)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), | ||||||
|  |  			      __wmemcmp_sse4_1) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
 | ||||||
|  | index 6c1f3153..3ca1f0a6 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
 | ||||||
|  | @@ -23,17 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; | ||||||
|  |  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; | ||||||
|  | +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
 | ||||||
|  |   | ||||||
|  |  static inline void * | ||||||
|  |  IFUNC_SELECTOR (void) | ||||||
|  |  { | ||||||
|  |    const struct cpu_features* cpu_features = __get_cpu_features (); | ||||||
|  |   | ||||||
|  | -  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
 | ||||||
|  | -      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
 | ||||||
|  | +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
 | ||||||
|  |        && CPU_FEATURE_USABLE_P (cpu_features, MOVBE) | ||||||
|  |        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) | ||||||
|  | -    return OPTIMIZE (avx2_movbe);
 | ||||||
|  | +    {
 | ||||||
|  | +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 | ||||||
|  | +	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 | ||||||
|  | +	return OPTIMIZE (evex_movbe);
 | ||||||
|  | +
 | ||||||
|  | +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 | ||||||
|  | +	return OPTIMIZE (avx2_movbe);
 | ||||||
|  | +    }
 | ||||||
|  |   | ||||||
|  |    if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) | ||||||
|  |      return OPTIMIZE (sse4_1); | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..9c093972
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 | ||||||
|  | @@ -0,0 +1,440 @@
 | ||||||
|  | +/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
 | ||||||
|  | +   Copyright (C) 2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#if IS_IN (libc)
 | ||||||
|  | +
 | ||||||
|  | +/* memcmp/wmemcmp is implemented as:
 | ||||||
|  | +   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
 | ||||||
|  | +      to avoid branches.
 | ||||||
|  | +   2. Use overlapping compare to avoid branch.
 | ||||||
|  | +   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
 | ||||||
|  | +      bytes for wmemcmp.
 | ||||||
|  | +   4. If size is 8 * VEC_SIZE or less, unroll the loop.
 | ||||||
|  | +   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
 | ||||||
|  | +      area.
 | ||||||
|  | +   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
 | ||||||
|  | +   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
 | ||||||
|  | +   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
 | ||||||
|  | +
 | ||||||
|  | +# include <sysdep.h>
 | ||||||
|  | +
 | ||||||
|  | +# ifndef MEMCMP
 | ||||||
|  | +#  define MEMCMP	__memcmp_evex_movbe
 | ||||||
|  | +# endif
 | ||||||
|  | +
 | ||||||
|  | +# define VMOVU		vmovdqu64
 | ||||||
|  | +
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +#  define VPCMPEQ	vpcmpeqd
 | ||||||
|  | +# else
 | ||||||
|  | +#  define VPCMPEQ	vpcmpeqb
 | ||||||
|  | +# endif
 | ||||||
|  | +
 | ||||||
|  | +# define XMM1		xmm17
 | ||||||
|  | +# define XMM2		xmm18
 | ||||||
|  | +# define YMM1		ymm17
 | ||||||
|  | +# define YMM2		ymm18
 | ||||||
|  | +# define YMM3		ymm19
 | ||||||
|  | +# define YMM4		ymm20
 | ||||||
|  | +# define YMM5		ymm21
 | ||||||
|  | +# define YMM6		ymm22
 | ||||||
|  | +
 | ||||||
|  | +# define VEC_SIZE 32
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +#  define VEC_MASK 0xff
 | ||||||
|  | +#  define XMM_MASK 0xf
 | ||||||
|  | +# else
 | ||||||
|  | +#  define VEC_MASK 0xffffffff
 | ||||||
|  | +#  define XMM_MASK 0xffff
 | ||||||
|  | +# endif
 | ||||||
|  | +
 | ||||||
|  | +/* Warning!
 | ||||||
|  | +           wmemcmp has to use SIGNED comparison for elements.
 | ||||||
|  | +           memcmp has to use UNSIGNED comparison for elemnts.
 | ||||||
|  | +*/
 | ||||||
|  | +
 | ||||||
|  | +	.section .text.evex,"ax",@progbits
 | ||||||
|  | +ENTRY (MEMCMP)
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	shl	$2, %RDX_LP
 | ||||||
|  | +# elif defined __ILP32__
 | ||||||
|  | +	/* Clear the upper 32 bits.  */
 | ||||||
|  | +	movl	%edx, %edx
 | ||||||
|  | +# endif
 | ||||||
|  | +	cmp	$VEC_SIZE, %RDX_LP
 | ||||||
|  | +	jb	L(less_vec)
 | ||||||
|  | +
 | ||||||
|  | +	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
 | ||||||
|  | +	VMOVU	(%rsi), %YMM2
 | ||||||
|  | +	VPCMPEQ (%rdi), %YMM2, %k1
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	subl    $VEC_MASK, %eax
 | ||||||
|  | +	jnz	L(first_vec)
 | ||||||
|  | +
 | ||||||
|  | +	cmpq	$(VEC_SIZE * 2), %rdx
 | ||||||
|  | +	jbe	L(last_vec)
 | ||||||
|  | +
 | ||||||
|  | +	/* More than 2 * VEC.  */
 | ||||||
|  | +	cmpq	$(VEC_SIZE * 8), %rdx
 | ||||||
|  | +	ja	L(more_8x_vec)
 | ||||||
|  | +	cmpq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | +	jb	L(last_4x_vec)
 | ||||||
|  | +
 | ||||||
|  | +	/* From 4 * VEC to 8 * VEC, inclusively. */
 | ||||||
|  | +	VMOVU	(%rsi), %YMM1
 | ||||||
|  | +	VPCMPEQ (%rdi), %YMM1, %k1
 | ||||||
|  | +
 | ||||||
|  | +	VMOVU	VEC_SIZE(%rsi), %YMM2
 | ||||||
|  | +	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
 | ||||||
|  | +
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
 | ||||||
|  | +	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
 | ||||||
|  | +
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
 | ||||||
|  | +	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
 | ||||||
|  | +
 | ||||||
|  | +	kandd	%k1, %k2, %k5
 | ||||||
|  | +	kandd	%k3, %k4, %k6
 | ||||||
|  | +	kandd	%k5, %k6, %k6
 | ||||||
|  | +
 | ||||||
|  | +	kmovd	%k6, %eax
 | ||||||
|  | +	cmpl	$VEC_MASK, %eax
 | ||||||
|  | +	jne	L(4x_vec_end)
 | ||||||
|  | +
 | ||||||
|  | +	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
 | ||||||
|  | +	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
 | ||||||
|  | +	VMOVU	(%rsi), %YMM1
 | ||||||
|  | +	VPCMPEQ (%rdi), %YMM1, %k1
 | ||||||
|  | +
 | ||||||
|  | +	VMOVU	VEC_SIZE(%rsi), %YMM2
 | ||||||
|  | +	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
 | ||||||
|  | +	kandd	%k1, %k2, %k5
 | ||||||
|  | +
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
 | ||||||
|  | +	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
 | ||||||
|  | +	kandd	%k3, %k5, %k5
 | ||||||
|  | +
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
 | ||||||
|  | +	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
 | ||||||
|  | +	kandd	%k4, %k5, %k5
 | ||||||
|  | +
 | ||||||
|  | +	kmovd	%k5, %eax
 | ||||||
|  | +	cmpl	$VEC_MASK, %eax
 | ||||||
|  | +	jne	L(4x_vec_end)
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(last_2x_vec):
 | ||||||
|  | +	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
 | ||||||
|  | +	VMOVU	(%rsi), %YMM2
 | ||||||
|  | +	VPCMPEQ (%rdi), %YMM2, %k2
 | ||||||
|  | +	kmovd	%k2, %eax
 | ||||||
|  | +	subl    $VEC_MASK, %eax
 | ||||||
|  | +	jnz	L(first_vec)
 | ||||||
|  | +
 | ||||||
|  | +L(last_vec):
 | ||||||
|  | +	/* Use overlapping loads to avoid branches.  */
 | ||||||
|  | +	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
 | ||||||
|  | +	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
 | ||||||
|  | +	VMOVU	(%rsi), %YMM2
 | ||||||
|  | +	VPCMPEQ (%rdi), %YMM2, %k2
 | ||||||
|  | +	kmovd	%k2, %eax
 | ||||||
|  | +	subl    $VEC_MASK, %eax
 | ||||||
|  | +	jnz	L(first_vec)
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_vec):
 | ||||||
|  | +	/* A byte or int32 is different within 16 or 32 bytes.  */
 | ||||||
|  | +	tzcntl	%eax, %ecx
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	movl	(%rdi, %rcx, 4), %edx
 | ||||||
|  | +	cmpl	(%rsi, %rcx, 4), %edx
 | ||||||
|  | +L(wmemcmp_return):
 | ||||||
|  | +	setl	%al
 | ||||||
|  | +	negl	%eax
 | ||||||
|  | +	orl	$1, %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	movzbl	(%rdi, %rcx), %eax
 | ||||||
|  | +	movzbl	(%rsi, %rcx), %edx
 | ||||||
|  | +	sub	%edx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(4):
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	movl	(%rdi), %edx
 | ||||||
|  | +	cmpl	(%rsi), %edx
 | ||||||
|  | +	jne	L(wmemcmp_return)
 | ||||||
|  | +	ret
 | ||||||
|  | +# else
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(between_4_7):
 | ||||||
|  | +	/* Load as big endian with overlapping movbe to avoid branches.  */
 | ||||||
|  | +	movbe	(%rdi), %eax
 | ||||||
|  | +	movbe	(%rsi), %ecx
 | ||||||
|  | +	shlq	$32, %rax
 | ||||||
|  | +	shlq	$32, %rcx
 | ||||||
|  | +	movbe	-4(%rdi, %rdx), %edi
 | ||||||
|  | +	movbe	-4(%rsi, %rdx), %esi
 | ||||||
|  | +	orq	%rdi, %rax
 | ||||||
|  | +	orq	%rsi, %rcx
 | ||||||
|  | +	subq	%rcx, %rax
 | ||||||
|  | +	je	L(exit)
 | ||||||
|  | +	sbbl	%eax, %eax
 | ||||||
|  | +	orl	$1, %eax
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(exit):
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(between_2_3):
 | ||||||
|  | +	/* Load as big endian to avoid branches.  */
 | ||||||
|  | +	movzwl	(%rdi), %eax
 | ||||||
|  | +	movzwl	(%rsi), %ecx
 | ||||||
|  | +	shll	$8, %eax
 | ||||||
|  | +	shll	$8, %ecx
 | ||||||
|  | +	bswap	%eax
 | ||||||
|  | +	bswap	%ecx
 | ||||||
|  | +	movb	-1(%rdi, %rdx), %al
 | ||||||
|  | +	movb	-1(%rsi, %rdx), %cl
 | ||||||
|  | +	/* Subtraction is okay because the upper 8 bits are zero.  */
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(1):
 | ||||||
|  | +	movzbl	(%rdi), %eax
 | ||||||
|  | +	movzbl	(%rsi), %ecx
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +	ret
 | ||||||
|  | +# endif
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(zero):
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(less_vec):
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
 | ||||||
|  | +	cmpb	$4, %dl
 | ||||||
|  | +	je	L(4)
 | ||||||
|  | +	jb	L(zero)
 | ||||||
|  | +# else
 | ||||||
|  | +	cmpb	$1, %dl
 | ||||||
|  | +	je	L(1)
 | ||||||
|  | +	jb	L(zero)
 | ||||||
|  | +	cmpb	$4, %dl
 | ||||||
|  | +	jb	L(between_2_3)
 | ||||||
|  | +	cmpb	$8, %dl
 | ||||||
|  | +	jb	L(between_4_7)
 | ||||||
|  | +# endif
 | ||||||
|  | +	cmpb	$16, %dl
 | ||||||
|  | +	jae	L(between_16_31)
 | ||||||
|  | +	/* It is between 8 and 15 bytes.  */
 | ||||||
|  | +	vmovq	(%rdi), %XMM1
 | ||||||
|  | +	vmovq	(%rsi), %XMM2
 | ||||||
|  | +	VPCMPEQ %XMM1, %XMM2, %k2
 | ||||||
|  | +	kmovw	%k2, %eax
 | ||||||
|  | +	subl    $XMM_MASK, %eax
 | ||||||
|  | +	jnz	L(first_vec)
 | ||||||
|  | +	/* Use overlapping loads to avoid branches.  */
 | ||||||
|  | +	leaq	-8(%rdi, %rdx), %rdi
 | ||||||
|  | +	leaq	-8(%rsi, %rdx), %rsi
 | ||||||
|  | +	vmovq	(%rdi), %XMM1
 | ||||||
|  | +	vmovq	(%rsi), %XMM2
 | ||||||
|  | +	VPCMPEQ %XMM1, %XMM2, %k2
 | ||||||
|  | +	kmovw	%k2, %eax
 | ||||||
|  | +	subl    $XMM_MASK, %eax
 | ||||||
|  | +	jnz	L(first_vec)
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(between_16_31):
 | ||||||
|  | +	/* From 16 to 31 bytes.  No branch when size == 16.  */
 | ||||||
|  | +	VMOVU	(%rsi), %XMM2
 | ||||||
|  | +	VPCMPEQ (%rdi), %XMM2, %k2
 | ||||||
|  | +	kmovw	%k2, %eax
 | ||||||
|  | +	subl    $XMM_MASK, %eax
 | ||||||
|  | +	jnz	L(first_vec)
 | ||||||
|  | +
 | ||||||
|  | +	/* Use overlapping loads to avoid branches.  */
 | ||||||
|  | +	leaq	-16(%rdi, %rdx), %rdi
 | ||||||
|  | +	leaq	-16(%rsi, %rdx), %rsi
 | ||||||
|  | +	VMOVU	(%rsi), %XMM2
 | ||||||
|  | +	VPCMPEQ (%rdi), %XMM2, %k2
 | ||||||
|  | +	kmovw	%k2, %eax
 | ||||||
|  | +	subl    $XMM_MASK, %eax
 | ||||||
|  | +	jnz	L(first_vec)
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(more_8x_vec):
 | ||||||
|  | +	/* More than 8 * VEC.  Check the first VEC.  */
 | ||||||
|  | +	VMOVU	(%rsi), %YMM2
 | ||||||
|  | +	VPCMPEQ (%rdi), %YMM2, %k2
 | ||||||
|  | +	kmovd	%k2, %eax
 | ||||||
|  | +	subl    $VEC_MASK, %eax
 | ||||||
|  | +	jnz	L(first_vec)
 | ||||||
|  | +
 | ||||||
|  | +	/* Align the first memory area for aligned loads in the loop.
 | ||||||
|  | +	   Compute how much the first memory area is misaligned.  */
 | ||||||
|  | +	movq	%rdi, %rcx
 | ||||||
|  | +	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | +	/* Get the negative of offset for alignment.  */
 | ||||||
|  | +	subq	$VEC_SIZE, %rcx
 | ||||||
|  | +	/* Adjust the second memory area.  */
 | ||||||
|  | +	subq	%rcx, %rsi
 | ||||||
|  | +	/* Adjust the first memory area which should be aligned now.  */
 | ||||||
|  | +	subq	%rcx, %rdi
 | ||||||
|  | +	/* Adjust length.  */
 | ||||||
|  | +	addq	%rcx, %rdx
 | ||||||
|  | +
 | ||||||
|  | +L(loop_4x_vec):
 | ||||||
|  | +	/* Compare 4 * VEC at a time forward.  */
 | ||||||
|  | +	VMOVU	(%rsi), %YMM1
 | ||||||
|  | +	VPCMPEQ (%rdi), %YMM1, %k1
 | ||||||
|  | +
 | ||||||
|  | +	VMOVU	VEC_SIZE(%rsi), %YMM2
 | ||||||
|  | +	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
 | ||||||
|  | +	kandd	%k2, %k1, %k5
 | ||||||
|  | +
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
 | ||||||
|  | +	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
 | ||||||
|  | +	kandd	%k3, %k5, %k5
 | ||||||
|  | +
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
 | ||||||
|  | +	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
 | ||||||
|  | +	kandd	%k4, %k5, %k5
 | ||||||
|  | +
 | ||||||
|  | +	kmovd	%k5, %eax
 | ||||||
|  | +	cmpl	$VEC_MASK, %eax
 | ||||||
|  | +	jne	L(4x_vec_end)
 | ||||||
|  | +
 | ||||||
|  | +	addq	$(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	addq	$(VEC_SIZE * 4), %rsi
 | ||||||
|  | +
 | ||||||
|  | +	subq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | +	cmpq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | +	jae	L(loop_4x_vec)
 | ||||||
|  | +
 | ||||||
|  | +	/* Less than 4 * VEC.  */
 | ||||||
|  | +	cmpq	$VEC_SIZE, %rdx
 | ||||||
|  | +	jbe	L(last_vec)
 | ||||||
|  | +	cmpq	$(VEC_SIZE * 2), %rdx
 | ||||||
|  | +	jbe	L(last_2x_vec)
 | ||||||
|  | +
 | ||||||
|  | +L(last_4x_vec):
 | ||||||
|  | +	/* From 2 * VEC to 4 * VEC. */
 | ||||||
|  | +	VMOVU	(%rsi), %YMM2
 | ||||||
|  | +	VPCMPEQ (%rdi), %YMM2, %k2
 | ||||||
|  | +	kmovd	%k2, %eax
 | ||||||
|  | +	subl    $VEC_MASK, %eax
 | ||||||
|  | +	jnz	L(first_vec)
 | ||||||
|  | +
 | ||||||
|  | +	addq	$VEC_SIZE, %rdi
 | ||||||
|  | +	addq	$VEC_SIZE, %rsi
 | ||||||
|  | +	VMOVU	(%rsi), %YMM2
 | ||||||
|  | +	VPCMPEQ (%rdi), %YMM2, %k2
 | ||||||
|  | +	kmovd	%k2, %eax
 | ||||||
|  | +	subl    $VEC_MASK, %eax
 | ||||||
|  | +	jnz	L(first_vec)
 | ||||||
|  | +
 | ||||||
|  | +	/* Use overlapping loads to avoid branches.  */
 | ||||||
|  | +	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
 | ||||||
|  | +	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
 | ||||||
|  | +	VMOVU	(%rsi), %YMM2
 | ||||||
|  | +	VPCMPEQ (%rdi), %YMM2, %k2
 | ||||||
|  | +	kmovd	%k2, %eax
 | ||||||
|  | +	subl    $VEC_MASK, %eax
 | ||||||
|  | +	jnz	L(first_vec)
 | ||||||
|  | +
 | ||||||
|  | +	addq	$VEC_SIZE, %rdi
 | ||||||
|  | +	addq	$VEC_SIZE, %rsi
 | ||||||
|  | +	VMOVU	(%rsi), %YMM2
 | ||||||
|  | +	VPCMPEQ (%rdi), %YMM2, %k2
 | ||||||
|  | +	kmovd	%k2, %eax
 | ||||||
|  | +	subl    $VEC_MASK, %eax
 | ||||||
|  | +	jnz	L(first_vec)
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(4x_vec_end):
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	subl	$VEC_MASK, %eax
 | ||||||
|  | +	jnz	L(first_vec)
 | ||||||
|  | +	kmovd	%k2, %eax
 | ||||||
|  | +	subl	$VEC_MASK, %eax
 | ||||||
|  | +	jnz	L(first_vec_x1)
 | ||||||
|  | +	kmovd	%k3, %eax
 | ||||||
|  | +	subl	$VEC_MASK, %eax
 | ||||||
|  | +	jnz	L(first_vec_x2)
 | ||||||
|  | +	kmovd	%k4, %eax
 | ||||||
|  | +	subl	$VEC_MASK, %eax
 | ||||||
|  | +	tzcntl	%eax, %ecx
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	movl	(VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
 | ||||||
|  | +	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
 | ||||||
|  | +	jmp	L(wmemcmp_return)
 | ||||||
|  | +# else
 | ||||||
|  | +	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 | ||||||
|  | +	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
 | ||||||
|  | +	sub	%edx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_vec_x1):
 | ||||||
|  | +	tzcntl	%eax, %ecx
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	movl	VEC_SIZE(%rdi, %rcx, 4), %edx
 | ||||||
|  | +	cmpl	VEC_SIZE(%rsi, %rcx, 4), %edx
 | ||||||
|  | +	jmp	L(wmemcmp_return)
 | ||||||
|  | +# else
 | ||||||
|  | +	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 | ||||||
|  | +	movzbl	VEC_SIZE(%rsi, %rcx), %edx
 | ||||||
|  | +	sub	%edx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_vec_x2):
 | ||||||
|  | +	tzcntl	%eax, %ecx
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	movl	(VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
 | ||||||
|  | +	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
 | ||||||
|  | +	jmp	L(wmemcmp_return)
 | ||||||
|  | +# else
 | ||||||
|  | +	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 | ||||||
|  | +	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
 | ||||||
|  | +	sub	%edx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	ret
 | ||||||
|  | +END (MEMCMP)
 | ||||||
|  | +#endif
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..4726d74a
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
 | ||||||
|  | @@ -0,0 +1,4 @@
 | ||||||
|  | +#define MEMCMP __wmemcmp_evex_movbe
 | ||||||
|  | +#define USE_AS_WMEMCMP 1
 | ||||||
|  | +
 | ||||||
|  | +#include "memcmp-evex-movbe.S"
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										2568
									
								
								SOURCES/glibc-RHEL-15696-17.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2568
									
								
								SOURCES/glibc-RHEL-15696-17.patch
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										735
									
								
								SOURCES/glibc-RHEL-15696-18.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										735
									
								
								SOURCES/glibc-RHEL-15696-18.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,735 @@ | |||||||
|  | From 4bd660be40967cd69072f69ebc2ad32bfcc1f206 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Tue, 23 Feb 2021 06:33:10 -0800 | ||||||
|  | Subject: [PATCH] x86: Add string/memory function tests in RTM region | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | At function exit, AVX optimized string/memory functions have VZEROUPPER | ||||||
|  | which triggers RTM abort.   When such functions are called inside a | ||||||
|  | transactionally executing RTM region, RTM abort causes severe performance | ||||||
|  | degradation.  Add tests to verify that string/memory functions won't | ||||||
|  | cause RTM abort in RTM region. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86/Makefile          | 23 +++++++++++ | ||||||
|  |  sysdeps/x86/tst-memchr-rtm.c  | 54 ++++++++++++++++++++++++++ | ||||||
|  |  sysdeps/x86/tst-memcmp-rtm.c  | 52 +++++++++++++++++++++++++ | ||||||
|  |  sysdeps/x86/tst-memmove-rtm.c | 53 ++++++++++++++++++++++++++ | ||||||
|  |  sysdeps/x86/tst-memrchr-rtm.c | 54 ++++++++++++++++++++++++++ | ||||||
|  |  sysdeps/x86/tst-memset-rtm.c  | 45 ++++++++++++++++++++++ | ||||||
|  |  sysdeps/x86/tst-strchr-rtm.c  | 54 ++++++++++++++++++++++++++ | ||||||
|  |  sysdeps/x86/tst-strcpy-rtm.c  | 53 ++++++++++++++++++++++++++ | ||||||
|  |  sysdeps/x86/tst-string-rtm.h  | 72 +++++++++++++++++++++++++++++++++++ | ||||||
|  |  sysdeps/x86/tst-strlen-rtm.c  | 53 ++++++++++++++++++++++++++ | ||||||
|  |  sysdeps/x86/tst-strncmp-rtm.c | 52 +++++++++++++++++++++++++ | ||||||
|  |  sysdeps/x86/tst-strrchr-rtm.c | 53 ++++++++++++++++++++++++++ | ||||||
|  |  12 files changed, 618 insertions(+) | ||||||
|  |  create mode 100644 sysdeps/x86/tst-memchr-rtm.c | ||||||
|  |  create mode 100644 sysdeps/x86/tst-memcmp-rtm.c | ||||||
|  |  create mode 100644 sysdeps/x86/tst-memmove-rtm.c | ||||||
|  |  create mode 100644 sysdeps/x86/tst-memrchr-rtm.c | ||||||
|  |  create mode 100644 sysdeps/x86/tst-memset-rtm.c | ||||||
|  |  create mode 100644 sysdeps/x86/tst-strchr-rtm.c | ||||||
|  |  create mode 100644 sysdeps/x86/tst-strcpy-rtm.c | ||||||
|  |  create mode 100644 sysdeps/x86/tst-string-rtm.h | ||||||
|  |  create mode 100644 sysdeps/x86/tst-strlen-rtm.c | ||||||
|  |  create mode 100644 sysdeps/x86/tst-strncmp-rtm.c | ||||||
|  |  create mode 100644 sysdeps/x86/tst-strrchr-rtm.c | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
 | ||||||
|  | index 59e928e9..5be71ada 100644
 | ||||||
|  | --- a/sysdeps/x86/Makefile
 | ||||||
|  | +++ b/sysdeps/x86/Makefile
 | ||||||
|  | @@ -17,6 +17,29 @@ endif
 | ||||||
|  |   | ||||||
|  |  ifeq ($(subdir),string) | ||||||
|  |  sysdep_routines += cacheinfo | ||||||
|  | +
 | ||||||
|  | +tests += \
 | ||||||
|  | +  tst-memchr-rtm \
 | ||||||
|  | +  tst-memcmp-rtm \
 | ||||||
|  | +  tst-memmove-rtm \
 | ||||||
|  | +  tst-memrchr-rtm \
 | ||||||
|  | +  tst-memset-rtm \
 | ||||||
|  | +  tst-strchr-rtm \
 | ||||||
|  | +  tst-strcpy-rtm \
 | ||||||
|  | +  tst-strlen-rtm \
 | ||||||
|  | +  tst-strncmp-rtm \
 | ||||||
|  | +  tst-strrchr-rtm
 | ||||||
|  | +
 | ||||||
|  | +CFLAGS-tst-memchr-rtm.c += -mrtm
 | ||||||
|  | +CFLAGS-tst-memcmp-rtm.c += -mrtm
 | ||||||
|  | +CFLAGS-tst-memmove-rtm.c += -mrtm
 | ||||||
|  | +CFLAGS-tst-memrchr-rtm.c += -mrtm
 | ||||||
|  | +CFLAGS-tst-memset-rtm.c += -mrtm
 | ||||||
|  | +CFLAGS-tst-strchr-rtm.c += -mrtm
 | ||||||
|  | +CFLAGS-tst-strcpy-rtm.c += -mrtm
 | ||||||
|  | +CFLAGS-tst-strlen-rtm.c += -mrtm
 | ||||||
|  | +CFLAGS-tst-strncmp-rtm.c += -mrtm
 | ||||||
|  | +CFLAGS-tst-strrchr-rtm.c += -mrtm
 | ||||||
|  |  endif | ||||||
|  |   | ||||||
|  |  ifneq ($(enable-cet),no) | ||||||
|  | diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..e4749401
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86/tst-memchr-rtm.c
 | ||||||
|  | @@ -0,0 +1,54 @@
 | ||||||
|  | +/* Test case for memchr inside a transactionally executing RTM region.
 | ||||||
|  | +   Copyright (C) 2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <tst-string-rtm.h>
 | ||||||
|  | +
 | ||||||
|  | +#define LOOP 3000
 | ||||||
|  | +#define STRING_SIZE 1024
 | ||||||
|  | +char string1[STRING_SIZE];
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +prepare (void)
 | ||||||
|  | +{
 | ||||||
|  | +  memset (string1, 'a', STRING_SIZE);
 | ||||||
|  | +  string1[100] = 'c';
 | ||||||
|  | +  string1[STRING_SIZE - 100] = 'c';
 | ||||||
|  | +  char *p = memchr (string1, 'c', STRING_SIZE);
 | ||||||
|  | +  if (p == &string1[100])
 | ||||||
|  | +    return EXIT_SUCCESS;
 | ||||||
|  | +  else
 | ||||||
|  | +    return EXIT_FAILURE;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +function (void)
 | ||||||
|  | +{
 | ||||||
|  | +  char *p = memchr (string1, 'c', STRING_SIZE);
 | ||||||
|  | +  if (p == &string1[100])
 | ||||||
|  | +    return 0;
 | ||||||
|  | +  else
 | ||||||
|  | +    return 1;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +do_test (void)
 | ||||||
|  | +{
 | ||||||
|  | +  return do_test_1 ("memchr", LOOP, prepare, function);
 | ||||||
|  | +}
 | ||||||
|  | diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..e4c8a623
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86/tst-memcmp-rtm.c
 | ||||||
|  | @@ -0,0 +1,52 @@
 | ||||||
|  | +/* Test case for memcmp inside a transactionally executing RTM region.
 | ||||||
|  | +   Copyright (C) 2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <tst-string-rtm.h>
 | ||||||
|  | +
 | ||||||
|  | +#define LOOP 3000
 | ||||||
|  | +#define STRING_SIZE 1024
 | ||||||
|  | +char string1[STRING_SIZE];
 | ||||||
|  | +char string2[STRING_SIZE];
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +prepare (void)
 | ||||||
|  | +{
 | ||||||
|  | +  memset (string1, 'a', STRING_SIZE);
 | ||||||
|  | +  memset (string2, 'a', STRING_SIZE);
 | ||||||
|  | +  if (memcmp (string1, string2, STRING_SIZE) == 0)
 | ||||||
|  | +    return EXIT_SUCCESS;
 | ||||||
|  | +  else
 | ||||||
|  | +    return EXIT_FAILURE;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +function (void)
 | ||||||
|  | +{
 | ||||||
|  | +  if (memcmp (string1, string2, STRING_SIZE) == 0)
 | ||||||
|  | +    return 0;
 | ||||||
|  | +  else
 | ||||||
|  | +    return 1;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +do_test (void)
 | ||||||
|  | +{
 | ||||||
|  | +  return do_test_1 ("memcmp", LOOP, prepare, function);
 | ||||||
|  | +}
 | ||||||
|  | diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..4bf97ef1
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86/tst-memmove-rtm.c
 | ||||||
|  | @@ -0,0 +1,53 @@
 | ||||||
|  | +/* Test case for memmove inside a transactionally executing RTM region.
 | ||||||
|  | +   Copyright (C) 2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <tst-string-rtm.h>
 | ||||||
|  | +
 | ||||||
|  | +#define LOOP 3000
 | ||||||
|  | +#define STRING_SIZE 1024
 | ||||||
|  | +char string1[STRING_SIZE];
 | ||||||
|  | +char string2[STRING_SIZE];
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +prepare (void)
 | ||||||
|  | +{
 | ||||||
|  | +  memset (string1, 'a', STRING_SIZE);
 | ||||||
|  | +  if (memmove (string2, string1, STRING_SIZE) == string2
 | ||||||
|  | +      && memcmp (string2, string1, STRING_SIZE) == 0)
 | ||||||
|  | +    return EXIT_SUCCESS;
 | ||||||
|  | +  else
 | ||||||
|  | +    return EXIT_FAILURE;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +function (void)
 | ||||||
|  | +{
 | ||||||
|  | +  if (memmove (string2, string1, STRING_SIZE) == string2
 | ||||||
|  | +      && memcmp (string2, string1, STRING_SIZE) == 0)
 | ||||||
|  | +    return 0;
 | ||||||
|  | +  else
 | ||||||
|  | +    return 1;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +do_test (void)
 | ||||||
|  | +{
 | ||||||
|  | +  return do_test_1 ("memmove", LOOP, prepare, function);
 | ||||||
|  | +}
 | ||||||
|  | diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..a57a5a8e
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86/tst-memrchr-rtm.c
 | ||||||
|  | @@ -0,0 +1,54 @@
 | ||||||
|  | +/* Test case for memrchr inside a transactionally executing RTM region.
 | ||||||
|  | +   Copyright (C) 2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <tst-string-rtm.h>
 | ||||||
|  | +
 | ||||||
|  | +#define LOOP 3000
 | ||||||
|  | +#define STRING_SIZE 1024
 | ||||||
|  | +char string1[STRING_SIZE];
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +prepare (void)
 | ||||||
|  | +{
 | ||||||
|  | +  memset (string1, 'a', STRING_SIZE);
 | ||||||
|  | +  string1[100] = 'c';
 | ||||||
|  | +  string1[STRING_SIZE - 100] = 'c';
 | ||||||
|  | +  char *p = memrchr (string1, 'c', STRING_SIZE);
 | ||||||
|  | +  if (p == &string1[STRING_SIZE - 100])
 | ||||||
|  | +    return EXIT_SUCCESS;
 | ||||||
|  | +  else
 | ||||||
|  | +    return EXIT_FAILURE;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +function (void)
 | ||||||
|  | +{
 | ||||||
|  | +  char *p = memrchr (string1, 'c', STRING_SIZE);
 | ||||||
|  | +  if (p == &string1[STRING_SIZE - 100])
 | ||||||
|  | +    return 0;
 | ||||||
|  | +  else
 | ||||||
|  | +    return 1;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +do_test (void)
 | ||||||
|  | +{
 | ||||||
|  | +  return do_test_1 ("memrchr", LOOP, prepare, function);
 | ||||||
|  | +}
 | ||||||
|  | diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..bf343a4d
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86/tst-memset-rtm.c
 | ||||||
|  | @@ -0,0 +1,45 @@
 | ||||||
|  | +/* Test case for memset inside a transactionally executing RTM region.
 | ||||||
|  | +   Copyright (C) 2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <tst-string-rtm.h>
 | ||||||
|  | +
 | ||||||
|  | +#define LOOP 3000
 | ||||||
|  | +#define STRING_SIZE 1024
 | ||||||
|  | +char string1[STRING_SIZE];
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +prepare (void)
 | ||||||
|  | +{
 | ||||||
|  | +  memset (string1, 'a', STRING_SIZE);
 | ||||||
|  | +  return EXIT_SUCCESS;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +function (void)
 | ||||||
|  | +{
 | ||||||
|  | +  memset (string1, 'a', STRING_SIZE);
 | ||||||
|  | +  return 0;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +do_test (void)
 | ||||||
|  | +{
 | ||||||
|  | +  return do_test_1 ("memset", LOOP, prepare, function);
 | ||||||
|  | +}
 | ||||||
|  | diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..a82e29c0
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86/tst-strchr-rtm.c
 | ||||||
|  | @@ -0,0 +1,54 @@
 | ||||||
|  | +/* Test case for strchr inside a transactionally executing RTM region.
 | ||||||
|  | +   Copyright (C) 2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <tst-string-rtm.h>
 | ||||||
|  | +
 | ||||||
|  | +#define LOOP 3000
 | ||||||
|  | +#define STRING_SIZE 1024
 | ||||||
|  | +char string1[STRING_SIZE];
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +prepare (void)
 | ||||||
|  | +{
 | ||||||
|  | +  memset (string1, 'a', STRING_SIZE - 1);
 | ||||||
|  | +  string1[100] = 'c';
 | ||||||
|  | +  string1[STRING_SIZE - 100] = 'c';
 | ||||||
|  | +  char *p = strchr (string1, 'c');
 | ||||||
|  | +  if (p == &string1[100])
 | ||||||
|  | +    return EXIT_SUCCESS;
 | ||||||
|  | +  else
 | ||||||
|  | +    return EXIT_FAILURE;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +function (void)
 | ||||||
|  | +{
 | ||||||
|  | +  char *p = strchr (string1, 'c');
 | ||||||
|  | +  if (p == &string1[100])
 | ||||||
|  | +    return 0;
 | ||||||
|  | +  else
 | ||||||
|  | +    return 1;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +do_test (void)
 | ||||||
|  | +{
 | ||||||
|  | +  return do_test_1 ("strchr", LOOP, prepare, function);
 | ||||||
|  | +}
 | ||||||
|  | diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..2b2a583f
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86/tst-strcpy-rtm.c
 | ||||||
|  | @@ -0,0 +1,53 @@
 | ||||||
|  | +/* Test case for strcpy inside a transactionally executing RTM region.
 | ||||||
|  | +   Copyright (C) 2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <tst-string-rtm.h>
 | ||||||
|  | +
 | ||||||
|  | +#define LOOP 3000
 | ||||||
|  | +#define STRING_SIZE 1024
 | ||||||
|  | +char string1[STRING_SIZE];
 | ||||||
|  | +char string2[STRING_SIZE];
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +prepare (void)
 | ||||||
|  | +{
 | ||||||
|  | +  memset (string1, 'a', STRING_SIZE - 1);
 | ||||||
|  | +  if (strcpy (string2, string1) == string2
 | ||||||
|  | +      && strcmp (string2, string1) == 0)
 | ||||||
|  | +    return EXIT_SUCCESS;
 | ||||||
|  | +  else
 | ||||||
|  | +    return EXIT_FAILURE;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +function (void)
 | ||||||
|  | +{
 | ||||||
|  | +  if (strcpy (string2, string1) == string2
 | ||||||
|  | +      && strcmp (string2, string1) == 0)
 | ||||||
|  | +    return 0;
 | ||||||
|  | +  else
 | ||||||
|  | +    return 1;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +do_test (void)
 | ||||||
|  | +{
 | ||||||
|  | +  return do_test_1 ("strcpy", LOOP, prepare, function);
 | ||||||
|  | +}
 | ||||||
|  | diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..d2470afa
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86/tst-string-rtm.h
 | ||||||
|  | @@ -0,0 +1,72 @@
 | ||||||
|  | +/* Test string function in a transactionally executing RTM region.
 | ||||||
|  | +   Copyright (C) 2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <string.h>
 | ||||||
|  | +#include <x86intrin.h>
 | ||||||
|  | +#include <sys/platform/x86.h>
 | ||||||
|  | +#include <support/check.h>
 | ||||||
|  | +#include <support/test-driver.h>
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
 | ||||||
|  | +	   int (*function) (void))
 | ||||||
|  | +{
 | ||||||
|  | +  if (!CPU_FEATURE_USABLE (RTM))
 | ||||||
|  | +    return EXIT_UNSUPPORTED;
 | ||||||
|  | +
 | ||||||
|  | +  int status = prepare ();
 | ||||||
|  | +  if (status != EXIT_SUCCESS)
 | ||||||
|  | +    return status;
 | ||||||
|  | +
 | ||||||
|  | +  unsigned int i;
 | ||||||
|  | +  unsigned int naborts = 0;
 | ||||||
|  | +  unsigned int failed = 0;
 | ||||||
|  | +  for (i = 0; i < loop; i++)
 | ||||||
|  | +    {
 | ||||||
|  | +      failed |= function ();
 | ||||||
|  | +      if (_xbegin() == _XBEGIN_STARTED)
 | ||||||
|  | +	{
 | ||||||
|  | +	  failed |= function ();
 | ||||||
|  | +	  _xend();
 | ||||||
|  | +	}
 | ||||||
|  | +      else
 | ||||||
|  | +	{
 | ||||||
|  | +	  failed |= function ();
 | ||||||
|  | +	  ++naborts;
 | ||||||
|  | +	}
 | ||||||
|  | +    }
 | ||||||
|  | +
 | ||||||
|  | +  if (failed)
 | ||||||
|  | +    FAIL_EXIT1 ("%s() failed", name);
 | ||||||
|  | +
 | ||||||
|  | +  if (naborts)
 | ||||||
|  | +    {
 | ||||||
|  | +      /* NB: Low single digit (<= 5%) noise-level aborts are normal for
 | ||||||
|  | +	 TSX.  */
 | ||||||
|  | +      double rate = 100 * ((double) naborts) / ((double) loop);
 | ||||||
|  | +      if (rate > 5)
 | ||||||
|  | +	FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
 | ||||||
|  | +		    rate, naborts, loop);
 | ||||||
|  | +    }
 | ||||||
|  | +
 | ||||||
|  | +  return EXIT_SUCCESS;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int do_test (void);
 | ||||||
|  | +
 | ||||||
|  | +#include <support/test-driver.c>
 | ||||||
|  | diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..0dcf14db
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86/tst-strlen-rtm.c
 | ||||||
|  | @@ -0,0 +1,53 @@
 | ||||||
|  | +/* Test case for strlen inside a transactionally executing RTM region.
 | ||||||
|  | +   Copyright (C) 2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <tst-string-rtm.h>
 | ||||||
|  | +
 | ||||||
|  | +#define LOOP 3000
 | ||||||
|  | +#define STRING_SIZE 1024
 | ||||||
|  | +char string1[STRING_SIZE];
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +prepare (void)
 | ||||||
|  | +{
 | ||||||
|  | +  memset (string1, 'a', STRING_SIZE - 1);
 | ||||||
|  | +  string1[STRING_SIZE - 100] = '\0';
 | ||||||
|  | +  size_t len = strlen (string1);
 | ||||||
|  | +  if (len == STRING_SIZE - 100)
 | ||||||
|  | +    return EXIT_SUCCESS;
 | ||||||
|  | +  else
 | ||||||
|  | +    return EXIT_FAILURE;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +function (void)
 | ||||||
|  | +{
 | ||||||
|  | +  size_t len = strlen (string1);
 | ||||||
|  | +  if (len == STRING_SIZE - 100)
 | ||||||
|  | +    return 0;
 | ||||||
|  | +  else
 | ||||||
|  | +    return 1;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +do_test (void)
 | ||||||
|  | +{
 | ||||||
|  | +  return do_test_1 ("strlen", LOOP, prepare, function);
 | ||||||
|  | +}
 | ||||||
|  | diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..236ad951
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86/tst-strncmp-rtm.c
 | ||||||
|  | @@ -0,0 +1,52 @@
 | ||||||
|  | +/* Test case for strncmp inside a transactionally executing RTM region.
 | ||||||
|  | +   Copyright (C) 2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <tst-string-rtm.h>
 | ||||||
|  | +
 | ||||||
|  | +#define LOOP 3000
 | ||||||
|  | +#define STRING_SIZE 1024
 | ||||||
|  | +char string1[STRING_SIZE];
 | ||||||
|  | +char string2[STRING_SIZE];
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +prepare (void)
 | ||||||
|  | +{
 | ||||||
|  | +  memset (string1, 'a', STRING_SIZE - 1);
 | ||||||
|  | +  memset (string2, 'a', STRING_SIZE - 1);
 | ||||||
|  | +  if (strncmp (string1, string2, STRING_SIZE) == 0)
 | ||||||
|  | +    return EXIT_SUCCESS;
 | ||||||
|  | +  else
 | ||||||
|  | +    return EXIT_FAILURE;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +function (void)
 | ||||||
|  | +{
 | ||||||
|  | +  if (strncmp (string1, string2, STRING_SIZE) == 0)
 | ||||||
|  | +    return 0;
 | ||||||
|  | +  else
 | ||||||
|  | +    return 1;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +do_test (void)
 | ||||||
|  | +{
 | ||||||
|  | +  return do_test_1 ("strncmp", LOOP, prepare, function);
 | ||||||
|  | +}
 | ||||||
|  | diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..e32bfaf5
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86/tst-strrchr-rtm.c
 | ||||||
|  | @@ -0,0 +1,53 @@
 | ||||||
|  | +/* Test case for strrchr inside a transactionally executing RTM region.
 | ||||||
|  | +   Copyright (C) 2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <tst-string-rtm.h>
 | ||||||
|  | +
 | ||||||
|  | +#define LOOP 3000
 | ||||||
|  | +#define STRING_SIZE 1024
 | ||||||
|  | +char string1[STRING_SIZE];
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +prepare (void)
 | ||||||
|  | +{
 | ||||||
|  | +  memset (string1, 'a', STRING_SIZE - 1);
 | ||||||
|  | +  string1[STRING_SIZE - 100] = 'c';
 | ||||||
|  | +  char *p = strrchr (string1, 'c');
 | ||||||
|  | +  if (p == &string1[STRING_SIZE - 100])
 | ||||||
|  | +    return EXIT_SUCCESS;
 | ||||||
|  | +  else
 | ||||||
|  | +    return EXIT_FAILURE;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +function (void)
 | ||||||
|  | +{
 | ||||||
|  | +  char *p = strrchr (string1, 'c');
 | ||||||
|  | +  if (p == &string1[STRING_SIZE - 100])
 | ||||||
|  | +    return 0;
 | ||||||
|  | +  else
 | ||||||
|  | +    return 1;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +do_test (void)
 | ||||||
|  | +{
 | ||||||
|  | +  return do_test_1 ("strrchr", LOOP, prepare, function);
 | ||||||
|  | +}
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										148
									
								
								SOURCES/glibc-RHEL-15696-19.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										148
									
								
								SOURCES/glibc-RHEL-15696-19.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,148 @@ | |||||||
|  | From 4e2d8f352774b56078c34648b14a2412c38384f4 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Sun, 7 Mar 2021 09:44:18 -0800 | ||||||
|  | Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memset family functions | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized | ||||||
|  | with AVX512 instructions using ZMM16-ZMM31 registers to avoid RTM abort | ||||||
|  | with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at | ||||||
|  | function exit. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-impl-list.c       | 14 +++++++++----- | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-memset.h          | 13 ++++++++----- | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-wmemset.h         | 12 ++++++------ | ||||||
|  |  .../multiarch/memset-avx512-unaligned-erms.S     | 16 ++++++++-------- | ||||||
|  |  4 files changed, 31 insertions(+), 24 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | index c1efeec0..d969a156 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | @@ -211,10 +211,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (AVX512BW)), | ||||||
|  |  			      __memset_chk_evex_unaligned_erms) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memset_chk, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX512VL)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  |  			      __memset_chk_avx512_unaligned_erms) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memset_chk, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX512VL)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  |  			      __memset_chk_avx512_unaligned) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memset_chk, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX512F), | ||||||
|  | @@ -252,10 +254,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (AVX512BW)), | ||||||
|  |  			      __memset_evex_unaligned_erms) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memset, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX512VL)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  |  			      __memset_avx512_unaligned_erms) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memset, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX512VL)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  |  			      __memset_avx512_unaligned) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memset, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX512F), | ||||||
|  | @@ -719,7 +723,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX512VL), | ||||||
|  |  			      __wmemset_evex_unaligned) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wmemset, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  |  			      __wmemset_avx512_unaligned)) | ||||||
|  |   | ||||||
|  |  #ifdef SHARED | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
 | ||||||
|  | index 6f3375cc..19795938 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
 | ||||||
|  | @@ -53,13 +53,16 @@ IFUNC_SELECTOR (void)
 | ||||||
|  |    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) | ||||||
|  |        && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) | ||||||
|  |      { | ||||||
|  | -      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 | ||||||
|  | -	return OPTIMIZE (avx512_no_vzeroupper);
 | ||||||
|  | +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 | ||||||
|  | +	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 | ||||||
|  | +	{
 | ||||||
|  | +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 | ||||||
|  | +	    return OPTIMIZE (avx512_unaligned_erms);
 | ||||||
|  |   | ||||||
|  | -      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 | ||||||
|  | -	return OPTIMIZE (avx512_unaligned_erms);
 | ||||||
|  | +	  return OPTIMIZE (avx512_unaligned);
 | ||||||
|  | +	}
 | ||||||
|  |   | ||||||
|  | -      return OPTIMIZE (avx512_unaligned);
 | ||||||
|  | +      return OPTIMIZE (avx512_no_vzeroupper);
 | ||||||
|  |      } | ||||||
|  |   | ||||||
|  |    if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
 | ||||||
|  | index bdc94c6c..98c5d406 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
 | ||||||
|  | @@ -33,13 +33,13 @@ IFUNC_SELECTOR (void)
 | ||||||
|  |    if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) | ||||||
|  |        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) | ||||||
|  |      { | ||||||
|  | -      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
 | ||||||
|  | -	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
 | ||||||
|  | -	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 | ||||||
|  | -	return OPTIMIZE (avx512_unaligned);
 | ||||||
|  | -
 | ||||||
|  |        if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) | ||||||
|  | -	return OPTIMIZE (evex_unaligned);
 | ||||||
|  | +	{
 | ||||||
|  | +	  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
 | ||||||
|  | +	    return OPTIMIZE (avx512_unaligned);
 | ||||||
|  | +
 | ||||||
|  | +	  return OPTIMIZE (evex_unaligned);
 | ||||||
|  | +	}
 | ||||||
|  |   | ||||||
|  |        if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) | ||||||
|  |  	return OPTIMIZE (avx2_unaligned_rtm); | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
 | ||||||
|  | index 0783979c..22e7b187 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
 | ||||||
|  | @@ -1,22 +1,22 @@
 | ||||||
|  |  #if IS_IN (libc) | ||||||
|  |  # define VEC_SIZE	64 | ||||||
|  | -# define VEC(i)		zmm##i
 | ||||||
|  | +# define XMM0		xmm16
 | ||||||
|  | +# define YMM0		ymm16
 | ||||||
|  | +# define VEC0		zmm16
 | ||||||
|  | +# define VEC(i)		VEC##i
 | ||||||
|  |  # define VMOVU		vmovdqu64 | ||||||
|  |  # define VMOVA		vmovdqa64 | ||||||
|  | +# define VZEROUPPER
 | ||||||
|  |   | ||||||
|  |  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ | ||||||
|  | -  vmovd d, %xmm0; \
 | ||||||
|  |    movq r, %rax; \ | ||||||
|  | -  vpbroadcastb %xmm0, %xmm0; \
 | ||||||
|  | -  vpbroadcastq %xmm0, %zmm0
 | ||||||
|  | +  vpbroadcastb d, %VEC0
 | ||||||
|  |   | ||||||
|  |  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ | ||||||
|  | -  vmovd d, %xmm0; \
 | ||||||
|  |    movq r, %rax; \ | ||||||
|  | -  vpbroadcastd %xmm0, %xmm0; \
 | ||||||
|  | -  vpbroadcastq %xmm0, %zmm0
 | ||||||
|  | +  vpbroadcastd d, %VEC0
 | ||||||
|  |   | ||||||
|  | -# define SECTION(p)		p##.avx512
 | ||||||
|  | +# define SECTION(p)		p##.evex512
 | ||||||
|  |  # define MEMSET_SYMBOL(p,s)	p##_avx512_##s | ||||||
|  |  # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s | ||||||
|  |   | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										230
									
								
								SOURCES/glibc-RHEL-15696-2.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										230
									
								
								SOURCES/glibc-RHEL-15696-2.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,230 @@ | |||||||
|  | From b304fc201d2f6baf52ea790df8643e99772243cd Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Mon, 21 Jan 2019 11:25:56 -0800 | ||||||
|  | Subject: [PATCH] x86-64 memcmp/wmemcmp: Properly handle the length parameter | ||||||
|  |  [BZ# 24097] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | On x32, the size_t parameter may be passed in the lower 32 bits of a | ||||||
|  | 64-bit register with the non-zero upper 32 bits.  The string/memory | ||||||
|  | functions written in assembly can only use the lower 32 bits of a | ||||||
|  | 64-bit register as length or must clear the upper 32 bits before using | ||||||
|  | the full 64-bit register for length. | ||||||
|  | 
 | ||||||
|  | This pach fixes memcmp/wmemcmp for x32.  Tested on x86-64 and x32.  On | ||||||
|  | x86-64, libc.so is the same with and withou the fix. | ||||||
|  | 
 | ||||||
|  | 	[BZ# 24097] | ||||||
|  | 	CVE-2019-6488 | ||||||
|  | 	* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for | ||||||
|  | 	length.  Clear the upper 32 bits of RDX register. | ||||||
|  | 	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise. | ||||||
|  | 	* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise. | ||||||
|  | 	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and | ||||||
|  | 	tst-size_t-wmemcmp. | ||||||
|  | 	* sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file. | ||||||
|  | 	* sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S |  7 +- | ||||||
|  |  sysdeps/x86_64/multiarch/memcmp-sse4.S       |  9 ++- | ||||||
|  |  sysdeps/x86_64/multiarch/memcmp-ssse3.S      |  7 +- | ||||||
|  |  sysdeps/x86_64/x32/Makefile                  |  4 +- | ||||||
|  |  sysdeps/x86_64/x32/tst-size_t-memcmp.c       | 76 ++++++++++++++++++++ | ||||||
|  |  sysdeps/x86_64/x32/tst-size_t-wmemcmp.c      | 20 ++++++ | ||||||
|  |  6 files changed, 114 insertions(+), 9 deletions(-) | ||||||
|  |  create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c | ||||||
|  |  create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	ChangeLog | ||||||
|  | 	(removed) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
 | ||||||
|  | index 30f764c3..e3a35b89 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
 | ||||||
|  | @@ -58,9 +58,12 @@
 | ||||||
|  |  	.section .text.avx,"ax",@progbits | ||||||
|  |  ENTRY (MEMCMP) | ||||||
|  |  # ifdef USE_AS_WMEMCMP | ||||||
|  | -	shl	$2, %rdx
 | ||||||
|  | +	shl	$2, %RDX_LP
 | ||||||
|  | +# elif defined __ILP32__
 | ||||||
|  | +	/* Clear the upper 32 bits.  */
 | ||||||
|  | +	movl	%edx, %edx
 | ||||||
|  |  # endif | ||||||
|  | -	cmpq	$VEC_SIZE, %rdx
 | ||||||
|  | +	cmp	$VEC_SIZE, %RDX_LP
 | ||||||
|  |  	jb	L(less_vec) | ||||||
|  |   | ||||||
|  |  	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */ | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
 | ||||||
|  | index 8e164f2c..302900f5 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
 | ||||||
|  | @@ -42,13 +42,16 @@
 | ||||||
|  |  	.section .text.sse4.1,"ax",@progbits | ||||||
|  |  ENTRY (MEMCMP) | ||||||
|  |  # ifdef USE_AS_WMEMCMP | ||||||
|  | -	shl	$2, %rdx
 | ||||||
|  | +	shl	$2, %RDX_LP
 | ||||||
|  | +# elif defined __ILP32__
 | ||||||
|  | +	/* Clear the upper 32 bits.  */
 | ||||||
|  | +	mov	%edx, %edx
 | ||||||
|  |  # endif | ||||||
|  |  	pxor	%xmm0, %xmm0 | ||||||
|  | -	cmp	$79, %rdx
 | ||||||
|  | +	cmp	$79, %RDX_LP
 | ||||||
|  |  	ja	L(79bytesormore) | ||||||
|  |  # ifndef USE_AS_WMEMCMP | ||||||
|  | -	cmp	$1, %rdx
 | ||||||
|  | +	cmp	$1, %RDX_LP
 | ||||||
|  |  	je	L(firstbyte) | ||||||
|  |  # endif | ||||||
|  |  	add	%rdx, %rsi | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
 | ||||||
|  | index 6f76c641..69d030fc 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
 | ||||||
|  | @@ -33,9 +33,12 @@
 | ||||||
|  |  	atom_text_section | ||||||
|  |  ENTRY (MEMCMP) | ||||||
|  |  # ifdef USE_AS_WMEMCMP | ||||||
|  | -	shl	$2, %rdx
 | ||||||
|  | -	test	%rdx, %rdx
 | ||||||
|  | +	shl	$2, %RDX_LP
 | ||||||
|  | +	test	%RDX_LP, %RDX_LP
 | ||||||
|  |  	jz	L(equal) | ||||||
|  | +# elif defined __ILP32__
 | ||||||
|  | +	/* Clear the upper 32 bits.  */
 | ||||||
|  | +	mov	%edx, %edx
 | ||||||
|  |  # endif | ||||||
|  |  	mov	%rdx, %rcx | ||||||
|  |  	mov	%rdi, %rdx | ||||||
|  | diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | index 7d528889..ddec7f04 100644
 | ||||||
|  | --- a/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | @@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround
 | ||||||
|  |  endif | ||||||
|  |   | ||||||
|  |  ifeq ($(subdir),string) | ||||||
|  | -tests += tst-size_t-memchr
 | ||||||
|  | +tests += tst-size_t-memchr tst-size_t-memcmp
 | ||||||
|  |  endif | ||||||
|  |   | ||||||
|  |  ifeq ($(subdir),wcsmbs) | ||||||
|  | -tests += tst-size_t-wmemchr
 | ||||||
|  | +tests += tst-size_t-wmemchr tst-size_t-wmemcmp
 | ||||||
|  |  endif | ||||||
|  | diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..9bd6fdb4
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
 | ||||||
|  | @@ -0,0 +1,76 @@
 | ||||||
|  | +/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
 | ||||||
|  | +   Copyright (C) 2019 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <http://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#define TEST_MAIN
 | ||||||
|  | +#ifdef WIDE
 | ||||||
|  | +# define TEST_NAME "wmemcmp"
 | ||||||
|  | +#else
 | ||||||
|  | +# define TEST_NAME "memcmp"
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +#include "test-size_t.h"
 | ||||||
|  | +
 | ||||||
|  | +#ifdef WIDE
 | ||||||
|  | +# include <inttypes.h>
 | ||||||
|  | +# include <wchar.h>
 | ||||||
|  | +
 | ||||||
|  | +# define MEMCMP wmemcmp
 | ||||||
|  | +# define CHAR wchar_t
 | ||||||
|  | +#else
 | ||||||
|  | +# define MEMCMP memcmp
 | ||||||
|  | +# define CHAR char
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +IMPL (MEMCMP, 1)
 | ||||||
|  | +
 | ||||||
|  | +typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +do_memcmp (parameter_t a, parameter_t b)
 | ||||||
|  | +{
 | ||||||
|  | +  return CALL (&b, a.p, b.p, a.len);
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +test_main (void)
 | ||||||
|  | +{
 | ||||||
|  | +  test_init ();
 | ||||||
|  | +
 | ||||||
|  | +  parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
 | ||||||
|  | +  parameter_t src = { { 0 }, buf2 };
 | ||||||
|  | +
 | ||||||
|  | +  memcpy (buf1, buf2, page_size);
 | ||||||
|  | +
 | ||||||
|  | +  int ret = 0;
 | ||||||
|  | +  FOR_EACH_IMPL (impl, 0)
 | ||||||
|  | +    {
 | ||||||
|  | +      src.fn = impl->fn;
 | ||||||
|  | +      int res = do_memcmp (dest, src);
 | ||||||
|  | +      if (res)
 | ||||||
|  | +	{
 | ||||||
|  | +	  error (0, 0, "Wrong result in function %s: %i != 0",
 | ||||||
|  | +		 impl->name, res);
 | ||||||
|  | +	  ret = 1;
 | ||||||
|  | +	}
 | ||||||
|  | +    }
 | ||||||
|  | +
 | ||||||
|  | +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +#include <support/test-driver.c>
 | ||||||
|  | diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..e8b5ffd0
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
 | ||||||
|  | @@ -0,0 +1,20 @@
 | ||||||
|  | +/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register.
 | ||||||
|  | +   Copyright (C) 2019 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <http://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#define WIDE 1
 | ||||||
|  | +#include "tst-size_t-memcmp.c"
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										164
									
								
								SOURCES/glibc-RHEL-15696-20.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										164
									
								
								SOURCES/glibc-RHEL-15696-20.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,164 @@ | |||||||
|  | From e4fda4631017e49d4ee5a2755db34289b6860fa4 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Sun, 7 Mar 2021 09:45:23 -0800 | ||||||
|  | Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memmove family functions | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Update ifunc-memmove.h to select the function optimized with AVX512 | ||||||
|  | instructions using ZMM16-ZMM31 registers to avoid RTM abort with usable | ||||||
|  | AVX512VL since VZEROUPPER isn't needed at function exit. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 24 +++++++++--------- | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-memmove.h      | 12 +++++---- | ||||||
|  |  .../multiarch/memmove-avx512-unaligned-erms.S | 25 +++++++++++++++++-- | ||||||
|  |  3 files changed, 42 insertions(+), 19 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | index d969a156..fec384f6 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | @@ -83,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX512F), | ||||||
|  |  			      __memmove_chk_avx512_no_vzeroupper) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memmove_chk, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  |  			      __memmove_chk_avx512_unaligned) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memmove_chk, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  |  			      __memmove_chk_avx512_unaligned_erms) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memmove_chk, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX), | ||||||
|  | @@ -148,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX512F), | ||||||
|  |  			      __memmove_avx512_no_vzeroupper) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memmove, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  |  			      __memmove_avx512_unaligned) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memmove, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  |  			      __memmove_avx512_unaligned_erms) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), | ||||||
|  |  			      __memmove_ssse3_back) | ||||||
|  | @@ -733,10 +733,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX512F), | ||||||
|  |  			      __memcpy_chk_avx512_no_vzeroupper) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  |  			      __memcpy_chk_avx512_unaligned) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  |  			      __memcpy_chk_avx512_unaligned_erms) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX), | ||||||
|  | @@ -802,10 +802,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX512F), | ||||||
|  |  			      __memcpy_avx512_no_vzeroupper) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memcpy, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  |  			      __memcpy_avx512_unaligned) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memcpy, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  |  			      __memcpy_avx512_unaligned_erms) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, | ||||||
|  | @@ -819,10 +819,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX512F), | ||||||
|  |  			      __mempcpy_chk_avx512_no_vzeroupper) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  |  			      __mempcpy_chk_avx512_unaligned) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  |  			      __mempcpy_chk_avx512_unaligned_erms) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX), | ||||||
|  | @@ -864,10 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX512F), | ||||||
|  |  			      __mempcpy_avx512_no_vzeroupper) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, mempcpy, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  |  			      __mempcpy_avx512_unaligned) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, mempcpy, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  |  			      __mempcpy_avx512_unaligned_erms) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, mempcpy, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX), | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
 | ||||||
|  | index fa09b9fb..014e95c7 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
 | ||||||
|  | @@ -56,13 +56,15 @@ IFUNC_SELECTOR (void)
 | ||||||
|  |    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) | ||||||
|  |        && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) | ||||||
|  |      { | ||||||
|  | -      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 | ||||||
|  | -	return OPTIMIZE (avx512_no_vzeroupper);
 | ||||||
|  | +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
 | ||||||
|  | +	{
 | ||||||
|  | +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 | ||||||
|  | +	    return OPTIMIZE (avx512_unaligned_erms);
 | ||||||
|  |   | ||||||
|  | -      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 | ||||||
|  | -	return OPTIMIZE (avx512_unaligned_erms);
 | ||||||
|  | +	  return OPTIMIZE (avx512_unaligned);
 | ||||||
|  | +	}
 | ||||||
|  |   | ||||||
|  | -      return OPTIMIZE (avx512_unaligned);
 | ||||||
|  | +      return OPTIMIZE (avx512_no_vzeroupper);
 | ||||||
|  |      } | ||||||
|  |   | ||||||
|  |    if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
 | ||||||
|  | index aac1515c..848848ab 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
 | ||||||
|  | @@ -1,11 +1,32 @@
 | ||||||
|  |  #if IS_IN (libc) | ||||||
|  |  # define VEC_SIZE	64 | ||||||
|  | -# define VEC(i)		zmm##i
 | ||||||
|  | +# define XMM0		xmm16
 | ||||||
|  | +# define XMM1		xmm17
 | ||||||
|  | +# define YMM0		ymm16
 | ||||||
|  | +# define YMM1		ymm17
 | ||||||
|  | +# define VEC0		zmm16
 | ||||||
|  | +# define VEC1		zmm17
 | ||||||
|  | +# define VEC2		zmm18
 | ||||||
|  | +# define VEC3		zmm19
 | ||||||
|  | +# define VEC4		zmm20
 | ||||||
|  | +# define VEC5		zmm21
 | ||||||
|  | +# define VEC6		zmm22
 | ||||||
|  | +# define VEC7		zmm23
 | ||||||
|  | +# define VEC8		zmm24
 | ||||||
|  | +# define VEC9		zmm25
 | ||||||
|  | +# define VEC10		zmm26
 | ||||||
|  | +# define VEC11		zmm27
 | ||||||
|  | +# define VEC12		zmm28
 | ||||||
|  | +# define VEC13		zmm29
 | ||||||
|  | +# define VEC14		zmm30
 | ||||||
|  | +# define VEC15		zmm31
 | ||||||
|  | +# define VEC(i)		VEC##i
 | ||||||
|  |  # define VMOVNT		vmovntdq | ||||||
|  |  # define VMOVU		vmovdqu64 | ||||||
|  |  # define VMOVA		vmovdqa64 | ||||||
|  | +# define VZEROUPPER
 | ||||||
|  |   | ||||||
|  | -# define SECTION(p)		p##.avx512
 | ||||||
|  | +# define SECTION(p)		p##.evex512
 | ||||||
|  |  # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s | ||||||
|  |   | ||||||
|  |  # include "memmove-vec-unaligned-erms.S" | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										71
									
								
								SOURCES/glibc-RHEL-15696-21.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										71
									
								
								SOURCES/glibc-RHEL-15696-21.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,71 @@ | |||||||
|  | From 595c22ecd8e87a27fd19270ed30fdbae9ad25426 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Sunil K Pandey <skpgkp2@gmail.com> | ||||||
|  | Date: Thu, 1 Apr 2021 15:47:04 -0700 | ||||||
|  | Subject: [PATCH] x86-64: Fix ifdef indentation in strlen-evex.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Fix some indentations of ifdef in file strlen-evex.S which are off by 1 | ||||||
|  | and confusing to read. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strlen-evex.S | 16 ++++++++-------- | ||||||
|  |  1 file changed, 8 insertions(+), 8 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
 | ||||||
|  | index cd022509..05838190 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strlen-evex.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strlen-evex.S
 | ||||||
|  | @@ -276,10 +276,10 @@ L(last_2x_vec):
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(first_vec_x0_check): | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  |  	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */ | ||||||
|  |  	sall	$2, %eax | ||||||
|  | -# endif
 | ||||||
|  | +#  endif
 | ||||||
|  |  	/* Check the end of data.  */ | ||||||
|  |  	cmpq	%rax, %rsi | ||||||
|  |  	jbe	L(max) | ||||||
|  | @@ -293,10 +293,10 @@ L(first_vec_x0_check):
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(first_vec_x1_check): | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  |  	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */ | ||||||
|  |  	sall	$2, %eax | ||||||
|  | -# endif
 | ||||||
|  | +#  endif
 | ||||||
|  |  	/* Check the end of data.  */ | ||||||
|  |  	cmpq	%rax, %rsi | ||||||
|  |  	jbe	L(max) | ||||||
|  | @@ -311,10 +311,10 @@ L(first_vec_x1_check):
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(first_vec_x2_check): | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  |  	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */ | ||||||
|  |  	sall	$2, %eax | ||||||
|  | -# endif
 | ||||||
|  | +#  endif
 | ||||||
|  |  	/* Check the end of data.  */ | ||||||
|  |  	cmpq	%rax, %rsi | ||||||
|  |  	jbe	L(max) | ||||||
|  | @@ -329,10 +329,10 @@ L(first_vec_x2_check):
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(first_vec_x3_check): | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  |  	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */ | ||||||
|  |  	sall	$2, %eax | ||||||
|  | -# endif
 | ||||||
|  | +#  endif
 | ||||||
|  |  	/* Check the end of data.  */ | ||||||
|  |  	cmpq	%rax, %rsi | ||||||
|  |  	jbe	L(max) | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										51
									
								
								SOURCES/glibc-RHEL-15696-22.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								SOURCES/glibc-RHEL-15696-22.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,51 @@ | |||||||
|  | From 55bf411b451c13f0fb7ff3d3bf9a820020b45df1 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Mon, 19 Apr 2021 07:07:21 -0700 | ||||||
|  | Subject: [PATCH] x86-64: Require BMI2 for __strlen_evex and __strnlen_evex | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Since __strlen_evex and __strnlen_evex added by | ||||||
|  | 
 | ||||||
|  | commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77 | ||||||
|  | Author: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | Date:   Fri Mar 5 06:24:52 2021 -0800 | ||||||
|  | 
 | ||||||
|  |     x86-64: Add ifunc-avx2.h functions with 256-bit EVEX | ||||||
|  | 
 | ||||||
|  | use sarx: | ||||||
|  | 
 | ||||||
|  | c4 e2 6a f7 c0       	sarx   %edx,%eax,%eax | ||||||
|  | 
 | ||||||
|  | require BMI2 for __strlen_evex and __strnlen_evex in ifunc-impl-list.c. | ||||||
|  | ifunc-avx2.h already requires BMI2 for EVEX implementation. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 ++++-- | ||||||
|  |  1 file changed, 4 insertions(+), 2 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | index fec384f6..cbfc1a5d 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | @@ -293,7 +293,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			      __strlen_avx2_rtm) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, strlen, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX512VL) | ||||||
|  | -			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __strlen_evex) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)) | ||||||
|  |   | ||||||
|  | @@ -308,7 +309,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			      __strnlen_avx2_rtm) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, strnlen, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX512VL) | ||||||
|  | -			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __strnlen_evex) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2)) | ||||||
|  |   | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										584
									
								
								SOURCES/glibc-RHEL-15696-23.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										584
									
								
								SOURCES/glibc-RHEL-15696-23.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,584 @@ | |||||||
|  | From acfd088a1963ba51cd83c78f95c0ab25ead79e04 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Mon, 3 May 2021 03:01:58 -0400 | ||||||
|  | Subject: [PATCH] x86: Optimize memchr-avx2.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. This commit optimizes memchr-avx2.S. The optimizations include | ||||||
|  | replacing some branches with cmovcc, avoiding some branches entirely | ||||||
|  | in the less_4x_vec case, making the page cross logic less strict, | ||||||
|  | asaving a few instructions the in loop return loop. test-memchr, | ||||||
|  | test-rawmemchr, and test-wmemchr are all passing. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++----------- | ||||||
|  |  1 file changed, 247 insertions(+), 178 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
 | ||||||
|  | index cf893e77..b377f22e 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
 | ||||||
|  | @@ -26,8 +26,22 @@
 | ||||||
|  |   | ||||||
|  |  # ifdef USE_AS_WMEMCHR | ||||||
|  |  #  define VPCMPEQ	vpcmpeqd | ||||||
|  | +#  define VPBROADCAST	vpbroadcastd
 | ||||||
|  | +#  define CHAR_SIZE	4
 | ||||||
|  |  # else | ||||||
|  |  #  define VPCMPEQ	vpcmpeqb | ||||||
|  | +#  define VPBROADCAST	vpbroadcastb
 | ||||||
|  | +#  define CHAR_SIZE	1
 | ||||||
|  | +# endif
 | ||||||
|  | +
 | ||||||
|  | +# ifdef USE_AS_RAWMEMCHR
 | ||||||
|  | +#  define ERAW_PTR_REG	ecx
 | ||||||
|  | +#  define RRAW_PTR_REG	rcx
 | ||||||
|  | +#  define ALGN_PTR_REG	rdi
 | ||||||
|  | +# else
 | ||||||
|  | +#  define ERAW_PTR_REG	edi
 | ||||||
|  | +#  define RRAW_PTR_REG	rdi
 | ||||||
|  | +#  define ALGN_PTR_REG	rcx
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  # ifndef VZEROUPPER | ||||||
|  | @@ -39,6 +53,7 @@
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  # define VEC_SIZE 32 | ||||||
|  | +# define PAGE_SIZE 4096
 | ||||||
|  |   | ||||||
|  |  	.section SECTION(.text),"ax",@progbits | ||||||
|  |  ENTRY (MEMCHR) | ||||||
|  | @@ -47,295 +62,349 @@ ENTRY (MEMCHR)
 | ||||||
|  |  	test	%RDX_LP, %RDX_LP | ||||||
|  |  	jz	L(null) | ||||||
|  |  # endif | ||||||
|  | -	movl	%edi, %ecx
 | ||||||
|  | -	/* Broadcast CHAR to YMM0.  */
 | ||||||
|  | -	vmovd	%esi, %xmm0
 | ||||||
|  |  # ifdef USE_AS_WMEMCHR | ||||||
|  |  	shl	$2, %RDX_LP | ||||||
|  | -	vpbroadcastd %xmm0, %ymm0
 | ||||||
|  |  # else | ||||||
|  |  #  ifdef __ILP32__ | ||||||
|  |  	/* Clear the upper 32 bits.  */ | ||||||
|  |  	movl	%edx, %edx | ||||||
|  |  #  endif | ||||||
|  | -	vpbroadcastb %xmm0, %ymm0
 | ||||||
|  |  # endif | ||||||
|  | +	/* Broadcast CHAR to YMMMATCH.  */
 | ||||||
|  | +	vmovd	%esi, %xmm0
 | ||||||
|  | +	VPBROADCAST %xmm0, %ymm0
 | ||||||
|  |  	/* Check if we may cross page boundary with one vector load.  */ | ||||||
|  | -	andl	$(2 * VEC_SIZE - 1), %ecx
 | ||||||
|  | -	cmpl	$VEC_SIZE, %ecx
 | ||||||
|  | -	ja	L(cros_page_boundary)
 | ||||||
|  | +	movl	%edi, %eax
 | ||||||
|  | +	andl	$(PAGE_SIZE - 1), %eax
 | ||||||
|  | +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 | ||||||
|  | +	ja	L(cross_page_boundary)
 | ||||||
|  |   | ||||||
|  |  	/* Check the first VEC_SIZE bytes.  */ | ||||||
|  | -	VPCMPEQ (%rdi), %ymm0, %ymm1
 | ||||||
|  | +	VPCMPEQ	(%rdi), %ymm0, %ymm1
 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -
 | ||||||
|  |  # ifndef USE_AS_RAWMEMCHR | ||||||
|  | -	jnz	L(first_vec_x0_check)
 | ||||||
|  | -	/* Adjust length and check the end of data.  */
 | ||||||
|  | -	subq	$VEC_SIZE, %rdx
 | ||||||
|  | -	jbe	L(zero)
 | ||||||
|  | -# else
 | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | +	/* If length < CHAR_PER_VEC handle special.  */
 | ||||||
|  | +	cmpq	$VEC_SIZE, %rdx
 | ||||||
|  | +	jbe	L(first_vec_x0)
 | ||||||
|  |  # endif | ||||||
|  | -
 | ||||||
|  | -	/* Align data for aligned loads in the loop.  */
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jz	L(aligned_more)
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  |  # ifndef USE_AS_RAWMEMCHR | ||||||
|  | -	/* Adjust length.  */
 | ||||||
|  | -	addq	%rcx, %rdx
 | ||||||
|  | +	.p2align 5
 | ||||||
|  | +L(first_vec_x0):
 | ||||||
|  | +	/* Check if first match was before length.  */
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	xorl	%ecx, %ecx
 | ||||||
|  | +	cmpl	%eax, %edx
 | ||||||
|  | +	leaq	(%rdi, %rax), %rax
 | ||||||
|  | +	cmovle	%rcx, %rax
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | -	jbe	L(last_4x_vec_or_less)
 | ||||||
|  | +L(null):
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	ret
 | ||||||
|  |  # endif | ||||||
|  | -	jmp	L(more_4x_vec)
 | ||||||
|  | -
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(cros_page_boundary):
 | ||||||
|  | -	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | -	VPCMPEQ (%rdi), %ymm0, %ymm1
 | ||||||
|  | +L(cross_page_boundary):
 | ||||||
|  | +	/* Save pointer before aligning as its original value is necessary
 | ||||||
|  | +	   for computer return address if byte is found or adjusting length
 | ||||||
|  | +	   if it is not and this is memchr.  */
 | ||||||
|  | +	movq	%rdi, %rcx
 | ||||||
|  | +	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
 | ||||||
|  | +	   rdi for rawmemchr.  */
 | ||||||
|  | +	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
 | ||||||
|  | +	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  | +# ifndef USE_AS_RAWMEMCHR
 | ||||||
|  | +	/* Calculate length until end of page (length checked for a
 | ||||||
|  | +	   match).  */
 | ||||||
|  | +	leaq	1(%ALGN_PTR_REG), %rsi
 | ||||||
|  | +	subq	%RRAW_PTR_REG, %rsi
 | ||||||
|  | +# endif
 | ||||||
|  |  	/* Remove the leading bytes.  */ | ||||||
|  | -	sarl	%cl, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jz	L(aligned_more)
 | ||||||
|  | -	tzcntl	%eax, %eax
 | ||||||
|  | +	sarxl	%ERAW_PTR_REG, %eax, %eax
 | ||||||
|  |  # ifndef USE_AS_RAWMEMCHR | ||||||
|  |  	/* Check the end of data.  */ | ||||||
|  | -	cmpq	%rax, %rdx
 | ||||||
|  | -	jbe	L(zero)
 | ||||||
|  | +	cmpq	%rsi, %rdx
 | ||||||
|  | +	jbe	L(first_vec_x0)
 | ||||||
|  |  # endif | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -	addq	%rcx, %rax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jz	L(cross_page_continue)
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	addq	%RRAW_PTR_REG, %rax
 | ||||||
|  |  L(return_vzeroupper): | ||||||
|  |  	ZERO_UPPER_VEC_REGISTERS_RETURN | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(aligned_more):
 | ||||||
|  | -# ifndef USE_AS_RAWMEMCHR
 | ||||||
|  | -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
 | ||||||
|  | -	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
 | ||||||
|  | -	   overflow.  */
 | ||||||
|  | -	negq	%rcx
 | ||||||
|  | -	addq	$VEC_SIZE, %rcx
 | ||||||
|  | +L(first_vec_x1):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	incq	%rdi
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	subq	%rcx, %rdx
 | ||||||
|  | -	jbe	L(zero)
 | ||||||
|  | -# endif
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_vec_x2):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	addq	$(VEC_SIZE + 1), %rdi
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_vec_x3):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	addq	$(VEC_SIZE * 2 + 1), %rdi
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  |   | ||||||
|  | -# ifndef USE_AS_RAWMEMCHR
 | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | -	jbe	L(last_4x_vec_or_less)
 | ||||||
|  | -# endif
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_vec_x4):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	addq	$(VEC_SIZE * 3 + 1), %rdi
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  | -L(more_4x_vec):
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(aligned_more):
 | ||||||
|  |  	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time | ||||||
|  |  	   since data is only aligned to VEC_SIZE.  */ | ||||||
|  | -	VPCMPEQ (%rdi), %ymm0, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  |   | ||||||
|  | -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
 | ||||||
|  | +# ifndef USE_AS_RAWMEMCHR
 | ||||||
|  | +L(cross_page_continue):
 | ||||||
|  | +	/* Align data to VEC_SIZE - 1.  */
 | ||||||
|  | +	xorl	%ecx, %ecx
 | ||||||
|  | +	subl	%edi, %ecx
 | ||||||
|  | +	orq	$(VEC_SIZE - 1), %rdi
 | ||||||
|  | +	/* esi is for adjusting length to see if near the end.  */
 | ||||||
|  | +	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
 | ||||||
|  | +# else
 | ||||||
|  | +	orq	$(VEC_SIZE - 1), %rdi
 | ||||||
|  | +L(cross_page_continue):
 | ||||||
|  | +# endif
 | ||||||
|  | +	/* Load first VEC regardless.  */
 | ||||||
|  | +	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  | +# ifndef USE_AS_RAWMEMCHR
 | ||||||
|  | +	/* Adjust length. If near end handle specially.  */
 | ||||||
|  | +	subq	%rsi, %rdx
 | ||||||
|  | +	jbe	L(last_4x_vec_or_less)
 | ||||||
|  | +# endif
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x1) | ||||||
|  |   | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x2) | ||||||
|  |   | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x3) | ||||||
|  |   | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(first_vec_x4)
 | ||||||
|  |   | ||||||
|  |  # ifndef USE_AS_RAWMEMCHR | ||||||
|  | +	/* Check if at last VEC_SIZE * 4 length.  */
 | ||||||
|  |  	subq	$(VEC_SIZE * 4), %rdx | ||||||
|  | -	jbe	L(last_4x_vec_or_less)
 | ||||||
|  | -# endif
 | ||||||
|  | -
 | ||||||
|  | -	/* Align data to 4 * VEC_SIZE.  */
 | ||||||
|  | -	movq	%rdi, %rcx
 | ||||||
|  | -	andl	$(4 * VEC_SIZE - 1), %ecx
 | ||||||
|  | -	andq	$-(4 * VEC_SIZE), %rdi
 | ||||||
|  | -
 | ||||||
|  | -# ifndef USE_AS_RAWMEMCHR
 | ||||||
|  | -	/* Adjust length.  */
 | ||||||
|  | +	jbe	L(last_4x_vec_or_less_cmpeq)
 | ||||||
|  | +	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
 | ||||||
|  | +	   length.  */
 | ||||||
|  | +	incq	%rdi
 | ||||||
|  | +	movl	%edi, %ecx
 | ||||||
|  | +	orq	$(VEC_SIZE * 4 - 1), %rdi
 | ||||||
|  | +	andl	$(VEC_SIZE * 4 - 1), %ecx
 | ||||||
|  |  	addq	%rcx, %rdx | ||||||
|  | +# else
 | ||||||
|  | +	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
 | ||||||
|  | +	incq	%rdi
 | ||||||
|  | +	orq	$(VEC_SIZE * 4 - 1), %rdi
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  | +	/* Compare 4 * VEC at a time forward.  */
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(loop_4x_vec): | ||||||
|  | -	/* Compare 4 * VEC at a time forward.  */
 | ||||||
|  | -	VPCMPEQ (%rdi), %ymm0, %ymm1
 | ||||||
|  | -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
 | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
 | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
 | ||||||
|  | -
 | ||||||
|  | +	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
 | ||||||
|  |  	vpor	%ymm1, %ymm2, %ymm5 | ||||||
|  |  	vpor	%ymm3, %ymm4, %ymm6 | ||||||
|  |  	vpor	%ymm5, %ymm6, %ymm5 | ||||||
|  |   | ||||||
|  | -	vpmovmskb %ymm5, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(4x_vec_end)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rdi
 | ||||||
|  | -
 | ||||||
|  | +	vpmovmskb %ymm5, %ecx
 | ||||||
|  |  # ifdef USE_AS_RAWMEMCHR | ||||||
|  | -	jmp	L(loop_4x_vec)
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jz	L(loop_4x_vec)
 | ||||||
|  |  # else | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | -	ja	L(loop_4x_vec)
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jnz	L(loop_4x_vec_end)
 | ||||||
|  |   | ||||||
|  | -L(last_4x_vec_or_less):
 | ||||||
|  | -	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
 | ||||||
|  | -	addl	$(VEC_SIZE * 2), %edx
 | ||||||
|  | -	jle	L(last_2x_vec)
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  |   | ||||||
|  | -	VPCMPEQ (%rdi), %ymm0, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | +	subq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | +	ja	L(loop_4x_vec)
 | ||||||
|  |   | ||||||
|  | -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	/* Fall through into less than 4 remaining vectors of length case.
 | ||||||
|  | +	 */
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(last_4x_vec_or_less):
 | ||||||
|  | +	/* Check if first VEC contained match.  */
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(first_vec_x1)
 | ||||||
|  | +	jnz	L(first_vec_x1_check)
 | ||||||
|  |   | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | +	/* If remaining length > VEC_SIZE * 2.  */
 | ||||||
|  | +	addl	$(VEC_SIZE * 2), %edx
 | ||||||
|  | +	jg	L(last_4x_vec)
 | ||||||
|  |   | ||||||
|  | -	jnz	L(first_vec_x2_check)
 | ||||||
|  | -	subl	$VEC_SIZE, %edx
 | ||||||
|  | -	jle	L(zero)
 | ||||||
|  | +L(last_2x_vec):
 | ||||||
|  | +	/* If remaining length < VEC_SIZE.  */
 | ||||||
|  | +	addl	$VEC_SIZE, %edx
 | ||||||
|  | +	jle	L(zero_end)
 | ||||||
|  |   | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	/* Check VEC2 and compare any match with remaining length.  */
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -
 | ||||||
|  | -	jnz	L(first_vec_x3_check)
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	cmpl	%eax, %edx
 | ||||||
|  | +	jbe	L(set_zero_end)
 | ||||||
|  | +	addq	$(VEC_SIZE + 1), %rdi
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  | +L(zero_end):
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(last_2x_vec):
 | ||||||
|  | -	addl	$(VEC_SIZE * 2), %edx
 | ||||||
|  | -	VPCMPEQ (%rdi), %ymm0, %ymm1
 | ||||||
|  | +L(loop_4x_vec_end):
 | ||||||
|  | +# endif
 | ||||||
|  | +	/* rawmemchr will fall through into this if match was found in
 | ||||||
|  | +	   loop.  */
 | ||||||
|  | +
 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | +	jnz	L(last_vec_x1_return)
 | ||||||
|  |   | ||||||
|  | -	jnz	L(first_vec_x0_check)
 | ||||||
|  | -	subl	$VEC_SIZE, %edx
 | ||||||
|  | -	jle	L(zero)
 | ||||||
|  | -
 | ||||||
|  | -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	vpmovmskb %ymm2, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(first_vec_x1_check)
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	VZEROUPPER_RETURN
 | ||||||
|  | +	jnz	L(last_vec_x2_return)
 | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(first_vec_x0_check):
 | ||||||
|  | -	tzcntl	%eax, %eax
 | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	cmpq	%rax, %rdx
 | ||||||
|  | -	jbe	L(zero)
 | ||||||
|  | +	vpmovmskb %ymm3, %eax
 | ||||||
|  | +	/* Combine VEC3 matches (eax) with VEC4 matches (ecx).  */
 | ||||||
|  | +	salq	$32, %rcx
 | ||||||
|  | +	orq	%rcx, %rax
 | ||||||
|  | +	tzcntq	%rax, %rax
 | ||||||
|  | +# ifdef USE_AS_RAWMEMCHR
 | ||||||
|  | +	subq	$(VEC_SIZE * 2 - 1), %rdi
 | ||||||
|  | +# else
 | ||||||
|  | +	subq	$-(VEC_SIZE * 2 + 1), %rdi
 | ||||||
|  | +# endif
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | +# ifndef USE_AS_RAWMEMCHR
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(first_vec_x1_check): | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	cmpq	%rax, %rdx
 | ||||||
|  | -	jbe	L(zero)
 | ||||||
|  | -	addq	$VEC_SIZE, %rax
 | ||||||
|  | +	/* Adjust length.  */
 | ||||||
|  | +	subl	$-(VEC_SIZE * 4), %edx
 | ||||||
|  | +	/* Check if match within remaining length.  */
 | ||||||
|  | +	cmpl	%eax, %edx
 | ||||||
|  | +	jbe	L(set_zero_end)
 | ||||||
|  | +	incq	%rdi
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(set_zero_end):
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +# endif
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x2_check):
 | ||||||
|  | +L(last_vec_x1_return):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	cmpq	%rax, %rdx
 | ||||||
|  | -	jbe	L(zero)
 | ||||||
|  | -	addq	$(VEC_SIZE * 2), %rax
 | ||||||
|  | +# ifdef USE_AS_RAWMEMCHR
 | ||||||
|  | +	subq	$(VEC_SIZE * 4 - 1), %rdi
 | ||||||
|  | +# else
 | ||||||
|  | +	incq	%rdi
 | ||||||
|  | +# endif
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x3_check):
 | ||||||
|  | +L(last_vec_x2_return):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	cmpq	%rax, %rdx
 | ||||||
|  | -	jbe	L(zero)
 | ||||||
|  | -	addq	$(VEC_SIZE * 3), %rax
 | ||||||
|  | +# ifdef USE_AS_RAWMEMCHR
 | ||||||
|  | +	subq	$(VEC_SIZE * 3 - 1), %rdi
 | ||||||
|  | +# else
 | ||||||
|  | +	subq	$-(VEC_SIZE + 1), %rdi
 | ||||||
|  | +# endif
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  | +# ifndef USE_AS_RAWMEMCHR
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(zero):
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	jmp     L(return_vzeroupper)
 | ||||||
|  | +L(last_4x_vec_or_less_cmpeq):
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	/* Check first VEC regardless.  */
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(first_vec_x1_check)
 | ||||||
|  |   | ||||||
|  | +	/* If remaining length <= CHAR_PER_VEC * 2.  */
 | ||||||
|  | +	addl	$(VEC_SIZE * 2), %edx
 | ||||||
|  | +	jle	L(last_2x_vec)
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(null):
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	ret
 | ||||||
|  | -# endif
 | ||||||
|  | +L(last_4x_vec):
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(last_vec_x2_return)
 | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(first_vec_x0):
 | ||||||
|  | -	tzcntl	%eax, %eax
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -	VZEROUPPER_RETURN
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(first_vec_x1):
 | ||||||
|  | -	tzcntl	%eax, %eax
 | ||||||
|  | -	addq	$VEC_SIZE, %rax
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -	VZEROUPPER_RETURN
 | ||||||
|  | +	/* Create mask for possible matches within remaining length.  */
 | ||||||
|  | +	movq	$-1, %rcx
 | ||||||
|  | +	bzhiq	%rdx, %rcx, %rcx
 | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(first_vec_x2):
 | ||||||
|  | +	/* Test matches in data against length match.  */
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jnz	L(last_vec_x3)
 | ||||||
|  | +
 | ||||||
|  | +	/* if remaining length <= VEC_SIZE * 3 (Note this is after
 | ||||||
|  | +	   remaining length was found to be > VEC_SIZE * 2.  */
 | ||||||
|  | +	subl	$VEC_SIZE, %edx
 | ||||||
|  | +	jbe	L(zero_end2)
 | ||||||
|  | +
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	/* Shift remaining length mask for last VEC.  */
 | ||||||
|  | +	shrq	$32, %rcx
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jz	L(zero_end2)
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	addq	$(VEC_SIZE * 2), %rax
 | ||||||
|  | +	addq	$(VEC_SIZE * 3 + 1), %rdi
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  | +L(zero_end2):
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(4x_vec_end):
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | -	vpmovmskb %ymm2, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x1)
 | ||||||
|  | -	vpmovmskb %ymm3, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x2)
 | ||||||
|  | -	vpmovmskb %ymm4, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -L(first_vec_x3):
 | ||||||
|  | +L(last_vec_x3):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	addq	$(VEC_SIZE * 3), %rax
 | ||||||
|  | +	subq	$-(VEC_SIZE * 2 + 1), %rdi
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | +# endif
 | ||||||
|  |   | ||||||
|  |  END (MEMCHR) | ||||||
|  |  #endif | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										388
									
								
								SOURCES/glibc-RHEL-15696-24.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										388
									
								
								SOURCES/glibc-RHEL-15696-24.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,388 @@ | |||||||
|  | From 645a158978f9520e74074e8c14047503be4db0f0 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Wed, 9 Jun 2021 16:25:32 -0400 | ||||||
|  | Subject: [PATCH] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ | ||||||
|  |  #27974] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | This commit fixes the bug mentioned in the previous commit. | ||||||
|  | 
 | ||||||
|  | The previous implementations of wmemchr in these files relied | ||||||
|  | on n * sizeof(wchar_t) which was not guranteed by the standard. | ||||||
|  | 
 | ||||||
|  | The new overflow tests added in the previous commit now | ||||||
|  | pass (As well as all the other tests). | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/memchr.S                | 77 +++++++++++++++++++------- | ||||||
|  |  sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------ | ||||||
|  |  2 files changed, 98 insertions(+), 37 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
 | ||||||
|  | index cb320257..24f9a0c5 100644
 | ||||||
|  | --- a/sysdeps/x86_64/memchr.S
 | ||||||
|  | +++ b/sysdeps/x86_64/memchr.S
 | ||||||
|  | @@ -21,9 +21,11 @@
 | ||||||
|  |  #ifdef USE_AS_WMEMCHR | ||||||
|  |  # define MEMCHR		wmemchr | ||||||
|  |  # define PCMPEQ		pcmpeqd | ||||||
|  | +# define CHAR_PER_VEC	4
 | ||||||
|  |  #else | ||||||
|  |  # define MEMCHR		memchr | ||||||
|  |  # define PCMPEQ		pcmpeqb | ||||||
|  | +# define CHAR_PER_VEC	16
 | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  |  /* fast SSE2 version with using pmaxub and 64 byte loop */ | ||||||
|  | @@ -33,15 +35,14 @@ ENTRY(MEMCHR)
 | ||||||
|  |  	movd	%esi, %xmm1 | ||||||
|  |  	mov	%edi, %ecx | ||||||
|  |   | ||||||
|  | +#ifdef __ILP32__
 | ||||||
|  | +	/* Clear the upper 32 bits.  */
 | ||||||
|  | +	movl	%edx, %edx
 | ||||||
|  | +#endif
 | ||||||
|  |  #ifdef USE_AS_WMEMCHR | ||||||
|  |  	test	%RDX_LP, %RDX_LP | ||||||
|  |  	jz	L(return_null) | ||||||
|  | -	shl	$2, %RDX_LP
 | ||||||
|  |  #else | ||||||
|  | -# ifdef __ILP32__
 | ||||||
|  | -	/* Clear the upper 32 bits.  */
 | ||||||
|  | -	movl	%edx, %edx
 | ||||||
|  | -# endif
 | ||||||
|  |  	punpcklbw %xmm1, %xmm1 | ||||||
|  |  	test	%RDX_LP, %RDX_LP | ||||||
|  |  	jz	L(return_null) | ||||||
|  | @@ -60,13 +61,16 @@ ENTRY(MEMCHR)
 | ||||||
|  |  	test	%eax, %eax | ||||||
|  |   | ||||||
|  |  	jnz	L(matches_1) | ||||||
|  | -	sub	$16, %rdx
 | ||||||
|  | +	sub	$CHAR_PER_VEC, %rdx
 | ||||||
|  |  	jbe	L(return_null) | ||||||
|  |  	add	$16, %rdi | ||||||
|  |  	and	$15, %ecx | ||||||
|  |  	and	$-16, %rdi | ||||||
|  | +#ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	shr	$2, %ecx
 | ||||||
|  | +#endif
 | ||||||
|  |  	add	%rcx, %rdx | ||||||
|  | -	sub	$64, %rdx
 | ||||||
|  | +	sub	$(CHAR_PER_VEC * 4), %rdx
 | ||||||
|  |  	jbe	L(exit_loop) | ||||||
|  |  	jmp	L(loop_prolog) | ||||||
|  |   | ||||||
|  | @@ -77,16 +81,21 @@ L(crosscache):
 | ||||||
|  |  	movdqa	(%rdi), %xmm0 | ||||||
|  |   | ||||||
|  |  	PCMPEQ	%xmm1, %xmm0 | ||||||
|  | -/* Check if there is a match.  */
 | ||||||
|  | +	/* Check if there is a match.  */
 | ||||||
|  |  	pmovmskb %xmm0, %eax | ||||||
|  | -/* Remove the leading bytes.  */
 | ||||||
|  | +	/* Remove the leading bytes.  */
 | ||||||
|  |  	sar	%cl, %eax | ||||||
|  |  	test	%eax, %eax | ||||||
|  |  	je	L(unaligned_no_match) | ||||||
|  | -/* Check which byte is a match.  */
 | ||||||
|  | +	/* Check which byte is a match.  */
 | ||||||
|  |  	bsf	%eax, %eax | ||||||
|  | -
 | ||||||
|  | +#ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	mov	%eax, %esi
 | ||||||
|  | +	shr	$2, %esi
 | ||||||
|  | +	sub	%rsi, %rdx
 | ||||||
|  | +#else
 | ||||||
|  |  	sub	%rax, %rdx | ||||||
|  | +#endif
 | ||||||
|  |  	jbe	L(return_null) | ||||||
|  |  	add	%rdi, %rax | ||||||
|  |  	add	%rcx, %rax | ||||||
|  | @@ -94,15 +103,18 @@ L(crosscache):
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(unaligned_no_match): | ||||||
|  | -        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
 | ||||||
|  | +	/* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
 | ||||||
|  |  	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void | ||||||
|  |  	   possible addition overflow.  */ | ||||||
|  |  	neg	%rcx | ||||||
|  |  	add	$16, %rcx | ||||||
|  | +#ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	shr	$2, %ecx
 | ||||||
|  | +#endif
 | ||||||
|  |  	sub	%rcx, %rdx | ||||||
|  |  	jbe	L(return_null) | ||||||
|  |  	add	$16, %rdi | ||||||
|  | -	sub	$64, %rdx
 | ||||||
|  | +	sub	$(CHAR_PER_VEC * 4), %rdx
 | ||||||
|  |  	jbe	L(exit_loop) | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | @@ -135,7 +147,7 @@ L(loop_prolog):
 | ||||||
|  |  	test	$0x3f, %rdi | ||||||
|  |  	jz	L(align64_loop) | ||||||
|  |   | ||||||
|  | -	sub	$64, %rdx
 | ||||||
|  | +	sub	$(CHAR_PER_VEC * 4), %rdx
 | ||||||
|  |  	jbe	L(exit_loop) | ||||||
|  |   | ||||||
|  |  	movdqa	(%rdi), %xmm0 | ||||||
|  | @@ -167,11 +179,14 @@ L(loop_prolog):
 | ||||||
|  |  	mov	%rdi, %rcx | ||||||
|  |  	and	$-64, %rdi | ||||||
|  |  	and	$63, %ecx | ||||||
|  | +#ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	shr	$2, %ecx
 | ||||||
|  | +#endif
 | ||||||
|  |  	add	%rcx, %rdx | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(align64_loop): | ||||||
|  | -	sub	$64, %rdx
 | ||||||
|  | +	sub	$(CHAR_PER_VEC * 4), %rdx
 | ||||||
|  |  	jbe	L(exit_loop) | ||||||
|  |  	movdqa	(%rdi), %xmm0 | ||||||
|  |  	movdqa	16(%rdi), %xmm2 | ||||||
|  | @@ -218,7 +233,7 @@ L(align64_loop):
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(exit_loop): | ||||||
|  | -	add	$32, %edx
 | ||||||
|  | +	add	$(CHAR_PER_VEC * 2), %edx
 | ||||||
|  |  	jle	L(exit_loop_32) | ||||||
|  |   | ||||||
|  |  	movdqa	(%rdi), %xmm0 | ||||||
|  | @@ -238,7 +253,7 @@ L(exit_loop):
 | ||||||
|  |  	pmovmskb %xmm3, %eax | ||||||
|  |  	test	%eax, %eax | ||||||
|  |  	jnz	L(matches32_1) | ||||||
|  | -	sub	$16, %edx
 | ||||||
|  | +	sub	$CHAR_PER_VEC, %edx
 | ||||||
|  |  	jle	L(return_null) | ||||||
|  |   | ||||||
|  |  	PCMPEQ	48(%rdi), %xmm1 | ||||||
|  | @@ -250,13 +265,13 @@ L(exit_loop):
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(exit_loop_32): | ||||||
|  | -	add	$32, %edx
 | ||||||
|  | +	add	$(CHAR_PER_VEC * 2), %edx
 | ||||||
|  |  	movdqa	(%rdi), %xmm0 | ||||||
|  |  	PCMPEQ	%xmm1, %xmm0 | ||||||
|  |  	pmovmskb %xmm0, %eax | ||||||
|  |  	test	%eax, %eax | ||||||
|  |  	jnz	L(matches_1) | ||||||
|  | -	sub	$16, %edx
 | ||||||
|  | +	sub	$CHAR_PER_VEC, %edx
 | ||||||
|  |  	jbe	L(return_null) | ||||||
|  |   | ||||||
|  |  	PCMPEQ	16(%rdi), %xmm1 | ||||||
|  | @@ -293,7 +308,13 @@ L(matches32):
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(matches_1): | ||||||
|  |  	bsf	%eax, %eax | ||||||
|  | +#ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	mov	%eax, %esi
 | ||||||
|  | +	shr	$2, %esi
 | ||||||
|  | +	sub	%rsi, %rdx
 | ||||||
|  | +#else
 | ||||||
|  |  	sub	%rax, %rdx | ||||||
|  | +#endif
 | ||||||
|  |  	jbe	L(return_null) | ||||||
|  |  	add	%rdi, %rax | ||||||
|  |  	ret | ||||||
|  | @@ -301,7 +322,13 @@ L(matches_1):
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(matches16_1): | ||||||
|  |  	bsf	%eax, %eax | ||||||
|  | +#ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	mov	%eax, %esi
 | ||||||
|  | +	shr	$2, %esi
 | ||||||
|  | +	sub	%rsi, %rdx
 | ||||||
|  | +#else
 | ||||||
|  |  	sub	%rax, %rdx | ||||||
|  | +#endif
 | ||||||
|  |  	jbe	L(return_null) | ||||||
|  |  	lea	16(%rdi, %rax), %rax | ||||||
|  |  	ret | ||||||
|  | @@ -309,7 +336,13 @@ L(matches16_1):
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(matches32_1): | ||||||
|  |  	bsf	%eax, %eax | ||||||
|  | +#ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	mov	%eax, %esi
 | ||||||
|  | +	shr	$2, %esi
 | ||||||
|  | +	sub	%rsi, %rdx
 | ||||||
|  | +#else
 | ||||||
|  |  	sub	%rax, %rdx | ||||||
|  | +#endif
 | ||||||
|  |  	jbe	L(return_null) | ||||||
|  |  	lea	32(%rdi, %rax), %rax | ||||||
|  |  	ret | ||||||
|  | @@ -317,7 +350,13 @@ L(matches32_1):
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(matches48_1): | ||||||
|  |  	bsf	%eax, %eax | ||||||
|  | +#ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	mov	%eax, %esi
 | ||||||
|  | +	shr	$2, %esi
 | ||||||
|  | +	sub	%rsi, %rdx
 | ||||||
|  | +#else
 | ||||||
|  |  	sub	%rax, %rdx | ||||||
|  | +#endif
 | ||||||
|  |  	jbe	L(return_null) | ||||||
|  |  	lea	48(%rdi, %rax), %rax | ||||||
|  |  	ret | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
 | ||||||
|  | index b377f22e..16027abb 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
 | ||||||
|  | @@ -54,21 +54,19 @@
 | ||||||
|  |   | ||||||
|  |  # define VEC_SIZE 32 | ||||||
|  |  # define PAGE_SIZE 4096 | ||||||
|  | +# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 | ||||||
|  |   | ||||||
|  |  	.section SECTION(.text),"ax",@progbits | ||||||
|  |  ENTRY (MEMCHR) | ||||||
|  |  # ifndef USE_AS_RAWMEMCHR | ||||||
|  |  	/* Check for zero length.  */ | ||||||
|  | -	test	%RDX_LP, %RDX_LP
 | ||||||
|  | -	jz	L(null)
 | ||||||
|  | -# endif
 | ||||||
|  | -# ifdef USE_AS_WMEMCHR
 | ||||||
|  | -	shl	$2, %RDX_LP
 | ||||||
|  | -# else
 | ||||||
|  |  #  ifdef __ILP32__ | ||||||
|  | -	/* Clear the upper 32 bits.  */
 | ||||||
|  | -	movl	%edx, %edx
 | ||||||
|  | +	/* Clear upper bits.  */
 | ||||||
|  | +	and	%RDX_LP, %RDX_LP
 | ||||||
|  | +#  else
 | ||||||
|  | +	test	%RDX_LP, %RDX_LP
 | ||||||
|  |  #  endif | ||||||
|  | +	jz	L(null)
 | ||||||
|  |  # endif | ||||||
|  |  	/* Broadcast CHAR to YMMMATCH.  */ | ||||||
|  |  	vmovd	%esi, %xmm0 | ||||||
|  | @@ -84,7 +82,7 @@ ENTRY (MEMCHR)
 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  |  # ifndef USE_AS_RAWMEMCHR | ||||||
|  |  	/* If length < CHAR_PER_VEC handle special.  */ | ||||||
|  | -	cmpq	$VEC_SIZE, %rdx
 | ||||||
|  | +	cmpq	$CHAR_PER_VEC, %rdx
 | ||||||
|  |  	jbe	L(first_vec_x0) | ||||||
|  |  # endif | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | @@ -98,6 +96,10 @@ ENTRY (MEMCHR)
 | ||||||
|  |  L(first_vec_x0): | ||||||
|  |  	/* Check if first match was before length.  */ | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | +#  ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	/* NB: Multiply length by 4 to get byte count.  */
 | ||||||
|  | +	sall	$2, %edx
 | ||||||
|  | +#  endif
 | ||||||
|  |  	xorl	%ecx, %ecx | ||||||
|  |  	cmpl	%eax, %edx | ||||||
|  |  	leaq	(%rdi, %rax), %rax | ||||||
|  | @@ -110,12 +112,12 @@ L(null):
 | ||||||
|  |  # endif | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(cross_page_boundary): | ||||||
|  | -	/* Save pointer before aligning as its original value is necessary
 | ||||||
|  | -	   for computer return address if byte is found or adjusting length
 | ||||||
|  | -	   if it is not and this is memchr.  */
 | ||||||
|  | +	/* Save pointer before aligning as its original value is
 | ||||||
|  | +	   necessary for computer return address if byte is found or
 | ||||||
|  | +	   adjusting length if it is not and this is memchr.  */
 | ||||||
|  |  	movq	%rdi, %rcx | ||||||
|  | -	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
 | ||||||
|  | -	   rdi for rawmemchr.  */
 | ||||||
|  | +	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
 | ||||||
|  | +	   and rdi for rawmemchr.  */
 | ||||||
|  |  	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG | ||||||
|  |  	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  | @@ -124,6 +126,10 @@ L(cross_page_boundary):
 | ||||||
|  |  	   match).  */ | ||||||
|  |  	leaq	1(%ALGN_PTR_REG), %rsi | ||||||
|  |  	subq	%RRAW_PTR_REG, %rsi | ||||||
|  | +#  ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 | ||||||
|  | +	shrl	$2, %esi
 | ||||||
|  | +#  endif
 | ||||||
|  |  # endif | ||||||
|  |  	/* Remove the leading bytes.  */ | ||||||
|  |  	sarxl	%ERAW_PTR_REG, %eax, %eax | ||||||
|  | @@ -181,6 +187,10 @@ L(cross_page_continue):
 | ||||||
|  |  	orq	$(VEC_SIZE - 1), %rdi | ||||||
|  |  	/* esi is for adjusting length to see if near the end.  */ | ||||||
|  |  	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi | ||||||
|  | +#  ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarl	$2, %esi
 | ||||||
|  | +#  endif
 | ||||||
|  |  # else | ||||||
|  |  	orq	$(VEC_SIZE - 1), %rdi | ||||||
|  |  L(cross_page_continue): | ||||||
|  | @@ -213,7 +223,7 @@ L(cross_page_continue):
 | ||||||
|  |   | ||||||
|  |  # ifndef USE_AS_RAWMEMCHR | ||||||
|  |  	/* Check if at last VEC_SIZE * 4 length.  */ | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | +	subq	$(CHAR_PER_VEC * 4), %rdx
 | ||||||
|  |  	jbe	L(last_4x_vec_or_less_cmpeq) | ||||||
|  |  	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust | ||||||
|  |  	   length.  */ | ||||||
|  | @@ -221,6 +231,10 @@ L(cross_page_continue):
 | ||||||
|  |  	movl	%edi, %ecx | ||||||
|  |  	orq	$(VEC_SIZE * 4 - 1), %rdi | ||||||
|  |  	andl	$(VEC_SIZE * 4 - 1), %ecx | ||||||
|  | +#  ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarl	$2, %ecx
 | ||||||
|  | +#  endif
 | ||||||
|  |  	addq	%rcx, %rdx | ||||||
|  |  # else | ||||||
|  |  	/* Align data to VEC_SIZE * 4 - 1 for loop.  */ | ||||||
|  | @@ -250,15 +264,19 @@ L(loop_4x_vec):
 | ||||||
|  |   | ||||||
|  |  	subq	$-(VEC_SIZE * 4), %rdi | ||||||
|  |   | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | +	subq	$(CHAR_PER_VEC * 4), %rdx
 | ||||||
|  |  	ja	L(loop_4x_vec) | ||||||
|  |   | ||||||
|  | -	/* Fall through into less than 4 remaining vectors of length case.
 | ||||||
|  | -	 */
 | ||||||
|  | +	/* Fall through into less than 4 remaining vectors of length
 | ||||||
|  | +	   case.  */
 | ||||||
|  |  	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(last_4x_vec_or_less): | ||||||
|  | +#  ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	/* NB: Multiply length by 4 to get byte count.  */
 | ||||||
|  | +	sall	$2, %edx
 | ||||||
|  | +#  endif
 | ||||||
|  |  	/* Check if first VEC contained match.  */ | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x1_check) | ||||||
|  | @@ -355,6 +373,10 @@ L(last_vec_x2_return):
 | ||||||
|  |  L(last_4x_vec_or_less_cmpeq): | ||||||
|  |  	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  | +#  ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	/* NB: Multiply length by 4 to get byte count.  */
 | ||||||
|  | +	sall	$2, %edx
 | ||||||
|  | +#  endif
 | ||||||
|  |  	subq	$-(VEC_SIZE * 4), %rdi | ||||||
|  |  	/* Check first VEC regardless.  */ | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										767
									
								
								SOURCES/glibc-RHEL-15696-25.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										767
									
								
								SOURCES/glibc-RHEL-15696-25.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,767 @@ | |||||||
|  | From aaa23c35071537e2dcf5807e956802ed215210aa Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Mon, 19 Apr 2021 19:36:07 -0400 | ||||||
|  | Subject: [PATCH] x86: Optimize strlen-avx2.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. This commit optimizes strlen-avx2.S. The optimizations are | ||||||
|  | mostly small things but they add up to roughly 10-30% performance | ||||||
|  | improvement for strlen. The results for strnlen are bit more | ||||||
|  | ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen | ||||||
|  | are all passing. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +- | ||||||
|  |  sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++-------- | ||||||
|  |  2 files changed, 334 insertions(+), 214 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | index cbfc1a5d..f1a6460a 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | @@ -285,10 +285,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |    /* Support sysdeps/x86_64/multiarch/strlen.c.  */ | ||||||
|  |    IFUNC_IMPL (i, name, strlen, | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, strlen, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX2),
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX2)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __strlen_avx2) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, strlen, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX2) | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (RTM)), | ||||||
|  |  			      __strlen_avx2_rtm) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, strlen, | ||||||
|  | @@ -301,10 +303,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */ | ||||||
|  |    IFUNC_IMPL (i, name, strnlen, | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, strnlen, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX2),
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX2)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __strnlen_avx2) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, strnlen, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX2) | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (RTM)), | ||||||
|  |  			      __strnlen_avx2_rtm) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, strnlen, | ||||||
|  | @@ -640,10 +644,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */ | ||||||
|  |    IFUNC_IMPL (i, name, wcslen, | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wcslen, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX2),
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX2)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __wcslen_avx2) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wcslen, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX2) | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (RTM)), | ||||||
|  |  			      __wcslen_avx2_rtm) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wcslen, | ||||||
|  | @@ -656,10 +662,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */ | ||||||
|  |    IFUNC_IMPL (i, name, wcsnlen, | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wcsnlen, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX2),
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX2)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __wcsnlen_avx2) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wcsnlen, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX2) | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (RTM)), | ||||||
|  |  			      __wcsnlen_avx2_rtm) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wcsnlen, | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
 | ||||||
|  | index 82826e10..be8a5db5 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
 | ||||||
|  | @@ -27,9 +27,11 @@
 | ||||||
|  |  # ifdef USE_AS_WCSLEN | ||||||
|  |  #  define VPCMPEQ	vpcmpeqd | ||||||
|  |  #  define VPMINU	vpminud | ||||||
|  | +#  define CHAR_SIZE	4
 | ||||||
|  |  # else | ||||||
|  |  #  define VPCMPEQ	vpcmpeqb | ||||||
|  |  #  define VPMINU	vpminub | ||||||
|  | +#  define CHAR_SIZE	1
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  # ifndef VZEROUPPER | ||||||
|  | @@ -41,349 +43,459 @@
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  # define VEC_SIZE 32 | ||||||
|  | +# define PAGE_SIZE 4096
 | ||||||
|  |   | ||||||
|  |  	.section SECTION(.text),"ax",@progbits | ||||||
|  |  ENTRY (STRLEN) | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  | -	/* Check for zero length.  */
 | ||||||
|  | +	/* Check zero length.  */
 | ||||||
|  |  	test	%RSI_LP, %RSI_LP | ||||||
|  |  	jz	L(zero) | ||||||
|  | +	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
 | ||||||
|  | +	mov	%RSI_LP, %R8_LP
 | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  |  	shl	$2, %RSI_LP | ||||||
|  |  #  elif defined __ILP32__ | ||||||
|  |  	/* Clear the upper 32 bits.  */ | ||||||
|  |  	movl	%esi, %esi | ||||||
|  |  #  endif | ||||||
|  | -	mov	%RSI_LP, %R8_LP
 | ||||||
|  |  # endif | ||||||
|  | -	movl	%edi, %ecx
 | ||||||
|  | +	movl	%edi, %eax
 | ||||||
|  |  	movq	%rdi, %rdx | ||||||
|  |  	vpxor	%xmm0, %xmm0, %xmm0 | ||||||
|  | -
 | ||||||
|  | +	/* Clear high bits from edi. Only keeping bits relevant to page
 | ||||||
|  | +	   cross check.  */
 | ||||||
|  | +	andl	$(PAGE_SIZE - 1), %eax
 | ||||||
|  |  	/* Check if we may cross page boundary with one vector load.  */ | ||||||
|  | -	andl	$(2 * VEC_SIZE - 1), %ecx
 | ||||||
|  | -	cmpl	$VEC_SIZE, %ecx
 | ||||||
|  | -	ja	L(cros_page_boundary)
 | ||||||
|  | +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 | ||||||
|  | +	ja	L(cross_page_boundary)
 | ||||||
|  |   | ||||||
|  |  	/* Check the first VEC_SIZE bytes.  */ | ||||||
|  | -	VPCMPEQ (%rdi), %ymm0, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -
 | ||||||
|  | +	VPCMPEQ	(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	vpmovmskb	%ymm1, %eax
 | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  | -	jnz	L(first_vec_x0_check)
 | ||||||
|  | -	/* Adjust length and check the end of data.  */
 | ||||||
|  | -	subq	$VEC_SIZE, %rsi
 | ||||||
|  | -	jbe	L(max)
 | ||||||
|  | -# else
 | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | +	/* If length < VEC_SIZE handle special.  */
 | ||||||
|  | +	cmpq	$VEC_SIZE, %rsi
 | ||||||
|  | +	jbe	L(first_vec_x0)
 | ||||||
|  |  # endif | ||||||
|  | -
 | ||||||
|  | -	/* Align data for aligned loads in the loop.  */
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | +	/* If empty continue to aligned_more. Otherwise return bit
 | ||||||
|  | +	   position of first match.  */
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jz	L(aligned_more)
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +# ifdef USE_AS_WCSLEN
 | ||||||
|  | +	shrl	$2, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  | -	/* Adjust length.  */
 | ||||||
|  | -	addq	%rcx, %rsi
 | ||||||
|  | +L(zero):
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rsi
 | ||||||
|  | -	jbe	L(last_4x_vec_or_less)
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_vec_x0):
 | ||||||
|  | +	/* Set bit for max len so that tzcnt will return min of max len
 | ||||||
|  | +	   and position of first match.  */
 | ||||||
|  | +	btsq	%rsi, %rax
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	shrl	$2, %eax
 | ||||||
|  | +#  endif
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |  # endif | ||||||
|  | -	jmp	L(more_4x_vec)
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(cros_page_boundary):
 | ||||||
|  | -	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | -	VPCMPEQ (%rdi), %ymm0, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | -	/* Remove the leading bytes.  */
 | ||||||
|  | -	sarl	%cl, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jz	L(aligned_more)
 | ||||||
|  | +L(first_vec_x1):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | +	/* Safe to use 32 bit instructions as these are only called for
 | ||||||
|  | +	   size = [1, 159].  */
 | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	cmpq	%rax, %rsi
 | ||||||
|  | -	jbe	L(max)
 | ||||||
|  | +	/* Use ecx which was computed earlier to compute correct value.
 | ||||||
|  | +	 */
 | ||||||
|  | +	subl	$(VEC_SIZE * 4 + 1), %ecx
 | ||||||
|  | +	addl	%ecx, %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	subl	%edx, %edi
 | ||||||
|  | +	incl	%edi
 | ||||||
|  | +	addl	%edi, %eax
 | ||||||
|  |  # endif | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -	addq	%rcx, %rax
 | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  |  # ifdef USE_AS_WCSLEN | ||||||
|  | -	shrq	$2, %rax
 | ||||||
|  | +	shrl	$2, %eax
 | ||||||
|  |  # endif | ||||||
|  | -L(return_vzeroupper):
 | ||||||
|  | -	ZERO_UPPER_VEC_REGISTERS_RETURN
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(aligned_more):
 | ||||||
|  | +L(first_vec_x2):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	/* Safe to use 32 bit instructions as these are only called for
 | ||||||
|  | +	   size = [1, 159].  */
 | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  | -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
 | ||||||
|  | -	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
 | ||||||
|  | -	    to void possible addition overflow.  */
 | ||||||
|  | -	negq	%rcx
 | ||||||
|  | -	addq	$VEC_SIZE, %rcx
 | ||||||
|  | -
 | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	subq	%rcx, %rsi
 | ||||||
|  | -	jbe	L(max)
 | ||||||
|  | +	/* Use ecx which was computed earlier to compute correct value.
 | ||||||
|  | +	 */
 | ||||||
|  | +	subl	$(VEC_SIZE * 3 + 1), %ecx
 | ||||||
|  | +	addl	%ecx, %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	subl	%edx, %edi
 | ||||||
|  | +	addl	$(VEC_SIZE + 1), %edi
 | ||||||
|  | +	addl	%edi, %eax
 | ||||||
|  |  # endif | ||||||
|  | +# ifdef USE_AS_WCSLEN
 | ||||||
|  | +	shrl	$2, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_vec_x3):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	/* Safe to use 32 bit instructions as these are only called for
 | ||||||
|  | +	   size = [1, 159].  */
 | ||||||
|  | +# ifdef USE_AS_STRNLEN
 | ||||||
|  | +	/* Use ecx which was computed earlier to compute correct value.
 | ||||||
|  | +	 */
 | ||||||
|  | +	subl	$(VEC_SIZE * 2 + 1), %ecx
 | ||||||
|  | +	addl	%ecx, %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	subl	%edx, %edi
 | ||||||
|  | +	addl	$(VEC_SIZE * 2 + 1), %edi
 | ||||||
|  | +	addl	%edi, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +# ifdef USE_AS_WCSLEN
 | ||||||
|  | +	shrl	$2, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_vec_x4):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	/* Safe to use 32 bit instructions as these are only called for
 | ||||||
|  | +	   size = [1, 159].  */
 | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rsi
 | ||||||
|  | -	jbe	L(last_4x_vec_or_less)
 | ||||||
|  | +	/* Use ecx which was computed earlier to compute correct value.
 | ||||||
|  | +	 */
 | ||||||
|  | +	subl	$(VEC_SIZE + 1), %ecx
 | ||||||
|  | +	addl	%ecx, %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	subl	%edx, %edi
 | ||||||
|  | +	addl	$(VEC_SIZE * 3 + 1), %edi
 | ||||||
|  | +	addl	%edi, %eax
 | ||||||
|  |  # endif | ||||||
|  | +# ifdef USE_AS_WCSLEN
 | ||||||
|  | +	shrl	$2, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  | -L(more_4x_vec):
 | ||||||
|  | +	.p2align 5
 | ||||||
|  | +L(aligned_more):
 | ||||||
|  | +	/* Align data to VEC_SIZE - 1. This is the same number of
 | ||||||
|  | +	   instructions as using andq with -VEC_SIZE but saves 4 bytes of
 | ||||||
|  | +	   code on the x4 check.  */
 | ||||||
|  | +	orq	$(VEC_SIZE - 1), %rdi
 | ||||||
|  | +L(cross_page_continue):
 | ||||||
|  |  	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time | ||||||
|  |  	   since data is only aligned to VEC_SIZE.  */ | ||||||
|  | -	VPCMPEQ (%rdi), %ymm0, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | -
 | ||||||
|  | -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | +# ifdef USE_AS_STRNLEN
 | ||||||
|  | +	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
 | ||||||
|  | +	   it simplies the logic in last_4x_vec_or_less.  */
 | ||||||
|  | +	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
 | ||||||
|  | +	subq	%rdx, %rcx
 | ||||||
|  | +# endif
 | ||||||
|  | +	/* Load first VEC regardless.  */
 | ||||||
|  | +	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 | ||||||
|  | +# ifdef USE_AS_STRNLEN
 | ||||||
|  | +	/* Adjust length. If near end handle specially.  */
 | ||||||
|  | +	subq	%rcx, %rsi
 | ||||||
|  | +	jb	L(last_4x_vec_or_less)
 | ||||||
|  | +# endif
 | ||||||
|  | +	vpmovmskb	%ymm1, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x1) | ||||||
|  |   | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	vpmovmskb	%ymm1, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x2) | ||||||
|  |   | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	vpmovmskb	%ymm1, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x3) | ||||||
|  |   | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rdi
 | ||||||
|  | -
 | ||||||
|  | -# ifdef USE_AS_STRNLEN
 | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rsi
 | ||||||
|  | -	jbe	L(last_4x_vec_or_less)
 | ||||||
|  | -# endif
 | ||||||
|  | -
 | ||||||
|  | -	/* Align data to 4 * VEC_SIZE.  */
 | ||||||
|  | -	movq	%rdi, %rcx
 | ||||||
|  | -	andl	$(4 * VEC_SIZE - 1), %ecx
 | ||||||
|  | -	andq	$-(4 * VEC_SIZE), %rdi
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(first_vec_x4)
 | ||||||
|  |   | ||||||
|  | +	/* Align data to VEC_SIZE * 4 - 1.  */
 | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  | -	/* Adjust length.  */
 | ||||||
|  | +	/* Before adjusting length check if at last VEC_SIZE * 4.  */
 | ||||||
|  | +	cmpq	$(VEC_SIZE * 4 - 1), %rsi
 | ||||||
|  | +	jbe	L(last_4x_vec_or_less_load)
 | ||||||
|  | +	incq	%rdi
 | ||||||
|  | +	movl	%edi, %ecx
 | ||||||
|  | +	orq	$(VEC_SIZE * 4 - 1), %rdi
 | ||||||
|  | +	andl	$(VEC_SIZE * 4 - 1), %ecx
 | ||||||
|  | +	/* Readjust length.  */
 | ||||||
|  |  	addq	%rcx, %rsi | ||||||
|  | +# else
 | ||||||
|  | +	incq	%rdi
 | ||||||
|  | +	orq	$(VEC_SIZE * 4 - 1), %rdi
 | ||||||
|  |  # endif | ||||||
|  | -
 | ||||||
|  | +	/* Compare 4 * VEC at a time forward.  */
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(loop_4x_vec): | ||||||
|  | -	/* Compare 4 * VEC at a time forward.  */
 | ||||||
|  | -	vmovdqa (%rdi), %ymm1
 | ||||||
|  | -	vmovdqa	VEC_SIZE(%rdi), %ymm2
 | ||||||
|  | -	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
 | ||||||
|  | -	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
 | ||||||
|  | -	VPMINU	%ymm1, %ymm2, %ymm5
 | ||||||
|  | -	VPMINU	%ymm3, %ymm4, %ymm6
 | ||||||
|  | -	VPMINU	%ymm5, %ymm6, %ymm5
 | ||||||
|  | -
 | ||||||
|  | -	VPCMPEQ	%ymm5, %ymm0, %ymm5
 | ||||||
|  | -	vpmovmskb %ymm5, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(4x_vec_end)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rdi
 | ||||||
|  | -
 | ||||||
|  | -# ifndef USE_AS_STRNLEN
 | ||||||
|  | -	jmp	L(loop_4x_vec)
 | ||||||
|  | -# else
 | ||||||
|  | +# ifdef USE_AS_STRNLEN
 | ||||||
|  | +	/* Break if at end of length.  */
 | ||||||
|  |  	subq	$(VEC_SIZE * 4), %rsi | ||||||
|  | -	ja	L(loop_4x_vec)
 | ||||||
|  | -
 | ||||||
|  | -L(last_4x_vec_or_less):
 | ||||||
|  | -	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
 | ||||||
|  | -	addl	$(VEC_SIZE * 2), %esi
 | ||||||
|  | -	jle	L(last_2x_vec)
 | ||||||
|  | +	jb	L(last_4x_vec_or_less_cmpeq)
 | ||||||
|  | +# endif
 | ||||||
|  | +	/* Save some code size by microfusing VPMINU with the load. Since
 | ||||||
|  | +	   the matches in ymm2/ymm4 can only be returned if there where no
 | ||||||
|  | +	   matches in ymm1/ymm3 respectively there is no issue with overlap.
 | ||||||
|  | +	 */
 | ||||||
|  | +	vmovdqa	1(%rdi), %ymm1
 | ||||||
|  | +	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
 | ||||||
|  | +	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
 | ||||||
|  | +	VPMINU	(VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
 | ||||||
|  | +
 | ||||||
|  | +	VPMINU	%ymm2, %ymm4, %ymm5
 | ||||||
|  | +	VPCMPEQ	%ymm5, %ymm0, %ymm5
 | ||||||
|  | +	vpmovmskb	%ymm5, %ecx
 | ||||||
|  |   | ||||||
|  | -	VPCMPEQ (%rdi), %ymm0, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jz	L(loop_4x_vec)
 | ||||||
|  |   | ||||||
|  | -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x1)
 | ||||||
|  |   | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	VPCMPEQ	%ymm1, %ymm0, %ymm1
 | ||||||
|  | +	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	subq	%rdx, %rdi
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | +	jnz	L(last_vec_return_x0)
 | ||||||
|  |   | ||||||
|  | -	jnz	L(first_vec_x2_check)
 | ||||||
|  | -	subl	$VEC_SIZE, %esi
 | ||||||
|  | -	jle	L(max)
 | ||||||
|  | -
 | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	VPCMPEQ	%ymm2, %ymm0, %ymm2
 | ||||||
|  | +	vpmovmskb	%ymm2, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -
 | ||||||
|  | -	jnz	L(first_vec_x3_check)
 | ||||||
|  | -	movq	%r8, %rax
 | ||||||
|  | -#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	jnz	L(last_vec_return_x1)
 | ||||||
|  | +
 | ||||||
|  | +	/* Combine last 2 VEC.  */
 | ||||||
|  | +	VPCMPEQ	%ymm3, %ymm0, %ymm3
 | ||||||
|  | +	vpmovmskb	%ymm3, %eax
 | ||||||
|  | +	/* rcx has combined result from all 4 VEC. It will only be used if
 | ||||||
|  | +	   the first 3 other VEC all did not contain a match.  */
 | ||||||
|  | +	salq	$32, %rcx
 | ||||||
|  | +	orq	%rcx, %rax
 | ||||||
|  | +	tzcntq	%rax, %rax
 | ||||||
|  | +	subq	$(VEC_SIZE * 2 - 1), %rdi
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  | +# ifdef USE_AS_WCSLEN
 | ||||||
|  |  	shrq	$2, %rax | ||||||
|  | -#  endif
 | ||||||
|  | +# endif
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  | +
 | ||||||
|  | +# ifdef USE_AS_STRNLEN
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(last_2x_vec):
 | ||||||
|  | -	addl	$(VEC_SIZE * 2), %esi
 | ||||||
|  | -	VPCMPEQ (%rdi), %ymm0, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | +L(last_4x_vec_or_less_load):
 | ||||||
|  | +	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +L(last_4x_vec_or_less_cmpeq):
 | ||||||
|  | +	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 | ||||||
|  | +L(last_4x_vec_or_less):
 | ||||||
|  |   | ||||||
|  | -	jnz	L(first_vec_x0_check)
 | ||||||
|  | -	subl	$VEC_SIZE, %esi
 | ||||||
|  | -	jle	L(max)
 | ||||||
|  | +	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
 | ||||||
|  | +	   VEC_SIZE * 4.  */
 | ||||||
|  | +	testl	$(VEC_SIZE * 2), %esi
 | ||||||
|  | +	jnz	L(last_4x_vec)
 | ||||||
|  |   | ||||||
|  | -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	/* length may have been negative or positive by an offset of
 | ||||||
|  | +	   VEC_SIZE * 4 depending on where this was called from. This fixes
 | ||||||
|  | +	   that.  */
 | ||||||
|  | +	andl	$(VEC_SIZE * 4 - 1), %esi
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(first_vec_x1_check)
 | ||||||
|  | -	movq	%r8, %rax
 | ||||||
|  | -#  ifdef USE_AS_WCSLEN
 | ||||||
|  | -	shrq	$2, %rax
 | ||||||
|  | -#  endif
 | ||||||
|  | -	VZEROUPPER_RETURN
 | ||||||
|  | +	jnz	L(last_vec_x1_check)
 | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(first_vec_x0_check):
 | ||||||
|  | +	subl	$VEC_SIZE, %esi
 | ||||||
|  | +	jb	L(max)
 | ||||||
|  | +
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	vpmovmskb	%ymm1, %eax
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  |  	/* Check the end of data.  */ | ||||||
|  | -	cmpq	%rax, %rsi
 | ||||||
|  | -	jbe	L(max)
 | ||||||
|  | +	cmpl	%eax, %esi
 | ||||||
|  | +	jb	L(max)
 | ||||||
|  | +	subq	%rdx, %rdi
 | ||||||
|  | +	addl	$(VEC_SIZE + 1), %eax
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  |  	shrq	$2, %rax | ||||||
|  |  #  endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | +# endif
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x1_check):
 | ||||||
|  | +L(last_vec_return_x0):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	cmpq	%rax, %rsi
 | ||||||
|  | -	jbe	L(max)
 | ||||||
|  | -	addq	$VEC_SIZE, %rax
 | ||||||
|  | +	subq	$(VEC_SIZE * 4 - 1), %rdi
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  | -#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +# ifdef USE_AS_WCSLEN
 | ||||||
|  |  	shrq	$2, %rax | ||||||
|  | -#  endif
 | ||||||
|  | +# endif
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x2_check):
 | ||||||
|  | +L(last_vec_return_x1):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	cmpq	%rax, %rsi
 | ||||||
|  | -	jbe	L(max)
 | ||||||
|  | -	addq	$(VEC_SIZE * 2), %rax
 | ||||||
|  | +	subq	$(VEC_SIZE * 3 - 1), %rdi
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  | -#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +# ifdef USE_AS_WCSLEN
 | ||||||
|  |  	shrq	$2, %rax | ||||||
|  | -#  endif
 | ||||||
|  | +# endif
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  | +# ifdef USE_AS_STRNLEN
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x3_check):
 | ||||||
|  | +L(last_vec_x1_check):
 | ||||||
|  | +
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  |  	/* Check the end of data.  */ | ||||||
|  | -	cmpq	%rax, %rsi
 | ||||||
|  | -	jbe	L(max)
 | ||||||
|  | -	addq	$(VEC_SIZE * 3), %rax
 | ||||||
|  | +	cmpl	%eax, %esi
 | ||||||
|  | +	jb	L(max)
 | ||||||
|  | +	subq	%rdx, %rdi
 | ||||||
|  | +	incl	%eax
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  |  	shrq	$2, %rax | ||||||
|  |  #  endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  |  L(max): | ||||||
|  |  	movq	%r8, %rax | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(last_4x_vec):
 | ||||||
|  | +	/* Test first 2x VEC normally.  */
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(last_vec_x1)
 | ||||||
|  | +
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(last_vec_x2)
 | ||||||
|  | +
 | ||||||
|  | +	/* Normalize length.  */
 | ||||||
|  | +	andl	$(VEC_SIZE * 4 - 1), %esi
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(last_vec_x3)
 | ||||||
|  | +
 | ||||||
|  | +	subl	$(VEC_SIZE * 3), %esi
 | ||||||
|  | +	jb	L(max)
 | ||||||
|  | +
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	/* Check the end of data.  */
 | ||||||
|  | +	cmpl	%eax, %esi
 | ||||||
|  | +	jb	L(max)
 | ||||||
|  | +	subq	%rdx, %rdi
 | ||||||
|  | +	addl	$(VEC_SIZE * 3 + 1), %eax
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  |  	shrq	$2, %rax | ||||||
|  |  #  endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(zero):
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	ret
 | ||||||
|  | -# endif
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x0):
 | ||||||
|  | +L(last_vec_x1):
 | ||||||
|  | +	/* essentially duplicates of first_vec_x1 but use 64 bit
 | ||||||
|  | +	   instructions.  */
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | +	subq	%rdx, %rdi
 | ||||||
|  | +	incl	%eax
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  |  	shrq	$2, %rax | ||||||
|  | -# endif
 | ||||||
|  | +#  endif
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x1):
 | ||||||
|  | +L(last_vec_x2):
 | ||||||
|  | +	/* essentially duplicates of first_vec_x1 but use 64 bit
 | ||||||
|  | +	   instructions.  */
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	addq	$VEC_SIZE, %rax
 | ||||||
|  | +	subq	%rdx, %rdi
 | ||||||
|  | +	addl	$(VEC_SIZE + 1), %eax
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  |  	shrq	$2, %rax | ||||||
|  | -# endif
 | ||||||
|  | +#  endif
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x2):
 | ||||||
|  | +L(last_vec_x3):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	addq	$(VEC_SIZE * 2), %rax
 | ||||||
|  | +	subl	$(VEC_SIZE * 2), %esi
 | ||||||
|  | +	/* Check the end of data.  */
 | ||||||
|  | +	cmpl	%eax, %esi
 | ||||||
|  | +	jb	L(max_end)
 | ||||||
|  | +	subq	%rdx, %rdi
 | ||||||
|  | +	addl	$(VEC_SIZE * 2 + 1), %eax
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  |  	shrq	$2, %rax | ||||||
|  | -# endif
 | ||||||
|  | +#  endif
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +L(max_end):
 | ||||||
|  | +	movq	%r8, %rax
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | +# endif
 | ||||||
|  |   | ||||||
|  | +	/* Cold case for crossing page with first load.	 */
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(4x_vec_end):
 | ||||||
|  | -	VPCMPEQ	%ymm1, %ymm0, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | -	VPCMPEQ %ymm2, %ymm0, %ymm2
 | ||||||
|  | -	vpmovmskb %ymm2, %eax
 | ||||||
|  | +L(cross_page_boundary):
 | ||||||
|  | +	/* Align data to VEC_SIZE - 1.  */
 | ||||||
|  | +	orq	$(VEC_SIZE - 1), %rdi
 | ||||||
|  | +	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
 | ||||||
|  | +	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
 | ||||||
|  | +	   so no need to manually mod rdx.  */
 | ||||||
|  | +	sarxl	%edx, %eax, %eax
 | ||||||
|  | +# ifdef USE_AS_STRNLEN
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(first_vec_x1)
 | ||||||
|  | -	VPCMPEQ %ymm3, %ymm0, %ymm3
 | ||||||
|  | -	vpmovmskb %ymm3, %eax
 | ||||||
|  | +	jnz	L(cross_page_less_vec)
 | ||||||
|  | +	leaq	1(%rdi), %rcx
 | ||||||
|  | +	subq	%rdx, %rcx
 | ||||||
|  | +	/* Check length.  */
 | ||||||
|  | +	cmpq	%rsi, %rcx
 | ||||||
|  | +	jb	L(cross_page_continue)
 | ||||||
|  | +	movq	%r8, %rax
 | ||||||
|  | +# else
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(first_vec_x2)
 | ||||||
|  | -	VPCMPEQ %ymm4, %ymm0, %ymm4
 | ||||||
|  | -	vpmovmskb %ymm4, %eax
 | ||||||
|  | -L(first_vec_x3):
 | ||||||
|  | +	jz	L(cross_page_continue)
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	addq	$(VEC_SIZE * 3), %rax
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | -	shrq	$2, %rax
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	shrl	$2, %eax
 | ||||||
|  | +#  endif
 | ||||||
|  |  # endif | ||||||
|  | +L(return_vzeroupper):
 | ||||||
|  | +	ZERO_UPPER_VEC_REGISTERS_RETURN
 | ||||||
|  | +
 | ||||||
|  | +# ifdef USE_AS_STRNLEN
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(cross_page_less_vec):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	cmpq	%rax, %rsi
 | ||||||
|  | +	cmovb	%esi, %eax
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	shrl	$2, %eax
 | ||||||
|  | +#  endif
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | +# endif
 | ||||||
|  |   | ||||||
|  |  END (STRLEN) | ||||||
|  |  #endif | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										701
									
								
								SOURCES/glibc-RHEL-15696-26.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										701
									
								
								SOURCES/glibc-RHEL-15696-26.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,701 @@ | |||||||
|  | From 2a76821c3081d2c0231ecd2618f52662cb48fccd Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Mon, 3 May 2021 03:03:19 -0400 | ||||||
|  | Subject: [PATCH] x86: Optimize memchr-evex.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. This commit optimizes memchr-evex.S. The optimizations include | ||||||
|  | replacing some branches with cmovcc, avoiding some branches entirely | ||||||
|  | in the less_4x_vec case, making the page cross logic less strict, | ||||||
|  | saving some ALU in the alignment process, and most importantly | ||||||
|  | increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and | ||||||
|  | test-wmemchr are all passing. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++---------- | ||||||
|  |  1 file changed, 322 insertions(+), 225 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
 | ||||||
|  | index 6dd5d67b..81d5cd64 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memchr-evex.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
 | ||||||
|  | @@ -26,14 +26,28 @@
 | ||||||
|  |   | ||||||
|  |  # ifdef USE_AS_WMEMCHR | ||||||
|  |  #  define VPBROADCAST	vpbroadcastd | ||||||
|  | -#  define VPCMP		vpcmpd
 | ||||||
|  | -#  define SHIFT_REG	r8d
 | ||||||
|  | +#  define VPMINU	vpminud
 | ||||||
|  | +#  define VPCMP	vpcmpd
 | ||||||
|  | +#  define VPCMPEQ	vpcmpeqd
 | ||||||
|  | +#  define CHAR_SIZE	4
 | ||||||
|  |  # else | ||||||
|  |  #  define VPBROADCAST	vpbroadcastb | ||||||
|  | -#  define VPCMP		vpcmpb
 | ||||||
|  | -#  define SHIFT_REG	ecx
 | ||||||
|  | +#  define VPMINU	vpminub
 | ||||||
|  | +#  define VPCMP	vpcmpb
 | ||||||
|  | +#  define VPCMPEQ	vpcmpeqb
 | ||||||
|  | +#  define CHAR_SIZE	1
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  | +# ifdef USE_AS_RAWMEMCHR
 | ||||||
|  | +#  define RAW_PTR_REG	rcx
 | ||||||
|  | +#  define ALGN_PTR_REG	rdi
 | ||||||
|  | +# else
 | ||||||
|  | +#  define RAW_PTR_REG	rdi
 | ||||||
|  | +#  define ALGN_PTR_REG	rcx
 | ||||||
|  | +# endif
 | ||||||
|  | +
 | ||||||
|  | +# define XMMZERO	xmm23
 | ||||||
|  | +# define YMMZERO	ymm23
 | ||||||
|  |  # define XMMMATCH	xmm16 | ||||||
|  |  # define YMMMATCH	ymm16 | ||||||
|  |  # define YMM1		ymm17 | ||||||
|  | @@ -44,6 +58,8 @@
 | ||||||
|  |  # define YMM6		ymm22 | ||||||
|  |   | ||||||
|  |  # define VEC_SIZE 32 | ||||||
|  | +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
 | ||||||
|  | +# define PAGE_SIZE 4096
 | ||||||
|  |   | ||||||
|  |  	.section .text.evex,"ax",@progbits | ||||||
|  |  ENTRY (MEMCHR) | ||||||
|  | @@ -51,11 +67,7 @@ ENTRY (MEMCHR)
 | ||||||
|  |  	/* Check for zero length.  */ | ||||||
|  |  	test	%RDX_LP, %RDX_LP | ||||||
|  |  	jz	L(zero) | ||||||
|  | -# endif
 | ||||||
|  | -	movl	%edi, %ecx
 | ||||||
|  | -# ifdef USE_AS_WMEMCHR
 | ||||||
|  | -	shl	$2, %RDX_LP
 | ||||||
|  | -# else
 | ||||||
|  | +
 | ||||||
|  |  #  ifdef __ILP32__ | ||||||
|  |  	/* Clear the upper 32 bits.  */ | ||||||
|  |  	movl	%edx, %edx | ||||||
|  | @@ -64,318 +76,403 @@ ENTRY (MEMCHR)
 | ||||||
|  |  	/* Broadcast CHAR to YMMMATCH.  */ | ||||||
|  |  	VPBROADCAST %esi, %YMMMATCH | ||||||
|  |  	/* Check if we may cross page boundary with one vector load.  */ | ||||||
|  | -	andl	$(2 * VEC_SIZE - 1), %ecx
 | ||||||
|  | -	cmpl	$VEC_SIZE, %ecx
 | ||||||
|  | -	ja	L(cros_page_boundary)
 | ||||||
|  | +	movl	%edi, %eax
 | ||||||
|  | +	andl	$(PAGE_SIZE - 1), %eax
 | ||||||
|  | +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 | ||||||
|  | +	ja	L(cross_page_boundary)
 | ||||||
|  |   | ||||||
|  |  	/* Check the first VEC_SIZE bytes.  */ | ||||||
|  | -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -
 | ||||||
|  | +	VPCMP	$0, (%rdi), %YMMMATCH, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  |  # ifndef USE_AS_RAWMEMCHR | ||||||
|  | -	jnz	L(first_vec_x0_check)
 | ||||||
|  | -	/* Adjust length and check the end of data.  */
 | ||||||
|  | -	subq	$VEC_SIZE, %rdx
 | ||||||
|  | -	jbe	L(zero)
 | ||||||
|  | +	/* If length < CHAR_PER_VEC handle special.  */
 | ||||||
|  | +	cmpq	$CHAR_PER_VEC, %rdx
 | ||||||
|  | +	jbe	L(first_vec_x0)
 | ||||||
|  | +# endif
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jz	L(aligned_more)
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +# ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 | ||||||
|  | +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  # else | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  |  # endif | ||||||
|  | -
 | ||||||
|  | -	/* Align data for aligned loads in the loop.  */
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  |  # ifndef USE_AS_RAWMEMCHR | ||||||
|  | -	/* Adjust length.  */
 | ||||||
|  | -	addq	%rcx, %rdx
 | ||||||
|  | -
 | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | -	jbe	L(last_4x_vec_or_less)
 | ||||||
|  | -# endif
 | ||||||
|  | -	jmp	L(more_4x_vec)
 | ||||||
|  | +L(zero):
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | +	.p2align 5
 | ||||||
|  | +L(first_vec_x0):
 | ||||||
|  | +	/* Check if first match was before length.  */
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	xorl	%ecx, %ecx
 | ||||||
|  | +	cmpl	%eax, %edx
 | ||||||
|  | +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	cmovle	%rcx, %rax
 | ||||||
|  | +	ret
 | ||||||
|  | +# else
 | ||||||
|  | +	/* NB: first_vec_x0 is 17 bytes which will leave
 | ||||||
|  | +	   cross_page_boundary (which is relatively cold) close enough
 | ||||||
|  | +	   to ideal alignment. So only realign L(cross_page_boundary) if
 | ||||||
|  | +	   rawmemchr.  */
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(cros_page_boundary):
 | ||||||
|  | -	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | +# endif
 | ||||||
|  | +L(cross_page_boundary):
 | ||||||
|  | +	/* Save pointer before aligning as its original value is
 | ||||||
|  | +	   necessary for computer return address if byte is found or
 | ||||||
|  | +	   adjusting length if it is not and this is memchr.  */
 | ||||||
|  | +	movq	%rdi, %rcx
 | ||||||
|  | +	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
 | ||||||
|  | +	   for rawmemchr.  */
 | ||||||
|  | +	andq	$-VEC_SIZE, %ALGN_PTR_REG
 | ||||||
|  | +	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
 | ||||||
|  | +	kmovd	%k0, %r8d
 | ||||||
|  |  # ifdef USE_AS_WMEMCHR | ||||||
|  | -	/* NB: Divide shift count by 4 since each bit in K1 represent 4
 | ||||||
|  | +	/* NB: Divide shift count by 4 since each bit in K0 represent 4
 | ||||||
|  |  	   bytes.  */ | ||||||
|  | -	movl	%ecx, %SHIFT_REG
 | ||||||
|  | -	sarl	$2, %SHIFT_REG
 | ||||||
|  | +	sarl	$2, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +# ifndef USE_AS_RAWMEMCHR
 | ||||||
|  | +	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
 | ||||||
|  | +	subl	%eax, %esi
 | ||||||
|  |  # endif | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | -	/* Remove the leading bytes.  */
 | ||||||
|  | -	sarxl	%SHIFT_REG, %eax, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jz	L(aligned_more)
 | ||||||
|  | -	tzcntl	%eax, %eax
 | ||||||
|  |  # ifdef USE_AS_WMEMCHR | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	sall	$2, %eax
 | ||||||
|  | +	andl	$(CHAR_PER_VEC - 1), %eax
 | ||||||
|  |  # endif | ||||||
|  | +	/* Remove the leading bytes.  */
 | ||||||
|  | +	sarxl	%eax, %r8d, %eax
 | ||||||
|  |  # ifndef USE_AS_RAWMEMCHR | ||||||
|  |  	/* Check the end of data.  */ | ||||||
|  | -	cmpq	%rax, %rdx
 | ||||||
|  | -	jbe	L(zero)
 | ||||||
|  | +	cmpq	%rsi, %rdx
 | ||||||
|  | +	jbe	L(first_vec_x0)
 | ||||||
|  | +# endif
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jz	L(cross_page_continue)
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +# ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 | ||||||
|  | +	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +# else
 | ||||||
|  | +	addq	%RAW_PTR_REG, %rax
 | ||||||
|  |  # endif | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -	addq	%rcx, %rax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(aligned_more):
 | ||||||
|  | -# ifndef USE_AS_RAWMEMCHR
 | ||||||
|  | -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
 | ||||||
|  | -	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
 | ||||||
|  | -	   overflow.  */
 | ||||||
|  | -	negq	%rcx
 | ||||||
|  | -	addq	$VEC_SIZE, %rcx
 | ||||||
|  | +L(first_vec_x1):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	subq	%rcx, %rdx
 | ||||||
|  | -	jbe	L(zero)
 | ||||||
|  | -# endif
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_vec_x2):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_vec_x3):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -# ifndef USE_AS_RAWMEMCHR
 | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | -	jbe	L(last_4x_vec_or_less)
 | ||||||
|  | -# endif
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_vec_x4):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -L(more_4x_vec):
 | ||||||
|  | +	.p2align 5
 | ||||||
|  | +L(aligned_more):
 | ||||||
|  |  	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time | ||||||
|  |  	   since data is only aligned to VEC_SIZE.  */ | ||||||
|  | -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  |   | ||||||
|  | -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | +# ifndef USE_AS_RAWMEMCHR
 | ||||||
|  | +	/* Align data to VEC_SIZE.  */
 | ||||||
|  | +L(cross_page_continue):
 | ||||||
|  | +	xorl	%ecx, %ecx
 | ||||||
|  | +	subl	%edi, %ecx
 | ||||||
|  | +	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | +	/* esi is for adjusting length to see if near the end.  */
 | ||||||
|  | +	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
 | ||||||
|  | +#  ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarl	$2, %esi
 | ||||||
|  | +#  endif
 | ||||||
|  | +# else
 | ||||||
|  | +	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | +L(cross_page_continue):
 | ||||||
|  | +# endif
 | ||||||
|  | +	/* Load first VEC regardless.  */
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +# ifndef USE_AS_RAWMEMCHR
 | ||||||
|  | +	/* Adjust length. If near end handle specially.  */
 | ||||||
|  | +	subq	%rsi, %rdx
 | ||||||
|  | +	jbe	L(last_4x_vec_or_less)
 | ||||||
|  | +# endif
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x1) | ||||||
|  |   | ||||||
|  | -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x2) | ||||||
|  |   | ||||||
|  | -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x3) | ||||||
|  |   | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(first_vec_x4)
 | ||||||
|  | +
 | ||||||
|  |   | ||||||
|  |  # ifndef USE_AS_RAWMEMCHR | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | -	jbe	L(last_4x_vec_or_less)
 | ||||||
|  | -# endif
 | ||||||
|  | +	/* Check if at last CHAR_PER_VEC * 4 length.  */
 | ||||||
|  | +	subq	$(CHAR_PER_VEC * 4), %rdx
 | ||||||
|  | +	jbe	L(last_4x_vec_or_less_cmpeq)
 | ||||||
|  | +	addq	$VEC_SIZE, %rdi
 | ||||||
|  |   | ||||||
|  | -	/* Align data to 4 * VEC_SIZE.  */
 | ||||||
|  | -	movq	%rdi, %rcx
 | ||||||
|  | -	andl	$(4 * VEC_SIZE - 1), %ecx
 | ||||||
|  | +	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
 | ||||||
|  | +	 */
 | ||||||
|  | +#  ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	movl	%edi, %ecx
 | ||||||
|  |  	andq	$-(4 * VEC_SIZE), %rdi | ||||||
|  | -
 | ||||||
|  | -# ifndef USE_AS_RAWMEMCHR
 | ||||||
|  | -	/* Adjust length.  */
 | ||||||
|  | +	andl	$(VEC_SIZE * 4 - 1), %ecx
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarl	$2, %ecx
 | ||||||
|  |  	addq	%rcx, %rdx | ||||||
|  | +#  else
 | ||||||
|  | +	addq	%rdi, %rdx
 | ||||||
|  | +	andq	$-(4 * VEC_SIZE), %rdi
 | ||||||
|  | +	subq	%rdi, %rdx
 | ||||||
|  | +#  endif
 | ||||||
|  | +# else
 | ||||||
|  | +	addq	$VEC_SIZE, %rdi
 | ||||||
|  | +	andq	$-(4 * VEC_SIZE), %rdi
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  | +	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
 | ||||||
|  | +
 | ||||||
|  | +	/* Compare 4 * VEC at a time forward.  */
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(loop_4x_vec): | ||||||
|  | -	/* Compare 4 * VEC at a time forward.  */
 | ||||||
|  | -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
 | ||||||
|  | -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
 | ||||||
|  | -	kord	%k1, %k2, %k5
 | ||||||
|  | -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
 | ||||||
|  | -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
 | ||||||
|  | -
 | ||||||
|  | -	kord	%k3, %k4, %k6
 | ||||||
|  | -	kortestd %k5, %k6
 | ||||||
|  | -	jnz	L(4x_vec_end)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rdi
 | ||||||
|  | -
 | ||||||
|  | +	/* It would be possible to save some instructions using 4x VPCMP
 | ||||||
|  | +	   but bottleneck on port 5 makes it not woth it.  */
 | ||||||
|  | +	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
 | ||||||
|  | +	/* xor will set bytes match esi to zero.  */
 | ||||||
|  | +	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
 | ||||||
|  | +	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
 | ||||||
|  | +	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
 | ||||||
|  | +	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
 | ||||||
|  | +	VPCMP	$0, %YMM3, %YMMZERO, %k2
 | ||||||
|  |  # ifdef USE_AS_RAWMEMCHR | ||||||
|  | -	jmp	L(loop_4x_vec)
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	kortestd %k2, %k3
 | ||||||
|  | +	jz	L(loop_4x_vec)
 | ||||||
|  |  # else | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | +	kortestd %k2, %k3
 | ||||||
|  | +	jnz	L(loop_4x_vec_end)
 | ||||||
|  | +
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +
 | ||||||
|  | +	subq	$(CHAR_PER_VEC * 4), %rdx
 | ||||||
|  |  	ja	L(loop_4x_vec) | ||||||
|  |   | ||||||
|  | +	/* Fall through into less than 4 remaining vectors of length case.
 | ||||||
|  | +	 */
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	addq	$(VEC_SIZE * 3), %rdi
 | ||||||
|  | +	.p2align 4
 | ||||||
|  |  L(last_4x_vec_or_less): | ||||||
|  | -	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
 | ||||||
|  | -	addl	$(VEC_SIZE * 2), %edx
 | ||||||
|  | -	jle	L(last_2x_vec)
 | ||||||
|  | -
 | ||||||
|  | -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | +	/* Check if first VEC contained match.  */
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | +	jnz	L(first_vec_x1_check)
 | ||||||
|  |   | ||||||
|  | -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x1)
 | ||||||
|  | +	/* If remaining length > CHAR_PER_VEC * 2.  */
 | ||||||
|  | +	addl	$(CHAR_PER_VEC * 2), %edx
 | ||||||
|  | +	jg	L(last_4x_vec)
 | ||||||
|  |   | ||||||
|  | -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | +L(last_2x_vec):
 | ||||||
|  | +	/* If remaining length < CHAR_PER_VEC.  */
 | ||||||
|  | +	addl	$CHAR_PER_VEC, %edx
 | ||||||
|  | +	jle	L(zero_end)
 | ||||||
|  |   | ||||||
|  | -	jnz	L(first_vec_x2_check)
 | ||||||
|  | -	subl	$VEC_SIZE, %edx
 | ||||||
|  | -	jle	L(zero)
 | ||||||
|  | +	/* Check VEC2 and compare any match with remaining length.  */
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	cmpl	%eax, %edx
 | ||||||
|  | +	jbe	L(set_zero_end)
 | ||||||
|  | +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +L(zero_end):
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  |   | ||||||
|  | -	jnz	L(first_vec_x3_check)
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_vec_x1_check):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	/* Adjust length.  */
 | ||||||
|  | +	subl	$-(CHAR_PER_VEC * 4), %edx
 | ||||||
|  | +	/* Check if match within remaining length.  */
 | ||||||
|  | +	cmpl	%eax, %edx
 | ||||||
|  | +	jbe	L(set_zero_end)
 | ||||||
|  | +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 | ||||||
|  | +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	ret
 | ||||||
|  | +L(set_zero_end):
 | ||||||
|  |  	xorl	%eax, %eax | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(last_2x_vec):
 | ||||||
|  | -	addl	$(VEC_SIZE * 2), %edx
 | ||||||
|  | -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
 | ||||||
|  | +L(loop_4x_vec_end):
 | ||||||
|  | +# endif
 | ||||||
|  | +	/* rawmemchr will fall through into this if match was found in
 | ||||||
|  | +	   loop.  */
 | ||||||
|  | +
 | ||||||
|  | +	/* k1 has not of matches with VEC1.  */
 | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | +# ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	subl	$((1 << CHAR_PER_VEC) - 1), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	incl	%eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	jnz	L(last_vec_x1_return)
 | ||||||
|  |   | ||||||
|  | -	jnz	L(first_vec_x0_check)
 | ||||||
|  | -	subl	$VEC_SIZE, %edx
 | ||||||
|  | -	jle	L(zero)
 | ||||||
|  | +	VPCMP	$0, %YMM2, %YMMZERO, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(last_vec_x2_return)
 | ||||||
|  |   | ||||||
|  | -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | +	kmovd	%k2, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(first_vec_x1_check)
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	ret
 | ||||||
|  | +	jnz	L(last_vec_x3_return)
 | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(first_vec_x0_check):
 | ||||||
|  | +	kmovd	%k3, %eax
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_WMEMCHR
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	sall	$2, %eax
 | ||||||
|  | +# ifdef USE_AS_RAWMEMCHR
 | ||||||
|  | +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +# else
 | ||||||
|  | +	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  # endif | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	cmpq	%rax, %rdx
 | ||||||
|  | -	jbe	L(zero)
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x1_check):
 | ||||||
|  | +L(last_vec_x1_return):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_WMEMCHR
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	sall	$2, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	cmpq	%rax, %rdx
 | ||||||
|  | -	jbe	L(zero)
 | ||||||
|  | -	addq	$VEC_SIZE, %rax
 | ||||||
|  | +# ifdef USE_AS_RAWMEMCHR
 | ||||||
|  | +#  ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 | ||||||
|  | +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +#  else
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(first_vec_x2_check):
 | ||||||
|  | -	tzcntl	%eax, %eax
 | ||||||
|  | -# ifdef USE_AS_WMEMCHR
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	sall	$2, %eax
 | ||||||
|  | +#  endif
 | ||||||
|  | +# else
 | ||||||
|  | +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 | ||||||
|  | +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  # endif | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	cmpq	%rax, %rdx
 | ||||||
|  | -	jbe	L(zero)
 | ||||||
|  | -	addq	$(VEC_SIZE * 2), %rax
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x3_check):
 | ||||||
|  | +L(last_vec_x2_return):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_WMEMCHR
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	sall	$2, %eax
 | ||||||
|  | +# ifdef USE_AS_RAWMEMCHR
 | ||||||
|  | +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 | ||||||
|  | +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +# else
 | ||||||
|  | +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 | ||||||
|  | +	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  # endif | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	cmpq	%rax, %rdx
 | ||||||
|  | -	jbe	L(zero)
 | ||||||
|  | -	addq	$(VEC_SIZE * 3), %rax
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(zero):
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	ret
 | ||||||
|  | -# endif
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(first_vec_x0):
 | ||||||
|  | +L(last_vec_x3_return):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_WMEMCHR
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	leaq	(%rdi, %rax, 4), %rax
 | ||||||
|  | +# ifdef USE_AS_RAWMEMCHR
 | ||||||
|  | +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 | ||||||
|  | +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  # else | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 | ||||||
|  | +	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  # endif | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | +
 | ||||||
|  | +# ifndef USE_AS_RAWMEMCHR
 | ||||||
|  | +L(last_4x_vec_or_less_cmpeq):
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	/* Check first VEC regardless.  */
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(first_vec_x1_check)
 | ||||||
|  | +
 | ||||||
|  | +	/* If remaining length <= CHAR_PER_VEC * 2.  */
 | ||||||
|  | +	addl	$(CHAR_PER_VEC * 2), %edx
 | ||||||
|  | +	jle	L(last_2x_vec)
 | ||||||
|  | +
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x1):
 | ||||||
|  | +L(last_4x_vec):
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(last_vec_x2)
 | ||||||
|  | +
 | ||||||
|  | +
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	/* Create mask for possible matches within remaining length.  */
 | ||||||
|  | +#  ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
 | ||||||
|  | +	bzhil	%edx, %ecx, %ecx
 | ||||||
|  | +#  else
 | ||||||
|  | +	movq	$-1, %rcx
 | ||||||
|  | +	bzhiq	%rdx, %rcx, %rcx
 | ||||||
|  | +#  endif
 | ||||||
|  | +	/* Test matches in data against length match.  */
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jnz	L(last_vec_x3)
 | ||||||
|  | +
 | ||||||
|  | +	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
 | ||||||
|  | +	   remaining length was found to be > CHAR_PER_VEC * 2.  */
 | ||||||
|  | +	subl	$CHAR_PER_VEC, %edx
 | ||||||
|  | +	jbe	L(zero_end2)
 | ||||||
|  | +
 | ||||||
|  | +
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	/* Shift remaining length mask for last VEC.  */
 | ||||||
|  | +#  ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	shrl	$CHAR_PER_VEC, %ecx
 | ||||||
|  | +#  else
 | ||||||
|  | +	shrq	$CHAR_PER_VEC, %rcx
 | ||||||
|  | +#  endif
 | ||||||
|  | +	andl	%ecx, %eax
 | ||||||
|  | +	jz	L(zero_end2)
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_WMEMCHR
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
 | ||||||
|  | -# else
 | ||||||
|  | -	addq	$VEC_SIZE, %rax
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -# endif
 | ||||||
|  | +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +L(zero_end2):
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(first_vec_x2):
 | ||||||
|  | +L(last_vec_x2):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_WMEMCHR
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
 | ||||||
|  | -# else
 | ||||||
|  | -	addq	$(VEC_SIZE * 2), %rax
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -# endif
 | ||||||
|  | +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(4x_vec_end):
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | -	kmovd	%k2, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x1)
 | ||||||
|  | -	kmovd	%k3, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x2)
 | ||||||
|  | -	kmovd	%k4, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -L(first_vec_x3):
 | ||||||
|  | +L(last_vec_x3):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_WMEMCHR
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
 | ||||||
|  | -# else
 | ||||||
|  | -	addq	$(VEC_SIZE * 3), %rax
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -# endif
 | ||||||
|  | +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  	ret | ||||||
|  | +# endif
 | ||||||
|  |   | ||||||
|  |  END (MEMCHR) | ||||||
|  |  #endif | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										30
									
								
								SOURCES/glibc-RHEL-15696-27.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								SOURCES/glibc-RHEL-15696-27.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,30 @@ | |||||||
|  | From 6ea916adfa0ab9af6e7dc6adcf6f977dfe017835 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Alice Xu <alice.d.xu@gmail.com> | ||||||
|  | Date: Fri, 7 May 2021 19:03:21 -0700 | ||||||
|  | Subject: [PATCH] x86-64: Fix an unknown vector operation in memchr-evex.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | An unknown vector operation occurred in commit 2a76821c308. Fixed it | ||||||
|  | by using "ymm{k1}{z}" but not "ymm {k1} {z}". | ||||||
|  | 
 | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/memchr-evex.S | 2 +- | ||||||
|  |  1 file changed, 1 insertion(+), 1 deletion(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
 | ||||||
|  | index 81d5cd64..f3fdad4f 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memchr-evex.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
 | ||||||
|  | @@ -271,7 +271,7 @@ L(loop_4x_vec):
 | ||||||
|  |  	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 | ||||||
|  |  	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 | ||||||
|  |  	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */ | ||||||
|  | -	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
 | ||||||
|  | +	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
 | ||||||
|  |  	VPCMP	$0, %YMM3, %YMMZERO, %k2 | ||||||
|  |  # ifdef USE_AS_RAWMEMCHR | ||||||
|  |  	subq	$-(VEC_SIZE * 4), %rdi | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										566
									
								
								SOURCES/glibc-RHEL-15696-28.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										566
									
								
								SOURCES/glibc-RHEL-15696-28.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,566 @@ | |||||||
|  | From a0db678071c60b6c47c468d231dd0b3694ba7a98 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Tue, 22 Jun 2021 20:42:10 -0700 | ||||||
|  | Subject: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1 | ||||||
|  | version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S | ||||||
|  | and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants. | ||||||
|  | This also removes the unused symbols, __GI___strlen_sse2 and | ||||||
|  | __GI___wcsnlen_sse4_1. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strlen-sse2.S    |   2 +- | ||||||
|  |  sysdeps/x86_64/multiarch/strlen-vec.S     | 257 ++++++++++++++++++++++ | ||||||
|  |  sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S |   2 +- | ||||||
|  |  sysdeps/x86_64/strlen.S                   | 243 +------------------- | ||||||
|  |  4 files changed, 262 insertions(+), 242 deletions(-) | ||||||
|  |  create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S | ||||||
|  | 	(Copyright dates, URL) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
 | ||||||
|  | index 7bc57b8d..449c8a7f 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strlen-sse2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
 | ||||||
|  | @@ -20,4 +20,4 @@
 | ||||||
|  |  # define strlen __strlen_sse2 | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  | -#include "../strlen.S"
 | ||||||
|  | +#include "strlen-vec.S"
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..8f660bb9
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strlen-vec.S
 | ||||||
|  | @@ -0,0 +1,257 @@
 | ||||||
|  | +/* SSE2 version of strlen and SSE4.1 version of wcslen.
 | ||||||
|  | +   Copyright (C) 2012-2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <sysdep.h>
 | ||||||
|  | +
 | ||||||
|  | +#ifdef AS_WCSLEN
 | ||||||
|  | +# define PMINU		pminud
 | ||||||
|  | +# define PCMPEQ		pcmpeqd
 | ||||||
|  | +# define SHIFT_RETURN	shrq $2, %rax
 | ||||||
|  | +#else
 | ||||||
|  | +# define PMINU		pminub
 | ||||||
|  | +# define PCMPEQ		pcmpeqb
 | ||||||
|  | +# define SHIFT_RETURN
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +/* Long lived register in strlen(s), strnlen(s, n) are:
 | ||||||
|  | +
 | ||||||
|  | +	%xmm3 - zero
 | ||||||
|  | +	%rdi   - s
 | ||||||
|  | +	%r10  (s+n) & (~(64-1))
 | ||||||
|  | +	%r11   s+n
 | ||||||
|  | +*/
 | ||||||
|  | +
 | ||||||
|  | +
 | ||||||
|  | +.text
 | ||||||
|  | +ENTRY(strlen)
 | ||||||
|  | +
 | ||||||
|  | +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
 | ||||||
|  | +#define FIND_ZERO	\
 | ||||||
|  | +	PCMPEQ	(%rax), %xmm0;	\
 | ||||||
|  | +	PCMPEQ	16(%rax), %xmm1;	\
 | ||||||
|  | +	PCMPEQ	32(%rax), %xmm2;	\
 | ||||||
|  | +	PCMPEQ	48(%rax), %xmm3;	\
 | ||||||
|  | +	pmovmskb	%xmm0, %esi;	\
 | ||||||
|  | +	pmovmskb	%xmm1, %edx;	\
 | ||||||
|  | +	pmovmskb	%xmm2, %r8d;	\
 | ||||||
|  | +	pmovmskb	%xmm3, %ecx;	\
 | ||||||
|  | +	salq	$16, %rdx;	\
 | ||||||
|  | +	salq	$16, %rcx;	\
 | ||||||
|  | +	orq	%rsi, %rdx;	\
 | ||||||
|  | +	orq	%r8, %rcx;	\
 | ||||||
|  | +	salq	$32, %rcx;	\
 | ||||||
|  | +	orq	%rcx, %rdx;
 | ||||||
|  | +
 | ||||||
|  | +#ifdef AS_STRNLEN
 | ||||||
|  | +/* Do not read anything when n==0.  */
 | ||||||
|  | +	test	%RSI_LP, %RSI_LP
 | ||||||
|  | +	jne	L(n_nonzero)
 | ||||||
|  | +	xor	%rax, %rax
 | ||||||
|  | +	ret
 | ||||||
|  | +L(n_nonzero):
 | ||||||
|  | +# ifdef AS_WCSLEN
 | ||||||
|  | +	shl	$2, %RSI_LP
 | ||||||
|  | +# endif
 | ||||||
|  | +
 | ||||||
|  | +/* Initialize long lived registers.  */
 | ||||||
|  | +
 | ||||||
|  | +	add	%RDI_LP, %RSI_LP
 | ||||||
|  | +	mov	%RSI_LP, %R10_LP
 | ||||||
|  | +	and	$-64, %R10_LP
 | ||||||
|  | +	mov	%RSI_LP, %R11_LP
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +	pxor	%xmm0, %xmm0
 | ||||||
|  | +	pxor	%xmm1, %xmm1
 | ||||||
|  | +	pxor	%xmm2, %xmm2
 | ||||||
|  | +	pxor	%xmm3, %xmm3
 | ||||||
|  | +	movq	%rdi, %rax
 | ||||||
|  | +	movq	%rdi, %rcx
 | ||||||
|  | +	andq	$4095, %rcx
 | ||||||
|  | +/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
 | ||||||
|  | +	cmpq	$4047, %rcx
 | ||||||
|  | +/* We cannot unify this branching as it would be ~6 cycles slower.  */
 | ||||||
|  | +	ja	L(cross_page)
 | ||||||
|  | +
 | ||||||
|  | +#ifdef AS_STRNLEN
 | ||||||
|  | +/* Test if end is among first 64 bytes.  */
 | ||||||
|  | +# define STRNLEN_PROLOG	\
 | ||||||
|  | +	mov	%r11, %rsi;	\
 | ||||||
|  | +	subq	%rax, %rsi;	\
 | ||||||
|  | +	andq	$-64, %rax;	\
 | ||||||
|  | +	testq	$-64, %rsi;	\
 | ||||||
|  | +	je	L(strnlen_ret)
 | ||||||
|  | +#else
 | ||||||
|  | +# define STRNLEN_PROLOG  andq $-64, %rax;
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +/* Ignore bits in mask that come before start of string.  */
 | ||||||
|  | +#define PROLOG(lab)	\
 | ||||||
|  | +	movq	%rdi, %rcx;	\
 | ||||||
|  | +	xorq	%rax, %rcx;	\
 | ||||||
|  | +	STRNLEN_PROLOG;	\
 | ||||||
|  | +	sarq	%cl, %rdx;	\
 | ||||||
|  | +	test	%rdx, %rdx;	\
 | ||||||
|  | +	je	L(lab);	\
 | ||||||
|  | +	bsfq	%rdx, %rax;	\
 | ||||||
|  | +	SHIFT_RETURN;		\
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +#ifdef AS_STRNLEN
 | ||||||
|  | +	andq	$-16, %rax
 | ||||||
|  | +	FIND_ZERO
 | ||||||
|  | +#else
 | ||||||
|  | +	/* Test first 16 bytes unaligned.  */
 | ||||||
|  | +	movdqu	(%rax), %xmm4
 | ||||||
|  | +	PCMPEQ	%xmm0, %xmm4
 | ||||||
|  | +	pmovmskb	%xmm4, %edx
 | ||||||
|  | +	test	%edx, %edx
 | ||||||
|  | +	je 	L(next48_bytes)
 | ||||||
|  | +	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
 | ||||||
|  | +	SHIFT_RETURN
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +L(next48_bytes):
 | ||||||
|  | +/* Same as FIND_ZERO except we do not check first 16 bytes.  */
 | ||||||
|  | +	andq	$-16, %rax
 | ||||||
|  | +	PCMPEQ 16(%rax), %xmm1
 | ||||||
|  | +	PCMPEQ 32(%rax), %xmm2
 | ||||||
|  | +	PCMPEQ 48(%rax), %xmm3
 | ||||||
|  | +	pmovmskb	%xmm1, %edx
 | ||||||
|  | +	pmovmskb	%xmm2, %r8d
 | ||||||
|  | +	pmovmskb	%xmm3, %ecx
 | ||||||
|  | +	salq	$16, %rdx
 | ||||||
|  | +	salq	$16, %rcx
 | ||||||
|  | +	orq	%r8, %rcx
 | ||||||
|  | +	salq	$32, %rcx
 | ||||||
|  | +	orq	%rcx, %rdx
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +	/* When no zero byte is found xmm1-3 are zero so we do not have to
 | ||||||
|  | +	   zero them.  */
 | ||||||
|  | +	PROLOG(loop)
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(cross_page):
 | ||||||
|  | +	andq	$-64, %rax
 | ||||||
|  | +	FIND_ZERO
 | ||||||
|  | +	PROLOG(loop_init)
 | ||||||
|  | +
 | ||||||
|  | +#ifdef AS_STRNLEN
 | ||||||
|  | +/* We must do this check to correctly handle strnlen (s, -1).  */
 | ||||||
|  | +L(strnlen_ret):
 | ||||||
|  | +	bts	%rsi, %rdx
 | ||||||
|  | +	sarq	%cl, %rdx
 | ||||||
|  | +	test	%rdx, %rdx
 | ||||||
|  | +	je	L(loop_init)
 | ||||||
|  | +	bsfq	%rdx, %rax
 | ||||||
|  | +	SHIFT_RETURN
 | ||||||
|  | +	ret
 | ||||||
|  | +#endif
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(loop_init):
 | ||||||
|  | +	pxor	%xmm1, %xmm1
 | ||||||
|  | +	pxor	%xmm2, %xmm2
 | ||||||
|  | +	pxor	%xmm3, %xmm3
 | ||||||
|  | +#ifdef AS_STRNLEN
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(loop):
 | ||||||
|  | +
 | ||||||
|  | +	addq	$64, %rax
 | ||||||
|  | +	cmpq	%rax, %r10
 | ||||||
|  | +	je	L(exit_end)
 | ||||||
|  | +
 | ||||||
|  | +	movdqa	(%rax), %xmm0
 | ||||||
|  | +	PMINU	16(%rax), %xmm0
 | ||||||
|  | +	PMINU	32(%rax), %xmm0
 | ||||||
|  | +	PMINU	48(%rax), %xmm0
 | ||||||
|  | +	PCMPEQ	%xmm3, %xmm0
 | ||||||
|  | +	pmovmskb	%xmm0, %edx
 | ||||||
|  | +	testl	%edx, %edx
 | ||||||
|  | +	jne	L(exit)
 | ||||||
|  | +	jmp	L(loop)
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(exit_end):
 | ||||||
|  | +	cmp	%rax, %r11
 | ||||||
|  | +	je	L(first) /* Do not read when end is at page boundary.  */
 | ||||||
|  | +	pxor	%xmm0, %xmm0
 | ||||||
|  | +	FIND_ZERO
 | ||||||
|  | +
 | ||||||
|  | +L(first):
 | ||||||
|  | +	bts	%r11, %rdx
 | ||||||
|  | +	bsfq	%rdx, %rdx
 | ||||||
|  | +	addq	%rdx, %rax
 | ||||||
|  | +	subq	%rdi, %rax
 | ||||||
|  | +	SHIFT_RETURN
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(exit):
 | ||||||
|  | +	pxor	%xmm0, %xmm0
 | ||||||
|  | +	FIND_ZERO
 | ||||||
|  | +
 | ||||||
|  | +	bsfq	%rdx, %rdx
 | ||||||
|  | +	addq	%rdx, %rax
 | ||||||
|  | +	subq	%rdi, %rax
 | ||||||
|  | +	SHIFT_RETURN
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +#else
 | ||||||
|  | +
 | ||||||
|  | +	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(loop):
 | ||||||
|  | +
 | ||||||
|  | +	movdqa	64(%rax), %xmm0
 | ||||||
|  | +	PMINU	80(%rax), %xmm0
 | ||||||
|  | +	PMINU	96(%rax), %xmm0
 | ||||||
|  | +	PMINU	112(%rax), %xmm0
 | ||||||
|  | +	PCMPEQ	%xmm3, %xmm0
 | ||||||
|  | +	pmovmskb	%xmm0, %edx
 | ||||||
|  | +	testl	%edx, %edx
 | ||||||
|  | +	jne	L(exit64)
 | ||||||
|  | +
 | ||||||
|  | +	subq	$-128, %rax
 | ||||||
|  | +
 | ||||||
|  | +	movdqa	(%rax), %xmm0
 | ||||||
|  | +	PMINU	16(%rax), %xmm0
 | ||||||
|  | +	PMINU	32(%rax), %xmm0
 | ||||||
|  | +	PMINU	48(%rax), %xmm0
 | ||||||
|  | +	PCMPEQ	%xmm3, %xmm0
 | ||||||
|  | +	pmovmskb	%xmm0, %edx
 | ||||||
|  | +	testl	%edx, %edx
 | ||||||
|  | +	jne	L(exit0)
 | ||||||
|  | +	jmp	L(loop)
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(exit64):
 | ||||||
|  | +	addq	$64, %rax
 | ||||||
|  | +L(exit0):
 | ||||||
|  | +	pxor	%xmm0, %xmm0
 | ||||||
|  | +	FIND_ZERO
 | ||||||
|  | +
 | ||||||
|  | +	bsfq	%rdx, %rdx
 | ||||||
|  | +	addq	%rdx, %rax
 | ||||||
|  | +	subq	%rdi, %rax
 | ||||||
|  | +	SHIFT_RETURN
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +END(strlen)
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
 | ||||||
|  | index a8cab0cb..5fa51fe0 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
 | ||||||
|  | @@ -2,4 +2,4 @@
 | ||||||
|  |  #define AS_STRNLEN | ||||||
|  |  #define strlen	__wcsnlen_sse4_1 | ||||||
|  |   | ||||||
|  | -#include "../strlen.S"
 | ||||||
|  | +#include "strlen-vec.S"
 | ||||||
|  | diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
 | ||||||
|  | index f845f3d4..ad047d84 100644
 | ||||||
|  | --- a/sysdeps/x86_64/strlen.S
 | ||||||
|  | +++ b/sysdeps/x86_64/strlen.S
 | ||||||
|  | @@ -1,5 +1,5 @@
 | ||||||
|  | -/* SSE2 version of strlen/wcslen.
 | ||||||
|  | -   Copyright (C) 2012-2018 Free Software Foundation, Inc.
 | ||||||
|  | +/* SSE2 version of strlen.
 | ||||||
|  | +   Copyright (C) 2021 Free Software Foundation, Inc.
 | ||||||
|  |     This file is part of the GNU C Library. | ||||||
|  |   | ||||||
|  |     The GNU C Library is free software; you can redistribute it and/or | ||||||
|  | @@ -16,243 +16,6 @@
 | ||||||
|  |     License along with the GNU C Library; if not, see | ||||||
|  |     <http://www.gnu.org/licenses/>.  */ | ||||||
|  |   | ||||||
|  | -#include <sysdep.h>
 | ||||||
|  | +#include "multiarch/strlen-vec.S"
 | ||||||
|  |   | ||||||
|  | -#ifdef AS_WCSLEN
 | ||||||
|  | -# define PMINU		pminud
 | ||||||
|  | -# define PCMPEQ		pcmpeqd
 | ||||||
|  | -# define SHIFT_RETURN	shrq $2, %rax
 | ||||||
|  | -#else
 | ||||||
|  | -# define PMINU		pminub
 | ||||||
|  | -# define PCMPEQ		pcmpeqb
 | ||||||
|  | -# define SHIFT_RETURN
 | ||||||
|  | -#endif
 | ||||||
|  | -
 | ||||||
|  | -/* Long lived register in strlen(s), strnlen(s, n) are:
 | ||||||
|  | -
 | ||||||
|  | -	%xmm3 - zero
 | ||||||
|  | -	%rdi   - s
 | ||||||
|  | -	%r10  (s+n) & (~(64-1))
 | ||||||
|  | -	%r11   s+n
 | ||||||
|  | -*/
 | ||||||
|  | -
 | ||||||
|  | -
 | ||||||
|  | -.text
 | ||||||
|  | -ENTRY(strlen)
 | ||||||
|  | -
 | ||||||
|  | -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
 | ||||||
|  | -#define FIND_ZERO	\
 | ||||||
|  | -	PCMPEQ	(%rax), %xmm0;	\
 | ||||||
|  | -	PCMPEQ	16(%rax), %xmm1;	\
 | ||||||
|  | -	PCMPEQ	32(%rax), %xmm2;	\
 | ||||||
|  | -	PCMPEQ	48(%rax), %xmm3;	\
 | ||||||
|  | -	pmovmskb	%xmm0, %esi;	\
 | ||||||
|  | -	pmovmskb	%xmm1, %edx;	\
 | ||||||
|  | -	pmovmskb	%xmm2, %r8d;	\
 | ||||||
|  | -	pmovmskb	%xmm3, %ecx;	\
 | ||||||
|  | -	salq	$16, %rdx;	\
 | ||||||
|  | -	salq	$16, %rcx;	\
 | ||||||
|  | -	orq	%rsi, %rdx;	\
 | ||||||
|  | -	orq	%r8, %rcx;	\
 | ||||||
|  | -	salq	$32, %rcx;	\
 | ||||||
|  | -	orq	%rcx, %rdx;
 | ||||||
|  | -
 | ||||||
|  | -#ifdef AS_STRNLEN
 | ||||||
|  | -/* Do not read anything when n==0.  */
 | ||||||
|  | -	test	%RSI_LP, %RSI_LP
 | ||||||
|  | -	jne	L(n_nonzero)
 | ||||||
|  | -	xor	%rax, %rax
 | ||||||
|  | -	ret
 | ||||||
|  | -L(n_nonzero):
 | ||||||
|  | -# ifdef AS_WCSLEN
 | ||||||
|  | -	shl	$2, %RSI_LP
 | ||||||
|  | -# endif
 | ||||||
|  | -
 | ||||||
|  | -/* Initialize long lived registers.  */
 | ||||||
|  | -
 | ||||||
|  | -	add	%RDI_LP, %RSI_LP
 | ||||||
|  | -	mov	%RSI_LP, %R10_LP
 | ||||||
|  | -	and	$-64, %R10_LP
 | ||||||
|  | -	mov	%RSI_LP, %R11_LP
 | ||||||
|  | -#endif
 | ||||||
|  | -
 | ||||||
|  | -	pxor	%xmm0, %xmm0
 | ||||||
|  | -	pxor	%xmm1, %xmm1
 | ||||||
|  | -	pxor	%xmm2, %xmm2
 | ||||||
|  | -	pxor	%xmm3, %xmm3
 | ||||||
|  | -	movq	%rdi, %rax
 | ||||||
|  | -	movq	%rdi, %rcx
 | ||||||
|  | -	andq	$4095, %rcx
 | ||||||
|  | -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
 | ||||||
|  | -	cmpq	$4047, %rcx
 | ||||||
|  | -/* We cannot unify this branching as it would be ~6 cycles slower.  */
 | ||||||
|  | -	ja	L(cross_page)
 | ||||||
|  | -
 | ||||||
|  | -#ifdef AS_STRNLEN
 | ||||||
|  | -/* Test if end is among first 64 bytes.  */
 | ||||||
|  | -# define STRNLEN_PROLOG	\
 | ||||||
|  | -	mov	%r11, %rsi;	\
 | ||||||
|  | -	subq	%rax, %rsi;	\
 | ||||||
|  | -	andq	$-64, %rax;	\
 | ||||||
|  | -	testq	$-64, %rsi;	\
 | ||||||
|  | -	je	L(strnlen_ret)
 | ||||||
|  | -#else
 | ||||||
|  | -# define STRNLEN_PROLOG  andq $-64, %rax;
 | ||||||
|  | -#endif
 | ||||||
|  | -
 | ||||||
|  | -/* Ignore bits in mask that come before start of string.  */
 | ||||||
|  | -#define PROLOG(lab)	\
 | ||||||
|  | -	movq	%rdi, %rcx;	\
 | ||||||
|  | -	xorq	%rax, %rcx;	\
 | ||||||
|  | -	STRNLEN_PROLOG;	\
 | ||||||
|  | -	sarq	%cl, %rdx;	\
 | ||||||
|  | -	test	%rdx, %rdx;	\
 | ||||||
|  | -	je	L(lab);	\
 | ||||||
|  | -	bsfq	%rdx, %rax;	\
 | ||||||
|  | -	SHIFT_RETURN;		\
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -#ifdef AS_STRNLEN
 | ||||||
|  | -	andq	$-16, %rax
 | ||||||
|  | -	FIND_ZERO
 | ||||||
|  | -#else
 | ||||||
|  | -	/* Test first 16 bytes unaligned.  */
 | ||||||
|  | -	movdqu	(%rax), %xmm4
 | ||||||
|  | -	PCMPEQ	%xmm0, %xmm4
 | ||||||
|  | -	pmovmskb	%xmm4, %edx
 | ||||||
|  | -	test	%edx, %edx
 | ||||||
|  | -	je 	L(next48_bytes)
 | ||||||
|  | -	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
 | ||||||
|  | -	SHIFT_RETURN
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -L(next48_bytes):
 | ||||||
|  | -/* Same as FIND_ZERO except we do not check first 16 bytes.  */
 | ||||||
|  | -	andq	$-16, %rax
 | ||||||
|  | -	PCMPEQ 16(%rax), %xmm1
 | ||||||
|  | -	PCMPEQ 32(%rax), %xmm2
 | ||||||
|  | -	PCMPEQ 48(%rax), %xmm3
 | ||||||
|  | -	pmovmskb	%xmm1, %edx
 | ||||||
|  | -	pmovmskb	%xmm2, %r8d
 | ||||||
|  | -	pmovmskb	%xmm3, %ecx
 | ||||||
|  | -	salq	$16, %rdx
 | ||||||
|  | -	salq	$16, %rcx
 | ||||||
|  | -	orq	%r8, %rcx
 | ||||||
|  | -	salq	$32, %rcx
 | ||||||
|  | -	orq	%rcx, %rdx
 | ||||||
|  | -#endif
 | ||||||
|  | -
 | ||||||
|  | -	/* When no zero byte is found xmm1-3 are zero so we do not have to
 | ||||||
|  | -	   zero them.  */
 | ||||||
|  | -	PROLOG(loop)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(cross_page):
 | ||||||
|  | -	andq	$-64, %rax
 | ||||||
|  | -	FIND_ZERO
 | ||||||
|  | -	PROLOG(loop_init)
 | ||||||
|  | -
 | ||||||
|  | -#ifdef AS_STRNLEN
 | ||||||
|  | -/* We must do this check to correctly handle strnlen (s, -1).  */
 | ||||||
|  | -L(strnlen_ret):
 | ||||||
|  | -	bts	%rsi, %rdx
 | ||||||
|  | -	sarq	%cl, %rdx
 | ||||||
|  | -	test	%rdx, %rdx
 | ||||||
|  | -	je	L(loop_init)
 | ||||||
|  | -	bsfq	%rdx, %rax
 | ||||||
|  | -	SHIFT_RETURN
 | ||||||
|  | -	ret
 | ||||||
|  | -#endif
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(loop_init):
 | ||||||
|  | -	pxor	%xmm1, %xmm1
 | ||||||
|  | -	pxor	%xmm2, %xmm2
 | ||||||
|  | -	pxor	%xmm3, %xmm3
 | ||||||
|  | -#ifdef AS_STRNLEN
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(loop):
 | ||||||
|  | -
 | ||||||
|  | -	addq	$64, %rax
 | ||||||
|  | -	cmpq	%rax, %r10
 | ||||||
|  | -	je	L(exit_end)
 | ||||||
|  | -
 | ||||||
|  | -	movdqa	(%rax), %xmm0
 | ||||||
|  | -	PMINU	16(%rax), %xmm0
 | ||||||
|  | -	PMINU	32(%rax), %xmm0
 | ||||||
|  | -	PMINU	48(%rax), %xmm0
 | ||||||
|  | -	PCMPEQ	%xmm3, %xmm0
 | ||||||
|  | -	pmovmskb	%xmm0, %edx
 | ||||||
|  | -	testl	%edx, %edx
 | ||||||
|  | -	jne	L(exit)
 | ||||||
|  | -	jmp	L(loop)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(exit_end):
 | ||||||
|  | -	cmp	%rax, %r11
 | ||||||
|  | -	je	L(first) /* Do not read when end is at page boundary.  */
 | ||||||
|  | -	pxor	%xmm0, %xmm0
 | ||||||
|  | -	FIND_ZERO
 | ||||||
|  | -
 | ||||||
|  | -L(first):
 | ||||||
|  | -	bts	%r11, %rdx
 | ||||||
|  | -	bsfq	%rdx, %rdx
 | ||||||
|  | -	addq	%rdx, %rax
 | ||||||
|  | -	subq	%rdi, %rax
 | ||||||
|  | -	SHIFT_RETURN
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(exit):
 | ||||||
|  | -	pxor	%xmm0, %xmm0
 | ||||||
|  | -	FIND_ZERO
 | ||||||
|  | -
 | ||||||
|  | -	bsfq	%rdx, %rdx
 | ||||||
|  | -	addq	%rdx, %rax
 | ||||||
|  | -	subq	%rdi, %rax
 | ||||||
|  | -	SHIFT_RETURN
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -#else
 | ||||||
|  | -
 | ||||||
|  | -	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(loop):
 | ||||||
|  | -
 | ||||||
|  | -	movdqa	64(%rax), %xmm0
 | ||||||
|  | -	PMINU	80(%rax), %xmm0
 | ||||||
|  | -	PMINU	96(%rax), %xmm0
 | ||||||
|  | -	PMINU	112(%rax), %xmm0
 | ||||||
|  | -	PCMPEQ	%xmm3, %xmm0
 | ||||||
|  | -	pmovmskb	%xmm0, %edx
 | ||||||
|  | -	testl	%edx, %edx
 | ||||||
|  | -	jne	L(exit64)
 | ||||||
|  | -
 | ||||||
|  | -	subq	$-128, %rax
 | ||||||
|  | -
 | ||||||
|  | -	movdqa	(%rax), %xmm0
 | ||||||
|  | -	PMINU	16(%rax), %xmm0
 | ||||||
|  | -	PMINU	32(%rax), %xmm0
 | ||||||
|  | -	PMINU	48(%rax), %xmm0
 | ||||||
|  | -	PCMPEQ	%xmm3, %xmm0
 | ||||||
|  | -	pmovmskb	%xmm0, %edx
 | ||||||
|  | -	testl	%edx, %edx
 | ||||||
|  | -	jne	L(exit0)
 | ||||||
|  | -	jmp	L(loop)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(exit64):
 | ||||||
|  | -	addq	$64, %rax
 | ||||||
|  | -L(exit0):
 | ||||||
|  | -	pxor	%xmm0, %xmm0
 | ||||||
|  | -	FIND_ZERO
 | ||||||
|  | -
 | ||||||
|  | -	bsfq	%rdx, %rdx
 | ||||||
|  | -	addq	%rdx, %rax
 | ||||||
|  | -	subq	%rdi, %rax
 | ||||||
|  | -	SHIFT_RETURN
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -#endif
 | ||||||
|  | -
 | ||||||
|  | -END(strlen)
 | ||||||
|  |  libc_hidden_builtin_def (strlen) | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										181
									
								
								SOURCES/glibc-RHEL-15696-29.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										181
									
								
								SOURCES/glibc-RHEL-15696-29.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,181 @@ | |||||||
|  | From 6f573a27b6c8b4236445810a44660612323f5a73 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Wed, 23 Jun 2021 01:19:34 -0400 | ||||||
|  | Subject: [PATCH] x86-64: Add wcslen optimize for sse4.1 | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. This comment adds the ifunc / build infrastructure | ||||||
|  | necessary for wcslen to prefer the sse4.1 implementation | ||||||
|  | in strlen-vec.S. test-wcslen.c is passing. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/Makefile          |  4 +- | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  3 ++ | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-wcslen.h    | 52 ++++++++++++++++++++++ | ||||||
|  |  sysdeps/x86_64/multiarch/wcslen-sse4_1.S   |  4 ++ | ||||||
|  |  sysdeps/x86_64/multiarch/wcslen.c          |  2 +- | ||||||
|  |  sysdeps/x86_64/multiarch/wcsnlen.c         | 34 +------------- | ||||||
|  |  6 files changed, 63 insertions(+), 36 deletions(-) | ||||||
|  |  create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcslen.h | ||||||
|  |  create mode 100644 sysdeps/x86_64/multiarch/wcslen-sse4_1.S | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | index 491c7698..65fde4eb 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | @@ -93,8 +93,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
 | ||||||
|  |  		   wcscpy-ssse3 wcscpy-c \ | ||||||
|  |  		   wcschr-sse2 wcschr-avx2 \ | ||||||
|  |  		   wcsrchr-sse2 wcsrchr-avx2 \ | ||||||
|  | -		   wcsnlen-sse4_1 wcsnlen-c \
 | ||||||
|  | -		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
 | ||||||
|  | +		   wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
 | ||||||
|  | +		   wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
 | ||||||
|  |  		   wcschr-avx2-rtm \ | ||||||
|  |  		   wcscmp-avx2-rtm \ | ||||||
|  |  		   wcslen-avx2-rtm \ | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | index f1a6460a..580913ca 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | @@ -657,6 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (AVX512BW) | ||||||
|  |  			       && CPU_FEATURE_USABLE (BMI2)), | ||||||
|  |  			      __wcslen_evex) | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 | ||||||
|  | +			      CPU_FEATURE_USABLE (SSE4_1),
 | ||||||
|  | +			      __wcsnlen_sse4_1)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2)) | ||||||
|  |   | ||||||
|  |    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */ | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..39e33473
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
 | ||||||
|  | @@ -0,0 +1,52 @@
 | ||||||
|  | +/* Common definition for ifunc selections for wcslen and wcsnlen
 | ||||||
|  | +   All versions must be listed in ifunc-impl-list.c.
 | ||||||
|  | +   Copyright (C) 2017-2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <init-arch.h>
 | ||||||
|  | +
 | ||||||
|  | +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 | ||||||
|  | +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 | ||||||
|  | +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 | ||||||
|  | +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 | ||||||
|  | +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 | ||||||
|  | +
 | ||||||
|  | +static inline void *
 | ||||||
|  | +IFUNC_SELECTOR (void)
 | ||||||
|  | +{
 | ||||||
|  | +  const struct cpu_features* cpu_features = __get_cpu_features ();
 | ||||||
|  | +
 | ||||||
|  | +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
 | ||||||
|  | +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
 | ||||||
|  | +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
 | ||||||
|  | +    {
 | ||||||
|  | +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 | ||||||
|  | +	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 | ||||||
|  | +	return OPTIMIZE (evex);
 | ||||||
|  | +
 | ||||||
|  | +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 | ||||||
|  | +	return OPTIMIZE (avx2_rtm);
 | ||||||
|  | +
 | ||||||
|  | +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 | ||||||
|  | +	return OPTIMIZE (avx2);
 | ||||||
|  | +    }
 | ||||||
|  | +
 | ||||||
|  | +  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
 | ||||||
|  | +    return OPTIMIZE (sse4_1);
 | ||||||
|  | +
 | ||||||
|  | +  return OPTIMIZE (sse2);
 | ||||||
|  | +}
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..7e62621a
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
 | ||||||
|  | @@ -0,0 +1,4 @@
 | ||||||
|  | +#define AS_WCSLEN
 | ||||||
|  | +#define strlen	__wcslen_sse4_1
 | ||||||
|  | +
 | ||||||
|  | +#include "strlen-vec.S"
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
 | ||||||
|  | index 6d06e47c..3b04b75b 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/wcslen.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/wcslen.c
 | ||||||
|  | @@ -24,7 +24,7 @@
 | ||||||
|  |  # undef __wcslen | ||||||
|  |   | ||||||
|  |  # define SYMBOL_NAME wcslen | ||||||
|  | -# include "ifunc-avx2.h"
 | ||||||
|  | +# include "ifunc-wcslen.h"
 | ||||||
|  |   | ||||||
|  |  libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ()); | ||||||
|  |  weak_alias (__wcslen, wcslen); | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
 | ||||||
|  | index 20b731ae..06736410 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/wcsnlen.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/wcsnlen.c
 | ||||||
|  | @@ -24,39 +24,7 @@
 | ||||||
|  |  # undef __wcsnlen | ||||||
|  |   | ||||||
|  |  # define SYMBOL_NAME wcsnlen | ||||||
|  | -# include <init-arch.h>
 | ||||||
|  | -
 | ||||||
|  | -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 | ||||||
|  | -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 | ||||||
|  | -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 | ||||||
|  | -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 | ||||||
|  | -extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 | ||||||
|  | -
 | ||||||
|  | -static inline void *
 | ||||||
|  | -IFUNC_SELECTOR (void)
 | ||||||
|  | -{
 | ||||||
|  | -  const struct cpu_features* cpu_features = __get_cpu_features ();
 | ||||||
|  | -
 | ||||||
|  | -  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
 | ||||||
|  | -      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
 | ||||||
|  | -    {
 | ||||||
|  | -      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 | ||||||
|  | -	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 | ||||||
|  | -	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 | ||||||
|  | -	return OPTIMIZE (evex);
 | ||||||
|  | -
 | ||||||
|  | -      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 | ||||||
|  | -	return OPTIMIZE (avx2_rtm);
 | ||||||
|  | -
 | ||||||
|  | -      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 | ||||||
|  | -	return OPTIMIZE (avx2);
 | ||||||
|  | -    }
 | ||||||
|  | -
 | ||||||
|  | -  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
 | ||||||
|  | -    return OPTIMIZE (sse4_1);
 | ||||||
|  | -
 | ||||||
|  | -  return OPTIMIZE (sse2);
 | ||||||
|  | -}
 | ||||||
|  | +# include "ifunc-wcslen.h"
 | ||||||
|  |   | ||||||
|  |  libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ()); | ||||||
|  |  weak_alias (__wcsnlen, wcsnlen); | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										396
									
								
								SOURCES/glibc-RHEL-15696-3.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										396
									
								
								SOURCES/glibc-RHEL-15696-3.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,396 @@ | |||||||
|  | From 231c56760c1e2ded21ad96bbb860b1f08c556c7a Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Mon, 21 Jan 2019 11:27:25 -0800 | ||||||
|  | Subject: [PATCH] x86-64 memcpy: Properly handle the length parameter [BZ# | ||||||
|  |  24097] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | On x32, the size_t parameter may be passed in the lower 32 bits of a | ||||||
|  | 64-bit register with the non-zero upper 32 bits.  The string/memory | ||||||
|  | functions written in assembly can only use the lower 32 bits of a | ||||||
|  | 64-bit register as length or must clear the upper 32 bits before using | ||||||
|  | the full 64-bit register for length. | ||||||
|  | 
 | ||||||
|  | This pach fixes memcpy for x32.  Tested on x86-64 and x32.  On x86-64, | ||||||
|  | libc.so is the same with and withou the fix. | ||||||
|  | 
 | ||||||
|  | 	[BZ# 24097] | ||||||
|  | 	CVE-2019-6488 | ||||||
|  | 	* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for | ||||||
|  | 	length.  Clear the upper 32 bits of RDX register. | ||||||
|  | 	* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise. | ||||||
|  | 	* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S: | ||||||
|  | 	Likewise. | ||||||
|  | 	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: | ||||||
|  | 	Likewise. | ||||||
|  | 	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy. | ||||||
|  | 	tst-size_t-wmemchr. | ||||||
|  | 	* sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/memcpy-ssse3-back.S  | 17 ++++-- | ||||||
|  |  sysdeps/x86_64/multiarch/memcpy-ssse3.S       | 17 ++++-- | ||||||
|  |  .../multiarch/memmove-avx512-no-vzeroupper.S  | 16 +++-- | ||||||
|  |  .../multiarch/memmove-vec-unaligned-erms.S    | 54 +++++++++-------- | ||||||
|  |  sysdeps/x86_64/x32/Makefile                   |  2 +- | ||||||
|  |  sysdeps/x86_64/x32/tst-size_t-memcpy.c        | 58 +++++++++++++++++++ | ||||||
|  |  6 files changed, 122 insertions(+), 42 deletions(-) | ||||||
|  |  create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	ChangeLog | ||||||
|  | 	(removed) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
 | ||||||
|  | index 3cd11233..568eebd3 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
 | ||||||
|  | @@ -45,28 +45,33 @@
 | ||||||
|  |  	.section .text.ssse3,"ax",@progbits | ||||||
|  |  #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE | ||||||
|  |  ENTRY (MEMPCPY_CHK) | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | +	cmp	%RDX_LP, %RCX_LP
 | ||||||
|  |  	jb	HIDDEN_JUMPTARGET (__chk_fail) | ||||||
|  |  END (MEMPCPY_CHK) | ||||||
|  |   | ||||||
|  |  ENTRY (MEMPCPY) | ||||||
|  | -	movq	%rdi, %rax
 | ||||||
|  | -	addq	%rdx, %rax
 | ||||||
|  | +	mov	%RDI_LP, %RAX_LP
 | ||||||
|  | +	add	%RDX_LP, %RAX_LP
 | ||||||
|  |  	jmp	L(start) | ||||||
|  |  END (MEMPCPY) | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  |  #if !defined USE_AS_BCOPY | ||||||
|  |  ENTRY (MEMCPY_CHK) | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | +	cmp	%RDX_LP, %RCX_LP
 | ||||||
|  |  	jb	HIDDEN_JUMPTARGET (__chk_fail) | ||||||
|  |  END (MEMCPY_CHK) | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  |  ENTRY (MEMCPY) | ||||||
|  | -	mov	%rdi, %rax
 | ||||||
|  | +	mov	%RDI_LP, %RAX_LP
 | ||||||
|  |  #ifdef USE_AS_MEMPCPY | ||||||
|  | -	add	%rdx, %rax
 | ||||||
|  | +	add	%RDX_LP, %RAX_LP
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +#ifdef __ILP32__
 | ||||||
|  | +	/* Clear the upper 32 bits.  */
 | ||||||
|  | +	mov	%edx, %edx
 | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  |  #ifdef USE_AS_MEMMOVE | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
 | ||||||
|  | index 0240bfa3..0bd5ee99 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
 | ||||||
|  | @@ -45,28 +45,33 @@
 | ||||||
|  |  	.section .text.ssse3,"ax",@progbits | ||||||
|  |  #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE | ||||||
|  |  ENTRY (MEMPCPY_CHK) | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | +	cmp	%RDX_LP, %RCX_LP
 | ||||||
|  |  	jb	HIDDEN_JUMPTARGET (__chk_fail) | ||||||
|  |  END (MEMPCPY_CHK) | ||||||
|  |   | ||||||
|  |  ENTRY (MEMPCPY) | ||||||
|  | -	movq	%rdi, %rax
 | ||||||
|  | -	addq	%rdx, %rax
 | ||||||
|  | +	mov	%RDI_LP, %RAX_LP
 | ||||||
|  | +	add	%RDX_LP, %RAX_LP
 | ||||||
|  |  	jmp	L(start) | ||||||
|  |  END (MEMPCPY) | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  |  #if !defined USE_AS_BCOPY | ||||||
|  |  ENTRY (MEMCPY_CHK) | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | +	cmp	%RDX_LP, %RCX_LP
 | ||||||
|  |  	jb	HIDDEN_JUMPTARGET (__chk_fail) | ||||||
|  |  END (MEMCPY_CHK) | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  |  ENTRY (MEMCPY) | ||||||
|  | -	mov	%rdi, %rax
 | ||||||
|  | +	mov	%RDI_LP, %RAX_LP
 | ||||||
|  |  #ifdef USE_AS_MEMPCPY | ||||||
|  | -	add	%rdx, %rax
 | ||||||
|  | +	add	%RDX_LP, %RAX_LP
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +#ifdef __ILP32__
 | ||||||
|  | +	/* Clear the upper 32 bits.  */
 | ||||||
|  | +	mov	%edx, %edx
 | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  |  #ifdef USE_AS_MEMMOVE | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
 | ||||||
|  | index effc3ac2..6ca2bbc9 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
 | ||||||
|  | @@ -24,27 +24,31 @@
 | ||||||
|  |   | ||||||
|  |  	.section .text.avx512,"ax",@progbits | ||||||
|  |  ENTRY (__mempcpy_chk_avx512_no_vzeroupper) | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | +	cmp	%RDX_LP, %RCX_LP
 | ||||||
|  |  	jb	HIDDEN_JUMPTARGET (__chk_fail) | ||||||
|  |  END (__mempcpy_chk_avx512_no_vzeroupper) | ||||||
|  |   | ||||||
|  |  ENTRY (__mempcpy_avx512_no_vzeroupper) | ||||||
|  | -	movq	%rdi, %rax
 | ||||||
|  | -	addq	%rdx, %rax
 | ||||||
|  | +	mov	%RDI_LP, %RAX_LP
 | ||||||
|  | +	add	%RDX_LP, %RAX_LP
 | ||||||
|  |  	jmp	L(start) | ||||||
|  |  END (__mempcpy_avx512_no_vzeroupper) | ||||||
|  |   | ||||||
|  |  ENTRY (__memmove_chk_avx512_no_vzeroupper) | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | +	cmp	%RDX_LP, %RCX_LP
 | ||||||
|  |  	jb	HIDDEN_JUMPTARGET (__chk_fail) | ||||||
|  |  END (__memmove_chk_avx512_no_vzeroupper) | ||||||
|  |   | ||||||
|  |  ENTRY (__memmove_avx512_no_vzeroupper) | ||||||
|  | -	mov	%rdi, %rax
 | ||||||
|  | +	mov	%RDI_LP, %RAX_LP
 | ||||||
|  |  # ifdef USE_AS_MEMPCPY | ||||||
|  | -	add	%rdx, %rax
 | ||||||
|  | +	add	%RDX_LP, %RAX_LP
 | ||||||
|  |  # endif | ||||||
|  |  L(start): | ||||||
|  | +# ifdef __ILP32__
 | ||||||
|  | +	/* Clear the upper 32 bits.  */
 | ||||||
|  | +	mov	%edx, %edx
 | ||||||
|  | +# endif
 | ||||||
|  |  	lea	(%rsi, %rdx), %rcx | ||||||
|  |  	lea	(%rdi, %rdx), %r9 | ||||||
|  |  	cmp	$512, %rdx | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | index c952576c..274aa1c7 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | @@ -95,20 +95,20 @@
 | ||||||
|  |  	.section SECTION(.text),"ax",@progbits | ||||||
|  |  #if defined SHARED && IS_IN (libc) | ||||||
|  |  ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | +	cmp	%RDX_LP, %RCX_LP
 | ||||||
|  |  	jb	HIDDEN_JUMPTARGET (__chk_fail) | ||||||
|  |  END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  |  ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) | ||||||
|  | -	movq	%rdi, %rax
 | ||||||
|  | -	addq	%rdx, %rax
 | ||||||
|  | +	mov	%RDI_LP, %RAX_LP
 | ||||||
|  | +	add	%RDX_LP, %RAX_LP
 | ||||||
|  |  	jmp	L(start) | ||||||
|  |  END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) | ||||||
|  |   | ||||||
|  |  #if defined SHARED && IS_IN (libc) | ||||||
|  |  ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | +	cmp	%RDX_LP, %RCX_LP
 | ||||||
|  |  	jb	HIDDEN_JUMPTARGET (__chk_fail) | ||||||
|  |  END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) | ||||||
|  |  #endif | ||||||
|  | @@ -116,9 +116,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 | ||||||
|  |  ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) | ||||||
|  |  	movq	%rdi, %rax | ||||||
|  |  L(start): | ||||||
|  | -	cmpq	$VEC_SIZE, %rdx
 | ||||||
|  | +# ifdef __ILP32__
 | ||||||
|  | +	/* Clear the upper 32 bits.  */
 | ||||||
|  | +	movl	%edx, %edx
 | ||||||
|  | +# endif
 | ||||||
|  | +	cmp	$VEC_SIZE, %RDX_LP
 | ||||||
|  |  	jb	L(less_vec) | ||||||
|  | -	cmpq	$(VEC_SIZE * 2), %rdx
 | ||||||
|  | +	cmp	$(VEC_SIZE * 2), %RDX_LP
 | ||||||
|  |  	ja	L(more_2x_vec) | ||||||
|  |  #if !defined USE_MULTIARCH || !IS_IN (libc) | ||||||
|  |  L(last_2x_vec): | ||||||
|  | @@ -138,38 +142,38 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned))
 | ||||||
|  |   | ||||||
|  |  # if VEC_SIZE == 16 | ||||||
|  |  ENTRY (__mempcpy_chk_erms) | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | +	cmp	%RDX_LP, %RCX_LP
 | ||||||
|  |  	jb	HIDDEN_JUMPTARGET (__chk_fail) | ||||||
|  |  END (__mempcpy_chk_erms) | ||||||
|  |   | ||||||
|  |  /* Only used to measure performance of REP MOVSB.  */ | ||||||
|  |  ENTRY (__mempcpy_erms) | ||||||
|  | -	movq	%rdi, %rax
 | ||||||
|  | +	mov	%RDI_LP, %RAX_LP
 | ||||||
|  |  	/* Skip zero length.  */ | ||||||
|  | -	testq	%rdx, %rdx
 | ||||||
|  | +	test	%RDX_LP, %RDX_LP
 | ||||||
|  |  	jz	2f | ||||||
|  | -	addq	%rdx, %rax
 | ||||||
|  | +	add	%RDX_LP, %RAX_LP
 | ||||||
|  |  	jmp	L(start_movsb) | ||||||
|  |  END (__mempcpy_erms) | ||||||
|  |   | ||||||
|  |  ENTRY (__memmove_chk_erms) | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | +	cmp	%RDX_LP, %RCX_LP
 | ||||||
|  |  	jb	HIDDEN_JUMPTARGET (__chk_fail) | ||||||
|  |  END (__memmove_chk_erms) | ||||||
|  |   | ||||||
|  |  ENTRY (__memmove_erms) | ||||||
|  |  	movq	%rdi, %rax | ||||||
|  |  	/* Skip zero length.  */ | ||||||
|  | -	testq	%rdx, %rdx
 | ||||||
|  | +	test	%RDX_LP, %RDX_LP
 | ||||||
|  |  	jz	2f | ||||||
|  |  L(start_movsb): | ||||||
|  | -	movq	%rdx, %rcx
 | ||||||
|  | -	cmpq	%rsi, %rdi
 | ||||||
|  | +	mov	%RDX_LP, %RCX_LP
 | ||||||
|  | +	cmp	%RSI_LP, %RDI_LP
 | ||||||
|  |  	jb	1f | ||||||
|  |  	/* Source == destination is less common.  */ | ||||||
|  |  	je	2f | ||||||
|  | -	leaq	(%rsi,%rcx), %rdx
 | ||||||
|  | -	cmpq	%rdx, %rdi
 | ||||||
|  | +	lea	(%rsi,%rcx), %RDX_LP
 | ||||||
|  | +	cmp	%RDX_LP, %RDI_LP
 | ||||||
|  |  	jb	L(movsb_backward) | ||||||
|  |  1: | ||||||
|  |  	rep movsb | ||||||
|  | @@ -189,20 +193,20 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
 | ||||||
|  |   | ||||||
|  |  # ifdef SHARED | ||||||
|  |  ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | +	cmp	%RDX_LP, %RCX_LP
 | ||||||
|  |  	jb	HIDDEN_JUMPTARGET (__chk_fail) | ||||||
|  |  END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) | ||||||
|  | -	movq	%rdi, %rax
 | ||||||
|  | -	addq	%rdx, %rax
 | ||||||
|  | +	mov	%RDI_LP, %RAX_LP
 | ||||||
|  | +	add	%RDX_LP, %RAX_LP
 | ||||||
|  |  	jmp	L(start_erms) | ||||||
|  |  END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) | ||||||
|  |   | ||||||
|  |  # ifdef SHARED | ||||||
|  |  ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | +	cmp	%RDX_LP, %RCX_LP
 | ||||||
|  |  	jb	HIDDEN_JUMPTARGET (__chk_fail) | ||||||
|  |  END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) | ||||||
|  |  # endif | ||||||
|  | @@ -210,9 +214,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 | ||||||
|  |  ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) | ||||||
|  |  	movq	%rdi, %rax | ||||||
|  |  L(start_erms): | ||||||
|  | -	cmpq	$VEC_SIZE, %rdx
 | ||||||
|  | +# ifdef __ILP32__
 | ||||||
|  | +	/* Clear the upper 32 bits.  */
 | ||||||
|  | +	movl	%edx, %edx
 | ||||||
|  | +# endif
 | ||||||
|  | +	cmp	$VEC_SIZE, %RDX_LP
 | ||||||
|  |  	jb	L(less_vec) | ||||||
|  | -	cmpq	$(VEC_SIZE * 2), %rdx
 | ||||||
|  | +	cmp	$(VEC_SIZE * 2), %RDX_LP
 | ||||||
|  |  	ja	L(movsb_more_2x_vec) | ||||||
|  |  L(last_2x_vec): | ||||||
|  |  	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */ | ||||||
|  | @@ -236,7 +244,7 @@ L(movsb):
 | ||||||
|  |  	/* Avoid slow backward REP MOVSB.  */ | ||||||
|  |  	jb	L(more_8x_vec_backward) | ||||||
|  |  1: | ||||||
|  | -	movq	%rdx, %rcx
 | ||||||
|  | +	mov	%RDX_LP, %RCX_LP
 | ||||||
|  |  	rep movsb | ||||||
|  |  L(nop): | ||||||
|  |  	ret | ||||||
|  | diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | index ddec7f04..2fe1e5ac 100644
 | ||||||
|  | --- a/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | @@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround
 | ||||||
|  |  endif | ||||||
|  |   | ||||||
|  |  ifeq ($(subdir),string) | ||||||
|  | -tests += tst-size_t-memchr tst-size_t-memcmp
 | ||||||
|  | +tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
 | ||||||
|  |  endif | ||||||
|  |   | ||||||
|  |  ifeq ($(subdir),wcsmbs) | ||||||
|  | diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..66b71e17
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
 | ||||||
|  | @@ -0,0 +1,58 @@
 | ||||||
|  | +/* Test memcpy with size_t in the lower 32 bits of 64-bit register.
 | ||||||
|  | +   Copyright (C) 2019 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <http://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#define TEST_NAME "memcpy"
 | ||||||
|  | +#include "test-size_t.h"
 | ||||||
|  | +
 | ||||||
|  | +IMPL (memcpy, 1)
 | ||||||
|  | +
 | ||||||
|  | +typedef void *(*proto_t) (void *, const void *, size_t);
 | ||||||
|  | +
 | ||||||
|  | +static void *
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +do_memcpy (parameter_t a, parameter_t b)
 | ||||||
|  | +{
 | ||||||
|  | +  return CALL (&b, a.p, b.p, a.len);
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +test_main (void)
 | ||||||
|  | +{
 | ||||||
|  | +  test_init ();
 | ||||||
|  | +
 | ||||||
|  | +  parameter_t dest = { { page_size }, buf1 };
 | ||||||
|  | +  parameter_t src = { { 0 }, buf2 };
 | ||||||
|  | +
 | ||||||
|  | +  int ret = 0;
 | ||||||
|  | +  FOR_EACH_IMPL (impl, 0)
 | ||||||
|  | +    {
 | ||||||
|  | +      src.fn = impl->fn;
 | ||||||
|  | +      do_memcpy (dest, src);
 | ||||||
|  | +      int res = memcmp (dest.p, src.p, dest.len);
 | ||||||
|  | +      if (res)
 | ||||||
|  | +	{
 | ||||||
|  | +	  error (0, 0, "Wrong result in function %s: %i != 0",
 | ||||||
|  | +		 impl->name, res);
 | ||||||
|  | +	  ret = 1;
 | ||||||
|  | +	}
 | ||||||
|  | +    }
 | ||||||
|  | +
 | ||||||
|  | +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +#include <support/test-driver.c>
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										497
									
								
								SOURCES/glibc-RHEL-15696-30.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										497
									
								
								SOURCES/glibc-RHEL-15696-30.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,497 @@ | |||||||
|  | From a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Wed, 23 Jun 2021 01:56:29 -0400 | ||||||
|  | Subject: [PATCH] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ | ||||||
|  |  #27974] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | This commit fixes the bug mentioned in the previous commit. | ||||||
|  | 
 | ||||||
|  | The previous implementations of wmemchr in these files relied | ||||||
|  | on maxlen * sizeof(wchar_t) which was not guranteed by the standard. | ||||||
|  | 
 | ||||||
|  | The new overflow tests added in the previous commit now | ||||||
|  | pass (As well as all the other tests). | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++------- | ||||||
|  |  sysdeps/x86_64/multiarch/strlen-vec.S  |  15 ++- | ||||||
|  |  2 files changed, 107 insertions(+), 38 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
 | ||||||
|  | index be8a5db5..37688966 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
 | ||||||
|  | @@ -44,21 +44,21 @@
 | ||||||
|  |   | ||||||
|  |  # define VEC_SIZE 32 | ||||||
|  |  # define PAGE_SIZE 4096 | ||||||
|  | +# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 | ||||||
|  |   | ||||||
|  |  	.section SECTION(.text),"ax",@progbits | ||||||
|  |  ENTRY (STRLEN) | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  |  	/* Check zero length.  */ | ||||||
|  | +#  ifdef __ILP32__
 | ||||||
|  | +	/* Clear upper bits.  */
 | ||||||
|  | +	and	%RSI_LP, %RSI_LP
 | ||||||
|  | +#  else
 | ||||||
|  |  	test	%RSI_LP, %RSI_LP | ||||||
|  | +#  endif
 | ||||||
|  |  	jz	L(zero) | ||||||
|  |  	/* Store max len in R8_LP before adjusting if using WCSLEN.  */ | ||||||
|  |  	mov	%RSI_LP, %R8_LP | ||||||
|  | -#  ifdef USE_AS_WCSLEN
 | ||||||
|  | -	shl	$2, %RSI_LP
 | ||||||
|  | -#  elif defined __ILP32__
 | ||||||
|  | -	/* Clear the upper 32 bits.  */
 | ||||||
|  | -	movl	%esi, %esi
 | ||||||
|  | -#  endif
 | ||||||
|  |  # endif | ||||||
|  |  	movl	%edi, %eax | ||||||
|  |  	movq	%rdi, %rdx | ||||||
|  | @@ -72,10 +72,10 @@ ENTRY (STRLEN)
 | ||||||
|  |   | ||||||
|  |  	/* Check the first VEC_SIZE bytes.  */ | ||||||
|  |  	VPCMPEQ	(%rdi), %ymm0, %ymm1 | ||||||
|  | -	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  |  	/* If length < VEC_SIZE handle special.  */ | ||||||
|  | -	cmpq	$VEC_SIZE, %rsi
 | ||||||
|  | +	cmpq	$CHAR_PER_VEC, %rsi
 | ||||||
|  |  	jbe	L(first_vec_x0) | ||||||
|  |  # endif | ||||||
|  |  	/* If empty continue to aligned_more. Otherwise return bit | ||||||
|  | @@ -84,6 +84,7 @@ ENTRY (STRLEN)
 | ||||||
|  |  	jz	L(aligned_more) | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  |  # ifdef USE_AS_WCSLEN | ||||||
|  | +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 | ||||||
|  |  	shrl	$2, %eax | ||||||
|  |  # endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | @@ -97,9 +98,14 @@ L(zero):
 | ||||||
|  |  L(first_vec_x0): | ||||||
|  |  	/* Set bit for max len so that tzcnt will return min of max len | ||||||
|  |  	   and position of first match.  */ | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	/* NB: Multiply length by 4 to get byte count.  */
 | ||||||
|  | +	sall	$2, %esi
 | ||||||
|  | +#  endif
 | ||||||
|  |  	btsq	%rsi, %rax | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  | +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 | ||||||
|  |  	shrl	$2, %eax | ||||||
|  |  #  endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | @@ -113,14 +119,19 @@ L(first_vec_x1):
 | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  |  	/* Use ecx which was computed earlier to compute correct value. | ||||||
|  |  	 */ | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	leal	-(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
 | ||||||
|  | +#  else
 | ||||||
|  |  	subl	$(VEC_SIZE * 4 + 1), %ecx | ||||||
|  |  	addl	%ecx, %eax | ||||||
|  | +#  endif
 | ||||||
|  |  # else | ||||||
|  |  	subl	%edx, %edi | ||||||
|  |  	incl	%edi | ||||||
|  |  	addl	%edi, %eax | ||||||
|  |  # endif | ||||||
|  |  # ifdef USE_AS_WCSLEN | ||||||
|  | +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 | ||||||
|  |  	shrl	$2, %eax | ||||||
|  |  # endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | @@ -133,14 +144,19 @@ L(first_vec_x2):
 | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  |  	/* Use ecx which was computed earlier to compute correct value. | ||||||
|  |  	 */ | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	leal	-(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
 | ||||||
|  | +#  else
 | ||||||
|  |  	subl	$(VEC_SIZE * 3 + 1), %ecx | ||||||
|  |  	addl	%ecx, %eax | ||||||
|  | +#  endif
 | ||||||
|  |  # else | ||||||
|  |  	subl	%edx, %edi | ||||||
|  |  	addl	$(VEC_SIZE + 1), %edi | ||||||
|  |  	addl	%edi, %eax | ||||||
|  |  # endif | ||||||
|  |  # ifdef USE_AS_WCSLEN | ||||||
|  | +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 | ||||||
|  |  	shrl	$2, %eax | ||||||
|  |  # endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | @@ -153,14 +169,19 @@ L(first_vec_x3):
 | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  |  	/* Use ecx which was computed earlier to compute correct value. | ||||||
|  |  	 */ | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	leal	-(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
 | ||||||
|  | +#  else
 | ||||||
|  |  	subl	$(VEC_SIZE * 2 + 1), %ecx | ||||||
|  |  	addl	%ecx, %eax | ||||||
|  | +#  endif
 | ||||||
|  |  # else | ||||||
|  |  	subl	%edx, %edi | ||||||
|  |  	addl	$(VEC_SIZE * 2 + 1), %edi | ||||||
|  |  	addl	%edi, %eax | ||||||
|  |  # endif | ||||||
|  |  # ifdef USE_AS_WCSLEN | ||||||
|  | +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 | ||||||
|  |  	shrl	$2, %eax | ||||||
|  |  # endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | @@ -173,14 +194,19 @@ L(first_vec_x4):
 | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  |  	/* Use ecx which was computed earlier to compute correct value. | ||||||
|  |  	 */ | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	leal	-(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
 | ||||||
|  | +#  else
 | ||||||
|  |  	subl	$(VEC_SIZE + 1), %ecx | ||||||
|  |  	addl	%ecx, %eax | ||||||
|  | +#  endif
 | ||||||
|  |  # else | ||||||
|  |  	subl	%edx, %edi | ||||||
|  |  	addl	$(VEC_SIZE * 3 + 1), %edi | ||||||
|  |  	addl	%edi, %eax | ||||||
|  |  # endif | ||||||
|  |  # ifdef USE_AS_WCSLEN | ||||||
|  | +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 | ||||||
|  |  	shrl	$2, %eax | ||||||
|  |  # endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | @@ -195,10 +221,14 @@ L(cross_page_continue):
 | ||||||
|  |  	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time | ||||||
|  |  	   since data is only aligned to VEC_SIZE.  */ | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  | -	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
 | ||||||
|  | -	   it simplies the logic in last_4x_vec_or_less.  */
 | ||||||
|  | +	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
 | ||||||
|  | +	   because it simplies the logic in last_4x_vec_or_less.  */
 | ||||||
|  |  	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx | ||||||
|  |  	subq	%rdx, %rcx | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarl	$2, %ecx
 | ||||||
|  | +#  endif
 | ||||||
|  |  # endif | ||||||
|  |  	/* Load first VEC regardless.  */ | ||||||
|  |  	VPCMPEQ	1(%rdi), %ymm0, %ymm1 | ||||||
|  | @@ -207,34 +237,38 @@ L(cross_page_continue):
 | ||||||
|  |  	subq	%rcx, %rsi | ||||||
|  |  	jb	L(last_4x_vec_or_less) | ||||||
|  |  # endif | ||||||
|  | -	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x1) | ||||||
|  |   | ||||||
|  |  	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 | ||||||
|  | -	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x2) | ||||||
|  |   | ||||||
|  |  	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 | ||||||
|  | -	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x3) | ||||||
|  |   | ||||||
|  |  	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 | ||||||
|  | -	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x4) | ||||||
|  |   | ||||||
|  |  	/* Align data to VEC_SIZE * 4 - 1.  */ | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  |  	/* Before adjusting length check if at last VEC_SIZE * 4.  */ | ||||||
|  | -	cmpq	$(VEC_SIZE * 4 - 1), %rsi
 | ||||||
|  | +	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
 | ||||||
|  |  	jbe	L(last_4x_vec_or_less_load) | ||||||
|  |  	incq	%rdi | ||||||
|  |  	movl	%edi, %ecx | ||||||
|  |  	orq	$(VEC_SIZE * 4 - 1), %rdi | ||||||
|  |  	andl	$(VEC_SIZE * 4 - 1), %ecx | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarl	$2, %ecx
 | ||||||
|  | +#  endif
 | ||||||
|  |  	/* Readjust length.  */ | ||||||
|  |  	addq	%rcx, %rsi | ||||||
|  |  # else | ||||||
|  | @@ -246,13 +280,13 @@ L(cross_page_continue):
 | ||||||
|  |  L(loop_4x_vec): | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  |  	/* Break if at end of length.  */ | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rsi
 | ||||||
|  | +	subq	$(CHAR_PER_VEC * 4), %rsi
 | ||||||
|  |  	jb	L(last_4x_vec_or_less_cmpeq) | ||||||
|  |  # endif | ||||||
|  | -	/* Save some code size by microfusing VPMINU with the load. Since
 | ||||||
|  | -	   the matches in ymm2/ymm4 can only be returned if there where no
 | ||||||
|  | -	   matches in ymm1/ymm3 respectively there is no issue with overlap.
 | ||||||
|  | -	 */
 | ||||||
|  | +	/* Save some code size by microfusing VPMINU with the load.
 | ||||||
|  | +	   Since the matches in ymm2/ymm4 can only be returned if there
 | ||||||
|  | +	   where no matches in ymm1/ymm3 respectively there is no issue
 | ||||||
|  | +	   with overlap.  */
 | ||||||
|  |  	vmovdqa	1(%rdi), %ymm1 | ||||||
|  |  	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 | ||||||
|  |  	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3 | ||||||
|  | @@ -260,7 +294,7 @@ L(loop_4x_vec):
 | ||||||
|  |   | ||||||
|  |  	VPMINU	%ymm2, %ymm4, %ymm5 | ||||||
|  |  	VPCMPEQ	%ymm5, %ymm0, %ymm5 | ||||||
|  | -	vpmovmskb	%ymm5, %ecx
 | ||||||
|  | +	vpmovmskb %ymm5, %ecx
 | ||||||
|  |   | ||||||
|  |  	subq	$-(VEC_SIZE * 4), %rdi | ||||||
|  |  	testl	%ecx, %ecx | ||||||
|  | @@ -268,27 +302,28 @@ L(loop_4x_vec):
 | ||||||
|  |   | ||||||
|  |   | ||||||
|  |  	VPCMPEQ	%ymm1, %ymm0, %ymm1 | ||||||
|  | -	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  |  	subq	%rdx, %rdi | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(last_vec_return_x0) | ||||||
|  |   | ||||||
|  |  	VPCMPEQ	%ymm2, %ymm0, %ymm2 | ||||||
|  | -	vpmovmskb	%ymm2, %eax
 | ||||||
|  | +	vpmovmskb %ymm2, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(last_vec_return_x1) | ||||||
|  |   | ||||||
|  |  	/* Combine last 2 VEC.  */ | ||||||
|  |  	VPCMPEQ	%ymm3, %ymm0, %ymm3 | ||||||
|  | -	vpmovmskb	%ymm3, %eax
 | ||||||
|  | -	/* rcx has combined result from all 4 VEC. It will only be used if
 | ||||||
|  | -	   the first 3 other VEC all did not contain a match.  */
 | ||||||
|  | +	vpmovmskb %ymm3, %eax
 | ||||||
|  | +	/* rcx has combined result from all 4 VEC. It will only be used
 | ||||||
|  | +	   if the first 3 other VEC all did not contain a match.  */
 | ||||||
|  |  	salq	$32, %rcx | ||||||
|  |  	orq	%rcx, %rax | ||||||
|  |  	tzcntq	%rax, %rax | ||||||
|  |  	subq	$(VEC_SIZE * 2 - 1), %rdi | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  |  # ifdef USE_AS_WCSLEN | ||||||
|  | +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 | ||||||
|  |  	shrq	$2, %rax | ||||||
|  |  # endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | @@ -297,15 +332,19 @@ L(loop_4x_vec):
 | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(last_4x_vec_or_less_load): | ||||||
|  | -	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
 | ||||||
|  | +	/* Depending on entry adjust rdi / prepare first VEC in ymm1.
 | ||||||
|  | +	 */
 | ||||||
|  |  	subq	$-(VEC_SIZE * 4), %rdi | ||||||
|  |  L(last_4x_vec_or_less_cmpeq): | ||||||
|  |  	VPCMPEQ	1(%rdi), %ymm0, %ymm1 | ||||||
|  |  L(last_4x_vec_or_less): | ||||||
|  | -
 | ||||||
|  | -	vpmovmskb	%ymm1, %eax
 | ||||||
|  | -	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
 | ||||||
|  | -	   VEC_SIZE * 4.  */
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	/* NB: Multiply length by 4 to get byte count.  */
 | ||||||
|  | +	sall	$2, %esi
 | ||||||
|  | +#  endif
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	/* If remaining length > VEC_SIZE * 2. This works if esi is off
 | ||||||
|  | +	   by VEC_SIZE * 4.  */
 | ||||||
|  |  	testl	$(VEC_SIZE * 2), %esi | ||||||
|  |  	jnz	L(last_4x_vec) | ||||||
|  |   | ||||||
|  | @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
 | ||||||
|  |  	jb	L(max) | ||||||
|  |   | ||||||
|  |  	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 | ||||||
|  | -	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  |  	/* Check the end of data.  */ | ||||||
|  |  	cmpl	%eax, %esi | ||||||
|  | @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
 | ||||||
|  |  	addl	$(VEC_SIZE + 1), %eax | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  | +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 | ||||||
|  |  	shrq	$2, %rax | ||||||
|  |  #  endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | @@ -340,6 +380,7 @@ L(last_vec_return_x0):
 | ||||||
|  |  	subq	$(VEC_SIZE * 4 - 1), %rdi | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  |  # ifdef USE_AS_WCSLEN | ||||||
|  | +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 | ||||||
|  |  	shrq	$2, %rax | ||||||
|  |  # endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | @@ -350,6 +391,7 @@ L(last_vec_return_x1):
 | ||||||
|  |  	subq	$(VEC_SIZE * 3 - 1), %rdi | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  |  # ifdef USE_AS_WCSLEN | ||||||
|  | +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 | ||||||
|  |  	shrq	$2, %rax | ||||||
|  |  # endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | @@ -366,6 +408,7 @@ L(last_vec_x1_check):
 | ||||||
|  |  	incl	%eax | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  | +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 | ||||||
|  |  	shrq	$2, %rax | ||||||
|  |  #  endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | @@ -381,14 +424,14 @@ L(last_4x_vec):
 | ||||||
|  |  	jnz	L(last_vec_x1) | ||||||
|  |   | ||||||
|  |  	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 | ||||||
|  | -	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(last_vec_x2) | ||||||
|  |   | ||||||
|  |  	/* Normalize length.  */ | ||||||
|  |  	andl	$(VEC_SIZE * 4 - 1), %esi | ||||||
|  |  	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 | ||||||
|  | -	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(last_vec_x3) | ||||||
|  |   | ||||||
|  | @@ -396,7 +439,7 @@ L(last_4x_vec):
 | ||||||
|  |  	jb	L(max) | ||||||
|  |   | ||||||
|  |  	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 | ||||||
|  | -	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  |  	/* Check the end of data.  */ | ||||||
|  |  	cmpl	%eax, %esi | ||||||
|  | @@ -405,6 +448,7 @@ L(last_4x_vec):
 | ||||||
|  |  	addl	$(VEC_SIZE * 3 + 1), %eax | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  | +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 | ||||||
|  |  	shrq	$2, %rax | ||||||
|  |  #  endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | @@ -419,6 +463,7 @@ L(last_vec_x1):
 | ||||||
|  |  	incl	%eax | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  | +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 | ||||||
|  |  	shrq	$2, %rax | ||||||
|  |  #  endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | @@ -432,6 +477,7 @@ L(last_vec_x2):
 | ||||||
|  |  	addl	$(VEC_SIZE + 1), %eax | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  | +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 | ||||||
|  |  	shrq	$2, %rax | ||||||
|  |  #  endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | @@ -447,6 +493,7 @@ L(last_vec_x3):
 | ||||||
|  |  	addl	$(VEC_SIZE * 2 + 1), %eax | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  | +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 | ||||||
|  |  	shrq	$2, %rax | ||||||
|  |  #  endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | @@ -455,13 +502,13 @@ L(max_end):
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  | -	/* Cold case for crossing page with first load.	 */
 | ||||||
|  | +	/* Cold case for crossing page with first load.  */
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(cross_page_boundary): | ||||||
|  |  	/* Align data to VEC_SIZE - 1.  */ | ||||||
|  |  	orq	$(VEC_SIZE - 1), %rdi | ||||||
|  |  	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 | ||||||
|  | -	vpmovmskb	%ymm1, %eax
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  |  	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT | ||||||
|  |  	   so no need to manually mod rdx.  */ | ||||||
|  |  	sarxl	%edx, %eax, %eax | ||||||
|  | @@ -470,6 +517,10 @@ L(cross_page_boundary):
 | ||||||
|  |  	jnz	L(cross_page_less_vec) | ||||||
|  |  	leaq	1(%rdi), %rcx | ||||||
|  |  	subq	%rdx, %rcx | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 | ||||||
|  | +	shrl	$2, %ecx
 | ||||||
|  | +#  endif
 | ||||||
|  |  	/* Check length.  */ | ||||||
|  |  	cmpq	%rsi, %rcx | ||||||
|  |  	jb	L(cross_page_continue) | ||||||
|  | @@ -479,6 +530,7 @@ L(cross_page_boundary):
 | ||||||
|  |  	jz	L(cross_page_continue) | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  | +	/* NB: Divide length by 4 to get wchar_t count.  */
 | ||||||
|  |  	shrl	$2, %eax | ||||||
|  |  #  endif | ||||||
|  |  # endif | ||||||
|  | @@ -489,6 +541,10 @@ L(return_vzeroupper):
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(cross_page_less_vec): | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	/* NB: Multiply length by 4 to get byte count.  */
 | ||||||
|  | +	sall	$2, %esi
 | ||||||
|  | +#  endif
 | ||||||
|  |  	cmpq	%rax, %rsi | ||||||
|  |  	cmovb	%esi, %eax | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
 | ||||||
|  | index 8f660bb9..439e486a 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strlen-vec.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strlen-vec.S
 | ||||||
|  | @@ -65,12 +65,25 @@ ENTRY(strlen)
 | ||||||
|  |  	ret | ||||||
|  |  L(n_nonzero): | ||||||
|  |  # ifdef AS_WCSLEN | ||||||
|  | -	shl	$2, %RSI_LP
 | ||||||
|  | +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
 | ||||||
|  | +   overflow the only way this program doesn't have undefined behavior 
 | ||||||
|  | +   is if there is a null terminator in valid memory so wcslen will 
 | ||||||
|  | +   suffice.  */
 | ||||||
|  | +	mov	%RSI_LP, %R10_LP
 | ||||||
|  | +	sar	$62, %R10_LP
 | ||||||
|  | +	test	%R10_LP, %R10_LP
 | ||||||
|  | +	jnz	__wcslen_sse4_1
 | ||||||
|  | +	sal	$2, %RSI_LP
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  | +
 | ||||||
|  |  /* Initialize long lived registers.  */ | ||||||
|  |   | ||||||
|  |  	add	%RDI_LP, %RSI_LP | ||||||
|  | +# ifdef AS_WCSLEN
 | ||||||
|  | +/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
 | ||||||
|  | +	jbe	__wcslen_sse4_1
 | ||||||
|  | +# endif
 | ||||||
|  |  	mov	%RSI_LP, %R10_LP | ||||||
|  |  	and	$-64, %R10_LP | ||||||
|  |  	mov	%RSI_LP, %R11_LP | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										745
									
								
								SOURCES/glibc-RHEL-15696-31.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										745
									
								
								SOURCES/glibc-RHEL-15696-31.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,745 @@ | |||||||
|  | From 4ba65586847751372520a36757c17f114588794e Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Mon, 19 Apr 2021 19:36:06 -0400 | ||||||
|  | Subject: [PATCH] x86: Optimize strlen-evex.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. This commit optimizes strlen-evex.S. The | ||||||
|  | optimizations are mostly small things but they add up to roughly | ||||||
|  | 10-30% performance improvement for strlen. The results for strnlen are | ||||||
|  | bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and | ||||||
|  | test-wcsnlen are all passing. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++----------- | ||||||
|  |  1 file changed, 317 insertions(+), 264 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
 | ||||||
|  | index 05838190..4bf6874b 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strlen-evex.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strlen-evex.S
 | ||||||
|  | @@ -29,11 +29,13 @@
 | ||||||
|  |  # ifdef USE_AS_WCSLEN | ||||||
|  |  #  define VPCMP		vpcmpd | ||||||
|  |  #  define VPMINU	vpminud | ||||||
|  | -#  define SHIFT_REG	r9d
 | ||||||
|  | +#  define SHIFT_REG ecx
 | ||||||
|  | +#  define CHAR_SIZE	4
 | ||||||
|  |  # else | ||||||
|  |  #  define VPCMP		vpcmpb | ||||||
|  |  #  define VPMINU	vpminub | ||||||
|  | -#  define SHIFT_REG	ecx
 | ||||||
|  | +#  define SHIFT_REG edx
 | ||||||
|  | +#  define CHAR_SIZE	1
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  # define XMMZERO	xmm16 | ||||||
|  | @@ -46,132 +48,165 @@
 | ||||||
|  |  # define YMM6		ymm22 | ||||||
|  |   | ||||||
|  |  # define VEC_SIZE 32 | ||||||
|  | +# define PAGE_SIZE 4096
 | ||||||
|  | +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
 | ||||||
|  |   | ||||||
|  |  	.section .text.evex,"ax",@progbits | ||||||
|  |  ENTRY (STRLEN) | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  | -	/* Check for zero length.  */
 | ||||||
|  | +	/* Check zero length.  */
 | ||||||
|  |  	test	%RSI_LP, %RSI_LP | ||||||
|  |  	jz	L(zero) | ||||||
|  | -#  ifdef USE_AS_WCSLEN
 | ||||||
|  | -	shl	$2, %RSI_LP
 | ||||||
|  | -#  elif defined __ILP32__
 | ||||||
|  | +#  ifdef __ILP32__
 | ||||||
|  |  	/* Clear the upper 32 bits.  */ | ||||||
|  |  	movl	%esi, %esi | ||||||
|  |  #  endif | ||||||
|  |  	mov	%RSI_LP, %R8_LP | ||||||
|  |  # endif | ||||||
|  | -	movl	%edi, %ecx
 | ||||||
|  | -	movq	%rdi, %rdx
 | ||||||
|  | +	movl	%edi, %eax
 | ||||||
|  |  	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO | ||||||
|  | -
 | ||||||
|  | +	/* Clear high bits from edi. Only keeping bits relevant to page
 | ||||||
|  | +	   cross check.  */
 | ||||||
|  | +	andl	$(PAGE_SIZE - 1), %eax
 | ||||||
|  |  	/* Check if we may cross page boundary with one vector load.  */ | ||||||
|  | -	andl	$(2 * VEC_SIZE - 1), %ecx
 | ||||||
|  | -	cmpl	$VEC_SIZE, %ecx
 | ||||||
|  | -	ja	L(cros_page_boundary)
 | ||||||
|  | +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 | ||||||
|  | +	ja	L(cross_page_boundary)
 | ||||||
|  |   | ||||||
|  |  	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a | ||||||
|  |  	   null byte.  */ | ||||||
|  |  	VPCMP	$0, (%rdi), %YMMZERO, %k0 | ||||||
|  |  	kmovd	%k0, %eax | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -
 | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  | -	jnz	L(first_vec_x0_check)
 | ||||||
|  | -	/* Adjust length and check the end of data.  */
 | ||||||
|  | -	subq	$VEC_SIZE, %rsi
 | ||||||
|  | -	jbe	L(max)
 | ||||||
|  | -# else
 | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | +	/* If length < CHAR_PER_VEC handle special.  */
 | ||||||
|  | +	cmpq	$CHAR_PER_VEC, %rsi
 | ||||||
|  | +	jbe	L(first_vec_x0)
 | ||||||
|  |  # endif | ||||||
|  | -
 | ||||||
|  | -	/* Align data for aligned loads in the loop.  */
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | -
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jz	L(aligned_more)
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	ret
 | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  | -	/* Adjust length.  */
 | ||||||
|  | -	addq	%rcx, %rsi
 | ||||||
|  | +L(zero):
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rsi
 | ||||||
|  | -	jbe	L(last_4x_vec_or_less)
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_vec_x0):
 | ||||||
|  | +	/* Set bit for max len so that tzcnt will return min of max len
 | ||||||
|  | +	   and position of first match.  */
 | ||||||
|  | +	btsq	%rsi, %rax
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	ret
 | ||||||
|  |  # endif | ||||||
|  | -	jmp	L(more_4x_vec)
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(cros_page_boundary):
 | ||||||
|  | -	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | -
 | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | -	/* NB: Divide shift count by 4 since each bit in K0 represent 4
 | ||||||
|  | -	   bytes.  */
 | ||||||
|  | -	movl	%ecx, %SHIFT_REG
 | ||||||
|  | -	sarl	$2, %SHIFT_REG
 | ||||||
|  | +L(first_vec_x1):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	/* Safe to use 32 bit instructions as these are only called for
 | ||||||
|  | +	   size = [1, 159].  */
 | ||||||
|  | +# ifdef USE_AS_STRNLEN
 | ||||||
|  | +	/* Use ecx which was computed earlier to compute correct value.
 | ||||||
|  | +	 */
 | ||||||
|  | +	leal	-(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	subl	%edx, %edi
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarl	$2, %edi
 | ||||||
|  | +#  endif
 | ||||||
|  | +	leal	CHAR_PER_VEC(%rdi, %rax), %eax
 | ||||||
|  |  # endif | ||||||
|  | -	VPCMP	$0, (%rdi), %YMMZERO, %k0
 | ||||||
|  | -	kmovd	%k0, %eax
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -	/* Remove the leading bytes.  */
 | ||||||
|  | -	sarxl	%SHIFT_REG, %eax, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jz	L(aligned_more)
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_vec_x2):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	sall	$2, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | +	/* Safe to use 32 bit instructions as these are only called for
 | ||||||
|  | +	   size = [1, 159].  */
 | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	cmpq	%rax, %rsi
 | ||||||
|  | -	jbe	L(max)
 | ||||||
|  | -# endif
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -	addq	%rcx, %rax
 | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | -	shrq	$2, %rax
 | ||||||
|  | +	/* Use ecx which was computed earlier to compute correct value.
 | ||||||
|  | +	 */
 | ||||||
|  | +	leal	-(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	subl	%edx, %edi
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarl	$2, %edi
 | ||||||
|  | +#  endif
 | ||||||
|  | +	leal	(CHAR_PER_VEC * 2)(%rdi, %rax), %eax
 | ||||||
|  |  # endif | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(aligned_more):
 | ||||||
|  | +L(first_vec_x3):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	/* Safe to use 32 bit instructions as these are only called for
 | ||||||
|  | +	   size = [1, 159].  */
 | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  | -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
 | ||||||
|  | -	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
 | ||||||
|  | -	    to void possible addition overflow.  */
 | ||||||
|  | -	negq	%rcx
 | ||||||
|  | -	addq	$VEC_SIZE, %rcx
 | ||||||
|  | -
 | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	subq	%rcx, %rsi
 | ||||||
|  | -	jbe	L(max)
 | ||||||
|  | +	/* Use ecx which was computed earlier to compute correct value.
 | ||||||
|  | +	 */
 | ||||||
|  | +	leal	-(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	subl	%edx, %edi
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarl	$2, %edi
 | ||||||
|  | +#  endif
 | ||||||
|  | +	leal	(CHAR_PER_VEC * 3)(%rdi, %rax), %eax
 | ||||||
|  |  # endif | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(first_vec_x4):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	/* Safe to use 32 bit instructions as these are only called for
 | ||||||
|  | +	   size = [1, 159].  */
 | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rsi
 | ||||||
|  | -	jbe	L(last_4x_vec_or_less)
 | ||||||
|  | +	/* Use ecx which was computed earlier to compute correct value.
 | ||||||
|  | +	 */
 | ||||||
|  | +	leal	-(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	subl	%edx, %edi
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarl	$2, %edi
 | ||||||
|  | +#  endif
 | ||||||
|  | +	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
 | ||||||
|  |  # endif | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -L(more_4x_vec):
 | ||||||
|  | +	.p2align 5
 | ||||||
|  | +L(aligned_more):
 | ||||||
|  | +	movq	%rdi, %rdx
 | ||||||
|  | +	/* Align data to VEC_SIZE.  */
 | ||||||
|  | +	andq	$-(VEC_SIZE), %rdi
 | ||||||
|  | +L(cross_page_continue):
 | ||||||
|  |  	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time | ||||||
|  |  	   since data is only aligned to VEC_SIZE.  */ | ||||||
|  | -	VPCMP	$0, (%rdi), %YMMZERO, %k0
 | ||||||
|  | -	kmovd	%k0, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | -
 | ||||||
|  | +# ifdef USE_AS_STRNLEN
 | ||||||
|  | +	/* + CHAR_SIZE because it simplies the logic in
 | ||||||
|  | +	   last_4x_vec_or_less.  */
 | ||||||
|  | +	leaq	(VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
 | ||||||
|  | +	subq	%rdx, %rcx
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarl	$2, %ecx
 | ||||||
|  | +#  endif
 | ||||||
|  | +# endif
 | ||||||
|  | +	/* Load first VEC regardless.  */
 | ||||||
|  |  	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0 | ||||||
|  | +# ifdef USE_AS_STRNLEN
 | ||||||
|  | +	/* Adjust length. If near end handle specially.  */
 | ||||||
|  | +	subq	%rcx, %rsi
 | ||||||
|  | +	jb	L(last_4x_vec_or_less)
 | ||||||
|  | +# endif
 | ||||||
|  |  	kmovd	%k0, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x1) | ||||||
|  |   | ||||||
|  |  	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 | ||||||
|  |  	kmovd	%k0, %eax | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | +	test	%eax, %eax
 | ||||||
|  |  	jnz	L(first_vec_x2) | ||||||
|  |   | ||||||
|  |  	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 | ||||||
|  | @@ -179,258 +214,276 @@ L(more_4x_vec):
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x3) | ||||||
|  |   | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rdi
 | ||||||
|  | -
 | ||||||
|  | -# ifdef USE_AS_STRNLEN
 | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rsi
 | ||||||
|  | -	jbe	L(last_4x_vec_or_less)
 | ||||||
|  | -# endif
 | ||||||
|  | -
 | ||||||
|  | -	/* Align data to 4 * VEC_SIZE.  */
 | ||||||
|  | -	movq	%rdi, %rcx
 | ||||||
|  | -	andl	$(4 * VEC_SIZE - 1), %ecx
 | ||||||
|  | -	andq	$-(4 * VEC_SIZE), %rdi
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(first_vec_x4)
 | ||||||
|  |   | ||||||
|  | +	addq	$VEC_SIZE, %rdi
 | ||||||
|  |  # ifdef USE_AS_STRNLEN | ||||||
|  | -	/* Adjust length.  */
 | ||||||
|  | +	/* Check if at last VEC_SIZE * 4 length.  */
 | ||||||
|  | +	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
 | ||||||
|  | +	jbe	L(last_4x_vec_or_less_load)
 | ||||||
|  | +	movl	%edi, %ecx
 | ||||||
|  | +	andl	$(VEC_SIZE * 4 - 1), %ecx
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarl	$2, %ecx
 | ||||||
|  | +#  endif
 | ||||||
|  | +	/* Readjust length.  */
 | ||||||
|  |  	addq	%rcx, %rsi | ||||||
|  |  # endif | ||||||
|  | +	/* Align data to VEC_SIZE * 4.  */
 | ||||||
|  | +	andq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  |   | ||||||
|  | +	/* Compare 4 * VEC at a time forward.  */
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(loop_4x_vec): | ||||||
|  | -	/* Compare 4 * VEC at a time forward.  */
 | ||||||
|  | -	VMOVA	(%rdi), %YMM1
 | ||||||
|  | -	VMOVA	VEC_SIZE(%rdi), %YMM2
 | ||||||
|  | -	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM3
 | ||||||
|  | -	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM4
 | ||||||
|  | -
 | ||||||
|  | -	VPMINU	%YMM1, %YMM2, %YMM5
 | ||||||
|  | -	VPMINU	%YMM3, %YMM4, %YMM6
 | ||||||
|  | +	/* Load first VEC regardless.  */
 | ||||||
|  | +	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
 | ||||||
|  | +# ifdef USE_AS_STRNLEN
 | ||||||
|  | +	/* Break if at end of length.  */
 | ||||||
|  | +	subq	$(CHAR_PER_VEC * 4), %rsi
 | ||||||
|  | +	jb	L(last_4x_vec_or_less_cmpeq)
 | ||||||
|  | +# endif
 | ||||||
|  | +	/* Save some code size by microfusing VPMINU with the load. Since
 | ||||||
|  | +	   the matches in ymm2/ymm4 can only be returned if there where no
 | ||||||
|  | +	   matches in ymm1/ymm3 respectively there is no issue with overlap.
 | ||||||
|  | +	 */
 | ||||||
|  | +	VPMINU	(VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
 | ||||||
|  | +	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
 | ||||||
|  | +	VPMINU	(VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
 | ||||||
|  | +
 | ||||||
|  | +	VPCMP	$0, %YMM2, %YMMZERO, %k0
 | ||||||
|  | +	VPCMP	$0, %YMM4, %YMMZERO, %k1
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	kortestd	%k0, %k1
 | ||||||
|  | +	jz	L(loop_4x_vec)
 | ||||||
|  | +
 | ||||||
|  | +	/* Check if end was in first half.  */
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	subq	%rdx, %rdi
 | ||||||
|  | +# ifdef USE_AS_WCSLEN
 | ||||||
|  | +	shrq	$2, %rdi
 | ||||||
|  | +# endif
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jz	L(second_vec_return)
 | ||||||
|  |   | ||||||
|  | -	VPMINU	%YMM5, %YMM6, %YMM5
 | ||||||
|  | -	VPCMP	$0, %YMM5, %YMMZERO, %k0
 | ||||||
|  | -	ktestd	%k0, %k0
 | ||||||
|  | -	jnz	L(4x_vec_end)
 | ||||||
|  | +	VPCMP	$0, %YMM1, %YMMZERO, %k2
 | ||||||
|  | +	kmovd	%k2, %edx
 | ||||||
|  | +	/* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
 | ||||||
|  | +# ifdef USE_AS_WCSLEN
 | ||||||
|  | +	sall	$CHAR_PER_VEC, %eax
 | ||||||
|  | +	orl	%edx, %eax
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	salq	$CHAR_PER_VEC, %rax
 | ||||||
|  | +	orq	%rdx, %rax
 | ||||||
|  | +	tzcntq	%rax, %rax
 | ||||||
|  | +# endif
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rdi
 | ||||||
|  |   | ||||||
|  | -# ifndef USE_AS_STRNLEN
 | ||||||
|  | -	jmp	L(loop_4x_vec)
 | ||||||
|  | -# else
 | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rsi
 | ||||||
|  | -	ja	L(loop_4x_vec)
 | ||||||
|  | +# ifdef USE_AS_STRNLEN
 | ||||||
|  |   | ||||||
|  | +L(last_4x_vec_or_less_load):
 | ||||||
|  | +	/* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
 | ||||||
|  | +	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
 | ||||||
|  | +L(last_4x_vec_or_less_cmpeq):
 | ||||||
|  | +	VPCMP	$0, %YMM1, %YMMZERO, %k0
 | ||||||
|  | +	addq	$(VEC_SIZE * 3), %rdi
 | ||||||
|  |  L(last_4x_vec_or_less): | ||||||
|  | -	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
 | ||||||
|  | -	addl	$(VEC_SIZE * 2), %esi
 | ||||||
|  | -	jle	L(last_2x_vec)
 | ||||||
|  | -
 | ||||||
|  | -	VPCMP	$0, (%rdi), %YMMZERO, %k0
 | ||||||
|  |  	kmovd	%k0, %eax | ||||||
|  | +	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
 | ||||||
|  | +	   VEC_SIZE * 4.  */
 | ||||||
|  | +	testl	$(CHAR_PER_VEC * 2), %esi
 | ||||||
|  | +	jnz	L(last_4x_vec)
 | ||||||
|  | +
 | ||||||
|  | +	/* length may have been negative or positive by an offset of
 | ||||||
|  | +	   CHAR_PER_VEC * 4 depending on where this was called from. This
 | ||||||
|  | +	   fixes that.  */
 | ||||||
|  | +	andl	$(CHAR_PER_VEC * 4 - 1), %esi
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | +	jnz	L(last_vec_x1_check)
 | ||||||
|  |   | ||||||
|  | -	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
 | ||||||
|  | -	kmovd	%k0, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x1)
 | ||||||
|  | +	/* Check the end of data.  */
 | ||||||
|  | +	subl	$CHAR_PER_VEC, %esi
 | ||||||
|  | +	jb	L(max)
 | ||||||
|  |   | ||||||
|  |  	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 | ||||||
|  |  	kmovd	%k0, %eax | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x2_check)
 | ||||||
|  | -	subl	$VEC_SIZE, %esi
 | ||||||
|  | -	jle	L(max)
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	/* Check the end of data.  */
 | ||||||
|  | +	cmpl	%eax, %esi
 | ||||||
|  | +	jb	L(max)
 | ||||||
|  |   | ||||||
|  | -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
 | ||||||
|  | -	kmovd	%k0, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x3_check)
 | ||||||
|  | +	subq	%rdx, %rdi
 | ||||||
|  | +#  ifdef USE_AS_WCSLEN
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarq	$2, %rdi
 | ||||||
|  | +#  endif
 | ||||||
|  | +	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
 | ||||||
|  | +	ret
 | ||||||
|  | +L(max):
 | ||||||
|  |  	movq	%r8, %rax | ||||||
|  | +	ret
 | ||||||
|  | +# endif
 | ||||||
|  | +
 | ||||||
|  | +	/* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
 | ||||||
|  | +	   in the 4x VEC loop can use 2 byte encoding.  */
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(second_vec_return):
 | ||||||
|  | +	VPCMP	$0, %YMM3, %YMMZERO, %k0
 | ||||||
|  | +	/* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
 | ||||||
|  | +# ifdef USE_AS_WCSLEN
 | ||||||
|  | +	kunpckbw	%k0, %k1, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	kunpckdq	%k0, %k1, %k0
 | ||||||
|  | +	kmovq	%k0, %rax
 | ||||||
|  | +	tzcntq	%rax, %rax
 | ||||||
|  | +# endif
 | ||||||
|  | +	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +
 | ||||||
|  | +# ifdef USE_AS_STRNLEN
 | ||||||
|  | +L(last_vec_x1_check):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	/* Check the end of data.  */
 | ||||||
|  | +	cmpl	%eax, %esi
 | ||||||
|  | +	jb	L(max)
 | ||||||
|  | +	subq	%rdx, %rdi
 | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  | -	shrq	$2, %rax
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarq	$2, %rdi
 | ||||||
|  |  #  endif | ||||||
|  | +	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(last_2x_vec):
 | ||||||
|  | -	addl	$(VEC_SIZE * 2), %esi
 | ||||||
|  | +L(last_4x_vec):
 | ||||||
|  | +	/* Test first 2x VEC normally.  */
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(last_vec_x1)
 | ||||||
|  |   | ||||||
|  | -	VPCMP	$0, (%rdi), %YMMZERO, %k0
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
 | ||||||
|  |  	kmovd	%k0, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(first_vec_x0_check)
 | ||||||
|  | -	subl	$VEC_SIZE, %esi
 | ||||||
|  | -	jle	L(max)
 | ||||||
|  | +	jnz	L(last_vec_x2)
 | ||||||
|  |   | ||||||
|  | -	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
 | ||||||
|  | +	/* Normalize length.  */
 | ||||||
|  | +	andl	$(CHAR_PER_VEC * 4 - 1), %esi
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
 | ||||||
|  |  	kmovd	%k0, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(first_vec_x1_check)
 | ||||||
|  | -	movq	%r8, %rax
 | ||||||
|  | -#  ifdef USE_AS_WCSLEN
 | ||||||
|  | -	shrq	$2, %rax
 | ||||||
|  | -#  endif
 | ||||||
|  | -	ret
 | ||||||
|  | +	jnz	L(last_vec_x3)
 | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(first_vec_x0_check):
 | ||||||
|  | +	/* Check the end of data.  */
 | ||||||
|  | +	subl	$(CHAR_PER_VEC * 3), %esi
 | ||||||
|  | +	jb	L(max)
 | ||||||
|  | +
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -#  ifdef USE_AS_WCSLEN
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	sall	$2, %eax
 | ||||||
|  | -#  endif
 | ||||||
|  |  	/* Check the end of data.  */ | ||||||
|  | -	cmpq	%rax, %rsi
 | ||||||
|  | -	jbe	L(max)
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  | +	cmpl	%eax, %esi
 | ||||||
|  | +	jb	L(max_end)
 | ||||||
|  | +
 | ||||||
|  | +	subq	%rdx, %rdi
 | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  | -	shrq	$2, %rax
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarq	$2, %rdi
 | ||||||
|  |  #  endif | ||||||
|  | +	leaq	(CHAR_PER_VEC * 4)(%rdi, %rax), %rax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x1_check):
 | ||||||
|  | +L(last_vec_x1):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | +	subq	%rdx, %rdi
 | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	sall	$2, %eax
 | ||||||
|  | -#  endif
 | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	cmpq	%rax, %rsi
 | ||||||
|  | -	jbe	L(max)
 | ||||||
|  | -	addq	$VEC_SIZE, %rax
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  | -#  ifdef USE_AS_WCSLEN
 | ||||||
|  | -	shrq	$2, %rax
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarq	$2, %rdi
 | ||||||
|  |  #  endif | ||||||
|  | +	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x2_check):
 | ||||||
|  | +L(last_vec_x2):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | +	subq	%rdx, %rdi
 | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	sall	$2, %eax
 | ||||||
|  | -#  endif
 | ||||||
|  | -	/* Check the end of data.  */
 | ||||||
|  | -	cmpq	%rax, %rsi
 | ||||||
|  | -	jbe	L(max)
 | ||||||
|  | -	addq	$(VEC_SIZE * 2), %rax
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  | -#  ifdef USE_AS_WCSLEN
 | ||||||
|  | -	shrq	$2, %rax
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarq	$2, %rdi
 | ||||||
|  |  #  endif | ||||||
|  | +	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x3_check):
 | ||||||
|  | +L(last_vec_x3):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -#  ifdef USE_AS_WCSLEN
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	sall	$2, %eax
 | ||||||
|  | -#  endif
 | ||||||
|  | +	subl	$(CHAR_PER_VEC * 2), %esi
 | ||||||
|  |  	/* Check the end of data.  */ | ||||||
|  | -	cmpq	%rax, %rsi
 | ||||||
|  | -	jbe	L(max)
 | ||||||
|  | -	addq	$(VEC_SIZE * 3), %rax
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  | +	cmpl	%eax, %esi
 | ||||||
|  | +	jb	L(max_end)
 | ||||||
|  | +	subq	%rdx, %rdi
 | ||||||
|  |  #  ifdef USE_AS_WCSLEN | ||||||
|  | -	shrq	$2, %rax
 | ||||||
|  | +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 | ||||||
|  | +	sarq	$2, %rdi
 | ||||||
|  |  #  endif | ||||||
|  | +	leaq	(CHAR_PER_VEC * 3)(%rdi, %rax), %rax
 | ||||||
|  |  	ret | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(max):
 | ||||||
|  | +L(max_end):
 | ||||||
|  |  	movq	%r8, %rax | ||||||
|  | -#  ifdef USE_AS_WCSLEN
 | ||||||
|  | -	shrq	$2, %rax
 | ||||||
|  | -#  endif
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(zero):
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  |  	ret | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  | +	/* Cold case for crossing page with first load.	 */
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x0):
 | ||||||
|  | -	tzcntl	%eax, %eax
 | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	sall	$2, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  | +L(cross_page_boundary):
 | ||||||
|  | +	movq	%rdi, %rdx
 | ||||||
|  | +	/* Align data to VEC_SIZE.  */
 | ||||||
|  | +	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | +	VPCMP	$0, (%rdi), %YMMZERO, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	/* Remove the leading bytes.  */
 | ||||||
|  |  # ifdef USE_AS_WCSLEN | ||||||
|  | -	shrq	$2, %rax
 | ||||||
|  | +	/* NB: Divide shift count by 4 since each bit in K0 represent 4
 | ||||||
|  | +	   bytes.  */
 | ||||||
|  | +	movl	%edx, %ecx
 | ||||||
|  | +	shrl	$2, %ecx
 | ||||||
|  | +	andl	$(CHAR_PER_VEC - 1), %ecx
 | ||||||
|  |  # endif | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(first_vec_x1):
 | ||||||
|  | +	/* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
 | ||||||
|  | +	sarxl	%SHIFT_REG, %eax, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +# ifndef USE_AS_STRNLEN
 | ||||||
|  | +	jz	L(cross_page_continue)
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	sall	$2, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | -	addq	$VEC_SIZE, %rax
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | -	shrq	$2, %rax
 | ||||||
|  | -# endif
 | ||||||
|  |  	ret | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(first_vec_x2):
 | ||||||
|  | -	tzcntl	%eax, %eax
 | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	sall	$2, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | -	addq	$(VEC_SIZE * 2), %rax
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | -	shrq	$2, %rax
 | ||||||
|  | -# endif
 | ||||||
|  | +# else
 | ||||||
|  | +	jnz	L(cross_page_less_vec)
 | ||||||
|  | +#  ifndef USE_AS_WCSLEN
 | ||||||
|  | +	movl	%edx, %ecx
 | ||||||
|  | +	andl	$(CHAR_PER_VEC - 1), %ecx
 | ||||||
|  | +#  endif
 | ||||||
|  | +	movl	$CHAR_PER_VEC, %eax
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +	/* Check the end of data.  */
 | ||||||
|  | +	cmpq	%rax, %rsi
 | ||||||
|  | +	ja	L(cross_page_continue)
 | ||||||
|  | +	movl	%esi, %eax
 | ||||||
|  |  	ret | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(4x_vec_end):
 | ||||||
|  | -	VPCMP	$0, %YMM1, %YMMZERO, %k0
 | ||||||
|  | -	kmovd	%k0, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | -	VPCMP	$0, %YMM2, %YMMZERO, %k1
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x1)
 | ||||||
|  | -	VPCMP	$0, %YMM3, %YMMZERO, %k2
 | ||||||
|  | -	kmovd	%k2, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x2)
 | ||||||
|  | -	VPCMP	$0, %YMM4, %YMMZERO, %k3
 | ||||||
|  | -	kmovd	%k3, %eax
 | ||||||
|  | -L(first_vec_x3):
 | ||||||
|  | +L(cross_page_less_vec):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	sall	$2, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | -	addq	$(VEC_SIZE * 3), %rax
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -	subq	%rdx, %rax
 | ||||||
|  | -# ifdef USE_AS_WCSLEN
 | ||||||
|  | -	shrq	$2, %rax
 | ||||||
|  | -# endif
 | ||||||
|  | +	/* Select min of length and position of first null.  */
 | ||||||
|  | +	cmpq	%rax, %rsi
 | ||||||
|  | +	cmovb	%esi, %eax
 | ||||||
|  |  	ret | ||||||
|  | +# endif
 | ||||||
|  |   | ||||||
|  |  END (STRLEN) | ||||||
|  |  #endif | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										158
									
								
								SOURCES/glibc-RHEL-15696-32.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										158
									
								
								SOURCES/glibc-RHEL-15696-32.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,158 @@ | |||||||
|  | From ea8e465a6b8d0f26c72bcbe453a854de3abf68ec Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Wed, 30 Jun 2021 10:47:06 -0700 | ||||||
|  | Subject: [PATCH] x86: Check RTM_ALWAYS_ABORT for RTM [BZ #28033] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | From | ||||||
|  | 
 | ||||||
|  | https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html | ||||||
|  | 
 | ||||||
|  | * Intel TSX will be disabled by default. | ||||||
|  | * The processor will force abort all Restricted Transactional Memory (RTM) | ||||||
|  |   transactions by default. | ||||||
|  | * A new CPUID bit CPUID.07H.0H.EDX[11](RTM_ALWAYS_ABORT) will be enumerated, | ||||||
|  |   which is set to indicate to updated software that the loaded microcode is | ||||||
|  |   forcing RTM abort. | ||||||
|  | * On processors that enumerate support for RTM, the CPUID enumeration bits | ||||||
|  |   for Intel TSX (CPUID.07H.0H.EBX[11] and CPUID.07H.0H.EBX[4]) continue to | ||||||
|  |   be set by default after microcode update. | ||||||
|  | * Workloads that were benefited from Intel TSX might experience a change | ||||||
|  |   in performance. | ||||||
|  | * System software may use a new bit in Model-Specific Register (MSR) 0x10F | ||||||
|  |   TSX_FORCE_ABORT[TSX_CPUID_CLEAR] functionality to clear the Hardware Lock | ||||||
|  |   Elision (HLE) and RTM bits to indicate to software that Intel TSX is | ||||||
|  |   disabled. | ||||||
|  | 
 | ||||||
|  | 1. Add RTM_ALWAYS_ABORT to CPUID features. | ||||||
|  | 2. Set RTM usable only if RTM_ALWAYS_ABORT isn't set.  This skips the | ||||||
|  | string/tst-memchr-rtm etc. testcases on the affected processors, which | ||||||
|  | always fail after a microcde update. | ||||||
|  | 3. Check RTM feature, instead of usability, against /proc/cpuinfo. | ||||||
|  | 
 | ||||||
|  | This fixes BZ #28033. | ||||||
|  | ---
 | ||||||
|  |  manual/platform.texi                    | 3 +++ | ||||||
|  |  sysdeps/x86/cpu-features.c              | 5 ++++- | ||||||
|  |  sysdeps/x86/sys/platform/x86.h          | 6 +++--- | ||||||
|  |  sysdeps/x86/tst-cpu-features-supports.c | 2 +- | ||||||
|  |  sysdeps/x86/tst-get-cpu-features.c      | 2 ++ | ||||||
|  |  5 files changed, 13 insertions(+), 5 deletions(-) | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	sysdeps/x86/bits/platform/x86.h | ||||||
|  | 	(doesn't exist) | ||||||
|  | 	sysdeps/x86/bits/platform/x86.h | ||||||
|  | 	(account for lack of upstream renames) | ||||||
|  | 
 | ||||||
|  | diff --git a/manual/platform.texi b/manual/platform.texi
 | ||||||
|  | index 8fec2933..b7e8aef7 100644
 | ||||||
|  | --- a/manual/platform.texi
 | ||||||
|  | +++ b/manual/platform.texi
 | ||||||
|  | @@ -510,6 +510,9 @@ capability.
 | ||||||
|  |  @item | ||||||
|  |  @code{RTM} -- RTM instruction extensions. | ||||||
|  |   | ||||||
|  | +@item
 | ||||||
|  | +@code{RTM_ALWAYS_ABORT} -- Transactions always abort, making RTM unusable.
 | ||||||
|  | +
 | ||||||
|  |  @item | ||||||
|  |  @code{SDBG} -- IA32_DEBUG_INTERFACE MSR for silicon debug. | ||||||
|  |   | ||||||
|  | diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
 | ||||||
|  | index 3610ee5c..4889f062 100644
 | ||||||
|  | --- a/sysdeps/x86/cpu-features.c
 | ||||||
|  | +++ b/sysdeps/x86/cpu-features.c
 | ||||||
|  | @@ -74,7 +74,6 @@ update_usable (struct cpu_features *cpu_features)
 | ||||||
|  |    CPU_FEATURE_SET_USABLE (cpu_features, HLE); | ||||||
|  |    CPU_FEATURE_SET_USABLE (cpu_features, BMI2); | ||||||
|  |    CPU_FEATURE_SET_USABLE (cpu_features, ERMS); | ||||||
|  | -  CPU_FEATURE_SET_USABLE (cpu_features, RTM);
 | ||||||
|  |    CPU_FEATURE_SET_USABLE (cpu_features, RDSEED); | ||||||
|  |    CPU_FEATURE_SET_USABLE (cpu_features, ADX); | ||||||
|  |    CPU_FEATURE_SET_USABLE (cpu_features, CLFLUSHOPT); | ||||||
|  | @@ -90,6 +89,7 @@ update_usable (struct cpu_features *cpu_features)
 | ||||||
|  |    CPU_FEATURE_SET_USABLE (cpu_features, MOVDIRI); | ||||||
|  |    CPU_FEATURE_SET_USABLE (cpu_features, MOVDIR64B); | ||||||
|  |    CPU_FEATURE_SET_USABLE (cpu_features, FSRM); | ||||||
|  | +  CPU_FEATURE_SET_USABLE (cpu_features, RTM_ALWAYS_ABORT);
 | ||||||
|  |    CPU_FEATURE_SET_USABLE (cpu_features, SERIALIZE); | ||||||
|  |    CPU_FEATURE_SET_USABLE (cpu_features, TSXLDTRK); | ||||||
|  |    CPU_FEATURE_SET_USABLE (cpu_features, LAHF64_SAHF64); | ||||||
|  | @@ -779,6 +779,9 @@ no_cpuid:
 | ||||||
|  |      GLRO(dl_platform) = "i586"; | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  | +  if (!CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT))
 | ||||||
|  | +    CPU_FEATURE_SET_USABLE (cpu_features, RTM);
 | ||||||
|  | +
 | ||||||
|  |  #if CET_ENABLED | ||||||
|  |  # if HAVE_TUNABLES | ||||||
|  |    TUNABLE_GET (x86_ibt, tunable_val_t *, | ||||||
|  | diff --git a/sysdeps/x86/sys/platform/x86.h b/sysdeps/x86/sys/platform/x86.h
 | ||||||
|  | index e5cc7c68..7a434926 100644
 | ||||||
|  | --- a/sysdeps/x86/sys/platform/x86.h
 | ||||||
|  | +++ b/sysdeps/x86/sys/platform/x86.h
 | ||||||
|  | @@ -247,7 +247,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
 | ||||||
|  |  #define bit_cpu_AVX512_VP2INTERSECT (1u << 8) | ||||||
|  |  #define bit_cpu_INDEX_7_EDX_9	(1u << 9) | ||||||
|  |  #define bit_cpu_MD_CLEAR	(1u << 10) | ||||||
|  | -#define bit_cpu_INDEX_7_EDX_11	(1u << 11)
 | ||||||
|  | +#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11)
 | ||||||
|  |  #define bit_cpu_INDEX_7_EDX_12	(1u << 12) | ||||||
|  |  #define bit_cpu_INDEX_7_EDX_13	(1u << 13) | ||||||
|  |  #define bit_cpu_SERIALIZE	(1u << 14) | ||||||
|  | @@ -471,7 +471,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
 | ||||||
|  |  #define index_cpu_AVX512_VP2INTERSECT COMMON_CPUID_INDEX_7 | ||||||
|  |  #define index_cpu_INDEX_7_EDX_9	COMMON_CPUID_INDEX_7 | ||||||
|  |  #define index_cpu_MD_CLEAR	COMMON_CPUID_INDEX_7 | ||||||
|  | -#define index_cpu_INDEX_7_EDX_11 COMMON_CPUID_INDEX_7
 | ||||||
|  | +#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7
 | ||||||
|  |  #define index_cpu_INDEX_7_EDX_12 COMMON_CPUID_INDEX_7 | ||||||
|  |  #define index_cpu_INDEX_7_EDX_13 COMMON_CPUID_INDEX_7 | ||||||
|  |  #define index_cpu_SERIALIZE	COMMON_CPUID_INDEX_7 | ||||||
|  | @@ -695,7 +695,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
 | ||||||
|  |  #define reg_AVX512_VP2INTERSECT	edx | ||||||
|  |  #define reg_INDEX_7_EDX_9	edx | ||||||
|  |  #define reg_MD_CLEAR		edx | ||||||
|  | -#define reg_INDEX_7_EDX_11	edx
 | ||||||
|  | +#define reg_RTM_ALWAYS_ABORT	edx
 | ||||||
|  |  #define reg_INDEX_7_EDX_12	edx | ||||||
|  |  #define reg_INDEX_7_EDX_13	edx | ||||||
|  |  #define reg_SERIALIZE		edx | ||||||
|  | diff --git a/sysdeps/x86/tst-cpu-features-supports.c b/sysdeps/x86/tst-cpu-features-supports.c
 | ||||||
|  | index 287cf01f..8100a319 100644
 | ||||||
|  | --- a/sysdeps/x86/tst-cpu-features-supports.c
 | ||||||
|  | +++ b/sysdeps/x86/tst-cpu-features-supports.c
 | ||||||
|  | @@ -152,7 +152,7 @@ do_test (int argc, char **argv)
 | ||||||
|  |    fails += CHECK_SUPPORTS (rdpid, RDPID); | ||||||
|  |    fails += CHECK_SUPPORTS (rdrnd, RDRAND); | ||||||
|  |    fails += CHECK_SUPPORTS (rdseed, RDSEED); | ||||||
|  | -  fails += CHECK_SUPPORTS (rtm, RTM);
 | ||||||
|  | +  fails += CHECK_CPU_SUPPORTS (rtm, RTM);
 | ||||||
|  |    fails += CHECK_SUPPORTS (serialize, SERIALIZE); | ||||||
|  |    fails += CHECK_SUPPORTS (sha, SHA); | ||||||
|  |    fails += CHECK_CPU_SUPPORTS (shstk, SHSTK); | ||||||
|  | diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c
 | ||||||
|  | index 2763deb6..0717e5d8 100644
 | ||||||
|  | --- a/sysdeps/x86/tst-get-cpu-features.c
 | ||||||
|  | +++ b/sysdeps/x86/tst-get-cpu-features.c
 | ||||||
|  | @@ -183,6 +183,7 @@ do_test (void)
 | ||||||
|  |    CHECK_CPU_FEATURE (UINTR); | ||||||
|  |    CHECK_CPU_FEATURE (AVX512_VP2INTERSECT); | ||||||
|  |    CHECK_CPU_FEATURE (MD_CLEAR); | ||||||
|  | +  CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT);
 | ||||||
|  |    CHECK_CPU_FEATURE (SERIALIZE); | ||||||
|  |    CHECK_CPU_FEATURE (HYBRID); | ||||||
|  |    CHECK_CPU_FEATURE (TSXLDTRK); | ||||||
|  | @@ -344,6 +345,7 @@ do_test (void)
 | ||||||
|  |    CHECK_CPU_FEATURE_USABLE (FSRM); | ||||||
|  |    CHECK_CPU_FEATURE_USABLE (AVX512_VP2INTERSECT); | ||||||
|  |    CHECK_CPU_FEATURE_USABLE (MD_CLEAR); | ||||||
|  | +  CHECK_CPU_FEATURE_USABLE (RTM_ALWAYS_ABORT);
 | ||||||
|  |    CHECK_CPU_FEATURE_USABLE (SERIALIZE); | ||||||
|  |    CHECK_CPU_FEATURE_USABLE (HYBRID); | ||||||
|  |    CHECK_CPU_FEATURE_USABLE (TSXLDTRK); | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										51
									
								
								SOURCES/glibc-RHEL-15696-33.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								SOURCES/glibc-RHEL-15696-33.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,51 @@ | |||||||
|  | From 0679442defedf7e52a94264975880ab8674736b2 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Thu, 8 Jul 2021 16:13:19 -0400 | ||||||
|  | Subject: [PATCH] x86: Remove wcsnlen-sse4_1 from wcslen ifunc-impl-list [BZ | ||||||
|  |  #28064] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | The following commit | ||||||
|  | 
 | ||||||
|  | commit 6f573a27b6c8b4236445810a44660612323f5a73 | ||||||
|  | Author: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date:   Wed Jun 23 01:19:34 2021 -0400 | ||||||
|  | 
 | ||||||
|  |     x86-64: Add wcslen optimize for sse4.1 | ||||||
|  | 
 | ||||||
|  | Added wcsnlen-sse4.1 to the wcslen ifunc implementation list and did | ||||||
|  | not add wcslen-sse4.1 to wcslen ifunc implementation list. This commit | ||||||
|  | fixes that by removing wcsnlen-sse4.1 from the wcslen ifunc | ||||||
|  | implementation list and adding wcslen-sse4.1 to the ifunc | ||||||
|  | implementation list. | ||||||
|  | 
 | ||||||
|  | Testing: | ||||||
|  | test-wcslen.c, test-rsi-wcslen.c, and test-rsi-strlen.c are passing as | ||||||
|  | well as all other tests in wcsmbs and string. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ++-- | ||||||
|  |  1 file changed, 2 insertions(+), 2 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | index 580913ca..695cdba6 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | @@ -657,9 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (AVX512BW) | ||||||
|  |  			       && CPU_FEATURE_USABLE (BMI2)), | ||||||
|  |  			      __wcslen_evex) | ||||||
|  | -	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, wcslen,
 | ||||||
|  |  			      CPU_FEATURE_USABLE (SSE4_1), | ||||||
|  | -			      __wcsnlen_sse4_1)
 | ||||||
|  | +			      __wcslen_sse4_1)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2)) | ||||||
|  |   | ||||||
|  |    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */ | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										135
									
								
								SOURCES/glibc-RHEL-15696-34.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										135
									
								
								SOURCES/glibc-RHEL-15696-34.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,135 @@ | |||||||
|  | From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Tue, 15 Feb 2022 08:18:15 -0600 | ||||||
|  | Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ | ||||||
|  |  #28896] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would | ||||||
|  | call strcmp-avx2 and wcscmp-avx2 respectively. This would have | ||||||
|  | not checks around vzeroupper and would trigger spurious | ||||||
|  | aborts. This commit fixes that. | ||||||
|  | 
 | ||||||
|  | test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on | ||||||
|  | AVX2 machines with and without RTM. | ||||||
|  | 
 | ||||||
|  | Co-authored-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86/Makefile                        |  2 +- | ||||||
|  |  sysdeps/x86/tst-strncmp-rtm.c               | 17 ++++++++++++++++- | ||||||
|  |  sysdeps/x86_64/multiarch/strcmp-avx2.S      |  2 +- | ||||||
|  |  sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S |  1 + | ||||||
|  |  sysdeps/x86_64/multiarch/strncmp-avx2.S     |  1 + | ||||||
|  |  sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S |  2 +- | ||||||
|  |  sysdeps/x86_64/multiarch/wcsncmp-avx2.S     |  2 +- | ||||||
|  |  7 files changed, 22 insertions(+), 5 deletions(-) | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	sysdeps/x86_64/multiarch/strcmp-avx2.S | ||||||
|  | 	(split into two patches due to upstream bug differences) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
 | ||||||
|  | index 5be71ada..2d814915 100644
 | ||||||
|  | --- a/sysdeps/x86/Makefile
 | ||||||
|  | +++ b/sysdeps/x86/Makefile
 | ||||||
|  | @@ -38,7 +38,7 @@ CFLAGS-tst-memset-rtm.c += -mrtm
 | ||||||
|  |  CFLAGS-tst-strchr-rtm.c += -mrtm | ||||||
|  |  CFLAGS-tst-strcpy-rtm.c += -mrtm | ||||||
|  |  CFLAGS-tst-strlen-rtm.c += -mrtm | ||||||
|  | -CFLAGS-tst-strncmp-rtm.c += -mrtm
 | ||||||
|  | +CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
 | ||||||
|  |  CFLAGS-tst-strrchr-rtm.c += -mrtm | ||||||
|  |  endif | ||||||
|  |   | ||||||
|  | diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
 | ||||||
|  | index 236ad951..4d0004b5 100644
 | ||||||
|  | --- a/sysdeps/x86/tst-strncmp-rtm.c
 | ||||||
|  | +++ b/sysdeps/x86/tst-strncmp-rtm.c
 | ||||||
|  | @@ -16,6 +16,7 @@
 | ||||||
|  |     License along with the GNU C Library; if not, see | ||||||
|  |     <https://www.gnu.org/licenses/>.  */ | ||||||
|  |   | ||||||
|  | +#include <stdint.h>
 | ||||||
|  |  #include <tst-string-rtm.h> | ||||||
|  |   | ||||||
|  |  #define LOOP 3000 | ||||||
|  | @@ -45,8 +46,22 @@ function (void)
 | ||||||
|  |      return 1; | ||||||
|  |  } | ||||||
|  |   | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +static int
 | ||||||
|  | +function_overflow (void)
 | ||||||
|  | +{
 | ||||||
|  | +  if (strncmp (string1, string2, SIZE_MAX) == 0)
 | ||||||
|  | +    return 0;
 | ||||||
|  | +  else
 | ||||||
|  | +    return 1;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  |  static int | ||||||
|  |  do_test (void) | ||||||
|  |  { | ||||||
|  | -  return do_test_1 ("strncmp", LOOP, prepare, function);
 | ||||||
|  | +  int status = do_test_1 ("strncmp", LOOP, prepare, function);
 | ||||||
|  | +  if (status != EXIT_SUCCESS)
 | ||||||
|  | +    return status;
 | ||||||
|  | +  status = do_test_1 ("strncmp", LOOP, prepare, function_overflow);
 | ||||||
|  | +  return status;
 | ||||||
|  |  } | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | index 5d1c9d90..433ae047 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | @@ -95,7 +95,7 @@ ENTRY (STRCMP)
 | ||||||
|  |  	   length to bound a valid memory region. In these cases just use | ||||||
|  |  	   'wcscmp'.  */ | ||||||
|  |  	shrq	$56, %rcx | ||||||
|  | -	jnz	__wcscmp_avx2
 | ||||||
|  | +	jnz	OVERFLOW_STRCMP
 | ||||||
|  |  #  endif | ||||||
|  |  	/* Convert units: from wide to byte char.  */ | ||||||
|  |  	shl	$2, %RDX_LP | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
 | ||||||
|  | index 37d1224b..68bad365 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
 | ||||||
|  | @@ -1,3 +1,4 @@
 | ||||||
|  |  #define STRCMP	__strncmp_avx2_rtm | ||||||
|  |  #define USE_AS_STRNCMP 1 | ||||||
|  | +#define OVERFLOW_STRCMP	__strcmp_avx2_rtm
 | ||||||
|  |  #include "strcmp-avx2-rtm.S" | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S
 | ||||||
|  | index 1678bcc2..f138e9f1 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strncmp-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S
 | ||||||
|  | @@ -1,3 +1,4 @@
 | ||||||
|  |  #define STRCMP	__strncmp_avx2 | ||||||
|  |  #define USE_AS_STRNCMP 1 | ||||||
|  | +#define OVERFLOW_STRCMP __strcmp_avx2
 | ||||||
|  |  #include "strcmp-avx2.S" | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
 | ||||||
|  | index 4e88c70c..f467582c 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
 | ||||||
|  | @@ -1,5 +1,5 @@
 | ||||||
|  |  #define STRCMP __wcsncmp_avx2_rtm | ||||||
|  |  #define USE_AS_STRNCMP 1 | ||||||
|  |  #define USE_AS_WCSCMP 1 | ||||||
|  | -
 | ||||||
|  | +#define OVERFLOW_STRCMP	__wcscmp_avx2_rtm
 | ||||||
|  |  #include "strcmp-avx2-rtm.S" | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
 | ||||||
|  | index 4fa1de4d..e9ede522 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
 | ||||||
|  | @@ -1,5 +1,5 @@
 | ||||||
|  |  #define STRCMP __wcsncmp_avx2 | ||||||
|  |  #define USE_AS_STRNCMP 1 | ||||||
|  |  #define USE_AS_WCSCMP 1 | ||||||
|  | -
 | ||||||
|  | +#define OVERFLOW_STRCMP	__wcscmp_avx2
 | ||||||
|  |  #include "strcmp-avx2.S" | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										51
									
								
								SOURCES/glibc-RHEL-15696-35.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								SOURCES/glibc-RHEL-15696-35.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,51 @@ | |||||||
|  | From 55c7bcc71b84123d5d4bd2814366a6b05fcf8ebd Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Sat, 9 May 2020 12:04:23 -0700 | ||||||
|  | Subject: [PATCH] x86-64: Use RDX_LP on __x86_shared_non_temporal_threshold [BZ | ||||||
|  |  #25966] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Since __x86_shared_non_temporal_threshold is defined as | ||||||
|  | 
 | ||||||
|  | long int __x86_shared_non_temporal_threshold; | ||||||
|  | 
 | ||||||
|  | and long int is 4 bytes for x32, use RDX_LP to compare against | ||||||
|  | __x86_shared_non_temporal_threshold in assembly code. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++--- | ||||||
|  |  1 file changed, 3 insertions(+), 3 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | index 71f5954d..673b73aa 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | @@ -245,7 +245,7 @@ L(return):
 | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  |  L(movsb): | ||||||
|  | -	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
 | ||||||
|  | +	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 | ||||||
|  |  	jae	L(more_8x_vec) | ||||||
|  |  	cmpq	%rsi, %rdi | ||||||
|  |  	jb	1f | ||||||
|  | @@ -397,7 +397,7 @@ L(more_8x_vec):
 | ||||||
|  |  	addq	%r8, %rdx | ||||||
|  |  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) | ||||||
|  |  	/* Check non-temporal store threshold.  */ | ||||||
|  | -	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
 | ||||||
|  | +	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 | ||||||
|  |  	ja	L(large_forward) | ||||||
|  |  #endif | ||||||
|  |  L(loop_4x_vec_forward): | ||||||
|  | @@ -448,7 +448,7 @@ L(more_8x_vec_backward):
 | ||||||
|  |  	subq	%r8, %rdx | ||||||
|  |  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) | ||||||
|  |  	/* Check non-temporal store threshold.  */ | ||||||
|  | -	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
 | ||||||
|  | +	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 | ||||||
|  |  	ja	L(large_backward) | ||||||
|  |  #endif | ||||||
|  |  L(loop_4x_vec_backward): | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										44
									
								
								SOURCES/glibc-RHEL-15696-36.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										44
									
								
								SOURCES/glibc-RHEL-15696-36.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,44 @@ | |||||||
|  | From a35a59036ebae3efcdf5e8167610e0656fca9770 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Thu, 11 Jun 2020 12:41:18 -0700 | ||||||
|  | Subject: [PATCH] x86_64: Use %xmmN with vpxor to clear a vector register | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Since "vpxor %xmmN, %xmmN, %xmmN" clears the whole vector register, use | ||||||
|  | %xmmN, instead of %ymmN, with vpxor to clear a vector register. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strcmp-avx2.S  | 4 ++-- | ||||||
|  |  sysdeps/x86_64/multiarch/strrchr-avx2.S | 2 +- | ||||||
|  |  2 files changed, 3 insertions(+), 3 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | index 433ae047..70d8499b 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | @@ -105,8 +105,8 @@ ENTRY (STRCMP)
 | ||||||
|  |  # endif | ||||||
|  |  	movl	%edi, %eax | ||||||
|  |  	xorl	%edx, %edx | ||||||
|  | -	/* Make %ymm7 all zeros in this function.  */
 | ||||||
|  | -	vpxor	%ymm7, %ymm7, %ymm7
 | ||||||
|  | +	/* Make %xmm7 (%ymm7) all zeros in this function.  */
 | ||||||
|  | +	vpxor	%xmm7, %xmm7, %xmm7
 | ||||||
|  |  	orl	%esi, %eax | ||||||
|  |  	andl	$(PAGE_SIZE - 1), %eax | ||||||
|  |  	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
 | ||||||
|  | index 9f22a15e..c949410b 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
 | ||||||
|  | @@ -48,7 +48,7 @@ ENTRY (STRRCHR)
 | ||||||
|  |  	movl	%edi, %ecx | ||||||
|  |  	/* Broadcast CHAR to YMM4.  */ | ||||||
|  |  	VPBROADCAST %xmm4, %ymm4 | ||||||
|  | -	vpxor	%ymm0, %ymm0, %ymm0
 | ||||||
|  | +	vpxor	%xmm0, %xmm0, %xmm0
 | ||||||
|  |   | ||||||
|  |  	/* Check if we may cross page boundary with one vector load.  */ | ||||||
|  |  	andl	$(2 * VEC_SIZE - 1), %ecx | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										359
									
								
								SOURCES/glibc-RHEL-15696-37.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										359
									
								
								SOURCES/glibc-RHEL-15696-37.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,359 @@ | |||||||
|  | From 1f745ecc2109890886b161d4791e1406fdfc29b8 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: noah <goldstein.w.n@gmail.com> | ||||||
|  | Date: Wed, 3 Feb 2021 00:38:59 -0500 | ||||||
|  | Subject: [PATCH] x86-64: Refactor and improve performance of strchr-avx2.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. Just seemed the performance could be improved a bit. Observed | ||||||
|  | and expected behavior are unchanged. Optimized body of main | ||||||
|  | loop. Updated page cross logic and optimized accordingly. Made a few | ||||||
|  | minor instruction selection modifications. No regressions in test | ||||||
|  | suite. Both test-strchrnul and test-strchr passed. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strchr-avx2.S | 225 ++++++++++++------------- | ||||||
|  |  sysdeps/x86_64/multiarch/strchr.c      |   4 +- | ||||||
|  |  2 files changed, 114 insertions(+), 115 deletions(-) | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	sysdeps/x86_64/multiarch/strchr.c | ||||||
|  | 	(account for missing upstream macros) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
 | ||||||
|  | index da7d2620..919d256c 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
 | ||||||
|  | @@ -27,10 +27,12 @@
 | ||||||
|  |  # ifdef USE_AS_WCSCHR | ||||||
|  |  #  define VPBROADCAST	vpbroadcastd | ||||||
|  |  #  define VPCMPEQ	vpcmpeqd | ||||||
|  | +#  define VPMINU	vpminud
 | ||||||
|  |  #  define CHAR_REG	esi | ||||||
|  |  # else | ||||||
|  |  #  define VPBROADCAST	vpbroadcastb | ||||||
|  |  #  define VPCMPEQ	vpcmpeqb | ||||||
|  | +#  define VPMINU	vpminub
 | ||||||
|  |  #  define CHAR_REG	sil | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  | @@ -43,71 +45,54 @@
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  # define VEC_SIZE 32 | ||||||
|  | +# define PAGE_SIZE 4096
 | ||||||
|  |   | ||||||
|  |  	.section SECTION(.text),"ax",@progbits | ||||||
|  |  ENTRY (STRCHR) | ||||||
|  |  	movl	%edi, %ecx | ||||||
|  | -	/* Broadcast CHAR to YMM0.  */
 | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +# endif
 | ||||||
|  | +
 | ||||||
|  | +	/* Broadcast CHAR to YMM0.	*/
 | ||||||
|  |  	vmovd	%esi, %xmm0 | ||||||
|  |  	vpxor	%xmm9, %xmm9, %xmm9 | ||||||
|  |  	VPBROADCAST %xmm0, %ymm0 | ||||||
|  | -	/* Check if we may cross page boundary with one vector load.  */
 | ||||||
|  | -	andl	$(2 * VEC_SIZE - 1), %ecx
 | ||||||
|  | -	cmpl	$VEC_SIZE, %ecx
 | ||||||
|  | -	ja	L(cros_page_boundary)
 | ||||||
|  |   | ||||||
|  | -	/* Check the first VEC_SIZE bytes.  Search for both CHAR and the
 | ||||||
|  | -	   null byte.  */
 | ||||||
|  | -	vmovdqu	(%rdi), %ymm8
 | ||||||
|  | -	VPCMPEQ %ymm8, %ymm0, %ymm1
 | ||||||
|  | -	VPCMPEQ %ymm8, %ymm9, %ymm2
 | ||||||
|  | -	vpor	%ymm1, %ymm2, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | +	/* Check if we cross page boundary with one vector load.  */
 | ||||||
|  | +	andl	$(PAGE_SIZE - 1), %ecx
 | ||||||
|  | +	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
 | ||||||
|  | +	ja  L(cross_page_boundary)
 | ||||||
|  |   | ||||||
|  | -	/* Align data for aligned loads in the loop.  */
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | -
 | ||||||
|  | -	jmp	L(more_4x_vec)
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(cros_page_boundary):
 | ||||||
|  | -	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | +	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
 | ||||||
|  | +	   null byte.  */
 | ||||||
|  |  	vmovdqu	(%rdi), %ymm8 | ||||||
|  |  	VPCMPEQ %ymm8, %ymm0, %ymm1 | ||||||
|  |  	VPCMPEQ %ymm8, %ymm9, %ymm2 | ||||||
|  |  	vpor	%ymm1, %ymm2, %ymm1 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  | -	/* Remove the leading bytes.  */
 | ||||||
|  | -	sarl	%cl, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jz	L(aligned_more)
 | ||||||
|  | -	/* Found CHAR or the null byte.  */
 | ||||||
|  | +	jz	L(more_vecs)
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	addq	%rcx, %rax
 | ||||||
|  | -# ifdef USE_AS_STRCHRNUL
 | ||||||
|  | +	/* Found CHAR or the null byte.	 */
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  | -# else
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  | -	leaq	(%rdi, %rax), %rax
 | ||||||
|  | -	cmp	(%rax), %CHAR_REG
 | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | +	cmp (%rax), %CHAR_REG
 | ||||||
|  |  	cmovne	%rdx, %rax | ||||||
|  |  # endif | ||||||
|  |  L(return_vzeroupper): | ||||||
|  |  	ZERO_UPPER_VEC_REGISTERS_RETURN | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | +L(more_vecs):
 | ||||||
|  | +	/* Align data for aligned loads in the loop.  */
 | ||||||
|  | +	andq	$-VEC_SIZE, %rdi
 | ||||||
|  |  L(aligned_more): | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  |   | ||||||
|  | -L(more_4x_vec):
 | ||||||
|  | -	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 | ||||||
|  | -	   since data is only aligned to VEC_SIZE.  */
 | ||||||
|  | -	vmovdqa	(%rdi), %ymm8
 | ||||||
|  | +	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
 | ||||||
|  | +	   since data is only aligned to VEC_SIZE.	*/
 | ||||||
|  | +	vmovdqa	VEC_SIZE(%rdi), %ymm8
 | ||||||
|  | +	addq	$VEC_SIZE, %rdi
 | ||||||
|  |  	VPCMPEQ %ymm8, %ymm0, %ymm1 | ||||||
|  |  	VPCMPEQ %ymm8, %ymm9, %ymm2 | ||||||
|  |  	vpor	%ymm1, %ymm2, %ymm1 | ||||||
|  | @@ -137,61 +122,24 @@ L(more_4x_vec):
 | ||||||
|  |  	vpor	%ymm1, %ymm2, %ymm1 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(first_vec_x3)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rdi
 | ||||||
|  | -
 | ||||||
|  | -	/* Align data to 4 * VEC_SIZE.  */
 | ||||||
|  | -	movq	%rdi, %rcx
 | ||||||
|  | -	andl	$(4 * VEC_SIZE - 1), %ecx
 | ||||||
|  | -	andq	$-(4 * VEC_SIZE), %rdi
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(loop_4x_vec):
 | ||||||
|  | -	/* Compare 4 * VEC at a time forward.  */
 | ||||||
|  | -	vmovdqa	(%rdi), %ymm5
 | ||||||
|  | -	vmovdqa	VEC_SIZE(%rdi), %ymm6
 | ||||||
|  | -	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
 | ||||||
|  | -	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
 | ||||||
|  | -
 | ||||||
|  | -	VPCMPEQ %ymm5, %ymm0, %ymm1
 | ||||||
|  | -	VPCMPEQ %ymm6, %ymm0, %ymm2
 | ||||||
|  | -	VPCMPEQ %ymm7, %ymm0, %ymm3
 | ||||||
|  | -	VPCMPEQ %ymm8, %ymm0, %ymm4
 | ||||||
|  | -
 | ||||||
|  | -	VPCMPEQ %ymm5, %ymm9, %ymm5
 | ||||||
|  | -	VPCMPEQ %ymm6, %ymm9, %ymm6
 | ||||||
|  | -	VPCMPEQ %ymm7, %ymm9, %ymm7
 | ||||||
|  | -	VPCMPEQ %ymm8, %ymm9, %ymm8
 | ||||||
|  | -
 | ||||||
|  | -	vpor	%ymm1, %ymm5, %ymm1
 | ||||||
|  | -	vpor	%ymm2, %ymm6, %ymm2
 | ||||||
|  | -	vpor	%ymm3, %ymm7, %ymm3
 | ||||||
|  | -	vpor	%ymm4, %ymm8, %ymm4
 | ||||||
|  | -
 | ||||||
|  | -	vpor	%ymm1, %ymm2, %ymm5
 | ||||||
|  | -	vpor	%ymm3, %ymm4, %ymm6
 | ||||||
|  | -
 | ||||||
|  | -	vpor	%ymm5, %ymm6, %ymm5
 | ||||||
|  | -
 | ||||||
|  | -	vpmovmskb %ymm5, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(4x_vec_end)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	jz	L(prep_loop_4x)
 | ||||||
|  |   | ||||||
|  | -	jmp	L(loop_4x_vec)
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
 | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | +	cmp (%rax), %CHAR_REG
 | ||||||
|  | +	cmovne	%rdx, %rax
 | ||||||
|  | +# endif
 | ||||||
|  | +	VZEROUPPER
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(first_vec_x0): | ||||||
|  | -	/* Found CHAR or the null byte.  */
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_STRCHRNUL
 | ||||||
|  | +	/* Found CHAR or the null byte.	 */
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  | -# else
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  | -	leaq	(%rdi, %rax), %rax
 | ||||||
|  | -	cmp	(%rax), %CHAR_REG
 | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | +	cmp (%rax), %CHAR_REG
 | ||||||
|  |  	cmovne	%rdx, %rax | ||||||
|  |  # endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | @@ -199,13 +147,9 @@ L(first_vec_x0):
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(first_vec_x1): | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_STRCHRNUL
 | ||||||
|  | -	addq	$VEC_SIZE, %rax
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -# else
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  |  	leaq	VEC_SIZE(%rdi, %rax), %rax | ||||||
|  | -	cmp	(%rax), %CHAR_REG
 | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | +	cmp (%rax), %CHAR_REG
 | ||||||
|  |  	cmovne	%rdx, %rax | ||||||
|  |  # endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | @@ -213,42 +157,97 @@ L(first_vec_x1):
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(first_vec_x2): | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_STRCHRNUL
 | ||||||
|  | -	addq	$(VEC_SIZE * 2), %rax
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -# else
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  | +	/* Found CHAR or the null byte.	 */
 | ||||||
|  |  	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax | ||||||
|  | -	cmp	(%rax), %CHAR_REG
 | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | +	cmp (%rax), %CHAR_REG
 | ||||||
|  |  	cmovne	%rdx, %rax | ||||||
|  |  # endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  | +L(prep_loop_4x):
 | ||||||
|  | +	/* Align data to 4 * VEC_SIZE.	*/
 | ||||||
|  | +	andq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(4x_vec_end):
 | ||||||
|  | +L(loop_4x_vec):
 | ||||||
|  | +	/* Compare 4 * VEC at a time forward.  */
 | ||||||
|  | +	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
 | ||||||
|  | +	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
 | ||||||
|  | +	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
 | ||||||
|  | +	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
 | ||||||
|  | +
 | ||||||
|  | +	/* Leaves only CHARS matching esi as 0.	 */
 | ||||||
|  | +	vpxor	%ymm5, %ymm0, %ymm1
 | ||||||
|  | +	vpxor	%ymm6, %ymm0, %ymm2
 | ||||||
|  | +	vpxor	%ymm7, %ymm0, %ymm3
 | ||||||
|  | +	vpxor	%ymm8, %ymm0, %ymm4
 | ||||||
|  | +
 | ||||||
|  | +	VPMINU	%ymm1, %ymm5, %ymm1
 | ||||||
|  | +	VPMINU	%ymm2, %ymm6, %ymm2
 | ||||||
|  | +	VPMINU	%ymm3, %ymm7, %ymm3
 | ||||||
|  | +	VPMINU	%ymm4, %ymm8, %ymm4
 | ||||||
|  | +
 | ||||||
|  | +	VPMINU	%ymm1, %ymm2, %ymm5
 | ||||||
|  | +	VPMINU	%ymm3, %ymm4, %ymm6
 | ||||||
|  | +
 | ||||||
|  | +	VPMINU	%ymm5, %ymm6, %ymm5
 | ||||||
|  | +
 | ||||||
|  | +	VPCMPEQ %ymm5, %ymm9, %ymm5
 | ||||||
|  | +	vpmovmskb %ymm5, %eax
 | ||||||
|  | +
 | ||||||
|  | +	addq	$(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jz  L(loop_4x_vec)
 | ||||||
|  | +
 | ||||||
|  | +	VPCMPEQ %ymm1, %ymm9, %ymm1
 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x0) | ||||||
|  | +
 | ||||||
|  | +	VPCMPEQ %ymm2, %ymm9, %ymm2
 | ||||||
|  |  	vpmovmskb %ymm2, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(first_vec_x1) | ||||||
|  | -	vpmovmskb %ymm3, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x2)
 | ||||||
|  | +
 | ||||||
|  | +	VPCMPEQ %ymm3, %ymm9, %ymm3
 | ||||||
|  | +	VPCMPEQ %ymm4, %ymm9, %ymm4
 | ||||||
|  | +	vpmovmskb %ymm3, %ecx
 | ||||||
|  |  	vpmovmskb %ymm4, %eax | ||||||
|  | +	salq	$32, %rax
 | ||||||
|  | +	orq %rcx, %rax
 | ||||||
|  | +	tzcntq  %rax, %rax
 | ||||||
|  | +	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
 | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | +	cmp (%rax), %CHAR_REG
 | ||||||
|  | +	cmovne	%rdx, %rax
 | ||||||
|  | +# endif
 | ||||||
|  | +	VZEROUPPER
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	/* Cold case for crossing page with first load.	 */
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(cross_page_boundary):
 | ||||||
|  | +	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | +	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | +
 | ||||||
|  | +	vmovdqa	(%rdi), %ymm8
 | ||||||
|  | +	VPCMPEQ %ymm8, %ymm0, %ymm1
 | ||||||
|  | +	VPCMPEQ %ymm8, %ymm9, %ymm2
 | ||||||
|  | +	vpor	%ymm1, %ymm2, %ymm1
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	/* Remove the leading bits.	 */
 | ||||||
|  | +	sarxl	%ecx, %eax, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -L(first_vec_x3):
 | ||||||
|  | +	jz	L(aligned_more)
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_STRCHRNUL
 | ||||||
|  | -	addq	$(VEC_SIZE * 3), %rax
 | ||||||
|  | +	addq	%rcx, %rdi
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  | -# else
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  | -	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
 | ||||||
|  | -	cmp	(%rax), %CHAR_REG
 | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | +	cmp (%rax), %CHAR_REG
 | ||||||
|  |  	cmovne	%rdx, %rax | ||||||
|  |  # endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  |  END (STRCHR) | ||||||
|  | -#endif
 | ||||||
|  | +# endif
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
 | ||||||
|  | index 7e582f02..5225bd4f 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strchr.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strchr.c
 | ||||||
|  | @@ -38,11 +38,11 @@ IFUNC_SELECTOR (void)
 | ||||||
|  |    const struct cpu_features* cpu_features = __get_cpu_features (); | ||||||
|  |   | ||||||
|  |    if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) | ||||||
|  | +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
 | ||||||
|  |        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) | ||||||
|  |      { | ||||||
|  |        if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) | ||||||
|  | -	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 | ||||||
|  | -	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 | ||||||
|  | +	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 | ||||||
|  |  	return OPTIMIZE (evex); | ||||||
|  |   | ||||||
|  |        if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										67
									
								
								SOURCES/glibc-RHEL-15696-38.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								SOURCES/glibc-RHEL-15696-38.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,67 @@ | |||||||
|  | From 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Sat, 25 Jan 2020 14:19:40 -0800 | ||||||
|  | Subject: [PATCH] x86-64: Avoid rep movsb with short distance [BZ #27130] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | When copying with "rep movsb", if the distance between source and | ||||||
|  | destination is N*4GB + [1..63] with N >= 0, performance may be very | ||||||
|  | slow.  This patch updates memmove-vec-unaligned-erms.S for AVX and | ||||||
|  | AVX512 versions with the distance in RCX: | ||||||
|  | 
 | ||||||
|  | 	cmpl	$63, %ecx | ||||||
|  | 	// Don't use "rep movsb" if ECX <= 63 | ||||||
|  | 	jbe	L(Don't use rep movsb") | ||||||
|  | 	Use "rep movsb" | ||||||
|  | 
 | ||||||
|  | Benchtests data with bench-memcpy, bench-memcpy-large, bench-memcpy-random | ||||||
|  | and bench-memcpy-walk on Skylake, Ice Lake and Tiger Lake show that its | ||||||
|  | performance impact is within noise range as "rep movsb" is only used for | ||||||
|  | data size >= 4KB. | ||||||
|  | ---
 | ||||||
|  |  .../multiarch/memmove-vec-unaligned-erms.S    | 21 +++++++++++++++++++ | ||||||
|  |  1 file changed, 21 insertions(+) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | index 673b73aa..c475fed4 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | @@ -64,6 +64,13 @@
 | ||||||
|  |  # endif | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  | +/* Avoid short distance rep movsb only with non-SSE vector.  */
 | ||||||
|  | +#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
 | ||||||
|  | +# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
 | ||||||
|  | +#else
 | ||||||
|  | +# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  |  #ifndef PREFETCH | ||||||
|  |  # define PREFETCH(addr) prefetcht0 addr | ||||||
|  |  #endif | ||||||
|  | @@ -255,7 +262,21 @@ L(movsb):
 | ||||||
|  |  	cmpq	%r9, %rdi | ||||||
|  |  	/* Avoid slow backward REP MOVSB.  */ | ||||||
|  |  	jb	L(more_8x_vec_backward) | ||||||
|  | +# if AVOID_SHORT_DISTANCE_REP_MOVSB
 | ||||||
|  | +	movq	%rdi, %rcx
 | ||||||
|  | +	subq	%rsi, %rcx
 | ||||||
|  | +	jmp	2f
 | ||||||
|  | +# endif
 | ||||||
|  |  1: | ||||||
|  | +# if AVOID_SHORT_DISTANCE_REP_MOVSB
 | ||||||
|  | +	movq	%rsi, %rcx
 | ||||||
|  | +	subq	%rdi, %rcx
 | ||||||
|  | +2:
 | ||||||
|  | +/* Avoid "rep movsb" if RCX, the distance between source and destination,
 | ||||||
|  | +   is N*4GB + [1..63] with N >= 0.  */
 | ||||||
|  | +	cmpl	$63, %ecx
 | ||||||
|  | +	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
 | ||||||
|  | +# endif
 | ||||||
|  |  	mov	%RDX_LP, %RCX_LP | ||||||
|  |  	rep movsb | ||||||
|  |  L(nop): | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										449
									
								
								SOURCES/glibc-RHEL-15696-39.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										449
									
								
								SOURCES/glibc-RHEL-15696-39.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,449 @@ | |||||||
|  | From 1a8605b6cd257e8a74e29b5b71c057211f5fb847 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: noah <goldstein.w.n@gmail.com> | ||||||
|  | Date: Sat, 3 Apr 2021 04:12:15 -0400 | ||||||
|  | Subject: [PATCH] x86: Update large memcpy case in memmove-vec-unaligned-erms.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No Bug. This commit updates the large memcpy case (no overlap). The | ||||||
|  | update is to perform memcpy on either 2 or 4 contiguous pages at | ||||||
|  | once. This 1) helps to alleviate the affects of false memory aliasing | ||||||
|  | when destination and source have a close 4k alignment and 2) In most | ||||||
|  | cases and for most DRAM units is a modestly more efficient access | ||||||
|  | pattern. These changes are a clear performance improvement for | ||||||
|  | VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, | ||||||
|  | test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all | ||||||
|  | pass. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  .../multiarch/memmove-vec-unaligned-erms.S    | 338 ++++++++++++++---- | ||||||
|  |  1 file changed, 265 insertions(+), 73 deletions(-) | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | ||||||
|  | 	(different number of sections) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | index c475fed4..3e2dd6bc 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | @@ -32,7 +32,16 @@
 | ||||||
|  |        overlapping addresses. | ||||||
|  |     6. If size >= __x86_shared_non_temporal_threshold and there is no | ||||||
|  |        overlap between destination and source, use non-temporal store | ||||||
|  | -      instead of aligned store.  */
 | ||||||
|  | +      instead of aligned store copying from either 2 or 4 pages at
 | ||||||
|  | +      once.
 | ||||||
|  | +   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
 | ||||||
|  | +      and source and destination do not page alias, copy from 2 pages
 | ||||||
|  | +      at once using non-temporal stores. Page aliasing in this case is
 | ||||||
|  | +      considered true if destination's page alignment - sources' page
 | ||||||
|  | +      alignment is less than 8 * VEC_SIZE.
 | ||||||
|  | +   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
 | ||||||
|  | +      and destination do page alias copy from 4 pages at once using
 | ||||||
|  | +      non-temporal stores.  */
 | ||||||
|  |   | ||||||
|  |  #include <sysdep.h> | ||||||
|  |   | ||||||
|  | @@ -64,6 +73,34 @@
 | ||||||
|  |  # endif | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  | +#ifndef PAGE_SIZE
 | ||||||
|  | +# define PAGE_SIZE 4096
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +#if PAGE_SIZE != 4096
 | ||||||
|  | +# error Unsupported PAGE_SIZE
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +#ifndef LOG_PAGE_SIZE
 | ||||||
|  | +# define LOG_PAGE_SIZE 12
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
 | ||||||
|  | +# error Invalid LOG_PAGE_SIZE
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +/* Byte per page for large_memcpy inner loop.  */
 | ||||||
|  | +#if VEC_SIZE == 64
 | ||||||
|  | +# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
 | ||||||
|  | +#else
 | ||||||
|  | +# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +/* Amount to shift rdx by to compare for memcpy_large_4x.  */
 | ||||||
|  | +#ifndef LOG_4X_MEMCPY_THRESH
 | ||||||
|  | +# define LOG_4X_MEMCPY_THRESH 4
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  |  /* Avoid short distance rep movsb only with non-SSE vector.  */ | ||||||
|  |  #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB | ||||||
|  |  # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) | ||||||
|  | @@ -103,6 +140,28 @@
 | ||||||
|  |  # error Unsupported PREFETCH_SIZE! | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  | +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
 | ||||||
|  | +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
 | ||||||
|  | +	VMOVU	(offset)base, vec0; \
 | ||||||
|  | +	VMOVU	((offset) + VEC_SIZE)base, vec1;
 | ||||||
|  | +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
 | ||||||
|  | +	VMOVNT  vec0, (offset)base; \
 | ||||||
|  | +	VMOVNT  vec1, ((offset) + VEC_SIZE)base;
 | ||||||
|  | +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
 | ||||||
|  | +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
 | ||||||
|  | +	VMOVU	(offset)base, vec0; \
 | ||||||
|  | +	VMOVU	((offset) + VEC_SIZE)base, vec1; \
 | ||||||
|  | +	VMOVU	((offset) + VEC_SIZE * 2)base, vec2; \
 | ||||||
|  | +	VMOVU	((offset) + VEC_SIZE * 3)base, vec3;
 | ||||||
|  | +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
 | ||||||
|  | +	VMOVNT	vec0, (offset)base; \
 | ||||||
|  | +	VMOVNT	vec1, ((offset) + VEC_SIZE)base; \
 | ||||||
|  | +	VMOVNT	vec2, ((offset) + VEC_SIZE * 2)base; \
 | ||||||
|  | +	VMOVNT	vec3, ((offset) + VEC_SIZE * 3)base;
 | ||||||
|  | +#else
 | ||||||
|  | +# error Invalid LARGE_LOAD_SIZE
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  |  #ifndef SECTION | ||||||
|  |  # error SECTION is not defined! | ||||||
|  |  #endif | ||||||
|  | @@ -390,6 +449,15 @@ L(last_4x_vec):
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  |  L(more_8x_vec): | ||||||
|  | +	/* Check if non-temporal move candidate.  */
 | ||||||
|  | +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 | ||||||
|  | +	/* Check non-temporal store threshold.  */
 | ||||||
|  | +	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
 | ||||||
|  | +	ja	L(large_memcpy_2x)
 | ||||||
|  | +#endif
 | ||||||
|  | +	/* Entry if rdx is greater than non-temporal threshold but there
 | ||||||
|  | +       is overlap.  */
 | ||||||
|  | +L(more_8x_vec_check):
 | ||||||
|  |  	cmpq	%rsi, %rdi | ||||||
|  |  	ja	L(more_8x_vec_backward) | ||||||
|  |  	/* Source == destination is less common.  */ | ||||||
|  | @@ -416,24 +484,21 @@ L(more_8x_vec):
 | ||||||
|  |  	subq	%r8, %rdi | ||||||
|  |  	/* Adjust length.  */ | ||||||
|  |  	addq	%r8, %rdx | ||||||
|  | -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 | ||||||
|  | -	/* Check non-temporal store threshold.  */
 | ||||||
|  | -	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 | ||||||
|  | -	ja	L(large_forward)
 | ||||||
|  | -#endif
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  |  L(loop_4x_vec_forward): | ||||||
|  |  	/* Copy 4 * VEC a time forward.  */ | ||||||
|  |  	VMOVU	(%rsi), %VEC(0) | ||||||
|  |  	VMOVU	VEC_SIZE(%rsi), %VEC(1) | ||||||
|  |  	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2) | ||||||
|  |  	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3) | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rsi
 | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rsi
 | ||||||
|  | +	addq	$-(VEC_SIZE * 4), %rdx
 | ||||||
|  |  	VMOVA	%VEC(0), (%rdi) | ||||||
|  |  	VMOVA	%VEC(1), VEC_SIZE(%rdi) | ||||||
|  |  	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi) | ||||||
|  |  	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi) | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  |  	cmpq	$(VEC_SIZE * 4), %rdx | ||||||
|  |  	ja	L(loop_4x_vec_forward) | ||||||
|  |  	/* Store the last 4 * VEC.  */ | ||||||
|  | @@ -467,24 +532,21 @@ L(more_8x_vec_backward):
 | ||||||
|  |  	subq	%r8, %r9 | ||||||
|  |  	/* Adjust length.  */ | ||||||
|  |  	subq	%r8, %rdx | ||||||
|  | -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 | ||||||
|  | -	/* Check non-temporal store threshold.  */
 | ||||||
|  | -	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 | ||||||
|  | -	ja	L(large_backward)
 | ||||||
|  | -#endif
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  |  L(loop_4x_vec_backward): | ||||||
|  |  	/* Copy 4 * VEC a time backward.  */ | ||||||
|  |  	VMOVU	(%rcx), %VEC(0) | ||||||
|  |  	VMOVU	-VEC_SIZE(%rcx), %VEC(1) | ||||||
|  |  	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2) | ||||||
|  |  	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3) | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rcx
 | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | +	addq	$-(VEC_SIZE * 4), %rcx
 | ||||||
|  | +	addq	$-(VEC_SIZE * 4), %rdx
 | ||||||
|  |  	VMOVA	%VEC(0), (%r9) | ||||||
|  |  	VMOVA	%VEC(1), -VEC_SIZE(%r9) | ||||||
|  |  	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9) | ||||||
|  |  	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9) | ||||||
|  | -	subq	$(VEC_SIZE * 4), %r9
 | ||||||
|  | +	addq	$-(VEC_SIZE * 4), %r9
 | ||||||
|  |  	cmpq	$(VEC_SIZE * 4), %rdx | ||||||
|  |  	ja	L(loop_4x_vec_backward) | ||||||
|  |  	/* Store the first 4 * VEC.  */ | ||||||
|  | @@ -497,72 +559,202 @@ L(loop_4x_vec_backward):
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  |  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) | ||||||
|  | -L(large_forward):
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(large_memcpy_2x):
 | ||||||
|  | +	/* Compute absolute value of difference between source and
 | ||||||
|  | +	   destination.  */
 | ||||||
|  | +	movq	%rdi, %r9
 | ||||||
|  | +	subq	%rsi, %r9
 | ||||||
|  | +	movq	%r9, %r8
 | ||||||
|  | +	leaq	-1(%r9), %rcx
 | ||||||
|  | +	sarq	$63, %r8
 | ||||||
|  | +	xorq	%r8, %r9
 | ||||||
|  | +	subq	%r8, %r9
 | ||||||
|  |  	/* Don't use non-temporal store if there is overlap between | ||||||
|  | -	   destination and source since destination may be in cache
 | ||||||
|  | -	   when source is loaded.  */
 | ||||||
|  | -	leaq    (%rdi, %rdx), %r10
 | ||||||
|  | -	cmpq    %r10, %rsi
 | ||||||
|  | -	jb	L(loop_4x_vec_forward)
 | ||||||
|  | -L(loop_large_forward):
 | ||||||
|  | +	   destination and source since destination may be in cache when
 | ||||||
|  | +	   source is loaded.  */
 | ||||||
|  | +	cmpq	%r9, %rdx
 | ||||||
|  | +	ja	L(more_8x_vec_check)
 | ||||||
|  | +
 | ||||||
|  | +	/* Cache align destination. First store the first 64 bytes then
 | ||||||
|  | +	   adjust alignments.  */
 | ||||||
|  | +	VMOVU	(%rsi), %VEC(8)
 | ||||||
|  | +#if VEC_SIZE < 64
 | ||||||
|  | +	VMOVU	VEC_SIZE(%rsi), %VEC(9)
 | ||||||
|  | +#if VEC_SIZE < 32
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
 | ||||||
|  | +#endif
 | ||||||
|  | +#endif
 | ||||||
|  | +	VMOVU	%VEC(8), (%rdi)
 | ||||||
|  | +#if VEC_SIZE < 64
 | ||||||
|  | +	VMOVU	%VEC(9), VEC_SIZE(%rdi)
 | ||||||
|  | +#if VEC_SIZE < 32
 | ||||||
|  | +	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
 | ||||||
|  | +	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
 | ||||||
|  | +#endif
 | ||||||
|  | +#endif
 | ||||||
|  | +	/* Adjust source, destination, and size.  */
 | ||||||
|  | +	movq	%rdi, %r8
 | ||||||
|  | +	andq	$63, %r8
 | ||||||
|  | +	/* Get the negative of offset for alignment.  */
 | ||||||
|  | +	subq	$64, %r8
 | ||||||
|  | +	/* Adjust source.  */
 | ||||||
|  | +	subq	%r8, %rsi
 | ||||||
|  | +	/* Adjust destination which should be aligned now.  */
 | ||||||
|  | +	subq	%r8, %rdi
 | ||||||
|  | +	/* Adjust length.  */
 | ||||||
|  | +	addq	%r8, %rdx
 | ||||||
|  | +
 | ||||||
|  | +	/* Test if source and destination addresses will alias. If they do
 | ||||||
|  | +	   the larger pipeline in large_memcpy_4x alleviated the
 | ||||||
|  | +	   performance drop.  */
 | ||||||
|  | +	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
 | ||||||
|  | +	jz	L(large_memcpy_4x)
 | ||||||
|  | +
 | ||||||
|  | +	movq	%rdx, %r10
 | ||||||
|  | +	shrq	$LOG_4X_MEMCPY_THRESH, %r10
 | ||||||
|  | +	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
 | ||||||
|  | +	jae	L(large_memcpy_4x)
 | ||||||
|  | +
 | ||||||
|  | +	/* edx will store remainder size for copying tail.  */
 | ||||||
|  | +	andl	$(PAGE_SIZE * 2 - 1), %edx
 | ||||||
|  | +	/* r10 stores outer loop counter.  */
 | ||||||
|  | +	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
 | ||||||
|  | +	/* Copy 4x VEC at a time from 2 pages.  */
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(loop_large_memcpy_2x_outer):
 | ||||||
|  | +	/* ecx stores inner loop counter.  */
 | ||||||
|  | +	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
 | ||||||
|  | +L(loop_large_memcpy_2x_inner):
 | ||||||
|  | +	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
 | ||||||
|  | +	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
 | ||||||
|  | +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
 | ||||||
|  | +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
 | ||||||
|  | +	/* Load vectors from rsi.  */
 | ||||||
|  | +	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
 | ||||||
|  | +	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
 | ||||||
|  | +	subq	$-LARGE_LOAD_SIZE, %rsi
 | ||||||
|  | +	/* Non-temporal store vectors to rdi.  */
 | ||||||
|  | +	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
 | ||||||
|  | +	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
 | ||||||
|  | +	subq	$-LARGE_LOAD_SIZE, %rdi
 | ||||||
|  | +	decl	%ecx
 | ||||||
|  | +	jnz	L(loop_large_memcpy_2x_inner)
 | ||||||
|  | +	addq	$PAGE_SIZE, %rdi
 | ||||||
|  | +	addq	$PAGE_SIZE, %rsi
 | ||||||
|  | +	decq	%r10
 | ||||||
|  | +	jne	L(loop_large_memcpy_2x_outer)
 | ||||||
|  | +	sfence
 | ||||||
|  | +
 | ||||||
|  | +	/* Check if only last 4 loads are needed.  */
 | ||||||
|  | +	cmpl	$(VEC_SIZE * 4), %edx
 | ||||||
|  | +	jbe	L(large_memcpy_2x_end)
 | ||||||
|  | +
 | ||||||
|  | +	/* Handle the last 2 * PAGE_SIZE bytes.  */
 | ||||||
|  | +L(loop_large_memcpy_2x_tail):
 | ||||||
|  |  	/* Copy 4 * VEC a time forward with non-temporal stores.  */ | ||||||
|  | -	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
 | ||||||
|  | -	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
 | ||||||
|  | +	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
 | ||||||
|  | +	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
 | ||||||
|  |  	VMOVU	(%rsi), %VEC(0) | ||||||
|  |  	VMOVU	VEC_SIZE(%rsi), %VEC(1) | ||||||
|  |  	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2) | ||||||
|  |  	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3) | ||||||
|  | -	addq	$PREFETCHED_LOAD_SIZE, %rsi
 | ||||||
|  | -	subq	$PREFETCHED_LOAD_SIZE, %rdx
 | ||||||
|  | -	VMOVNT	%VEC(0), (%rdi)
 | ||||||
|  | -	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
 | ||||||
|  | -	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
 | ||||||
|  | -	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
 | ||||||
|  | -	addq	$PREFETCHED_LOAD_SIZE, %rdi
 | ||||||
|  | -	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
 | ||||||
|  | -	ja	L(loop_large_forward)
 | ||||||
|  | -	sfence
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rsi
 | ||||||
|  | +	addl	$-(VEC_SIZE * 4), %edx
 | ||||||
|  | +	VMOVA	%VEC(0), (%rdi)
 | ||||||
|  | +	VMOVA	%VEC(1), VEC_SIZE(%rdi)
 | ||||||
|  | +	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
 | ||||||
|  | +	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	cmpl	$(VEC_SIZE * 4), %edx
 | ||||||
|  | +	ja	L(loop_large_memcpy_2x_tail)
 | ||||||
|  | +
 | ||||||
|  | +L(large_memcpy_2x_end):
 | ||||||
|  |  	/* Store the last 4 * VEC.  */ | ||||||
|  | -	VMOVU	%VEC(5), (%rcx)
 | ||||||
|  | -	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
 | ||||||
|  | -	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
 | ||||||
|  | -	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
 | ||||||
|  | -	/* Store the first VEC.  */
 | ||||||
|  | -	VMOVU	%VEC(4), (%r11)
 | ||||||
|  | +	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
 | ||||||
|  | +	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
 | ||||||
|  | +	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
 | ||||||
|  | +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
 | ||||||
|  | +
 | ||||||
|  | +	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
 | ||||||
|  | +	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
 | ||||||
|  | +	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
 | ||||||
|  | +	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  | -L(large_backward):
 | ||||||
|  | -	/* Don't use non-temporal store if there is overlap between
 | ||||||
|  | -	   destination and source since destination may be in cache
 | ||||||
|  | -	   when source is loaded.  */
 | ||||||
|  | -	leaq    (%rcx, %rdx), %r10
 | ||||||
|  | -	cmpq    %r10, %r9
 | ||||||
|  | -	jb	L(loop_4x_vec_backward)
 | ||||||
|  | -L(loop_large_backward):
 | ||||||
|  | -	/* Copy 4 * VEC a time backward with non-temporal stores.  */
 | ||||||
|  | -	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
 | ||||||
|  | -	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
 | ||||||
|  | -	VMOVU	(%rcx), %VEC(0)
 | ||||||
|  | -	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
 | ||||||
|  | -	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
 | ||||||
|  | -	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
 | ||||||
|  | -	subq	$PREFETCHED_LOAD_SIZE, %rcx
 | ||||||
|  | -	subq	$PREFETCHED_LOAD_SIZE, %rdx
 | ||||||
|  | -	VMOVNT	%VEC(0), (%r9)
 | ||||||
|  | -	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
 | ||||||
|  | -	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
 | ||||||
|  | -	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
 | ||||||
|  | -	subq	$PREFETCHED_LOAD_SIZE, %r9
 | ||||||
|  | -	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
 | ||||||
|  | -	ja	L(loop_large_backward)
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(large_memcpy_4x):
 | ||||||
|  | +	movq	%rdx, %r10
 | ||||||
|  | +	/* edx will store remainder size for copying tail.  */
 | ||||||
|  | +	andl	$(PAGE_SIZE * 4 - 1), %edx
 | ||||||
|  | +	/* r10 stores outer loop counter.  */
 | ||||||
|  | +	shrq	$(LOG_PAGE_SIZE + 2), %r10
 | ||||||
|  | +	/* Copy 4x VEC at a time from 4 pages.  */
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(loop_large_memcpy_4x_outer):
 | ||||||
|  | +	/* ecx stores inner loop counter.  */
 | ||||||
|  | +	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
 | ||||||
|  | +L(loop_large_memcpy_4x_inner):
 | ||||||
|  | +	/* Only one prefetch set per page as doing 4 pages give more time
 | ||||||
|  | +	   for prefetcher to keep up.  */
 | ||||||
|  | +	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
 | ||||||
|  | +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
 | ||||||
|  | +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
 | ||||||
|  | +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
 | ||||||
|  | +	/* Load vectors from rsi.  */
 | ||||||
|  | +	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
 | ||||||
|  | +	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
 | ||||||
|  | +	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
 | ||||||
|  | +	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
 | ||||||
|  | +	subq	$-LARGE_LOAD_SIZE, %rsi
 | ||||||
|  | +	/* Non-temporal store vectors to rdi.  */
 | ||||||
|  | +	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
 | ||||||
|  | +	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
 | ||||||
|  | +	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
 | ||||||
|  | +	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
 | ||||||
|  | +	subq	$-LARGE_LOAD_SIZE, %rdi
 | ||||||
|  | +	decl	%ecx
 | ||||||
|  | +	jnz	L(loop_large_memcpy_4x_inner)
 | ||||||
|  | +	addq	$(PAGE_SIZE * 3), %rdi
 | ||||||
|  | +	addq	$(PAGE_SIZE * 3), %rsi
 | ||||||
|  | +	decq	%r10
 | ||||||
|  | +	jne	L(loop_large_memcpy_4x_outer)
 | ||||||
|  |  	sfence | ||||||
|  | -	/* Store the first 4 * VEC.  */
 | ||||||
|  | -	VMOVU	%VEC(4), (%rdi)
 | ||||||
|  | -	VMOVU	%VEC(5), VEC_SIZE(%rdi)
 | ||||||
|  | -	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
 | ||||||
|  | -	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
 | ||||||
|  | -	/* Store the last VEC.  */
 | ||||||
|  | -	VMOVU	%VEC(8), (%r11)
 | ||||||
|  | +	/* Check if only last 4 loads are needed.  */
 | ||||||
|  | +	cmpl	$(VEC_SIZE * 4), %edx
 | ||||||
|  | +	jbe	L(large_memcpy_4x_end)
 | ||||||
|  | +
 | ||||||
|  | +	/* Handle the last 4  * PAGE_SIZE bytes.  */
 | ||||||
|  | +L(loop_large_memcpy_4x_tail):
 | ||||||
|  | +	/* Copy 4 * VEC a time forward with non-temporal stores.  */
 | ||||||
|  | +	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
 | ||||||
|  | +	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
 | ||||||
|  | +	VMOVU	(%rsi), %VEC(0)
 | ||||||
|  | +	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rsi
 | ||||||
|  | +	addl	$-(VEC_SIZE * 4), %edx
 | ||||||
|  | +	VMOVA	%VEC(0), (%rdi)
 | ||||||
|  | +	VMOVA	%VEC(1), VEC_SIZE(%rdi)
 | ||||||
|  | +	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
 | ||||||
|  | +	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	cmpl	$(VEC_SIZE * 4), %edx
 | ||||||
|  | +	ja	L(loop_large_memcpy_4x_tail)
 | ||||||
|  | +
 | ||||||
|  | +L(large_memcpy_4x_end):
 | ||||||
|  | +	/* Store the last 4 * VEC.  */
 | ||||||
|  | +	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
 | ||||||
|  | +	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
 | ||||||
|  | +	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
 | ||||||
|  | +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
 | ||||||
|  | +
 | ||||||
|  | +	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
 | ||||||
|  | +	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
 | ||||||
|  | +	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
 | ||||||
|  | +	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |  #endif | ||||||
|  |  END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										151
									
								
								SOURCES/glibc-RHEL-15696-4.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										151
									
								
								SOURCES/glibc-RHEL-15696-4.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,151 @@ | |||||||
|  | From ecd8b842cf37ea112e59cd9085ff1f1b6e208ae0 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Mon, 21 Jan 2019 11:29:58 -0800 | ||||||
|  | Subject: [PATCH] x86-64 memrchr: Properly handle the length parameter [BZ# | ||||||
|  |  24097] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | On x32, the size_t parameter may be passed in the lower 32 bits of a | ||||||
|  | 64-bit register with the non-zero upper 32 bits.  The string/memory | ||||||
|  | functions written in assembly can only use the lower 32 bits of a | ||||||
|  | 64-bit register as length or must clear the upper 32 bits before using | ||||||
|  | the full 64-bit register for length. | ||||||
|  | 
 | ||||||
|  | This pach fixes memrchr for x32.  Tested on x86-64 and x32.  On x86-64, | ||||||
|  | libc.so is the same with and withou the fix. | ||||||
|  | 
 | ||||||
|  | 	[BZ# 24097] | ||||||
|  | 	CVE-2019-6488 | ||||||
|  | 	* sysdeps/x86_64/memrchr.S: Use RDX_LP for length. | ||||||
|  | 	* sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise. | ||||||
|  | 	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr. | ||||||
|  | 	* sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/memrchr.S                |  4 +- | ||||||
|  |  sysdeps/x86_64/multiarch/memrchr-avx2.S |  4 +- | ||||||
|  |  sysdeps/x86_64/x32/Makefile             |  3 +- | ||||||
|  |  sysdeps/x86_64/x32/tst-size_t-memrchr.c | 57 +++++++++++++++++++++++++ | ||||||
|  |  4 files changed, 63 insertions(+), 5 deletions(-) | ||||||
|  |  create mode 100644 sysdeps/x86_64/x32/tst-size_t-memrchr.c | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	ChangeLog | ||||||
|  | 	(removed) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
 | ||||||
|  | index b8e3fa1d..dc82f8f7 100644
 | ||||||
|  | --- a/sysdeps/x86_64/memrchr.S
 | ||||||
|  | +++ b/sysdeps/x86_64/memrchr.S
 | ||||||
|  | @@ -24,13 +24,13 @@
 | ||||||
|  |  ENTRY (__memrchr) | ||||||
|  |  	movd	%esi, %xmm1 | ||||||
|  |   | ||||||
|  | -	sub	$16, %rdx
 | ||||||
|  | +	sub	$16, %RDX_LP
 | ||||||
|  |  	jbe	L(length_less16) | ||||||
|  |   | ||||||
|  |  	punpcklbw	%xmm1, %xmm1 | ||||||
|  |  	punpcklbw	%xmm1, %xmm1 | ||||||
|  |   | ||||||
|  | -	add	%rdx, %rdi
 | ||||||
|  | +	add	%RDX_LP, %RDI_LP
 | ||||||
|  |  	pshufd	$0, %xmm1, %xmm1 | ||||||
|  |   | ||||||
|  |  	movdqu	(%rdi), %xmm0 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
 | ||||||
|  | index b41a58bc..ce488dd9 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
 | ||||||
|  | @@ -32,10 +32,10 @@ ENTRY (__memrchr_avx2)
 | ||||||
|  |  	vmovd	%esi, %xmm0 | ||||||
|  |  	vpbroadcastb %xmm0, %ymm0 | ||||||
|  |   | ||||||
|  | -	subq	$VEC_SIZE, %rdx
 | ||||||
|  | +	sub	$VEC_SIZE, %RDX_LP
 | ||||||
|  |  	jbe	L(last_vec_or_less) | ||||||
|  |   | ||||||
|  | -	addq	%rdx, %rdi
 | ||||||
|  | +	add	%RDX_LP, %RDI_LP
 | ||||||
|  |   | ||||||
|  |  	/* Check the last VEC_SIZE bytes.  */ | ||||||
|  |  	vpcmpeqb (%rdi), %ymm0, %ymm1 | ||||||
|  | diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | index 2fe1e5ac..e99dbd7c 100644
 | ||||||
|  | --- a/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | @@ -6,7 +6,8 @@ CFLAGS-s_llround.c += -fno-builtin-lround
 | ||||||
|  |  endif | ||||||
|  |   | ||||||
|  |  ifeq ($(subdir),string) | ||||||
|  | -tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
 | ||||||
|  | +tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
 | ||||||
|  | +	 tst-size_t-memrchr
 | ||||||
|  |  endif | ||||||
|  |   | ||||||
|  |  ifeq ($(subdir),wcsmbs) | ||||||
|  | diff --git a/sysdeps/x86_64/x32/tst-size_t-memrchr.c b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..c83699c0
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
 | ||||||
|  | @@ -0,0 +1,57 @@
 | ||||||
|  | +/* Test memrchr with size_t in the lower 32 bits of 64-bit register.
 | ||||||
|  | +   Copyright (C) 2019 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <http://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#define TEST_NAME "memrchr"
 | ||||||
|  | +#include "test-size_t.h"
 | ||||||
|  | +
 | ||||||
|  | +IMPL (memchr, 1)
 | ||||||
|  | +
 | ||||||
|  | +typedef void * (*proto_t) (const void *, int, size_t);
 | ||||||
|  | +
 | ||||||
|  | +static void *
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +do_memrchr (parameter_t a, parameter_t b)
 | ||||||
|  | +{
 | ||||||
|  | +  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +test_main (void)
 | ||||||
|  | +{
 | ||||||
|  | +  test_init ();
 | ||||||
|  | +
 | ||||||
|  | +  parameter_t src = { { page_size }, buf2 };
 | ||||||
|  | +  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
 | ||||||
|  | +
 | ||||||
|  | +  int ret = 0;
 | ||||||
|  | +  FOR_EACH_IMPL (impl, 0)
 | ||||||
|  | +    {
 | ||||||
|  | +      c.fn = impl->fn;
 | ||||||
|  | +      void * res = do_memrchr (src, c);
 | ||||||
|  | +      if (res)
 | ||||||
|  | +	{
 | ||||||
|  | +	  error (0, 0, "Wrong result in function %s: %p != NULL",
 | ||||||
|  | +		 impl->name, res);
 | ||||||
|  | +	  ret = 1;
 | ||||||
|  | +	}
 | ||||||
|  | +    }
 | ||||||
|  | +
 | ||||||
|  | +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +#include <support/test-driver.c>
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										92
									
								
								SOURCES/glibc-RHEL-15696-40.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										92
									
								
								SOURCES/glibc-RHEL-15696-40.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,92 @@ | |||||||
|  | From 83c5b368226c34a2f0a5287df40fc290b2b34359 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Mon, 19 Apr 2021 10:45:07 -0700 | ||||||
|  | Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Since strchr-avx2.S updated by | ||||||
|  | 
 | ||||||
|  | commit 1f745ecc2109890886b161d4791e1406fdfc29b8 | ||||||
|  | Author: noah <goldstein.w.n@gmail.com> | ||||||
|  | Date:   Wed Feb 3 00:38:59 2021 -0500 | ||||||
|  | 
 | ||||||
|  |     x86-64: Refactor and improve performance of strchr-avx2.S | ||||||
|  | 
 | ||||||
|  | uses sarx: | ||||||
|  | 
 | ||||||
|  | c4 e2 72 f7 c0       	sarx   %ecx,%eax,%eax | ||||||
|  | 
 | ||||||
|  | for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and | ||||||
|  | ifunc-avx2.h. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++-- | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- | ||||||
|  |  2 files changed, 11 insertions(+), 5 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
 | ||||||
|  | index e0f30e61..ef72b73f 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
 | ||||||
|  | @@ -30,11 +30,11 @@ IFUNC_SELECTOR (void)
 | ||||||
|  |    const struct cpu_features* cpu_features = __get_cpu_features (); | ||||||
|  |   | ||||||
|  |    if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) | ||||||
|  | +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
 | ||||||
|  |        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) | ||||||
|  |      { | ||||||
|  |        if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) | ||||||
|  | -	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 | ||||||
|  | -	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 | ||||||
|  | +	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 | ||||||
|  |  	return OPTIMIZE (evex); | ||||||
|  |   | ||||||
|  |        if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | index 695cdba6..85b8863a 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | @@ -400,10 +400,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |    /* Support sysdeps/x86_64/multiarch/strchr.c.  */ | ||||||
|  |    IFUNC_IMPL (i, name, strchr, | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, strchr, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX2),
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX2)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __strchr_avx2) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, strchr, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX2) | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (RTM)), | ||||||
|  |  			      __strchr_avx2_rtm) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, strchr, | ||||||
|  | @@ -417,10 +419,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |    /* Support sysdeps/x86_64/multiarch/strchrnul.c.  */ | ||||||
|  |    IFUNC_IMPL (i, name, strchrnul, | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, strchrnul, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX2),
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX2)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __strchrnul_avx2) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, strchrnul, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX2) | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (RTM)), | ||||||
|  |  			      __strchrnul_avx2_rtm) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, strchrnul, | ||||||
|  | @@ -574,10 +578,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |    /* Support sysdeps/x86_64/multiarch/wcschr.c.  */ | ||||||
|  |    IFUNC_IMPL (i, name, wcschr, | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wcschr, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX2),
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX2)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __wcschr_avx2) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wcschr, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX2) | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (RTM)), | ||||||
|  |  			      __wcschr_avx2_rtm) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wcschr, | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										265
									
								
								SOURCES/glibc-RHEL-15696-41.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										265
									
								
								SOURCES/glibc-RHEL-15696-41.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,265 @@ | |||||||
|  | From f53790272ce7bdc5ecd14b45f65d0464d2a61a3a Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Mon, 19 Apr 2021 17:48:10 -0400 | ||||||
|  | Subject: [PATCH] x86: Optimize less_vec evex and avx512 | ||||||
|  |  memset-vec-unaligned-erms.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. This commit adds optimized cased for less_vec memset case that | ||||||
|  | uses the avx512vl/avx512bw mask store avoiding the excessive | ||||||
|  | branches. test-memset and test-wmemset are passing. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 40 ++++++++++----- | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-memset.h       |  6 ++- | ||||||
|  |  .../multiarch/memset-avx512-unaligned-erms.S  |  2 +- | ||||||
|  |  .../multiarch/memset-evex-unaligned-erms.S    |  2 +- | ||||||
|  |  .../multiarch/memset-vec-unaligned-erms.S     | 51 +++++++++++++++---- | ||||||
|  |  5 files changed, 74 insertions(+), 27 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | index 85b8863a..d59d65f8 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | @@ -204,19 +204,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			      __memset_chk_avx2_unaligned_erms_rtm) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memset_chk, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX512VL) | ||||||
|  | -			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __memset_chk_evex_unaligned) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memset_chk, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX512VL) | ||||||
|  | -			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __memset_chk_evex_unaligned_erms) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memset_chk, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX512VL) | ||||||
|  | -			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __memset_chk_avx512_unaligned_erms) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memset_chk, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX512VL) | ||||||
|  | -			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __memset_chk_avx512_unaligned) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __memset_chk, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX512F), | ||||||
|  | @@ -247,19 +251,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			      __memset_avx2_unaligned_erms_rtm) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memset, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX512VL) | ||||||
|  | -			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __memset_evex_unaligned) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memset, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX512VL) | ||||||
|  | -			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __memset_evex_unaligned_erms) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memset, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX512VL) | ||||||
|  | -			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __memset_avx512_unaligned_erms) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memset, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX512VL) | ||||||
|  | -			       && CPU_FEATURE_USABLE (AVX512BW)),
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __memset_avx512_unaligned) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memset, | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX512F), | ||||||
|  | @@ -739,10 +747,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (RTM)), | ||||||
|  |  			      __wmemset_avx2_unaligned_rtm) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wmemset, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX512VL)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __wmemset_evex_unaligned) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wmemset, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX512VL)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __wmemset_avx512_unaligned)) | ||||||
|  |   | ||||||
|  |  #ifdef SHARED | ||||||
|  | @@ -946,10 +958,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			      CPU_FEATURE_USABLE (AVX2), | ||||||
|  |  			      __wmemset_chk_avx2_unaligned) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __wmemset_chk, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512VL),
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX512VL)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __wmemset_chk_evex_unaligned) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, __wmemset_chk, | ||||||
|  | -			      CPU_FEATURE_USABLE (AVX512F),
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX512VL)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  |  			      __wmemset_chk_avx512_unaligned)) | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
 | ||||||
|  | index 19795938..100e3707 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
 | ||||||
|  | @@ -54,7 +54,8 @@ IFUNC_SELECTOR (void)
 | ||||||
|  |        && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) | ||||||
|  |      { | ||||||
|  |        if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) | ||||||
|  | -	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 | ||||||
|  | +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 | ||||||
|  | +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 | ||||||
|  |  	{ | ||||||
|  |  	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) | ||||||
|  |  	    return OPTIMIZE (avx512_unaligned_erms); | ||||||
|  | @@ -68,7 +69,8 @@ IFUNC_SELECTOR (void)
 | ||||||
|  |    if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) | ||||||
|  |      { | ||||||
|  |        if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) | ||||||
|  | -	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 | ||||||
|  | +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 | ||||||
|  | +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 | ||||||
|  |  	{ | ||||||
|  |  	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) | ||||||
|  |  	    return OPTIMIZE (evex_unaligned_erms); | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
 | ||||||
|  | index 22e7b187..8ad842fc 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
 | ||||||
|  | @@ -19,6 +19,6 @@
 | ||||||
|  |  # define SECTION(p)		p##.evex512 | ||||||
|  |  # define MEMSET_SYMBOL(p,s)	p##_avx512_##s | ||||||
|  |  # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s | ||||||
|  | -
 | ||||||
|  | +# define USE_LESS_VEC_MASK_STORE	1
 | ||||||
|  |  # include "memset-vec-unaligned-erms.S" | ||||||
|  |  #endif | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
 | ||||||
|  | index ae0a4d6e..640f0929 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
 | ||||||
|  | @@ -19,6 +19,6 @@
 | ||||||
|  |  # define SECTION(p)		p##.evex | ||||||
|  |  # define MEMSET_SYMBOL(p,s)	p##_evex_##s | ||||||
|  |  # define WMEMSET_SYMBOL(p,s)	p##_evex_##s | ||||||
|  | -
 | ||||||
|  | +# define USE_LESS_VEC_MASK_STORE	1
 | ||||||
|  |  # include "memset-vec-unaligned-erms.S" | ||||||
|  |  #endif | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 | ||||||
|  | index bae5cba4..f877ac9d 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 | ||||||
|  | @@ -63,6 +63,8 @@
 | ||||||
|  |  # endif | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  | +#define PAGE_SIZE 4096
 | ||||||
|  | +
 | ||||||
|  |  #ifndef SECTION | ||||||
|  |  # error SECTION is not defined! | ||||||
|  |  #endif | ||||||
|  | @@ -213,11 +215,38 @@ L(loop):
 | ||||||
|  |  	cmpq	%rcx, %rdx | ||||||
|  |  	jne	L(loop) | ||||||
|  |  	VZEROUPPER_SHORT_RETURN | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  |  L(less_vec): | ||||||
|  |  	/* Less than 1 VEC.  */ | ||||||
|  |  # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 | ||||||
|  |  #  error Unsupported VEC_SIZE! | ||||||
|  |  # endif | ||||||
|  | +# ifdef USE_LESS_VEC_MASK_STORE
 | ||||||
|  | +	/* Clear high bits from edi. Only keeping bits relevant to page
 | ||||||
|  | +	   cross check. Note that we are using rax which is set in
 | ||||||
|  | +	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
 | ||||||
|  | +	 */
 | ||||||
|  | +	andl	$(PAGE_SIZE - 1), %edi
 | ||||||
|  | +	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
 | ||||||
|  | +	   performance degradation when it has to fault supress.  */
 | ||||||
|  | +	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
 | ||||||
|  | +	ja	L(cross_page)
 | ||||||
|  | +# if VEC_SIZE > 32
 | ||||||
|  | +	movq	$-1, %rcx
 | ||||||
|  | +	bzhiq	%rdx, %rcx, %rcx
 | ||||||
|  | +	kmovq	%rcx, %k1
 | ||||||
|  | +# else
 | ||||||
|  | +	movl	$-1, %ecx
 | ||||||
|  | +	bzhil	%edx, %ecx, %ecx
 | ||||||
|  | +	kmovd	%ecx, %k1
 | ||||||
|  | +# endif
 | ||||||
|  | +	vmovdqu8	%VEC(0), (%rax) {%k1}
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(cross_page):
 | ||||||
|  | +# endif
 | ||||||
|  |  # if VEC_SIZE > 32 | ||||||
|  |  	cmpb	$32, %dl | ||||||
|  |  	jae	L(between_32_63) | ||||||
|  | @@ -234,36 +263,36 @@ L(less_vec):
 | ||||||
|  |  	cmpb	$1, %dl | ||||||
|  |  	ja	L(between_2_3) | ||||||
|  |  	jb	1f | ||||||
|  | -	movb	%cl, (%rdi)
 | ||||||
|  | +	movb	%cl, (%rax)
 | ||||||
|  |  1: | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |  # if VEC_SIZE > 32 | ||||||
|  |  	/* From 32 to 63.  No branch when size == 32.  */ | ||||||
|  |  L(between_32_63): | ||||||
|  | -	VMOVU	%YMM0, -32(%rdi,%rdx)
 | ||||||
|  | -	VMOVU	%YMM0, (%rdi)
 | ||||||
|  | +	VMOVU	%YMM0, -32(%rax,%rdx)
 | ||||||
|  | +	VMOVU	%YMM0, (%rax)
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |  # endif | ||||||
|  |  # if VEC_SIZE > 16 | ||||||
|  |  	/* From 16 to 31.  No branch when size == 16.  */ | ||||||
|  |  L(between_16_31): | ||||||
|  | -	VMOVU	%XMM0, -16(%rdi,%rdx)
 | ||||||
|  | -	VMOVU	%XMM0, (%rdi)
 | ||||||
|  | +	VMOVU	%XMM0, -16(%rax,%rdx)
 | ||||||
|  | +	VMOVU	%XMM0, (%rax)
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |  # endif | ||||||
|  |  	/* From 8 to 15.  No branch when size == 8.  */ | ||||||
|  |  L(between_8_15): | ||||||
|  | -	movq	%rcx, -8(%rdi,%rdx)
 | ||||||
|  | -	movq	%rcx, (%rdi)
 | ||||||
|  | +	movq	%rcx, -8(%rax,%rdx)
 | ||||||
|  | +	movq	%rcx, (%rax)
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |  L(between_4_7): | ||||||
|  |  	/* From 4 to 7.  No branch when size == 4.  */ | ||||||
|  | -	movl	%ecx, -4(%rdi,%rdx)
 | ||||||
|  | -	movl	%ecx, (%rdi)
 | ||||||
|  | +	movl	%ecx, -4(%rax,%rdx)
 | ||||||
|  | +	movl	%ecx, (%rax)
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |  L(between_2_3): | ||||||
|  |  	/* From 2 to 3.  No branch when size == 2.  */ | ||||||
|  | -	movw	%cx, -2(%rdi,%rdx)
 | ||||||
|  | -	movw	%cx, (%rdi)
 | ||||||
|  | +	movw	%cx, -2(%rax,%rdx)
 | ||||||
|  | +	movw	%cx, (%rax)
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |  END (MEMSET_SYMBOL (__memset, unaligned_erms)) | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										396
									
								
								SOURCES/glibc-RHEL-15696-42.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										396
									
								
								SOURCES/glibc-RHEL-15696-42.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,396 @@ | |||||||
|  | From ccabe7971f508709d034b63b8672f6f751a3d356 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Fri, 23 Apr 2021 15:56:24 -0400 | ||||||
|  | Subject: [PATCH] x86: Optimize strchr-avx2.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. This commit optimizes strchr-avx2.S. The optimizations are all | ||||||
|  | small things such as save an ALU in the alignment process, saving a | ||||||
|  | few instructions in the loop return, saving some bytes in the main | ||||||
|  | loop, and increasing the ILP in the return cases. test-strchr, | ||||||
|  | test-strchrnul, test-wcschr, and test-wcschrnul are all passing. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strchr-avx2.S | 290 +++++++++++++++---------- | ||||||
|  |  1 file changed, 170 insertions(+), 120 deletions(-) | ||||||
|  | 
 | ||||||
|  | Conflics: | ||||||
|  | 	sysdeps/x86_64/multiarch/strchr-avx2.S | ||||||
|  | 	(rearranged to account for branch changes) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
 | ||||||
|  | index 919d256c..5884726b 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
 | ||||||
|  | @@ -49,133 +49,144 @@
 | ||||||
|  |   | ||||||
|  |  	.section SECTION(.text),"ax",@progbits | ||||||
|  |  ENTRY (STRCHR) | ||||||
|  | -	movl	%edi, %ecx
 | ||||||
|  | -# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  | -# endif
 | ||||||
|  | -
 | ||||||
|  |  	/* Broadcast CHAR to YMM0.	*/ | ||||||
|  |  	vmovd	%esi, %xmm0 | ||||||
|  | +	movl	%edi, %eax
 | ||||||
|  | +	andl	$(PAGE_SIZE - 1), %eax
 | ||||||
|  | +	VPBROADCAST	%xmm0, %ymm0
 | ||||||
|  |  	vpxor	%xmm9, %xmm9, %xmm9 | ||||||
|  | -	VPBROADCAST %xmm0, %ymm0
 | ||||||
|  |   | ||||||
|  |  	/* Check if we cross page boundary with one vector load.  */ | ||||||
|  | -	andl	$(PAGE_SIZE - 1), %ecx
 | ||||||
|  | -	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
 | ||||||
|  | -	ja  L(cross_page_boundary)
 | ||||||
|  | +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 | ||||||
|  | +	ja	L(cross_page_boundary)
 | ||||||
|  |   | ||||||
|  |  	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the | ||||||
|  |  	   null byte.  */ | ||||||
|  |  	vmovdqu	(%rdi), %ymm8 | ||||||
|  | -	VPCMPEQ %ymm8, %ymm0, %ymm1
 | ||||||
|  | -	VPCMPEQ %ymm8, %ymm9, %ymm2
 | ||||||
|  | +	VPCMPEQ	%ymm8, %ymm0, %ymm1
 | ||||||
|  | +	VPCMPEQ	%ymm8, %ymm9, %ymm2
 | ||||||
|  |  	vpor	%ymm1, %ymm2, %ymm1 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jz	L(more_vecs)
 | ||||||
|  | +	jz	L(aligned_more)
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  |  	/* Found CHAR or the null byte.	 */ | ||||||
|  | +	cmp	(%rdi, %rax), %CHAR_REG
 | ||||||
|  | +	jne	L(zero)
 | ||||||
|  | +# endif
 | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +
 | ||||||
|  | +	/* .p2align 5 helps keep performance more consistent if ENTRY()
 | ||||||
|  | +	   alignment % 32 was either 16 or 0. As well this makes the
 | ||||||
|  | +	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
 | ||||||
|  | +	   easier.  */
 | ||||||
|  | +	.p2align 5
 | ||||||
|  | +L(first_vec_x4):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	addq	$(VEC_SIZE * 3 + 1), %rdi
 | ||||||
|  |  # ifndef USE_AS_STRCHRNUL | ||||||
|  | -	cmp (%rax), %CHAR_REG
 | ||||||
|  | -	cmovne	%rdx, %rax
 | ||||||
|  | +	/* Found CHAR or the null byte.	 */
 | ||||||
|  | +	cmp	(%rdi, %rax), %CHAR_REG
 | ||||||
|  | +	jne	L(zero)
 | ||||||
|  |  # endif | ||||||
|  | -L(return_vzeroupper):
 | ||||||
|  | -	ZERO_UPPER_VEC_REGISTERS_RETURN
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(more_vecs):
 | ||||||
|  | -	/* Align data for aligned loads in the loop.  */
 | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | -L(aligned_more):
 | ||||||
|  | -
 | ||||||
|  | -	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
 | ||||||
|  | -	   since data is only aligned to VEC_SIZE.	*/
 | ||||||
|  | -	vmovdqa	VEC_SIZE(%rdi), %ymm8
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -	VPCMPEQ %ymm8, %ymm0, %ymm1
 | ||||||
|  | -	VPCMPEQ %ymm8, %ymm9, %ymm2
 | ||||||
|  | -	vpor	%ymm1, %ymm2, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | -
 | ||||||
|  | -	vmovdqa	VEC_SIZE(%rdi), %ymm8
 | ||||||
|  | -	VPCMPEQ %ymm8, %ymm0, %ymm1
 | ||||||
|  | -	VPCMPEQ %ymm8, %ymm9, %ymm2
 | ||||||
|  | -	vpor	%ymm1, %ymm2, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x1)
 | ||||||
|  | -
 | ||||||
|  | -	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
 | ||||||
|  | -	VPCMPEQ %ymm8, %ymm0, %ymm1
 | ||||||
|  | -	VPCMPEQ %ymm8, %ymm9, %ymm2
 | ||||||
|  | -	vpor	%ymm1, %ymm2, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x2)
 | ||||||
|  | -
 | ||||||
|  | -	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
 | ||||||
|  | -	VPCMPEQ %ymm8, %ymm0, %ymm1
 | ||||||
|  | -	VPCMPEQ %ymm8, %ymm9, %ymm2
 | ||||||
|  | -	vpor	%ymm1, %ymm2, %ymm1
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jz	L(prep_loop_4x)
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  | -	tzcntl	%eax, %eax
 | ||||||
|  | -	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
 | ||||||
|  |  # ifndef USE_AS_STRCHRNUL | ||||||
|  | -	cmp (%rax), %CHAR_REG
 | ||||||
|  | -	cmovne	%rdx, %rax
 | ||||||
|  | +L(zero):
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |  # endif | ||||||
|  | -	VZEROUPPER
 | ||||||
|  | -	ret
 | ||||||
|  | +
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x0):
 | ||||||
|  | +L(first_vec_x1):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	/* Found CHAR or the null byte.	 */
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | +	incq	%rdi
 | ||||||
|  |  # ifndef USE_AS_STRCHRNUL | ||||||
|  | -	cmp (%rax), %CHAR_REG
 | ||||||
|  | -	cmovne	%rdx, %rax
 | ||||||
|  | +	/* Found CHAR or the null byte.	 */
 | ||||||
|  | +	cmp	(%rdi, %rax), %CHAR_REG
 | ||||||
|  | +	jne	L(zero)
 | ||||||
|  |  # endif | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x1):
 | ||||||
|  | +L(first_vec_x2):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	leaq	VEC_SIZE(%rdi, %rax), %rax
 | ||||||
|  | +	addq	$(VEC_SIZE + 1), %rdi
 | ||||||
|  |  # ifndef USE_AS_STRCHRNUL | ||||||
|  | -	cmp (%rax), %CHAR_REG
 | ||||||
|  | -	cmovne	%rdx, %rax
 | ||||||
|  | +	/* Found CHAR or the null byte.	 */
 | ||||||
|  | +	cmp	(%rdi, %rax), %CHAR_REG
 | ||||||
|  | +	jne	L(zero)
 | ||||||
|  |  # endif | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x2):
 | ||||||
|  | +L(first_vec_x3):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	/* Found CHAR or the null byte.	 */
 | ||||||
|  | -	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
 | ||||||
|  | +	addq	$(VEC_SIZE * 2 + 1), %rdi
 | ||||||
|  |  # ifndef USE_AS_STRCHRNUL | ||||||
|  | -	cmp (%rax), %CHAR_REG
 | ||||||
|  | -	cmovne	%rdx, %rax
 | ||||||
|  | +	/* Found CHAR or the null byte.	 */
 | ||||||
|  | +	cmp	(%rdi, %rax), %CHAR_REG
 | ||||||
|  | +	jne	L(zero)
 | ||||||
|  |  # endif | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  | -L(prep_loop_4x):
 | ||||||
|  | -	/* Align data to 4 * VEC_SIZE.	*/
 | ||||||
|  | -	andq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(aligned_more):
 | ||||||
|  | +	/* Align data to VEC_SIZE - 1. This is the same number of
 | ||||||
|  | +	   instructions as using andq -VEC_SIZE but saves 4 bytes of code
 | ||||||
|  | +	   on x4 check.  */
 | ||||||
|  | +	orq	$(VEC_SIZE - 1), %rdi
 | ||||||
|  | +L(cross_page_continue):
 | ||||||
|  | +	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 | ||||||
|  | +	   since data is only aligned to VEC_SIZE.  */
 | ||||||
|  | +	vmovdqa	1(%rdi), %ymm8
 | ||||||
|  | +	VPCMPEQ	%ymm8, %ymm0, %ymm1
 | ||||||
|  | +	VPCMPEQ	%ymm8, %ymm9, %ymm2
 | ||||||
|  | +	vpor	%ymm1, %ymm2, %ymm1
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(first_vec_x1)
 | ||||||
|  | +
 | ||||||
|  | +	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
 | ||||||
|  | +	VPCMPEQ	%ymm8, %ymm0, %ymm1
 | ||||||
|  | +	VPCMPEQ	%ymm8, %ymm9, %ymm2
 | ||||||
|  | +	vpor	%ymm1, %ymm2, %ymm1
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(first_vec_x2)
 | ||||||
|  | +
 | ||||||
|  | +	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
 | ||||||
|  | +	VPCMPEQ	%ymm8, %ymm0, %ymm1
 | ||||||
|  | +	VPCMPEQ	%ymm8, %ymm9, %ymm2
 | ||||||
|  | +	vpor	%ymm1, %ymm2, %ymm1
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(first_vec_x3)
 | ||||||
|  |   | ||||||
|  | +	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
 | ||||||
|  | +	VPCMPEQ	%ymm8, %ymm0, %ymm1
 | ||||||
|  | +	VPCMPEQ	%ymm8, %ymm9, %ymm2
 | ||||||
|  | +	vpor	%ymm1, %ymm2, %ymm1
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(first_vec_x4)
 | ||||||
|  | +	/* Align data to VEC_SIZE * 4 - 1.	*/
 | ||||||
|  | +	addq	$(VEC_SIZE * 4 + 1), %rdi
 | ||||||
|  | +	andq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(loop_4x_vec): | ||||||
|  |  	/* Compare 4 * VEC at a time forward.  */ | ||||||
|  | -	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
 | ||||||
|  | -	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
 | ||||||
|  | -	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
 | ||||||
|  | -	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
 | ||||||
|  | +	vmovdqa	(%rdi), %ymm5
 | ||||||
|  | +	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
 | ||||||
|  | +	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
 | ||||||
|  | +	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
 | ||||||
|  |   | ||||||
|  |  	/* Leaves only CHARS matching esi as 0.	 */ | ||||||
|  |  	vpxor	%ymm5, %ymm0, %ymm1 | ||||||
|  | @@ -191,63 +202,102 @@ L(loop_4x_vec):
 | ||||||
|  |  	VPMINU	%ymm1, %ymm2, %ymm5 | ||||||
|  |  	VPMINU	%ymm3, %ymm4, %ymm6 | ||||||
|  |   | ||||||
|  | -	VPMINU	%ymm5, %ymm6, %ymm5
 | ||||||
|  | +	VPMINU	%ymm5, %ymm6, %ymm6
 | ||||||
|  |   | ||||||
|  | -	VPCMPEQ %ymm5, %ymm9, %ymm5
 | ||||||
|  | -	vpmovmskb %ymm5, %eax
 | ||||||
|  | +	VPCMPEQ	%ymm6, %ymm9, %ymm6
 | ||||||
|  | +	vpmovmskb %ymm6, %ecx
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jz	L(loop_4x_vec)
 | ||||||
|  |   | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rdi
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jz  L(loop_4x_vec)
 | ||||||
|  |   | ||||||
|  | -	VPCMPEQ %ymm1, %ymm9, %ymm1
 | ||||||
|  | +	VPCMPEQ	%ymm1, %ymm9, %ymm1
 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | +	jnz	L(last_vec_x0)
 | ||||||
|  | +
 | ||||||
|  |   | ||||||
|  | -	VPCMPEQ %ymm2, %ymm9, %ymm2
 | ||||||
|  | +	VPCMPEQ	%ymm5, %ymm9, %ymm2
 | ||||||
|  |  	vpmovmskb %ymm2, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(first_vec_x1)
 | ||||||
|  | +	jnz	L(last_vec_x1)
 | ||||||
|  | +
 | ||||||
|  | +	VPCMPEQ	%ymm3, %ymm9, %ymm3
 | ||||||
|  | +	vpmovmskb %ymm3, %eax
 | ||||||
|  | +	/* rcx has combined result from all 4 VEC. It will only be used
 | ||||||
|  | +	   if the first 3 other VEC all did not contain a match.  */
 | ||||||
|  | +	salq	$32, %rcx
 | ||||||
|  | +	orq	%rcx, %rax
 | ||||||
|  | +	tzcntq	%rax, %rax
 | ||||||
|  | +	subq	$(VEC_SIZE * 2), %rdi
 | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | +	/* Found CHAR or the null byte.	 */
 | ||||||
|  | +	cmp	(%rdi, %rax), %CHAR_REG
 | ||||||
|  | +	jne	L(zero_end)
 | ||||||
|  | +# endif
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(last_vec_x0):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	addq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | +	/* Found CHAR or the null byte.	 */
 | ||||||
|  | +	cmp	(%rdi, %rax), %CHAR_REG
 | ||||||
|  | +	jne	L(zero_end)
 | ||||||
|  | +# endif
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  | -	VPCMPEQ %ymm3, %ymm9, %ymm3
 | ||||||
|  | -	VPCMPEQ %ymm4, %ymm9, %ymm4
 | ||||||
|  | -	vpmovmskb %ymm3, %ecx
 | ||||||
|  | -	vpmovmskb %ymm4, %eax
 | ||||||
|  | -	salq	$32, %rax
 | ||||||
|  | -	orq %rcx, %rax
 | ||||||
|  | -	tzcntq  %rax, %rax
 | ||||||
|  | -	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
 | ||||||
|  |  # ifndef USE_AS_STRCHRNUL | ||||||
|  | -	cmp (%rax), %CHAR_REG
 | ||||||
|  | -	cmovne	%rdx, %rax
 | ||||||
|  | +L(zero_end):
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |  # endif | ||||||
|  | -	VZEROUPPER
 | ||||||
|  | -	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(last_vec_x1):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	subq	$(VEC_SIZE * 3), %rdi
 | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | +	/* Found CHAR or the null byte.	 */
 | ||||||
|  | +	cmp	(%rdi, %rax), %CHAR_REG
 | ||||||
|  | +	jne	L(zero_end)
 | ||||||
|  | +# endif
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +
 | ||||||
|  |   | ||||||
|  |  	/* Cold case for crossing page with first load.	 */ | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(cross_page_boundary): | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | -	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | -
 | ||||||
|  | -	vmovdqa	(%rdi), %ymm8
 | ||||||
|  | -	VPCMPEQ %ymm8, %ymm0, %ymm1
 | ||||||
|  | -	VPCMPEQ %ymm8, %ymm9, %ymm2
 | ||||||
|  | +	movq	%rdi, %rdx
 | ||||||
|  | +	/* Align rdi to VEC_SIZE - 1.  */
 | ||||||
|  | +	orq	$(VEC_SIZE - 1), %rdi
 | ||||||
|  | +	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
 | ||||||
|  | +	VPCMPEQ	%ymm8, %ymm0, %ymm1
 | ||||||
|  | +	VPCMPEQ	%ymm8, %ymm9, %ymm2
 | ||||||
|  |  	vpor	%ymm1, %ymm2, %ymm1 | ||||||
|  |  	vpmovmskb %ymm1, %eax | ||||||
|  | -	/* Remove the leading bits.	 */
 | ||||||
|  | -	sarxl	%ecx, %eax, %eax
 | ||||||
|  | +	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
 | ||||||
|  | +	   so no need to manually mod edx.  */
 | ||||||
|  | +	sarxl	%edx, %eax, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jz	L(aligned_more)
 | ||||||
|  | +	jz	L(cross_page_continue)
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	addq	%rcx, %rdi
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  |  # ifndef USE_AS_STRCHRNUL | ||||||
|  | -	cmp (%rax), %CHAR_REG
 | ||||||
|  | -	cmovne	%rdx, %rax
 | ||||||
|  | +	xorl	%ecx, %ecx
 | ||||||
|  | +	/* Found CHAR or the null byte.	 */
 | ||||||
|  | +	cmp	(%rdx, %rax), %CHAR_REG
 | ||||||
|  | +	leaq	(%rdx, %rax), %rax
 | ||||||
|  | +	cmovne	%rcx, %rax
 | ||||||
|  | +# else
 | ||||||
|  | +	addq	%rdx, %rax
 | ||||||
|  |  # endif | ||||||
|  | -	VZEROUPPER_RETURN
 | ||||||
|  | +L(return_vzeroupper):
 | ||||||
|  | +	ZERO_UPPER_VEC_REGISTERS_RETURN
 | ||||||
|  |   | ||||||
|  |  END (STRCHR) | ||||||
|  |  # endif | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										532
									
								
								SOURCES/glibc-RHEL-15696-43.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										532
									
								
								SOURCES/glibc-RHEL-15696-43.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,532 @@ | |||||||
|  | From 7f3e7c262cab4e2401e4331a6ef29c428de02044 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Fri, 23 Apr 2021 15:56:25 -0400 | ||||||
|  | Subject: [PATCH] x86: Optimize strchr-evex.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. This commit optimizes strchr-evex.S. The optimizations are | ||||||
|  | mostly small things such as save an ALU in the alignment process, | ||||||
|  | saving a few instructions in the loop return. The one significant | ||||||
|  | change is saving 2 instructions in the 4x loop. test-strchr, | ||||||
|  | test-strchrnul, test-wcschr, and test-wcschrnul are all passing. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strchr-evex.S | 392 ++++++++++++++----------- | ||||||
|  |  1 file changed, 218 insertions(+), 174 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
 | ||||||
|  | index ddc86a70..7f9d4ee4 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strchr-evex.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strchr-evex.S
 | ||||||
|  | @@ -32,13 +32,15 @@
 | ||||||
|  |  #  define VPCMP		vpcmpd | ||||||
|  |  #  define VPMINU	vpminud | ||||||
|  |  #  define CHAR_REG	esi | ||||||
|  | -#  define SHIFT_REG	r8d
 | ||||||
|  | +#  define SHIFT_REG	ecx
 | ||||||
|  | +#  define CHAR_SIZE	4
 | ||||||
|  |  # else | ||||||
|  |  #  define VPBROADCAST	vpbroadcastb | ||||||
|  |  #  define VPCMP		vpcmpb | ||||||
|  |  #  define VPMINU	vpminub | ||||||
|  |  #  define CHAR_REG	sil | ||||||
|  | -#  define SHIFT_REG	ecx
 | ||||||
|  | +#  define SHIFT_REG	edx
 | ||||||
|  | +#  define CHAR_SIZE	1
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  # define XMMZERO	xmm16 | ||||||
|  | @@ -56,23 +58,20 @@
 | ||||||
|  |   | ||||||
|  |  # define VEC_SIZE 32 | ||||||
|  |  # define PAGE_SIZE 4096 | ||||||
|  | +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
 | ||||||
|  |   | ||||||
|  |  	.section .text.evex,"ax",@progbits | ||||||
|  |  ENTRY (STRCHR) | ||||||
|  | -	movl	%edi, %ecx
 | ||||||
|  | -# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  | -# endif
 | ||||||
|  | -
 | ||||||
|  |  	/* Broadcast CHAR to YMM0.	*/ | ||||||
|  | -	VPBROADCAST %esi, %YMM0
 | ||||||
|  | -
 | ||||||
|  | +	VPBROADCAST	%esi, %YMM0
 | ||||||
|  | +	movl	%edi, %eax
 | ||||||
|  | +	andl	$(PAGE_SIZE - 1), %eax
 | ||||||
|  |  	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO | ||||||
|  |   | ||||||
|  | -	/* Check if we cross page boundary with one vector load.  */
 | ||||||
|  | -	andl	$(PAGE_SIZE - 1), %ecx
 | ||||||
|  | -	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
 | ||||||
|  | -	ja  L(cross_page_boundary)
 | ||||||
|  | +	/* Check if we cross page boundary with one vector load.
 | ||||||
|  | +	   Otherwise it is safe to use an unaligned load.  */
 | ||||||
|  | +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 | ||||||
|  | +	ja	L(cross_page_boundary)
 | ||||||
|  |   | ||||||
|  |  	/* Check the first VEC_SIZE bytes. Search for both CHAR and the | ||||||
|  |  	   null bytes.  */ | ||||||
|  | @@ -83,251 +82,296 @@ ENTRY (STRCHR)
 | ||||||
|  |  	VPMINU	%YMM2, %YMM1, %YMM2 | ||||||
|  |  	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */ | ||||||
|  |  	VPCMP	$0, %YMMZERO, %YMM2, %k0 | ||||||
|  | -	ktestd	%k0, %k0
 | ||||||
|  | -	jz	L(more_vecs)
 | ||||||
|  |  	kmovd	%k0, %eax | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jz	L(aligned_more)
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	/* Found CHAR or the null byte.	 */
 | ||||||
|  |  # ifdef USE_AS_WCSCHR | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	leaq	(%rdi, %rax, 4), %rax
 | ||||||
|  | +	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
 | ||||||
|  | +	 */
 | ||||||
|  | +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  # else | ||||||
|  |  	addq	%rdi, %rax | ||||||
|  |  # endif | ||||||
|  |  # ifndef USE_AS_STRCHRNUL | ||||||
|  | -	cmp (%rax), %CHAR_REG
 | ||||||
|  | -	cmovne	%rdx, %rax
 | ||||||
|  | +	/* Found CHAR or the null byte.	 */
 | ||||||
|  | +	cmp	(%rax), %CHAR_REG
 | ||||||
|  | +	jne	L(zero)
 | ||||||
|  |  # endif | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(more_vecs):
 | ||||||
|  | -	/* Align data for aligned loads in the loop.  */
 | ||||||
|  | -	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | -L(aligned_more):
 | ||||||
|  | -
 | ||||||
|  | -	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
 | ||||||
|  | -	   since data is only aligned to VEC_SIZE.	*/
 | ||||||
|  | -	VMOVA	VEC_SIZE(%rdi), %YMM1
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -
 | ||||||
|  | -	/* Leaves only CHARS matching esi as 0.  */
 | ||||||
|  | -	vpxorq	%YMM1, %YMM0, %YMM2
 | ||||||
|  | -	VPMINU	%YMM2, %YMM1, %YMM2
 | ||||||
|  | -	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM2, %k0
 | ||||||
|  | -	kmovd	%k0, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | -
 | ||||||
|  | -	VMOVA	VEC_SIZE(%rdi), %YMM1
 | ||||||
|  | -	/* Leaves only CHARS matching esi as 0.  */
 | ||||||
|  | -	vpxorq	%YMM1, %YMM0, %YMM2
 | ||||||
|  | -	VPMINU	%YMM2, %YMM1, %YMM2
 | ||||||
|  | -	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM2, %k0
 | ||||||
|  | -	kmovd	%k0, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x1)
 | ||||||
|  | -
 | ||||||
|  | -	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
 | ||||||
|  | -	/* Leaves only CHARS matching esi as 0.  */
 | ||||||
|  | -	vpxorq	%YMM1, %YMM0, %YMM2
 | ||||||
|  | -	VPMINU	%YMM2, %YMM1, %YMM2
 | ||||||
|  | -	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM2, %k0
 | ||||||
|  | -	kmovd	%k0, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(first_vec_x2)
 | ||||||
|  | -
 | ||||||
|  | -	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
 | ||||||
|  | -	/* Leaves only CHARS matching esi as 0.  */
 | ||||||
|  | -	vpxorq	%YMM1, %YMM0, %YMM2
 | ||||||
|  | -	VPMINU	%YMM2, %YMM1, %YMM2
 | ||||||
|  | -	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM2, %k0
 | ||||||
|  | -	ktestd	%k0, %k0
 | ||||||
|  | -	jz	L(prep_loop_4x)
 | ||||||
|  | -
 | ||||||
|  | -	kmovd	%k0, %eax
 | ||||||
|  | +	/* .p2align 5 helps keep performance more consistent if ENTRY()
 | ||||||
|  | +	   alignment % 32 was either 16 or 0. As well this makes the
 | ||||||
|  | +	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
 | ||||||
|  | +	   easier.  */
 | ||||||
|  | +	.p2align 5
 | ||||||
|  | +L(first_vec_x3):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  |  	/* Found CHAR or the null byte.	 */ | ||||||
|  | -# ifdef USE_AS_WCSCHR
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
 | ||||||
|  | -# else
 | ||||||
|  | -	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
 | ||||||
|  | +	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
 | ||||||
|  | +	jne	L(zero)
 | ||||||
|  |  # endif | ||||||
|  | +	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 | ||||||
|  | +	   bytes.  */
 | ||||||
|  | +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  |  # ifndef USE_AS_STRCHRNUL | ||||||
|  | -	cmp (%rax), %CHAR_REG
 | ||||||
|  | -	cmovne	%rdx, %rax
 | ||||||
|  | -# endif
 | ||||||
|  | +L(zero):
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  |  	ret | ||||||
|  | +# endif
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x0):
 | ||||||
|  | +L(first_vec_x4):
 | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | +	/* Check to see if first match was CHAR (k0) or null (k1).  */
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	/* Found CHAR or the null byte.	 */
 | ||||||
|  | -# ifdef USE_AS_WCSCHR
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	leaq	(%rdi, %rax, 4), %rax
 | ||||||
|  | +	kmovd	%k1, %ecx
 | ||||||
|  | +	/* bzhil will not be 0 if first match was null.  */
 | ||||||
|  | +	bzhil	%eax, %ecx, %ecx
 | ||||||
|  | +	jne	L(zero)
 | ||||||
|  |  # else | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -# endif
 | ||||||
|  | -# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | -	cmp (%rax), %CHAR_REG
 | ||||||
|  | -	cmovne	%rdx, %rax
 | ||||||
|  | +	/* Combine CHAR and null matches.  */
 | ||||||
|  | +	kord	%k0, %k1, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  |  # endif | ||||||
|  | +	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 | ||||||
|  | +	   bytes.  */
 | ||||||
|  | +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(first_vec_x1): | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	/* Found CHAR or the null byte.	 */
 | ||||||
|  | -# ifdef USE_AS_WCSCHR
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
 | ||||||
|  | -# else
 | ||||||
|  | -	leaq	VEC_SIZE(%rdi, %rax), %rax
 | ||||||
|  | -# endif
 | ||||||
|  |  # ifndef USE_AS_STRCHRNUL | ||||||
|  | -	cmp (%rax), %CHAR_REG
 | ||||||
|  | -	cmovne	%rdx, %rax
 | ||||||
|  | +	/* Found CHAR or the null byte.	 */
 | ||||||
|  | +	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
 | ||||||
|  | +	jne	L(zero)
 | ||||||
|  | +
 | ||||||
|  |  # endif | ||||||
|  | +	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 | ||||||
|  | +	   bytes.  */
 | ||||||
|  | +	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(first_vec_x2): | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | +	/* Check to see if first match was CHAR (k0) or null (k1).  */
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	/* Found CHAR or the null byte.	 */
 | ||||||
|  | -# ifdef USE_AS_WCSCHR
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
 | ||||||
|  | +	kmovd	%k1, %ecx
 | ||||||
|  | +	/* bzhil will not be 0 if first match was null.  */
 | ||||||
|  | +	bzhil	%eax, %ecx, %ecx
 | ||||||
|  | +	jne	L(zero)
 | ||||||
|  |  # else | ||||||
|  | -	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
 | ||||||
|  | -# endif
 | ||||||
|  | -# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | -	cmp (%rax), %CHAR_REG
 | ||||||
|  | -	cmovne	%rdx, %rax
 | ||||||
|  | +	/* Combine CHAR and null matches.  */
 | ||||||
|  | +	kord	%k0, %k1, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  |  # endif | ||||||
|  | +	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 | ||||||
|  | +	   bytes.  */
 | ||||||
|  | +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -L(prep_loop_4x):
 | ||||||
|  | -	/* Align data to 4 * VEC_SIZE.	*/
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(aligned_more):
 | ||||||
|  | +	/* Align data to VEC_SIZE.  */
 | ||||||
|  | +	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | +L(cross_page_continue):
 | ||||||
|  | +	/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
 | ||||||
|  | +	   data is only aligned to VEC_SIZE. Use two alternating methods
 | ||||||
|  | +	   for checking VEC to balance latency and port contention.  */
 | ||||||
|  | +
 | ||||||
|  | +	/* This method has higher latency but has better port
 | ||||||
|  | +	   distribution.  */
 | ||||||
|  | +	VMOVA	(VEC_SIZE)(%rdi), %YMM1
 | ||||||
|  | +	/* Leaves only CHARS matching esi as 0.  */
 | ||||||
|  | +	vpxorq	%YMM1, %YMM0, %YMM2
 | ||||||
|  | +	VPMINU	%YMM2, %YMM1, %YMM2
 | ||||||
|  | +	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 | ||||||
|  | +	VPCMP	$0, %YMMZERO, %YMM2, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(first_vec_x1)
 | ||||||
|  | +
 | ||||||
|  | +	/* This method has higher latency but has better port
 | ||||||
|  | +	   distribution.  */
 | ||||||
|  | +	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
 | ||||||
|  | +	/* Each bit in K0 represents a CHAR in YMM1.  */
 | ||||||
|  | +	VPCMP	$0, %YMM1, %YMM0, %k0
 | ||||||
|  | +	/* Each bit in K1 represents a CHAR in YMM1.  */
 | ||||||
|  | +	VPCMP	$0, %YMM1, %YMMZERO, %k1
 | ||||||
|  | +	kortestd	%k0, %k1
 | ||||||
|  | +	jnz	L(first_vec_x2)
 | ||||||
|  | +
 | ||||||
|  | +	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
 | ||||||
|  | +	/* Leaves only CHARS matching esi as 0.  */
 | ||||||
|  | +	vpxorq	%YMM1, %YMM0, %YMM2
 | ||||||
|  | +	VPMINU	%YMM2, %YMM1, %YMM2
 | ||||||
|  | +	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 | ||||||
|  | +	VPCMP	$0, %YMMZERO, %YMM2, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(first_vec_x3)
 | ||||||
|  | +
 | ||||||
|  | +	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
 | ||||||
|  | +	/* Each bit in K0 represents a CHAR in YMM1.  */
 | ||||||
|  | +	VPCMP	$0, %YMM1, %YMM0, %k0
 | ||||||
|  | +	/* Each bit in K1 represents a CHAR in YMM1.  */
 | ||||||
|  | +	VPCMP	$0, %YMM1, %YMMZERO, %k1
 | ||||||
|  | +	kortestd	%k0, %k1
 | ||||||
|  | +	jnz	L(first_vec_x4)
 | ||||||
|  | +
 | ||||||
|  | +	/* Align data to VEC_SIZE * 4 for the loop.  */
 | ||||||
|  | +	addq	$VEC_SIZE, %rdi
 | ||||||
|  |  	andq	$-(VEC_SIZE * 4), %rdi | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(loop_4x_vec): | ||||||
|  | -	/* Compare 4 * VEC at a time forward.  */
 | ||||||
|  | +	/* Check 4x VEC at a time. No penalty to imm32 offset with evex
 | ||||||
|  | +	   encoding.  */
 | ||||||
|  |  	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1 | ||||||
|  |  	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2 | ||||||
|  |  	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3 | ||||||
|  |  	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4 | ||||||
|  |   | ||||||
|  | -	/* Leaves only CHARS matching esi as 0.  */
 | ||||||
|  | +	/* For YMM1 and YMM3 use xor to set the CHARs matching esi to
 | ||||||
|  | +	   zero.  */
 | ||||||
|  |  	vpxorq	%YMM1, %YMM0, %YMM5 | ||||||
|  | -	vpxorq	%YMM2, %YMM0, %YMM6
 | ||||||
|  | +	/* For YMM2 and YMM4 cmp not equals to CHAR and store result in
 | ||||||
|  | +	   k register. Its possible to save either 1 or 2 instructions
 | ||||||
|  | +	   using cmp no equals method for either YMM1 or YMM1 and YMM3
 | ||||||
|  | +	   respectively but bottleneck on p5 makes it not worth it.  */
 | ||||||
|  | +	VPCMP	$4, %YMM0, %YMM2, %k2
 | ||||||
|  |  	vpxorq	%YMM3, %YMM0, %YMM7 | ||||||
|  | -	vpxorq	%YMM4, %YMM0, %YMM8
 | ||||||
|  | -
 | ||||||
|  | -	VPMINU	%YMM5, %YMM1, %YMM5
 | ||||||
|  | -	VPMINU	%YMM6, %YMM2, %YMM6
 | ||||||
|  | -	VPMINU	%YMM7, %YMM3, %YMM7
 | ||||||
|  | -	VPMINU	%YMM8, %YMM4, %YMM8
 | ||||||
|  | -
 | ||||||
|  | -	VPMINU	%YMM5, %YMM6, %YMM1
 | ||||||
|  | -	VPMINU	%YMM7, %YMM8, %YMM2
 | ||||||
|  | -
 | ||||||
|  | -	VPMINU	%YMM1, %YMM2, %YMM1
 | ||||||
|  | -
 | ||||||
|  | -	/* Each bit in K0 represents a CHAR or a null byte.  */
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM1, %k0
 | ||||||
|  | -
 | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rdi
 | ||||||
|  | -
 | ||||||
|  | -	ktestd	%k0, %k0
 | ||||||
|  | +	VPCMP	$4, %YMM0, %YMM4, %k4
 | ||||||
|  | +
 | ||||||
|  | +	/* Use min to select all zeros from either xor or end of string).
 | ||||||
|  | +	 */
 | ||||||
|  | +	VPMINU	%YMM1, %YMM5, %YMM1
 | ||||||
|  | +	VPMINU	%YMM3, %YMM7, %YMM3
 | ||||||
|  | +
 | ||||||
|  | +	/* Use min + zeromask to select for zeros. Since k2 and k4 will
 | ||||||
|  | +	   have 0 as positions that matched with CHAR which will set
 | ||||||
|  | +	   zero in the corresponding destination bytes in YMM2 / YMM4.
 | ||||||
|  | +	 */
 | ||||||
|  | +	VPMINU	%YMM1, %YMM2, %YMM2{%k2}{z}
 | ||||||
|  | +	VPMINU	%YMM3, %YMM4, %YMM4
 | ||||||
|  | +	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
 | ||||||
|  | +
 | ||||||
|  | +	VPCMP	$0, %YMMZERO, %YMM4, %k1
 | ||||||
|  | +	kmovd	%k1, %ecx
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  |  	jz	L(loop_4x_vec) | ||||||
|  |   | ||||||
|  | -	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM5, %k0
 | ||||||
|  | +	VPCMP	$0, %YMMZERO, %YMM1, %k0
 | ||||||
|  |  	kmovd	%k0, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(first_vec_x0)
 | ||||||
|  | +	jnz	L(last_vec_x1)
 | ||||||
|  |   | ||||||
|  | -	/* Each bit in K1 represents a CHAR or a null byte in YMM2.  */
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM6, %k1
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | +	VPCMP	$0, %YMMZERO, %YMM2, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(first_vec_x1)
 | ||||||
|  | -
 | ||||||
|  | -	/* Each bit in K2 represents a CHAR or a null byte in YMM3.  */
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM7, %k2
 | ||||||
|  | -	/* Each bit in K3 represents a CHAR or a null byte in YMM4.  */
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM8, %k3
 | ||||||
|  | +	jnz	L(last_vec_x2)
 | ||||||
|  |   | ||||||
|  | +	VPCMP	$0, %YMMZERO, %YMM3, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
 | ||||||
|  |  # ifdef USE_AS_WCSCHR | ||||||
|  | -	/* NB: Each bit in K2/K3 represents 4-byte element.  */
 | ||||||
|  | -	kshiftlw $8, %k3, %k1
 | ||||||
|  | +	sall	$8, %ecx
 | ||||||
|  | +	orl	%ecx, %eax
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  |  # else | ||||||
|  | -	kshiftlq $32, %k3, %k1
 | ||||||
|  | +	salq	$32, %rcx
 | ||||||
|  | +	orq	%rcx, %rax
 | ||||||
|  | +	tzcntq	%rax, %rax
 | ||||||
|  |  # endif | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | +	/* Check if match was CHAR or null.  */
 | ||||||
|  | +	cmp	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
 | ||||||
|  | +	jne	L(zero_end)
 | ||||||
|  | +# endif
 | ||||||
|  | +	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 | ||||||
|  | +	   bytes.  */
 | ||||||
|  | +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -	/* Each bit in K1 represents a NULL or a mismatch.  */
 | ||||||
|  | -	korq	%k1, %k2, %k1
 | ||||||
|  | -	kmovq	%k1, %rax
 | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | +L(zero_end):
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	ret
 | ||||||
|  | +# endif
 | ||||||
|  |   | ||||||
|  | -	tzcntq  %rax, %rax
 | ||||||
|  | -# ifdef USE_AS_WCSCHR
 | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
 | ||||||
|  | -# else
 | ||||||
|  | -	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(last_vec_x1):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | +	/* Check if match was null.  */
 | ||||||
|  | +	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
 | ||||||
|  | +	jne	L(zero_end)
 | ||||||
|  |  # endif | ||||||
|  | +	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 | ||||||
|  | +	   bytes.  */
 | ||||||
|  | +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(last_vec_x2):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  |  # ifndef USE_AS_STRCHRNUL | ||||||
|  | -	cmp (%rax), %CHAR_REG
 | ||||||
|  | -	cmovne	%rdx, %rax
 | ||||||
|  | +	/* Check if match was null.  */
 | ||||||
|  | +	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
 | ||||||
|  | +	jne	L(zero_end)
 | ||||||
|  |  # endif | ||||||
|  | +	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 | ||||||
|  | +	   bytes.  */
 | ||||||
|  | +	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	/* Cold case for crossing page with first load.	 */ | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(cross_page_boundary): | ||||||
|  | +	movq	%rdi, %rdx
 | ||||||
|  | +	/* Align rdi.  */
 | ||||||
|  |  	andq	$-VEC_SIZE, %rdi | ||||||
|  | -	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | -
 | ||||||
|  |  	VMOVA	(%rdi), %YMM1 | ||||||
|  | -
 | ||||||
|  |  	/* Leaves only CHARS matching esi as 0.  */ | ||||||
|  |  	vpxorq	%YMM1, %YMM0, %YMM2 | ||||||
|  |  	VPMINU	%YMM2, %YMM1, %YMM2 | ||||||
|  |  	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */ | ||||||
|  |  	VPCMP	$0, %YMMZERO, %YMM2, %k0 | ||||||
|  |  	kmovd	%k0, %eax | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -
 | ||||||
|  | +	/* Remove the leading bits.	 */
 | ||||||
|  |  # ifdef USE_AS_WCSCHR | ||||||
|  | +	movl	%edx, %SHIFT_REG
 | ||||||
|  |  	/* NB: Divide shift count by 4 since each bit in K1 represent 4 | ||||||
|  |  	   bytes.  */ | ||||||
|  | -	movl	%ecx, %SHIFT_REG
 | ||||||
|  | -	sarl    $2, %SHIFT_REG
 | ||||||
|  | +	sarl	$2, %SHIFT_REG
 | ||||||
|  | +	andl	$(CHAR_PER_VEC - 1), %SHIFT_REG
 | ||||||
|  |  # endif | ||||||
|  | -
 | ||||||
|  | -	/* Remove the leading bits.	 */
 | ||||||
|  |  	sarxl	%SHIFT_REG, %eax, %eax | ||||||
|  | +	/* If eax is zero continue.  */
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -
 | ||||||
|  | -	jz	L(aligned_more)
 | ||||||
|  | +	jz	L(cross_page_continue)
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	addq	%rcx, %rdi
 | ||||||
|  | +# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | +	/* Check to see if match was CHAR or null.  */
 | ||||||
|  | +	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
 | ||||||
|  | +	jne	L(zero_end)
 | ||||||
|  | +# endif
 | ||||||
|  |  # ifdef USE_AS_WCSCHR | ||||||
|  | -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 | ||||||
|  | -	leaq	(%rdi, %rax, 4), %rax
 | ||||||
|  | +	/* NB: Multiply wchar_t count by 4 to get the number of
 | ||||||
|  | +	   bytes.  */
 | ||||||
|  | +	leaq	(%rdx, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  # else | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -# endif
 | ||||||
|  | -# ifndef USE_AS_STRCHRNUL
 | ||||||
|  | -	cmp (%rax), %CHAR_REG
 | ||||||
|  | -	cmovne	%rdx, %rax
 | ||||||
|  | +	addq	%rdx, %rax
 | ||||||
|  |  # endif | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										536
									
								
								SOURCES/glibc-RHEL-15696-44.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										536
									
								
								SOURCES/glibc-RHEL-15696-44.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,536 @@ | |||||||
|  | From 104c7b1967c3e78435c6f7eab5e225a7eddf9c6e Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Tue, 4 May 2021 19:02:40 -0400 | ||||||
|  | Subject: [PATCH] x86: Add EVEX optimized memchr family not safe for RTM | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. | ||||||
|  | 
 | ||||||
|  | This commit adds a new implementation for EVEX memchr that is not safe | ||||||
|  | for RTM because it uses vzeroupper. The benefit is that by using | ||||||
|  | ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is | ||||||
|  | faster than the RTM safe version which cannot use vpcmpeq because | ||||||
|  | there is no EVEX encoding for the instruction. All parts of the | ||||||
|  | implementation aside from the 4x loop are the same for the two | ||||||
|  | versions and the optimization is only relevant for large sizes. | ||||||
|  | 
 | ||||||
|  | Tigerlake: | ||||||
|  | size  , algn  , Pos   , Cur T , New T , Win     , Dif | ||||||
|  | 512   , 6     , 192   , 9.2   , 9.04  , no-RTM  , 0.16 | ||||||
|  | 512   , 7     , 224   , 9.19  , 8.98  , no-RTM  , 0.21 | ||||||
|  | 2048  , 0     , 256   , 10.74 , 10.54 , no-RTM  , 0.2 | ||||||
|  | 2048  , 0     , 512   , 14.81 , 14.87 , RTM     , 0.06 | ||||||
|  | 2048  , 0     , 1024  , 22.97 , 22.57 , no-RTM  , 0.4 | ||||||
|  | 2048  , 0     , 2048  , 37.49 , 34.51 , no-RTM  , 2.98   <-- | ||||||
|  | 
 | ||||||
|  | Icelake: | ||||||
|  | size  , algn  , Pos   , Cur T , New T , Win     , Dif | ||||||
|  | 512   , 6     , 192   , 7.6   , 7.3   , no-RTM  , 0.3 | ||||||
|  | 512   , 7     , 224   , 7.63  , 7.27  , no-RTM  , 0.36 | ||||||
|  | 2048  , 0     , 256   , 8.48  , 8.38  , no-RTM  , 0.1 | ||||||
|  | 2048  , 0     , 512   , 11.57 , 11.42 , no-RTM  , 0.15 | ||||||
|  | 2048  , 0     , 1024  , 17.92 , 17.38 , no-RTM  , 0.54 | ||||||
|  | 2048  , 0     , 2048  , 30.37 , 27.34 , no-RTM  , 3.03   <-- | ||||||
|  | 
 | ||||||
|  | test-memchr, test-wmemchr, and test-rawmemchr are all passing. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/Makefile             |   7 +- | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-evex.h         |  55 ++++++ | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  15 ++ | ||||||
|  |  sysdeps/x86_64/multiarch/memchr-evex-rtm.S    |   8 + | ||||||
|  |  sysdeps/x86_64/multiarch/memchr-evex.S        | 161 ++++++++++++++---- | ||||||
|  |  sysdeps/x86_64/multiarch/memchr.c             |   2 +- | ||||||
|  |  sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S |   3 + | ||||||
|  |  sysdeps/x86_64/multiarch/rawmemchr.c          |   2 +- | ||||||
|  |  sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S   |   3 + | ||||||
|  |  sysdeps/x86_64/multiarch/wmemchr.c            |   2 +- | ||||||
|  |  10 files changed, 217 insertions(+), 41 deletions(-) | ||||||
|  |  create mode 100644 sysdeps/x86_64/multiarch/ifunc-evex.h | ||||||
|  |  create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-rtm.S | ||||||
|  |  create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | ||||||
|  |  create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | index 65fde4eb..26be4095 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/Makefile
 | ||||||
|  | @@ -77,7 +77,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
 | ||||||
|  |  		   strncmp-evex \ | ||||||
|  |  		   strncpy-evex \ | ||||||
|  |  		   strnlen-evex \ | ||||||
|  | -		   strrchr-evex
 | ||||||
|  | +		   strrchr-evex \
 | ||||||
|  | +		   memchr-evex-rtm \
 | ||||||
|  | +		   rawmemchr-evex-rtm
 | ||||||
|  |  CFLAGS-varshift.c += -msse4 | ||||||
|  |  CFLAGS-strcspn-c.c += -msse4 | ||||||
|  |  CFLAGS-strpbrk-c.c += -msse4 | ||||||
|  | @@ -110,7 +112,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
 | ||||||
|  |  		   wcsnlen-evex \ | ||||||
|  |  		   wcsrchr-evex \ | ||||||
|  |  		   wmemchr-evex \ | ||||||
|  | -		   wmemcmp-evex-movbe
 | ||||||
|  | +		   wmemcmp-evex-movbe \
 | ||||||
|  | +		   wmemchr-evex-rtm
 | ||||||
|  |  endif | ||||||
|  |   | ||||||
|  |  ifeq ($(subdir),debug) | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..fc391edb
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-evex.h
 | ||||||
|  | @@ -0,0 +1,55 @@
 | ||||||
|  | +/* Common definition for ifunc selection optimized with EVEX.
 | ||||||
|  | +   All versions must be listed in ifunc-impl-list.c.
 | ||||||
|  | +   Copyright (C) 2017-2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <init-arch.h>
 | ||||||
|  | +
 | ||||||
|  | +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 | ||||||
|  | +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 | ||||||
|  | +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 | ||||||
|  | +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 | ||||||
|  | +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden;
 | ||||||
|  | +
 | ||||||
|  | +
 | ||||||
|  | +static inline void *
 | ||||||
|  | +IFUNC_SELECTOR (void)
 | ||||||
|  | +{
 | ||||||
|  | +  const struct cpu_features* cpu_features = __get_cpu_features ();
 | ||||||
|  | +
 | ||||||
|  | +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
 | ||||||
|  | +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
 | ||||||
|  | +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
 | ||||||
|  | +    {
 | ||||||
|  | +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 | ||||||
|  | +	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 | ||||||
|  | +	{
 | ||||||
|  | +	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 | ||||||
|  | +	    return OPTIMIZE (evex_rtm);
 | ||||||
|  | +
 | ||||||
|  | +	  return OPTIMIZE (evex);
 | ||||||
|  | +	}
 | ||||||
|  | +
 | ||||||
|  | +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 | ||||||
|  | +	return OPTIMIZE (avx2_rtm);
 | ||||||
|  | +
 | ||||||
|  | +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 | ||||||
|  | +	return OPTIMIZE (avx2);
 | ||||||
|  | +    }
 | ||||||
|  | +
 | ||||||
|  | +  return OPTIMIZE (sse2);
 | ||||||
|  | +}
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | index d59d65f8..ac097e8d 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | @@ -52,6 +52,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (AVX512BW) | ||||||
|  |  			       && CPU_FEATURE_USABLE (BMI2)), | ||||||
|  |  			      __memchr_evex) | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, memchr,
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX512VL)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  | +			      __memchr_evex_rtm)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) | ||||||
|  |   | ||||||
|  |    /* Support sysdeps/x86_64/multiarch/memcmp.c.  */ | ||||||
|  | @@ -288,6 +293,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (AVX512BW) | ||||||
|  |  			       && CPU_FEATURE_USABLE (BMI2)), | ||||||
|  |  			      __rawmemchr_evex) | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, rawmemchr,
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX512VL)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  | +			      __rawmemchr_evex_rtm)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) | ||||||
|  |   | ||||||
|  |    /* Support sysdeps/x86_64/multiarch/strlen.c.  */ | ||||||
|  | @@ -711,6 +721,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (AVX512BW) | ||||||
|  |  			       && CPU_FEATURE_USABLE (BMI2)), | ||||||
|  |  			      __wmemchr_evex) | ||||||
|  | +	      IFUNC_IMPL_ADD (array, i, wmemchr,
 | ||||||
|  | +			      (CPU_FEATURE_USABLE (AVX512VL)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (AVX512BW)
 | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)),
 | ||||||
|  | +			      __wmemchr_evex_rtm)
 | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) | ||||||
|  |   | ||||||
|  |    /* Support sysdeps/x86_64/multiarch/wmemcmp.c.  */ | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memchr-evex-rtm.S b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..19871882
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
 | ||||||
|  | @@ -0,0 +1,8 @@
 | ||||||
|  | +#ifndef MEMCHR
 | ||||||
|  | +# define MEMCHR __memchr_evex_rtm
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +#define USE_IN_RTM 1
 | ||||||
|  | +#define SECTION(p) p##.evex.rtm
 | ||||||
|  | +
 | ||||||
|  | +#include "memchr-evex.S"
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
 | ||||||
|  | index f3fdad4f..4d0ed6d1 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memchr-evex.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
 | ||||||
|  | @@ -38,10 +38,32 @@
 | ||||||
|  |  #  define CHAR_SIZE	1 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  | +	/* In the 4x loop the RTM and non-RTM versions have data pointer
 | ||||||
|  | +	   off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
 | ||||||
|  | +	   This is represented by BASE_OFFSET. As well because the RTM
 | ||||||
|  | +	   version uses vpcmp which stores a bit per element compared where
 | ||||||
|  | +	   the non-RTM version uses vpcmpeq which stores a bit per byte
 | ||||||
|  | +	   compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
 | ||||||
|  | +	   version.  */
 | ||||||
|  | +# ifdef USE_IN_RTM
 | ||||||
|  | +#  define VZEROUPPER
 | ||||||
|  | +#  define BASE_OFFSET	(VEC_SIZE * 4)
 | ||||||
|  | +#  define RET_SCALE	CHAR_SIZE
 | ||||||
|  | +# else
 | ||||||
|  | +#  define VZEROUPPER	vzeroupper
 | ||||||
|  | +#  define BASE_OFFSET	0
 | ||||||
|  | +#  define RET_SCALE	1
 | ||||||
|  | +# endif
 | ||||||
|  | +
 | ||||||
|  | +	/* In the return from 4x loop memchr and rawmemchr versions have
 | ||||||
|  | +	   data pointers off by VEC_SIZE * 4 with memchr version being
 | ||||||
|  | +	   VEC_SIZE * 4 greater.  */
 | ||||||
|  |  # ifdef USE_AS_RAWMEMCHR | ||||||
|  | +#  define RET_OFFSET	(BASE_OFFSET - (VEC_SIZE * 4))
 | ||||||
|  |  #  define RAW_PTR_REG	rcx | ||||||
|  |  #  define ALGN_PTR_REG	rdi | ||||||
|  |  # else | ||||||
|  | +#  define RET_OFFSET	BASE_OFFSET
 | ||||||
|  |  #  define RAW_PTR_REG	rdi | ||||||
|  |  #  define ALGN_PTR_REG	rcx | ||||||
|  |  # endif | ||||||
|  | @@ -57,11 +79,15 @@
 | ||||||
|  |  # define YMM5		ymm21 | ||||||
|  |  # define YMM6		ymm22 | ||||||
|  |   | ||||||
|  | +# ifndef SECTION
 | ||||||
|  | +#  define SECTION(p)	p##.evex
 | ||||||
|  | +# endif
 | ||||||
|  | +
 | ||||||
|  |  # define VEC_SIZE 32 | ||||||
|  |  # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) | ||||||
|  |  # define PAGE_SIZE 4096 | ||||||
|  |   | ||||||
|  | -	.section .text.evex,"ax",@progbits
 | ||||||
|  | +	.section SECTION(.text),"ax",@progbits
 | ||||||
|  |  ENTRY (MEMCHR) | ||||||
|  |  # ifndef USE_AS_RAWMEMCHR | ||||||
|  |  	/* Check for zero length.  */ | ||||||
|  | @@ -237,14 +263,15 @@ L(cross_page_continue):
 | ||||||
|  |  	/* Check if at last CHAR_PER_VEC * 4 length.  */ | ||||||
|  |  	subq	$(CHAR_PER_VEC * 4), %rdx | ||||||
|  |  	jbe	L(last_4x_vec_or_less_cmpeq) | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | +	/* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
 | ||||||
|  | +	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
 | ||||||
|  |   | ||||||
|  |  	/* Align data to VEC_SIZE * 4 for the loop and readjust length. | ||||||
|  |  	 */ | ||||||
|  |  #  ifdef USE_AS_WMEMCHR | ||||||
|  |  	movl	%edi, %ecx | ||||||
|  |  	andq	$-(4 * VEC_SIZE), %rdi | ||||||
|  | -	andl	$(VEC_SIZE * 4 - 1), %ecx
 | ||||||
|  | +	subl	%edi, %ecx
 | ||||||
|  |  	/* NB: Divide bytes by 4 to get the wchar_t count.  */ | ||||||
|  |  	sarl	$2, %ecx | ||||||
|  |  	addq	%rcx, %rdx | ||||||
|  | @@ -254,15 +281,28 @@ L(cross_page_continue):
 | ||||||
|  |  	subq	%rdi, %rdx | ||||||
|  |  #  endif | ||||||
|  |  # else | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | +	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
 | ||||||
|  |  	andq	$-(4 * VEC_SIZE), %rdi | ||||||
|  |  # endif | ||||||
|  | -
 | ||||||
|  | +# ifdef USE_IN_RTM
 | ||||||
|  |  	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO | ||||||
|  | +# else
 | ||||||
|  | +	/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
 | ||||||
|  | +	   encodable with EVEX registers (ymm16-ymm31).  */
 | ||||||
|  | +	vmovdqa64 %YMMMATCH, %ymm0
 | ||||||
|  | +# endif
 | ||||||
|  |   | ||||||
|  |  	/* Compare 4 * VEC at a time forward.  */ | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(loop_4x_vec): | ||||||
|  | +	/* Two versions of the loop. One that does not require
 | ||||||
|  | +	   vzeroupper by not using ymm0-ymm15 and another does that require
 | ||||||
|  | +	   vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
 | ||||||
|  | +	   is used at all is because there is no EVEX encoding vpcmpeq and
 | ||||||
|  | +	   with vpcmpeq this loop can be performed more efficiently. The
 | ||||||
|  | +	   non-vzeroupper version is safe for RTM while the vzeroupper
 | ||||||
|  | +	   version should be prefered if RTM are not supported.  */
 | ||||||
|  | +# ifdef USE_IN_RTM
 | ||||||
|  |  	/* It would be possible to save some instructions using 4x VPCMP | ||||||
|  |  	   but bottleneck on port 5 makes it not woth it.  */ | ||||||
|  |  	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 | ||||||
|  | @@ -273,12 +313,55 @@ L(loop_4x_vec):
 | ||||||
|  |  	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */ | ||||||
|  |  	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z} | ||||||
|  |  	VPCMP	$0, %YMM3, %YMMZERO, %k2 | ||||||
|  | +# else
 | ||||||
|  | +	/* Since vptern can only take 3x vectors fastest to do 1 vec
 | ||||||
|  | +	   seperately with EVEX vpcmp.  */
 | ||||||
|  | +#  ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	/* vptern can only accept masks for epi32/epi64 so can only save
 | ||||||
|  | +	   instruction using not equals mask on vptern with wmemchr.  */
 | ||||||
|  | +	VPCMP	$4, (%rdi), %YMMMATCH, %k1
 | ||||||
|  | +#  else
 | ||||||
|  | +	VPCMP	$0, (%rdi), %YMMMATCH, %k1
 | ||||||
|  | +#  endif
 | ||||||
|  | +	/* Compare 3x with vpcmpeq and or them all together with vptern.
 | ||||||
|  | +	 */
 | ||||||
|  | +	VPCMPEQ	VEC_SIZE(%rdi), %ymm0, %ymm2
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
 | ||||||
|  | +#  ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	/* This takes the not of or between ymm2, ymm3, ymm4 as well as
 | ||||||
|  | +	   combines result from VEC0 with zero mask.  */
 | ||||||
|  | +	vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
 | ||||||
|  | +	vpmovmskb %ymm4, %ecx
 | ||||||
|  | +#  else
 | ||||||
|  | +	/* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
 | ||||||
|  | +	vpternlogd $254, %ymm2, %ymm3, %ymm4
 | ||||||
|  | +	vpmovmskb %ymm4, %ecx
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +#  endif
 | ||||||
|  | +# endif
 | ||||||
|  | +
 | ||||||
|  |  # ifdef USE_AS_RAWMEMCHR | ||||||
|  |  	subq	$-(VEC_SIZE * 4), %rdi | ||||||
|  | +# endif
 | ||||||
|  | +# ifdef USE_IN_RTM
 | ||||||
|  |  	kortestd %k2, %k3 | ||||||
|  | +# else
 | ||||||
|  | +#  ifdef USE_AS_WMEMCHR
 | ||||||
|  | +	/* ecx contains not of matches. All 1s means no matches. incl will
 | ||||||
|  | +	   overflow and set zeroflag if that is the case.  */
 | ||||||
|  | +	incl	%ecx
 | ||||||
|  | +#  else
 | ||||||
|  | +	/* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
 | ||||||
|  | +	   to ecx is not an issue because if eax is non-zero it will be
 | ||||||
|  | +	   used for returning the match. If it is zero the add does
 | ||||||
|  | +	   nothing.  */
 | ||||||
|  | +	addq	%rax, %rcx
 | ||||||
|  | +#  endif
 | ||||||
|  | +# endif
 | ||||||
|  | +# ifdef USE_AS_RAWMEMCHR
 | ||||||
|  |  	jz	L(loop_4x_vec) | ||||||
|  |  # else | ||||||
|  | -	kortestd %k2, %k3
 | ||||||
|  |  	jnz	L(loop_4x_vec_end) | ||||||
|  |   | ||||||
|  |  	subq	$-(VEC_SIZE * 4), %rdi | ||||||
|  | @@ -288,10 +371,11 @@ L(loop_4x_vec):
 | ||||||
|  |   | ||||||
|  |  	/* Fall through into less than 4 remaining vectors of length case. | ||||||
|  |  	 */ | ||||||
|  | -	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
 | ||||||
|  | +	VPCMP	$0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
 | ||||||
|  | +	addq	$(BASE_OFFSET - VEC_SIZE), %rdi
 | ||||||
|  |  	kmovd	%k0, %eax | ||||||
|  | -	addq	$(VEC_SIZE * 3), %rdi
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	VZEROUPPER
 | ||||||
|  | +
 | ||||||
|  |  L(last_4x_vec_or_less): | ||||||
|  |  	/* Check if first VEC contained match.  */ | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | @@ -338,73 +422,78 @@ L(loop_4x_vec_end):
 | ||||||
|  |  	/* rawmemchr will fall through into this if match was found in | ||||||
|  |  	   loop.  */ | ||||||
|  |   | ||||||
|  | +# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
 | ||||||
|  |  	/* k1 has not of matches with VEC1.  */ | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  | -# ifdef USE_AS_WMEMCHR
 | ||||||
|  | +#  ifdef USE_AS_WMEMCHR
 | ||||||
|  |  	subl	$((1 << CHAR_PER_VEC) - 1), %eax | ||||||
|  | -# else
 | ||||||
|  | +#  else
 | ||||||
|  |  	incl	%eax | ||||||
|  | +#  endif
 | ||||||
|  | +# else
 | ||||||
|  | +	/* eax already has matches for VEC1.  */
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  |  # endif | ||||||
|  |  	jnz	L(last_vec_x1_return) | ||||||
|  |   | ||||||
|  | +# ifdef USE_IN_RTM
 | ||||||
|  |  	VPCMP	$0, %YMM2, %YMMZERO, %k0 | ||||||
|  |  	kmovd	%k0, %eax | ||||||
|  | +# else
 | ||||||
|  | +	vpmovmskb %ymm2, %eax
 | ||||||
|  | +# endif
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(last_vec_x2_return) | ||||||
|  |   | ||||||
|  | +# ifdef USE_IN_RTM
 | ||||||
|  |  	kmovd	%k2, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(last_vec_x3_return) | ||||||
|  |   | ||||||
|  |  	kmovd	%k3, %eax | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_RAWMEMCHR
 | ||||||
|  | -	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	leaq	(VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  # else | ||||||
|  | -	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	vpmovmskb %ymm3, %eax
 | ||||||
|  | +	/* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
 | ||||||
|  | +	salq	$VEC_SIZE, %rcx
 | ||||||
|  | +	orq	%rcx, %rax
 | ||||||
|  | +	tzcntq	%rax, %rax
 | ||||||
|  | +	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
 | ||||||
|  | +	VZEROUPPER
 | ||||||
|  |  # endif | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(last_vec_x1_return): | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_RAWMEMCHR
 | ||||||
|  | -#  ifdef USE_AS_WMEMCHR
 | ||||||
|  | +# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
 | ||||||
|  |  	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */ | ||||||
|  | -	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | -#  else
 | ||||||
|  | -	addq	%rdi, %rax
 | ||||||
|  | -#  endif
 | ||||||
|  | +	leaq	RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  # else | ||||||
|  | -	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 | ||||||
|  | -	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	addq	%rdi, %rax
 | ||||||
|  |  # endif | ||||||
|  | +	VZEROUPPER
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(last_vec_x2_return): | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_RAWMEMCHR
 | ||||||
|  | -	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 | ||||||
|  | -	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | -# else
 | ||||||
|  | -	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 | ||||||
|  | -	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | -# endif
 | ||||||
|  | +	/* NB: Multiply bytes by RET_SCALE to get the wchar_t count
 | ||||||
|  | +	   if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
 | ||||||
|  | +	   USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
 | ||||||
|  | +	leaq	(VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
 | ||||||
|  | +	VZEROUPPER
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | +# ifdef USE_IN_RTM
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(last_vec_x3_return): | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -# ifdef USE_AS_RAWMEMCHR
 | ||||||
|  | -	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 | ||||||
|  | -	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | -# else
 | ||||||
|  |  	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */ | ||||||
|  | -	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  | -# endif
 | ||||||
|  | +	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
 | ||||||
|  |  	ret | ||||||
|  | -
 | ||||||
|  | +# endif
 | ||||||
|  |   | ||||||
|  |  # ifndef USE_AS_RAWMEMCHR | ||||||
|  |  L(last_4x_vec_or_less_cmpeq): | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c
 | ||||||
|  | index 016f5784..f28aea77 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memchr.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memchr.c
 | ||||||
|  | @@ -24,7 +24,7 @@
 | ||||||
|  |  # undef memchr | ||||||
|  |   | ||||||
|  |  # define SYMBOL_NAME memchr | ||||||
|  | -# include "ifunc-avx2.h"
 | ||||||
|  | +# include "ifunc-evex.h"
 | ||||||
|  |   | ||||||
|  |  libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ()); | ||||||
|  |  strong_alias (memchr, __memchr) | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..deda1ca3
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
 | ||||||
|  | @@ -0,0 +1,3 @@
 | ||||||
|  | +#define MEMCHR __rawmemchr_evex_rtm
 | ||||||
|  | +#define USE_AS_RAWMEMCHR 1
 | ||||||
|  | +#include "memchr-evex-rtm.S"
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c
 | ||||||
|  | index 8a0bc313..1f764f35 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/rawmemchr.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/rawmemchr.c
 | ||||||
|  | @@ -26,7 +26,7 @@
 | ||||||
|  |  # undef __rawmemchr | ||||||
|  |   | ||||||
|  |  # define SYMBOL_NAME rawmemchr | ||||||
|  | -# include "ifunc-avx2.h"
 | ||||||
|  | +# include "ifunc-evex.h"
 | ||||||
|  |   | ||||||
|  |  libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr, | ||||||
|  |  		       IFUNC_SELECTOR ()); | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..a346cd35
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
 | ||||||
|  | @@ -0,0 +1,3 @@
 | ||||||
|  | +#define MEMCHR __wmemchr_evex_rtm
 | ||||||
|  | +#define USE_AS_WMEMCHR 1
 | ||||||
|  | +#include "memchr-evex-rtm.S"
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c
 | ||||||
|  | index 6d833702..f9c91915 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/wmemchr.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/wmemchr.c
 | ||||||
|  | @@ -26,7 +26,7 @@
 | ||||||
|  |  # undef __wmemchr | ||||||
|  |   | ||||||
|  |  # define SYMBOL_NAME wmemchr | ||||||
|  | -# include "ifunc-avx2.h"
 | ||||||
|  | +# include "ifunc-evex.h"
 | ||||||
|  |   | ||||||
|  |  libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ()); | ||||||
|  |  weak_alias (__wmemchr, wmemchr) | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										873
									
								
								SOURCES/glibc-RHEL-15696-45.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										873
									
								
								SOURCES/glibc-RHEL-15696-45.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,873 @@ | |||||||
|  | From 16d12015c57701b08d7bbed6ec536641bcafb428 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Mon, 17 May 2021 13:56:52 -0400 | ||||||
|  | Subject: [PATCH] x86: Optimize memcmp-avx2-movbe.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. This commit optimizes memcmp-avx2.S. The optimizations include | ||||||
|  | adding a new vec compare path for small sizes, reorganizing the entry | ||||||
|  | control flow, and removing some unnecissary ALU instructions from the | ||||||
|  | main loop. test-memcmp and test-wmemcmp are both passing. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |   6 + | ||||||
|  |  sysdeps/x86_64/multiarch/ifunc-memcmp.h      |   1 + | ||||||
|  |  sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 676 +++++++++++-------- | ||||||
|  |  3 files changed, 402 insertions(+), 281 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | index ac097e8d..8be0d78a 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 | ||||||
|  | @@ -63,16 +63,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |    IFUNC_IMPL (i, name, memcmp, | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memcmp, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX2) | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (MOVBE)), | ||||||
|  |  			      __memcmp_avx2_movbe) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memcmp, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX2) | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (MOVBE) | ||||||
|  |  			       && CPU_FEATURE_USABLE (RTM)), | ||||||
|  |  			      __memcmp_avx2_movbe_rtm) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memcmp, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX512VL) | ||||||
|  |  			       && CPU_FEATURE_USABLE (AVX512BW) | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (MOVBE)), | ||||||
|  |  			      __memcmp_evex_movbe) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), | ||||||
|  | @@ -732,16 +735,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 | ||||||
|  |    IFUNC_IMPL (i, name, wmemcmp, | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wmemcmp, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX2) | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (MOVBE)), | ||||||
|  |  			      __wmemcmp_avx2_movbe) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wmemcmp, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX2) | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (MOVBE) | ||||||
|  |  			       && CPU_FEATURE_USABLE (RTM)), | ||||||
|  |  			      __wmemcmp_avx2_movbe_rtm) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wmemcmp, | ||||||
|  |  			      (CPU_FEATURE_USABLE (AVX512VL) | ||||||
|  |  			       && CPU_FEATURE_USABLE (AVX512BW) | ||||||
|  | +			       && CPU_FEATURE_USABLE (BMI2)
 | ||||||
|  |  			       && CPU_FEATURE_USABLE (MOVBE)), | ||||||
|  |  			      __wmemcmp_evex_movbe) | ||||||
|  |  	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
 | ||||||
|  | index 8043c635..690dffe8 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
 | ||||||
|  | @@ -33,6 +33,7 @@ IFUNC_SELECTOR (void)
 | ||||||
|  |   | ||||||
|  |    if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) | ||||||
|  |        && CPU_FEATURE_USABLE_P (cpu_features, MOVBE) | ||||||
|  | +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
 | ||||||
|  |        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) | ||||||
|  |      { | ||||||
|  |        if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
 | ||||||
|  | index 9d5c9c72..16fc673e 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
 | ||||||
|  | @@ -19,17 +19,23 @@
 | ||||||
|  |  #if IS_IN (libc) | ||||||
|  |   | ||||||
|  |  /* memcmp/wmemcmp is implemented as: | ||||||
|  | -   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
 | ||||||
|  | -      to avoid branches.
 | ||||||
|  | -   2. Use overlapping compare to avoid branch.
 | ||||||
|  | -   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
 | ||||||
|  | -      bytes for wmemcmp.
 | ||||||
|  | -   4. If size is 8 * VEC_SIZE or less, unroll the loop.
 | ||||||
|  | -   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
 | ||||||
|  | +   1. Use ymm vector compares when possible. The only case where
 | ||||||
|  | +      vector compares is not possible for when size < VEC_SIZE
 | ||||||
|  | +      and loading from either s1 or s2 would cause a page cross.
 | ||||||
|  | +   2. For size from 2 to 7 bytes on page cross, load as big endian
 | ||||||
|  | +      with movbe and bswap to avoid branches.
 | ||||||
|  | +   3. Use xmm vector compare when size >= 4 bytes for memcmp or
 | ||||||
|  | +      size >= 8 bytes for wmemcmp.
 | ||||||
|  | +   4. Optimistically compare up to first 4 * VEC_SIZE one at a
 | ||||||
|  | +      to check for early mismatches. Only do this if its guranteed the
 | ||||||
|  | +      work is not wasted.
 | ||||||
|  | +   5. If size is 8 * VEC_SIZE or less, unroll the loop.
 | ||||||
|  | +   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
 | ||||||
|  |        area. | ||||||
|  | -   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
 | ||||||
|  | -   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
 | ||||||
|  | -   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
 | ||||||
|  | +   7. Use 2 vector compares when size is 2 * VEC_SIZE or less.
 | ||||||
|  | +   8. Use 4 vector compares when size is 4 * VEC_SIZE or less.
 | ||||||
|  | +   9. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
 | ||||||
|  | +
 | ||||||
|  |   | ||||||
|  |  # include <sysdep.h> | ||||||
|  |   | ||||||
|  | @@ -38,8 +44,10 @@
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  # ifdef USE_AS_WMEMCMP | ||||||
|  | +#  define CHAR_SIZE	4
 | ||||||
|  |  #  define VPCMPEQ	vpcmpeqd | ||||||
|  |  # else | ||||||
|  | +#  define CHAR_SIZE	1
 | ||||||
|  |  #  define VPCMPEQ	vpcmpeqb | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  | @@ -52,7 +60,7 @@
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  # define VEC_SIZE 32 | ||||||
|  | -# define VEC_MASK ((1 << VEC_SIZE) - 1)
 | ||||||
|  | +# define PAGE_SIZE	4096
 | ||||||
|  |   | ||||||
|  |  /* Warning! | ||||||
|  |             wmemcmp has to use SIGNED comparison for elements. | ||||||
|  | @@ -71,136 +79,359 @@ ENTRY (MEMCMP)
 | ||||||
|  |  	jb	L(less_vec) | ||||||
|  |   | ||||||
|  |  	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */ | ||||||
|  | -	vmovdqu	(%rsi), %ymm2
 | ||||||
|  | -	VPCMPEQ (%rdi), %ymm2, %ymm2
 | ||||||
|  | -	vpmovmskb %ymm2, %eax
 | ||||||
|  | -	subl    $VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | +	vmovdqu	(%rsi), %ymm1
 | ||||||
|  | +	VPCMPEQ	(%rdi), %ymm1, %ymm1
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	/* NB: eax must be destination register if going to
 | ||||||
|  | +	   L(return_vec_[0,2]). For L(return_vec_3 destination register
 | ||||||
|  | +	   must be ecx.  */
 | ||||||
|  | +	incl	%eax
 | ||||||
|  | +	jnz	L(return_vec_0)
 | ||||||
|  |   | ||||||
|  |  	cmpq	$(VEC_SIZE * 2), %rdx | ||||||
|  | -	jbe	L(last_vec)
 | ||||||
|  | -
 | ||||||
|  | -	VPCMPEQ	%ymm0, %ymm0, %ymm0
 | ||||||
|  | -	/* More than 2 * VEC.  */
 | ||||||
|  | -	cmpq	$(VEC_SIZE * 8), %rdx
 | ||||||
|  | -	ja	L(more_8x_vec)
 | ||||||
|  | -	cmpq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | -	jb	L(last_4x_vec)
 | ||||||
|  | -
 | ||||||
|  | -	/* From 4 * VEC to 8 * VEC, inclusively. */
 | ||||||
|  | -	vmovdqu	(%rsi), %ymm1
 | ||||||
|  | -	VPCMPEQ (%rdi), %ymm1, %ymm1
 | ||||||
|  | +	jbe	L(last_1x_vec)
 | ||||||
|  |   | ||||||
|  | +	/* Check second VEC no matter what.  */
 | ||||||
|  |  	vmovdqu	VEC_SIZE(%rsi), %ymm2 | ||||||
|  | -	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
 | ||||||
|  | +	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
 | ||||||
|  | +	vpmovmskb %ymm2, %eax
 | ||||||
|  | +	/* If all 4 VEC where equal eax will be all 1s so incl will
 | ||||||
|  | +	   overflow and set zero flag.  */
 | ||||||
|  | +	incl	%eax
 | ||||||
|  | +	jnz	L(return_vec_1)
 | ||||||
|  |   | ||||||
|  | -	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
 | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 | ||||||
|  | +	/* Less than 4 * VEC.  */
 | ||||||
|  | +	cmpq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | +	jbe	L(last_2x_vec)
 | ||||||
|  |   | ||||||
|  | +	/* Check third and fourth VEC no matter what.  */
 | ||||||
|  | +	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 | ||||||
|  | +	vpmovmskb %ymm3, %eax
 | ||||||
|  | +	incl	%eax
 | ||||||
|  | +	jnz	L(return_vec_2)
 | ||||||
|  |  	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4 | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 | ||||||
|  | +	vpmovmskb %ymm4, %ecx
 | ||||||
|  | +	incl	%ecx
 | ||||||
|  | +	jnz	L(return_vec_3)
 | ||||||
|  |   | ||||||
|  | -	vpand	%ymm1, %ymm2, %ymm5
 | ||||||
|  | -	vpand	%ymm3, %ymm4, %ymm6
 | ||||||
|  | -	vpand	%ymm5, %ymm6, %ymm5
 | ||||||
|  | +	/* Go to 4x VEC loop.  */
 | ||||||
|  | +	cmpq	$(VEC_SIZE * 8), %rdx
 | ||||||
|  | +	ja	L(more_8x_vec)
 | ||||||
|  |   | ||||||
|  | -	vptest	%ymm0, %ymm5
 | ||||||
|  | -	jnc	L(4x_vec_end)
 | ||||||
|  | +	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
 | ||||||
|  | +	   branches.  */
 | ||||||
|  |   | ||||||
|  | +	/* Load first two VEC from s2 before adjusting addresses.  */
 | ||||||
|  | +	vmovdqu	-(VEC_SIZE * 4)(%rsi, %rdx), %ymm1
 | ||||||
|  | +	vmovdqu	-(VEC_SIZE * 3)(%rsi, %rdx), %ymm2
 | ||||||
|  |  	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi | ||||||
|  |  	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi | ||||||
|  | -	vmovdqu	(%rsi), %ymm1
 | ||||||
|  | -	VPCMPEQ (%rdi), %ymm1, %ymm1
 | ||||||
|  |   | ||||||
|  | -	vmovdqu	VEC_SIZE(%rsi), %ymm2
 | ||||||
|  | -	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
 | ||||||
|  | -	vpand	%ymm2, %ymm1, %ymm5
 | ||||||
|  | +	/* Wait to load from s1 until addressed adjust due to
 | ||||||
|  | +	   unlamination of microfusion with complex address mode.  */
 | ||||||
|  | +	VPCMPEQ	(%rdi), %ymm1, %ymm1
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE)(%rdi), %ymm2, %ymm2
 | ||||||
|  |   | ||||||
|  |  	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3 | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 | ||||||
|  | -	vpand	%ymm3, %ymm5, %ymm5
 | ||||||
|  | -
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 | ||||||
|  |  	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4 | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 | ||||||
|  | -	vpand	%ymm4, %ymm5, %ymm5
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 | ||||||
|  |   | ||||||
|  | -	vptest	%ymm0, %ymm5
 | ||||||
|  | -	jnc	L(4x_vec_end)
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | +	/* Reduce VEC0 - VEC4.  */
 | ||||||
|  | +	vpand	%ymm1, %ymm2, %ymm5
 | ||||||
|  | +	vpand	%ymm3, %ymm4, %ymm6
 | ||||||
|  | +	vpand	%ymm5, %ymm6, %ymm7
 | ||||||
|  | +	vpmovmskb %ymm7, %ecx
 | ||||||
|  | +	incl	%ecx
 | ||||||
|  | +	jnz	L(return_vec_0_1_2_3)
 | ||||||
|  | +	/* NB: eax must be zero to reach here.  */
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(return_vec_0):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	movl	(%rdi, %rax), %ecx
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	(%rsi, %rax), %ecx
 | ||||||
|  | +	/* NB: no partial register stall here because xorl zero idiom
 | ||||||
|  | +	   above.  */
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	movzbl	(%rsi, %rax), %ecx
 | ||||||
|  | +	movzbl	(%rdi, %rax), %eax
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  |  L(return_vzeroupper): | ||||||
|  |  	ZERO_UPPER_VEC_REGISTERS_RETURN | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(last_2x_vec):
 | ||||||
|  | -	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
 | ||||||
|  | -	vmovdqu	(%rsi), %ymm2
 | ||||||
|  | -	VPCMPEQ (%rdi), %ymm2, %ymm2
 | ||||||
|  | -	vpmovmskb %ymm2, %eax
 | ||||||
|  | -	subl    $VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | +L(return_vec_1):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	movl	VEC_SIZE(%rdi, %rax), %ecx
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	VEC_SIZE(%rsi, %rax), %ecx
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	movzbl	VEC_SIZE(%rsi, %rax), %ecx
 | ||||||
|  | +	movzbl	VEC_SIZE(%rdi, %rax), %eax
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(return_vec_2):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
 | ||||||
|  | +	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +
 | ||||||
|  | +	/* NB: p2align 5 here to ensure 4x loop is 32 byte aligned.  */
 | ||||||
|  | +	.p2align 5
 | ||||||
|  | +L(8x_return_vec_0_1_2_3):
 | ||||||
|  | +	/* Returning from L(more_8x_vec) requires restoring rsi.  */
 | ||||||
|  | +	addq	%rdi, %rsi
 | ||||||
|  | +L(return_vec_0_1_2_3):
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	incl	%eax
 | ||||||
|  | +	jnz	L(return_vec_0)
 | ||||||
|  |   | ||||||
|  | -L(last_vec):
 | ||||||
|  | -	/* Use overlapping loads to avoid branches.  */
 | ||||||
|  | -	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
 | ||||||
|  | -	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
 | ||||||
|  | -	vmovdqu	(%rsi), %ymm2
 | ||||||
|  | -	VPCMPEQ (%rdi), %ymm2, %ymm2
 | ||||||
|  |  	vpmovmskb %ymm2, %eax | ||||||
|  | -	subl    $VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | +	incl	%eax
 | ||||||
|  | +	jnz	L(return_vec_1)
 | ||||||
|  | +
 | ||||||
|  | +	vpmovmskb %ymm3, %eax
 | ||||||
|  | +	incl	%eax
 | ||||||
|  | +	jnz	L(return_vec_2)
 | ||||||
|  | +L(return_vec_3):
 | ||||||
|  | +	tzcntl	%ecx, %ecx
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	movl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %eax
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 | ||||||
|  | +	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(more_8x_vec):
 | ||||||
|  | +	/* Set end of s1 in rdx.  */
 | ||||||
|  | +	leaq	-(VEC_SIZE * 4)(%rdi, %rdx), %rdx
 | ||||||
|  | +	/* rsi stores s2 - s1. This allows loop to only update one
 | ||||||
|  | +	   pointer.  */
 | ||||||
|  | +	subq	%rdi, %rsi
 | ||||||
|  | +	/* Align s1 pointer.  */
 | ||||||
|  | +	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | +	/* Adjust because first 4x vec where check already.  */
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(loop_4x_vec):
 | ||||||
|  | +	/* rsi has s2 - s1 so get correct address by adding s1 (in rdi).
 | ||||||
|  | +	 */
 | ||||||
|  | +	vmovdqu	(%rsi, %rdi), %ymm1
 | ||||||
|  | +	VPCMPEQ	(%rdi), %ymm1, %ymm1
 | ||||||
|  | +
 | ||||||
|  | +	vmovdqu	VEC_SIZE(%rsi, %rdi), %ymm2
 | ||||||
|  | +	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
 | ||||||
|  | +
 | ||||||
|  | +	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdi), %ymm3
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 | ||||||
|  | +
 | ||||||
|  | +	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdi), %ymm4
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 | ||||||
|  | +
 | ||||||
|  | +	vpand	%ymm1, %ymm2, %ymm5
 | ||||||
|  | +	vpand	%ymm3, %ymm4, %ymm6
 | ||||||
|  | +	vpand	%ymm5, %ymm6, %ymm7
 | ||||||
|  | +	vpmovmskb %ymm7, %ecx
 | ||||||
|  | +	incl	%ecx
 | ||||||
|  | +	jnz	L(8x_return_vec_0_1_2_3)
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	/* Check if s1 pointer at end.  */
 | ||||||
|  | +	cmpq	%rdx, %rdi
 | ||||||
|  | +	jb	L(loop_4x_vec)
 | ||||||
|  | +
 | ||||||
|  | +	subq	%rdx, %rdi
 | ||||||
|  | +	/* rdi has 4 * VEC_SIZE - remaining length.  */
 | ||||||
|  | +	cmpl	$(VEC_SIZE * 3), %edi
 | ||||||
|  | +	jae	L(8x_last_1x_vec)
 | ||||||
|  | +	/* Load regardless of branch.  */
 | ||||||
|  | +	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdx), %ymm3
 | ||||||
|  | +	cmpl	$(VEC_SIZE * 2), %edi
 | ||||||
|  | +	jae	L(8x_last_2x_vec)
 | ||||||
|  | +
 | ||||||
|  | +	/* Check last 4 VEC.  */
 | ||||||
|  | +	vmovdqu	(%rsi, %rdx), %ymm1
 | ||||||
|  | +	VPCMPEQ	(%rdx), %ymm1, %ymm1
 | ||||||
|  | +
 | ||||||
|  | +	vmovdqu	VEC_SIZE(%rsi, %rdx), %ymm2
 | ||||||
|  | +	VPCMPEQ	VEC_SIZE(%rdx), %ymm2, %ymm2
 | ||||||
|  | +
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
 | ||||||
|  | +
 | ||||||
|  | +	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
 | ||||||
|  | +
 | ||||||
|  | +	vpand	%ymm1, %ymm2, %ymm5
 | ||||||
|  | +	vpand	%ymm3, %ymm4, %ymm6
 | ||||||
|  | +	vpand	%ymm5, %ymm6, %ymm7
 | ||||||
|  | +	vpmovmskb %ymm7, %ecx
 | ||||||
|  | +	/* Restore s1 pointer to rdi.  */
 | ||||||
|  | +	movq	%rdx, %rdi
 | ||||||
|  | +	incl	%ecx
 | ||||||
|  | +	jnz	L(8x_return_vec_0_1_2_3)
 | ||||||
|  | +	/* NB: eax must be zero to reach here.  */
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +
 | ||||||
|  | +	/* Only entry is from L(more_8x_vec).  */
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(8x_last_2x_vec):
 | ||||||
|  | +	/* Check second to last VEC. rdx store end pointer of s1 and
 | ||||||
|  | +	   ymm3 has already been loaded with second to last VEC from s2.
 | ||||||
|  | +	 */
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
 | ||||||
|  | +	vpmovmskb %ymm3, %eax
 | ||||||
|  | +	incl	%eax
 | ||||||
|  | +	jnz	L(8x_return_vec_2)
 | ||||||
|  | +	/* Check last VEC.  */
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(8x_last_1x_vec):
 | ||||||
|  | +	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
 | ||||||
|  | +	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
 | ||||||
|  | +	vpmovmskb %ymm4, %eax
 | ||||||
|  | +	incl	%eax
 | ||||||
|  | +	jnz	L(8x_return_vec_3)
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec):
 | ||||||
|  | -	/* A byte or int32 is different within 16 or 32 bytes.  */
 | ||||||
|  | -	tzcntl	%eax, %ecx
 | ||||||
|  | +L(last_2x_vec):
 | ||||||
|  | +	/* Check second to last VEC.  */
 | ||||||
|  | +	vmovdqu	-(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
 | ||||||
|  | +	VPCMPEQ	-(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	incl	%eax
 | ||||||
|  | +	jnz	L(return_vec_1_end)
 | ||||||
|  | +	/* Check last VEC.  */
 | ||||||
|  | +L(last_1x_vec):
 | ||||||
|  | +	vmovdqu	-(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
 | ||||||
|  | +	VPCMPEQ	-(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
 | ||||||
|  | +	vpmovmskb %ymm1, %eax
 | ||||||
|  | +	incl	%eax
 | ||||||
|  | +	jnz	L(return_vec_0_end)
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(8x_return_vec_2):
 | ||||||
|  | +	subq	$VEC_SIZE, %rdx
 | ||||||
|  | +L(8x_return_vec_3):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	addq	%rdx, %rax
 | ||||||
|  |  # ifdef USE_AS_WMEMCMP | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	movl	(%rdi, %rcx), %edx
 | ||||||
|  | -	cmpl	(%rsi, %rcx), %edx
 | ||||||
|  | -L(wmemcmp_return):
 | ||||||
|  | -	setl	%al
 | ||||||
|  | -	negl	%eax
 | ||||||
|  | -	orl	$1, %eax
 | ||||||
|  | +	movl	(VEC_SIZE * 3)(%rax), %ecx
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  |  # else | ||||||
|  | -	movzbl	(%rdi, %rcx), %eax
 | ||||||
|  | -	movzbl	(%rsi, %rcx), %edx
 | ||||||
|  | -	sub	%edx, %eax
 | ||||||
|  | +	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
 | ||||||
|  | +	movzbl	(VEC_SIZE * 3)(%rax), %eax
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  |  # endif | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(4):
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	movl	(%rdi), %edx
 | ||||||
|  | -	cmpl	(%rsi), %edx
 | ||||||
|  | -	jne	L(wmemcmp_return)
 | ||||||
|  | -	ret
 | ||||||
|  | +L(return_vec_1_end):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	addl	%edx, %eax
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	movl	-(VEC_SIZE * 2)(%rdi, %rax), %ecx
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  |  # else | ||||||
|  | +	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
 | ||||||
|  | +	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(between_4_7):
 | ||||||
|  | -	/* Load as big endian with overlapping movbe to avoid branches.  */
 | ||||||
|  | -	movbe	(%rdi), %eax
 | ||||||
|  | -	movbe	(%rsi), %ecx
 | ||||||
|  | -	shlq	$32, %rax
 | ||||||
|  | -	shlq	$32, %rcx
 | ||||||
|  | -	movbe	-4(%rdi, %rdx), %edi
 | ||||||
|  | -	movbe	-4(%rsi, %rdx), %esi
 | ||||||
|  | -	orq	%rdi, %rax
 | ||||||
|  | -	orq	%rsi, %rcx
 | ||||||
|  | -	subq	%rcx, %rax
 | ||||||
|  | -	je	L(exit)
 | ||||||
|  | -	sbbl	%eax, %eax
 | ||||||
|  | -	orl	$1, %eax
 | ||||||
|  | -	ret
 | ||||||
|  | +L(return_vec_0_end):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	addl	%edx, %eax
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	movl	-VEC_SIZE(%rdi, %rax), %ecx
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	-VEC_SIZE(%rsi, %rax), %ecx
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
 | ||||||
|  | +	movzbl	-VEC_SIZE(%rdi, %rax), %eax
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(exit):
 | ||||||
|  | -	ret
 | ||||||
|  | +L(less_vec):
 | ||||||
|  | +	/* Check if one or less CHAR. This is necessary for size = 0 but
 | ||||||
|  | +	   is also faster for size = CHAR_SIZE.  */
 | ||||||
|  | +	cmpl	$CHAR_SIZE, %edx
 | ||||||
|  | +	jbe	L(one_or_less)
 | ||||||
|  | +
 | ||||||
|  | +	/* Check if loading one VEC from either s1 or s2 could cause a
 | ||||||
|  | +	   page cross. This can have false positives but is by far the
 | ||||||
|  | +	   fastest method.  */
 | ||||||
|  | +	movl	%edi, %eax
 | ||||||
|  | +	orl	%esi, %eax
 | ||||||
|  | +	andl	$(PAGE_SIZE - 1), %eax
 | ||||||
|  | +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 | ||||||
|  | +	jg	L(page_cross_less_vec)
 | ||||||
|  | +
 | ||||||
|  | +	/* No page cross possible.  */
 | ||||||
|  | +	vmovdqu	(%rsi), %ymm2
 | ||||||
|  | +	VPCMPEQ	(%rdi), %ymm2, %ymm2
 | ||||||
|  | +	vpmovmskb %ymm2, %eax
 | ||||||
|  | +	incl	%eax
 | ||||||
|  | +	/* Result will be zero if s1 and s2 match. Otherwise first set
 | ||||||
|  | +	   bit will be first mismatch.  */
 | ||||||
|  | +	bzhil	%edx, %eax, %edx
 | ||||||
|  | +	jnz	L(return_vec_0)
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(between_2_3):
 | ||||||
|  | +L(page_cross_less_vec):
 | ||||||
|  | +	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
 | ||||||
|  | +	   bytes.  */
 | ||||||
|  | +	cmpl	$16, %edx
 | ||||||
|  | +	jae	L(between_16_31)
 | ||||||
|  | +# ifndef USE_AS_WMEMCMP
 | ||||||
|  | +	cmpl	$8, %edx
 | ||||||
|  | +	jae	L(between_8_15)
 | ||||||
|  | +	cmpl	$4, %edx
 | ||||||
|  | +	jae	L(between_4_7)
 | ||||||
|  | +
 | ||||||
|  |  	/* Load as big endian to avoid branches.  */ | ||||||
|  |  	movzwl	(%rdi), %eax | ||||||
|  |  	movzwl	(%rsi), %ecx | ||||||
|  | @@ -208,223 +439,106 @@ L(between_2_3):
 | ||||||
|  |  	shll	$8, %ecx | ||||||
|  |  	bswap	%eax | ||||||
|  |  	bswap	%ecx | ||||||
|  | -	movb	-1(%rdi, %rdx), %al
 | ||||||
|  | -	movb	-1(%rsi, %rdx), %cl
 | ||||||
|  | +	movzbl	-1(%rdi, %rdx), %edi
 | ||||||
|  | +	movzbl	-1(%rsi, %rdx), %esi
 | ||||||
|  | +	orl	%edi, %eax
 | ||||||
|  | +	orl	%esi, %ecx
 | ||||||
|  |  	/* Subtraction is okay because the upper 8 bits are zero.  */ | ||||||
|  |  	subl	%ecx, %eax | ||||||
|  | +	/* No ymm register was touched.  */
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(1):
 | ||||||
|  | -	movzbl	(%rdi), %eax
 | ||||||
|  | +L(one_or_less):
 | ||||||
|  | +	jb	L(zero)
 | ||||||
|  |  	movzbl	(%rsi), %ecx | ||||||
|  | +	movzbl	(%rdi), %eax
 | ||||||
|  |  	subl	%ecx, %eax | ||||||
|  | -	ret
 | ||||||
|  | -# endif
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(zero):
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | +	/* No ymm register was touched.  */
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(less_vec):
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
 | ||||||
|  | -	cmpb	$4, %dl
 | ||||||
|  | -	je	L(4)
 | ||||||
|  | -	jb	L(zero)
 | ||||||
|  | -# else
 | ||||||
|  | -	cmpb	$1, %dl
 | ||||||
|  | -	je	L(1)
 | ||||||
|  | -	jb	L(zero)
 | ||||||
|  | -	cmpb	$4, %dl
 | ||||||
|  | -	jb	L(between_2_3)
 | ||||||
|  | -	cmpb	$8, %dl
 | ||||||
|  | -	jb	L(between_4_7)
 | ||||||
|  | +L(between_8_15):
 | ||||||
|  |  # endif | ||||||
|  | -	cmpb	$16, %dl
 | ||||||
|  | -	jae	L(between_16_31)
 | ||||||
|  | -	/* It is between 8 and 15 bytes.  */
 | ||||||
|  | +	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
 | ||||||
|  |  	vmovq	(%rdi), %xmm1 | ||||||
|  |  	vmovq	(%rsi), %xmm2 | ||||||
|  | -	VPCMPEQ %xmm1, %xmm2, %xmm2
 | ||||||
|  | +	VPCMPEQ	%xmm1, %xmm2, %xmm2
 | ||||||
|  |  	vpmovmskb %xmm2, %eax | ||||||
|  | -	subl    $0xffff, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | +	subl	$0xffff, %eax
 | ||||||
|  | +	jnz	L(return_vec_0)
 | ||||||
|  |  	/* Use overlapping loads to avoid branches.  */ | ||||||
|  |  	leaq	-8(%rdi, %rdx), %rdi | ||||||
|  |  	leaq	-8(%rsi, %rdx), %rsi | ||||||
|  |  	vmovq	(%rdi), %xmm1 | ||||||
|  |  	vmovq	(%rsi), %xmm2 | ||||||
|  | -	VPCMPEQ %xmm1, %xmm2, %xmm2
 | ||||||
|  | +	VPCMPEQ	%xmm1, %xmm2, %xmm2
 | ||||||
|  |  	vpmovmskb %xmm2, %eax | ||||||
|  | -	subl    $0xffff, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | +	subl	$0xffff, %eax
 | ||||||
|  | +	jnz	L(return_vec_0)
 | ||||||
|  | +	/* No ymm register was touched.  */
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(zero):
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(between_16_31): | ||||||
|  |  	/* From 16 to 31 bytes.  No branch when size == 16.  */ | ||||||
|  |  	vmovdqu	(%rsi), %xmm2 | ||||||
|  | -	VPCMPEQ (%rdi), %xmm2, %xmm2
 | ||||||
|  | +	VPCMPEQ	(%rdi), %xmm2, %xmm2
 | ||||||
|  |  	vpmovmskb %xmm2, %eax | ||||||
|  | -	subl    $0xffff, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | +	subl	$0xffff, %eax
 | ||||||
|  | +	jnz	L(return_vec_0)
 | ||||||
|  |   | ||||||
|  |  	/* Use overlapping loads to avoid branches.  */ | ||||||
|  | +
 | ||||||
|  | +	vmovdqu	-16(%rsi, %rdx), %xmm2
 | ||||||
|  |  	leaq	-16(%rdi, %rdx), %rdi | ||||||
|  |  	leaq	-16(%rsi, %rdx), %rsi | ||||||
|  | -	vmovdqu	(%rsi), %xmm2
 | ||||||
|  | -	VPCMPEQ (%rdi), %xmm2, %xmm2
 | ||||||
|  | +	VPCMPEQ	(%rdi), %xmm2, %xmm2
 | ||||||
|  |  	vpmovmskb %xmm2, %eax | ||||||
|  | -	subl    $0xffff, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | +	subl	$0xffff, %eax
 | ||||||
|  | +	jnz	L(return_vec_0)
 | ||||||
|  | +	/* No ymm register was touched.  */
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(more_8x_vec):
 | ||||||
|  | -	/* More than 8 * VEC.  Check the first VEC.  */
 | ||||||
|  | -	vmovdqu	(%rsi), %ymm2
 | ||||||
|  | -	VPCMPEQ (%rdi), %ymm2, %ymm2
 | ||||||
|  | -	vpmovmskb %ymm2, %eax
 | ||||||
|  | -	subl    $VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | -
 | ||||||
|  | -	/* Align the first memory area for aligned loads in the loop.
 | ||||||
|  | -	   Compute how much the first memory area is misaligned.  */
 | ||||||
|  | -	movq	%rdi, %rcx
 | ||||||
|  | -	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | -	/* Get the negative of offset for alignment.  */
 | ||||||
|  | -	subq	$VEC_SIZE, %rcx
 | ||||||
|  | -	/* Adjust the second memory area.  */
 | ||||||
|  | -	subq	%rcx, %rsi
 | ||||||
|  | -	/* Adjust the first memory area which should be aligned now.  */
 | ||||||
|  | -	subq	%rcx, %rdi
 | ||||||
|  | -	/* Adjust length.  */
 | ||||||
|  | -	addq	%rcx, %rdx
 | ||||||
|  | -
 | ||||||
|  | -L(loop_4x_vec):
 | ||||||
|  | -	/* Compare 4 * VEC at a time forward.  */
 | ||||||
|  | -	vmovdqu	(%rsi), %ymm1
 | ||||||
|  | -	VPCMPEQ (%rdi), %ymm1, %ymm1
 | ||||||
|  | -
 | ||||||
|  | -	vmovdqu	VEC_SIZE(%rsi), %ymm2
 | ||||||
|  | -	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
 | ||||||
|  | -	vpand	%ymm2, %ymm1, %ymm5
 | ||||||
|  | -
 | ||||||
|  | -	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
 | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 | ||||||
|  | -	vpand	%ymm3, %ymm5, %ymm5
 | ||||||
|  | -
 | ||||||
|  | -	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
 | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 | ||||||
|  | -	vpand	%ymm4, %ymm5, %ymm5
 | ||||||
|  | -
 | ||||||
|  | -	vptest	%ymm0, %ymm5
 | ||||||
|  | -	jnc	L(4x_vec_end)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rdi
 | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rsi
 | ||||||
|  | -
 | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | -	cmpq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | -	jae	L(loop_4x_vec)
 | ||||||
|  | -
 | ||||||
|  | -	/* Less than 4 * VEC.  */
 | ||||||
|  | -	cmpq	$VEC_SIZE, %rdx
 | ||||||
|  | -	jbe	L(last_vec)
 | ||||||
|  | -	cmpq	$(VEC_SIZE * 2), %rdx
 | ||||||
|  | -	jbe	L(last_2x_vec)
 | ||||||
|  | -
 | ||||||
|  | -L(last_4x_vec):
 | ||||||
|  | -	/* From 2 * VEC to 4 * VEC. */
 | ||||||
|  | -	vmovdqu	(%rsi), %ymm2
 | ||||||
|  | -	VPCMPEQ (%rdi), %ymm2, %ymm2
 | ||||||
|  | -	vpmovmskb %ymm2, %eax
 | ||||||
|  | -	subl    $VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -	addq	$VEC_SIZE, %rsi
 | ||||||
|  | -	vmovdqu	(%rsi), %ymm2
 | ||||||
|  | -	VPCMPEQ (%rdi), %ymm2, %ymm2
 | ||||||
|  | -	vpmovmskb %ymm2, %eax
 | ||||||
|  | -	subl    $VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | -
 | ||||||
|  | -	/* Use overlapping loads to avoid branches.  */
 | ||||||
|  | -	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
 | ||||||
|  | -	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
 | ||||||
|  | -	vmovdqu	(%rsi), %ymm2
 | ||||||
|  | -	VPCMPEQ (%rdi), %ymm2, %ymm2
 | ||||||
|  | -	vpmovmskb %ymm2, %eax
 | ||||||
|  | -	subl    $VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -	addq	$VEC_SIZE, %rsi
 | ||||||
|  | -	vmovdqu	(%rsi), %ymm2
 | ||||||
|  | -	VPCMPEQ (%rdi), %ymm2, %ymm2
 | ||||||
|  | -	vpmovmskb %ymm2, %eax
 | ||||||
|  | -	subl    $VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | -	VZEROUPPER_RETURN
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(4x_vec_end):
 | ||||||
|  | -	vpmovmskb %ymm1, %eax
 | ||||||
|  | -	subl	$VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | -	vpmovmskb %ymm2, %eax
 | ||||||
|  | -	subl	$VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec_x1)
 | ||||||
|  | -	vpmovmskb %ymm3, %eax
 | ||||||
|  | -	subl	$VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec_x2)
 | ||||||
|  | -	vpmovmskb %ymm4, %eax
 | ||||||
|  | -	subl	$VEC_MASK, %eax
 | ||||||
|  | -	tzcntl	%eax, %ecx
 | ||||||
|  |  # ifdef USE_AS_WMEMCMP | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
 | ||||||
|  | -	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
 | ||||||
|  | -	jmp	L(wmemcmp_return)
 | ||||||
|  | -# else
 | ||||||
|  | -	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 | ||||||
|  | -	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
 | ||||||
|  | -	sub	%edx, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | -	VZEROUPPER_RETURN
 | ||||||
|  | -
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x1):
 | ||||||
|  | -	tzcntl	%eax, %ecx
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	movl	VEC_SIZE(%rdi, %rcx), %edx
 | ||||||
|  | -	cmpl	VEC_SIZE(%rsi, %rcx), %edx
 | ||||||
|  | -	jmp	L(wmemcmp_return)
 | ||||||
|  | +L(one_or_less):
 | ||||||
|  | +	jb	L(zero)
 | ||||||
|  | +	movl	(%rdi), %ecx
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	(%rsi), %ecx
 | ||||||
|  | +	je	L(zero)
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | +	/* No ymm register was touched.  */
 | ||||||
|  | +	ret
 | ||||||
|  |  # else | ||||||
|  | -	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 | ||||||
|  | -	movzbl	VEC_SIZE(%rsi, %rcx), %edx
 | ||||||
|  | -	sub	%edx, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | -	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x2):
 | ||||||
|  | -	tzcntl	%eax, %ecx
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
 | ||||||
|  | -	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
 | ||||||
|  | -	jmp	L(wmemcmp_return)
 | ||||||
|  | -# else
 | ||||||
|  | -	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 | ||||||
|  | -	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
 | ||||||
|  | -	sub	%edx, %eax
 | ||||||
|  | +L(between_4_7):
 | ||||||
|  | +	/* Load as big endian with overlapping movbe to avoid branches.
 | ||||||
|  | +	 */
 | ||||||
|  | +	movbe	(%rdi), %eax
 | ||||||
|  | +	movbe	(%rsi), %ecx
 | ||||||
|  | +	shlq	$32, %rax
 | ||||||
|  | +	shlq	$32, %rcx
 | ||||||
|  | +	movbe	-4(%rdi, %rdx), %edi
 | ||||||
|  | +	movbe	-4(%rsi, %rdx), %esi
 | ||||||
|  | +	orq	%rdi, %rax
 | ||||||
|  | +	orq	%rsi, %rcx
 | ||||||
|  | +	subq	%rcx, %rax
 | ||||||
|  | +	jz	L(zero_4_7)
 | ||||||
|  | +	sbbl	%eax, %eax
 | ||||||
|  | +	orl	$1, %eax
 | ||||||
|  | +L(zero_4_7):
 | ||||||
|  | +	/* No ymm register was touched.  */
 | ||||||
|  | +	ret
 | ||||||
|  |  # endif | ||||||
|  | -	VZEROUPPER_RETURN
 | ||||||
|  | +
 | ||||||
|  |  END (MEMCMP) | ||||||
|  |  #endif | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										851
									
								
								SOURCES/glibc-RHEL-15696-46.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										851
									
								
								SOURCES/glibc-RHEL-15696-46.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,851 @@ | |||||||
|  | From 4ad473e97acdc5f6d811755b67c09f2128a644ce Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Mon, 17 May 2021 13:57:24 -0400 | ||||||
|  | Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. This commit optimizes memcmp-evex.S. The optimizations include | ||||||
|  | adding a new vec compare path for small sizes, reorganizing the entry | ||||||
|  | control flow, removing some unnecissary ALU instructions from the main | ||||||
|  | loop, and most importantly replacing the heavy use of vpcmp + kand | ||||||
|  | logic with vpxor + vptern. test-memcmp and test-wmemcmp are both | ||||||
|  | passing. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 710 +++++++++++-------- | ||||||
|  |  1 file changed, 408 insertions(+), 302 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 | ||||||
|  | index 9c093972..654dc7ac 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 | ||||||
|  | @@ -19,17 +19,22 @@
 | ||||||
|  |  #if IS_IN (libc) | ||||||
|  |   | ||||||
|  |  /* memcmp/wmemcmp is implemented as: | ||||||
|  | -   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
 | ||||||
|  | -      to avoid branches.
 | ||||||
|  | -   2. Use overlapping compare to avoid branch.
 | ||||||
|  | -   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
 | ||||||
|  | -      bytes for wmemcmp.
 | ||||||
|  | -   4. If size is 8 * VEC_SIZE or less, unroll the loop.
 | ||||||
|  | -   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
 | ||||||
|  | +   1. Use ymm vector compares when possible. The only case where
 | ||||||
|  | +      vector compares is not possible for when size < CHAR_PER_VEC
 | ||||||
|  | +      and loading from either s1 or s2 would cause a page cross.
 | ||||||
|  | +   2. For size from 2 to 7 bytes on page cross, load as big endian
 | ||||||
|  | +      with movbe and bswap to avoid branches.
 | ||||||
|  | +   3. Use xmm vector compare when size >= 4 bytes for memcmp or
 | ||||||
|  | +      size >= 8 bytes for wmemcmp.
 | ||||||
|  | +   4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
 | ||||||
|  | +      to check for early mismatches. Only do this if its guranteed the
 | ||||||
|  | +      work is not wasted.
 | ||||||
|  | +   5. If size is 8 * VEC_SIZE or less, unroll the loop.
 | ||||||
|  | +   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
 | ||||||
|  |        area. | ||||||
|  | -   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
 | ||||||
|  | -   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
 | ||||||
|  | -   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
 | ||||||
|  | +   7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
 | ||||||
|  | +   8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
 | ||||||
|  | +   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
 | ||||||
|  |   | ||||||
|  |  # include <sysdep.h> | ||||||
|  |   | ||||||
|  | @@ -40,11 +45,21 @@
 | ||||||
|  |  # define VMOVU		vmovdqu64 | ||||||
|  |   | ||||||
|  |  # ifdef USE_AS_WMEMCMP | ||||||
|  | -#  define VPCMPEQ	vpcmpeqd
 | ||||||
|  | +#  define CHAR_SIZE	4
 | ||||||
|  | +#  define VPCMP	vpcmpd
 | ||||||
|  |  # else | ||||||
|  | -#  define VPCMPEQ	vpcmpeqb
 | ||||||
|  | +#  define CHAR_SIZE	1
 | ||||||
|  | +#  define VPCMP	vpcmpub
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  | +# define VEC_SIZE	32
 | ||||||
|  | +# define PAGE_SIZE	4096
 | ||||||
|  | +# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 | ||||||
|  | +
 | ||||||
|  | +# define XMM0		xmm16
 | ||||||
|  | +# define XMM1		xmm17
 | ||||||
|  | +# define XMM2		xmm18
 | ||||||
|  | +# define YMM0		ymm16
 | ||||||
|  |  # define XMM1		xmm17 | ||||||
|  |  # define XMM2		xmm18 | ||||||
|  |  # define YMM1		ymm17 | ||||||
|  | @@ -54,15 +69,6 @@
 | ||||||
|  |  # define YMM5		ymm21 | ||||||
|  |  # define YMM6		ymm22 | ||||||
|  |   | ||||||
|  | -# define VEC_SIZE 32
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -#  define VEC_MASK 0xff
 | ||||||
|  | -#  define XMM_MASK 0xf
 | ||||||
|  | -# else
 | ||||||
|  | -#  define VEC_MASK 0xffffffff
 | ||||||
|  | -#  define XMM_MASK 0xffff
 | ||||||
|  | -# endif
 | ||||||
|  | -
 | ||||||
|  |  /* Warning! | ||||||
|  |             wmemcmp has to use SIGNED comparison for elements. | ||||||
|  |             memcmp has to use UNSIGNED comparison for elemnts. | ||||||
|  | @@ -70,145 +76,370 @@
 | ||||||
|  |   | ||||||
|  |  	.section .text.evex,"ax",@progbits | ||||||
|  |  ENTRY (MEMCMP) | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -	shl	$2, %RDX_LP
 | ||||||
|  | -# elif defined __ILP32__
 | ||||||
|  | +# ifdef __ILP32__
 | ||||||
|  |  	/* Clear the upper 32 bits.  */ | ||||||
|  |  	movl	%edx, %edx | ||||||
|  |  # endif | ||||||
|  | -	cmp	$VEC_SIZE, %RDX_LP
 | ||||||
|  | +	cmp	$CHAR_PER_VEC, %RDX_LP
 | ||||||
|  |  	jb	L(less_vec) | ||||||
|  |   | ||||||
|  |  	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */ | ||||||
|  | -	VMOVU	(%rsi), %YMM2
 | ||||||
|  | -	VPCMPEQ (%rdi), %YMM2, %k1
 | ||||||
|  | +	VMOVU	(%rsi), %YMM1
 | ||||||
|  | +	/* Use compare not equals to directly check for mismatch.  */
 | ||||||
|  | +	VPCMP	$4, (%rdi), %YMM1, %k1
 | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  | -	subl    $VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | -
 | ||||||
|  | -	cmpq	$(VEC_SIZE * 2), %rdx
 | ||||||
|  | -	jbe	L(last_vec)
 | ||||||
|  | -
 | ||||||
|  | -	/* More than 2 * VEC.  */
 | ||||||
|  | -	cmpq	$(VEC_SIZE * 8), %rdx
 | ||||||
|  | -	ja	L(more_8x_vec)
 | ||||||
|  | -	cmpq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | -	jb	L(last_4x_vec)
 | ||||||
|  | +	/* NB: eax must be destination register if going to
 | ||||||
|  | +	   L(return_vec_[0,2]). For L(return_vec_3 destination register
 | ||||||
|  | +	   must be ecx.  */
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_0)
 | ||||||
|  |   | ||||||
|  | -	/* From 4 * VEC to 8 * VEC, inclusively. */
 | ||||||
|  | -	VMOVU	(%rsi), %YMM1
 | ||||||
|  | -	VPCMPEQ (%rdi), %YMM1, %k1
 | ||||||
|  | +	cmpq	$(CHAR_PER_VEC * 2), %rdx
 | ||||||
|  | +	jbe	L(last_1x_vec)
 | ||||||
|  |   | ||||||
|  | +	/* Check second VEC no matter what.  */
 | ||||||
|  |  	VMOVU	VEC_SIZE(%rsi), %YMM2 | ||||||
|  | -	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
 | ||||||
|  | +	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_1)
 | ||||||
|  | +
 | ||||||
|  | +	/* Less than 4 * VEC.  */
 | ||||||
|  | +	cmpq	$(CHAR_PER_VEC * 4), %rdx
 | ||||||
|  | +	jbe	L(last_2x_vec)
 | ||||||
|  |   | ||||||
|  | +	/* Check third and fourth VEC no matter what.  */
 | ||||||
|  |  	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3 | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
 | ||||||
|  | +	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_2)
 | ||||||
|  |   | ||||||
|  |  	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4 | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
 | ||||||
|  | +	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
 | ||||||
|  | +	kmovd	%k1, %ecx
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jnz	L(return_vec_3)
 | ||||||
|  |   | ||||||
|  | -	kandd	%k1, %k2, %k5
 | ||||||
|  | -	kandd	%k3, %k4, %k6
 | ||||||
|  | -	kandd	%k5, %k6, %k6
 | ||||||
|  | +	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
 | ||||||
|  | +	   compare with zero to get a mask is needed.  */
 | ||||||
|  | +	vpxorq	%XMM0, %XMM0, %XMM0
 | ||||||
|  |   | ||||||
|  | -	kmovd	%k6, %eax
 | ||||||
|  | -	cmpl	$VEC_MASK, %eax
 | ||||||
|  | -	jne	L(4x_vec_end)
 | ||||||
|  | +	/* Go to 4x VEC loop.  */
 | ||||||
|  | +	cmpq	$(CHAR_PER_VEC * 8), %rdx
 | ||||||
|  | +	ja	L(more_8x_vec)
 | ||||||
|  |   | ||||||
|  | -	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
 | ||||||
|  | -	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
 | ||||||
|  | -	VMOVU	(%rsi), %YMM1
 | ||||||
|  | -	VPCMPEQ (%rdi), %YMM1, %k1
 | ||||||
|  | +	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
 | ||||||
|  | +	   branches.  */
 | ||||||
|  |   | ||||||
|  | -	VMOVU	VEC_SIZE(%rsi), %YMM2
 | ||||||
|  | -	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
 | ||||||
|  | -	kandd	%k1, %k2, %k5
 | ||||||
|  | +	/* Load first two VEC from s2 before adjusting addresses.  */
 | ||||||
|  | +	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
 | ||||||
|  | +	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
 | ||||||
|  | +	leaq	-(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
 | ||||||
|  | +	leaq	-(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
 | ||||||
|  | +
 | ||||||
|  | +	/* Wait to load from s1 until addressed adjust due to
 | ||||||
|  | +	   unlamination of microfusion with complex address mode.  */
 | ||||||
|  | +
 | ||||||
|  | +	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
 | ||||||
|  | +	   will have some 1s.  */
 | ||||||
|  | +	vpxorq	(%rdi), %YMM1, %YMM1
 | ||||||
|  | +	vpxorq	(VEC_SIZE)(%rdi), %YMM2, %YMM2
 | ||||||
|  |   | ||||||
|  |  	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3 | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
 | ||||||
|  | -	kandd	%k3, %k5, %k5
 | ||||||
|  | +	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
 | ||||||
|  | +	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
 | ||||||
|  | +	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
 | ||||||
|  |   | ||||||
|  |  	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4 | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
 | ||||||
|  | -	kandd	%k4, %k5, %k5
 | ||||||
|  | +	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
 | ||||||
|  | +	   oring with YMM3. Result is stored in YMM4.  */
 | ||||||
|  | +	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
 | ||||||
|  | +	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
 | ||||||
|  | +	VPCMP	$4, %YMM4, %YMM0, %k1
 | ||||||
|  | +	kmovd	%k1, %ecx
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jnz	L(return_vec_0_1_2_3)
 | ||||||
|  | +	/* NB: eax must be zero to reach here.  */
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -	kmovd	%k5, %eax
 | ||||||
|  | -	cmpl	$VEC_MASK, %eax
 | ||||||
|  | -	jne	L(4x_vec_end)
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | +	/* NB: aligning 32 here allows for the rest of the jump targets
 | ||||||
|  | +	   to be tuned for 32 byte alignment. Most important this ensures
 | ||||||
|  | +	   the L(more_8x_vec) loop is 32 byte aligned.  */
 | ||||||
|  | +	.p2align 5
 | ||||||
|  | +L(less_vec):
 | ||||||
|  | +	/* Check if one or less CHAR. This is necessary for size = 0 but
 | ||||||
|  | +	   is also faster for size = CHAR_SIZE.  */
 | ||||||
|  | +	cmpl	$1, %edx
 | ||||||
|  | +	jbe	L(one_or_less)
 | ||||||
|  | +
 | ||||||
|  | +	/* Check if loading one VEC from either s1 or s2 could cause a
 | ||||||
|  | +	   page cross. This can have false positives but is by far the
 | ||||||
|  | +	   fastest method.  */
 | ||||||
|  | +	movl	%edi, %eax
 | ||||||
|  | +	orl	%esi, %eax
 | ||||||
|  | +	andl	$(PAGE_SIZE - 1), %eax
 | ||||||
|  | +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 | ||||||
|  | +	jg	L(page_cross_less_vec)
 | ||||||
|  | +
 | ||||||
|  | +	/* No page cross possible.  */
 | ||||||
|  | +	VMOVU	(%rsi), %YMM2
 | ||||||
|  | +	VPCMP	$4, (%rdi), %YMM2, %k1
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	/* Create mask in ecx for potentially in bound matches.  */
 | ||||||
|  | +	bzhil	%edx, %eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_0)
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(last_2x_vec):
 | ||||||
|  | -	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
 | ||||||
|  | -	VMOVU	(%rsi), %YMM2
 | ||||||
|  | -	VPCMPEQ (%rdi), %YMM2, %k2
 | ||||||
|  | -	kmovd	%k2, %eax
 | ||||||
|  | -	subl    $VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | +L(return_vec_0):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	movl	(%rdi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | +	/* NB: no partial register stall here because xorl zero idiom
 | ||||||
|  | +	   above.  */
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	movzbl	(%rsi, %rax), %ecx
 | ||||||
|  | +	movzbl	(%rdi, %rax), %eax
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | -L(last_vec):
 | ||||||
|  | -	/* Use overlapping loads to avoid branches.  */
 | ||||||
|  | -	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
 | ||||||
|  | -	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
 | ||||||
|  | -	VMOVU	(%rsi), %YMM2
 | ||||||
|  | -	VPCMPEQ (%rdi), %YMM2, %k2
 | ||||||
|  | -	kmovd	%k2, %eax
 | ||||||
|  | -	subl    $VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | +	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
 | ||||||
|  | +	   which is good enough for a target not in a loop.  */
 | ||||||
|  | +L(return_vec_1):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	movzbl	VEC_SIZE(%rsi, %rax), %ecx
 | ||||||
|  | +	movzbl	VEC_SIZE(%rdi, %rax), %eax
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(first_vec):
 | ||||||
|  | -	/* A byte or int32 is different within 16 or 32 bytes.  */
 | ||||||
|  | -	tzcntl	%eax, %ecx
 | ||||||
|  | +	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
 | ||||||
|  | +	   which is good enough for a target not in a loop.  */
 | ||||||
|  | +L(return_vec_2):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  |  # ifdef USE_AS_WMEMCMP | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	movl	(%rdi, %rcx, 4), %edx
 | ||||||
|  | -	cmpl	(%rsi, %rcx, 4), %edx
 | ||||||
|  | -L(wmemcmp_return):
 | ||||||
|  | -	setl	%al
 | ||||||
|  | -	negl	%eax
 | ||||||
|  | -	orl	$1, %eax
 | ||||||
|  | +	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  |  # else | ||||||
|  | -	movzbl	(%rdi, %rcx), %eax
 | ||||||
|  | -	movzbl	(%rsi, %rcx), %edx
 | ||||||
|  | -	sub	%edx, %eax
 | ||||||
|  | +	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
 | ||||||
|  | +	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  |  # endif | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(8x_return_vec_0_1_2_3):
 | ||||||
|  | +	/* Returning from L(more_8x_vec) requires restoring rsi.  */
 | ||||||
|  | +	addq	%rdi, %rsi
 | ||||||
|  | +L(return_vec_0_1_2_3):
 | ||||||
|  | +	VPCMP	$4, %YMM1, %YMM0, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_0)
 | ||||||
|  | +
 | ||||||
|  | +	VPCMP	$4, %YMM2, %YMM0, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_1)
 | ||||||
|  | +
 | ||||||
|  | +	VPCMP	$4, %YMM3, %YMM0, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_2)
 | ||||||
|  | +L(return_vec_3):
 | ||||||
|  | +	tzcntl	%ecx, %ecx
 | ||||||
|  |  # ifdef USE_AS_WMEMCMP | ||||||
|  | +	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 | ||||||
|  | +	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(4):
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	movl	(%rdi), %edx
 | ||||||
|  | -	cmpl	(%rsi), %edx
 | ||||||
|  | -	jne	L(wmemcmp_return)
 | ||||||
|  | +L(more_8x_vec):
 | ||||||
|  | +	/* Set end of s1 in rdx.  */
 | ||||||
|  | +	leaq	-(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
 | ||||||
|  | +	/* rsi stores s2 - s1. This allows loop to only update one
 | ||||||
|  | +	   pointer.  */
 | ||||||
|  | +	subq	%rdi, %rsi
 | ||||||
|  | +	/* Align s1 pointer.  */
 | ||||||
|  | +	andq	$-VEC_SIZE, %rdi
 | ||||||
|  | +	/* Adjust because first 4x vec where check already.  */
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(loop_4x_vec):
 | ||||||
|  | +	VMOVU	(%rsi, %rdi), %YMM1
 | ||||||
|  | +	vpxorq	(%rdi), %YMM1, %YMM1
 | ||||||
|  | +
 | ||||||
|  | +	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
 | ||||||
|  | +	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
 | ||||||
|  | +
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
 | ||||||
|  | +	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
 | ||||||
|  | +	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
 | ||||||
|  | +
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
 | ||||||
|  | +	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
 | ||||||
|  | +	VPCMP	$4, %YMM4, %YMM0, %k1
 | ||||||
|  | +	kmovd	%k1, %ecx
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jnz	L(8x_return_vec_0_1_2_3)
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	cmpq	%rdx, %rdi
 | ||||||
|  | +	jb	L(loop_4x_vec)
 | ||||||
|  | +
 | ||||||
|  | +	subq	%rdx, %rdi
 | ||||||
|  | +	/* rdi has 4 * VEC_SIZE - remaining length.  */
 | ||||||
|  | +	cmpl	$(VEC_SIZE * 3), %edi
 | ||||||
|  | +	jae	L(8x_last_1x_vec)
 | ||||||
|  | +	/* Load regardless of branch.  */
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
 | ||||||
|  | +	cmpl	$(VEC_SIZE * 2), %edi
 | ||||||
|  | +	jae	L(8x_last_2x_vec)
 | ||||||
|  | +
 | ||||||
|  | +	VMOVU	(%rsi, %rdx), %YMM1
 | ||||||
|  | +	vpxorq	(%rdx), %YMM1, %YMM1
 | ||||||
|  | +
 | ||||||
|  | +	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
 | ||||||
|  | +	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
 | ||||||
|  | +
 | ||||||
|  | +	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
 | ||||||
|  | +	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
 | ||||||
|  | +
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
 | ||||||
|  | +	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
 | ||||||
|  | +	VPCMP	$4, %YMM4, %YMM0, %k1
 | ||||||
|  | +	kmovd	%k1, %ecx
 | ||||||
|  | +	/* Restore s1 pointer to rdi.  */
 | ||||||
|  | +	movq	%rdx, %rdi
 | ||||||
|  | +	testl	%ecx, %ecx
 | ||||||
|  | +	jnz	L(8x_return_vec_0_1_2_3)
 | ||||||
|  | +	/* NB: eax must be zero to reach here.  */
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	/* Only entry is from L(more_8x_vec).  */
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(8x_last_2x_vec):
 | ||||||
|  | +	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(8x_return_vec_2)
 | ||||||
|  | +	/* Naturally aligned to 16 bytes.  */
 | ||||||
|  | +L(8x_last_1x_vec):
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
 | ||||||
|  | +	VPCMP	$4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(8x_return_vec_3)
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(last_2x_vec):
 | ||||||
|  | +	/* Check second to last VEC.  */
 | ||||||
|  | +	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
 | ||||||
|  | +	VPCMP	$4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_1_end)
 | ||||||
|  | +
 | ||||||
|  | +	/* Check last VEC.  */
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(last_1x_vec):
 | ||||||
|  | +	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
 | ||||||
|  | +	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_0_end)
 | ||||||
|  |  	ret | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(8x_return_vec_2):
 | ||||||
|  | +	subq	$VEC_SIZE, %rdx
 | ||||||
|  | +L(8x_return_vec_3):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	leaq	(%rdx, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	movl	(VEC_SIZE * 3)(%rax), %ecx
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  |  # else | ||||||
|  | +	addq	%rdx, %rax
 | ||||||
|  | +	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
 | ||||||
|  | +	movzbl	(VEC_SIZE * 3)(%rax), %eax
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(between_4_7):
 | ||||||
|  | -	/* Load as big endian with overlapping movbe to avoid branches.  */
 | ||||||
|  | -	movbe	(%rdi), %eax
 | ||||||
|  | -	movbe	(%rsi), %ecx
 | ||||||
|  | -	shlq	$32, %rax
 | ||||||
|  | -	shlq	$32, %rcx
 | ||||||
|  | -	movbe	-4(%rdi, %rdx), %edi
 | ||||||
|  | -	movbe	-4(%rsi, %rdx), %esi
 | ||||||
|  | -	orq	%rdi, %rax
 | ||||||
|  | -	orq	%rsi, %rcx
 | ||||||
|  | -	subq	%rcx, %rax
 | ||||||
|  | -	je	L(exit)
 | ||||||
|  | -	sbbl	%eax, %eax
 | ||||||
|  | -	orl	$1, %eax
 | ||||||
|  | +L(return_vec_0_end):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	addl	%edx, %eax
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	movl	-VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	-VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
 | ||||||
|  | +	movzbl	-VEC_SIZE(%rdi, %rax), %eax
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(exit):
 | ||||||
|  | +L(return_vec_1_end):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +	addl	%edx, %eax
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
 | ||||||
|  | +	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | +
 | ||||||
|  |  	.p2align 4 | ||||||
|  | +L(page_cross_less_vec):
 | ||||||
|  | +	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
 | ||||||
|  | +	   bytes.  */
 | ||||||
|  | +	cmpl	$(16 / CHAR_SIZE), %edx
 | ||||||
|  | +	jae	L(between_16_31)
 | ||||||
|  | +# ifndef USE_AS_WMEMCMP
 | ||||||
|  | +	cmpl	$8, %edx
 | ||||||
|  | +	jae	L(between_8_15)
 | ||||||
|  | +	cmpl	$4, %edx
 | ||||||
|  | +	jae	L(between_4_7)
 | ||||||
|  |  L(between_2_3): | ||||||
|  |  	/* Load as big endian to avoid branches.  */ | ||||||
|  |  	movzwl	(%rdi), %eax | ||||||
|  | @@ -217,224 +448,99 @@ L(between_2_3):
 | ||||||
|  |  	shll	$8, %ecx | ||||||
|  |  	bswap	%eax | ||||||
|  |  	bswap	%ecx | ||||||
|  | -	movb	-1(%rdi, %rdx), %al
 | ||||||
|  | -	movb	-1(%rsi, %rdx), %cl
 | ||||||
|  | +	movzbl	-1(%rdi, %rdx), %edi
 | ||||||
|  | +	movzbl	-1(%rsi, %rdx), %esi
 | ||||||
|  | +	orl	%edi, %eax
 | ||||||
|  | +	orl	%esi, %ecx
 | ||||||
|  |  	/* Subtraction is okay because the upper 8 bits are zero.  */ | ||||||
|  |  	subl	%ecx, %eax | ||||||
|  |  	ret | ||||||
|  | -
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(1):
 | ||||||
|  | -	movzbl	(%rdi), %eax
 | ||||||
|  | +L(one_or_less):
 | ||||||
|  | +	jb	L(zero)
 | ||||||
|  |  	movzbl	(%rsi), %ecx | ||||||
|  | +	movzbl	(%rdi), %eax
 | ||||||
|  |  	subl	%ecx, %eax | ||||||
|  |  	ret | ||||||
|  | -# endif
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(zero):
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	ret
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(less_vec):
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
 | ||||||
|  | -	cmpb	$4, %dl
 | ||||||
|  | -	je	L(4)
 | ||||||
|  | -	jb	L(zero)
 | ||||||
|  | -# else
 | ||||||
|  | -	cmpb	$1, %dl
 | ||||||
|  | -	je	L(1)
 | ||||||
|  | -	jb	L(zero)
 | ||||||
|  | -	cmpb	$4, %dl
 | ||||||
|  | -	jb	L(between_2_3)
 | ||||||
|  | -	cmpb	$8, %dl
 | ||||||
|  | -	jb	L(between_4_7)
 | ||||||
|  | +L(between_8_15):
 | ||||||
|  |  # endif | ||||||
|  | -	cmpb	$16, %dl
 | ||||||
|  | -	jae	L(between_16_31)
 | ||||||
|  | -	/* It is between 8 and 15 bytes.  */
 | ||||||
|  | +	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
 | ||||||
|  |  	vmovq	(%rdi), %XMM1 | ||||||
|  |  	vmovq	(%rsi), %XMM2 | ||||||
|  | -	VPCMPEQ %XMM1, %XMM2, %k2
 | ||||||
|  | -	kmovw	%k2, %eax
 | ||||||
|  | -	subl    $XMM_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | +	VPCMP	$4, %XMM1, %XMM2, %k1
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_0)
 | ||||||
|  |  	/* Use overlapping loads to avoid branches.  */ | ||||||
|  | -	leaq	-8(%rdi, %rdx), %rdi
 | ||||||
|  | -	leaq	-8(%rsi, %rdx), %rsi
 | ||||||
|  | +	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
 | ||||||
|  | +	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
 | ||||||
|  |  	vmovq	(%rdi), %XMM1 | ||||||
|  |  	vmovq	(%rsi), %XMM2 | ||||||
|  | -	VPCMPEQ %XMM1, %XMM2, %k2
 | ||||||
|  | -	kmovw	%k2, %eax
 | ||||||
|  | -	subl    $XMM_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | +	VPCMP	$4, %XMM1, %XMM2, %k1
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_0)
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(between_16_31):
 | ||||||
|  | -	/* From 16 to 31 bytes.  No branch when size == 16.  */
 | ||||||
|  | -	VMOVU	(%rsi), %XMM2
 | ||||||
|  | -	VPCMPEQ (%rdi), %XMM2, %k2
 | ||||||
|  | -	kmovw	%k2, %eax
 | ||||||
|  | -	subl    $XMM_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | -
 | ||||||
|  | -	/* Use overlapping loads to avoid branches.  */
 | ||||||
|  | -	leaq	-16(%rdi, %rdx), %rdi
 | ||||||
|  | -	leaq	-16(%rsi, %rdx), %rsi
 | ||||||
|  | -	VMOVU	(%rsi), %XMM2
 | ||||||
|  | -	VPCMPEQ (%rdi), %XMM2, %k2
 | ||||||
|  | -	kmovw	%k2, %eax
 | ||||||
|  | -	subl    $XMM_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | +L(zero):
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(more_8x_vec):
 | ||||||
|  | -	/* More than 8 * VEC.  Check the first VEC.  */
 | ||||||
|  | -	VMOVU	(%rsi), %YMM2
 | ||||||
|  | -	VPCMPEQ (%rdi), %YMM2, %k2
 | ||||||
|  | -	kmovd	%k2, %eax
 | ||||||
|  | -	subl    $VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | -
 | ||||||
|  | -	/* Align the first memory area for aligned loads in the loop.
 | ||||||
|  | -	   Compute how much the first memory area is misaligned.  */
 | ||||||
|  | -	movq	%rdi, %rcx
 | ||||||
|  | -	andl	$(VEC_SIZE - 1), %ecx
 | ||||||
|  | -	/* Get the negative of offset for alignment.  */
 | ||||||
|  | -	subq	$VEC_SIZE, %rcx
 | ||||||
|  | -	/* Adjust the second memory area.  */
 | ||||||
|  | -	subq	%rcx, %rsi
 | ||||||
|  | -	/* Adjust the first memory area which should be aligned now.  */
 | ||||||
|  | -	subq	%rcx, %rdi
 | ||||||
|  | -	/* Adjust length.  */
 | ||||||
|  | -	addq	%rcx, %rdx
 | ||||||
|  | -
 | ||||||
|  | -L(loop_4x_vec):
 | ||||||
|  | -	/* Compare 4 * VEC at a time forward.  */
 | ||||||
|  | -	VMOVU	(%rsi), %YMM1
 | ||||||
|  | -	VPCMPEQ (%rdi), %YMM1, %k1
 | ||||||
|  | -
 | ||||||
|  | -	VMOVU	VEC_SIZE(%rsi), %YMM2
 | ||||||
|  | -	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
 | ||||||
|  | -	kandd	%k2, %k1, %k5
 | ||||||
|  | -
 | ||||||
|  | -	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
 | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
 | ||||||
|  | -	kandd	%k3, %k5, %k5
 | ||||||
|  | -
 | ||||||
|  | -	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
 | ||||||
|  | -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
 | ||||||
|  | -	kandd	%k4, %k5, %k5
 | ||||||
|  | -
 | ||||||
|  | -	kmovd	%k5, %eax
 | ||||||
|  | -	cmpl	$VEC_MASK, %eax
 | ||||||
|  | -	jne	L(4x_vec_end)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rdi
 | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rsi
 | ||||||
|  | -
 | ||||||
|  | -	subq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | -	cmpq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | -	jae	L(loop_4x_vec)
 | ||||||
|  | -
 | ||||||
|  | -	/* Less than 4 * VEC.  */
 | ||||||
|  | -	cmpq	$VEC_SIZE, %rdx
 | ||||||
|  | -	jbe	L(last_vec)
 | ||||||
|  | -	cmpq	$(VEC_SIZE * 2), %rdx
 | ||||||
|  | -	jbe	L(last_2x_vec)
 | ||||||
|  | -
 | ||||||
|  | -L(last_4x_vec):
 | ||||||
|  | -	/* From 2 * VEC to 4 * VEC. */
 | ||||||
|  | -	VMOVU	(%rsi), %YMM2
 | ||||||
|  | -	VPCMPEQ (%rdi), %YMM2, %k2
 | ||||||
|  | -	kmovd	%k2, %eax
 | ||||||
|  | -	subl    $VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | -
 | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -	addq	$VEC_SIZE, %rsi
 | ||||||
|  | -	VMOVU	(%rsi), %YMM2
 | ||||||
|  | -	VPCMPEQ (%rdi), %YMM2, %k2
 | ||||||
|  | -	kmovd	%k2, %eax
 | ||||||
|  | -	subl    $VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | +L(between_16_31):
 | ||||||
|  | +	/* From 16 to 31 bytes.  No branch when size == 16.  */
 | ||||||
|  | +	VMOVU	(%rsi), %XMM2
 | ||||||
|  | +	VPCMP	$4, (%rdi), %XMM2, %k1
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_0)
 | ||||||
|  |   | ||||||
|  |  	/* Use overlapping loads to avoid branches.  */ | ||||||
|  | -	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
 | ||||||
|  | -	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
 | ||||||
|  | -	VMOVU	(%rsi), %YMM2
 | ||||||
|  | -	VPCMPEQ (%rdi), %YMM2, %k2
 | ||||||
|  | -	kmovd	%k2, %eax
 | ||||||
|  | -	subl    $VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  |   | ||||||
|  | -	addq	$VEC_SIZE, %rdi
 | ||||||
|  | -	addq	$VEC_SIZE, %rsi
 | ||||||
|  | -	VMOVU	(%rsi), %YMM2
 | ||||||
|  | -	VPCMPEQ (%rdi), %YMM2, %k2
 | ||||||
|  | -	kmovd	%k2, %eax
 | ||||||
|  | -	subl    $VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(4x_vec_end):
 | ||||||
|  | +	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
 | ||||||
|  | +	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
 | ||||||
|  | +	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
 | ||||||
|  | +	VPCMP	$4, (%rdi), %XMM2, %k1
 | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  | -	subl	$VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec)
 | ||||||
|  | -	kmovd	%k2, %eax
 | ||||||
|  | -	subl	$VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec_x1)
 | ||||||
|  | -	kmovd	%k3, %eax
 | ||||||
|  | -	subl	$VEC_MASK, %eax
 | ||||||
|  | -	jnz	L(first_vec_x2)
 | ||||||
|  | -	kmovd	%k4, %eax
 | ||||||
|  | -	subl	$VEC_MASK, %eax
 | ||||||
|  | -	tzcntl	%eax, %ecx
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	movl	(VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
 | ||||||
|  | -	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
 | ||||||
|  | -	jmp	L(wmemcmp_return)
 | ||||||
|  | -# else
 | ||||||
|  | -	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 | ||||||
|  | -	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
 | ||||||
|  | -	sub	%edx, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_0)
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(first_vec_x1):
 | ||||||
|  | -	tzcntl	%eax, %ecx
 | ||||||
|  |  # ifdef USE_AS_WMEMCMP | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	movl	VEC_SIZE(%rdi, %rcx, 4), %edx
 | ||||||
|  | -	cmpl	VEC_SIZE(%rsi, %rcx, 4), %edx
 | ||||||
|  | -	jmp	L(wmemcmp_return)
 | ||||||
|  | -# else
 | ||||||
|  | -	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 | ||||||
|  | -	movzbl	VEC_SIZE(%rsi, %rcx), %edx
 | ||||||
|  | -	sub	%edx, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(one_or_less):
 | ||||||
|  | +	jb	L(zero)
 | ||||||
|  | +	movl	(%rdi), %ecx
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	(%rsi), %ecx
 | ||||||
|  | +	je	L(zero)
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  |  	ret | ||||||
|  | +# else
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(first_vec_x2):
 | ||||||
|  | -	tzcntl	%eax, %ecx
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	movl	(VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
 | ||||||
|  | -	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
 | ||||||
|  | -	jmp	L(wmemcmp_return)
 | ||||||
|  | -# else
 | ||||||
|  | -	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 | ||||||
|  | -	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
 | ||||||
|  | -	sub	%edx, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | +L(between_4_7):
 | ||||||
|  | +	/* Load as big endian with overlapping movbe to avoid branches.
 | ||||||
|  | +	 */
 | ||||||
|  | +	movbe	(%rdi), %eax
 | ||||||
|  | +	movbe	(%rsi), %ecx
 | ||||||
|  | +	shlq	$32, %rax
 | ||||||
|  | +	shlq	$32, %rcx
 | ||||||
|  | +	movbe	-4(%rdi, %rdx), %edi
 | ||||||
|  | +	movbe	-4(%rsi, %rdx), %esi
 | ||||||
|  | +	orq	%rdi, %rax
 | ||||||
|  | +	orq	%rsi, %rcx
 | ||||||
|  | +	subq	%rcx, %rax
 | ||||||
|  | +	jz	L(zero_4_7)
 | ||||||
|  | +	sbbl	%eax, %eax
 | ||||||
|  | +	orl	$1, %eax
 | ||||||
|  | +L(zero_4_7):
 | ||||||
|  |  	ret | ||||||
|  | +# endif
 | ||||||
|  | +
 | ||||||
|  |  END (MEMCMP) | ||||||
|  |  #endif | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										104
									
								
								SOURCES/glibc-RHEL-15696-47.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										104
									
								
								SOURCES/glibc-RHEL-15696-47.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,104 @@ | |||||||
|  | From 6abf27980a947f9b6e514d6b33b83059d39566ae Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Thu, 20 May 2021 13:13:51 -0400 | ||||||
|  | Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. This commit makes a few small improvements to | ||||||
|  | memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64 | ||||||
|  | instead of 128. Either alignment will perform equally well in a loop | ||||||
|  | and 128 just increases the odds of having to do an extra iteration | ||||||
|  | which can be significant overhead for small values. 2) Align some | ||||||
|  | targets and the loop. 3) Remove an ALU from the alignment process. 4) | ||||||
|  | Reorder the last 4x VEC so that they are stored after the loop. 5) | ||||||
|  | Move the condition for leq 8x VEC to before the alignment | ||||||
|  | process. test-memset and test-wmemset are both passing. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  .../multiarch/memset-vec-unaligned-erms.S     | 50 +++++++++++-------- | ||||||
|  |  1 file changed, 28 insertions(+), 22 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 | ||||||
|  | index f877ac9d..909c33f6 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 | ||||||
|  | @@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
 | ||||||
|  |  	VMOVU	%VEC(0), (%rdi) | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  | +	.p2align 4
 | ||||||
|  |  L(stosb_more_2x_vec): | ||||||
|  |  	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP | ||||||
|  |  	ja	L(stosb) | ||||||
|  | +#else
 | ||||||
|  | +	.p2align 4
 | ||||||
|  |  #endif | ||||||
|  |  L(more_2x_vec): | ||||||
|  | -	cmpq  $(VEC_SIZE * 4), %rdx
 | ||||||
|  | -	ja	L(loop_start)
 | ||||||
|  | +	/* Stores to first 2x VEC before cmp as any path forward will
 | ||||||
|  | +	   require it.  */
 | ||||||
|  |  	VMOVU	%VEC(0), (%rdi) | ||||||
|  |  	VMOVU	%VEC(0), VEC_SIZE(%rdi) | ||||||
|  | -	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 | ||||||
|  | +	cmpq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | +	ja	L(loop_start)
 | ||||||
|  |  	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) | ||||||
|  | +	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 | ||||||
|  |  L(return): | ||||||
|  |  #if VEC_SIZE > 16 | ||||||
|  |  	ZERO_UPPER_VEC_REGISTERS_RETURN | ||||||
|  | @@ -192,28 +197,29 @@ L(return):
 | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  |  L(loop_start): | ||||||
|  | -	leaq	(VEC_SIZE * 4)(%rdi), %rcx
 | ||||||
|  | -	VMOVU	%VEC(0), (%rdi)
 | ||||||
|  | -	andq	$-(VEC_SIZE * 4), %rcx
 | ||||||
|  | -	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 | ||||||
|  | -	VMOVU	%VEC(0), VEC_SIZE(%rdi)
 | ||||||
|  | -	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
 | ||||||
|  |  	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi) | ||||||
|  | -	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
 | ||||||
|  |  	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi) | ||||||
|  | -	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
 | ||||||
|  | -	addq	%rdi, %rdx
 | ||||||
|  | -	andq	$-(VEC_SIZE * 4), %rdx
 | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | -	je	L(return)
 | ||||||
|  | +	cmpq	$(VEC_SIZE * 8), %rdx
 | ||||||
|  | +	jbe	L(loop_end)
 | ||||||
|  | +	andq	$-(VEC_SIZE * 2), %rdi
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
 | ||||||
|  | +	.p2align 4
 | ||||||
|  |  L(loop): | ||||||
|  | -	VMOVA	%VEC(0), (%rcx)
 | ||||||
|  | -	VMOVA	%VEC(0), VEC_SIZE(%rcx)
 | ||||||
|  | -	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
 | ||||||
|  | -	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
 | ||||||
|  | -	addq	$(VEC_SIZE * 4), %rcx
 | ||||||
|  | -	cmpq	%rcx, %rdx
 | ||||||
|  | -	jne	L(loop)
 | ||||||
|  | +	VMOVA	%VEC(0), (%rdi)
 | ||||||
|  | +	VMOVA	%VEC(0), VEC_SIZE(%rdi)
 | ||||||
|  | +	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
 | ||||||
|  | +	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | +	cmpq	%rcx, %rdi
 | ||||||
|  | +	jb	L(loop)
 | ||||||
|  | +L(loop_end):
 | ||||||
|  | +	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
 | ||||||
|  | +	       rdx as length is also unchanged.  */
 | ||||||
|  | +	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
 | ||||||
|  | +	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
 | ||||||
|  | +	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
 | ||||||
|  | +	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
 | ||||||
|  |  	VZEROUPPER_SHORT_RETURN | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										84
									
								
								SOURCES/glibc-RHEL-15696-48.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										84
									
								
								SOURCES/glibc-RHEL-15696-48.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,84 @@ | |||||||
|  | From 1b992204f68af851e905c16016756fd4421e1934 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Sun, 23 May 2021 19:43:24 -0400 | ||||||
|  | Subject: [PATCH] x86: Improve memmove-vec-unaligned-erms.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | This patch changes the condition for copy 4x VEC so that if length is | ||||||
|  | exactly equal to 4 * VEC_SIZE it will use the 4x VEC case instead of | ||||||
|  | 8x VEC case. | ||||||
|  | 
 | ||||||
|  | Results For Skylake memcpy-avx2-erms | ||||||
|  | size, al1 , al2 , Cur T   , New T   , Win , New / Cur | ||||||
|  | 128 , 0   , 0   , 9.137   , 6.873   , New , 75.22 | ||||||
|  | 128 , 7   , 0   , 12.933  , 7.732   , New , 59.79 | ||||||
|  | 128 , 0   , 7   , 11.852  , 6.76    , New , 57.04 | ||||||
|  | 128 , 7   , 7   , 12.587  , 6.808   , New , 54.09 | ||||||
|  | 
 | ||||||
|  | Results For Icelake memcpy-evex-erms | ||||||
|  | size, al1 , al2 , Cur T   , New T   , Win , New / Cur | ||||||
|  | 128 , 0   , 0   , 9.963   , 5.416   , New , 54.36 | ||||||
|  | 128 , 7   , 0   , 16.467  , 8.061   , New , 48.95 | ||||||
|  | 128 , 0   , 7   , 14.388  , 7.644   , New , 53.13 | ||||||
|  | 128 , 7   , 7   , 14.546  , 7.642   , New , 52.54 | ||||||
|  | 
 | ||||||
|  | Results For Tigerlake memcpy-evex-erms | ||||||
|  | size, al1 , al2 , Cur T   , New T   , Win , New / Cur | ||||||
|  | 128 , 0   , 0   , 8.979   , 4.95    , New , 55.13 | ||||||
|  | 128 , 7   , 0   , 14.245  , 7.122   , New , 50.0 | ||||||
|  | 128 , 0   , 7   , 12.668  , 6.675   , New , 52.69 | ||||||
|  | 128 , 7   , 7   , 13.042  , 6.802   , New , 52.15 | ||||||
|  | 
 | ||||||
|  | Results For Skylake memmove-avx2-erms | ||||||
|  | size, al1 , al2 , Cur T   , New T   , Win , New / Cur | ||||||
|  | 128 , 0   , 32  , 6.181   , 5.691   , New , 92.07 | ||||||
|  | 128 , 32  , 0   , 6.165   , 5.752   , New , 93.3 | ||||||
|  | 128 , 0   , 7   , 13.923  , 9.37    , New , 67.3 | ||||||
|  | 128 , 7   , 0   , 12.049  , 10.182  , New , 84.5 | ||||||
|  | 
 | ||||||
|  | Results For Icelake memmove-evex-erms | ||||||
|  | size, al1 , al2 , Cur T   , New T   , Win , New / Cur | ||||||
|  | 128 , 0   , 32  , 5.479   , 4.889   , New , 89.23 | ||||||
|  | 128 , 32  , 0   , 5.127   , 4.911   , New , 95.79 | ||||||
|  | 128 , 0   , 7   , 18.885  , 13.547  , New , 71.73 | ||||||
|  | 128 , 7   , 0   , 15.565  , 14.436  , New , 92.75 | ||||||
|  | 
 | ||||||
|  | Results For Tigerlake memmove-evex-erms | ||||||
|  | size, al1 , al2 , Cur T   , New T   , Win , New / Cur | ||||||
|  | 128 , 0   , 32  , 5.275   , 4.815   , New , 91.28 | ||||||
|  | 128 , 32  , 0   , 5.376   , 4.565   , New , 84.91 | ||||||
|  | 128 , 0   , 7   , 19.426  , 14.273  , New , 73.47 | ||||||
|  | 128 , 7   , 0   , 15.924  , 14.951  , New , 93.89 | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++--- | ||||||
|  |  1 file changed, 3 insertions(+), 3 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | index 3e2dd6bc..572cef04 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 | ||||||
|  | @@ -417,8 +417,8 @@ L(more_2x_vec):
 | ||||||
|  |  	cmpq	$(VEC_SIZE * 8), %rdx | ||||||
|  |  	ja	L(more_8x_vec) | ||||||
|  |  	cmpq	$(VEC_SIZE * 4), %rdx | ||||||
|  | -	jb	L(last_4x_vec)
 | ||||||
|  | -	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
 | ||||||
|  | +	jbe	L(last_4x_vec)
 | ||||||
|  | +	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
 | ||||||
|  |  	VMOVU	(%rsi), %VEC(0) | ||||||
|  |  	VMOVU	VEC_SIZE(%rsi), %VEC(1) | ||||||
|  |  	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2) | ||||||
|  | @@ -437,7 +437,7 @@ L(more_2x_vec):
 | ||||||
|  |  	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |  L(last_4x_vec): | ||||||
|  | -	/* Copy from 2 * VEC to 4 * VEC. */
 | ||||||
|  | +	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
 | ||||||
|  |  	VMOVU	(%rsi), %VEC(0) | ||||||
|  |  	VMOVU	VEC_SIZE(%rsi), %VEC(1) | ||||||
|  |  	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2) | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										55
									
								
								SOURCES/glibc-RHEL-15696-49.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								SOURCES/glibc-RHEL-15696-49.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,55 @@ | |||||||
|  | From 08cbcd4dbc686bb38ec3093aff2f919fbff5ec17 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Wed, 23 Jun 2021 19:19:34 -0400 | ||||||
|  | Subject: [PATCH] x86: Remove unnecessary overflow check from wcsnlen-sse4_1.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. The way wcsnlen will check if near the end of maxlen | ||||||
|  | is the following macro: | ||||||
|  | 
 | ||||||
|  | 	mov	%r11, %rsi;	\ | ||||||
|  | 	subq	%rax, %rsi;	\ | ||||||
|  | 	andq	$-64, %rax;	\ | ||||||
|  | 	testq	$-64, %rsi;	\ | ||||||
|  | 	je	L(strnlen_ret) | ||||||
|  | 
 | ||||||
|  | Which words independently of s + maxlen overflowing. So the | ||||||
|  | second overflow check is unnecissary for correctness and | ||||||
|  | just extra overhead in the common no overflow case. | ||||||
|  | 
 | ||||||
|  | test-strlen.c, test-wcslen.c, test-strnlen.c and test-wcsnlen.c are | ||||||
|  | all passing | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strlen-vec.S | 7 ------- | ||||||
|  |  1 file changed, 7 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
 | ||||||
|  | index 439e486a..b7657282 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strlen-vec.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strlen-vec.S
 | ||||||
|  | @@ -71,19 +71,12 @@ L(n_nonzero):
 | ||||||
|  |     suffice.  */ | ||||||
|  |  	mov	%RSI_LP, %R10_LP | ||||||
|  |  	sar	$62, %R10_LP | ||||||
|  | -	test	%R10_LP, %R10_LP
 | ||||||
|  |  	jnz	__wcslen_sse4_1 | ||||||
|  |  	sal	$2, %RSI_LP | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  | -
 | ||||||
|  |  /* Initialize long lived registers.  */ | ||||||
|  | -
 | ||||||
|  |  	add	%RDI_LP, %RSI_LP | ||||||
|  | -# ifdef AS_WCSLEN
 | ||||||
|  | -/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
 | ||||||
|  | -	jbe	__wcslen_sse4_1
 | ||||||
|  | -# endif
 | ||||||
|  |  	mov	%RSI_LP, %R10_LP | ||||||
|  |  	and	$-64, %R10_LP | ||||||
|  |  	mov	%RSI_LP, %R11_LP | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										290
									
								
								SOURCES/glibc-RHEL-15696-5.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										290
									
								
								SOURCES/glibc-RHEL-15696-5.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,290 @@ | |||||||
|  | From 82d0b4a4d76db554eb6757acb790fcea30b19965 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Mon, 21 Jan 2019 11:32:24 -0800 | ||||||
|  | Subject: [PATCH] x86-64 memset/wmemset: Properly handle the length parameter | ||||||
|  |  [BZ# 24097] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | On x32, the size_t parameter may be passed in the lower 32 bits of a | ||||||
|  | 64-bit register with the non-zero upper 32 bits.  The string/memory | ||||||
|  | functions written in assembly can only use the lower 32 bits of a | ||||||
|  | 64-bit register as length or must clear the upper 32 bits before using | ||||||
|  | the full 64-bit register for length. | ||||||
|  | 
 | ||||||
|  | This pach fixes memset/wmemset for x32.  Tested on x86-64 and x32.  On | ||||||
|  | x86-64, libc.so is the same with and withou the fix. | ||||||
|  | 
 | ||||||
|  | 	[BZ# 24097] | ||||||
|  | 	CVE-2019-6488 | ||||||
|  | 	* sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use | ||||||
|  | 	RDX_LP for length.  Clear the upper 32 bits of RDX register. | ||||||
|  | 	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise. | ||||||
|  | 	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset. | ||||||
|  | 	* sysdeps/x86_64/x32/tst-size_t-memset.c: New file. | ||||||
|  | 	* sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise. | ||||||
|  | ---
 | ||||||
|  |  .../multiarch/memset-avx512-no-vzeroupper.S   |  6 +- | ||||||
|  |  .../multiarch/memset-vec-unaligned-erms.S     | 34 +++++---- | ||||||
|  |  sysdeps/x86_64/x32/Makefile                   |  4 +- | ||||||
|  |  sysdeps/x86_64/x32/tst-size_t-memset.c        | 73 +++++++++++++++++++ | ||||||
|  |  sysdeps/x86_64/x32/tst-size_t-wmemset.c       | 20 +++++ | ||||||
|  |  5 files changed, 121 insertions(+), 16 deletions(-) | ||||||
|  |  create mode 100644 sysdeps/x86_64/x32/tst-size_t-memset.c | ||||||
|  |  create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemset.c | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	ChangeLog | ||||||
|  | 	(removed) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
 | ||||||
|  | index 689cc119..99e25519 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
 | ||||||
|  | @@ -29,12 +29,16 @@
 | ||||||
|  |  	.section .text.avx512,"ax",@progbits | ||||||
|  |  #if defined PIC | ||||||
|  |  ENTRY (MEMSET_CHK) | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | +	cmp	%RDX_LP, %RCX_LP
 | ||||||
|  |  	jb	HIDDEN_JUMPTARGET (__chk_fail) | ||||||
|  |  END (MEMSET_CHK) | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  |  ENTRY (MEMSET) | ||||||
|  | +# ifdef __ILP32__
 | ||||||
|  | +	/* Clear the upper 32 bits.  */
 | ||||||
|  | +	mov	%edx, %edx
 | ||||||
|  | +# endif
 | ||||||
|  |  	vpxor	%xmm0, %xmm0, %xmm0 | ||||||
|  |  	vmovd	%esi, %xmm1 | ||||||
|  |  	lea	(%rdi, %rdx), %rsi | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 | ||||||
|  | index 270a1d49..9a0fd818 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 | ||||||
|  | @@ -65,8 +65,8 @@
 | ||||||
|  |  	.section SECTION(.text),"ax",@progbits | ||||||
|  |  #if VEC_SIZE == 16 && IS_IN (libc) | ||||||
|  |  ENTRY (__bzero) | ||||||
|  | -	movq	%rdi, %rax /* Set return value.  */
 | ||||||
|  | -	movq	%rsi, %rdx /* Set n.  */
 | ||||||
|  | +	mov	%RDI_LP, %RAX_LP /* Set return value.  */
 | ||||||
|  | +	mov	%RSI_LP, %RDX_LP /* Set n.  */
 | ||||||
|  |  	pxor	%xmm0, %xmm0 | ||||||
|  |  	jmp	L(entry_from_bzero) | ||||||
|  |  END (__bzero) | ||||||
|  | @@ -76,13 +76,13 @@ weak_alias (__bzero, bzero)
 | ||||||
|  |  #if IS_IN (libc) | ||||||
|  |  # if defined SHARED | ||||||
|  |  ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | +	cmp	%RDX_LP, %RCX_LP
 | ||||||
|  |  	jb	HIDDEN_JUMPTARGET (__chk_fail) | ||||||
|  |  END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) | ||||||
|  | -	shlq	$2, %rdx
 | ||||||
|  | +	shl	$2, %RDX_LP
 | ||||||
|  |  	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) | ||||||
|  |  	jmp	L(entry_from_bzero) | ||||||
|  |  END (WMEMSET_SYMBOL (__wmemset, unaligned)) | ||||||
|  | @@ -90,13 +90,17 @@ END (WMEMSET_SYMBOL (__wmemset, unaligned))
 | ||||||
|  |   | ||||||
|  |  #if defined SHARED && IS_IN (libc) | ||||||
|  |  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | +	cmp	%RDX_LP, %RCX_LP
 | ||||||
|  |  	jb	HIDDEN_JUMPTARGET (__chk_fail) | ||||||
|  |  END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  |  ENTRY (MEMSET_SYMBOL (__memset, unaligned)) | ||||||
|  |  	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) | ||||||
|  | +# ifdef __ILP32__
 | ||||||
|  | +	/* Clear the upper 32 bits.  */
 | ||||||
|  | +	mov	%edx, %edx
 | ||||||
|  | +# endif
 | ||||||
|  |  L(entry_from_bzero): | ||||||
|  |  	cmpq	$VEC_SIZE, %rdx | ||||||
|  |  	jb	L(less_vec) | ||||||
|  | @@ -112,14 +116,14 @@ END (MEMSET_SYMBOL (__memset, unaligned))
 | ||||||
|  |   | ||||||
|  |  # if VEC_SIZE == 16 | ||||||
|  |  ENTRY (__memset_chk_erms) | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | +	cmp	%RDX_LP, %RCX_LP
 | ||||||
|  |  	jb	HIDDEN_JUMPTARGET (__chk_fail) | ||||||
|  |  END (__memset_chk_erms) | ||||||
|  |   | ||||||
|  |  /* Only used to measure performance of REP STOSB.  */ | ||||||
|  |  ENTRY (__memset_erms) | ||||||
|  |  	/* Skip zero length.  */ | ||||||
|  | -	testq	%rdx, %rdx
 | ||||||
|  | +	test	%RDX_LP, %RDX_LP
 | ||||||
|  |  	jnz	 L(stosb) | ||||||
|  |  	movq	%rdi, %rax | ||||||
|  |  	ret | ||||||
|  | @@ -131,11 +135,11 @@ ENTRY (MEMSET_SYMBOL (__memset, erms))
 | ||||||
|  |  L(stosb): | ||||||
|  |  	/* Issue vzeroupper before rep stosb.  */ | ||||||
|  |  	VZEROUPPER | ||||||
|  | -	movq	%rdx, %rcx
 | ||||||
|  | +	mov	%RDX_LP, %RCX_LP
 | ||||||
|  |  	movzbl	%sil, %eax | ||||||
|  | -	movq	%rdi, %rdx
 | ||||||
|  | +	mov	%RDI_LP, %RDX_LP
 | ||||||
|  |  	rep stosb | ||||||
|  | -	movq	%rdx, %rax
 | ||||||
|  | +	mov	%RDX_LP, %RAX_LP
 | ||||||
|  |  	ret | ||||||
|  |  # if VEC_SIZE == 16 | ||||||
|  |  END (__memset_erms) | ||||||
|  | @@ -145,16 +149,20 @@ END (MEMSET_SYMBOL (__memset, erms))
 | ||||||
|  |   | ||||||
|  |  # if defined SHARED && IS_IN (libc) | ||||||
|  |  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) | ||||||
|  | -	cmpq	%rdx, %rcx
 | ||||||
|  | +	cmp	%RDX_LP, %RCX_LP
 | ||||||
|  |  	jb	HIDDEN_JUMPTARGET (__chk_fail) | ||||||
|  |  END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) | ||||||
|  |  	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) | ||||||
|  | -	cmpq	$VEC_SIZE, %rdx
 | ||||||
|  | +# ifdef __ILP32__
 | ||||||
|  | +	/* Clear the upper 32 bits.  */
 | ||||||
|  | +	mov	%edx, %edx
 | ||||||
|  | +# endif
 | ||||||
|  | +	cmp	$VEC_SIZE, %RDX_LP
 | ||||||
|  |  	jb	L(less_vec) | ||||||
|  | -	cmpq	$(VEC_SIZE * 2), %rdx
 | ||||||
|  | +	cmp	$(VEC_SIZE * 2), %RDX_LP
 | ||||||
|  |  	ja	L(stosb_more_2x_vec) | ||||||
|  |  	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */ | ||||||
|  |  	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx) | ||||||
|  | diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | index e99dbd7c..98bd9ae9 100644
 | ||||||
|  | --- a/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | @@ -7,9 +7,9 @@ endif
 | ||||||
|  |   | ||||||
|  |  ifeq ($(subdir),string) | ||||||
|  |  tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ | ||||||
|  | -	 tst-size_t-memrchr
 | ||||||
|  | +	 tst-size_t-memrchr tst-size_t-memset
 | ||||||
|  |  endif | ||||||
|  |   | ||||||
|  |  ifeq ($(subdir),wcsmbs) | ||||||
|  | -tests += tst-size_t-wmemchr tst-size_t-wmemcmp
 | ||||||
|  | +tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
 | ||||||
|  |  endif | ||||||
|  | diff --git a/sysdeps/x86_64/x32/tst-size_t-memset.c b/sysdeps/x86_64/x32/tst-size_t-memset.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..2c367af6
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/tst-size_t-memset.c
 | ||||||
|  | @@ -0,0 +1,73 @@
 | ||||||
|  | +/* Test memset with size_t in the lower 32 bits of 64-bit register.
 | ||||||
|  | +   Copyright (C) 2019 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <http://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#ifdef WIDE
 | ||||||
|  | +# define TEST_NAME "wmemset"
 | ||||||
|  | +#else
 | ||||||
|  | +# define TEST_NAME "memset"
 | ||||||
|  | +#endif /* WIDE */
 | ||||||
|  | +
 | ||||||
|  | +#include "test-size_t.h"
 | ||||||
|  | +
 | ||||||
|  | +#ifdef WIDE
 | ||||||
|  | +# include <wchar.h>
 | ||||||
|  | +# define MEMSET wmemset
 | ||||||
|  | +# define CHAR wchar_t
 | ||||||
|  | +#else
 | ||||||
|  | +# define MEMSET memset
 | ||||||
|  | +# define CHAR char
 | ||||||
|  | +#endif /* WIDE */
 | ||||||
|  | +
 | ||||||
|  | +IMPL (MEMSET, 1)
 | ||||||
|  | +
 | ||||||
|  | +typedef CHAR *(*proto_t) (CHAR *, int, size_t);
 | ||||||
|  | +
 | ||||||
|  | +static void *
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +do_memset (parameter_t a, parameter_t b)
 | ||||||
|  | +{
 | ||||||
|  | +  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +test_main (void)
 | ||||||
|  | +{
 | ||||||
|  | +  test_init ();
 | ||||||
|  | +
 | ||||||
|  | +  CHAR ch = 0x23;
 | ||||||
|  | +  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
 | ||||||
|  | +  parameter_t c = { { 0 }, (void *) (uintptr_t) ch };
 | ||||||
|  | +
 | ||||||
|  | +  int ret = 0;
 | ||||||
|  | +  FOR_EACH_IMPL (impl, 0)
 | ||||||
|  | +    {
 | ||||||
|  | +      c.fn = impl->fn;
 | ||||||
|  | +      CHAR *p = (CHAR *) do_memset (src, c);
 | ||||||
|  | +      size_t i;
 | ||||||
|  | +      for (i = 0; i < src.len; i++)
 | ||||||
|  | +	if (p[i] != ch)
 | ||||||
|  | +	  {
 | ||||||
|  | +	    error (0, 0, "Wrong result in function %s", impl->name);
 | ||||||
|  | +	    ret = 1;
 | ||||||
|  | +	  }
 | ||||||
|  | +    }
 | ||||||
|  | +
 | ||||||
|  | +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +#include <support/test-driver.c>
 | ||||||
|  | diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemset.c b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..955eb488
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
 | ||||||
|  | @@ -0,0 +1,20 @@
 | ||||||
|  | +/* Test wmemset with size_t in the lower 32 bits of 64-bit register.
 | ||||||
|  | +   Copyright (C) 2019 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <http://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#define WIDE 1
 | ||||||
|  | +#include "tst-size_t-memset.c"
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										43
									
								
								SOURCES/glibc-RHEL-15696-50.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								SOURCES/glibc-RHEL-15696-50.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,43 @@ | |||||||
|  | From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001 | ||||||
|  | Author: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>  2021-05-23 21:43:10 | ||||||
|  | Committer: H.J. Lu <hjl.tools@gmail.com>  2021-06-27 10:56:57 | ||||||
|  | Parent: 2c16cb88a6e5ace0fb7cedca86860ea7bde522a7 (Linux: Move timer helper routines from librt to libc) | ||||||
|  | Child:  1683249d17e14827b6579529742eb895027dfa84 (x86_64: roundeven with sse4.1 support) | ||||||
|  | Branches: master, remotes/origin/master and many more (41) | ||||||
|  | Follows: glibc-2.33.9000 | ||||||
|  | Precedes: glibc-2.34 | ||||||
|  | 
 | ||||||
|  |     math: redirect roundeven function | ||||||
|  |      | ||||||
|  |     This patch redirect roundeven function for futhermore changes. | ||||||
|  |      | ||||||
|  |     Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com> | ||||||
|  |     Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	* | ||||||
|  | 	(rewritten for older branch) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
 | ||||||
|  | index 7bbbb2dc..8728d0f2 100644
 | ||||||
|  | --- a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
 | ||||||
|  | +++ b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
 | ||||||
|  | @@ -17,6 +17,7 @@
 | ||||||
|  |     License along with the GNU C Library; if not, see | ||||||
|  |     <http://www.gnu.org/licenses/>.  */ | ||||||
|  |   | ||||||
|  | +#define NO_MATH_REDIRECT
 | ||||||
|  |  #include <math.h> | ||||||
|  |  #include <math_private.h> | ||||||
|  |  #include <libm-alias-double.h> | ||||||
|  | @@ -67,5 +68,6 @@ __roundeven (double x)
 | ||||||
|  |    INSERT_WORDS64 (x, ix); | ||||||
|  |    return x; | ||||||
|  |  } | ||||||
|  | -hidden_def (__roundeven)
 | ||||||
|  | +#ifndef __roundeven
 | ||||||
|  |  libm_alias_double (__roundeven, roundeven) | ||||||
|  | +#endif
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										118
									
								
								SOURCES/glibc-RHEL-15696-51.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										118
									
								
								SOURCES/glibc-RHEL-15696-51.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,118 @@ | |||||||
|  | From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com> | ||||||
|  | Date: Mon, 24 May 2021 09:43:10 +0800 | ||||||
|  | Subject: [PATCH] math: redirect roundeven function | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | This patch redirect roundeven function for futhermore changes. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  include/math.h                             | 3 ++- | ||||||
|  |  sysdeps/ieee754/dbl-64/s_roundeven.c       | 4 +++- | ||||||
|  |  sysdeps/ieee754/float128/s_roundevenf128.c | 1 + | ||||||
|  |  sysdeps/ieee754/flt-32/s_roundevenf.c      | 3 +++ | ||||||
|  |  sysdeps/ieee754/ldbl-128/s_roundevenl.c    | 1 + | ||||||
|  |  sysdeps/ieee754/ldbl-96/s_roundevenl.c     | 1 + | ||||||
|  |  6 files changed, 11 insertions(+), 2 deletions(-) | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	include/math.h | ||||||
|  | 	(missing MATH_REDIRECT macros) | ||||||
|  | 
 | ||||||
|  | diff --git a/include/math.h b/include/math.h
 | ||||||
|  | index e21d34b8..1f9f9a54 100644
 | ||||||
|  | --- a/include/math.h
 | ||||||
|  | +++ b/include/math.h
 | ||||||
|  | @@ -38,7 +38,6 @@ libm_hidden_proto (__issignaling)
 | ||||||
|  |  libm_hidden_proto (__issignalingf) | ||||||
|  |  libm_hidden_proto (__exp) | ||||||
|  |  libm_hidden_proto (__expf) | ||||||
|  | -libm_hidden_proto (__roundeven)
 | ||||||
|  |   | ||||||
|  |  # ifndef __NO_LONG_DOUBLE_MATH | ||||||
|  |  libm_hidden_proto (__fpclassifyl) | ||||||
|  | @@ -56,6 +55,8 @@ libm_hidden_proto (__expm1f128)
 | ||||||
|  |   | ||||||
|  |  # if !(defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ > 0) | ||||||
|  |  #  ifndef NO_MATH_REDIRECT | ||||||
|  | +float (roundevenf) (float) asm ("__roundevenf");
 | ||||||
|  | +double (roundeven) (double) asm ("__roundeven");
 | ||||||
|  |  /* Declare sqrt for use within GLIBC.  Compilers typically inline sqrt as a | ||||||
|  |     single instruction.  Use an asm to avoid use of PLTs if it doesn't.  */ | ||||||
|  |  float (sqrtf) (float) asm ("__ieee754_sqrtf"); | ||||||
|  | diff --git a/sysdeps/ieee754/dbl-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/s_roundeven.c
 | ||||||
|  | index 1438e81d..61962184 100644
 | ||||||
|  | --- a/sysdeps/ieee754/dbl-64/s_roundeven.c
 | ||||||
|  | +++ b/sysdeps/ieee754/dbl-64/s_roundeven.c
 | ||||||
|  | @@ -17,6 +17,7 @@
 | ||||||
|  |     License along with the GNU C Library; if not, see | ||||||
|  |     <http://www.gnu.org/licenses/>.  */ | ||||||
|  |   | ||||||
|  | +#define NO_MATH_REDIRECT
 | ||||||
|  |  #include <math.h> | ||||||
|  |  #include <math_private.h> | ||||||
|  |  #include <libm-alias-double.h> | ||||||
|  | @@ -101,5 +102,6 @@ __roundeven (double x)
 | ||||||
|  |    INSERT_WORDS (x, hx, lx); | ||||||
|  |    return x; | ||||||
|  |  } | ||||||
|  | -hidden_def (__roundeven)
 | ||||||
|  | +#ifndef __roundeven
 | ||||||
|  |  libm_alias_double (__roundeven, roundeven) | ||||||
|  | +#endif
 | ||||||
|  | diff --git a/sysdeps/ieee754/float128/s_roundevenf128.c b/sysdeps/ieee754/float128/s_roundevenf128.c
 | ||||||
|  | index 5a9b3f39..e0faf727 100644
 | ||||||
|  | --- a/sysdeps/ieee754/float128/s_roundevenf128.c
 | ||||||
|  | +++ b/sysdeps/ieee754/float128/s_roundevenf128.c
 | ||||||
|  | @@ -1,2 +1,3 @@
 | ||||||
|  | +#define NO_MATH_REDIRECT
 | ||||||
|  |  #include <float128_private.h> | ||||||
|  |  #include "../ldbl-128/s_roundevenl.c" | ||||||
|  | diff --git a/sysdeps/ieee754/flt-32/s_roundevenf.c b/sysdeps/ieee754/flt-32/s_roundevenf.c
 | ||||||
|  | index 90f991d5..a661875e 100644
 | ||||||
|  | --- a/sysdeps/ieee754/flt-32/s_roundevenf.c
 | ||||||
|  | +++ b/sysdeps/ieee754/flt-32/s_roundevenf.c
 | ||||||
|  | @@ -17,6 +17,7 @@
 | ||||||
|  |     License along with the GNU C Library; if not, see | ||||||
|  |     <http://www.gnu.org/licenses/>.  */ | ||||||
|  |   | ||||||
|  | +#define NO_MATH_REDIRECT
 | ||||||
|  |  #include <math.h> | ||||||
|  |  #include <math_private.h> | ||||||
|  |  #include <libm-alias-float.h> | ||||||
|  | @@ -67,4 +68,6 @@ __roundevenf (float x)
 | ||||||
|  |    SET_FLOAT_WORD (x, ix); | ||||||
|  |    return x; | ||||||
|  |  } | ||||||
|  | +#ifndef __roundevenf
 | ||||||
|  |  libm_alias_float (__roundeven, roundeven) | ||||||
|  | +#endif
 | ||||||
|  | diff --git a/sysdeps/ieee754/ldbl-128/s_roundevenl.c b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
 | ||||||
|  | index 5fc59af4..b9375b6c 100644
 | ||||||
|  | --- a/sysdeps/ieee754/ldbl-128/s_roundevenl.c
 | ||||||
|  | +++ b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
 | ||||||
|  | @@ -17,6 +17,7 @@
 | ||||||
|  |     License along with the GNU C Library; if not, see | ||||||
|  |     <http://www.gnu.org/licenses/>.  */ | ||||||
|  |   | ||||||
|  | +#define NO_MATH_REDIRECT
 | ||||||
|  |  #include <math.h> | ||||||
|  |  #include <math_private.h> | ||||||
|  |  #include <libm-alias-ldouble.h> | ||||||
|  | diff --git a/sysdeps/ieee754/ldbl-96/s_roundevenl.c b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
 | ||||||
|  | index be2e4fa4..65031ab7 100644
 | ||||||
|  | --- a/sysdeps/ieee754/ldbl-96/s_roundevenl.c
 | ||||||
|  | +++ b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
 | ||||||
|  | @@ -17,6 +17,7 @@
 | ||||||
|  |     License along with the GNU C Library; if not, see | ||||||
|  |     <http://www.gnu.org/licenses/>.  */ | ||||||
|  |   | ||||||
|  | +#define NO_MATH_REDIRECT
 | ||||||
|  |  #include <math.h> | ||||||
|  |  #include <math_private.h> | ||||||
|  |  #include <libm-alias-ldouble.h> | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										242
									
								
								SOURCES/glibc-RHEL-15696-52.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										242
									
								
								SOURCES/glibc-RHEL-15696-52.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,242 @@ | |||||||
|  | From 1683249d17e14827b6579529742eb895027dfa84 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com> | ||||||
|  | Date: Mon, 24 May 2021 09:43:11 +0800 | ||||||
|  | Subject: [PATCH] x86_64: roundeven with sse4.1 support | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | This patch adds support for the sse4.1 hardware floating point | ||||||
|  | roundeven. | ||||||
|  | 
 | ||||||
|  | Here is some benchmark results on my systems: | ||||||
|  | 
 | ||||||
|  | =AMD Ryzen 9 3900X 12-Core Processor=
 | ||||||
|  | 
 | ||||||
|  | * benchmark result before this commit | ||||||
|  | |            |    roundeven |   roundevenf | | ||||||
|  | |------------|--------------|--------------| | ||||||
|  | | duration   |  3.75587e+09 |  3.75114e+09 | | ||||||
|  | | iterations |  3.93053e+08 |  4.35402e+08 | | ||||||
|  | | max        | 52.592       | 58.71        | | ||||||
|  | | min        |  7.98        |  7.22        | | ||||||
|  | | mean       |  9.55563     |  8.61535     | | ||||||
|  | 
 | ||||||
|  | * benchmark result after this commit | ||||||
|  | |            |     roundeven |   roundevenf | | ||||||
|  | |------------|---------------|--------------| | ||||||
|  | | duration   |   3.73815e+09 |  3.73738e+09 | | ||||||
|  | | iterations |   5.82692e+08 |  5.91498e+08 | | ||||||
|  | | max        |  56.468       | 51.642       | | ||||||
|  | | min        |   6.27        |  6.156       | | ||||||
|  | | mean       |   6.41532     |  6.3185      | | ||||||
|  | 
 | ||||||
|  | =Intel(R) Pentium(R) CPU D1508 @ 2.20GHz=
 | ||||||
|  | 
 | ||||||
|  | * benchmark result before this commit | ||||||
|  | |            |    roundeven |   roundevenf | | ||||||
|  | |------------|--------------|--------------| | ||||||
|  | | duration   |  2.18208e+09 |  2.18258e+09 | | ||||||
|  | | iterations |  2.39932e+08 |  2.46924e+08 | | ||||||
|  | | max        | 96.378       | 98.035       | | ||||||
|  | | min        |  6.776       |  5.94        | | ||||||
|  | | mean       |  9.09456     |  8.83907     | | ||||||
|  | 
 | ||||||
|  | * benchmark result after this commit | ||||||
|  | |            |    roundeven |   roundevenf | | ||||||
|  | |------------|--------------|--------------| | ||||||
|  | | duration   |  2.17415e+09 |  2.17005e+09 | | ||||||
|  | | iterations |  3.56193e+08 |  4.09824e+08 | | ||||||
|  | | max        | 51.693       | 97.192       | | ||||||
|  | | min        |  5.926       |  5.093       | | ||||||
|  | | mean       |  6.10385     |  5.29507     | | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/fpu/multiarch/Makefile         |  5 +-- | ||||||
|  |  sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c  |  2 ++ | ||||||
|  |  .../x86_64/fpu/multiarch/s_roundeven-sse4_1.S | 24 ++++++++++++++ | ||||||
|  |  sysdeps/x86_64/fpu/multiarch/s_roundeven.c    | 31 +++++++++++++++++++ | ||||||
|  |  sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c |  3 ++ | ||||||
|  |  .../fpu/multiarch/s_roundevenf-sse4_1.S       | 24 ++++++++++++++ | ||||||
|  |  sysdeps/x86_64/fpu/multiarch/s_roundevenf.c   | 31 +++++++++++++++++++ | ||||||
|  |  7 files changed, 118 insertions(+), 2 deletions(-) | ||||||
|  |  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c | ||||||
|  |  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S | ||||||
|  |  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven.c | ||||||
|  |  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c | ||||||
|  |  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S | ||||||
|  |  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf.c | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
 | ||||||
|  | index 9f387248..6ddd1c01 100644
 | ||||||
|  | --- a/sysdeps/x86_64/fpu/multiarch/Makefile
 | ||||||
|  | +++ b/sysdeps/x86_64/fpu/multiarch/Makefile
 | ||||||
|  | @@ -1,11 +1,12 @@
 | ||||||
|  |  ifeq ($(subdir),math) | ||||||
|  |  libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \ | ||||||
|  |  			s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \ | ||||||
|  | -			s_trunc-c s_truncf-c
 | ||||||
|  | +			s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c
 | ||||||
|  |   | ||||||
|  |  libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \ | ||||||
|  |  			s_floorf-sse4_1 s_nearbyint-sse4_1 \ | ||||||
|  | -			s_nearbyintf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
 | ||||||
|  | +			s_nearbyintf-sse4_1 s_roundeven-sse4_1 \
 | ||||||
|  | +			s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
 | ||||||
|  |  			s_trunc-sse4_1 s_truncf-sse4_1 | ||||||
|  |   | ||||||
|  |  libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \ | ||||||
|  | diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..c7be43cb
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
 | ||||||
|  | @@ -0,0 +1,2 @@
 | ||||||
|  | +#define __roundeven __roundeven_c
 | ||||||
|  | +#include <sysdeps/ieee754/dbl-64/s_roundeven.c>
 | ||||||
|  | diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..6ae8f6b1
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
 | ||||||
|  | @@ -0,0 +1,24 @@
 | ||||||
|  | +/* Copyright (C) 2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <sysdep.h>
 | ||||||
|  | +
 | ||||||
|  | +	.section .text.sse4.1,"ax",@progbits
 | ||||||
|  | +ENTRY(__roundeven_sse41)
 | ||||||
|  | +	roundsd	$8, %xmm0, %xmm0
 | ||||||
|  | +	ret
 | ||||||
|  | +END(__roundeven_sse41)
 | ||||||
|  | diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..d92eda65
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
 | ||||||
|  | @@ -0,0 +1,31 @@
 | ||||||
|  | +/* Multiple versions of __roundeven.
 | ||||||
|  | +   Copyright (C) 2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <libm-alias-double.h>
 | ||||||
|  | +
 | ||||||
|  | +#define roundeven __redirect_roundeven
 | ||||||
|  | +#define __roundeven __redirect___roundeven
 | ||||||
|  | +#include <math.h>
 | ||||||
|  | +#undef roundeven
 | ||||||
|  | +#undef __roundeven
 | ||||||
|  | +
 | ||||||
|  | +#define SYMBOL_NAME roundeven
 | ||||||
|  | +#include "ifunc-sse4_1.h"
 | ||||||
|  | +
 | ||||||
|  | +libc_ifunc_redirected (__redirect_roundeven, __roundeven, IFUNC_SELECTOR ());
 | ||||||
|  | +libm_alias_double (__roundeven, roundeven)
 | ||||||
|  | diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..72a6e7d1
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
 | ||||||
|  | @@ -0,0 +1,3 @@
 | ||||||
|  | +#undef __roundevenf
 | ||||||
|  | +#define __roundevenf __roundevenf_c
 | ||||||
|  | +#include <sysdeps/ieee754/flt-32/s_roundevenf.c>
 | ||||||
|  | diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..a76e1080
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
 | ||||||
|  | @@ -0,0 +1,24 @@
 | ||||||
|  | +/* Copyright (C) 2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <sysdep.h>
 | ||||||
|  | +
 | ||||||
|  | +	.section .text.sse4.1,"ax",@progbits
 | ||||||
|  | +ENTRY(__roundevenf_sse41)
 | ||||||
|  | +	roundss	$8, %xmm0, %xmm0
 | ||||||
|  | +	ret
 | ||||||
|  | +END(__roundevenf_sse41)
 | ||||||
|  | diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..2ee196e6
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
 | ||||||
|  | @@ -0,0 +1,31 @@
 | ||||||
|  | +/* Multiple versions of __roundevenf.
 | ||||||
|  | +   Copyright (C) 2021 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#include <libm-alias-float.h>
 | ||||||
|  | +
 | ||||||
|  | +#define roundevenf __redirect_roundevenf
 | ||||||
|  | +#define __roundevenf __redirect___roundevenf
 | ||||||
|  | +#include <math.h>
 | ||||||
|  | +#undef roundevenf
 | ||||||
|  | +#undef __roundevenf
 | ||||||
|  | +
 | ||||||
|  | +#define SYMBOL_NAME roundevenf
 | ||||||
|  | +#include "ifunc-sse4_1.h"
 | ||||||
|  | +
 | ||||||
|  | +libc_ifunc_redirected (__redirect_roundevenf, __roundevenf, IFUNC_SELECTOR ());
 | ||||||
|  | +libm_alias_float (__roundeven, roundeven)
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										41
									
								
								SOURCES/glibc-RHEL-15696-53.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								SOURCES/glibc-RHEL-15696-53.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,41 @@ | |||||||
|  | From 7e08db3359c86c94918feb33a1182cd0ff3bb10b Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Sun, 9 Jan 2022 16:02:28 -0600 | ||||||
|  | Subject: [PATCH] x86: Fix __wcsncmp_evex in strcmp-evex.S [BZ# 28755] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to | ||||||
|  | __wcscmp_evex. For x86_64 this covers the entire address range so any | ||||||
|  | length larger could not possibly be used to bound `s1` or `s2`. | ||||||
|  | 
 | ||||||
|  | test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strcmp-evex.S | 10 ++++++++++ | ||||||
|  |  1 file changed, 10 insertions(+) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
 | ||||||
|  | index 459eeed0..d5aa6daa 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
 | ||||||
|  | @@ -97,6 +97,16 @@ ENTRY (STRCMP)
 | ||||||
|  |  	je	L(char0) | ||||||
|  |  	jb	L(zero) | ||||||
|  |  #  ifdef USE_AS_WCSCMP | ||||||
|  | +#  ifndef __ILP32__
 | ||||||
|  | +	movq	%rdx, %rcx
 | ||||||
|  | +	/* Check if length could overflow when multiplied by
 | ||||||
|  | +	   sizeof(wchar_t). Checking top 8 bits will cover all potential
 | ||||||
|  | +	   overflow cases as well as redirect cases where its impossible to
 | ||||||
|  | +	   length to bound a valid memory region. In these cases just use
 | ||||||
|  | +	   'wcscmp'.  */
 | ||||||
|  | +	shrq	$56, %rcx
 | ||||||
|  | +	jnz	__wcscmp_evex
 | ||||||
|  | +#  endif
 | ||||||
|  |  	/* Convert units: from wide to byte char.  */ | ||||||
|  |  	shl	$2, %RDX_LP | ||||||
|  |  #  endif | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										268
									
								
								SOURCES/glibc-RHEL-15696-54.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										268
									
								
								SOURCES/glibc-RHEL-15696-54.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,268 @@ | |||||||
|  | From 78c9ec9000f873abe7a15a91b87080a2e4308260 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Fri, 20 Aug 2021 06:42:24 -0700 | ||||||
|  | Subject: [PATCH] x86-64: Optimize load of all bits set into ZMM register [BZ | ||||||
|  |  #28252] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Optimize loads of all bits set into ZMM register in AVX512 SVML codes | ||||||
|  | by replacing | ||||||
|  | 
 | ||||||
|  | 	vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX | ||||||
|  | 
 | ||||||
|  | and | ||||||
|  | 
 | ||||||
|  | 	vmovups   .L_2il0floatpacket.13(%rip), %zmmX | ||||||
|  | 
 | ||||||
|  | with | ||||||
|  | 	vpternlogd $0xff, %zmmX, %zmmX, %zmmX | ||||||
|  | 
 | ||||||
|  | This fixes BZ #28252. | ||||||
|  | ---
 | ||||||
|  |  .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S   |  7 +------ | ||||||
|  |  .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S   |  7 +------ | ||||||
|  |  .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S   |  7 +------ | ||||||
|  |  .../fpu/multiarch/svml_d_sincos8_core_avx512.S       |  7 +------ | ||||||
|  |  .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S |  7 +------ | ||||||
|  |  .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S |  7 +------ | ||||||
|  |  .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S |  7 +------ | ||||||
|  |  .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++---------- | ||||||
|  |  .../fpu/multiarch/svml_s_sincosf16_core_avx512.S     |  7 +------ | ||||||
|  |  .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S |  7 +------ | ||||||
|  |  10 files changed, 11 insertions(+), 64 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
 | ||||||
|  | index 24e3b363..07dfed85 100644
 | ||||||
|  | --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
 | ||||||
|  | +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
 | ||||||
|  | @@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
 | ||||||
|  |          vmovaps   %zmm0, %zmm8 | ||||||
|  |   | ||||||
|  |  /* Check for large arguments path */ | ||||||
|  | -        vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
 | ||||||
|  | +        vpternlogd $0xff, %zmm2, %zmm2, %zmm2
 | ||||||
|  |   | ||||||
|  |  /* | ||||||
|  |    ARGUMENT RANGE REDUCTION: | ||||||
|  | @@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
 | ||||||
|  |          jmp       .LBL_2_7 | ||||||
|  |  #endif | ||||||
|  |  END (_ZGVeN8v_cos_skx) | ||||||
|  | -
 | ||||||
|  | -	.section .rodata, "a"
 | ||||||
|  | -.L_2il0floatpacket.16:
 | ||||||
|  | -	.long	0xffffffff,0xffffffff
 | ||||||
|  | -	.type	.L_2il0floatpacket.16,@object
 | ||||||
|  | diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
 | ||||||
|  | index ae8af8d8..ddb60e5b 100644
 | ||||||
|  | --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
 | ||||||
|  | +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
 | ||||||
|  | @@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
 | ||||||
|  |   | ||||||
|  |  /* preserve mantissa, set input exponent to 2^(-10) */ | ||||||
|  |          vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2 | ||||||
|  | -        vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
 | ||||||
|  | +        vpternlogd $0xff, %zmm1, %zmm1, %zmm1
 | ||||||
|  |          vpsrlq    $32, %zmm4, %zmm6 | ||||||
|  |   | ||||||
|  |  /* reciprocal approximation good to at least 11 bits */ | ||||||
|  | @@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
 | ||||||
|  |          jmp       .LBL_2_7 | ||||||
|  |  #endif | ||||||
|  |  END (_ZGVeN8v_log_skx) | ||||||
|  | -
 | ||||||
|  | -	.section .rodata, "a"
 | ||||||
|  | -.L_2il0floatpacket.12:
 | ||||||
|  | -	.long	0xffffffff,0xffffffff
 | ||||||
|  | -	.type	.L_2il0floatpacket.12,@object
 | ||||||
|  | diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
 | ||||||
|  | index 2d4b14fd..529c454a 100644
 | ||||||
|  | --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
 | ||||||
|  | +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
 | ||||||
|  | @@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
 | ||||||
|  |          andq      $-64, %rsp | ||||||
|  |          subq      $1280, %rsp | ||||||
|  |          movq      __svml_d_trig_data@GOTPCREL(%rip), %rax | ||||||
|  | -        vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
 | ||||||
|  | +        vpternlogd $0xff, %zmm1, %zmm1, %zmm14
 | ||||||
|  |          vmovups __dAbsMask(%rax), %zmm7 | ||||||
|  |          vmovups __dInvPI(%rax), %zmm2 | ||||||
|  |          vmovups __dRShifter(%rax), %zmm1 | ||||||
|  | @@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
 | ||||||
|  |          jmp       .LBL_2_7 | ||||||
|  |  #endif | ||||||
|  |  END (_ZGVeN8v_sin_skx) | ||||||
|  | -
 | ||||||
|  | -	.section .rodata, "a"
 | ||||||
|  | -.L_2il0floatpacket.14:
 | ||||||
|  | -	.long	0xffffffff,0xffffffff
 | ||||||
|  | -	.type	.L_2il0floatpacket.14,@object
 | ||||||
|  | diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
 | ||||||
|  | index 2df626c0..e501a53a 100644
 | ||||||
|  | --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
 | ||||||
|  | +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
 | ||||||
|  | @@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
 | ||||||
|  |   | ||||||
|  |  /* SinPoly = SinR*SinPoly */ | ||||||
|  |          vfmadd213pd %zmm5, %zmm5, %zmm4 | ||||||
|  | -        vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
 | ||||||
|  | +        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
 | ||||||
|  |   | ||||||
|  |  /* Update Cos result's sign */ | ||||||
|  |          vxorpd    %zmm2, %zmm1, %zmm1 | ||||||
|  | @@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl)
 | ||||||
|  |  ENTRY (_ZGVeN8vvv_sincos_skx) | ||||||
|  |  WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx | ||||||
|  |  END (_ZGVeN8vvv_sincos_skx) | ||||||
|  | -
 | ||||||
|  | -	.section .rodata, "a"
 | ||||||
|  | -.L_2il0floatpacket.15:
 | ||||||
|  | -	.long	0xffffffff,0xffffffff
 | ||||||
|  | -	.type	.L_2il0floatpacket.15,@object
 | ||||||
|  | diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
 | ||||||
|  | index 6ea1137b..377af394 100644
 | ||||||
|  | --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
 | ||||||
|  | +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
 | ||||||
|  | @@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
 | ||||||
|  |    X = X - Y*PI1 - Y*PI2 - Y*PI3 | ||||||
|  |   */ | ||||||
|  |          vmovaps   %zmm0, %zmm6 | ||||||
|  | -        vmovups   .L_2il0floatpacket.13(%rip), %zmm12
 | ||||||
|  | +        vpternlogd $0xff, %zmm12, %zmm12, %zmm12
 | ||||||
|  |          vmovups __sRShifter(%rax), %zmm3 | ||||||
|  |          vmovups __sPI1_FMA(%rax), %zmm5 | ||||||
|  |          vmovups __sA9_FMA(%rax), %zmm9 | ||||||
|  | @@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
 | ||||||
|  |          jmp       .LBL_2_7 | ||||||
|  |  #endif | ||||||
|  |  END (_ZGVeN16v_cosf_skx) | ||||||
|  | -
 | ||||||
|  | -	.section .rodata, "a"
 | ||||||
|  | -.L_2il0floatpacket.13:
 | ||||||
|  | -	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 | ||||||
|  | -	.type	.L_2il0floatpacket.13,@object
 | ||||||
|  | diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
 | ||||||
|  | index 89ba0df2..46f33d46 100644
 | ||||||
|  | --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
 | ||||||
|  | +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
 | ||||||
|  | @@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
 | ||||||
|  |          vmovaps   %zmm0, %zmm7 | ||||||
|  |   | ||||||
|  |  /* compare against threshold */ | ||||||
|  | -        vmovups   .L_2il0floatpacket.13(%rip), %zmm3
 | ||||||
|  | +        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
 | ||||||
|  |          vmovups __sInvLn2(%rax), %zmm4 | ||||||
|  |          vmovups __sShifter(%rax), %zmm1 | ||||||
|  |          vmovups __sLn2hi(%rax), %zmm6 | ||||||
|  | @@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
 | ||||||
|  |   | ||||||
|  |  #endif | ||||||
|  |  END (_ZGVeN16v_expf_skx) | ||||||
|  | -
 | ||||||
|  | -	.section .rodata, "a"
 | ||||||
|  | -.L_2il0floatpacket.13:
 | ||||||
|  | -	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 | ||||||
|  | -	.type	.L_2il0floatpacket.13,@object
 | ||||||
|  | diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
 | ||||||
|  | index 4cf0a96f..9e254956 100644
 | ||||||
|  | --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
 | ||||||
|  | +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
 | ||||||
|  | @@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
 | ||||||
|  |          andq      $-64, %rsp | ||||||
|  |          subq      $1280, %rsp | ||||||
|  |          movq      __svml_slog_data@GOTPCREL(%rip), %rax | ||||||
|  | -        vmovups   .L_2il0floatpacket.7(%rip), %zmm6
 | ||||||
|  | +        vpternlogd $0xff, %zmm6, %zmm6, %zmm6
 | ||||||
|  |          vmovups _iBrkValue(%rax), %zmm4 | ||||||
|  |          vmovups _sPoly_7(%rax), %zmm8 | ||||||
|  |   | ||||||
|  | @@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
 | ||||||
|  |   | ||||||
|  |  #endif | ||||||
|  |  END (_ZGVeN16v_logf_skx) | ||||||
|  | -
 | ||||||
|  | -	.section .rodata, "a"
 | ||||||
|  | -.L_2il0floatpacket.7:
 | ||||||
|  | -	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 | ||||||
|  | -	.type	.L_2il0floatpacket.7,@object
 | ||||||
|  | diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
 | ||||||
|  | index bdcd50af..e8331ba1 100644
 | ||||||
|  | --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
 | ||||||
|  | +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
 | ||||||
|  | @@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
 | ||||||
|  |          vpsrlq    $32, %zmm3, %zmm2 | ||||||
|  |          vpmovqd   %zmm2, %ymm11 | ||||||
|  |          vcvtps2pd %ymm14, %zmm13 | ||||||
|  | -        vmovups   .L_2il0floatpacket.23(%rip), %zmm14
 | ||||||
|  | +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
 | ||||||
|  |          vmovaps   %zmm14, %zmm26 | ||||||
|  |          vpandd _ABSMASK(%rax), %zmm1, %zmm8 | ||||||
|  |          vpcmpd    $1, _INF(%rax), %zmm8, %k2 | ||||||
|  | @@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
 | ||||||
|  |          vpmovqd   %zmm11, %ymm5 | ||||||
|  |          vpxord    %zmm10, %zmm10, %zmm10 | ||||||
|  |          vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3} | ||||||
|  | -        vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
 | ||||||
|  | +        vpternlogd $0xff, %zmm4, %zmm4, %zmm4
 | ||||||
|  |          vpxord    %zmm11, %zmm11, %zmm11 | ||||||
|  |          vcvtdq2pd %ymm7, %zmm7 | ||||||
|  |          vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1} | ||||||
|  | @@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
 | ||||||
|  |          jmp       .LBL_2_7 | ||||||
|  |  #endif | ||||||
|  |  END (_ZGVeN16vv_powf_skx) | ||||||
|  | -
 | ||||||
|  | -	.section .rodata, "a"
 | ||||||
|  | -.L_2il0floatpacket.23:
 | ||||||
|  | -	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 | ||||||
|  | -	.type	.L_2il0floatpacket.23,@object
 | ||||||
|  | -.L_2il0floatpacket.24:
 | ||||||
|  | -	.long	0xffffffff,0xffffffff
 | ||||||
|  | -	.type	.L_2il0floatpacket.24,@object
 | ||||||
|  | diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
 | ||||||
|  | index 5fa4bc41..1f46f334 100644
 | ||||||
|  | --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
 | ||||||
|  | +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
 | ||||||
|  | @@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
 | ||||||
|  |   | ||||||
|  |  /* Result sign calculations */ | ||||||
|  |          vpternlogd $150, %zmm0, %zmm14, %zmm1 | ||||||
|  | -        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
 | ||||||
|  | +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
 | ||||||
|  |   | ||||||
|  |  /* Add correction term 0.5 for cos() part */ | ||||||
|  |          vaddps    %zmm8, %zmm5, %zmm15 | ||||||
|  | @@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl)
 | ||||||
|  |  ENTRY (_ZGVeN16vvv_sincosf_skx) | ||||||
|  |  WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx | ||||||
|  |  END (_ZGVeN16vvv_sincosf_skx) | ||||||
|  | -
 | ||||||
|  | -	.section .rodata, "a"
 | ||||||
|  | -.L_2il0floatpacket.13:
 | ||||||
|  | -	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 | ||||||
|  | -	.type	.L_2il0floatpacket.13,@object
 | ||||||
|  | diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
 | ||||||
|  | index 141f747e..1fc9308a 100644
 | ||||||
|  | --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
 | ||||||
|  | +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
 | ||||||
|  | @@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
 | ||||||
|  |          movq      __svml_s_trig_data@GOTPCREL(%rip), %rax | ||||||
|  |   | ||||||
|  |  /* Check for large and special values */ | ||||||
|  | -        vmovups   .L_2il0floatpacket.11(%rip), %zmm14
 | ||||||
|  | +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
 | ||||||
|  |          vmovups __sAbsMask(%rax), %zmm5 | ||||||
|  |          vmovups __sInvPI(%rax), %zmm1 | ||||||
|  |          vmovups __sRShifter(%rax), %zmm2 | ||||||
|  | @@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
 | ||||||
|  |          jmp       .LBL_2_7 | ||||||
|  |  #endif | ||||||
|  |  END (_ZGVeN16v_sinf_skx) | ||||||
|  | -
 | ||||||
|  | -	.section .rodata, "a"
 | ||||||
|  | -.L_2il0floatpacket.11:
 | ||||||
|  | -	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 | ||||||
|  | -	.type	.L_2il0floatpacket.11,@object
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										48
									
								
								SOURCES/glibc-RHEL-15696-55.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								SOURCES/glibc-RHEL-15696-55.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,48 @@ | |||||||
|  | From fc5bd179ef3a953dff8d1655bd530d0e230ffe71 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Tue, 21 Sep 2021 18:31:49 -0500 | ||||||
|  | Subject: [PATCH] x86: Modify ENTRY in sysdep.h so that p2align can be | ||||||
|  |  specified | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. | ||||||
|  | 
 | ||||||
|  | This change adds a new macro ENTRY_P2ALIGN which takes a second | ||||||
|  | argument, log2 of the desired function alignment. | ||||||
|  | 
 | ||||||
|  | The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this | ||||||
|  | doesn't affect any existing functionality. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86/sysdep.h | 7 +++++-- | ||||||
|  |  1 file changed, 5 insertions(+), 2 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
 | ||||||
|  | index 01bac0f6..a70bb3a2 100644
 | ||||||
|  | --- a/sysdeps/x86/sysdep.h
 | ||||||
|  | +++ b/sysdeps/x86/sysdep.h
 | ||||||
|  | @@ -78,15 +78,18 @@ enum cf_protection_level
 | ||||||
|  |  #define ASM_SIZE_DIRECTIVE(name) .size name,.-name; | ||||||
|  |   | ||||||
|  |  /* Define an entry point visible from C.  */ | ||||||
|  | -#define	ENTRY(name)							      \
 | ||||||
|  | +#define	ENTRY_P2ALIGN(name, alignment)					      \
 | ||||||
|  |    .globl C_SYMBOL_NAME(name);						      \ | ||||||
|  |    .type C_SYMBOL_NAME(name),@function;					      \ | ||||||
|  | -  .align ALIGNARG(4);							      \
 | ||||||
|  | +  .align ALIGNARG(alignment);						      \
 | ||||||
|  |    C_LABEL(name)								      \ | ||||||
|  |    cfi_startproc;							      \ | ||||||
|  |    _CET_ENDBR;								      \ | ||||||
|  |    CALL_MCOUNT | ||||||
|  |   | ||||||
|  | +/* Common entry 16 byte aligns.  */
 | ||||||
|  | +#define ENTRY(name) ENTRY_P2ALIGN (name, 4)
 | ||||||
|  | +
 | ||||||
|  |  #undef	END | ||||||
|  |  #define END(name)							      \ | ||||||
|  |    cfi_endproc;								      \ | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										658
									
								
								SOURCES/glibc-RHEL-15696-56.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										658
									
								
								SOURCES/glibc-RHEL-15696-56.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,658 @@ | |||||||
|  | From 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Tue, 21 Sep 2021 18:45:03 -0500 | ||||||
|  | Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S for frontend behavior and | ||||||
|  |  size | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. | ||||||
|  | 
 | ||||||
|  | The frontend optimizations are to: | ||||||
|  | 1. Reorganize logically connected basic blocks so they are either in | ||||||
|  |    the same cache line or adjacent cache lines. | ||||||
|  | 2. Avoid cases when basic blocks unnecissarily cross cache lines. | ||||||
|  | 3. Try and 32 byte align any basic blocks possible without sacrificing | ||||||
|  |    code size. Smaller / Less hot basic blocks are used for this. | ||||||
|  | 
 | ||||||
|  | Overall code size shrunk by 168 bytes. This should make up for any | ||||||
|  | extra costs due to aligning to 64 bytes. | ||||||
|  | 
 | ||||||
|  | In general performance before deviated a great deal dependending on | ||||||
|  | whether entry alignment % 64 was 0, 16, 32, or 48. These changes | ||||||
|  | essentially make it so that the current implementation is at least | ||||||
|  | equal to the best alignment of the original for any arguments. | ||||||
|  | 
 | ||||||
|  | The only additional optimization is in the page cross case. Branch on | ||||||
|  | equals case was removed from the size == [4, 7] case. As well the [4, | ||||||
|  | 7] and [2, 3] case where swapped as [4, 7] is likely a more hot | ||||||
|  | argument size. | ||||||
|  | 
 | ||||||
|  | test-memcmp and test-wmemcmp are both passing. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 434 +++++++++++-------- | ||||||
|  |  1 file changed, 242 insertions(+), 192 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 | ||||||
|  | index 654dc7ac..2761b54f 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 | ||||||
|  | @@ -34,7 +34,24 @@
 | ||||||
|  |        area. | ||||||
|  |     7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less. | ||||||
|  |     8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less. | ||||||
|  | -   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
 | ||||||
|  | +   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
 | ||||||
|  | +
 | ||||||
|  | +When possible the implementation tries to optimize for frontend in the
 | ||||||
|  | +following ways:
 | ||||||
|  | +Throughput:
 | ||||||
|  | +    1. All code sections that fit are able to run optimally out of the
 | ||||||
|  | +       LSD.
 | ||||||
|  | +    2. All code sections that fit are able to run optimally out of the
 | ||||||
|  | +       DSB
 | ||||||
|  | +    3. Basic blocks are contained in minimum number of fetch blocks
 | ||||||
|  | +       necessary.
 | ||||||
|  | +
 | ||||||
|  | +Latency:
 | ||||||
|  | +    1. Logically connected basic blocks are put in the same
 | ||||||
|  | +       cache-line.
 | ||||||
|  | +    2. Logically connected basic blocks that do not fit in the same
 | ||||||
|  | +       cache-line are put in adjacent lines. This can get beneficial
 | ||||||
|  | +       L2 spatial prefetching and L1 next-line prefetching.  */
 | ||||||
|  |   | ||||||
|  |  # include <sysdep.h> | ||||||
|  |   | ||||||
|  | @@ -47,9 +64,11 @@
 | ||||||
|  |  # ifdef USE_AS_WMEMCMP | ||||||
|  |  #  define CHAR_SIZE	4 | ||||||
|  |  #  define VPCMP	vpcmpd | ||||||
|  | +#  define VPTEST	vptestmd
 | ||||||
|  |  # else | ||||||
|  |  #  define CHAR_SIZE	1 | ||||||
|  |  #  define VPCMP	vpcmpub | ||||||
|  | +#  define VPTEST	vptestmb
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  # define VEC_SIZE	32 | ||||||
|  | @@ -75,7 +94,9 @@
 | ||||||
|  |  */ | ||||||
|  |   | ||||||
|  |  	.section .text.evex,"ax",@progbits | ||||||
|  | -ENTRY (MEMCMP)
 | ||||||
|  | +/* Cache align memcmp entry. This allows for much more thorough
 | ||||||
|  | +   frontend optimization.  */
 | ||||||
|  | +ENTRY_P2ALIGN (MEMCMP, 6)
 | ||||||
|  |  # ifdef __ILP32__ | ||||||
|  |  	/* Clear the upper 32 bits.  */ | ||||||
|  |  	movl	%edx, %edx | ||||||
|  | @@ -89,7 +110,7 @@ ENTRY (MEMCMP)
 | ||||||
|  |  	VPCMP	$4, (%rdi), %YMM1, %k1 | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  |  	/* NB: eax must be destination register if going to | ||||||
|  | -	   L(return_vec_[0,2]). For L(return_vec_3 destination register
 | ||||||
|  | +	   L(return_vec_[0,2]). For L(return_vec_3) destination register
 | ||||||
|  |  	   must be ecx.  */ | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(return_vec_0) | ||||||
|  | @@ -121,10 +142,6 @@ ENTRY (MEMCMP)
 | ||||||
|  |  	testl	%ecx, %ecx | ||||||
|  |  	jnz	L(return_vec_3) | ||||||
|  |   | ||||||
|  | -	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
 | ||||||
|  | -	   compare with zero to get a mask is needed.  */
 | ||||||
|  | -	vpxorq	%XMM0, %XMM0, %XMM0
 | ||||||
|  | -
 | ||||||
|  |  	/* Go to 4x VEC loop.  */ | ||||||
|  |  	cmpq	$(CHAR_PER_VEC * 8), %rdx | ||||||
|  |  	ja	L(more_8x_vec) | ||||||
|  | @@ -148,47 +165,61 @@ ENTRY (MEMCMP)
 | ||||||
|  |   | ||||||
|  |  	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3 | ||||||
|  |  	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 | ||||||
|  | -	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
 | ||||||
|  | -	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
 | ||||||
|  |   | ||||||
|  |  	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4 | ||||||
|  |  	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while | ||||||
|  | -	   oring with YMM3. Result is stored in YMM4.  */
 | ||||||
|  | -	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
 | ||||||
|  | -	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
 | ||||||
|  | -	VPCMP	$4, %YMM4, %YMM0, %k1
 | ||||||
|  | +	   oring with YMM1. Result is stored in YMM4.  */
 | ||||||
|  | +	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
 | ||||||
|  | +
 | ||||||
|  | +	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
 | ||||||
|  | +	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
 | ||||||
|  | +
 | ||||||
|  | +	/* Test YMM4 against itself. Store any CHAR mismatches in k1.
 | ||||||
|  | +	 */
 | ||||||
|  | +	VPTEST	%YMM4, %YMM4, %k1
 | ||||||
|  | +	/* k1 must go to ecx for L(return_vec_0_1_2_3).  */
 | ||||||
|  |  	kmovd	%k1, %ecx | ||||||
|  |  	testl	%ecx, %ecx | ||||||
|  |  	jnz	L(return_vec_0_1_2_3) | ||||||
|  |  	/* NB: eax must be zero to reach here.  */ | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	/* NB: aligning 32 here allows for the rest of the jump targets
 | ||||||
|  | -	   to be tuned for 32 byte alignment. Most important this ensures
 | ||||||
|  | -	   the L(more_8x_vec) loop is 32 byte aligned.  */
 | ||||||
|  | -	.p2align 5
 | ||||||
|  | -L(less_vec):
 | ||||||
|  | -	/* Check if one or less CHAR. This is necessary for size = 0 but
 | ||||||
|  | -	   is also faster for size = CHAR_SIZE.  */
 | ||||||
|  | -	cmpl	$1, %edx
 | ||||||
|  | -	jbe	L(one_or_less)
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(8x_end_return_vec_0_1_2_3):
 | ||||||
|  | +	movq	%rdx, %rdi
 | ||||||
|  | +L(8x_return_vec_0_1_2_3):
 | ||||||
|  | +	addq	%rdi, %rsi
 | ||||||
|  | +L(return_vec_0_1_2_3):
 | ||||||
|  | +	VPTEST	%YMM1, %YMM1, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_0)
 | ||||||
|  |   | ||||||
|  | -	/* Check if loading one VEC from either s1 or s2 could cause a
 | ||||||
|  | -	   page cross. This can have false positives but is by far the
 | ||||||
|  | -	   fastest method.  */
 | ||||||
|  | -	movl	%edi, %eax
 | ||||||
|  | -	orl	%esi, %eax
 | ||||||
|  | -	andl	$(PAGE_SIZE - 1), %eax
 | ||||||
|  | -	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 | ||||||
|  | -	jg	L(page_cross_less_vec)
 | ||||||
|  | +	VPTEST	%YMM2, %YMM2, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_1)
 | ||||||
|  |   | ||||||
|  | -	/* No page cross possible.  */
 | ||||||
|  | -	VMOVU	(%rsi), %YMM2
 | ||||||
|  | -	VPCMP	$4, (%rdi), %YMM2, %k1
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | -	/* Create mask in ecx for potentially in bound matches.  */
 | ||||||
|  | -	bzhil	%edx, %eax, %eax
 | ||||||
|  | -	jnz	L(return_vec_0)
 | ||||||
|  | +	VPTEST	%YMM3, %YMM3, %k0
 | ||||||
|  | +	kmovd	%k0, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_2)
 | ||||||
|  | +L(return_vec_3):
 | ||||||
|  | +	/* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
 | ||||||
|  | +	   fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
 | ||||||
|  | +	   line.  */
 | ||||||
|  | +	bsfl	%ecx, %ecx
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 | ||||||
|  | +	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | @@ -209,10 +240,11 @@ L(return_vec_0):
 | ||||||
|  |  # endif | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
 | ||||||
|  | -	   which is good enough for a target not in a loop.  */
 | ||||||
|  | +	.p2align 4
 | ||||||
|  |  L(return_vec_1): | ||||||
|  | -	tzcntl	%eax, %eax
 | ||||||
|  | +	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
 | ||||||
|  | +	   fetch block.  */
 | ||||||
|  | +	bsfl	%eax, %eax
 | ||||||
|  |  # ifdef USE_AS_WMEMCMP | ||||||
|  |  	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx | ||||||
|  |  	xorl	%edx, %edx | ||||||
|  | @@ -226,10 +258,11 @@ L(return_vec_1):
 | ||||||
|  |  # endif | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
 | ||||||
|  | -	   which is good enough for a target not in a loop.  */
 | ||||||
|  | +	.p2align 4,, 10
 | ||||||
|  |  L(return_vec_2): | ||||||
|  | -	tzcntl	%eax, %eax
 | ||||||
|  | +	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
 | ||||||
|  | +	   fetch block.  */
 | ||||||
|  | +	bsfl	%eax, %eax
 | ||||||
|  |  # ifdef USE_AS_WMEMCMP | ||||||
|  |  	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx | ||||||
|  |  	xorl	%edx, %edx | ||||||
|  | @@ -243,40 +276,6 @@ L(return_vec_2):
 | ||||||
|  |  # endif | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(8x_return_vec_0_1_2_3):
 | ||||||
|  | -	/* Returning from L(more_8x_vec) requires restoring rsi.  */
 | ||||||
|  | -	addq	%rdi, %rsi
 | ||||||
|  | -L(return_vec_0_1_2_3):
 | ||||||
|  | -	VPCMP	$4, %YMM1, %YMM0, %k0
 | ||||||
|  | -	kmovd	%k0, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(return_vec_0)
 | ||||||
|  | -
 | ||||||
|  | -	VPCMP	$4, %YMM2, %YMM0, %k0
 | ||||||
|  | -	kmovd	%k0, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(return_vec_1)
 | ||||||
|  | -
 | ||||||
|  | -	VPCMP	$4, %YMM3, %YMM0, %k0
 | ||||||
|  | -	kmovd	%k0, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(return_vec_2)
 | ||||||
|  | -L(return_vec_3):
 | ||||||
|  | -	tzcntl	%ecx, %ecx
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  | -	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
 | ||||||
|  | -	setg	%dl
 | ||||||
|  | -	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | -# else
 | ||||||
|  | -	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 | ||||||
|  | -	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
 | ||||||
|  | -	subl	%ecx, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(more_8x_vec): | ||||||
|  |  	/* Set end of s1 in rdx.  */ | ||||||
|  | @@ -288,21 +287,19 @@ L(more_8x_vec):
 | ||||||
|  |  	andq	$-VEC_SIZE, %rdi | ||||||
|  |  	/* Adjust because first 4x vec where check already.  */ | ||||||
|  |  	subq	$-(VEC_SIZE * 4), %rdi | ||||||
|  | +
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(loop_4x_vec): | ||||||
|  |  	VMOVU	(%rsi, %rdi), %YMM1 | ||||||
|  |  	vpxorq	(%rdi), %YMM1, %YMM1 | ||||||
|  | -
 | ||||||
|  |  	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2 | ||||||
|  |  	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2 | ||||||
|  | -
 | ||||||
|  |  	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3 | ||||||
|  |  	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 | ||||||
|  | -	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
 | ||||||
|  | -
 | ||||||
|  |  	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4 | ||||||
|  | -	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
 | ||||||
|  | -	VPCMP	$4, %YMM4, %YMM0, %k1
 | ||||||
|  | +	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
 | ||||||
|  | +	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
 | ||||||
|  | +	VPTEST	%YMM4, %YMM4, %k1
 | ||||||
|  |  	kmovd	%k1, %ecx | ||||||
|  |  	testl	%ecx, %ecx | ||||||
|  |  	jnz	L(8x_return_vec_0_1_2_3) | ||||||
|  | @@ -319,28 +316,25 @@ L(loop_4x_vec):
 | ||||||
|  |  	cmpl	$(VEC_SIZE * 2), %edi | ||||||
|  |  	jae	L(8x_last_2x_vec) | ||||||
|  |   | ||||||
|  | +	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
 | ||||||
|  | +
 | ||||||
|  |  	VMOVU	(%rsi, %rdx), %YMM1 | ||||||
|  |  	vpxorq	(%rdx), %YMM1, %YMM1 | ||||||
|  |   | ||||||
|  |  	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2 | ||||||
|  |  	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2 | ||||||
|  | -
 | ||||||
|  | -	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
 | ||||||
|  | -	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
 | ||||||
|  | -
 | ||||||
|  |  	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4 | ||||||
|  | -	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
 | ||||||
|  | -	VPCMP	$4, %YMM4, %YMM0, %k1
 | ||||||
|  | +	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
 | ||||||
|  | +	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
 | ||||||
|  | +	VPTEST	%YMM4, %YMM4, %k1
 | ||||||
|  |  	kmovd	%k1, %ecx | ||||||
|  | -	/* Restore s1 pointer to rdi.  */
 | ||||||
|  | -	movq	%rdx, %rdi
 | ||||||
|  |  	testl	%ecx, %ecx | ||||||
|  | -	jnz	L(8x_return_vec_0_1_2_3)
 | ||||||
|  | +	jnz	L(8x_end_return_vec_0_1_2_3)
 | ||||||
|  |  	/* NB: eax must be zero to reach here.  */ | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	/* Only entry is from L(more_8x_vec).  */ | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	.p2align 4,, 10
 | ||||||
|  |  L(8x_last_2x_vec): | ||||||
|  |  	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1 | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  | @@ -355,7 +349,31 @@ L(8x_last_1x_vec):
 | ||||||
|  |  	jnz	L(8x_return_vec_3) | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	/* Not ideally aligned (at offset +9 bytes in fetch block) but
 | ||||||
|  | +	   not aligning keeps it in the same cache line as
 | ||||||
|  | +	   L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
 | ||||||
|  | +	   size.  */
 | ||||||
|  | +	.p2align 4,, 4
 | ||||||
|  | +L(8x_return_vec_2):
 | ||||||
|  | +	subq	$VEC_SIZE, %rdx
 | ||||||
|  | +L(8x_return_vec_3):
 | ||||||
|  | +	bsfl	%eax, %eax
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	leaq	(%rdx, %rax, CHAR_SIZE), %rax
 | ||||||
|  | +	movl	(VEC_SIZE * 3)(%rax), %ecx
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	addq	%rdx, %rax
 | ||||||
|  | +	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
 | ||||||
|  | +	movzbl	(VEC_SIZE * 3)(%rax), %eax
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4,, 10
 | ||||||
|  |  L(last_2x_vec): | ||||||
|  |  	/* Check second to last VEC.  */ | ||||||
|  |  	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1 | ||||||
|  | @@ -374,26 +392,49 @@ L(last_1x_vec):
 | ||||||
|  |  	jnz	L(return_vec_0_end) | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(8x_return_vec_2):
 | ||||||
|  | -	subq	$VEC_SIZE, %rdx
 | ||||||
|  | -L(8x_return_vec_3):
 | ||||||
|  | -	tzcntl	%eax, %eax
 | ||||||
|  | +	.p2align 4,, 10
 | ||||||
|  | +L(return_vec_1_end):
 | ||||||
|  | +	/* Use bsf to save code size. This is necessary to have
 | ||||||
|  | +	   L(one_or_less) fit in aligning bytes between.  */
 | ||||||
|  | +	bsfl	%eax, %eax
 | ||||||
|  | +	addl	%edx, %eax
 | ||||||
|  |  # ifdef USE_AS_WMEMCMP | ||||||
|  | -	leaq	(%rdx, %rax, CHAR_SIZE), %rax
 | ||||||
|  | -	movl	(VEC_SIZE * 3)(%rax), %ecx
 | ||||||
|  | +	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  |  	xorl	%edx, %edx | ||||||
|  | -	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
 | ||||||
|  | +	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  |  	setg	%dl | ||||||
|  |  	leal	-1(%rdx, %rdx), %eax | ||||||
|  |  # else | ||||||
|  | -	addq	%rdx, %rax
 | ||||||
|  | -	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
 | ||||||
|  | -	movzbl	(VEC_SIZE * 3)(%rax), %eax
 | ||||||
|  | +	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
 | ||||||
|  | +	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
 | ||||||
|  |  	subl	%ecx, %eax | ||||||
|  |  # endif | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | +	/* NB: L(one_or_less) fits in alignment padding between
 | ||||||
|  | +	   L(return_vec_1_end) and L(return_vec_0_end).  */
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +L(one_or_less):
 | ||||||
|  | +	jb	L(zero)
 | ||||||
|  | +	movl	(%rdi), %ecx
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	(%rsi), %ecx
 | ||||||
|  | +	je	L(zero)
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | +	ret
 | ||||||
|  | +# else
 | ||||||
|  | +L(one_or_less):
 | ||||||
|  | +	jb	L(zero)
 | ||||||
|  | +	movzbl	(%rsi), %ecx
 | ||||||
|  | +	movzbl	(%rdi), %eax
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +	ret
 | ||||||
|  | +# endif
 | ||||||
|  | +L(zero):
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(return_vec_0_end): | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | @@ -412,23 +453,56 @@ L(return_vec_0_end):
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(return_vec_1_end):
 | ||||||
|  | +L(less_vec):
 | ||||||
|  | +	/* Check if one or less CHAR. This is necessary for size == 0
 | ||||||
|  | +	   but is also faster for size == CHAR_SIZE.  */
 | ||||||
|  | +	cmpl	$1, %edx
 | ||||||
|  | +	jbe	L(one_or_less)
 | ||||||
|  | +
 | ||||||
|  | +	/* Check if loading one VEC from either s1 or s2 could cause a
 | ||||||
|  | +	   page cross. This can have false positives but is by far the
 | ||||||
|  | +	   fastest method.  */
 | ||||||
|  | +	movl	%edi, %eax
 | ||||||
|  | +	orl	%esi, %eax
 | ||||||
|  | +	andl	$(PAGE_SIZE - 1), %eax
 | ||||||
|  | +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 | ||||||
|  | +	jg	L(page_cross_less_vec)
 | ||||||
|  | +
 | ||||||
|  | +	/* No page cross possible.  */
 | ||||||
|  | +	VMOVU	(%rsi), %YMM2
 | ||||||
|  | +	VPCMP	$4, (%rdi), %YMM2, %k1
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	/* Check if any matches where in bounds. Intentionally not
 | ||||||
|  | +	   storing result in eax to limit dependency chain if it goes to
 | ||||||
|  | +	   L(return_vec_0_lv).  */
 | ||||||
|  | +	bzhil	%edx, %eax, %edx
 | ||||||
|  | +	jnz	L(return_vec_0_lv)
 | ||||||
|  | +	xorl	%eax, %eax
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +	/* Essentially duplicate of L(return_vec_0). Ends up not costing
 | ||||||
|  | +	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
 | ||||||
|  | +	   the jump and ends up fitting in aligning bytes. As well fits on
 | ||||||
|  | +	   same cache line as L(less_vec) so also saves a line from having
 | ||||||
|  | +	   to be fetched on cold calls to memcmp.  */
 | ||||||
|  | +	.p2align 4,, 4
 | ||||||
|  | +L(return_vec_0_lv):
 | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  | -	addl	%edx, %eax
 | ||||||
|  |  # ifdef USE_AS_WMEMCMP | ||||||
|  | -	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | +	movl	(%rdi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  |  	xorl	%edx, %edx | ||||||
|  | -	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | +	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | +	/* NB: no partial register stall here because xorl zero idiom
 | ||||||
|  | +	   above.  */
 | ||||||
|  |  	setg	%dl | ||||||
|  |  	leal	-1(%rdx, %rdx), %eax | ||||||
|  |  # else | ||||||
|  | -	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
 | ||||||
|  | -	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
 | ||||||
|  | +	movzbl	(%rsi, %rax), %ecx
 | ||||||
|  | +	movzbl	(%rdi, %rax), %eax
 | ||||||
|  |  	subl	%ecx, %eax | ||||||
|  |  # endif | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(page_cross_less_vec): | ||||||
|  |  	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 | ||||||
|  | @@ -439,108 +513,84 @@ L(page_cross_less_vec):
 | ||||||
|  |  	cmpl	$8, %edx | ||||||
|  |  	jae	L(between_8_15) | ||||||
|  |  	cmpl	$4, %edx | ||||||
|  | -	jae	L(between_4_7)
 | ||||||
|  | -L(between_2_3):
 | ||||||
|  | -	/* Load as big endian to avoid branches.  */
 | ||||||
|  | -	movzwl	(%rdi), %eax
 | ||||||
|  | -	movzwl	(%rsi), %ecx
 | ||||||
|  | -	shll	$8, %eax
 | ||||||
|  | -	shll	$8, %ecx
 | ||||||
|  | -	bswap	%eax
 | ||||||
|  | -	bswap	%ecx
 | ||||||
|  | -	movzbl	-1(%rdi, %rdx), %edi
 | ||||||
|  | -	movzbl	-1(%rsi, %rdx), %esi
 | ||||||
|  | -	orl	%edi, %eax
 | ||||||
|  | -	orl	%esi, %ecx
 | ||||||
|  | -	/* Subtraction is okay because the upper 8 bits are zero.  */
 | ||||||
|  | -	subl	%ecx, %eax
 | ||||||
|  | -	ret
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(one_or_less):
 | ||||||
|  | -	jb	L(zero)
 | ||||||
|  | -	movzbl	(%rsi), %ecx
 | ||||||
|  | -	movzbl	(%rdi), %eax
 | ||||||
|  | -	subl	%ecx, %eax
 | ||||||
|  | +	jb	L(between_2_3)
 | ||||||
|  | +
 | ||||||
|  | +	/* Load as big endian with overlapping movbe to avoid branches.
 | ||||||
|  | +	 */
 | ||||||
|  | +	movbe	(%rdi), %eax
 | ||||||
|  | +	movbe	(%rsi), %ecx
 | ||||||
|  | +	shlq	$32, %rax
 | ||||||
|  | +	shlq	$32, %rcx
 | ||||||
|  | +	movbe	-4(%rdi, %rdx), %edi
 | ||||||
|  | +	movbe	-4(%rsi, %rdx), %esi
 | ||||||
|  | +	orq	%rdi, %rax
 | ||||||
|  | +	orq	%rsi, %rcx
 | ||||||
|  | +	subq	%rcx, %rax
 | ||||||
|  | +	/* edx is guranteed to be positive int32 in range [4, 7].  */
 | ||||||
|  | +	cmovne	%edx, %eax
 | ||||||
|  | +	/* ecx is -1 if rcx > rax. Otherwise 0.  */
 | ||||||
|  | +	sbbl	%ecx, %ecx
 | ||||||
|  | +	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
 | ||||||
|  | +	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
 | ||||||
|  | +	   eax doesn't matter.  */
 | ||||||
|  | +	orl	%ecx, %eax
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	.p2align 4,, 8
 | ||||||
|  |  L(between_8_15): | ||||||
|  |  # endif | ||||||
|  |  	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */ | ||||||
|  | -	vmovq	(%rdi), %XMM1
 | ||||||
|  | -	vmovq	(%rsi), %XMM2
 | ||||||
|  | -	VPCMP	$4, %XMM1, %XMM2, %k1
 | ||||||
|  | +	vmovq	(%rdi), %xmm1
 | ||||||
|  | +	vmovq	(%rsi), %xmm2
 | ||||||
|  | +	VPCMP	$4, %xmm1, %xmm2, %k1
 | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(return_vec_0)
 | ||||||
|  | +	jnz	L(return_vec_0_lv)
 | ||||||
|  |  	/* Use overlapping loads to avoid branches.  */ | ||||||
|  | -	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
 | ||||||
|  | -	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
 | ||||||
|  | -	vmovq	(%rdi), %XMM1
 | ||||||
|  | -	vmovq	(%rsi), %XMM2
 | ||||||
|  | -	VPCMP	$4, %XMM1, %XMM2, %k1
 | ||||||
|  | +	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
 | ||||||
|  | +	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
 | ||||||
|  | +	VPCMP	$4, %xmm1, %xmm2, %k1
 | ||||||
|  | +	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
 | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(return_vec_0)
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(zero):
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_0_end)
 | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	.p2align 4,, 8
 | ||||||
|  |  L(between_16_31): | ||||||
|  |  	/* From 16 to 31 bytes.  No branch when size == 16.  */ | ||||||
|  | -	VMOVU	(%rsi), %XMM2
 | ||||||
|  | -	VPCMP	$4, (%rdi), %XMM2, %k1
 | ||||||
|  | +
 | ||||||
|  | +	/* Use movups to save code size.  */
 | ||||||
|  | +	movups	(%rsi), %xmm2
 | ||||||
|  | +	VPCMP	$4, (%rdi), %xmm2, %k1
 | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(return_vec_0)
 | ||||||
|  | -
 | ||||||
|  | +	jnz	L(return_vec_0_lv)
 | ||||||
|  |  	/* Use overlapping loads to avoid branches.  */ | ||||||
|  | -
 | ||||||
|  | -	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
 | ||||||
|  | -	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
 | ||||||
|  | -	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
 | ||||||
|  | -	VPCMP	$4, (%rdi), %XMM2, %k1
 | ||||||
|  | +	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
 | ||||||
|  | +	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
 | ||||||
|  | +	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
 | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  | -	jnz	L(return_vec_0)
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(one_or_less):
 | ||||||
|  | -	jb	L(zero)
 | ||||||
|  | -	movl	(%rdi), %ecx
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  | -	cmpl	(%rsi), %ecx
 | ||||||
|  | -	je	L(zero)
 | ||||||
|  | -	setg	%dl
 | ||||||
|  | -	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | +	jnz	L(return_vec_0_end)
 | ||||||
|  |  	ret | ||||||
|  | -# else
 | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(between_4_7):
 | ||||||
|  | -	/* Load as big endian with overlapping movbe to avoid branches.
 | ||||||
|  | -	 */
 | ||||||
|  | -	movbe	(%rdi), %eax
 | ||||||
|  | -	movbe	(%rsi), %ecx
 | ||||||
|  | -	shlq	$32, %rax
 | ||||||
|  | -	shlq	$32, %rcx
 | ||||||
|  | -	movbe	-4(%rdi, %rdx), %edi
 | ||||||
|  | -	movbe	-4(%rsi, %rdx), %esi
 | ||||||
|  | -	orq	%rdi, %rax
 | ||||||
|  | -	orq	%rsi, %rcx
 | ||||||
|  | -	subq	%rcx, %rax
 | ||||||
|  | -	jz	L(zero_4_7)
 | ||||||
|  | -	sbbl	%eax, %eax
 | ||||||
|  | -	orl	$1, %eax
 | ||||||
|  | -L(zero_4_7):
 | ||||||
|  | +# ifndef USE_AS_WMEMCMP
 | ||||||
|  | +L(between_2_3):
 | ||||||
|  | +	/* Load as big endian to avoid branches.  */
 | ||||||
|  | +	movzwl	(%rdi), %eax
 | ||||||
|  | +	movzwl	(%rsi), %ecx
 | ||||||
|  | +	shll	$8, %eax
 | ||||||
|  | +	shll	$8, %ecx
 | ||||||
|  | +	bswap	%eax
 | ||||||
|  | +	bswap	%ecx
 | ||||||
|  | +	movzbl	-1(%rdi, %rdx), %edi
 | ||||||
|  | +	movzbl	-1(%rsi, %rdx), %esi
 | ||||||
|  | +	orl	%edi, %eax
 | ||||||
|  | +	orl	%esi, %ecx
 | ||||||
|  | +	/* Subtraction is okay because the upper 8 bits are zero.  */
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  |  	ret | ||||||
|  |  # endif | ||||||
|  | -
 | ||||||
|  |  END (MEMCMP) | ||||||
|  |  #endif | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										510
									
								
								SOURCES/glibc-RHEL-15696-57.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										510
									
								
								SOURCES/glibc-RHEL-15696-57.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,510 @@ | |||||||
|  | From e59ced238482fd71f3e493717f14f6507346741e Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Mon, 20 Sep 2021 16:20:15 -0500 | ||||||
|  | Subject: [PATCH] x86: Optimize memset-vec-unaligned-erms.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. | ||||||
|  | 
 | ||||||
|  | Optimization are | ||||||
|  | 
 | ||||||
|  | 1. change control flow for L(more_2x_vec) to fall through to loop and | ||||||
|  |    jump for L(less_4x_vec) and L(less_8x_vec). This uses less code | ||||||
|  |    size and saves jumps for length > 4x VEC_SIZE. | ||||||
|  | 
 | ||||||
|  | 2. For EVEX/AVX512 move L(less_vec) closer to entry. | ||||||
|  | 
 | ||||||
|  | 3. Avoid complex address mode for length > 2x VEC_SIZE | ||||||
|  | 
 | ||||||
|  | 4. Slightly better aligning code for the loop from the perspective of | ||||||
|  |    code size and uops. | ||||||
|  | 
 | ||||||
|  | 5. Align targets so they make full use of their fetch block and if | ||||||
|  |    possible cache line. | ||||||
|  | 
 | ||||||
|  | 6. Try and reduce total number of icache lines that will need to be | ||||||
|  |    pulled in for a given length. | ||||||
|  | 
 | ||||||
|  | 7. Include "local" version of stosb target. For AVX2/EVEX/AVX512 | ||||||
|  |    jumping to the stosb target in the sse2 code section will almost | ||||||
|  |    certainly be to a new page. The new version does increase code size | ||||||
|  |    marginally by duplicating the target but should get better iTLB | ||||||
|  |    behavior as a result. | ||||||
|  | 
 | ||||||
|  | test-memset, test-wmemset, and test-bzero are all passing. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/memset.S                       |  10 +- | ||||||
|  |  .../multiarch/memset-avx2-unaligned-erms.S    |  10 +- | ||||||
|  |  .../multiarch/memset-avx512-unaligned-erms.S  |  11 +- | ||||||
|  |  .../multiarch/memset-evex-unaligned-erms.S    |  11 +- | ||||||
|  |  .../multiarch/memset-vec-unaligned-erms.S     | 285 ++++++++++++------ | ||||||
|  |  5 files changed, 232 insertions(+), 95 deletions(-) | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	sysdeps/x86_64/memset.S | ||||||
|  | 	(GNU URL) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
 | ||||||
|  | index b3426795..8672b030 100644
 | ||||||
|  | --- a/sysdeps/x86_64/memset.S
 | ||||||
|  | +++ b/sysdeps/x86_64/memset.S
 | ||||||
|  | @@ -18,13 +18,15 @@
 | ||||||
|  |     <http://www.gnu.org/licenses/>.  */ | ||||||
|  |   | ||||||
|  |  #include <sysdep.h> | ||||||
|  | +#define USE_WITH_SSE2	1
 | ||||||
|  |   | ||||||
|  |  #define VEC_SIZE	16 | ||||||
|  | +#define MOV_SIZE	3
 | ||||||
|  | +#define RET_SIZE	1
 | ||||||
|  | +
 | ||||||
|  |  #define VEC(i)		xmm##i | ||||||
|  | -/* Don't use movups and movaps since it will get larger nop paddings for
 | ||||||
|  | -   alignment.  */
 | ||||||
|  | -#define VMOVU		movdqu
 | ||||||
|  | -#define VMOVA		movdqa
 | ||||||
|  | +#define VMOVU     movups
 | ||||||
|  | +#define VMOVA     movaps
 | ||||||
|  |   | ||||||
|  |  #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ | ||||||
|  |    movd d, %xmm0; \ | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
 | ||||||
|  | index ae0860f3..1af668af 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
 | ||||||
|  | @@ -1,8 +1,14 @@
 | ||||||
|  |  #if IS_IN (libc) | ||||||
|  | +# define USE_WITH_AVX2	1
 | ||||||
|  | +
 | ||||||
|  |  # define VEC_SIZE	32 | ||||||
|  | +# define MOV_SIZE	4
 | ||||||
|  | +# define RET_SIZE	4
 | ||||||
|  | +
 | ||||||
|  |  # define VEC(i)		ymm##i | ||||||
|  | -# define VMOVU		vmovdqu
 | ||||||
|  | -# define VMOVA		vmovdqa
 | ||||||
|  | +
 | ||||||
|  | +# define VMOVU     vmovdqu
 | ||||||
|  | +# define VMOVA     vmovdqa
 | ||||||
|  |   | ||||||
|  |  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ | ||||||
|  |    vmovd d, %xmm0; \ | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
 | ||||||
|  | index 8ad842fc..f14d6f84 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
 | ||||||
|  | @@ -1,11 +1,18 @@
 | ||||||
|  |  #if IS_IN (libc) | ||||||
|  | +# define USE_WITH_AVX512	1
 | ||||||
|  | +
 | ||||||
|  |  # define VEC_SIZE	64 | ||||||
|  | +# define MOV_SIZE	6
 | ||||||
|  | +# define RET_SIZE	1
 | ||||||
|  | +
 | ||||||
|  |  # define XMM0		xmm16 | ||||||
|  |  # define YMM0		ymm16 | ||||||
|  |  # define VEC0		zmm16 | ||||||
|  |  # define VEC(i)		VEC##i | ||||||
|  | -# define VMOVU		vmovdqu64
 | ||||||
|  | -# define VMOVA		vmovdqa64
 | ||||||
|  | +
 | ||||||
|  | +# define VMOVU     vmovdqu64
 | ||||||
|  | +# define VMOVA     vmovdqa64
 | ||||||
|  | +
 | ||||||
|  |  # define VZEROUPPER | ||||||
|  |   | ||||||
|  |  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
 | ||||||
|  | index 640f0929..64b09e77 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
 | ||||||
|  | @@ -1,11 +1,18 @@
 | ||||||
|  |  #if IS_IN (libc) | ||||||
|  | +# define USE_WITH_EVEX	1
 | ||||||
|  | +
 | ||||||
|  |  # define VEC_SIZE	32 | ||||||
|  | +# define MOV_SIZE	6
 | ||||||
|  | +# define RET_SIZE	1
 | ||||||
|  | +
 | ||||||
|  |  # define XMM0		xmm16 | ||||||
|  |  # define YMM0		ymm16 | ||||||
|  |  # define VEC0		ymm16 | ||||||
|  |  # define VEC(i)		VEC##i | ||||||
|  | -# define VMOVU		vmovdqu64
 | ||||||
|  | -# define VMOVA		vmovdqa64
 | ||||||
|  | +
 | ||||||
|  | +# define VMOVU     vmovdqu64
 | ||||||
|  | +# define VMOVA     vmovdqa64
 | ||||||
|  | +
 | ||||||
|  |  # define VZEROUPPER | ||||||
|  |   | ||||||
|  |  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 | ||||||
|  | index 909c33f6..f08b7323 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 | ||||||
|  | @@ -63,8 +63,27 @@
 | ||||||
|  |  # endif | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  | +#if VEC_SIZE == 64
 | ||||||
|  | +# define LOOP_4X_OFFSET	(VEC_SIZE * 4)
 | ||||||
|  | +#else
 | ||||||
|  | +# define LOOP_4X_OFFSET	(0)
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
 | ||||||
|  | +# define END_REG	rcx
 | ||||||
|  | +# define LOOP_REG	rdi
 | ||||||
|  | +#else
 | ||||||
|  | +# define END_REG	rdi
 | ||||||
|  | +# define LOOP_REG	rdx
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  |  #define PAGE_SIZE 4096 | ||||||
|  |   | ||||||
|  | +/* Macro to calculate size of small memset block for aligning
 | ||||||
|  | +   purposes.  */
 | ||||||
|  | +#define SMALL_MEMSET_ALIGN(mov_sz,	ret_sz)	(2 * (mov_sz) + (ret_sz) + 1)
 | ||||||
|  | +
 | ||||||
|  | +
 | ||||||
|  |  #ifndef SECTION | ||||||
|  |  # error SECTION is not defined! | ||||||
|  |  #endif | ||||||
|  | @@ -74,6 +93,7 @@
 | ||||||
|  |  ENTRY (__bzero) | ||||||
|  |  	mov	%RDI_LP, %RAX_LP /* Set return value.  */ | ||||||
|  |  	mov	%RSI_LP, %RDX_LP /* Set n.  */ | ||||||
|  | +	xorl	%esi, %esi
 | ||||||
|  |  	pxor	%XMM0, %XMM0 | ||||||
|  |  	jmp	L(entry_from_bzero) | ||||||
|  |  END (__bzero) | ||||||
|  | @@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 | ||||||
|  |  END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  | -ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
 | ||||||
|  | +ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
 | ||||||
|  |  	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) | ||||||
|  |  # ifdef __ILP32__ | ||||||
|  |  	/* Clear the upper 32 bits.  */ | ||||||
|  | @@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
 | ||||||
|  |  	jb	L(less_vec) | ||||||
|  |  	cmp	$(VEC_SIZE * 2), %RDX_LP | ||||||
|  |  	ja	L(stosb_more_2x_vec) | ||||||
|  | -	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 | ||||||
|  | -	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 | ||||||
|  | -	VMOVU	%VEC(0), (%rdi)
 | ||||||
|  | +	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
 | ||||||
|  | +	 */
 | ||||||
|  | +	VMOVU	%VEC(0), (%rax)
 | ||||||
|  | +	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(stosb_more_2x_vec):
 | ||||||
|  | -	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
 | ||||||
|  | -	ja	L(stosb)
 | ||||||
|  | -#else
 | ||||||
|  | -	.p2align 4
 | ||||||
|  |  #endif | ||||||
|  | -L(more_2x_vec):
 | ||||||
|  | -	/* Stores to first 2x VEC before cmp as any path forward will
 | ||||||
|  | -	   require it.  */
 | ||||||
|  | -	VMOVU	%VEC(0), (%rdi)
 | ||||||
|  | -	VMOVU	%VEC(0), VEC_SIZE(%rdi)
 | ||||||
|  | -	cmpq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | -	ja	L(loop_start)
 | ||||||
|  | -	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
 | ||||||
|  | -	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 | ||||||
|  | -L(return):
 | ||||||
|  | -#if VEC_SIZE > 16
 | ||||||
|  | -	ZERO_UPPER_VEC_REGISTERS_RETURN
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4,, 10
 | ||||||
|  | +L(last_2x_vec):
 | ||||||
|  | +#ifdef USE_LESS_VEC_MASK_STORE
 | ||||||
|  | +	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
 | ||||||
|  | +	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
 | ||||||
|  |  #else | ||||||
|  | -	ret
 | ||||||
|  | +	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
 | ||||||
|  | +	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
 | ||||||
|  |  #endif | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |   | ||||||
|  | -L(loop_start):
 | ||||||
|  | -	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
 | ||||||
|  | -	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
 | ||||||
|  | -	cmpq	$(VEC_SIZE * 8), %rdx
 | ||||||
|  | -	jbe	L(loop_end)
 | ||||||
|  | -	andq	$-(VEC_SIZE * 2), %rdi
 | ||||||
|  | -	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | -	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(loop):
 | ||||||
|  | -	VMOVA	%VEC(0), (%rdi)
 | ||||||
|  | -	VMOVA	%VEC(0), VEC_SIZE(%rdi)
 | ||||||
|  | -	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
 | ||||||
|  | -	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
 | ||||||
|  | -	subq	$-(VEC_SIZE * 4), %rdi
 | ||||||
|  | -	cmpq	%rcx, %rdi
 | ||||||
|  | -	jb	L(loop)
 | ||||||
|  | -L(loop_end):
 | ||||||
|  | -	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
 | ||||||
|  | -	       rdx as length is also unchanged.  */
 | ||||||
|  | -	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
 | ||||||
|  | -	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
 | ||||||
|  | -	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
 | ||||||
|  | -	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
 | ||||||
|  | -	VZEROUPPER_SHORT_RETURN
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	/* If have AVX512 mask instructions put L(less_vec) close to
 | ||||||
|  | +	   entry as it doesn't take much space and is likely a hot target.
 | ||||||
|  | +	 */
 | ||||||
|  | +#ifdef USE_LESS_VEC_MASK_STORE
 | ||||||
|  | +	.p2align 4,, 10
 | ||||||
|  |  L(less_vec): | ||||||
|  |  	/* Less than 1 VEC.  */ | ||||||
|  |  # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 | ||||||
|  |  #  error Unsupported VEC_SIZE! | ||||||
|  |  # endif | ||||||
|  | -# ifdef USE_LESS_VEC_MASK_STORE
 | ||||||
|  |  	/* Clear high bits from edi. Only keeping bits relevant to page | ||||||
|  |  	   cross check. Note that we are using rax which is set in | ||||||
|  | -	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
 | ||||||
|  | -	 */
 | ||||||
|  | +	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
 | ||||||
|  |  	andl	$(PAGE_SIZE - 1), %edi | ||||||
|  | -	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
 | ||||||
|  | -	   performance degradation when it has to fault supress.  */
 | ||||||
|  | +	/* Check if VEC_SIZE store cross page. Mask stores suffer
 | ||||||
|  | +	   serious performance degradation when it has to fault supress.
 | ||||||
|  | +	 */
 | ||||||
|  |  	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi | ||||||
|  | +	/* This is generally considered a cold target.  */
 | ||||||
|  |  	ja	L(cross_page) | ||||||
|  |  # if VEC_SIZE > 32 | ||||||
|  |  	movq	$-1, %rcx | ||||||
|  | @@ -247,58 +235,185 @@ L(less_vec):
 | ||||||
|  |  	bzhil	%edx, %ecx, %ecx | ||||||
|  |  	kmovd	%ecx, %k1 | ||||||
|  |  # endif | ||||||
|  | -	vmovdqu8	%VEC(0), (%rax) {%k1}
 | ||||||
|  | +	vmovdqu8 %VEC(0), (%rax){%k1}
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |   | ||||||
|  | +# if defined USE_MULTIARCH && IS_IN (libc)
 | ||||||
|  | +	/* Include L(stosb_local) here if including L(less_vec) between
 | ||||||
|  | +	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
 | ||||||
|  | +	   L(stosb_more_2x_vec) target.  */
 | ||||||
|  | +	.p2align 4,, 10
 | ||||||
|  | +L(stosb_local):
 | ||||||
|  | +	movzbl	%sil, %eax
 | ||||||
|  | +	mov	%RDX_LP, %RCX_LP
 | ||||||
|  | +	mov	%RDI_LP, %RDX_LP
 | ||||||
|  | +	rep	stosb
 | ||||||
|  | +	mov	%RDX_LP, %RAX_LP
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  | +# endif
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +#if defined USE_MULTIARCH && IS_IN (libc)
 | ||||||
|  |  	.p2align 4 | ||||||
|  | -L(cross_page):
 | ||||||
|  | +L(stosb_more_2x_vec):
 | ||||||
|  | +	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
 | ||||||
|  | +	ja	L(stosb_local)
 | ||||||
|  | +#endif
 | ||||||
|  | +	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
 | ||||||
|  | +	   and (4x, 8x] jump to target.  */
 | ||||||
|  | +L(more_2x_vec):
 | ||||||
|  | +
 | ||||||
|  | +	/* Two different methods of setting up pointers / compare. The
 | ||||||
|  | +	   two methods are based on the fact that EVEX/AVX512 mov
 | ||||||
|  | +	   instructions take more bytes then AVX2/SSE2 mov instructions. As
 | ||||||
|  | +	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
 | ||||||
|  | +	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
 | ||||||
|  | +	   this saves code size and keeps a few targets in one fetch block.
 | ||||||
|  | +	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
 | ||||||
|  | +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
 | ||||||
|  | +	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
 | ||||||
|  | +	   LOOP_4X_OFFSET) with LEA_BID.  */
 | ||||||
|  | +
 | ||||||
|  | +	/* END_REG is rcx for EVEX/AVX512.  */
 | ||||||
|  | +	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +	/* Stores to first 2x VEC before cmp as any path forward will
 | ||||||
|  | +	   require it.  */
 | ||||||
|  | +	VMOVU	%VEC(0), (%rax)
 | ||||||
|  | +	VMOVU	%VEC(0), VEC_SIZE(%rax)
 | ||||||
|  | +
 | ||||||
|  | +
 | ||||||
|  | +#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
 | ||||||
|  | +	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
 | ||||||
|  | +	addq	%rdx, %END_REG
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +	cmpq	$(VEC_SIZE * 4), %rdx
 | ||||||
|  | +	jbe	L(last_2x_vec)
 | ||||||
|  | +
 | ||||||
|  | +	/* Store next 2x vec regardless.  */
 | ||||||
|  | +	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
 | ||||||
|  | +	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
 | ||||||
|  | +
 | ||||||
|  | +
 | ||||||
|  | +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
 | ||||||
|  | +	/* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
 | ||||||
|  | +	   extra offset to addresses in loop. Used for AVX512 to save space
 | ||||||
|  | +	   as no way to get (VEC_SIZE * 4) in imm8.  */
 | ||||||
|  | +# if LOOP_4X_OFFSET == 0
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %LOOP_REG
 | ||||||
|  |  # endif | ||||||
|  | -# if VEC_SIZE > 32
 | ||||||
|  | -	cmpb	$32, %dl
 | ||||||
|  | -	jae	L(between_32_63)
 | ||||||
|  | +	/* Avoid imm32 compare here to save code size.  */
 | ||||||
|  | +	cmpq	%rdi, %rcx
 | ||||||
|  | +#else
 | ||||||
|  | +	addq	$-(VEC_SIZE * 4), %END_REG
 | ||||||
|  | +	cmpq	$(VEC_SIZE * 8), %rdx
 | ||||||
|  | +#endif
 | ||||||
|  | +	jbe	L(last_4x_vec)
 | ||||||
|  | +#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
 | ||||||
|  | +	/* Set LOOP_REG (rdx).  */
 | ||||||
|  | +	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
 | ||||||
|  | +#endif
 | ||||||
|  | +	/* Align dst for loop.  */
 | ||||||
|  | +	andq	$(VEC_SIZE * -2), %LOOP_REG
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(loop):
 | ||||||
|  | +	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
 | ||||||
|  | +	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
 | ||||||
|  | +	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
 | ||||||
|  | +	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
 | ||||||
|  | +	subq	$-(VEC_SIZE * 4), %LOOP_REG
 | ||||||
|  | +	cmpq	%END_REG, %LOOP_REG
 | ||||||
|  | +	jb	L(loop)
 | ||||||
|  | +	.p2align 4,, MOV_SIZE
 | ||||||
|  | +L(last_4x_vec):
 | ||||||
|  | +	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
 | ||||||
|  | +	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
 | ||||||
|  | +	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
 | ||||||
|  | +	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
 | ||||||
|  | +L(return):
 | ||||||
|  | +#if VEC_SIZE > 16
 | ||||||
|  | +	ZERO_UPPER_VEC_REGISTERS_RETURN
 | ||||||
|  | +#else
 | ||||||
|  | +	ret
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4,, 10
 | ||||||
|  | +#ifndef USE_LESS_VEC_MASK_STORE
 | ||||||
|  | +# if defined USE_MULTIARCH && IS_IN (libc)
 | ||||||
|  | +	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
 | ||||||
|  | +	   range for 2-byte jump encoding.  */
 | ||||||
|  | +L(stosb_local):
 | ||||||
|  | +	movzbl	%sil, %eax
 | ||||||
|  | +	mov	%RDX_LP, %RCX_LP
 | ||||||
|  | +	mov	%RDI_LP, %RDX_LP
 | ||||||
|  | +	rep	stosb
 | ||||||
|  | +	mov	%RDX_LP, %RAX_LP
 | ||||||
|  | +	VZEROUPPER_RETURN
 | ||||||
|  |  # endif | ||||||
|  | -# if VEC_SIZE > 16
 | ||||||
|  | -	cmpb	$16, %dl
 | ||||||
|  | +	/* Define L(less_vec) only if not otherwise defined.  */
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(less_vec):
 | ||||||
|  | +#endif
 | ||||||
|  | +L(cross_page):
 | ||||||
|  | +#if VEC_SIZE > 32
 | ||||||
|  | +	cmpl	$32, %edx
 | ||||||
|  | +	jae	L(between_32_63)
 | ||||||
|  | +#endif
 | ||||||
|  | +#if VEC_SIZE > 16
 | ||||||
|  | +	cmpl	$16, %edx
 | ||||||
|  |  	jae	L(between_16_31) | ||||||
|  | -# endif
 | ||||||
|  | -	MOVQ	%XMM0, %rcx
 | ||||||
|  | -	cmpb	$8, %dl
 | ||||||
|  | +#endif
 | ||||||
|  | +	MOVQ	%XMM0, %rdi
 | ||||||
|  | +	cmpl	$8, %edx
 | ||||||
|  |  	jae	L(between_8_15) | ||||||
|  | -	cmpb	$4, %dl
 | ||||||
|  | +	cmpl	$4, %edx
 | ||||||
|  |  	jae	L(between_4_7) | ||||||
|  | -	cmpb	$1, %dl
 | ||||||
|  | +	cmpl	$1, %edx
 | ||||||
|  |  	ja	L(between_2_3) | ||||||
|  | -	jb	1f
 | ||||||
|  | -	movb	%cl, (%rax)
 | ||||||
|  | -1:
 | ||||||
|  | +	jb	L(return)
 | ||||||
|  | +	movb	%sil, (%rax)
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | -# if VEC_SIZE > 32
 | ||||||
|  | +
 | ||||||
|  | +	/* Align small targets only if not doing so would cross a fetch
 | ||||||
|  | +	   line.  */
 | ||||||
|  | +#if VEC_SIZE > 32
 | ||||||
|  | +	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
 | ||||||
|  |  	/* From 32 to 63.  No branch when size == 32.  */ | ||||||
|  |  L(between_32_63): | ||||||
|  | -	VMOVU	%YMM0, -32(%rax,%rdx)
 | ||||||
|  |  	VMOVU	%YMM0, (%rax) | ||||||
|  | +	VMOVU	%YMM0, -32(%rax, %rdx)
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | -# endif
 | ||||||
|  | -# if VEC_SIZE > 16
 | ||||||
|  | -	/* From 16 to 31.  No branch when size == 16.  */
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +#if VEC_SIZE >= 32
 | ||||||
|  | +	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
 | ||||||
|  |  L(between_16_31): | ||||||
|  | -	VMOVU	%XMM0, -16(%rax,%rdx)
 | ||||||
|  | +	/* From 16 to 31.  No branch when size == 16.  */
 | ||||||
|  |  	VMOVU	%XMM0, (%rax) | ||||||
|  | +	VMOVU	%XMM0, -16(%rax, %rdx)
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | -# endif
 | ||||||
|  | -	/* From 8 to 15.  No branch when size == 8.  */
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
 | ||||||
|  |  L(between_8_15): | ||||||
|  | -	movq	%rcx, -8(%rax,%rdx)
 | ||||||
|  | -	movq	%rcx, (%rax)
 | ||||||
|  | +	/* From 8 to 15.  No branch when size == 8.  */
 | ||||||
|  | +	movq	%rdi, (%rax)
 | ||||||
|  | +	movq	%rdi, -8(%rax, %rdx)
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | +
 | ||||||
|  | +	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
 | ||||||
|  |  L(between_4_7): | ||||||
|  |  	/* From 4 to 7.  No branch when size == 4.  */ | ||||||
|  | -	movl	%ecx, -4(%rax,%rdx)
 | ||||||
|  | -	movl	%ecx, (%rax)
 | ||||||
|  | +	movl	%edi, (%rax)
 | ||||||
|  | +	movl	%edi, -4(%rax, %rdx)
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  | +
 | ||||||
|  | +	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
 | ||||||
|  |  L(between_2_3): | ||||||
|  |  	/* From 2 to 3.  No branch when size == 2.  */ | ||||||
|  | -	movw	%cx, -2(%rax,%rdx)
 | ||||||
|  | -	movw	%cx, (%rax)
 | ||||||
|  | +	movw	%di, (%rax)
 | ||||||
|  | +	movb	%dil, -1(%rax, %rdx)
 | ||||||
|  |  	VZEROUPPER_RETURN | ||||||
|  |  END (MEMSET_SYMBOL (__memset, unaligned_erms)) | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										45
									
								
								SOURCES/glibc-RHEL-15696-58.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								SOURCES/glibc-RHEL-15696-58.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,45 @@ | |||||||
|  | From bad852b61b79503fcb3c5fc379c70f768df3e1fb Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Sat, 23 Oct 2021 01:26:47 -0400 | ||||||
|  | Subject: [PATCH] x86: Replace sse2 instructions with avx in | ||||||
|  |  memcmp-evex-movbe.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'. | ||||||
|  | 
 | ||||||
|  | it could potentially be dangerous to use SSE2 if this function is ever | ||||||
|  | called without using 'vzeroupper' beforehand. While compilers appear | ||||||
|  | to use 'vzeroupper' before function calls if AVX2 has been used, using | ||||||
|  | SSE2 here is more brittle. Since it is not absolutely necessary it | ||||||
|  | should be avoided. | ||||||
|  | 
 | ||||||
|  | It costs 2-extra bytes but the extra bytes should only eat into | ||||||
|  | alignment padding. | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 4 ++-- | ||||||
|  |  1 file changed, 2 insertions(+), 2 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 | ||||||
|  | index 2761b54f..640f6757 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 | ||||||
|  | @@ -561,13 +561,13 @@ L(between_16_31):
 | ||||||
|  |  	/* From 16 to 31 bytes.  No branch when size == 16.  */ | ||||||
|  |   | ||||||
|  |  	/* Use movups to save code size.  */ | ||||||
|  | -	movups	(%rsi), %xmm2
 | ||||||
|  | +	vmovdqu	(%rsi), %xmm2
 | ||||||
|  |  	VPCMP	$4, (%rdi), %xmm2, %k1 | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(return_vec_0_lv) | ||||||
|  |  	/* Use overlapping loads to avoid branches.  */ | ||||||
|  | -	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
 | ||||||
|  | +	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
 | ||||||
|  |  	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1 | ||||||
|  |  	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										695
									
								
								SOURCES/glibc-RHEL-15696-59.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										695
									
								
								SOURCES/glibc-RHEL-15696-59.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,695 @@ | |||||||
|  | From c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Fri, 29 Oct 2021 12:40:20 -0700 | ||||||
|  | Subject: [PATCH] x86-64: Improve EVEX strcmp with masked load | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | In strcmp-evex.S, to compare 2 32-byte strings, replace | ||||||
|  | 
 | ||||||
|  |         VMOVU   (%rdi, %rdx), %YMM0 | ||||||
|  |         VMOVU   (%rsi, %rdx), %YMM1 | ||||||
|  |         /* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */ | ||||||
|  |         VPCMP   $4, %YMM0, %YMM1, %k0 | ||||||
|  |         VPCMP   $0, %YMMZERO, %YMM0, %k1 | ||||||
|  |         VPCMP   $0, %YMMZERO, %YMM1, %k2 | ||||||
|  |         /* Each bit in K1 represents a NULL in YMM0 or YMM1.  */ | ||||||
|  |         kord    %k1, %k2, %k1 | ||||||
|  |         /* Each bit in K1 represents a NULL or a mismatch.  */ | ||||||
|  |         kord    %k0, %k1, %k1 | ||||||
|  |         kmovd   %k1, %ecx | ||||||
|  |         testl   %ecx, %ecx | ||||||
|  |         jne     L(last_vector) | ||||||
|  | 
 | ||||||
|  | with | ||||||
|  | 
 | ||||||
|  |         VMOVU   (%rdi, %rdx), %YMM0 | ||||||
|  |         VPTESTM %YMM0, %YMM0, %k2 | ||||||
|  |         /* Each bit cleared in K1 represents a mismatch or a null CHAR | ||||||
|  |            in YMM0 and 32 bytes at (%rsi, %rdx).  */ | ||||||
|  |         VPCMP   $0, (%rsi, %rdx), %YMM0, %k1{%k2} | ||||||
|  |         kmovd   %k1, %ecx | ||||||
|  |         incl    %ecx | ||||||
|  |         jne     L(last_vector) | ||||||
|  | 
 | ||||||
|  | It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake | ||||||
|  | and Ice Lake. | ||||||
|  | 
 | ||||||
|  | Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strcmp-evex.S | 461 +++++++++++++------------ | ||||||
|  |  1 file changed, 243 insertions(+), 218 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
 | ||||||
|  | index d5aa6daa..82f12ac8 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
 | ||||||
|  | @@ -41,6 +41,8 @@
 | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  |  /* Compare packed dwords.  */ | ||||||
|  |  #  define VPCMP		vpcmpd | ||||||
|  | +#  define VPMINU	vpminud
 | ||||||
|  | +#  define VPTESTM	vptestmd
 | ||||||
|  |  #  define SHIFT_REG32	r8d | ||||||
|  |  #  define SHIFT_REG64	r8 | ||||||
|  |  /* 1 dword char == 4 bytes.  */ | ||||||
|  | @@ -48,6 +50,8 @@
 | ||||||
|  |  # else | ||||||
|  |  /* Compare packed bytes.  */ | ||||||
|  |  #  define VPCMP		vpcmpb | ||||||
|  | +#  define VPMINU	vpminub
 | ||||||
|  | +#  define VPTESTM	vptestmb
 | ||||||
|  |  #  define SHIFT_REG32	ecx | ||||||
|  |  #  define SHIFT_REG64	rcx | ||||||
|  |  /* 1 byte char == 1 byte.  */ | ||||||
|  | @@ -67,6 +71,9 @@
 | ||||||
|  |  # define YMM5		ymm22 | ||||||
|  |  # define YMM6		ymm23 | ||||||
|  |  # define YMM7		ymm24 | ||||||
|  | +# define YMM8		ymm25
 | ||||||
|  | +# define YMM9		ymm26
 | ||||||
|  | +# define YMM10		ymm27
 | ||||||
|  |   | ||||||
|  |  /* Warning! | ||||||
|  |             wcscmp/wcsncmp have to use SIGNED comparison for elements. | ||||||
|  | @@ -76,7 +83,7 @@
 | ||||||
|  |  /* The main idea of the string comparison (byte or dword) using 256-bit | ||||||
|  |     EVEX instructions consists of comparing (VPCMP) two ymm vectors. The | ||||||
|  |     latter can be on either packed bytes or dwords depending on | ||||||
|  | -   USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
 | ||||||
|  | +   USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
 | ||||||
|  |     matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2 | ||||||
|  |     KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes) | ||||||
|  |     are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd | ||||||
|  | @@ -123,27 +130,21 @@ ENTRY (STRCMP)
 | ||||||
|  |  	jg	L(cross_page) | ||||||
|  |  	/* Start comparing 4 vectors.  */ | ||||||
|  |  	VMOVU	(%rdi), %YMM0 | ||||||
|  | -	VMOVU	(%rsi), %YMM1
 | ||||||
|  |   | ||||||
|  | -	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
 | ||||||
|  | -	VPCMP	$4, %YMM0, %YMM1, %k0
 | ||||||
|  | +	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
 | ||||||
|  | +	VPTESTM	%YMM0, %YMM0, %k2
 | ||||||
|  |   | ||||||
|  | -	/* Check for NULL in YMM0.  */
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM0, %k1
 | ||||||
|  | -	/* Check for NULL in YMM1.  */
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM1, %k2
 | ||||||
|  | -	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
 | ||||||
|  | -	kord	%k1, %k2, %k1
 | ||||||
|  | +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 | ||||||
|  | +	   in YMM0 and 32 bytes at (%rsi).  */
 | ||||||
|  | +	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
 | ||||||
|  |   | ||||||
|  | -	/* Each bit in K1 represents:
 | ||||||
|  | -	   1. A mismatch in YMM0 and YMM1.  Or
 | ||||||
|  | -	   2. A NULL in YMM0 or YMM1.
 | ||||||
|  | -	 */
 | ||||||
|  | -	kord	%k0, %k1, %k1
 | ||||||
|  | -
 | ||||||
|  | -	ktestd	%k1, %k1
 | ||||||
|  | -	je	L(next_3_vectors)
 | ||||||
|  |  	kmovd	%k1, %ecx | ||||||
|  | +# ifdef USE_AS_WCSCMP
 | ||||||
|  | +	subl	$0xff, %ecx
 | ||||||
|  | +# else
 | ||||||
|  | +	incl	%ecx
 | ||||||
|  | +# endif
 | ||||||
|  | +	je	L(next_3_vectors)
 | ||||||
|  |  	tzcntl	%ecx, %edx | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  |  	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */ | ||||||
|  | @@ -172,9 +173,7 @@ L(return):
 | ||||||
|  |  # endif | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  |  L(return_vec_size): | ||||||
|  | -	kmovd	%k1, %ecx
 | ||||||
|  |  	tzcntl	%ecx, %edx | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  |  	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */ | ||||||
|  | @@ -210,9 +209,7 @@ L(return_vec_size):
 | ||||||
|  |  # endif | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  |  L(return_2_vec_size): | ||||||
|  | -	kmovd	%k1, %ecx
 | ||||||
|  |  	tzcntl	%ecx, %edx | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  |  	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */ | ||||||
|  | @@ -248,9 +245,7 @@ L(return_2_vec_size):
 | ||||||
|  |  # endif | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  |  L(return_3_vec_size): | ||||||
|  | -	kmovd	%k1, %ecx
 | ||||||
|  |  	tzcntl	%ecx, %edx | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  |  	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */ | ||||||
|  | @@ -289,43 +284,45 @@ L(return_3_vec_size):
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(next_3_vectors): | ||||||
|  |  	VMOVU	VEC_SIZE(%rdi), %YMM0 | ||||||
|  | -	VMOVU	VEC_SIZE(%rsi), %YMM1
 | ||||||
|  | -	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
 | ||||||
|  | -	VPCMP	$4, %YMM0, %YMM1, %k0
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM0, %k1
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM1, %k2
 | ||||||
|  | -	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
 | ||||||
|  | -	kord	%k1, %k2, %k1
 | ||||||
|  | -	/* Each bit in K1 represents a NULL or a mismatch.  */
 | ||||||
|  | -	kord	%k0, %k1, %k1
 | ||||||
|  | -	ktestd	%k1, %k1
 | ||||||
|  | +	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
 | ||||||
|  | +	VPTESTM	%YMM0, %YMM0, %k2
 | ||||||
|  | +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 | ||||||
|  | +	   in YMM0 and 32 bytes at VEC_SIZE(%rsi).  */
 | ||||||
|  | +	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
 | ||||||
|  | +	kmovd	%k1, %ecx
 | ||||||
|  | +# ifdef USE_AS_WCSCMP
 | ||||||
|  | +	subl	$0xff, %ecx
 | ||||||
|  | +# else
 | ||||||
|  | +	incl	%ecx
 | ||||||
|  | +# endif
 | ||||||
|  |  	jne	L(return_vec_size) | ||||||
|  |   | ||||||
|  | -	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM2
 | ||||||
|  | -	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM3
 | ||||||
|  | -	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM4
 | ||||||
|  | -	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM5
 | ||||||
|  | -
 | ||||||
|  | -	/* Each bit in K0 represents a mismatch in YMM2 and YMM4.  */
 | ||||||
|  | -	VPCMP	$4, %YMM2, %YMM4, %k0
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM2, %k1
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM4, %k2
 | ||||||
|  | -	/* Each bit in K1 represents a NULL in YMM2 or YMM4.  */
 | ||||||
|  | -	kord	%k1, %k2, %k1
 | ||||||
|  | -	/* Each bit in K1 represents a NULL or a mismatch.  */
 | ||||||
|  | -	kord	%k0, %k1, %k1
 | ||||||
|  | -	ktestd	%k1, %k1
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
 | ||||||
|  | +	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
 | ||||||
|  | +	VPTESTM	%YMM0, %YMM0, %k2
 | ||||||
|  | +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 | ||||||
|  | +	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
 | ||||||
|  | +	kmovd	%k1, %ecx
 | ||||||
|  | +# ifdef USE_AS_WCSCMP
 | ||||||
|  | +	subl	$0xff, %ecx
 | ||||||
|  | +# else
 | ||||||
|  | +	incl	%ecx
 | ||||||
|  | +# endif
 | ||||||
|  |  	jne	L(return_2_vec_size) | ||||||
|  |   | ||||||
|  | -	/* Each bit in K0 represents a mismatch in YMM3 and YMM5.  */
 | ||||||
|  | -	VPCMP	$4, %YMM3, %YMM5, %k0
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM3, %k1
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM5, %k2
 | ||||||
|  | -	/* Each bit in K1 represents a NULL in YMM3 or YMM5.  */
 | ||||||
|  | -	kord	%k1, %k2, %k1
 | ||||||
|  | -	/* Each bit in K1 represents a NULL or a mismatch.  */
 | ||||||
|  | -	kord	%k0, %k1, %k1
 | ||||||
|  | -	ktestd	%k1, %k1
 | ||||||
|  | +	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
 | ||||||
|  | +	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
 | ||||||
|  | +	VPTESTM	%YMM0, %YMM0, %k2
 | ||||||
|  | +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 | ||||||
|  | +	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
 | ||||||
|  | +	kmovd	%k1, %ecx
 | ||||||
|  | +# ifdef USE_AS_WCSCMP
 | ||||||
|  | +	subl	$0xff, %ecx
 | ||||||
|  | +# else
 | ||||||
|  | +	incl	%ecx
 | ||||||
|  | +# endif
 | ||||||
|  |  	jne	L(return_3_vec_size) | ||||||
|  |  L(main_loop_header): | ||||||
|  |  	leaq	(VEC_SIZE * 4)(%rdi), %rdx | ||||||
|  | @@ -375,56 +372,51 @@ L(back_to_loop):
 | ||||||
|  |  	VMOVA	VEC_SIZE(%rax), %YMM2 | ||||||
|  |  	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4 | ||||||
|  |  	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6 | ||||||
|  | -	VMOVU	(%rdx), %YMM1
 | ||||||
|  | -	VMOVU	VEC_SIZE(%rdx), %YMM3
 | ||||||
|  | -	VMOVU	(VEC_SIZE * 2)(%rdx), %YMM5
 | ||||||
|  | -	VMOVU	(VEC_SIZE * 3)(%rdx), %YMM7
 | ||||||
|  | -
 | ||||||
|  | -	VPCMP	$4, %YMM0, %YMM1, %k0
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM0, %k1
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM1, %k2
 | ||||||
|  | -	kord	%k1, %k2, %k1
 | ||||||
|  | -	/* Each bit in K4 represents a NULL or a mismatch in YMM0 and
 | ||||||
|  | -	   YMM1.  */
 | ||||||
|  | -	kord	%k0, %k1, %k4
 | ||||||
|  | -
 | ||||||
|  | -	VPCMP	$4, %YMM2, %YMM3, %k0
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM2, %k1
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM3, %k2
 | ||||||
|  | -	kord	%k1, %k2, %k1
 | ||||||
|  | -	/* Each bit in K5 represents a NULL or a mismatch in YMM2 and
 | ||||||
|  | -	   YMM3.  */
 | ||||||
|  | -	kord	%k0, %k1, %k5
 | ||||||
|  | -
 | ||||||
|  | -	VPCMP	$4, %YMM4, %YMM5, %k0
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM4, %k1
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM5, %k2
 | ||||||
|  | -	kord	%k1, %k2, %k1
 | ||||||
|  | -	/* Each bit in K6 represents a NULL or a mismatch in YMM4 and
 | ||||||
|  | -	   YMM5.  */
 | ||||||
|  | -	kord	%k0, %k1, %k6
 | ||||||
|  | -
 | ||||||
|  | -	VPCMP	$4, %YMM6, %YMM7, %k0
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM6, %k1
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM7, %k2
 | ||||||
|  | -	kord	%k1, %k2, %k1
 | ||||||
|  | -	/* Each bit in K7 represents a NULL or a mismatch in YMM6 and
 | ||||||
|  | -	   YMM7.  */
 | ||||||
|  | -	kord	%k0, %k1, %k7
 | ||||||
|  | -
 | ||||||
|  | -	kord	%k4, %k5, %k0
 | ||||||
|  | -	kord	%k6, %k7, %k1
 | ||||||
|  | -
 | ||||||
|  | -	/* Test each mask (32 bits) individually because for VEC_SIZE
 | ||||||
|  | -	   == 32 is not possible to OR the four masks and keep all bits
 | ||||||
|  | -	   in a 64-bit integer register, differing from SSE2 strcmp
 | ||||||
|  | -	   where ORing is possible.  */
 | ||||||
|  | -	kortestd %k0, %k1
 | ||||||
|  | -	je	L(loop)
 | ||||||
|  | -	ktestd	%k4, %k4
 | ||||||
|  | +
 | ||||||
|  | +	VPMINU	%YMM0, %YMM2, %YMM8
 | ||||||
|  | +	VPMINU	%YMM4, %YMM6, %YMM9
 | ||||||
|  | +
 | ||||||
|  | +	/* A zero CHAR in YMM8 means that there is a null CHAR.  */
 | ||||||
|  | +	VPMINU	%YMM8, %YMM9, %YMM8
 | ||||||
|  | +
 | ||||||
|  | +	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
 | ||||||
|  | +	VPTESTM	%YMM8, %YMM8, %k1
 | ||||||
|  | +
 | ||||||
|  | +	/* (YMM ^ YMM): A non-zero CHAR represents a mismatch.  */
 | ||||||
|  | +	vpxorq	(%rdx), %YMM0, %YMM1
 | ||||||
|  | +	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM3
 | ||||||
|  | +	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
 | ||||||
|  | +	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
 | ||||||
|  | +
 | ||||||
|  | +	vporq	%YMM1, %YMM3, %YMM9
 | ||||||
|  | +	vporq	%YMM5, %YMM7, %YMM10
 | ||||||
|  | +
 | ||||||
|  | +	/* A non-zero CHAR in YMM9 represents a mismatch.  */
 | ||||||
|  | +	vporq	%YMM9, %YMM10, %YMM9
 | ||||||
|  | +
 | ||||||
|  | +	/* Each bit cleared in K0 represents a mismatch or a null CHAR.  */
 | ||||||
|  | +	VPCMP	$0, %YMMZERO, %YMM9, %k0{%k1}
 | ||||||
|  | +	kmovd   %k0, %ecx
 | ||||||
|  | +# ifdef USE_AS_WCSCMP
 | ||||||
|  | +	subl	$0xff, %ecx
 | ||||||
|  | +# else
 | ||||||
|  | +	incl	%ecx
 | ||||||
|  | +# endif
 | ||||||
|  | +	je	 L(loop)
 | ||||||
|  | +
 | ||||||
|  | +	/* Each bit set in K1 represents a non-null CHAR in YMM0.  */
 | ||||||
|  | +	VPTESTM	%YMM0, %YMM0, %k1
 | ||||||
|  | +	/* Each bit cleared in K0 represents a mismatch or a null CHAR
 | ||||||
|  | +	   in YMM0 and (%rdx).  */
 | ||||||
|  | +	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
 | ||||||
|  | +	kmovd	%k0, %ecx
 | ||||||
|  | +# ifdef USE_AS_WCSCMP
 | ||||||
|  | +	subl	$0xff, %ecx
 | ||||||
|  | +# else
 | ||||||
|  | +	incl	%ecx
 | ||||||
|  | +# endif
 | ||||||
|  |  	je	L(test_vec) | ||||||
|  | -	kmovd	%k4, %edi
 | ||||||
|  | -	tzcntl	%edi, %ecx
 | ||||||
|  | +	tzcntl	%ecx, %ecx
 | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  |  	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */ | ||||||
|  |  	sall	$2, %ecx | ||||||
|  | @@ -466,9 +458,18 @@ L(test_vec):
 | ||||||
|  |  	cmpq	$VEC_SIZE, %r11 | ||||||
|  |  	jbe	L(zero) | ||||||
|  |  # endif | ||||||
|  | -	ktestd	%k5, %k5
 | ||||||
|  | +	/* Each bit set in K1 represents a non-null CHAR in YMM2.  */
 | ||||||
|  | +	VPTESTM	%YMM2, %YMM2, %k1
 | ||||||
|  | +	/* Each bit cleared in K0 represents a mismatch or a null CHAR
 | ||||||
|  | +	   in YMM2 and VEC_SIZE(%rdx).  */
 | ||||||
|  | +	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
 | ||||||
|  | +	kmovd	%k0, %ecx
 | ||||||
|  | +# ifdef USE_AS_WCSCMP
 | ||||||
|  | +	subl	$0xff, %ecx
 | ||||||
|  | +# else
 | ||||||
|  | +	incl	%ecx
 | ||||||
|  | +# endif
 | ||||||
|  |  	je	L(test_2_vec) | ||||||
|  | -	kmovd	%k5, %ecx
 | ||||||
|  |  	tzcntl	%ecx, %edi | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  |  	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */ | ||||||
|  | @@ -512,9 +513,18 @@ L(test_2_vec):
 | ||||||
|  |  	cmpq	$(VEC_SIZE * 2), %r11 | ||||||
|  |  	jbe	L(zero) | ||||||
|  |  # endif | ||||||
|  | -	ktestd	%k6, %k6
 | ||||||
|  | +	/* Each bit set in K1 represents a non-null CHAR in YMM4.  */
 | ||||||
|  | +	VPTESTM	%YMM4, %YMM4, %k1
 | ||||||
|  | +	/* Each bit cleared in K0 represents a mismatch or a null CHAR
 | ||||||
|  | +	   in YMM4 and (VEC_SIZE * 2)(%rdx).  */
 | ||||||
|  | +	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
 | ||||||
|  | +	kmovd	%k0, %ecx
 | ||||||
|  | +# ifdef USE_AS_WCSCMP
 | ||||||
|  | +	subl	$0xff, %ecx
 | ||||||
|  | +# else
 | ||||||
|  | +	incl	%ecx
 | ||||||
|  | +# endif
 | ||||||
|  |  	je	L(test_3_vec) | ||||||
|  | -	kmovd	%k6, %ecx
 | ||||||
|  |  	tzcntl	%ecx, %edi | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  |  	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */ | ||||||
|  | @@ -558,8 +568,18 @@ L(test_3_vec):
 | ||||||
|  |  	cmpq	$(VEC_SIZE * 3), %r11 | ||||||
|  |  	jbe	L(zero) | ||||||
|  |  # endif | ||||||
|  | -	kmovd	%k7, %esi
 | ||||||
|  | -	tzcntl	%esi, %ecx
 | ||||||
|  | +	/* Each bit set in K1 represents a non-null CHAR in YMM6.  */
 | ||||||
|  | +	VPTESTM	%YMM6, %YMM6, %k1
 | ||||||
|  | +	/* Each bit cleared in K0 represents a mismatch or a null CHAR
 | ||||||
|  | +	   in YMM6 and (VEC_SIZE * 3)(%rdx).  */
 | ||||||
|  | +	VPCMP	$0, %YMMZERO, %YMM7, %k0{%k1}
 | ||||||
|  | +	kmovd	%k0, %ecx
 | ||||||
|  | +# ifdef USE_AS_WCSCMP
 | ||||||
|  | +	subl	$0xff, %ecx
 | ||||||
|  | +# else
 | ||||||
|  | +	incl	%ecx
 | ||||||
|  | +# endif
 | ||||||
|  | +	tzcntl	%ecx, %ecx
 | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  |  	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */ | ||||||
|  |  	sall	$2, %ecx | ||||||
|  | @@ -615,39 +635,51 @@ L(loop_cross_page):
 | ||||||
|  |   | ||||||
|  |  	VMOVU	(%rax, %r10), %YMM2 | ||||||
|  |  	VMOVU	VEC_SIZE(%rax, %r10), %YMM3 | ||||||
|  | -	VMOVU	(%rdx, %r10), %YMM4
 | ||||||
|  | -	VMOVU	VEC_SIZE(%rdx, %r10), %YMM5
 | ||||||
|  | -
 | ||||||
|  | -	VPCMP	$4, %YMM4, %YMM2, %k0
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM2, %k1
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM4, %k2
 | ||||||
|  | -	kord	%k1, %k2, %k1
 | ||||||
|  | -	/* Each bit in K1 represents a NULL or a mismatch in YMM2 and
 | ||||||
|  | -	   YMM4.  */
 | ||||||
|  | -	kord	%k0, %k1, %k1
 | ||||||
|  | -
 | ||||||
|  | -	VPCMP	$4, %YMM5, %YMM3, %k3
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM3, %k4
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM5, %k5
 | ||||||
|  | -	kord	%k4, %k5, %k4
 | ||||||
|  | -	/* Each bit in K3 represents a NULL or a mismatch in YMM3 and
 | ||||||
|  | -	   YMM5.  */
 | ||||||
|  | -	kord	%k3, %k4, %k3
 | ||||||
|  | +
 | ||||||
|  | +	/* Each bit set in K2 represents a non-null CHAR in YMM2.  */
 | ||||||
|  | +	VPTESTM	%YMM2, %YMM2, %k2
 | ||||||
|  | +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 | ||||||
|  | +	   in YMM2 and 32 bytes at (%rdx, %r10).  */
 | ||||||
|  | +	VPCMP	$0, (%rdx, %r10), %YMM2, %k1{%k2}
 | ||||||
|  | +	kmovd	%k1, %r9d
 | ||||||
|  | +	/* Don't use subl since it is the lower 16/32 bits of RDI
 | ||||||
|  | +	   below.  */
 | ||||||
|  | +	notl	%r9d
 | ||||||
|  | +# ifdef USE_AS_WCSCMP
 | ||||||
|  | +	/* Only last 8 bits are valid.  */
 | ||||||
|  | +	andl	$0xff, %r9d
 | ||||||
|  | +# endif
 | ||||||
|  | +
 | ||||||
|  | +	/* Each bit set in K4 represents a non-null CHAR in YMM3.  */
 | ||||||
|  | +	VPTESTM	%YMM3, %YMM3, %k4
 | ||||||
|  | +	/* Each bit cleared in K3 represents a mismatch or a null CHAR
 | ||||||
|  | +	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
 | ||||||
|  | +	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
 | ||||||
|  | +	kmovd	%k3, %edi
 | ||||||
|  | +# ifdef USE_AS_WCSCMP
 | ||||||
|  | +	/* Don't use subl since it is the upper 8 bits of EDI below.  */
 | ||||||
|  | +	notl	%edi
 | ||||||
|  | +	andl	$0xff, %edi
 | ||||||
|  | +# else
 | ||||||
|  | +	incl	%edi
 | ||||||
|  | +# endif
 | ||||||
|  |   | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  | -	/* NB: Each bit in K1/K3 represents 4-byte element.  */
 | ||||||
|  | -	kshiftlw $8, %k3, %k2
 | ||||||
|  | +	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
 | ||||||
|  | +	sall	$8, %edi
 | ||||||
|  |  	/* NB: Divide shift count by 4 since each bit in K1 represent 4 | ||||||
|  |  	   bytes.  */ | ||||||
|  |  	movl	%ecx, %SHIFT_REG32 | ||||||
|  |  	sarl	$2, %SHIFT_REG32 | ||||||
|  | +
 | ||||||
|  | +	/* Each bit in EDI represents a null CHAR or a mismatch.  */
 | ||||||
|  | +	orl	%r9d, %edi
 | ||||||
|  |  # else | ||||||
|  | -	kshiftlq $32, %k3, %k2
 | ||||||
|  | -# endif
 | ||||||
|  | +	salq	$32, %rdi
 | ||||||
|  |   | ||||||
|  | -	/* Each bit in K1 represents a NULL or a mismatch.  */
 | ||||||
|  | -	korq	%k1, %k2, %k1
 | ||||||
|  | -	kmovq	%k1, %rdi
 | ||||||
|  | +	/* Each bit in RDI represents a null CHAR or a mismatch.  */
 | ||||||
|  | +	orq	%r9, %rdi
 | ||||||
|  | +# endif
 | ||||||
|  |   | ||||||
|  |  	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */ | ||||||
|  |  	shrxq	%SHIFT_REG64, %rdi, %rdi | ||||||
|  | @@ -692,35 +724,45 @@ L(loop_cross_page_2_vec):
 | ||||||
|  |  	/* The first VEC_SIZE * 2 bytes match or are ignored.  */ | ||||||
|  |  	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0 | ||||||
|  |  	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1 | ||||||
|  | -	VMOVU	(VEC_SIZE * 2)(%rdx, %r10), %YMM2
 | ||||||
|  | -	VMOVU	(VEC_SIZE * 3)(%rdx, %r10), %YMM3
 | ||||||
|  | -
 | ||||||
|  | -	VPCMP	$4, %YMM0, %YMM2, %k0
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM0, %k1
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM2, %k2
 | ||||||
|  | -	kord	%k1, %k2, %k1
 | ||||||
|  | -	/* Each bit in K1 represents a NULL or a mismatch in YMM0 and
 | ||||||
|  | -	   YMM2.  */
 | ||||||
|  | -	kord	%k0, %k1, %k1
 | ||||||
|  | -
 | ||||||
|  | -	VPCMP	$4, %YMM1, %YMM3, %k3
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM1, %k4
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM3, %k5
 | ||||||
|  | -	kord	%k4, %k5, %k4
 | ||||||
|  | -	/* Each bit in K3 represents a NULL or a mismatch in YMM1 and
 | ||||||
|  | -	   YMM3.  */
 | ||||||
|  | -	kord	%k3, %k4, %k3
 | ||||||
|  |   | ||||||
|  | +	VPTESTM	%YMM0, %YMM0, %k2
 | ||||||
|  | +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 | ||||||
|  | +	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10).  */
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
 | ||||||
|  | +	kmovd	%k1, %r9d
 | ||||||
|  | +	/* Don't use subl since it is the lower 16/32 bits of RDI
 | ||||||
|  | +	   below.  */
 | ||||||
|  | +	notl	%r9d
 | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  | -	/* NB: Each bit in K1/K3 represents 4-byte element.  */
 | ||||||
|  | -	kshiftlw $8, %k3, %k2
 | ||||||
|  | +	/* Only last 8 bits are valid.  */
 | ||||||
|  | +	andl	$0xff, %r9d
 | ||||||
|  | +# endif
 | ||||||
|  | +
 | ||||||
|  | +	VPTESTM	%YMM1, %YMM1, %k4
 | ||||||
|  | +	/* Each bit cleared in K3 represents a mismatch or a null CHAR
 | ||||||
|  | +	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
 | ||||||
|  | +	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
 | ||||||
|  | +	kmovd	%k3, %edi
 | ||||||
|  | +# ifdef USE_AS_WCSCMP
 | ||||||
|  | +	/* Don't use subl since it is the upper 8 bits of EDI below.  */
 | ||||||
|  | +	notl	%edi
 | ||||||
|  | +	andl	$0xff, %edi
 | ||||||
|  |  # else | ||||||
|  | -	kshiftlq $32, %k3, %k2
 | ||||||
|  | +	incl	%edi
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  | -	/* Each bit in K1 represents a NULL or a mismatch.  */
 | ||||||
|  | -	korq	%k1, %k2, %k1
 | ||||||
|  | -	kmovq	%k1, %rdi
 | ||||||
|  | +# ifdef USE_AS_WCSCMP
 | ||||||
|  | +	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
 | ||||||
|  | +	sall	$8, %edi
 | ||||||
|  | +
 | ||||||
|  | +	/* Each bit in EDI represents a null CHAR or a mismatch.  */
 | ||||||
|  | +	orl	%r9d, %edi
 | ||||||
|  | +# else
 | ||||||
|  | +	salq	$32, %rdi
 | ||||||
|  | +
 | ||||||
|  | +	/* Each bit in RDI represents a null CHAR or a mismatch.  */
 | ||||||
|  | +	orq	%r9, %rdi
 | ||||||
|  | +# endif
 | ||||||
|  |   | ||||||
|  |  	xorl	%r8d, %r8d | ||||||
|  |  	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */ | ||||||
|  | @@ -729,12 +771,15 @@ L(loop_cross_page_2_vec):
 | ||||||
|  |  	/* R8 has number of bytes skipped.  */ | ||||||
|  |  	movl	%ecx, %r8d | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  | -	/* NB: Divide shift count by 4 since each bit in K1 represent 4
 | ||||||
|  | +	/* NB: Divide shift count by 4 since each bit in RDI represent 4
 | ||||||
|  |  	   bytes.  */ | ||||||
|  |  	sarl	$2, %ecx | ||||||
|  | -# endif
 | ||||||
|  | +	/* Skip ECX bytes.  */
 | ||||||
|  | +	shrl	%cl, %edi
 | ||||||
|  | +# else
 | ||||||
|  |  	/* Skip ECX bytes.  */ | ||||||
|  |  	shrq	%cl, %rdi | ||||||
|  | +# endif
 | ||||||
|  |  1: | ||||||
|  |  	/* Before jumping back to the loop, set ESI to the number of | ||||||
|  |  	   VEC_SIZE * 4 blocks before page crossing.  */ | ||||||
|  | @@ -818,7 +863,7 @@ L(cross_page_loop):
 | ||||||
|  |  	movzbl	(%rdi, %rdx), %eax | ||||||
|  |  	movzbl	(%rsi, %rdx), %ecx | ||||||
|  |  # endif | ||||||
|  | -	/* Check null char.  */
 | ||||||
|  | +	/* Check null CHAR.  */
 | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jne	L(cross_page_loop) | ||||||
|  |  	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED | ||||||
|  | @@ -901,18 +946,17 @@ L(cross_page):
 | ||||||
|  |  	jg	L(cross_page_1_vector) | ||||||
|  |  L(loop_1_vector): | ||||||
|  |  	VMOVU	(%rdi, %rdx), %YMM0 | ||||||
|  | -	VMOVU	(%rsi, %rdx), %YMM1
 | ||||||
|  | -
 | ||||||
|  | -	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
 | ||||||
|  | -	VPCMP	$4, %YMM0, %YMM1, %k0
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM0, %k1
 | ||||||
|  | -	VPCMP	$0, %YMMZERO, %YMM1, %k2
 | ||||||
|  | -	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
 | ||||||
|  | -	kord	%k1, %k2, %k1
 | ||||||
|  | -	/* Each bit in K1 represents a NULL or a mismatch.  */
 | ||||||
|  | -	kord	%k0, %k1, %k1
 | ||||||
|  | +
 | ||||||
|  | +	VPTESTM	%YMM0, %YMM0, %k2
 | ||||||
|  | +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 | ||||||
|  | +	   in YMM0 and 32 bytes at (%rsi, %rdx).  */
 | ||||||
|  | +	VPCMP	$0, (%rsi, %rdx), %YMM0, %k1{%k2}
 | ||||||
|  |  	kmovd	%k1, %ecx | ||||||
|  | -	testl	%ecx, %ecx
 | ||||||
|  | +# ifdef USE_AS_WCSCMP
 | ||||||
|  | +	subl	$0xff, %ecx
 | ||||||
|  | +# else
 | ||||||
|  | +	incl	%ecx
 | ||||||
|  | +# endif
 | ||||||
|  |  	jne	L(last_vector) | ||||||
|  |   | ||||||
|  |  	addl	$VEC_SIZE, %edx | ||||||
|  | @@ -931,18 +975,17 @@ L(cross_page_1_vector):
 | ||||||
|  |  	cmpl	$(PAGE_SIZE - 16), %eax | ||||||
|  |  	jg	L(cross_page_1_xmm) | ||||||
|  |  	VMOVU	(%rdi, %rdx), %XMM0 | ||||||
|  | -	VMOVU	(%rsi, %rdx), %XMM1
 | ||||||
|  | -
 | ||||||
|  | -	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
 | ||||||
|  | -	VPCMP	$4, %XMM0, %XMM1, %k0
 | ||||||
|  | -	VPCMP	$0, %XMMZERO, %XMM0, %k1
 | ||||||
|  | -	VPCMP	$0, %XMMZERO, %XMM1, %k2
 | ||||||
|  | -	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
 | ||||||
|  | -	korw	%k1, %k2, %k1
 | ||||||
|  | -	/* Each bit in K1 represents a NULL or a mismatch.  */
 | ||||||
|  | -	korw	%k0, %k1, %k1
 | ||||||
|  | -	kmovw	%k1, %ecx
 | ||||||
|  | -	testl	%ecx, %ecx
 | ||||||
|  | +
 | ||||||
|  | +	VPTESTM	%YMM0, %YMM0, %k2
 | ||||||
|  | +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 | ||||||
|  | +	   in XMM0 and 16 bytes at (%rsi, %rdx).  */
 | ||||||
|  | +	VPCMP	$0, (%rsi, %rdx), %XMM0, %k1{%k2}
 | ||||||
|  | +	kmovd	%k1, %ecx
 | ||||||
|  | +# ifdef USE_AS_WCSCMP
 | ||||||
|  | +	subl	$0xf, %ecx
 | ||||||
|  | +# else
 | ||||||
|  | +	subl	$0xffff, %ecx
 | ||||||
|  | +# endif
 | ||||||
|  |  	jne	L(last_vector) | ||||||
|  |   | ||||||
|  |  	addl	$16, %edx | ||||||
|  | @@ -965,25 +1008,16 @@ L(cross_page_1_xmm):
 | ||||||
|  |  	vmovq	(%rdi, %rdx), %XMM0 | ||||||
|  |  	vmovq	(%rsi, %rdx), %XMM1 | ||||||
|  |   | ||||||
|  | -	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
 | ||||||
|  | -	VPCMP	$4, %XMM0, %XMM1, %k0
 | ||||||
|  | -	VPCMP	$0, %XMMZERO, %XMM0, %k1
 | ||||||
|  | -	VPCMP	$0, %XMMZERO, %XMM1, %k2
 | ||||||
|  | -	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
 | ||||||
|  | -	kord	%k1, %k2, %k1
 | ||||||
|  | -	/* Each bit in K1 represents a NULL or a mismatch.  */
 | ||||||
|  | -	kord	%k0, %k1, %k1
 | ||||||
|  | -	kmovd	%k1, %ecx
 | ||||||
|  | -
 | ||||||
|  | +	VPTESTM	%YMM0, %YMM0, %k2
 | ||||||
|  | +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 | ||||||
|  | +	   in XMM0 and XMM1.  */
 | ||||||
|  | +	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
 | ||||||
|  | +	kmovb	%k1, %ecx
 | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  | -	/* Only last 2 bits are valid.  */
 | ||||||
|  | -	andl	$0x3, %ecx
 | ||||||
|  | +	subl	$0x3, %ecx
 | ||||||
|  |  # else | ||||||
|  | -	/* Only last 8 bits are valid.  */
 | ||||||
|  | -	andl	$0xff, %ecx
 | ||||||
|  | +	subl	$0xff, %ecx
 | ||||||
|  |  # endif | ||||||
|  | -
 | ||||||
|  | -	testl	%ecx, %ecx
 | ||||||
|  |  	jne	L(last_vector) | ||||||
|  |   | ||||||
|  |  	addl	$8, %edx | ||||||
|  | @@ -1002,25 +1036,16 @@ L(cross_page_8bytes):
 | ||||||
|  |  	vmovd	(%rdi, %rdx), %XMM0 | ||||||
|  |  	vmovd	(%rsi, %rdx), %XMM1 | ||||||
|  |   | ||||||
|  | -	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
 | ||||||
|  | -	VPCMP	$4, %XMM0, %XMM1, %k0
 | ||||||
|  | -	VPCMP	$0, %XMMZERO, %XMM0, %k1
 | ||||||
|  | -	VPCMP	$0, %XMMZERO, %XMM1, %k2
 | ||||||
|  | -	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
 | ||||||
|  | -	kord	%k1, %k2, %k1
 | ||||||
|  | -	/* Each bit in K1 represents a NULL or a mismatch.  */
 | ||||||
|  | -	kord	%k0, %k1, %k1
 | ||||||
|  | +	VPTESTM	%YMM0, %YMM0, %k2
 | ||||||
|  | +	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 | ||||||
|  | +	   in XMM0 and XMM1.  */
 | ||||||
|  | +	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
 | ||||||
|  |  	kmovd	%k1, %ecx | ||||||
|  | -
 | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  | -	/* Only the last bit is valid.  */
 | ||||||
|  | -	andl	$0x1, %ecx
 | ||||||
|  | +	subl	$0x1, %ecx
 | ||||||
|  |  # else | ||||||
|  | -	/* Only last 4 bits are valid.  */
 | ||||||
|  | -	andl	$0xf, %ecx
 | ||||||
|  | +	subl	$0xf, %ecx
 | ||||||
|  |  # endif | ||||||
|  | -
 | ||||||
|  | -	testl	%ecx, %ecx
 | ||||||
|  |  	jne	L(last_vector) | ||||||
|  |   | ||||||
|  |  	addl	$4, %edx | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										300
									
								
								SOURCES/glibc-RHEL-15696-6.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										300
									
								
								SOURCES/glibc-RHEL-15696-6.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,300 @@ | |||||||
|  | From ee915088a0231cd421054dbd8abab7aadf331153 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Mon, 21 Jan 2019 11:33:52 -0800 | ||||||
|  | Subject: [PATCH] x86-64 strncmp family: Properly handle the length parameter | ||||||
|  |  [BZ# 24097] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | On x32, the size_t parameter may be passed in the lower 32 bits of a | ||||||
|  | 64-bit register with the non-zero upper 32 bits.  The string/memory | ||||||
|  | functions written in assembly can only use the lower 32 bits of a | ||||||
|  | 64-bit register as length or must clear the upper 32 bits before using | ||||||
|  | the full 64-bit register for length. | ||||||
|  | 
 | ||||||
|  | This pach fixes the strncmp family for x32.  Tested on x86-64 and x32. | ||||||
|  | On x86-64, libc.so is the same with and withou the fix. | ||||||
|  | 
 | ||||||
|  | 	[BZ# 24097] | ||||||
|  | 	CVE-2019-6488 | ||||||
|  | 	* sysdeps/x86_64/multiarch/strcmp-avx2.S: Use RDX_LP for length. | ||||||
|  | 	* sysdeps/x86_64/multiarch/strcmp-sse42.S: Likewise. | ||||||
|  | 	* sysdeps/x86_64/strcmp.S: Likewise. | ||||||
|  | 	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp, | ||||||
|  | 	tst-size_t-strncmp and tst-size_t-wcsncmp. | ||||||
|  | 	* sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file. | ||||||
|  | 	* sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise. | ||||||
|  | 	* sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strcmp-avx2.S      |  6 +- | ||||||
|  |  sysdeps/x86_64/multiarch/strcmp-sse42.S     |  6 +- | ||||||
|  |  sysdeps/x86_64/strcmp.S                     |  6 +- | ||||||
|  |  sysdeps/x86_64/x32/Makefile                 |  6 +- | ||||||
|  |  sysdeps/x86_64/x32/tst-size_t-strncasecmp.c | 59 ++++++++++++++++ | ||||||
|  |  sysdeps/x86_64/x32/tst-size_t-strncmp.c     | 78 +++++++++++++++++++++ | ||||||
|  |  sysdeps/x86_64/x32/tst-size_t-wcsncmp.c     | 20 ++++++ | ||||||
|  |  7 files changed, 170 insertions(+), 11 deletions(-) | ||||||
|  |  create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncasecmp.c | ||||||
|  |  create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncmp.c | ||||||
|  |  create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsncmp.c | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	ChangeLog | ||||||
|  | 	(removed) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | index 327e3d87..156c1949 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | @@ -79,15 +79,15 @@
 | ||||||
|  |  ENTRY (STRCMP) | ||||||
|  |  # ifdef USE_AS_STRNCMP | ||||||
|  |  	/* Check for simple cases (0 or 1) in offset.  */ | ||||||
|  | -	cmp	$1, %rdx
 | ||||||
|  | +	cmp	$1, %RDX_LP
 | ||||||
|  |  	je	L(char0) | ||||||
|  |  	jb	L(zero) | ||||||
|  |  #  ifdef USE_AS_WCSCMP | ||||||
|  |  	/* Convert units: from wide to byte char.  */ | ||||||
|  | -	shl	$2, %rdx
 | ||||||
|  | +	shl	$2, %RDX_LP
 | ||||||
|  |  #  endif | ||||||
|  |  	/* Register %r11 tracks the maximum offset.  */ | ||||||
|  | -	movq	%rdx, %r11
 | ||||||
|  | +	mov	%RDX_LP, %R11_LP
 | ||||||
|  |  # endif | ||||||
|  |  	movl	%edi, %eax | ||||||
|  |  	xorl	%edx, %edx | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
 | ||||||
|  | index d3c07bd2..a1ebea46 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
 | ||||||
|  | @@ -156,11 +156,11 @@ STRCMP_SSE42:
 | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  |  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | ||||||
|  | -	test	%rdx, %rdx
 | ||||||
|  | +	test	%RDX_LP, %RDX_LP
 | ||||||
|  |  	je	LABEL(strcmp_exitz) | ||||||
|  | -	cmp	$1, %rdx
 | ||||||
|  | +	cmp	$1, %RDX_LP
 | ||||||
|  |  	je	LABEL(Byte0) | ||||||
|  | -	mov	%rdx, %r11
 | ||||||
|  | +	mov	%RDX_LP, %R11_LP
 | ||||||
|  |  #endif | ||||||
|  |  	mov	%esi, %ecx | ||||||
|  |  	mov	%edi, %eax | ||||||
|  | diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
 | ||||||
|  | index e16945b9..f47c8ad4 100644
 | ||||||
|  | --- a/sysdeps/x86_64/strcmp.S
 | ||||||
|  | +++ b/sysdeps/x86_64/strcmp.S
 | ||||||
|  | @@ -135,11 +135,11 @@ ENTRY (STRCMP)
 | ||||||
|  |   * This implementation uses SSE to compare up to 16 bytes at a time. | ||||||
|  |   */ | ||||||
|  |  #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L | ||||||
|  | -	test	%rdx, %rdx
 | ||||||
|  | +	test	%RDX_LP, %RDX_LP
 | ||||||
|  |  	je	LABEL(strcmp_exitz) | ||||||
|  | -	cmp	$1, %rdx
 | ||||||
|  | +	cmp	$1, %RDX_LP
 | ||||||
|  |  	je	LABEL(Byte0) | ||||||
|  | -	mov	%rdx, %r11
 | ||||||
|  | +	mov	%RDX_LP, %R11_LP
 | ||||||
|  |  #endif | ||||||
|  |  	mov	%esi, %ecx | ||||||
|  |  	mov	%edi, %eax | ||||||
|  | diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | index 98bd9ae9..db302839 100644
 | ||||||
|  | --- a/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | @@ -7,9 +7,11 @@ endif
 | ||||||
|  |   | ||||||
|  |  ifeq ($(subdir),string) | ||||||
|  |  tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ | ||||||
|  | -	 tst-size_t-memrchr tst-size_t-memset
 | ||||||
|  | +	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
 | ||||||
|  | +	 tst-size_t-strncmp
 | ||||||
|  |  endif | ||||||
|  |   | ||||||
|  |  ifeq ($(subdir),wcsmbs) | ||||||
|  | -tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
 | ||||||
|  | +tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
 | ||||||
|  | +	 tst-size_t-wcsncmp
 | ||||||
|  |  endif | ||||||
|  | diff --git a/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..86233593
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
 | ||||||
|  | @@ -0,0 +1,59 @@
 | ||||||
|  | +/* Test strncaecmp with size_t in the lower 32 bits of 64-bit register.
 | ||||||
|  | +   Copyright (C) 2019 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <http://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#define TEST_NAME "strncasecmp"
 | ||||||
|  | +#include "test-size_t.h"
 | ||||||
|  | +
 | ||||||
|  | +IMPL (strncasecmp, 1)
 | ||||||
|  | +
 | ||||||
|  | +typedef int (*proto_t) (const char *, const char *, size_t);
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +do_strncasecmp (parameter_t a, parameter_t b)
 | ||||||
|  | +{
 | ||||||
|  | +  return CALL (&b, a.p, b.p, a.len);
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +test_main (void)
 | ||||||
|  | +{
 | ||||||
|  | +  test_init ();
 | ||||||
|  | +
 | ||||||
|  | +  parameter_t dest = { { page_size }, buf1 };
 | ||||||
|  | +  parameter_t src = { { 0 }, buf2 };
 | ||||||
|  | +
 | ||||||
|  | +  strncpy ((char *) buf1, (const char *) buf2, page_size);
 | ||||||
|  | +
 | ||||||
|  | +  int ret = 0;
 | ||||||
|  | +  FOR_EACH_IMPL (impl, 0)
 | ||||||
|  | +    {
 | ||||||
|  | +      src.fn = impl->fn;
 | ||||||
|  | +      int res = do_strncasecmp (dest, src);
 | ||||||
|  | +      if (res)
 | ||||||
|  | +	{
 | ||||||
|  | +	  error (0, 0, "Wrong result in function %s: %i != 0",
 | ||||||
|  | +		 impl->name, res);
 | ||||||
|  | +	  ret = 1;
 | ||||||
|  | +	}
 | ||||||
|  | +    }
 | ||||||
|  | +
 | ||||||
|  | +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +#include <support/test-driver.c>
 | ||||||
|  | diff --git a/sysdeps/x86_64/x32/tst-size_t-strncmp.c b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..54e6bd83
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
 | ||||||
|  | @@ -0,0 +1,78 @@
 | ||||||
|  | +/* Test strncmp with size_t in the lower 32 bits of 64-bit register.
 | ||||||
|  | +   Copyright (C) 2019 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <http://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#ifdef WIDE
 | ||||||
|  | +# define TEST_NAME "wcsncmp"
 | ||||||
|  | +#else
 | ||||||
|  | +# define TEST_NAME "strncmp"
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +#include "test-size_t.h"
 | ||||||
|  | +
 | ||||||
|  | +#ifdef WIDE
 | ||||||
|  | +# include <wchar.h>
 | ||||||
|  | +
 | ||||||
|  | +# define STRNCMP wcsncmp
 | ||||||
|  | +# define STRNCPY wcsncpy
 | ||||||
|  | +# define CHAR wchar_t
 | ||||||
|  | +#else
 | ||||||
|  | +# define STRNCMP strncmp
 | ||||||
|  | +# define STRNCPY strncpy
 | ||||||
|  | +# define CHAR char
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  | +IMPL (STRNCMP, 1)
 | ||||||
|  | +
 | ||||||
|  | +typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
 | ||||||
|  | +
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +do_strncmp (parameter_t a, parameter_t b)
 | ||||||
|  | +{
 | ||||||
|  | +  return CALL (&b, a.p, b.p, a.len);
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +test_main (void)
 | ||||||
|  | +{
 | ||||||
|  | +  test_init ();
 | ||||||
|  | +
 | ||||||
|  | +  size_t size = page_size / sizeof (CHAR);
 | ||||||
|  | +  parameter_t dest = { { size }, buf1 };
 | ||||||
|  | +  parameter_t src = { { 0 }, buf2 };
 | ||||||
|  | +
 | ||||||
|  | +  STRNCPY ((CHAR *) buf1, (const CHAR *) buf2, size);
 | ||||||
|  | +
 | ||||||
|  | +  int ret = 0;
 | ||||||
|  | +  FOR_EACH_IMPL (impl, 0)
 | ||||||
|  | +    {
 | ||||||
|  | +      src.fn = impl->fn;
 | ||||||
|  | +      int res = do_strncmp (dest, src);
 | ||||||
|  | +      if (res)
 | ||||||
|  | +	{
 | ||||||
|  | +	  error (0, 0, "Wrong result in function %s: %i != 0",
 | ||||||
|  | +		 impl->name, res);
 | ||||||
|  | +	  ret = 1;
 | ||||||
|  | +	}
 | ||||||
|  | +    }
 | ||||||
|  | +
 | ||||||
|  | +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +#include <support/test-driver.c>
 | ||||||
|  | diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..4829647c
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
 | ||||||
|  | @@ -0,0 +1,20 @@
 | ||||||
|  | +/* Test wcsncmp with size_t in the lower 32 bits of 64-bit register.
 | ||||||
|  | +   Copyright (C) 2019 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <http://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#define WIDE 1
 | ||||||
|  | +#include "tst-size_t-strncmp.c"
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										54
									
								
								SOURCES/glibc-RHEL-15696-60.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								SOURCES/glibc-RHEL-15696-60.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,54 @@ | |||||||
|  | From 6720d36b6623c5e48c070d86acf61198b33e144e Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Fangrui Song <maskray@google.com> | ||||||
|  | Date: Tue, 2 Nov 2021 20:59:52 -0700 | ||||||
|  | Subject: [PATCH] x86-64: Replace movzx with movzbl | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Clang cannot assemble movzx in the AT&T dialect mode. | ||||||
|  | 
 | ||||||
|  | ../sysdeps/x86_64/strcmp.S:2232:16: error: invalid operand for instruction | ||||||
|  |  movzx (%rsi), %ecx | ||||||
|  |                ^~~~ | ||||||
|  | 
 | ||||||
|  | Change movzx to movzbl, which follows the AT&T dialect and is used | ||||||
|  | elsewhere in the file. | ||||||
|  | 
 | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strcmp-sse42.S | 4 ++-- | ||||||
|  |  sysdeps/x86_64/strcmp.S                 | 4 ++-- | ||||||
|  |  2 files changed, 4 insertions(+), 4 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
 | ||||||
|  | index a1ebea46..d8fdeb3a 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
 | ||||||
|  | @@ -1771,8 +1771,8 @@ LABEL(strcmp_exitz):
 | ||||||
|  |  	.p2align 4 | ||||||
|  |  	// XXX Same as code above | ||||||
|  |  LABEL(Byte0): | ||||||
|  | -	movzx	(%rsi), %ecx
 | ||||||
|  | -	movzx	(%rdi), %eax
 | ||||||
|  | +	movzbl	(%rsi), %ecx
 | ||||||
|  | +	movzbl	(%rdi), %eax
 | ||||||
|  |   | ||||||
|  |  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | ||||||
|  |  	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx | ||||||
|  | diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
 | ||||||
|  | index f47c8ad4..aa6df898 100644
 | ||||||
|  | --- a/sysdeps/x86_64/strcmp.S
 | ||||||
|  | +++ b/sysdeps/x86_64/strcmp.S
 | ||||||
|  | @@ -2232,8 +2232,8 @@ LABEL(strcmp_exitz):
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  |  LABEL(Byte0): | ||||||
|  | -	movzx	(%rsi), %ecx
 | ||||||
|  | -	movzx	(%rdi), %eax
 | ||||||
|  | +	movzbl	(%rsi), %ecx
 | ||||||
|  | +	movzbl	(%rdi), %eax
 | ||||||
|  |   | ||||||
|  |  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L | ||||||
|  |  	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										56
									
								
								SOURCES/glibc-RHEL-15696-61.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								SOURCES/glibc-RHEL-15696-61.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,56 @@ | |||||||
|  | From cf2c57526ba4b57e6863ad4db8a868e2678adce8 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Fri, 30 Apr 2021 05:58:59 -0700 | ||||||
|  | Subject: [PATCH] x86: Set rep_movsb_threshold to 2112 on processors with FSRM | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | The glibc memcpy benchmark on Intel Core i7-1065G7 (Ice Lake) showed | ||||||
|  | that REP MOVSB became faster after 2112 bytes: | ||||||
|  | 
 | ||||||
|  |                                       Vector Move       REP MOVSB | ||||||
|  | length=2112, align1=0, align2=0:        24.20             24.40 | ||||||
|  | length=2112, align1=1, align2=0:        26.07             23.13 | ||||||
|  | length=2112, align1=0, align2=1:        27.18             28.13 | ||||||
|  | length=2112, align1=1, align2=1:        26.23             25.16 | ||||||
|  | length=2176, align1=0, align2=0:        23.18             22.52 | ||||||
|  | length=2176, align1=2, align2=0:        25.45             22.52 | ||||||
|  | length=2176, align1=0, align2=2:        27.14             27.82 | ||||||
|  | length=2176, align1=2, align2=2:        22.73             25.56 | ||||||
|  | length=2240, align1=0, align2=0:        24.62             24.25 | ||||||
|  | length=2240, align1=3, align2=0:        29.77             27.15 | ||||||
|  | length=2240, align1=0, align2=3:        35.55             29.93 | ||||||
|  | length=2240, align1=3, align2=3:        34.49             25.15 | ||||||
|  | length=2304, align1=0, align2=0:        34.75             26.64 | ||||||
|  | length=2304, align1=4, align2=0:        32.09             22.63 | ||||||
|  | length=2304, align1=0, align2=4:        28.43             31.24 | ||||||
|  | 
 | ||||||
|  | Use REP MOVSB for data size > 2112 bytes in memcpy on processors with | ||||||
|  | fast short REP MOVSB (FSRM). | ||||||
|  | 
 | ||||||
|  | 	* sysdeps/x86/dl-cacheinfo.h (dl_init_cacheinfo): Set | ||||||
|  | 	rep_movsb_threshold to 2112 on processors with fast short REP | ||||||
|  | 	MOVSB (FSRM). | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86/cacheinfo.h | 6 ++++++ | ||||||
|  |  1 file changed, 6 insertions(+) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
 | ||||||
|  | index f72f634a..cc3941d3 100644
 | ||||||
|  | --- a/sysdeps/x86/cacheinfo.h
 | ||||||
|  | +++ b/sysdeps/x86/cacheinfo.h
 | ||||||
|  | @@ -430,6 +430,12 @@ init_cacheinfo (void)
 | ||||||
|  |        rep_movsb_threshold = 2048 * (16 / 16); | ||||||
|  |        minimum_rep_movsb_threshold = 16 * 8; | ||||||
|  |      } | ||||||
|  | +
 | ||||||
|  | +  /* NB: The default REP MOVSB threshold is 2112 on processors with fast
 | ||||||
|  | +     short REP MOVSB (FSRM).  */
 | ||||||
|  | +  if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
 | ||||||
|  | +    rep_movsb_threshold = 2112;
 | ||||||
|  | +
 | ||||||
|  |    if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold) | ||||||
|  |      __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; | ||||||
|  |    else | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										136
									
								
								SOURCES/glibc-RHEL-15696-62.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										136
									
								
								SOURCES/glibc-RHEL-15696-62.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,136 @@ | |||||||
|  | From 475b63702ef38b69558fc3d31a0b66776a70f1d3 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Mon, 1 Nov 2021 00:49:52 -0500 | ||||||
|  | Subject: [PATCH] x86: Double size of ERMS rep_movsb_threshold in | ||||||
|  |  dl-cacheinfo.h | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. | ||||||
|  | 
 | ||||||
|  | This patch doubles the rep_movsb_threshold when using ERMS. Based on | ||||||
|  | benchmarks the vector copy loop, especially now that it handles 4k | ||||||
|  | aliasing, is better for these medium ranged. | ||||||
|  | 
 | ||||||
|  | On Skylake with ERMS: | ||||||
|  | 
 | ||||||
|  | Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy) | ||||||
|  | 4096,   0,      0,      0,      0.975 | ||||||
|  | 4096,   0,      0,      1,      0.953 | ||||||
|  | 4096,   12,     0,      0,      0.969 | ||||||
|  | 4096,   12,     0,      1,      0.872 | ||||||
|  | 4096,   44,     0,      0,      0.979 | ||||||
|  | 4096,   44,     0,      1,      0.83 | ||||||
|  | 4096,   0,      12,     0,      1.006 | ||||||
|  | 4096,   0,      12,     1,      0.989 | ||||||
|  | 4096,   0,      44,     0,      0.739 | ||||||
|  | 4096,   0,      44,     1,      0.942 | ||||||
|  | 4096,   12,     12,     0,      1.009 | ||||||
|  | 4096,   12,     12,     1,      0.973 | ||||||
|  | 4096,   44,     44,     0,      0.791 | ||||||
|  | 4096,   44,     44,     1,      0.961 | ||||||
|  | 4096,   2048,   0,      0,      0.978 | ||||||
|  | 4096,   2048,   0,      1,      0.951 | ||||||
|  | 4096,   2060,   0,      0,      0.986 | ||||||
|  | 4096,   2060,   0,      1,      0.963 | ||||||
|  | 4096,   2048,   12,     0,      0.971 | ||||||
|  | 4096,   2048,   12,     1,      0.941 | ||||||
|  | 4096,   2060,   12,     0,      0.977 | ||||||
|  | 4096,   2060,   12,     1,      0.949 | ||||||
|  | 8192,   0,      0,      0,      0.85 | ||||||
|  | 8192,   0,      0,      1,      0.845 | ||||||
|  | 8192,   13,     0,      0,      0.937 | ||||||
|  | 8192,   13,     0,      1,      0.939 | ||||||
|  | 8192,   45,     0,      0,      0.932 | ||||||
|  | 8192,   45,     0,      1,      0.927 | ||||||
|  | 8192,   0,      13,     0,      0.621 | ||||||
|  | 8192,   0,      13,     1,      0.62 | ||||||
|  | 8192,   0,      45,     0,      0.53 | ||||||
|  | 8192,   0,      45,     1,      0.516 | ||||||
|  | 8192,   13,     13,     0,      0.664 | ||||||
|  | 8192,   13,     13,     1,      0.659 | ||||||
|  | 8192,   45,     45,     0,      0.593 | ||||||
|  | 8192,   45,     45,     1,      0.575 | ||||||
|  | 8192,   2048,   0,      0,      0.854 | ||||||
|  | 8192,   2048,   0,      1,      0.834 | ||||||
|  | 8192,   2061,   0,      0,      0.863 | ||||||
|  | 8192,   2061,   0,      1,      0.857 | ||||||
|  | 8192,   2048,   13,     0,      0.63 | ||||||
|  | 8192,   2048,   13,     1,      0.629 | ||||||
|  | 8192,   2061,   13,     0,      0.627 | ||||||
|  | 8192,   2061,   13,     1,      0.62 | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86/cacheinfo.h      |  8 +++++--- | ||||||
|  |  sysdeps/x86/dl-tunables.list | 26 +++++++++++++++----------- | ||||||
|  |  2 files changed, 20 insertions(+), 14 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
 | ||||||
|  | index cc3941d3..ac025e08 100644
 | ||||||
|  | --- a/sysdeps/x86/cacheinfo.h
 | ||||||
|  | +++ b/sysdeps/x86/cacheinfo.h
 | ||||||
|  | @@ -411,18 +411,20 @@ init_cacheinfo (void)
 | ||||||
|  |   | ||||||
|  |    /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */ | ||||||
|  |    unsigned int minimum_rep_movsb_threshold; | ||||||
|  | -  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
 | ||||||
|  | +  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
 | ||||||
|  | +     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
 | ||||||
|  | +     threshold is 2048 * (VEC_SIZE / 16).  */
 | ||||||
|  |    unsigned int rep_movsb_threshold; | ||||||
|  |    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) | ||||||
|  |        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512)) | ||||||
|  |      { | ||||||
|  | -      rep_movsb_threshold = 2048 * (64 / 16);
 | ||||||
|  | +      rep_movsb_threshold = 4096 * (64 / 16);
 | ||||||
|  |        minimum_rep_movsb_threshold = 64 * 8; | ||||||
|  |      } | ||||||
|  |    else if (CPU_FEATURE_PREFERRED_P (cpu_features, | ||||||
|  |  				    AVX_Fast_Unaligned_Load)) | ||||||
|  |      { | ||||||
|  | -      rep_movsb_threshold = 2048 * (32 / 16);
 | ||||||
|  | +      rep_movsb_threshold = 4096 * (32 / 16);
 | ||||||
|  |        minimum_rep_movsb_threshold = 32 * 8; | ||||||
|  |      } | ||||||
|  |    else | ||||||
|  | diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
 | ||||||
|  | index 89bf2966..56c6834a 100644
 | ||||||
|  | --- a/sysdeps/x86/dl-tunables.list
 | ||||||
|  | +++ b/sysdeps/x86/dl-tunables.list
 | ||||||
|  | @@ -32,17 +32,21 @@ glibc {
 | ||||||
|  |      } | ||||||
|  |      x86_rep_movsb_threshold { | ||||||
|  |        type: SIZE_T | ||||||
|  | -      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
 | ||||||
|  | -      # isn't faster on short data.  The memcpy micro benchmark in glibc
 | ||||||
|  | -      # shows that 2KB is the approximate value above which REP MOVSB
 | ||||||
|  | -      # becomes faster than SSE2 optimization on processors with Enhanced
 | ||||||
|  | -      # REP MOVSB.  Since larger register size can move more data with a
 | ||||||
|  | -      # single load and store, the threshold is higher with larger register
 | ||||||
|  | -      # size.  Note: Since the REP MOVSB threshold must be greater than 8
 | ||||||
|  | -      # times of vector size and the default value is 2048 * (vector size
 | ||||||
|  | -      # / 16), the default value and the minimum value must be updated at
 | ||||||
|  | -      # run-time.  NB: Don't set the default value since we can't tell if
 | ||||||
|  | -      # the tunable value is set by user or not [BZ #27069].
 | ||||||
|  | +      # Since there is overhead to set up REP MOVSB operation, REP
 | ||||||
|  | +      # MOVSB isn't faster on short data.  The memcpy micro benchmark
 | ||||||
|  | +      # in glibc shows that 2KB is the approximate value above which
 | ||||||
|  | +      # REP MOVSB becomes faster than SSE2 optimization on processors
 | ||||||
|  | +      # with Enhanced REP MOVSB.  Since larger register size can move
 | ||||||
|  | +      # more data with a single load and store, the threshold is
 | ||||||
|  | +      # higher with larger register size.  Micro benchmarks show AVX
 | ||||||
|  | +      # REP MOVSB becomes faster apprximately at 8KB.  The AVX512
 | ||||||
|  | +      # threshold is extrapolated to 16KB.  For machines with FSRM the
 | ||||||
|  | +      # threshold is universally set at 2112 bytes.  Note: Since the
 | ||||||
|  | +      # REP MOVSB threshold must be greater than 8 times of vector
 | ||||||
|  | +      # size and the default value is 4096 * (vector size / 16), the
 | ||||||
|  | +      # default value and the minimum value must be updated at
 | ||||||
|  | +      # run-time.  NB: Don't set the default value since we can't tell
 | ||||||
|  | +      # if the tunable value is set by user or not [BZ #27069].
 | ||||||
|  |        minval: 1 | ||||||
|  |      } | ||||||
|  |      x86_rep_stosb_threshold { | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										2428
									
								
								SOURCES/glibc-RHEL-15696-63.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2428
									
								
								SOURCES/glibc-RHEL-15696-63.patch
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										39
									
								
								SOURCES/glibc-RHEL-15696-64.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								SOURCES/glibc-RHEL-15696-64.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,39 @@ | |||||||
|  | From 0b82747dc48d5bf0871bdc6da8cb6eec1256355f Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Thu, 11 Nov 2021 06:31:51 -0800 | ||||||
|  | Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_lock_full [BZ | ||||||
|  |  #28537] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Replace boolean CAS with value CAS to avoid the extra load. | ||||||
|  | 
 | ||||||
|  | Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> | ||||||
|  | ---
 | ||||||
|  |  nptl/pthread_mutex_lock.c | 10 +++++----- | ||||||
|  |  1 file changed, 5 insertions(+), 5 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
 | ||||||
|  | index 29cc143e..60ada70d 100644
 | ||||||
|  | --- a/nptl/pthread_mutex_lock.c
 | ||||||
|  | +++ b/nptl/pthread_mutex_lock.c
 | ||||||
|  | @@ -292,12 +292,12 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
 | ||||||
|  |  	     meantime.  */ | ||||||
|  |  	  if ((oldval & FUTEX_WAITERS) == 0) | ||||||
|  |  	    { | ||||||
|  | -	      if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock,
 | ||||||
|  | -							oldval | FUTEX_WAITERS,
 | ||||||
|  | -							oldval)
 | ||||||
|  | -		  != 0)
 | ||||||
|  | +	      int val;
 | ||||||
|  | +	      if ((val = atomic_compare_and_exchange_val_acq
 | ||||||
|  | +		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
 | ||||||
|  | +		    oldval)) != oldval)
 | ||||||
|  |  		{ | ||||||
|  | -		  oldval = mutex->__data.__lock;
 | ||||||
|  | +		  oldval = val;
 | ||||||
|  |  		  continue; | ||||||
|  |  		} | ||||||
|  |  	      oldval |= FUTEX_WAITERS; | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										39
									
								
								SOURCES/glibc-RHEL-15696-65.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								SOURCES/glibc-RHEL-15696-65.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,39 @@ | |||||||
|  | From 49302b8fdf9103b6fc0a398678668a22fa19574c Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Thu, 11 Nov 2021 06:54:01 -0800 | ||||||
|  | Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_clocklock_common | ||||||
|  |  [BZ #28537] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Replace boolean CAS with value CAS to avoid the extra load. | ||||||
|  | 
 | ||||||
|  | Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> | ||||||
|  | ---
 | ||||||
|  |  nptl/pthread_mutex_timedlock.c | 10 +++++----- | ||||||
|  |  1 file changed, 5 insertions(+), 5 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
 | ||||||
|  | index 888c12fe..c4627ef6 100644
 | ||||||
|  | --- a/nptl/pthread_mutex_timedlock.c
 | ||||||
|  | +++ b/nptl/pthread_mutex_timedlock.c
 | ||||||
|  | @@ -269,12 +269,12 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
 | ||||||
|  |  	     meantime.  */ | ||||||
|  |  	  if ((oldval & FUTEX_WAITERS) == 0) | ||||||
|  |  	    { | ||||||
|  | -	      if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock,
 | ||||||
|  | -							oldval | FUTEX_WAITERS,
 | ||||||
|  | -							oldval)
 | ||||||
|  | -		  != 0)
 | ||||||
|  | +	      int val;
 | ||||||
|  | +	      if ((val = atomic_compare_and_exchange_val_acq
 | ||||||
|  | +		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
 | ||||||
|  | +		    oldval)) != oldval)
 | ||||||
|  |  		{ | ||||||
|  | -		  oldval = mutex->__data.__lock;
 | ||||||
|  | +		  oldval = val;
 | ||||||
|  |  		  continue; | ||||||
|  |  		} | ||||||
|  |  	      oldval |= FUTEX_WAITERS; | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										51
									
								
								SOURCES/glibc-RHEL-15696-66.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								SOURCES/glibc-RHEL-15696-66.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,51 @@ | |||||||
|  | From d672a98a1af106bd68deb15576710cd61363f7a6 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Tue, 2 Nov 2021 18:33:07 -0700 | ||||||
|  | Subject: [PATCH] Add LLL_MUTEX_READ_LOCK [BZ #28537] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | CAS instruction is expensive.  From the x86 CPU's point of view, getting | ||||||
|  | a cache line for writing is more expensive than reading.  See Appendix | ||||||
|  | A.2 Spinlock in: | ||||||
|  | 
 | ||||||
|  | https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf | ||||||
|  | 
 | ||||||
|  | The full compare and swap will grab the cache line exclusive and cause | ||||||
|  | excessive cache line bouncing. | ||||||
|  | 
 | ||||||
|  | Add LLL_MUTEX_READ_LOCK to do an atomic load and skip CAS in spinlock | ||||||
|  | loop if compare may fail to reduce cache line bouncing on contended locks. | ||||||
|  | 
 | ||||||
|  | Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> | ||||||
|  | ---
 | ||||||
|  |  nptl/pthread_mutex_lock.c | 7 +++++++ | ||||||
|  |  1 file changed, 7 insertions(+) | ||||||
|  | 
 | ||||||
|  | diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
 | ||||||
|  | index 60ada70d..eb4d8baa 100644
 | ||||||
|  | --- a/nptl/pthread_mutex_lock.c
 | ||||||
|  | +++ b/nptl/pthread_mutex_lock.c
 | ||||||
|  | @@ -56,6 +56,11 @@
 | ||||||
|  |  #define FORCE_ELISION(m, s) | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  | +#ifndef LLL_MUTEX_READ_LOCK
 | ||||||
|  | +# define LLL_MUTEX_READ_LOCK(mutex) \
 | ||||||
|  | +  atomic_load_relaxed (&(mutex)->__data.__lock)
 | ||||||
|  | +#endif
 | ||||||
|  | +
 | ||||||
|  |  static int __pthread_mutex_lock_full (pthread_mutex_t *mutex) | ||||||
|  |       __attribute_noinline__; | ||||||
|  |   | ||||||
|  | @@ -136,6 +141,8 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
 | ||||||
|  |  		  break; | ||||||
|  |  		} | ||||||
|  |  	      atomic_spin_nop (); | ||||||
|  | +	      if (LLL_MUTEX_READ_LOCK (mutex) != 0)
 | ||||||
|  | +		continue;
 | ||||||
|  |  	    } | ||||||
|  |  	  while (LLL_MUTEX_TRYLOCK (mutex) != 0); | ||||||
|  |   | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										71
									
								
								SOURCES/glibc-RHEL-15696-67.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										71
									
								
								SOURCES/glibc-RHEL-15696-67.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,71 @@ | |||||||
|  | From 120ac6d238825452e8024e2f627da33b2508dfd3 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Fri, 12 Nov 2021 11:47:42 -0800 | ||||||
|  | Subject: [PATCH] Move assignment out of the CAS condition | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Update | ||||||
|  | 
 | ||||||
|  | commit 49302b8fdf9103b6fc0a398678668a22fa19574c | ||||||
|  | Author: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | Date:   Thu Nov 11 06:54:01 2021 -0800 | ||||||
|  | 
 | ||||||
|  |     Avoid extra load with CAS in __pthread_mutex_clocklock_common [BZ #28537] | ||||||
|  | 
 | ||||||
|  |     Replace boolean CAS with value CAS to avoid the extra load. | ||||||
|  | 
 | ||||||
|  | and | ||||||
|  | 
 | ||||||
|  | commit 0b82747dc48d5bf0871bdc6da8cb6eec1256355f | ||||||
|  | Author: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | Date:   Thu Nov 11 06:31:51 2021 -0800 | ||||||
|  | 
 | ||||||
|  |     Avoid extra load with CAS in __pthread_mutex_lock_full [BZ #28537] | ||||||
|  | 
 | ||||||
|  |     Replace boolean CAS with value CAS to avoid the extra load. | ||||||
|  | 
 | ||||||
|  | by moving assignment out of the CAS condition. | ||||||
|  | ---
 | ||||||
|  |  nptl/pthread_mutex_lock.c      | 7 +++---- | ||||||
|  |  nptl/pthread_mutex_timedlock.c | 7 +++---- | ||||||
|  |  2 files changed, 6 insertions(+), 8 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
 | ||||||
|  | index eb4d8baa..a633d95e 100644
 | ||||||
|  | --- a/nptl/pthread_mutex_lock.c
 | ||||||
|  | +++ b/nptl/pthread_mutex_lock.c
 | ||||||
|  | @@ -299,10 +299,9 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
 | ||||||
|  |  	     meantime.  */ | ||||||
|  |  	  if ((oldval & FUTEX_WAITERS) == 0) | ||||||
|  |  	    { | ||||||
|  | -	      int val;
 | ||||||
|  | -	      if ((val = atomic_compare_and_exchange_val_acq
 | ||||||
|  | -		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
 | ||||||
|  | -		    oldval)) != oldval)
 | ||||||
|  | +	      int val = atomic_compare_and_exchange_val_acq
 | ||||||
|  | +		(&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
 | ||||||
|  | +	      if (val != oldval)
 | ||||||
|  |  		{ | ||||||
|  |  		  oldval = val; | ||||||
|  |  		  continue; | ||||||
|  | diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
 | ||||||
|  | index c4627ef6..a76c30b7 100644
 | ||||||
|  | --- a/nptl/pthread_mutex_timedlock.c
 | ||||||
|  | +++ b/nptl/pthread_mutex_timedlock.c
 | ||||||
|  | @@ -269,10 +269,9 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
 | ||||||
|  |  	     meantime.  */ | ||||||
|  |  	  if ((oldval & FUTEX_WAITERS) == 0) | ||||||
|  |  	    { | ||||||
|  | -	      int val;
 | ||||||
|  | -	      if ((val = atomic_compare_and_exchange_val_acq
 | ||||||
|  | -		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
 | ||||||
|  | -		    oldval)) != oldval)
 | ||||||
|  | +	      int val = atomic_compare_and_exchange_val_acq
 | ||||||
|  | +		(&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
 | ||||||
|  | +	      if (val != oldval)
 | ||||||
|  |  		{ | ||||||
|  |  		  oldval = val; | ||||||
|  |  		  continue; | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										60
									
								
								SOURCES/glibc-RHEL-15696-68.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										60
									
								
								SOURCES/glibc-RHEL-15696-68.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,60 @@ | |||||||
|  | From 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Fri, 3 Dec 2021 15:29:25 -0800 | ||||||
|  | Subject: [PATCH] x86-64: Use notl in EVEX strcmp [BZ #28646] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Must use notl %edi here as lower bits are for CHAR comparisons | ||||||
|  | potentially out of range thus can be 0 without indicating mismatch. | ||||||
|  | This fixes BZ #28646. | ||||||
|  | 
 | ||||||
|  | Co-Authored-By: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strcmp-evex.S | 14 ++++++++------ | ||||||
|  |  1 file changed, 8 insertions(+), 6 deletions(-) | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	string/test-strcmp.c | ||||||
|  | 	(new check omitted) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
 | ||||||
|  | index 82f12ac8..6f5c4bf9 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
 | ||||||
|  | @@ -656,12 +656,13 @@ L(loop_cross_page):
 | ||||||
|  |  	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */ | ||||||
|  |  	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4} | ||||||
|  |  	kmovd	%k3, %edi | ||||||
|  | +    /* Must use notl %edi here as lower bits are for CHAR
 | ||||||
|  | +	   comparisons potentially out of range thus can be 0 without
 | ||||||
|  | +	   indicating mismatch.  */
 | ||||||
|  | +	notl	%edi
 | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  |  	/* Don't use subl since it is the upper 8 bits of EDI below.  */ | ||||||
|  | -	notl	%edi
 | ||||||
|  |  	andl	$0xff, %edi | ||||||
|  | -# else
 | ||||||
|  | -	incl	%edi
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  | @@ -743,12 +744,13 @@ L(loop_cross_page_2_vec):
 | ||||||
|  |  	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */ | ||||||
|  |  	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4} | ||||||
|  |  	kmovd	%k3, %edi | ||||||
|  | +	/* Must use notl %edi here as lower bits are for CHAR
 | ||||||
|  | +	   comparisons potentially out of range thus can be 0 without
 | ||||||
|  | +	   indicating mismatch.  */
 | ||||||
|  | +	notl	%edi
 | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  |  	/* Don't use subl since it is the upper 8 bits of EDI below.  */ | ||||||
|  | -	notl	%edi
 | ||||||
|  |  	andl	$0xff, %edi | ||||||
|  | -# else
 | ||||||
|  | -	incl	%edi
 | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  |  # ifdef USE_AS_WCSCMP | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										35
									
								
								SOURCES/glibc-RHEL-15696-69.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								SOURCES/glibc-RHEL-15696-69.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,35 @@ | |||||||
|  | From ceeffe968c01b1202e482f4855cb6baf5c6cb713 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Mon, 6 Dec 2021 07:14:12 -0800 | ||||||
|  | Subject: [PATCH] x86: Don't set Prefer_No_AVX512 for processors with AVX512 | ||||||
|  |  and AVX-VNNI | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Don't set Prefer_No_AVX512 on processors with AVX512 and AVX-VNNI since | ||||||
|  | they won't lower CPU frequency when ZMM load and store instructions are | ||||||
|  | used. | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86/cpu-features.c | 7 +++++-- | ||||||
|  |  1 file changed, 5 insertions(+), 2 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
 | ||||||
|  | index 956bfb4f..5ff2baa0 100644
 | ||||||
|  | --- a/sysdeps/x86/cpu-features.c
 | ||||||
|  | +++ b/sysdeps/x86/cpu-features.c
 | ||||||
|  | @@ -525,8 +525,11 @@ init_cpu_features (struct cpu_features *cpu_features)
 | ||||||
|  |  	  |= bit_arch_Prefer_No_VZEROUPPER; | ||||||
|  |        else | ||||||
|  |  	{ | ||||||
|  | -	  cpu_features->preferred[index_arch_Prefer_No_AVX512]
 | ||||||
|  | -	    |= bit_arch_Prefer_No_AVX512;
 | ||||||
|  | +	  /* Processors with AVX512 and AVX-VNNI won't lower CPU frequency
 | ||||||
|  | +	     when ZMM load and store instructions are used.  */
 | ||||||
|  | +	  if (!CPU_FEATURES_CPU_P (cpu_features, AVX_VNNI))
 | ||||||
|  | +	    cpu_features->preferred[index_arch_Prefer_No_AVX512]
 | ||||||
|  | +	      |= bit_arch_Prefer_No_AVX512;
 | ||||||
|  |   | ||||||
|  |  	  /* Avoid RTM abort triggered by VZEROUPPER inside a | ||||||
|  |  	     transactionally executing RTM region.  */ | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										153
									
								
								SOURCES/glibc-RHEL-15696-7.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										153
									
								
								SOURCES/glibc-RHEL-15696-7.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,153 @@ | |||||||
|  | From c7c54f65b080affb87a1513dee449c8ad6143c8b Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Mon, 21 Jan 2019 11:35:18 -0800 | ||||||
|  | Subject: [PATCH] x86-64 strncpy: Properly handle the length parameter [BZ# | ||||||
|  |  24097] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | On x32, the size_t parameter may be passed in the lower 32 bits of a | ||||||
|  | 64-bit register with the non-zero upper 32 bits.  The string/memory | ||||||
|  | functions written in assembly can only use the lower 32 bits of a | ||||||
|  | 64-bit register as length or must clear the upper 32 bits before using | ||||||
|  | the full 64-bit register for length. | ||||||
|  | 
 | ||||||
|  | This pach fixes strncpy for x32.  Tested on x86-64 and x32.  On x86-64, | ||||||
|  | libc.so is the same with and withou the fix. | ||||||
|  | 
 | ||||||
|  | 	[BZ# 24097] | ||||||
|  | 	CVE-2019-6488 | ||||||
|  | 	* sysdeps/x86_64/multiarch/strcpy-avx2.S: Use RDX_LP for length. | ||||||
|  | 	* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise. | ||||||
|  | 	* sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise. | ||||||
|  | 	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy. | ||||||
|  | 	* sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file. | ||||||
|  | ---
 | ||||||
|  |  .../x86_64/multiarch/strcpy-sse2-unaligned.S  |  4 +- | ||||||
|  |  sysdeps/x86_64/multiarch/strcpy-ssse3.S       |  6 +- | ||||||
|  |  sysdeps/x86_64/x32/Makefile                   |  2 +- | ||||||
|  |  sysdeps/x86_64/x32/tst-size_t-strncpy.c       | 58 +++++++++++++++++++ | ||||||
|  |  4 files changed, 64 insertions(+), 6 deletions(-) | ||||||
|  |  create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncpy.c | ||||||
|  | 
 | ||||||
|  | Conflicts: | ||||||
|  | 	ChangeLog | ||||||
|  | 	(removed) | ||||||
|  | 	sysdeps/x86_64/multiarch/strcpy-avx2.S | ||||||
|  | 	(skipped, only needed for x32 arch) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
 | ||||||
|  | index 72bf7e85..50aca22d 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
 | ||||||
|  | @@ -40,8 +40,8 @@
 | ||||||
|  |  .text | ||||||
|  |  ENTRY (STRCPY) | ||||||
|  |  #  ifdef USE_AS_STRNCPY | ||||||
|  | -	mov	%rdx, %r8
 | ||||||
|  | -	test	%r8, %r8
 | ||||||
|  | +	mov	%RDX_LP, %R8_LP
 | ||||||
|  | +	test	%R8_LP, %R8_LP
 | ||||||
|  |  	jz	L(ExitZero) | ||||||
|  |  #  endif | ||||||
|  |  	mov	%rsi, %rcx | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
 | ||||||
|  | index 9858d0c4..0a62814a 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
 | ||||||
|  | @@ -31,13 +31,13 @@ ENTRY (STRCPY)
 | ||||||
|  |   | ||||||
|  |  	mov	%rsi, %rcx | ||||||
|  |  #  ifdef USE_AS_STRNCPY | ||||||
|  | -	mov	%rdx, %r8
 | ||||||
|  | +	mov	%RDX_LP, %R8_LP
 | ||||||
|  |  #  endif | ||||||
|  |  	mov	%rdi, %rdx | ||||||
|  |  #  ifdef USE_AS_STRNCPY | ||||||
|  | -	test	%r8, %r8
 | ||||||
|  | +	test	%R8_LP, %R8_LP
 | ||||||
|  |  	jz	L(Exit0) | ||||||
|  | -	cmp	$8, %r8
 | ||||||
|  | +	cmp	$8, %R8_LP
 | ||||||
|  |  	jbe	L(StrncpyExit8Bytes) | ||||||
|  |  # endif | ||||||
|  |  	cmpb	$0, (%rcx) | ||||||
|  | diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | index db302839..2a9e20a9 100644
 | ||||||
|  | --- a/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/Makefile
 | ||||||
|  | @@ -8,7 +8,7 @@ endif
 | ||||||
|  |  ifeq ($(subdir),string) | ||||||
|  |  tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ | ||||||
|  |  	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \ | ||||||
|  | -	 tst-size_t-strncmp
 | ||||||
|  | +	 tst-size_t-strncmp tst-size_t-strncpy
 | ||||||
|  |  endif | ||||||
|  |   | ||||||
|  |  ifeq ($(subdir),wcsmbs) | ||||||
|  | diff --git a/sysdeps/x86_64/x32/tst-size_t-strncpy.c b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..4dec71e6
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
 | ||||||
|  | @@ -0,0 +1,58 @@
 | ||||||
|  | +/* Test strncpy with size_t in the lower 32 bits of 64-bit register.
 | ||||||
|  | +   Copyright (C) 2019 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <http://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#define TEST_NAME "strncpy"
 | ||||||
|  | +#include "test-size_t.h"
 | ||||||
|  | +
 | ||||||
|  | +IMPL (strncpy, 1)
 | ||||||
|  | +
 | ||||||
|  | +typedef char *(*proto_t) (char *, const char*, size_t);
 | ||||||
|  | +
 | ||||||
|  | +static void *
 | ||||||
|  | +__attribute__ ((noinline, noclone))
 | ||||||
|  | +do_strncpy (parameter_t a, parameter_t b)
 | ||||||
|  | +{
 | ||||||
|  | +  return CALL (&b, a.p, b.p, a.len);
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +static int
 | ||||||
|  | +test_main (void)
 | ||||||
|  | +{
 | ||||||
|  | +  test_init ();
 | ||||||
|  | +
 | ||||||
|  | +  parameter_t dest = { { page_size }, buf1 };
 | ||||||
|  | +  parameter_t src = { { 0 }, buf2 };
 | ||||||
|  | +
 | ||||||
|  | +  int ret = 0;
 | ||||||
|  | +  FOR_EACH_IMPL (impl, 0)
 | ||||||
|  | +    {
 | ||||||
|  | +      src.fn = impl->fn;
 | ||||||
|  | +      do_strncpy (dest, src);
 | ||||||
|  | +      int res = strncmp (dest.p, src.p, dest.len);
 | ||||||
|  | +      if (res)
 | ||||||
|  | +	{
 | ||||||
|  | +	  error (0, 0, "Wrong result in function %s: %i != 0",
 | ||||||
|  | +		 impl->name, res);
 | ||||||
|  | +	  ret = 1;
 | ||||||
|  | +	}
 | ||||||
|  | +    }
 | ||||||
|  | +
 | ||||||
|  | +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 | ||||||
|  | +}
 | ||||||
|  | +
 | ||||||
|  | +#include <support/test-driver.c>
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										389
									
								
								SOURCES/glibc-RHEL-15696-70.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										389
									
								
								SOURCES/glibc-RHEL-15696-70.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,389 @@ | |||||||
|  | From abddd61de090ae84e380aff68a98bd94ef704667 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Fri, 24 Dec 2021 18:54:41 -0600 | ||||||
|  | Subject: [PATCH] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | No bug. | ||||||
|  | Optimizations are twofold. | ||||||
|  | 
 | ||||||
|  | 1) Replace page cross and 0/1 checks with masked load instructions in | ||||||
|  |    L(less_vec). In applications this reduces branch-misses in the | ||||||
|  |    hot [0, 32] case. | ||||||
|  | 2) Change controlflow so that L(less_vec) case gets the fall through. | ||||||
|  | 
 | ||||||
|  | Change 2) helps copies in the [0, 32] size range but comes at the cost | ||||||
|  | of copies in the [33, 64] size range.  From profiles of GCC and | ||||||
|  | Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this | ||||||
|  | appears to the the right tradeoff. | ||||||
|  | 
 | ||||||
|  | Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 249 +++++-------------- | ||||||
|  |  1 file changed, 56 insertions(+), 193 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 | ||||||
|  | index 640f6757..d2899e7c 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 | ||||||
|  | @@ -62,15 +62,18 @@ Latency:
 | ||||||
|  |  # define VMOVU		vmovdqu64 | ||||||
|  |   | ||||||
|  |  # ifdef USE_AS_WMEMCMP | ||||||
|  | +#  define VMOVU_MASK	vmovdqu32
 | ||||||
|  |  #  define CHAR_SIZE	4 | ||||||
|  |  #  define VPCMP	vpcmpd | ||||||
|  |  #  define VPTEST	vptestmd | ||||||
|  |  # else | ||||||
|  | +#  define VMOVU_MASK	vmovdqu8
 | ||||||
|  |  #  define CHAR_SIZE	1 | ||||||
|  |  #  define VPCMP	vpcmpub | ||||||
|  |  #  define VPTEST	vptestmb | ||||||
|  |  # endif | ||||||
|  |   | ||||||
|  | +
 | ||||||
|  |  # define VEC_SIZE	32 | ||||||
|  |  # define PAGE_SIZE	4096 | ||||||
|  |  # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE) | ||||||
|  | @@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6)
 | ||||||
|  |  	movl	%edx, %edx | ||||||
|  |  # endif | ||||||
|  |  	cmp	$CHAR_PER_VEC, %RDX_LP | ||||||
|  | -	jb	L(less_vec)
 | ||||||
|  | +	/* Fall through for [0, VEC_SIZE] as its the hottest.  */
 | ||||||
|  | +	ja	L(more_1x_vec)
 | ||||||
|  | +
 | ||||||
|  | +	/* Create mask for CHAR's we want to compare. This allows us to
 | ||||||
|  | +	   avoid having to include page cross logic.  */
 | ||||||
|  | +	movl	$-1, %ecx
 | ||||||
|  | +	bzhil	%edx, %ecx, %ecx
 | ||||||
|  | +	kmovd	%ecx, %k2
 | ||||||
|  | +
 | ||||||
|  | +	/* Safe to load full ymm with mask.  */
 | ||||||
|  | +	VMOVU_MASK (%rsi), %YMM2{%k2}
 | ||||||
|  | +	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
 | ||||||
|  | +	kmovd	%k1, %eax
 | ||||||
|  | +	testl	%eax, %eax
 | ||||||
|  | +	jnz	L(return_vec_0)
 | ||||||
|  | +	ret
 | ||||||
|  |   | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(return_vec_0):
 | ||||||
|  | +	tzcntl	%eax, %eax
 | ||||||
|  | +# ifdef USE_AS_WMEMCMP
 | ||||||
|  | +	movl	(%rdi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | +	xorl	%edx, %edx
 | ||||||
|  | +	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | +	/* NB: no partial register stall here because xorl zero idiom
 | ||||||
|  | +	   above.  */
 | ||||||
|  | +	setg	%dl
 | ||||||
|  | +	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | +# else
 | ||||||
|  | +	movzbl	(%rsi, %rax), %ecx
 | ||||||
|  | +	movzbl	(%rdi, %rax), %eax
 | ||||||
|  | +	subl	%ecx, %eax
 | ||||||
|  | +# endif
 | ||||||
|  | +	ret
 | ||||||
|  | +
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4
 | ||||||
|  | +L(more_1x_vec):
 | ||||||
|  |  	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */ | ||||||
|  |  	VMOVU	(%rsi), %YMM1 | ||||||
|  |  	/* Use compare not equals to directly check for mismatch.  */ | ||||||
|  | -	VPCMP	$4, (%rdi), %YMM1, %k1
 | ||||||
|  | +	VPCMP	$4,(%rdi), %YMM1, %k1
 | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  |  	/* NB: eax must be destination register if going to | ||||||
|  |  	   L(return_vec_[0,2]). For L(return_vec_3) destination register | ||||||
|  | @@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6)
 | ||||||
|  |   | ||||||
|  |  	/* Check third and fourth VEC no matter what.  */ | ||||||
|  |  	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3 | ||||||
|  | -	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
 | ||||||
|  | +	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
 | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(return_vec_2) | ||||||
|  |   | ||||||
|  |  	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4 | ||||||
|  | -	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
 | ||||||
|  | +	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
 | ||||||
|  |  	kmovd	%k1, %ecx | ||||||
|  |  	testl	%ecx, %ecx | ||||||
|  |  	jnz	L(return_vec_3) | ||||||
|  | @@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
 | ||||||
|  |  	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4 | ||||||
|  |  	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while | ||||||
|  |  	   oring with YMM1. Result is stored in YMM4.  */ | ||||||
|  | -	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
 | ||||||
|  | +	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
 | ||||||
|  |   | ||||||
|  |  	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */ | ||||||
|  |  	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 | ||||||
|  | @@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)
 | ||||||
|  |  	/* NB: eax must be zero to reach here.  */ | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | +
 | ||||||
|  | +	.p2align 4,, 8
 | ||||||
|  |  L(8x_end_return_vec_0_1_2_3): | ||||||
|  |  	movq	%rdx, %rdi | ||||||
|  |  L(8x_return_vec_0_1_2_3): | ||||||
|  | @@ -222,23 +262,6 @@ L(return_vec_3):
 | ||||||
|  |  # endif | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(return_vec_0):
 | ||||||
|  | -	tzcntl	%eax, %eax
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -	movl	(%rdi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  | -	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | -	/* NB: no partial register stall here because xorl zero idiom
 | ||||||
|  | -	   above.  */
 | ||||||
|  | -	setg	%dl
 | ||||||
|  | -	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | -# else
 | ||||||
|  | -	movzbl	(%rsi, %rax), %ecx
 | ||||||
|  | -	movzbl	(%rdi, %rax), %eax
 | ||||||
|  | -	subl	%ecx, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | -	ret
 | ||||||
|  |   | ||||||
|  |  	.p2align 4 | ||||||
|  |  L(return_vec_1): | ||||||
|  | @@ -297,7 +320,7 @@ L(loop_4x_vec):
 | ||||||
|  |  	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3 | ||||||
|  |  	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 | ||||||
|  |  	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4 | ||||||
|  | -	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
 | ||||||
|  | +	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
 | ||||||
|  |  	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 | ||||||
|  |  	VPTEST	%YMM4, %YMM4, %k1 | ||||||
|  |  	kmovd	%k1, %ecx | ||||||
|  | @@ -324,7 +347,7 @@ L(loop_4x_vec):
 | ||||||
|  |  	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2 | ||||||
|  |  	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2 | ||||||
|  |  	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4 | ||||||
|  | -	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
 | ||||||
|  | +	vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
 | ||||||
|  |  	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 | ||||||
|  |  	VPTEST	%YMM4, %YMM4, %k1 | ||||||
|  |  	kmovd	%k1, %ecx | ||||||
|  | @@ -336,14 +359,14 @@ L(loop_4x_vec):
 | ||||||
|  |  	/* Only entry is from L(more_8x_vec).  */ | ||||||
|  |  	.p2align 4,, 10 | ||||||
|  |  L(8x_last_2x_vec): | ||||||
|  | -	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
 | ||||||
|  | +	VPCMP	$4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
 | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(8x_return_vec_2) | ||||||
|  |  	/* Naturally aligned to 16 bytes.  */ | ||||||
|  |  L(8x_last_1x_vec): | ||||||
|  |  	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1 | ||||||
|  | -	VPCMP	$4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
 | ||||||
|  | +	VPCMP	$4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
 | ||||||
|  |  	kmovd	%k1, %eax | ||||||
|  |  	testl	%eax, %eax | ||||||
|  |  	jnz	L(8x_return_vec_3) | ||||||
|  | @@ -392,7 +415,9 @@ L(last_1x_vec):
 | ||||||
|  |  	jnz	L(return_vec_0_end) | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	.p2align 4,, 10
 | ||||||
|  | +
 | ||||||
|  | +	/* Don't align. Takes 2-fetch blocks either way and aligning
 | ||||||
|  | +	   will cause code to spill into another cacheline.  */
 | ||||||
|  |  L(return_vec_1_end): | ||||||
|  |  	/* Use bsf to save code size. This is necessary to have | ||||||
|  |  	   L(one_or_less) fit in aligning bytes between.  */ | ||||||
|  | @@ -411,31 +436,8 @@ L(return_vec_1_end):
 | ||||||
|  |  # endif | ||||||
|  |  	ret | ||||||
|  |   | ||||||
|  | -	/* NB: L(one_or_less) fits in alignment padding between
 | ||||||
|  | -	   L(return_vec_1_end) and L(return_vec_0_end).  */
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -L(one_or_less):
 | ||||||
|  | -	jb	L(zero)
 | ||||||
|  | -	movl	(%rdi), %ecx
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  | -	cmpl	(%rsi), %ecx
 | ||||||
|  | -	je	L(zero)
 | ||||||
|  | -	setg	%dl
 | ||||||
|  | -	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | -	ret
 | ||||||
|  | -# else
 | ||||||
|  | -L(one_or_less):
 | ||||||
|  | -	jb	L(zero)
 | ||||||
|  | -	movzbl	(%rsi), %ecx
 | ||||||
|  | -	movzbl	(%rdi), %eax
 | ||||||
|  | -	subl	%ecx, %eax
 | ||||||
|  | -	ret
 | ||||||
|  | -# endif
 | ||||||
|  | -L(zero):
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | +	/* Don't align. Takes 2-fetch blocks either way and aligning
 | ||||||
|  | +	   will cause code to spill into another cacheline.  */
 | ||||||
|  |  L(return_vec_0_end): | ||||||
|  |  	tzcntl	%eax, %eax | ||||||
|  |  	addl	%edx, %eax | ||||||
|  | @@ -451,146 +453,7 @@ L(return_vec_0_end):
 | ||||||
|  |  	subl	%ecx, %eax | ||||||
|  |  # endif | ||||||
|  |  	ret | ||||||
|  | +	/* 1-byte until next cache line.  */
 | ||||||
|  |   | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(less_vec):
 | ||||||
|  | -	/* Check if one or less CHAR. This is necessary for size == 0
 | ||||||
|  | -	   but is also faster for size == CHAR_SIZE.  */
 | ||||||
|  | -	cmpl	$1, %edx
 | ||||||
|  | -	jbe	L(one_or_less)
 | ||||||
|  | -
 | ||||||
|  | -	/* Check if loading one VEC from either s1 or s2 could cause a
 | ||||||
|  | -	   page cross. This can have false positives but is by far the
 | ||||||
|  | -	   fastest method.  */
 | ||||||
|  | -	movl	%edi, %eax
 | ||||||
|  | -	orl	%esi, %eax
 | ||||||
|  | -	andl	$(PAGE_SIZE - 1), %eax
 | ||||||
|  | -	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 | ||||||
|  | -	jg	L(page_cross_less_vec)
 | ||||||
|  | -
 | ||||||
|  | -	/* No page cross possible.  */
 | ||||||
|  | -	VMOVU	(%rsi), %YMM2
 | ||||||
|  | -	VPCMP	$4, (%rdi), %YMM2, %k1
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | -	/* Check if any matches where in bounds. Intentionally not
 | ||||||
|  | -	   storing result in eax to limit dependency chain if it goes to
 | ||||||
|  | -	   L(return_vec_0_lv).  */
 | ||||||
|  | -	bzhil	%edx, %eax, %edx
 | ||||||
|  | -	jnz	L(return_vec_0_lv)
 | ||||||
|  | -	xorl	%eax, %eax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	/* Essentially duplicate of L(return_vec_0). Ends up not costing
 | ||||||
|  | -	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
 | ||||||
|  | -	   the jump and ends up fitting in aligning bytes. As well fits on
 | ||||||
|  | -	   same cache line as L(less_vec) so also saves a line from having
 | ||||||
|  | -	   to be fetched on cold calls to memcmp.  */
 | ||||||
|  | -	.p2align 4,, 4
 | ||||||
|  | -L(return_vec_0_lv):
 | ||||||
|  | -	tzcntl	%eax, %eax
 | ||||||
|  | -# ifdef USE_AS_WMEMCMP
 | ||||||
|  | -	movl	(%rdi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | -	xorl	%edx, %edx
 | ||||||
|  | -	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
 | ||||||
|  | -	/* NB: no partial register stall here because xorl zero idiom
 | ||||||
|  | -	   above.  */
 | ||||||
|  | -	setg	%dl
 | ||||||
|  | -	leal	-1(%rdx, %rdx), %eax
 | ||||||
|  | -# else
 | ||||||
|  | -	movzbl	(%rsi, %rax), %ecx
 | ||||||
|  | -	movzbl	(%rdi, %rax), %eax
 | ||||||
|  | -	subl	%ecx, %eax
 | ||||||
|  | -# endif
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4
 | ||||||
|  | -L(page_cross_less_vec):
 | ||||||
|  | -	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
 | ||||||
|  | -	   bytes.  */
 | ||||||
|  | -	cmpl	$(16 / CHAR_SIZE), %edx
 | ||||||
|  | -	jae	L(between_16_31)
 | ||||||
|  | -# ifndef USE_AS_WMEMCMP
 | ||||||
|  | -	cmpl	$8, %edx
 | ||||||
|  | -	jae	L(between_8_15)
 | ||||||
|  | -	cmpl	$4, %edx
 | ||||||
|  | -	jb	L(between_2_3)
 | ||||||
|  | -
 | ||||||
|  | -	/* Load as big endian with overlapping movbe to avoid branches.
 | ||||||
|  | -	 */
 | ||||||
|  | -	movbe	(%rdi), %eax
 | ||||||
|  | -	movbe	(%rsi), %ecx
 | ||||||
|  | -	shlq	$32, %rax
 | ||||||
|  | -	shlq	$32, %rcx
 | ||||||
|  | -	movbe	-4(%rdi, %rdx), %edi
 | ||||||
|  | -	movbe	-4(%rsi, %rdx), %esi
 | ||||||
|  | -	orq	%rdi, %rax
 | ||||||
|  | -	orq	%rsi, %rcx
 | ||||||
|  | -	subq	%rcx, %rax
 | ||||||
|  | -	/* edx is guranteed to be positive int32 in range [4, 7].  */
 | ||||||
|  | -	cmovne	%edx, %eax
 | ||||||
|  | -	/* ecx is -1 if rcx > rax. Otherwise 0.  */
 | ||||||
|  | -	sbbl	%ecx, %ecx
 | ||||||
|  | -	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
 | ||||||
|  | -	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
 | ||||||
|  | -	   eax doesn't matter.  */
 | ||||||
|  | -	orl	%ecx, %eax
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4,, 8
 | ||||||
|  | -L(between_8_15):
 | ||||||
|  | -# endif
 | ||||||
|  | -	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
 | ||||||
|  | -	vmovq	(%rdi), %xmm1
 | ||||||
|  | -	vmovq	(%rsi), %xmm2
 | ||||||
|  | -	VPCMP	$4, %xmm1, %xmm2, %k1
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(return_vec_0_lv)
 | ||||||
|  | -	/* Use overlapping loads to avoid branches.  */
 | ||||||
|  | -	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
 | ||||||
|  | -	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
 | ||||||
|  | -	VPCMP	$4, %xmm1, %xmm2, %k1
 | ||||||
|  | -	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(return_vec_0_end)
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -	.p2align 4,, 8
 | ||||||
|  | -L(between_16_31):
 | ||||||
|  | -	/* From 16 to 31 bytes.  No branch when size == 16.  */
 | ||||||
|  | -
 | ||||||
|  | -	/* Use movups to save code size.  */
 | ||||||
|  | -	vmovdqu	(%rsi), %xmm2
 | ||||||
|  | -	VPCMP	$4, (%rdi), %xmm2, %k1
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(return_vec_0_lv)
 | ||||||
|  | -	/* Use overlapping loads to avoid branches.  */
 | ||||||
|  | -	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
 | ||||||
|  | -	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
 | ||||||
|  | -	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
 | ||||||
|  | -	kmovd	%k1, %eax
 | ||||||
|  | -	testl	%eax, %eax
 | ||||||
|  | -	jnz	L(return_vec_0_end)
 | ||||||
|  | -	ret
 | ||||||
|  | -
 | ||||||
|  | -# ifndef USE_AS_WMEMCMP
 | ||||||
|  | -L(between_2_3):
 | ||||||
|  | -	/* Load as big endian to avoid branches.  */
 | ||||||
|  | -	movzwl	(%rdi), %eax
 | ||||||
|  | -	movzwl	(%rsi), %ecx
 | ||||||
|  | -	shll	$8, %eax
 | ||||||
|  | -	shll	$8, %ecx
 | ||||||
|  | -	bswap	%eax
 | ||||||
|  | -	bswap	%ecx
 | ||||||
|  | -	movzbl	-1(%rdi, %rdx), %edi
 | ||||||
|  | -	movzbl	-1(%rsi, %rdx), %esi
 | ||||||
|  | -	orl	%edi, %eax
 | ||||||
|  | -	orl	%esi, %ecx
 | ||||||
|  | -	/* Subtraction is okay because the upper 8 bits are zero.  */
 | ||||||
|  | -	subl	%ecx, %eax
 | ||||||
|  | -	ret
 | ||||||
|  | -# endif
 | ||||||
|  |  END (MEMCMP) | ||||||
|  |  #endif | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										43
									
								
								SOURCES/glibc-RHEL-15696-71.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								SOURCES/glibc-RHEL-15696-71.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,43 @@ | |||||||
|  | From 6b8dbbd03ac88f169b65b5c7d7278576a11d2e44 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Jangwoong Kim <6812skiii@gmail.com> | ||||||
|  | Date: Tue, 14 Dec 2021 21:30:51 +0900 | ||||||
|  | Subject: [PATCH] nptl: Effectively skip CAS in spinlock loop | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | The commit: | ||||||
|  | "Add LLL_MUTEX_READ_LOCK [BZ #28537]" | ||||||
|  | SHA1: d672a98a1af106bd68deb15576710cd61363f7a6 | ||||||
|  | 
 | ||||||
|  | introduced LLL_MUTEX_READ_LOCK, to skip CAS in spinlock loop | ||||||
|  | if atomic load fails. But, "continue" inside of do-while loop | ||||||
|  | does not skip the evaluation of escape expression, thus CAS | ||||||
|  | is not skipped. | ||||||
|  | 
 | ||||||
|  | Replace do-while with while and skip LLL_MUTEX_TRYLOCK if | ||||||
|  | LLL_MUTEX_READ_LOCK fails. | ||||||
|  | 
 | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  nptl/pthread_mutex_lock.c | 5 ++--- | ||||||
|  |  1 file changed, 2 insertions(+), 3 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
 | ||||||
|  | index a633d95e..d96a9933 100644
 | ||||||
|  | --- a/nptl/pthread_mutex_lock.c
 | ||||||
|  | +++ b/nptl/pthread_mutex_lock.c
 | ||||||
|  | @@ -141,10 +141,9 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
 | ||||||
|  |  		  break; | ||||||
|  |  		} | ||||||
|  |  	      atomic_spin_nop (); | ||||||
|  | -	      if (LLL_MUTEX_READ_LOCK (mutex) != 0)
 | ||||||
|  | -		continue;
 | ||||||
|  |  	    } | ||||||
|  | -	  while (LLL_MUTEX_TRYLOCK (mutex) != 0);
 | ||||||
|  | +	  while (LLL_MUTEX_READ_LOCK (mutex) != 0
 | ||||||
|  | +		 || LLL_MUTEX_TRYLOCK (mutex) != 0);
 | ||||||
|  |   | ||||||
|  |  	  mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8; | ||||||
|  |  	} | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										146
									
								
								SOURCES/glibc-RHEL-15696-72.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										146
									
								
								SOURCES/glibc-RHEL-15696-72.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,146 @@ | |||||||
|  | From 7835d611af0854e69a0c71e3806f8fe379282d6f Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Fri, 18 Feb 2022 14:19:15 -0600 | ||||||
|  | Subject: [PATCH] x86: Test wcscmp RTM in the wcsncmp overflow case [BZ #28896] | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would | ||||||
|  | call strcmp-avx2 and wcscmp-avx2 respectively. This would have | ||||||
|  | not checks around vzeroupper and would trigger spurious | ||||||
|  | aborts. This commit fixes that. | ||||||
|  | 
 | ||||||
|  | test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on | ||||||
|  | AVX2 machines with and without RTM. | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86/Makefile          |  5 ++++- | ||||||
|  |  sysdeps/x86/tst-strncmp-rtm.c | 32 +++++++++++++++++++++++--------- | ||||||
|  |  sysdeps/x86/tst-wcsncmp-rtm.c | 21 +++++++++++++++++++++ | ||||||
|  |  3 files changed, 48 insertions(+), 10 deletions(-) | ||||||
|  |  create mode 100644 sysdeps/x86/tst-wcsncmp-rtm.c | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
 | ||||||
|  | index 2d814915..c2111f49 100644
 | ||||||
|  | --- a/sysdeps/x86/Makefile
 | ||||||
|  | +++ b/sysdeps/x86/Makefile
 | ||||||
|  | @@ -28,7 +28,9 @@ tests += \
 | ||||||
|  |    tst-strcpy-rtm \ | ||||||
|  |    tst-strlen-rtm \ | ||||||
|  |    tst-strncmp-rtm \ | ||||||
|  | -  tst-strrchr-rtm
 | ||||||
|  | +  tst-strrchr-rtm \
 | ||||||
|  | +  tst-wcsncmp-rtm \
 | ||||||
|  | +# tests
 | ||||||
|  |   | ||||||
|  |  CFLAGS-tst-memchr-rtm.c += -mrtm | ||||||
|  |  CFLAGS-tst-memcmp-rtm.c += -mrtm | ||||||
|  | @@ -40,6 +42,7 @@ CFLAGS-tst-strcpy-rtm.c += -mrtm
 | ||||||
|  |  CFLAGS-tst-strlen-rtm.c += -mrtm | ||||||
|  |  CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error | ||||||
|  |  CFLAGS-tst-strrchr-rtm.c += -mrtm | ||||||
|  | +CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error
 | ||||||
|  |  endif | ||||||
|  |   | ||||||
|  |  ifneq ($(enable-cet),no) | ||||||
|  | diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
 | ||||||
|  | index 4d0004b5..4e9f094f 100644
 | ||||||
|  | --- a/sysdeps/x86/tst-strncmp-rtm.c
 | ||||||
|  | +++ b/sysdeps/x86/tst-strncmp-rtm.c
 | ||||||
|  | @@ -19,18 +19,32 @@
 | ||||||
|  |  #include <stdint.h> | ||||||
|  |  #include <tst-string-rtm.h> | ||||||
|  |   | ||||||
|  | +#ifdef WIDE
 | ||||||
|  | +# define CHAR wchar_t
 | ||||||
|  | +# define MEMSET wmemset
 | ||||||
|  | +# define STRNCMP wcsncmp
 | ||||||
|  | +# define TEST_NAME wcsncmp
 | ||||||
|  | +#else /* !WIDE */
 | ||||||
|  | +# define CHAR char
 | ||||||
|  | +# define MEMSET memset
 | ||||||
|  | +# define STRNCMP strncmp
 | ||||||
|  | +# define TEST_NAME strncmp
 | ||||||
|  | +#endif /* !WIDE */
 | ||||||
|  | +
 | ||||||
|  | +
 | ||||||
|  | +
 | ||||||
|  |  #define LOOP 3000 | ||||||
|  |  #define STRING_SIZE 1024 | ||||||
|  | -char string1[STRING_SIZE];
 | ||||||
|  | -char string2[STRING_SIZE];
 | ||||||
|  | +CHAR string1[STRING_SIZE];
 | ||||||
|  | +CHAR string2[STRING_SIZE];
 | ||||||
|  |   | ||||||
|  |  __attribute__ ((noinline, noclone)) | ||||||
|  |  static int | ||||||
|  |  prepare (void) | ||||||
|  |  { | ||||||
|  | -  memset (string1, 'a', STRING_SIZE - 1);
 | ||||||
|  | -  memset (string2, 'a', STRING_SIZE - 1);
 | ||||||
|  | -  if (strncmp (string1, string2, STRING_SIZE) == 0)
 | ||||||
|  | +  MEMSET (string1, 'a', STRING_SIZE - 1);
 | ||||||
|  | +  MEMSET (string2, 'a', STRING_SIZE - 1);
 | ||||||
|  | +  if (STRNCMP (string1, string2, STRING_SIZE) == 0)
 | ||||||
|  |      return EXIT_SUCCESS; | ||||||
|  |    else | ||||||
|  |      return EXIT_FAILURE; | ||||||
|  | @@ -40,7 +54,7 @@ __attribute__ ((noinline, noclone))
 | ||||||
|  |  static int | ||||||
|  |  function (void) | ||||||
|  |  { | ||||||
|  | -  if (strncmp (string1, string2, STRING_SIZE) == 0)
 | ||||||
|  | +  if (STRNCMP (string1, string2, STRING_SIZE) == 0)
 | ||||||
|  |      return 0; | ||||||
|  |    else | ||||||
|  |      return 1; | ||||||
|  | @@ -50,7 +64,7 @@ __attribute__ ((noinline, noclone))
 | ||||||
|  |  static int | ||||||
|  |  function_overflow (void) | ||||||
|  |  { | ||||||
|  | -  if (strncmp (string1, string2, SIZE_MAX) == 0)
 | ||||||
|  | +  if (STRNCMP (string1, string2, SIZE_MAX) == 0)
 | ||||||
|  |      return 0; | ||||||
|  |    else | ||||||
|  |      return 1; | ||||||
|  | @@ -59,9 +73,9 @@ function_overflow (void)
 | ||||||
|  |  static int | ||||||
|  |  do_test (void) | ||||||
|  |  { | ||||||
|  | -  int status = do_test_1 ("strncmp", LOOP, prepare, function);
 | ||||||
|  | +  int status = do_test_1 (TEST_NAME, LOOP, prepare, function);
 | ||||||
|  |    if (status != EXIT_SUCCESS) | ||||||
|  |      return status; | ||||||
|  | -  status = do_test_1 ("strncmp", LOOP, prepare, function_overflow);
 | ||||||
|  | +  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
 | ||||||
|  |    return status; | ||||||
|  |  } | ||||||
|  | diff --git a/sysdeps/x86/tst-wcsncmp-rtm.c b/sysdeps/x86/tst-wcsncmp-rtm.c
 | ||||||
|  | new file mode 100644 | ||||||
|  | index 00000000..bad3b863
 | ||||||
|  | --- /dev/null
 | ||||||
|  | +++ b/sysdeps/x86/tst-wcsncmp-rtm.c
 | ||||||
|  | @@ -0,0 +1,21 @@
 | ||||||
|  | +/* Test case for wcsncmp inside a transactionally executing RTM region.
 | ||||||
|  | +   Copyright (C) 2022 Free Software Foundation, Inc.
 | ||||||
|  | +   This file is part of the GNU C Library.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is free software; you can redistribute it and/or
 | ||||||
|  | +   modify it under the terms of the GNU Lesser General Public
 | ||||||
|  | +   License as published by the Free Software Foundation; either
 | ||||||
|  | +   version 2.1 of the License, or (at your option) any later version.
 | ||||||
|  | +
 | ||||||
|  | +   The GNU C Library is distributed in the hope that it will be useful,
 | ||||||
|  | +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | ||||||
|  | +   Lesser General Public License for more details.
 | ||||||
|  | +
 | ||||||
|  | +   You should have received a copy of the GNU Lesser General Public
 | ||||||
|  | +   License along with the GNU C Library; if not, see
 | ||||||
|  | +   <https://www.gnu.org/licenses/>.  */
 | ||||||
|  | +
 | ||||||
|  | +#define WIDE 1
 | ||||||
|  | +#include <wchar.h>
 | ||||||
|  | +#include "tst-strncmp-rtm.c"
 | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										37
									
								
								SOURCES/glibc-RHEL-15696-73.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								SOURCES/glibc-RHEL-15696-73.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,37 @@ | |||||||
|  | From b98d0bbf747f39770e0caba7e984ce9f8f900330 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Fri, 18 Feb 2022 17:00:25 -0600 | ||||||
|  | Subject: [PATCH] x86: Fix TEST_NAME to make it a string in tst-strncmp-rtm.c | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Previously TEST_NAME was passing a function pointer. This didn't fail | ||||||
|  | because of the -Wno-error flag (to allow for overflow sizes passed | ||||||
|  | to strncmp/wcsncmp) | ||||||
|  | 
 | ||||||
|  | Reviewed-by: H.J. Lu <hjl.tools@gmail.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86/tst-strncmp-rtm.c | 4 ++-- | ||||||
|  |  1 file changed, 2 insertions(+), 2 deletions(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
 | ||||||
|  | index 4e9f094f..aef9866c 100644
 | ||||||
|  | --- a/sysdeps/x86/tst-strncmp-rtm.c
 | ||||||
|  | +++ b/sysdeps/x86/tst-strncmp-rtm.c
 | ||||||
|  | @@ -23,12 +23,12 @@
 | ||||||
|  |  # define CHAR wchar_t | ||||||
|  |  # define MEMSET wmemset | ||||||
|  |  # define STRNCMP wcsncmp | ||||||
|  | -# define TEST_NAME wcsncmp
 | ||||||
|  | +# define TEST_NAME "wcsncmp"
 | ||||||
|  |  #else /* !WIDE */ | ||||||
|  |  # define CHAR char | ||||||
|  |  # define MEMSET memset | ||||||
|  |  # define STRNCMP strncmp | ||||||
|  | -# define TEST_NAME strncmp
 | ||||||
|  | +# define TEST_NAME "strncmp"
 | ||||||
|  |  #endif /* !WIDE */ | ||||||
|  |   | ||||||
|  |   | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										1798
									
								
								SOURCES/glibc-RHEL-15696-74.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1798
									
								
								SOURCES/glibc-RHEL-15696-74.patch
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										1992
									
								
								SOURCES/glibc-RHEL-15696-75.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1992
									
								
								SOURCES/glibc-RHEL-15696-75.patch
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										33
									
								
								SOURCES/glibc-RHEL-15696-76.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								SOURCES/glibc-RHEL-15696-76.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,33 @@ | |||||||
|  | From c15efd011cea3d8f0494269eb539583215a1feed Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Fri, 4 Feb 2022 11:09:10 -0800 | ||||||
|  | Subject: [PATCH] x86-64: Fix strcmp-avx2.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Change "movl %edx, %rdx" to "movl %edx, %edx" in: | ||||||
|  | 
 | ||||||
|  | commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45 | ||||||
|  | Author: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date:   Mon Jan 10 15:35:38 2022 -0600 | ||||||
|  | 
 | ||||||
|  |     x86: Optimize strcmp-avx2.S | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +- | ||||||
|  |  1 file changed, 1 insertion(+), 1 deletion(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | index 554ffe4c..04675aa4 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
 | ||||||
|  | @@ -106,7 +106,7 @@ ENTRY(STRCMP)
 | ||||||
|  |  # ifdef USE_AS_STRNCMP | ||||||
|  |  #  ifdef __ILP32__ | ||||||
|  |  	/* Clear the upper 32 bits.  */ | ||||||
|  | -	movl	%edx, %rdx
 | ||||||
|  | +	movl	%edx, %edx
 | ||||||
|  |  #  endif | ||||||
|  |  	cmp	$1, %RDX_LP | ||||||
|  |  	/* Signed comparison intentional. We use this branch to also | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
							
								
								
									
										33
									
								
								SOURCES/glibc-RHEL-15696-77.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								SOURCES/glibc-RHEL-15696-77.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,33 @@ | |||||||
|  | From 0e0199a9e02ebe42e2b36958964d63f03573c382 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: "H.J. Lu" <hjl.tools@gmail.com> | ||||||
|  | Date: Fri, 4 Feb 2022 11:11:08 -0800 | ||||||
|  | Subject: [PATCH] x86-64: Fix strcmp-evex.S | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Change "movl %edx, %rdx" to "movl %edx, %edx" in: | ||||||
|  | 
 | ||||||
|  | commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9 | ||||||
|  | Author: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date:   Mon Jan 10 15:35:39 2022 -0600 | ||||||
|  | 
 | ||||||
|  |     x86: Optimize strcmp-evex.S | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86_64/multiarch/strcmp-evex.S | 2 +- | ||||||
|  |  1 file changed, 1 insertion(+), 1 deletion(-) | ||||||
|  | 
 | ||||||
|  | diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
 | ||||||
|  | index 99d8409a..ed56af8e 100644
 | ||||||
|  | --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
 | ||||||
|  | +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
 | ||||||
|  | @@ -116,7 +116,7 @@ ENTRY(STRCMP)
 | ||||||
|  |  # ifdef USE_AS_STRNCMP | ||||||
|  |  #  ifdef __ILP32__ | ||||||
|  |  	/* Clear the upper 32 bits.  */ | ||||||
|  | -	movl	%edx, %rdx
 | ||||||
|  | +	movl	%edx, %edx
 | ||||||
|  |  #  endif | ||||||
|  |  	cmp	$1, %RDX_LP | ||||||
|  |  	/* Signed comparison intentional. We use this branch to also | ||||||
|  | -- 
 | ||||||
|  | GitLab | ||||||
|  | 
 | ||||||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue
	
	Block a user