Compare commits
No commits in common. "c8" and "c9s" have entirely different histories.
7
.gitignore
vendored
7
.gitignore
vendored
@ -1 +1,6 @@
|
|||||||
SOURCES/glibc-2.28.tar.xz
|
# Release tarballs.
|
||||||
|
/glibc-*.tar.[gx]z
|
||||||
|
# Generated (source) RPMs.
|
||||||
|
/*.rpm
|
||||||
|
# Expanded source trees.
|
||||||
|
/glibc-*/
|
||||||
|
@ -1 +0,0 @@
|
|||||||
ccb5dc9e51a9884df8488f86982439d47b283b2a SOURCES/glibc-2.28.tar.xz
|
|
File diff suppressed because it is too large
Load Diff
2
README.scripts
Normal file
2
README.scripts
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
All of the useful glibc maintainer scripts are located at:
|
||||||
|
https://pagure.io/glibc-maintainer-scripts
|
@ -1,496 +0,0 @@
|
|||||||
# This file names the currently supported and somewhat tested locales.
|
|
||||||
# If you have any additions please file a glibc bug report.
|
|
||||||
SUPPORTED-LOCALES=\
|
|
||||||
C.UTF-8/UTF-8 \
|
|
||||||
aa_DJ.UTF-8/UTF-8 \
|
|
||||||
aa_DJ/ISO-8859-1 \
|
|
||||||
aa_ER/UTF-8 \
|
|
||||||
aa_ER@saaho/UTF-8 \
|
|
||||||
aa_ET/UTF-8 \
|
|
||||||
af_ZA.UTF-8/UTF-8 \
|
|
||||||
af_ZA/ISO-8859-1 \
|
|
||||||
agr_PE/UTF-8 \
|
|
||||||
ak_GH/UTF-8 \
|
|
||||||
am_ET/UTF-8 \
|
|
||||||
an_ES.UTF-8/UTF-8 \
|
|
||||||
an_ES/ISO-8859-15 \
|
|
||||||
anp_IN/UTF-8 \
|
|
||||||
ar_AE.UTF-8/UTF-8 \
|
|
||||||
ar_AE/ISO-8859-6 \
|
|
||||||
ar_BH.UTF-8/UTF-8 \
|
|
||||||
ar_BH/ISO-8859-6 \
|
|
||||||
ar_DZ.UTF-8/UTF-8 \
|
|
||||||
ar_DZ/ISO-8859-6 \
|
|
||||||
ar_EG.UTF-8/UTF-8 \
|
|
||||||
ar_EG/ISO-8859-6 \
|
|
||||||
ar_IN/UTF-8 \
|
|
||||||
ar_IQ.UTF-8/UTF-8 \
|
|
||||||
ar_IQ/ISO-8859-6 \
|
|
||||||
ar_JO.UTF-8/UTF-8 \
|
|
||||||
ar_JO/ISO-8859-6 \
|
|
||||||
ar_KW.UTF-8/UTF-8 \
|
|
||||||
ar_KW/ISO-8859-6 \
|
|
||||||
ar_LB.UTF-8/UTF-8 \
|
|
||||||
ar_LB/ISO-8859-6 \
|
|
||||||
ar_LY.UTF-8/UTF-8 \
|
|
||||||
ar_LY/ISO-8859-6 \
|
|
||||||
ar_MA.UTF-8/UTF-8 \
|
|
||||||
ar_MA/ISO-8859-6 \
|
|
||||||
ar_OM.UTF-8/UTF-8 \
|
|
||||||
ar_OM/ISO-8859-6 \
|
|
||||||
ar_QA.UTF-8/UTF-8 \
|
|
||||||
ar_QA/ISO-8859-6 \
|
|
||||||
ar_SA.UTF-8/UTF-8 \
|
|
||||||
ar_SA/ISO-8859-6 \
|
|
||||||
ar_SD.UTF-8/UTF-8 \
|
|
||||||
ar_SD/ISO-8859-6 \
|
|
||||||
ar_SS/UTF-8 \
|
|
||||||
ar_SY.UTF-8/UTF-8 \
|
|
||||||
ar_SY/ISO-8859-6 \
|
|
||||||
ar_TN.UTF-8/UTF-8 \
|
|
||||||
ar_TN/ISO-8859-6 \
|
|
||||||
ar_YE.UTF-8/UTF-8 \
|
|
||||||
ar_YE/ISO-8859-6 \
|
|
||||||
ayc_PE/UTF-8 \
|
|
||||||
az_AZ/UTF-8 \
|
|
||||||
az_IR/UTF-8 \
|
|
||||||
as_IN/UTF-8 \
|
|
||||||
ast_ES.UTF-8/UTF-8 \
|
|
||||||
ast_ES/ISO-8859-15 \
|
|
||||||
be_BY.UTF-8/UTF-8 \
|
|
||||||
be_BY/CP1251 \
|
|
||||||
be_BY@latin/UTF-8 \
|
|
||||||
bem_ZM/UTF-8 \
|
|
||||||
ber_DZ/UTF-8 \
|
|
||||||
ber_MA/UTF-8 \
|
|
||||||
bg_BG.UTF-8/UTF-8 \
|
|
||||||
bg_BG/CP1251 \
|
|
||||||
bhb_IN.UTF-8/UTF-8 \
|
|
||||||
bho_IN/UTF-8 \
|
|
||||||
bho_NP/UTF-8 \
|
|
||||||
bi_VU/UTF-8 \
|
|
||||||
bn_BD/UTF-8 \
|
|
||||||
bn_IN/UTF-8 \
|
|
||||||
bo_CN/UTF-8 \
|
|
||||||
bo_IN/UTF-8 \
|
|
||||||
br_FR.UTF-8/UTF-8 \
|
|
||||||
br_FR/ISO-8859-1 \
|
|
||||||
br_FR@euro/ISO-8859-15 \
|
|
||||||
brx_IN/UTF-8 \
|
|
||||||
bs_BA.UTF-8/UTF-8 \
|
|
||||||
bs_BA/ISO-8859-2 \
|
|
||||||
byn_ER/UTF-8 \
|
|
||||||
ca_AD.UTF-8/UTF-8 \
|
|
||||||
ca_AD/ISO-8859-15 \
|
|
||||||
ca_ES.UTF-8/UTF-8 \
|
|
||||||
ca_ES/ISO-8859-1 \
|
|
||||||
ca_ES@euro/ISO-8859-15 \
|
|
||||||
ca_ES@valencia/UTF-8 \
|
|
||||||
ca_FR.UTF-8/UTF-8 \
|
|
||||||
ca_FR/ISO-8859-15 \
|
|
||||||
ca_IT.UTF-8/UTF-8 \
|
|
||||||
ca_IT/ISO-8859-15 \
|
|
||||||
ce_RU/UTF-8 \
|
|
||||||
chr_US/UTF-8 \
|
|
||||||
cmn_TW/UTF-8 \
|
|
||||||
crh_UA/UTF-8 \
|
|
||||||
cs_CZ.UTF-8/UTF-8 \
|
|
||||||
cs_CZ/ISO-8859-2 \
|
|
||||||
csb_PL/UTF-8 \
|
|
||||||
cv_RU/UTF-8 \
|
|
||||||
cy_GB.UTF-8/UTF-8 \
|
|
||||||
cy_GB/ISO-8859-14 \
|
|
||||||
da_DK.UTF-8/UTF-8 \
|
|
||||||
da_DK/ISO-8859-1 \
|
|
||||||
da_DK.ISO-8859-15/ISO-8859-15 \
|
|
||||||
de_AT.UTF-8/UTF-8 \
|
|
||||||
de_AT/ISO-8859-1 \
|
|
||||||
de_AT@euro/ISO-8859-15 \
|
|
||||||
de_BE.UTF-8/UTF-8 \
|
|
||||||
de_BE/ISO-8859-1 \
|
|
||||||
de_BE@euro/ISO-8859-15 \
|
|
||||||
de_CH.UTF-8/UTF-8 \
|
|
||||||
de_CH/ISO-8859-1 \
|
|
||||||
de_DE.UTF-8/UTF-8 \
|
|
||||||
de_DE/ISO-8859-1 \
|
|
||||||
de_DE@euro/ISO-8859-15 \
|
|
||||||
de_IT.UTF-8/UTF-8 \
|
|
||||||
de_IT/ISO-8859-1 \
|
|
||||||
de_LI.UTF-8/UTF-8 \
|
|
||||||
de_LU.UTF-8/UTF-8 \
|
|
||||||
de_LU/ISO-8859-1 \
|
|
||||||
de_LU@euro/ISO-8859-15 \
|
|
||||||
doi_IN/UTF-8 \
|
|
||||||
dsb_DE/UTF-8 \
|
|
||||||
dv_MV/UTF-8 \
|
|
||||||
dz_BT/UTF-8 \
|
|
||||||
el_GR.UTF-8/UTF-8 \
|
|
||||||
el_GR/ISO-8859-7 \
|
|
||||||
el_GR@euro/ISO-8859-7 \
|
|
||||||
el_CY.UTF-8/UTF-8 \
|
|
||||||
el_CY/ISO-8859-7 \
|
|
||||||
en_AG/UTF-8 \
|
|
||||||
en_AU.UTF-8/UTF-8 \
|
|
||||||
en_AU/ISO-8859-1 \
|
|
||||||
en_BW.UTF-8/UTF-8 \
|
|
||||||
en_BW/ISO-8859-1 \
|
|
||||||
en_CA.UTF-8/UTF-8 \
|
|
||||||
en_CA/ISO-8859-1 \
|
|
||||||
en_DK.UTF-8/UTF-8 \
|
|
||||||
en_DK/ISO-8859-1 \
|
|
||||||
en_GB.UTF-8/UTF-8 \
|
|
||||||
en_GB/ISO-8859-1 \
|
|
||||||
en_GB.ISO-8859-15/ISO-8859-15 \
|
|
||||||
en_HK.UTF-8/UTF-8 \
|
|
||||||
en_HK/ISO-8859-1 \
|
|
||||||
en_IE.UTF-8/UTF-8 \
|
|
||||||
en_IE/ISO-8859-1 \
|
|
||||||
en_IE@euro/ISO-8859-15 \
|
|
||||||
en_IL/UTF-8 \
|
|
||||||
en_IN/UTF-8 \
|
|
||||||
en_NG/UTF-8 \
|
|
||||||
en_NZ.UTF-8/UTF-8 \
|
|
||||||
en_NZ/ISO-8859-1 \
|
|
||||||
en_PH.UTF-8/UTF-8 \
|
|
||||||
en_PH/ISO-8859-1 \
|
|
||||||
en_SC.UTF-8/UTF-8 \
|
|
||||||
en_SG.UTF-8/UTF-8 \
|
|
||||||
en_SG/ISO-8859-1 \
|
|
||||||
en_US.UTF-8/UTF-8 \
|
|
||||||
en_US/ISO-8859-1 \
|
|
||||||
en_US.ISO-8859-15/ISO-8859-15 \
|
|
||||||
en_US@ampm/UTF-8 \
|
|
||||||
en_US.UTF-8@ampm/UTF-8 \
|
|
||||||
en_ZA.UTF-8/UTF-8 \
|
|
||||||
en_ZA/ISO-8859-1 \
|
|
||||||
en_ZM/UTF-8 \
|
|
||||||
en_ZW.UTF-8/UTF-8 \
|
|
||||||
en_ZW/ISO-8859-1 \
|
|
||||||
eo/UTF-8 \
|
|
||||||
es_AR.UTF-8/UTF-8 \
|
|
||||||
es_AR/ISO-8859-1 \
|
|
||||||
es_BO.UTF-8/UTF-8 \
|
|
||||||
es_BO/ISO-8859-1 \
|
|
||||||
es_CL.UTF-8/UTF-8 \
|
|
||||||
es_CL/ISO-8859-1 \
|
|
||||||
es_CO.UTF-8/UTF-8 \
|
|
||||||
es_CO/ISO-8859-1 \
|
|
||||||
es_CR.UTF-8/UTF-8 \
|
|
||||||
es_CR/ISO-8859-1 \
|
|
||||||
es_CU/UTF-8 \
|
|
||||||
es_DO.UTF-8/UTF-8 \
|
|
||||||
es_DO/ISO-8859-1 \
|
|
||||||
es_EC.UTF-8/UTF-8 \
|
|
||||||
es_EC/ISO-8859-1 \
|
|
||||||
es_ES.UTF-8/UTF-8 \
|
|
||||||
es_ES/ISO-8859-1 \
|
|
||||||
es_ES@euro/ISO-8859-15 \
|
|
||||||
es_GT.UTF-8/UTF-8 \
|
|
||||||
es_GT/ISO-8859-1 \
|
|
||||||
es_HN.UTF-8/UTF-8 \
|
|
||||||
es_HN/ISO-8859-1 \
|
|
||||||
es_MX.UTF-8/UTF-8 \
|
|
||||||
es_MX/ISO-8859-1 \
|
|
||||||
es_NI.UTF-8/UTF-8 \
|
|
||||||
es_NI/ISO-8859-1 \
|
|
||||||
es_PA.UTF-8/UTF-8 \
|
|
||||||
es_PA/ISO-8859-1 \
|
|
||||||
es_PE.UTF-8/UTF-8 \
|
|
||||||
es_PE/ISO-8859-1 \
|
|
||||||
es_PR.UTF-8/UTF-8 \
|
|
||||||
es_PR/ISO-8859-1 \
|
|
||||||
es_PY.UTF-8/UTF-8 \
|
|
||||||
es_PY/ISO-8859-1 \
|
|
||||||
es_SV.UTF-8/UTF-8 \
|
|
||||||
es_SV/ISO-8859-1 \
|
|
||||||
es_US.UTF-8/UTF-8 \
|
|
||||||
es_US/ISO-8859-1 \
|
|
||||||
es_UY.UTF-8/UTF-8 \
|
|
||||||
es_UY/ISO-8859-1 \
|
|
||||||
es_VE.UTF-8/UTF-8 \
|
|
||||||
es_VE/ISO-8859-1 \
|
|
||||||
et_EE.UTF-8/UTF-8 \
|
|
||||||
et_EE/ISO-8859-1 \
|
|
||||||
et_EE.ISO-8859-15/ISO-8859-15 \
|
|
||||||
eu_ES.UTF-8/UTF-8 \
|
|
||||||
eu_ES/ISO-8859-1 \
|
|
||||||
eu_ES@euro/ISO-8859-15 \
|
|
||||||
fa_IR/UTF-8 \
|
|
||||||
ff_SN/UTF-8 \
|
|
||||||
fi_FI.UTF-8/UTF-8 \
|
|
||||||
fi_FI/ISO-8859-1 \
|
|
||||||
fi_FI@euro/ISO-8859-15 \
|
|
||||||
fil_PH/UTF-8 \
|
|
||||||
fo_FO.UTF-8/UTF-8 \
|
|
||||||
fo_FO/ISO-8859-1 \
|
|
||||||
fr_BE.UTF-8/UTF-8 \
|
|
||||||
fr_BE/ISO-8859-1 \
|
|
||||||
fr_BE@euro/ISO-8859-15 \
|
|
||||||
fr_CA.UTF-8/UTF-8 \
|
|
||||||
fr_CA/ISO-8859-1 \
|
|
||||||
fr_CH.UTF-8/UTF-8 \
|
|
||||||
fr_CH/ISO-8859-1 \
|
|
||||||
fr_FR.UTF-8/UTF-8 \
|
|
||||||
fr_FR/ISO-8859-1 \
|
|
||||||
fr_FR@euro/ISO-8859-15 \
|
|
||||||
fr_LU.UTF-8/UTF-8 \
|
|
||||||
fr_LU/ISO-8859-1 \
|
|
||||||
fr_LU@euro/ISO-8859-15 \
|
|
||||||
fur_IT/UTF-8 \
|
|
||||||
fy_NL/UTF-8 \
|
|
||||||
fy_DE/UTF-8 \
|
|
||||||
ga_IE.UTF-8/UTF-8 \
|
|
||||||
ga_IE/ISO-8859-1 \
|
|
||||||
ga_IE@euro/ISO-8859-15 \
|
|
||||||
gd_GB.UTF-8/UTF-8 \
|
|
||||||
gd_GB/ISO-8859-15 \
|
|
||||||
gez_ER/UTF-8 \
|
|
||||||
gez_ER@abegede/UTF-8 \
|
|
||||||
gez_ET/UTF-8 \
|
|
||||||
gez_ET@abegede/UTF-8 \
|
|
||||||
gl_ES.UTF-8/UTF-8 \
|
|
||||||
gl_ES/ISO-8859-1 \
|
|
||||||
gl_ES@euro/ISO-8859-15 \
|
|
||||||
gu_IN/UTF-8 \
|
|
||||||
gv_GB.UTF-8/UTF-8 \
|
|
||||||
gv_GB/ISO-8859-1 \
|
|
||||||
ha_NG/UTF-8 \
|
|
||||||
hak_TW/UTF-8 \
|
|
||||||
he_IL.UTF-8/UTF-8 \
|
|
||||||
he_IL/ISO-8859-8 \
|
|
||||||
hi_IN/UTF-8 \
|
|
||||||
hif_FJ/UTF-8 \
|
|
||||||
hne_IN/UTF-8 \
|
|
||||||
hr_HR.UTF-8/UTF-8 \
|
|
||||||
hr_HR/ISO-8859-2 \
|
|
||||||
hsb_DE/ISO-8859-2 \
|
|
||||||
hsb_DE.UTF-8/UTF-8 \
|
|
||||||
ht_HT/UTF-8 \
|
|
||||||
hu_HU.UTF-8/UTF-8 \
|
|
||||||
hu_HU/ISO-8859-2 \
|
|
||||||
hy_AM/UTF-8 \
|
|
||||||
hy_AM.ARMSCII-8/ARMSCII-8 \
|
|
||||||
ia_FR/UTF-8 \
|
|
||||||
id_ID.UTF-8/UTF-8 \
|
|
||||||
id_ID/ISO-8859-1 \
|
|
||||||
ig_NG/UTF-8 \
|
|
||||||
ik_CA/UTF-8 \
|
|
||||||
is_IS.UTF-8/UTF-8 \
|
|
||||||
is_IS/ISO-8859-1 \
|
|
||||||
it_CH.UTF-8/UTF-8 \
|
|
||||||
it_CH/ISO-8859-1 \
|
|
||||||
it_IT.UTF-8/UTF-8 \
|
|
||||||
it_IT/ISO-8859-1 \
|
|
||||||
it_IT@euro/ISO-8859-15 \
|
|
||||||
iu_CA/UTF-8 \
|
|
||||||
ja_JP.EUC-JP/EUC-JP \
|
|
||||||
ja_JP.UTF-8/UTF-8 \
|
|
||||||
ka_GE.UTF-8/UTF-8 \
|
|
||||||
ka_GE/GEORGIAN-PS \
|
|
||||||
kab_DZ/UTF-8 \
|
|
||||||
kk_KZ.UTF-8/UTF-8 \
|
|
||||||
kk_KZ/PT154 \
|
|
||||||
kl_GL.UTF-8/UTF-8 \
|
|
||||||
kl_GL/ISO-8859-1 \
|
|
||||||
km_KH/UTF-8 \
|
|
||||||
kn_IN/UTF-8 \
|
|
||||||
ko_KR.EUC-KR/EUC-KR \
|
|
||||||
ko_KR.UTF-8/UTF-8 \
|
|
||||||
kok_IN/UTF-8 \
|
|
||||||
ks_IN/UTF-8 \
|
|
||||||
ks_IN@devanagari/UTF-8 \
|
|
||||||
ku_TR.UTF-8/UTF-8 \
|
|
||||||
ku_TR/ISO-8859-9 \
|
|
||||||
kw_GB.UTF-8/UTF-8 \
|
|
||||||
kw_GB/ISO-8859-1 \
|
|
||||||
ky_KG/UTF-8 \
|
|
||||||
lb_LU/UTF-8 \
|
|
||||||
lg_UG.UTF-8/UTF-8 \
|
|
||||||
lg_UG/ISO-8859-10 \
|
|
||||||
li_BE/UTF-8 \
|
|
||||||
li_NL/UTF-8 \
|
|
||||||
lij_IT/UTF-8 \
|
|
||||||
ln_CD/UTF-8 \
|
|
||||||
lo_LA/UTF-8 \
|
|
||||||
lt_LT.UTF-8/UTF-8 \
|
|
||||||
lt_LT/ISO-8859-13 \
|
|
||||||
lv_LV.UTF-8/UTF-8 \
|
|
||||||
lv_LV/ISO-8859-13 \
|
|
||||||
lzh_TW/UTF-8 \
|
|
||||||
mag_IN/UTF-8 \
|
|
||||||
mai_IN/UTF-8 \
|
|
||||||
mai_NP/UTF-8 \
|
|
||||||
mfe_MU/UTF-8 \
|
|
||||||
mg_MG.UTF-8/UTF-8 \
|
|
||||||
mg_MG/ISO-8859-15 \
|
|
||||||
mhr_RU/UTF-8 \
|
|
||||||
mi_NZ.UTF-8/UTF-8 \
|
|
||||||
mi_NZ/ISO-8859-13 \
|
|
||||||
miq_NI/UTF-8 \
|
|
||||||
mjw_IN/UTF-8 \
|
|
||||||
mk_MK.UTF-8/UTF-8 \
|
|
||||||
mk_MK/ISO-8859-5 \
|
|
||||||
ml_IN/UTF-8 \
|
|
||||||
mn_MN/UTF-8 \
|
|
||||||
mni_IN/UTF-8 \
|
|
||||||
mr_IN/UTF-8 \
|
|
||||||
ms_MY.UTF-8/UTF-8 \
|
|
||||||
ms_MY/ISO-8859-1 \
|
|
||||||
mt_MT.UTF-8/UTF-8 \
|
|
||||||
mt_MT/ISO-8859-3 \
|
|
||||||
my_MM/UTF-8 \
|
|
||||||
nan_TW/UTF-8 \
|
|
||||||
nan_TW@latin/UTF-8 \
|
|
||||||
nb_NO.UTF-8/UTF-8 \
|
|
||||||
nb_NO/ISO-8859-1 \
|
|
||||||
nds_DE/UTF-8 \
|
|
||||||
nds_NL/UTF-8 \
|
|
||||||
ne_NP/UTF-8 \
|
|
||||||
nhn_MX/UTF-8 \
|
|
||||||
niu_NU/UTF-8 \
|
|
||||||
niu_NZ/UTF-8 \
|
|
||||||
nl_AW/UTF-8 \
|
|
||||||
nl_BE.UTF-8/UTF-8 \
|
|
||||||
nl_BE/ISO-8859-1 \
|
|
||||||
nl_BE@euro/ISO-8859-15 \
|
|
||||||
nl_NL.UTF-8/UTF-8 \
|
|
||||||
nl_NL/ISO-8859-1 \
|
|
||||||
nl_NL@euro/ISO-8859-15 \
|
|
||||||
nn_NO.UTF-8/UTF-8 \
|
|
||||||
nn_NO/ISO-8859-1 \
|
|
||||||
nr_ZA/UTF-8 \
|
|
||||||
nso_ZA/UTF-8 \
|
|
||||||
oc_FR.UTF-8/UTF-8 \
|
|
||||||
oc_FR/ISO-8859-1 \
|
|
||||||
om_ET/UTF-8 \
|
|
||||||
om_KE.UTF-8/UTF-8 \
|
|
||||||
om_KE/ISO-8859-1 \
|
|
||||||
or_IN/UTF-8 \
|
|
||||||
os_RU/UTF-8 \
|
|
||||||
pa_IN/UTF-8 \
|
|
||||||
pa_PK/UTF-8 \
|
|
||||||
pap_AW/UTF-8 \
|
|
||||||
pap_CW/UTF-8 \
|
|
||||||
pl_PL.UTF-8/UTF-8 \
|
|
||||||
pl_PL/ISO-8859-2 \
|
|
||||||
ps_AF/UTF-8 \
|
|
||||||
pt_BR.UTF-8/UTF-8 \
|
|
||||||
pt_BR/ISO-8859-1 \
|
|
||||||
pt_PT.UTF-8/UTF-8 \
|
|
||||||
pt_PT/ISO-8859-1 \
|
|
||||||
pt_PT@euro/ISO-8859-15 \
|
|
||||||
quz_PE/UTF-8 \
|
|
||||||
raj_IN/UTF-8 \
|
|
||||||
ro_RO.UTF-8/UTF-8 \
|
|
||||||
ro_RO/ISO-8859-2 \
|
|
||||||
ru_RU.KOI8-R/KOI8-R \
|
|
||||||
ru_RU.UTF-8/UTF-8 \
|
|
||||||
ru_RU/ISO-8859-5 \
|
|
||||||
ru_UA.UTF-8/UTF-8 \
|
|
||||||
ru_UA/KOI8-U \
|
|
||||||
rw_RW/UTF-8 \
|
|
||||||
sa_IN/UTF-8 \
|
|
||||||
sah_RU/UTF-8 \
|
|
||||||
sat_IN/UTF-8 \
|
|
||||||
sc_IT/UTF-8 \
|
|
||||||
sd_IN/UTF-8 \
|
|
||||||
sd_IN@devanagari/UTF-8 \
|
|
||||||
se_NO/UTF-8 \
|
|
||||||
sgs_LT/UTF-8 \
|
|
||||||
shn_MM/UTF-8 \
|
|
||||||
shs_CA/UTF-8 \
|
|
||||||
si_LK/UTF-8 \
|
|
||||||
sid_ET/UTF-8 \
|
|
||||||
sk_SK.UTF-8/UTF-8 \
|
|
||||||
sk_SK/ISO-8859-2 \
|
|
||||||
sl_SI.UTF-8/UTF-8 \
|
|
||||||
sl_SI/ISO-8859-2 \
|
|
||||||
sm_WS/UTF-8 \
|
|
||||||
so_DJ.UTF-8/UTF-8 \
|
|
||||||
so_DJ/ISO-8859-1 \
|
|
||||||
so_ET/UTF-8 \
|
|
||||||
so_KE.UTF-8/UTF-8 \
|
|
||||||
so_KE/ISO-8859-1 \
|
|
||||||
so_SO.UTF-8/UTF-8 \
|
|
||||||
so_SO/ISO-8859-1 \
|
|
||||||
sq_AL.UTF-8/UTF-8 \
|
|
||||||
sq_AL/ISO-8859-1 \
|
|
||||||
sq_MK/UTF-8 \
|
|
||||||
sr_ME/UTF-8 \
|
|
||||||
sr_RS/UTF-8 \
|
|
||||||
sr_RS@latin/UTF-8 \
|
|
||||||
ss_ZA/UTF-8 \
|
|
||||||
st_ZA.UTF-8/UTF-8 \
|
|
||||||
st_ZA/ISO-8859-1 \
|
|
||||||
sv_FI.UTF-8/UTF-8 \
|
|
||||||
sv_FI/ISO-8859-1 \
|
|
||||||
sv_FI@euro/ISO-8859-15 \
|
|
||||||
sv_SE.UTF-8/UTF-8 \
|
|
||||||
sv_SE/ISO-8859-1 \
|
|
||||||
sv_SE.ISO-8859-15/ISO-8859-15 \
|
|
||||||
sw_KE/UTF-8 \
|
|
||||||
sw_TZ/UTF-8 \
|
|
||||||
szl_PL/UTF-8 \
|
|
||||||
ta_IN/UTF-8 \
|
|
||||||
ta_LK/UTF-8 \
|
|
||||||
tcy_IN.UTF-8/UTF-8 \
|
|
||||||
te_IN/UTF-8 \
|
|
||||||
tg_TJ.UTF-8/UTF-8 \
|
|
||||||
tg_TJ/KOI8-T \
|
|
||||||
th_TH.UTF-8/UTF-8 \
|
|
||||||
th_TH/TIS-620 \
|
|
||||||
the_NP/UTF-8 \
|
|
||||||
ti_ER/UTF-8 \
|
|
||||||
ti_ET/UTF-8 \
|
|
||||||
tig_ER/UTF-8 \
|
|
||||||
tk_TM/UTF-8 \
|
|
||||||
tl_PH.UTF-8/UTF-8 \
|
|
||||||
tl_PH/ISO-8859-1 \
|
|
||||||
tn_ZA/UTF-8 \
|
|
||||||
to_TO/UTF-8 \
|
|
||||||
tpi_PG/UTF-8 \
|
|
||||||
tr_CY.UTF-8/UTF-8 \
|
|
||||||
tr_CY/ISO-8859-9 \
|
|
||||||
tr_TR.UTF-8/UTF-8 \
|
|
||||||
tr_TR/ISO-8859-9 \
|
|
||||||
ts_ZA/UTF-8 \
|
|
||||||
tt_RU/UTF-8 \
|
|
||||||
tt_RU@iqtelif/UTF-8 \
|
|
||||||
ug_CN/UTF-8 \
|
|
||||||
uk_UA.UTF-8/UTF-8 \
|
|
||||||
uk_UA/KOI8-U \
|
|
||||||
unm_US/UTF-8 \
|
|
||||||
ur_IN/UTF-8 \
|
|
||||||
ur_PK/UTF-8 \
|
|
||||||
uz_UZ.UTF-8/UTF-8 \
|
|
||||||
uz_UZ/ISO-8859-1 \
|
|
||||||
uz_UZ@cyrillic/UTF-8 \
|
|
||||||
ve_ZA/UTF-8 \
|
|
||||||
vi_VN/UTF-8 \
|
|
||||||
wa_BE/ISO-8859-1 \
|
|
||||||
wa_BE@euro/ISO-8859-15 \
|
|
||||||
wa_BE.UTF-8/UTF-8 \
|
|
||||||
wae_CH/UTF-8 \
|
|
||||||
wal_ET/UTF-8 \
|
|
||||||
wo_SN/UTF-8 \
|
|
||||||
xh_ZA.UTF-8/UTF-8 \
|
|
||||||
xh_ZA/ISO-8859-1 \
|
|
||||||
yi_US.UTF-8/UTF-8 \
|
|
||||||
yi_US/CP1255 \
|
|
||||||
yo_NG/UTF-8 \
|
|
||||||
yue_HK/UTF-8 \
|
|
||||||
yuw_PG/UTF-8 \
|
|
||||||
zh_CN.GB18030/GB18030 \
|
|
||||||
zh_CN.GBK/GBK \
|
|
||||||
zh_CN.UTF-8/UTF-8 \
|
|
||||||
zh_CN/GB2312 \
|
|
||||||
zh_HK.UTF-8/UTF-8 \
|
|
||||||
zh_HK/BIG5-HKSCS \
|
|
||||||
zh_SG.UTF-8/UTF-8 \
|
|
||||||
zh_SG.GBK/GBK \
|
|
||||||
zh_SG/GB2312 \
|
|
||||||
zh_TW.EUC-TW/EUC-TW \
|
|
||||||
zh_TW.UTF-8/UTF-8 \
|
|
||||||
zh_TW/BIG5 \
|
|
||||||
zu_ZA.UTF-8/UTF-8 \
|
|
||||||
zu_ZA/ISO-8859-1 \
|
|
@ -1,862 +0,0 @@
|
|||||||
#define _GNU_SOURCE
|
|
||||||
#include <assert.h>
|
|
||||||
#include <dirent.h>
|
|
||||||
#include <errno.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
#include <locale.h>
|
|
||||||
#include <stdarg.h>
|
|
||||||
#include <stdbool.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <getopt.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <sys/mman.h>
|
|
||||||
#include <sys/stat.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include "../locale/hashval.h"
|
|
||||||
#define __LC_LAST 13
|
|
||||||
#include "../locale/locarchive.h"
|
|
||||||
#include "../crypt/md5.h"
|
|
||||||
|
|
||||||
const char *alias_file = DATADIR "/locale/locale.alias";
|
|
||||||
const char *locar_file = PREFIX "/lib/locale/locale-archive";
|
|
||||||
const char *tmpl_file = PREFIX "/lib/locale/locale-archive.tmpl";
|
|
||||||
const char *loc_path = PREFIX "/lib/locale/";
|
|
||||||
/* Flags set by `--verbose` option. */
|
|
||||||
int be_quiet = 1;
|
|
||||||
int verbose = 0;
|
|
||||||
int max_locarchive_open_retry = 10;
|
|
||||||
const char *output_prefix;
|
|
||||||
|
|
||||||
/* Endianness should have been taken care of by localedef. We don't need to do
|
|
||||||
additional swapping. We need this variable exported however, since
|
|
||||||
locarchive.c uses it to determine if it needs to swap endianness of a value
|
|
||||||
before writing to or reading from the archive. */
|
|
||||||
bool swap_endianness_p = false;
|
|
||||||
|
|
||||||
static const char *locnames[] =
|
|
||||||
{
|
|
||||||
#define DEFINE_CATEGORY(category, category_name, items, a) \
|
|
||||||
[category] = category_name,
|
|
||||||
#include "../locale/categories.def"
|
|
||||||
#undef DEFINE_CATEGORY
|
|
||||||
};
|
|
||||||
|
|
||||||
static int
|
|
||||||
is_prime (unsigned long candidate)
|
|
||||||
{
|
|
||||||
/* No even number and none less than 10 will be passed here. */
|
|
||||||
unsigned long int divn = 3;
|
|
||||||
unsigned long int sq = divn * divn;
|
|
||||||
|
|
||||||
while (sq < candidate && candidate % divn != 0)
|
|
||||||
{
|
|
||||||
++divn;
|
|
||||||
sq += 4 * divn;
|
|
||||||
++divn;
|
|
||||||
}
|
|
||||||
|
|
||||||
return candidate % divn != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned long
|
|
||||||
next_prime (unsigned long seed)
|
|
||||||
{
|
|
||||||
/* Make it definitely odd. */
|
|
||||||
seed |= 1;
|
|
||||||
|
|
||||||
while (!is_prime (seed))
|
|
||||||
seed += 2;
|
|
||||||
|
|
||||||
return seed;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
error (int status, int errnum, const char *message, ...)
|
|
||||||
{
|
|
||||||
va_list args;
|
|
||||||
|
|
||||||
va_start (args, message);
|
|
||||||
fflush (stdout);
|
|
||||||
fprintf (stderr, "%s: ", program_invocation_name);
|
|
||||||
vfprintf (stderr, message, args);
|
|
||||||
va_end (args);
|
|
||||||
if (errnum)
|
|
||||||
fprintf (stderr, ": %s", strerror (errnum));
|
|
||||||
putc ('\n', stderr);
|
|
||||||
fflush (stderr);
|
|
||||||
if (status)
|
|
||||||
exit (errnum == EROFS ? 0 : status);
|
|
||||||
}
|
|
||||||
|
|
||||||
void *
|
|
||||||
xmalloc (size_t size)
|
|
||||||
{
|
|
||||||
void *p = malloc (size);
|
|
||||||
if (p == NULL)
|
|
||||||
error (EXIT_FAILURE, errno, "could not allocate %zd bytes of memory", size);
|
|
||||||
return p;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
open_tmpl_archive (struct locarhandle *ah)
|
|
||||||
{
|
|
||||||
struct stat64 st;
|
|
||||||
int fd;
|
|
||||||
struct locarhead head;
|
|
||||||
const char *archivefname = ah->fname == NULL ? tmpl_file : ah->fname;
|
|
||||||
|
|
||||||
/* Open the archive. We must have exclusive write access. */
|
|
||||||
fd = open64 (archivefname, O_RDONLY);
|
|
||||||
if (fd == -1)
|
|
||||||
error (EXIT_FAILURE, errno, "cannot open locale archive template file \"%s\"",
|
|
||||||
archivefname);
|
|
||||||
|
|
||||||
if (fstat64 (fd, &st) < 0)
|
|
||||||
error (EXIT_FAILURE, errno, "cannot stat locale archive template file \"%s\"",
|
|
||||||
archivefname);
|
|
||||||
|
|
||||||
/* Read the header. */
|
|
||||||
if (TEMP_FAILURE_RETRY (read (fd, &head, sizeof (head))) != sizeof (head))
|
|
||||||
error (EXIT_FAILURE, errno, "cannot read archive header");
|
|
||||||
|
|
||||||
ah->fd = fd;
|
|
||||||
ah->mmaped = (head.sumhash_offset
|
|
||||||
+ head.sumhash_size * sizeof (struct sumhashent));
|
|
||||||
if (ah->mmaped > (unsigned long) st.st_size)
|
|
||||||
error (EXIT_FAILURE, 0, "locale archive template file truncated");
|
|
||||||
ah->mmaped = st.st_size;
|
|
||||||
ah->reserved = st.st_size;
|
|
||||||
|
|
||||||
/* Now we know how large the administrative information part is.
|
|
||||||
Map all of it. */
|
|
||||||
ah->addr = mmap64 (NULL, ah->mmaped, PROT_READ, MAP_SHARED, fd, 0);
|
|
||||||
if (ah->addr == MAP_FAILED)
|
|
||||||
error (EXIT_FAILURE, errno, "cannot map archive header");
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Open the locale archive. */
|
|
||||||
extern void open_archive (struct locarhandle *ah, bool readonly);
|
|
||||||
|
|
||||||
/* Close the locale archive. */
|
|
||||||
extern void close_archive (struct locarhandle *ah);
|
|
||||||
|
|
||||||
/* Add given locale data to the archive. */
|
|
||||||
extern int add_locale_to_archive (struct locarhandle *ah, const char *name,
|
|
||||||
locale_data_t data, bool replace);
|
|
||||||
|
|
||||||
extern void add_alias (struct locarhandle *ah, const char *alias,
|
|
||||||
bool replace, const char *oldname,
|
|
||||||
uint32_t *locrec_offset_p);
|
|
||||||
|
|
||||||
extern struct namehashent *
|
|
||||||
insert_name (struct locarhandle *ah,
|
|
||||||
const char *name, size_t name_len, bool replace);
|
|
||||||
|
|
||||||
struct nameent
|
|
||||||
{
|
|
||||||
char *name;
|
|
||||||
struct locrecent *locrec;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct dataent
|
|
||||||
{
|
|
||||||
const unsigned char *sum;
|
|
||||||
uint32_t file_offset;
|
|
||||||
};
|
|
||||||
|
|
||||||
static int
|
|
||||||
nameentcmp (const void *a, const void *b)
|
|
||||||
{
|
|
||||||
struct locrecent *la = ((const struct nameent *) a)->locrec;
|
|
||||||
struct locrecent *lb = ((const struct nameent *) b)->locrec;
|
|
||||||
uint32_t start_a = -1, end_a = 0;
|
|
||||||
uint32_t start_b = -1, end_b = 0;
|
|
||||||
int cnt;
|
|
||||||
|
|
||||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
|
||||||
if (cnt != LC_ALL)
|
|
||||||
{
|
|
||||||
if (la->record[cnt].offset < start_a)
|
|
||||||
start_a = la->record[cnt].offset;
|
|
||||||
if (la->record[cnt].offset + la->record[cnt].len > end_a)
|
|
||||||
end_a = la->record[cnt].offset + la->record[cnt].len;
|
|
||||||
}
|
|
||||||
assert (start_a != (uint32_t)-1);
|
|
||||||
assert (end_a != 0);
|
|
||||||
|
|
||||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
|
||||||
if (cnt != LC_ALL)
|
|
||||||
{
|
|
||||||
if (lb->record[cnt].offset < start_b)
|
|
||||||
start_b = lb->record[cnt].offset;
|
|
||||||
if (lb->record[cnt].offset + lb->record[cnt].len > end_b)
|
|
||||||
end_b = lb->record[cnt].offset + lb->record[cnt].len;
|
|
||||||
}
|
|
||||||
assert (start_b != (uint32_t)-1);
|
|
||||||
assert (end_b != 0);
|
|
||||||
|
|
||||||
if (start_a != start_b)
|
|
||||||
return (int)start_a - (int)start_b;
|
|
||||||
return (int)end_a - (int)end_b;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int
|
|
||||||
dataentcmp (const void *a, const void *b)
|
|
||||||
{
|
|
||||||
if (((const struct dataent *) a)->file_offset
|
|
||||||
< ((const struct dataent *) b)->file_offset)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
if (((const struct dataent *) a)->file_offset
|
|
||||||
> ((const struct dataent *) b)->file_offset)
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int
|
|
||||||
sumsearchfn (const void *key, const void *ent)
|
|
||||||
{
|
|
||||||
uint32_t keyn = *(uint32_t *)key;
|
|
||||||
uint32_t entn = ((struct dataent *)ent)->file_offset;
|
|
||||||
|
|
||||||
if (keyn < entn)
|
|
||||||
return -1;
|
|
||||||
if (keyn > entn)
|
|
||||||
return 1;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
compute_data (struct locarhandle *ah, struct nameent *name, size_t sumused,
|
|
||||||
struct dataent *files, locale_data_t data)
|
|
||||||
{
|
|
||||||
int cnt;
|
|
||||||
struct locrecent *locrec = name->locrec;
|
|
||||||
struct dataent *file;
|
|
||||||
data[LC_ALL].addr = ((char *) ah->addr) + locrec->record[LC_ALL].offset;
|
|
||||||
data[LC_ALL].size = locrec->record[LC_ALL].len;
|
|
||||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
|
||||||
if (cnt != LC_ALL)
|
|
||||||
{
|
|
||||||
data[cnt].addr = ((char *) ah->addr) + locrec->record[cnt].offset;
|
|
||||||
data[cnt].size = locrec->record[cnt].len;
|
|
||||||
if (data[cnt].addr >= data[LC_ALL].addr
|
|
||||||
&& data[cnt].addr + data[cnt].size
|
|
||||||
<= data[LC_ALL].addr + data[LC_ALL].size)
|
|
||||||
__md5_buffer (data[cnt].addr, data[cnt].size, data[cnt].sum);
|
|
||||||
else
|
|
||||||
{
|
|
||||||
file = bsearch (&locrec->record[cnt].offset, files, sumused,
|
|
||||||
sizeof (*files), sumsearchfn);
|
|
||||||
if (file == NULL)
|
|
||||||
error (EXIT_FAILURE, 0, "inconsistent template file");
|
|
||||||
memcpy (data[cnt].sum, file->sum, sizeof (data[cnt].sum));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static int
|
|
||||||
fill_archive (struct locarhandle *tmpl_ah,
|
|
||||||
const char *fname,
|
|
||||||
size_t install_langs_count, char *install_langs_list[],
|
|
||||||
size_t nlist, char *list[],
|
|
||||||
const char *primary)
|
|
||||||
{
|
|
||||||
struct locarhandle ah;
|
|
||||||
struct locarhead *head;
|
|
||||||
int result = 0;
|
|
||||||
struct nameent *names;
|
|
||||||
struct namehashent *namehashtab;
|
|
||||||
size_t cnt, used;
|
|
||||||
struct dataent *files;
|
|
||||||
struct sumhashent *sumhashtab;
|
|
||||||
size_t sumused;
|
|
||||||
struct locrecent *primary_locrec = NULL;
|
|
||||||
struct nameent *primary_nameent = NULL;
|
|
||||||
|
|
||||||
head = tmpl_ah->addr;
|
|
||||||
names = (struct nameent *) malloc (head->namehash_used
|
|
||||||
* sizeof (struct nameent));
|
|
||||||
files = (struct dataent *) malloc (head->sumhash_used
|
|
||||||
* sizeof (struct dataent));
|
|
||||||
if (names == NULL || files == NULL)
|
|
||||||
error (EXIT_FAILURE, errno, "could not allocate tables");
|
|
||||||
|
|
||||||
namehashtab = (struct namehashent *) ((char *) tmpl_ah->addr
|
|
||||||
+ head->namehash_offset);
|
|
||||||
sumhashtab = (struct sumhashent *) ((char *) tmpl_ah->addr
|
|
||||||
+ head->sumhash_offset);
|
|
||||||
|
|
||||||
for (cnt = used = 0; cnt < head->namehash_size; ++cnt)
|
|
||||||
if (namehashtab[cnt].locrec_offset != 0)
|
|
||||||
{
|
|
||||||
char * name;
|
|
||||||
int i;
|
|
||||||
assert (used < head->namehash_used);
|
|
||||||
name = tmpl_ah->addr + namehashtab[cnt].name_offset;
|
|
||||||
if (install_langs_count == 0)
|
|
||||||
{
|
|
||||||
/* Always intstall the entry. */
|
|
||||||
names[used].name = name;
|
|
||||||
names[used++].locrec
|
|
||||||
= (struct locrecent *) ((char *) tmpl_ah->addr +
|
|
||||||
namehashtab[cnt].locrec_offset);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/* Only install the entry if the user asked for it via
|
|
||||||
--install-langs. */
|
|
||||||
for (i = 0; i < install_langs_count; i++)
|
|
||||||
{
|
|
||||||
/* Add one for "_" and one for the null terminator. */
|
|
||||||
size_t len = strlen (install_langs_list[i]) + 2;
|
|
||||||
char *install_lang = (char *)xmalloc (len);
|
|
||||||
strcpy (install_lang, install_langs_list[i]);
|
|
||||||
if (strchr (install_lang, '_') == NULL)
|
|
||||||
strcat (install_lang, "_");
|
|
||||||
if (strncmp (name, install_lang, strlen (install_lang)) == 0)
|
|
||||||
{
|
|
||||||
names[used].name = name;
|
|
||||||
names[used++].locrec
|
|
||||||
= (struct locrecent *) ((char *)tmpl_ah->addr
|
|
||||||
+ namehashtab[cnt].locrec_offset);
|
|
||||||
}
|
|
||||||
free (install_lang);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Sort the names. */
|
|
||||||
qsort (names, used, sizeof (struct nameent), nameentcmp);
|
|
||||||
|
|
||||||
for (cnt = sumused = 0; cnt < head->sumhash_size; ++cnt)
|
|
||||||
if (sumhashtab[cnt].file_offset != 0)
|
|
||||||
{
|
|
||||||
assert (sumused < head->sumhash_used);
|
|
||||||
files[sumused].sum = (const unsigned char *) sumhashtab[cnt].sum;
|
|
||||||
files[sumused++].file_offset = sumhashtab[cnt].file_offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Sort by file locations. */
|
|
||||||
qsort (files, sumused, sizeof (struct dataent), dataentcmp);
|
|
||||||
|
|
||||||
/* Open the archive. This call never returns if we cannot
|
|
||||||
successfully open the archive. */
|
|
||||||
ah.fname = NULL;
|
|
||||||
if (fname != NULL)
|
|
||||||
ah.fname = fname;
|
|
||||||
open_archive (&ah, false);
|
|
||||||
|
|
||||||
if (primary != NULL)
|
|
||||||
{
|
|
||||||
for (cnt = 0; cnt < used; ++cnt)
|
|
||||||
if (strcmp (names[cnt].name, primary) == 0)
|
|
||||||
break;
|
|
||||||
if (cnt < used)
|
|
||||||
{
|
|
||||||
locale_data_t data;
|
|
||||||
|
|
||||||
compute_data (tmpl_ah, &names[cnt], sumused, files, data);
|
|
||||||
result |= add_locale_to_archive (&ah, primary, data, 0);
|
|
||||||
primary_locrec = names[cnt].locrec;
|
|
||||||
primary_nameent = &names[cnt];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (cnt = 0; cnt < used; ++cnt)
|
|
||||||
if (&names[cnt] == primary_nameent)
|
|
||||||
continue;
|
|
||||||
else if ((cnt > 0 && names[cnt - 1].locrec == names[cnt].locrec)
|
|
||||||
|| names[cnt].locrec == primary_locrec)
|
|
||||||
{
|
|
||||||
const char *oldname;
|
|
||||||
struct namehashent *namehashent;
|
|
||||||
uint32_t locrec_offset;
|
|
||||||
|
|
||||||
if (names[cnt].locrec == primary_locrec)
|
|
||||||
oldname = primary;
|
|
||||||
else
|
|
||||||
oldname = names[cnt - 1].name;
|
|
||||||
namehashent = insert_name (&ah, oldname, strlen (oldname), true);
|
|
||||||
assert (namehashent->name_offset != 0);
|
|
||||||
assert (namehashent->locrec_offset != 0);
|
|
||||||
locrec_offset = namehashent->locrec_offset;
|
|
||||||
add_alias (&ah, names[cnt].name, 0, oldname, &locrec_offset);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
locale_data_t data;
|
|
||||||
|
|
||||||
compute_data (tmpl_ah, &names[cnt], sumused, files, data);
|
|
||||||
result |= add_locale_to_archive (&ah, names[cnt].name, data, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
while (nlist-- > 0)
|
|
||||||
{
|
|
||||||
const char *fname = *list++;
|
|
||||||
size_t fnamelen = strlen (fname);
|
|
||||||
struct stat64 st;
|
|
||||||
DIR *dirp;
|
|
||||||
struct dirent64 *d;
|
|
||||||
int seen;
|
|
||||||
locale_data_t data;
|
|
||||||
int cnt;
|
|
||||||
|
|
||||||
/* First see whether this really is a directory and whether it
|
|
||||||
contains all the require locale category files. */
|
|
||||||
if (stat64 (fname, &st) < 0)
|
|
||||||
{
|
|
||||||
error (0, 0, "stat of \"%s\" failed: %s: ignored", fname,
|
|
||||||
strerror (errno));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!S_ISDIR (st.st_mode))
|
|
||||||
{
|
|
||||||
error (0, 0, "\"%s\" is no directory; ignored", fname);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
dirp = opendir (fname);
|
|
||||||
if (dirp == NULL)
|
|
||||||
{
|
|
||||||
error (0, 0, "cannot open directory \"%s\": %s: ignored",
|
|
||||||
fname, strerror (errno));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
seen = 0;
|
|
||||||
while ((d = readdir64 (dirp)) != NULL)
|
|
||||||
{
|
|
||||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
|
||||||
if (cnt != LC_ALL)
|
|
||||||
if (strcmp (d->d_name, locnames[cnt]) == 0)
|
|
||||||
{
|
|
||||||
unsigned char d_type;
|
|
||||||
|
|
||||||
/* We have an object of the required name. If it's
|
|
||||||
a directory we have to look at a file with the
|
|
||||||
prefix "SYS_". Otherwise we have found what we
|
|
||||||
are looking for. */
|
|
||||||
#ifdef _DIRENT_HAVE_D_TYPE
|
|
||||||
d_type = d->d_type;
|
|
||||||
|
|
||||||
if (d_type != DT_REG)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
char fullname[fnamelen + 2 * strlen (d->d_name) + 7];
|
|
||||||
|
|
||||||
#ifdef _DIRENT_HAVE_D_TYPE
|
|
||||||
if (d_type == DT_UNKNOWN || d_type == DT_LNK)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
strcpy (stpcpy (stpcpy (fullname, fname), "/"),
|
|
||||||
d->d_name);
|
|
||||||
|
|
||||||
if (stat64 (fullname, &st) == -1)
|
|
||||||
/* We cannot stat the file, ignore it. */
|
|
||||||
break;
|
|
||||||
|
|
||||||
d_type = IFTODT (st.st_mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (d_type == DT_DIR)
|
|
||||||
{
|
|
||||||
/* We have to do more tests. The file is a
|
|
||||||
directory and it therefore must contain a
|
|
||||||
regular file with the same name except a
|
|
||||||
"SYS_" prefix. */
|
|
||||||
char *t = stpcpy (stpcpy (fullname, fname), "/");
|
|
||||||
strcpy (stpcpy (stpcpy (t, d->d_name), "/SYS_"),
|
|
||||||
d->d_name);
|
|
||||||
|
|
||||||
if (stat64 (fullname, &st) == -1)
|
|
||||||
/* There is no SYS_* file or we cannot
|
|
||||||
access it. */
|
|
||||||
break;
|
|
||||||
|
|
||||||
d_type = IFTODT (st.st_mode);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If we found a regular file (eventually after
|
|
||||||
following a symlink) we are successful. */
|
|
||||||
if (d_type == DT_REG)
|
|
||||||
++seen;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
closedir (dirp);
|
|
||||||
|
|
||||||
if (seen != __LC_LAST - 1)
|
|
||||||
{
|
|
||||||
/* We don't have all locale category files. Ignore the name. */
|
|
||||||
error (0, 0, "incomplete set of locale files in \"%s\"",
|
|
||||||
fname);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Add the files to the archive. To do this we first compute
|
|
||||||
sizes and the MD5 sums of all the files. */
|
|
||||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
|
||||||
if (cnt != LC_ALL)
|
|
||||||
{
|
|
||||||
char fullname[fnamelen + 2 * strlen (locnames[cnt]) + 7];
|
|
||||||
int fd;
|
|
||||||
|
|
||||||
strcpy (stpcpy (stpcpy (fullname, fname), "/"), locnames[cnt]);
|
|
||||||
fd = open64 (fullname, O_RDONLY);
|
|
||||||
if (fd == -1 || fstat64 (fd, &st) == -1)
|
|
||||||
{
|
|
||||||
/* Cannot read the file. */
|
|
||||||
if (fd != -1)
|
|
||||||
close (fd);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (S_ISDIR (st.st_mode))
|
|
||||||
{
|
|
||||||
char *t;
|
|
||||||
close (fd);
|
|
||||||
t = stpcpy (stpcpy (fullname, fname), "/");
|
|
||||||
strcpy (stpcpy (stpcpy (t, locnames[cnt]), "/SYS_"),
|
|
||||||
locnames[cnt]);
|
|
||||||
|
|
||||||
fd = open64 (fullname, O_RDONLY);
|
|
||||||
if (fd == -1 || fstat64 (fd, &st) == -1
|
|
||||||
|| !S_ISREG (st.st_mode))
|
|
||||||
{
|
|
||||||
if (fd != -1)
|
|
||||||
close (fd);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Map the file. */
|
|
||||||
data[cnt].addr = mmap64 (NULL, st.st_size, PROT_READ, MAP_SHARED,
|
|
||||||
fd, 0);
|
|
||||||
if (data[cnt].addr == MAP_FAILED)
|
|
||||||
{
|
|
||||||
/* Cannot map it. */
|
|
||||||
close (fd);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
data[cnt].size = st.st_size;
|
|
||||||
__md5_buffer (data[cnt].addr, st.st_size, data[cnt].sum);
|
|
||||||
|
|
||||||
/* We don't need the file descriptor anymore. */
|
|
||||||
close (fd);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cnt != __LC_LAST)
|
|
||||||
{
|
|
||||||
while (cnt-- > 0)
|
|
||||||
if (cnt != LC_ALL)
|
|
||||||
munmap (data[cnt].addr, data[cnt].size);
|
|
||||||
|
|
||||||
error (0, 0, "cannot read all files in \"%s\": ignored", fname);
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
result |= add_locale_to_archive (&ah, basename (fname), data, 0);
|
|
||||||
|
|
||||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
|
||||||
if (cnt != LC_ALL)
|
|
||||||
munmap (data[cnt].addr, data[cnt].size);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* We are done. */
|
|
||||||
close_archive (&ah);
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
void usage()
|
|
||||||
{
|
|
||||||
printf ("\
|
|
||||||
Usage: build-locale-archive [OPTION]... [TEMPLATE-FILE] [ARCHIVE-FILE]\n\
|
|
||||||
Builds a locale archive from a template file.\n\
|
|
||||||
Options:\n\
|
|
||||||
-h, --help Print this usage message.\n\
|
|
||||||
-v, --verbose Verbose execution.\n\
|
|
||||||
-l, --install-langs=LIST Only include locales given in LIST into the \n\
|
|
||||||
locale archive. LIST is a colon separated list\n\
|
|
||||||
of locale prefixes, for example \"de:en:ja\".\n\
|
|
||||||
The special argument \"all\" means to install\n\
|
|
||||||
all languages and it must be present by itself.\n\
|
|
||||||
If \"all\" is present with any other language it\n\
|
|
||||||
will be treated as the name of a locale.\n\
|
|
||||||
If the --install-langs option is missing, all\n\
|
|
||||||
locales are installed. The colon separated list\n\
|
|
||||||
can contain any strings matching the beginning of\n\
|
|
||||||
locale names.\n\
|
|
||||||
If a string does not contain a \"_\", it is added.\n\
|
|
||||||
Examples:\n\
|
|
||||||
--install-langs=\"en\"\n\
|
|
||||||
installs en_US, en_US.iso88591,\n\
|
|
||||||
en_US.iso885915, en_US.utf8,\n\
|
|
||||||
en_GB ...\n\
|
|
||||||
--install-langs=\"en_US.utf8\"\n\
|
|
||||||
installs only en_US.utf8.\n\
|
|
||||||
--install-langs=\"ko\"\n\
|
|
||||||
installs ko_KR, ko_KR.euckr,\n\
|
|
||||||
ko_KR.utf8 but *not* kok_IN\n\
|
|
||||||
because \"ko\" does not contain\n\
|
|
||||||
\"_\" and it is silently added\n\
|
|
||||||
--install-langs\"ko:kok\"\n\
|
|
||||||
installs ko_KR, ko_KR.euckr,\n\
|
|
||||||
ko_KR.utf8, kok_IN, and\n\
|
|
||||||
kok_IN.utf8.\n\
|
|
||||||
--install-langs=\"POSIX\" will\n\
|
|
||||||
installs *no* locales at all\n\
|
|
||||||
because POSIX matches none of\n\
|
|
||||||
the locales. Actually, any string\n\
|
|
||||||
matching nothing will do that.\n\
|
|
||||||
POSIX and C will always be\n\
|
|
||||||
available because they are\n\
|
|
||||||
builtin.\n\
|
|
||||||
Aliases are installed as well,\n\
|
|
||||||
i.e. --install-langs=\"de\"\n\
|
|
||||||
will install not only every locale starting with\n\
|
|
||||||
\"de\" but also the aliases \"deutsch\"\n\
|
|
||||||
and and \"german\" although the latter does not\n\
|
|
||||||
start with \"de\".\n\
|
|
||||||
\n\
|
|
||||||
If the arguments TEMPLATE-FILE and ARCHIVE-FILE are not given the locations\n\
|
|
||||||
where the glibc used expects these files are used by default.\n\
|
|
||||||
");
|
|
||||||
}
|
|
||||||
|
|
||||||
int main (int argc, char *argv[])
|
|
||||||
{
|
|
||||||
char path[4096];
|
|
||||||
DIR *dirp;
|
|
||||||
struct dirent64 *d;
|
|
||||||
struct stat64 st;
|
|
||||||
char *list[16384], *primary;
|
|
||||||
char *lang;
|
|
||||||
int install_langs_count = 0;
|
|
||||||
int i;
|
|
||||||
char *install_langs_arg, *ila_start;
|
|
||||||
char **install_langs_list = NULL;
|
|
||||||
unsigned int cnt = 0;
|
|
||||||
struct locarhandle tmpl_ah;
|
|
||||||
char *new_locar_fname = NULL;
|
|
||||||
size_t loc_path_len = strlen (loc_path);
|
|
||||||
|
|
||||||
while (1)
|
|
||||||
{
|
|
||||||
int c;
|
|
||||||
|
|
||||||
static struct option long_options[] =
|
|
||||||
{
|
|
||||||
{"help", no_argument, 0, 'h'},
|
|
||||||
{"verbose", no_argument, 0, 'v'},
|
|
||||||
{"install-langs", required_argument, 0, 'l'},
|
|
||||||
{0, 0, 0, 0}
|
|
||||||
};
|
|
||||||
/* getopt_long stores the option index here. */
|
|
||||||
int option_index = 0;
|
|
||||||
|
|
||||||
c = getopt_long (argc, argv, "vhl:",
|
|
||||||
long_options, &option_index);
|
|
||||||
|
|
||||||
/* Detect the end of the options. */
|
|
||||||
if (c == -1)
|
|
||||||
break;
|
|
||||||
|
|
||||||
switch (c)
|
|
||||||
{
|
|
||||||
case 0:
|
|
||||||
printf ("unknown option %s", long_options[option_index].name);
|
|
||||||
if (optarg)
|
|
||||||
printf (" with arg %s", optarg);
|
|
||||||
printf ("\n");
|
|
||||||
usage ();
|
|
||||||
exit (1);
|
|
||||||
|
|
||||||
case 'v':
|
|
||||||
verbose = 1;
|
|
||||||
be_quiet = 0;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case 'h':
|
|
||||||
usage ();
|
|
||||||
exit (0);
|
|
||||||
|
|
||||||
case 'l':
|
|
||||||
install_langs_arg = ila_start = strdup (optarg);
|
|
||||||
/* If the argument to --install-lang is "all", do
|
|
||||||
not limit the list of languages to install and install
|
|
||||||
them all. We do not support installing a single locale
|
|
||||||
called "all". */
|
|
||||||
#define MAGIC_INSTALL_ALL "all"
|
|
||||||
if (install_langs_arg != NULL
|
|
||||||
&& install_langs_arg[0] != '\0'
|
|
||||||
&& !(strncmp(install_langs_arg, MAGIC_INSTALL_ALL,
|
|
||||||
strlen(MAGIC_INSTALL_ALL)) == 0
|
|
||||||
&& strlen (install_langs_arg) == 3))
|
|
||||||
{
|
|
||||||
/* Count the number of languages we will install. */
|
|
||||||
while (true)
|
|
||||||
{
|
|
||||||
lang = strtok(install_langs_arg, ":;,");
|
|
||||||
if (lang == NULL)
|
|
||||||
break;
|
|
||||||
install_langs_count++;
|
|
||||||
install_langs_arg = NULL;
|
|
||||||
}
|
|
||||||
free (ila_start);
|
|
||||||
|
|
||||||
/* Reject an entire string made up of delimiters. */
|
|
||||||
if (install_langs_count == 0)
|
|
||||||
break;
|
|
||||||
|
|
||||||
/* Copy the list. */
|
|
||||||
install_langs_list = (char **)xmalloc (sizeof(char *) * install_langs_count);
|
|
||||||
install_langs_arg = ila_start = strdup (optarg);
|
|
||||||
install_langs_count = 0;
|
|
||||||
while (true)
|
|
||||||
{
|
|
||||||
lang = strtok(install_langs_arg, ":;,");
|
|
||||||
if (lang == NULL)
|
|
||||||
break;
|
|
||||||
install_langs_list[install_langs_count] = lang;
|
|
||||||
install_langs_count++;
|
|
||||||
install_langs_arg = NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case '?':
|
|
||||||
/* getopt_long already printed an error message. */
|
|
||||||
usage ();
|
|
||||||
exit (0);
|
|
||||||
|
|
||||||
default:
|
|
||||||
abort ();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
tmpl_ah.fname = NULL;
|
|
||||||
if (optind < argc)
|
|
||||||
tmpl_ah.fname = argv[optind];
|
|
||||||
if (optind + 1 < argc)
|
|
||||||
new_locar_fname = argv[optind + 1];
|
|
||||||
if (verbose)
|
|
||||||
{
|
|
||||||
if (tmpl_ah.fname)
|
|
||||||
printf("input archive file specified on command line: %s\n",
|
|
||||||
tmpl_ah.fname);
|
|
||||||
else
|
|
||||||
printf("using default input archive file.\n");
|
|
||||||
if (new_locar_fname)
|
|
||||||
printf("output archive file specified on command line: %s\n",
|
|
||||||
new_locar_fname);
|
|
||||||
else
|
|
||||||
printf("using default output archive file.\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
dirp = opendir (loc_path);
|
|
||||||
if (dirp == NULL)
|
|
||||||
error (EXIT_FAILURE, errno, "cannot open directory \"%s\"", loc_path);
|
|
||||||
|
|
||||||
open_tmpl_archive (&tmpl_ah);
|
|
||||||
|
|
||||||
if (new_locar_fname)
|
|
||||||
unlink (new_locar_fname);
|
|
||||||
else
|
|
||||||
unlink (locar_file);
|
|
||||||
primary = getenv ("LC_ALL");
|
|
||||||
if (primary == NULL)
|
|
||||||
primary = getenv ("LANG");
|
|
||||||
if (primary != NULL)
|
|
||||||
{
|
|
||||||
if (strncmp (primary, "ja", 2) != 0
|
|
||||||
&& strncmp (primary, "ko", 2) != 0
|
|
||||||
&& strncmp (primary, "zh", 2) != 0)
|
|
||||||
{
|
|
||||||
char *ptr = malloc (strlen (primary) + strlen (".utf8") + 1), *p, *q;
|
|
||||||
/* This leads to invalid locales sometimes:
|
|
||||||
de_DE.iso885915@euro -> de_DE.utf8@euro */
|
|
||||||
if (ptr != NULL)
|
|
||||||
{
|
|
||||||
p = ptr;
|
|
||||||
q = primary;
|
|
||||||
while (*q && *q != '.' && *q != '@')
|
|
||||||
*p++ = *q++;
|
|
||||||
if (*q == '.')
|
|
||||||
while (*q && *q != '@')
|
|
||||||
q++;
|
|
||||||
p = stpcpy (p, ".utf8");
|
|
||||||
strcpy (p, q);
|
|
||||||
primary = ptr;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
primary = NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
memcpy (path, loc_path, loc_path_len);
|
|
||||||
|
|
||||||
while ((d = readdir64 (dirp)) != NULL)
|
|
||||||
{
|
|
||||||
if (strcmp (d->d_name, ".") == 0 || strcmp (d->d_name, "..") == 0)
|
|
||||||
continue;
|
|
||||||
if (strchr (d->d_name, '_') == NULL)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
size_t d_name_len = strlen (d->d_name);
|
|
||||||
if (loc_path_len + d_name_len + 1 > sizeof (path))
|
|
||||||
{
|
|
||||||
error (0, 0, "too long filename \"%s\"", d->d_name);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
memcpy (path + loc_path_len, d->d_name, d_name_len + 1);
|
|
||||||
if (stat64 (path, &st) < 0)
|
|
||||||
{
|
|
||||||
error (0, errno, "cannot stat \"%s\"", path);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (! S_ISDIR (st.st_mode))
|
|
||||||
continue;
|
|
||||||
if (cnt == 16384)
|
|
||||||
{
|
|
||||||
error (0, 0, "too many directories in \"%s\"", loc_path);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
list[cnt] = strdup (path);
|
|
||||||
if (list[cnt] == NULL)
|
|
||||||
{
|
|
||||||
error (0, errno, "cannot add file to list \"%s\"", path);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (primary != NULL && cnt > 0 && strcmp (primary, d->d_name) == 0)
|
|
||||||
{
|
|
||||||
char *p = list[0];
|
|
||||||
list[0] = list[cnt];
|
|
||||||
list[cnt] = p;
|
|
||||||
}
|
|
||||||
cnt++;
|
|
||||||
}
|
|
||||||
closedir (dirp);
|
|
||||||
/* Store the archive to the file specified as the second argument on the
|
|
||||||
command line or the default locale archive. */
|
|
||||||
fill_archive (&tmpl_ah, new_locar_fname,
|
|
||||||
install_langs_count, install_langs_list,
|
|
||||||
cnt, list, primary);
|
|
||||||
close_archive (&tmpl_ah);
|
|
||||||
truncate (tmpl_file, 0);
|
|
||||||
if (install_langs_count > 0)
|
|
||||||
{
|
|
||||||
free (ila_start);
|
|
||||||
free (install_langs_list);
|
|
||||||
}
|
|
||||||
char *tz_argv[] = { "/usr/sbin/tzdata-update", NULL };
|
|
||||||
execve (tz_argv[0], (char *const *)tz_argv, (char *const *)&tz_argv[1]);
|
|
||||||
exit (0);
|
|
||||||
}
|
|
@ -1,259 +0,0 @@
|
|||||||
From 97700a34f36721b11a754cf37a1cc40695ece1fd Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 21 Jan 2019 11:23:59 -0800
|
|
||||||
Subject: [PATCH] x86-64 memchr/wmemchr: Properly handle the length parameter
|
|
||||||
[BZ# 24097]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
|
||||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
|
||||||
functions written in assembly can only use the lower 32 bits of a
|
|
||||||
64-bit register as length or must clear the upper 32 bits before using
|
|
||||||
the full 64-bit register for length.
|
|
||||||
|
|
||||||
This pach fixes memchr/wmemchr for x32. Tested on x86-64 and x32. On
|
|
||||||
x86-64, libc.so is the same with and withou the fix.
|
|
||||||
|
|
||||||
[BZ# 24097]
|
|
||||||
CVE-2019-6488
|
|
||||||
* sysdeps/x86_64/memchr.S: Use RDX_LP for length. Clear the
|
|
||||||
upper 32 bits of RDX register.
|
|
||||||
* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and
|
|
||||||
tst-size_t-wmemchr.
|
|
||||||
* sysdeps/x86_64/x32/test-size_t.h: New file.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/memchr.S | 10 ++--
|
|
||||||
sysdeps/x86_64/multiarch/memchr-avx2.S | 8 ++-
|
|
||||||
sysdeps/x86_64/x32/Makefile | 8 +++
|
|
||||||
sysdeps/x86_64/x32/test-size_t.h | 35 ++++++++++++
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-memchr.c | 72 +++++++++++++++++++++++++
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++
|
|
||||||
6 files changed, 148 insertions(+), 5 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/test-size_t.h
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
NEWS
|
|
||||||
(removed)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
|
|
||||||
index feef5d4f..cb320257 100644
|
|
||||||
--- a/sysdeps/x86_64/memchr.S
|
|
||||||
+++ b/sysdeps/x86_64/memchr.S
|
|
||||||
@@ -34,12 +34,16 @@ ENTRY(MEMCHR)
|
|
||||||
mov %edi, %ecx
|
|
||||||
|
|
||||||
#ifdef USE_AS_WMEMCHR
|
|
||||||
- test %rdx, %rdx
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
jz L(return_null)
|
|
||||||
- shl $2, %rdx
|
|
||||||
+ shl $2, %RDX_LP
|
|
||||||
#else
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %edx, %edx
|
|
||||||
+# endif
|
|
||||||
punpcklbw %xmm1, %xmm1
|
|
||||||
- test %rdx, %rdx
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
jz L(return_null)
|
|
||||||
punpcklbw %xmm1, %xmm1
|
|
||||||
#endif
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
index 5f5e7725..c81da19b 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
@@ -40,16 +40,20 @@
|
|
||||||
ENTRY (MEMCHR)
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
/* Check for zero length. */
|
|
||||||
- testq %rdx, %rdx
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
jz L(null)
|
|
||||||
# endif
|
|
||||||
movl %edi, %ecx
|
|
||||||
/* Broadcast CHAR to YMM0. */
|
|
||||||
vmovd %esi, %xmm0
|
|
||||||
# ifdef USE_AS_WMEMCHR
|
|
||||||
- shl $2, %rdx
|
|
||||||
+ shl $2, %RDX_LP
|
|
||||||
vpbroadcastd %xmm0, %ymm0
|
|
||||||
# else
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %edx, %edx
|
|
||||||
+# endif
|
|
||||||
vpbroadcastb %xmm0, %ymm0
|
|
||||||
# endif
|
|
||||||
/* Check if we may cross page boundary with one vector load. */
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index f2ebc24f..7d528889 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -4,3 +4,11 @@ ifeq ($(subdir),math)
|
|
||||||
# 64-bit llround. Add -fno-builtin-lround to silence the compiler.
|
|
||||||
CFLAGS-s_llround.c += -fno-builtin-lround
|
|
||||||
endif
|
|
||||||
+
|
|
||||||
+ifeq ($(subdir),string)
|
|
||||||
+tests += tst-size_t-memchr
|
|
||||||
+endif
|
|
||||||
+
|
|
||||||
+ifeq ($(subdir),wcsmbs)
|
|
||||||
+tests += tst-size_t-wmemchr
|
|
||||||
+endif
|
|
||||||
diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..78a94086
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/test-size_t.h
|
|
||||||
@@ -0,0 +1,35 @@
|
|
||||||
+/* Test string/memory functions with size_t in the lower 32 bits of
|
|
||||||
+ 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define TEST_MAIN
|
|
||||||
+#include <string/test-string.h>
|
|
||||||
+
|
|
||||||
+/* On x32, parameter_t may be passed in a 64-bit register with the LEN
|
|
||||||
+ field in the lower 32 bits. When the LEN field of 64-bit register
|
|
||||||
+ is passed to string/memory function as the size_t parameter, only
|
|
||||||
+ the lower 32 bits can be used. */
|
|
||||||
+typedef struct
|
|
||||||
+{
|
|
||||||
+ union
|
|
||||||
+ {
|
|
||||||
+ size_t len;
|
|
||||||
+ void (*fn) (void);
|
|
||||||
+ };
|
|
||||||
+ void *p;
|
|
||||||
+} parameter_t;
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..29a3daf1
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c
|
|
||||||
@@ -0,0 +1,72 @@
|
|
||||||
+/* Test memchr with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#ifndef WIDE
|
|
||||||
+# define TEST_NAME "memchr"
|
|
||||||
+#else
|
|
||||||
+# define TEST_NAME "wmemchr"
|
|
||||||
+#endif /* WIDE */
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+#ifndef WIDE
|
|
||||||
+# define MEMCHR memchr
|
|
||||||
+# define CHAR char
|
|
||||||
+# define UCHAR unsigned char
|
|
||||||
+#else
|
|
||||||
+# include <wchar.h>
|
|
||||||
+# define MEMCHR wmemchr
|
|
||||||
+# define CHAR wchar_t
|
|
||||||
+# define UCHAR wchar_t
|
|
||||||
+#endif /* WIDE */
|
|
||||||
+
|
|
||||||
+IMPL (MEMCHR, 1)
|
|
||||||
+
|
|
||||||
+typedef CHAR * (*proto_t) (const CHAR*, int, size_t);
|
|
||||||
+
|
|
||||||
+static CHAR *
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_memchr (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, (uintptr_t) b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
|
|
||||||
+ parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ c.fn = impl->fn;
|
|
||||||
+ CHAR *res = do_memchr (src, c);
|
|
||||||
+ if (res)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: %p != NULL",
|
|
||||||
+ impl->name, res);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..877801d6
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
|
|
||||||
@@ -0,0 +1,20 @@
|
|
||||||
+/* Test wmemchr with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define WIDE 1
|
|
||||||
+#include "tst-size_t-memchr.c"
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,41 +0,0 @@
|
|||||||
From ddf0992cf57a93200e0c782e2a94d0733a5a0b87 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Sun, 9 Jan 2022 16:02:21 -0600
|
|
||||||
Subject: [PATCH] x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
|
|
||||||
__wcscmp_avx2. For x86_64 this covers the entire address range so any
|
|
||||||
length larger could not possibly be used to bound `s1` or `s2`.
|
|
||||||
|
|
||||||
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 10 ++++++++++
|
|
||||||
1 file changed, 10 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
index 156c1949..8fb8eedc 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
@@ -83,6 +83,16 @@ ENTRY (STRCMP)
|
|
||||||
je L(char0)
|
|
||||||
jb L(zero)
|
|
||||||
# ifdef USE_AS_WCSCMP
|
|
||||||
+# ifndef __ILP32__
|
|
||||||
+ movq %rdx, %rcx
|
|
||||||
+ /* Check if length could overflow when multiplied by
|
|
||||||
+ sizeof(wchar_t). Checking top 8 bits will cover all potential
|
|
||||||
+ overflow cases as well as redirect cases where its impossible to
|
|
||||||
+ length to bound a valid memory region. In these cases just use
|
|
||||||
+ 'wcscmp'. */
|
|
||||||
+ shrq $56, %rcx
|
|
||||||
+ jnz __wcscmp_avx2
|
|
||||||
+# endif
|
|
||||||
/* Convert units: from wide to byte char. */
|
|
||||||
shl $2, %RDX_LP
|
|
||||||
# endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,263 +0,0 @@
|
|||||||
From 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Fri, 15 Apr 2022 12:28:01 -0500
|
|
||||||
Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Old code was both inefficient and wasted code size. New code (-62
|
|
||||||
bytes) and comparable or better performance in the page cross case.
|
|
||||||
|
|
||||||
geometric_mean(N=20) of page cross cases New / Original: 0.960
|
|
||||||
|
|
||||||
size, align0, align1, ret, New Time/Old Time
|
|
||||||
1, 4095, 0, 0, 1.001
|
|
||||||
1, 4095, 0, 1, 0.999
|
|
||||||
1, 4095, 0, -1, 1.0
|
|
||||||
2, 4094, 0, 0, 1.0
|
|
||||||
2, 4094, 0, 1, 1.0
|
|
||||||
2, 4094, 0, -1, 1.0
|
|
||||||
3, 4093, 0, 0, 1.0
|
|
||||||
3, 4093, 0, 1, 1.0
|
|
||||||
3, 4093, 0, -1, 1.0
|
|
||||||
4, 4092, 0, 0, 0.987
|
|
||||||
4, 4092, 0, 1, 1.0
|
|
||||||
4, 4092, 0, -1, 1.0
|
|
||||||
5, 4091, 0, 0, 0.984
|
|
||||||
5, 4091, 0, 1, 1.002
|
|
||||||
5, 4091, 0, -1, 1.005
|
|
||||||
6, 4090, 0, 0, 0.993
|
|
||||||
6, 4090, 0, 1, 1.001
|
|
||||||
6, 4090, 0, -1, 1.003
|
|
||||||
7, 4089, 0, 0, 0.991
|
|
||||||
7, 4089, 0, 1, 1.0
|
|
||||||
7, 4089, 0, -1, 1.001
|
|
||||||
8, 4088, 0, 0, 0.875
|
|
||||||
8, 4088, 0, 1, 0.881
|
|
||||||
8, 4088, 0, -1, 0.888
|
|
||||||
9, 4087, 0, 0, 0.872
|
|
||||||
9, 4087, 0, 1, 0.879
|
|
||||||
9, 4087, 0, -1, 0.883
|
|
||||||
10, 4086, 0, 0, 0.878
|
|
||||||
10, 4086, 0, 1, 0.886
|
|
||||||
10, 4086, 0, -1, 0.873
|
|
||||||
11, 4085, 0, 0, 0.878
|
|
||||||
11, 4085, 0, 1, 0.881
|
|
||||||
11, 4085, 0, -1, 0.879
|
|
||||||
12, 4084, 0, 0, 0.873
|
|
||||||
12, 4084, 0, 1, 0.889
|
|
||||||
12, 4084, 0, -1, 0.875
|
|
||||||
13, 4083, 0, 0, 0.873
|
|
||||||
13, 4083, 0, 1, 0.863
|
|
||||||
13, 4083, 0, -1, 0.863
|
|
||||||
14, 4082, 0, 0, 0.838
|
|
||||||
14, 4082, 0, 1, 0.869
|
|
||||||
14, 4082, 0, -1, 0.877
|
|
||||||
15, 4081, 0, 0, 0.841
|
|
||||||
15, 4081, 0, 1, 0.869
|
|
||||||
15, 4081, 0, -1, 0.876
|
|
||||||
16, 4080, 0, 0, 0.988
|
|
||||||
16, 4080, 0, 1, 0.99
|
|
||||||
16, 4080, 0, -1, 0.989
|
|
||||||
17, 4079, 0, 0, 0.978
|
|
||||||
17, 4079, 0, 1, 0.981
|
|
||||||
17, 4079, 0, -1, 0.98
|
|
||||||
18, 4078, 0, 0, 0.981
|
|
||||||
18, 4078, 0, 1, 0.98
|
|
||||||
18, 4078, 0, -1, 0.985
|
|
||||||
19, 4077, 0, 0, 0.977
|
|
||||||
19, 4077, 0, 1, 0.979
|
|
||||||
19, 4077, 0, -1, 0.986
|
|
||||||
20, 4076, 0, 0, 0.977
|
|
||||||
20, 4076, 0, 1, 0.986
|
|
||||||
20, 4076, 0, -1, 0.984
|
|
||||||
21, 4075, 0, 0, 0.977
|
|
||||||
21, 4075, 0, 1, 0.983
|
|
||||||
21, 4075, 0, -1, 0.988
|
|
||||||
22, 4074, 0, 0, 0.983
|
|
||||||
22, 4074, 0, 1, 0.994
|
|
||||||
22, 4074, 0, -1, 0.993
|
|
||||||
23, 4073, 0, 0, 0.98
|
|
||||||
23, 4073, 0, 1, 0.992
|
|
||||||
23, 4073, 0, -1, 0.995
|
|
||||||
24, 4072, 0, 0, 0.989
|
|
||||||
24, 4072, 0, 1, 0.989
|
|
||||||
24, 4072, 0, -1, 0.991
|
|
||||||
25, 4071, 0, 0, 0.99
|
|
||||||
25, 4071, 0, 1, 0.999
|
|
||||||
25, 4071, 0, -1, 0.996
|
|
||||||
26, 4070, 0, 0, 0.993
|
|
||||||
26, 4070, 0, 1, 0.995
|
|
||||||
26, 4070, 0, -1, 0.998
|
|
||||||
27, 4069, 0, 0, 0.993
|
|
||||||
27, 4069, 0, 1, 0.999
|
|
||||||
27, 4069, 0, -1, 1.0
|
|
||||||
28, 4068, 0, 0, 0.997
|
|
||||||
28, 4068, 0, 1, 1.0
|
|
||||||
28, 4068, 0, -1, 0.999
|
|
||||||
29, 4067, 0, 0, 0.996
|
|
||||||
29, 4067, 0, 1, 0.999
|
|
||||||
29, 4067, 0, -1, 0.999
|
|
||||||
30, 4066, 0, 0, 0.991
|
|
||||||
30, 4066, 0, 1, 1.001
|
|
||||||
30, 4066, 0, -1, 0.999
|
|
||||||
31, 4065, 0, 0, 0.988
|
|
||||||
31, 4065, 0, 1, 0.998
|
|
||||||
31, 4065, 0, -1, 0.998
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
|
|
||||||
1 file changed, 61 insertions(+), 37 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
index 16fc673e..99258cf5 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
@@ -429,22 +429,21 @@ L(page_cross_less_vec):
|
|
||||||
# ifndef USE_AS_WMEMCMP
|
|
||||||
cmpl $8, %edx
|
|
||||||
jae L(between_8_15)
|
|
||||||
+ /* Fall through for [4, 7]. */
|
|
||||||
cmpl $4, %edx
|
|
||||||
- jae L(between_4_7)
|
|
||||||
+ jb L(between_2_3)
|
|
||||||
|
|
||||||
- /* Load as big endian to avoid branches. */
|
|
||||||
- movzwl (%rdi), %eax
|
|
||||||
- movzwl (%rsi), %ecx
|
|
||||||
- shll $8, %eax
|
|
||||||
- shll $8, %ecx
|
|
||||||
- bswap %eax
|
|
||||||
- bswap %ecx
|
|
||||||
- movzbl -1(%rdi, %rdx), %edi
|
|
||||||
- movzbl -1(%rsi, %rdx), %esi
|
|
||||||
- orl %edi, %eax
|
|
||||||
- orl %esi, %ecx
|
|
||||||
- /* Subtraction is okay because the upper 8 bits are zero. */
|
|
||||||
- subl %ecx, %eax
|
|
||||||
+ movbe (%rdi), %eax
|
|
||||||
+ movbe (%rsi), %ecx
|
|
||||||
+ shlq $32, %rax
|
|
||||||
+ shlq $32, %rcx
|
|
||||||
+ movbe -4(%rdi, %rdx), %edi
|
|
||||||
+ movbe -4(%rsi, %rdx), %esi
|
|
||||||
+ orq %rdi, %rax
|
|
||||||
+ orq %rsi, %rcx
|
|
||||||
+ subq %rcx, %rax
|
|
||||||
+ /* Fast path for return zero. */
|
|
||||||
+ jnz L(ret_nonzero)
|
|
||||||
/* No ymm register was touched. */
|
|
||||||
ret
|
|
||||||
|
|
||||||
@@ -457,9 +456,33 @@ L(one_or_less):
|
|
||||||
/* No ymm register was touched. */
|
|
||||||
ret
|
|
||||||
|
|
||||||
+ .p2align 4,, 5
|
|
||||||
+L(ret_nonzero):
|
|
||||||
+ sbbl %eax, %eax
|
|
||||||
+ orl $1, %eax
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4,, 2
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
.p2align 4
|
|
||||||
L(between_8_15):
|
|
||||||
-# endif
|
|
||||||
+ movbe (%rdi), %rax
|
|
||||||
+ movbe (%rsi), %rcx
|
|
||||||
+ subq %rcx, %rax
|
|
||||||
+ jnz L(ret_nonzero)
|
|
||||||
+ movbe -8(%rdi, %rdx), %rax
|
|
||||||
+ movbe -8(%rsi, %rdx), %rcx
|
|
||||||
+ subq %rcx, %rax
|
|
||||||
+ /* Fast path for return zero. */
|
|
||||||
+ jnz L(ret_nonzero)
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
+ ret
|
|
||||||
+# else
|
|
||||||
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
|
|
||||||
vmovq (%rdi), %xmm1
|
|
||||||
vmovq (%rsi), %xmm2
|
|
||||||
@@ -475,16 +498,13 @@ L(between_8_15):
|
|
||||||
VPCMPEQ %xmm1, %xmm2, %xmm2
|
|
||||||
vpmovmskb %xmm2, %eax
|
|
||||||
subl $0xffff, %eax
|
|
||||||
+ /* Fast path for return zero. */
|
|
||||||
jnz L(return_vec_0)
|
|
||||||
/* No ymm register was touched. */
|
|
||||||
ret
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(zero):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
+ .p2align 4,, 10
|
|
||||||
L(between_16_31):
|
|
||||||
/* From 16 to 31 bytes. No branch when size == 16. */
|
|
||||||
vmovdqu (%rsi), %xmm2
|
|
||||||
@@ -501,11 +521,17 @@ L(between_16_31):
|
|
||||||
VPCMPEQ (%rdi), %xmm2, %xmm2
|
|
||||||
vpmovmskb %xmm2, %eax
|
|
||||||
subl $0xffff, %eax
|
|
||||||
+ /* Fast path for return zero. */
|
|
||||||
jnz L(return_vec_0)
|
|
||||||
/* No ymm register was touched. */
|
|
||||||
ret
|
|
||||||
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
+ .p2align 4,, 2
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
.p2align 4
|
|
||||||
L(one_or_less):
|
|
||||||
jb L(zero)
|
|
||||||
@@ -520,22 +546,20 @@ L(one_or_less):
|
|
||||||
# else
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(between_4_7):
|
|
||||||
- /* Load as big endian with overlapping movbe to avoid branches.
|
|
||||||
- */
|
|
||||||
- movbe (%rdi), %eax
|
|
||||||
- movbe (%rsi), %ecx
|
|
||||||
- shlq $32, %rax
|
|
||||||
- shlq $32, %rcx
|
|
||||||
- movbe -4(%rdi, %rdx), %edi
|
|
||||||
- movbe -4(%rsi, %rdx), %esi
|
|
||||||
- orq %rdi, %rax
|
|
||||||
- orq %rsi, %rcx
|
|
||||||
- subq %rcx, %rax
|
|
||||||
- jz L(zero_4_7)
|
|
||||||
- sbbl %eax, %eax
|
|
||||||
- orl $1, %eax
|
|
||||||
-L(zero_4_7):
|
|
||||||
+L(between_2_3):
|
|
||||||
+ /* Load as big endian to avoid branches. */
|
|
||||||
+ movzwl (%rdi), %eax
|
|
||||||
+ movzwl (%rsi), %ecx
|
|
||||||
+ bswap %eax
|
|
||||||
+ bswap %ecx
|
|
||||||
+ shrl %eax
|
|
||||||
+ shrl %ecx
|
|
||||||
+ movzbl -1(%rdi, %rdx), %edi
|
|
||||||
+ movzbl -1(%rsi, %rdx), %esi
|
|
||||||
+ orl %edi, %eax
|
|
||||||
+ orl %esi, %ecx
|
|
||||||
+ /* Subtraction is okay because the upper bit is zero. */
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
/* No ymm register was touched. */
|
|
||||||
ret
|
|
||||||
# endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,226 +0,0 @@
|
|||||||
From 8162147872491bb5b48e91543b19c49a29ae6b6d Mon Sep 17 00:00:00 2001
|
|
||||||
From: Wangyang Guo <wangyang.guo@intel.com>
|
|
||||||
Date: Fri, 6 May 2022 01:50:10 +0000
|
|
||||||
Subject: [PATCH] nptl: Add backoff mechanism to spinlock loop
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
When mutiple threads waiting for lock at the same time, once lock owner
|
|
||||||
releases the lock, waiters will see lock available and all try to lock,
|
|
||||||
which may cause an expensive CAS storm.
|
|
||||||
|
|
||||||
Binary exponential backoff with random jitter is introduced. As try-lock
|
|
||||||
attempt increases, there is more likely that a larger number threads
|
|
||||||
compete for adaptive mutex lock, so increase wait time in exponential.
|
|
||||||
A random jitter is also added to avoid synchronous try-lock from other
|
|
||||||
threads.
|
|
||||||
|
|
||||||
v2: Remove read-check before try-lock for performance.
|
|
||||||
|
|
||||||
v3:
|
|
||||||
1. Restore read-check since it works well in some platform.
|
|
||||||
2. Make backoff arch dependent, and enable it for x86_64.
|
|
||||||
3. Limit max backoff to reduce latency in large critical section.
|
|
||||||
|
|
||||||
v4: Fix strict-prototypes error in sysdeps/nptl/pthread_mutex_backoff.h
|
|
||||||
|
|
||||||
v5: Commit log updated for regression in large critical section.
|
|
||||||
|
|
||||||
Result of pthread-mutex-locks bench
|
|
||||||
|
|
||||||
Test Platform: Xeon 8280L (2 socket, 112 CPUs in total)
|
|
||||||
First Row: thread number
|
|
||||||
First Col: critical section length
|
|
||||||
Values: backoff vs upstream, time based, low is better
|
|
||||||
|
|
||||||
non-critical-length: 1
|
|
||||||
1 2 4 8 16 32 64 112 140
|
|
||||||
0 0.99 0.58 0.52 0.49 0.43 0.44 0.46 0.52 0.54
|
|
||||||
1 0.98 0.43 0.56 0.50 0.44 0.45 0.50 0.56 0.57
|
|
||||||
2 0.99 0.41 0.57 0.51 0.45 0.47 0.48 0.60 0.61
|
|
||||||
4 0.99 0.45 0.59 0.53 0.48 0.49 0.52 0.64 0.65
|
|
||||||
8 1.00 0.66 0.71 0.63 0.56 0.59 0.66 0.72 0.71
|
|
||||||
16 0.97 0.78 0.91 0.73 0.67 0.70 0.79 0.80 0.80
|
|
||||||
32 0.95 1.17 0.98 0.87 0.82 0.86 0.89 0.90 0.90
|
|
||||||
64 0.96 0.95 1.01 1.01 0.98 1.00 1.03 0.99 0.99
|
|
||||||
128 0.99 1.01 1.01 1.17 1.08 1.12 1.02 0.97 1.02
|
|
||||||
|
|
||||||
non-critical-length: 32
|
|
||||||
1 2 4 8 16 32 64 112 140
|
|
||||||
0 1.03 0.97 0.75 0.65 0.58 0.58 0.56 0.70 0.70
|
|
||||||
1 0.94 0.95 0.76 0.65 0.58 0.58 0.61 0.71 0.72
|
|
||||||
2 0.97 0.96 0.77 0.66 0.58 0.59 0.62 0.74 0.74
|
|
||||||
4 0.99 0.96 0.78 0.66 0.60 0.61 0.66 0.76 0.77
|
|
||||||
8 0.99 0.99 0.84 0.70 0.64 0.66 0.71 0.80 0.80
|
|
||||||
16 0.98 0.97 0.95 0.76 0.70 0.73 0.81 0.85 0.84
|
|
||||||
32 1.04 1.12 1.04 0.89 0.82 0.86 0.93 0.91 0.91
|
|
||||||
64 0.99 1.15 1.07 1.00 0.99 1.01 1.05 0.99 0.99
|
|
||||||
128 1.00 1.21 1.20 1.22 1.25 1.31 1.12 1.10 0.99
|
|
||||||
|
|
||||||
non-critical-length: 128
|
|
||||||
1 2 4 8 16 32 64 112 140
|
|
||||||
0 1.02 1.00 0.99 0.67 0.61 0.61 0.61 0.74 0.73
|
|
||||||
1 0.95 0.99 1.00 0.68 0.61 0.60 0.60 0.74 0.74
|
|
||||||
2 1.00 1.04 1.00 0.68 0.59 0.61 0.65 0.76 0.76
|
|
||||||
4 1.00 0.96 0.98 0.70 0.63 0.63 0.67 0.78 0.77
|
|
||||||
8 1.01 1.02 0.89 0.73 0.65 0.67 0.71 0.81 0.80
|
|
||||||
16 0.99 0.96 0.96 0.79 0.71 0.73 0.80 0.84 0.84
|
|
||||||
32 0.99 0.95 1.05 0.89 0.84 0.85 0.94 0.92 0.91
|
|
||||||
64 1.00 0.99 1.16 1.04 1.00 1.02 1.06 0.99 0.99
|
|
||||||
128 1.00 1.06 0.98 1.14 1.39 1.26 1.08 1.02 0.98
|
|
||||||
|
|
||||||
There is regression in large critical section. But adaptive mutex is
|
|
||||||
aimed for "quick" locks. Small critical section is more common when
|
|
||||||
users choose to use adaptive pthread_mutex.
|
|
||||||
|
|
||||||
Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
pthreadP.h
|
|
||||||
(had been moved)
|
|
||||||
nptl/pthread_mutex_lock.c
|
|
||||||
(max_adaptive_count renamed)
|
|
||||||
|
|
||||||
---
|
|
||||||
nptl/pthreadP.h | 1 +
|
|
||||||
nptl/pthread_mutex_lock.c | 16 +++++++--
|
|
||||||
sysdeps/nptl/pthread_mutex_backoff.h | 35 ++++++++++++++++++
|
|
||||||
sysdeps/x86_64/nptl/pthread_mutex_backoff.h | 39 +++++++++++++++++++++
|
|
||||||
4 files changed, 89 insertions(+), 2 deletions(-)
|
|
||||||
create mode 100644 sysdeps/nptl/pthread_mutex_backoff.h
|
|
||||||
create mode 100644 sysdeps/x86_64/nptl/pthread_mutex_backoff.h
|
|
||||||
|
|
||||||
diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
|
|
||||||
index 7ddc166c..1550e3b6 100644
|
|
||||||
--- a/nptl/pthreadP.h
|
|
||||||
+++ b/nptl/pthreadP.h
|
|
||||||
@@ -33,6 +33,7 @@
|
|
||||||
#include <kernel-features.h>
|
|
||||||
#include <errno.h>
|
|
||||||
#include <internal-signals.h>
|
|
||||||
+#include <pthread_mutex_backoff.h>
|
|
||||||
|
|
||||||
|
|
||||||
/* Atomic operations on TLS memory. */
|
|
||||||
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
|
|
||||||
index d96a9933..c7770fc9 100644
|
|
||||||
--- a/nptl/pthread_mutex_lock.c
|
|
||||||
+++ b/nptl/pthread_mutex_lock.c
|
|
||||||
@@ -133,14 +133,26 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
|
|
||||||
int cnt = 0;
|
|
||||||
int max_cnt = MIN (MAX_ADAPTIVE_COUNT,
|
|
||||||
mutex->__data.__spins * 2 + 10);
|
|
||||||
+ int spin_count, exp_backoff = 1;
|
|
||||||
+ unsigned int jitter = get_jitter ();
|
|
||||||
do
|
|
||||||
{
|
|
||||||
- if (cnt++ >= max_cnt)
|
|
||||||
+ /* In each loop, spin count is exponential backoff plus
|
|
||||||
+ random jitter, random range is [0, exp_backoff-1]. */
|
|
||||||
+ spin_count = exp_backoff + (jitter & (exp_backoff - 1));
|
|
||||||
+ cnt += spin_count;
|
|
||||||
+ if (cnt >= max_cnt)
|
|
||||||
{
|
|
||||||
+ /* If cnt exceeds max spin count, just go to wait
|
|
||||||
+ queue. */
|
|
||||||
LLL_MUTEX_LOCK (mutex);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
- atomic_spin_nop ();
|
|
||||||
+ do
|
|
||||||
+ atomic_spin_nop ();
|
|
||||||
+ while (--spin_count > 0);
|
|
||||||
+ /* Prepare for next loop. */
|
|
||||||
+ exp_backoff = get_next_backoff (exp_backoff);
|
|
||||||
}
|
|
||||||
while (LLL_MUTEX_READ_LOCK (mutex) != 0
|
|
||||||
|| LLL_MUTEX_TRYLOCK (mutex) != 0);
|
|
||||||
diff --git a/sysdeps/nptl/pthread_mutex_backoff.h b/sysdeps/nptl/pthread_mutex_backoff.h
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..5b26c22a
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/nptl/pthread_mutex_backoff.h
|
|
||||||
@@ -0,0 +1,35 @@
|
|
||||||
+/* Pthread mutex backoff configuration.
|
|
||||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
|
|
||||||
+#define _PTHREAD_MUTEX_BACKOFF_H 1
|
|
||||||
+
|
|
||||||
+static inline unsigned int
|
|
||||||
+get_jitter (void)
|
|
||||||
+{
|
|
||||||
+ /* Arch dependent random jitter, return 0 disables random. */
|
|
||||||
+ return 0;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static inline int
|
|
||||||
+get_next_backoff (int backoff)
|
|
||||||
+{
|
|
||||||
+ /* Next backoff, return 1 disables mutex backoff. */
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#endif
|
|
||||||
diff --git a/sysdeps/x86_64/nptl/pthread_mutex_backoff.h b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..ec74c3d9
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
|
|
||||||
@@ -0,0 +1,39 @@
|
|
||||||
+/* Pthread mutex backoff configuration.
|
|
||||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
|
|
||||||
+#define _PTHREAD_MUTEX_BACKOFF_H 1
|
|
||||||
+
|
|
||||||
+#include <fast-jitter.h>
|
|
||||||
+
|
|
||||||
+static inline unsigned int
|
|
||||||
+get_jitter (void)
|
|
||||||
+{
|
|
||||||
+ return get_fast_jitter ();
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#define MAX_BACKOFF 16
|
|
||||||
+
|
|
||||||
+static inline int
|
|
||||||
+get_next_backoff (int backoff)
|
|
||||||
+{
|
|
||||||
+ /* Binary expontial backoff. Limiting max backoff
|
|
||||||
+ can reduce latency in large critical section. */
|
|
||||||
+ return (backoff < MAX_BACKOFF) ? backoff << 1 : backoff;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,55 +0,0 @@
|
|||||||
From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Tue, 15 Feb 2022 08:18:15 -0600
|
|
||||||
Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
|
|
||||||
#28896]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
|
|
||||||
call strcmp-avx2 and wcscmp-avx2 respectively. This would have
|
|
||||||
not checks around vzeroupper and would trigger spurious
|
|
||||||
aborts. This commit fixes that.
|
|
||||||
|
|
||||||
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
|
|
||||||
AVX2 machines with and without RTM.
|
|
||||||
|
|
||||||
Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 8 ++------
|
|
||||||
1 file changed, 2 insertions(+), 6 deletions(-)
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
(split into two patches due to upstream bug differences)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
index 28cc98b6..e267c6cb 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
@@ -345,10 +345,10 @@ L(one_or_less):
|
|
||||||
movq %LOCALE_REG, %rdx
|
|
||||||
# endif
|
|
||||||
jb L(ret_zero)
|
|
||||||
-# ifdef USE_AS_WCSCMP
|
|
||||||
/* 'nbe' covers the case where length is negative (large
|
|
||||||
unsigned). */
|
|
||||||
- jnbe __wcscmp_avx2
|
|
||||||
+ jnbe OVERFLOW_STRCMP
|
|
||||||
+# ifdef USE_AS_WCSCMP
|
|
||||||
movl (%rdi), %edx
|
|
||||||
xorl %eax, %eax
|
|
||||||
cmpl (%rsi), %edx
|
|
||||||
@@ -357,10 +357,6 @@ L(one_or_less):
|
|
||||||
negl %eax
|
|
||||||
orl $1, %eax
|
|
||||||
# else
|
|
||||||
- /* 'nbe' covers the case where length is negative (large
|
|
||||||
- unsigned). */
|
|
||||||
-
|
|
||||||
- jnbe __strcmp_avx2
|
|
||||||
movzbl (%rdi), %eax
|
|
||||||
movzbl (%rsi), %ecx
|
|
||||||
TOLOWER_gpr (%rax, %eax)
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,60 +0,0 @@
|
|||||||
From 259a17cc98058d2576511201f85d28cb5d9de2a2 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Stefan Liebler <stli@linux.ibm.com>
|
|
||||||
Date: Mon, 28 Jun 2021 13:01:07 +0200
|
|
||||||
Subject: s390x: Update math: redirect roundeven function
|
|
||||||
|
|
||||||
After recent commit
|
|
||||||
447954a206837b5f153869cfeeeab44631c3fac9
|
|
||||||
"math: redirect roundeven function", building on
|
|
||||||
s390x fails with:
|
|
||||||
Error: symbol `__roundevenl' is already defined
|
|
||||||
|
|
||||||
Similar to aarch64/riscv fix, this patch redirects target
|
|
||||||
specific functions for s390x:
|
|
||||||
commit 3213ed770cbc5821920d16caa93c85e92dd7b9f6
|
|
||||||
"Update math: redirect roundeven function"
|
|
||||||
|
|
||||||
diff --git a/sysdeps/s390/fpu/s_roundeven.c b/sysdeps/s390/fpu/s_roundeven.c
|
|
||||||
index 40b07e054b..0773adfed0 100644
|
|
||||||
--- a/sysdeps/s390/fpu/s_roundeven.c
|
|
||||||
+++ b/sysdeps/s390/fpu/s_roundeven.c
|
|
||||||
@@ -18,6 +18,7 @@
|
|
||||||
<https://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
|
|
||||||
+# define NO_MATH_REDIRECT
|
|
||||||
# include <math.h>
|
|
||||||
# include <libm-alias-double.h>
|
|
||||||
|
|
||||||
@@ -31,7 +32,6 @@ __roundeven (double x)
|
|
||||||
__asm__ ("fidbra %0,4,%1,4" : "=f" (y) : "f" (x));
|
|
||||||
return y;
|
|
||||||
}
|
|
||||||
-hidden_def (__roundeven)
|
|
||||||
libm_alias_double (__roundeven, roundeven)
|
|
||||||
|
|
||||||
#else
|
|
||||||
diff --git a/sysdeps/s390/fpu/s_roundevenf.c b/sysdeps/s390/fpu/s_roundevenf.c
|
|
||||||
index d2fbf3d2b6..289785bc4a 100644
|
|
||||||
--- a/sysdeps/s390/fpu/s_roundevenf.c
|
|
||||||
+++ b/sysdeps/s390/fpu/s_roundevenf.c
|
|
||||||
@@ -18,6 +18,7 @@
|
|
||||||
<https://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
|
|
||||||
+# define NO_MATH_REDIRECT
|
|
||||||
# include <math.h>
|
|
||||||
# include <libm-alias-float.h>
|
|
||||||
|
|
||||||
diff --git a/sysdeps/s390/fpu/s_roundevenl.c b/sysdeps/s390/fpu/s_roundevenl.c
|
|
||||||
index 29ab7a8616..94b6459ab4 100644
|
|
||||||
--- a/sysdeps/s390/fpu/s_roundevenl.c
|
|
||||||
+++ b/sysdeps/s390/fpu/s_roundevenl.c
|
|
||||||
@@ -18,6 +18,7 @@
|
|
||||||
<https://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
|
|
||||||
+# define NO_MATH_REDIRECT
|
|
||||||
# include <math.h>
|
|
||||||
# include <math_private.h>
|
|
||||||
# include <libm-alias-ldouble.h>
|
|
@ -1,74 +0,0 @@
|
|||||||
From 1da50d4bda07f04135dca39f40e79fc9eabed1f8 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Fri, 26 Feb 2021 05:36:59 -0800
|
|
||||||
Subject: [PATCH] x86: Set Prefer_No_VZEROUPPER and add Prefer_AVX2_STRCMP
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
1. Set Prefer_No_VZEROUPPER if RTM is usable to avoid RTM abort triggered
|
|
||||||
by VZEROUPPER inside a transactionally executing RTM region.
|
|
||||||
2. Since to compare 2 32-byte strings, 256-bit EVEX strcmp requires 2
|
|
||||||
loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp requires 1 load, 2 VPCMPEQs,
|
|
||||||
1 VPMINU and 1 VPMOVMSKB, AVX2 strcmp is faster than EVEX strcmp. Add
|
|
||||||
Prefer_AVX2_STRCMP to prefer AVX2 strcmp family functions.
|
|
||||||
---
|
|
||||||
sysdeps/x86/cpu-features.c | 20 +++++++++++++++++--
|
|
||||||
sysdeps/x86/cpu-tunables.c | 2 ++
|
|
||||||
...cpu-features-preferred_feature_index_1.def | 1 +
|
|
||||||
3 files changed, 21 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
|
||||||
index 91042505..3610ee5c 100644
|
|
||||||
--- a/sysdeps/x86/cpu-features.c
|
|
||||||
+++ b/sysdeps/x86/cpu-features.c
|
|
||||||
@@ -524,8 +524,24 @@ init_cpu_features (struct cpu_features *cpu_features)
|
|
||||||
cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
|
|
||||||
|= bit_arch_Prefer_No_VZEROUPPER;
|
|
||||||
else
|
|
||||||
- cpu_features->preferred[index_arch_Prefer_No_AVX512]
|
|
||||||
- |= bit_arch_Prefer_No_AVX512;
|
|
||||||
+ {
|
|
||||||
+ cpu_features->preferred[index_arch_Prefer_No_AVX512]
|
|
||||||
+ |= bit_arch_Prefer_No_AVX512;
|
|
||||||
+
|
|
||||||
+ /* Avoid RTM abort triggered by VZEROUPPER inside a
|
|
||||||
+ transactionally executing RTM region. */
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
||||||
+ cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
|
|
||||||
+ |= bit_arch_Prefer_No_VZEROUPPER;
|
|
||||||
+
|
|
||||||
+ /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
|
|
||||||
+ requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
|
|
||||||
+ requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
|
|
||||||
+ AVX2 strcmp is faster than EVEX strcmp. */
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
|
||||||
+ cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
|
|
||||||
+ |= bit_arch_Prefer_AVX2_STRCMP;
|
|
||||||
+ }
|
|
||||||
}
|
|
||||||
/* This spells out "AuthenticAMD". */
|
|
||||||
else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
|
|
||||||
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
|
|
||||||
index 3173b2b9..73adbaba 100644
|
|
||||||
--- a/sysdeps/x86/cpu-tunables.c
|
|
||||||
+++ b/sysdeps/x86/cpu-tunables.c
|
|
||||||
@@ -239,6 +239,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
|
|
||||||
CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
|
|
||||||
Fast_Copy_Backward,
|
|
||||||
disable, 18);
|
|
||||||
+ CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
|
|
||||||
+ (n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case 19:
|
|
||||||
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
|
||||||
index 17a5cc42..4ca70b40 100644
|
|
||||||
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
|
||||||
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
|
||||||
@@ -32,3 +32,4 @@ BIT (Prefer_ERMS)
|
|
||||||
BIT (Prefer_FSRM)
|
|
||||||
BIT (Prefer_No_AVX512)
|
|
||||||
BIT (MathVec_Prefer_No_AVX512)
|
|
||||||
+BIT (Prefer_AVX2_STRCMP)
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,26 +0,0 @@
|
|||||||
From 3213ed770cbc5821920d16caa93c85e92dd7b9f6 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Wed, 23 Jun 2021 13:29:41 -0700
|
|
||||||
Subject: Update math: redirect roundeven function
|
|
||||||
|
|
||||||
Redirect target specific roundeven functions for aarch64, ldbl-128ibm
|
|
||||||
and riscv.
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
sysdeps/aarch64/*
|
|
||||||
(not needed)
|
|
||||||
sysdeps/riscv/*
|
|
||||||
(not supported)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
|
|
||||||
index 6701970f4a..90eecf496b 100644
|
|
||||||
--- a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
|
|
||||||
+++ b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
|
|
||||||
@@ -17,6 +17,7 @@
|
|
||||||
License along with the GNU C Library; if not, see
|
|
||||||
<http://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
+#define NO_MATH_REDIRECT
|
|
||||||
#include <math.h>
|
|
||||||
#include <math_private.h>
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,242 +0,0 @@
|
|||||||
From 63ad43566f7a25d140dc723598aeb441ad657eed Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Fri, 5 Mar 2021 06:46:08 -0800
|
|
||||||
Subject: [PATCH] x86-64: Add memmove family functions with 256-bit EVEX
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Update ifunc-memmove.h to select the function optimized with 256-bit EVEX
|
|
||||||
instructions using YMM16-YMM31 registers to avoid RTM abort with usable
|
|
||||||
AVX512VL since VZEROUPPER isn't needed at function exit.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/Makefile | 1 +
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 36 +++++++++++++++++++
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-memmove.h | 21 +++++++++--
|
|
||||||
.../multiarch/memmove-evex-unaligned-erms.S | 33 +++++++++++++++++
|
|
||||||
.../multiarch/memmove-vec-unaligned-erms.S | 24 ++++++++-----
|
|
||||||
5 files changed, 104 insertions(+), 11 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
index 46783cd1..4563fc56 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
|
||||||
memset-avx2-unaligned-erms \
|
|
||||||
memset-avx512-unaligned-erms \
|
|
||||||
memchr-evex \
|
|
||||||
+ memmove-evex-unaligned-erms \
|
|
||||||
memrchr-evex \
|
|
||||||
rawmemchr-evex \
|
|
||||||
stpcpy-evex \
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index 082e4da3..6bd3abfc 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
__memmove_chk_avx_unaligned_erms)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __memmove_chk_evex_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __memmove_chk_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
|
||||||
CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
__memmove_chk_ssse3_back)
|
|
||||||
@@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, memmove,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
__memmove_avx_unaligned_erms)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, memmove,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __memmove_evex_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, memmove,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __memmove_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memmove,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__memmove_avx512_no_vzeroupper)
|
|
||||||
@@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
__memcpy_chk_avx_unaligned_erms)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __memcpy_chk_evex_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __memcpy_chk_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
|
||||||
CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
__memcpy_chk_ssse3_back)
|
|
||||||
@@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcpy,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
__memcpy_avx_unaligned_erms)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, memcpy,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __memcpy_evex_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, memcpy,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __memcpy_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
__memcpy_ssse3_back)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
@@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
__mempcpy_chk_avx_unaligned_erms)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __mempcpy_chk_evex_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __mempcpy_chk_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
|
||||||
CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
__mempcpy_chk_ssse3_back)
|
|
||||||
@@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
__mempcpy_avx_unaligned_erms)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, mempcpy,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __mempcpy_evex_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, mempcpy,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __mempcpy_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
__mempcpy_ssse3_back)
|
|
||||||
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
|
||||||
index 5e5f0299..6f8bce5f 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
|
||||||
@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
|
|
||||||
attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
|
|
||||||
+ attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
|
|
||||||
+ attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
|
|
||||||
attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
|
|
||||||
@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void)
|
|
||||||
|
|
||||||
if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
{
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
- return OPTIMIZE (avx_unaligned_erms);
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
+ return OPTIMIZE (evex_unaligned_erms);
|
|
||||||
+
|
|
||||||
+ return OPTIMIZE (evex_unaligned);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
+ return OPTIMIZE (avx_unaligned_erms);
|
|
||||||
|
|
||||||
- return OPTIMIZE (avx_unaligned);
|
|
||||||
+ return OPTIMIZE (avx_unaligned);
|
|
||||||
+ }
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..0cbce8f9
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
|
||||||
@@ -0,0 +1,33 @@
|
|
||||||
+#if IS_IN (libc)
|
|
||||||
+# define VEC_SIZE 32
|
|
||||||
+# define XMM0 xmm16
|
|
||||||
+# define XMM1 xmm17
|
|
||||||
+# define YMM0 ymm16
|
|
||||||
+# define YMM1 ymm17
|
|
||||||
+# define VEC0 ymm16
|
|
||||||
+# define VEC1 ymm17
|
|
||||||
+# define VEC2 ymm18
|
|
||||||
+# define VEC3 ymm19
|
|
||||||
+# define VEC4 ymm20
|
|
||||||
+# define VEC5 ymm21
|
|
||||||
+# define VEC6 ymm22
|
|
||||||
+# define VEC7 ymm23
|
|
||||||
+# define VEC8 ymm24
|
|
||||||
+# define VEC9 ymm25
|
|
||||||
+# define VEC10 ymm26
|
|
||||||
+# define VEC11 ymm27
|
|
||||||
+# define VEC12 ymm28
|
|
||||||
+# define VEC13 ymm29
|
|
||||||
+# define VEC14 ymm30
|
|
||||||
+# define VEC15 ymm31
|
|
||||||
+# define VEC(i) VEC##i
|
|
||||||
+# define VMOVNT vmovntdq
|
|
||||||
+# define VMOVU vmovdqu64
|
|
||||||
+# define VMOVA vmovdqa64
|
|
||||||
+# define VZEROUPPER
|
|
||||||
+
|
|
||||||
+# define SECTION(p) p##.evex
|
|
||||||
+# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
|
|
||||||
+
|
|
||||||
+# include "memmove-vec-unaligned-erms.S"
|
|
||||||
+#endif
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
index 274aa1c7..08e21692 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
@@ -48,6 +48,14 @@
|
|
||||||
# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+#ifndef XMM0
|
|
||||||
+# define XMM0 xmm0
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#ifndef YMM0
|
|
||||||
+# define YMM0 ymm0
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
#ifndef VZEROUPPER
|
|
||||||
# if VEC_SIZE > 16
|
|
||||||
# define VZEROUPPER vzeroupper
|
|
||||||
@@ -277,20 +285,20 @@ L(less_vec):
|
|
||||||
#if VEC_SIZE > 32
|
|
||||||
L(between_32_63):
|
|
||||||
/* From 32 to 63. No branch when size == 32. */
|
|
||||||
- vmovdqu (%rsi), %ymm0
|
|
||||||
- vmovdqu -32(%rsi,%rdx), %ymm1
|
|
||||||
- vmovdqu %ymm0, (%rdi)
|
|
||||||
- vmovdqu %ymm1, -32(%rdi,%rdx)
|
|
||||||
+ VMOVU (%rsi), %YMM0
|
|
||||||
+ VMOVU -32(%rsi,%rdx), %YMM1
|
|
||||||
+ VMOVU %YMM0, (%rdi)
|
|
||||||
+ VMOVU %YMM1, -32(%rdi,%rdx)
|
|
||||||
VZEROUPPER
|
|
||||||
ret
|
|
||||||
#endif
|
|
||||||
#if VEC_SIZE > 16
|
|
||||||
/* From 16 to 31. No branch when size == 16. */
|
|
||||||
L(between_16_31):
|
|
||||||
- vmovdqu (%rsi), %xmm0
|
|
||||||
- vmovdqu -16(%rsi,%rdx), %xmm1
|
|
||||||
- vmovdqu %xmm0, (%rdi)
|
|
||||||
- vmovdqu %xmm1, -16(%rdi,%rdx)
|
|
||||||
+ VMOVU (%rsi), %XMM0
|
|
||||||
+ VMOVU -16(%rsi,%rdx), %XMM1
|
|
||||||
+ VMOVU %XMM0, (%rdi)
|
|
||||||
+ VMOVU %XMM1, -16(%rdi,%rdx)
|
|
||||||
ret
|
|
||||||
#endif
|
|
||||||
L(between_8_15):
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,254 +0,0 @@
|
|||||||
From 1b968b6b9b3aac702ac2f133e0dd16cfdbb415ee Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Fri, 5 Mar 2021 07:15:03 -0800
|
|
||||||
Subject: [PATCH] x86-64: Add memset family functions with 256-bit EVEX
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
|
|
||||||
with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM
|
|
||||||
abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
|
|
||||||
function exit.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/Makefile | 1 +
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 22 +++++++++++++++++
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-memset.h | 24 +++++++++++++++----
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-wmemset.h | 13 ++++++----
|
|
||||||
.../multiarch/memset-evex-unaligned-erms.S | 24 +++++++++++++++++++
|
|
||||||
.../multiarch/memset-vec-unaligned-erms.S | 20 +++++++++++-----
|
|
||||||
6 files changed, 90 insertions(+), 14 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
index 4563fc56..1cc0a10e 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
@@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
|
||||||
memchr-evex \
|
|
||||||
memmove-evex-unaligned-erms \
|
|
||||||
memrchr-evex \
|
|
||||||
+ memset-evex-unaligned-erms \
|
|
||||||
rawmemchr-evex \
|
|
||||||
stpcpy-evex \
|
|
||||||
stpncpy-evex \
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index 6bd3abfc..7cf83485 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX2),
|
|
||||||
__memset_chk_avx2_unaligned_erms)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ __memset_chk_evex_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ __memset_chk_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__memset_chk_avx512_unaligned_erms)
|
|
||||||
@@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
CPU_FEATURE_USABLE (AVX2),
|
|
||||||
__memset_avx2_unaligned_erms)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ __memset_evex_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ __memset_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__memset_avx512_unaligned_erms)
|
|
||||||
@@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
|
||||||
CPU_FEATURE_USABLE (AVX2),
|
|
||||||
__wmemset_avx2_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, wmemset,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __wmemset_evex_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__wmemset_avx512_unaligned))
|
|
||||||
@@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX2),
|
|
||||||
__wmemset_chk_avx2_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __wmemset_chk_evex_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__wmemset_chk_avx512_unaligned))
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
index 708bd72e..6f31f4dc 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
|
|
||||||
attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
|
|
||||||
+ attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
|
|
||||||
+ attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
|
|
||||||
attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
|
|
||||||
@@ -56,10 +60,22 @@ IFUNC_SELECTOR (void)
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
|
||||||
{
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
- return OPTIMIZE (avx2_unaligned_erms);
|
|
||||||
- else
|
|
||||||
- return OPTIMIZE (avx2_unaligned);
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
+ return OPTIMIZE (evex_unaligned_erms);
|
|
||||||
+
|
|
||||||
+ return OPTIMIZE (evex_unaligned);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
+ return OPTIMIZE (avx2_unaligned_erms);
|
|
||||||
+
|
|
||||||
+ return OPTIMIZE (avx2_unaligned);
|
|
||||||
+ }
|
|
||||||
}
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
|
||||||
index eb242210..9290c4bf 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
|
||||||
@@ -20,6 +20,7 @@
|
|
||||||
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
|
|
||||||
|
|
||||||
static inline void *
|
|
||||||
@@ -27,14 +28,18 @@ IFUNC_SELECTOR (void)
|
|
||||||
{
|
|
||||||
const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
||||||
|
|
||||||
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
{
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
|
||||||
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
|
||||||
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
|
|
||||||
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
return OPTIMIZE (avx512_unaligned);
|
|
||||||
- else
|
|
||||||
+
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
|
|
||||||
+ return OPTIMIZE (evex_unaligned);
|
|
||||||
+
|
|
||||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
return OPTIMIZE (avx2_unaligned);
|
|
||||||
}
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..ae0a4d6e
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
||||||
@@ -0,0 +1,24 @@
|
|
||||||
+#if IS_IN (libc)
|
|
||||||
+# define VEC_SIZE 32
|
|
||||||
+# define XMM0 xmm16
|
|
||||||
+# define YMM0 ymm16
|
|
||||||
+# define VEC0 ymm16
|
|
||||||
+# define VEC(i) VEC##i
|
|
||||||
+# define VMOVU vmovdqu64
|
|
||||||
+# define VMOVA vmovdqa64
|
|
||||||
+# define VZEROUPPER
|
|
||||||
+
|
|
||||||
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
||||||
+ movq r, %rax; \
|
|
||||||
+ vpbroadcastb d, %VEC0
|
|
||||||
+
|
|
||||||
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
||||||
+ movq r, %rax; \
|
|
||||||
+ vpbroadcastd d, %VEC0
|
|
||||||
+
|
|
||||||
+# define SECTION(p) p##.evex
|
|
||||||
+# define MEMSET_SYMBOL(p,s) p##_evex_##s
|
|
||||||
+# define WMEMSET_SYMBOL(p,s) p##_evex_##s
|
|
||||||
+
|
|
||||||
+# include "memset-vec-unaligned-erms.S"
|
|
||||||
+#endif
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
index 9a0fd818..71e91a8f 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
@@ -34,6 +34,14 @@
|
|
||||||
# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+#ifndef XMM0
|
|
||||||
+# define XMM0 xmm0
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#ifndef YMM0
|
|
||||||
+# define YMM0 ymm0
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
#ifndef VZEROUPPER
|
|
||||||
# if VEC_SIZE > 16
|
|
||||||
# define VZEROUPPER vzeroupper
|
|
||||||
@@ -67,7 +75,7 @@
|
|
||||||
ENTRY (__bzero)
|
|
||||||
mov %RDI_LP, %RAX_LP /* Set return value. */
|
|
||||||
mov %RSI_LP, %RDX_LP /* Set n. */
|
|
||||||
- pxor %xmm0, %xmm0
|
|
||||||
+ pxor %XMM0, %XMM0
|
|
||||||
jmp L(entry_from_bzero)
|
|
||||||
END (__bzero)
|
|
||||||
weak_alias (__bzero, bzero)
|
|
||||||
@@ -223,7 +231,7 @@ L(less_vec):
|
|
||||||
cmpb $16, %dl
|
|
||||||
jae L(between_16_31)
|
|
||||||
# endif
|
|
||||||
- MOVQ %xmm0, %rcx
|
|
||||||
+ MOVQ %XMM0, %rcx
|
|
||||||
cmpb $8, %dl
|
|
||||||
jae L(between_8_15)
|
|
||||||
cmpb $4, %dl
|
|
||||||
@@ -238,16 +246,16 @@ L(less_vec):
|
|
||||||
# if VEC_SIZE > 32
|
|
||||||
/* From 32 to 63. No branch when size == 32. */
|
|
||||||
L(between_32_63):
|
|
||||||
- vmovdqu %ymm0, -32(%rdi,%rdx)
|
|
||||||
- vmovdqu %ymm0, (%rdi)
|
|
||||||
+ VMOVU %YMM0, -32(%rdi,%rdx)
|
|
||||||
+ VMOVU %YMM0, (%rdi)
|
|
||||||
VZEROUPPER
|
|
||||||
ret
|
|
||||||
# endif
|
|
||||||
# if VEC_SIZE > 16
|
|
||||||
/* From 16 to 31. No branch when size == 16. */
|
|
||||||
L(between_16_31):
|
|
||||||
- vmovdqu %xmm0, -16(%rdi,%rdx)
|
|
||||||
- vmovdqu %xmm0, (%rdi)
|
|
||||||
+ VMOVU %XMM0, -16(%rdi,%rdx)
|
|
||||||
+ VMOVU %XMM0, (%rdi)
|
|
||||||
VZEROUPPER
|
|
||||||
ret
|
|
||||||
# endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,561 +0,0 @@
|
|||||||
From 91264fe3577fe887b4860923fa6142b5274c8965 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Fri, 5 Mar 2021 07:20:28 -0800
|
|
||||||
Subject: [PATCH] x86-64: Add memcmp family functions with 256-bit EVEX
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Update ifunc-memcmp.h to select the function optimized with 256-bit EVEX
|
|
||||||
instructions using YMM16-YMM31 registers to avoid RTM abort with usable
|
|
||||||
AVX512VL, AVX512BW and MOVBE since VZEROUPPER isn't needed at function
|
|
||||||
exit.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/Makefile | 4 +-
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 10 +
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 13 +-
|
|
||||||
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 440 ++++++++++++++++++
|
|
||||||
sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S | 4 +
|
|
||||||
5 files changed, 467 insertions(+), 4 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
index 1cc0a10e..9d79b138 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
|
||||||
memset-avx2-unaligned-erms \
|
|
||||||
memset-avx512-unaligned-erms \
|
|
||||||
memchr-evex \
|
|
||||||
+ memcmp-evex-movbe \
|
|
||||||
memmove-evex-unaligned-erms \
|
|
||||||
memrchr-evex \
|
|
||||||
memset-evex-unaligned-erms \
|
|
||||||
@@ -81,7 +82,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
|
|
||||||
wcsncmp-evex \
|
|
||||||
wcsnlen-evex \
|
|
||||||
wcsrchr-evex \
|
|
||||||
- wmemchr-evex
|
|
||||||
+ wmemchr-evex \
|
|
||||||
+ wmemcmp-evex-movbe
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),debug)
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index 7cf83485..c8da910e 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -56,6 +56,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
|
||||||
__memcmp_avx2_movbe)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, memcmp,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (MOVBE)),
|
|
||||||
+ __memcmp_evex_movbe)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
|
|
||||||
__memcmp_sse4_1)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
@@ -558,6 +563,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
|
||||||
__wmemcmp_avx2_movbe)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, wmemcmp,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (MOVBE)),
|
|
||||||
+ __wmemcmp_evex_movbe)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
|
|
||||||
__wmemcmp_sse4_1)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
|
||||||
index 6c1f3153..3ca1f0a6 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
|
||||||
@@ -23,17 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
|
|
||||||
|
|
||||||
static inline void *
|
|
||||||
IFUNC_SELECTOR (void)
|
|
||||||
{
|
|
||||||
const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
||||||
|
|
||||||
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
&& CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
|
|
||||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
- return OPTIMIZE (avx2_movbe);
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
+ return OPTIMIZE (evex_movbe);
|
|
||||||
+
|
|
||||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
+ return OPTIMIZE (avx2_movbe);
|
|
||||||
+ }
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
|
|
||||||
return OPTIMIZE (sse4_1);
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..9c093972
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
@@ -0,0 +1,440 @@
|
|
||||||
+/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#if IS_IN (libc)
|
|
||||||
+
|
|
||||||
+/* memcmp/wmemcmp is implemented as:
|
|
||||||
+ 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
|
|
||||||
+ to avoid branches.
|
|
||||||
+ 2. Use overlapping compare to avoid branch.
|
|
||||||
+ 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
|
|
||||||
+ bytes for wmemcmp.
|
|
||||||
+ 4. If size is 8 * VEC_SIZE or less, unroll the loop.
|
|
||||||
+ 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
|
||||||
+ area.
|
|
||||||
+ 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
|
|
||||||
+ 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
|
|
||||||
+ 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
|
|
||||||
+
|
|
||||||
+# include <sysdep.h>
|
|
||||||
+
|
|
||||||
+# ifndef MEMCMP
|
|
||||||
+# define MEMCMP __memcmp_evex_movbe
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+# define VMOVU vmovdqu64
|
|
||||||
+
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+# define VPCMPEQ vpcmpeqd
|
|
||||||
+# else
|
|
||||||
+# define VPCMPEQ vpcmpeqb
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+# define XMM1 xmm17
|
|
||||||
+# define XMM2 xmm18
|
|
||||||
+# define YMM1 ymm17
|
|
||||||
+# define YMM2 ymm18
|
|
||||||
+# define YMM3 ymm19
|
|
||||||
+# define YMM4 ymm20
|
|
||||||
+# define YMM5 ymm21
|
|
||||||
+# define YMM6 ymm22
|
|
||||||
+
|
|
||||||
+# define VEC_SIZE 32
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+# define VEC_MASK 0xff
|
|
||||||
+# define XMM_MASK 0xf
|
|
||||||
+# else
|
|
||||||
+# define VEC_MASK 0xffffffff
|
|
||||||
+# define XMM_MASK 0xffff
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+/* Warning!
|
|
||||||
+ wmemcmp has to use SIGNED comparison for elements.
|
|
||||||
+ memcmp has to use UNSIGNED comparison for elemnts.
|
|
||||||
+*/
|
|
||||||
+
|
|
||||||
+ .section .text.evex,"ax",@progbits
|
|
||||||
+ENTRY (MEMCMP)
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ shl $2, %RDX_LP
|
|
||||||
+# elif defined __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %edx, %edx
|
|
||||||
+# endif
|
|
||||||
+ cmp $VEC_SIZE, %RDX_LP
|
|
||||||
+ jb L(less_vec)
|
|
||||||
+
|
|
||||||
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMPEQ (%rdi), %YMM2, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+
|
|
||||||
+ cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
+ jbe L(last_vec)
|
|
||||||
+
|
|
||||||
+ /* More than 2 * VEC. */
|
|
||||||
+ cmpq $(VEC_SIZE * 8), %rdx
|
|
||||||
+ ja L(more_8x_vec)
|
|
||||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ jb L(last_4x_vec)
|
|
||||||
+
|
|
||||||
+ /* From 4 * VEC to 8 * VEC, inclusively. */
|
|
||||||
+ VMOVU (%rsi), %YMM1
|
|
||||||
+ VPCMPEQ (%rdi), %YMM1, %k1
|
|
||||||
+
|
|
||||||
+ VMOVU VEC_SIZE(%rsi), %YMM2
|
|
||||||
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
|
||||||
+
|
|
||||||
+ kandd %k1, %k2, %k5
|
|
||||||
+ kandd %k3, %k4, %k6
|
|
||||||
+ kandd %k5, %k6, %k6
|
|
||||||
+
|
|
||||||
+ kmovd %k6, %eax
|
|
||||||
+ cmpl $VEC_MASK, %eax
|
|
||||||
+ jne L(4x_vec_end)
|
|
||||||
+
|
|
||||||
+ leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
|
|
||||||
+ leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
|
|
||||||
+ VMOVU (%rsi), %YMM1
|
|
||||||
+ VPCMPEQ (%rdi), %YMM1, %k1
|
|
||||||
+
|
|
||||||
+ VMOVU VEC_SIZE(%rsi), %YMM2
|
|
||||||
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
|
||||||
+ kandd %k1, %k2, %k5
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
|
||||||
+ kandd %k3, %k5, %k5
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
|
||||||
+ kandd %k4, %k5, %k5
|
|
||||||
+
|
|
||||||
+ kmovd %k5, %eax
|
|
||||||
+ cmpl $VEC_MASK, %eax
|
|
||||||
+ jne L(4x_vec_end)
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_2x_vec):
|
|
||||||
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+
|
|
||||||
+L(last_vec):
|
|
||||||
+ /* Use overlapping loads to avoid branches. */
|
|
||||||
+ leaq -VEC_SIZE(%rdi, %rdx), %rdi
|
|
||||||
+ leaq -VEC_SIZE(%rsi, %rdx), %rsi
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec):
|
|
||||||
+ /* A byte or int32 is different within 16 or 32 bytes. */
|
|
||||||
+ tzcntl %eax, %ecx
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ movl (%rdi, %rcx, 4), %edx
|
|
||||||
+ cmpl (%rsi, %rcx, 4), %edx
|
|
||||||
+L(wmemcmp_return):
|
|
||||||
+ setl %al
|
|
||||||
+ negl %eax
|
|
||||||
+ orl $1, %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl (%rdi, %rcx), %eax
|
|
||||||
+ movzbl (%rsi, %rcx), %edx
|
|
||||||
+ sub %edx, %eax
|
|
||||||
+# endif
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ .p2align 4
|
|
||||||
+L(4):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ movl (%rdi), %edx
|
|
||||||
+ cmpl (%rsi), %edx
|
|
||||||
+ jne L(wmemcmp_return)
|
|
||||||
+ ret
|
|
||||||
+# else
|
|
||||||
+ .p2align 4
|
|
||||||
+L(between_4_7):
|
|
||||||
+ /* Load as big endian with overlapping movbe to avoid branches. */
|
|
||||||
+ movbe (%rdi), %eax
|
|
||||||
+ movbe (%rsi), %ecx
|
|
||||||
+ shlq $32, %rax
|
|
||||||
+ shlq $32, %rcx
|
|
||||||
+ movbe -4(%rdi, %rdx), %edi
|
|
||||||
+ movbe -4(%rsi, %rdx), %esi
|
|
||||||
+ orq %rdi, %rax
|
|
||||||
+ orq %rsi, %rcx
|
|
||||||
+ subq %rcx, %rax
|
|
||||||
+ je L(exit)
|
|
||||||
+ sbbl %eax, %eax
|
|
||||||
+ orl $1, %eax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(exit):
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(between_2_3):
|
|
||||||
+ /* Load as big endian to avoid branches. */
|
|
||||||
+ movzwl (%rdi), %eax
|
|
||||||
+ movzwl (%rsi), %ecx
|
|
||||||
+ shll $8, %eax
|
|
||||||
+ shll $8, %ecx
|
|
||||||
+ bswap %eax
|
|
||||||
+ bswap %ecx
|
|
||||||
+ movb -1(%rdi, %rdx), %al
|
|
||||||
+ movb -1(%rsi, %rdx), %cl
|
|
||||||
+ /* Subtraction is okay because the upper 8 bits are zero. */
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(1):
|
|
||||||
+ movzbl (%rdi), %eax
|
|
||||||
+ movzbl (%rsi), %ecx
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+ ret
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(less_vec):
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
|
|
||||||
+ cmpb $4, %dl
|
|
||||||
+ je L(4)
|
|
||||||
+ jb L(zero)
|
|
||||||
+# else
|
|
||||||
+ cmpb $1, %dl
|
|
||||||
+ je L(1)
|
|
||||||
+ jb L(zero)
|
|
||||||
+ cmpb $4, %dl
|
|
||||||
+ jb L(between_2_3)
|
|
||||||
+ cmpb $8, %dl
|
|
||||||
+ jb L(between_4_7)
|
|
||||||
+# endif
|
|
||||||
+ cmpb $16, %dl
|
|
||||||
+ jae L(between_16_31)
|
|
||||||
+ /* It is between 8 and 15 bytes. */
|
|
||||||
+ vmovq (%rdi), %XMM1
|
|
||||||
+ vmovq (%rsi), %XMM2
|
|
||||||
+ VPCMPEQ %XMM1, %XMM2, %k2
|
|
||||||
+ kmovw %k2, %eax
|
|
||||||
+ subl $XMM_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+ /* Use overlapping loads to avoid branches. */
|
|
||||||
+ leaq -8(%rdi, %rdx), %rdi
|
|
||||||
+ leaq -8(%rsi, %rdx), %rsi
|
|
||||||
+ vmovq (%rdi), %XMM1
|
|
||||||
+ vmovq (%rsi), %XMM2
|
|
||||||
+ VPCMPEQ %XMM1, %XMM2, %k2
|
|
||||||
+ kmovw %k2, %eax
|
|
||||||
+ subl $XMM_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(between_16_31):
|
|
||||||
+ /* From 16 to 31 bytes. No branch when size == 16. */
|
|
||||||
+ VMOVU (%rsi), %XMM2
|
|
||||||
+ VPCMPEQ (%rdi), %XMM2, %k2
|
|
||||||
+ kmovw %k2, %eax
|
|
||||||
+ subl $XMM_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+
|
|
||||||
+ /* Use overlapping loads to avoid branches. */
|
|
||||||
+ leaq -16(%rdi, %rdx), %rdi
|
|
||||||
+ leaq -16(%rsi, %rdx), %rsi
|
|
||||||
+ VMOVU (%rsi), %XMM2
|
|
||||||
+ VPCMPEQ (%rdi), %XMM2, %k2
|
|
||||||
+ kmovw %k2, %eax
|
|
||||||
+ subl $XMM_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(more_8x_vec):
|
|
||||||
+ /* More than 8 * VEC. Check the first VEC. */
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+
|
|
||||||
+ /* Align the first memory area for aligned loads in the loop.
|
|
||||||
+ Compute how much the first memory area is misaligned. */
|
|
||||||
+ movq %rdi, %rcx
|
|
||||||
+ andl $(VEC_SIZE - 1), %ecx
|
|
||||||
+ /* Get the negative of offset for alignment. */
|
|
||||||
+ subq $VEC_SIZE, %rcx
|
|
||||||
+ /* Adjust the second memory area. */
|
|
||||||
+ subq %rcx, %rsi
|
|
||||||
+ /* Adjust the first memory area which should be aligned now. */
|
|
||||||
+ subq %rcx, %rdi
|
|
||||||
+ /* Adjust length. */
|
|
||||||
+ addq %rcx, %rdx
|
|
||||||
+
|
|
||||||
+L(loop_4x_vec):
|
|
||||||
+ /* Compare 4 * VEC at a time forward. */
|
|
||||||
+ VMOVU (%rsi), %YMM1
|
|
||||||
+ VPCMPEQ (%rdi), %YMM1, %k1
|
|
||||||
+
|
|
||||||
+ VMOVU VEC_SIZE(%rsi), %YMM2
|
|
||||||
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
|
||||||
+ kandd %k2, %k1, %k5
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
|
||||||
+ kandd %k3, %k5, %k5
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
|
||||||
+ kandd %k4, %k5, %k5
|
|
||||||
+
|
|
||||||
+ kmovd %k5, %eax
|
|
||||||
+ cmpl $VEC_MASK, %eax
|
|
||||||
+ jne L(4x_vec_end)
|
|
||||||
+
|
|
||||||
+ addq $(VEC_SIZE * 4), %rdi
|
|
||||||
+ addq $(VEC_SIZE * 4), %rsi
|
|
||||||
+
|
|
||||||
+ subq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ jae L(loop_4x_vec)
|
|
||||||
+
|
|
||||||
+ /* Less than 4 * VEC. */
|
|
||||||
+ cmpq $VEC_SIZE, %rdx
|
|
||||||
+ jbe L(last_vec)
|
|
||||||
+ cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
+ jbe L(last_2x_vec)
|
|
||||||
+
|
|
||||||
+L(last_4x_vec):
|
|
||||||
+ /* From 2 * VEC to 4 * VEC. */
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+
|
|
||||||
+ addq $VEC_SIZE, %rdi
|
|
||||||
+ addq $VEC_SIZE, %rsi
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+
|
|
||||||
+ /* Use overlapping loads to avoid branches. */
|
|
||||||
+ leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
|
|
||||||
+ leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+
|
|
||||||
+ addq $VEC_SIZE, %rdi
|
|
||||||
+ addq $VEC_SIZE, %rsi
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(4x_vec_end):
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec_x1)
|
|
||||||
+ kmovd %k3, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec_x2)
|
|
||||||
+ kmovd %k4, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ tzcntl %eax, %ecx
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
|
|
||||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
|
|
||||||
+ jmp L(wmemcmp_return)
|
|
||||||
+# else
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
|
|
||||||
+ sub %edx, %eax
|
|
||||||
+# endif
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x1):
|
|
||||||
+ tzcntl %eax, %ecx
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ movl VEC_SIZE(%rdi, %rcx, 4), %edx
|
|
||||||
+ cmpl VEC_SIZE(%rsi, %rcx, 4), %edx
|
|
||||||
+ jmp L(wmemcmp_return)
|
|
||||||
+# else
|
|
||||||
+ movzbl VEC_SIZE(%rdi, %rcx), %eax
|
|
||||||
+ movzbl VEC_SIZE(%rsi, %rcx), %edx
|
|
||||||
+ sub %edx, %eax
|
|
||||||
+# endif
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x2):
|
|
||||||
+ tzcntl %eax, %ecx
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
|
|
||||||
+ cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
|
|
||||||
+ jmp L(wmemcmp_return)
|
|
||||||
+# else
|
|
||||||
+ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
|
|
||||||
+ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
|
|
||||||
+ sub %edx, %eax
|
|
||||||
+# endif
|
|
||||||
+ ret
|
|
||||||
+END (MEMCMP)
|
|
||||||
+#endif
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..4726d74a
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
|
|
||||||
@@ -0,0 +1,4 @@
|
|
||||||
+#define MEMCMP __wmemcmp_evex_movbe
|
|
||||||
+#define USE_AS_WMEMCMP 1
|
|
||||||
+
|
|
||||||
+#include "memcmp-evex-movbe.S"
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
@ -1,735 +0,0 @@
|
|||||||
From 4bd660be40967cd69072f69ebc2ad32bfcc1f206 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Tue, 23 Feb 2021 06:33:10 -0800
|
|
||||||
Subject: [PATCH] x86: Add string/memory function tests in RTM region
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
At function exit, AVX optimized string/memory functions have VZEROUPPER
|
|
||||||
which triggers RTM abort. When such functions are called inside a
|
|
||||||
transactionally executing RTM region, RTM abort causes severe performance
|
|
||||||
degradation. Add tests to verify that string/memory functions won't
|
|
||||||
cause RTM abort in RTM region.
|
|
||||||
---
|
|
||||||
sysdeps/x86/Makefile | 23 +++++++++++
|
|
||||||
sysdeps/x86/tst-memchr-rtm.c | 54 ++++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-memcmp-rtm.c | 52 +++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-memmove-rtm.c | 53 ++++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-memrchr-rtm.c | 54 ++++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-memset-rtm.c | 45 ++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-strchr-rtm.c | 54 ++++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-strcpy-rtm.c | 53 ++++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-string-rtm.h | 72 +++++++++++++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-strlen-rtm.c | 53 ++++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-strncmp-rtm.c | 52 +++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-strrchr-rtm.c | 53 ++++++++++++++++++++++++++
|
|
||||||
12 files changed, 618 insertions(+)
|
|
||||||
create mode 100644 sysdeps/x86/tst-memchr-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-memcmp-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-memmove-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-memrchr-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-memset-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-strchr-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-strcpy-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-string-rtm.h
|
|
||||||
create mode 100644 sysdeps/x86/tst-strlen-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-strncmp-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-strrchr-rtm.c
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
|
|
||||||
index 59e928e9..5be71ada 100644
|
|
||||||
--- a/sysdeps/x86/Makefile
|
|
||||||
+++ b/sysdeps/x86/Makefile
|
|
||||||
@@ -17,6 +17,29 @@ endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
sysdep_routines += cacheinfo
|
|
||||||
+
|
|
||||||
+tests += \
|
|
||||||
+ tst-memchr-rtm \
|
|
||||||
+ tst-memcmp-rtm \
|
|
||||||
+ tst-memmove-rtm \
|
|
||||||
+ tst-memrchr-rtm \
|
|
||||||
+ tst-memset-rtm \
|
|
||||||
+ tst-strchr-rtm \
|
|
||||||
+ tst-strcpy-rtm \
|
|
||||||
+ tst-strlen-rtm \
|
|
||||||
+ tst-strncmp-rtm \
|
|
||||||
+ tst-strrchr-rtm
|
|
||||||
+
|
|
||||||
+CFLAGS-tst-memchr-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-memcmp-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-memmove-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-memrchr-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-memset-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-strchr-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-strcpy-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-strlen-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-strncmp-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-strrchr-rtm.c += -mrtm
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifneq ($(enable-cet),no)
|
|
||||||
diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..e4749401
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-memchr-rtm.c
|
|
||||||
@@ -0,0 +1,54 @@
|
|
||||||
+/* Test case for memchr inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE);
|
|
||||||
+ string1[100] = 'c';
|
|
||||||
+ string1[STRING_SIZE - 100] = 'c';
|
|
||||||
+ char *p = memchr (string1, 'c', STRING_SIZE);
|
|
||||||
+ if (p == &string1[100])
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ char *p = memchr (string1, 'c', STRING_SIZE);
|
|
||||||
+ if (p == &string1[100])
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("memchr", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..e4c8a623
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-memcmp-rtm.c
|
|
||||||
@@ -0,0 +1,52 @@
|
|
||||||
+/* Test case for memcmp inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+char string2[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE);
|
|
||||||
+ memset (string2, 'a', STRING_SIZE);
|
|
||||||
+ if (memcmp (string1, string2, STRING_SIZE) == 0)
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ if (memcmp (string1, string2, STRING_SIZE) == 0)
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("memcmp", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..4bf97ef1
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-memmove-rtm.c
|
|
||||||
@@ -0,0 +1,53 @@
|
|
||||||
+/* Test case for memmove inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+char string2[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE);
|
|
||||||
+ if (memmove (string2, string1, STRING_SIZE) == string2
|
|
||||||
+ && memcmp (string2, string1, STRING_SIZE) == 0)
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ if (memmove (string2, string1, STRING_SIZE) == string2
|
|
||||||
+ && memcmp (string2, string1, STRING_SIZE) == 0)
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("memmove", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..a57a5a8e
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-memrchr-rtm.c
|
|
||||||
@@ -0,0 +1,54 @@
|
|
||||||
+/* Test case for memrchr inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE);
|
|
||||||
+ string1[100] = 'c';
|
|
||||||
+ string1[STRING_SIZE - 100] = 'c';
|
|
||||||
+ char *p = memrchr (string1, 'c', STRING_SIZE);
|
|
||||||
+ if (p == &string1[STRING_SIZE - 100])
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ char *p = memrchr (string1, 'c', STRING_SIZE);
|
|
||||||
+ if (p == &string1[STRING_SIZE - 100])
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("memrchr", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..bf343a4d
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-memset-rtm.c
|
|
||||||
@@ -0,0 +1,45 @@
|
|
||||||
+/* Test case for memset inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE);
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE);
|
|
||||||
+ return 0;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("memset", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..a82e29c0
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-strchr-rtm.c
|
|
||||||
@@ -0,0 +1,54 @@
|
|
||||||
+/* Test case for strchr inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
|
||||||
+ string1[100] = 'c';
|
|
||||||
+ string1[STRING_SIZE - 100] = 'c';
|
|
||||||
+ char *p = strchr (string1, 'c');
|
|
||||||
+ if (p == &string1[100])
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ char *p = strchr (string1, 'c');
|
|
||||||
+ if (p == &string1[100])
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("strchr", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..2b2a583f
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-strcpy-rtm.c
|
|
||||||
@@ -0,0 +1,53 @@
|
|
||||||
+/* Test case for strcpy inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+char string2[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
|
||||||
+ if (strcpy (string2, string1) == string2
|
|
||||||
+ && strcmp (string2, string1) == 0)
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ if (strcpy (string2, string1) == string2
|
|
||||||
+ && strcmp (string2, string1) == 0)
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("strcpy", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..d2470afa
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-string-rtm.h
|
|
||||||
@@ -0,0 +1,72 @@
|
|
||||||
+/* Test string function in a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <string.h>
|
|
||||||
+#include <x86intrin.h>
|
|
||||||
+#include <sys/platform/x86.h>
|
|
||||||
+#include <support/check.h>
|
|
||||||
+#include <support/test-driver.h>
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
|
|
||||||
+ int (*function) (void))
|
|
||||||
+{
|
|
||||||
+ if (!CPU_FEATURE_USABLE (RTM))
|
|
||||||
+ return EXIT_UNSUPPORTED;
|
|
||||||
+
|
|
||||||
+ int status = prepare ();
|
|
||||||
+ if (status != EXIT_SUCCESS)
|
|
||||||
+ return status;
|
|
||||||
+
|
|
||||||
+ unsigned int i;
|
|
||||||
+ unsigned int naborts = 0;
|
|
||||||
+ unsigned int failed = 0;
|
|
||||||
+ for (i = 0; i < loop; i++)
|
|
||||||
+ {
|
|
||||||
+ failed |= function ();
|
|
||||||
+ if (_xbegin() == _XBEGIN_STARTED)
|
|
||||||
+ {
|
|
||||||
+ failed |= function ();
|
|
||||||
+ _xend();
|
|
||||||
+ }
|
|
||||||
+ else
|
|
||||||
+ {
|
|
||||||
+ failed |= function ();
|
|
||||||
+ ++naborts;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (failed)
|
|
||||||
+ FAIL_EXIT1 ("%s() failed", name);
|
|
||||||
+
|
|
||||||
+ if (naborts)
|
|
||||||
+ {
|
|
||||||
+ /* NB: Low single digit (<= 5%) noise-level aborts are normal for
|
|
||||||
+ TSX. */
|
|
||||||
+ double rate = 100 * ((double) naborts) / ((double) loop);
|
|
||||||
+ if (rate > 5)
|
|
||||||
+ FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
|
|
||||||
+ rate, naborts, loop);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int do_test (void);
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..0dcf14db
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-strlen-rtm.c
|
|
||||||
@@ -0,0 +1,53 @@
|
|
||||||
+/* Test case for strlen inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
|
||||||
+ string1[STRING_SIZE - 100] = '\0';
|
|
||||||
+ size_t len = strlen (string1);
|
|
||||||
+ if (len == STRING_SIZE - 100)
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ size_t len = strlen (string1);
|
|
||||||
+ if (len == STRING_SIZE - 100)
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("strlen", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..236ad951
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-strncmp-rtm.c
|
|
||||||
@@ -0,0 +1,52 @@
|
|
||||||
+/* Test case for strncmp inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+char string2[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
|
||||||
+ memset (string2, 'a', STRING_SIZE - 1);
|
|
||||||
+ if (strncmp (string1, string2, STRING_SIZE) == 0)
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ if (strncmp (string1, string2, STRING_SIZE) == 0)
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("strncmp", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..e32bfaf5
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-strrchr-rtm.c
|
|
||||||
@@ -0,0 +1,53 @@
|
|
||||||
+/* Test case for strrchr inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
|
||||||
+ string1[STRING_SIZE - 100] = 'c';
|
|
||||||
+ char *p = strrchr (string1, 'c');
|
|
||||||
+ if (p == &string1[STRING_SIZE - 100])
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ char *p = strrchr (string1, 'c');
|
|
||||||
+ if (p == &string1[STRING_SIZE - 100])
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("strrchr", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,148 +0,0 @@
|
|||||||
From 4e2d8f352774b56078c34648b14a2412c38384f4 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Sun, 7 Mar 2021 09:44:18 -0800
|
|
||||||
Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memset family functions
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
|
|
||||||
with AVX512 instructions using ZMM16-ZMM31 registers to avoid RTM abort
|
|
||||||
with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
|
|
||||||
function exit.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 14 +++++++++-----
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-memset.h | 13 ++++++++-----
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-wmemset.h | 12 ++++++------
|
|
||||||
.../multiarch/memset-avx512-unaligned-erms.S | 16 ++++++++--------
|
|
||||||
4 files changed, 31 insertions(+), 24 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index c1efeec0..d969a156 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -211,10 +211,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
__memset_chk_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
__memset_chk_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
__memset_chk_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
@@ -252,10 +254,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
__memset_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
__memset_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
__memset_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
@@ -719,7 +723,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__wmemset_evex_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__wmemset_avx512_unaligned))
|
|
||||||
|
|
||||||
#ifdef SHARED
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
index 6f3375cc..19795938 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
@@ -53,13 +53,16 @@ IFUNC_SELECTOR (void)
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
|
||||||
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
|
||||||
{
|
|
||||||
- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
- return OPTIMIZE (avx512_no_vzeroupper);
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
+ return OPTIMIZE (avx512_unaligned_erms);
|
|
||||||
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
- return OPTIMIZE (avx512_unaligned_erms);
|
|
||||||
+ return OPTIMIZE (avx512_unaligned);
|
|
||||||
+ }
|
|
||||||
|
|
||||||
- return OPTIMIZE (avx512_unaligned);
|
|
||||||
+ return OPTIMIZE (avx512_no_vzeroupper);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
|
||||||
index bdc94c6c..98c5d406 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
|
||||||
@@ -33,13 +33,13 @@ IFUNC_SELECTOR (void)
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
{
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
|
||||||
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
|
|
||||||
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
- return OPTIMIZE (avx512_unaligned);
|
|
||||||
-
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
|
|
||||||
- return OPTIMIZE (evex_unaligned);
|
|
||||||
+ {
|
|
||||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
|
||||||
+ return OPTIMIZE (avx512_unaligned);
|
|
||||||
+
|
|
||||||
+ return OPTIMIZE (evex_unaligned);
|
|
||||||
+ }
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
||||||
return OPTIMIZE (avx2_unaligned_rtm);
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
||||||
index 0783979c..22e7b187 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
||||||
@@ -1,22 +1,22 @@
|
|
||||||
#if IS_IN (libc)
|
|
||||||
# define VEC_SIZE 64
|
|
||||||
-# define VEC(i) zmm##i
|
|
||||||
+# define XMM0 xmm16
|
|
||||||
+# define YMM0 ymm16
|
|
||||||
+# define VEC0 zmm16
|
|
||||||
+# define VEC(i) VEC##i
|
|
||||||
# define VMOVU vmovdqu64
|
|
||||||
# define VMOVA vmovdqa64
|
|
||||||
+# define VZEROUPPER
|
|
||||||
|
|
||||||
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
||||||
- vmovd d, %xmm0; \
|
|
||||||
movq r, %rax; \
|
|
||||||
- vpbroadcastb %xmm0, %xmm0; \
|
|
||||||
- vpbroadcastq %xmm0, %zmm0
|
|
||||||
+ vpbroadcastb d, %VEC0
|
|
||||||
|
|
||||||
# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
||||||
- vmovd d, %xmm0; \
|
|
||||||
movq r, %rax; \
|
|
||||||
- vpbroadcastd %xmm0, %xmm0; \
|
|
||||||
- vpbroadcastq %xmm0, %zmm0
|
|
||||||
+ vpbroadcastd d, %VEC0
|
|
||||||
|
|
||||||
-# define SECTION(p) p##.avx512
|
|
||||||
+# define SECTION(p) p##.evex512
|
|
||||||
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
|
|
||||||
# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
|
|
||||||
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,230 +0,0 @@
|
|||||||
From b304fc201d2f6baf52ea790df8643e99772243cd Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 21 Jan 2019 11:25:56 -0800
|
|
||||||
Subject: [PATCH] x86-64 memcmp/wmemcmp: Properly handle the length parameter
|
|
||||||
[BZ# 24097]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
|
||||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
|
||||||
functions written in assembly can only use the lower 32 bits of a
|
|
||||||
64-bit register as length or must clear the upper 32 bits before using
|
|
||||||
the full 64-bit register for length.
|
|
||||||
|
|
||||||
This pach fixes memcmp/wmemcmp for x32. Tested on x86-64 and x32. On
|
|
||||||
x86-64, libc.so is the same with and withou the fix.
|
|
||||||
|
|
||||||
[BZ# 24097]
|
|
||||||
CVE-2019-6488
|
|
||||||
* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for
|
|
||||||
length. Clear the upper 32 bits of RDX register.
|
|
||||||
* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
|
|
||||||
* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and
|
|
||||||
tst-size_t-wmemcmp.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 7 +-
|
|
||||||
sysdeps/x86_64/multiarch/memcmp-sse4.S | 9 ++-
|
|
||||||
sysdeps/x86_64/multiarch/memcmp-ssse3.S | 7 +-
|
|
||||||
sysdeps/x86_64/x32/Makefile | 4 +-
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-memcmp.c | 76 ++++++++++++++++++++
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-wmemcmp.c | 20 ++++++
|
|
||||||
6 files changed, 114 insertions(+), 9 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
index 30f764c3..e3a35b89 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
@@ -58,9 +58,12 @@
|
|
||||||
.section .text.avx,"ax",@progbits
|
|
||||||
ENTRY (MEMCMP)
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
- shl $2, %rdx
|
|
||||||
+ shl $2, %RDX_LP
|
|
||||||
+# elif defined __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %edx, %edx
|
|
||||||
# endif
|
|
||||||
- cmpq $VEC_SIZE, %rdx
|
|
||||||
+ cmp $VEC_SIZE, %RDX_LP
|
|
||||||
jb L(less_vec)
|
|
||||||
|
|
||||||
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
|
||||||
index 8e164f2c..302900f5 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
|
||||||
@@ -42,13 +42,16 @@
|
|
||||||
.section .text.sse4.1,"ax",@progbits
|
|
||||||
ENTRY (MEMCMP)
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
- shl $2, %rdx
|
|
||||||
+ shl $2, %RDX_LP
|
|
||||||
+# elif defined __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ mov %edx, %edx
|
|
||||||
# endif
|
|
||||||
pxor %xmm0, %xmm0
|
|
||||||
- cmp $79, %rdx
|
|
||||||
+ cmp $79, %RDX_LP
|
|
||||||
ja L(79bytesormore)
|
|
||||||
# ifndef USE_AS_WMEMCMP
|
|
||||||
- cmp $1, %rdx
|
|
||||||
+ cmp $1, %RDX_LP
|
|
||||||
je L(firstbyte)
|
|
||||||
# endif
|
|
||||||
add %rdx, %rsi
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
|
|
||||||
index 6f76c641..69d030fc 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
|
|
||||||
@@ -33,9 +33,12 @@
|
|
||||||
atom_text_section
|
|
||||||
ENTRY (MEMCMP)
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
- shl $2, %rdx
|
|
||||||
- test %rdx, %rdx
|
|
||||||
+ shl $2, %RDX_LP
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
jz L(equal)
|
|
||||||
+# elif defined __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ mov %edx, %edx
|
|
||||||
# endif
|
|
||||||
mov %rdx, %rcx
|
|
||||||
mov %rdi, %rdx
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index 7d528889..ddec7f04 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
-tests += tst-size_t-memchr
|
|
||||||
+tests += tst-size_t-memchr tst-size_t-memcmp
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),wcsmbs)
|
|
||||||
-tests += tst-size_t-wmemchr
|
|
||||||
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp
|
|
||||||
endif
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..9bd6fdb4
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
|
|
||||||
@@ -0,0 +1,76 @@
|
|
||||||
+/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define TEST_MAIN
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# define TEST_NAME "wmemcmp"
|
|
||||||
+#else
|
|
||||||
+# define TEST_NAME "memcmp"
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# include <inttypes.h>
|
|
||||||
+# include <wchar.h>
|
|
||||||
+
|
|
||||||
+# define MEMCMP wmemcmp
|
|
||||||
+# define CHAR wchar_t
|
|
||||||
+#else
|
|
||||||
+# define MEMCMP memcmp
|
|
||||||
+# define CHAR char
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+IMPL (MEMCMP, 1)
|
|
||||||
+
|
|
||||||
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_memcmp (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
|
|
||||||
+ parameter_t src = { { 0 }, buf2 };
|
|
||||||
+
|
|
||||||
+ memcpy (buf1, buf2, page_size);
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ src.fn = impl->fn;
|
|
||||||
+ int res = do_memcmp (dest, src);
|
|
||||||
+ if (res)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: %i != 0",
|
|
||||||
+ impl->name, res);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..e8b5ffd0
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
|
|
||||||
@@ -0,0 +1,20 @@
|
|
||||||
+/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define WIDE 1
|
|
||||||
+#include "tst-size_t-memcmp.c"
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,164 +0,0 @@
|
|||||||
From e4fda4631017e49d4ee5a2755db34289b6860fa4 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Sun, 7 Mar 2021 09:45:23 -0800
|
|
||||||
Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memmove family functions
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Update ifunc-memmove.h to select the function optimized with AVX512
|
|
||||||
instructions using ZMM16-ZMM31 registers to avoid RTM abort with usable
|
|
||||||
AVX512VL since VZEROUPPER isn't needed at function exit.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 24 +++++++++---------
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-memmove.h | 12 +++++----
|
|
||||||
.../multiarch/memmove-avx512-unaligned-erms.S | 25 +++++++++++++++++--
|
|
||||||
3 files changed, 42 insertions(+), 19 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index d969a156..fec384f6 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -83,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__memmove_chk_avx512_no_vzeroupper)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__memmove_chk_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__memmove_chk_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
@@ -148,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__memmove_avx512_no_vzeroupper)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memmove,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__memmove_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memmove,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__memmove_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
__memmove_ssse3_back)
|
|
||||||
@@ -733,10 +733,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__memcpy_chk_avx512_no_vzeroupper)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__memcpy_chk_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__memcpy_chk_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
@@ -802,10 +802,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__memcpy_avx512_no_vzeroupper)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcpy,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__memcpy_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcpy,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__memcpy_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcpy, 1,
|
|
||||||
@@ -819,10 +819,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__mempcpy_chk_avx512_no_vzeroupper)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__mempcpy_chk_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__mempcpy_chk_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
@@ -864,10 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__mempcpy_avx512_no_vzeroupper)
|
|
||||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__mempcpy_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__mempcpy_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
|
||||||
index fa09b9fb..014e95c7 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
|
||||||
@@ -56,13 +56,15 @@ IFUNC_SELECTOR (void)
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
|
||||||
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
|
||||||
{
|
|
||||||
- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
- return OPTIMIZE (avx512_no_vzeroupper);
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
+ return OPTIMIZE (avx512_unaligned_erms);
|
|
||||||
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
- return OPTIMIZE (avx512_unaligned_erms);
|
|
||||||
+ return OPTIMIZE (avx512_unaligned);
|
|
||||||
+ }
|
|
||||||
|
|
||||||
- return OPTIMIZE (avx512_unaligned);
|
|
||||||
+ return OPTIMIZE (avx512_no_vzeroupper);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
|
||||||
index aac1515c..848848ab 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
|
||||||
@@ -1,11 +1,32 @@
|
|
||||||
#if IS_IN (libc)
|
|
||||||
# define VEC_SIZE 64
|
|
||||||
-# define VEC(i) zmm##i
|
|
||||||
+# define XMM0 xmm16
|
|
||||||
+# define XMM1 xmm17
|
|
||||||
+# define YMM0 ymm16
|
|
||||||
+# define YMM1 ymm17
|
|
||||||
+# define VEC0 zmm16
|
|
||||||
+# define VEC1 zmm17
|
|
||||||
+# define VEC2 zmm18
|
|
||||||
+# define VEC3 zmm19
|
|
||||||
+# define VEC4 zmm20
|
|
||||||
+# define VEC5 zmm21
|
|
||||||
+# define VEC6 zmm22
|
|
||||||
+# define VEC7 zmm23
|
|
||||||
+# define VEC8 zmm24
|
|
||||||
+# define VEC9 zmm25
|
|
||||||
+# define VEC10 zmm26
|
|
||||||
+# define VEC11 zmm27
|
|
||||||
+# define VEC12 zmm28
|
|
||||||
+# define VEC13 zmm29
|
|
||||||
+# define VEC14 zmm30
|
|
||||||
+# define VEC15 zmm31
|
|
||||||
+# define VEC(i) VEC##i
|
|
||||||
# define VMOVNT vmovntdq
|
|
||||||
# define VMOVU vmovdqu64
|
|
||||||
# define VMOVA vmovdqa64
|
|
||||||
+# define VZEROUPPER
|
|
||||||
|
|
||||||
-# define SECTION(p) p##.avx512
|
|
||||||
+# define SECTION(p) p##.evex512
|
|
||||||
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
|
|
||||||
|
|
||||||
# include "memmove-vec-unaligned-erms.S"
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,71 +0,0 @@
|
|||||||
From 595c22ecd8e87a27fd19270ed30fdbae9ad25426 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Sunil K Pandey <skpgkp2@gmail.com>
|
|
||||||
Date: Thu, 1 Apr 2021 15:47:04 -0700
|
|
||||||
Subject: [PATCH] x86-64: Fix ifdef indentation in strlen-evex.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Fix some indentations of ifdef in file strlen-evex.S which are off by 1
|
|
||||||
and confusing to read.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strlen-evex.S | 16 ++++++++--------
|
|
||||||
1 file changed, 8 insertions(+), 8 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
|
|
||||||
index cd022509..05838190 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
|
|
||||||
@@ -276,10 +276,10 @@ L(last_2x_vec):
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x0_check):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
/* Check the end of data. */
|
|
||||||
cmpq %rax, %rsi
|
|
||||||
jbe L(max)
|
|
||||||
@@ -293,10 +293,10 @@ L(first_vec_x0_check):
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x1_check):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
/* Check the end of data. */
|
|
||||||
cmpq %rax, %rsi
|
|
||||||
jbe L(max)
|
|
||||||
@@ -311,10 +311,10 @@ L(first_vec_x1_check):
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x2_check):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
/* Check the end of data. */
|
|
||||||
cmpq %rax, %rsi
|
|
||||||
jbe L(max)
|
|
||||||
@@ -329,10 +329,10 @@ L(first_vec_x2_check):
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x3_check):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
/* Check the end of data. */
|
|
||||||
cmpq %rax, %rsi
|
|
||||||
jbe L(max)
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,51 +0,0 @@
|
|||||||
From 55bf411b451c13f0fb7ff3d3bf9a820020b45df1 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 19 Apr 2021 07:07:21 -0700
|
|
||||||
Subject: [PATCH] x86-64: Require BMI2 for __strlen_evex and __strnlen_evex
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Since __strlen_evex and __strnlen_evex added by
|
|
||||||
|
|
||||||
commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77
|
|
||||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
Date: Fri Mar 5 06:24:52 2021 -0800
|
|
||||||
|
|
||||||
x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
|
|
||||||
|
|
||||||
use sarx:
|
|
||||||
|
|
||||||
c4 e2 6a f7 c0 sarx %edx,%eax,%eax
|
|
||||||
|
|
||||||
require BMI2 for __strlen_evex and __strnlen_evex in ifunc-impl-list.c.
|
|
||||||
ifunc-avx2.h already requires BMI2 for EVEX implementation.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 ++++--
|
|
||||||
1 file changed, 4 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index fec384f6..cbfc1a5d 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -293,7 +293,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
__strlen_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strlen,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__strlen_evex)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
|
|
||||||
|
|
||||||
@@ -308,7 +309,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
__strnlen_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__strnlen_evex)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
|
|
||||||
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,584 +0,0 @@
|
|||||||
From acfd088a1963ba51cd83c78f95c0ab25ead79e04 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon, 3 May 2021 03:01:58 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize memchr-avx2.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit optimizes memchr-avx2.S. The optimizations include
|
|
||||||
replacing some branches with cmovcc, avoiding some branches entirely
|
|
||||||
in the less_4x_vec case, making the page cross logic less strict,
|
|
||||||
asaving a few instructions the in loop return loop. test-memchr,
|
|
||||||
test-rawmemchr, and test-wmemchr are all passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++-----------
|
|
||||||
1 file changed, 247 insertions(+), 178 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
index cf893e77..b377f22e 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
@@ -26,8 +26,22 @@
|
|
||||||
|
|
||||||
# ifdef USE_AS_WMEMCHR
|
|
||||||
# define VPCMPEQ vpcmpeqd
|
|
||||||
+# define VPBROADCAST vpbroadcastd
|
|
||||||
+# define CHAR_SIZE 4
|
|
||||||
# else
|
|
||||||
# define VPCMPEQ vpcmpeqb
|
|
||||||
+# define VPBROADCAST vpbroadcastb
|
|
||||||
+# define CHAR_SIZE 1
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+# define ERAW_PTR_REG ecx
|
|
||||||
+# define RRAW_PTR_REG rcx
|
|
||||||
+# define ALGN_PTR_REG rdi
|
|
||||||
+# else
|
|
||||||
+# define ERAW_PTR_REG edi
|
|
||||||
+# define RRAW_PTR_REG rdi
|
|
||||||
+# define ALGN_PTR_REG rcx
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# ifndef VZEROUPPER
|
|
||||||
@@ -39,6 +53,7 @@
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
+# define PAGE_SIZE 4096
|
|
||||||
|
|
||||||
.section SECTION(.text),"ax",@progbits
|
|
||||||
ENTRY (MEMCHR)
|
|
||||||
@@ -47,295 +62,349 @@ ENTRY (MEMCHR)
|
|
||||||
test %RDX_LP, %RDX_LP
|
|
||||||
jz L(null)
|
|
||||||
# endif
|
|
||||||
- movl %edi, %ecx
|
|
||||||
- /* Broadcast CHAR to YMM0. */
|
|
||||||
- vmovd %esi, %xmm0
|
|
||||||
# ifdef USE_AS_WMEMCHR
|
|
||||||
shl $2, %RDX_LP
|
|
||||||
- vpbroadcastd %xmm0, %ymm0
|
|
||||||
# else
|
|
||||||
# ifdef __ILP32__
|
|
||||||
/* Clear the upper 32 bits. */
|
|
||||||
movl %edx, %edx
|
|
||||||
# endif
|
|
||||||
- vpbroadcastb %xmm0, %ymm0
|
|
||||||
# endif
|
|
||||||
+ /* Broadcast CHAR to YMMMATCH. */
|
|
||||||
+ vmovd %esi, %xmm0
|
|
||||||
+ VPBROADCAST %xmm0, %ymm0
|
|
||||||
/* Check if we may cross page boundary with one vector load. */
|
|
||||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
|
||||||
- cmpl $VEC_SIZE, %ecx
|
|
||||||
- ja L(cros_page_boundary)
|
|
||||||
+ movl %edi, %eax
|
|
||||||
+ andl $(PAGE_SIZE - 1), %eax
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
+ ja L(cross_page_boundary)
|
|
||||||
|
|
||||||
/* Check the first VEC_SIZE bytes. */
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
-
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- jnz L(first_vec_x0_check)
|
|
||||||
- /* Adjust length and check the end of data. */
|
|
||||||
- subq $VEC_SIZE, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
-# else
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ /* If length < CHAR_PER_VEC handle special. */
|
|
||||||
+ cmpq $VEC_SIZE, %rdx
|
|
||||||
+ jbe L(first_vec_x0)
|
|
||||||
# endif
|
|
||||||
-
|
|
||||||
- /* Align data for aligned loads in the loop. */
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(aligned_more)
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- /* Adjust length. */
|
|
||||||
- addq %rcx, %rdx
|
|
||||||
+ .p2align 5
|
|
||||||
+L(first_vec_x0):
|
|
||||||
+ /* Check if first match was before length. */
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ xorl %ecx, %ecx
|
|
||||||
+ cmpl %eax, %edx
|
|
||||||
+ leaq (%rdi, %rax), %rax
|
|
||||||
+ cmovle %rcx, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
+L(null):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
# endif
|
|
||||||
- jmp L(more_4x_vec)
|
|
||||||
-
|
|
||||||
.p2align 4
|
|
||||||
-L(cros_page_boundary):
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
+L(cross_page_boundary):
|
|
||||||
+ /* Save pointer before aligning as its original value is necessary
|
|
||||||
+ for computer return address if byte is found or adjusting length
|
|
||||||
+ if it is not and this is memchr. */
|
|
||||||
+ movq %rdi, %rcx
|
|
||||||
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
|
|
||||||
+ rdi for rawmemchr. */
|
|
||||||
+ orq $(VEC_SIZE - 1), %ALGN_PTR_REG
|
|
||||||
+ VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
+ /* Calculate length until end of page (length checked for a
|
|
||||||
+ match). */
|
|
||||||
+ leaq 1(%ALGN_PTR_REG), %rsi
|
|
||||||
+ subq %RRAW_PTR_REG, %rsi
|
|
||||||
+# endif
|
|
||||||
/* Remove the leading bytes. */
|
|
||||||
- sarl %cl, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jz L(aligned_more)
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
+ sarxl %ERAW_PTR_REG, %eax, %eax
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
/* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
+ cmpq %rsi, %rdx
|
|
||||||
+ jbe L(first_vec_x0)
|
|
||||||
# endif
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- addq %rcx, %rax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(cross_page_continue)
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addq %RRAW_PTR_REG, %rax
|
|
||||||
L(return_vzeroupper):
|
|
||||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(aligned_more):
|
|
||||||
-# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
|
|
||||||
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
|
|
||||||
- overflow. */
|
|
||||||
- negq %rcx
|
|
||||||
- addq $VEC_SIZE, %rcx
|
|
||||||
+L(first_vec_x1):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ incq %rdi
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- subq %rcx, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
-# endif
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x2):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addq $(VEC_SIZE + 1), %rdi
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x3):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addq $(VEC_SIZE * 2 + 1), %rdi
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
|
|
||||||
-# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
-# endif
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x4):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addq $(VEC_SIZE * 3 + 1), %rdi
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
-L(more_4x_vec):
|
|
||||||
+ .p2align 4
|
|
||||||
+L(aligned_more):
|
|
||||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
since data is only aligned to VEC_SIZE. */
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
+L(cross_page_continue):
|
|
||||||
+ /* Align data to VEC_SIZE - 1. */
|
|
||||||
+ xorl %ecx, %ecx
|
|
||||||
+ subl %edi, %ecx
|
|
||||||
+ orq $(VEC_SIZE - 1), %rdi
|
|
||||||
+ /* esi is for adjusting length to see if near the end. */
|
|
||||||
+ leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
|
|
||||||
+# else
|
|
||||||
+ orq $(VEC_SIZE - 1), %rdi
|
|
||||||
+L(cross_page_continue):
|
|
||||||
+# endif
|
|
||||||
+ /* Load first VEC regardless. */
|
|
||||||
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
+ /* Adjust length. If near end handle specially. */
|
|
||||||
+ subq %rsi, %rdx
|
|
||||||
+ jbe L(last_4x_vec_or_less)
|
|
||||||
+# endif
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x1)
|
|
||||||
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x2)
|
|
||||||
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x3)
|
|
||||||
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x4)
|
|
||||||
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
+ /* Check if at last VEC_SIZE * 4 length. */
|
|
||||||
subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
- /* Align data to 4 * VEC_SIZE. */
|
|
||||||
- movq %rdi, %rcx
|
|
||||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
-
|
|
||||||
-# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- /* Adjust length. */
|
|
||||||
+ jbe L(last_4x_vec_or_less_cmpeq)
|
|
||||||
+ /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
|
|
||||||
+ length. */
|
|
||||||
+ incq %rdi
|
|
||||||
+ movl %edi, %ecx
|
|
||||||
+ orq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
+ andl $(VEC_SIZE * 4 - 1), %ecx
|
|
||||||
addq %rcx, %rdx
|
|
||||||
+# else
|
|
||||||
+ /* Align data to VEC_SIZE * 4 - 1 for loop. */
|
|
||||||
+ incq %rdi
|
|
||||||
+ orq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
# endif
|
|
||||||
|
|
||||||
+ /* Compare 4 * VEC at a time forward. */
|
|
||||||
.p2align 4
|
|
||||||
L(loop_4x_vec):
|
|
||||||
- /* Compare 4 * VEC at a time forward. */
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
|
|
||||||
-
|
|
||||||
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
|
|
||||||
vpor %ymm1, %ymm2, %ymm5
|
|
||||||
vpor %ymm3, %ymm4, %ymm6
|
|
||||||
vpor %ymm5, %ymm6, %ymm5
|
|
||||||
|
|
||||||
- vpmovmskb %ymm5, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(4x_vec_end)
|
|
||||||
-
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
-
|
|
||||||
+ vpmovmskb %ymm5, %ecx
|
|
||||||
# ifdef USE_AS_RAWMEMCHR
|
|
||||||
- jmp L(loop_4x_vec)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
+ jz L(loop_4x_vec)
|
|
||||||
# else
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- ja L(loop_4x_vec)
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
+ jnz L(loop_4x_vec_end)
|
|
||||||
|
|
||||||
-L(last_4x_vec_or_less):
|
|
||||||
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
|
||||||
- addl $(VEC_SIZE * 2), %edx
|
|
||||||
- jle L(last_2x_vec)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ subq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ ja L(loop_4x_vec)
|
|
||||||
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
|
||||||
+ /* Fall through into less than 4 remaining vectors of length case.
|
|
||||||
+ */
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_4x_vec_or_less):
|
|
||||||
+ /* Check if first VEC contained match. */
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
+ jnz L(first_vec_x1_check)
|
|
||||||
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
+ /* If remaining length > VEC_SIZE * 2. */
|
|
||||||
+ addl $(VEC_SIZE * 2), %edx
|
|
||||||
+ jg L(last_4x_vec)
|
|
||||||
|
|
||||||
- jnz L(first_vec_x2_check)
|
|
||||||
- subl $VEC_SIZE, %edx
|
|
||||||
- jle L(zero)
|
|
||||||
+L(last_2x_vec):
|
|
||||||
+ /* If remaining length < VEC_SIZE. */
|
|
||||||
+ addl $VEC_SIZE, %edx
|
|
||||||
+ jle L(zero_end)
|
|
||||||
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
|
||||||
+ /* Check VEC2 and compare any match with remaining length. */
|
|
||||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
-
|
|
||||||
- jnz L(first_vec_x3_check)
|
|
||||||
- xorl %eax, %eax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ cmpl %eax, %edx
|
|
||||||
+ jbe L(set_zero_end)
|
|
||||||
+ addq $(VEC_SIZE + 1), %rdi
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+L(zero_end):
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(last_2x_vec):
|
|
||||||
- addl $(VEC_SIZE * 2), %edx
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
+L(loop_4x_vec_end):
|
|
||||||
+# endif
|
|
||||||
+ /* rawmemchr will fall through into this if match was found in
|
|
||||||
+ loop. */
|
|
||||||
+
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_x1_return)
|
|
||||||
|
|
||||||
- jnz L(first_vec_x0_check)
|
|
||||||
- subl $VEC_SIZE, %edx
|
|
||||||
- jle L(zero)
|
|
||||||
-
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm2, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1_check)
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
+ jnz L(last_vec_x2_return)
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x0_check):
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
+ vpmovmskb %ymm3, %eax
|
|
||||||
+ /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */
|
|
||||||
+ salq $32, %rcx
|
|
||||||
+ orq %rcx, %rax
|
|
||||||
+ tzcntq %rax, %rax
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+ subq $(VEC_SIZE * 2 - 1), %rdi
|
|
||||||
+# else
|
|
||||||
+ subq $-(VEC_SIZE * 2 + 1), %rdi
|
|
||||||
+# endif
|
|
||||||
addq %rdi, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x1_check):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
+ /* Adjust length. */
|
|
||||||
+ subl $-(VEC_SIZE * 4), %edx
|
|
||||||
+ /* Check if match within remaining length. */
|
|
||||||
+ cmpl %eax, %edx
|
|
||||||
+ jbe L(set_zero_end)
|
|
||||||
+ incq %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
+ .p2align 4
|
|
||||||
+L(set_zero_end):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x2_check):
|
|
||||||
+L(last_vec_x1_return):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+ subq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
+# else
|
|
||||||
+ incq %rdi
|
|
||||||
+# endif
|
|
||||||
addq %rdi, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x3_check):
|
|
||||||
+L(last_vec_x2_return):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+ subq $(VEC_SIZE * 3 - 1), %rdi
|
|
||||||
+# else
|
|
||||||
+ subq $-(VEC_SIZE + 1), %rdi
|
|
||||||
+# endif
|
|
||||||
addq %rdi, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
.p2align 4
|
|
||||||
-L(zero):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- jmp L(return_vzeroupper)
|
|
||||||
+L(last_4x_vec_or_less_cmpeq):
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ /* Check first VEC regardless. */
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x1_check)
|
|
||||||
|
|
||||||
+ /* If remaining length <= CHAR_PER_VEC * 2. */
|
|
||||||
+ addl $(VEC_SIZE * 2), %edx
|
|
||||||
+ jle L(last_2x_vec)
|
|
||||||
.p2align 4
|
|
||||||
-L(null):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- ret
|
|
||||||
-# endif
|
|
||||||
+L(last_4x_vec):
|
|
||||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_x2_return)
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x0):
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x1):
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
+ /* Create mask for possible matches within remaining length. */
|
|
||||||
+ movq $-1, %rcx
|
|
||||||
+ bzhiq %rdx, %rcx, %rcx
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x2):
|
|
||||||
+ /* Test matches in data against length match. */
|
|
||||||
+ andl %ecx, %eax
|
|
||||||
+ jnz L(last_vec_x3)
|
|
||||||
+
|
|
||||||
+ /* if remaining length <= VEC_SIZE * 3 (Note this is after
|
|
||||||
+ remaining length was found to be > VEC_SIZE * 2. */
|
|
||||||
+ subl $VEC_SIZE, %edx
|
|
||||||
+ jbe L(zero_end2)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ /* Shift remaining length mask for last VEC. */
|
|
||||||
+ shrq $32, %rcx
|
|
||||||
+ andl %ecx, %eax
|
|
||||||
+ jz L(zero_end2)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
+ addq $(VEC_SIZE * 3 + 1), %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
+L(zero_end2):
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(4x_vec_end):
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
- vpmovmskb %ymm3, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
- vpmovmskb %ymm4, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
-L(first_vec_x3):
|
|
||||||
+L(last_vec_x3):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
+ subq $-(VEC_SIZE * 2 + 1), %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
END (MEMCHR)
|
|
||||||
#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,388 +0,0 @@
|
|||||||
From 645a158978f9520e74074e8c14047503be4db0f0 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Wed, 9 Jun 2021 16:25:32 -0400
|
|
||||||
Subject: [PATCH] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ
|
|
||||||
#27974]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
This commit fixes the bug mentioned in the previous commit.
|
|
||||||
|
|
||||||
The previous implementations of wmemchr in these files relied
|
|
||||||
on n * sizeof(wchar_t) which was not guranteed by the standard.
|
|
||||||
|
|
||||||
The new overflow tests added in the previous commit now
|
|
||||||
pass (As well as all the other tests).
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/memchr.S | 77 +++++++++++++++++++-------
|
|
||||||
sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
|
|
||||||
2 files changed, 98 insertions(+), 37 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
|
|
||||||
index cb320257..24f9a0c5 100644
|
|
||||||
--- a/sysdeps/x86_64/memchr.S
|
|
||||||
+++ b/sysdeps/x86_64/memchr.S
|
|
||||||
@@ -21,9 +21,11 @@
|
|
||||||
#ifdef USE_AS_WMEMCHR
|
|
||||||
# define MEMCHR wmemchr
|
|
||||||
# define PCMPEQ pcmpeqd
|
|
||||||
+# define CHAR_PER_VEC 4
|
|
||||||
#else
|
|
||||||
# define MEMCHR memchr
|
|
||||||
# define PCMPEQ pcmpeqb
|
|
||||||
+# define CHAR_PER_VEC 16
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* fast SSE2 version with using pmaxub and 64 byte loop */
|
|
||||||
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
|
|
||||||
movd %esi, %xmm1
|
|
||||||
mov %edi, %ecx
|
|
||||||
|
|
||||||
+#ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %edx, %edx
|
|
||||||
+#endif
|
|
||||||
#ifdef USE_AS_WMEMCHR
|
|
||||||
test %RDX_LP, %RDX_LP
|
|
||||||
jz L(return_null)
|
|
||||||
- shl $2, %RDX_LP
|
|
||||||
#else
|
|
||||||
-# ifdef __ILP32__
|
|
||||||
- /* Clear the upper 32 bits. */
|
|
||||||
- movl %edx, %edx
|
|
||||||
-# endif
|
|
||||||
punpcklbw %xmm1, %xmm1
|
|
||||||
test %RDX_LP, %RDX_LP
|
|
||||||
jz L(return_null)
|
|
||||||
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
|
|
||||||
test %eax, %eax
|
|
||||||
|
|
||||||
jnz L(matches_1)
|
|
||||||
- sub $16, %rdx
|
|
||||||
+ sub $CHAR_PER_VEC, %rdx
|
|
||||||
jbe L(return_null)
|
|
||||||
add $16, %rdi
|
|
||||||
and $15, %ecx
|
|
||||||
and $-16, %rdi
|
|
||||||
+#ifdef USE_AS_WMEMCHR
|
|
||||||
+ shr $2, %ecx
|
|
||||||
+#endif
|
|
||||||
add %rcx, %rdx
|
|
||||||
- sub $64, %rdx
|
|
||||||
+ sub $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
jbe L(exit_loop)
|
|
||||||
jmp L(loop_prolog)
|
|
||||||
|
|
||||||
@@ -77,16 +81,21 @@ L(crosscache):
|
|
||||||
movdqa (%rdi), %xmm0
|
|
||||||
|
|
||||||
PCMPEQ %xmm1, %xmm0
|
|
||||||
-/* Check if there is a match. */
|
|
||||||
+ /* Check if there is a match. */
|
|
||||||
pmovmskb %xmm0, %eax
|
|
||||||
-/* Remove the leading bytes. */
|
|
||||||
+ /* Remove the leading bytes. */
|
|
||||||
sar %cl, %eax
|
|
||||||
test %eax, %eax
|
|
||||||
je L(unaligned_no_match)
|
|
||||||
-/* Check which byte is a match. */
|
|
||||||
+ /* Check which byte is a match. */
|
|
||||||
bsf %eax, %eax
|
|
||||||
-
|
|
||||||
+#ifdef USE_AS_WMEMCHR
|
|
||||||
+ mov %eax, %esi
|
|
||||||
+ shr $2, %esi
|
|
||||||
+ sub %rsi, %rdx
|
|
||||||
+#else
|
|
||||||
sub %rax, %rdx
|
|
||||||
+#endif
|
|
||||||
jbe L(return_null)
|
|
||||||
add %rdi, %rax
|
|
||||||
add %rcx, %rax
|
|
||||||
@@ -94,15 +103,18 @@ L(crosscache):
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(unaligned_no_match):
|
|
||||||
- /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
|
|
||||||
+ /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
|
|
||||||
"rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
|
|
||||||
possible addition overflow. */
|
|
||||||
neg %rcx
|
|
||||||
add $16, %rcx
|
|
||||||
+#ifdef USE_AS_WMEMCHR
|
|
||||||
+ shr $2, %ecx
|
|
||||||
+#endif
|
|
||||||
sub %rcx, %rdx
|
|
||||||
jbe L(return_null)
|
|
||||||
add $16, %rdi
|
|
||||||
- sub $64, %rdx
|
|
||||||
+ sub $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
jbe L(exit_loop)
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
@@ -135,7 +147,7 @@ L(loop_prolog):
|
|
||||||
test $0x3f, %rdi
|
|
||||||
jz L(align64_loop)
|
|
||||||
|
|
||||||
- sub $64, %rdx
|
|
||||||
+ sub $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
jbe L(exit_loop)
|
|
||||||
|
|
||||||
movdqa (%rdi), %xmm0
|
|
||||||
@@ -167,11 +179,14 @@ L(loop_prolog):
|
|
||||||
mov %rdi, %rcx
|
|
||||||
and $-64, %rdi
|
|
||||||
and $63, %ecx
|
|
||||||
+#ifdef USE_AS_WMEMCHR
|
|
||||||
+ shr $2, %ecx
|
|
||||||
+#endif
|
|
||||||
add %rcx, %rdx
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(align64_loop):
|
|
||||||
- sub $64, %rdx
|
|
||||||
+ sub $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
jbe L(exit_loop)
|
|
||||||
movdqa (%rdi), %xmm0
|
|
||||||
movdqa 16(%rdi), %xmm2
|
|
||||||
@@ -218,7 +233,7 @@ L(align64_loop):
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(exit_loop):
|
|
||||||
- add $32, %edx
|
|
||||||
+ add $(CHAR_PER_VEC * 2), %edx
|
|
||||||
jle L(exit_loop_32)
|
|
||||||
|
|
||||||
movdqa (%rdi), %xmm0
|
|
||||||
@@ -238,7 +253,7 @@ L(exit_loop):
|
|
||||||
pmovmskb %xmm3, %eax
|
|
||||||
test %eax, %eax
|
|
||||||
jnz L(matches32_1)
|
|
||||||
- sub $16, %edx
|
|
||||||
+ sub $CHAR_PER_VEC, %edx
|
|
||||||
jle L(return_null)
|
|
||||||
|
|
||||||
PCMPEQ 48(%rdi), %xmm1
|
|
||||||
@@ -250,13 +265,13 @@ L(exit_loop):
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(exit_loop_32):
|
|
||||||
- add $32, %edx
|
|
||||||
+ add $(CHAR_PER_VEC * 2), %edx
|
|
||||||
movdqa (%rdi), %xmm0
|
|
||||||
PCMPEQ %xmm1, %xmm0
|
|
||||||
pmovmskb %xmm0, %eax
|
|
||||||
test %eax, %eax
|
|
||||||
jnz L(matches_1)
|
|
||||||
- sub $16, %edx
|
|
||||||
+ sub $CHAR_PER_VEC, %edx
|
|
||||||
jbe L(return_null)
|
|
||||||
|
|
||||||
PCMPEQ 16(%rdi), %xmm1
|
|
||||||
@@ -293,7 +308,13 @@ L(matches32):
|
|
||||||
.p2align 4
|
|
||||||
L(matches_1):
|
|
||||||
bsf %eax, %eax
|
|
||||||
+#ifdef USE_AS_WMEMCHR
|
|
||||||
+ mov %eax, %esi
|
|
||||||
+ shr $2, %esi
|
|
||||||
+ sub %rsi, %rdx
|
|
||||||
+#else
|
|
||||||
sub %rax, %rdx
|
|
||||||
+#endif
|
|
||||||
jbe L(return_null)
|
|
||||||
add %rdi, %rax
|
|
||||||
ret
|
|
||||||
@@ -301,7 +322,13 @@ L(matches_1):
|
|
||||||
.p2align 4
|
|
||||||
L(matches16_1):
|
|
||||||
bsf %eax, %eax
|
|
||||||
+#ifdef USE_AS_WMEMCHR
|
|
||||||
+ mov %eax, %esi
|
|
||||||
+ shr $2, %esi
|
|
||||||
+ sub %rsi, %rdx
|
|
||||||
+#else
|
|
||||||
sub %rax, %rdx
|
|
||||||
+#endif
|
|
||||||
jbe L(return_null)
|
|
||||||
lea 16(%rdi, %rax), %rax
|
|
||||||
ret
|
|
||||||
@@ -309,7 +336,13 @@ L(matches16_1):
|
|
||||||
.p2align 4
|
|
||||||
L(matches32_1):
|
|
||||||
bsf %eax, %eax
|
|
||||||
+#ifdef USE_AS_WMEMCHR
|
|
||||||
+ mov %eax, %esi
|
|
||||||
+ shr $2, %esi
|
|
||||||
+ sub %rsi, %rdx
|
|
||||||
+#else
|
|
||||||
sub %rax, %rdx
|
|
||||||
+#endif
|
|
||||||
jbe L(return_null)
|
|
||||||
lea 32(%rdi, %rax), %rax
|
|
||||||
ret
|
|
||||||
@@ -317,7 +350,13 @@ L(matches32_1):
|
|
||||||
.p2align 4
|
|
||||||
L(matches48_1):
|
|
||||||
bsf %eax, %eax
|
|
||||||
+#ifdef USE_AS_WMEMCHR
|
|
||||||
+ mov %eax, %esi
|
|
||||||
+ shr $2, %esi
|
|
||||||
+ sub %rsi, %rdx
|
|
||||||
+#else
|
|
||||||
sub %rax, %rdx
|
|
||||||
+#endif
|
|
||||||
jbe L(return_null)
|
|
||||||
lea 48(%rdi, %rax), %rax
|
|
||||||
ret
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
index b377f22e..16027abb 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
@@ -54,21 +54,19 @@
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
# define PAGE_SIZE 4096
|
|
||||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
||||||
|
|
||||||
.section SECTION(.text),"ax",@progbits
|
|
||||||
ENTRY (MEMCHR)
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
/* Check for zero length. */
|
|
||||||
- test %RDX_LP, %RDX_LP
|
|
||||||
- jz L(null)
|
|
||||||
-# endif
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- shl $2, %RDX_LP
|
|
||||||
-# else
|
|
||||||
# ifdef __ILP32__
|
|
||||||
- /* Clear the upper 32 bits. */
|
|
||||||
- movl %edx, %edx
|
|
||||||
+ /* Clear upper bits. */
|
|
||||||
+ and %RDX_LP, %RDX_LP
|
|
||||||
+# else
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
# endif
|
|
||||||
+ jz L(null)
|
|
||||||
# endif
|
|
||||||
/* Broadcast CHAR to YMMMATCH. */
|
|
||||||
vmovd %esi, %xmm0
|
|
||||||
@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
/* If length < CHAR_PER_VEC handle special. */
|
|
||||||
- cmpq $VEC_SIZE, %rdx
|
|
||||||
+ cmpq $CHAR_PER_VEC, %rdx
|
|
||||||
jbe L(first_vec_x0)
|
|
||||||
# endif
|
|
||||||
testl %eax, %eax
|
|
||||||
@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
|
|
||||||
L(first_vec_x0):
|
|
||||||
/* Check if first match was before length. */
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Multiply length by 4 to get byte count. */
|
|
||||||
+ sall $2, %edx
|
|
||||||
+# endif
|
|
||||||
xorl %ecx, %ecx
|
|
||||||
cmpl %eax, %edx
|
|
||||||
leaq (%rdi, %rax), %rax
|
|
||||||
@@ -110,12 +112,12 @@ L(null):
|
|
||||||
# endif
|
|
||||||
.p2align 4
|
|
||||||
L(cross_page_boundary):
|
|
||||||
- /* Save pointer before aligning as its original value is necessary
|
|
||||||
- for computer return address if byte is found or adjusting length
|
|
||||||
- if it is not and this is memchr. */
|
|
||||||
+ /* Save pointer before aligning as its original value is
|
|
||||||
+ necessary for computer return address if byte is found or
|
|
||||||
+ adjusting length if it is not and this is memchr. */
|
|
||||||
movq %rdi, %rcx
|
|
||||||
- /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
|
|
||||||
- rdi for rawmemchr. */
|
|
||||||
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
|
|
||||||
+ and rdi for rawmemchr. */
|
|
||||||
orq $(VEC_SIZE - 1), %ALGN_PTR_REG
|
|
||||||
VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
@@ -124,6 +126,10 @@ L(cross_page_boundary):
|
|
||||||
match). */
|
|
||||||
leaq 1(%ALGN_PTR_REG), %rsi
|
|
||||||
subq %RRAW_PTR_REG, %rsi
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
+ shrl $2, %esi
|
|
||||||
+# endif
|
|
||||||
# endif
|
|
||||||
/* Remove the leading bytes. */
|
|
||||||
sarxl %ERAW_PTR_REG, %eax, %eax
|
|
||||||
@@ -181,6 +187,10 @@ L(cross_page_continue):
|
|
||||||
orq $(VEC_SIZE - 1), %rdi
|
|
||||||
/* esi is for adjusting length to see if near the end. */
|
|
||||||
leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %esi
|
|
||||||
+# endif
|
|
||||||
# else
|
|
||||||
orq $(VEC_SIZE - 1), %rdi
|
|
||||||
L(cross_page_continue):
|
|
||||||
@@ -213,7 +223,7 @@ L(cross_page_continue):
|
|
||||||
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
/* Check if at last VEC_SIZE * 4 length. */
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ subq $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
jbe L(last_4x_vec_or_less_cmpeq)
|
|
||||||
/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
|
|
||||||
length. */
|
|
||||||
@@ -221,6 +231,10 @@ L(cross_page_continue):
|
|
||||||
movl %edi, %ecx
|
|
||||||
orq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
andl $(VEC_SIZE * 4 - 1), %ecx
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %ecx
|
|
||||||
+# endif
|
|
||||||
addq %rcx, %rdx
|
|
||||||
# else
|
|
||||||
/* Align data to VEC_SIZE * 4 - 1 for loop. */
|
|
||||||
@@ -250,15 +264,19 @@ L(loop_4x_vec):
|
|
||||||
|
|
||||||
subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ subq $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
ja L(loop_4x_vec)
|
|
||||||
|
|
||||||
- /* Fall through into less than 4 remaining vectors of length case.
|
|
||||||
- */
|
|
||||||
+ /* Fall through into less than 4 remaining vectors of length
|
|
||||||
+ case. */
|
|
||||||
VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
.p2align 4
|
|
||||||
L(last_4x_vec_or_less):
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Multiply length by 4 to get byte count. */
|
|
||||||
+ sall $2, %edx
|
|
||||||
+# endif
|
|
||||||
/* Check if first VEC contained match. */
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x1_check)
|
|
||||||
@@ -355,6 +373,10 @@ L(last_vec_x2_return):
|
|
||||||
L(last_4x_vec_or_less_cmpeq):
|
|
||||||
VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Multiply length by 4 to get byte count. */
|
|
||||||
+ sall $2, %edx
|
|
||||||
+# endif
|
|
||||||
subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
/* Check first VEC regardless. */
|
|
||||||
testl %eax, %eax
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,767 +0,0 @@
|
|||||||
From aaa23c35071537e2dcf5807e956802ed215210aa Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon, 19 Apr 2021 19:36:07 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize strlen-avx2.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit optimizes strlen-avx2.S. The optimizations are
|
|
||||||
mostly small things but they add up to roughly 10-30% performance
|
|
||||||
improvement for strlen. The results for strnlen are bit more
|
|
||||||
ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
|
|
||||||
are all passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +-
|
|
||||||
sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++--------
|
|
||||||
2 files changed, 334 insertions(+), 214 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index cbfc1a5d..f1a6460a 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -285,10 +285,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
/* Support sysdeps/x86_64/multiarch/strlen.c. */
|
|
||||||
IFUNC_IMPL (i, name, strlen,
|
|
||||||
IFUNC_IMPL_ADD (array, i, strlen,
|
|
||||||
- CPU_FEATURE_USABLE (AVX2),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__strlen_avx2)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strlen,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__strlen_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strlen,
|
|
||||||
@@ -301,10 +303,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
/* Support sysdeps/x86_64/multiarch/strnlen.c. */
|
|
||||||
IFUNC_IMPL (i, name, strnlen,
|
|
||||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
|
||||||
- CPU_FEATURE_USABLE (AVX2),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__strnlen_avx2)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__strnlen_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
|
||||||
@@ -640,10 +644,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
/* Support sysdeps/x86_64/multiarch/wcslen.c. */
|
|
||||||
IFUNC_IMPL (i, name, wcslen,
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcslen,
|
|
||||||
- CPU_FEATURE_USABLE (AVX2),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wcslen_avx2)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcslen,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__wcslen_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcslen,
|
|
||||||
@@ -656,10 +662,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
|
|
||||||
IFUNC_IMPL (i, name, wcsnlen,
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
|
||||||
- CPU_FEATURE_USABLE (AVX2),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wcsnlen_avx2)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__wcsnlen_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
index 82826e10..be8a5db5 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
@@ -27,9 +27,11 @@
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
# define VPCMPEQ vpcmpeqd
|
|
||||||
# define VPMINU vpminud
|
|
||||||
+# define CHAR_SIZE 4
|
|
||||||
# else
|
|
||||||
# define VPCMPEQ vpcmpeqb
|
|
||||||
# define VPMINU vpminub
|
|
||||||
+# define CHAR_SIZE 1
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# ifndef VZEROUPPER
|
|
||||||
@@ -41,349 +43,459 @@
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
+# define PAGE_SIZE 4096
|
|
||||||
|
|
||||||
.section SECTION(.text),"ax",@progbits
|
|
||||||
ENTRY (STRLEN)
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* Check for zero length. */
|
|
||||||
+ /* Check zero length. */
|
|
||||||
test %RSI_LP, %RSI_LP
|
|
||||||
jz L(zero)
|
|
||||||
+ /* Store max len in R8_LP before adjusting if using WCSLEN. */
|
|
||||||
+ mov %RSI_LP, %R8_LP
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
shl $2, %RSI_LP
|
|
||||||
# elif defined __ILP32__
|
|
||||||
/* Clear the upper 32 bits. */
|
|
||||||
movl %esi, %esi
|
|
||||||
# endif
|
|
||||||
- mov %RSI_LP, %R8_LP
|
|
||||||
# endif
|
|
||||||
- movl %edi, %ecx
|
|
||||||
+ movl %edi, %eax
|
|
||||||
movq %rdi, %rdx
|
|
||||||
vpxor %xmm0, %xmm0, %xmm0
|
|
||||||
-
|
|
||||||
+ /* Clear high bits from edi. Only keeping bits relevant to page
|
|
||||||
+ cross check. */
|
|
||||||
+ andl $(PAGE_SIZE - 1), %eax
|
|
||||||
/* Check if we may cross page boundary with one vector load. */
|
|
||||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
|
||||||
- cmpl $VEC_SIZE, %ecx
|
|
||||||
- ja L(cros_page_boundary)
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
+ ja L(cross_page_boundary)
|
|
||||||
|
|
||||||
/* Check the first VEC_SIZE bytes. */
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
-
|
|
||||||
+ VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- jnz L(first_vec_x0_check)
|
|
||||||
- /* Adjust length and check the end of data. */
|
|
||||||
- subq $VEC_SIZE, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
-# else
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ /* If length < VEC_SIZE handle special. */
|
|
||||||
+ cmpq $VEC_SIZE, %rsi
|
|
||||||
+ jbe L(first_vec_x0)
|
|
||||||
# endif
|
|
||||||
-
|
|
||||||
- /* Align data for aligned loads in the loop. */
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
+ /* If empty continue to aligned_more. Otherwise return bit
|
|
||||||
+ position of first match. */
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(aligned_more)
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ shrl $2, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* Adjust length. */
|
|
||||||
- addq %rcx, %rsi
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- subq $(VEC_SIZE * 4), %rsi
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x0):
|
|
||||||
+ /* Set bit for max len so that tzcnt will return min of max len
|
|
||||||
+ and position of first match. */
|
|
||||||
+ btsq %rsi, %rax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ shrl $2, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
# endif
|
|
||||||
- jmp L(more_4x_vec)
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(cros_page_boundary):
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- /* Remove the leading bytes. */
|
|
||||||
- sarl %cl, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jz L(aligned_more)
|
|
||||||
+L(first_vec_x1):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
+ /* Safe to use 32 bit instructions as these are only called for
|
|
||||||
+ size = [1, 159]. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
+ /* Use ecx which was computed earlier to compute correct value.
|
|
||||||
+ */
|
|
||||||
+ subl $(VEC_SIZE * 4 + 1), %ecx
|
|
||||||
+ addl %ecx, %eax
|
|
||||||
+# else
|
|
||||||
+ subl %edx, %edi
|
|
||||||
+ incl %edi
|
|
||||||
+ addl %edi, %eax
|
|
||||||
# endif
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- addq %rcx, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+ shrl $2, %eax
|
|
||||||
# endif
|
|
||||||
-L(return_vzeroupper):
|
|
||||||
- ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(aligned_more):
|
|
||||||
+L(first_vec_x2):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Safe to use 32 bit instructions as these are only called for
|
|
||||||
+ size = [1, 159]. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
|
|
||||||
- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
|
|
||||||
- to void possible addition overflow. */
|
|
||||||
- negq %rcx
|
|
||||||
- addq $VEC_SIZE, %rcx
|
|
||||||
-
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- subq %rcx, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
+ /* Use ecx which was computed earlier to compute correct value.
|
|
||||||
+ */
|
|
||||||
+ subl $(VEC_SIZE * 3 + 1), %ecx
|
|
||||||
+ addl %ecx, %eax
|
|
||||||
+# else
|
|
||||||
+ subl %edx, %edi
|
|
||||||
+ addl $(VEC_SIZE + 1), %edi
|
|
||||||
+ addl %edi, %eax
|
|
||||||
# endif
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ shrl $2, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x3):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Safe to use 32 bit instructions as these are only called for
|
|
||||||
+ size = [1, 159]. */
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ /* Use ecx which was computed earlier to compute correct value.
|
|
||||||
+ */
|
|
||||||
+ subl $(VEC_SIZE * 2 + 1), %ecx
|
|
||||||
+ addl %ecx, %eax
|
|
||||||
+# else
|
|
||||||
+ subl %edx, %edi
|
|
||||||
+ addl $(VEC_SIZE * 2 + 1), %edi
|
|
||||||
+ addl %edi, %eax
|
|
||||||
+# endif
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ shrl $2, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x4):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Safe to use 32 bit instructions as these are only called for
|
|
||||||
+ size = [1, 159]. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- subq $(VEC_SIZE * 4), %rsi
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
+ /* Use ecx which was computed earlier to compute correct value.
|
|
||||||
+ */
|
|
||||||
+ subl $(VEC_SIZE + 1), %ecx
|
|
||||||
+ addl %ecx, %eax
|
|
||||||
+# else
|
|
||||||
+ subl %edx, %edi
|
|
||||||
+ addl $(VEC_SIZE * 3 + 1), %edi
|
|
||||||
+ addl %edi, %eax
|
|
||||||
# endif
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ shrl $2, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
-L(more_4x_vec):
|
|
||||||
+ .p2align 5
|
|
||||||
+L(aligned_more):
|
|
||||||
+ /* Align data to VEC_SIZE - 1. This is the same number of
|
|
||||||
+ instructions as using andq with -VEC_SIZE but saves 4 bytes of
|
|
||||||
+ code on the x4 check. */
|
|
||||||
+ orq $(VEC_SIZE - 1), %rdi
|
|
||||||
+L(cross_page_continue):
|
|
||||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
since data is only aligned to VEC_SIZE. */
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
-
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
|
|
||||||
+ it simplies the logic in last_4x_vec_or_less. */
|
|
||||||
+ leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
|
|
||||||
+ subq %rdx, %rcx
|
|
||||||
+# endif
|
|
||||||
+ /* Load first VEC regardless. */
|
|
||||||
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ /* Adjust length. If near end handle specially. */
|
|
||||||
+ subq %rcx, %rsi
|
|
||||||
+ jb L(last_4x_vec_or_less)
|
|
||||||
+# endif
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x1)
|
|
||||||
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x2)
|
|
||||||
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x3)
|
|
||||||
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
-
|
|
||||||
-# ifdef USE_AS_STRNLEN
|
|
||||||
- subq $(VEC_SIZE * 4), %rsi
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
- /* Align data to 4 * VEC_SIZE. */
|
|
||||||
- movq %rdi, %rcx
|
|
||||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x4)
|
|
||||||
|
|
||||||
+ /* Align data to VEC_SIZE * 4 - 1. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* Adjust length. */
|
|
||||||
+ /* Before adjusting length check if at last VEC_SIZE * 4. */
|
|
||||||
+ cmpq $(VEC_SIZE * 4 - 1), %rsi
|
|
||||||
+ jbe L(last_4x_vec_or_less_load)
|
|
||||||
+ incq %rdi
|
|
||||||
+ movl %edi, %ecx
|
|
||||||
+ orq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
+ andl $(VEC_SIZE * 4 - 1), %ecx
|
|
||||||
+ /* Readjust length. */
|
|
||||||
addq %rcx, %rsi
|
|
||||||
+# else
|
|
||||||
+ incq %rdi
|
|
||||||
+ orq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
# endif
|
|
||||||
-
|
|
||||||
+ /* Compare 4 * VEC at a time forward. */
|
|
||||||
.p2align 4
|
|
||||||
L(loop_4x_vec):
|
|
||||||
- /* Compare 4 * VEC at a time forward. */
|
|
||||||
- vmovdqa (%rdi), %ymm1
|
|
||||||
- vmovdqa VEC_SIZE(%rdi), %ymm2
|
|
||||||
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
|
|
||||||
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
|
|
||||||
- VPMINU %ymm1, %ymm2, %ymm5
|
|
||||||
- VPMINU %ymm3, %ymm4, %ymm6
|
|
||||||
- VPMINU %ymm5, %ymm6, %ymm5
|
|
||||||
-
|
|
||||||
- VPCMPEQ %ymm5, %ymm0, %ymm5
|
|
||||||
- vpmovmskb %ymm5, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(4x_vec_end)
|
|
||||||
-
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
-
|
|
||||||
-# ifndef USE_AS_STRNLEN
|
|
||||||
- jmp L(loop_4x_vec)
|
|
||||||
-# else
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ /* Break if at end of length. */
|
|
||||||
subq $(VEC_SIZE * 4), %rsi
|
|
||||||
- ja L(loop_4x_vec)
|
|
||||||
-
|
|
||||||
-L(last_4x_vec_or_less):
|
|
||||||
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
|
||||||
- addl $(VEC_SIZE * 2), %esi
|
|
||||||
- jle L(last_2x_vec)
|
|
||||||
+ jb L(last_4x_vec_or_less_cmpeq)
|
|
||||||
+# endif
|
|
||||||
+ /* Save some code size by microfusing VPMINU with the load. Since
|
|
||||||
+ the matches in ymm2/ymm4 can only be returned if there where no
|
|
||||||
+ matches in ymm1/ymm3 respectively there is no issue with overlap.
|
|
||||||
+ */
|
|
||||||
+ vmovdqa 1(%rdi), %ymm1
|
|
||||||
+ VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
|
|
||||||
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
|
|
||||||
+ VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
|
|
||||||
+
|
|
||||||
+ VPMINU %ymm2, %ymm4, %ymm5
|
|
||||||
+ VPCMPEQ %ymm5, %ymm0, %ymm5
|
|
||||||
+ vpmovmskb %ymm5, %ecx
|
|
||||||
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
+ jz L(loop_4x_vec)
|
|
||||||
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ VPCMPEQ %ymm1, %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_return_x0)
|
|
||||||
|
|
||||||
- jnz L(first_vec_x2_check)
|
|
||||||
- subl $VEC_SIZE, %esi
|
|
||||||
- jle L(max)
|
|
||||||
-
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ VPCMPEQ %ymm2, %ymm0, %ymm2
|
|
||||||
+ vpmovmskb %ymm2, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
-
|
|
||||||
- jnz L(first_vec_x3_check)
|
|
||||||
- movq %r8, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+ jnz L(last_vec_return_x1)
|
|
||||||
+
|
|
||||||
+ /* Combine last 2 VEC. */
|
|
||||||
+ VPCMPEQ %ymm3, %ymm0, %ymm3
|
|
||||||
+ vpmovmskb %ymm3, %eax
|
|
||||||
+ /* rcx has combined result from all 4 VEC. It will only be used if
|
|
||||||
+ the first 3 other VEC all did not contain a match. */
|
|
||||||
+ salq $32, %rcx
|
|
||||||
+ orq %rcx, %rax
|
|
||||||
+ tzcntq %rax, %rax
|
|
||||||
+ subq $(VEC_SIZE * 2 - 1), %rdi
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
+
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
.p2align 4
|
|
||||||
-L(last_2x_vec):
|
|
||||||
- addl $(VEC_SIZE * 2), %esi
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
+L(last_4x_vec_or_less_load):
|
|
||||||
+ /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+L(last_4x_vec_or_less_cmpeq):
|
|
||||||
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
|
||||||
+L(last_4x_vec_or_less):
|
|
||||||
|
|
||||||
- jnz L(first_vec_x0_check)
|
|
||||||
- subl $VEC_SIZE, %esi
|
|
||||||
- jle L(max)
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off by
|
|
||||||
+ VEC_SIZE * 4. */
|
|
||||||
+ testl $(VEC_SIZE * 2), %esi
|
|
||||||
+ jnz L(last_4x_vec)
|
|
||||||
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ /* length may have been negative or positive by an offset of
|
|
||||||
+ VEC_SIZE * 4 depending on where this was called from. This fixes
|
|
||||||
+ that. */
|
|
||||||
+ andl $(VEC_SIZE * 4 - 1), %esi
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1_check)
|
|
||||||
- movq %r8, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
+ jnz L(last_vec_x1_check)
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x0_check):
|
|
||||||
+ subl $VEC_SIZE, %esi
|
|
||||||
+ jb L(max)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
/* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
+ cmpl %eax, %esi
|
|
||||||
+ jb L(max)
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+ addl $(VEC_SIZE + 1), %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x1_check):
|
|
||||||
+L(last_vec_return_x0):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
+ subq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x2_check):
|
|
||||||
+L(last_vec_return_x1):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
+ subq $(VEC_SIZE * 3 - 1), %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x3_check):
|
|
||||||
+L(last_vec_x1_check):
|
|
||||||
+
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
/* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
+ cmpl %eax, %esi
|
|
||||||
+ jb L(max)
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+ incl %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
L(max):
|
|
||||||
movq %r8, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_4x_vec):
|
|
||||||
+ /* Test first 2x VEC normally. */
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_x1)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_x2)
|
|
||||||
+
|
|
||||||
+ /* Normalize length. */
|
|
||||||
+ andl $(VEC_SIZE * 4 - 1), %esi
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_x3)
|
|
||||||
+
|
|
||||||
+ subl $(VEC_SIZE * 3), %esi
|
|
||||||
+ jb L(max)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Check the end of data. */
|
|
||||||
+ cmpl %eax, %esi
|
|
||||||
+ jb L(max)
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+ addl $(VEC_SIZE * 3 + 1), %eax
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(zero):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- ret
|
|
||||||
-# endif
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x0):
|
|
||||||
+L(last_vec_x1):
|
|
||||||
+ /* essentially duplicates of first_vec_x1 but use 64 bit
|
|
||||||
+ instructions. */
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+ incl %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x1):
|
|
||||||
+L(last_vec_x2):
|
|
||||||
+ /* essentially duplicates of first_vec_x1 but use 64 bit
|
|
||||||
+ instructions. */
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+ addl $(VEC_SIZE + 1), %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x2):
|
|
||||||
+L(last_vec_x3):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
+ subl $(VEC_SIZE * 2), %esi
|
|
||||||
+ /* Check the end of data. */
|
|
||||||
+ cmpl %eax, %esi
|
|
||||||
+ jb L(max_end)
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+ addl $(VEC_SIZE * 2 + 1), %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+L(max_end):
|
|
||||||
+ movq %r8, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
+ /* Cold case for crossing page with first load. */
|
|
||||||
.p2align 4
|
|
||||||
-L(4x_vec_end):
|
|
||||||
- VPCMPEQ %ymm1, %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
- VPCMPEQ %ymm2, %ymm0, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
+L(cross_page_boundary):
|
|
||||||
+ /* Align data to VEC_SIZE - 1. */
|
|
||||||
+ orq $(VEC_SIZE - 1), %rdi
|
|
||||||
+ VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
|
|
||||||
+ so no need to manually mod rdx. */
|
|
||||||
+ sarxl %edx, %eax, %eax
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
- VPCMPEQ %ymm3, %ymm0, %ymm3
|
|
||||||
- vpmovmskb %ymm3, %eax
|
|
||||||
+ jnz L(cross_page_less_vec)
|
|
||||||
+ leaq 1(%rdi), %rcx
|
|
||||||
+ subq %rdx, %rcx
|
|
||||||
+ /* Check length. */
|
|
||||||
+ cmpq %rsi, %rcx
|
|
||||||
+ jb L(cross_page_continue)
|
|
||||||
+ movq %r8, %rax
|
|
||||||
+# else
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
- VPCMPEQ %ymm4, %ymm0, %ymm4
|
|
||||||
- vpmovmskb %ymm4, %eax
|
|
||||||
-L(first_vec_x3):
|
|
||||||
+ jz L(cross_page_continue)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ shrl $2, %eax
|
|
||||||
+# endif
|
|
||||||
# endif
|
|
||||||
+L(return_vzeroupper):
|
|
||||||
+ ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
||||||
+
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ .p2align 4
|
|
||||||
+L(cross_page_less_vec):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ cmpq %rax, %rsi
|
|
||||||
+ cmovb %esi, %eax
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ shrl $2, %eax
|
|
||||||
+# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
END (STRLEN)
|
|
||||||
#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,701 +0,0 @@
|
|||||||
From 2a76821c3081d2c0231ecd2618f52662cb48fccd Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon, 3 May 2021 03:03:19 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize memchr-evex.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit optimizes memchr-evex.S. The optimizations include
|
|
||||||
replacing some branches with cmovcc, avoiding some branches entirely
|
|
||||||
in the less_4x_vec case, making the page cross logic less strict,
|
|
||||||
saving some ALU in the alignment process, and most importantly
|
|
||||||
increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
|
|
||||||
test-wmemchr are all passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
|
|
||||||
1 file changed, 322 insertions(+), 225 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
index 6dd5d67b..81d5cd64 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
@@ -26,14 +26,28 @@
|
|
||||||
|
|
||||||
# ifdef USE_AS_WMEMCHR
|
|
||||||
# define VPBROADCAST vpbroadcastd
|
|
||||||
-# define VPCMP vpcmpd
|
|
||||||
-# define SHIFT_REG r8d
|
|
||||||
+# define VPMINU vpminud
|
|
||||||
+# define VPCMP vpcmpd
|
|
||||||
+# define VPCMPEQ vpcmpeqd
|
|
||||||
+# define CHAR_SIZE 4
|
|
||||||
# else
|
|
||||||
# define VPBROADCAST vpbroadcastb
|
|
||||||
-# define VPCMP vpcmpb
|
|
||||||
-# define SHIFT_REG ecx
|
|
||||||
+# define VPMINU vpminub
|
|
||||||
+# define VPCMP vpcmpb
|
|
||||||
+# define VPCMPEQ vpcmpeqb
|
|
||||||
+# define CHAR_SIZE 1
|
|
||||||
# endif
|
|
||||||
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+# define RAW_PTR_REG rcx
|
|
||||||
+# define ALGN_PTR_REG rdi
|
|
||||||
+# else
|
|
||||||
+# define RAW_PTR_REG rdi
|
|
||||||
+# define ALGN_PTR_REG rcx
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+# define XMMZERO xmm23
|
|
||||||
+# define YMMZERO ymm23
|
|
||||||
# define XMMMATCH xmm16
|
|
||||||
# define YMMMATCH ymm16
|
|
||||||
# define YMM1 ymm17
|
|
||||||
@@ -44,6 +58,8 @@
|
|
||||||
# define YMM6 ymm22
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
||||||
+# define PAGE_SIZE 4096
|
|
||||||
|
|
||||||
.section .text.evex,"ax",@progbits
|
|
||||||
ENTRY (MEMCHR)
|
|
||||||
@@ -51,11 +67,7 @@ ENTRY (MEMCHR)
|
|
||||||
/* Check for zero length. */
|
|
||||||
test %RDX_LP, %RDX_LP
|
|
||||||
jz L(zero)
|
|
||||||
-# endif
|
|
||||||
- movl %edi, %ecx
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- shl $2, %RDX_LP
|
|
||||||
-# else
|
|
||||||
+
|
|
||||||
# ifdef __ILP32__
|
|
||||||
/* Clear the upper 32 bits. */
|
|
||||||
movl %edx, %edx
|
|
||||||
@@ -64,318 +76,403 @@ ENTRY (MEMCHR)
|
|
||||||
/* Broadcast CHAR to YMMMATCH. */
|
|
||||||
VPBROADCAST %esi, %YMMMATCH
|
|
||||||
/* Check if we may cross page boundary with one vector load. */
|
|
||||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
|
||||||
- cmpl $VEC_SIZE, %ecx
|
|
||||||
- ja L(cros_page_boundary)
|
|
||||||
+ movl %edi, %eax
|
|
||||||
+ andl $(PAGE_SIZE - 1), %eax
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
+ ja L(cross_page_boundary)
|
|
||||||
|
|
||||||
/* Check the first VEC_SIZE bytes. */
|
|
||||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
-
|
|
||||||
+ VPCMP $0, (%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- jnz L(first_vec_x0_check)
|
|
||||||
- /* Adjust length and check the end of data. */
|
|
||||||
- subq $VEC_SIZE, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
+ /* If length < CHAR_PER_VEC handle special. */
|
|
||||||
+ cmpq $CHAR_PER_VEC, %rdx
|
|
||||||
+ jbe L(first_vec_x0)
|
|
||||||
+# endif
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(aligned_more)
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# else
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
# endif
|
|
||||||
-
|
|
||||||
- /* Align data for aligned loads in the loop. */
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- /* Adjust length. */
|
|
||||||
- addq %rcx, %rdx
|
|
||||||
-
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
-# endif
|
|
||||||
- jmp L(more_4x_vec)
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
+ .p2align 5
|
|
||||||
+L(first_vec_x0):
|
|
||||||
+ /* Check if first match was before length. */
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ xorl %ecx, %ecx
|
|
||||||
+ cmpl %eax, %edx
|
|
||||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ cmovle %rcx, %rax
|
|
||||||
+ ret
|
|
||||||
+# else
|
|
||||||
+ /* NB: first_vec_x0 is 17 bytes which will leave
|
|
||||||
+ cross_page_boundary (which is relatively cold) close enough
|
|
||||||
+ to ideal alignment. So only realign L(cross_page_boundary) if
|
|
||||||
+ rawmemchr. */
|
|
||||||
.p2align 4
|
|
||||||
-L(cros_page_boundary):
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
+# endif
|
|
||||||
+L(cross_page_boundary):
|
|
||||||
+ /* Save pointer before aligning as its original value is
|
|
||||||
+ necessary for computer return address if byte is found or
|
|
||||||
+ adjusting length if it is not and this is memchr. */
|
|
||||||
+ movq %rdi, %rcx
|
|
||||||
+ /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
|
|
||||||
+ for rawmemchr. */
|
|
||||||
+ andq $-VEC_SIZE, %ALGN_PTR_REG
|
|
||||||
+ VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %r8d
|
|
||||||
# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
|
|
||||||
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
|
|
||||||
bytes. */
|
|
||||||
- movl %ecx, %SHIFT_REG
|
|
||||||
- sarl $2, %SHIFT_REG
|
|
||||||
+ sarl $2, %eax
|
|
||||||
+# endif
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
+ movl $(PAGE_SIZE / CHAR_SIZE), %esi
|
|
||||||
+ subl %eax, %esi
|
|
||||||
# endif
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
- /* Remove the leading bytes. */
|
|
||||||
- sarxl %SHIFT_REG, %eax, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jz L(aligned_more)
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
+ andl $(CHAR_PER_VEC - 1), %eax
|
|
||||||
# endif
|
|
||||||
+ /* Remove the leading bytes. */
|
|
||||||
+ sarxl %eax, %r8d, %eax
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
/* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
+ cmpq %rsi, %rdx
|
|
||||||
+ jbe L(first_vec_x0)
|
|
||||||
+# endif
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(cross_page_continue)
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
|
|
||||||
+# else
|
|
||||||
+ addq %RAW_PTR_REG, %rax
|
|
||||||
# endif
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- addq %rcx, %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(aligned_more):
|
|
||||||
-# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
|
|
||||||
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
|
|
||||||
- overflow. */
|
|
||||||
- negq %rcx
|
|
||||||
- addq $VEC_SIZE, %rcx
|
|
||||||
+L(first_vec_x1):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- subq %rcx, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
-# endif
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x2):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x3):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
-# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
-# endif
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x4):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
-L(more_4x_vec):
|
|
||||||
+ .p2align 5
|
|
||||||
+L(aligned_more):
|
|
||||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
since data is only aligned to VEC_SIZE. */
|
|
||||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
|
|
||||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
+ /* Align data to VEC_SIZE. */
|
|
||||||
+L(cross_page_continue):
|
|
||||||
+ xorl %ecx, %ecx
|
|
||||||
+ subl %edi, %ecx
|
|
||||||
+ andq $-VEC_SIZE, %rdi
|
|
||||||
+ /* esi is for adjusting length to see if near the end. */
|
|
||||||
+ leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %esi
|
|
||||||
+# endif
|
|
||||||
+# else
|
|
||||||
+ andq $-VEC_SIZE, %rdi
|
|
||||||
+L(cross_page_continue):
|
|
||||||
+# endif
|
|
||||||
+ /* Load first VEC regardless. */
|
|
||||||
+ VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
+ /* Adjust length. If near end handle specially. */
|
|
||||||
+ subq %rsi, %rdx
|
|
||||||
+ jbe L(last_4x_vec_or_less)
|
|
||||||
+# endif
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x1)
|
|
||||||
|
|
||||||
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x2)
|
|
||||||
|
|
||||||
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x3)
|
|
||||||
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x4)
|
|
||||||
+
|
|
||||||
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
-# endif
|
|
||||||
+ /* Check if at last CHAR_PER_VEC * 4 length. */
|
|
||||||
+ subq $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
+ jbe L(last_4x_vec_or_less_cmpeq)
|
|
||||||
+ addq $VEC_SIZE, %rdi
|
|
||||||
|
|
||||||
- /* Align data to 4 * VEC_SIZE. */
|
|
||||||
- movq %rdi, %rcx
|
|
||||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
|
||||||
+ /* Align data to VEC_SIZE * 4 for the loop and readjust length.
|
|
||||||
+ */
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ movl %edi, %ecx
|
|
||||||
andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
-
|
|
||||||
-# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- /* Adjust length. */
|
|
||||||
+ andl $(VEC_SIZE * 4 - 1), %ecx
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %ecx
|
|
||||||
addq %rcx, %rdx
|
|
||||||
+# else
|
|
||||||
+ addq %rdi, %rdx
|
|
||||||
+ andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
+ subq %rdi, %rdx
|
|
||||||
+# endif
|
|
||||||
+# else
|
|
||||||
+ addq $VEC_SIZE, %rdi
|
|
||||||
+ andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
# endif
|
|
||||||
|
|
||||||
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
|
||||||
+
|
|
||||||
+ /* Compare 4 * VEC at a time forward. */
|
|
||||||
.p2align 4
|
|
||||||
L(loop_4x_vec):
|
|
||||||
- /* Compare 4 * VEC at a time forward. */
|
|
||||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
||||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
|
|
||||||
- kord %k1, %k2, %k5
|
|
||||||
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
|
|
||||||
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
|
|
||||||
-
|
|
||||||
- kord %k3, %k4, %k6
|
|
||||||
- kortestd %k5, %k6
|
|
||||||
- jnz L(4x_vec_end)
|
|
||||||
-
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
-
|
|
||||||
+ /* It would be possible to save some instructions using 4x VPCMP
|
|
||||||
+ but bottleneck on port 5 makes it not woth it. */
|
|
||||||
+ VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
|
|
||||||
+ /* xor will set bytes match esi to zero. */
|
|
||||||
+ vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
|
|
||||||
+ vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
|
|
||||||
+ /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
|
|
||||||
+ VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
|
|
||||||
+ VPCMP $0, %YMM3, %YMMZERO, %k2
|
|
||||||
# ifdef USE_AS_RAWMEMCHR
|
|
||||||
- jmp L(loop_4x_vec)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ kortestd %k2, %k3
|
|
||||||
+ jz L(loop_4x_vec)
|
|
||||||
# else
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ kortestd %k2, %k3
|
|
||||||
+ jnz L(loop_4x_vec_end)
|
|
||||||
+
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+
|
|
||||||
+ subq $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
ja L(loop_4x_vec)
|
|
||||||
|
|
||||||
+ /* Fall through into less than 4 remaining vectors of length case.
|
|
||||||
+ */
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ addq $(VEC_SIZE * 3), %rdi
|
|
||||||
+ .p2align 4
|
|
||||||
L(last_4x_vec_or_less):
|
|
||||||
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
|
||||||
- addl $(VEC_SIZE * 2), %edx
|
|
||||||
- jle L(last_2x_vec)
|
|
||||||
-
|
|
||||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
+ /* Check if first VEC contained match. */
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ jnz L(first_vec_x1_check)
|
|
||||||
|
|
||||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
+ /* If remaining length > CHAR_PER_VEC * 2. */
|
|
||||||
+ addl $(CHAR_PER_VEC * 2), %edx
|
|
||||||
+ jg L(last_4x_vec)
|
|
||||||
|
|
||||||
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
+L(last_2x_vec):
|
|
||||||
+ /* If remaining length < CHAR_PER_VEC. */
|
|
||||||
+ addl $CHAR_PER_VEC, %edx
|
|
||||||
+ jle L(zero_end)
|
|
||||||
|
|
||||||
- jnz L(first_vec_x2_check)
|
|
||||||
- subl $VEC_SIZE, %edx
|
|
||||||
- jle L(zero)
|
|
||||||
+ /* Check VEC2 and compare any match with remaining length. */
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ cmpl %eax, %edx
|
|
||||||
+ jbe L(set_zero_end)
|
|
||||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+L(zero_end):
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
|
|
||||||
- jnz L(first_vec_x3_check)
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x1_check):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Adjust length. */
|
|
||||||
+ subl $-(CHAR_PER_VEC * 4), %edx
|
|
||||||
+ /* Check if match within remaining length. */
|
|
||||||
+ cmpl %eax, %edx
|
|
||||||
+ jbe L(set_zero_end)
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
+L(set_zero_end):
|
|
||||||
xorl %eax, %eax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(last_2x_vec):
|
|
||||||
- addl $(VEC_SIZE * 2), %edx
|
|
||||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
||||||
+L(loop_4x_vec_end):
|
|
||||||
+# endif
|
|
||||||
+ /* rawmemchr will fall through into this if match was found in
|
|
||||||
+ loop. */
|
|
||||||
+
|
|
||||||
+ /* k1 has not of matches with VEC1. */
|
|
||||||
kmovd %k1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ subl $((1 << CHAR_PER_VEC) - 1), %eax
|
|
||||||
+# else
|
|
||||||
+ incl %eax
|
|
||||||
+# endif
|
|
||||||
+ jnz L(last_vec_x1_return)
|
|
||||||
|
|
||||||
- jnz L(first_vec_x0_check)
|
|
||||||
- subl $VEC_SIZE, %edx
|
|
||||||
- jle L(zero)
|
|
||||||
+ VPCMP $0, %YMM2, %YMMZERO, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_x2_return)
|
|
||||||
|
|
||||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1_check)
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- ret
|
|
||||||
+ jnz L(last_vec_x3_return)
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x0_check):
|
|
||||||
+ kmovd %k3, %eax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+# else
|
|
||||||
+ leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# endif
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
- addq %rdi, %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x1_check):
|
|
||||||
+L(last_vec_x1_return):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+# else
|
|
||||||
addq %rdi, %rax
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x2_check):
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
+# endif
|
|
||||||
+# else
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# endif
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x3_check):
|
|
||||||
+L(last_vec_x2_return):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+# else
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# endif
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(zero):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- ret
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x0):
|
|
||||||
+L(last_vec_x3_return):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (%rdi, %rax, 4), %rax
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# else
|
|
||||||
- addq %rdi, %rax
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
+
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
+L(last_4x_vec_or_less_cmpeq):
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ /* Check first VEC regardless. */
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x1_check)
|
|
||||||
+
|
|
||||||
+ /* If remaining length <= CHAR_PER_VEC * 2. */
|
|
||||||
+ addl $(CHAR_PER_VEC * 2), %edx
|
|
||||||
+ jle L(last_2x_vec)
|
|
||||||
+
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x1):
|
|
||||||
+L(last_4x_vec):
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_x2)
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ /* Create mask for possible matches within remaining length. */
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
|
|
||||||
+ bzhil %edx, %ecx, %ecx
|
|
||||||
+# else
|
|
||||||
+ movq $-1, %rcx
|
|
||||||
+ bzhiq %rdx, %rcx, %rcx
|
|
||||||
+# endif
|
|
||||||
+ /* Test matches in data against length match. */
|
|
||||||
+ andl %ecx, %eax
|
|
||||||
+ jnz L(last_vec_x3)
|
|
||||||
+
|
|
||||||
+ /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
|
|
||||||
+ remaining length was found to be > CHAR_PER_VEC * 2. */
|
|
||||||
+ subl $CHAR_PER_VEC, %edx
|
|
||||||
+ jbe L(zero_end2)
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ /* Shift remaining length mask for last VEC. */
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ shrl $CHAR_PER_VEC, %ecx
|
|
||||||
+# else
|
|
||||||
+ shrq $CHAR_PER_VEC, %rcx
|
|
||||||
+# endif
|
|
||||||
+ andl %ecx, %eax
|
|
||||||
+ jz L(zero_end2)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq VEC_SIZE(%rdi, %rax, 4), %rax
|
|
||||||
-# else
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
-# endif
|
|
||||||
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+L(zero_end2):
|
|
||||||
ret
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x2):
|
|
||||||
+L(last_vec_x2):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
|
|
||||||
-# else
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
-# endif
|
|
||||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(4x_vec_end):
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
- kmovd %k3, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
- kmovd %k4, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
-L(first_vec_x3):
|
|
||||||
+L(last_vec_x3):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
|
|
||||||
-# else
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
-# endif
|
|
||||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
ret
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
END (MEMCHR)
|
|
||||||
#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,30 +0,0 @@
|
|||||||
From 6ea916adfa0ab9af6e7dc6adcf6f977dfe017835 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Alice Xu <alice.d.xu@gmail.com>
|
|
||||||
Date: Fri, 7 May 2021 19:03:21 -0700
|
|
||||||
Subject: [PATCH] x86-64: Fix an unknown vector operation in memchr-evex.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
An unknown vector operation occurred in commit 2a76821c308. Fixed it
|
|
||||||
by using "ymm{k1}{z}" but not "ymm {k1} {z}".
|
|
||||||
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memchr-evex.S | 2 +-
|
|
||||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
index 81d5cd64..f3fdad4f 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
@@ -271,7 +271,7 @@ L(loop_4x_vec):
|
|
||||||
vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
|
|
||||||
VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
|
|
||||||
/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
|
|
||||||
- VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
|
|
||||||
+ VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
|
|
||||||
VPCMP $0, %YMM3, %YMMZERO, %k2
|
|
||||||
# ifdef USE_AS_RAWMEMCHR
|
|
||||||
subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,566 +0,0 @@
|
|||||||
From a0db678071c60b6c47c468d231dd0b3694ba7a98 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Tue, 22 Jun 2021 20:42:10 -0700
|
|
||||||
Subject: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1
|
|
||||||
version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S
|
|
||||||
and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants.
|
|
||||||
This also removes the unused symbols, __GI___strlen_sse2 and
|
|
||||||
__GI___wcsnlen_sse4_1.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strlen-sse2.S | 2 +-
|
|
||||||
sysdeps/x86_64/multiarch/strlen-vec.S | 257 ++++++++++++++++++++++
|
|
||||||
sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 2 +-
|
|
||||||
sysdeps/x86_64/strlen.S | 243 +-------------------
|
|
||||||
4 files changed, 262 insertions(+), 242 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
|
|
||||||
(Copyright dates, URL)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
|
|
||||||
index 7bc57b8d..449c8a7f 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strlen-sse2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
|
|
||||||
@@ -20,4 +20,4 @@
|
|
||||||
# define strlen __strlen_sse2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
-#include "../strlen.S"
|
|
||||||
+#include "strlen-vec.S"
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..8f660bb9
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
@@ -0,0 +1,257 @@
|
|
||||||
+/* SSE2 version of strlen and SSE4.1 version of wcslen.
|
|
||||||
+ Copyright (C) 2012-2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <sysdep.h>
|
|
||||||
+
|
|
||||||
+#ifdef AS_WCSLEN
|
|
||||||
+# define PMINU pminud
|
|
||||||
+# define PCMPEQ pcmpeqd
|
|
||||||
+# define SHIFT_RETURN shrq $2, %rax
|
|
||||||
+#else
|
|
||||||
+# define PMINU pminub
|
|
||||||
+# define PCMPEQ pcmpeqb
|
|
||||||
+# define SHIFT_RETURN
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+/* Long lived register in strlen(s), strnlen(s, n) are:
|
|
||||||
+
|
|
||||||
+ %xmm3 - zero
|
|
||||||
+ %rdi - s
|
|
||||||
+ %r10 (s+n) & (~(64-1))
|
|
||||||
+ %r11 s+n
|
|
||||||
+*/
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+.text
|
|
||||||
+ENTRY(strlen)
|
|
||||||
+
|
|
||||||
+/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
|
|
||||||
+#define FIND_ZERO \
|
|
||||||
+ PCMPEQ (%rax), %xmm0; \
|
|
||||||
+ PCMPEQ 16(%rax), %xmm1; \
|
|
||||||
+ PCMPEQ 32(%rax), %xmm2; \
|
|
||||||
+ PCMPEQ 48(%rax), %xmm3; \
|
|
||||||
+ pmovmskb %xmm0, %esi; \
|
|
||||||
+ pmovmskb %xmm1, %edx; \
|
|
||||||
+ pmovmskb %xmm2, %r8d; \
|
|
||||||
+ pmovmskb %xmm3, %ecx; \
|
|
||||||
+ salq $16, %rdx; \
|
|
||||||
+ salq $16, %rcx; \
|
|
||||||
+ orq %rsi, %rdx; \
|
|
||||||
+ orq %r8, %rcx; \
|
|
||||||
+ salq $32, %rcx; \
|
|
||||||
+ orq %rcx, %rdx;
|
|
||||||
+
|
|
||||||
+#ifdef AS_STRNLEN
|
|
||||||
+/* Do not read anything when n==0. */
|
|
||||||
+ test %RSI_LP, %RSI_LP
|
|
||||||
+ jne L(n_nonzero)
|
|
||||||
+ xor %rax, %rax
|
|
||||||
+ ret
|
|
||||||
+L(n_nonzero):
|
|
||||||
+# ifdef AS_WCSLEN
|
|
||||||
+ shl $2, %RSI_LP
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+/* Initialize long lived registers. */
|
|
||||||
+
|
|
||||||
+ add %RDI_LP, %RSI_LP
|
|
||||||
+ mov %RSI_LP, %R10_LP
|
|
||||||
+ and $-64, %R10_LP
|
|
||||||
+ mov %RSI_LP, %R11_LP
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+ pxor %xmm0, %xmm0
|
|
||||||
+ pxor %xmm1, %xmm1
|
|
||||||
+ pxor %xmm2, %xmm2
|
|
||||||
+ pxor %xmm3, %xmm3
|
|
||||||
+ movq %rdi, %rax
|
|
||||||
+ movq %rdi, %rcx
|
|
||||||
+ andq $4095, %rcx
|
|
||||||
+/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
|
|
||||||
+ cmpq $4047, %rcx
|
|
||||||
+/* We cannot unify this branching as it would be ~6 cycles slower. */
|
|
||||||
+ ja L(cross_page)
|
|
||||||
+
|
|
||||||
+#ifdef AS_STRNLEN
|
|
||||||
+/* Test if end is among first 64 bytes. */
|
|
||||||
+# define STRNLEN_PROLOG \
|
|
||||||
+ mov %r11, %rsi; \
|
|
||||||
+ subq %rax, %rsi; \
|
|
||||||
+ andq $-64, %rax; \
|
|
||||||
+ testq $-64, %rsi; \
|
|
||||||
+ je L(strnlen_ret)
|
|
||||||
+#else
|
|
||||||
+# define STRNLEN_PROLOG andq $-64, %rax;
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+/* Ignore bits in mask that come before start of string. */
|
|
||||||
+#define PROLOG(lab) \
|
|
||||||
+ movq %rdi, %rcx; \
|
|
||||||
+ xorq %rax, %rcx; \
|
|
||||||
+ STRNLEN_PROLOG; \
|
|
||||||
+ sarq %cl, %rdx; \
|
|
||||||
+ test %rdx, %rdx; \
|
|
||||||
+ je L(lab); \
|
|
||||||
+ bsfq %rdx, %rax; \
|
|
||||||
+ SHIFT_RETURN; \
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+#ifdef AS_STRNLEN
|
|
||||||
+ andq $-16, %rax
|
|
||||||
+ FIND_ZERO
|
|
||||||
+#else
|
|
||||||
+ /* Test first 16 bytes unaligned. */
|
|
||||||
+ movdqu (%rax), %xmm4
|
|
||||||
+ PCMPEQ %xmm0, %xmm4
|
|
||||||
+ pmovmskb %xmm4, %edx
|
|
||||||
+ test %edx, %edx
|
|
||||||
+ je L(next48_bytes)
|
|
||||||
+ bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
|
|
||||||
+ SHIFT_RETURN
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+L(next48_bytes):
|
|
||||||
+/* Same as FIND_ZERO except we do not check first 16 bytes. */
|
|
||||||
+ andq $-16, %rax
|
|
||||||
+ PCMPEQ 16(%rax), %xmm1
|
|
||||||
+ PCMPEQ 32(%rax), %xmm2
|
|
||||||
+ PCMPEQ 48(%rax), %xmm3
|
|
||||||
+ pmovmskb %xmm1, %edx
|
|
||||||
+ pmovmskb %xmm2, %r8d
|
|
||||||
+ pmovmskb %xmm3, %ecx
|
|
||||||
+ salq $16, %rdx
|
|
||||||
+ salq $16, %rcx
|
|
||||||
+ orq %r8, %rcx
|
|
||||||
+ salq $32, %rcx
|
|
||||||
+ orq %rcx, %rdx
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+ /* When no zero byte is found xmm1-3 are zero so we do not have to
|
|
||||||
+ zero them. */
|
|
||||||
+ PROLOG(loop)
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(cross_page):
|
|
||||||
+ andq $-64, %rax
|
|
||||||
+ FIND_ZERO
|
|
||||||
+ PROLOG(loop_init)
|
|
||||||
+
|
|
||||||
+#ifdef AS_STRNLEN
|
|
||||||
+/* We must do this check to correctly handle strnlen (s, -1). */
|
|
||||||
+L(strnlen_ret):
|
|
||||||
+ bts %rsi, %rdx
|
|
||||||
+ sarq %cl, %rdx
|
|
||||||
+ test %rdx, %rdx
|
|
||||||
+ je L(loop_init)
|
|
||||||
+ bsfq %rdx, %rax
|
|
||||||
+ SHIFT_RETURN
|
|
||||||
+ ret
|
|
||||||
+#endif
|
|
||||||
+ .p2align 4
|
|
||||||
+L(loop_init):
|
|
||||||
+ pxor %xmm1, %xmm1
|
|
||||||
+ pxor %xmm2, %xmm2
|
|
||||||
+ pxor %xmm3, %xmm3
|
|
||||||
+#ifdef AS_STRNLEN
|
|
||||||
+ .p2align 4
|
|
||||||
+L(loop):
|
|
||||||
+
|
|
||||||
+ addq $64, %rax
|
|
||||||
+ cmpq %rax, %r10
|
|
||||||
+ je L(exit_end)
|
|
||||||
+
|
|
||||||
+ movdqa (%rax), %xmm0
|
|
||||||
+ PMINU 16(%rax), %xmm0
|
|
||||||
+ PMINU 32(%rax), %xmm0
|
|
||||||
+ PMINU 48(%rax), %xmm0
|
|
||||||
+ PCMPEQ %xmm3, %xmm0
|
|
||||||
+ pmovmskb %xmm0, %edx
|
|
||||||
+ testl %edx, %edx
|
|
||||||
+ jne L(exit)
|
|
||||||
+ jmp L(loop)
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(exit_end):
|
|
||||||
+ cmp %rax, %r11
|
|
||||||
+ je L(first) /* Do not read when end is at page boundary. */
|
|
||||||
+ pxor %xmm0, %xmm0
|
|
||||||
+ FIND_ZERO
|
|
||||||
+
|
|
||||||
+L(first):
|
|
||||||
+ bts %r11, %rdx
|
|
||||||
+ bsfq %rdx, %rdx
|
|
||||||
+ addq %rdx, %rax
|
|
||||||
+ subq %rdi, %rax
|
|
||||||
+ SHIFT_RETURN
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(exit):
|
|
||||||
+ pxor %xmm0, %xmm0
|
|
||||||
+ FIND_ZERO
|
|
||||||
+
|
|
||||||
+ bsfq %rdx, %rdx
|
|
||||||
+ addq %rdx, %rax
|
|
||||||
+ subq %rdi, %rax
|
|
||||||
+ SHIFT_RETURN
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+#else
|
|
||||||
+
|
|
||||||
+ /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(loop):
|
|
||||||
+
|
|
||||||
+ movdqa 64(%rax), %xmm0
|
|
||||||
+ PMINU 80(%rax), %xmm0
|
|
||||||
+ PMINU 96(%rax), %xmm0
|
|
||||||
+ PMINU 112(%rax), %xmm0
|
|
||||||
+ PCMPEQ %xmm3, %xmm0
|
|
||||||
+ pmovmskb %xmm0, %edx
|
|
||||||
+ testl %edx, %edx
|
|
||||||
+ jne L(exit64)
|
|
||||||
+
|
|
||||||
+ subq $-128, %rax
|
|
||||||
+
|
|
||||||
+ movdqa (%rax), %xmm0
|
|
||||||
+ PMINU 16(%rax), %xmm0
|
|
||||||
+ PMINU 32(%rax), %xmm0
|
|
||||||
+ PMINU 48(%rax), %xmm0
|
|
||||||
+ PCMPEQ %xmm3, %xmm0
|
|
||||||
+ pmovmskb %xmm0, %edx
|
|
||||||
+ testl %edx, %edx
|
|
||||||
+ jne L(exit0)
|
|
||||||
+ jmp L(loop)
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(exit64):
|
|
||||||
+ addq $64, %rax
|
|
||||||
+L(exit0):
|
|
||||||
+ pxor %xmm0, %xmm0
|
|
||||||
+ FIND_ZERO
|
|
||||||
+
|
|
||||||
+ bsfq %rdx, %rdx
|
|
||||||
+ addq %rdx, %rax
|
|
||||||
+ subq %rdi, %rax
|
|
||||||
+ SHIFT_RETURN
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+END(strlen)
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
|
|
||||||
index a8cab0cb..5fa51fe0 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
|
|
||||||
@@ -2,4 +2,4 @@
|
|
||||||
#define AS_STRNLEN
|
|
||||||
#define strlen __wcsnlen_sse4_1
|
|
||||||
|
|
||||||
-#include "../strlen.S"
|
|
||||||
+#include "strlen-vec.S"
|
|
||||||
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
|
|
||||||
index f845f3d4..ad047d84 100644
|
|
||||||
--- a/sysdeps/x86_64/strlen.S
|
|
||||||
+++ b/sysdeps/x86_64/strlen.S
|
|
||||||
@@ -1,5 +1,5 @@
|
|
||||||
-/* SSE2 version of strlen/wcslen.
|
|
||||||
- Copyright (C) 2012-2018 Free Software Foundation, Inc.
|
|
||||||
+/* SSE2 version of strlen.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
This file is part of the GNU C Library.
|
|
||||||
|
|
||||||
The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
@@ -16,243 +16,6 @@
|
|
||||||
License along with the GNU C Library; if not, see
|
|
||||||
<http://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
-#include <sysdep.h>
|
|
||||||
+#include "multiarch/strlen-vec.S"
|
|
||||||
|
|
||||||
-#ifdef AS_WCSLEN
|
|
||||||
-# define PMINU pminud
|
|
||||||
-# define PCMPEQ pcmpeqd
|
|
||||||
-# define SHIFT_RETURN shrq $2, %rax
|
|
||||||
-#else
|
|
||||||
-# define PMINU pminub
|
|
||||||
-# define PCMPEQ pcmpeqb
|
|
||||||
-# define SHIFT_RETURN
|
|
||||||
-#endif
|
|
||||||
-
|
|
||||||
-/* Long lived register in strlen(s), strnlen(s, n) are:
|
|
||||||
-
|
|
||||||
- %xmm3 - zero
|
|
||||||
- %rdi - s
|
|
||||||
- %r10 (s+n) & (~(64-1))
|
|
||||||
- %r11 s+n
|
|
||||||
-*/
|
|
||||||
-
|
|
||||||
-
|
|
||||||
-.text
|
|
||||||
-ENTRY(strlen)
|
|
||||||
-
|
|
||||||
-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
|
|
||||||
-#define FIND_ZERO \
|
|
||||||
- PCMPEQ (%rax), %xmm0; \
|
|
||||||
- PCMPEQ 16(%rax), %xmm1; \
|
|
||||||
- PCMPEQ 32(%rax), %xmm2; \
|
|
||||||
- PCMPEQ 48(%rax), %xmm3; \
|
|
||||||
- pmovmskb %xmm0, %esi; \
|
|
||||||
- pmovmskb %xmm1, %edx; \
|
|
||||||
- pmovmskb %xmm2, %r8d; \
|
|
||||||
- pmovmskb %xmm3, %ecx; \
|
|
||||||
- salq $16, %rdx; \
|
|
||||||
- salq $16, %rcx; \
|
|
||||||
- orq %rsi, %rdx; \
|
|
||||||
- orq %r8, %rcx; \
|
|
||||||
- salq $32, %rcx; \
|
|
||||||
- orq %rcx, %rdx;
|
|
||||||
-
|
|
||||||
-#ifdef AS_STRNLEN
|
|
||||||
-/* Do not read anything when n==0. */
|
|
||||||
- test %RSI_LP, %RSI_LP
|
|
||||||
- jne L(n_nonzero)
|
|
||||||
- xor %rax, %rax
|
|
||||||
- ret
|
|
||||||
-L(n_nonzero):
|
|
||||||
-# ifdef AS_WCSLEN
|
|
||||||
- shl $2, %RSI_LP
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
-/* Initialize long lived registers. */
|
|
||||||
-
|
|
||||||
- add %RDI_LP, %RSI_LP
|
|
||||||
- mov %RSI_LP, %R10_LP
|
|
||||||
- and $-64, %R10_LP
|
|
||||||
- mov %RSI_LP, %R11_LP
|
|
||||||
-#endif
|
|
||||||
-
|
|
||||||
- pxor %xmm0, %xmm0
|
|
||||||
- pxor %xmm1, %xmm1
|
|
||||||
- pxor %xmm2, %xmm2
|
|
||||||
- pxor %xmm3, %xmm3
|
|
||||||
- movq %rdi, %rax
|
|
||||||
- movq %rdi, %rcx
|
|
||||||
- andq $4095, %rcx
|
|
||||||
-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
|
|
||||||
- cmpq $4047, %rcx
|
|
||||||
-/* We cannot unify this branching as it would be ~6 cycles slower. */
|
|
||||||
- ja L(cross_page)
|
|
||||||
-
|
|
||||||
-#ifdef AS_STRNLEN
|
|
||||||
-/* Test if end is among first 64 bytes. */
|
|
||||||
-# define STRNLEN_PROLOG \
|
|
||||||
- mov %r11, %rsi; \
|
|
||||||
- subq %rax, %rsi; \
|
|
||||||
- andq $-64, %rax; \
|
|
||||||
- testq $-64, %rsi; \
|
|
||||||
- je L(strnlen_ret)
|
|
||||||
-#else
|
|
||||||
-# define STRNLEN_PROLOG andq $-64, %rax;
|
|
||||||
-#endif
|
|
||||||
-
|
|
||||||
-/* Ignore bits in mask that come before start of string. */
|
|
||||||
-#define PROLOG(lab) \
|
|
||||||
- movq %rdi, %rcx; \
|
|
||||||
- xorq %rax, %rcx; \
|
|
||||||
- STRNLEN_PROLOG; \
|
|
||||||
- sarq %cl, %rdx; \
|
|
||||||
- test %rdx, %rdx; \
|
|
||||||
- je L(lab); \
|
|
||||||
- bsfq %rdx, %rax; \
|
|
||||||
- SHIFT_RETURN; \
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
-#ifdef AS_STRNLEN
|
|
||||||
- andq $-16, %rax
|
|
||||||
- FIND_ZERO
|
|
||||||
-#else
|
|
||||||
- /* Test first 16 bytes unaligned. */
|
|
||||||
- movdqu (%rax), %xmm4
|
|
||||||
- PCMPEQ %xmm0, %xmm4
|
|
||||||
- pmovmskb %xmm4, %edx
|
|
||||||
- test %edx, %edx
|
|
||||||
- je L(next48_bytes)
|
|
||||||
- bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
|
|
||||||
- SHIFT_RETURN
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
-L(next48_bytes):
|
|
||||||
-/* Same as FIND_ZERO except we do not check first 16 bytes. */
|
|
||||||
- andq $-16, %rax
|
|
||||||
- PCMPEQ 16(%rax), %xmm1
|
|
||||||
- PCMPEQ 32(%rax), %xmm2
|
|
||||||
- PCMPEQ 48(%rax), %xmm3
|
|
||||||
- pmovmskb %xmm1, %edx
|
|
||||||
- pmovmskb %xmm2, %r8d
|
|
||||||
- pmovmskb %xmm3, %ecx
|
|
||||||
- salq $16, %rdx
|
|
||||||
- salq $16, %rcx
|
|
||||||
- orq %r8, %rcx
|
|
||||||
- salq $32, %rcx
|
|
||||||
- orq %rcx, %rdx
|
|
||||||
-#endif
|
|
||||||
-
|
|
||||||
- /* When no zero byte is found xmm1-3 are zero so we do not have to
|
|
||||||
- zero them. */
|
|
||||||
- PROLOG(loop)
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(cross_page):
|
|
||||||
- andq $-64, %rax
|
|
||||||
- FIND_ZERO
|
|
||||||
- PROLOG(loop_init)
|
|
||||||
-
|
|
||||||
-#ifdef AS_STRNLEN
|
|
||||||
-/* We must do this check to correctly handle strnlen (s, -1). */
|
|
||||||
-L(strnlen_ret):
|
|
||||||
- bts %rsi, %rdx
|
|
||||||
- sarq %cl, %rdx
|
|
||||||
- test %rdx, %rdx
|
|
||||||
- je L(loop_init)
|
|
||||||
- bsfq %rdx, %rax
|
|
||||||
- SHIFT_RETURN
|
|
||||||
- ret
|
|
||||||
-#endif
|
|
||||||
- .p2align 4
|
|
||||||
-L(loop_init):
|
|
||||||
- pxor %xmm1, %xmm1
|
|
||||||
- pxor %xmm2, %xmm2
|
|
||||||
- pxor %xmm3, %xmm3
|
|
||||||
-#ifdef AS_STRNLEN
|
|
||||||
- .p2align 4
|
|
||||||
-L(loop):
|
|
||||||
-
|
|
||||||
- addq $64, %rax
|
|
||||||
- cmpq %rax, %r10
|
|
||||||
- je L(exit_end)
|
|
||||||
-
|
|
||||||
- movdqa (%rax), %xmm0
|
|
||||||
- PMINU 16(%rax), %xmm0
|
|
||||||
- PMINU 32(%rax), %xmm0
|
|
||||||
- PMINU 48(%rax), %xmm0
|
|
||||||
- PCMPEQ %xmm3, %xmm0
|
|
||||||
- pmovmskb %xmm0, %edx
|
|
||||||
- testl %edx, %edx
|
|
||||||
- jne L(exit)
|
|
||||||
- jmp L(loop)
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(exit_end):
|
|
||||||
- cmp %rax, %r11
|
|
||||||
- je L(first) /* Do not read when end is at page boundary. */
|
|
||||||
- pxor %xmm0, %xmm0
|
|
||||||
- FIND_ZERO
|
|
||||||
-
|
|
||||||
-L(first):
|
|
||||||
- bts %r11, %rdx
|
|
||||||
- bsfq %rdx, %rdx
|
|
||||||
- addq %rdx, %rax
|
|
||||||
- subq %rdi, %rax
|
|
||||||
- SHIFT_RETURN
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(exit):
|
|
||||||
- pxor %xmm0, %xmm0
|
|
||||||
- FIND_ZERO
|
|
||||||
-
|
|
||||||
- bsfq %rdx, %rdx
|
|
||||||
- addq %rdx, %rax
|
|
||||||
- subq %rdi, %rax
|
|
||||||
- SHIFT_RETURN
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
-#else
|
|
||||||
-
|
|
||||||
- /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
|
|
||||||
- .p2align 4
|
|
||||||
-L(loop):
|
|
||||||
-
|
|
||||||
- movdqa 64(%rax), %xmm0
|
|
||||||
- PMINU 80(%rax), %xmm0
|
|
||||||
- PMINU 96(%rax), %xmm0
|
|
||||||
- PMINU 112(%rax), %xmm0
|
|
||||||
- PCMPEQ %xmm3, %xmm0
|
|
||||||
- pmovmskb %xmm0, %edx
|
|
||||||
- testl %edx, %edx
|
|
||||||
- jne L(exit64)
|
|
||||||
-
|
|
||||||
- subq $-128, %rax
|
|
||||||
-
|
|
||||||
- movdqa (%rax), %xmm0
|
|
||||||
- PMINU 16(%rax), %xmm0
|
|
||||||
- PMINU 32(%rax), %xmm0
|
|
||||||
- PMINU 48(%rax), %xmm0
|
|
||||||
- PCMPEQ %xmm3, %xmm0
|
|
||||||
- pmovmskb %xmm0, %edx
|
|
||||||
- testl %edx, %edx
|
|
||||||
- jne L(exit0)
|
|
||||||
- jmp L(loop)
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(exit64):
|
|
||||||
- addq $64, %rax
|
|
||||||
-L(exit0):
|
|
||||||
- pxor %xmm0, %xmm0
|
|
||||||
- FIND_ZERO
|
|
||||||
-
|
|
||||||
- bsfq %rdx, %rdx
|
|
||||||
- addq %rdx, %rax
|
|
||||||
- subq %rdi, %rax
|
|
||||||
- SHIFT_RETURN
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
-#endif
|
|
||||||
-
|
|
||||||
-END(strlen)
|
|
||||||
libc_hidden_builtin_def (strlen)
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,181 +0,0 @@
|
|||||||
From 6f573a27b6c8b4236445810a44660612323f5a73 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Wed, 23 Jun 2021 01:19:34 -0400
|
|
||||||
Subject: [PATCH] x86-64: Add wcslen optimize for sse4.1
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This comment adds the ifunc / build infrastructure
|
|
||||||
necessary for wcslen to prefer the sse4.1 implementation
|
|
||||||
in strlen-vec.S. test-wcslen.c is passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/Makefile | 4 +-
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 3 ++
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-wcslen.h | 52 ++++++++++++++++++++++
|
|
||||||
sysdeps/x86_64/multiarch/wcslen-sse4_1.S | 4 ++
|
|
||||||
sysdeps/x86_64/multiarch/wcslen.c | 2 +-
|
|
||||||
sysdeps/x86_64/multiarch/wcsnlen.c | 34 +-------------
|
|
||||||
6 files changed, 63 insertions(+), 36 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcslen.h
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/wcslen-sse4_1.S
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
index 491c7698..65fde4eb 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
@@ -93,8 +93,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
|
|
||||||
wcscpy-ssse3 wcscpy-c \
|
|
||||||
wcschr-sse2 wcschr-avx2 \
|
|
||||||
wcsrchr-sse2 wcsrchr-avx2 \
|
|
||||||
- wcsnlen-sse4_1 wcsnlen-c \
|
|
||||||
- wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
|
|
||||||
+ wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
|
|
||||||
+ wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
|
|
||||||
wcschr-avx2-rtm \
|
|
||||||
wcscmp-avx2-rtm \
|
|
||||||
wcslen-avx2-rtm \
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index f1a6460a..580913ca 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -657,6 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
&& CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wcslen_evex)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
|
|
||||||
+ CPU_FEATURE_USABLE (SSE4_1),
|
|
||||||
+ __wcsnlen_sse4_1)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
|
|
||||||
|
|
||||||
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..39e33473
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
|
|
||||||
@@ -0,0 +1,52 @@
|
|
||||||
+/* Common definition for ifunc selections for wcslen and wcsnlen
|
|
||||||
+ All versions must be listed in ifunc-impl-list.c.
|
|
||||||
+ Copyright (C) 2017-2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <init-arch.h>
|
|
||||||
+
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
|
||||||
+
|
|
||||||
+static inline void *
|
|
||||||
+IFUNC_SELECTOR (void)
|
|
||||||
+{
|
|
||||||
+ const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
||||||
+
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
|
||||||
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
+ return OPTIMIZE (evex);
|
|
||||||
+
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
||||||
+ return OPTIMIZE (avx2_rtm);
|
|
||||||
+
|
|
||||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
+ return OPTIMIZE (avx2);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
|
|
||||||
+ return OPTIMIZE (sse4_1);
|
|
||||||
+
|
|
||||||
+ return OPTIMIZE (sse2);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..7e62621a
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
|
|
||||||
@@ -0,0 +1,4 @@
|
|
||||||
+#define AS_WCSLEN
|
|
||||||
+#define strlen __wcslen_sse4_1
|
|
||||||
+
|
|
||||||
+#include "strlen-vec.S"
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
|
|
||||||
index 6d06e47c..3b04b75b 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/wcslen.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/wcslen.c
|
|
||||||
@@ -24,7 +24,7 @@
|
|
||||||
# undef __wcslen
|
|
||||||
|
|
||||||
# define SYMBOL_NAME wcslen
|
|
||||||
-# include "ifunc-avx2.h"
|
|
||||||
+# include "ifunc-wcslen.h"
|
|
||||||
|
|
||||||
libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ());
|
|
||||||
weak_alias (__wcslen, wcslen);
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
|
|
||||||
index 20b731ae..06736410 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/wcsnlen.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
|
|
||||||
@@ -24,39 +24,7 @@
|
|
||||||
# undef __wcsnlen
|
|
||||||
|
|
||||||
# define SYMBOL_NAME wcsnlen
|
|
||||||
-# include <init-arch.h>
|
|
||||||
-
|
|
||||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|
||||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
|
||||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
|
||||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
|
||||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
|
||||||
-
|
|
||||||
-static inline void *
|
|
||||||
-IFUNC_SELECTOR (void)
|
|
||||||
-{
|
|
||||||
- const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
||||||
-
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
- && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
- {
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
|
||||||
- return OPTIMIZE (evex);
|
|
||||||
-
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
||||||
- return OPTIMIZE (avx2_rtm);
|
|
||||||
-
|
|
||||||
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
- return OPTIMIZE (avx2);
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
|
|
||||||
- return OPTIMIZE (sse4_1);
|
|
||||||
-
|
|
||||||
- return OPTIMIZE (sse2);
|
|
||||||
-}
|
|
||||||
+# include "ifunc-wcslen.h"
|
|
||||||
|
|
||||||
libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
|
|
||||||
weak_alias (__wcsnlen, wcsnlen);
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,396 +0,0 @@
|
|||||||
From 231c56760c1e2ded21ad96bbb860b1f08c556c7a Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 21 Jan 2019 11:27:25 -0800
|
|
||||||
Subject: [PATCH] x86-64 memcpy: Properly handle the length parameter [BZ#
|
|
||||||
24097]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
|
||||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
|
||||||
functions written in assembly can only use the lower 32 bits of a
|
|
||||||
64-bit register as length or must clear the upper 32 bits before using
|
|
||||||
the full 64-bit register for length.
|
|
||||||
|
|
||||||
This pach fixes memcpy for x32. Tested on x86-64 and x32. On x86-64,
|
|
||||||
libc.so is the same with and withou the fix.
|
|
||||||
|
|
||||||
[BZ# 24097]
|
|
||||||
CVE-2019-6488
|
|
||||||
* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for
|
|
||||||
length. Clear the upper 32 bits of RDX register.
|
|
||||||
* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
|
|
||||||
* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S:
|
|
||||||
Likewise.
|
|
||||||
* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
|
|
||||||
Likewise.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy.
|
|
||||||
tst-size_t-wmemchr.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 17 ++++--
|
|
||||||
sysdeps/x86_64/multiarch/memcpy-ssse3.S | 17 ++++--
|
|
||||||
.../multiarch/memmove-avx512-no-vzeroupper.S | 16 +++--
|
|
||||||
.../multiarch/memmove-vec-unaligned-erms.S | 54 +++++++++--------
|
|
||||||
sysdeps/x86_64/x32/Makefile | 2 +-
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-memcpy.c | 58 +++++++++++++++++++
|
|
||||||
6 files changed, 122 insertions(+), 42 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
|
|
||||||
index 3cd11233..568eebd3 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
|
|
||||||
@@ -45,28 +45,33 @@
|
|
||||||
.section .text.ssse3,"ax",@progbits
|
|
||||||
#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
|
|
||||||
ENTRY (MEMPCPY_CHK)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMPCPY_CHK)
|
|
||||||
|
|
||||||
ENTRY (MEMPCPY)
|
|
||||||
- movq %rdi, %rax
|
|
||||||
- addq %rdx, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
jmp L(start)
|
|
||||||
END (MEMPCPY)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if !defined USE_AS_BCOPY
|
|
||||||
ENTRY (MEMCPY_CHK)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMCPY_CHK)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ENTRY (MEMCPY)
|
|
||||||
- mov %rdi, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
#ifdef USE_AS_MEMPCPY
|
|
||||||
- add %rdx, %rax
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ mov %edx, %edx
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef USE_AS_MEMMOVE
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
|
|
||||||
index 0240bfa3..0bd5ee99 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
|
|
||||||
@@ -45,28 +45,33 @@
|
|
||||||
.section .text.ssse3,"ax",@progbits
|
|
||||||
#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
|
|
||||||
ENTRY (MEMPCPY_CHK)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMPCPY_CHK)
|
|
||||||
|
|
||||||
ENTRY (MEMPCPY)
|
|
||||||
- movq %rdi, %rax
|
|
||||||
- addq %rdx, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
jmp L(start)
|
|
||||||
END (MEMPCPY)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if !defined USE_AS_BCOPY
|
|
||||||
ENTRY (MEMCPY_CHK)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMCPY_CHK)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ENTRY (MEMCPY)
|
|
||||||
- mov %rdi, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
#ifdef USE_AS_MEMPCPY
|
|
||||||
- add %rdx, %rax
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ mov %edx, %edx
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef USE_AS_MEMMOVE
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
|
|
||||||
index effc3ac2..6ca2bbc9 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
|
|
||||||
@@ -24,27 +24,31 @@
|
|
||||||
|
|
||||||
.section .text.avx512,"ax",@progbits
|
|
||||||
ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (__mempcpy_chk_avx512_no_vzeroupper)
|
|
||||||
|
|
||||||
ENTRY (__mempcpy_avx512_no_vzeroupper)
|
|
||||||
- movq %rdi, %rax
|
|
||||||
- addq %rdx, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
jmp L(start)
|
|
||||||
END (__mempcpy_avx512_no_vzeroupper)
|
|
||||||
|
|
||||||
ENTRY (__memmove_chk_avx512_no_vzeroupper)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (__memmove_chk_avx512_no_vzeroupper)
|
|
||||||
|
|
||||||
ENTRY (__memmove_avx512_no_vzeroupper)
|
|
||||||
- mov %rdi, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
# ifdef USE_AS_MEMPCPY
|
|
||||||
- add %rdx, %rax
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
# endif
|
|
||||||
L(start):
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ mov %edx, %edx
|
|
||||||
+# endif
|
|
||||||
lea (%rsi, %rdx), %rcx
|
|
||||||
lea (%rdi, %rdx), %r9
|
|
||||||
cmp $512, %rdx
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
index c952576c..274aa1c7 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
@@ -95,20 +95,20 @@
|
|
||||||
.section SECTION(.text),"ax",@progbits
|
|
||||||
#if defined SHARED && IS_IN (libc)
|
|
||||||
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
|
|
||||||
- movq %rdi, %rax
|
|
||||||
- addq %rdx, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
jmp L(start)
|
|
||||||
END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
|
|
||||||
|
|
||||||
#if defined SHARED && IS_IN (libc)
|
|
||||||
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
|
|
||||||
#endif
|
|
||||||
@@ -116,9 +116,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
|
|
||||||
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
|
|
||||||
movq %rdi, %rax
|
|
||||||
L(start):
|
|
||||||
- cmpq $VEC_SIZE, %rdx
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %edx, %edx
|
|
||||||
+# endif
|
|
||||||
+ cmp $VEC_SIZE, %RDX_LP
|
|
||||||
jb L(less_vec)
|
|
||||||
- cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
+ cmp $(VEC_SIZE * 2), %RDX_LP
|
|
||||||
ja L(more_2x_vec)
|
|
||||||
#if !defined USE_MULTIARCH || !IS_IN (libc)
|
|
||||||
L(last_2x_vec):
|
|
||||||
@@ -138,38 +142,38 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned))
|
|
||||||
|
|
||||||
# if VEC_SIZE == 16
|
|
||||||
ENTRY (__mempcpy_chk_erms)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (__mempcpy_chk_erms)
|
|
||||||
|
|
||||||
/* Only used to measure performance of REP MOVSB. */
|
|
||||||
ENTRY (__mempcpy_erms)
|
|
||||||
- movq %rdi, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
/* Skip zero length. */
|
|
||||||
- testq %rdx, %rdx
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
jz 2f
|
|
||||||
- addq %rdx, %rax
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
jmp L(start_movsb)
|
|
||||||
END (__mempcpy_erms)
|
|
||||||
|
|
||||||
ENTRY (__memmove_chk_erms)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (__memmove_chk_erms)
|
|
||||||
|
|
||||||
ENTRY (__memmove_erms)
|
|
||||||
movq %rdi, %rax
|
|
||||||
/* Skip zero length. */
|
|
||||||
- testq %rdx, %rdx
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
jz 2f
|
|
||||||
L(start_movsb):
|
|
||||||
- movq %rdx, %rcx
|
|
||||||
- cmpq %rsi, %rdi
|
|
||||||
+ mov %RDX_LP, %RCX_LP
|
|
||||||
+ cmp %RSI_LP, %RDI_LP
|
|
||||||
jb 1f
|
|
||||||
/* Source == destination is less common. */
|
|
||||||
je 2f
|
|
||||||
- leaq (%rsi,%rcx), %rdx
|
|
||||||
- cmpq %rdx, %rdi
|
|
||||||
+ lea (%rsi,%rcx), %RDX_LP
|
|
||||||
+ cmp %RDX_LP, %RDI_LP
|
|
||||||
jb L(movsb_backward)
|
|
||||||
1:
|
|
||||||
rep movsb
|
|
||||||
@@ -189,20 +193,20 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
|
|
||||||
|
|
||||||
# ifdef SHARED
|
|
||||||
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
|
|
||||||
# endif
|
|
||||||
|
|
||||||
ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
|
|
||||||
- movq %rdi, %rax
|
|
||||||
- addq %rdx, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
jmp L(start_erms)
|
|
||||||
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
|
|
||||||
|
|
||||||
# ifdef SHARED
|
|
||||||
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
|
||||||
# endif
|
|
||||||
@@ -210,9 +214,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
|
||||||
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
|
||||||
movq %rdi, %rax
|
|
||||||
L(start_erms):
|
|
||||||
- cmpq $VEC_SIZE, %rdx
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %edx, %edx
|
|
||||||
+# endif
|
|
||||||
+ cmp $VEC_SIZE, %RDX_LP
|
|
||||||
jb L(less_vec)
|
|
||||||
- cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
+ cmp $(VEC_SIZE * 2), %RDX_LP
|
|
||||||
ja L(movsb_more_2x_vec)
|
|
||||||
L(last_2x_vec):
|
|
||||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
@@ -236,7 +244,7 @@ L(movsb):
|
|
||||||
/* Avoid slow backward REP MOVSB. */
|
|
||||||
jb L(more_8x_vec_backward)
|
|
||||||
1:
|
|
||||||
- movq %rdx, %rcx
|
|
||||||
+ mov %RDX_LP, %RCX_LP
|
|
||||||
rep movsb
|
|
||||||
L(nop):
|
|
||||||
ret
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index ddec7f04..2fe1e5ac 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
-tests += tst-size_t-memchr tst-size_t-memcmp
|
|
||||||
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),wcsmbs)
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..66b71e17
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
|
|
||||||
@@ -0,0 +1,58 @@
|
|
||||||
+/* Test memcpy with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define TEST_NAME "memcpy"
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+IMPL (memcpy, 1)
|
|
||||||
+
|
|
||||||
+typedef void *(*proto_t) (void *, const void *, size_t);
|
|
||||||
+
|
|
||||||
+static void *
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_memcpy (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ parameter_t dest = { { page_size }, buf1 };
|
|
||||||
+ parameter_t src = { { 0 }, buf2 };
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ src.fn = impl->fn;
|
|
||||||
+ do_memcpy (dest, src);
|
|
||||||
+ int res = memcmp (dest.p, src.p, dest.len);
|
|
||||||
+ if (res)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: %i != 0",
|
|
||||||
+ impl->name, res);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,497 +0,0 @@
|
|||||||
From a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Wed, 23 Jun 2021 01:56:29 -0400
|
|
||||||
Subject: [PATCH] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ
|
|
||||||
#27974]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
This commit fixes the bug mentioned in the previous commit.
|
|
||||||
|
|
||||||
The previous implementations of wmemchr in these files relied
|
|
||||||
on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
|
|
||||||
|
|
||||||
The new overflow tests added in the previous commit now
|
|
||||||
pass (As well as all the other tests).
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
|
|
||||||
sysdeps/x86_64/multiarch/strlen-vec.S | 15 ++-
|
|
||||||
2 files changed, 107 insertions(+), 38 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
index be8a5db5..37688966 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
@@ -44,21 +44,21 @@
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
# define PAGE_SIZE 4096
|
|
||||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
||||||
|
|
||||||
.section SECTION(.text),"ax",@progbits
|
|
||||||
ENTRY (STRLEN)
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* Check zero length. */
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear upper bits. */
|
|
||||||
+ and %RSI_LP, %RSI_LP
|
|
||||||
+# else
|
|
||||||
test %RSI_LP, %RSI_LP
|
|
||||||
+# endif
|
|
||||||
jz L(zero)
|
|
||||||
/* Store max len in R8_LP before adjusting if using WCSLEN. */
|
|
||||||
mov %RSI_LP, %R8_LP
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shl $2, %RSI_LP
|
|
||||||
-# elif defined __ILP32__
|
|
||||||
- /* Clear the upper 32 bits. */
|
|
||||||
- movl %esi, %esi
|
|
||||||
-# endif
|
|
||||||
# endif
|
|
||||||
movl %edi, %eax
|
|
||||||
movq %rdi, %rdx
|
|
||||||
@@ -72,10 +72,10 @@ ENTRY (STRLEN)
|
|
||||||
|
|
||||||
/* Check the first VEC_SIZE bytes. */
|
|
||||||
VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* If length < VEC_SIZE handle special. */
|
|
||||||
- cmpq $VEC_SIZE, %rsi
|
|
||||||
+ cmpq $CHAR_PER_VEC, %rsi
|
|
||||||
jbe L(first_vec_x0)
|
|
||||||
# endif
|
|
||||||
/* If empty continue to aligned_more. Otherwise return bit
|
|
||||||
@@ -84,6 +84,7 @@ ENTRY (STRLEN)
|
|
||||||
jz L(aligned_more)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrl $2, %eax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -97,9 +98,14 @@ L(zero):
|
|
||||||
L(first_vec_x0):
|
|
||||||
/* Set bit for max len so that tzcnt will return min of max len
|
|
||||||
and position of first match. */
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Multiply length by 4 to get byte count. */
|
|
||||||
+ sall $2, %esi
|
|
||||||
+# endif
|
|
||||||
btsq %rsi, %rax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrl $2, %eax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -113,14 +119,19 @@ L(first_vec_x1):
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* Use ecx which was computed earlier to compute correct value.
|
|
||||||
*/
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
|
|
||||||
+# else
|
|
||||||
subl $(VEC_SIZE * 4 + 1), %ecx
|
|
||||||
addl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
# else
|
|
||||||
subl %edx, %edi
|
|
||||||
incl %edi
|
|
||||||
addl %edi, %eax
|
|
||||||
# endif
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrl $2, %eax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -133,14 +144,19 @@ L(first_vec_x2):
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* Use ecx which was computed earlier to compute correct value.
|
|
||||||
*/
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
|
|
||||||
+# else
|
|
||||||
subl $(VEC_SIZE * 3 + 1), %ecx
|
|
||||||
addl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
# else
|
|
||||||
subl %edx, %edi
|
|
||||||
addl $(VEC_SIZE + 1), %edi
|
|
||||||
addl %edi, %eax
|
|
||||||
# endif
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrl $2, %eax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -153,14 +169,19 @@ L(first_vec_x3):
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* Use ecx which was computed earlier to compute correct value.
|
|
||||||
*/
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
|
|
||||||
+# else
|
|
||||||
subl $(VEC_SIZE * 2 + 1), %ecx
|
|
||||||
addl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
# else
|
|
||||||
subl %edx, %edi
|
|
||||||
addl $(VEC_SIZE * 2 + 1), %edi
|
|
||||||
addl %edi, %eax
|
|
||||||
# endif
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrl $2, %eax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -173,14 +194,19 @@ L(first_vec_x4):
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* Use ecx which was computed earlier to compute correct value.
|
|
||||||
*/
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
|
|
||||||
+# else
|
|
||||||
subl $(VEC_SIZE + 1), %ecx
|
|
||||||
addl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
# else
|
|
||||||
subl %edx, %edi
|
|
||||||
addl $(VEC_SIZE * 3 + 1), %edi
|
|
||||||
addl %edi, %eax
|
|
||||||
# endif
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrl $2, %eax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -195,10 +221,14 @@ L(cross_page_continue):
|
|
||||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
since data is only aligned to VEC_SIZE. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
|
|
||||||
- it simplies the logic in last_4x_vec_or_less. */
|
|
||||||
+ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
|
|
||||||
+ because it simplies the logic in last_4x_vec_or_less. */
|
|
||||||
leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
|
|
||||||
subq %rdx, %rcx
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %ecx
|
|
||||||
+# endif
|
|
||||||
# endif
|
|
||||||
/* Load first VEC regardless. */
|
|
||||||
VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
|
||||||
@@ -207,34 +237,38 @@ L(cross_page_continue):
|
|
||||||
subq %rcx, %rsi
|
|
||||||
jb L(last_4x_vec_or_less)
|
|
||||||
# endif
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x1)
|
|
||||||
|
|
||||||
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x2)
|
|
||||||
|
|
||||||
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x3)
|
|
||||||
|
|
||||||
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x4)
|
|
||||||
|
|
||||||
/* Align data to VEC_SIZE * 4 - 1. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* Before adjusting length check if at last VEC_SIZE * 4. */
|
|
||||||
- cmpq $(VEC_SIZE * 4 - 1), %rsi
|
|
||||||
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
|
|
||||||
jbe L(last_4x_vec_or_less_load)
|
|
||||||
incq %rdi
|
|
||||||
movl %edi, %ecx
|
|
||||||
orq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
andl $(VEC_SIZE * 4 - 1), %ecx
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %ecx
|
|
||||||
+# endif
|
|
||||||
/* Readjust length. */
|
|
||||||
addq %rcx, %rsi
|
|
||||||
# else
|
|
||||||
@@ -246,13 +280,13 @@ L(cross_page_continue):
|
|
||||||
L(loop_4x_vec):
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* Break if at end of length. */
|
|
||||||
- subq $(VEC_SIZE * 4), %rsi
|
|
||||||
+ subq $(CHAR_PER_VEC * 4), %rsi
|
|
||||||
jb L(last_4x_vec_or_less_cmpeq)
|
|
||||||
# endif
|
|
||||||
- /* Save some code size by microfusing VPMINU with the load. Since
|
|
||||||
- the matches in ymm2/ymm4 can only be returned if there where no
|
|
||||||
- matches in ymm1/ymm3 respectively there is no issue with overlap.
|
|
||||||
- */
|
|
||||||
+ /* Save some code size by microfusing VPMINU with the load.
|
|
||||||
+ Since the matches in ymm2/ymm4 can only be returned if there
|
|
||||||
+ where no matches in ymm1/ymm3 respectively there is no issue
|
|
||||||
+ with overlap. */
|
|
||||||
vmovdqa 1(%rdi), %ymm1
|
|
||||||
VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
|
|
||||||
vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
|
|
||||||
@@ -260,7 +294,7 @@ L(loop_4x_vec):
|
|
||||||
|
|
||||||
VPMINU %ymm2, %ymm4, %ymm5
|
|
||||||
VPCMPEQ %ymm5, %ymm0, %ymm5
|
|
||||||
- vpmovmskb %ymm5, %ecx
|
|
||||||
+ vpmovmskb %ymm5, %ecx
|
|
||||||
|
|
||||||
subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
testl %ecx, %ecx
|
|
||||||
@@ -268,27 +302,28 @@ L(loop_4x_vec):
|
|
||||||
|
|
||||||
|
|
||||||
VPCMPEQ %ymm1, %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
subq %rdx, %rdi
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(last_vec_return_x0)
|
|
||||||
|
|
||||||
VPCMPEQ %ymm2, %ymm0, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
+ vpmovmskb %ymm2, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(last_vec_return_x1)
|
|
||||||
|
|
||||||
/* Combine last 2 VEC. */
|
|
||||||
VPCMPEQ %ymm3, %ymm0, %ymm3
|
|
||||||
- vpmovmskb %ymm3, %eax
|
|
||||||
- /* rcx has combined result from all 4 VEC. It will only be used if
|
|
||||||
- the first 3 other VEC all did not contain a match. */
|
|
||||||
+ vpmovmskb %ymm3, %eax
|
|
||||||
+ /* rcx has combined result from all 4 VEC. It will only be used
|
|
||||||
+ if the first 3 other VEC all did not contain a match. */
|
|
||||||
salq $32, %rcx
|
|
||||||
orq %rcx, %rax
|
|
||||||
tzcntq %rax, %rax
|
|
||||||
subq $(VEC_SIZE * 2 - 1), %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -297,15 +332,19 @@ L(loop_4x_vec):
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
.p2align 4
|
|
||||||
L(last_4x_vec_or_less_load):
|
|
||||||
- /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
|
|
||||||
+ /* Depending on entry adjust rdi / prepare first VEC in ymm1.
|
|
||||||
+ */
|
|
||||||
subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
L(last_4x_vec_or_less_cmpeq):
|
|
||||||
VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
|
||||||
L(last_4x_vec_or_less):
|
|
||||||
-
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- /* If remaining length > VEC_SIZE * 2. This works if esi is off by
|
|
||||||
- VEC_SIZE * 4. */
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Multiply length by 4 to get byte count. */
|
|
||||||
+ sall $2, %esi
|
|
||||||
+# endif
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off
|
|
||||||
+ by VEC_SIZE * 4. */
|
|
||||||
testl $(VEC_SIZE * 2), %esi
|
|
||||||
jnz L(last_4x_vec)
|
|
||||||
|
|
||||||
@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
|
|
||||||
jb L(max)
|
|
||||||
|
|
||||||
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
/* Check the end of data. */
|
|
||||||
cmpl %eax, %esi
|
|
||||||
@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
|
|
||||||
addl $(VEC_SIZE + 1), %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -340,6 +380,7 @@ L(last_vec_return_x0):
|
|
||||||
subq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -350,6 +391,7 @@ L(last_vec_return_x1):
|
|
||||||
subq $(VEC_SIZE * 3 - 1), %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -366,6 +408,7 @@ L(last_vec_x1_check):
|
|
||||||
incl %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -381,14 +424,14 @@ L(last_4x_vec):
|
|
||||||
jnz L(last_vec_x1)
|
|
||||||
|
|
||||||
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(last_vec_x2)
|
|
||||||
|
|
||||||
/* Normalize length. */
|
|
||||||
andl $(VEC_SIZE * 4 - 1), %esi
|
|
||||||
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(last_vec_x3)
|
|
||||||
|
|
||||||
@@ -396,7 +439,7 @@ L(last_4x_vec):
|
|
||||||
jb L(max)
|
|
||||||
|
|
||||||
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
/* Check the end of data. */
|
|
||||||
cmpl %eax, %esi
|
|
||||||
@@ -405,6 +448,7 @@ L(last_4x_vec):
|
|
||||||
addl $(VEC_SIZE * 3 + 1), %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -419,6 +463,7 @@ L(last_vec_x1):
|
|
||||||
incl %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -432,6 +477,7 @@ L(last_vec_x2):
|
|
||||||
addl $(VEC_SIZE + 1), %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -447,6 +493,7 @@ L(last_vec_x3):
|
|
||||||
addl $(VEC_SIZE * 2 + 1), %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -455,13 +502,13 @@ L(max_end):
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
# endif
|
|
||||||
|
|
||||||
- /* Cold case for crossing page with first load. */
|
|
||||||
+ /* Cold case for crossing page with first load. */
|
|
||||||
.p2align 4
|
|
||||||
L(cross_page_boundary):
|
|
||||||
/* Align data to VEC_SIZE - 1. */
|
|
||||||
orq $(VEC_SIZE - 1), %rdi
|
|
||||||
VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
|
|
||||||
so no need to manually mod rdx. */
|
|
||||||
sarxl %edx, %eax, %eax
|
|
||||||
@@ -470,6 +517,10 @@ L(cross_page_boundary):
|
|
||||||
jnz L(cross_page_less_vec)
|
|
||||||
leaq 1(%rdi), %rcx
|
|
||||||
subq %rdx, %rcx
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
+ shrl $2, %ecx
|
|
||||||
+# endif
|
|
||||||
/* Check length. */
|
|
||||||
cmpq %rsi, %rcx
|
|
||||||
jb L(cross_page_continue)
|
|
||||||
@@ -479,6 +530,7 @@ L(cross_page_boundary):
|
|
||||||
jz L(cross_page_continue)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide length by 4 to get wchar_t count. */
|
|
||||||
shrl $2, %eax
|
|
||||||
# endif
|
|
||||||
# endif
|
|
||||||
@@ -489,6 +541,10 @@ L(return_vzeroupper):
|
|
||||||
.p2align 4
|
|
||||||
L(cross_page_less_vec):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Multiply length by 4 to get byte count. */
|
|
||||||
+ sall $2, %esi
|
|
||||||
+# endif
|
|
||||||
cmpq %rax, %rsi
|
|
||||||
cmovb %esi, %eax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
index 8f660bb9..439e486a 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
@@ -65,12 +65,25 @@ ENTRY(strlen)
|
|
||||||
ret
|
|
||||||
L(n_nonzero):
|
|
||||||
# ifdef AS_WCSLEN
|
|
||||||
- shl $2, %RSI_LP
|
|
||||||
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
|
|
||||||
+ overflow the only way this program doesn't have undefined behavior
|
|
||||||
+ is if there is a null terminator in valid memory so wcslen will
|
|
||||||
+ suffice. */
|
|
||||||
+ mov %RSI_LP, %R10_LP
|
|
||||||
+ sar $62, %R10_LP
|
|
||||||
+ test %R10_LP, %R10_LP
|
|
||||||
+ jnz __wcslen_sse4_1
|
|
||||||
+ sal $2, %RSI_LP
|
|
||||||
# endif
|
|
||||||
|
|
||||||
+
|
|
||||||
/* Initialize long lived registers. */
|
|
||||||
|
|
||||||
add %RDI_LP, %RSI_LP
|
|
||||||
+# ifdef AS_WCSLEN
|
|
||||||
+/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
|
|
||||||
+ jbe __wcslen_sse4_1
|
|
||||||
+# endif
|
|
||||||
mov %RSI_LP, %R10_LP
|
|
||||||
and $-64, %R10_LP
|
|
||||||
mov %RSI_LP, %R11_LP
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,745 +0,0 @@
|
|||||||
From 4ba65586847751372520a36757c17f114588794e Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon, 19 Apr 2021 19:36:06 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize strlen-evex.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit optimizes strlen-evex.S. The
|
|
||||||
optimizations are mostly small things but they add up to roughly
|
|
||||||
10-30% performance improvement for strlen. The results for strnlen are
|
|
||||||
bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and
|
|
||||||
test-wcsnlen are all passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++-----------
|
|
||||||
1 file changed, 317 insertions(+), 264 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
|
|
||||||
index 05838190..4bf6874b 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
|
|
||||||
@@ -29,11 +29,13 @@
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
# define VPCMP vpcmpd
|
|
||||||
# define VPMINU vpminud
|
|
||||||
-# define SHIFT_REG r9d
|
|
||||||
+# define SHIFT_REG ecx
|
|
||||||
+# define CHAR_SIZE 4
|
|
||||||
# else
|
|
||||||
# define VPCMP vpcmpb
|
|
||||||
# define VPMINU vpminub
|
|
||||||
-# define SHIFT_REG ecx
|
|
||||||
+# define SHIFT_REG edx
|
|
||||||
+# define CHAR_SIZE 1
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# define XMMZERO xmm16
|
|
||||||
@@ -46,132 +48,165 @@
|
|
||||||
# define YMM6 ymm22
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
+# define PAGE_SIZE 4096
|
|
||||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
||||||
|
|
||||||
.section .text.evex,"ax",@progbits
|
|
||||||
ENTRY (STRLEN)
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* Check for zero length. */
|
|
||||||
+ /* Check zero length. */
|
|
||||||
test %RSI_LP, %RSI_LP
|
|
||||||
jz L(zero)
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shl $2, %RSI_LP
|
|
||||||
-# elif defined __ILP32__
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
/* Clear the upper 32 bits. */
|
|
||||||
movl %esi, %esi
|
|
||||||
# endif
|
|
||||||
mov %RSI_LP, %R8_LP
|
|
||||||
# endif
|
|
||||||
- movl %edi, %ecx
|
|
||||||
- movq %rdi, %rdx
|
|
||||||
+ movl %edi, %eax
|
|
||||||
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
|
||||||
-
|
|
||||||
+ /* Clear high bits from edi. Only keeping bits relevant to page
|
|
||||||
+ cross check. */
|
|
||||||
+ andl $(PAGE_SIZE - 1), %eax
|
|
||||||
/* Check if we may cross page boundary with one vector load. */
|
|
||||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
|
||||||
- cmpl $VEC_SIZE, %ecx
|
|
||||||
- ja L(cros_page_boundary)
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
+ ja L(cross_page_boundary)
|
|
||||||
|
|
||||||
/* Check the first VEC_SIZE bytes. Each bit in K0 represents a
|
|
||||||
null byte. */
|
|
||||||
VPCMP $0, (%rdi), %YMMZERO, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
-
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- jnz L(first_vec_x0_check)
|
|
||||||
- /* Adjust length and check the end of data. */
|
|
||||||
- subq $VEC_SIZE, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
-# else
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ /* If length < CHAR_PER_VEC handle special. */
|
|
||||||
+ cmpq $CHAR_PER_VEC, %rsi
|
|
||||||
+ jbe L(first_vec_x0)
|
|
||||||
# endif
|
|
||||||
-
|
|
||||||
- /* Align data for aligned loads in the loop. */
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
-
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(aligned_more)
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* Adjust length. */
|
|
||||||
- addq %rcx, %rsi
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- subq $(VEC_SIZE * 4), %rsi
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x0):
|
|
||||||
+ /* Set bit for max len so that tzcnt will return min of max len
|
|
||||||
+ and position of first match. */
|
|
||||||
+ btsq %rsi, %rax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
# endif
|
|
||||||
- jmp L(more_4x_vec)
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(cros_page_boundary):
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
-
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Divide shift count by 4 since each bit in K0 represent 4
|
|
||||||
- bytes. */
|
|
||||||
- movl %ecx, %SHIFT_REG
|
|
||||||
- sarl $2, %SHIFT_REG
|
|
||||||
+L(first_vec_x1):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Safe to use 32 bit instructions as these are only called for
|
|
||||||
+ size = [1, 159]. */
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ /* Use ecx which was computed earlier to compute correct value.
|
|
||||||
+ */
|
|
||||||
+ leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
|
|
||||||
+# else
|
|
||||||
+ subl %edx, %edi
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %edi
|
|
||||||
+# endif
|
|
||||||
+ leal CHAR_PER_VEC(%rdi, %rax), %eax
|
|
||||||
# endif
|
|
||||||
- VPCMP $0, (%rdi), %YMMZERO, %k0
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- /* Remove the leading bytes. */
|
|
||||||
- sarxl %SHIFT_REG, %eax, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jz L(aligned_more)
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x2):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
+ /* Safe to use 32 bit instructions as these are only called for
|
|
||||||
+ size = [1, 159]. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
-# endif
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- addq %rcx, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+ /* Use ecx which was computed earlier to compute correct value.
|
|
||||||
+ */
|
|
||||||
+ leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
|
|
||||||
+# else
|
|
||||||
+ subl %edx, %edi
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %edi
|
|
||||||
+# endif
|
|
||||||
+ leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax
|
|
||||||
# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(aligned_more):
|
|
||||||
+L(first_vec_x3):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Safe to use 32 bit instructions as these are only called for
|
|
||||||
+ size = [1, 159]. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
|
|
||||||
- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
|
|
||||||
- to void possible addition overflow. */
|
|
||||||
- negq %rcx
|
|
||||||
- addq $VEC_SIZE, %rcx
|
|
||||||
-
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- subq %rcx, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
+ /* Use ecx which was computed earlier to compute correct value.
|
|
||||||
+ */
|
|
||||||
+ leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
|
|
||||||
+# else
|
|
||||||
+ subl %edx, %edi
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %edi
|
|
||||||
+# endif
|
|
||||||
+ leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax
|
|
||||||
# endif
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
-
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x4):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Safe to use 32 bit instructions as these are only called for
|
|
||||||
+ size = [1, 159]. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- subq $(VEC_SIZE * 4), %rsi
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
+ /* Use ecx which was computed earlier to compute correct value.
|
|
||||||
+ */
|
|
||||||
+ leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
|
|
||||||
+# else
|
|
||||||
+ subl %edx, %edi
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %edi
|
|
||||||
+# endif
|
|
||||||
+ leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
|
|
||||||
# endif
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
-L(more_4x_vec):
|
|
||||||
+ .p2align 5
|
|
||||||
+L(aligned_more):
|
|
||||||
+ movq %rdi, %rdx
|
|
||||||
+ /* Align data to VEC_SIZE. */
|
|
||||||
+ andq $-(VEC_SIZE), %rdi
|
|
||||||
+L(cross_page_continue):
|
|
||||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
since data is only aligned to VEC_SIZE. */
|
|
||||||
- VPCMP $0, (%rdi), %YMMZERO, %k0
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
-
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ /* + CHAR_SIZE because it simplies the logic in
|
|
||||||
+ last_4x_vec_or_less. */
|
|
||||||
+ leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
|
|
||||||
+ subq %rdx, %rcx
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %ecx
|
|
||||||
+# endif
|
|
||||||
+# endif
|
|
||||||
+ /* Load first VEC regardless. */
|
|
||||||
VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ /* Adjust length. If near end handle specially. */
|
|
||||||
+ subq %rcx, %rsi
|
|
||||||
+ jb L(last_4x_vec_or_less)
|
|
||||||
+# endif
|
|
||||||
kmovd %k0, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x1)
|
|
||||||
|
|
||||||
VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
+ test %eax, %eax
|
|
||||||
jnz L(first_vec_x2)
|
|
||||||
|
|
||||||
VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
|
|
||||||
@@ -179,258 +214,276 @@ L(more_4x_vec):
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x3)
|
|
||||||
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
-
|
|
||||||
-# ifdef USE_AS_STRNLEN
|
|
||||||
- subq $(VEC_SIZE * 4), %rsi
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
- /* Align data to 4 * VEC_SIZE. */
|
|
||||||
- movq %rdi, %rcx
|
|
||||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x4)
|
|
||||||
|
|
||||||
+ addq $VEC_SIZE, %rdi
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* Adjust length. */
|
|
||||||
+ /* Check if at last VEC_SIZE * 4 length. */
|
|
||||||
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
|
|
||||||
+ jbe L(last_4x_vec_or_less_load)
|
|
||||||
+ movl %edi, %ecx
|
|
||||||
+ andl $(VEC_SIZE * 4 - 1), %ecx
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %ecx
|
|
||||||
+# endif
|
|
||||||
+ /* Readjust length. */
|
|
||||||
addq %rcx, %rsi
|
|
||||||
# endif
|
|
||||||
+ /* Align data to VEC_SIZE * 4. */
|
|
||||||
+ andq $-(VEC_SIZE * 4), %rdi
|
|
||||||
|
|
||||||
+ /* Compare 4 * VEC at a time forward. */
|
|
||||||
.p2align 4
|
|
||||||
L(loop_4x_vec):
|
|
||||||
- /* Compare 4 * VEC at a time forward. */
|
|
||||||
- VMOVA (%rdi), %YMM1
|
|
||||||
- VMOVA VEC_SIZE(%rdi), %YMM2
|
|
||||||
- VMOVA (VEC_SIZE * 2)(%rdi), %YMM3
|
|
||||||
- VMOVA (VEC_SIZE * 3)(%rdi), %YMM4
|
|
||||||
-
|
|
||||||
- VPMINU %YMM1, %YMM2, %YMM5
|
|
||||||
- VPMINU %YMM3, %YMM4, %YMM6
|
|
||||||
+ /* Load first VEC regardless. */
|
|
||||||
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ /* Break if at end of length. */
|
|
||||||
+ subq $(CHAR_PER_VEC * 4), %rsi
|
|
||||||
+ jb L(last_4x_vec_or_less_cmpeq)
|
|
||||||
+# endif
|
|
||||||
+ /* Save some code size by microfusing VPMINU with the load. Since
|
|
||||||
+ the matches in ymm2/ymm4 can only be returned if there where no
|
|
||||||
+ matches in ymm1/ymm3 respectively there is no issue with overlap.
|
|
||||||
+ */
|
|
||||||
+ VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
|
|
||||||
+ VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
|
|
||||||
+ VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
|
|
||||||
+
|
|
||||||
+ VPCMP $0, %YMM2, %YMMZERO, %k0
|
|
||||||
+ VPCMP $0, %YMM4, %YMMZERO, %k1
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ kortestd %k0, %k1
|
|
||||||
+ jz L(loop_4x_vec)
|
|
||||||
+
|
|
||||||
+ /* Check if end was in first half. */
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ shrq $2, %rdi
|
|
||||||
+# endif
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(second_vec_return)
|
|
||||||
|
|
||||||
- VPMINU %YMM5, %YMM6, %YMM5
|
|
||||||
- VPCMP $0, %YMM5, %YMMZERO, %k0
|
|
||||||
- ktestd %k0, %k0
|
|
||||||
- jnz L(4x_vec_end)
|
|
||||||
+ VPCMP $0, %YMM1, %YMMZERO, %k2
|
|
||||||
+ kmovd %k2, %edx
|
|
||||||
+ /* Combine VEC1 matches (edx) with VEC2 matches (eax). */
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ sall $CHAR_PER_VEC, %eax
|
|
||||||
+ orl %edx, %eax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# else
|
|
||||||
+ salq $CHAR_PER_VEC, %rax
|
|
||||||
+ orq %rdx, %rax
|
|
||||||
+ tzcntq %rax, %rax
|
|
||||||
+# endif
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
|
|
||||||
-# ifndef USE_AS_STRNLEN
|
|
||||||
- jmp L(loop_4x_vec)
|
|
||||||
-# else
|
|
||||||
- subq $(VEC_SIZE * 4), %rsi
|
|
||||||
- ja L(loop_4x_vec)
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
|
|
||||||
+L(last_4x_vec_or_less_load):
|
|
||||||
+ /* Depending on entry adjust rdi / prepare first VEC in YMM1. */
|
|
||||||
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
|
|
||||||
+L(last_4x_vec_or_less_cmpeq):
|
|
||||||
+ VPCMP $0, %YMM1, %YMMZERO, %k0
|
|
||||||
+ addq $(VEC_SIZE * 3), %rdi
|
|
||||||
L(last_4x_vec_or_less):
|
|
||||||
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
|
||||||
- addl $(VEC_SIZE * 2), %esi
|
|
||||||
- jle L(last_2x_vec)
|
|
||||||
-
|
|
||||||
- VPCMP $0, (%rdi), %YMMZERO, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off by
|
|
||||||
+ VEC_SIZE * 4. */
|
|
||||||
+ testl $(CHAR_PER_VEC * 2), %esi
|
|
||||||
+ jnz L(last_4x_vec)
|
|
||||||
+
|
|
||||||
+ /* length may have been negative or positive by an offset of
|
|
||||||
+ CHAR_PER_VEC * 4 depending on where this was called from. This
|
|
||||||
+ fixes that. */
|
|
||||||
+ andl $(CHAR_PER_VEC * 4 - 1), %esi
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ jnz L(last_vec_x1_check)
|
|
||||||
|
|
||||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
+ /* Check the end of data. */
|
|
||||||
+ subl $CHAR_PER_VEC, %esi
|
|
||||||
+ jb L(max)
|
|
||||||
|
|
||||||
VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x2_check)
|
|
||||||
- subl $VEC_SIZE, %esi
|
|
||||||
- jle L(max)
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Check the end of data. */
|
|
||||||
+ cmpl %eax, %esi
|
|
||||||
+ jb L(max)
|
|
||||||
|
|
||||||
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x3_check)
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarq $2, %rdi
|
|
||||||
+# endif
|
|
||||||
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
|
|
||||||
+ ret
|
|
||||||
+L(max):
|
|
||||||
movq %r8, %rax
|
|
||||||
+ ret
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+ /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
|
|
||||||
+ in the 4x VEC loop can use 2 byte encoding. */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(second_vec_return):
|
|
||||||
+ VPCMP $0, %YMM3, %YMMZERO, %k0
|
|
||||||
+ /* Combine YMM3 matches (k0) with YMM4 matches (k1). */
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ kunpckbw %k0, %k1, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# else
|
|
||||||
+ kunpckdq %k0, %k1, %k0
|
|
||||||
+ kmovq %k0, %rax
|
|
||||||
+ tzcntq %rax, %rax
|
|
||||||
+# endif
|
|
||||||
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+L(last_vec_x1_check):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Check the end of data. */
|
|
||||||
+ cmpl %eax, %esi
|
|
||||||
+ jb L(max)
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarq $2, %rdi
|
|
||||||
# endif
|
|
||||||
+ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(last_2x_vec):
|
|
||||||
- addl $(VEC_SIZE * 2), %esi
|
|
||||||
+L(last_4x_vec):
|
|
||||||
+ /* Test first 2x VEC normally. */
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_x1)
|
|
||||||
|
|
||||||
- VPCMP $0, (%rdi), %YMMZERO, %k0
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0_check)
|
|
||||||
- subl $VEC_SIZE, %esi
|
|
||||||
- jle L(max)
|
|
||||||
+ jnz L(last_vec_x2)
|
|
||||||
|
|
||||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
|
|
||||||
+ /* Normalize length. */
|
|
||||||
+ andl $(CHAR_PER_VEC * 4 - 1), %esi
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1_check)
|
|
||||||
- movq %r8, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
- ret
|
|
||||||
+ jnz L(last_vec_x3)
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x0_check):
|
|
||||||
+ /* Check the end of data. */
|
|
||||||
+ subl $(CHAR_PER_VEC * 3), %esi
|
|
||||||
+ jb L(max)
|
|
||||||
+
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
/* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
+ cmpl %eax, %esi
|
|
||||||
+ jb L(max_end)
|
|
||||||
+
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarq $2, %rdi
|
|
||||||
# endif
|
|
||||||
+ leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x1_check):
|
|
||||||
+L(last_vec_x1):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarq $2, %rdi
|
|
||||||
# endif
|
|
||||||
+ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x2_check):
|
|
||||||
+L(last_vec_x2):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarq $2, %rdi
|
|
||||||
# endif
|
|
||||||
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x3_check):
|
|
||||||
+L(last_vec_x3):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
+ subl $(CHAR_PER_VEC * 2), %esi
|
|
||||||
/* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
+ cmpl %eax, %esi
|
|
||||||
+ jb L(max_end)
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarq $2, %rdi
|
|
||||||
# endif
|
|
||||||
+ leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax
|
|
||||||
ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(max):
|
|
||||||
+L(max_end):
|
|
||||||
movq %r8, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(zero):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
ret
|
|
||||||
# endif
|
|
||||||
|
|
||||||
+ /* Cold case for crossing page with first load. */
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x0):
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
+L(cross_page_boundary):
|
|
||||||
+ movq %rdi, %rdx
|
|
||||||
+ /* Align data to VEC_SIZE. */
|
|
||||||
+ andq $-VEC_SIZE, %rdi
|
|
||||||
+ VPCMP $0, (%rdi), %YMMZERO, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ /* Remove the leading bytes. */
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
|
|
||||||
+ bytes. */
|
|
||||||
+ movl %edx, %ecx
|
|
||||||
+ shrl $2, %ecx
|
|
||||||
+ andl $(CHAR_PER_VEC - 1), %ecx
|
|
||||||
# endif
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x1):
|
|
||||||
+ /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */
|
|
||||||
+ sarxl %SHIFT_REG, %eax, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+# ifndef USE_AS_STRNLEN
|
|
||||||
+ jz L(cross_page_continue)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x2):
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
+# else
|
|
||||||
+ jnz L(cross_page_less_vec)
|
|
||||||
+# ifndef USE_AS_WCSLEN
|
|
||||||
+ movl %edx, %ecx
|
|
||||||
+ andl $(CHAR_PER_VEC - 1), %ecx
|
|
||||||
+# endif
|
|
||||||
+ movl $CHAR_PER_VEC, %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+ /* Check the end of data. */
|
|
||||||
+ cmpq %rax, %rsi
|
|
||||||
+ ja L(cross_page_continue)
|
|
||||||
+ movl %esi, %eax
|
|
||||||
ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(4x_vec_end):
|
|
||||||
- VPCMP $0, %YMM1, %YMMZERO, %k0
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
- VPCMP $0, %YMM2, %YMMZERO, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
- VPCMP $0, %YMM3, %YMMZERO, %k2
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
- VPCMP $0, %YMM4, %YMMZERO, %k3
|
|
||||||
- kmovd %k3, %eax
|
|
||||||
-L(first_vec_x3):
|
|
||||||
+L(cross_page_less_vec):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
+ /* Select min of length and position of first null. */
|
|
||||||
+ cmpq %rax, %rsi
|
|
||||||
+ cmovb %esi, %eax
|
|
||||||
ret
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
END (STRLEN)
|
|
||||||
#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,158 +0,0 @@
|
|||||||
From ea8e465a6b8d0f26c72bcbe453a854de3abf68ec Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Wed, 30 Jun 2021 10:47:06 -0700
|
|
||||||
Subject: [PATCH] x86: Check RTM_ALWAYS_ABORT for RTM [BZ #28033]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
From
|
|
||||||
|
|
||||||
https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
|
|
||||||
|
|
||||||
* Intel TSX will be disabled by default.
|
|
||||||
* The processor will force abort all Restricted Transactional Memory (RTM)
|
|
||||||
transactions by default.
|
|
||||||
* A new CPUID bit CPUID.07H.0H.EDX[11](RTM_ALWAYS_ABORT) will be enumerated,
|
|
||||||
which is set to indicate to updated software that the loaded microcode is
|
|
||||||
forcing RTM abort.
|
|
||||||
* On processors that enumerate support for RTM, the CPUID enumeration bits
|
|
||||||
for Intel TSX (CPUID.07H.0H.EBX[11] and CPUID.07H.0H.EBX[4]) continue to
|
|
||||||
be set by default after microcode update.
|
|
||||||
* Workloads that were benefited from Intel TSX might experience a change
|
|
||||||
in performance.
|
|
||||||
* System software may use a new bit in Model-Specific Register (MSR) 0x10F
|
|
||||||
TSX_FORCE_ABORT[TSX_CPUID_CLEAR] functionality to clear the Hardware Lock
|
|
||||||
Elision (HLE) and RTM bits to indicate to software that Intel TSX is
|
|
||||||
disabled.
|
|
||||||
|
|
||||||
1. Add RTM_ALWAYS_ABORT to CPUID features.
|
|
||||||
2. Set RTM usable only if RTM_ALWAYS_ABORT isn't set. This skips the
|
|
||||||
string/tst-memchr-rtm etc. testcases on the affected processors, which
|
|
||||||
always fail after a microcde update.
|
|
||||||
3. Check RTM feature, instead of usability, against /proc/cpuinfo.
|
|
||||||
|
|
||||||
This fixes BZ #28033.
|
|
||||||
---
|
|
||||||
manual/platform.texi | 3 +++
|
|
||||||
sysdeps/x86/cpu-features.c | 5 ++++-
|
|
||||||
sysdeps/x86/sys/platform/x86.h | 6 +++---
|
|
||||||
sysdeps/x86/tst-cpu-features-supports.c | 2 +-
|
|
||||||
sysdeps/x86/tst-get-cpu-features.c | 2 ++
|
|
||||||
5 files changed, 13 insertions(+), 5 deletions(-)
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
sysdeps/x86/bits/platform/x86.h
|
|
||||||
(doesn't exist)
|
|
||||||
sysdeps/x86/bits/platform/x86.h
|
|
||||||
(account for lack of upstream renames)
|
|
||||||
|
|
||||||
diff --git a/manual/platform.texi b/manual/platform.texi
|
|
||||||
index 8fec2933..b7e8aef7 100644
|
|
||||||
--- a/manual/platform.texi
|
|
||||||
+++ b/manual/platform.texi
|
|
||||||
@@ -510,6 +510,9 @@ capability.
|
|
||||||
@item
|
|
||||||
@code{RTM} -- RTM instruction extensions.
|
|
||||||
|
|
||||||
+@item
|
|
||||||
+@code{RTM_ALWAYS_ABORT} -- Transactions always abort, making RTM unusable.
|
|
||||||
+
|
|
||||||
@item
|
|
||||||
@code{SDBG} -- IA32_DEBUG_INTERFACE MSR for silicon debug.
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
|
||||||
index 3610ee5c..4889f062 100644
|
|
||||||
--- a/sysdeps/x86/cpu-features.c
|
|
||||||
+++ b/sysdeps/x86/cpu-features.c
|
|
||||||
@@ -74,7 +74,6 @@ update_usable (struct cpu_features *cpu_features)
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, HLE);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, BMI2);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, ERMS);
|
|
||||||
- CPU_FEATURE_SET_USABLE (cpu_features, RTM);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, RDSEED);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, ADX);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, CLFLUSHOPT);
|
|
||||||
@@ -90,6 +89,7 @@ update_usable (struct cpu_features *cpu_features)
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, MOVDIRI);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, MOVDIR64B);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, FSRM);
|
|
||||||
+ CPU_FEATURE_SET_USABLE (cpu_features, RTM_ALWAYS_ABORT);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, SERIALIZE);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, TSXLDTRK);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, LAHF64_SAHF64);
|
|
||||||
@@ -779,6 +779,9 @@ no_cpuid:
|
|
||||||
GLRO(dl_platform) = "i586";
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+ if (!CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT))
|
|
||||||
+ CPU_FEATURE_SET_USABLE (cpu_features, RTM);
|
|
||||||
+
|
|
||||||
#if CET_ENABLED
|
|
||||||
# if HAVE_TUNABLES
|
|
||||||
TUNABLE_GET (x86_ibt, tunable_val_t *,
|
|
||||||
diff --git a/sysdeps/x86/sys/platform/x86.h b/sysdeps/x86/sys/platform/x86.h
|
|
||||||
index e5cc7c68..7a434926 100644
|
|
||||||
--- a/sysdeps/x86/sys/platform/x86.h
|
|
||||||
+++ b/sysdeps/x86/sys/platform/x86.h
|
|
||||||
@@ -247,7 +247,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
|
|
||||||
#define bit_cpu_AVX512_VP2INTERSECT (1u << 8)
|
|
||||||
#define bit_cpu_INDEX_7_EDX_9 (1u << 9)
|
|
||||||
#define bit_cpu_MD_CLEAR (1u << 10)
|
|
||||||
-#define bit_cpu_INDEX_7_EDX_11 (1u << 11)
|
|
||||||
+#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11)
|
|
||||||
#define bit_cpu_INDEX_7_EDX_12 (1u << 12)
|
|
||||||
#define bit_cpu_INDEX_7_EDX_13 (1u << 13)
|
|
||||||
#define bit_cpu_SERIALIZE (1u << 14)
|
|
||||||
@@ -471,7 +471,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
|
|
||||||
#define index_cpu_AVX512_VP2INTERSECT COMMON_CPUID_INDEX_7
|
|
||||||
#define index_cpu_INDEX_7_EDX_9 COMMON_CPUID_INDEX_7
|
|
||||||
#define index_cpu_MD_CLEAR COMMON_CPUID_INDEX_7
|
|
||||||
-#define index_cpu_INDEX_7_EDX_11 COMMON_CPUID_INDEX_7
|
|
||||||
+#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7
|
|
||||||
#define index_cpu_INDEX_7_EDX_12 COMMON_CPUID_INDEX_7
|
|
||||||
#define index_cpu_INDEX_7_EDX_13 COMMON_CPUID_INDEX_7
|
|
||||||
#define index_cpu_SERIALIZE COMMON_CPUID_INDEX_7
|
|
||||||
@@ -695,7 +695,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
|
|
||||||
#define reg_AVX512_VP2INTERSECT edx
|
|
||||||
#define reg_INDEX_7_EDX_9 edx
|
|
||||||
#define reg_MD_CLEAR edx
|
|
||||||
-#define reg_INDEX_7_EDX_11 edx
|
|
||||||
+#define reg_RTM_ALWAYS_ABORT edx
|
|
||||||
#define reg_INDEX_7_EDX_12 edx
|
|
||||||
#define reg_INDEX_7_EDX_13 edx
|
|
||||||
#define reg_SERIALIZE edx
|
|
||||||
diff --git a/sysdeps/x86/tst-cpu-features-supports.c b/sysdeps/x86/tst-cpu-features-supports.c
|
|
||||||
index 287cf01f..8100a319 100644
|
|
||||||
--- a/sysdeps/x86/tst-cpu-features-supports.c
|
|
||||||
+++ b/sysdeps/x86/tst-cpu-features-supports.c
|
|
||||||
@@ -152,7 +152,7 @@ do_test (int argc, char **argv)
|
|
||||||
fails += CHECK_SUPPORTS (rdpid, RDPID);
|
|
||||||
fails += CHECK_SUPPORTS (rdrnd, RDRAND);
|
|
||||||
fails += CHECK_SUPPORTS (rdseed, RDSEED);
|
|
||||||
- fails += CHECK_SUPPORTS (rtm, RTM);
|
|
||||||
+ fails += CHECK_CPU_SUPPORTS (rtm, RTM);
|
|
||||||
fails += CHECK_SUPPORTS (serialize, SERIALIZE);
|
|
||||||
fails += CHECK_SUPPORTS (sha, SHA);
|
|
||||||
fails += CHECK_CPU_SUPPORTS (shstk, SHSTK);
|
|
||||||
diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c
|
|
||||||
index 2763deb6..0717e5d8 100644
|
|
||||||
--- a/sysdeps/x86/tst-get-cpu-features.c
|
|
||||||
+++ b/sysdeps/x86/tst-get-cpu-features.c
|
|
||||||
@@ -183,6 +183,7 @@ do_test (void)
|
|
||||||
CHECK_CPU_FEATURE (UINTR);
|
|
||||||
CHECK_CPU_FEATURE (AVX512_VP2INTERSECT);
|
|
||||||
CHECK_CPU_FEATURE (MD_CLEAR);
|
|
||||||
+ CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT);
|
|
||||||
CHECK_CPU_FEATURE (SERIALIZE);
|
|
||||||
CHECK_CPU_FEATURE (HYBRID);
|
|
||||||
CHECK_CPU_FEATURE (TSXLDTRK);
|
|
||||||
@@ -344,6 +345,7 @@ do_test (void)
|
|
||||||
CHECK_CPU_FEATURE_USABLE (FSRM);
|
|
||||||
CHECK_CPU_FEATURE_USABLE (AVX512_VP2INTERSECT);
|
|
||||||
CHECK_CPU_FEATURE_USABLE (MD_CLEAR);
|
|
||||||
+ CHECK_CPU_FEATURE_USABLE (RTM_ALWAYS_ABORT);
|
|
||||||
CHECK_CPU_FEATURE_USABLE (SERIALIZE);
|
|
||||||
CHECK_CPU_FEATURE_USABLE (HYBRID);
|
|
||||||
CHECK_CPU_FEATURE_USABLE (TSXLDTRK);
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,51 +0,0 @@
|
|||||||
From 0679442defedf7e52a94264975880ab8674736b2 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Thu, 8 Jul 2021 16:13:19 -0400
|
|
||||||
Subject: [PATCH] x86: Remove wcsnlen-sse4_1 from wcslen ifunc-impl-list [BZ
|
|
||||||
#28064]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
The following commit
|
|
||||||
|
|
||||||
commit 6f573a27b6c8b4236445810a44660612323f5a73
|
|
||||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Wed Jun 23 01:19:34 2021 -0400
|
|
||||||
|
|
||||||
x86-64: Add wcslen optimize for sse4.1
|
|
||||||
|
|
||||||
Added wcsnlen-sse4.1 to the wcslen ifunc implementation list and did
|
|
||||||
not add wcslen-sse4.1 to wcslen ifunc implementation list. This commit
|
|
||||||
fixes that by removing wcsnlen-sse4.1 from the wcslen ifunc
|
|
||||||
implementation list and adding wcslen-sse4.1 to the ifunc
|
|
||||||
implementation list.
|
|
||||||
|
|
||||||
Testing:
|
|
||||||
test-wcslen.c, test-rsi-wcslen.c, and test-rsi-strlen.c are passing as
|
|
||||||
well as all other tests in wcsmbs and string.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ++--
|
|
||||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index 580913ca..695cdba6 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -657,9 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
&& CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wcslen_evex)
|
|
||||||
- IFUNC_IMPL_ADD (array, i, wcsnlen,
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, wcslen,
|
|
||||||
CPU_FEATURE_USABLE (SSE4_1),
|
|
||||||
- __wcsnlen_sse4_1)
|
|
||||||
+ __wcslen_sse4_1)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
|
|
||||||
|
|
||||||
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,51 +0,0 @@
|
|||||||
From 55c7bcc71b84123d5d4bd2814366a6b05fcf8ebd Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Sat, 9 May 2020 12:04:23 -0700
|
|
||||||
Subject: [PATCH] x86-64: Use RDX_LP on __x86_shared_non_temporal_threshold [BZ
|
|
||||||
#25966]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Since __x86_shared_non_temporal_threshold is defined as
|
|
||||||
|
|
||||||
long int __x86_shared_non_temporal_threshold;
|
|
||||||
|
|
||||||
and long int is 4 bytes for x32, use RDX_LP to compare against
|
|
||||||
__x86_shared_non_temporal_threshold in assembly code.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
|
|
||||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
index 71f5954d..673b73aa 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
@@ -245,7 +245,7 @@ L(return):
|
|
||||||
#endif
|
|
||||||
|
|
||||||
L(movsb):
|
|
||||||
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
|
|
||||||
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
|
||||||
jae L(more_8x_vec)
|
|
||||||
cmpq %rsi, %rdi
|
|
||||||
jb 1f
|
|
||||||
@@ -397,7 +397,7 @@ L(more_8x_vec):
|
|
||||||
addq %r8, %rdx
|
|
||||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
|
||||||
/* Check non-temporal store threshold. */
|
|
||||||
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
|
|
||||||
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
|
||||||
ja L(large_forward)
|
|
||||||
#endif
|
|
||||||
L(loop_4x_vec_forward):
|
|
||||||
@@ -448,7 +448,7 @@ L(more_8x_vec_backward):
|
|
||||||
subq %r8, %rdx
|
|
||||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
|
||||||
/* Check non-temporal store threshold. */
|
|
||||||
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
|
|
||||||
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
|
||||||
ja L(large_backward)
|
|
||||||
#endif
|
|
||||||
L(loop_4x_vec_backward):
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,44 +0,0 @@
|
|||||||
From a35a59036ebae3efcdf5e8167610e0656fca9770 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Thu, 11 Jun 2020 12:41:18 -0700
|
|
||||||
Subject: [PATCH] x86_64: Use %xmmN with vpxor to clear a vector register
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Since "vpxor %xmmN, %xmmN, %xmmN" clears the whole vector register, use
|
|
||||||
%xmmN, instead of %ymmN, with vpxor to clear a vector register.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 4 ++--
|
|
||||||
sysdeps/x86_64/multiarch/strrchr-avx2.S | 2 +-
|
|
||||||
2 files changed, 3 insertions(+), 3 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
index 433ae047..70d8499b 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
@@ -105,8 +105,8 @@ ENTRY (STRCMP)
|
|
||||||
# endif
|
|
||||||
movl %edi, %eax
|
|
||||||
xorl %edx, %edx
|
|
||||||
- /* Make %ymm7 all zeros in this function. */
|
|
||||||
- vpxor %ymm7, %ymm7, %ymm7
|
|
||||||
+ /* Make %xmm7 (%ymm7) all zeros in this function. */
|
|
||||||
+ vpxor %xmm7, %xmm7, %xmm7
|
|
||||||
orl %esi, %eax
|
|
||||||
andl $(PAGE_SIZE - 1), %eax
|
|
||||||
cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
|
|
||||||
index 9f22a15e..c949410b 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
|
|
||||||
@@ -48,7 +48,7 @@ ENTRY (STRRCHR)
|
|
||||||
movl %edi, %ecx
|
|
||||||
/* Broadcast CHAR to YMM4. */
|
|
||||||
VPBROADCAST %xmm4, %ymm4
|
|
||||||
- vpxor %ymm0, %ymm0, %ymm0
|
|
||||||
+ vpxor %xmm0, %xmm0, %xmm0
|
|
||||||
|
|
||||||
/* Check if we may cross page boundary with one vector load. */
|
|
||||||
andl $(2 * VEC_SIZE - 1), %ecx
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,359 +0,0 @@
|
|||||||
From 1f745ecc2109890886b161d4791e1406fdfc29b8 Mon Sep 17 00:00:00 2001
|
|
||||||
From: noah <goldstein.w.n@gmail.com>
|
|
||||||
Date: Wed, 3 Feb 2021 00:38:59 -0500
|
|
||||||
Subject: [PATCH] x86-64: Refactor and improve performance of strchr-avx2.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. Just seemed the performance could be improved a bit. Observed
|
|
||||||
and expected behavior are unchanged. Optimized body of main
|
|
||||||
loop. Updated page cross logic and optimized accordingly. Made a few
|
|
||||||
minor instruction selection modifications. No regressions in test
|
|
||||||
suite. Both test-strchrnul and test-strchr passed.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strchr-avx2.S | 225 ++++++++++++-------------
|
|
||||||
sysdeps/x86_64/multiarch/strchr.c | 4 +-
|
|
||||||
2 files changed, 114 insertions(+), 115 deletions(-)
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
sysdeps/x86_64/multiarch/strchr.c
|
|
||||||
(account for missing upstream macros)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
|
|
||||||
index da7d2620..919d256c 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
|
|
||||||
@@ -27,10 +27,12 @@
|
|
||||||
# ifdef USE_AS_WCSCHR
|
|
||||||
# define VPBROADCAST vpbroadcastd
|
|
||||||
# define VPCMPEQ vpcmpeqd
|
|
||||||
+# define VPMINU vpminud
|
|
||||||
# define CHAR_REG esi
|
|
||||||
# else
|
|
||||||
# define VPBROADCAST vpbroadcastb
|
|
||||||
# define VPCMPEQ vpcmpeqb
|
|
||||||
+# define VPMINU vpminub
|
|
||||||
# define CHAR_REG sil
|
|
||||||
# endif
|
|
||||||
|
|
||||||
@@ -43,71 +45,54 @@
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
+# define PAGE_SIZE 4096
|
|
||||||
|
|
||||||
.section SECTION(.text),"ax",@progbits
|
|
||||||
ENTRY (STRCHR)
|
|
||||||
movl %edi, %ecx
|
|
||||||
- /* Broadcast CHAR to YMM0. */
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+ /* Broadcast CHAR to YMM0. */
|
|
||||||
vmovd %esi, %xmm0
|
|
||||||
vpxor %xmm9, %xmm9, %xmm9
|
|
||||||
VPBROADCAST %xmm0, %ymm0
|
|
||||||
- /* Check if we may cross page boundary with one vector load. */
|
|
||||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
|
||||||
- cmpl $VEC_SIZE, %ecx
|
|
||||||
- ja L(cros_page_boundary)
|
|
||||||
|
|
||||||
- /* Check the first VEC_SIZE bytes. Search for both CHAR and the
|
|
||||||
- null byte. */
|
|
||||||
- vmovdqu (%rdi), %ymm8
|
|
||||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
- vpor %ymm1, %ymm2, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ /* Check if we cross page boundary with one vector load. */
|
|
||||||
+ andl $(PAGE_SIZE - 1), %ecx
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
|
|
||||||
+ ja L(cross_page_boundary)
|
|
||||||
|
|
||||||
- /* Align data for aligned loads in the loop. */
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
-
|
|
||||||
- jmp L(more_4x_vec)
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(cros_page_boundary):
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
+ /* Check the first VEC_SIZE bytes. Search for both CHAR and the
|
|
||||||
+ null byte. */
|
|
||||||
vmovdqu (%rdi), %ymm8
|
|
||||||
VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
vpor %ymm1, %ymm2, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
- /* Remove the leading bytes. */
|
|
||||||
- sarl %cl, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jz L(aligned_more)
|
|
||||||
- /* Found CHAR or the null byte. */
|
|
||||||
+ jz L(more_vecs)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- addq %rcx, %rax
|
|
||||||
-# ifdef USE_AS_STRCHRNUL
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
addq %rdi, %rax
|
|
||||||
-# else
|
|
||||||
- xorl %edx, %edx
|
|
||||||
- leaq (%rdi, %rax), %rax
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ cmp (%rax), %CHAR_REG
|
|
||||||
cmovne %rdx, %rax
|
|
||||||
# endif
|
|
||||||
L(return_vzeroupper):
|
|
||||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
+L(more_vecs):
|
|
||||||
+ /* Align data for aligned loads in the loop. */
|
|
||||||
+ andq $-VEC_SIZE, %rdi
|
|
||||||
L(aligned_more):
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
|
|
||||||
-L(more_4x_vec):
|
|
||||||
- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
- since data is only aligned to VEC_SIZE. */
|
|
||||||
- vmovdqa (%rdi), %ymm8
|
|
||||||
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
+ since data is only aligned to VEC_SIZE. */
|
|
||||||
+ vmovdqa VEC_SIZE(%rdi), %ymm8
|
|
||||||
+ addq $VEC_SIZE, %rdi
|
|
||||||
VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
vpor %ymm1, %ymm2, %ymm1
|
|
||||||
@@ -137,61 +122,24 @@ L(more_4x_vec):
|
|
||||||
vpor %ymm1, %ymm2, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x3)
|
|
||||||
-
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
-
|
|
||||||
- /* Align data to 4 * VEC_SIZE. */
|
|
||||||
- movq %rdi, %rcx
|
|
||||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(loop_4x_vec):
|
|
||||||
- /* Compare 4 * VEC at a time forward. */
|
|
||||||
- vmovdqa (%rdi), %ymm5
|
|
||||||
- vmovdqa VEC_SIZE(%rdi), %ymm6
|
|
||||||
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
|
|
||||||
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
|
|
||||||
-
|
|
||||||
- VPCMPEQ %ymm5, %ymm0, %ymm1
|
|
||||||
- VPCMPEQ %ymm6, %ymm0, %ymm2
|
|
||||||
- VPCMPEQ %ymm7, %ymm0, %ymm3
|
|
||||||
- VPCMPEQ %ymm8, %ymm0, %ymm4
|
|
||||||
-
|
|
||||||
- VPCMPEQ %ymm5, %ymm9, %ymm5
|
|
||||||
- VPCMPEQ %ymm6, %ymm9, %ymm6
|
|
||||||
- VPCMPEQ %ymm7, %ymm9, %ymm7
|
|
||||||
- VPCMPEQ %ymm8, %ymm9, %ymm8
|
|
||||||
-
|
|
||||||
- vpor %ymm1, %ymm5, %ymm1
|
|
||||||
- vpor %ymm2, %ymm6, %ymm2
|
|
||||||
- vpor %ymm3, %ymm7, %ymm3
|
|
||||||
- vpor %ymm4, %ymm8, %ymm4
|
|
||||||
-
|
|
||||||
- vpor %ymm1, %ymm2, %ymm5
|
|
||||||
- vpor %ymm3, %ymm4, %ymm6
|
|
||||||
-
|
|
||||||
- vpor %ymm5, %ymm6, %ymm5
|
|
||||||
-
|
|
||||||
- vpmovmskb %ymm5, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(4x_vec_end)
|
|
||||||
-
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
+ jz L(prep_loop_4x)
|
|
||||||
|
|
||||||
- jmp L(loop_4x_vec)
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ cmp (%rax), %CHAR_REG
|
|
||||||
+ cmovne %rdx, %rax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x0):
|
|
||||||
- /* Found CHAR or the null byte. */
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_STRCHRNUL
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
addq %rdi, %rax
|
|
||||||
-# else
|
|
||||||
- xorl %edx, %edx
|
|
||||||
- leaq (%rdi, %rax), %rax
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ cmp (%rax), %CHAR_REG
|
|
||||||
cmovne %rdx, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -199,13 +147,9 @@ L(first_vec_x0):
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x1):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_STRCHRNUL
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
-# else
|
|
||||||
- xorl %edx, %edx
|
|
||||||
leaq VEC_SIZE(%rdi, %rax), %rax
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ cmp (%rax), %CHAR_REG
|
|
||||||
cmovne %rdx, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -213,42 +157,97 @@ L(first_vec_x1):
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x2):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_STRCHRNUL
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
-# else
|
|
||||||
- xorl %edx, %edx
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ cmp (%rax), %CHAR_REG
|
|
||||||
cmovne %rdx, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
+L(prep_loop_4x):
|
|
||||||
+ /* Align data to 4 * VEC_SIZE. */
|
|
||||||
+ andq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+
|
|
||||||
.p2align 4
|
|
||||||
-L(4x_vec_end):
|
|
||||||
+L(loop_4x_vec):
|
|
||||||
+ /* Compare 4 * VEC at a time forward. */
|
|
||||||
+ vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
|
|
||||||
+ vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
|
|
||||||
+ vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
|
|
||||||
+ vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
|
|
||||||
+
|
|
||||||
+ /* Leaves only CHARS matching esi as 0. */
|
|
||||||
+ vpxor %ymm5, %ymm0, %ymm1
|
|
||||||
+ vpxor %ymm6, %ymm0, %ymm2
|
|
||||||
+ vpxor %ymm7, %ymm0, %ymm3
|
|
||||||
+ vpxor %ymm8, %ymm0, %ymm4
|
|
||||||
+
|
|
||||||
+ VPMINU %ymm1, %ymm5, %ymm1
|
|
||||||
+ VPMINU %ymm2, %ymm6, %ymm2
|
|
||||||
+ VPMINU %ymm3, %ymm7, %ymm3
|
|
||||||
+ VPMINU %ymm4, %ymm8, %ymm4
|
|
||||||
+
|
|
||||||
+ VPMINU %ymm1, %ymm2, %ymm5
|
|
||||||
+ VPMINU %ymm3, %ymm4, %ymm6
|
|
||||||
+
|
|
||||||
+ VPMINU %ymm5, %ymm6, %ymm5
|
|
||||||
+
|
|
||||||
+ VPCMPEQ %ymm5, %ymm9, %ymm5
|
|
||||||
+ vpmovmskb %ymm5, %eax
|
|
||||||
+
|
|
||||||
+ addq $(VEC_SIZE * 4), %rdi
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(loop_4x_vec)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ %ymm1, %ymm9, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x0)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ %ymm2, %ymm9, %ymm2
|
|
||||||
vpmovmskb %ymm2, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x1)
|
|
||||||
- vpmovmskb %ymm3, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ %ymm3, %ymm9, %ymm3
|
|
||||||
+ VPCMPEQ %ymm4, %ymm9, %ymm4
|
|
||||||
+ vpmovmskb %ymm3, %ecx
|
|
||||||
vpmovmskb %ymm4, %eax
|
|
||||||
+ salq $32, %rax
|
|
||||||
+ orq %rcx, %rax
|
|
||||||
+ tzcntq %rax, %rax
|
|
||||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ cmp (%rax), %CHAR_REG
|
|
||||||
+ cmovne %rdx, %rax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ /* Cold case for crossing page with first load. */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(cross_page_boundary):
|
|
||||||
+ andq $-VEC_SIZE, %rdi
|
|
||||||
+ andl $(VEC_SIZE - 1), %ecx
|
|
||||||
+
|
|
||||||
+ vmovdqa (%rdi), %ymm8
|
|
||||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
+ vpor %ymm1, %ymm2, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ /* Remove the leading bits. */
|
|
||||||
+ sarxl %ecx, %eax, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
-L(first_vec_x3):
|
|
||||||
+ jz L(aligned_more)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_STRCHRNUL
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
+ addq %rcx, %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
-# else
|
|
||||||
- xorl %edx, %edx
|
|
||||||
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ cmp (%rax), %CHAR_REG
|
|
||||||
cmovne %rdx, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
END (STRCHR)
|
|
||||||
-#endif
|
|
||||||
+# endif
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
|
|
||||||
index 7e582f02..5225bd4f 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strchr.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strchr.c
|
|
||||||
@@ -38,11 +38,11 @@ IFUNC_SELECTOR (void)
|
|
||||||
const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
|
||||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
{
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
return OPTIMIZE (evex);
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,67 +0,0 @@
|
|||||||
From 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Sat, 25 Jan 2020 14:19:40 -0800
|
|
||||||
Subject: [PATCH] x86-64: Avoid rep movsb with short distance [BZ #27130]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
When copying with "rep movsb", if the distance between source and
|
|
||||||
destination is N*4GB + [1..63] with N >= 0, performance may be very
|
|
||||||
slow. This patch updates memmove-vec-unaligned-erms.S for AVX and
|
|
||||||
AVX512 versions with the distance in RCX:
|
|
||||||
|
|
||||||
cmpl $63, %ecx
|
|
||||||
// Don't use "rep movsb" if ECX <= 63
|
|
||||||
jbe L(Don't use rep movsb")
|
|
||||||
Use "rep movsb"
|
|
||||||
|
|
||||||
Benchtests data with bench-memcpy, bench-memcpy-large, bench-memcpy-random
|
|
||||||
and bench-memcpy-walk on Skylake, Ice Lake and Tiger Lake show that its
|
|
||||||
performance impact is within noise range as "rep movsb" is only used for
|
|
||||||
data size >= 4KB.
|
|
||||||
---
|
|
||||||
.../multiarch/memmove-vec-unaligned-erms.S | 21 +++++++++++++++++++
|
|
||||||
1 file changed, 21 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
index 673b73aa..c475fed4 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
@@ -64,6 +64,13 @@
|
|
||||||
# endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+/* Avoid short distance rep movsb only with non-SSE vector. */
|
|
||||||
+#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
|
|
||||||
+# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
|
|
||||||
+#else
|
|
||||||
+# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
#ifndef PREFETCH
|
|
||||||
# define PREFETCH(addr) prefetcht0 addr
|
|
||||||
#endif
|
|
||||||
@@ -255,7 +262,21 @@ L(movsb):
|
|
||||||
cmpq %r9, %rdi
|
|
||||||
/* Avoid slow backward REP MOVSB. */
|
|
||||||
jb L(more_8x_vec_backward)
|
|
||||||
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
|
|
||||||
+ movq %rdi, %rcx
|
|
||||||
+ subq %rsi, %rcx
|
|
||||||
+ jmp 2f
|
|
||||||
+# endif
|
|
||||||
1:
|
|
||||||
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
|
|
||||||
+ movq %rsi, %rcx
|
|
||||||
+ subq %rdi, %rcx
|
|
||||||
+2:
|
|
||||||
+/* Avoid "rep movsb" if RCX, the distance between source and destination,
|
|
||||||
+ is N*4GB + [1..63] with N >= 0. */
|
|
||||||
+ cmpl $63, %ecx
|
|
||||||
+ jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
|
|
||||||
+# endif
|
|
||||||
mov %RDX_LP, %RCX_LP
|
|
||||||
rep movsb
|
|
||||||
L(nop):
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,449 +0,0 @@
|
|||||||
From 1a8605b6cd257e8a74e29b5b71c057211f5fb847 Mon Sep 17 00:00:00 2001
|
|
||||||
From: noah <goldstein.w.n@gmail.com>
|
|
||||||
Date: Sat, 3 Apr 2021 04:12:15 -0400
|
|
||||||
Subject: [PATCH] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No Bug. This commit updates the large memcpy case (no overlap). The
|
|
||||||
update is to perform memcpy on either 2 or 4 contiguous pages at
|
|
||||||
once. This 1) helps to alleviate the affects of false memory aliasing
|
|
||||||
when destination and source have a close 4k alignment and 2) In most
|
|
||||||
cases and for most DRAM units is a modestly more efficient access
|
|
||||||
pattern. These changes are a clear performance improvement for
|
|
||||||
VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
|
|
||||||
test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
|
|
||||||
pass.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
.../multiarch/memmove-vec-unaligned-erms.S | 338 ++++++++++++++----
|
|
||||||
1 file changed, 265 insertions(+), 73 deletions(-)
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
(different number of sections)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
index c475fed4..3e2dd6bc 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
@@ -32,7 +32,16 @@
|
|
||||||
overlapping addresses.
|
|
||||||
6. If size >= __x86_shared_non_temporal_threshold and there is no
|
|
||||||
overlap between destination and source, use non-temporal store
|
|
||||||
- instead of aligned store. */
|
|
||||||
+ instead of aligned store copying from either 2 or 4 pages at
|
|
||||||
+ once.
|
|
||||||
+ 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
|
|
||||||
+ and source and destination do not page alias, copy from 2 pages
|
|
||||||
+ at once using non-temporal stores. Page aliasing in this case is
|
|
||||||
+ considered true if destination's page alignment - sources' page
|
|
||||||
+ alignment is less than 8 * VEC_SIZE.
|
|
||||||
+ 9. If size >= 16 * __x86_shared_non_temporal_threshold or source
|
|
||||||
+ and destination do page alias copy from 4 pages at once using
|
|
||||||
+ non-temporal stores. */
|
|
||||||
|
|
||||||
#include <sysdep.h>
|
|
||||||
|
|
||||||
@@ -64,6 +73,34 @@
|
|
||||||
# endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+#ifndef PAGE_SIZE
|
|
||||||
+# define PAGE_SIZE 4096
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#if PAGE_SIZE != 4096
|
|
||||||
+# error Unsupported PAGE_SIZE
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#ifndef LOG_PAGE_SIZE
|
|
||||||
+# define LOG_PAGE_SIZE 12
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
|
|
||||||
+# error Invalid LOG_PAGE_SIZE
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+/* Byte per page for large_memcpy inner loop. */
|
|
||||||
+#if VEC_SIZE == 64
|
|
||||||
+# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
|
|
||||||
+#else
|
|
||||||
+# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+/* Amount to shift rdx by to compare for memcpy_large_4x. */
|
|
||||||
+#ifndef LOG_4X_MEMCPY_THRESH
|
|
||||||
+# define LOG_4X_MEMCPY_THRESH 4
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
/* Avoid short distance rep movsb only with non-SSE vector. */
|
|
||||||
#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
|
|
||||||
# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
|
|
||||||
@@ -103,6 +140,28 @@
|
|
||||||
# error Unsupported PREFETCH_SIZE!
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
|
|
||||||
+# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
|
|
||||||
+ VMOVU (offset)base, vec0; \
|
|
||||||
+ VMOVU ((offset) + VEC_SIZE)base, vec1;
|
|
||||||
+# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
|
|
||||||
+ VMOVNT vec0, (offset)base; \
|
|
||||||
+ VMOVNT vec1, ((offset) + VEC_SIZE)base;
|
|
||||||
+#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
|
|
||||||
+# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
|
|
||||||
+ VMOVU (offset)base, vec0; \
|
|
||||||
+ VMOVU ((offset) + VEC_SIZE)base, vec1; \
|
|
||||||
+ VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \
|
|
||||||
+ VMOVU ((offset) + VEC_SIZE * 3)base, vec3;
|
|
||||||
+# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
|
|
||||||
+ VMOVNT vec0, (offset)base; \
|
|
||||||
+ VMOVNT vec1, ((offset) + VEC_SIZE)base; \
|
|
||||||
+ VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \
|
|
||||||
+ VMOVNT vec3, ((offset) + VEC_SIZE * 3)base;
|
|
||||||
+#else
|
|
||||||
+# error Invalid LARGE_LOAD_SIZE
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
#ifndef SECTION
|
|
||||||
# error SECTION is not defined!
|
|
||||||
#endif
|
|
||||||
@@ -390,6 +449,15 @@ L(last_4x_vec):
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
L(more_8x_vec):
|
|
||||||
+ /* Check if non-temporal move candidate. */
|
|
||||||
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
|
||||||
+ /* Check non-temporal store threshold. */
|
|
||||||
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
|
||||||
+ ja L(large_memcpy_2x)
|
|
||||||
+#endif
|
|
||||||
+ /* Entry if rdx is greater than non-temporal threshold but there
|
|
||||||
+ is overlap. */
|
|
||||||
+L(more_8x_vec_check):
|
|
||||||
cmpq %rsi, %rdi
|
|
||||||
ja L(more_8x_vec_backward)
|
|
||||||
/* Source == destination is less common. */
|
|
||||||
@@ -416,24 +484,21 @@ L(more_8x_vec):
|
|
||||||
subq %r8, %rdi
|
|
||||||
/* Adjust length. */
|
|
||||||
addq %r8, %rdx
|
|
||||||
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
|
||||||
- /* Check non-temporal store threshold. */
|
|
||||||
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
|
||||||
- ja L(large_forward)
|
|
||||||
-#endif
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
L(loop_4x_vec_forward):
|
|
||||||
/* Copy 4 * VEC a time forward. */
|
|
||||||
VMOVU (%rsi), %VEC(0)
|
|
||||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
|
||||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
|
||||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
|
||||||
- addq $(VEC_SIZE * 4), %rsi
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rsi
|
|
||||||
+ addq $-(VEC_SIZE * 4), %rdx
|
|
||||||
VMOVA %VEC(0), (%rdi)
|
|
||||||
VMOVA %VEC(1), VEC_SIZE(%rdi)
|
|
||||||
VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
|
|
||||||
VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
ja L(loop_4x_vec_forward)
|
|
||||||
/* Store the last 4 * VEC. */
|
|
||||||
@@ -467,24 +532,21 @@ L(more_8x_vec_backward):
|
|
||||||
subq %r8, %r9
|
|
||||||
/* Adjust length. */
|
|
||||||
subq %r8, %rdx
|
|
||||||
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
|
||||||
- /* Check non-temporal store threshold. */
|
|
||||||
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
|
||||||
- ja L(large_backward)
|
|
||||||
-#endif
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
L(loop_4x_vec_backward):
|
|
||||||
/* Copy 4 * VEC a time backward. */
|
|
||||||
VMOVU (%rcx), %VEC(0)
|
|
||||||
VMOVU -VEC_SIZE(%rcx), %VEC(1)
|
|
||||||
VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
|
|
||||||
VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
|
|
||||||
- subq $(VEC_SIZE * 4), %rcx
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ addq $-(VEC_SIZE * 4), %rcx
|
|
||||||
+ addq $-(VEC_SIZE * 4), %rdx
|
|
||||||
VMOVA %VEC(0), (%r9)
|
|
||||||
VMOVA %VEC(1), -VEC_SIZE(%r9)
|
|
||||||
VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
|
|
||||||
VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
|
|
||||||
- subq $(VEC_SIZE * 4), %r9
|
|
||||||
+ addq $-(VEC_SIZE * 4), %r9
|
|
||||||
cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
ja L(loop_4x_vec_backward)
|
|
||||||
/* Store the first 4 * VEC. */
|
|
||||||
@@ -497,72 +559,202 @@ L(loop_4x_vec_backward):
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
|
||||||
-L(large_forward):
|
|
||||||
+ .p2align 4
|
|
||||||
+L(large_memcpy_2x):
|
|
||||||
+ /* Compute absolute value of difference between source and
|
|
||||||
+ destination. */
|
|
||||||
+ movq %rdi, %r9
|
|
||||||
+ subq %rsi, %r9
|
|
||||||
+ movq %r9, %r8
|
|
||||||
+ leaq -1(%r9), %rcx
|
|
||||||
+ sarq $63, %r8
|
|
||||||
+ xorq %r8, %r9
|
|
||||||
+ subq %r8, %r9
|
|
||||||
/* Don't use non-temporal store if there is overlap between
|
|
||||||
- destination and source since destination may be in cache
|
|
||||||
- when source is loaded. */
|
|
||||||
- leaq (%rdi, %rdx), %r10
|
|
||||||
- cmpq %r10, %rsi
|
|
||||||
- jb L(loop_4x_vec_forward)
|
|
||||||
-L(loop_large_forward):
|
|
||||||
+ destination and source since destination may be in cache when
|
|
||||||
+ source is loaded. */
|
|
||||||
+ cmpq %r9, %rdx
|
|
||||||
+ ja L(more_8x_vec_check)
|
|
||||||
+
|
|
||||||
+ /* Cache align destination. First store the first 64 bytes then
|
|
||||||
+ adjust alignments. */
|
|
||||||
+ VMOVU (%rsi), %VEC(8)
|
|
||||||
+#if VEC_SIZE < 64
|
|
||||||
+ VMOVU VEC_SIZE(%rsi), %VEC(9)
|
|
||||||
+#if VEC_SIZE < 32
|
|
||||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
|
|
||||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
|
|
||||||
+#endif
|
|
||||||
+#endif
|
|
||||||
+ VMOVU %VEC(8), (%rdi)
|
|
||||||
+#if VEC_SIZE < 64
|
|
||||||
+ VMOVU %VEC(9), VEC_SIZE(%rdi)
|
|
||||||
+#if VEC_SIZE < 32
|
|
||||||
+ VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
|
|
||||||
+ VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
|
|
||||||
+#endif
|
|
||||||
+#endif
|
|
||||||
+ /* Adjust source, destination, and size. */
|
|
||||||
+ movq %rdi, %r8
|
|
||||||
+ andq $63, %r8
|
|
||||||
+ /* Get the negative of offset for alignment. */
|
|
||||||
+ subq $64, %r8
|
|
||||||
+ /* Adjust source. */
|
|
||||||
+ subq %r8, %rsi
|
|
||||||
+ /* Adjust destination which should be aligned now. */
|
|
||||||
+ subq %r8, %rdi
|
|
||||||
+ /* Adjust length. */
|
|
||||||
+ addq %r8, %rdx
|
|
||||||
+
|
|
||||||
+ /* Test if source and destination addresses will alias. If they do
|
|
||||||
+ the larger pipeline in large_memcpy_4x alleviated the
|
|
||||||
+ performance drop. */
|
|
||||||
+ testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
|
|
||||||
+ jz L(large_memcpy_4x)
|
|
||||||
+
|
|
||||||
+ movq %rdx, %r10
|
|
||||||
+ shrq $LOG_4X_MEMCPY_THRESH, %r10
|
|
||||||
+ cmp __x86_shared_non_temporal_threshold(%rip), %r10
|
|
||||||
+ jae L(large_memcpy_4x)
|
|
||||||
+
|
|
||||||
+ /* edx will store remainder size for copying tail. */
|
|
||||||
+ andl $(PAGE_SIZE * 2 - 1), %edx
|
|
||||||
+ /* r10 stores outer loop counter. */
|
|
||||||
+ shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
|
|
||||||
+ /* Copy 4x VEC at a time from 2 pages. */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(loop_large_memcpy_2x_outer):
|
|
||||||
+ /* ecx stores inner loop counter. */
|
|
||||||
+ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
|
|
||||||
+L(loop_large_memcpy_2x_inner):
|
|
||||||
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
|
|
||||||
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
|
|
||||||
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
|
|
||||||
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
|
|
||||||
+ /* Load vectors from rsi. */
|
|
||||||
+ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
|
|
||||||
+ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
|
|
||||||
+ subq $-LARGE_LOAD_SIZE, %rsi
|
|
||||||
+ /* Non-temporal store vectors to rdi. */
|
|
||||||
+ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
|
|
||||||
+ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
|
|
||||||
+ subq $-LARGE_LOAD_SIZE, %rdi
|
|
||||||
+ decl %ecx
|
|
||||||
+ jnz L(loop_large_memcpy_2x_inner)
|
|
||||||
+ addq $PAGE_SIZE, %rdi
|
|
||||||
+ addq $PAGE_SIZE, %rsi
|
|
||||||
+ decq %r10
|
|
||||||
+ jne L(loop_large_memcpy_2x_outer)
|
|
||||||
+ sfence
|
|
||||||
+
|
|
||||||
+ /* Check if only last 4 loads are needed. */
|
|
||||||
+ cmpl $(VEC_SIZE * 4), %edx
|
|
||||||
+ jbe L(large_memcpy_2x_end)
|
|
||||||
+
|
|
||||||
+ /* Handle the last 2 * PAGE_SIZE bytes. */
|
|
||||||
+L(loop_large_memcpy_2x_tail):
|
|
||||||
/* Copy 4 * VEC a time forward with non-temporal stores. */
|
|
||||||
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
|
|
||||||
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
|
|
||||||
+ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
|
|
||||||
+ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
|
|
||||||
VMOVU (%rsi), %VEC(0)
|
|
||||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
|
||||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
|
||||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
|
||||||
- addq $PREFETCHED_LOAD_SIZE, %rsi
|
|
||||||
- subq $PREFETCHED_LOAD_SIZE, %rdx
|
|
||||||
- VMOVNT %VEC(0), (%rdi)
|
|
||||||
- VMOVNT %VEC(1), VEC_SIZE(%rdi)
|
|
||||||
- VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
|
|
||||||
- VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
|
|
||||||
- addq $PREFETCHED_LOAD_SIZE, %rdi
|
|
||||||
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
|
|
||||||
- ja L(loop_large_forward)
|
|
||||||
- sfence
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rsi
|
|
||||||
+ addl $-(VEC_SIZE * 4), %edx
|
|
||||||
+ VMOVA %VEC(0), (%rdi)
|
|
||||||
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
|
|
||||||
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
|
|
||||||
+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ cmpl $(VEC_SIZE * 4), %edx
|
|
||||||
+ ja L(loop_large_memcpy_2x_tail)
|
|
||||||
+
|
|
||||||
+L(large_memcpy_2x_end):
|
|
||||||
/* Store the last 4 * VEC. */
|
|
||||||
- VMOVU %VEC(5), (%rcx)
|
|
||||||
- VMOVU %VEC(6), -VEC_SIZE(%rcx)
|
|
||||||
- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
|
|
||||||
- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
|
|
||||||
- /* Store the first VEC. */
|
|
||||||
- VMOVU %VEC(4), (%r11)
|
|
||||||
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
|
|
||||||
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
|
|
||||||
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
|
|
||||||
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
|
|
||||||
+
|
|
||||||
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
|
|
||||||
+ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
|
|
||||||
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
|
|
||||||
+ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
-L(large_backward):
|
|
||||||
- /* Don't use non-temporal store if there is overlap between
|
|
||||||
- destination and source since destination may be in cache
|
|
||||||
- when source is loaded. */
|
|
||||||
- leaq (%rcx, %rdx), %r10
|
|
||||||
- cmpq %r10, %r9
|
|
||||||
- jb L(loop_4x_vec_backward)
|
|
||||||
-L(loop_large_backward):
|
|
||||||
- /* Copy 4 * VEC a time backward with non-temporal stores. */
|
|
||||||
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
|
|
||||||
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
|
|
||||||
- VMOVU (%rcx), %VEC(0)
|
|
||||||
- VMOVU -VEC_SIZE(%rcx), %VEC(1)
|
|
||||||
- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
|
|
||||||
- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
|
|
||||||
- subq $PREFETCHED_LOAD_SIZE, %rcx
|
|
||||||
- subq $PREFETCHED_LOAD_SIZE, %rdx
|
|
||||||
- VMOVNT %VEC(0), (%r9)
|
|
||||||
- VMOVNT %VEC(1), -VEC_SIZE(%r9)
|
|
||||||
- VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
|
|
||||||
- VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
|
|
||||||
- subq $PREFETCHED_LOAD_SIZE, %r9
|
|
||||||
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
|
|
||||||
- ja L(loop_large_backward)
|
|
||||||
+ .p2align 4
|
|
||||||
+L(large_memcpy_4x):
|
|
||||||
+ movq %rdx, %r10
|
|
||||||
+ /* edx will store remainder size for copying tail. */
|
|
||||||
+ andl $(PAGE_SIZE * 4 - 1), %edx
|
|
||||||
+ /* r10 stores outer loop counter. */
|
|
||||||
+ shrq $(LOG_PAGE_SIZE + 2), %r10
|
|
||||||
+ /* Copy 4x VEC at a time from 4 pages. */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(loop_large_memcpy_4x_outer):
|
|
||||||
+ /* ecx stores inner loop counter. */
|
|
||||||
+ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
|
|
||||||
+L(loop_large_memcpy_4x_inner):
|
|
||||||
+ /* Only one prefetch set per page as doing 4 pages give more time
|
|
||||||
+ for prefetcher to keep up. */
|
|
||||||
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
|
|
||||||
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
|
|
||||||
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
|
|
||||||
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
|
|
||||||
+ /* Load vectors from rsi. */
|
|
||||||
+ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
|
|
||||||
+ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
|
|
||||||
+ LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
|
|
||||||
+ LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
|
|
||||||
+ subq $-LARGE_LOAD_SIZE, %rsi
|
|
||||||
+ /* Non-temporal store vectors to rdi. */
|
|
||||||
+ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
|
|
||||||
+ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
|
|
||||||
+ STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
|
|
||||||
+ STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
|
|
||||||
+ subq $-LARGE_LOAD_SIZE, %rdi
|
|
||||||
+ decl %ecx
|
|
||||||
+ jnz L(loop_large_memcpy_4x_inner)
|
|
||||||
+ addq $(PAGE_SIZE * 3), %rdi
|
|
||||||
+ addq $(PAGE_SIZE * 3), %rsi
|
|
||||||
+ decq %r10
|
|
||||||
+ jne L(loop_large_memcpy_4x_outer)
|
|
||||||
sfence
|
|
||||||
- /* Store the first 4 * VEC. */
|
|
||||||
- VMOVU %VEC(4), (%rdi)
|
|
||||||
- VMOVU %VEC(5), VEC_SIZE(%rdi)
|
|
||||||
- VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
|
|
||||||
- VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
|
|
||||||
- /* Store the last VEC. */
|
|
||||||
- VMOVU %VEC(8), (%r11)
|
|
||||||
+ /* Check if only last 4 loads are needed. */
|
|
||||||
+ cmpl $(VEC_SIZE * 4), %edx
|
|
||||||
+ jbe L(large_memcpy_4x_end)
|
|
||||||
+
|
|
||||||
+ /* Handle the last 4 * PAGE_SIZE bytes. */
|
|
||||||
+L(loop_large_memcpy_4x_tail):
|
|
||||||
+ /* Copy 4 * VEC a time forward with non-temporal stores. */
|
|
||||||
+ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
|
|
||||||
+ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
|
|
||||||
+ VMOVU (%rsi), %VEC(0)
|
|
||||||
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
|
|
||||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
|
||||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rsi
|
|
||||||
+ addl $-(VEC_SIZE * 4), %edx
|
|
||||||
+ VMOVA %VEC(0), (%rdi)
|
|
||||||
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
|
|
||||||
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
|
|
||||||
+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ cmpl $(VEC_SIZE * 4), %edx
|
|
||||||
+ ja L(loop_large_memcpy_4x_tail)
|
|
||||||
+
|
|
||||||
+L(large_memcpy_4x_end):
|
|
||||||
+ /* Store the last 4 * VEC. */
|
|
||||||
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
|
|
||||||
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
|
|
||||||
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
|
|
||||||
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
|
|
||||||
+
|
|
||||||
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
|
|
||||||
+ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
|
|
||||||
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
|
|
||||||
+ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
#endif
|
|
||||||
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,151 +0,0 @@
|
|||||||
From ecd8b842cf37ea112e59cd9085ff1f1b6e208ae0 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 21 Jan 2019 11:29:58 -0800
|
|
||||||
Subject: [PATCH] x86-64 memrchr: Properly handle the length parameter [BZ#
|
|
||||||
24097]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
|
||||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
|
||||||
functions written in assembly can only use the lower 32 bits of a
|
|
||||||
64-bit register as length or must clear the upper 32 bits before using
|
|
||||||
the full 64-bit register for length.
|
|
||||||
|
|
||||||
This pach fixes memrchr for x32. Tested on x86-64 and x32. On x86-64,
|
|
||||||
libc.so is the same with and withou the fix.
|
|
||||||
|
|
||||||
[BZ# 24097]
|
|
||||||
CVE-2019-6488
|
|
||||||
* sysdeps/x86_64/memrchr.S: Use RDX_LP for length.
|
|
||||||
* sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/memrchr.S | 4 +-
|
|
||||||
sysdeps/x86_64/multiarch/memrchr-avx2.S | 4 +-
|
|
||||||
sysdeps/x86_64/x32/Makefile | 3 +-
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-memrchr.c | 57 +++++++++++++++++++++++++
|
|
||||||
4 files changed, 63 insertions(+), 5 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memrchr.c
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
|
|
||||||
index b8e3fa1d..dc82f8f7 100644
|
|
||||||
--- a/sysdeps/x86_64/memrchr.S
|
|
||||||
+++ b/sysdeps/x86_64/memrchr.S
|
|
||||||
@@ -24,13 +24,13 @@
|
|
||||||
ENTRY (__memrchr)
|
|
||||||
movd %esi, %xmm1
|
|
||||||
|
|
||||||
- sub $16, %rdx
|
|
||||||
+ sub $16, %RDX_LP
|
|
||||||
jbe L(length_less16)
|
|
||||||
|
|
||||||
punpcklbw %xmm1, %xmm1
|
|
||||||
punpcklbw %xmm1, %xmm1
|
|
||||||
|
|
||||||
- add %rdx, %rdi
|
|
||||||
+ add %RDX_LP, %RDI_LP
|
|
||||||
pshufd $0, %xmm1, %xmm1
|
|
||||||
|
|
||||||
movdqu (%rdi), %xmm0
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
|
|
||||||
index b41a58bc..ce488dd9 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
|
|
||||||
@@ -32,10 +32,10 @@ ENTRY (__memrchr_avx2)
|
|
||||||
vmovd %esi, %xmm0
|
|
||||||
vpbroadcastb %xmm0, %ymm0
|
|
||||||
|
|
||||||
- subq $VEC_SIZE, %rdx
|
|
||||||
+ sub $VEC_SIZE, %RDX_LP
|
|
||||||
jbe L(last_vec_or_less)
|
|
||||||
|
|
||||||
- addq %rdx, %rdi
|
|
||||||
+ add %RDX_LP, %RDI_LP
|
|
||||||
|
|
||||||
/* Check the last VEC_SIZE bytes. */
|
|
||||||
vpcmpeqb (%rdi), %ymm0, %ymm1
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index 2fe1e5ac..e99dbd7c 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -6,7 +6,8 @@ CFLAGS-s_llround.c += -fno-builtin-lround
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
-tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
|
|
||||||
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
|
|
||||||
+ tst-size_t-memrchr
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),wcsmbs)
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memrchr.c b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..c83699c0
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
|
|
||||||
@@ -0,0 +1,57 @@
|
|
||||||
+/* Test memrchr with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define TEST_NAME "memrchr"
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+IMPL (memchr, 1)
|
|
||||||
+
|
|
||||||
+typedef void * (*proto_t) (const void *, int, size_t);
|
|
||||||
+
|
|
||||||
+static void *
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_memrchr (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, (uintptr_t) b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ parameter_t src = { { page_size }, buf2 };
|
|
||||||
+ parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ c.fn = impl->fn;
|
|
||||||
+ void * res = do_memrchr (src, c);
|
|
||||||
+ if (res)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: %p != NULL",
|
|
||||||
+ impl->name, res);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,92 +0,0 @@
|
|||||||
From 83c5b368226c34a2f0a5287df40fc290b2b34359 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 19 Apr 2021 10:45:07 -0700
|
|
||||||
Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Since strchr-avx2.S updated by
|
|
||||||
|
|
||||||
commit 1f745ecc2109890886b161d4791e1406fdfc29b8
|
|
||||||
Author: noah <goldstein.w.n@gmail.com>
|
|
||||||
Date: Wed Feb 3 00:38:59 2021 -0500
|
|
||||||
|
|
||||||
x86-64: Refactor and improve performance of strchr-avx2.S
|
|
||||||
|
|
||||||
uses sarx:
|
|
||||||
|
|
||||||
c4 e2 72 f7 c0 sarx %ecx,%eax,%eax
|
|
||||||
|
|
||||||
for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
|
|
||||||
ifunc-avx2.h.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++--
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
|
|
||||||
2 files changed, 11 insertions(+), 5 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
|
|
||||||
index e0f30e61..ef72b73f 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
|
|
||||||
@@ -30,11 +30,11 @@ IFUNC_SELECTOR (void)
|
|
||||||
const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
|
||||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
{
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
return OPTIMIZE (evex);
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index 695cdba6..85b8863a 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -400,10 +400,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
/* Support sysdeps/x86_64/multiarch/strchr.c. */
|
|
||||||
IFUNC_IMPL (i, name, strchr,
|
|
||||||
IFUNC_IMPL_ADD (array, i, strchr,
|
|
||||||
- CPU_FEATURE_USABLE (AVX2),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__strchr_avx2)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strchr,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__strchr_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strchr,
|
|
||||||
@@ -417,10 +419,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
/* Support sysdeps/x86_64/multiarch/strchrnul.c. */
|
|
||||||
IFUNC_IMPL (i, name, strchrnul,
|
|
||||||
IFUNC_IMPL_ADD (array, i, strchrnul,
|
|
||||||
- CPU_FEATURE_USABLE (AVX2),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__strchrnul_avx2)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strchrnul,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__strchrnul_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strchrnul,
|
|
||||||
@@ -574,10 +578,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
/* Support sysdeps/x86_64/multiarch/wcschr.c. */
|
|
||||||
IFUNC_IMPL (i, name, wcschr,
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcschr,
|
|
||||||
- CPU_FEATURE_USABLE (AVX2),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wcschr_avx2)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcschr,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__wcschr_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcschr,
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,265 +0,0 @@
|
|||||||
From f53790272ce7bdc5ecd14b45f65d0464d2a61a3a Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon, 19 Apr 2021 17:48:10 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize less_vec evex and avx512
|
|
||||||
memset-vec-unaligned-erms.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit adds optimized cased for less_vec memset case that
|
|
||||||
uses the avx512vl/avx512bw mask store avoiding the excessive
|
|
||||||
branches. test-memset and test-wmemset are passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 40 ++++++++++-----
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-memset.h | 6 ++-
|
|
||||||
.../multiarch/memset-avx512-unaligned-erms.S | 2 +-
|
|
||||||
.../multiarch/memset-evex-unaligned-erms.S | 2 +-
|
|
||||||
.../multiarch/memset-vec-unaligned-erms.S | 51 +++++++++++++++----
|
|
||||||
5 files changed, 74 insertions(+), 27 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index 85b8863a..d59d65f8 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -204,19 +204,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
__memset_chk_avx2_unaligned_erms_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memset_chk_evex_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memset_chk_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memset_chk_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memset_chk_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
@@ -247,19 +251,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
__memset_avx2_unaligned_erms_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memset_evex_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memset_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memset_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memset_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
@@ -739,10 +747,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__wmemset_avx2_unaligned_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wmemset_evex_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wmemset_avx512_unaligned))
|
|
||||||
|
|
||||||
#ifdef SHARED
|
|
||||||
@@ -946,10 +958,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
CPU_FEATURE_USABLE (AVX2),
|
|
||||||
__wmemset_chk_avx2_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wmemset_chk_evex_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wmemset_chk_avx512_unaligned))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
index 19795938..100e3707 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
@@ -54,7 +54,8 @@ IFUNC_SELECTOR (void)
|
|
||||||
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
|
||||||
{
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
|
||||||
{
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
return OPTIMIZE (avx512_unaligned_erms);
|
|
||||||
@@ -68,7 +69,8 @@ IFUNC_SELECTOR (void)
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
|
||||||
{
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
|
||||||
{
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
return OPTIMIZE (evex_unaligned_erms);
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
||||||
index 22e7b187..8ad842fc 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
||||||
@@ -19,6 +19,6 @@
|
|
||||||
# define SECTION(p) p##.evex512
|
|
||||||
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
|
|
||||||
# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
|
|
||||||
-
|
|
||||||
+# define USE_LESS_VEC_MASK_STORE 1
|
|
||||||
# include "memset-vec-unaligned-erms.S"
|
|
||||||
#endif
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
||||||
index ae0a4d6e..640f0929 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
||||||
@@ -19,6 +19,6 @@
|
|
||||||
# define SECTION(p) p##.evex
|
|
||||||
# define MEMSET_SYMBOL(p,s) p##_evex_##s
|
|
||||||
# define WMEMSET_SYMBOL(p,s) p##_evex_##s
|
|
||||||
-
|
|
||||||
+# define USE_LESS_VEC_MASK_STORE 1
|
|
||||||
# include "memset-vec-unaligned-erms.S"
|
|
||||||
#endif
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
index bae5cba4..f877ac9d 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
@@ -63,6 +63,8 @@
|
|
||||||
# endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+#define PAGE_SIZE 4096
|
|
||||||
+
|
|
||||||
#ifndef SECTION
|
|
||||||
# error SECTION is not defined!
|
|
||||||
#endif
|
|
||||||
@@ -213,11 +215,38 @@ L(loop):
|
|
||||||
cmpq %rcx, %rdx
|
|
||||||
jne L(loop)
|
|
||||||
VZEROUPPER_SHORT_RETURN
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
L(less_vec):
|
|
||||||
/* Less than 1 VEC. */
|
|
||||||
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
|
||||||
# error Unsupported VEC_SIZE!
|
|
||||||
# endif
|
|
||||||
+# ifdef USE_LESS_VEC_MASK_STORE
|
|
||||||
+ /* Clear high bits from edi. Only keeping bits relevant to page
|
|
||||||
+ cross check. Note that we are using rax which is set in
|
|
||||||
+ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
|
|
||||||
+ */
|
|
||||||
+ andl $(PAGE_SIZE - 1), %edi
|
|
||||||
+ /* Check if VEC_SIZE store cross page. Mask stores suffer serious
|
|
||||||
+ performance degradation when it has to fault supress. */
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %edi
|
|
||||||
+ ja L(cross_page)
|
|
||||||
+# if VEC_SIZE > 32
|
|
||||||
+ movq $-1, %rcx
|
|
||||||
+ bzhiq %rdx, %rcx, %rcx
|
|
||||||
+ kmovq %rcx, %k1
|
|
||||||
+# else
|
|
||||||
+ movl $-1, %ecx
|
|
||||||
+ bzhil %edx, %ecx, %ecx
|
|
||||||
+ kmovd %ecx, %k1
|
|
||||||
+# endif
|
|
||||||
+ vmovdqu8 %VEC(0), (%rax) {%k1}
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(cross_page):
|
|
||||||
+# endif
|
|
||||||
# if VEC_SIZE > 32
|
|
||||||
cmpb $32, %dl
|
|
||||||
jae L(between_32_63)
|
|
||||||
@@ -234,36 +263,36 @@ L(less_vec):
|
|
||||||
cmpb $1, %dl
|
|
||||||
ja L(between_2_3)
|
|
||||||
jb 1f
|
|
||||||
- movb %cl, (%rdi)
|
|
||||||
+ movb %cl, (%rax)
|
|
||||||
1:
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
# if VEC_SIZE > 32
|
|
||||||
/* From 32 to 63. No branch when size == 32. */
|
|
||||||
L(between_32_63):
|
|
||||||
- VMOVU %YMM0, -32(%rdi,%rdx)
|
|
||||||
- VMOVU %YMM0, (%rdi)
|
|
||||||
+ VMOVU %YMM0, -32(%rax,%rdx)
|
|
||||||
+ VMOVU %YMM0, (%rax)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
# endif
|
|
||||||
# if VEC_SIZE > 16
|
|
||||||
/* From 16 to 31. No branch when size == 16. */
|
|
||||||
L(between_16_31):
|
|
||||||
- VMOVU %XMM0, -16(%rdi,%rdx)
|
|
||||||
- VMOVU %XMM0, (%rdi)
|
|
||||||
+ VMOVU %XMM0, -16(%rax,%rdx)
|
|
||||||
+ VMOVU %XMM0, (%rax)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
# endif
|
|
||||||
/* From 8 to 15. No branch when size == 8. */
|
|
||||||
L(between_8_15):
|
|
||||||
- movq %rcx, -8(%rdi,%rdx)
|
|
||||||
- movq %rcx, (%rdi)
|
|
||||||
+ movq %rcx, -8(%rax,%rdx)
|
|
||||||
+ movq %rcx, (%rax)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
L(between_4_7):
|
|
||||||
/* From 4 to 7. No branch when size == 4. */
|
|
||||||
- movl %ecx, -4(%rdi,%rdx)
|
|
||||||
- movl %ecx, (%rdi)
|
|
||||||
+ movl %ecx, -4(%rax,%rdx)
|
|
||||||
+ movl %ecx, (%rax)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
L(between_2_3):
|
|
||||||
/* From 2 to 3. No branch when size == 2. */
|
|
||||||
- movw %cx, -2(%rdi,%rdx)
|
|
||||||
- movw %cx, (%rdi)
|
|
||||||
+ movw %cx, -2(%rax,%rdx)
|
|
||||||
+ movw %cx, (%rax)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
END (MEMSET_SYMBOL (__memset, unaligned_erms))
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,396 +0,0 @@
|
|||||||
From ccabe7971f508709d034b63b8672f6f751a3d356 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Fri, 23 Apr 2021 15:56:24 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize strchr-avx2.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit optimizes strchr-avx2.S. The optimizations are all
|
|
||||||
small things such as save an ALU in the alignment process, saving a
|
|
||||||
few instructions in the loop return, saving some bytes in the main
|
|
||||||
loop, and increasing the ILP in the return cases. test-strchr,
|
|
||||||
test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strchr-avx2.S | 290 +++++++++++++++----------
|
|
||||||
1 file changed, 170 insertions(+), 120 deletions(-)
|
|
||||||
|
|
||||||
Conflics:
|
|
||||||
sysdeps/x86_64/multiarch/strchr-avx2.S
|
|
||||||
(rearranged to account for branch changes)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
|
|
||||||
index 919d256c..5884726b 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
|
|
||||||
@@ -49,133 +49,144 @@
|
|
||||||
|
|
||||||
.section SECTION(.text),"ax",@progbits
|
|
||||||
ENTRY (STRCHR)
|
|
||||||
- movl %edi, %ecx
|
|
||||||
-# ifndef USE_AS_STRCHRNUL
|
|
||||||
- xorl %edx, %edx
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
/* Broadcast CHAR to YMM0. */
|
|
||||||
vmovd %esi, %xmm0
|
|
||||||
+ movl %edi, %eax
|
|
||||||
+ andl $(PAGE_SIZE - 1), %eax
|
|
||||||
+ VPBROADCAST %xmm0, %ymm0
|
|
||||||
vpxor %xmm9, %xmm9, %xmm9
|
|
||||||
- VPBROADCAST %xmm0, %ymm0
|
|
||||||
|
|
||||||
/* Check if we cross page boundary with one vector load. */
|
|
||||||
- andl $(PAGE_SIZE - 1), %ecx
|
|
||||||
- cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
|
|
||||||
- ja L(cross_page_boundary)
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
+ ja L(cross_page_boundary)
|
|
||||||
|
|
||||||
/* Check the first VEC_SIZE bytes. Search for both CHAR and the
|
|
||||||
null byte. */
|
|
||||||
vmovdqu (%rdi), %ymm8
|
|
||||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
vpor %ymm1, %ymm2, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jz L(more_vecs)
|
|
||||||
+ jz L(aligned_more)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
/* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdi, %rax), %CHAR_REG
|
|
||||||
+ jne L(zero)
|
|
||||||
+# endif
|
|
||||||
addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ /* .p2align 5 helps keep performance more consistent if ENTRY()
|
|
||||||
+ alignment % 32 was either 16 or 0. As well this makes the
|
|
||||||
+ alignment % 32 of the loop_4x_vec fixed which makes tuning it
|
|
||||||
+ easier. */
|
|
||||||
+ .p2align 5
|
|
||||||
+L(first_vec_x4):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addq $(VEC_SIZE * 3 + 1), %rdi
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdi, %rax), %CHAR_REG
|
|
||||||
+ jne L(zero)
|
|
||||||
# endif
|
|
||||||
-L(return_vzeroupper):
|
|
||||||
- ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(more_vecs):
|
|
||||||
- /* Align data for aligned loads in the loop. */
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
-L(aligned_more):
|
|
||||||
-
|
|
||||||
- /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
- since data is only aligned to VEC_SIZE. */
|
|
||||||
- vmovdqa VEC_SIZE(%rdi), %ymm8
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
- vpor %ymm1, %ymm2, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
-
|
|
||||||
- vmovdqa VEC_SIZE(%rdi), %ymm8
|
|
||||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
- vpor %ymm1, %ymm2, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
-
|
|
||||||
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8
|
|
||||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
- vpor %ymm1, %ymm2, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
-
|
|
||||||
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
|
|
||||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
- vpor %ymm1, %ymm2, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jz L(prep_loop_4x)
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
# endif
|
|
||||||
- VZEROUPPER
|
|
||||||
- ret
|
|
||||||
+
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x0):
|
|
||||||
+L(first_vec_x1):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Found CHAR or the null byte. */
|
|
||||||
- addq %rdi, %rax
|
|
||||||
+ incq %rdi
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdi, %rax), %CHAR_REG
|
|
||||||
+ jne L(zero)
|
|
||||||
# endif
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x1):
|
|
||||||
+L(first_vec_x2):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- leaq VEC_SIZE(%rdi, %rax), %rax
|
|
||||||
+ addq $(VEC_SIZE + 1), %rdi
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdi, %rax), %CHAR_REG
|
|
||||||
+ jne L(zero)
|
|
||||||
# endif
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x2):
|
|
||||||
+L(first_vec_x3):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Found CHAR or the null byte. */
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
|
||||||
+ addq $(VEC_SIZE * 2 + 1), %rdi
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdi, %rax), %CHAR_REG
|
|
||||||
+ jne L(zero)
|
|
||||||
# endif
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
-L(prep_loop_4x):
|
|
||||||
- /* Align data to 4 * VEC_SIZE. */
|
|
||||||
- andq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ .p2align 4
|
|
||||||
+L(aligned_more):
|
|
||||||
+ /* Align data to VEC_SIZE - 1. This is the same number of
|
|
||||||
+ instructions as using andq -VEC_SIZE but saves 4 bytes of code
|
|
||||||
+ on x4 check. */
|
|
||||||
+ orq $(VEC_SIZE - 1), %rdi
|
|
||||||
+L(cross_page_continue):
|
|
||||||
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
+ since data is only aligned to VEC_SIZE. */
|
|
||||||
+ vmovdqa 1(%rdi), %ymm8
|
|
||||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
+ vpor %ymm1, %ymm2, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x1)
|
|
||||||
+
|
|
||||||
+ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
|
|
||||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
+ vpor %ymm1, %ymm2, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x2)
|
|
||||||
+
|
|
||||||
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
|
|
||||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
+ vpor %ymm1, %ymm2, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x3)
|
|
||||||
|
|
||||||
+ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
|
|
||||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
+ vpor %ymm1, %ymm2, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x4)
|
|
||||||
+ /* Align data to VEC_SIZE * 4 - 1. */
|
|
||||||
+ addq $(VEC_SIZE * 4 + 1), %rdi
|
|
||||||
+ andq $-(VEC_SIZE * 4), %rdi
|
|
||||||
.p2align 4
|
|
||||||
L(loop_4x_vec):
|
|
||||||
/* Compare 4 * VEC at a time forward. */
|
|
||||||
- vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
|
|
||||||
- vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
|
|
||||||
- vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
|
|
||||||
- vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
|
|
||||||
+ vmovdqa (%rdi), %ymm5
|
|
||||||
+ vmovdqa (VEC_SIZE)(%rdi), %ymm6
|
|
||||||
+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
|
|
||||||
+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
|
|
||||||
|
|
||||||
/* Leaves only CHARS matching esi as 0. */
|
|
||||||
vpxor %ymm5, %ymm0, %ymm1
|
|
||||||
@@ -191,63 +202,102 @@ L(loop_4x_vec):
|
|
||||||
VPMINU %ymm1, %ymm2, %ymm5
|
|
||||||
VPMINU %ymm3, %ymm4, %ymm6
|
|
||||||
|
|
||||||
- VPMINU %ymm5, %ymm6, %ymm5
|
|
||||||
+ VPMINU %ymm5, %ymm6, %ymm6
|
|
||||||
|
|
||||||
- VPCMPEQ %ymm5, %ymm9, %ymm5
|
|
||||||
- vpmovmskb %ymm5, %eax
|
|
||||||
+ VPCMPEQ %ymm6, %ymm9, %ymm6
|
|
||||||
+ vpmovmskb %ymm6, %ecx
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
+ jz L(loop_4x_vec)
|
|
||||||
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jz L(loop_4x_vec)
|
|
||||||
|
|
||||||
- VPCMPEQ %ymm1, %ymm9, %ymm1
|
|
||||||
+ VPCMPEQ %ymm1, %ymm9, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ jnz L(last_vec_x0)
|
|
||||||
+
|
|
||||||
|
|
||||||
- VPCMPEQ %ymm2, %ymm9, %ymm2
|
|
||||||
+ VPCMPEQ %ymm5, %ymm9, %ymm2
|
|
||||||
vpmovmskb %ymm2, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
+ jnz L(last_vec_x1)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ %ymm3, %ymm9, %ymm3
|
|
||||||
+ vpmovmskb %ymm3, %eax
|
|
||||||
+ /* rcx has combined result from all 4 VEC. It will only be used
|
|
||||||
+ if the first 3 other VEC all did not contain a match. */
|
|
||||||
+ salq $32, %rcx
|
|
||||||
+ orq %rcx, %rax
|
|
||||||
+ tzcntq %rax, %rax
|
|
||||||
+ subq $(VEC_SIZE * 2), %rdi
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdi, %rax), %CHAR_REG
|
|
||||||
+ jne L(zero_end)
|
|
||||||
+# endif
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_vec_x0):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdi, %rax), %CHAR_REG
|
|
||||||
+ jne L(zero_end)
|
|
||||||
+# endif
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
- VPCMPEQ %ymm3, %ymm9, %ymm3
|
|
||||||
- VPCMPEQ %ymm4, %ymm9, %ymm4
|
|
||||||
- vpmovmskb %ymm3, %ecx
|
|
||||||
- vpmovmskb %ymm4, %eax
|
|
||||||
- salq $32, %rax
|
|
||||||
- orq %rcx, %rax
|
|
||||||
- tzcntq %rax, %rax
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+L(zero_end):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
# endif
|
|
||||||
- VZEROUPPER
|
|
||||||
- ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_vec_x1):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ subq $(VEC_SIZE * 3), %rdi
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdi, %rax), %CHAR_REG
|
|
||||||
+ jne L(zero_end)
|
|
||||||
+# endif
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
|
|
||||||
/* Cold case for crossing page with first load. */
|
|
||||||
.p2align 4
|
|
||||||
L(cross_page_boundary):
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
-
|
|
||||||
- vmovdqa (%rdi), %ymm8
|
|
||||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
+ movq %rdi, %rdx
|
|
||||||
+ /* Align rdi to VEC_SIZE - 1. */
|
|
||||||
+ orq $(VEC_SIZE - 1), %rdi
|
|
||||||
+ vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
|
|
||||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
vpor %ymm1, %ymm2, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
- /* Remove the leading bits. */
|
|
||||||
- sarxl %ecx, %eax, %eax
|
|
||||||
+ /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
|
|
||||||
+ so no need to manually mod edx. */
|
|
||||||
+ sarxl %edx, %eax, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jz L(aligned_more)
|
|
||||||
+ jz L(cross_page_continue)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- addq %rcx, %rdi
|
|
||||||
- addq %rdi, %rax
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ xorl %ecx, %ecx
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdx, %rax), %CHAR_REG
|
|
||||||
+ leaq (%rdx, %rax), %rax
|
|
||||||
+ cmovne %rcx, %rax
|
|
||||||
+# else
|
|
||||||
+ addq %rdx, %rax
|
|
||||||
# endif
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
+L(return_vzeroupper):
|
|
||||||
+ ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
||||||
|
|
||||||
END (STRCHR)
|
|
||||||
# endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,532 +0,0 @@
|
|||||||
From 7f3e7c262cab4e2401e4331a6ef29c428de02044 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Fri, 23 Apr 2021 15:56:25 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize strchr-evex.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit optimizes strchr-evex.S. The optimizations are
|
|
||||||
mostly small things such as save an ALU in the alignment process,
|
|
||||||
saving a few instructions in the loop return. The one significant
|
|
||||||
change is saving 2 instructions in the 4x loop. test-strchr,
|
|
||||||
test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strchr-evex.S | 392 ++++++++++++++-----------
|
|
||||||
1 file changed, 218 insertions(+), 174 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
|
|
||||||
index ddc86a70..7f9d4ee4 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strchr-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
|
|
||||||
@@ -32,13 +32,15 @@
|
|
||||||
# define VPCMP vpcmpd
|
|
||||||
# define VPMINU vpminud
|
|
||||||
# define CHAR_REG esi
|
|
||||||
-# define SHIFT_REG r8d
|
|
||||||
+# define SHIFT_REG ecx
|
|
||||||
+# define CHAR_SIZE 4
|
|
||||||
# else
|
|
||||||
# define VPBROADCAST vpbroadcastb
|
|
||||||
# define VPCMP vpcmpb
|
|
||||||
# define VPMINU vpminub
|
|
||||||
# define CHAR_REG sil
|
|
||||||
-# define SHIFT_REG ecx
|
|
||||||
+# define SHIFT_REG edx
|
|
||||||
+# define CHAR_SIZE 1
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# define XMMZERO xmm16
|
|
||||||
@@ -56,23 +58,20 @@
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
# define PAGE_SIZE 4096
|
|
||||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
||||||
|
|
||||||
.section .text.evex,"ax",@progbits
|
|
||||||
ENTRY (STRCHR)
|
|
||||||
- movl %edi, %ecx
|
|
||||||
-# ifndef USE_AS_STRCHRNUL
|
|
||||||
- xorl %edx, %edx
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
/* Broadcast CHAR to YMM0. */
|
|
||||||
- VPBROADCAST %esi, %YMM0
|
|
||||||
-
|
|
||||||
+ VPBROADCAST %esi, %YMM0
|
|
||||||
+ movl %edi, %eax
|
|
||||||
+ andl $(PAGE_SIZE - 1), %eax
|
|
||||||
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
|
||||||
|
|
||||||
- /* Check if we cross page boundary with one vector load. */
|
|
||||||
- andl $(PAGE_SIZE - 1), %ecx
|
|
||||||
- cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
|
|
||||||
- ja L(cross_page_boundary)
|
|
||||||
+ /* Check if we cross page boundary with one vector load.
|
|
||||||
+ Otherwise it is safe to use an unaligned load. */
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
+ ja L(cross_page_boundary)
|
|
||||||
|
|
||||||
/* Check the first VEC_SIZE bytes. Search for both CHAR and the
|
|
||||||
null bytes. */
|
|
||||||
@@ -83,251 +82,296 @@ ENTRY (STRCHR)
|
|
||||||
VPMINU %YMM2, %YMM1, %YMM2
|
|
||||||
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
- ktestd %k0, %k0
|
|
||||||
- jz L(more_vecs)
|
|
||||||
kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(aligned_more)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Found CHAR or the null byte. */
|
|
||||||
# ifdef USE_AS_WCSCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (%rdi, %rax, 4), %rax
|
|
||||||
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes.
|
|
||||||
+ */
|
|
||||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# else
|
|
||||||
addq %rdi, %rax
|
|
||||||
# endif
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rax), %CHAR_REG
|
|
||||||
+ jne L(zero)
|
|
||||||
# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(more_vecs):
|
|
||||||
- /* Align data for aligned loads in the loop. */
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
-L(aligned_more):
|
|
||||||
-
|
|
||||||
- /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
- since data is only aligned to VEC_SIZE. */
|
|
||||||
- VMOVA VEC_SIZE(%rdi), %YMM1
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
-
|
|
||||||
- /* Leaves only CHARS matching esi as 0. */
|
|
||||||
- vpxorq %YMM1, %YMM0, %YMM2
|
|
||||||
- VPMINU %YMM2, %YMM1, %YMM2
|
|
||||||
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
-
|
|
||||||
- VMOVA VEC_SIZE(%rdi), %YMM1
|
|
||||||
- /* Leaves only CHARS matching esi as 0. */
|
|
||||||
- vpxorq %YMM1, %YMM0, %YMM2
|
|
||||||
- VPMINU %YMM2, %YMM1, %YMM2
|
|
||||||
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
-
|
|
||||||
- VMOVA (VEC_SIZE * 2)(%rdi), %YMM1
|
|
||||||
- /* Leaves only CHARS matching esi as 0. */
|
|
||||||
- vpxorq %YMM1, %YMM0, %YMM2
|
|
||||||
- VPMINU %YMM2, %YMM1, %YMM2
|
|
||||||
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
-
|
|
||||||
- VMOVA (VEC_SIZE * 3)(%rdi), %YMM1
|
|
||||||
- /* Leaves only CHARS matching esi as 0. */
|
|
||||||
- vpxorq %YMM1, %YMM0, %YMM2
|
|
||||||
- VPMINU %YMM2, %YMM1, %YMM2
|
|
||||||
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
- ktestd %k0, %k0
|
|
||||||
- jz L(prep_loop_4x)
|
|
||||||
-
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
+ /* .p2align 5 helps keep performance more consistent if ENTRY()
|
|
||||||
+ alignment % 32 was either 16 or 0. As well this makes the
|
|
||||||
+ alignment % 32 of the loop_4x_vec fixed which makes tuning it
|
|
||||||
+ easier. */
|
|
||||||
+ .p2align 5
|
|
||||||
+L(first_vec_x3):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
/* Found CHAR or the null byte. */
|
|
||||||
-# ifdef USE_AS_WCSCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
|
|
||||||
-# else
|
|
||||||
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
|
|
||||||
+ cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
|
|
||||||
+ jne L(zero)
|
|
||||||
# endif
|
|
||||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
|
||||||
+ bytes. */
|
|
||||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
-# endif
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
ret
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x0):
|
|
||||||
+L(first_vec_x4):
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ /* Check to see if first match was CHAR (k0) or null (k1). */
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Found CHAR or the null byte. */
|
|
||||||
-# ifdef USE_AS_WCSCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (%rdi, %rax, 4), %rax
|
|
||||||
+ kmovd %k1, %ecx
|
|
||||||
+ /* bzhil will not be 0 if first match was null. */
|
|
||||||
+ bzhil %eax, %ecx, %ecx
|
|
||||||
+ jne L(zero)
|
|
||||||
# else
|
|
||||||
- addq %rdi, %rax
|
|
||||||
-# endif
|
|
||||||
-# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Combine CHAR and null matches. */
|
|
||||||
+ kord %k0, %k1, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
# endif
|
|
||||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
|
||||||
+ bytes. */
|
|
||||||
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x1):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Found CHAR or the null byte. */
|
|
||||||
-# ifdef USE_AS_WCSCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq VEC_SIZE(%rdi, %rax, 4), %rax
|
|
||||||
-# else
|
|
||||||
- leaq VEC_SIZE(%rdi, %rax), %rax
|
|
||||||
-# endif
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
|
|
||||||
+ jne L(zero)
|
|
||||||
+
|
|
||||||
# endif
|
|
||||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
|
||||||
+ bytes. */
|
|
||||||
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x2):
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ /* Check to see if first match was CHAR (k0) or null (k1). */
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Found CHAR or the null byte. */
|
|
||||||
-# ifdef USE_AS_WCSCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
|
|
||||||
+ kmovd %k1, %ecx
|
|
||||||
+ /* bzhil will not be 0 if first match was null. */
|
|
||||||
+ bzhil %eax, %ecx, %ecx
|
|
||||||
+ jne L(zero)
|
|
||||||
# else
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
|
||||||
-# endif
|
|
||||||
-# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Combine CHAR and null matches. */
|
|
||||||
+ kord %k0, %k1, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
# endif
|
|
||||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
|
||||||
+ bytes. */
|
|
||||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
-L(prep_loop_4x):
|
|
||||||
- /* Align data to 4 * VEC_SIZE. */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(aligned_more):
|
|
||||||
+ /* Align data to VEC_SIZE. */
|
|
||||||
+ andq $-VEC_SIZE, %rdi
|
|
||||||
+L(cross_page_continue):
|
|
||||||
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
|
|
||||||
+ data is only aligned to VEC_SIZE. Use two alternating methods
|
|
||||||
+ for checking VEC to balance latency and port contention. */
|
|
||||||
+
|
|
||||||
+ /* This method has higher latency but has better port
|
|
||||||
+ distribution. */
|
|
||||||
+ VMOVA (VEC_SIZE)(%rdi), %YMM1
|
|
||||||
+ /* Leaves only CHARS matching esi as 0. */
|
|
||||||
+ vpxorq %YMM1, %YMM0, %YMM2
|
|
||||||
+ VPMINU %YMM2, %YMM1, %YMM2
|
|
||||||
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
+ VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x1)
|
|
||||||
+
|
|
||||||
+ /* This method has higher latency but has better port
|
|
||||||
+ distribution. */
|
|
||||||
+ VMOVA (VEC_SIZE * 2)(%rdi), %YMM1
|
|
||||||
+ /* Each bit in K0 represents a CHAR in YMM1. */
|
|
||||||
+ VPCMP $0, %YMM1, %YMM0, %k0
|
|
||||||
+ /* Each bit in K1 represents a CHAR in YMM1. */
|
|
||||||
+ VPCMP $0, %YMM1, %YMMZERO, %k1
|
|
||||||
+ kortestd %k0, %k1
|
|
||||||
+ jnz L(first_vec_x2)
|
|
||||||
+
|
|
||||||
+ VMOVA (VEC_SIZE * 3)(%rdi), %YMM1
|
|
||||||
+ /* Leaves only CHARS matching esi as 0. */
|
|
||||||
+ vpxorq %YMM1, %YMM0, %YMM2
|
|
||||||
+ VPMINU %YMM2, %YMM1, %YMM2
|
|
||||||
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
+ VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x3)
|
|
||||||
+
|
|
||||||
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
|
|
||||||
+ /* Each bit in K0 represents a CHAR in YMM1. */
|
|
||||||
+ VPCMP $0, %YMM1, %YMM0, %k0
|
|
||||||
+ /* Each bit in K1 represents a CHAR in YMM1. */
|
|
||||||
+ VPCMP $0, %YMM1, %YMMZERO, %k1
|
|
||||||
+ kortestd %k0, %k1
|
|
||||||
+ jnz L(first_vec_x4)
|
|
||||||
+
|
|
||||||
+ /* Align data to VEC_SIZE * 4 for the loop. */
|
|
||||||
+ addq $VEC_SIZE, %rdi
|
|
||||||
andq $-(VEC_SIZE * 4), %rdi
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(loop_4x_vec):
|
|
||||||
- /* Compare 4 * VEC at a time forward. */
|
|
||||||
+ /* Check 4x VEC at a time. No penalty to imm32 offset with evex
|
|
||||||
+ encoding. */
|
|
||||||
VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
|
|
||||||
VMOVA (VEC_SIZE * 5)(%rdi), %YMM2
|
|
||||||
VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
|
|
||||||
VMOVA (VEC_SIZE * 7)(%rdi), %YMM4
|
|
||||||
|
|
||||||
- /* Leaves only CHARS matching esi as 0. */
|
|
||||||
+ /* For YMM1 and YMM3 use xor to set the CHARs matching esi to
|
|
||||||
+ zero. */
|
|
||||||
vpxorq %YMM1, %YMM0, %YMM5
|
|
||||||
- vpxorq %YMM2, %YMM0, %YMM6
|
|
||||||
+ /* For YMM2 and YMM4 cmp not equals to CHAR and store result in
|
|
||||||
+ k register. Its possible to save either 1 or 2 instructions
|
|
||||||
+ using cmp no equals method for either YMM1 or YMM1 and YMM3
|
|
||||||
+ respectively but bottleneck on p5 makes it not worth it. */
|
|
||||||
+ VPCMP $4, %YMM0, %YMM2, %k2
|
|
||||||
vpxorq %YMM3, %YMM0, %YMM7
|
|
||||||
- vpxorq %YMM4, %YMM0, %YMM8
|
|
||||||
-
|
|
||||||
- VPMINU %YMM5, %YMM1, %YMM5
|
|
||||||
- VPMINU %YMM6, %YMM2, %YMM6
|
|
||||||
- VPMINU %YMM7, %YMM3, %YMM7
|
|
||||||
- VPMINU %YMM8, %YMM4, %YMM8
|
|
||||||
-
|
|
||||||
- VPMINU %YMM5, %YMM6, %YMM1
|
|
||||||
- VPMINU %YMM7, %YMM8, %YMM2
|
|
||||||
-
|
|
||||||
- VPMINU %YMM1, %YMM2, %YMM1
|
|
||||||
-
|
|
||||||
- /* Each bit in K0 represents a CHAR or a null byte. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM1, %k0
|
|
||||||
-
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
-
|
|
||||||
- ktestd %k0, %k0
|
|
||||||
+ VPCMP $4, %YMM0, %YMM4, %k4
|
|
||||||
+
|
|
||||||
+ /* Use min to select all zeros from either xor or end of string).
|
|
||||||
+ */
|
|
||||||
+ VPMINU %YMM1, %YMM5, %YMM1
|
|
||||||
+ VPMINU %YMM3, %YMM7, %YMM3
|
|
||||||
+
|
|
||||||
+ /* Use min + zeromask to select for zeros. Since k2 and k4 will
|
|
||||||
+ have 0 as positions that matched with CHAR which will set
|
|
||||||
+ zero in the corresponding destination bytes in YMM2 / YMM4.
|
|
||||||
+ */
|
|
||||||
+ VPMINU %YMM1, %YMM2, %YMM2{%k2}{z}
|
|
||||||
+ VPMINU %YMM3, %YMM4, %YMM4
|
|
||||||
+ VPMINU %YMM2, %YMM4, %YMM4{%k4}{z}
|
|
||||||
+
|
|
||||||
+ VPCMP $0, %YMMZERO, %YMM4, %k1
|
|
||||||
+ kmovd %k1, %ecx
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
jz L(loop_4x_vec)
|
|
||||||
|
|
||||||
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM5, %k0
|
|
||||||
+ VPCMP $0, %YMMZERO, %YMM1, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ jnz L(last_vec_x1)
|
|
||||||
|
|
||||||
- /* Each bit in K1 represents a CHAR or a null byte in YMM2. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM6, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
+ VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
-
|
|
||||||
- /* Each bit in K2 represents a CHAR or a null byte in YMM3. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM7, %k2
|
|
||||||
- /* Each bit in K3 represents a CHAR or a null byte in YMM4. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM8, %k3
|
|
||||||
+ jnz L(last_vec_x2)
|
|
||||||
|
|
||||||
+ VPCMP $0, %YMMZERO, %YMM3, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */
|
|
||||||
# ifdef USE_AS_WCSCHR
|
|
||||||
- /* NB: Each bit in K2/K3 represents 4-byte element. */
|
|
||||||
- kshiftlw $8, %k3, %k1
|
|
||||||
+ sall $8, %ecx
|
|
||||||
+ orl %ecx, %eax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
# else
|
|
||||||
- kshiftlq $32, %k3, %k1
|
|
||||||
+ salq $32, %rcx
|
|
||||||
+ orq %rcx, %rax
|
|
||||||
+ tzcntq %rax, %rax
|
|
||||||
# endif
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ /* Check if match was CHAR or null. */
|
|
||||||
+ cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
|
|
||||||
+ jne L(zero_end)
|
|
||||||
+# endif
|
|
||||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
|
||||||
+ bytes. */
|
|
||||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
|
||||||
- korq %k1, %k2, %k1
|
|
||||||
- kmovq %k1, %rax
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+L(zero_end):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
- tzcntq %rax, %rax
|
|
||||||
-# ifdef USE_AS_WCSCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
|
|
||||||
-# else
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_vec_x1):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ /* Check if match was null. */
|
|
||||||
+ cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
|
|
||||||
+ jne L(zero_end)
|
|
||||||
# endif
|
|
||||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
|
||||||
+ bytes. */
|
|
||||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_vec_x2):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Check if match was null. */
|
|
||||||
+ cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
|
|
||||||
+ jne L(zero_end)
|
|
||||||
# endif
|
|
||||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
|
||||||
+ bytes. */
|
|
||||||
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
/* Cold case for crossing page with first load. */
|
|
||||||
.p2align 4
|
|
||||||
L(cross_page_boundary):
|
|
||||||
+ movq %rdi, %rdx
|
|
||||||
+ /* Align rdi. */
|
|
||||||
andq $-VEC_SIZE, %rdi
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
-
|
|
||||||
VMOVA (%rdi), %YMM1
|
|
||||||
-
|
|
||||||
/* Leaves only CHARS matching esi as 0. */
|
|
||||||
vpxorq %YMM1, %YMM0, %YMM2
|
|
||||||
VPMINU %YMM2, %YMM1, %YMM2
|
|
||||||
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
-
|
|
||||||
+ /* Remove the leading bits. */
|
|
||||||
# ifdef USE_AS_WCSCHR
|
|
||||||
+ movl %edx, %SHIFT_REG
|
|
||||||
/* NB: Divide shift count by 4 since each bit in K1 represent 4
|
|
||||||
bytes. */
|
|
||||||
- movl %ecx, %SHIFT_REG
|
|
||||||
- sarl $2, %SHIFT_REG
|
|
||||||
+ sarl $2, %SHIFT_REG
|
|
||||||
+ andl $(CHAR_PER_VEC - 1), %SHIFT_REG
|
|
||||||
# endif
|
|
||||||
-
|
|
||||||
- /* Remove the leading bits. */
|
|
||||||
sarxl %SHIFT_REG, %eax, %eax
|
|
||||||
+ /* If eax is zero continue. */
|
|
||||||
testl %eax, %eax
|
|
||||||
-
|
|
||||||
- jz L(aligned_more)
|
|
||||||
+ jz L(cross_page_continue)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- addq %rcx, %rdi
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ /* Check to see if match was CHAR or null. */
|
|
||||||
+ cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG
|
|
||||||
+ jne L(zero_end)
|
|
||||||
+# endif
|
|
||||||
# ifdef USE_AS_WCSCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (%rdi, %rax, 4), %rax
|
|
||||||
+ /* NB: Multiply wchar_t count by 4 to get the number of
|
|
||||||
+ bytes. */
|
|
||||||
+ leaq (%rdx, %rax, CHAR_SIZE), %rax
|
|
||||||
# else
|
|
||||||
- addq %rdi, %rax
|
|
||||||
-# endif
|
|
||||||
-# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ addq %rdx, %rax
|
|
||||||
# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,536 +0,0 @@
|
|||||||
From 104c7b1967c3e78435c6f7eab5e225a7eddf9c6e Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Tue, 4 May 2021 19:02:40 -0400
|
|
||||||
Subject: [PATCH] x86: Add EVEX optimized memchr family not safe for RTM
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug.
|
|
||||||
|
|
||||||
This commit adds a new implementation for EVEX memchr that is not safe
|
|
||||||
for RTM because it uses vzeroupper. The benefit is that by using
|
|
||||||
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
|
|
||||||
faster than the RTM safe version which cannot use vpcmpeq because
|
|
||||||
there is no EVEX encoding for the instruction. All parts of the
|
|
||||||
implementation aside from the 4x loop are the same for the two
|
|
||||||
versions and the optimization is only relevant for large sizes.
|
|
||||||
|
|
||||||
Tigerlake:
|
|
||||||
size , algn , Pos , Cur T , New T , Win , Dif
|
|
||||||
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
|
|
||||||
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
|
|
||||||
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
|
|
||||||
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
|
|
||||||
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
|
|
||||||
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
|
|
||||||
|
|
||||||
Icelake:
|
|
||||||
size , algn , Pos , Cur T , New T , Win , Dif
|
|
||||||
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
|
|
||||||
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
|
|
||||||
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
|
|
||||||
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
|
|
||||||
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
|
|
||||||
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
|
|
||||||
|
|
||||||
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/Makefile | 7 +-
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-evex.h | 55 ++++++
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 ++
|
|
||||||
sysdeps/x86_64/multiarch/memchr-evex-rtm.S | 8 +
|
|
||||||
sysdeps/x86_64/multiarch/memchr-evex.S | 161 ++++++++++++++----
|
|
||||||
sysdeps/x86_64/multiarch/memchr.c | 2 +-
|
|
||||||
sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | 3 +
|
|
||||||
sysdeps/x86_64/multiarch/rawmemchr.c | 2 +-
|
|
||||||
sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S | 3 +
|
|
||||||
sysdeps/x86_64/multiarch/wmemchr.c | 2 +-
|
|
||||||
10 files changed, 217 insertions(+), 41 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/ifunc-evex.h
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-rtm.S
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
index 65fde4eb..26be4095 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
@@ -77,7 +77,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
|
||||||
strncmp-evex \
|
|
||||||
strncpy-evex \
|
|
||||||
strnlen-evex \
|
|
||||||
- strrchr-evex
|
|
||||||
+ strrchr-evex \
|
|
||||||
+ memchr-evex-rtm \
|
|
||||||
+ rawmemchr-evex-rtm
|
|
||||||
CFLAGS-varshift.c += -msse4
|
|
||||||
CFLAGS-strcspn-c.c += -msse4
|
|
||||||
CFLAGS-strpbrk-c.c += -msse4
|
|
||||||
@@ -110,7 +112,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
|
|
||||||
wcsnlen-evex \
|
|
||||||
wcsrchr-evex \
|
|
||||||
wmemchr-evex \
|
|
||||||
- wmemcmp-evex-movbe
|
|
||||||
+ wmemcmp-evex-movbe \
|
|
||||||
+ wmemchr-evex-rtm
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),debug)
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..fc391edb
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-evex.h
|
|
||||||
@@ -0,0 +1,55 @@
|
|
||||||
+/* Common definition for ifunc selection optimized with EVEX.
|
|
||||||
+ All versions must be listed in ifunc-impl-list.c.
|
|
||||||
+ Copyright (C) 2017-2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <init-arch.h>
|
|
||||||
+
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden;
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+static inline void *
|
|
||||||
+IFUNC_SELECTOR (void)
|
|
||||||
+{
|
|
||||||
+ const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
||||||
+
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
|
||||||
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
||||||
+ return OPTIMIZE (evex_rtm);
|
|
||||||
+
|
|
||||||
+ return OPTIMIZE (evex);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
||||||
+ return OPTIMIZE (avx2_rtm);
|
|
||||||
+
|
|
||||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
+ return OPTIMIZE (avx2);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return OPTIMIZE (sse2);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index d59d65f8..ac097e8d 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -52,6 +52,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
&& CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memchr_evex)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, memchr,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
+ __memchr_evex_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
|
|
||||||
|
|
||||||
/* Support sysdeps/x86_64/multiarch/memcmp.c. */
|
|
||||||
@@ -288,6 +293,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
&& CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__rawmemchr_evex)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, rawmemchr,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
+ __rawmemchr_evex_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
|
|
||||||
|
|
||||||
/* Support sysdeps/x86_64/multiarch/strlen.c. */
|
|
||||||
@@ -711,6 +721,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
&& CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wmemchr_evex)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, wmemchr,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
+ __wmemchr_evex_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
|
|
||||||
|
|
||||||
/* Support sysdeps/x86_64/multiarch/wmemcmp.c. */
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memchr-evex-rtm.S b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..19871882
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
|
|
||||||
@@ -0,0 +1,8 @@
|
|
||||||
+#ifndef MEMCHR
|
|
||||||
+# define MEMCHR __memchr_evex_rtm
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#define USE_IN_RTM 1
|
|
||||||
+#define SECTION(p) p##.evex.rtm
|
|
||||||
+
|
|
||||||
+#include "memchr-evex.S"
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
index f3fdad4f..4d0ed6d1 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
@@ -38,10 +38,32 @@
|
|
||||||
# define CHAR_SIZE 1
|
|
||||||
# endif
|
|
||||||
|
|
||||||
+ /* In the 4x loop the RTM and non-RTM versions have data pointer
|
|
||||||
+ off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
|
|
||||||
+ This is represented by BASE_OFFSET. As well because the RTM
|
|
||||||
+ version uses vpcmp which stores a bit per element compared where
|
|
||||||
+ the non-RTM version uses vpcmpeq which stores a bit per byte
|
|
||||||
+ compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
|
|
||||||
+ version. */
|
|
||||||
+# ifdef USE_IN_RTM
|
|
||||||
+# define VZEROUPPER
|
|
||||||
+# define BASE_OFFSET (VEC_SIZE * 4)
|
|
||||||
+# define RET_SCALE CHAR_SIZE
|
|
||||||
+# else
|
|
||||||
+# define VZEROUPPER vzeroupper
|
|
||||||
+# define BASE_OFFSET 0
|
|
||||||
+# define RET_SCALE 1
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+ /* In the return from 4x loop memchr and rawmemchr versions have
|
|
||||||
+ data pointers off by VEC_SIZE * 4 with memchr version being
|
|
||||||
+ VEC_SIZE * 4 greater. */
|
|
||||||
# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4))
|
|
||||||
# define RAW_PTR_REG rcx
|
|
||||||
# define ALGN_PTR_REG rdi
|
|
||||||
# else
|
|
||||||
+# define RET_OFFSET BASE_OFFSET
|
|
||||||
# define RAW_PTR_REG rdi
|
|
||||||
# define ALGN_PTR_REG rcx
|
|
||||||
# endif
|
|
||||||
@@ -57,11 +79,15 @@
|
|
||||||
# define YMM5 ymm21
|
|
||||||
# define YMM6 ymm22
|
|
||||||
|
|
||||||
+# ifndef SECTION
|
|
||||||
+# define SECTION(p) p##.evex
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
||||||
# define PAGE_SIZE 4096
|
|
||||||
|
|
||||||
- .section .text.evex,"ax",@progbits
|
|
||||||
+ .section SECTION(.text),"ax",@progbits
|
|
||||||
ENTRY (MEMCHR)
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
/* Check for zero length. */
|
|
||||||
@@ -237,14 +263,15 @@ L(cross_page_continue):
|
|
||||||
/* Check if at last CHAR_PER_VEC * 4 length. */
|
|
||||||
subq $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
jbe L(last_4x_vec_or_less_cmpeq)
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
+ /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */
|
|
||||||
+ addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
|
|
||||||
|
|
||||||
/* Align data to VEC_SIZE * 4 for the loop and readjust length.
|
|
||||||
*/
|
|
||||||
# ifdef USE_AS_WMEMCHR
|
|
||||||
movl %edi, %ecx
|
|
||||||
andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
- andl $(VEC_SIZE * 4 - 1), %ecx
|
|
||||||
+ subl %edi, %ecx
|
|
||||||
/* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
sarl $2, %ecx
|
|
||||||
addq %rcx, %rdx
|
|
||||||
@@ -254,15 +281,28 @@ L(cross_page_continue):
|
|
||||||
subq %rdi, %rdx
|
|
||||||
# endif
|
|
||||||
# else
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
+ addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
|
|
||||||
andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
# endif
|
|
||||||
-
|
|
||||||
+# ifdef USE_IN_RTM
|
|
||||||
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
|
||||||
+# else
|
|
||||||
+ /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
|
|
||||||
+ encodable with EVEX registers (ymm16-ymm31). */
|
|
||||||
+ vmovdqa64 %YMMMATCH, %ymm0
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
/* Compare 4 * VEC at a time forward. */
|
|
||||||
.p2align 4
|
|
||||||
L(loop_4x_vec):
|
|
||||||
+ /* Two versions of the loop. One that does not require
|
|
||||||
+ vzeroupper by not using ymm0-ymm15 and another does that require
|
|
||||||
+ vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
|
|
||||||
+ is used at all is because there is no EVEX encoding vpcmpeq and
|
|
||||||
+ with vpcmpeq this loop can be performed more efficiently. The
|
|
||||||
+ non-vzeroupper version is safe for RTM while the vzeroupper
|
|
||||||
+ version should be prefered if RTM are not supported. */
|
|
||||||
+# ifdef USE_IN_RTM
|
|
||||||
/* It would be possible to save some instructions using 4x VPCMP
|
|
||||||
but bottleneck on port 5 makes it not woth it. */
|
|
||||||
VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
|
|
||||||
@@ -273,12 +313,55 @@ L(loop_4x_vec):
|
|
||||||
/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
|
|
||||||
VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
|
|
||||||
VPCMP $0, %YMM3, %YMMZERO, %k2
|
|
||||||
+# else
|
|
||||||
+ /* Since vptern can only take 3x vectors fastest to do 1 vec
|
|
||||||
+ seperately with EVEX vpcmp. */
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* vptern can only accept masks for epi32/epi64 so can only save
|
|
||||||
+ instruction using not equals mask on vptern with wmemchr. */
|
|
||||||
+ VPCMP $4, (%rdi), %YMMMATCH, %k1
|
|
||||||
+# else
|
|
||||||
+ VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
||||||
+# endif
|
|
||||||
+ /* Compare 3x with vpcmpeq and or them all together with vptern.
|
|
||||||
+ */
|
|
||||||
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* This takes the not of or between ymm2, ymm3, ymm4 as well as
|
|
||||||
+ combines result from VEC0 with zero mask. */
|
|
||||||
+ vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
|
|
||||||
+ vpmovmskb %ymm4, %ecx
|
|
||||||
+# else
|
|
||||||
+ /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */
|
|
||||||
+ vpternlogd $254, %ymm2, %ymm3, %ymm4
|
|
||||||
+ vpmovmskb %ymm4, %ecx
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+# endif
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
# ifdef USE_AS_RAWMEMCHR
|
|
||||||
subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+# endif
|
|
||||||
+# ifdef USE_IN_RTM
|
|
||||||
kortestd %k2, %k3
|
|
||||||
+# else
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* ecx contains not of matches. All 1s means no matches. incl will
|
|
||||||
+ overflow and set zeroflag if that is the case. */
|
|
||||||
+ incl %ecx
|
|
||||||
+# else
|
|
||||||
+ /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
|
|
||||||
+ to ecx is not an issue because if eax is non-zero it will be
|
|
||||||
+ used for returning the match. If it is zero the add does
|
|
||||||
+ nothing. */
|
|
||||||
+ addq %rax, %rcx
|
|
||||||
+# endif
|
|
||||||
+# endif
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
jz L(loop_4x_vec)
|
|
||||||
# else
|
|
||||||
- kortestd %k2, %k3
|
|
||||||
jnz L(loop_4x_vec_end)
|
|
||||||
|
|
||||||
subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
@@ -288,10 +371,11 @@ L(loop_4x_vec):
|
|
||||||
|
|
||||||
/* Fall through into less than 4 remaining vectors of length case.
|
|
||||||
*/
|
|
||||||
- VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
|
|
||||||
+ addq $(BASE_OFFSET - VEC_SIZE), %rdi
|
|
||||||
kmovd %k0, %eax
|
|
||||||
- addq $(VEC_SIZE * 3), %rdi
|
|
||||||
- .p2align 4
|
|
||||||
+ VZEROUPPER
|
|
||||||
+
|
|
||||||
L(last_4x_vec_or_less):
|
|
||||||
/* Check if first VEC contained match. */
|
|
||||||
testl %eax, %eax
|
|
||||||
@@ -338,73 +422,78 @@ L(loop_4x_vec_end):
|
|
||||||
/* rawmemchr will fall through into this if match was found in
|
|
||||||
loop. */
|
|
||||||
|
|
||||||
+# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
|
|
||||||
/* k1 has not of matches with VEC1. */
|
|
||||||
kmovd %k1, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
subl $((1 << CHAR_PER_VEC) - 1), %eax
|
|
||||||
-# else
|
|
||||||
+# else
|
|
||||||
incl %eax
|
|
||||||
+# endif
|
|
||||||
+# else
|
|
||||||
+ /* eax already has matches for VEC1. */
|
|
||||||
+ testl %eax, %eax
|
|
||||||
# endif
|
|
||||||
jnz L(last_vec_x1_return)
|
|
||||||
|
|
||||||
+# ifdef USE_IN_RTM
|
|
||||||
VPCMP $0, %YMM2, %YMMZERO, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
+# else
|
|
||||||
+ vpmovmskb %ymm2, %eax
|
|
||||||
+# endif
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(last_vec_x2_return)
|
|
||||||
|
|
||||||
+# ifdef USE_IN_RTM
|
|
||||||
kmovd %k2, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(last_vec_x3_return)
|
|
||||||
|
|
||||||
kmovd %k3, %eax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_RAWMEMCHR
|
|
||||||
- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# else
|
|
||||||
- leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ vpmovmskb %ymm3, %eax
|
|
||||||
+ /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */
|
|
||||||
+ salq $VEC_SIZE, %rcx
|
|
||||||
+ orq %rcx, %rax
|
|
||||||
+ tzcntq %rax, %rax
|
|
||||||
+ leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
|
|
||||||
+ VZEROUPPER
|
|
||||||
# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(last_vec_x1_return):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_RAWMEMCHR
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
+# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
|
|
||||||
/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
- leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
-# else
|
|
||||||
- addq %rdi, %rax
|
|
||||||
-# endif
|
|
||||||
+ leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# else
|
|
||||||
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
- leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
# endif
|
|
||||||
+ VZEROUPPER
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(last_vec_x2_return):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_RAWMEMCHR
|
|
||||||
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
- leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
-# else
|
|
||||||
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
- leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
-# endif
|
|
||||||
+ /* NB: Multiply bytes by RET_SCALE to get the wchar_t count
|
|
||||||
+ if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
|
|
||||||
+ USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */
|
|
||||||
+ leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
|
|
||||||
+ VZEROUPPER
|
|
||||||
ret
|
|
||||||
|
|
||||||
+# ifdef USE_IN_RTM
|
|
||||||
.p2align 4
|
|
||||||
L(last_vec_x3_return):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_RAWMEMCHR
|
|
||||||
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
-# else
|
|
||||||
/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
- leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
-# endif
|
|
||||||
+ leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
ret
|
|
||||||
-
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
L(last_4x_vec_or_less_cmpeq):
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c
|
|
||||||
index 016f5784..f28aea77 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memchr.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memchr.c
|
|
||||||
@@ -24,7 +24,7 @@
|
|
||||||
# undef memchr
|
|
||||||
|
|
||||||
# define SYMBOL_NAME memchr
|
|
||||||
-# include "ifunc-avx2.h"
|
|
||||||
+# include "ifunc-evex.h"
|
|
||||||
|
|
||||||
libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ());
|
|
||||||
strong_alias (memchr, __memchr)
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..deda1ca3
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
|
|
||||||
@@ -0,0 +1,3 @@
|
|
||||||
+#define MEMCHR __rawmemchr_evex_rtm
|
|
||||||
+#define USE_AS_RAWMEMCHR 1
|
|
||||||
+#include "memchr-evex-rtm.S"
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c
|
|
||||||
index 8a0bc313..1f764f35 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/rawmemchr.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/rawmemchr.c
|
|
||||||
@@ -26,7 +26,7 @@
|
|
||||||
# undef __rawmemchr
|
|
||||||
|
|
||||||
# define SYMBOL_NAME rawmemchr
|
|
||||||
-# include "ifunc-avx2.h"
|
|
||||||
+# include "ifunc-evex.h"
|
|
||||||
|
|
||||||
libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
|
|
||||||
IFUNC_SELECTOR ());
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..a346cd35
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
|
|
||||||
@@ -0,0 +1,3 @@
|
|
||||||
+#define MEMCHR __wmemchr_evex_rtm
|
|
||||||
+#define USE_AS_WMEMCHR 1
|
|
||||||
+#include "memchr-evex-rtm.S"
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c
|
|
||||||
index 6d833702..f9c91915 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/wmemchr.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/wmemchr.c
|
|
||||||
@@ -26,7 +26,7 @@
|
|
||||||
# undef __wmemchr
|
|
||||||
|
|
||||||
# define SYMBOL_NAME wmemchr
|
|
||||||
-# include "ifunc-avx2.h"
|
|
||||||
+# include "ifunc-evex.h"
|
|
||||||
|
|
||||||
libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ());
|
|
||||||
weak_alias (__wmemchr, wmemchr)
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,873 +0,0 @@
|
|||||||
From 16d12015c57701b08d7bbed6ec536641bcafb428 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon, 17 May 2021 13:56:52 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize memcmp-avx2-movbe.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit optimizes memcmp-avx2.S. The optimizations include
|
|
||||||
adding a new vec compare path for small sizes, reorganizing the entry
|
|
||||||
control flow, and removing some unnecissary ALU instructions from the
|
|
||||||
main loop. test-memcmp and test-wmemcmp are both passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 +
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 1 +
|
|
||||||
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 676 +++++++++++--------
|
|
||||||
3 files changed, 402 insertions(+), 281 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index ac097e8d..8be0d78a 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -63,16 +63,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL (i, name, memcmp,
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcmp,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
|
||||||
__memcmp_avx2_movbe)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcmp,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (MOVBE)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__memcmp_avx2_movbe_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcmp,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
|
||||||
__memcmp_evex_movbe)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
|
|
||||||
@@ -732,16 +735,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL (i, name, wmemcmp,
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemcmp,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
|
||||||
__wmemcmp_avx2_movbe)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemcmp,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (MOVBE)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__wmemcmp_avx2_movbe_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemcmp,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
|
||||||
__wmemcmp_evex_movbe)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
|
||||||
index 8043c635..690dffe8 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
|
||||||
@@ -33,6 +33,7 @@ IFUNC_SELECTOR (void)
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
&& CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
|
||||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
{
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
index 9d5c9c72..16fc673e 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
@@ -19,17 +19,23 @@
|
|
||||||
#if IS_IN (libc)
|
|
||||||
|
|
||||||
/* memcmp/wmemcmp is implemented as:
|
|
||||||
- 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
|
|
||||||
- to avoid branches.
|
|
||||||
- 2. Use overlapping compare to avoid branch.
|
|
||||||
- 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
|
|
||||||
- bytes for wmemcmp.
|
|
||||||
- 4. If size is 8 * VEC_SIZE or less, unroll the loop.
|
|
||||||
- 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
|
||||||
+ 1. Use ymm vector compares when possible. The only case where
|
|
||||||
+ vector compares is not possible for when size < VEC_SIZE
|
|
||||||
+ and loading from either s1 or s2 would cause a page cross.
|
|
||||||
+ 2. For size from 2 to 7 bytes on page cross, load as big endian
|
|
||||||
+ with movbe and bswap to avoid branches.
|
|
||||||
+ 3. Use xmm vector compare when size >= 4 bytes for memcmp or
|
|
||||||
+ size >= 8 bytes for wmemcmp.
|
|
||||||
+ 4. Optimistically compare up to first 4 * VEC_SIZE one at a
|
|
||||||
+ to check for early mismatches. Only do this if its guranteed the
|
|
||||||
+ work is not wasted.
|
|
||||||
+ 5. If size is 8 * VEC_SIZE or less, unroll the loop.
|
|
||||||
+ 6. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
|
||||||
area.
|
|
||||||
- 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
|
|
||||||
- 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
|
|
||||||
- 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
|
|
||||||
+ 7. Use 2 vector compares when size is 2 * VEC_SIZE or less.
|
|
||||||
+ 8. Use 4 vector compares when size is 4 * VEC_SIZE or less.
|
|
||||||
+ 9. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
|
|
||||||
+
|
|
||||||
|
|
||||||
# include <sysdep.h>
|
|
||||||
|
|
||||||
@@ -38,8 +44,10 @@
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
+# define CHAR_SIZE 4
|
|
||||||
# define VPCMPEQ vpcmpeqd
|
|
||||||
# else
|
|
||||||
+# define CHAR_SIZE 1
|
|
||||||
# define VPCMPEQ vpcmpeqb
|
|
||||||
# endif
|
|
||||||
|
|
||||||
@@ -52,7 +60,7 @@
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
-# define VEC_MASK ((1 << VEC_SIZE) - 1)
|
|
||||||
+# define PAGE_SIZE 4096
|
|
||||||
|
|
||||||
/* Warning!
|
|
||||||
wmemcmp has to use SIGNED comparison for elements.
|
|
||||||
@@ -71,136 +79,359 @@ ENTRY (MEMCMP)
|
|
||||||
jb L(less_vec)
|
|
||||||
|
|
||||||
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
- vmovdqu (%rsi), %ymm2
|
|
||||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ vmovdqu (%rsi), %ymm1
|
|
||||||
+ VPCMPEQ (%rdi), %ymm1, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ /* NB: eax must be destination register if going to
|
|
||||||
+ L(return_vec_[0,2]). For L(return_vec_3 destination register
|
|
||||||
+ must be ecx. */
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
|
|
||||||
cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
- jbe L(last_vec)
|
|
||||||
-
|
|
||||||
- VPCMPEQ %ymm0, %ymm0, %ymm0
|
|
||||||
- /* More than 2 * VEC. */
|
|
||||||
- cmpq $(VEC_SIZE * 8), %rdx
|
|
||||||
- ja L(more_8x_vec)
|
|
||||||
- cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jb L(last_4x_vec)
|
|
||||||
-
|
|
||||||
- /* From 4 * VEC to 8 * VEC, inclusively. */
|
|
||||||
- vmovdqu (%rsi), %ymm1
|
|
||||||
- VPCMPEQ (%rdi), %ymm1, %ymm1
|
|
||||||
+ jbe L(last_1x_vec)
|
|
||||||
|
|
||||||
+ /* Check second VEC no matter what. */
|
|
||||||
vmovdqu VEC_SIZE(%rsi), %ymm2
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
|
|
||||||
+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
|
|
||||||
+ vpmovmskb %ymm2, %eax
|
|
||||||
+ /* If all 4 VEC where equal eax will be all 1s so incl will
|
|
||||||
+ overflow and set zero flag. */
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(return_vec_1)
|
|
||||||
|
|
||||||
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
|
||||||
+ /* Less than 4 * VEC. */
|
|
||||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ jbe L(last_2x_vec)
|
|
||||||
|
|
||||||
+ /* Check third and fourth VEC no matter what. */
|
|
||||||
+ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
|
||||||
+ vpmovmskb %ymm3, %eax
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(return_vec_2)
|
|
||||||
vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
|
||||||
+ vpmovmskb %ymm4, %ecx
|
|
||||||
+ incl %ecx
|
|
||||||
+ jnz L(return_vec_3)
|
|
||||||
|
|
||||||
- vpand %ymm1, %ymm2, %ymm5
|
|
||||||
- vpand %ymm3, %ymm4, %ymm6
|
|
||||||
- vpand %ymm5, %ymm6, %ymm5
|
|
||||||
+ /* Go to 4x VEC loop. */
|
|
||||||
+ cmpq $(VEC_SIZE * 8), %rdx
|
|
||||||
+ ja L(more_8x_vec)
|
|
||||||
|
|
||||||
- vptest %ymm0, %ymm5
|
|
||||||
- jnc L(4x_vec_end)
|
|
||||||
+ /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
|
|
||||||
+ branches. */
|
|
||||||
|
|
||||||
+ /* Load first two VEC from s2 before adjusting addresses. */
|
|
||||||
+ vmovdqu -(VEC_SIZE * 4)(%rsi, %rdx), %ymm1
|
|
||||||
+ vmovdqu -(VEC_SIZE * 3)(%rsi, %rdx), %ymm2
|
|
||||||
leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
|
|
||||||
leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
|
|
||||||
- vmovdqu (%rsi), %ymm1
|
|
||||||
- VPCMPEQ (%rdi), %ymm1, %ymm1
|
|
||||||
|
|
||||||
- vmovdqu VEC_SIZE(%rsi), %ymm2
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
|
|
||||||
- vpand %ymm2, %ymm1, %ymm5
|
|
||||||
+ /* Wait to load from s1 until addressed adjust due to
|
|
||||||
+ unlamination of microfusion with complex address mode. */
|
|
||||||
+ VPCMPEQ (%rdi), %ymm1, %ymm1
|
|
||||||
+ VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2
|
|
||||||
|
|
||||||
vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
|
||||||
- vpand %ymm3, %ymm5, %ymm5
|
|
||||||
-
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
|
||||||
vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
|
||||||
- vpand %ymm4, %ymm5, %ymm5
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
|
||||||
|
|
||||||
- vptest %ymm0, %ymm5
|
|
||||||
- jnc L(4x_vec_end)
|
|
||||||
- xorl %eax, %eax
|
|
||||||
+ /* Reduce VEC0 - VEC4. */
|
|
||||||
+ vpand %ymm1, %ymm2, %ymm5
|
|
||||||
+ vpand %ymm3, %ymm4, %ymm6
|
|
||||||
+ vpand %ymm5, %ymm6, %ymm7
|
|
||||||
+ vpmovmskb %ymm7, %ecx
|
|
||||||
+ incl %ecx
|
|
||||||
+ jnz L(return_vec_0_1_2_3)
|
|
||||||
+ /* NB: eax must be zero to reach here. */
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(return_vec_0):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl (%rdi, %rax), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (%rsi, %rax), %ecx
|
|
||||||
+ /* NB: no partial register stall here because xorl zero idiom
|
|
||||||
+ above. */
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl (%rsi, %rax), %ecx
|
|
||||||
+ movzbl (%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
L(return_vzeroupper):
|
|
||||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(last_2x_vec):
|
|
||||||
- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
- vmovdqu (%rsi), %ymm2
|
|
||||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+L(return_vec_1):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl VEC_SIZE(%rdi, %rax), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl VEC_SIZE(%rsi, %rax), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl VEC_SIZE(%rsi, %rax), %ecx
|
|
||||||
+ movzbl VEC_SIZE(%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(return_vec_2):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
|
|
||||||
+ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ /* NB: p2align 5 here to ensure 4x loop is 32 byte aligned. */
|
|
||||||
+ .p2align 5
|
|
||||||
+L(8x_return_vec_0_1_2_3):
|
|
||||||
+ /* Returning from L(more_8x_vec) requires restoring rsi. */
|
|
||||||
+ addq %rdi, %rsi
|
|
||||||
+L(return_vec_0_1_2_3):
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
|
|
||||||
-L(last_vec):
|
|
||||||
- /* Use overlapping loads to avoid branches. */
|
|
||||||
- leaq -VEC_SIZE(%rdi, %rdx), %rdi
|
|
||||||
- leaq -VEC_SIZE(%rsi, %rdx), %rsi
|
|
||||||
- vmovdqu (%rsi), %ymm2
|
|
||||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(return_vec_1)
|
|
||||||
+
|
|
||||||
+ vpmovmskb %ymm3, %eax
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(return_vec_2)
|
|
||||||
+L(return_vec_3):
|
|
||||||
+ tzcntl %ecx, %ecx
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx), %eax
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(more_8x_vec):
|
|
||||||
+ /* Set end of s1 in rdx. */
|
|
||||||
+ leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
|
|
||||||
+ /* rsi stores s2 - s1. This allows loop to only update one
|
|
||||||
+ pointer. */
|
|
||||||
+ subq %rdi, %rsi
|
|
||||||
+ /* Align s1 pointer. */
|
|
||||||
+ andq $-VEC_SIZE, %rdi
|
|
||||||
+ /* Adjust because first 4x vec where check already. */
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ .p2align 4
|
|
||||||
+L(loop_4x_vec):
|
|
||||||
+ /* rsi has s2 - s1 so get correct address by adding s1 (in rdi).
|
|
||||||
+ */
|
|
||||||
+ vmovdqu (%rsi, %rdi), %ymm1
|
|
||||||
+ VPCMPEQ (%rdi), %ymm1, %ymm1
|
|
||||||
+
|
|
||||||
+ vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2
|
|
||||||
+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
|
|
||||||
+
|
|
||||||
+ vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
|
||||||
+
|
|
||||||
+ vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
|
||||||
+
|
|
||||||
+ vpand %ymm1, %ymm2, %ymm5
|
|
||||||
+ vpand %ymm3, %ymm4, %ymm6
|
|
||||||
+ vpand %ymm5, %ymm6, %ymm7
|
|
||||||
+ vpmovmskb %ymm7, %ecx
|
|
||||||
+ incl %ecx
|
|
||||||
+ jnz L(8x_return_vec_0_1_2_3)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ /* Check if s1 pointer at end. */
|
|
||||||
+ cmpq %rdx, %rdi
|
|
||||||
+ jb L(loop_4x_vec)
|
|
||||||
+
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+ /* rdi has 4 * VEC_SIZE - remaining length. */
|
|
||||||
+ cmpl $(VEC_SIZE * 3), %edi
|
|
||||||
+ jae L(8x_last_1x_vec)
|
|
||||||
+ /* Load regardless of branch. */
|
|
||||||
+ vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3
|
|
||||||
+ cmpl $(VEC_SIZE * 2), %edi
|
|
||||||
+ jae L(8x_last_2x_vec)
|
|
||||||
+
|
|
||||||
+ /* Check last 4 VEC. */
|
|
||||||
+ vmovdqu (%rsi, %rdx), %ymm1
|
|
||||||
+ VPCMPEQ (%rdx), %ymm1, %ymm1
|
|
||||||
+
|
|
||||||
+ vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2
|
|
||||||
+ VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2
|
|
||||||
+
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
|
|
||||||
+
|
|
||||||
+ vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
|
|
||||||
+
|
|
||||||
+ vpand %ymm1, %ymm2, %ymm5
|
|
||||||
+ vpand %ymm3, %ymm4, %ymm6
|
|
||||||
+ vpand %ymm5, %ymm6, %ymm7
|
|
||||||
+ vpmovmskb %ymm7, %ecx
|
|
||||||
+ /* Restore s1 pointer to rdi. */
|
|
||||||
+ movq %rdx, %rdi
|
|
||||||
+ incl %ecx
|
|
||||||
+ jnz L(8x_return_vec_0_1_2_3)
|
|
||||||
+ /* NB: eax must be zero to reach here. */
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ /* Only entry is from L(more_8x_vec). */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(8x_last_2x_vec):
|
|
||||||
+ /* Check second to last VEC. rdx store end pointer of s1 and
|
|
||||||
+ ymm3 has already been loaded with second to last VEC from s2.
|
|
||||||
+ */
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
|
|
||||||
+ vpmovmskb %ymm3, %eax
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(8x_return_vec_2)
|
|
||||||
+ /* Check last VEC. */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(8x_last_1x_vec):
|
|
||||||
+ vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
|
|
||||||
+ vpmovmskb %ymm4, %eax
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(8x_return_vec_3)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec):
|
|
||||||
- /* A byte or int32 is different within 16 or 32 bytes. */
|
|
||||||
- tzcntl %eax, %ecx
|
|
||||||
+L(last_2x_vec):
|
|
||||||
+ /* Check second to last VEC. */
|
|
||||||
+ vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
|
|
||||||
+ VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(return_vec_1_end)
|
|
||||||
+ /* Check last VEC. */
|
|
||||||
+L(last_1x_vec):
|
|
||||||
+ vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
|
|
||||||
+ VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(return_vec_0_end)
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(8x_return_vec_2):
|
|
||||||
+ subq $VEC_SIZE, %rdx
|
|
||||||
+L(8x_return_vec_3):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addq %rdx, %rax
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl (%rdi, %rcx), %edx
|
|
||||||
- cmpl (%rsi, %rcx), %edx
|
|
||||||
-L(wmemcmp_return):
|
|
||||||
- setl %al
|
|
||||||
- negl %eax
|
|
||||||
- orl $1, %eax
|
|
||||||
+ movl (VEC_SIZE * 3)(%rax), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
# else
|
|
||||||
- movzbl (%rdi, %rcx), %eax
|
|
||||||
- movzbl (%rsi, %rcx), %edx
|
|
||||||
- sub %edx, %eax
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
.p2align 4
|
|
||||||
-L(4):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl (%rdi), %edx
|
|
||||||
- cmpl (%rsi), %edx
|
|
||||||
- jne L(wmemcmp_return)
|
|
||||||
- ret
|
|
||||||
+L(return_vec_1_end):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addl %edx, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl -(VEC_SIZE * 2)(%rdi, %rax), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
# else
|
|
||||||
+ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
|
|
||||||
+ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
.p2align 4
|
|
||||||
-L(between_4_7):
|
|
||||||
- /* Load as big endian with overlapping movbe to avoid branches. */
|
|
||||||
- movbe (%rdi), %eax
|
|
||||||
- movbe (%rsi), %ecx
|
|
||||||
- shlq $32, %rax
|
|
||||||
- shlq $32, %rcx
|
|
||||||
- movbe -4(%rdi, %rdx), %edi
|
|
||||||
- movbe -4(%rsi, %rdx), %esi
|
|
||||||
- orq %rdi, %rax
|
|
||||||
- orq %rsi, %rcx
|
|
||||||
- subq %rcx, %rax
|
|
||||||
- je L(exit)
|
|
||||||
- sbbl %eax, %eax
|
|
||||||
- orl $1, %eax
|
|
||||||
- ret
|
|
||||||
+L(return_vec_0_end):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addl %edx, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl -VEC_SIZE(%rdi, %rax), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl -VEC_SIZE(%rsi, %rax), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl -VEC_SIZE(%rsi, %rax), %ecx
|
|
||||||
+ movzbl -VEC_SIZE(%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(exit):
|
|
||||||
- ret
|
|
||||||
+L(less_vec):
|
|
||||||
+ /* Check if one or less CHAR. This is necessary for size = 0 but
|
|
||||||
+ is also faster for size = CHAR_SIZE. */
|
|
||||||
+ cmpl $CHAR_SIZE, %edx
|
|
||||||
+ jbe L(one_or_less)
|
|
||||||
+
|
|
||||||
+ /* Check if loading one VEC from either s1 or s2 could cause a
|
|
||||||
+ page cross. This can have false positives but is by far the
|
|
||||||
+ fastest method. */
|
|
||||||
+ movl %edi, %eax
|
|
||||||
+ orl %esi, %eax
|
|
||||||
+ andl $(PAGE_SIZE - 1), %eax
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
+ jg L(page_cross_less_vec)
|
|
||||||
+
|
|
||||||
+ /* No page cross possible. */
|
|
||||||
+ vmovdqu (%rsi), %ymm2
|
|
||||||
+ VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
+ vpmovmskb %ymm2, %eax
|
|
||||||
+ incl %eax
|
|
||||||
+ /* Result will be zero if s1 and s2 match. Otherwise first set
|
|
||||||
+ bit will be first mismatch. */
|
|
||||||
+ bzhil %edx, %eax, %edx
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(between_2_3):
|
|
||||||
+L(page_cross_less_vec):
|
|
||||||
+ /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
|
|
||||||
+ bytes. */
|
|
||||||
+ cmpl $16, %edx
|
|
||||||
+ jae L(between_16_31)
|
|
||||||
+# ifndef USE_AS_WMEMCMP
|
|
||||||
+ cmpl $8, %edx
|
|
||||||
+ jae L(between_8_15)
|
|
||||||
+ cmpl $4, %edx
|
|
||||||
+ jae L(between_4_7)
|
|
||||||
+
|
|
||||||
/* Load as big endian to avoid branches. */
|
|
||||||
movzwl (%rdi), %eax
|
|
||||||
movzwl (%rsi), %ecx
|
|
||||||
@@ -208,223 +439,106 @@ L(between_2_3):
|
|
||||||
shll $8, %ecx
|
|
||||||
bswap %eax
|
|
||||||
bswap %ecx
|
|
||||||
- movb -1(%rdi, %rdx), %al
|
|
||||||
- movb -1(%rsi, %rdx), %cl
|
|
||||||
+ movzbl -1(%rdi, %rdx), %edi
|
|
||||||
+ movzbl -1(%rsi, %rdx), %esi
|
|
||||||
+ orl %edi, %eax
|
|
||||||
+ orl %esi, %ecx
|
|
||||||
/* Subtraction is okay because the upper 8 bits are zero. */
|
|
||||||
subl %ecx, %eax
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(1):
|
|
||||||
- movzbl (%rdi), %eax
|
|
||||||
+L(one_or_less):
|
|
||||||
+ jb L(zero)
|
|
||||||
movzbl (%rsi), %ecx
|
|
||||||
+ movzbl (%rdi), %eax
|
|
||||||
subl %ecx, %eax
|
|
||||||
- ret
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(zero):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(less_vec):
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
- /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
|
|
||||||
- cmpb $4, %dl
|
|
||||||
- je L(4)
|
|
||||||
- jb L(zero)
|
|
||||||
-# else
|
|
||||||
- cmpb $1, %dl
|
|
||||||
- je L(1)
|
|
||||||
- jb L(zero)
|
|
||||||
- cmpb $4, %dl
|
|
||||||
- jb L(between_2_3)
|
|
||||||
- cmpb $8, %dl
|
|
||||||
- jb L(between_4_7)
|
|
||||||
+L(between_8_15):
|
|
||||||
# endif
|
|
||||||
- cmpb $16, %dl
|
|
||||||
- jae L(between_16_31)
|
|
||||||
- /* It is between 8 and 15 bytes. */
|
|
||||||
+ /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
|
|
||||||
vmovq (%rdi), %xmm1
|
|
||||||
vmovq (%rsi), %xmm2
|
|
||||||
- VPCMPEQ %xmm1, %xmm2, %xmm2
|
|
||||||
+ VPCMPEQ %xmm1, %xmm2, %xmm2
|
|
||||||
vpmovmskb %xmm2, %eax
|
|
||||||
- subl $0xffff, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ subl $0xffff, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
/* Use overlapping loads to avoid branches. */
|
|
||||||
leaq -8(%rdi, %rdx), %rdi
|
|
||||||
leaq -8(%rsi, %rdx), %rsi
|
|
||||||
vmovq (%rdi), %xmm1
|
|
||||||
vmovq (%rsi), %xmm2
|
|
||||||
- VPCMPEQ %xmm1, %xmm2, %xmm2
|
|
||||||
+ VPCMPEQ %xmm1, %xmm2, %xmm2
|
|
||||||
vpmovmskb %xmm2, %eax
|
|
||||||
- subl $0xffff, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ subl $0xffff, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(between_16_31):
|
|
||||||
/* From 16 to 31 bytes. No branch when size == 16. */
|
|
||||||
vmovdqu (%rsi), %xmm2
|
|
||||||
- VPCMPEQ (%rdi), %xmm2, %xmm2
|
|
||||||
+ VPCMPEQ (%rdi), %xmm2, %xmm2
|
|
||||||
vpmovmskb %xmm2, %eax
|
|
||||||
- subl $0xffff, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ subl $0xffff, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
|
|
||||||
/* Use overlapping loads to avoid branches. */
|
|
||||||
+
|
|
||||||
+ vmovdqu -16(%rsi, %rdx), %xmm2
|
|
||||||
leaq -16(%rdi, %rdx), %rdi
|
|
||||||
leaq -16(%rsi, %rdx), %rsi
|
|
||||||
- vmovdqu (%rsi), %xmm2
|
|
||||||
- VPCMPEQ (%rdi), %xmm2, %xmm2
|
|
||||||
+ VPCMPEQ (%rdi), %xmm2, %xmm2
|
|
||||||
vpmovmskb %xmm2, %eax
|
|
||||||
- subl $0xffff, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ subl $0xffff, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
ret
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(more_8x_vec):
|
|
||||||
- /* More than 8 * VEC. Check the first VEC. */
|
|
||||||
- vmovdqu (%rsi), %ymm2
|
|
||||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
-
|
|
||||||
- /* Align the first memory area for aligned loads in the loop.
|
|
||||||
- Compute how much the first memory area is misaligned. */
|
|
||||||
- movq %rdi, %rcx
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- /* Get the negative of offset for alignment. */
|
|
||||||
- subq $VEC_SIZE, %rcx
|
|
||||||
- /* Adjust the second memory area. */
|
|
||||||
- subq %rcx, %rsi
|
|
||||||
- /* Adjust the first memory area which should be aligned now. */
|
|
||||||
- subq %rcx, %rdi
|
|
||||||
- /* Adjust length. */
|
|
||||||
- addq %rcx, %rdx
|
|
||||||
-
|
|
||||||
-L(loop_4x_vec):
|
|
||||||
- /* Compare 4 * VEC at a time forward. */
|
|
||||||
- vmovdqu (%rsi), %ymm1
|
|
||||||
- VPCMPEQ (%rdi), %ymm1, %ymm1
|
|
||||||
-
|
|
||||||
- vmovdqu VEC_SIZE(%rsi), %ymm2
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
|
|
||||||
- vpand %ymm2, %ymm1, %ymm5
|
|
||||||
-
|
|
||||||
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
|
||||||
- vpand %ymm3, %ymm5, %ymm5
|
|
||||||
-
|
|
||||||
- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
|
||||||
- vpand %ymm4, %ymm5, %ymm5
|
|
||||||
-
|
|
||||||
- vptest %ymm0, %ymm5
|
|
||||||
- jnc L(4x_vec_end)
|
|
||||||
-
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
- addq $(VEC_SIZE * 4), %rsi
|
|
||||||
-
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jae L(loop_4x_vec)
|
|
||||||
-
|
|
||||||
- /* Less than 4 * VEC. */
|
|
||||||
- cmpq $VEC_SIZE, %rdx
|
|
||||||
- jbe L(last_vec)
|
|
||||||
- cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
- jbe L(last_2x_vec)
|
|
||||||
-
|
|
||||||
-L(last_4x_vec):
|
|
||||||
- /* From 2 * VEC to 4 * VEC. */
|
|
||||||
- vmovdqu (%rsi), %ymm2
|
|
||||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
-
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- addq $VEC_SIZE, %rsi
|
|
||||||
- vmovdqu (%rsi), %ymm2
|
|
||||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
-
|
|
||||||
- /* Use overlapping loads to avoid branches. */
|
|
||||||
- leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
|
|
||||||
- leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
|
|
||||||
- vmovdqu (%rsi), %ymm2
|
|
||||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
-
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- addq $VEC_SIZE, %rsi
|
|
||||||
- vmovdqu (%rsi), %ymm2
|
|
||||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(4x_vec_end):
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
- vpmovmskb %ymm3, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
- vpmovmskb %ymm4, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- tzcntl %eax, %ecx
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl (VEC_SIZE * 3)(%rdi, %rcx), %edx
|
|
||||||
- cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx
|
|
||||||
- jmp L(wmemcmp_return)
|
|
||||||
-# else
|
|
||||||
- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
|
||||||
- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
|
|
||||||
- sub %edx, %eax
|
|
||||||
-# endif
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
-
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x1):
|
|
||||||
- tzcntl %eax, %ecx
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl VEC_SIZE(%rdi, %rcx), %edx
|
|
||||||
- cmpl VEC_SIZE(%rsi, %rcx), %edx
|
|
||||||
- jmp L(wmemcmp_return)
|
|
||||||
+L(one_or_less):
|
|
||||||
+ jb L(zero)
|
|
||||||
+ movl (%rdi), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (%rsi), %ecx
|
|
||||||
+ je L(zero)
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
+ ret
|
|
||||||
# else
|
|
||||||
- movzbl VEC_SIZE(%rdi, %rcx), %eax
|
|
||||||
- movzbl VEC_SIZE(%rsi, %rcx), %edx
|
|
||||||
- sub %edx, %eax
|
|
||||||
-# endif
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x2):
|
|
||||||
- tzcntl %eax, %ecx
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
|
|
||||||
- cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
|
|
||||||
- jmp L(wmemcmp_return)
|
|
||||||
-# else
|
|
||||||
- movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
|
|
||||||
- movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
|
|
||||||
- sub %edx, %eax
|
|
||||||
+L(between_4_7):
|
|
||||||
+ /* Load as big endian with overlapping movbe to avoid branches.
|
|
||||||
+ */
|
|
||||||
+ movbe (%rdi), %eax
|
|
||||||
+ movbe (%rsi), %ecx
|
|
||||||
+ shlq $32, %rax
|
|
||||||
+ shlq $32, %rcx
|
|
||||||
+ movbe -4(%rdi, %rdx), %edi
|
|
||||||
+ movbe -4(%rsi, %rdx), %esi
|
|
||||||
+ orq %rdi, %rax
|
|
||||||
+ orq %rsi, %rcx
|
|
||||||
+ subq %rcx, %rax
|
|
||||||
+ jz L(zero_4_7)
|
|
||||||
+ sbbl %eax, %eax
|
|
||||||
+ orl $1, %eax
|
|
||||||
+L(zero_4_7):
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
+ ret
|
|
||||||
# endif
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
END (MEMCMP)
|
|
||||||
#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,851 +0,0 @@
|
|||||||
From 4ad473e97acdc5f6d811755b67c09f2128a644ce Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon, 17 May 2021 13:57:24 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit optimizes memcmp-evex.S. The optimizations include
|
|
||||||
adding a new vec compare path for small sizes, reorganizing the entry
|
|
||||||
control flow, removing some unnecissary ALU instructions from the main
|
|
||||||
loop, and most importantly replacing the heavy use of vpcmp + kand
|
|
||||||
logic with vpxor + vptern. test-memcmp and test-wmemcmp are both
|
|
||||||
passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 710 +++++++++++--------
|
|
||||||
1 file changed, 408 insertions(+), 302 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
index 9c093972..654dc7ac 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
@@ -19,17 +19,22 @@
|
|
||||||
#if IS_IN (libc)
|
|
||||||
|
|
||||||
/* memcmp/wmemcmp is implemented as:
|
|
||||||
- 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
|
|
||||||
- to avoid branches.
|
|
||||||
- 2. Use overlapping compare to avoid branch.
|
|
||||||
- 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
|
|
||||||
- bytes for wmemcmp.
|
|
||||||
- 4. If size is 8 * VEC_SIZE or less, unroll the loop.
|
|
||||||
- 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
|
||||||
+ 1. Use ymm vector compares when possible. The only case where
|
|
||||||
+ vector compares is not possible for when size < CHAR_PER_VEC
|
|
||||||
+ and loading from either s1 or s2 would cause a page cross.
|
|
||||||
+ 2. For size from 2 to 7 bytes on page cross, load as big endian
|
|
||||||
+ with movbe and bswap to avoid branches.
|
|
||||||
+ 3. Use xmm vector compare when size >= 4 bytes for memcmp or
|
|
||||||
+ size >= 8 bytes for wmemcmp.
|
|
||||||
+ 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
|
|
||||||
+ to check for early mismatches. Only do this if its guranteed the
|
|
||||||
+ work is not wasted.
|
|
||||||
+ 5. If size is 8 * VEC_SIZE or less, unroll the loop.
|
|
||||||
+ 6. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
|
||||||
area.
|
|
||||||
- 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
|
|
||||||
- 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
|
|
||||||
- 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
|
|
||||||
+ 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
|
|
||||||
+ 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
|
|
||||||
+ 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */
|
|
||||||
|
|
||||||
# include <sysdep.h>
|
|
||||||
|
|
||||||
@@ -40,11 +45,21 @@
|
|
||||||
# define VMOVU vmovdqu64
|
|
||||||
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
-# define VPCMPEQ vpcmpeqd
|
|
||||||
+# define CHAR_SIZE 4
|
|
||||||
+# define VPCMP vpcmpd
|
|
||||||
# else
|
|
||||||
-# define VPCMPEQ vpcmpeqb
|
|
||||||
+# define CHAR_SIZE 1
|
|
||||||
+# define VPCMP vpcmpub
|
|
||||||
# endif
|
|
||||||
|
|
||||||
+# define VEC_SIZE 32
|
|
||||||
+# define PAGE_SIZE 4096
|
|
||||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
||||||
+
|
|
||||||
+# define XMM0 xmm16
|
|
||||||
+# define XMM1 xmm17
|
|
||||||
+# define XMM2 xmm18
|
|
||||||
+# define YMM0 ymm16
|
|
||||||
# define XMM1 xmm17
|
|
||||||
# define XMM2 xmm18
|
|
||||||
# define YMM1 ymm17
|
|
||||||
@@ -54,15 +69,6 @@
|
|
||||||
# define YMM5 ymm21
|
|
||||||
# define YMM6 ymm22
|
|
||||||
|
|
||||||
-# define VEC_SIZE 32
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
-# define VEC_MASK 0xff
|
|
||||||
-# define XMM_MASK 0xf
|
|
||||||
-# else
|
|
||||||
-# define VEC_MASK 0xffffffff
|
|
||||||
-# define XMM_MASK 0xffff
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
/* Warning!
|
|
||||||
wmemcmp has to use SIGNED comparison for elements.
|
|
||||||
memcmp has to use UNSIGNED comparison for elemnts.
|
|
||||||
@@ -70,145 +76,370 @@
|
|
||||||
|
|
||||||
.section .text.evex,"ax",@progbits
|
|
||||||
ENTRY (MEMCMP)
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
- shl $2, %RDX_LP
|
|
||||||
-# elif defined __ILP32__
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
/* Clear the upper 32 bits. */
|
|
||||||
movl %edx, %edx
|
|
||||||
# endif
|
|
||||||
- cmp $VEC_SIZE, %RDX_LP
|
|
||||||
+ cmp $CHAR_PER_VEC, %RDX_LP
|
|
||||||
jb L(less_vec)
|
|
||||||
|
|
||||||
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
- VMOVU (%rsi), %YMM2
|
|
||||||
- VPCMPEQ (%rdi), %YMM2, %k1
|
|
||||||
+ VMOVU (%rsi), %YMM1
|
|
||||||
+ /* Use compare not equals to directly check for mismatch. */
|
|
||||||
+ VPCMP $4, (%rdi), %YMM1, %k1
|
|
||||||
kmovd %k1, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
-
|
|
||||||
- cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
- jbe L(last_vec)
|
|
||||||
-
|
|
||||||
- /* More than 2 * VEC. */
|
|
||||||
- cmpq $(VEC_SIZE * 8), %rdx
|
|
||||||
- ja L(more_8x_vec)
|
|
||||||
- cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jb L(last_4x_vec)
|
|
||||||
+ /* NB: eax must be destination register if going to
|
|
||||||
+ L(return_vec_[0,2]). For L(return_vec_3 destination register
|
|
||||||
+ must be ecx. */
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
|
|
||||||
- /* From 4 * VEC to 8 * VEC, inclusively. */
|
|
||||||
- VMOVU (%rsi), %YMM1
|
|
||||||
- VPCMPEQ (%rdi), %YMM1, %k1
|
|
||||||
+ cmpq $(CHAR_PER_VEC * 2), %rdx
|
|
||||||
+ jbe L(last_1x_vec)
|
|
||||||
|
|
||||||
+ /* Check second VEC no matter what. */
|
|
||||||
VMOVU VEC_SIZE(%rsi), %YMM2
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
|
||||||
+ VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_1)
|
|
||||||
+
|
|
||||||
+ /* Less than 4 * VEC. */
|
|
||||||
+ cmpq $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
+ jbe L(last_2x_vec)
|
|
||||||
|
|
||||||
+ /* Check third and fourth VEC no matter what. */
|
|
||||||
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
|
||||||
+ VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_2)
|
|
||||||
|
|
||||||
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
|
||||||
+ VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
|
|
||||||
+ kmovd %k1, %ecx
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
+ jnz L(return_vec_3)
|
|
||||||
|
|
||||||
- kandd %k1, %k2, %k5
|
|
||||||
- kandd %k3, %k4, %k6
|
|
||||||
- kandd %k5, %k6, %k6
|
|
||||||
+ /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
|
|
||||||
+ compare with zero to get a mask is needed. */
|
|
||||||
+ vpxorq %XMM0, %XMM0, %XMM0
|
|
||||||
|
|
||||||
- kmovd %k6, %eax
|
|
||||||
- cmpl $VEC_MASK, %eax
|
|
||||||
- jne L(4x_vec_end)
|
|
||||||
+ /* Go to 4x VEC loop. */
|
|
||||||
+ cmpq $(CHAR_PER_VEC * 8), %rdx
|
|
||||||
+ ja L(more_8x_vec)
|
|
||||||
|
|
||||||
- leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
|
|
||||||
- leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
|
|
||||||
- VMOVU (%rsi), %YMM1
|
|
||||||
- VPCMPEQ (%rdi), %YMM1, %k1
|
|
||||||
+ /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
|
|
||||||
+ branches. */
|
|
||||||
|
|
||||||
- VMOVU VEC_SIZE(%rsi), %YMM2
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
|
||||||
- kandd %k1, %k2, %k5
|
|
||||||
+ /* Load first two VEC from s2 before adjusting addresses. */
|
|
||||||
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
|
|
||||||
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
|
|
||||||
+ leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
|
|
||||||
+ leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
|
|
||||||
+
|
|
||||||
+ /* Wait to load from s1 until addressed adjust due to
|
|
||||||
+ unlamination of microfusion with complex address mode. */
|
|
||||||
+
|
|
||||||
+ /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
|
|
||||||
+ will have some 1s. */
|
|
||||||
+ vpxorq (%rdi), %YMM1, %YMM1
|
|
||||||
+ vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2
|
|
||||||
|
|
||||||
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
|
||||||
- kandd %k3, %k5, %k5
|
|
||||||
+ vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
|
|
||||||
+ /* Or together YMM1, YMM2, and YMM3 into YMM3. */
|
|
||||||
+ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
|
|
||||||
|
|
||||||
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
|
||||||
- kandd %k4, %k5, %k5
|
|
||||||
+ /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
|
|
||||||
+ oring with YMM3. Result is stored in YMM4. */
|
|
||||||
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
|
|
||||||
+ /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
|
|
||||||
+ VPCMP $4, %YMM4, %YMM0, %k1
|
|
||||||
+ kmovd %k1, %ecx
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
+ jnz L(return_vec_0_1_2_3)
|
|
||||||
+ /* NB: eax must be zero to reach here. */
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- kmovd %k5, %eax
|
|
||||||
- cmpl $VEC_MASK, %eax
|
|
||||||
- jne L(4x_vec_end)
|
|
||||||
- xorl %eax, %eax
|
|
||||||
+ /* NB: aligning 32 here allows for the rest of the jump targets
|
|
||||||
+ to be tuned for 32 byte alignment. Most important this ensures
|
|
||||||
+ the L(more_8x_vec) loop is 32 byte aligned. */
|
|
||||||
+ .p2align 5
|
|
||||||
+L(less_vec):
|
|
||||||
+ /* Check if one or less CHAR. This is necessary for size = 0 but
|
|
||||||
+ is also faster for size = CHAR_SIZE. */
|
|
||||||
+ cmpl $1, %edx
|
|
||||||
+ jbe L(one_or_less)
|
|
||||||
+
|
|
||||||
+ /* Check if loading one VEC from either s1 or s2 could cause a
|
|
||||||
+ page cross. This can have false positives but is by far the
|
|
||||||
+ fastest method. */
|
|
||||||
+ movl %edi, %eax
|
|
||||||
+ orl %esi, %eax
|
|
||||||
+ andl $(PAGE_SIZE - 1), %eax
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
+ jg L(page_cross_less_vec)
|
|
||||||
+
|
|
||||||
+ /* No page cross possible. */
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMP $4, (%rdi), %YMM2, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ /* Create mask in ecx for potentially in bound matches. */
|
|
||||||
+ bzhil %edx, %eax, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(last_2x_vec):
|
|
||||||
- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
- VMOVU (%rsi), %YMM2
|
|
||||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+L(return_vec_0):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl (%rdi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (%rsi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ /* NB: no partial register stall here because xorl zero idiom
|
|
||||||
+ above. */
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl (%rsi, %rax), %ecx
|
|
||||||
+ movzbl (%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
-L(last_vec):
|
|
||||||
- /* Use overlapping loads to avoid branches. */
|
|
||||||
- leaq -VEC_SIZE(%rdi, %rdx), %rdi
|
|
||||||
- leaq -VEC_SIZE(%rsi, %rdx), %rsi
|
|
||||||
- VMOVU (%rsi), %YMM2
|
|
||||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ /* NB: No p2align necessary. Alignment % 16 is naturally 1
|
|
||||||
+ which is good enough for a target not in a loop. */
|
|
||||||
+L(return_vec_1):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl VEC_SIZE(%rsi, %rax), %ecx
|
|
||||||
+ movzbl VEC_SIZE(%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec):
|
|
||||||
- /* A byte or int32 is different within 16 or 32 bytes. */
|
|
||||||
- tzcntl %eax, %ecx
|
|
||||||
+ /* NB: No p2align necessary. Alignment % 16 is naturally 2
|
|
||||||
+ which is good enough for a target not in a loop. */
|
|
||||||
+L(return_vec_2):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl (%rdi, %rcx, 4), %edx
|
|
||||||
- cmpl (%rsi, %rcx, 4), %edx
|
|
||||||
-L(wmemcmp_return):
|
|
||||||
- setl %al
|
|
||||||
- negl %eax
|
|
||||||
- orl $1, %eax
|
|
||||||
+ movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
# else
|
|
||||||
- movzbl (%rdi, %rcx), %eax
|
|
||||||
- movzbl (%rsi, %rcx), %edx
|
|
||||||
- sub %edx, %eax
|
|
||||||
+ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
|
|
||||||
+ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
+ .p2align 4
|
|
||||||
+L(8x_return_vec_0_1_2_3):
|
|
||||||
+ /* Returning from L(more_8x_vec) requires restoring rsi. */
|
|
||||||
+ addq %rdi, %rsi
|
|
||||||
+L(return_vec_0_1_2_3):
|
|
||||||
+ VPCMP $4, %YMM1, %YMM0, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
+
|
|
||||||
+ VPCMP $4, %YMM2, %YMM0, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_1)
|
|
||||||
+
|
|
||||||
+ VPCMP $4, %YMM3, %YMM0, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_2)
|
|
||||||
+L(return_vec_3):
|
|
||||||
+ tzcntl %ecx, %ecx
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
.p2align 4
|
|
||||||
-L(4):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl (%rdi), %edx
|
|
||||||
- cmpl (%rsi), %edx
|
|
||||||
- jne L(wmemcmp_return)
|
|
||||||
+L(more_8x_vec):
|
|
||||||
+ /* Set end of s1 in rdx. */
|
|
||||||
+ leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
|
|
||||||
+ /* rsi stores s2 - s1. This allows loop to only update one
|
|
||||||
+ pointer. */
|
|
||||||
+ subq %rdi, %rsi
|
|
||||||
+ /* Align s1 pointer. */
|
|
||||||
+ andq $-VEC_SIZE, %rdi
|
|
||||||
+ /* Adjust because first 4x vec where check already. */
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ .p2align 4
|
|
||||||
+L(loop_4x_vec):
|
|
||||||
+ VMOVU (%rsi, %rdi), %YMM1
|
|
||||||
+ vpxorq (%rdi), %YMM1, %YMM1
|
|
||||||
+
|
|
||||||
+ VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
|
|
||||||
+ vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
|
|
||||||
+ vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
|
|
||||||
+ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
|
|
||||||
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
|
|
||||||
+ VPCMP $4, %YMM4, %YMM0, %k1
|
|
||||||
+ kmovd %k1, %ecx
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
+ jnz L(8x_return_vec_0_1_2_3)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ cmpq %rdx, %rdi
|
|
||||||
+ jb L(loop_4x_vec)
|
|
||||||
+
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+ /* rdi has 4 * VEC_SIZE - remaining length. */
|
|
||||||
+ cmpl $(VEC_SIZE * 3), %edi
|
|
||||||
+ jae L(8x_last_1x_vec)
|
|
||||||
+ /* Load regardless of branch. */
|
|
||||||
+ VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
|
|
||||||
+ cmpl $(VEC_SIZE * 2), %edi
|
|
||||||
+ jae L(8x_last_2x_vec)
|
|
||||||
+
|
|
||||||
+ VMOVU (%rsi, %rdx), %YMM1
|
|
||||||
+ vpxorq (%rdx), %YMM1, %YMM1
|
|
||||||
+
|
|
||||||
+ VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
|
|
||||||
+ vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
|
|
||||||
+
|
|
||||||
+ vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
|
|
||||||
+ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
|
|
||||||
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
|
|
||||||
+ VPCMP $4, %YMM4, %YMM0, %k1
|
|
||||||
+ kmovd %k1, %ecx
|
|
||||||
+ /* Restore s1 pointer to rdi. */
|
|
||||||
+ movq %rdx, %rdi
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
+ jnz L(8x_return_vec_0_1_2_3)
|
|
||||||
+ /* NB: eax must be zero to reach here. */
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ /* Only entry is from L(more_8x_vec). */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(8x_last_2x_vec):
|
|
||||||
+ VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(8x_return_vec_2)
|
|
||||||
+ /* Naturally aligned to 16 bytes. */
|
|
||||||
+L(8x_last_1x_vec):
|
|
||||||
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
|
|
||||||
+ VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(8x_return_vec_3)
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_2x_vec):
|
|
||||||
+ /* Check second to last VEC. */
|
|
||||||
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
|
|
||||||
+ VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_1_end)
|
|
||||||
+
|
|
||||||
+ /* Check last VEC. */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_1x_vec):
|
|
||||||
+ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
|
|
||||||
+ VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_0_end)
|
|
||||||
ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(8x_return_vec_2):
|
|
||||||
+ subq $VEC_SIZE, %rdx
|
|
||||||
+L(8x_return_vec_3):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ leaq (%rdx, %rax, CHAR_SIZE), %rax
|
|
||||||
+ movl (VEC_SIZE * 3)(%rax), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
# else
|
|
||||||
+ addq %rdx, %rax
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
.p2align 4
|
|
||||||
-L(between_4_7):
|
|
||||||
- /* Load as big endian with overlapping movbe to avoid branches. */
|
|
||||||
- movbe (%rdi), %eax
|
|
||||||
- movbe (%rsi), %ecx
|
|
||||||
- shlq $32, %rax
|
|
||||||
- shlq $32, %rcx
|
|
||||||
- movbe -4(%rdi, %rdx), %edi
|
|
||||||
- movbe -4(%rsi, %rdx), %esi
|
|
||||||
- orq %rdi, %rax
|
|
||||||
- orq %rsi, %rcx
|
|
||||||
- subq %rcx, %rax
|
|
||||||
- je L(exit)
|
|
||||||
- sbbl %eax, %eax
|
|
||||||
- orl $1, %eax
|
|
||||||
+L(return_vec_0_end):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addl %edx, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl -VEC_SIZE(%rsi, %rax), %ecx
|
|
||||||
+ movzbl -VEC_SIZE(%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(exit):
|
|
||||||
+L(return_vec_1_end):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addl %edx, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
|
|
||||||
+ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
+
|
|
||||||
.p2align 4
|
|
||||||
+L(page_cross_less_vec):
|
|
||||||
+ /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
|
|
||||||
+ bytes. */
|
|
||||||
+ cmpl $(16 / CHAR_SIZE), %edx
|
|
||||||
+ jae L(between_16_31)
|
|
||||||
+# ifndef USE_AS_WMEMCMP
|
|
||||||
+ cmpl $8, %edx
|
|
||||||
+ jae L(between_8_15)
|
|
||||||
+ cmpl $4, %edx
|
|
||||||
+ jae L(between_4_7)
|
|
||||||
L(between_2_3):
|
|
||||||
/* Load as big endian to avoid branches. */
|
|
||||||
movzwl (%rdi), %eax
|
|
||||||
@@ -217,224 +448,99 @@ L(between_2_3):
|
|
||||||
shll $8, %ecx
|
|
||||||
bswap %eax
|
|
||||||
bswap %ecx
|
|
||||||
- movb -1(%rdi, %rdx), %al
|
|
||||||
- movb -1(%rsi, %rdx), %cl
|
|
||||||
+ movzbl -1(%rdi, %rdx), %edi
|
|
||||||
+ movzbl -1(%rsi, %rdx), %esi
|
|
||||||
+ orl %edi, %eax
|
|
||||||
+ orl %esi, %ecx
|
|
||||||
/* Subtraction is okay because the upper 8 bits are zero. */
|
|
||||||
subl %ecx, %eax
|
|
||||||
ret
|
|
||||||
-
|
|
||||||
.p2align 4
|
|
||||||
-L(1):
|
|
||||||
- movzbl (%rdi), %eax
|
|
||||||
+L(one_or_less):
|
|
||||||
+ jb L(zero)
|
|
||||||
movzbl (%rsi), %ecx
|
|
||||||
+ movzbl (%rdi), %eax
|
|
||||||
subl %ecx, %eax
|
|
||||||
ret
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(zero):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(less_vec):
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
- /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
|
|
||||||
- cmpb $4, %dl
|
|
||||||
- je L(4)
|
|
||||||
- jb L(zero)
|
|
||||||
-# else
|
|
||||||
- cmpb $1, %dl
|
|
||||||
- je L(1)
|
|
||||||
- jb L(zero)
|
|
||||||
- cmpb $4, %dl
|
|
||||||
- jb L(between_2_3)
|
|
||||||
- cmpb $8, %dl
|
|
||||||
- jb L(between_4_7)
|
|
||||||
+L(between_8_15):
|
|
||||||
# endif
|
|
||||||
- cmpb $16, %dl
|
|
||||||
- jae L(between_16_31)
|
|
||||||
- /* It is between 8 and 15 bytes. */
|
|
||||||
+ /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
|
|
||||||
vmovq (%rdi), %XMM1
|
|
||||||
vmovq (%rsi), %XMM2
|
|
||||||
- VPCMPEQ %XMM1, %XMM2, %k2
|
|
||||||
- kmovw %k2, %eax
|
|
||||||
- subl $XMM_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ VPCMP $4, %XMM1, %XMM2, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
/* Use overlapping loads to avoid branches. */
|
|
||||||
- leaq -8(%rdi, %rdx), %rdi
|
|
||||||
- leaq -8(%rsi, %rdx), %rsi
|
|
||||||
+ leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi
|
|
||||||
+ leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi
|
|
||||||
vmovq (%rdi), %XMM1
|
|
||||||
vmovq (%rsi), %XMM2
|
|
||||||
- VPCMPEQ %XMM1, %XMM2, %k2
|
|
||||||
- kmovw %k2, %eax
|
|
||||||
- subl $XMM_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ VPCMP $4, %XMM1, %XMM2, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(between_16_31):
|
|
||||||
- /* From 16 to 31 bytes. No branch when size == 16. */
|
|
||||||
- VMOVU (%rsi), %XMM2
|
|
||||||
- VPCMPEQ (%rdi), %XMM2, %k2
|
|
||||||
- kmovw %k2, %eax
|
|
||||||
- subl $XMM_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
-
|
|
||||||
- /* Use overlapping loads to avoid branches. */
|
|
||||||
- leaq -16(%rdi, %rdx), %rdi
|
|
||||||
- leaq -16(%rsi, %rdx), %rsi
|
|
||||||
- VMOVU (%rsi), %XMM2
|
|
||||||
- VPCMPEQ (%rdi), %XMM2, %k2
|
|
||||||
- kmovw %k2, %eax
|
|
||||||
- subl $XMM_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(more_8x_vec):
|
|
||||||
- /* More than 8 * VEC. Check the first VEC. */
|
|
||||||
- VMOVU (%rsi), %YMM2
|
|
||||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
-
|
|
||||||
- /* Align the first memory area for aligned loads in the loop.
|
|
||||||
- Compute how much the first memory area is misaligned. */
|
|
||||||
- movq %rdi, %rcx
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- /* Get the negative of offset for alignment. */
|
|
||||||
- subq $VEC_SIZE, %rcx
|
|
||||||
- /* Adjust the second memory area. */
|
|
||||||
- subq %rcx, %rsi
|
|
||||||
- /* Adjust the first memory area which should be aligned now. */
|
|
||||||
- subq %rcx, %rdi
|
|
||||||
- /* Adjust length. */
|
|
||||||
- addq %rcx, %rdx
|
|
||||||
-
|
|
||||||
-L(loop_4x_vec):
|
|
||||||
- /* Compare 4 * VEC at a time forward. */
|
|
||||||
- VMOVU (%rsi), %YMM1
|
|
||||||
- VPCMPEQ (%rdi), %YMM1, %k1
|
|
||||||
-
|
|
||||||
- VMOVU VEC_SIZE(%rsi), %YMM2
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
|
||||||
- kandd %k2, %k1, %k5
|
|
||||||
-
|
|
||||||
- VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
|
||||||
- kandd %k3, %k5, %k5
|
|
||||||
-
|
|
||||||
- VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
|
||||||
- kandd %k4, %k5, %k5
|
|
||||||
-
|
|
||||||
- kmovd %k5, %eax
|
|
||||||
- cmpl $VEC_MASK, %eax
|
|
||||||
- jne L(4x_vec_end)
|
|
||||||
-
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
- addq $(VEC_SIZE * 4), %rsi
|
|
||||||
-
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jae L(loop_4x_vec)
|
|
||||||
-
|
|
||||||
- /* Less than 4 * VEC. */
|
|
||||||
- cmpq $VEC_SIZE, %rdx
|
|
||||||
- jbe L(last_vec)
|
|
||||||
- cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
- jbe L(last_2x_vec)
|
|
||||||
-
|
|
||||||
-L(last_4x_vec):
|
|
||||||
- /* From 2 * VEC to 4 * VEC. */
|
|
||||||
- VMOVU (%rsi), %YMM2
|
|
||||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
-
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- addq $VEC_SIZE, %rsi
|
|
||||||
- VMOVU (%rsi), %YMM2
|
|
||||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+L(between_16_31):
|
|
||||||
+ /* From 16 to 31 bytes. No branch when size == 16. */
|
|
||||||
+ VMOVU (%rsi), %XMM2
|
|
||||||
+ VPCMP $4, (%rdi), %XMM2, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
|
|
||||||
/* Use overlapping loads to avoid branches. */
|
|
||||||
- leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
|
|
||||||
- leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
|
|
||||||
- VMOVU (%rsi), %YMM2
|
|
||||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- addq $VEC_SIZE, %rsi
|
|
||||||
- VMOVU (%rsi), %YMM2
|
|
||||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(4x_vec_end):
|
|
||||||
+ VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2
|
|
||||||
+ leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi
|
|
||||||
+ leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi
|
|
||||||
+ VPCMP $4, (%rdi), %XMM2, %k1
|
|
||||||
kmovd %k1, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
- kmovd %k3, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
- kmovd %k4, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- tzcntl %eax, %ecx
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
|
|
||||||
- cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
|
|
||||||
- jmp L(wmemcmp_return)
|
|
||||||
-# else
|
|
||||||
- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
|
||||||
- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
|
|
||||||
- sub %edx, %eax
|
|
||||||
-# endif
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
ret
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x1):
|
|
||||||
- tzcntl %eax, %ecx
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl VEC_SIZE(%rdi, %rcx, 4), %edx
|
|
||||||
- cmpl VEC_SIZE(%rsi, %rcx, 4), %edx
|
|
||||||
- jmp L(wmemcmp_return)
|
|
||||||
-# else
|
|
||||||
- movzbl VEC_SIZE(%rdi, %rcx), %eax
|
|
||||||
- movzbl VEC_SIZE(%rsi, %rcx), %edx
|
|
||||||
- sub %edx, %eax
|
|
||||||
-# endif
|
|
||||||
+ .p2align 4
|
|
||||||
+L(one_or_less):
|
|
||||||
+ jb L(zero)
|
|
||||||
+ movl (%rdi), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (%rsi), %ecx
|
|
||||||
+ je L(zero)
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
ret
|
|
||||||
+# else
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x2):
|
|
||||||
- tzcntl %eax, %ecx
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
|
|
||||||
- cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
|
|
||||||
- jmp L(wmemcmp_return)
|
|
||||||
-# else
|
|
||||||
- movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
|
|
||||||
- movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
|
|
||||||
- sub %edx, %eax
|
|
||||||
-# endif
|
|
||||||
+L(between_4_7):
|
|
||||||
+ /* Load as big endian with overlapping movbe to avoid branches.
|
|
||||||
+ */
|
|
||||||
+ movbe (%rdi), %eax
|
|
||||||
+ movbe (%rsi), %ecx
|
|
||||||
+ shlq $32, %rax
|
|
||||||
+ shlq $32, %rcx
|
|
||||||
+ movbe -4(%rdi, %rdx), %edi
|
|
||||||
+ movbe -4(%rsi, %rdx), %esi
|
|
||||||
+ orq %rdi, %rax
|
|
||||||
+ orq %rsi, %rcx
|
|
||||||
+ subq %rcx, %rax
|
|
||||||
+ jz L(zero_4_7)
|
|
||||||
+ sbbl %eax, %eax
|
|
||||||
+ orl $1, %eax
|
|
||||||
+L(zero_4_7):
|
|
||||||
ret
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
END (MEMCMP)
|
|
||||||
#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,104 +0,0 @@
|
|||||||
From 6abf27980a947f9b6e514d6b33b83059d39566ae Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Thu, 20 May 2021 13:13:51 -0400
|
|
||||||
Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit makes a few small improvements to
|
|
||||||
memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64
|
|
||||||
instead of 128. Either alignment will perform equally well in a loop
|
|
||||||
and 128 just increases the odds of having to do an extra iteration
|
|
||||||
which can be significant overhead for small values. 2) Align some
|
|
||||||
targets and the loop. 3) Remove an ALU from the alignment process. 4)
|
|
||||||
Reorder the last 4x VEC so that they are stored after the loop. 5)
|
|
||||||
Move the condition for leq 8x VEC to before the alignment
|
|
||||||
process. test-memset and test-wmemset are both passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
.../multiarch/memset-vec-unaligned-erms.S | 50 +++++++++++--------
|
|
||||||
1 file changed, 28 insertions(+), 22 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
index f877ac9d..909c33f6 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
@@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
|
|
||||||
VMOVU %VEC(0), (%rdi)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
+ .p2align 4
|
|
||||||
L(stosb_more_2x_vec):
|
|
||||||
cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
|
|
||||||
ja L(stosb)
|
|
||||||
+#else
|
|
||||||
+ .p2align 4
|
|
||||||
#endif
|
|
||||||
L(more_2x_vec):
|
|
||||||
- cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
- ja L(loop_start)
|
|
||||||
+ /* Stores to first 2x VEC before cmp as any path forward will
|
|
||||||
+ require it. */
|
|
||||||
VMOVU %VEC(0), (%rdi)
|
|
||||||
VMOVU %VEC(0), VEC_SIZE(%rdi)
|
|
||||||
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
||||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ ja L(loop_start)
|
|
||||||
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
|
|
||||||
+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
||||||
L(return):
|
|
||||||
#if VEC_SIZE > 16
|
|
||||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
||||||
@@ -192,28 +197,29 @@ L(return):
|
|
||||||
#endif
|
|
||||||
|
|
||||||
L(loop_start):
|
|
||||||
- leaq (VEC_SIZE * 4)(%rdi), %rcx
|
|
||||||
- VMOVU %VEC(0), (%rdi)
|
|
||||||
- andq $-(VEC_SIZE * 4), %rcx
|
|
||||||
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
||||||
- VMOVU %VEC(0), VEC_SIZE(%rdi)
|
|
||||||
- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
|
|
||||||
VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
|
|
||||||
- VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
|
|
||||||
VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
|
|
||||||
- VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
|
|
||||||
- addq %rdi, %rdx
|
|
||||||
- andq $-(VEC_SIZE * 4), %rdx
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
- je L(return)
|
|
||||||
+ cmpq $(VEC_SIZE * 8), %rdx
|
|
||||||
+ jbe L(loop_end)
|
|
||||||
+ andq $-(VEC_SIZE * 2), %rdi
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx
|
|
||||||
+ .p2align 4
|
|
||||||
L(loop):
|
|
||||||
- VMOVA %VEC(0), (%rcx)
|
|
||||||
- VMOVA %VEC(0), VEC_SIZE(%rcx)
|
|
||||||
- VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
|
|
||||||
- VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
|
|
||||||
- addq $(VEC_SIZE * 4), %rcx
|
|
||||||
- cmpq %rcx, %rdx
|
|
||||||
- jne L(loop)
|
|
||||||
+ VMOVA %VEC(0), (%rdi)
|
|
||||||
+ VMOVA %VEC(0), VEC_SIZE(%rdi)
|
|
||||||
+ VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi)
|
|
||||||
+ VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ cmpq %rcx, %rdi
|
|
||||||
+ jb L(loop)
|
|
||||||
+L(loop_end):
|
|
||||||
+ /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
|
|
||||||
+ rdx as length is also unchanged. */
|
|
||||||
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
|
|
||||||
+ VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
|
|
||||||
+ VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
|
|
||||||
+ VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
|
|
||||||
VZEROUPPER_SHORT_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,84 +0,0 @@
|
|||||||
From 1b992204f68af851e905c16016756fd4421e1934 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Sun, 23 May 2021 19:43:24 -0400
|
|
||||||
Subject: [PATCH] x86: Improve memmove-vec-unaligned-erms.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
This patch changes the condition for copy 4x VEC so that if length is
|
|
||||||
exactly equal to 4 * VEC_SIZE it will use the 4x VEC case instead of
|
|
||||||
8x VEC case.
|
|
||||||
|
|
||||||
Results For Skylake memcpy-avx2-erms
|
|
||||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
|
||||||
128 , 0 , 0 , 9.137 , 6.873 , New , 75.22
|
|
||||||
128 , 7 , 0 , 12.933 , 7.732 , New , 59.79
|
|
||||||
128 , 0 , 7 , 11.852 , 6.76 , New , 57.04
|
|
||||||
128 , 7 , 7 , 12.587 , 6.808 , New , 54.09
|
|
||||||
|
|
||||||
Results For Icelake memcpy-evex-erms
|
|
||||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
|
||||||
128 , 0 , 0 , 9.963 , 5.416 , New , 54.36
|
|
||||||
128 , 7 , 0 , 16.467 , 8.061 , New , 48.95
|
|
||||||
128 , 0 , 7 , 14.388 , 7.644 , New , 53.13
|
|
||||||
128 , 7 , 7 , 14.546 , 7.642 , New , 52.54
|
|
||||||
|
|
||||||
Results For Tigerlake memcpy-evex-erms
|
|
||||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
|
||||||
128 , 0 , 0 , 8.979 , 4.95 , New , 55.13
|
|
||||||
128 , 7 , 0 , 14.245 , 7.122 , New , 50.0
|
|
||||||
128 , 0 , 7 , 12.668 , 6.675 , New , 52.69
|
|
||||||
128 , 7 , 7 , 13.042 , 6.802 , New , 52.15
|
|
||||||
|
|
||||||
Results For Skylake memmove-avx2-erms
|
|
||||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
|
||||||
128 , 0 , 32 , 6.181 , 5.691 , New , 92.07
|
|
||||||
128 , 32 , 0 , 6.165 , 5.752 , New , 93.3
|
|
||||||
128 , 0 , 7 , 13.923 , 9.37 , New , 67.3
|
|
||||||
128 , 7 , 0 , 12.049 , 10.182 , New , 84.5
|
|
||||||
|
|
||||||
Results For Icelake memmove-evex-erms
|
|
||||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
|
||||||
128 , 0 , 32 , 5.479 , 4.889 , New , 89.23
|
|
||||||
128 , 32 , 0 , 5.127 , 4.911 , New , 95.79
|
|
||||||
128 , 0 , 7 , 18.885 , 13.547 , New , 71.73
|
|
||||||
128 , 7 , 0 , 15.565 , 14.436 , New , 92.75
|
|
||||||
|
|
||||||
Results For Tigerlake memmove-evex-erms
|
|
||||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
|
||||||
128 , 0 , 32 , 5.275 , 4.815 , New , 91.28
|
|
||||||
128 , 32 , 0 , 5.376 , 4.565 , New , 84.91
|
|
||||||
128 , 0 , 7 , 19.426 , 14.273 , New , 73.47
|
|
||||||
128 , 7 , 0 , 15.924 , 14.951 , New , 93.89
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
|
|
||||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
index 3e2dd6bc..572cef04 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
@@ -417,8 +417,8 @@ L(more_2x_vec):
|
|
||||||
cmpq $(VEC_SIZE * 8), %rdx
|
|
||||||
ja L(more_8x_vec)
|
|
||||||
cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jb L(last_4x_vec)
|
|
||||||
- /* Copy from 4 * VEC to 8 * VEC, inclusively. */
|
|
||||||
+ jbe L(last_4x_vec)
|
|
||||||
+ /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
|
|
||||||
VMOVU (%rsi), %VEC(0)
|
|
||||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
|
||||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
|
||||||
@@ -437,7 +437,7 @@ L(more_2x_vec):
|
|
||||||
VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
L(last_4x_vec):
|
|
||||||
- /* Copy from 2 * VEC to 4 * VEC. */
|
|
||||||
+ /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
|
|
||||||
VMOVU (%rsi), %VEC(0)
|
|
||||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
|
||||||
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,55 +0,0 @@
|
|||||||
From 08cbcd4dbc686bb38ec3093aff2f919fbff5ec17 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Wed, 23 Jun 2021 19:19:34 -0400
|
|
||||||
Subject: [PATCH] x86: Remove unnecessary overflow check from wcsnlen-sse4_1.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. The way wcsnlen will check if near the end of maxlen
|
|
||||||
is the following macro:
|
|
||||||
|
|
||||||
mov %r11, %rsi; \
|
|
||||||
subq %rax, %rsi; \
|
|
||||||
andq $-64, %rax; \
|
|
||||||
testq $-64, %rsi; \
|
|
||||||
je L(strnlen_ret)
|
|
||||||
|
|
||||||
Which words independently of s + maxlen overflowing. So the
|
|
||||||
second overflow check is unnecissary for correctness and
|
|
||||||
just extra overhead in the common no overflow case.
|
|
||||||
|
|
||||||
test-strlen.c, test-wcslen.c, test-strnlen.c and test-wcsnlen.c are
|
|
||||||
all passing
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strlen-vec.S | 7 -------
|
|
||||||
1 file changed, 7 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
index 439e486a..b7657282 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
@@ -71,19 +71,12 @@ L(n_nonzero):
|
|
||||||
suffice. */
|
|
||||||
mov %RSI_LP, %R10_LP
|
|
||||||
sar $62, %R10_LP
|
|
||||||
- test %R10_LP, %R10_LP
|
|
||||||
jnz __wcslen_sse4_1
|
|
||||||
sal $2, %RSI_LP
|
|
||||||
# endif
|
|
||||||
|
|
||||||
-
|
|
||||||
/* Initialize long lived registers. */
|
|
||||||
-
|
|
||||||
add %RDI_LP, %RSI_LP
|
|
||||||
-# ifdef AS_WCSLEN
|
|
||||||
-/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
|
|
||||||
- jbe __wcslen_sse4_1
|
|
||||||
-# endif
|
|
||||||
mov %RSI_LP, %R10_LP
|
|
||||||
and $-64, %R10_LP
|
|
||||||
mov %RSI_LP, %R11_LP
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,290 +0,0 @@
|
|||||||
From 82d0b4a4d76db554eb6757acb790fcea30b19965 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 21 Jan 2019 11:32:24 -0800
|
|
||||||
Subject: [PATCH] x86-64 memset/wmemset: Properly handle the length parameter
|
|
||||||
[BZ# 24097]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
|
||||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
|
||||||
functions written in assembly can only use the lower 32 bits of a
|
|
||||||
64-bit register as length or must clear the upper 32 bits before using
|
|
||||||
the full 64-bit register for length.
|
|
||||||
|
|
||||||
This pach fixes memset/wmemset for x32. Tested on x86-64 and x32. On
|
|
||||||
x86-64, libc.so is the same with and withou the fix.
|
|
||||||
|
|
||||||
[BZ# 24097]
|
|
||||||
CVE-2019-6488
|
|
||||||
* sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use
|
|
||||||
RDX_LP for length. Clear the upper 32 bits of RDX register.
|
|
||||||
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-memset.c: New file.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise.
|
|
||||||
---
|
|
||||||
.../multiarch/memset-avx512-no-vzeroupper.S | 6 +-
|
|
||||||
.../multiarch/memset-vec-unaligned-erms.S | 34 +++++----
|
|
||||||
sysdeps/x86_64/x32/Makefile | 4 +-
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-memset.c | 73 +++++++++++++++++++
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-wmemset.c | 20 +++++
|
|
||||||
5 files changed, 121 insertions(+), 16 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memset.c
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemset.c
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
|
|
||||||
index 689cc119..99e25519 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
|
|
||||||
@@ -29,12 +29,16 @@
|
|
||||||
.section .text.avx512,"ax",@progbits
|
|
||||||
#if defined PIC
|
|
||||||
ENTRY (MEMSET_CHK)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMSET_CHK)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ENTRY (MEMSET)
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ mov %edx, %edx
|
|
||||||
+# endif
|
|
||||||
vpxor %xmm0, %xmm0, %xmm0
|
|
||||||
vmovd %esi, %xmm1
|
|
||||||
lea (%rdi, %rdx), %rsi
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
index 270a1d49..9a0fd818 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
@@ -65,8 +65,8 @@
|
|
||||||
.section SECTION(.text),"ax",@progbits
|
|
||||||
#if VEC_SIZE == 16 && IS_IN (libc)
|
|
||||||
ENTRY (__bzero)
|
|
||||||
- movq %rdi, %rax /* Set return value. */
|
|
||||||
- movq %rsi, %rdx /* Set n. */
|
|
||||||
+ mov %RDI_LP, %RAX_LP /* Set return value. */
|
|
||||||
+ mov %RSI_LP, %RDX_LP /* Set n. */
|
|
||||||
pxor %xmm0, %xmm0
|
|
||||||
jmp L(entry_from_bzero)
|
|
||||||
END (__bzero)
|
|
||||||
@@ -76,13 +76,13 @@ weak_alias (__bzero, bzero)
|
|
||||||
#if IS_IN (libc)
|
|
||||||
# if defined SHARED
|
|
||||||
ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
|
||||||
# endif
|
|
||||||
|
|
||||||
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
|
|
||||||
- shlq $2, %rdx
|
|
||||||
+ shl $2, %RDX_LP
|
|
||||||
WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
||||||
jmp L(entry_from_bzero)
|
|
||||||
END (WMEMSET_SYMBOL (__wmemset, unaligned))
|
|
||||||
@@ -90,13 +90,17 @@ END (WMEMSET_SYMBOL (__wmemset, unaligned))
|
|
||||||
|
|
||||||
#if defined SHARED && IS_IN (libc)
|
|
||||||
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
|
||||||
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ mov %edx, %edx
|
|
||||||
+# endif
|
|
||||||
L(entry_from_bzero):
|
|
||||||
cmpq $VEC_SIZE, %rdx
|
|
||||||
jb L(less_vec)
|
|
||||||
@@ -112,14 +116,14 @@ END (MEMSET_SYMBOL (__memset, unaligned))
|
|
||||||
|
|
||||||
# if VEC_SIZE == 16
|
|
||||||
ENTRY (__memset_chk_erms)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (__memset_chk_erms)
|
|
||||||
|
|
||||||
/* Only used to measure performance of REP STOSB. */
|
|
||||||
ENTRY (__memset_erms)
|
|
||||||
/* Skip zero length. */
|
|
||||||
- testq %rdx, %rdx
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
jnz L(stosb)
|
|
||||||
movq %rdi, %rax
|
|
||||||
ret
|
|
||||||
@@ -131,11 +135,11 @@ ENTRY (MEMSET_SYMBOL (__memset, erms))
|
|
||||||
L(stosb):
|
|
||||||
/* Issue vzeroupper before rep stosb. */
|
|
||||||
VZEROUPPER
|
|
||||||
- movq %rdx, %rcx
|
|
||||||
+ mov %RDX_LP, %RCX_LP
|
|
||||||
movzbl %sil, %eax
|
|
||||||
- movq %rdi, %rdx
|
|
||||||
+ mov %RDI_LP, %RDX_LP
|
|
||||||
rep stosb
|
|
||||||
- movq %rdx, %rax
|
|
||||||
+ mov %RDX_LP, %RAX_LP
|
|
||||||
ret
|
|
||||||
# if VEC_SIZE == 16
|
|
||||||
END (__memset_erms)
|
|
||||||
@@ -145,16 +149,20 @@ END (MEMSET_SYMBOL (__memset, erms))
|
|
||||||
|
|
||||||
# if defined SHARED && IS_IN (libc)
|
|
||||||
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
|
||||||
# endif
|
|
||||||
|
|
||||||
ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
|
|
||||||
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
||||||
- cmpq $VEC_SIZE, %rdx
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ mov %edx, %edx
|
|
||||||
+# endif
|
|
||||||
+ cmp $VEC_SIZE, %RDX_LP
|
|
||||||
jb L(less_vec)
|
|
||||||
- cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
+ cmp $(VEC_SIZE * 2), %RDX_LP
|
|
||||||
ja L(stosb_more_2x_vec)
|
|
||||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index e99dbd7c..98bd9ae9 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -7,9 +7,9 @@ endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
|
|
||||||
- tst-size_t-memrchr
|
|
||||||
+ tst-size_t-memrchr tst-size_t-memset
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),wcsmbs)
|
|
||||||
-tests += tst-size_t-wmemchr tst-size_t-wmemcmp
|
|
||||||
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
|
|
||||||
endif
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memset.c b/sysdeps/x86_64/x32/tst-size_t-memset.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..2c367af6
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memset.c
|
|
||||||
@@ -0,0 +1,73 @@
|
|
||||||
+/* Test memset with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# define TEST_NAME "wmemset"
|
|
||||||
+#else
|
|
||||||
+# define TEST_NAME "memset"
|
|
||||||
+#endif /* WIDE */
|
|
||||||
+
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# include <wchar.h>
|
|
||||||
+# define MEMSET wmemset
|
|
||||||
+# define CHAR wchar_t
|
|
||||||
+#else
|
|
||||||
+# define MEMSET memset
|
|
||||||
+# define CHAR char
|
|
||||||
+#endif /* WIDE */
|
|
||||||
+
|
|
||||||
+IMPL (MEMSET, 1)
|
|
||||||
+
|
|
||||||
+typedef CHAR *(*proto_t) (CHAR *, int, size_t);
|
|
||||||
+
|
|
||||||
+static void *
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_memset (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, (uintptr_t) b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ CHAR ch = 0x23;
|
|
||||||
+ parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
|
|
||||||
+ parameter_t c = { { 0 }, (void *) (uintptr_t) ch };
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ c.fn = impl->fn;
|
|
||||||
+ CHAR *p = (CHAR *) do_memset (src, c);
|
|
||||||
+ size_t i;
|
|
||||||
+ for (i = 0; i < src.len; i++)
|
|
||||||
+ if (p[i] != ch)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s", impl->name);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemset.c b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..955eb488
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
|
|
||||||
@@ -0,0 +1,20 @@
|
|
||||||
+/* Test wmemset with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define WIDE 1
|
|
||||||
+#include "tst-size_t-memset.c"
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,43 +0,0 @@
|
|||||||
From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
|
|
||||||
Author: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com> 2021-05-23 21:43:10
|
|
||||||
Committer: H.J. Lu <hjl.tools@gmail.com> 2021-06-27 10:56:57
|
|
||||||
Parent: 2c16cb88a6e5ace0fb7cedca86860ea7bde522a7 (Linux: Move timer helper routines from librt to libc)
|
|
||||||
Child: 1683249d17e14827b6579529742eb895027dfa84 (x86_64: roundeven with sse4.1 support)
|
|
||||||
Branches: master, remotes/origin/master and many more (41)
|
|
||||||
Follows: glibc-2.33.9000
|
|
||||||
Precedes: glibc-2.34
|
|
||||||
|
|
||||||
math: redirect roundeven function
|
|
||||||
|
|
||||||
This patch redirect roundeven function for futhermore changes.
|
|
||||||
|
|
||||||
Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
*
|
|
||||||
(rewritten for older branch)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
|
|
||||||
index 7bbbb2dc..8728d0f2 100644
|
|
||||||
--- a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
|
|
||||||
+++ b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
|
|
||||||
@@ -17,6 +17,7 @@
|
|
||||||
License along with the GNU C Library; if not, see
|
|
||||||
<http://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
+#define NO_MATH_REDIRECT
|
|
||||||
#include <math.h>
|
|
||||||
#include <math_private.h>
|
|
||||||
#include <libm-alias-double.h>
|
|
||||||
@@ -67,5 +68,6 @@ __roundeven (double x)
|
|
||||||
INSERT_WORDS64 (x, ix);
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
-hidden_def (__roundeven)
|
|
||||||
+#ifndef __roundeven
|
|
||||||
libm_alias_double (__roundeven, roundeven)
|
|
||||||
+#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,118 +0,0 @@
|
|||||||
From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
|
|
||||||
Date: Mon, 24 May 2021 09:43:10 +0800
|
|
||||||
Subject: [PATCH] math: redirect roundeven function
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
This patch redirect roundeven function for futhermore changes.
|
|
||||||
|
|
||||||
Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
include/math.h | 3 ++-
|
|
||||||
sysdeps/ieee754/dbl-64/s_roundeven.c | 4 +++-
|
|
||||||
sysdeps/ieee754/float128/s_roundevenf128.c | 1 +
|
|
||||||
sysdeps/ieee754/flt-32/s_roundevenf.c | 3 +++
|
|
||||||
sysdeps/ieee754/ldbl-128/s_roundevenl.c | 1 +
|
|
||||||
sysdeps/ieee754/ldbl-96/s_roundevenl.c | 1 +
|
|
||||||
6 files changed, 11 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
include/math.h
|
|
||||||
(missing MATH_REDIRECT macros)
|
|
||||||
|
|
||||||
diff --git a/include/math.h b/include/math.h
|
|
||||||
index e21d34b8..1f9f9a54 100644
|
|
||||||
--- a/include/math.h
|
|
||||||
+++ b/include/math.h
|
|
||||||
@@ -38,7 +38,6 @@ libm_hidden_proto (__issignaling)
|
|
||||||
libm_hidden_proto (__issignalingf)
|
|
||||||
libm_hidden_proto (__exp)
|
|
||||||
libm_hidden_proto (__expf)
|
|
||||||
-libm_hidden_proto (__roundeven)
|
|
||||||
|
|
||||||
# ifndef __NO_LONG_DOUBLE_MATH
|
|
||||||
libm_hidden_proto (__fpclassifyl)
|
|
||||||
@@ -56,6 +55,8 @@ libm_hidden_proto (__expm1f128)
|
|
||||||
|
|
||||||
# if !(defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ > 0)
|
|
||||||
# ifndef NO_MATH_REDIRECT
|
|
||||||
+float (roundevenf) (float) asm ("__roundevenf");
|
|
||||||
+double (roundeven) (double) asm ("__roundeven");
|
|
||||||
/* Declare sqrt for use within GLIBC. Compilers typically inline sqrt as a
|
|
||||||
single instruction. Use an asm to avoid use of PLTs if it doesn't. */
|
|
||||||
float (sqrtf) (float) asm ("__ieee754_sqrtf");
|
|
||||||
diff --git a/sysdeps/ieee754/dbl-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/s_roundeven.c
|
|
||||||
index 1438e81d..61962184 100644
|
|
||||||
--- a/sysdeps/ieee754/dbl-64/s_roundeven.c
|
|
||||||
+++ b/sysdeps/ieee754/dbl-64/s_roundeven.c
|
|
||||||
@@ -17,6 +17,7 @@
|
|
||||||
License along with the GNU C Library; if not, see
|
|
||||||
<http://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
+#define NO_MATH_REDIRECT
|
|
||||||
#include <math.h>
|
|
||||||
#include <math_private.h>
|
|
||||||
#include <libm-alias-double.h>
|
|
||||||
@@ -101,5 +102,6 @@ __roundeven (double x)
|
|
||||||
INSERT_WORDS (x, hx, lx);
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
-hidden_def (__roundeven)
|
|
||||||
+#ifndef __roundeven
|
|
||||||
libm_alias_double (__roundeven, roundeven)
|
|
||||||
+#endif
|
|
||||||
diff --git a/sysdeps/ieee754/float128/s_roundevenf128.c b/sysdeps/ieee754/float128/s_roundevenf128.c
|
|
||||||
index 5a9b3f39..e0faf727 100644
|
|
||||||
--- a/sysdeps/ieee754/float128/s_roundevenf128.c
|
|
||||||
+++ b/sysdeps/ieee754/float128/s_roundevenf128.c
|
|
||||||
@@ -1,2 +1,3 @@
|
|
||||||
+#define NO_MATH_REDIRECT
|
|
||||||
#include <float128_private.h>
|
|
||||||
#include "../ldbl-128/s_roundevenl.c"
|
|
||||||
diff --git a/sysdeps/ieee754/flt-32/s_roundevenf.c b/sysdeps/ieee754/flt-32/s_roundevenf.c
|
|
||||||
index 90f991d5..a661875e 100644
|
|
||||||
--- a/sysdeps/ieee754/flt-32/s_roundevenf.c
|
|
||||||
+++ b/sysdeps/ieee754/flt-32/s_roundevenf.c
|
|
||||||
@@ -17,6 +17,7 @@
|
|
||||||
License along with the GNU C Library; if not, see
|
|
||||||
<http://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
+#define NO_MATH_REDIRECT
|
|
||||||
#include <math.h>
|
|
||||||
#include <math_private.h>
|
|
||||||
#include <libm-alias-float.h>
|
|
||||||
@@ -67,4 +68,6 @@ __roundevenf (float x)
|
|
||||||
SET_FLOAT_WORD (x, ix);
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
+#ifndef __roundevenf
|
|
||||||
libm_alias_float (__roundeven, roundeven)
|
|
||||||
+#endif
|
|
||||||
diff --git a/sysdeps/ieee754/ldbl-128/s_roundevenl.c b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
|
|
||||||
index 5fc59af4..b9375b6c 100644
|
|
||||||
--- a/sysdeps/ieee754/ldbl-128/s_roundevenl.c
|
|
||||||
+++ b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
|
|
||||||
@@ -17,6 +17,7 @@
|
|
||||||
License along with the GNU C Library; if not, see
|
|
||||||
<http://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
+#define NO_MATH_REDIRECT
|
|
||||||
#include <math.h>
|
|
||||||
#include <math_private.h>
|
|
||||||
#include <libm-alias-ldouble.h>
|
|
||||||
diff --git a/sysdeps/ieee754/ldbl-96/s_roundevenl.c b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
|
|
||||||
index be2e4fa4..65031ab7 100644
|
|
||||||
--- a/sysdeps/ieee754/ldbl-96/s_roundevenl.c
|
|
||||||
+++ b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
|
|
||||||
@@ -17,6 +17,7 @@
|
|
||||||
License along with the GNU C Library; if not, see
|
|
||||||
<http://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
+#define NO_MATH_REDIRECT
|
|
||||||
#include <math.h>
|
|
||||||
#include <math_private.h>
|
|
||||||
#include <libm-alias-ldouble.h>
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,242 +0,0 @@
|
|||||||
From 1683249d17e14827b6579529742eb895027dfa84 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
|
|
||||||
Date: Mon, 24 May 2021 09:43:11 +0800
|
|
||||||
Subject: [PATCH] x86_64: roundeven with sse4.1 support
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
This patch adds support for the sse4.1 hardware floating point
|
|
||||||
roundeven.
|
|
||||||
|
|
||||||
Here is some benchmark results on my systems:
|
|
||||||
|
|
||||||
=AMD Ryzen 9 3900X 12-Core Processor=
|
|
||||||
|
|
||||||
* benchmark result before this commit
|
|
||||||
| | roundeven | roundevenf |
|
|
||||||
|------------|--------------|--------------|
|
|
||||||
| duration | 3.75587e+09 | 3.75114e+09 |
|
|
||||||
| iterations | 3.93053e+08 | 4.35402e+08 |
|
|
||||||
| max | 52.592 | 58.71 |
|
|
||||||
| min | 7.98 | 7.22 |
|
|
||||||
| mean | 9.55563 | 8.61535 |
|
|
||||||
|
|
||||||
* benchmark result after this commit
|
|
||||||
| | roundeven | roundevenf |
|
|
||||||
|------------|---------------|--------------|
|
|
||||||
| duration | 3.73815e+09 | 3.73738e+09 |
|
|
||||||
| iterations | 5.82692e+08 | 5.91498e+08 |
|
|
||||||
| max | 56.468 | 51.642 |
|
|
||||||
| min | 6.27 | 6.156 |
|
|
||||||
| mean | 6.41532 | 6.3185 |
|
|
||||||
|
|
||||||
=Intel(R) Pentium(R) CPU D1508 @ 2.20GHz=
|
|
||||||
|
|
||||||
* benchmark result before this commit
|
|
||||||
| | roundeven | roundevenf |
|
|
||||||
|------------|--------------|--------------|
|
|
||||||
| duration | 2.18208e+09 | 2.18258e+09 |
|
|
||||||
| iterations | 2.39932e+08 | 2.46924e+08 |
|
|
||||||
| max | 96.378 | 98.035 |
|
|
||||||
| min | 6.776 | 5.94 |
|
|
||||||
| mean | 9.09456 | 8.83907 |
|
|
||||||
|
|
||||||
* benchmark result after this commit
|
|
||||||
| | roundeven | roundevenf |
|
|
||||||
|------------|--------------|--------------|
|
|
||||||
| duration | 2.17415e+09 | 2.17005e+09 |
|
|
||||||
| iterations | 3.56193e+08 | 4.09824e+08 |
|
|
||||||
| max | 51.693 | 97.192 |
|
|
||||||
| min | 5.926 | 5.093 |
|
|
||||||
| mean | 6.10385 | 5.29507 |
|
|
||||||
|
|
||||||
Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/fpu/multiarch/Makefile | 5 +--
|
|
||||||
sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c | 2 ++
|
|
||||||
.../x86_64/fpu/multiarch/s_roundeven-sse4_1.S | 24 ++++++++++++++
|
|
||||||
sysdeps/x86_64/fpu/multiarch/s_roundeven.c | 31 +++++++++++++++++++
|
|
||||||
sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c | 3 ++
|
|
||||||
.../fpu/multiarch/s_roundevenf-sse4_1.S | 24 ++++++++++++++
|
|
||||||
sysdeps/x86_64/fpu/multiarch/s_roundevenf.c | 31 +++++++++++++++++++
|
|
||||||
7 files changed, 118 insertions(+), 2 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
|
|
||||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
|
|
||||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven.c
|
|
||||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
|
|
||||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
|
|
||||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
|
|
||||||
index 9f387248..6ddd1c01 100644
|
|
||||||
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
|
|
||||||
@@ -1,11 +1,12 @@
|
|
||||||
ifeq ($(subdir),math)
|
|
||||||
libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
|
|
||||||
s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \
|
|
||||||
- s_trunc-c s_truncf-c
|
|
||||||
+ s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c
|
|
||||||
|
|
||||||
libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
|
|
||||||
s_floorf-sse4_1 s_nearbyint-sse4_1 \
|
|
||||||
- s_nearbyintf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
|
|
||||||
+ s_nearbyintf-sse4_1 s_roundeven-sse4_1 \
|
|
||||||
+ s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
|
|
||||||
s_trunc-sse4_1 s_truncf-sse4_1
|
|
||||||
|
|
||||||
libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
|
|
||||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..c7be43cb
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
|
|
||||||
@@ -0,0 +1,2 @@
|
|
||||||
+#define __roundeven __roundeven_c
|
|
||||||
+#include <sysdeps/ieee754/dbl-64/s_roundeven.c>
|
|
||||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..6ae8f6b1
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
|
|
||||||
@@ -0,0 +1,24 @@
|
|
||||||
+/* Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <sysdep.h>
|
|
||||||
+
|
|
||||||
+ .section .text.sse4.1,"ax",@progbits
|
|
||||||
+ENTRY(__roundeven_sse41)
|
|
||||||
+ roundsd $8, %xmm0, %xmm0
|
|
||||||
+ ret
|
|
||||||
+END(__roundeven_sse41)
|
|
||||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..d92eda65
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
|
|
||||||
@@ -0,0 +1,31 @@
|
|
||||||
+/* Multiple versions of __roundeven.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <libm-alias-double.h>
|
|
||||||
+
|
|
||||||
+#define roundeven __redirect_roundeven
|
|
||||||
+#define __roundeven __redirect___roundeven
|
|
||||||
+#include <math.h>
|
|
||||||
+#undef roundeven
|
|
||||||
+#undef __roundeven
|
|
||||||
+
|
|
||||||
+#define SYMBOL_NAME roundeven
|
|
||||||
+#include "ifunc-sse4_1.h"
|
|
||||||
+
|
|
||||||
+libc_ifunc_redirected (__redirect_roundeven, __roundeven, IFUNC_SELECTOR ());
|
|
||||||
+libm_alias_double (__roundeven, roundeven)
|
|
||||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..72a6e7d1
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
|
|
||||||
@@ -0,0 +1,3 @@
|
|
||||||
+#undef __roundevenf
|
|
||||||
+#define __roundevenf __roundevenf_c
|
|
||||||
+#include <sysdeps/ieee754/flt-32/s_roundevenf.c>
|
|
||||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..a76e1080
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
|
|
||||||
@@ -0,0 +1,24 @@
|
|
||||||
+/* Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <sysdep.h>
|
|
||||||
+
|
|
||||||
+ .section .text.sse4.1,"ax",@progbits
|
|
||||||
+ENTRY(__roundevenf_sse41)
|
|
||||||
+ roundss $8, %xmm0, %xmm0
|
|
||||||
+ ret
|
|
||||||
+END(__roundevenf_sse41)
|
|
||||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..2ee196e6
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
|
|
||||||
@@ -0,0 +1,31 @@
|
|
||||||
+/* Multiple versions of __roundevenf.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <libm-alias-float.h>
|
|
||||||
+
|
|
||||||
+#define roundevenf __redirect_roundevenf
|
|
||||||
+#define __roundevenf __redirect___roundevenf
|
|
||||||
+#include <math.h>
|
|
||||||
+#undef roundevenf
|
|
||||||
+#undef __roundevenf
|
|
||||||
+
|
|
||||||
+#define SYMBOL_NAME roundevenf
|
|
||||||
+#include "ifunc-sse4_1.h"
|
|
||||||
+
|
|
||||||
+libc_ifunc_redirected (__redirect_roundevenf, __roundevenf, IFUNC_SELECTOR ());
|
|
||||||
+libm_alias_float (__roundeven, roundeven)
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,41 +0,0 @@
|
|||||||
From 7e08db3359c86c94918feb33a1182cd0ff3bb10b Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Sun, 9 Jan 2022 16:02:28 -0600
|
|
||||||
Subject: [PATCH] x86: Fix __wcsncmp_evex in strcmp-evex.S [BZ# 28755]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
|
|
||||||
__wcscmp_evex. For x86_64 this covers the entire address range so any
|
|
||||||
length larger could not possibly be used to bound `s1` or `s2`.
|
|
||||||
|
|
||||||
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-evex.S | 10 ++++++++++
|
|
||||||
1 file changed, 10 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
index 459eeed0..d5aa6daa 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
@@ -97,6 +97,16 @@ ENTRY (STRCMP)
|
|
||||||
je L(char0)
|
|
||||||
jb L(zero)
|
|
||||||
# ifdef USE_AS_WCSCMP
|
|
||||||
+# ifndef __ILP32__
|
|
||||||
+ movq %rdx, %rcx
|
|
||||||
+ /* Check if length could overflow when multiplied by
|
|
||||||
+ sizeof(wchar_t). Checking top 8 bits will cover all potential
|
|
||||||
+ overflow cases as well as redirect cases where its impossible to
|
|
||||||
+ length to bound a valid memory region. In these cases just use
|
|
||||||
+ 'wcscmp'. */
|
|
||||||
+ shrq $56, %rcx
|
|
||||||
+ jnz __wcscmp_evex
|
|
||||||
+# endif
|
|
||||||
/* Convert units: from wide to byte char. */
|
|
||||||
shl $2, %RDX_LP
|
|
||||||
# endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,45 +0,0 @@
|
|||||||
From bad852b61b79503fcb3c5fc379c70f768df3e1fb Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Sat, 23 Oct 2021 01:26:47 -0400
|
|
||||||
Subject: [PATCH] x86: Replace sse2 instructions with avx in
|
|
||||||
memcmp-evex-movbe.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
|
|
||||||
|
|
||||||
it could potentially be dangerous to use SSE2 if this function is ever
|
|
||||||
called without using 'vzeroupper' beforehand. While compilers appear
|
|
||||||
to use 'vzeroupper' before function calls if AVX2 has been used, using
|
|
||||||
SSE2 here is more brittle. Since it is not absolutely necessary it
|
|
||||||
should be avoided.
|
|
||||||
|
|
||||||
It costs 2-extra bytes but the extra bytes should only eat into
|
|
||||||
alignment padding.
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 4 ++--
|
|
||||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
index 2761b54f..640f6757 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
@@ -561,13 +561,13 @@ L(between_16_31):
|
|
||||||
/* From 16 to 31 bytes. No branch when size == 16. */
|
|
||||||
|
|
||||||
/* Use movups to save code size. */
|
|
||||||
- movups (%rsi), %xmm2
|
|
||||||
+ vmovdqu (%rsi), %xmm2
|
|
||||||
VPCMP $4, (%rdi), %xmm2, %k1
|
|
||||||
kmovd %k1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(return_vec_0_lv)
|
|
||||||
/* Use overlapping loads to avoid branches. */
|
|
||||||
- movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2
|
|
||||||
+ vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2
|
|
||||||
VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
|
|
||||||
addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
|
|
||||||
kmovd %k1, %eax
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,300 +0,0 @@
|
|||||||
From ee915088a0231cd421054dbd8abab7aadf331153 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 21 Jan 2019 11:33:52 -0800
|
|
||||||
Subject: [PATCH] x86-64 strncmp family: Properly handle the length parameter
|
|
||||||
[BZ# 24097]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
|
||||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
|
||||||
functions written in assembly can only use the lower 32 bits of a
|
|
||||||
64-bit register as length or must clear the upper 32 bits before using
|
|
||||||
the full 64-bit register for length.
|
|
||||||
|
|
||||||
This pach fixes the strncmp family for x32. Tested on x86-64 and x32.
|
|
||||||
On x86-64, libc.so is the same with and withou the fix.
|
|
||||||
|
|
||||||
[BZ# 24097]
|
|
||||||
CVE-2019-6488
|
|
||||||
* sysdeps/x86_64/multiarch/strcmp-avx2.S: Use RDX_LP for length.
|
|
||||||
* sysdeps/x86_64/multiarch/strcmp-sse42.S: Likewise.
|
|
||||||
* sysdeps/x86_64/strcmp.S: Likewise.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp,
|
|
||||||
tst-size_t-strncmp and tst-size_t-wcsncmp.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 6 +-
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-sse42.S | 6 +-
|
|
||||||
sysdeps/x86_64/strcmp.S | 6 +-
|
|
||||||
sysdeps/x86_64/x32/Makefile | 6 +-
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-strncasecmp.c | 59 ++++++++++++++++
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-strncmp.c | 78 +++++++++++++++++++++
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-wcsncmp.c | 20 ++++++
|
|
||||||
7 files changed, 170 insertions(+), 11 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncmp.c
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
index 327e3d87..156c1949 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
@@ -79,15 +79,15 @@
|
|
||||||
ENTRY (STRCMP)
|
|
||||||
# ifdef USE_AS_STRNCMP
|
|
||||||
/* Check for simple cases (0 or 1) in offset. */
|
|
||||||
- cmp $1, %rdx
|
|
||||||
+ cmp $1, %RDX_LP
|
|
||||||
je L(char0)
|
|
||||||
jb L(zero)
|
|
||||||
# ifdef USE_AS_WCSCMP
|
|
||||||
/* Convert units: from wide to byte char. */
|
|
||||||
- shl $2, %rdx
|
|
||||||
+ shl $2, %RDX_LP
|
|
||||||
# endif
|
|
||||||
/* Register %r11 tracks the maximum offset. */
|
|
||||||
- movq %rdx, %r11
|
|
||||||
+ mov %RDX_LP, %R11_LP
|
|
||||||
# endif
|
|
||||||
movl %edi, %eax
|
|
||||||
xorl %edx, %edx
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
|
||||||
index d3c07bd2..a1ebea46 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
|
||||||
@@ -156,11 +156,11 @@ STRCMP_SSE42:
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
|
|
||||||
- test %rdx, %rdx
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
je LABEL(strcmp_exitz)
|
|
||||||
- cmp $1, %rdx
|
|
||||||
+ cmp $1, %RDX_LP
|
|
||||||
je LABEL(Byte0)
|
|
||||||
- mov %rdx, %r11
|
|
||||||
+ mov %RDX_LP, %R11_LP
|
|
||||||
#endif
|
|
||||||
mov %esi, %ecx
|
|
||||||
mov %edi, %eax
|
|
||||||
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
|
|
||||||
index e16945b9..f47c8ad4 100644
|
|
||||||
--- a/sysdeps/x86_64/strcmp.S
|
|
||||||
+++ b/sysdeps/x86_64/strcmp.S
|
|
||||||
@@ -135,11 +135,11 @@ ENTRY (STRCMP)
|
|
||||||
* This implementation uses SSE to compare up to 16 bytes at a time.
|
|
||||||
*/
|
|
||||||
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
|
|
||||||
- test %rdx, %rdx
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
je LABEL(strcmp_exitz)
|
|
||||||
- cmp $1, %rdx
|
|
||||||
+ cmp $1, %RDX_LP
|
|
||||||
je LABEL(Byte0)
|
|
||||||
- mov %rdx, %r11
|
|
||||||
+ mov %RDX_LP, %R11_LP
|
|
||||||
#endif
|
|
||||||
mov %esi, %ecx
|
|
||||||
mov %edi, %eax
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index 98bd9ae9..db302839 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -7,9 +7,11 @@ endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
|
|
||||||
- tst-size_t-memrchr tst-size_t-memset
|
|
||||||
+ tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
|
|
||||||
+ tst-size_t-strncmp
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),wcsmbs)
|
|
||||||
-tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
|
|
||||||
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
|
|
||||||
+ tst-size_t-wcsncmp
|
|
||||||
endif
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..86233593
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
|
|
||||||
@@ -0,0 +1,59 @@
|
|
||||||
+/* Test strncaecmp with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define TEST_NAME "strncasecmp"
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+IMPL (strncasecmp, 1)
|
|
||||||
+
|
|
||||||
+typedef int (*proto_t) (const char *, const char *, size_t);
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_strncasecmp (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ parameter_t dest = { { page_size }, buf1 };
|
|
||||||
+ parameter_t src = { { 0 }, buf2 };
|
|
||||||
+
|
|
||||||
+ strncpy ((char *) buf1, (const char *) buf2, page_size);
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ src.fn = impl->fn;
|
|
||||||
+ int res = do_strncasecmp (dest, src);
|
|
||||||
+ if (res)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: %i != 0",
|
|
||||||
+ impl->name, res);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncmp.c b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..54e6bd83
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
|
|
||||||
@@ -0,0 +1,78 @@
|
|
||||||
+/* Test strncmp with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# define TEST_NAME "wcsncmp"
|
|
||||||
+#else
|
|
||||||
+# define TEST_NAME "strncmp"
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# include <wchar.h>
|
|
||||||
+
|
|
||||||
+# define STRNCMP wcsncmp
|
|
||||||
+# define STRNCPY wcsncpy
|
|
||||||
+# define CHAR wchar_t
|
|
||||||
+#else
|
|
||||||
+# define STRNCMP strncmp
|
|
||||||
+# define STRNCPY strncpy
|
|
||||||
+# define CHAR char
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+IMPL (STRNCMP, 1)
|
|
||||||
+
|
|
||||||
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_strncmp (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ size_t size = page_size / sizeof (CHAR);
|
|
||||||
+ parameter_t dest = { { size }, buf1 };
|
|
||||||
+ parameter_t src = { { 0 }, buf2 };
|
|
||||||
+
|
|
||||||
+ STRNCPY ((CHAR *) buf1, (const CHAR *) buf2, size);
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ src.fn = impl->fn;
|
|
||||||
+ int res = do_strncmp (dest, src);
|
|
||||||
+ if (res)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: %i != 0",
|
|
||||||
+ impl->name, res);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..4829647c
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
|
|
||||||
@@ -0,0 +1,20 @@
|
|
||||||
+/* Test wcsncmp with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define WIDE 1
|
|
||||||
+#include "tst-size_t-strncmp.c"
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,56 +0,0 @@
|
|||||||
From cf2c57526ba4b57e6863ad4db8a868e2678adce8 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Fri, 30 Apr 2021 05:58:59 -0700
|
|
||||||
Subject: [PATCH] x86: Set rep_movsb_threshold to 2112 on processors with FSRM
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
The glibc memcpy benchmark on Intel Core i7-1065G7 (Ice Lake) showed
|
|
||||||
that REP MOVSB became faster after 2112 bytes:
|
|
||||||
|
|
||||||
Vector Move REP MOVSB
|
|
||||||
length=2112, align1=0, align2=0: 24.20 24.40
|
|
||||||
length=2112, align1=1, align2=0: 26.07 23.13
|
|
||||||
length=2112, align1=0, align2=1: 27.18 28.13
|
|
||||||
length=2112, align1=1, align2=1: 26.23 25.16
|
|
||||||
length=2176, align1=0, align2=0: 23.18 22.52
|
|
||||||
length=2176, align1=2, align2=0: 25.45 22.52
|
|
||||||
length=2176, align1=0, align2=2: 27.14 27.82
|
|
||||||
length=2176, align1=2, align2=2: 22.73 25.56
|
|
||||||
length=2240, align1=0, align2=0: 24.62 24.25
|
|
||||||
length=2240, align1=3, align2=0: 29.77 27.15
|
|
||||||
length=2240, align1=0, align2=3: 35.55 29.93
|
|
||||||
length=2240, align1=3, align2=3: 34.49 25.15
|
|
||||||
length=2304, align1=0, align2=0: 34.75 26.64
|
|
||||||
length=2304, align1=4, align2=0: 32.09 22.63
|
|
||||||
length=2304, align1=0, align2=4: 28.43 31.24
|
|
||||||
|
|
||||||
Use REP MOVSB for data size > 2112 bytes in memcpy on processors with
|
|
||||||
fast short REP MOVSB (FSRM).
|
|
||||||
|
|
||||||
* sysdeps/x86/dl-cacheinfo.h (dl_init_cacheinfo): Set
|
|
||||||
rep_movsb_threshold to 2112 on processors with fast short REP
|
|
||||||
MOVSB (FSRM).
|
|
||||||
---
|
|
||||||
sysdeps/x86/cacheinfo.h | 6 ++++++
|
|
||||||
1 file changed, 6 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
|
|
||||||
index f72f634a..cc3941d3 100644
|
|
||||||
--- a/sysdeps/x86/cacheinfo.h
|
|
||||||
+++ b/sysdeps/x86/cacheinfo.h
|
|
||||||
@@ -430,6 +430,12 @@ init_cacheinfo (void)
|
|
||||||
rep_movsb_threshold = 2048 * (16 / 16);
|
|
||||||
minimum_rep_movsb_threshold = 16 * 8;
|
|
||||||
}
|
|
||||||
+
|
|
||||||
+ /* NB: The default REP MOVSB threshold is 2112 on processors with fast
|
|
||||||
+ short REP MOVSB (FSRM). */
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
|
||||||
+ rep_movsb_threshold = 2112;
|
|
||||||
+
|
|
||||||
if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold)
|
|
||||||
__x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
|
|
||||||
else
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,51 +0,0 @@
|
|||||||
From d672a98a1af106bd68deb15576710cd61363f7a6 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Tue, 2 Nov 2021 18:33:07 -0700
|
|
||||||
Subject: [PATCH] Add LLL_MUTEX_READ_LOCK [BZ #28537]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
CAS instruction is expensive. From the x86 CPU's point of view, getting
|
|
||||||
a cache line for writing is more expensive than reading. See Appendix
|
|
||||||
A.2 Spinlock in:
|
|
||||||
|
|
||||||
https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf
|
|
||||||
|
|
||||||
The full compare and swap will grab the cache line exclusive and cause
|
|
||||||
excessive cache line bouncing.
|
|
||||||
|
|
||||||
Add LLL_MUTEX_READ_LOCK to do an atomic load and skip CAS in spinlock
|
|
||||||
loop if compare may fail to reduce cache line bouncing on contended locks.
|
|
||||||
|
|
||||||
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
|
||||||
---
|
|
||||||
nptl/pthread_mutex_lock.c | 7 +++++++
|
|
||||||
1 file changed, 7 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
|
|
||||||
index 60ada70d..eb4d8baa 100644
|
|
||||||
--- a/nptl/pthread_mutex_lock.c
|
|
||||||
+++ b/nptl/pthread_mutex_lock.c
|
|
||||||
@@ -56,6 +56,11 @@
|
|
||||||
#define FORCE_ELISION(m, s)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+#ifndef LLL_MUTEX_READ_LOCK
|
|
||||||
+# define LLL_MUTEX_READ_LOCK(mutex) \
|
|
||||||
+ atomic_load_relaxed (&(mutex)->__data.__lock)
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
|
|
||||||
__attribute_noinline__;
|
|
||||||
|
|
||||||
@@ -136,6 +141,8 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
atomic_spin_nop ();
|
|
||||||
+ if (LLL_MUTEX_READ_LOCK (mutex) != 0)
|
|
||||||
+ continue;
|
|
||||||
}
|
|
||||||
while (LLL_MUTEX_TRYLOCK (mutex) != 0);
|
|
||||||
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,71 +0,0 @@
|
|||||||
From 120ac6d238825452e8024e2f627da33b2508dfd3 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Fri, 12 Nov 2021 11:47:42 -0800
|
|
||||||
Subject: [PATCH] Move assignment out of the CAS condition
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Update
|
|
||||||
|
|
||||||
commit 49302b8fdf9103b6fc0a398678668a22fa19574c
|
|
||||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
Date: Thu Nov 11 06:54:01 2021 -0800
|
|
||||||
|
|
||||||
Avoid extra load with CAS in __pthread_mutex_clocklock_common [BZ #28537]
|
|
||||||
|
|
||||||
Replace boolean CAS with value CAS to avoid the extra load.
|
|
||||||
|
|
||||||
and
|
|
||||||
|
|
||||||
commit 0b82747dc48d5bf0871bdc6da8cb6eec1256355f
|
|
||||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
Date: Thu Nov 11 06:31:51 2021 -0800
|
|
||||||
|
|
||||||
Avoid extra load with CAS in __pthread_mutex_lock_full [BZ #28537]
|
|
||||||
|
|
||||||
Replace boolean CAS with value CAS to avoid the extra load.
|
|
||||||
|
|
||||||
by moving assignment out of the CAS condition.
|
|
||||||
---
|
|
||||||
nptl/pthread_mutex_lock.c | 7 +++----
|
|
||||||
nptl/pthread_mutex_timedlock.c | 7 +++----
|
|
||||||
2 files changed, 6 insertions(+), 8 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
|
|
||||||
index eb4d8baa..a633d95e 100644
|
|
||||||
--- a/nptl/pthread_mutex_lock.c
|
|
||||||
+++ b/nptl/pthread_mutex_lock.c
|
|
||||||
@@ -299,10 +299,9 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
|
|
||||||
meantime. */
|
|
||||||
if ((oldval & FUTEX_WAITERS) == 0)
|
|
||||||
{
|
|
||||||
- int val;
|
|
||||||
- if ((val = atomic_compare_and_exchange_val_acq
|
|
||||||
- (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
|
|
||||||
- oldval)) != oldval)
|
|
||||||
+ int val = atomic_compare_and_exchange_val_acq
|
|
||||||
+ (&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
|
|
||||||
+ if (val != oldval)
|
|
||||||
{
|
|
||||||
oldval = val;
|
|
||||||
continue;
|
|
||||||
diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
|
|
||||||
index c4627ef6..a76c30b7 100644
|
|
||||||
--- a/nptl/pthread_mutex_timedlock.c
|
|
||||||
+++ b/nptl/pthread_mutex_timedlock.c
|
|
||||||
@@ -269,10 +269,9 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
|
|
||||||
meantime. */
|
|
||||||
if ((oldval & FUTEX_WAITERS) == 0)
|
|
||||||
{
|
|
||||||
- int val;
|
|
||||||
- if ((val = atomic_compare_and_exchange_val_acq
|
|
||||||
- (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
|
|
||||||
- oldval)) != oldval)
|
|
||||||
+ int val = atomic_compare_and_exchange_val_acq
|
|
||||||
+ (&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
|
|
||||||
+ if (val != oldval)
|
|
||||||
{
|
|
||||||
oldval = val;
|
|
||||||
continue;
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,60 +0,0 @@
|
|||||||
From 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Fri, 3 Dec 2021 15:29:25 -0800
|
|
||||||
Subject: [PATCH] x86-64: Use notl in EVEX strcmp [BZ #28646]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Must use notl %edi here as lower bits are for CHAR comparisons
|
|
||||||
potentially out of range thus can be 0 without indicating mismatch.
|
|
||||||
This fixes BZ #28646.
|
|
||||||
|
|
||||||
Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-evex.S | 14 ++++++++------
|
|
||||||
1 file changed, 8 insertions(+), 6 deletions(-)
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
string/test-strcmp.c
|
|
||||||
(new check omitted)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
index 82f12ac8..6f5c4bf9 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
@@ -656,12 +656,13 @@ L(loop_cross_page):
|
|
||||||
in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */
|
|
||||||
VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
|
|
||||||
kmovd %k3, %edi
|
|
||||||
+ /* Must use notl %edi here as lower bits are for CHAR
|
|
||||||
+ comparisons potentially out of range thus can be 0 without
|
|
||||||
+ indicating mismatch. */
|
|
||||||
+ notl %edi
|
|
||||||
# ifdef USE_AS_WCSCMP
|
|
||||||
/* Don't use subl since it is the upper 8 bits of EDI below. */
|
|
||||||
- notl %edi
|
|
||||||
andl $0xff, %edi
|
|
||||||
-# else
|
|
||||||
- incl %edi
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# ifdef USE_AS_WCSCMP
|
|
||||||
@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec):
|
|
||||||
in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */
|
|
||||||
VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
|
|
||||||
kmovd %k3, %edi
|
|
||||||
+ /* Must use notl %edi here as lower bits are for CHAR
|
|
||||||
+ comparisons potentially out of range thus can be 0 without
|
|
||||||
+ indicating mismatch. */
|
|
||||||
+ notl %edi
|
|
||||||
# ifdef USE_AS_WCSCMP
|
|
||||||
/* Don't use subl since it is the upper 8 bits of EDI below. */
|
|
||||||
- notl %edi
|
|
||||||
andl $0xff, %edi
|
|
||||||
-# else
|
|
||||||
- incl %edi
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# ifdef USE_AS_WCSCMP
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,153 +0,0 @@
|
|||||||
From c7c54f65b080affb87a1513dee449c8ad6143c8b Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 21 Jan 2019 11:35:18 -0800
|
|
||||||
Subject: [PATCH] x86-64 strncpy: Properly handle the length parameter [BZ#
|
|
||||||
24097]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
|
||||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
|
||||||
functions written in assembly can only use the lower 32 bits of a
|
|
||||||
64-bit register as length or must clear the upper 32 bits before using
|
|
||||||
the full 64-bit register for length.
|
|
||||||
|
|
||||||
This pach fixes strncpy for x32. Tested on x86-64 and x32. On x86-64,
|
|
||||||
libc.so is the same with and withou the fix.
|
|
||||||
|
|
||||||
[BZ# 24097]
|
|
||||||
CVE-2019-6488
|
|
||||||
* sysdeps/x86_64/multiarch/strcpy-avx2.S: Use RDX_LP for length.
|
|
||||||
* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise.
|
|
||||||
* sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file.
|
|
||||||
---
|
|
||||||
.../x86_64/multiarch/strcpy-sse2-unaligned.S | 4 +-
|
|
||||||
sysdeps/x86_64/multiarch/strcpy-ssse3.S | 6 +-
|
|
||||||
sysdeps/x86_64/x32/Makefile | 2 +-
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-strncpy.c | 58 +++++++++++++++++++
|
|
||||||
4 files changed, 64 insertions(+), 6 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncpy.c
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
sysdeps/x86_64/multiarch/strcpy-avx2.S
|
|
||||||
(skipped, only needed for x32 arch)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
|
|
||||||
index 72bf7e85..50aca22d 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
|
|
||||||
@@ -40,8 +40,8 @@
|
|
||||||
.text
|
|
||||||
ENTRY (STRCPY)
|
|
||||||
# ifdef USE_AS_STRNCPY
|
|
||||||
- mov %rdx, %r8
|
|
||||||
- test %r8, %r8
|
|
||||||
+ mov %RDX_LP, %R8_LP
|
|
||||||
+ test %R8_LP, %R8_LP
|
|
||||||
jz L(ExitZero)
|
|
||||||
# endif
|
|
||||||
mov %rsi, %rcx
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
|
|
||||||
index 9858d0c4..0a62814a 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
|
|
||||||
@@ -31,13 +31,13 @@ ENTRY (STRCPY)
|
|
||||||
|
|
||||||
mov %rsi, %rcx
|
|
||||||
# ifdef USE_AS_STRNCPY
|
|
||||||
- mov %rdx, %r8
|
|
||||||
+ mov %RDX_LP, %R8_LP
|
|
||||||
# endif
|
|
||||||
mov %rdi, %rdx
|
|
||||||
# ifdef USE_AS_STRNCPY
|
|
||||||
- test %r8, %r8
|
|
||||||
+ test %R8_LP, %R8_LP
|
|
||||||
jz L(Exit0)
|
|
||||||
- cmp $8, %r8
|
|
||||||
+ cmp $8, %R8_LP
|
|
||||||
jbe L(StrncpyExit8Bytes)
|
|
||||||
# endif
|
|
||||||
cmpb $0, (%rcx)
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index db302839..2a9e20a9 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -8,7 +8,7 @@ endif
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
|
|
||||||
tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
|
|
||||||
- tst-size_t-strncmp
|
|
||||||
+ tst-size_t-strncmp tst-size_t-strncpy
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),wcsmbs)
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncpy.c b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..4dec71e6
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
|
|
||||||
@@ -0,0 +1,58 @@
|
|
||||||
+/* Test strncpy with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define TEST_NAME "strncpy"
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+IMPL (strncpy, 1)
|
|
||||||
+
|
|
||||||
+typedef char *(*proto_t) (char *, const char*, size_t);
|
|
||||||
+
|
|
||||||
+static void *
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_strncpy (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ parameter_t dest = { { page_size }, buf1 };
|
|
||||||
+ parameter_t src = { { 0 }, buf2 };
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ src.fn = impl->fn;
|
|
||||||
+ do_strncpy (dest, src);
|
|
||||||
+ int res = strncmp (dest.p, src.p, dest.len);
|
|
||||||
+ if (res)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: %i != 0",
|
|
||||||
+ impl->name, res);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,43 +0,0 @@
|
|||||||
From 6b8dbbd03ac88f169b65b5c7d7278576a11d2e44 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Jangwoong Kim <6812skiii@gmail.com>
|
|
||||||
Date: Tue, 14 Dec 2021 21:30:51 +0900
|
|
||||||
Subject: [PATCH] nptl: Effectively skip CAS in spinlock loop
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
The commit:
|
|
||||||
"Add LLL_MUTEX_READ_LOCK [BZ #28537]"
|
|
||||||
SHA1: d672a98a1af106bd68deb15576710cd61363f7a6
|
|
||||||
|
|
||||||
introduced LLL_MUTEX_READ_LOCK, to skip CAS in spinlock loop
|
|
||||||
if atomic load fails. But, "continue" inside of do-while loop
|
|
||||||
does not skip the evaluation of escape expression, thus CAS
|
|
||||||
is not skipped.
|
|
||||||
|
|
||||||
Replace do-while with while and skip LLL_MUTEX_TRYLOCK if
|
|
||||||
LLL_MUTEX_READ_LOCK fails.
|
|
||||||
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
nptl/pthread_mutex_lock.c | 5 ++---
|
|
||||||
1 file changed, 2 insertions(+), 3 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
|
|
||||||
index a633d95e..d96a9933 100644
|
|
||||||
--- a/nptl/pthread_mutex_lock.c
|
|
||||||
+++ b/nptl/pthread_mutex_lock.c
|
|
||||||
@@ -141,10 +141,9 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
atomic_spin_nop ();
|
|
||||||
- if (LLL_MUTEX_READ_LOCK (mutex) != 0)
|
|
||||||
- continue;
|
|
||||||
}
|
|
||||||
- while (LLL_MUTEX_TRYLOCK (mutex) != 0);
|
|
||||||
+ while (LLL_MUTEX_READ_LOCK (mutex) != 0
|
|
||||||
+ || LLL_MUTEX_TRYLOCK (mutex) != 0);
|
|
||||||
|
|
||||||
mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8;
|
|
||||||
}
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,37 +0,0 @@
|
|||||||
From b98d0bbf747f39770e0caba7e984ce9f8f900330 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Fri, 18 Feb 2022 17:00:25 -0600
|
|
||||||
Subject: [PATCH] x86: Fix TEST_NAME to make it a string in tst-strncmp-rtm.c
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Previously TEST_NAME was passing a function pointer. This didn't fail
|
|
||||||
because of the -Wno-error flag (to allow for overflow sizes passed
|
|
||||||
to strncmp/wcsncmp)
|
|
||||||
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86/tst-strncmp-rtm.c | 4 ++--
|
|
||||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
|
|
||||||
index 4e9f094f..aef9866c 100644
|
|
||||||
--- a/sysdeps/x86/tst-strncmp-rtm.c
|
|
||||||
+++ b/sysdeps/x86/tst-strncmp-rtm.c
|
|
||||||
@@ -23,12 +23,12 @@
|
|
||||||
# define CHAR wchar_t
|
|
||||||
# define MEMSET wmemset
|
|
||||||
# define STRNCMP wcsncmp
|
|
||||||
-# define TEST_NAME wcsncmp
|
|
||||||
+# define TEST_NAME "wcsncmp"
|
|
||||||
#else /* !WIDE */
|
|
||||||
# define CHAR char
|
|
||||||
# define MEMSET memset
|
|
||||||
# define STRNCMP strncmp
|
|
||||||
-# define TEST_NAME strncmp
|
|
||||||
+# define TEST_NAME "strncmp"
|
|
||||||
#endif /* !WIDE */
|
|
||||||
|
|
||||||
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,33 +0,0 @@
|
|||||||
From c15efd011cea3d8f0494269eb539583215a1feed Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Fri, 4 Feb 2022 11:09:10 -0800
|
|
||||||
Subject: [PATCH] x86-64: Fix strcmp-avx2.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Change "movl %edx, %rdx" to "movl %edx, %edx" in:
|
|
||||||
|
|
||||||
commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45
|
|
||||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon Jan 10 15:35:38 2022 -0600
|
|
||||||
|
|
||||||
x86: Optimize strcmp-avx2.S
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
|
|
||||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
index 554ffe4c..04675aa4 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
@@ -106,7 +106,7 @@ ENTRY(STRCMP)
|
|
||||||
# ifdef USE_AS_STRNCMP
|
|
||||||
# ifdef __ILP32__
|
|
||||||
/* Clear the upper 32 bits. */
|
|
||||||
- movl %edx, %rdx
|
|
||||||
+ movl %edx, %edx
|
|
||||||
# endif
|
|
||||||
cmp $1, %RDX_LP
|
|
||||||
/* Signed comparison intentional. We use this branch to also
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,33 +0,0 @@
|
|||||||
From 0e0199a9e02ebe42e2b36958964d63f03573c382 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Fri, 4 Feb 2022 11:11:08 -0800
|
|
||||||
Subject: [PATCH] x86-64: Fix strcmp-evex.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Change "movl %edx, %rdx" to "movl %edx, %edx" in:
|
|
||||||
|
|
||||||
commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9
|
|
||||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon Jan 10 15:35:39 2022 -0600
|
|
||||||
|
|
||||||
x86: Optimize strcmp-evex.S
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-evex.S | 2 +-
|
|
||||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
index 99d8409a..ed56af8e 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
@@ -116,7 +116,7 @@ ENTRY(STRCMP)
|
|
||||||
# ifdef USE_AS_STRNCMP
|
|
||||||
# ifdef __ILP32__
|
|
||||||
/* Clear the upper 32 bits. */
|
|
||||||
- movl %edx, %rdx
|
|
||||||
+ movl %edx, %edx
|
|
||||||
# endif
|
|
||||||
cmp $1, %RDX_LP
|
|
||||||
/* Signed comparison intentional. We use this branch to also
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,40 +0,0 @@
|
|||||||
From 1b0c60f95bbe2eded80b2bb5be75c0e45b11cde1 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon, 7 Feb 2022 00:32:23 -0600
|
|
||||||
Subject: [PATCH] x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2
|
|
||||||
Only)
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
commit b62ace2740a106222e124cc86956448fa07abf4d
|
|
||||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Sun Feb 6 00:54:18 2022 -0600
|
|
||||||
|
|
||||||
x86: Improve vec generation in memset-vec-unaligned-erms.S
|
|
||||||
|
|
||||||
Revert usage of 'pshufb' in broadcast logic as it is an SSSE3
|
|
||||||
instruction and memset.S is restricted to only SSE2 instructions.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/memset.S | 7 ++++---
|
|
||||||
1 file changed, 4 insertions(+), 3 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
|
|
||||||
index 27debd2b..4cb4aa71 100644
|
|
||||||
--- a/sysdeps/x86_64/memset.S
|
|
||||||
+++ b/sysdeps/x86_64/memset.S
|
|
||||||
@@ -30,9 +30,10 @@
|
|
||||||
|
|
||||||
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
||||||
movd d, %xmm0; \
|
|
||||||
- pxor %xmm1, %xmm1; \
|
|
||||||
- pshufb %xmm1, %xmm0; \
|
|
||||||
- movq r, %rax
|
|
||||||
+ movq r, %rax; \
|
|
||||||
+ punpcklbw %xmm0, %xmm0; \
|
|
||||||
+ punpcklwd %xmm0, %xmm0; \
|
|
||||||
+ pshufd $0, %xmm0, %xmm0
|
|
||||||
|
|
||||||
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
||||||
movd d, %xmm0; \
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,218 +0,0 @@
|
|||||||
From 5165de69c0908e28a380cbd4bb054e55ea4abc95 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 21 Jan 2019 11:36:36 -0800
|
|
||||||
Subject: [PATCH] x86-64 strnlen/wcsnlen: Properly handle the length parameter
|
|
||||||
[BZ# 24097]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
|
||||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
|
||||||
functions written in assembly can only use the lower 32 bits of a
|
|
||||||
64-bit register as length or must clear the upper 32 bits before using
|
|
||||||
the full 64-bit register for length.
|
|
||||||
|
|
||||||
This pach fixes strnlen/wcsnlen for x32. Tested on x86-64 and x32. On
|
|
||||||
x86-64, libc.so is the same with and withou the fix.
|
|
||||||
|
|
||||||
[BZ# 24097]
|
|
||||||
CVE-2019-6488
|
|
||||||
* sysdeps/x86_64/multiarch/strlen-avx2.S: Use RSI_LP for length.
|
|
||||||
Clear the upper 32 bits of RSI register.
|
|
||||||
* sysdeps/x86_64/strlen.S: Use RSI_LP for length.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strnlen
|
|
||||||
and tst-size_t-wcsnlen.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-strnlen.c: New file.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-wcsnlen.c: Likewise.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strlen-avx2.S | 9 ++--
|
|
||||||
sysdeps/x86_64/strlen.S | 12 ++---
|
|
||||||
sysdeps/x86_64/x32/Makefile | 4 +-
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-strnlen.c | 72 +++++++++++++++++++++++++
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-wcsnlen.c | 20 +++++++
|
|
||||||
5 files changed, 106 insertions(+), 11 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-strnlen.c
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
index fb2418cd..645e0446 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
@@ -42,12 +42,15 @@
|
|
||||||
ENTRY (STRLEN)
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* Check for zero length. */
|
|
||||||
- testq %rsi, %rsi
|
|
||||||
+ test %RSI_LP, %RSI_LP
|
|
||||||
jz L(zero)
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
- shl $2, %rsi
|
|
||||||
+ shl $2, %RSI_LP
|
|
||||||
+# elif defined __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %esi, %esi
|
|
||||||
# endif
|
|
||||||
- movq %rsi, %r8
|
|
||||||
+ mov %RSI_LP, %R8_LP
|
|
||||||
# endif
|
|
||||||
movl %edi, %ecx
|
|
||||||
movq %rdi, %rdx
|
|
||||||
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
|
|
||||||
index 01cb5fa8..f845f3d4 100644
|
|
||||||
--- a/sysdeps/x86_64/strlen.S
|
|
||||||
+++ b/sysdeps/x86_64/strlen.S
|
|
||||||
@@ -59,21 +59,21 @@ ENTRY(strlen)
|
|
||||||
|
|
||||||
#ifdef AS_STRNLEN
|
|
||||||
/* Do not read anything when n==0. */
|
|
||||||
- test %rsi, %rsi
|
|
||||||
+ test %RSI_LP, %RSI_LP
|
|
||||||
jne L(n_nonzero)
|
|
||||||
xor %rax, %rax
|
|
||||||
ret
|
|
||||||
L(n_nonzero):
|
|
||||||
# ifdef AS_WCSLEN
|
|
||||||
- shlq $2, %rsi
|
|
||||||
+ shl $2, %RSI_LP
|
|
||||||
# endif
|
|
||||||
|
|
||||||
/* Initialize long lived registers. */
|
|
||||||
|
|
||||||
- add %rdi, %rsi
|
|
||||||
- mov %rsi, %r10
|
|
||||||
- and $-64, %r10
|
|
||||||
- mov %rsi, %r11
|
|
||||||
+ add %RDI_LP, %RSI_LP
|
|
||||||
+ mov %RSI_LP, %R10_LP
|
|
||||||
+ and $-64, %R10_LP
|
|
||||||
+ mov %RSI_LP, %R11_LP
|
|
||||||
#endif
|
|
||||||
|
|
||||||
pxor %xmm0, %xmm0
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index 2a9e20a9..1557724b 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -8,10 +8,10 @@ endif
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
|
|
||||||
tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
|
|
||||||
- tst-size_t-strncmp tst-size_t-strncpy
|
|
||||||
+ tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),wcsmbs)
|
|
||||||
tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
|
|
||||||
- tst-size_t-wcsncmp
|
|
||||||
+ tst-size_t-wcsncmp tst-size_t-wcsnlen
|
|
||||||
endif
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-strnlen.c b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..690a4a8a
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
|
|
||||||
@@ -0,0 +1,72 @@
|
|
||||||
+/* Test strnlen with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# define TEST_NAME "wcsnlen"
|
|
||||||
+#else
|
|
||||||
+# define TEST_NAME "strnlen"
|
|
||||||
+#endif /* WIDE */
|
|
||||||
+
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# include <wchar.h>
|
|
||||||
+# define STRNLEN wcsnlen
|
|
||||||
+# define CHAR wchar_t
|
|
||||||
+#else
|
|
||||||
+# define STRNLEN strnlen
|
|
||||||
+# define CHAR char
|
|
||||||
+#endif /* WIDE */
|
|
||||||
+
|
|
||||||
+IMPL (STRNLEN, 1)
|
|
||||||
+
|
|
||||||
+typedef size_t (*proto_t) (const CHAR *, size_t);
|
|
||||||
+
|
|
||||||
+static size_t
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_strnlen (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&a, a.p, b.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ size_t size = page_size / sizeof (CHAR);
|
|
||||||
+ parameter_t src = { { 0 }, buf2 };
|
|
||||||
+ parameter_t c = { { size }, (void *) (uintptr_t) 'a' };
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ src.fn = impl->fn;
|
|
||||||
+ size_t res = do_strnlen (src, c);
|
|
||||||
+ if (res != size)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: 0x%x != 0x%x",
|
|
||||||
+ impl->name, res, size);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..093b4bbe
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
|
|
||||||
@@ -0,0 +1,20 @@
|
|
||||||
+/* Test wcsnlen with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define WIDE 1
|
|
||||||
+#include "tst-size_t-strnlen.c"
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,33 +0,0 @@
|
|||||||
From 7912236f4a597deb092650ca79f33504ddb4af28 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Sat, 12 Feb 2022 00:45:00 -0600
|
|
||||||
Subject: [PATCH] x86: Set .text section in memset-vec-unaligned-erms
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
|
|
||||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
Date: Mon Feb 7 05:55:15 2022 -0800
|
|
||||||
|
|
||||||
x86-64: Optimize bzero
|
|
||||||
|
|
||||||
Remove setting the .text section for the code. This commit
|
|
||||||
adds that back.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 1 +
|
|
||||||
1 file changed, 1 insertion(+)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
index 06f5f5d7..4fb475c0 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
@@ -114,6 +114,7 @@
|
|
||||||
# error SECTION is not defined!
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+ .section SECTION(.text), "ax", @progbits
|
|
||||||
#if IS_IN (libc)
|
|
||||||
# if defined SHARED
|
|
||||||
ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,36 +0,0 @@
|
|||||||
From 0fb8800029d230b3711bf722b2a47db92d0e273f Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Thu, 10 Feb 2022 11:52:50 -0800
|
|
||||||
Subject: [PATCH] x86-64: Remove bzero weak alias in SS2 memset
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
|
|
||||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
Date: Mon Feb 7 05:55:15 2022 -0800
|
|
||||||
|
|
||||||
x86-64: Optimize bzero
|
|
||||||
|
|
||||||
added the optimized bzero. Remove bzero weak alias in SS2 memset to
|
|
||||||
avoid undefined __bzero in memset-sse2-unaligned-erms.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S | 4 +---
|
|
||||||
1 file changed, 1 insertion(+), 3 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
|
|
||||||
index 8f579ad6..af51362b 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
|
|
||||||
@@ -31,9 +31,7 @@
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# undef weak_alias
|
|
||||||
-# define weak_alias(original, alias) \
|
|
||||||
- .weak bzero; bzero = __bzero
|
|
||||||
-
|
|
||||||
+# define weak_alias(original, alias)
|
|
||||||
# undef strong_alias
|
|
||||||
# define strong_alias(ignored1, ignored2)
|
|
||||||
#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,29 +0,0 @@
|
|||||||
From bf92893a14ebc161b08b28acc24fa06ae6be19cb Mon Sep 17 00:00:00 2001
|
|
||||||
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
|
||||||
Date: Thu, 10 Feb 2022 11:23:24 -0300
|
|
||||||
Subject: [PATCH] x86_64: Remove bcopy optimizations
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
The symbols is not present in current POSIX specification and compiler
|
|
||||||
already generates memmove call.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/bcopy.S | 7 -------
|
|
||||||
1 file changed, 7 deletions(-)
|
|
||||||
delete mode 100644 sysdeps/x86_64/multiarch/bcopy.S
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S
|
|
||||||
deleted file mode 100644
|
|
||||||
index 639f02bd..00000000
|
|
||||||
--- a/sysdeps/x86_64/multiarch/bcopy.S
|
|
||||||
+++ /dev/null
|
|
||||||
@@ -1,7 +0,0 @@
|
|
||||||
-#include <sysdep.h>
|
|
||||||
-
|
|
||||||
- .text
|
|
||||||
-ENTRY(bcopy)
|
|
||||||
- xchg %rdi, %rsi
|
|
||||||
- jmp __libc_memmove /* Branch to IFUNC memmove. */
|
|
||||||
-END(bcopy)
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,206 +0,0 @@
|
|||||||
From 3f635fb43389b54f682fc9ed2acc0b2aaf4a923d Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 4 Feb 2019 06:31:01 -0800
|
|
||||||
Subject: [PATCH] x86-64 memcmp: Use unsigned Jcc instructions on size [BZ
|
|
||||||
#24155]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Since the size argument is unsigned. we should use unsigned Jcc
|
|
||||||
instructions, instead of signed, to check size.
|
|
||||||
|
|
||||||
Tested on x86-64 and x32, with and without --disable-multi-arch.
|
|
||||||
|
|
||||||
[BZ #24155]
|
|
||||||
CVE-2019-7309
|
|
||||||
* NEWS: Updated for CVE-2019-7309.
|
|
||||||
* sysdeps/x86_64/memcmp.S: Use RDX_LP for size. Clear the
|
|
||||||
upper 32 bits of RDX register for x32. Use unsigned Jcc
|
|
||||||
instructions, instead of signed.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp-2.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-memcmp-2.c: New test.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/memcmp.S | 20 +++---
|
|
||||||
sysdeps/x86_64/x32/Makefile | 3 +-
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-memcmp-2.c | 79 ++++++++++++++++++++++++
|
|
||||||
3 files changed, 93 insertions(+), 9 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
|
|
||||||
|
|
||||||
Conflics:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
NEWS
|
|
||||||
(removed)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
|
|
||||||
index bcb4a2e8..45918d37 100644
|
|
||||||
--- a/sysdeps/x86_64/memcmp.S
|
|
||||||
+++ b/sysdeps/x86_64/memcmp.S
|
|
||||||
@@ -21,14 +21,18 @@
|
|
||||||
|
|
||||||
.text
|
|
||||||
ENTRY (memcmp)
|
|
||||||
- test %rdx, %rdx
|
|
||||||
+#ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %edx, %edx
|
|
||||||
+#endif
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
jz L(finz)
|
|
||||||
cmpq $1, %rdx
|
|
||||||
- jle L(finr1b)
|
|
||||||
+ jbe L(finr1b)
|
|
||||||
subq %rdi, %rsi
|
|
||||||
movq %rdx, %r10
|
|
||||||
cmpq $32, %r10
|
|
||||||
- jge L(gt32)
|
|
||||||
+ jae L(gt32)
|
|
||||||
/* Handle small chunks and last block of less than 32 bytes. */
|
|
||||||
L(small):
|
|
||||||
testq $1, %r10
|
|
||||||
@@ -156,7 +160,7 @@ L(A32):
|
|
||||||
movq %r11, %r10
|
|
||||||
andq $-32, %r10
|
|
||||||
cmpq %r10, %rdi
|
|
||||||
- jge L(mt16)
|
|
||||||
+ jae L(mt16)
|
|
||||||
/* Pre-unroll to be ready for unrolled 64B loop. */
|
|
||||||
testq $32, %rdi
|
|
||||||
jz L(A64)
|
|
||||||
@@ -178,7 +182,7 @@ L(A64):
|
|
||||||
movq %r11, %r10
|
|
||||||
andq $-64, %r10
|
|
||||||
cmpq %r10, %rdi
|
|
||||||
- jge L(mt32)
|
|
||||||
+ jae L(mt32)
|
|
||||||
|
|
||||||
L(A64main):
|
|
||||||
movdqu (%rdi,%rsi), %xmm0
|
|
||||||
@@ -216,7 +220,7 @@ L(mt32):
|
|
||||||
movq %r11, %r10
|
|
||||||
andq $-32, %r10
|
|
||||||
cmpq %r10, %rdi
|
|
||||||
- jge L(mt16)
|
|
||||||
+ jae L(mt16)
|
|
||||||
|
|
||||||
L(A32main):
|
|
||||||
movdqu (%rdi,%rsi), %xmm0
|
|
||||||
@@ -254,7 +258,7 @@ L(ATR):
|
|
||||||
movq %r11, %r10
|
|
||||||
andq $-32, %r10
|
|
||||||
cmpq %r10, %rdi
|
|
||||||
- jge L(mt16)
|
|
||||||
+ jae L(mt16)
|
|
||||||
testq $16, %rdi
|
|
||||||
jz L(ATR32)
|
|
||||||
|
|
||||||
@@ -325,7 +329,7 @@ L(ATR64main):
|
|
||||||
movq %r11, %r10
|
|
||||||
andq $-32, %r10
|
|
||||||
cmpq %r10, %rdi
|
|
||||||
- jge L(mt16)
|
|
||||||
+ jae L(mt16)
|
|
||||||
|
|
||||||
L(ATR32res):
|
|
||||||
movdqa (%rdi,%rsi), %xmm0
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index 1557724b..87489565 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -8,7 +8,8 @@ endif
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
|
|
||||||
tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
|
|
||||||
- tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
|
|
||||||
+ tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen \
|
|
||||||
+ tst-size_t-memcmp-2
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),wcsmbs)
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..d8ae1a08
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
|
|
||||||
@@ -0,0 +1,79 @@
|
|
||||||
+/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define TEST_MAIN
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# define TEST_NAME "wmemcmp"
|
|
||||||
+#else
|
|
||||||
+# define TEST_NAME "memcmp"
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# include <inttypes.h>
|
|
||||||
+# include <wchar.h>
|
|
||||||
+
|
|
||||||
+# define MEMCMP wmemcmp
|
|
||||||
+# define CHAR wchar_t
|
|
||||||
+#else
|
|
||||||
+# define MEMCMP memcmp
|
|
||||||
+# define CHAR char
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+IMPL (MEMCMP, 1)
|
|
||||||
+
|
|
||||||
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_memcmp (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
|
|
||||||
+ parameter_t src = { { 0 }, buf2 };
|
|
||||||
+
|
|
||||||
+ memcpy (buf1, buf2, page_size);
|
|
||||||
+
|
|
||||||
+ CHAR *p = (CHAR *) buf1;
|
|
||||||
+ p[page_size / sizeof (CHAR) - 1] = (CHAR) 1;
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ src.fn = impl->fn;
|
|
||||||
+ int res = do_memcmp (dest, src);
|
|
||||||
+ if (res >= 0)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: %i >= 0",
|
|
||||||
+ impl->name, res);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,31 +0,0 @@
|
|||||||
Based on the following commit, adjusted for glibc-2.28 in RHEL-8:
|
|
||||||
|
|
||||||
commit 5eabdb6a6ac1599d23dd5966a37417215950245f
|
|
||||||
Author: Andreas Schwab <schwab@suse.de>
|
|
||||||
Date: Wed Dec 6 14:48:22 2023 +0100
|
|
||||||
|
|
||||||
getaddrinfo: translate ENOMEM to EAI_MEMORY (bug 31163)
|
|
||||||
|
|
||||||
When __resolv_context_get returns NULL due to out of memory, translate it
|
|
||||||
to a return value of EAI_MEMORY.
|
|
||||||
|
|
||||||
diff --git a/sysdeps/posix/getaddrinfo.c b/sysdeps/posix/getaddrinfo.c
|
|
||||||
index 46046504a6858f2e..d0708f3e84e20025 100644
|
|
||||||
--- a/sysdeps/posix/getaddrinfo.c
|
|
||||||
+++ b/sysdeps/posix/getaddrinfo.c
|
|
||||||
@@ -777,7 +777,14 @@ gaih_inet (const char *name, const struct gaih_service *service,
|
|
||||||
res_ctx = __resolv_context_get ();
|
|
||||||
res_enable_inet6 = __resolv_context_disable_inet6 (res_ctx);
|
|
||||||
if (res_ctx == NULL)
|
|
||||||
- no_more = 1;
|
|
||||||
+ {
|
|
||||||
+ if (errno == ENOMEM)
|
|
||||||
+ {
|
|
||||||
+ result = -EAI_MEMORY;
|
|
||||||
+ goto free_and_return;
|
|
||||||
+ }
|
|
||||||
+ no_more = 1;
|
|
||||||
+ }
|
|
||||||
|
|
||||||
while (!no_more)
|
|
||||||
{
|
|
@ -1,112 +0,0 @@
|
|||||||
This downstream-only patch compensates for the missing backport of
|
|
||||||
commit 2d651eb9265d1366d7b9e881bfddd46db9c1ecc4 ("x86: Move
|
|
||||||
x86 processor cache info to cpu_features"). Without it,
|
|
||||||
ld.so --list-diagnostics prints values that have not been properly
|
|
||||||
initalized from CPUID data.
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
|
|
||||||
index 10ebadd819d9efff..d8421fab83ab08ac 100644
|
|
||||||
--- a/sysdeps/x86/cacheinfo.h
|
|
||||||
+++ b/sysdeps/x86/cacheinfo.h
|
|
||||||
@@ -19,31 +19,42 @@
|
|
||||||
#include <assert.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
|
|
||||||
+/* When building ld.so, do not export any of the variables. They are
|
|
||||||
+ only used for diagnostics and are not initialized during regular
|
|
||||||
+ operation. */
|
|
||||||
+#if IS_IN (rtld)
|
|
||||||
+# define CACHEINFO_VARIABLE(name, initializer) \
|
|
||||||
+ static long int name = initializer
|
|
||||||
+#else
|
|
||||||
+# define CACHEINFO_VARIABLE(name, initializer) \
|
|
||||||
+ long int name attribute_hidden = initializer
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
/* Data cache size for use in memory and string routines, typically
|
|
||||||
L1 size, rounded to multiple of 256 bytes. */
|
|
||||||
-long int __x86_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
|
|
||||||
-long int __x86_data_cache_size attribute_hidden = 32 * 1024;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_data_cache_size_half, 32 * 1024 / 2);
|
|
||||||
+CACHEINFO_VARIABLE (__x86_data_cache_size, 32 * 1024);
|
|
||||||
/* Similar to __x86_data_cache_size_half, but not rounded. */
|
|
||||||
-long int __x86_raw_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_raw_data_cache_size_half, 32 * 1024 / 2);
|
|
||||||
/* Similar to __x86_data_cache_size, but not rounded. */
|
|
||||||
-long int __x86_raw_data_cache_size attribute_hidden = 32 * 1024;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_raw_data_cache_size, 32 * 1024);
|
|
||||||
/* Shared cache size for use in memory and string routines, typically
|
|
||||||
L2 or L3 size, rounded to multiple of 256 bytes. */
|
|
||||||
-long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
|
|
||||||
-long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_shared_cache_size_half, 1024 * 1024 / 2);
|
|
||||||
+CACHEINFO_VARIABLE (__x86_shared_cache_size, 1024 * 1024);
|
|
||||||
/* Similar to __x86_shared_cache_size_half, but not rounded. */
|
|
||||||
-long int __x86_raw_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_raw_shared_cache_size_half, 1024 * 1024 / 2);
|
|
||||||
/* Similar to __x86_shared_cache_size, but not rounded. */
|
|
||||||
-long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_raw_shared_cache_size, 1024 * 1024);
|
|
||||||
|
|
||||||
/* Threshold to use non temporal store. */
|
|
||||||
-long int __x86_shared_non_temporal_threshold attribute_hidden;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_shared_non_temporal_threshold, 0);
|
|
||||||
|
|
||||||
/* Threshold to use Enhanced REP MOVSB. */
|
|
||||||
-long int __x86_rep_movsb_threshold attribute_hidden = 2048;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_rep_movsb_threshold, 2048);
|
|
||||||
|
|
||||||
/* Threshold to use Enhanced REP STOSB. */
|
|
||||||
-long int __x86_rep_stosb_threshold attribute_hidden = 2048;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_rep_stosb_threshold, 2048);
|
|
||||||
|
|
||||||
static void
|
|
||||||
get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
|
|
||||||
diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
|
|
||||||
index 0ba286a828b69937..9215604ecf22344c 100644
|
|
||||||
--- a/sysdeps/x86/dl-diagnostics-cpu.c
|
|
||||||
+++ b/sysdeps/x86/dl-diagnostics-cpu.c
|
|
||||||
@@ -19,6 +19,13 @@
|
|
||||||
#include <dl-diagnostics.h>
|
|
||||||
#include <ldsodefs.h>
|
|
||||||
|
|
||||||
+#include <assert.h>
|
|
||||||
+#include <unistd.h>
|
|
||||||
+#include <cpu-features.h>
|
|
||||||
+#include <cpuid.h>
|
|
||||||
+#include <dl-cacheinfo.h>
|
|
||||||
+#include <cacheinfo.h>
|
|
||||||
+
|
|
||||||
static void
|
|
||||||
print_cpu_features_value (const char *label, uint64_t value)
|
|
||||||
{
|
|
||||||
@@ -81,19 +88,21 @@ _dl_diagnostics_cpu (void)
|
|
||||||
#include "cpu-features-preferred_feature_index_1.def"
|
|
||||||
#undef BIT
|
|
||||||
|
|
||||||
+ /* The cache information variables are only used for diagnostics and
|
|
||||||
+ are not initialized during startup. The values used at run time
|
|
||||||
+ are only in libc.so.6. */
|
|
||||||
+ init_cacheinfo ();
|
|
||||||
+
|
|
||||||
print_cpu_features_value ("xsave_state_size",
|
|
||||||
cpu_features->xsave_state_size);
|
|
||||||
print_cpu_features_value ("xsave_state_full_size",
|
|
||||||
cpu_features->xsave_state_full_size);
|
|
||||||
- print_cpu_features_value ("data_cache_size", cpu_features->data_cache_size);
|
|
||||||
- print_cpu_features_value ("shared_cache_size",
|
|
||||||
- cpu_features->shared_cache_size);
|
|
||||||
+ print_cpu_features_value ("data_cache_size", __x86_data_cache_size);
|
|
||||||
+ print_cpu_features_value ("shared_cache_size", __x86_shared_cache_size);
|
|
||||||
print_cpu_features_value ("non_temporal_threshold",
|
|
||||||
- cpu_features->non_temporal_threshold);
|
|
||||||
- print_cpu_features_value ("rep_movsb_threshold",
|
|
||||||
- cpu_features->rep_movsb_threshold);
|
|
||||||
- print_cpu_features_value ("rep_stosb_threshold",
|
|
||||||
- cpu_features->rep_stosb_threshold);
|
|
||||||
+ __x86_shared_non_temporal_threshold);
|
|
||||||
+ print_cpu_features_value ("rep_movsb_threshold", __x86_rep_movsb_threshold);
|
|
||||||
+ print_cpu_features_value ("rep_stosb_threshold", __x86_rep_stosb_threshold);
|
|
||||||
_Static_assert (offsetof (struct cpu_features, rep_stosb_threshold)
|
|
||||||
+ sizeof (cpu_features->rep_stosb_threshold)
|
|
||||||
== sizeof (*cpu_features),
|
|
@ -1,22 +0,0 @@
|
|||||||
Work around in the test case, the fact that RHEL-8 NSS modules
|
|
||||||
infrastructure incorrectly allows merging in the hosts database. This
|
|
||||||
is a RHEL-8 only fix.
|
|
||||||
|
|
||||||
diff --git a/nss/tst-nss-gai-actions.c b/nss/tst-nss-gai-actions.c
|
|
||||||
index efca6cd1837a172a..c35e752896eceb2a 100644
|
|
||||||
--- a/nss/tst-nss-gai-actions.c
|
|
||||||
+++ b/nss/tst-nss-gai-actions.c
|
|
||||||
@@ -87,6 +87,13 @@ do_one_test (int action, int family, bool canon)
|
|
||||||
case ACTION_MERGE:
|
|
||||||
if (ret == 0)
|
|
||||||
{
|
|
||||||
+ if (hints.ai_flags == 0 && hints.ai_family == AF_INET)
|
|
||||||
+ {
|
|
||||||
+ printf ("***** RHEL-8 limitation: "
|
|
||||||
+ "NSS modules infrastructure incorrectly allows MERGE\n");
|
|
||||||
+ return;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
char *formatted = support_format_addrinfo (ai, ret);
|
|
||||||
|
|
||||||
printf ("merge unexpectedly succeeded:\n %s\n", formatted);
|
|
@ -1,97 +0,0 @@
|
|||||||
commit 2ac579f9c25388a7734948d77b03e4dd10f35334
|
|
||||||
Author: DJ Delorie <dj@redhat.com>
|
|
||||||
Date: Mon Sep 30 16:04:52 2019 -0400
|
|
||||||
|
|
||||||
Add run-one-test convenience target and makefile help text
|
|
||||||
|
|
||||||
Adds "make test" for re-running just one test. Also adds
|
|
||||||
"make help" for help with our Makefile targets, and adds a
|
|
||||||
mini-help when you just run "make".
|
|
||||||
|
|
||||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
|
||||||
|
|
||||||
diff --git a/Makefile b/Makefile
|
|
||||||
index 6d73241bbc811c13..6518f62ee0676b0d 100644
|
|
||||||
--- a/Makefile
|
|
||||||
+++ b/Makefile
|
|
||||||
@@ -26,8 +26,17 @@ include Makeconfig
|
|
||||||
|
|
||||||
|
|
||||||
# This is the default target; it makes everything except the tests.
|
|
||||||
-.PHONY: all
|
|
||||||
-all: lib others
|
|
||||||
+.PHONY: all help minihelp
|
|
||||||
+all: minihelp lib others
|
|
||||||
+
|
|
||||||
+help:
|
|
||||||
+ @sed '0,/^help-starts-here$$/d' Makefile.help
|
|
||||||
+
|
|
||||||
+minihelp:
|
|
||||||
+ @echo
|
|
||||||
+ @echo type \"make help\" for help with common glibc makefile targets
|
|
||||||
+ @echo
|
|
||||||
+
|
|
||||||
|
|
||||||
ifneq ($(AUTOCONF),no)
|
|
||||||
|
|
||||||
@@ -503,3 +512,12 @@ FORCE:
|
|
||||||
|
|
||||||
iconvdata/% localedata/% po/%: FORCE
|
|
||||||
$(MAKE) $(PARALLELMFLAGS) -C $(@D) $(@F)
|
|
||||||
+
|
|
||||||
+# Convenience target to rerun one test, from the top of the build tree
|
|
||||||
+# Example: make test t=wcsmbs/test-wcsnlen
|
|
||||||
+.PHONY: test
|
|
||||||
+test :
|
|
||||||
+ @-rm -f $(objpfx)$t.out
|
|
||||||
+ $(MAKE) subdir=$(dir $t) -C $(dir $t) ..=../ $(objpfx)$t.out
|
|
||||||
+ @cat $(objpfx)$t.test-result
|
|
||||||
+ @cat $(objpfx)$t.out
|
|
||||||
diff --git a/Makefile.help b/Makefile.help
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000000000000..3b043bce013cc2b4
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/Makefile.help
|
|
||||||
@@ -0,0 +1,42 @@
|
|
||||||
+# Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+# This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+# The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+# modify it under the terms of the GNU Lesser General Public
|
|
||||||
+# License as published by the Free Software Foundation; either
|
|
||||||
+# version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+# The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+# Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+# You should have received a copy of the GNU Lesser General Public
|
|
||||||
+# License along with the GNU C Library; if not, see
|
|
||||||
+# <https://www.gnu.org/licenses/>.
|
|
||||||
+
|
|
||||||
+This is the file that gets printed when the user runs "make help",
|
|
||||||
+starting just after the "help-starts-here" line.
|
|
||||||
+
|
|
||||||
+help-starts-here
|
|
||||||
+
|
|
||||||
+all
|
|
||||||
+ The usual default; builds everything but doesn't run the
|
|
||||||
+ tests.
|
|
||||||
+
|
|
||||||
+check (or tests)
|
|
||||||
+ Runs the standard set of tests.
|
|
||||||
+
|
|
||||||
+test
|
|
||||||
+ Runs one test. Use like this:
|
|
||||||
+ make test t=wcsmbs/test-wcsnlen
|
|
||||||
+ Note that this will rebuild the test if needed, but will not
|
|
||||||
+ rebuild what "make all" would have rebuilt.
|
|
||||||
+
|
|
||||||
+--
|
|
||||||
+Other useful hints:
|
|
||||||
+
|
|
||||||
+builddir$ rm testroot.pristine/install.stamp
|
|
||||||
+ Forces the testroot to be reinstalled the next time you run
|
|
||||||
+ the testsuite (or just rm -rf testroot.pristine)
|
|
||||||
+
|
|
@ -1,34 +0,0 @@
|
|||||||
commit 56e098118a31753a9f755948bb5a47bc7111e214
|
|
||||||
Author: Andreas Schwab <schwab@suse.de>
|
|
||||||
Date: Thu Aug 15 12:14:35 2019 +0200
|
|
||||||
|
|
||||||
Update i386 libm-test-ulps
|
|
||||||
|
|
||||||
Conflicts: ChangeLog removed
|
|
||||||
|
|
||||||
diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
|
|
||||||
index e83bae71b4..2232296fe0 100644
|
|
||||||
--- a/sysdeps/i386/fpu/libm-test-ulps
|
|
||||||
+++ b/sysdeps/i386/fpu/libm-test-ulps
|
|
||||||
@@ -1158,8 +1158,8 @@ float128: 4
|
|
||||||
idouble: 4
|
|
||||||
ifloat: 5
|
|
||||||
ifloat128: 4
|
|
||||||
-ildouble: 7
|
|
||||||
-ldouble: 7
|
|
||||||
+ildouble: 8
|
|
||||||
+ldouble: 8
|
|
||||||
|
|
||||||
Function: Imaginary part of "clog10_upward":
|
|
||||||
double: 2
|
|
||||||
@@ -2222,8 +2222,8 @@ float128: 8
|
|
||||||
idouble: 5
|
|
||||||
ifloat: 5
|
|
||||||
ifloat128: 8
|
|
||||||
-ildouble: 5
|
|
||||||
-ldouble: 5
|
|
||||||
+ildouble: 6
|
|
||||||
+ldouble: 6
|
|
||||||
|
|
||||||
Function: "log":
|
|
||||||
double: 1
|
|
@ -1,26 +0,0 @@
|
|||||||
Author: Patsy Griffin <patsy@redhat.com>
|
|
||||||
|
|
||||||
i386: update ulps
|
|
||||||
|
|
||||||
This change fixes 3 test failures:
|
|
||||||
math/test-ildouble-lgamma
|
|
||||||
math/test-ldouble-finite-lgamma
|
|
||||||
math/test-ldouble-lgamma
|
|
||||||
|
|
||||||
This is a downstream only patch as upstream removed entries for
|
|
||||||
i{float,double,ldouble} by commit: 1c15464ca05f36db5c582856d3770d5e8bde9d61.
|
|
||||||
The ldouble change is already upstream.
|
|
||||||
|
|
||||||
--- a/sysdeps/i386/fpu/libm-test-ulps 2024-08-06 15:51:18.182808710 -0400
|
|
||||||
+++ b/sysdeps/i386/fpu/libm-test-ulps 2024-08-06 18:01:50.579719841 -0400
|
|
||||||
@@ -2030,8 +2030,8 @@ double: 5
|
|
||||||
float: 5
|
|
||||||
idouble: 5
|
|
||||||
ifloat: 5
|
|
||||||
-ildouble: 5
|
|
||||||
-ldouble: 5
|
|
||||||
+ildouble: 6
|
|
||||||
+ldouble: 6
|
|
||||||
|
|
||||||
Function: "hypot":
|
|
||||||
double: 1
|
|
@ -1,374 +0,0 @@
|
|||||||
commit 03e1378f94173fc192a81e421457198f7b8a34a0
|
|
||||||
Author: Alex Butler <Alex.Butler@arm.com>
|
|
||||||
Date: Tue Jun 16 12:44:24 2020 +0000
|
|
||||||
|
|
||||||
aarch64: MTE compatible strncmp
|
|
||||||
|
|
||||||
Add support for MTE to strncmp. Regression tested with xcheck and benchmarked
|
|
||||||
with glibc's benchtests on the Cortex-A53, Cortex-A72, and Neoverse N1.
|
|
||||||
|
|
||||||
The existing implementation assumes that any access to the pages in which the
|
|
||||||
string resides is safe. This assumption is not true when MTE is enabled. This
|
|
||||||
patch updates the algorithm to ensure that accesses remain within the bounds
|
|
||||||
of an MTE tag (16-byte chunks) and improves overall performance.
|
|
||||||
|
|
||||||
Co-authored-by: Branislav Rankov <branislav.rankov@arm.com>
|
|
||||||
Co-authored-by: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
|
||||||
|
|
||||||
diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S
|
|
||||||
index c5141fab8a..ba2563490e 100644
|
|
||||||
--- a/sysdeps/aarch64/strncmp.S
|
|
||||||
+++ b/sysdeps/aarch64/strncmp.S
|
|
||||||
@@ -25,7 +25,6 @@
|
|
||||||
|
|
||||||
#define REP8_01 0x0101010101010101
|
|
||||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
|
||||||
-#define REP8_80 0x8080808080808080
|
|
||||||
|
|
||||||
/* Parameters and result. */
|
|
||||||
#define src1 x0
|
|
||||||
@@ -46,15 +45,31 @@
|
|
||||||
#define tmp3 x10
|
|
||||||
#define zeroones x11
|
|
||||||
#define pos x12
|
|
||||||
-#define limit_wd x13
|
|
||||||
-#define mask x14
|
|
||||||
-#define endloop x15
|
|
||||||
+#define mask x13
|
|
||||||
+#define endloop x14
|
|
||||||
#define count mask
|
|
||||||
+#define offset pos
|
|
||||||
+#define neg_offset x15
|
|
||||||
|
|
||||||
-ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
|
|
||||||
- DELOUSE (0)
|
|
||||||
- DELOUSE (1)
|
|
||||||
- DELOUSE (2)
|
|
||||||
+/* Define endian dependent shift operations.
|
|
||||||
+ On big-endian early bytes are at MSB and on little-endian LSB.
|
|
||||||
+ LS_FW means shifting towards early bytes.
|
|
||||||
+ LS_BK means shifting towards later bytes.
|
|
||||||
+ */
|
|
||||||
+#ifdef __AARCH64EB__
|
|
||||||
+#define LS_FW lsl
|
|
||||||
+#define LS_BK lsr
|
|
||||||
+#else
|
|
||||||
+#define LS_FW lsr
|
|
||||||
+#define LS_BK lsl
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+ .text
|
|
||||||
+ .p2align 6
|
|
||||||
+ .rep 9
|
|
||||||
+ nop /* Pad so that the loop below fits a cache line. */
|
|
||||||
+ .endr
|
|
||||||
+ENTRY_ALIGN (strncmp, 0)
|
|
||||||
cbz limit, L(ret0)
|
|
||||||
eor tmp1, src1, src2
|
|
||||||
mov zeroones, #REP8_01
|
|
||||||
@@ -62,9 +77,6 @@ ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
|
|
||||||
and count, src1, #7
|
|
||||||
b.ne L(misaligned8)
|
|
||||||
cbnz count, L(mutual_align)
|
|
||||||
- /* Calculate the number of full and partial words -1. */
|
|
||||||
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
|
|
||||||
- lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
|
|
||||||
|
|
||||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
|
||||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
|
||||||
@@ -74,56 +86,52 @@ L(loop_aligned):
|
|
||||||
ldr data1, [src1], #8
|
|
||||||
ldr data2, [src2], #8
|
|
||||||
L(start_realigned):
|
|
||||||
- subs limit_wd, limit_wd, #1
|
|
||||||
+ subs limit, limit, #8
|
|
||||||
sub tmp1, data1, zeroones
|
|
||||||
orr tmp2, data1, #REP8_7f
|
|
||||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
|
||||||
- csinv endloop, diff, xzr, pl /* Last Dword or differences. */
|
|
||||||
+ csinv endloop, diff, xzr, hi /* Last Dword or differences. */
|
|
||||||
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
|
||||||
ccmp endloop, #0, #0, eq
|
|
||||||
b.eq L(loop_aligned)
|
|
||||||
/* End of performance-critical section -- one 64B cache line. */
|
|
||||||
|
|
||||||
- /* Not reached the limit, must have found the end or a diff. */
|
|
||||||
- tbz limit_wd, #63, L(not_limit)
|
|
||||||
-
|
|
||||||
- /* Limit % 8 == 0 => all bytes significant. */
|
|
||||||
- ands limit, limit, #7
|
|
||||||
- b.eq L(not_limit)
|
|
||||||
-
|
|
||||||
- lsl limit, limit, #3 /* Bits -> bytes. */
|
|
||||||
- mov mask, #~0
|
|
||||||
-#ifdef __AARCH64EB__
|
|
||||||
- lsr mask, mask, limit
|
|
||||||
-#else
|
|
||||||
- lsl mask, mask, limit
|
|
||||||
-#endif
|
|
||||||
- bic data1, data1, mask
|
|
||||||
- bic data2, data2, mask
|
|
||||||
-
|
|
||||||
- /* Make sure that the NUL byte is marked in the syndrome. */
|
|
||||||
- orr has_nul, has_nul, mask
|
|
||||||
-
|
|
||||||
-L(not_limit):
|
|
||||||
+L(full_check):
|
|
||||||
+#ifndef __AARCH64EB__
|
|
||||||
orr syndrome, diff, has_nul
|
|
||||||
-
|
|
||||||
-#ifndef __AARCH64EB__
|
|
||||||
+ add limit, limit, 8 /* Rewind limit to before last subs. */
|
|
||||||
+L(syndrome_check):
|
|
||||||
+ /* Limit was reached. Check if the NUL byte or the difference
|
|
||||||
+ is before the limit. */
|
|
||||||
rev syndrome, syndrome
|
|
||||||
rev data1, data1
|
|
||||||
- /* The MS-non-zero bit of the syndrome marks either the first bit
|
|
||||||
- that is different, or the top bit of the first zero byte.
|
|
||||||
- Shifting left now will bring the critical information into the
|
|
||||||
- top bits. */
|
|
||||||
clz pos, syndrome
|
|
||||||
rev data2, data2
|
|
||||||
lsl data1, data1, pos
|
|
||||||
+ cmp limit, pos, lsr #3
|
|
||||||
lsl data2, data2, pos
|
|
||||||
/* But we need to zero-extend (char is unsigned) the value and then
|
|
||||||
perform a signed 32-bit subtraction. */
|
|
||||||
lsr data1, data1, #56
|
|
||||||
sub result, data1, data2, lsr #56
|
|
||||||
- RET
|
|
||||||
+ csel result, result, xzr, hi
|
|
||||||
+ ret
|
|
||||||
#else
|
|
||||||
+ /* Not reached the limit, must have found the end or a diff. */
|
|
||||||
+ tbz limit, #63, L(not_limit)
|
|
||||||
+ add tmp1, limit, 8
|
|
||||||
+ cbz limit, L(not_limit)
|
|
||||||
+
|
|
||||||
+ lsl limit, tmp1, #3 /* Bits -> bytes. */
|
|
||||||
+ mov mask, #~0
|
|
||||||
+ lsr mask, mask, limit
|
|
||||||
+ bic data1, data1, mask
|
|
||||||
+ bic data2, data2, mask
|
|
||||||
+
|
|
||||||
+ /* Make sure that the NUL byte is marked in the syndrome. */
|
|
||||||
+ orr has_nul, has_nul, mask
|
|
||||||
+
|
|
||||||
+L(not_limit):
|
|
||||||
/* For big-endian we cannot use the trick with the syndrome value
|
|
||||||
as carry-propagation can corrupt the upper bits if the trailing
|
|
||||||
bytes in the string contain 0x01. */
|
|
||||||
@@ -134,7 +142,7 @@ L(not_limit):
|
|
||||||
cmp data1, data2
|
|
||||||
cset result, ne
|
|
||||||
cneg result, result, lo
|
|
||||||
- RET
|
|
||||||
+ ret
|
|
||||||
1:
|
|
||||||
/* Re-compute the NUL-byte detection, using a byte-reversed value. */
|
|
||||||
rev tmp3, data1
|
|
||||||
@@ -144,17 +152,18 @@ L(not_limit):
|
|
||||||
rev has_nul, has_nul
|
|
||||||
orr syndrome, diff, has_nul
|
|
||||||
clz pos, syndrome
|
|
||||||
- /* The MS-non-zero bit of the syndrome marks either the first bit
|
|
||||||
- that is different, or the top bit of the first zero byte.
|
|
||||||
+ /* The most-significant-non-zero bit of the syndrome marks either the
|
|
||||||
+ first bit that is different, or the top bit of the first zero byte.
|
|
||||||
Shifting left now will bring the critical information into the
|
|
||||||
top bits. */
|
|
||||||
+L(end_quick):
|
|
||||||
lsl data1, data1, pos
|
|
||||||
lsl data2, data2, pos
|
|
||||||
/* But we need to zero-extend (char is unsigned) the value and then
|
|
||||||
perform a signed 32-bit subtraction. */
|
|
||||||
lsr data1, data1, #56
|
|
||||||
sub result, data1, data2, lsr #56
|
|
||||||
- RET
|
|
||||||
+ ret
|
|
||||||
#endif
|
|
||||||
|
|
||||||
L(mutual_align):
|
|
||||||
@@ -169,22 +178,12 @@ L(mutual_align):
|
|
||||||
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
|
|
||||||
ldr data2, [src2], #8
|
|
||||||
mov tmp2, #~0
|
|
||||||
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
|
|
||||||
-#ifdef __AARCH64EB__
|
|
||||||
- /* Big-endian. Early bytes are at MSB. */
|
|
||||||
- lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
|
|
||||||
-#else
|
|
||||||
- /* Little-endian. Early bytes are at LSB. */
|
|
||||||
- lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
|
|
||||||
-#endif
|
|
||||||
- and tmp3, limit_wd, #7
|
|
||||||
- lsr limit_wd, limit_wd, #3
|
|
||||||
- /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
|
|
||||||
- add limit, limit, count
|
|
||||||
- add tmp3, tmp3, count
|
|
||||||
+ LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
|
|
||||||
+ /* Adjust the limit and ensure it doesn't overflow. */
|
|
||||||
+ adds limit, limit, count
|
|
||||||
+ csinv limit, limit, xzr, lo
|
|
||||||
orr data1, data1, tmp2
|
|
||||||
orr data2, data2, tmp2
|
|
||||||
- add limit_wd, limit_wd, tmp3, lsr #3
|
|
||||||
b L(start_realigned)
|
|
||||||
|
|
||||||
.p2align 6
|
|
||||||
@@ -203,18 +202,15 @@ L(byte_loop):
|
|
||||||
b.eq L(byte_loop)
|
|
||||||
L(done):
|
|
||||||
sub result, data1, data2
|
|
||||||
- RET
|
|
||||||
-
|
|
||||||
+ ret
|
|
||||||
/* Align the SRC1 to a dword by doing a bytewise compare and then do
|
|
||||||
the dword loop. */
|
|
||||||
L(try_misaligned_words):
|
|
||||||
- lsr limit_wd, limit, #3
|
|
||||||
- cbz count, L(do_misaligned)
|
|
||||||
+ cbz count, L(src1_aligned)
|
|
||||||
|
|
||||||
neg count, count
|
|
||||||
and count, count, #7
|
|
||||||
sub limit, limit, count
|
|
||||||
- lsr limit_wd, limit, #3
|
|
||||||
|
|
||||||
L(page_end_loop):
|
|
||||||
ldrb data1w, [src1], #1
|
|
||||||
@@ -225,48 +221,98 @@ L(page_end_loop):
|
|
||||||
subs count, count, #1
|
|
||||||
b.hi L(page_end_loop)
|
|
||||||
|
|
||||||
-L(do_misaligned):
|
|
||||||
- /* Prepare ourselves for the next page crossing. Unlike the aligned
|
|
||||||
- loop, we fetch 1 less dword because we risk crossing bounds on
|
|
||||||
- SRC2. */
|
|
||||||
- mov count, #8
|
|
||||||
- subs limit_wd, limit_wd, #1
|
|
||||||
- b.lo L(done_loop)
|
|
||||||
+ /* The following diagram explains the comparison of misaligned strings.
|
|
||||||
+ The bytes are shown in natural order. For little-endian, it is
|
|
||||||
+ reversed in the registers. The "x" bytes are before the string.
|
|
||||||
+ The "|" separates data that is loaded at one time.
|
|
||||||
+ src1 | a a a a a a a a | b b b c c c c c | . . .
|
|
||||||
+ src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
|
|
||||||
+ After shifting in each step, the data looks like this:
|
|
||||||
+ STEP_A STEP_B STEP_C
|
|
||||||
+ data1 a a a a a a a a b b b c c c c c b b b c c c c c
|
|
||||||
+ data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
|
|
||||||
+ The bytes with "0" are eliminated from the syndrome via mask.
|
|
||||||
+ Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
|
|
||||||
+ time from SRC2. The comparison happens in 3 steps. After each step
|
|
||||||
+ the loop can exit, or read from SRC1 or SRC2. */
|
|
||||||
+L(src1_aligned):
|
|
||||||
+ /* Calculate offset from 8 byte alignment to string start in bits. No
|
|
||||||
+ need to mask offset since shifts are ignoring upper bits. */
|
|
||||||
+ lsl offset, src2, #3
|
|
||||||
+ bic src2, src2, #0xf
|
|
||||||
+ mov mask, -1
|
|
||||||
+ neg neg_offset, offset
|
|
||||||
+ ldr data1, [src1], #8
|
|
||||||
+ ldp tmp1, tmp2, [src2], #16
|
|
||||||
+ LS_BK mask, mask, neg_offset
|
|
||||||
+ and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
|
|
||||||
+ /* Skip the first compare if data in tmp1 is irrelevant. */
|
|
||||||
+ tbnz offset, 6, L(misaligned_mid_loop)
|
|
||||||
+
|
|
||||||
L(loop_misaligned):
|
|
||||||
- and tmp2, src2, #0xff8
|
|
||||||
- eor tmp2, tmp2, #0xff8
|
|
||||||
- cbz tmp2, L(page_end_loop)
|
|
||||||
+ /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
|
|
||||||
+ LS_FW data2, tmp1, offset
|
|
||||||
+ LS_BK tmp1, tmp2, neg_offset
|
|
||||||
+ subs limit, limit, #8
|
|
||||||
+ orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
|
|
||||||
+ sub has_nul, data1, zeroones
|
|
||||||
+ eor diff, data1, data2 /* Non-zero if differences found. */
|
|
||||||
+ orr tmp3, data1, #REP8_7f
|
|
||||||
+ csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
|
|
||||||
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
|
|
||||||
+ orr tmp3, endloop, has_nul
|
|
||||||
+ cbnz tmp3, L(full_check)
|
|
||||||
|
|
||||||
ldr data1, [src1], #8
|
|
||||||
- ldr data2, [src2], #8
|
|
||||||
- sub tmp1, data1, zeroones
|
|
||||||
- orr tmp2, data1, #REP8_7f
|
|
||||||
- eor diff, data1, data2 /* Non-zero if differences found. */
|
|
||||||
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
|
||||||
- ccmp diff, #0, #0, eq
|
|
||||||
- b.ne L(not_limit)
|
|
||||||
- subs limit_wd, limit_wd, #1
|
|
||||||
- b.pl L(loop_misaligned)
|
|
||||||
+L(misaligned_mid_loop):
|
|
||||||
+ /* STEP_B: Compare first part of data1 to second part of tmp2. */
|
|
||||||
+ LS_FW data2, tmp2, offset
|
|
||||||
+#ifdef __AARCH64EB__
|
|
||||||
+ /* For big-endian we do a byte reverse to avoid carry-propagation
|
|
||||||
+ problem described above. This way we can reuse the has_nul in the
|
|
||||||
+ next step and also use syndrome value trick at the end. */
|
|
||||||
+ rev tmp3, data1
|
|
||||||
+ #define data1_fixed tmp3
|
|
||||||
+#else
|
|
||||||
+ #define data1_fixed data1
|
|
||||||
+#endif
|
|
||||||
+ sub has_nul, data1_fixed, zeroones
|
|
||||||
+ orr tmp3, data1_fixed, #REP8_7f
|
|
||||||
+ eor diff, data2, data1 /* Non-zero if differences found. */
|
|
||||||
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
|
|
||||||
+#ifdef __AARCH64EB__
|
|
||||||
+ rev has_nul, has_nul
|
|
||||||
+#endif
|
|
||||||
+ cmp limit, neg_offset, lsr #3
|
|
||||||
+ orr syndrome, diff, has_nul
|
|
||||||
+ bic syndrome, syndrome, mask /* Ignore later bytes. */
|
|
||||||
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
|
|
||||||
+ cbnz tmp3, L(syndrome_check)
|
|
||||||
|
|
||||||
-L(done_loop):
|
|
||||||
- /* We found a difference or a NULL before the limit was reached. */
|
|
||||||
- and limit, limit, #7
|
|
||||||
- cbz limit, L(not_limit)
|
|
||||||
- /* Read the last word. */
|
|
||||||
- sub src1, src1, 8
|
|
||||||
- sub src2, src2, 8
|
|
||||||
- ldr data1, [src1, limit]
|
|
||||||
- ldr data2, [src2, limit]
|
|
||||||
- sub tmp1, data1, zeroones
|
|
||||||
- orr tmp2, data1, #REP8_7f
|
|
||||||
- eor diff, data1, data2 /* Non-zero if differences found. */
|
|
||||||
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
|
||||||
- ccmp diff, #0, #0, eq
|
|
||||||
- b.ne L(not_limit)
|
|
||||||
+ /* STEP_C: Compare second part of data1 to first part of tmp1. */
|
|
||||||
+ ldp tmp1, tmp2, [src2], #16
|
|
||||||
+ cmp limit, #8
|
|
||||||
+ LS_BK data2, tmp1, neg_offset
|
|
||||||
+ eor diff, data2, data1 /* Non-zero if differences found. */
|
|
||||||
+ orr syndrome, diff, has_nul
|
|
||||||
+ and syndrome, syndrome, mask /* Ignore earlier bytes. */
|
|
||||||
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
|
|
||||||
+ cbnz tmp3, L(syndrome_check)
|
|
||||||
+
|
|
||||||
+ ldr data1, [src1], #8
|
|
||||||
+ sub limit, limit, #8
|
|
||||||
+ b L(loop_misaligned)
|
|
||||||
+
|
|
||||||
+#ifdef __AARCH64EB__
|
|
||||||
+L(syndrome_check):
|
|
||||||
+ clz pos, syndrome
|
|
||||||
+ cmp pos, limit, lsl #3
|
|
||||||
+ b.lo L(end_quick)
|
|
||||||
+#endif
|
|
||||||
|
|
||||||
L(ret0):
|
|
||||||
mov result, #0
|
|
||||||
- RET
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
END (strncmp)
|
|
||||||
libc_hidden_builtin_def (strncmp)
|
|
@ -1,845 +0,0 @@
|
|||||||
From e1d3312015e8f70344620375aedf91afe7e7e7a4 Mon Sep 17 00:00:00 2001
|
|
||||||
From: lijianglin <lijianglin2@huawei.com>
|
|
||||||
Date: Tue, 27 Jun 2023 20:15:49 +0800
|
|
||||||
Subject: add GB18030-2022 charmap and test the entire GB18030 charmap [BZ
|
|
||||||
#30243]
|
|
||||||
|
|
||||||
support GB18030-2022 after add and change some transcoding relationship
|
|
||||||
of GB18030-2022.Details are as follows:
|
|
||||||
add 25 transcoding relationship
|
|
||||||
UE81E 0x82359037
|
|
||||||
UE826 0x82359038
|
|
||||||
UE82B 0x82359039
|
|
||||||
UE82C 0x82359130
|
|
||||||
UE832 0x82359131
|
|
||||||
UE843 0x82359132
|
|
||||||
UE854 0x82359133
|
|
||||||
UE864 0x82359134
|
|
||||||
UE78D 0x84318236
|
|
||||||
UE78F 0x84318237
|
|
||||||
UE78E 0x84318238
|
|
||||||
UE790 0x84318239
|
|
||||||
UE791 0x84318330
|
|
||||||
UE792 0x84318331
|
|
||||||
UE793 0x84318332
|
|
||||||
UE794 0x84318333
|
|
||||||
UE795 0x84318334
|
|
||||||
UE796 0x84318335
|
|
||||||
UE816 0xfe51
|
|
||||||
UE817 0xfe52
|
|
||||||
UE818 0xfe53
|
|
||||||
UE831 0xfe6c
|
|
||||||
UE83B 0xfe76
|
|
||||||
UE855 0xfe91
|
|
||||||
change 6 transcoding relationship
|
|
||||||
U20087 0x95329031
|
|
||||||
U20089 0x95329033
|
|
||||||
U200CC 0x95329730
|
|
||||||
U215D7 0x9536b937
|
|
||||||
U2298F 0x9630ba35
|
|
||||||
U241FE 0x9635b630
|
|
||||||
Test the entire GB18030 charmap, not only the Unicode BMP part.
|
|
||||||
|
|
||||||
Co-authored-by: yangyanchao <yangyanchao6@huawei.com>
|
|
||||||
Co-authored-by: liqingqing <liqingqing3@huawei.com>
|
|
||||||
Co-authored-by: Bruno Haible <bruno@clisp.org>
|
|
||||||
Reviewed-by: Andreas Schwab <schwab@suse.de>
|
|
||||||
Reviewed-by: Mike FABIAN <mfabian@redhat.com>
|
|
||||||
|
|
||||||
diff --git a/iconvdata/gb18030.c b/iconvdata/gb18030.c
|
|
||||||
index 9996a59eaf..be6cfe652c 100644
|
|
||||||
--- a/iconvdata/gb18030.c
|
|
||||||
+++ b/iconvdata/gb18030.c
|
|
||||||
@@ -6009,49 +6009,50 @@ static const uint16_t __twobyte_to_ucs[] =
|
|
||||||
[0x5dc2] = 0xfa0e, [0x5dc3] = 0xfa0f, [0x5dc4] = 0xfa11, [0x5dc5] = 0xfa13,
|
|
||||||
[0x5dc6] = 0xfa14, [0x5dc7] = 0xfa18, [0x5dc8] = 0xfa1f, [0x5dc9] = 0xfa20,
|
|
||||||
[0x5dca] = 0xfa21, [0x5dcb] = 0xfa23, [0x5dcc] = 0xfa24, [0x5dcd] = 0xfa27,
|
|
||||||
- [0x5dce] = 0xfa28, [0x5dcf] = 0xfa29, [0x5dd0] = 0x2e81, [0x5dd4] = 0x2e84,
|
|
||||||
- [0x5dd5] = 0x3473, [0x5dd6] = 0x3447, [0x5dd7] = 0x2e88, [0x5dd8] = 0x2e8b,
|
|
||||||
- [0x5dd9] = 0x9fb4, [0x5dda] = 0x359e, [0x5ddb] = 0x361a, [0x5ddc] = 0x360e,
|
|
||||||
- [0x5ddd] = 0x2e8c, [0x5dde] = 0x2e97, [0x5ddf] = 0x396e, [0x5de0] = 0x3918,
|
|
||||||
- [0x5de1] = 0x9fb5, [0x5de2] = 0x39cf, [0x5de3] = 0x39df, [0x5de4] = 0x3a73,
|
|
||||||
- [0x5de5] = 0x39d0, [0x5de6] = 0x9fb6, [0x5de7] = 0x9fb7, [0x5de8] = 0x3b4e,
|
|
||||||
- [0x5de9] = 0x3c6e, [0x5dea] = 0x3ce0, [0x5deb] = 0x2ea7, [0x5ded] = 0x9fb8,
|
|
||||||
+ [0x5dce] = 0xfa28, [0x5dcf] = 0xfa29, [0x5dd0] = 0x2e81, [0x5dd1] = 0xe816,
|
|
||||||
+ [0x5dd2] = 0xe817, [0x5dd3] = 0xe818, [0x5dd4] = 0x2e84, [0x5dd5] = 0x3473,
|
|
||||||
+ [0x5dd6] = 0x3447, [0x5dd7] = 0x2e88, [0x5dd8] = 0x2e8b, [0x5dd9] = 0x9fb4,
|
|
||||||
+ [0x5dda] = 0x359e, [0x5ddb] = 0x361a, [0x5ddc] = 0x360e, [0x5ddd] = 0x2e8c,
|
|
||||||
+ [0x5dde] = 0x2e97, [0x5ddf] = 0x396e, [0x5de0] = 0x3918, [0x5de1] = 0x9fb5,
|
|
||||||
+ [0x5de2] = 0x39cf, [0x5de3] = 0x39df, [0x5de4] = 0x3a73, [0x5de5] = 0x39d0,
|
|
||||||
+ [0x5de6] = 0x9fb6, [0x5de7] = 0x9fb7, [0x5de8] = 0x3b4e, [0x5de9] = 0x3c6e,
|
|
||||||
+ [0x5dea] = 0x3ce0, [0x5deb] = 0x2ea7, [0x5dec] = 0xe831, [0x5ded] = 0x9fb8,
|
|
||||||
[0x5dee] = 0x2eaa, [0x5def] = 0x4056, [0x5df0] = 0x415f, [0x5df1] = 0x2eae,
|
|
||||||
[0x5df2] = 0x4337, [0x5df3] = 0x2eb3, [0x5df4] = 0x2eb6, [0x5df5] = 0x2eb7,
|
|
||||||
- [0x5df7] = 0x43b1, [0x5df8] = 0x43ac, [0x5df9] = 0x2ebb, [0x5dfa] = 0x43dd,
|
|
||||||
- [0x5dfb] = 0x44d6, [0x5dfc] = 0x4661, [0x5dfd] = 0x464c, [0x5dfe] = 0x9fb9,
|
|
||||||
- [0x5e00] = 0x4723, [0x5e01] = 0x4729, [0x5e02] = 0x477c, [0x5e03] = 0x478d,
|
|
||||||
- [0x5e04] = 0x2eca, [0x5e05] = 0x4947, [0x5e06] = 0x497a, [0x5e07] = 0x497d,
|
|
||||||
- [0x5e08] = 0x4982, [0x5e09] = 0x4983, [0x5e0a] = 0x4985, [0x5e0b] = 0x4986,
|
|
||||||
- [0x5e0c] = 0x499f, [0x5e0d] = 0x499b, [0x5e0e] = 0x49b7, [0x5e0f] = 0x49b6,
|
|
||||||
- [0x5e10] = 0x9fba, [0x5e12] = 0x4ca3, [0x5e13] = 0x4c9f, [0x5e14] = 0x4ca0,
|
|
||||||
- [0x5e15] = 0x4ca1, [0x5e16] = 0x4c77, [0x5e17] = 0x4ca2, [0x5e18] = 0x4d13,
|
|
||||||
- [0x5e19] = 0x4d14, [0x5e1a] = 0x4d15, [0x5e1b] = 0x4d16, [0x5e1c] = 0x4d17,
|
|
||||||
- [0x5e1d] = 0x4d18, [0x5e1e] = 0x4d19, [0x5e1f] = 0x4dae, [0x5e20] = 0x9fbb,
|
|
||||||
- [0x5e21] = 0xe468, [0x5e22] = 0xe469, [0x5e23] = 0xe46a, [0x5e24] = 0xe46b,
|
|
||||||
- [0x5e25] = 0xe46c, [0x5e26] = 0xe46d, [0x5e27] = 0xe46e, [0x5e28] = 0xe46f,
|
|
||||||
- [0x5e29] = 0xe470, [0x5e2a] = 0xe471, [0x5e2b] = 0xe472, [0x5e2c] = 0xe473,
|
|
||||||
- [0x5e2d] = 0xe474, [0x5e2e] = 0xe475, [0x5e2f] = 0xe476, [0x5e30] = 0xe477,
|
|
||||||
- [0x5e31] = 0xe478, [0x5e32] = 0xe479, [0x5e33] = 0xe47a, [0x5e34] = 0xe47b,
|
|
||||||
- [0x5e35] = 0xe47c, [0x5e36] = 0xe47d, [0x5e37] = 0xe47e, [0x5e38] = 0xe47f,
|
|
||||||
- [0x5e39] = 0xe480, [0x5e3a] = 0xe481, [0x5e3b] = 0xe482, [0x5e3c] = 0xe483,
|
|
||||||
- [0x5e3d] = 0xe484, [0x5e3e] = 0xe485, [0x5e3f] = 0xe486, [0x5e40] = 0xe487,
|
|
||||||
- [0x5e41] = 0xe488, [0x5e42] = 0xe489, [0x5e43] = 0xe48a, [0x5e44] = 0xe48b,
|
|
||||||
- [0x5e45] = 0xe48c, [0x5e46] = 0xe48d, [0x5e47] = 0xe48e, [0x5e48] = 0xe48f,
|
|
||||||
- [0x5e49] = 0xe490, [0x5e4a] = 0xe491, [0x5e4b] = 0xe492, [0x5e4c] = 0xe493,
|
|
||||||
- [0x5e4d] = 0xe494, [0x5e4e] = 0xe495, [0x5e4f] = 0xe496, [0x5e50] = 0xe497,
|
|
||||||
- [0x5e51] = 0xe498, [0x5e52] = 0xe499, [0x5e53] = 0xe49a, [0x5e54] = 0xe49b,
|
|
||||||
- [0x5e55] = 0xe49c, [0x5e56] = 0xe49d, [0x5e57] = 0xe49e, [0x5e58] = 0xe49f,
|
|
||||||
- [0x5e59] = 0xe4a0, [0x5e5a] = 0xe4a1, [0x5e5b] = 0xe4a2, [0x5e5c] = 0xe4a3,
|
|
||||||
- [0x5e5d] = 0xe4a4, [0x5e5e] = 0xe4a5, [0x5e5f] = 0xe4a6, [0x5e60] = 0xe4a7,
|
|
||||||
- [0x5e61] = 0xe4a8, [0x5e62] = 0xe4a9, [0x5e63] = 0xe4aa, [0x5e64] = 0xe4ab,
|
|
||||||
- [0x5e65] = 0xe4ac, [0x5e66] = 0xe4ad, [0x5e67] = 0xe4ae, [0x5e68] = 0xe4af,
|
|
||||||
- [0x5e69] = 0xe4b0, [0x5e6a] = 0xe4b1, [0x5e6b] = 0xe4b2, [0x5e6c] = 0xe4b3,
|
|
||||||
- [0x5e6d] = 0xe4b4, [0x5e6e] = 0xe4b5, [0x5e6f] = 0xe4b6, [0x5e70] = 0xe4b7,
|
|
||||||
- [0x5e71] = 0xe4b8, [0x5e72] = 0xe4b9, [0x5e73] = 0xe4ba, [0x5e74] = 0xe4bb,
|
|
||||||
- [0x5e75] = 0xe4bc, [0x5e76] = 0xe4bd, [0x5e77] = 0xe4be, [0x5e78] = 0xe4bf,
|
|
||||||
- [0x5e79] = 0xe4c0, [0x5e7a] = 0xe4c1, [0x5e7b] = 0xe4c2, [0x5e7c] = 0xe4c3,
|
|
||||||
- [0x5e7d] = 0xe4c4, [0x5e7e] = 0xe4c5,
|
|
||||||
+ [0x5df6] = 0xe83b, [0x5df7] = 0x43b1, [0x5df8] = 0x43ac, [0x5df9] = 0x2ebb,
|
|
||||||
+ [0x5dfa] = 0x43dd, [0x5dfb] = 0x44d6, [0x5dfc] = 0x4661, [0x5dfd] = 0x464c,
|
|
||||||
+ [0x5dfe] = 0x9fb9, [0x5e00] = 0x4723, [0x5e01] = 0x4729, [0x5e02] = 0x477c,
|
|
||||||
+ [0x5e03] = 0x478d, [0x5e04] = 0x2eca, [0x5e05] = 0x4947, [0x5e06] = 0x497a,
|
|
||||||
+ [0x5e07] = 0x497d, [0x5e08] = 0x4982, [0x5e09] = 0x4983, [0x5e0a] = 0x4985,
|
|
||||||
+ [0x5e0b] = 0x4986, [0x5e0c] = 0x499f, [0x5e0d] = 0x499b, [0x5e0e] = 0x49b7,
|
|
||||||
+ [0x5e0f] = 0x49b6, [0x5e10] = 0x9fba, [0x5e11] = 0xe855, [0x5e12] = 0x4ca3,
|
|
||||||
+ [0x5e13] = 0x4c9f, [0x5e14] = 0x4ca0, [0x5e15] = 0x4ca1, [0x5e16] = 0x4c77,
|
|
||||||
+ [0x5e17] = 0x4ca2, [0x5e18] = 0x4d13, [0x5e19] = 0x4d14, [0x5e1a] = 0x4d15,
|
|
||||||
+ [0x5e1b] = 0x4d16, [0x5e1c] = 0x4d17, [0x5e1d] = 0x4d18, [0x5e1e] = 0x4d19,
|
|
||||||
+ [0x5e1f] = 0x4dae, [0x5e20] = 0x9fbb, [0x5e21] = 0xe468, [0x5e22] = 0xe469,
|
|
||||||
+ [0x5e23] = 0xe46a, [0x5e24] = 0xe46b, [0x5e25] = 0xe46c, [0x5e26] = 0xe46d,
|
|
||||||
+ [0x5e27] = 0xe46e, [0x5e28] = 0xe46f, [0x5e29] = 0xe470, [0x5e2a] = 0xe471,
|
|
||||||
+ [0x5e2b] = 0xe472, [0x5e2c] = 0xe473, [0x5e2d] = 0xe474, [0x5e2e] = 0xe475,
|
|
||||||
+ [0x5e2f] = 0xe476, [0x5e30] = 0xe477, [0x5e31] = 0xe478, [0x5e32] = 0xe479,
|
|
||||||
+ [0x5e33] = 0xe47a, [0x5e34] = 0xe47b, [0x5e35] = 0xe47c, [0x5e36] = 0xe47d,
|
|
||||||
+ [0x5e37] = 0xe47e, [0x5e38] = 0xe47f, [0x5e39] = 0xe480, [0x5e3a] = 0xe481,
|
|
||||||
+ [0x5e3b] = 0xe482, [0x5e3c] = 0xe483, [0x5e3d] = 0xe484, [0x5e3e] = 0xe485,
|
|
||||||
+ [0x5e3f] = 0xe486, [0x5e40] = 0xe487, [0x5e41] = 0xe488, [0x5e42] = 0xe489,
|
|
||||||
+ [0x5e43] = 0xe48a, [0x5e44] = 0xe48b, [0x5e45] = 0xe48c, [0x5e46] = 0xe48d,
|
|
||||||
+ [0x5e47] = 0xe48e, [0x5e48] = 0xe48f, [0x5e49] = 0xe490, [0x5e4a] = 0xe491,
|
|
||||||
+ [0x5e4b] = 0xe492, [0x5e4c] = 0xe493, [0x5e4d] = 0xe494, [0x5e4e] = 0xe495,
|
|
||||||
+ [0x5e4f] = 0xe496, [0x5e50] = 0xe497, [0x5e51] = 0xe498, [0x5e52] = 0xe499,
|
|
||||||
+ [0x5e53] = 0xe49a, [0x5e54] = 0xe49b, [0x5e55] = 0xe49c, [0x5e56] = 0xe49d,
|
|
||||||
+ [0x5e57] = 0xe49e, [0x5e58] = 0xe49f, [0x5e59] = 0xe4a0, [0x5e5a] = 0xe4a1,
|
|
||||||
+ [0x5e5b] = 0xe4a2, [0x5e5c] = 0xe4a3, [0x5e5d] = 0xe4a4, [0x5e5e] = 0xe4a5,
|
|
||||||
+ [0x5e5f] = 0xe4a6, [0x5e60] = 0xe4a7, [0x5e61] = 0xe4a8, [0x5e62] = 0xe4a9,
|
|
||||||
+ [0x5e63] = 0xe4aa, [0x5e64] = 0xe4ab, [0x5e65] = 0xe4ac, [0x5e66] = 0xe4ad,
|
|
||||||
+ [0x5e67] = 0xe4ae, [0x5e68] = 0xe4af, [0x5e69] = 0xe4b0, [0x5e6a] = 0xe4b1,
|
|
||||||
+ [0x5e6b] = 0xe4b2, [0x5e6c] = 0xe4b3, [0x5e6d] = 0xe4b4, [0x5e6e] = 0xe4b5,
|
|
||||||
+ [0x5e6f] = 0xe4b6, [0x5e70] = 0xe4b7, [0x5e71] = 0xe4b8, [0x5e72] = 0xe4b9,
|
|
||||||
+ [0x5e73] = 0xe4ba, [0x5e74] = 0xe4bb, [0x5e75] = 0xe4bc, [0x5e76] = 0xe4bd,
|
|
||||||
+ [0x5e77] = 0xe4be, [0x5e78] = 0xe4bf, [0x5e79] = 0xe4c0, [0x5e7a] = 0xe4c1,
|
|
||||||
+ [0x5e7b] = 0xe4c2, [0x5e7c] = 0xe4c3, [0x5e7d] = 0xe4c4, [0x5e7e] = 0xe4c5,
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Table for GB18030 -> UCS-4, containing the four-byte characters only,
|
|
||||||
@@ -8680,7 +8681,9 @@ static const uint16_t __fourbyte_to_ucs[0x99e2 - 6637 - 2110 - 14404 - 4295] =
|
|
||||||
[0x2838] = 0x9fa6, [0x2839] = 0x9fa7, [0x283a] = 0x9fa8, [0x283b] = 0x9fa9,
|
|
||||||
[0x283c] = 0x9faa, [0x283d] = 0x9fab, [0x283e] = 0x9fac, [0x283f] = 0x9fad,
|
|
||||||
[0x2840] = 0x9fae, [0x2841] = 0x9faf, [0x2842] = 0x9fb0, [0x2843] = 0x9fb1,
|
|
||||||
- [0x2844] = 0x9fb2, [0x2845] = 0x9fb3, [0x284e] = 0xe76c, [0x284f] = 0xe7c8,
|
|
||||||
+ [0x2844] = 0x9fb2, [0x2845] = 0x9fb3, [0x2846] = 0xe81e, [0x2847] = 0xe826,
|
|
||||||
+ [0x2848] = 0xe82b, [0x2849] = 0xe82c, [0x284a] = 0xe832, [0x284b] = 0xe843,
|
|
||||||
+ [0x284c] = 0xe854, [0x284d] = 0xe864, [0x284e] = 0xe76c, [0x284f] = 0xe7c8,
|
|
||||||
[0x2850] = 0xe7e7, [0x2851] = 0xe7e8, [0x2852] = 0xe7e9, [0x2853] = 0xe7ea,
|
|
||||||
[0x2854] = 0xe7eb, [0x2855] = 0xe7ec, [0x2856] = 0xe7ed, [0x2857] = 0xe7ee,
|
|
||||||
[0x2858] = 0xe7ef, [0x2859] = 0xe7f0, [0x285a] = 0xe7f1, [0x285b] = 0xe7f2,
|
|
||||||
@@ -9008,84 +9011,86 @@ static const uint16_t __fourbyte_to_ucs[0x99e2 - 6637 - 2110 - 14404 - 4295] =
|
|
||||||
[0x2d60] = 0xfe02, [0x2d61] = 0xfe03, [0x2d62] = 0xfe04, [0x2d63] = 0xfe05,
|
|
||||||
[0x2d64] = 0xfe06, [0x2d65] = 0xfe07, [0x2d66] = 0xfe08, [0x2d67] = 0xfe09,
|
|
||||||
[0x2d68] = 0xfe0a, [0x2d69] = 0xfe0b, [0x2d6a] = 0xfe0c, [0x2d6b] = 0xfe0d,
|
|
||||||
- [0x2d6c] = 0xfe0e, [0x2d6d] = 0xfe0f, [0x2d78] = 0xfe1a, [0x2d79] = 0xfe1b,
|
|
||||||
- [0x2d7a] = 0xfe1c, [0x2d7b] = 0xfe1d, [0x2d7c] = 0xfe1e, [0x2d7d] = 0xfe1f,
|
|
||||||
- [0x2d7e] = 0xfe20, [0x2d7f] = 0xfe21, [0x2d80] = 0xfe22, [0x2d81] = 0xfe23,
|
|
||||||
- [0x2d82] = 0xfe24, [0x2d83] = 0xfe25, [0x2d84] = 0xfe26, [0x2d85] = 0xfe27,
|
|
||||||
- [0x2d86] = 0xfe28, [0x2d87] = 0xfe29, [0x2d88] = 0xfe2a, [0x2d89] = 0xfe2b,
|
|
||||||
- [0x2d8a] = 0xfe2c, [0x2d8b] = 0xfe2d, [0x2d8c] = 0xfe2e, [0x2d8d] = 0xfe2f,
|
|
||||||
- [0x2d8e] = 0xfe32, [0x2d8f] = 0xfe45, [0x2d90] = 0xfe46, [0x2d91] = 0xfe47,
|
|
||||||
- [0x2d92] = 0xfe48, [0x2d93] = 0xfe53, [0x2d94] = 0xfe58, [0x2d95] = 0xfe67,
|
|
||||||
- [0x2d96] = 0xfe6c, [0x2d97] = 0xfe6d, [0x2d98] = 0xfe6e, [0x2d99] = 0xfe6f,
|
|
||||||
- [0x2d9a] = 0xfe70, [0x2d9b] = 0xfe71, [0x2d9c] = 0xfe72, [0x2d9d] = 0xfe73,
|
|
||||||
- [0x2d9e] = 0xfe74, [0x2d9f] = 0xfe75, [0x2da0] = 0xfe76, [0x2da1] = 0xfe77,
|
|
||||||
- [0x2da2] = 0xfe78, [0x2da3] = 0xfe79, [0x2da4] = 0xfe7a, [0x2da5] = 0xfe7b,
|
|
||||||
- [0x2da6] = 0xfe7c, [0x2da7] = 0xfe7d, [0x2da8] = 0xfe7e, [0x2da9] = 0xfe7f,
|
|
||||||
- [0x2daa] = 0xfe80, [0x2dab] = 0xfe81, [0x2dac] = 0xfe82, [0x2dad] = 0xfe83,
|
|
||||||
- [0x2dae] = 0xfe84, [0x2daf] = 0xfe85, [0x2db0] = 0xfe86, [0x2db1] = 0xfe87,
|
|
||||||
- [0x2db2] = 0xfe88, [0x2db3] = 0xfe89, [0x2db4] = 0xfe8a, [0x2db5] = 0xfe8b,
|
|
||||||
- [0x2db6] = 0xfe8c, [0x2db7] = 0xfe8d, [0x2db8] = 0xfe8e, [0x2db9] = 0xfe8f,
|
|
||||||
- [0x2dba] = 0xfe90, [0x2dbb] = 0xfe91, [0x2dbc] = 0xfe92, [0x2dbd] = 0xfe93,
|
|
||||||
- [0x2dbe] = 0xfe94, [0x2dbf] = 0xfe95, [0x2dc0] = 0xfe96, [0x2dc1] = 0xfe97,
|
|
||||||
- [0x2dc2] = 0xfe98, [0x2dc3] = 0xfe99, [0x2dc4] = 0xfe9a, [0x2dc5] = 0xfe9b,
|
|
||||||
- [0x2dc6] = 0xfe9c, [0x2dc7] = 0xfe9d, [0x2dc8] = 0xfe9e, [0x2dc9] = 0xfe9f,
|
|
||||||
- [0x2dca] = 0xfea0, [0x2dcb] = 0xfea1, [0x2dcc] = 0xfea2, [0x2dcd] = 0xfea3,
|
|
||||||
- [0x2dce] = 0xfea4, [0x2dcf] = 0xfea5, [0x2dd0] = 0xfea6, [0x2dd1] = 0xfea7,
|
|
||||||
- [0x2dd2] = 0xfea8, [0x2dd3] = 0xfea9, [0x2dd4] = 0xfeaa, [0x2dd5] = 0xfeab,
|
|
||||||
- [0x2dd6] = 0xfeac, [0x2dd7] = 0xfead, [0x2dd8] = 0xfeae, [0x2dd9] = 0xfeaf,
|
|
||||||
- [0x2dda] = 0xfeb0, [0x2ddb] = 0xfeb1, [0x2ddc] = 0xfeb2, [0x2ddd] = 0xfeb3,
|
|
||||||
- [0x2dde] = 0xfeb4, [0x2ddf] = 0xfeb5, [0x2de0] = 0xfeb6, [0x2de1] = 0xfeb7,
|
|
||||||
- [0x2de2] = 0xfeb8, [0x2de3] = 0xfeb9, [0x2de4] = 0xfeba, [0x2de5] = 0xfebb,
|
|
||||||
- [0x2de6] = 0xfebc, [0x2de7] = 0xfebd, [0x2de8] = 0xfebe, [0x2de9] = 0xfebf,
|
|
||||||
- [0x2dea] = 0xfec0, [0x2deb] = 0xfec1, [0x2dec] = 0xfec2, [0x2ded] = 0xfec3,
|
|
||||||
- [0x2dee] = 0xfec4, [0x2def] = 0xfec5, [0x2df0] = 0xfec6, [0x2df1] = 0xfec7,
|
|
||||||
- [0x2df2] = 0xfec8, [0x2df3] = 0xfec9, [0x2df4] = 0xfeca, [0x2df5] = 0xfecb,
|
|
||||||
- [0x2df6] = 0xfecc, [0x2df7] = 0xfecd, [0x2df8] = 0xfece, [0x2df9] = 0xfecf,
|
|
||||||
- [0x2dfa] = 0xfed0, [0x2dfb] = 0xfed1, [0x2dfc] = 0xfed2, [0x2dfd] = 0xfed3,
|
|
||||||
- [0x2dfe] = 0xfed4, [0x2dff] = 0xfed5, [0x2e00] = 0xfed6, [0x2e01] = 0xfed7,
|
|
||||||
- [0x2e02] = 0xfed8, [0x2e03] = 0xfed9, [0x2e04] = 0xfeda, [0x2e05] = 0xfedb,
|
|
||||||
- [0x2e06] = 0xfedc, [0x2e07] = 0xfedd, [0x2e08] = 0xfede, [0x2e09] = 0xfedf,
|
|
||||||
- [0x2e0a] = 0xfee0, [0x2e0b] = 0xfee1, [0x2e0c] = 0xfee2, [0x2e0d] = 0xfee3,
|
|
||||||
- [0x2e0e] = 0xfee4, [0x2e0f] = 0xfee5, [0x2e10] = 0xfee6, [0x2e11] = 0xfee7,
|
|
||||||
- [0x2e12] = 0xfee8, [0x2e13] = 0xfee9, [0x2e14] = 0xfeea, [0x2e15] = 0xfeeb,
|
|
||||||
- [0x2e16] = 0xfeec, [0x2e17] = 0xfeed, [0x2e18] = 0xfeee, [0x2e19] = 0xfeef,
|
|
||||||
- [0x2e1a] = 0xfef0, [0x2e1b] = 0xfef1, [0x2e1c] = 0xfef2, [0x2e1d] = 0xfef3,
|
|
||||||
- [0x2e1e] = 0xfef4, [0x2e1f] = 0xfef5, [0x2e20] = 0xfef6, [0x2e21] = 0xfef7,
|
|
||||||
- [0x2e22] = 0xfef8, [0x2e23] = 0xfef9, [0x2e24] = 0xfefa, [0x2e25] = 0xfefb,
|
|
||||||
- [0x2e26] = 0xfefc, [0x2e27] = 0xfefd, [0x2e28] = 0xfefe, [0x2e29] = 0xfeff,
|
|
||||||
- [0x2e2a] = 0xff00, [0x2e2b] = 0xff5f, [0x2e2c] = 0xff60, [0x2e2d] = 0xff61,
|
|
||||||
- [0x2e2e] = 0xff62, [0x2e2f] = 0xff63, [0x2e30] = 0xff64, [0x2e31] = 0xff65,
|
|
||||||
- [0x2e32] = 0xff66, [0x2e33] = 0xff67, [0x2e34] = 0xff68, [0x2e35] = 0xff69,
|
|
||||||
- [0x2e36] = 0xff6a, [0x2e37] = 0xff6b, [0x2e38] = 0xff6c, [0x2e39] = 0xff6d,
|
|
||||||
- [0x2e3a] = 0xff6e, [0x2e3b] = 0xff6f, [0x2e3c] = 0xff70, [0x2e3d] = 0xff71,
|
|
||||||
- [0x2e3e] = 0xff72, [0x2e3f] = 0xff73, [0x2e40] = 0xff74, [0x2e41] = 0xff75,
|
|
||||||
- [0x2e42] = 0xff76, [0x2e43] = 0xff77, [0x2e44] = 0xff78, [0x2e45] = 0xff79,
|
|
||||||
- [0x2e46] = 0xff7a, [0x2e47] = 0xff7b, [0x2e48] = 0xff7c, [0x2e49] = 0xff7d,
|
|
||||||
- [0x2e4a] = 0xff7e, [0x2e4b] = 0xff7f, [0x2e4c] = 0xff80, [0x2e4d] = 0xff81,
|
|
||||||
- [0x2e4e] = 0xff82, [0x2e4f] = 0xff83, [0x2e50] = 0xff84, [0x2e51] = 0xff85,
|
|
||||||
- [0x2e52] = 0xff86, [0x2e53] = 0xff87, [0x2e54] = 0xff88, [0x2e55] = 0xff89,
|
|
||||||
- [0x2e56] = 0xff8a, [0x2e57] = 0xff8b, [0x2e58] = 0xff8c, [0x2e59] = 0xff8d,
|
|
||||||
- [0x2e5a] = 0xff8e, [0x2e5b] = 0xff8f, [0x2e5c] = 0xff90, [0x2e5d] = 0xff91,
|
|
||||||
- [0x2e5e] = 0xff92, [0x2e5f] = 0xff93, [0x2e60] = 0xff94, [0x2e61] = 0xff95,
|
|
||||||
- [0x2e62] = 0xff96, [0x2e63] = 0xff97, [0x2e64] = 0xff98, [0x2e65] = 0xff99,
|
|
||||||
- [0x2e66] = 0xff9a, [0x2e67] = 0xff9b, [0x2e68] = 0xff9c, [0x2e69] = 0xff9d,
|
|
||||||
- [0x2e6a] = 0xff9e, [0x2e6b] = 0xff9f, [0x2e6c] = 0xffa0, [0x2e6d] = 0xffa1,
|
|
||||||
- [0x2e6e] = 0xffa2, [0x2e6f] = 0xffa3, [0x2e70] = 0xffa4, [0x2e71] = 0xffa5,
|
|
||||||
- [0x2e72] = 0xffa6, [0x2e73] = 0xffa7, [0x2e74] = 0xffa8, [0x2e75] = 0xffa9,
|
|
||||||
- [0x2e76] = 0xffaa, [0x2e77] = 0xffab, [0x2e78] = 0xffac, [0x2e79] = 0xffad,
|
|
||||||
- [0x2e7a] = 0xffae, [0x2e7b] = 0xffaf, [0x2e7c] = 0xffb0, [0x2e7d] = 0xffb1,
|
|
||||||
- [0x2e7e] = 0xffb2, [0x2e7f] = 0xffb3, [0x2e80] = 0xffb4, [0x2e81] = 0xffb5,
|
|
||||||
- [0x2e82] = 0xffb6, [0x2e83] = 0xffb7, [0x2e84] = 0xffb8, [0x2e85] = 0xffb9,
|
|
||||||
- [0x2e86] = 0xffba, [0x2e87] = 0xffbb, [0x2e88] = 0xffbc, [0x2e89] = 0xffbd,
|
|
||||||
- [0x2e8a] = 0xffbe, [0x2e8b] = 0xffbf, [0x2e8c] = 0xffc0, [0x2e8d] = 0xffc1,
|
|
||||||
- [0x2e8e] = 0xffc2, [0x2e8f] = 0xffc3, [0x2e90] = 0xffc4, [0x2e91] = 0xffc5,
|
|
||||||
- [0x2e92] = 0xffc6, [0x2e93] = 0xffc7, [0x2e94] = 0xffc8, [0x2e95] = 0xffc9,
|
|
||||||
- [0x2e96] = 0xffca, [0x2e97] = 0xffcb, [0x2e98] = 0xffcc, [0x2e99] = 0xffcd,
|
|
||||||
- [0x2e9a] = 0xffce, [0x2e9b] = 0xffcf, [0x2e9c] = 0xffd0, [0x2e9d] = 0xffd1,
|
|
||||||
- [0x2e9e] = 0xffd2, [0x2e9f] = 0xffd3, [0x2ea0] = 0xffd4, [0x2ea1] = 0xffd5,
|
|
||||||
- [0x2ea2] = 0xffd6, [0x2ea3] = 0xffd7, [0x2ea4] = 0xffd8, [0x2ea5] = 0xffd9,
|
|
||||||
- [0x2ea6] = 0xffda, [0x2ea7] = 0xffdb, [0x2ea8] = 0xffdc, [0x2ea9] = 0xffdd,
|
|
||||||
- [0x2eaa] = 0xffde, [0x2eab] = 0xffdf,
|
|
||||||
+ [0x2d6c] = 0xfe0e, [0x2d6d] = 0xfe0f, [0x2d6e] = 0xe78d, [0x2d6f] = 0xe78f,
|
|
||||||
+ [0x2d70] = 0xe78e, [0x2d71] = 0xe790, [0x2d72] = 0xe791, [0x2d73] = 0xe792,
|
|
||||||
+ [0x2d74] = 0xe793, [0x2d75] = 0xe794, [0x2d76] = 0xe795, [0x2d77] = 0xe796,
|
|
||||||
+ [0x2d78] = 0xfe1a, [0x2d79] = 0xfe1b, [0x2d7a] = 0xfe1c, [0x2d7b] = 0xfe1d,
|
|
||||||
+ [0x2d7c] = 0xfe1e, [0x2d7d] = 0xfe1f, [0x2d7e] = 0xfe20, [0x2d7f] = 0xfe21,
|
|
||||||
+ [0x2d80] = 0xfe22, [0x2d81] = 0xfe23, [0x2d82] = 0xfe24, [0x2d83] = 0xfe25,
|
|
||||||
+ [0x2d84] = 0xfe26, [0x2d85] = 0xfe27, [0x2d86] = 0xfe28, [0x2d87] = 0xfe29,
|
|
||||||
+ [0x2d88] = 0xfe2a, [0x2d89] = 0xfe2b, [0x2d8a] = 0xfe2c, [0x2d8b] = 0xfe2d,
|
|
||||||
+ [0x2d8c] = 0xfe2e, [0x2d8d] = 0xfe2f, [0x2d8e] = 0xfe32, [0x2d8f] = 0xfe45,
|
|
||||||
+ [0x2d90] = 0xfe46, [0x2d91] = 0xfe47, [0x2d92] = 0xfe48, [0x2d93] = 0xfe53,
|
|
||||||
+ [0x2d94] = 0xfe58, [0x2d95] = 0xfe67, [0x2d96] = 0xfe6c, [0x2d97] = 0xfe6d,
|
|
||||||
+ [0x2d98] = 0xfe6e, [0x2d99] = 0xfe6f, [0x2d9a] = 0xfe70, [0x2d9b] = 0xfe71,
|
|
||||||
+ [0x2d9c] = 0xfe72, [0x2d9d] = 0xfe73, [0x2d9e] = 0xfe74, [0x2d9f] = 0xfe75,
|
|
||||||
+ [0x2da0] = 0xfe76, [0x2da1] = 0xfe77, [0x2da2] = 0xfe78, [0x2da3] = 0xfe79,
|
|
||||||
+ [0x2da4] = 0xfe7a, [0x2da5] = 0xfe7b, [0x2da6] = 0xfe7c, [0x2da7] = 0xfe7d,
|
|
||||||
+ [0x2da8] = 0xfe7e, [0x2da9] = 0xfe7f, [0x2daa] = 0xfe80, [0x2dab] = 0xfe81,
|
|
||||||
+ [0x2dac] = 0xfe82, [0x2dad] = 0xfe83, [0x2dae] = 0xfe84, [0x2daf] = 0xfe85,
|
|
||||||
+ [0x2db0] = 0xfe86, [0x2db1] = 0xfe87, [0x2db2] = 0xfe88, [0x2db3] = 0xfe89,
|
|
||||||
+ [0x2db4] = 0xfe8a, [0x2db5] = 0xfe8b, [0x2db6] = 0xfe8c, [0x2db7] = 0xfe8d,
|
|
||||||
+ [0x2db8] = 0xfe8e, [0x2db9] = 0xfe8f, [0x2dba] = 0xfe90, [0x2dbb] = 0xfe91,
|
|
||||||
+ [0x2dbc] = 0xfe92, [0x2dbd] = 0xfe93, [0x2dbe] = 0xfe94, [0x2dbf] = 0xfe95,
|
|
||||||
+ [0x2dc0] = 0xfe96, [0x2dc1] = 0xfe97, [0x2dc2] = 0xfe98, [0x2dc3] = 0xfe99,
|
|
||||||
+ [0x2dc4] = 0xfe9a, [0x2dc5] = 0xfe9b, [0x2dc6] = 0xfe9c, [0x2dc7] = 0xfe9d,
|
|
||||||
+ [0x2dc8] = 0xfe9e, [0x2dc9] = 0xfe9f, [0x2dca] = 0xfea0, [0x2dcb] = 0xfea1,
|
|
||||||
+ [0x2dcc] = 0xfea2, [0x2dcd] = 0xfea3, [0x2dce] = 0xfea4, [0x2dcf] = 0xfea5,
|
|
||||||
+ [0x2dd0] = 0xfea6, [0x2dd1] = 0xfea7, [0x2dd2] = 0xfea8, [0x2dd3] = 0xfea9,
|
|
||||||
+ [0x2dd4] = 0xfeaa, [0x2dd5] = 0xfeab, [0x2dd6] = 0xfeac, [0x2dd7] = 0xfead,
|
|
||||||
+ [0x2dd8] = 0xfeae, [0x2dd9] = 0xfeaf, [0x2dda] = 0xfeb0, [0x2ddb] = 0xfeb1,
|
|
||||||
+ [0x2ddc] = 0xfeb2, [0x2ddd] = 0xfeb3, [0x2dde] = 0xfeb4, [0x2ddf] = 0xfeb5,
|
|
||||||
+ [0x2de0] = 0xfeb6, [0x2de1] = 0xfeb7, [0x2de2] = 0xfeb8, [0x2de3] = 0xfeb9,
|
|
||||||
+ [0x2de4] = 0xfeba, [0x2de5] = 0xfebb, [0x2de6] = 0xfebc, [0x2de7] = 0xfebd,
|
|
||||||
+ [0x2de8] = 0xfebe, [0x2de9] = 0xfebf, [0x2dea] = 0xfec0, [0x2deb] = 0xfec1,
|
|
||||||
+ [0x2dec] = 0xfec2, [0x2ded] = 0xfec3, [0x2dee] = 0xfec4, [0x2def] = 0xfec5,
|
|
||||||
+ [0x2df0] = 0xfec6, [0x2df1] = 0xfec7, [0x2df2] = 0xfec8, [0x2df3] = 0xfec9,
|
|
||||||
+ [0x2df4] = 0xfeca, [0x2df5] = 0xfecb, [0x2df6] = 0xfecc, [0x2df7] = 0xfecd,
|
|
||||||
+ [0x2df8] = 0xfece, [0x2df9] = 0xfecf, [0x2dfa] = 0xfed0, [0x2dfb] = 0xfed1,
|
|
||||||
+ [0x2dfc] = 0xfed2, [0x2dfd] = 0xfed3, [0x2dfe] = 0xfed4, [0x2dff] = 0xfed5,
|
|
||||||
+ [0x2e00] = 0xfed6, [0x2e01] = 0xfed7, [0x2e02] = 0xfed8, [0x2e03] = 0xfed9,
|
|
||||||
+ [0x2e04] = 0xfeda, [0x2e05] = 0xfedb, [0x2e06] = 0xfedc, [0x2e07] = 0xfedd,
|
|
||||||
+ [0x2e08] = 0xfede, [0x2e09] = 0xfedf, [0x2e0a] = 0xfee0, [0x2e0b] = 0xfee1,
|
|
||||||
+ [0x2e0c] = 0xfee2, [0x2e0d] = 0xfee3, [0x2e0e] = 0xfee4, [0x2e0f] = 0xfee5,
|
|
||||||
+ [0x2e10] = 0xfee6, [0x2e11] = 0xfee7, [0x2e12] = 0xfee8, [0x2e13] = 0xfee9,
|
|
||||||
+ [0x2e14] = 0xfeea, [0x2e15] = 0xfeeb, [0x2e16] = 0xfeec, [0x2e17] = 0xfeed,
|
|
||||||
+ [0x2e18] = 0xfeee, [0x2e19] = 0xfeef, [0x2e1a] = 0xfef0, [0x2e1b] = 0xfef1,
|
|
||||||
+ [0x2e1c] = 0xfef2, [0x2e1d] = 0xfef3, [0x2e1e] = 0xfef4, [0x2e1f] = 0xfef5,
|
|
||||||
+ [0x2e20] = 0xfef6, [0x2e21] = 0xfef7, [0x2e22] = 0xfef8, [0x2e23] = 0xfef9,
|
|
||||||
+ [0x2e24] = 0xfefa, [0x2e25] = 0xfefb, [0x2e26] = 0xfefc, [0x2e27] = 0xfefd,
|
|
||||||
+ [0x2e28] = 0xfefe, [0x2e29] = 0xfeff, [0x2e2a] = 0xff00, [0x2e2b] = 0xff5f,
|
|
||||||
+ [0x2e2c] = 0xff60, [0x2e2d] = 0xff61, [0x2e2e] = 0xff62, [0x2e2f] = 0xff63,
|
|
||||||
+ [0x2e30] = 0xff64, [0x2e31] = 0xff65, [0x2e32] = 0xff66, [0x2e33] = 0xff67,
|
|
||||||
+ [0x2e34] = 0xff68, [0x2e35] = 0xff69, [0x2e36] = 0xff6a, [0x2e37] = 0xff6b,
|
|
||||||
+ [0x2e38] = 0xff6c, [0x2e39] = 0xff6d, [0x2e3a] = 0xff6e, [0x2e3b] = 0xff6f,
|
|
||||||
+ [0x2e3c] = 0xff70, [0x2e3d] = 0xff71, [0x2e3e] = 0xff72, [0x2e3f] = 0xff73,
|
|
||||||
+ [0x2e40] = 0xff74, [0x2e41] = 0xff75, [0x2e42] = 0xff76, [0x2e43] = 0xff77,
|
|
||||||
+ [0x2e44] = 0xff78, [0x2e45] = 0xff79, [0x2e46] = 0xff7a, [0x2e47] = 0xff7b,
|
|
||||||
+ [0x2e48] = 0xff7c, [0x2e49] = 0xff7d, [0x2e4a] = 0xff7e, [0x2e4b] = 0xff7f,
|
|
||||||
+ [0x2e4c] = 0xff80, [0x2e4d] = 0xff81, [0x2e4e] = 0xff82, [0x2e4f] = 0xff83,
|
|
||||||
+ [0x2e50] = 0xff84, [0x2e51] = 0xff85, [0x2e52] = 0xff86, [0x2e53] = 0xff87,
|
|
||||||
+ [0x2e54] = 0xff88, [0x2e55] = 0xff89, [0x2e56] = 0xff8a, [0x2e57] = 0xff8b,
|
|
||||||
+ [0x2e58] = 0xff8c, [0x2e59] = 0xff8d, [0x2e5a] = 0xff8e, [0x2e5b] = 0xff8f,
|
|
||||||
+ [0x2e5c] = 0xff90, [0x2e5d] = 0xff91, [0x2e5e] = 0xff92, [0x2e5f] = 0xff93,
|
|
||||||
+ [0x2e60] = 0xff94, [0x2e61] = 0xff95, [0x2e62] = 0xff96, [0x2e63] = 0xff97,
|
|
||||||
+ [0x2e64] = 0xff98, [0x2e65] = 0xff99, [0x2e66] = 0xff9a, [0x2e67] = 0xff9b,
|
|
||||||
+ [0x2e68] = 0xff9c, [0x2e69] = 0xff9d, [0x2e6a] = 0xff9e, [0x2e6b] = 0xff9f,
|
|
||||||
+ [0x2e6c] = 0xffa0, [0x2e6d] = 0xffa1, [0x2e6e] = 0xffa2, [0x2e6f] = 0xffa3,
|
|
||||||
+ [0x2e70] = 0xffa4, [0x2e71] = 0xffa5, [0x2e72] = 0xffa6, [0x2e73] = 0xffa7,
|
|
||||||
+ [0x2e74] = 0xffa8, [0x2e75] = 0xffa9, [0x2e76] = 0xffaa, [0x2e77] = 0xffab,
|
|
||||||
+ [0x2e78] = 0xffac, [0x2e79] = 0xffad, [0x2e7a] = 0xffae, [0x2e7b] = 0xffaf,
|
|
||||||
+ [0x2e7c] = 0xffb0, [0x2e7d] = 0xffb1, [0x2e7e] = 0xffb2, [0x2e7f] = 0xffb3,
|
|
||||||
+ [0x2e80] = 0xffb4, [0x2e81] = 0xffb5, [0x2e82] = 0xffb6, [0x2e83] = 0xffb7,
|
|
||||||
+ [0x2e84] = 0xffb8, [0x2e85] = 0xffb9, [0x2e86] = 0xffba, [0x2e87] = 0xffbb,
|
|
||||||
+ [0x2e88] = 0xffbc, [0x2e89] = 0xffbd, [0x2e8a] = 0xffbe, [0x2e8b] = 0xffbf,
|
|
||||||
+ [0x2e8c] = 0xffc0, [0x2e8d] = 0xffc1, [0x2e8e] = 0xffc2, [0x2e8f] = 0xffc3,
|
|
||||||
+ [0x2e90] = 0xffc4, [0x2e91] = 0xffc5, [0x2e92] = 0xffc6, [0x2e93] = 0xffc7,
|
|
||||||
+ [0x2e94] = 0xffc8, [0x2e95] = 0xffc9, [0x2e96] = 0xffca, [0x2e97] = 0xffcb,
|
|
||||||
+ [0x2e98] = 0xffcc, [0x2e99] = 0xffcd, [0x2e9a] = 0xffce, [0x2e9b] = 0xffcf,
|
|
||||||
+ [0x2e9c] = 0xffd0, [0x2e9d] = 0xffd1, [0x2e9e] = 0xffd2, [0x2e9f] = 0xffd3,
|
|
||||||
+ [0x2ea0] = 0xffd4, [0x2ea1] = 0xffd5, [0x2ea2] = 0xffd6, [0x2ea3] = 0xffd7,
|
|
||||||
+ [0x2ea4] = 0xffd8, [0x2ea5] = 0xffd9, [0x2ea6] = 0xffda, [0x2ea7] = 0xffdb,
|
|
||||||
+ [0x2ea8] = 0xffdc, [0x2ea9] = 0xffdd, [0x2eaa] = 0xffde, [0x2eab] = 0xffdf,
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Table for UCS-4 -> GB18030, for the range U+0080..U+9FBB.
|
|
||||||
@@ -23437,71 +23442,79 @@ static const unsigned char __ucs_to_gb18030_tab2[][2] =
|
|
||||||
[0x0783] = "\xa5\xfd", [0x0784] = "\xa5\xfe", [0x0785] = "\xa6\xb9",
|
|
||||||
[0x0786] = "\xa6\xba", [0x0787] = "\xa6\xbb", [0x0788] = "\xa6\xbc",
|
|
||||||
[0x0789] = "\xa6\xbd", [0x078a] = "\xa6\xbe", [0x078b] = "\xa6\xbf",
|
|
||||||
- [0x078c] = "\xa6\xc0", [0x0797] = "\xa6\xf6", [0x0798] = "\xa6\xf7",
|
|
||||||
- [0x0799] = "\xa6\xf8", [0x079a] = "\xa6\xf9", [0x079b] = "\xa6\xfa",
|
|
||||||
- [0x079c] = "\xa6\xfb", [0x079d] = "\xa6\xfc", [0x079e] = "\xa6\xfd",
|
|
||||||
- [0x079f] = "\xa6\xfe", [0x07a0] = "\xa7\xc2", [0x07a1] = "\xa7\xc3",
|
|
||||||
- [0x07a2] = "\xa7\xc4", [0x07a3] = "\xa7\xc5", [0x07a4] = "\xa7\xc6",
|
|
||||||
- [0x07a5] = "\xa7\xc7", [0x07a6] = "\xa7\xc8", [0x07a7] = "\xa7\xc9",
|
|
||||||
- [0x07a8] = "\xa7\xca", [0x07a9] = "\xa7\xcb", [0x07aa] = "\xa7\xcc",
|
|
||||||
- [0x07ab] = "\xa7\xcd", [0x07ac] = "\xa7\xce", [0x07ad] = "\xa7\xcf",
|
|
||||||
- [0x07ae] = "\xa7\xd0", [0x07af] = "\xa7\xf2", [0x07b0] = "\xa7\xf3",
|
|
||||||
- [0x07b1] = "\xa7\xf4", [0x07b2] = "\xa7\xf5", [0x07b3] = "\xa7\xf6",
|
|
||||||
- [0x07b4] = "\xa7\xf7", [0x07b5] = "\xa7\xf8", [0x07b6] = "\xa7\xf9",
|
|
||||||
- [0x07b7] = "\xa7\xfa", [0x07b8] = "\xa7\xfb", [0x07b9] = "\xa7\xfc",
|
|
||||||
- [0x07ba] = "\xa7\xfd", [0x07bb] = "\xa7\xfe", [0x07bc] = "\xa8\x96",
|
|
||||||
- [0x07bd] = "\xa8\x97", [0x07be] = "\xa8\x98", [0x07bf] = "\xa8\x99",
|
|
||||||
- [0x07c0] = "\xa8\x9a", [0x07c1] = "\xa8\x9b", [0x07c2] = "\xa8\x9c",
|
|
||||||
- [0x07c3] = "\xa8\x9d", [0x07c4] = "\xa8\x9e", [0x07c5] = "\xa8\x9f",
|
|
||||||
- [0x07c6] = "\xa8\xa0", [0x07c7] = "\x00\x01", [0x07c8] = "\x65\x9e",
|
|
||||||
- [0x07c9] = "\xa8\xc1", [0x07ca] = "\xa8\xc2", [0x07cb] = "\xa8\xc3",
|
|
||||||
- [0x07cc] = "\xa8\xc4", [0x07cd] = "\xa8\xea", [0x07ce] = "\xa8\xeb",
|
|
||||||
- [0x07cf] = "\xa8\xec", [0x07d0] = "\xa8\xed", [0x07d1] = "\xa8\xee",
|
|
||||||
- [0x07d2] = "\xa8\xef", [0x07d3] = "\xa8\xf0", [0x07d4] = "\xa8\xf1",
|
|
||||||
- [0x07d5] = "\xa8\xf2", [0x07d6] = "\xa8\xf3", [0x07d7] = "\xa8\xf4",
|
|
||||||
- [0x07d8] = "\xa8\xf5", [0x07d9] = "\xa8\xf6", [0x07da] = "\xa8\xf7",
|
|
||||||
- [0x07db] = "\xa8\xf8", [0x07dc] = "\xa8\xf9", [0x07dd] = "\xa8\xfa",
|
|
||||||
- [0x07de] = "\xa8\xfb", [0x07df] = "\xa8\xfc", [0x07e0] = "\xa8\xfd",
|
|
||||||
- [0x07e1] = "\xa8\xfe", [0x07e2] = "\xa9\x58", [0x07e3] = "\xa9\x5b",
|
|
||||||
- [0x07e4] = "\xa9\x5d", [0x07e5] = "\xa9\x5e", [0x07e6] = "\xa9\x5f",
|
|
||||||
- [0x07e7] = "\x65\x9f", [0x07e8] = "\x65\xa0", [0x07e9] = "\x65\xa1",
|
|
||||||
- [0x07ea] = "\x65\xa2", [0x07eb] = "\x65\xa3", [0x07ec] = "\x65\xa4",
|
|
||||||
- [0x07ed] = "\x65\xa5", [0x07ee] = "\x65\xa6", [0x07ef] = "\x65\xa7",
|
|
||||||
- [0x07f0] = "\x65\xa8", [0x07f1] = "\x65\xa9", [0x07f2] = "\x65\xaa",
|
|
||||||
- [0x07f3] = "\x65\xab", [0x07f4] = "\xa9\x97", [0x07f5] = "\xa9\x98",
|
|
||||||
- [0x07f6] = "\xa9\x99", [0x07f7] = "\xa9\x9a", [0x07f8] = "\xa9\x9b",
|
|
||||||
- [0x07f9] = "\xa9\x9c", [0x07fa] = "\xa9\x9d", [0x07fb] = "\xa9\x9e",
|
|
||||||
- [0x07fc] = "\xa9\x9f", [0x07fd] = "\xa9\xa0", [0x07fe] = "\xa9\xa1",
|
|
||||||
- [0x07ff] = "\xa9\xa2", [0x0800] = "\xa9\xa3", [0x0801] = "\xa9\xf0",
|
|
||||||
- [0x0802] = "\xa9\xf1", [0x0803] = "\xa9\xf2", [0x0804] = "\xa9\xf3",
|
|
||||||
- [0x0805] = "\xa9\xf4", [0x0806] = "\xa9\xf5", [0x0807] = "\xa9\xf6",
|
|
||||||
- [0x0808] = "\xa9\xf7", [0x0809] = "\xa9\xf8", [0x080a] = "\xa9\xf9",
|
|
||||||
- [0x080b] = "\xa9\xfa", [0x080c] = "\xa9\xfb", [0x080d] = "\xa9\xfc",
|
|
||||||
- [0x080e] = "\xa9\xfd", [0x080f] = "\xa9\xfe", [0x0810] = "\xd7\xfa",
|
|
||||||
- [0x0811] = "\xd7\xfb", [0x0812] = "\xd7\xfc", [0x0813] = "\xd7\xfd",
|
|
||||||
- [0x0814] = "\xd7\xfe", [0x0815] = "\x65\xac", [0x0819] = "\x65\xad",
|
|
||||||
- [0x081a] = "\x65\xae", [0x081b] = "\x65\xaf", [0x081c] = "\x65\xb0",
|
|
||||||
- [0x081d] = "\x65\xb1", [0x081f] = "\x65\xb2", [0x0820] = "\x65\xb3",
|
|
||||||
- [0x0821] = "\x65\xb4", [0x0822] = "\x65\xb5", [0x0823] = "\x65\xb6",
|
|
||||||
- [0x0824] = "\x65\xb7", [0x0825] = "\x65\xb8", [0x0827] = "\x65\xb9",
|
|
||||||
+ [0x078c] = "\xa6\xc0", [0x078d] = "\x7b\x84", [0x078e] = "\x7b\x86",
|
|
||||||
+ [0x078f] = "\x7b\x85", [0x0790] = "\x7b\x87", [0x0791] = "\x7b\x88",
|
|
||||||
+ [0x0792] = "\x7b\x89", [0x0793] = "\x7b\x8a", [0x0794] = "\x7b\x8b",
|
|
||||||
+ [0x0795] = "\x7b\x8c", [0x0796] = "\x7b\x8d", [0x0797] = "\xa6\xf6",
|
|
||||||
+ [0x0798] = "\xa6\xf7", [0x0799] = "\xa6\xf8", [0x079a] = "\xa6\xf9",
|
|
||||||
+ [0x079b] = "\xa6\xfa", [0x079c] = "\xa6\xfb", [0x079d] = "\xa6\xfc",
|
|
||||||
+ [0x079e] = "\xa6\xfd", [0x079f] = "\xa6\xfe", [0x07a0] = "\xa7\xc2",
|
|
||||||
+ [0x07a1] = "\xa7\xc3", [0x07a2] = "\xa7\xc4", [0x07a3] = "\xa7\xc5",
|
|
||||||
+ [0x07a4] = "\xa7\xc6", [0x07a5] = "\xa7\xc7", [0x07a6] = "\xa7\xc8",
|
|
||||||
+ [0x07a7] = "\xa7\xc9", [0x07a8] = "\xa7\xca", [0x07a9] = "\xa7\xcb",
|
|
||||||
+ [0x07aa] = "\xa7\xcc", [0x07ab] = "\xa7\xcd", [0x07ac] = "\xa7\xce",
|
|
||||||
+ [0x07ad] = "\xa7\xcf", [0x07ae] = "\xa7\xd0", [0x07af] = "\xa7\xf2",
|
|
||||||
+ [0x07b0] = "\xa7\xf3", [0x07b1] = "\xa7\xf4", [0x07b2] = "\xa7\xf5",
|
|
||||||
+ [0x07b3] = "\xa7\xf6", [0x07b4] = "\xa7\xf7", [0x07b5] = "\xa7\xf8",
|
|
||||||
+ [0x07b6] = "\xa7\xf9", [0x07b7] = "\xa7\xfa", [0x07b8] = "\xa7\xfb",
|
|
||||||
+ [0x07b9] = "\xa7\xfc", [0x07ba] = "\xa7\xfd", [0x07bb] = "\xa7\xfe",
|
|
||||||
+ [0x07bc] = "\xa8\x96", [0x07bd] = "\xa8\x97", [0x07be] = "\xa8\x98",
|
|
||||||
+ [0x07bf] = "\xa8\x99", [0x07c0] = "\xa8\x9a", [0x07c1] = "\xa8\x9b",
|
|
||||||
+ [0x07c2] = "\xa8\x9c", [0x07c3] = "\xa8\x9d", [0x07c4] = "\xa8\x9e",
|
|
||||||
+ [0x07c5] = "\xa8\x9f", [0x07c6] = "\xa8\xa0", [0x07c7] = "\x00\x01",
|
|
||||||
+ [0x07c8] = "\x65\x9e", [0x07c9] = "\xa8\xc1", [0x07ca] = "\xa8\xc2",
|
|
||||||
+ [0x07cb] = "\xa8\xc3", [0x07cc] = "\xa8\xc4", [0x07cd] = "\xa8\xea",
|
|
||||||
+ [0x07ce] = "\xa8\xeb", [0x07cf] = "\xa8\xec", [0x07d0] = "\xa8\xed",
|
|
||||||
+ [0x07d1] = "\xa8\xee", [0x07d2] = "\xa8\xef", [0x07d3] = "\xa8\xf0",
|
|
||||||
+ [0x07d4] = "\xa8\xf1", [0x07d5] = "\xa8\xf2", [0x07d6] = "\xa8\xf3",
|
|
||||||
+ [0x07d7] = "\xa8\xf4", [0x07d8] = "\xa8\xf5", [0x07d9] = "\xa8\xf6",
|
|
||||||
+ [0x07da] = "\xa8\xf7", [0x07db] = "\xa8\xf8", [0x07dc] = "\xa8\xf9",
|
|
||||||
+ [0x07dd] = "\xa8\xfa", [0x07de] = "\xa8\xfb", [0x07df] = "\xa8\xfc",
|
|
||||||
+ [0x07e0] = "\xa8\xfd", [0x07e1] = "\xa8\xfe", [0x07e2] = "\xa9\x58",
|
|
||||||
+ [0x07e3] = "\xa9\x5b", [0x07e4] = "\xa9\x5d", [0x07e5] = "\xa9\x5e",
|
|
||||||
+ [0x07e6] = "\xa9\x5f", [0x07e7] = "\x65\x9f", [0x07e8] = "\x65\xa0",
|
|
||||||
+ [0x07e9] = "\x65\xa1", [0x07ea] = "\x65\xa2", [0x07eb] = "\x65\xa3",
|
|
||||||
+ [0x07ec] = "\x65\xa4", [0x07ed] = "\x65\xa5", [0x07ee] = "\x65\xa6",
|
|
||||||
+ [0x07ef] = "\x65\xa7", [0x07f0] = "\x65\xa8", [0x07f1] = "\x65\xa9",
|
|
||||||
+ [0x07f2] = "\x65\xaa", [0x07f3] = "\x65\xab", [0x07f4] = "\xa9\x97",
|
|
||||||
+ [0x07f5] = "\xa9\x98", [0x07f6] = "\xa9\x99", [0x07f7] = "\xa9\x9a",
|
|
||||||
+ [0x07f8] = "\xa9\x9b", [0x07f9] = "\xa9\x9c", [0x07fa] = "\xa9\x9d",
|
|
||||||
+ [0x07fb] = "\xa9\x9e", [0x07fc] = "\xa9\x9f", [0x07fd] = "\xa9\xa0",
|
|
||||||
+ [0x07fe] = "\xa9\xa1", [0x07ff] = "\xa9\xa2", [0x0800] = "\xa9\xa3",
|
|
||||||
+ [0x0801] = "\xa9\xf0", [0x0802] = "\xa9\xf1", [0x0803] = "\xa9\xf2",
|
|
||||||
+ [0x0804] = "\xa9\xf3", [0x0805] = "\xa9\xf4", [0x0806] = "\xa9\xf5",
|
|
||||||
+ [0x0807] = "\xa9\xf6", [0x0808] = "\xa9\xf7", [0x0809] = "\xa9\xf8",
|
|
||||||
+ [0x080a] = "\xa9\xf9", [0x080b] = "\xa9\xfa", [0x080c] = "\xa9\xfb",
|
|
||||||
+ [0x080d] = "\xa9\xfc", [0x080e] = "\xa9\xfd", [0x080f] = "\xa9\xfe",
|
|
||||||
+ [0x0810] = "\xd7\xfa", [0x0811] = "\xd7\xfb", [0x0812] = "\xd7\xfc",
|
|
||||||
+ [0x0813] = "\xd7\xfd", [0x0814] = "\xd7\xfe", [0x0815] = "\x65\xac",
|
|
||||||
+ [0x0816] = "\xfe\x51", [0x0817] = "\xfe\x52", [0x0818] = "\xfe\x53",
|
|
||||||
+ [0x0819] = "\x65\xad", [0x081a] = "\x65\xae", [0x081b] = "\x65\xaf",
|
|
||||||
+ [0x081c] = "\x65\xb0", [0x081d] = "\x65\xb1", [0x081e] = "\x2d\x51",
|
|
||||||
+ [0x081f] = "\x65\xb2", [0x0820] = "\x65\xb3", [0x0821] = "\x65\xb4",
|
|
||||||
+ [0x0822] = "\x65\xb5", [0x0823] = "\x65\xb6", [0x0824] = "\x65\xb7",
|
|
||||||
+ [0x0825] = "\x65\xb8", [0x0826] = "\x2d\x52", [0x0827] = "\x65\xb9",
|
|
||||||
[0x0828] = "\x65\xba", [0x0829] = "\x65\xbb", [0x082a] = "\x65\xbc",
|
|
||||||
- [0x082d] = "\x65\xbd", [0x082e] = "\x65\xbe", [0x082f] = "\x65\xbf",
|
|
||||||
- [0x0830] = "\x65\xc0", [0x0833] = "\x65\xc1", [0x0834] = "\x65\xc2",
|
|
||||||
- [0x0835] = "\x65\xc3", [0x0836] = "\x65\xc4", [0x0837] = "\x65\xc5",
|
|
||||||
- [0x0838] = "\x65\xc6", [0x0839] = "\x65\xc7", [0x083a] = "\x65\xc8",
|
|
||||||
- [0x083c] = "\x65\xc9", [0x083d] = "\x65\xca", [0x083e] = "\x65\xcb",
|
|
||||||
- [0x083f] = "\x65\xcc", [0x0840] = "\x65\xcd", [0x0841] = "\x65\xce",
|
|
||||||
- [0x0842] = "\x65\xcf", [0x0844] = "\x65\xd0", [0x0845] = "\x65\xd1",
|
|
||||||
+ [0x082b] = "\x2d\x53", [0x082c] = "\x2d\x54", [0x082d] = "\x65\xbd",
|
|
||||||
+ [0x082e] = "\x65\xbe", [0x082f] = "\x65\xbf", [0x0830] = "\x65\xc0",
|
|
||||||
+ [0x0831] = "\xfe\x6c", [0x0832] = "\x2d\x55", [0x0833] = "\x65\xc1",
|
|
||||||
+ [0x0834] = "\x65\xc2", [0x0835] = "\x65\xc3", [0x0836] = "\x65\xc4",
|
|
||||||
+ [0x0837] = "\x65\xc5", [0x0838] = "\x65\xc6", [0x0839] = "\x65\xc7",
|
|
||||||
+ [0x083a] = "\x65\xc8", [0x083b] = "\xfe\x76", [0x083c] = "\x65\xc9",
|
|
||||||
+ [0x083d] = "\x65\xca", [0x083e] = "\x65\xcb", [0x083f] = "\x65\xcc",
|
|
||||||
+ [0x0840] = "\x65\xcd", [0x0841] = "\x65\xce", [0x0842] = "\x65\xcf",
|
|
||||||
+ [0x0843] = "\x2d\x56", [0x0844] = "\x65\xd0", [0x0845] = "\x65\xd1",
|
|
||||||
[0x0846] = "\x65\xd2", [0x0847] = "\x65\xd3", [0x0848] = "\x65\xd4",
|
|
||||||
[0x0849] = "\x65\xd5", [0x084a] = "\x65\xd6", [0x084b] = "\x65\xd7",
|
|
||||||
[0x084c] = "\x65\xd8", [0x084d] = "\x65\xd9", [0x084e] = "\x65\xda",
|
|
||||||
[0x084f] = "\x65\xdb", [0x0850] = "\x65\xdc", [0x0851] = "\x65\xdd",
|
|
||||||
- [0x0852] = "\x65\xde", [0x0853] = "\x65\xdf", [0x0856] = "\x65\xe0",
|
|
||||||
- [0x0857] = "\x65\xe1", [0x0858] = "\x65\xe2", [0x0859] = "\x65\xe3",
|
|
||||||
- [0x085a] = "\x65\xe4", [0x085b] = "\x65\xe5", [0x085c] = "\x65\xe6",
|
|
||||||
- [0x085d] = "\x65\xe7", [0x085e] = "\x65\xe8", [0x085f] = "\x65\xe9",
|
|
||||||
- [0x0860] = "\x65\xea", [0x0861] = "\x65\xeb", [0x0862] = "\x65\xec",
|
|
||||||
- [0x0863] = "\x65\xed", [0x0865] = "\xfd\x9c", [0x0866] = "\x76\xb5",
|
|
||||||
+ [0x0852] = "\x65\xde", [0x0853] = "\x65\xdf", [0x0854] = "\x2d\x57",
|
|
||||||
+ [0x0855] = "\xfe\x91", [0x0856] = "\x65\xe0", [0x0857] = "\x65\xe1",
|
|
||||||
+ [0x0858] = "\x65\xe2", [0x0859] = "\x65\xe3", [0x085a] = "\x65\xe4",
|
|
||||||
+ [0x085b] = "\x65\xe5", [0x085c] = "\x65\xe6", [0x085d] = "\x65\xe7",
|
|
||||||
+ [0x085e] = "\x65\xe8", [0x085f] = "\x65\xe9", [0x0860] = "\x65\xea",
|
|
||||||
+ [0x0861] = "\x65\xeb", [0x0862] = "\x65\xec", [0x0863] = "\x65\xed",
|
|
||||||
+ [0x0864] = "\x2d\x58", [0x0865] = "\xfd\x9c", [0x0866] = "\x76\xb5",
|
|
||||||
[0x0867] = "\x76\xb6", [0x0868] = "\x76\xb7", [0x0869] = "\x76\xb8",
|
|
||||||
[0x086a] = "\x76\xb9", [0x086b] = "\x76\xba", [0x086c] = "\x76\xbb",
|
|
||||||
[0x086d] = "\x76\xbc", [0x086e] = "\x76\xbd", [0x086f] = "\x76\xbe",
|
|
||||||
@@ -24211,24 +24224,8 @@ static const unsigned char __ucs_to_gb18030_tab2[][2] =
|
|
||||||
|| (ch = __twobyte_to_ucs[idx], \
|
|
||||||
ch == 0 && *inptr != '\0')) \
|
|
||||||
{ \
|
|
||||||
- /* Handle a few special cases. */ \
|
|
||||||
- if (idx == 0x5dd1) \
|
|
||||||
- ch = 0x20087; \
|
|
||||||
- else if (idx == 0x5dd2) \
|
|
||||||
- ch = 0x20089; \
|
|
||||||
- else if (idx == 0x5dd3) \
|
|
||||||
- ch = 0x200cc; \
|
|
||||||
- else if (idx == 0x5dec) \
|
|
||||||
- ch = 0x215D7; \
|
|
||||||
- else if (idx == 0x5df6) \
|
|
||||||
- ch = 0x2298F; \
|
|
||||||
- else if (idx == 0x5e11) \
|
|
||||||
- ch = 0x241FE; \
|
|
||||||
- else \
|
|
||||||
- { \
|
|
||||||
- /* This is an illegal character. */ \
|
|
||||||
- STANDARD_FROM_LOOP_ERR_HANDLER (2); \
|
|
||||||
- } \
|
|
||||||
+ /* This is an illegal character. */ \
|
|
||||||
+ STANDARD_FROM_LOOP_ERR_HANDLER (2); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
inptr += 2; \
|
|
||||||
@@ -24320,17 +24317,35 @@ static const unsigned char __ucs_to_gb18030_tab2[][2] =
|
|
||||||
len = 4; \
|
|
||||||
} \
|
|
||||||
else if (ch == 0x20087) \
|
|
||||||
- cp = (const unsigned char *) "\xfe\x51"; \
|
|
||||||
+ { \
|
|
||||||
+ idx = 0x3E2CF; \
|
|
||||||
+ len = 4; \
|
|
||||||
+ } \
|
|
||||||
else if (ch == 0x20089) \
|
|
||||||
- cp = (const unsigned char *) "\xfe\x52"; \
|
|
||||||
+ { \
|
|
||||||
+ idx = 0x3E2D1; \
|
|
||||||
+ len = 4; \
|
|
||||||
+ } \
|
|
||||||
else if (ch == 0x200CC) \
|
|
||||||
- cp = (const unsigned char *) "\xfe\x53"; \
|
|
||||||
+ { \
|
|
||||||
+ idx = 0x3E314; \
|
|
||||||
+ len = 4; \
|
|
||||||
+ } \
|
|
||||||
else if (ch == 0x215d7) \
|
|
||||||
- cp = (const unsigned char *) "\xfe\x6c"; \
|
|
||||||
+ { \
|
|
||||||
+ idx = 0x3F81F; \
|
|
||||||
+ len = 4; \
|
|
||||||
+ } \
|
|
||||||
else if (ch == 0x2298F) \
|
|
||||||
- cp = (const unsigned char *) "\xfe\x76"; \
|
|
||||||
+ { \
|
|
||||||
+ idx = 0x40BD7; \
|
|
||||||
+ len = 4; \
|
|
||||||
+ } \
|
|
||||||
else if (ch == 0x241FE) \
|
|
||||||
- cp = (const unsigned char *) "\xfe\x91"; \
|
|
||||||
+ { \
|
|
||||||
+ idx = 0x42446; \
|
|
||||||
+ len = 4; \
|
|
||||||
+ } \
|
|
||||||
else if (ch >= 0x10000 && ch <= 0x10FFFF) \
|
|
||||||
{ \
|
|
||||||
idx = ch + 0x1E248; \
|
|
||||||
diff --git a/iconvdata/tst-table-from.c b/iconvdata/tst-table-from.c
|
|
||||||
index 09aaaf0942..55a7113d8c 100644
|
|
||||||
--- a/iconvdata/tst-table-from.c
|
|
||||||
+++ b/iconvdata/tst-table-from.c
|
|
||||||
@@ -194,10 +194,9 @@ main (int argc, char *argv[])
|
|
||||||
exit (1);
|
|
||||||
}
|
|
||||||
|
|
||||||
- /* When testing UTF-8 or GB18030, stop at 0x10000, otherwise the output
|
|
||||||
+ /* When testing UTF-8, stop at 0x10000, otherwise the output
|
|
||||||
file gets too big. */
|
|
||||||
- bmp_only = (strcmp (charset, "UTF-8") == 0
|
|
||||||
- || strcmp (charset, "GB18030") == 0);
|
|
||||||
+ bmp_only = (strcmp (charset, "UTF-8") == 0);
|
|
||||||
search_depth = (strcmp (charset, "UTF-8") == 0 ? 3 : 4);
|
|
||||||
|
|
||||||
{
|
|
||||||
diff --git a/iconvdata/tst-table-to.c b/iconvdata/tst-table-to.c
|
|
||||||
index 4dec4acad1..2b75f0c6e8 100644
|
|
||||||
--- a/iconvdata/tst-table-to.c
|
|
||||||
+++ b/iconvdata/tst-table-to.c
|
|
||||||
@@ -32,6 +32,7 @@ main (int argc, char *argv[])
|
|
||||||
const char *charset;
|
|
||||||
iconv_t cd;
|
|
||||||
int bmp_only;
|
|
||||||
+ int no_tags;
|
|
||||||
|
|
||||||
if (argc != 2)
|
|
||||||
{
|
|
||||||
@@ -47,16 +48,19 @@ main (int argc, char *argv[])
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
- /* When testing UTF-8 or GB18030, stop at 0x10000, otherwise the output
|
|
||||||
+ /* When testing UTF-8, stop at 0x10000, otherwise the output
|
|
||||||
file gets too big. */
|
|
||||||
- bmp_only = (strcmp (charset, "UTF-8") == 0
|
|
||||||
+ bmp_only = (strcmp (charset, "UTF-8") == 0);
|
|
||||||
+ /* When testing any encoding other than UTF-8 or GB18030, stop at 0xE0000,
|
|
||||||
+ because the conversion drops Unicode tag characters (range
|
|
||||||
+ U+E0000..U+E007F). */
|
|
||||||
+ no_tags = !(strcmp (charset, "UTF-8") == 0
|
|
||||||
|| strcmp (charset, "GB18030") == 0);
|
|
||||||
|
|
||||||
{
|
|
||||||
unsigned int i;
|
|
||||||
unsigned char buf[10];
|
|
||||||
-
|
|
||||||
- for (i = 0; i < (bmp_only ? 0x10000 : 0x30000); i++)
|
|
||||||
+ for (i = 0; i < (bmp_only ? 0x10000 : no_tags ? 0xE0000 : 0x110000); i++)
|
|
||||||
{
|
|
||||||
unsigned char in[6];
|
|
||||||
unsigned int incount =
|
|
||||||
diff --git a/iconvdata/tst-table.sh b/iconvdata/tst-table.sh
|
|
||||||
index bc6f542b24..7ba15bbf5c 100755
|
|
||||||
--- a/iconvdata/tst-table.sh
|
|
||||||
+++ b/iconvdata/tst-table.sh
|
|
||||||
@@ -37,7 +37,8 @@ set -e
|
|
||||||
< ../localedata/charmaps/${charmap:-$charset} \
|
|
||||||
> ${objpfx}tst-${charset}.charmap.table
|
|
||||||
# When the charset is GB18030, truncate this table because for this encoding,
|
|
||||||
-# the tst-table-from and tst-table-to programs scan the Unicode BMP only.
|
|
||||||
+# the charmap contains ranges (<Unnnn>..<Ummmm> notation), which the
|
|
||||||
+# tst-table-charmap.sh script does not grok.
|
|
||||||
if test ${charset} = GB18030; then
|
|
||||||
grep '0x....$' < ${objpfx}tst-${charset}.charmap.table \
|
|
||||||
> ${objpfx}tst-${charset}.truncated.table
|
|
||||||
@@ -73,25 +74,42 @@ diff ${objpfx}tst-${charset}.charmap.table ${objpfx}tst-${charset}.inverse.table
|
|
||||||
|
|
||||||
# Check 1: charmap and iconv forward should be identical, except for
|
|
||||||
# precomposed characters.
|
|
||||||
-if test -f ${precomposed}; then
|
|
||||||
- cat ${objpfx}tst-${charset}.table ${precomposed} | sort | uniq -u \
|
|
||||||
- > ${objpfx}tst-${charset}.tmp.table
|
|
||||||
- cmp -s ${objpfx}tst-${charset}.charmap.table ${objpfx}tst-${charset}.tmp.table ||
|
|
||||||
+{ if test -f ${precomposed}; then
|
|
||||||
+ cat ${objpfx}tst-${charset}.table ${precomposed} | sort | uniq -u
|
|
||||||
+ else
|
|
||||||
+ cat ${objpfx}tst-${charset}.table
|
|
||||||
+ fi
|
|
||||||
+} | { if test ${charset} = GB18030; then grep '0x....$'; else cat; fi; } \
|
|
||||||
+ > ${objpfx}tst-${charset}.tmp1.table
|
|
||||||
+cmp -s ${objpfx}tst-${charset}.charmap.table ${objpfx}tst-${charset}.tmp1.table ||
|
|
||||||
exit 1
|
|
||||||
-else
|
|
||||||
- cmp -s ${objpfx}tst-${charset}.charmap.table ${objpfx}tst-${charset}.table ||
|
|
||||||
- exit 1
|
|
||||||
-fi
|
|
||||||
|
|
||||||
# Check 2: the difference between the charmap and iconv backward.
|
|
||||||
-if test -f ${irreversible}; then
|
|
||||||
- cat ${objpfx}tst-${charset}.charmap.table ${irreversible} | sort | uniq -u \
|
|
||||||
- > ${objpfx}tst-${charset}.tmp.table
|
|
||||||
- cmp -s ${objpfx}tst-${charset}.tmp.table ${objpfx}tst-${charset}.inverse.table ||
|
|
||||||
- exit 1
|
|
||||||
-else
|
|
||||||
- cmp -s ${objpfx}tst-${charset}.charmap.table ${objpfx}tst-${charset}.inverse.table ||
|
|
||||||
+{ if test -f ${irreversible}; then
|
|
||||||
+ cat ${objpfx}tst-${charset}.charmap.table ${irreversible} | sort | uniq -u
|
|
||||||
+ else
|
|
||||||
+ cat ${objpfx}tst-${charset}.charmap.table
|
|
||||||
+ fi
|
|
||||||
+} | { if test ${charset} = GB18030; then grep '0x....$'; else cat; fi; } \
|
|
||||||
+ > ${objpfx}tst-${charset}.tmp2c.table
|
|
||||||
+cat ${objpfx}tst-${charset}.inverse.table \
|
|
||||||
+ | { if test ${charset} = GB18030; then grep '0x....$'; else cat; fi; } \
|
|
||||||
+ > ${objpfx}tst-${charset}.tmp2i.table
|
|
||||||
+cmp -s ${objpfx}tst-${charset}.tmp2c.table ${objpfx}tst-${charset}.tmp2i.table ||
|
|
||||||
exit 1
|
|
||||||
+
|
|
||||||
+# Check 3: the difference between iconv forward and iconv backward. This is
|
|
||||||
+# necessary only for GB18030, because ${objpfx}tst-${charset}.charmap.table
|
|
||||||
+# is truncated for this encoding (see above).
|
|
||||||
+if test ${charset} = GB18030; then
|
|
||||||
+ { if test -f ${irreversible}; then
|
|
||||||
+ cat ${objpfx}tst-${charset}.table ${irreversible} | sort | uniq -u
|
|
||||||
+ else
|
|
||||||
+ cat ${objpfx}tst-${charset}.table
|
|
||||||
+ fi
|
|
||||||
+ } > ${objpfx}tst-${charset}.tmp3.table
|
|
||||||
+ cmp -s ${objpfx}tst-${charset}.tmp3.table ${objpfx}tst-${charset}.inverse.table ||
|
|
||||||
+ exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
exit 0
|
|
||||||
diff --git a/localedata/charmaps/GB18030 b/localedata/charmaps/GB18030
|
|
||||||
index ad6728c5bd..fc3b1d2d40 100644
|
|
||||||
--- a/localedata/charmaps/GB18030
|
|
||||||
+++ b/localedata/charmaps/GB18030
|
|
||||||
@@ -57234,32 +57234,16 @@ CHARMAP
|
|
||||||
<UE78A> /xa6/xbe <Private Use>
|
|
||||||
<UE78B> /xa6/xbf <Private Use>
|
|
||||||
<UE78C> /xa6/xc0 <Private Use>
|
|
||||||
-% The newest GB 18030-2005 standard still uses some private use area
|
|
||||||
-% code points. Any implementation which has Unicode 4.1 or newer
|
|
||||||
-% support should not use these PUA code points, and instead should
|
|
||||||
-% map these entries to their equivalent non-PUA code points. There
|
|
||||||
-% are 24 idiograms in GB 18030-2005 which have non-PUA equivalents.
|
|
||||||
-% In glibc we only support roundtrip code points, and so must choose
|
|
||||||
-% between supporting the old PUA code points, or using the newer
|
|
||||||
-% non-PUA code points. We choose to use the non-PUA code points to
|
|
||||||
-% be compatible with ICU's similar choice. In choosing the non-PUA
|
|
||||||
-% code points we can no longer convert the old PUA code points back
|
|
||||||
-% to GB-18030-2005 (technically only fixable if we added support
|
|
||||||
-% for non-roundtrip code points e.g. ICU's "fallback mapping").
|
|
||||||
-% The recommendation to use the non-PUA code points, where available,
|
|
||||||
-% is based on "CJKV Information Processing" 2nd Ed. by Dr. Ken Lunde.
|
|
||||||
-%
|
|
||||||
-% These 10 PUA mappings use equivalents from <UFE10> to <UFE19>.
|
|
||||||
-% <UE78D> /xa6/xd9 <Private Use>
|
|
||||||
-% <UE78E> /xa6/xda <Private Use>
|
|
||||||
-% <UE78F> /xa6/xdb <Private Use>
|
|
||||||
-% <UE790> /xa6/xdc <Private Use>
|
|
||||||
-% <UE791> /xa6/xdd <Private Use>
|
|
||||||
-% <UE792> /xa6/xde <Private Use>
|
|
||||||
-% <UE793> /xa6/xdf <Private Use>
|
|
||||||
-% <UE794> /xa6/xec <Private Use>
|
|
||||||
-% <UE795> /xa6/xed <Private Use>
|
|
||||||
-% <UE796> /xa6/xf3 <Private Use>
|
|
||||||
+<UE78D> /x84/x31/x82/x36 <Private Use>
|
|
||||||
+<UE78E> /x84/x31/x82/x38 <Private Use>
|
|
||||||
+<UE78F> /x84/x31/x82/x37 <Private Use>
|
|
||||||
+<UE790> /x84/x31/x82/x39 <Private Use>
|
|
||||||
+<UE791> /x84/x31/x83/x30 <Private Use>
|
|
||||||
+<UE792> /x84/x31/x83/x31 <Private Use>
|
|
||||||
+<UE793> /x84/x31/x83/x32 <Private Use>
|
|
||||||
+<UE794> /x84/x31/x83/x33 <Private Use>
|
|
||||||
+<UE795> /x84/x31/x83/x34 <Private Use>
|
|
||||||
+<UE796> /x84/x31/x83/x35 <Private Use>
|
|
||||||
<UE797> /xa6/xf6 <Private Use>
|
|
||||||
<UE798> /xa6/xf7 <Private Use>
|
|
||||||
<UE799> /xa6/xf8 <Private Use>
|
|
||||||
@@ -57387,17 +57371,15 @@ CHARMAP
|
|
||||||
<UE813> /xd7/xfd <Private Use>
|
|
||||||
<UE814> /xd7/xfe <Private Use>
|
|
||||||
<UE815> /x83/x36/xc9/x34 <Private Use>
|
|
||||||
-% These 3 PUA mappings use equivalents <U20087>, <U20089> and <U200CC>.
|
|
||||||
-% <UE816> /xfe/x51 <Private Use>
|
|
||||||
-% <UE817> /xfe/x52 <Private Use>
|
|
||||||
-% <UE818> /xfe/x53 <Private Use>
|
|
||||||
+<UE816> /xfe/x51 <Private Use>
|
|
||||||
+<UE817> /xfe/x52 <Private Use>
|
|
||||||
+<UE818> /xfe/x53 <Private Use>
|
|
||||||
<UE819> /x83/x36/xc9/x35 <Private Use>
|
|
||||||
<UE81A> /x83/x36/xc9/x36 <Private Use>
|
|
||||||
<UE81B> /x83/x36/xc9/x37 <Private Use>
|
|
||||||
<UE81C> /x83/x36/xc9/x38 <Private Use>
|
|
||||||
<UE81D> /x83/x36/xc9/x39 <Private Use>
|
|
||||||
-% This 1 PUA mapping uses the equivalent <U9FB4>.
|
|
||||||
-% <UE81E> /xfe/x59 <Private Use>
|
|
||||||
+<UE81E> /x82/x35/x90/x37 <Private Use>
|
|
||||||
<UE81F> /x83/x36/xca/x30 <Private Use>
|
|
||||||
<UE820> /x83/x36/xca/x31 <Private Use>
|
|
||||||
<UE821> /x83/x36/xca/x32 <Private Use>
|
|
||||||
@@ -57405,22 +57387,19 @@ CHARMAP
|
|
||||||
<UE823> /x83/x36/xca/x34 <Private Use>
|
|
||||||
<UE824> /x83/x36/xca/x35 <Private Use>
|
|
||||||
<UE825> /x83/x36/xca/x36 <Private Use>
|
|
||||||
-% This 1 PUA mapping uses the equivalent <U9FB5>.
|
|
||||||
-% <UE826> /xfe/x61 <Private Use>
|
|
||||||
+<UE826> /x82/x35/x90/x38 <Private Use>
|
|
||||||
<UE827> /x83/x36/xca/x37 <Private Use>
|
|
||||||
<UE828> /x83/x36/xca/x38 <Private Use>
|
|
||||||
<UE829> /x83/x36/xca/x39 <Private Use>
|
|
||||||
<UE82A> /x83/x36/xcb/x30 <Private Use>
|
|
||||||
-% These 2 PUA mappings use the equivalents <U9FB6> and <U9FB7>.
|
|
||||||
-% <UE82B> /xfe/x66 <Private Use>
|
|
||||||
-% <UE82C> /xfe/x67 <Private Use>
|
|
||||||
+<UE82B> /x82/x35/x90/x39 <Private Use>
|
|
||||||
+<UE82C> /x82/x35/x91/x30 <Private Use>
|
|
||||||
<UE82D> /x83/x36/xcb/x31 <Private Use>
|
|
||||||
<UE82E> /x83/x36/xcb/x32 <Private Use>
|
|
||||||
<UE82F> /x83/x36/xcb/x33 <Private Use>
|
|
||||||
<UE830> /x83/x36/xcb/x34 <Private Use>
|
|
||||||
-% These 2 PUA mappings use the equivalents <U215D7> and <U9FB8>.
|
|
||||||
-% <UE831> /xfe/x6c <Private Use>
|
|
||||||
-% <UE832> /xfe/x6d <Private Use>
|
|
||||||
+<UE831> /xfe/x6c <Private Use>
|
|
||||||
+<UE832> /x82/x35/x91/x31 <Private Use>
|
|
||||||
<UE833> /x83/x36/xcb/x35 <Private Use>
|
|
||||||
<UE834> /x83/x36/xcb/x36 <Private Use>
|
|
||||||
<UE835> /x83/x36/xcb/x37 <Private Use>
|
|
||||||
@@ -57429,8 +57408,7 @@ CHARMAP
|
|
||||||
<UE838> /x83/x36/xcc/x30 <Private Use>
|
|
||||||
<UE839> /x83/x36/xcc/x31 <Private Use>
|
|
||||||
<UE83A> /x83/x36/xcc/x32 <Private Use>
|
|
||||||
-% This 1 PUA mapping uses the equivalent <U2298F>.
|
|
||||||
-% <UE83B> /xfe/x76 <Private Use>
|
|
||||||
+<UE83B> /xfe/x76 <Private Use>
|
|
||||||
<UE83C> /x83/x36/xcc/x33 <Private Use>
|
|
||||||
<UE83D> /x83/x36/xcc/x34 <Private Use>
|
|
||||||
<UE83E> /x83/x36/xcc/x35 <Private Use>
|
|
||||||
@@ -57438,8 +57416,7 @@ CHARMAP
|
|
||||||
<UE840> /x83/x36/xcc/x37 <Private Use>
|
|
||||||
<UE841> /x83/x36/xcc/x38 <Private Use>
|
|
||||||
<UE842> /x83/x36/xcc/x39 <Private Use>
|
|
||||||
-% This 1 PUA mapping uses the equivalent <U9FB9>.
|
|
||||||
-% <UE843> /xfe/x7e <Private Use>
|
|
||||||
+<UE843> /x82/x35/x91/x32 <Private Use>
|
|
||||||
<UE844> /x83/x36/xcd/x30 <Private Use>
|
|
||||||
<UE845> /x83/x36/xcd/x31 <Private Use>
|
|
||||||
<UE846> /x83/x36/xcd/x32 <Private Use>
|
|
||||||
@@ -57456,9 +57433,8 @@ CHARMAP
|
|
||||||
<UE851> /x83/x36/xce/x33 <Private Use>
|
|
||||||
<UE852> /x83/x36/xce/x34 <Private Use>
|
|
||||||
<UE853> /x83/x36/xce/x35 <Private Use>
|
|
||||||
-% These 2 PUA mappings use the equivalents <U9FBA> and <U241FE>.
|
|
||||||
-% <UE854> /xfe/x90 <Private Use>
|
|
||||||
-% <UE855> /xfe/x91 <Private Use>
|
|
||||||
+<UE854> /x82/x35/x91/x33 <Private Use>
|
|
||||||
+<UE855> /xfe/x91 <Private Use>
|
|
||||||
<UE856> /x83/x36/xce/x36 <Private Use>
|
|
||||||
<UE857> /x83/x36/xce/x37 <Private Use>
|
|
||||||
<UE858> /x83/x36/xce/x38 <Private Use>
|
|
||||||
@@ -57473,8 +57449,7 @@ CHARMAP
|
|
||||||
<UE861> /x83/x36/xcf/x37 <Private Use>
|
|
||||||
<UE862> /x83/x36/xcf/x38 <Private Use>
|
|
||||||
<UE863> /x83/x36/xcf/x39 <Private Use>
|
|
||||||
-% This 1 PUA mapping uses the equivalent <U9FBB>.
|
|
||||||
-% <UE864> /xfe/xa0 <Private Use>
|
|
||||||
+<UE864> /x82/x35/x91/x34 <Private Use>
|
|
||||||
<UE865> /x83/x36/xd0/x30 <Private Use>
|
|
||||||
<UE866> /x83/x36/xd0/x31 <Private Use>
|
|
||||||
<UE867> /x83/x36/xd0/x32 <Private Use>
|
|
||||||
@@ -70447,19 +70422,14 @@ CHARMAP
|
|
||||||
<U00020068>..<U00020071> /x95/x32/x8d/x30 <CJK>
|
|
||||||
<U00020072>..<U0002007B> /x95/x32/x8e/x30 <CJK>
|
|
||||||
<U0002007C>..<U00020085> /x95/x32/x8f/x30 <CJK>
|
|
||||||
-<U00020086> /x95/x32/x90/x30 <CJK>
|
|
||||||
-<U00020087> /xfe/x51 <CJK>
|
|
||||||
-<U00020088> /x95/x32/x90/x32 <CJK>
|
|
||||||
-<U00020089> /xfe/x52 <CJK>
|
|
||||||
-<U0002008A>..<U0002008F> /x95/x32/x90/x34 <CJK>
|
|
||||||
+<U00020086>..<U0002008F> /x95/x32/x90/x30 <CJK>
|
|
||||||
<U00020090>..<U00020099> /x95/x32/x91/x30 <CJK>
|
|
||||||
<U0002009A>..<U000200A3> /x95/x32/x92/x30 <CJK>
|
|
||||||
<U000200A4>..<U000200AD> /x95/x32/x93/x30 <CJK>
|
|
||||||
<U000200AE>..<U000200B7> /x95/x32/x94/x30 <CJK>
|
|
||||||
<U000200B8>..<U000200C1> /x95/x32/x95/x30 <CJK>
|
|
||||||
<U000200C2>..<U000200CB> /x95/x32/x96/x30 <CJK>
|
|
||||||
-<U000200CC> /xfe/x53 <CJK>
|
|
||||||
-<U000200CD>..<U000200D5> /x95/x32/x97/x31 <CJK>
|
|
||||||
+<U000200CC>..<U000200D5> /x95/x32/x97/x30 <CJK>
|
|
||||||
<U000200D6>..<U000200DF> /x95/x32/x98/x30 <CJK>
|
|
||||||
<U000200E0>..<U000200E9> /x95/x32/x99/x30 <CJK>
|
|
||||||
<U000200EA>..<U000200F3> /x95/x32/x9a/x30 <CJK>
|
|
||||||
@@ -70998,8 +70968,7 @@ CHARMAP
|
|
||||||
<U000215BC>..<U000215C5> /x95/x36/xb7/x30 <CJK>
|
|
||||||
<U000215C6>..<U000215CF> /x95/x36/xb8/x30 <CJK>
|
|
||||||
<U000215D0>..<U000215D6> /x95/x36/xb9/x30 <CJK>
|
|
||||||
-<U000215D7> /xfe/x6c <CJK>
|
|
||||||
-<U000215D8>..<U000215D9> /x95/x36/xb9/x38 <CJK>
|
|
||||||
+<U000215D7>..<U000215D9> /x95/x36/xb9/x37 <CJK>
|
|
||||||
<U000215DA>..<U000215E3> /x95/x36/xba/x30 <CJK>
|
|
||||||
<U000215E4>..<U000215ED> /x95/x36/xbb/x30 <CJK>
|
|
||||||
<U000215EE>..<U000215F7> /x95/x36/xbc/x30 <CJK>
|
|
||||||
@@ -71505,8 +71474,7 @@ CHARMAP
|
|
||||||
<U00022976>..<U0002297F> /x96/x30/xb8/x30 <CJK>
|
|
||||||
<U00022980>..<U00022989> /x96/x30/xb9/x30 <CJK>
|
|
||||||
<U0002298A>..<U0002298E> /x96/x30/xba/x30 <CJK>
|
|
||||||
-<U0002298F> /xfe/x76 <CJK>
|
|
||||||
-<U00022990>..<U00022993> /x96/x30/xba/x36 <CJK>
|
|
||||||
+<U0002298F>..<U00022993> /x96/x30/xba/x35 <CJK>
|
|
||||||
<U00022994>..<U0002299D> /x96/x30/xbb/x30 <CJK>
|
|
||||||
<U0002299E>..<U000229A7> /x96/x30/xbc/x30 <CJK>
|
|
||||||
<U000229A8>..<U000229B1> /x96/x30/xbd/x30 <CJK>
|
|
||||||
@@ -72132,8 +72100,7 @@ CHARMAP
|
|
||||||
<U000241E0>..<U000241E9> /x96/x35/xb3/x30 <CJK>
|
|
||||||
<U000241EA>..<U000241F3> /x96/x35/xb4/x30 <CJK>
|
|
||||||
<U000241F4>..<U000241FD> /x96/x35/xb5/x30 <CJK>
|
|
||||||
-<U000241FE> /xfe/x91 <CJK>
|
|
||||||
-<U000241FF>..<U00024207> /x96/x35/xb6/x31 <CJK>
|
|
||||||
+<U000241FE>..<U00024207> /x96/x35/xb6/x30 <CJK>
|
|
||||||
<U00024208>..<U00024211> /x96/x35/xb7/x30 <CJK>
|
|
||||||
<U00024212>..<U0002421B> /x96/x35/xb8/x30 <CJK>
|
|
||||||
<U0002421C>..<U00024225> /x96/x35/xb9/x30 <CJK>
|
|
@ -1,25 +0,0 @@
|
|||||||
Author: Florian Weimer <fweimer@redhat.com>
|
|
||||||
Date: Wed Jul 4 16:16:57 2018 +0200
|
|
||||||
|
|
||||||
Makeconfig (ASFLAGS): Always append required assembler flags.
|
|
||||||
|
|
||||||
Submitted upstream here:
|
|
||||||
|
|
||||||
https://sourceware.org/ml/libc-alpha/2018-07/msg00077.html
|
|
||||||
|
|
||||||
Otherwise, we lose essential flags such as -Wa,--noexecstack due to
|
|
||||||
the way += works in make due to the ASFLAGS command line override.
|
|
||||||
|
|
||||||
diff --git a/Makeconfig b/Makeconfig
|
|
||||||
index b0b27f0113ac18b8..92e76d6200bbcd5b 100644
|
|
||||||
--- a/Makeconfig
|
|
||||||
+++ b/Makeconfig
|
|
||||||
@@ -1047,7 +1047,7 @@ endif
|
|
||||||
ifndef ASFLAGS
|
|
||||||
ASFLAGS := $(filter -g% -fdebug-prefix-map=%,$(CFLAGS))
|
|
||||||
endif
|
|
||||||
-ASFLAGS += -Werror=undef $(ASFLAGS-config) $(asflags-cpu)
|
|
||||||
+override ASFLAGS += -Werror=undef $(ASFLAGS-config) $(asflags-cpu)
|
|
||||||
|
|
||||||
ifndef BUILD_CC
|
|
||||||
BUILD_CC = $(CC)
|
|
@ -1,286 +0,0 @@
|
|||||||
Short description: Add C.UTF-8 support.
|
|
||||||
Author(s): Fedora glibc team <glibc@lists.fedoraproject.org>
|
|
||||||
Origin: PATCH
|
|
||||||
Upstream status: not-submitted
|
|
||||||
|
|
||||||
This patch needs to upstream as part of Carlos O'Donell
|
|
||||||
<carlos@redhat.com>'s work on enabling upstream C.UTF-8 support. This
|
|
||||||
work is currently blocked on cleaning up the test results to prove that
|
|
||||||
full code-point sorting is working as intended.
|
|
||||||
|
|
||||||
Note that this patch does not provide full code-point sorting as
|
|
||||||
expected.
|
|
||||||
|
|
||||||
This patch needs to upstream as soon as possible since it would be nice
|
|
||||||
to have this in F29 and fixed.
|
|
||||||
|
|
||||||
From 2eda7b462b415105f5a05c1323372d4e39d46439 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Mike FABIAN <mfabian@redhat.com>
|
|
||||||
Date: Mon, 10 Aug 2015 15:58:12 +0200
|
|
||||||
Subject: [PATCH] Add a C.UTF-8 locale
|
|
||||||
|
|
||||||
---
|
|
||||||
localedata/SUPPORTED | 1 +
|
|
||||||
localedata/locales/C | 238 +++++++++++++++++++++++++++++++++++++++++++++++++++
|
|
||||||
2 files changed, 239 insertions(+)
|
|
||||||
create mode 100644 localedata/locales/C
|
|
||||||
|
|
||||||
diff --git a/localedata/SUPPORTED b/localedata/SUPPORTED
|
|
||||||
index 8ca023e..2a78391 100644
|
|
||||||
--- a/localedata/SUPPORTED
|
|
||||||
+++ b/localedata/SUPPORTED
|
|
||||||
@@ -1,6 +1,7 @@
|
|
||||||
# This file names the currently supported and somewhat tested locales.
|
|
||||||
# If you have any additions please file a glibc bug report.
|
|
||||||
SUPPORTED-LOCALES=\
|
|
||||||
+C.UTF-8/UTF-8 \
|
|
||||||
aa_DJ.UTF-8/UTF-8 \
|
|
||||||
aa_DJ/ISO-8859-1 \
|
|
||||||
aa_ER/UTF-8 \
|
|
||||||
diff --git a/localedata/locales/C b/localedata/locales/C
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000..fdf460e
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/localedata/locales/C
|
|
||||||
@@ -0,0 +1,238 @@
|
|
||||||
+escape_char /
|
|
||||||
+comment_char %
|
|
||||||
+% Locale for C locale in UTF-8
|
|
||||||
+
|
|
||||||
+LC_IDENTIFICATION
|
|
||||||
+title "C locale"
|
|
||||||
+source ""
|
|
||||||
+address ""
|
|
||||||
+contact ""
|
|
||||||
+email "mfabian@redhat.com"
|
|
||||||
+tel ""
|
|
||||||
+fax ""
|
|
||||||
+language "C"
|
|
||||||
+territory ""
|
|
||||||
+revision "1.0"
|
|
||||||
+date "2015-08-10"
|
|
||||||
+%
|
|
||||||
+category "i18n:2012";LC_IDENTIFICATION
|
|
||||||
+category "i18n:2012";LC_CTYPE
|
|
||||||
+category "i18n:2012";LC_COLLATE
|
|
||||||
+category "i18n:2012";LC_TIME
|
|
||||||
+category "i18n:2012";LC_NUMERIC
|
|
||||||
+category "i18n:2012";LC_MONETARY
|
|
||||||
+category "i18n:2012";LC_MESSAGES
|
|
||||||
+category "i18n:2012";LC_PAPER
|
|
||||||
+category "i18n:2012";LC_NAME
|
|
||||||
+category "i18n:2012";LC_ADDRESS
|
|
||||||
+category "i18n:2012";LC_TELEPHONE
|
|
||||||
+category "i18n:2012";LC_MEASUREMENT
|
|
||||||
+END LC_IDENTIFICATION
|
|
||||||
+
|
|
||||||
+LC_CTYPE
|
|
||||||
+copy "i18n"
|
|
||||||
+
|
|
||||||
+translit_start
|
|
||||||
+include "translit_combining";""
|
|
||||||
+translit_end
|
|
||||||
+
|
|
||||||
+END LC_CTYPE
|
|
||||||
+
|
|
||||||
+LC_COLLATE
|
|
||||||
+order_start forward
|
|
||||||
+<U0000>
|
|
||||||
+..
|
|
||||||
+<UFFFF>
|
|
||||||
+<U10000>
|
|
||||||
+..
|
|
||||||
+<U1FFFF>
|
|
||||||
+<U20000>
|
|
||||||
+..
|
|
||||||
+<U2FFFF>
|
|
||||||
+<UE0000>
|
|
||||||
+..
|
|
||||||
+<UEFFFF>
|
|
||||||
+<UF0000>
|
|
||||||
+..
|
|
||||||
+<UFFFFF>
|
|
||||||
+<U100000>
|
|
||||||
+..
|
|
||||||
+<U10FFFF>
|
|
||||||
+UNDEFINED
|
|
||||||
+order_end
|
|
||||||
+END LC_COLLATE
|
|
||||||
+
|
|
||||||
+LC_MONETARY
|
|
||||||
+% This is the 14652 i18n fdcc-set definition for
|
|
||||||
+% the LC_MONETARY category
|
|
||||||
+% (except for the int_curr_symbol and currency_symbol, they are empty in
|
|
||||||
+% the 14652 i18n fdcc-set definition and also empty in
|
|
||||||
+% glibc/locale/C-monetary.c. But localedef complains in that case).
|
|
||||||
+%
|
|
||||||
+% Using "USD" for int_curr_symbol. But maybe "XXX" would be better?
|
|
||||||
+% XXX is "No currency" (https://en.wikipedia.org/wiki/ISO_4217)
|
|
||||||
+int_curr_symbol "<U0055><U0053><U0044><U0020>"
|
|
||||||
+% Using "$" for currency_symbol. But maybe <U00A4> would be better?
|
|
||||||
+% U+00A4 is the "generic currency symbol"
|
|
||||||
+% (https://en.wikipedia.org/wiki/Currency_sign_%28typography%29)
|
|
||||||
+currency_symbol "<U0024>"
|
|
||||||
+mon_decimal_point "<U002E>"
|
|
||||||
+mon_thousands_sep ""
|
|
||||||
+mon_grouping -1
|
|
||||||
+positive_sign ""
|
|
||||||
+negative_sign "<U002D>"
|
|
||||||
+int_frac_digits -1
|
|
||||||
+frac_digits -1
|
|
||||||
+p_cs_precedes -1
|
|
||||||
+int_p_sep_by_space -1
|
|
||||||
+p_sep_by_space -1
|
|
||||||
+n_cs_precedes -1
|
|
||||||
+int_n_sep_by_space -1
|
|
||||||
+n_sep_by_space -1
|
|
||||||
+p_sign_posn -1
|
|
||||||
+n_sign_posn -1
|
|
||||||
+%
|
|
||||||
+END LC_MONETARY
|
|
||||||
+
|
|
||||||
+LC_NUMERIC
|
|
||||||
+% This is the POSIX Locale definition for
|
|
||||||
+% the LC_NUMERIC category.
|
|
||||||
+%
|
|
||||||
+decimal_point "<U002E>"
|
|
||||||
+thousands_sep ""
|
|
||||||
+grouping -1
|
|
||||||
+END LC_NUMERIC
|
|
||||||
+
|
|
||||||
+LC_TIME
|
|
||||||
+% This is the POSIX Locale definition for
|
|
||||||
+% the LC_TIME category.
|
|
||||||
+%
|
|
||||||
+% Abbreviated weekday names (%a)
|
|
||||||
+abday "<U0053><U0075><U006E>";"<U004D><U006F><U006E>";/
|
|
||||||
+ "<U0054><U0075><U0065>";"<U0057><U0065><U0064>";/
|
|
||||||
+ "<U0054><U0068><U0075>";"<U0046><U0072><U0069>";/
|
|
||||||
+ "<U0053><U0061><U0074>"
|
|
||||||
+
|
|
||||||
+% Full weekday names (%A)
|
|
||||||
+day "<U0053><U0075><U006E><U0064><U0061><U0079>";/
|
|
||||||
+ "<U004D><U006F><U006E><U0064><U0061><U0079>";/
|
|
||||||
+ "<U0054><U0075><U0065><U0073><U0064><U0061><U0079>";/
|
|
||||||
+ "<U0057><U0065><U0064><U006E><U0065><U0073><U0064><U0061><U0079>";/
|
|
||||||
+ "<U0054><U0068><U0075><U0072><U0073><U0064><U0061><U0079>";/
|
|
||||||
+ "<U0046><U0072><U0069><U0064><U0061><U0079>";/
|
|
||||||
+ "<U0053><U0061><U0074><U0075><U0072><U0064><U0061><U0079>"
|
|
||||||
+
|
|
||||||
+% Abbreviated month names (%b)
|
|
||||||
+abmon "<U004A><U0061><U006E>";"<U0046><U0065><U0062>";/
|
|
||||||
+ "<U004D><U0061><U0072>";"<U0041><U0070><U0072>";/
|
|
||||||
+ "<U004D><U0061><U0079>";"<U004A><U0075><U006E>";/
|
|
||||||
+ "<U004A><U0075><U006C>";"<U0041><U0075><U0067>";/
|
|
||||||
+ "<U0053><U0065><U0070>";"<U004F><U0063><U0074>";/
|
|
||||||
+ "<U004E><U006F><U0076>";"<U0044><U0065><U0063>"
|
|
||||||
+
|
|
||||||
+% Full month names (%B)
|
|
||||||
+mon "<U004A><U0061><U006E><U0075><U0061><U0072><U0079>";/
|
|
||||||
+ "<U0046><U0065><U0062><U0072><U0075><U0061><U0072><U0079>";/
|
|
||||||
+ "<U004D><U0061><U0072><U0063><U0068>";/
|
|
||||||
+ "<U0041><U0070><U0072><U0069><U006C>";/
|
|
||||||
+ "<U004D><U0061><U0079>";/
|
|
||||||
+ "<U004A><U0075><U006E><U0065>";/
|
|
||||||
+ "<U004A><U0075><U006C><U0079>";/
|
|
||||||
+ "<U0041><U0075><U0067><U0075><U0073><U0074>";/
|
|
||||||
+ "<U0053><U0065><U0070><U0074><U0065><U006D><U0062><U0065><U0072>";/
|
|
||||||
+ "<U004F><U0063><U0074><U006F><U0062><U0065><U0072>";/
|
|
||||||
+ "<U004E><U006F><U0076><U0065><U006D><U0062><U0065><U0072>";/
|
|
||||||
+ "<U0044><U0065><U0063><U0065><U006D><U0062><U0065><U0072>"
|
|
||||||
+
|
|
||||||
+% Week description, consists of three fields:
|
|
||||||
+% 1. Number of days in a week.
|
|
||||||
+% 2. Gregorian date that is a first weekday (19971130 for Sunday, 19971201 for Monday).
|
|
||||||
+% 3. The weekday number to be contained in the first week of the year.
|
|
||||||
+%
|
|
||||||
+% ISO 8601 conforming applications should use the values 7, 19971201 (a
|
|
||||||
+% Monday), and 4 (Thursday), respectively.
|
|
||||||
+week 7;19971201;4
|
|
||||||
+first_weekday 1
|
|
||||||
+first_workday 1
|
|
||||||
+
|
|
||||||
+% Appropriate date and time representation (%c)
|
|
||||||
+% "%a %b %e %H:%M:%S %Y"
|
|
||||||
+d_t_fmt "<U0025><U0061><U0020><U0025><U0062><U0020><U0025><U0065><U0020><U0025><U0048><U003A><U0025><U004D><U003A><U0025><U0053><U0020><U0025><U0059>"
|
|
||||||
+
|
|
||||||
+% Appropriate date representation (%x)
|
|
||||||
+% "%m/%d/%y"
|
|
||||||
+d_fmt "<U0025><U006D><U002F><U0025><U0064><U002F><U0025><U0079>"
|
|
||||||
+
|
|
||||||
+% Appropriate time representation (%X)
|
|
||||||
+% "%H:%M:%S"
|
|
||||||
+t_fmt "<U0025><U0048><U003A><U0025><U004D><U003A><U0025><U0053>"
|
|
||||||
+
|
|
||||||
+% Appropriate AM/PM time representation (%r)
|
|
||||||
+% "%I:%M:%S %p"
|
|
||||||
+t_fmt_ampm "<U0025><U0049><U003A><U0025><U004D><U003A><U0025><U0053><U0020><U0025><U0070>"
|
|
||||||
+
|
|
||||||
+% Equivalent of AM/PM (%p) "AM"/"PM"
|
|
||||||
+%
|
|
||||||
+am_pm "<U0041><U004D>";"<U0050><U004D>"
|
|
||||||
+
|
|
||||||
+% Appropriate date representation (date(1)) "%a %b %e %H:%M:%S %Z %Y"
|
|
||||||
+date_fmt "<U0025><U0061><U0020><U0025><U0062><U0020><U0025><U0065><U0020><U0025><U0048><U003A><U0025><U004D><U003A><U0025><U0053><U0020><U0025><U005A><U0020><U0025><U0059>"
|
|
||||||
+END LC_TIME
|
|
||||||
+
|
|
||||||
+LC_MESSAGES
|
|
||||||
+% This is the POSIX Locale definition for
|
|
||||||
+% the LC_NUMERIC category.
|
|
||||||
+%
|
|
||||||
+yesexpr "<U005E><U005B><U0079><U0059><U005D>"
|
|
||||||
+noexpr "<U005E><U005B><U006E><U004E><U005D>"
|
|
||||||
+yesstr "<U0059><U0065><U0073>"
|
|
||||||
+nostr "<U004E><U006F>"
|
|
||||||
+END LC_MESSAGES
|
|
||||||
+
|
|
||||||
+LC_PAPER
|
|
||||||
+% This is the ISO/IEC 14652 "i18n" definition for
|
|
||||||
+% the LC_PAPER category.
|
|
||||||
+% (A4 paper, this is also used in the built in C/POSIX
|
|
||||||
+% locale in glibc/locale/C-paper.c)
|
|
||||||
+height 297
|
|
||||||
+width 210
|
|
||||||
+END LC_PAPER
|
|
||||||
+
|
|
||||||
+LC_NAME
|
|
||||||
+% This is the ISO/IEC 14652 "i18n" definition for
|
|
||||||
+% the LC_NAME category.
|
|
||||||
+% "%p%t%g%t%m%t%f"
|
|
||||||
+% (also used in the built in C/POSIX locale in glibc/locale/C-name.c)
|
|
||||||
+name_fmt "<U0025><U0070><U0025><U0074><U0025><U0067><U0025><U0074>/
|
|
||||||
+<U0025><U006D><U0025><U0074><U0025><U0066>"
|
|
||||||
+END LC_NAME
|
|
||||||
+
|
|
||||||
+LC_ADDRESS
|
|
||||||
+% This is the ISO/IEC 14652 "i18n" definition for
|
|
||||||
+% the LC_ADDRESS category.
|
|
||||||
+% "%a%N%f%N%d%N%b%N%s %h %e %r%N%C-%z %T%N%c%N"
|
|
||||||
+% (also used in the built in C/POSIX locale in glibc/locale/C-address.c)
|
|
||||||
+postal_fmt "<U0025><U0061><U0025><U004E><U0025><U0066><U0025><U004E>/
|
|
||||||
+<U0025><U0064><U0025><U004E><U0025><U0062><U0025><U004E><U0025><U0073>/
|
|
||||||
+<U0020><U0025><U0068><U0020><U0025><U0065><U0020><U0025><U0072><U0025>/
|
|
||||||
+<U004E><U0025><U0043><U002D><U0025><U007A><U0020><U0025><U0054><U0025>/
|
|
||||||
+<U004E><U0025><U0063><U0025><U004E>"
|
|
||||||
+END LC_ADDRESS
|
|
||||||
+
|
|
||||||
+LC_TELEPHONE
|
|
||||||
+% This is the ISO/IEC 14652 "i18n" definition for
|
|
||||||
+% the LC_TELEPHONE category.
|
|
||||||
+% "+%c %a %l"
|
|
||||||
+tel_int_fmt "<U002B><U0025><U0063><U0020><U0025><U0061><U0020><U0025>/
|
|
||||||
+<U006C>"
|
|
||||||
+% (also used in the built in C/POSIX locale in glibc/locale/C-telephone.c)
|
|
||||||
+END LC_TELEPHONE
|
|
||||||
+
|
|
||||||
+LC_MEASUREMENT
|
|
||||||
+% This is the ISO/IEC 14652 "i18n" definition for
|
|
||||||
+% the LC_MEASUREMENT category.
|
|
||||||
+% (same as in the built in C/POSIX locale in glibc/locale/C-measurement.c)
|
|
||||||
+%metric
|
|
||||||
+measurement 1
|
|
||||||
+END LC_MEASUREMENT
|
|
||||||
+
|
|
||||||
--
|
|
||||||
2.4.3
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
|||||||
Short description: Adjust CS_PATH return value.
|
|
||||||
Author(s): Fedora glibc team <glibc@lists.fedoraproject.org>
|
|
||||||
Origin: PATCH
|
|
||||||
Upstream status: not-needed
|
|
||||||
|
|
||||||
In Fedora we should return only /usr/bin because /bin is just a symlink
|
|
||||||
to /usr/bin after MoveToUsr transition (which glibc has not really
|
|
||||||
completed).
|
|
||||||
|
|
||||||
diff -pruN a/sysdeps/unix/confstr.h b/sysdeps/unix/confstr.h
|
|
||||||
--- a/sysdeps/unix/confstr.h 2012-12-25 08:32:13.000000000 +0530
|
|
||||||
+++ b/sysdeps/unix/confstr.h 2014-09-05 20:02:55.698275219 +0530
|
|
||||||
@@ -1 +1 @@
|
|
||||||
-#define CS_PATH "/bin:/usr/bin"
|
|
||||||
+#define CS_PATH "/usr/bin"
|
|
@ -1,91 +0,0 @@
|
|||||||
Short description: Cleanup use of _dl_starting_up.
|
|
||||||
Author(s): Fedora glibc team <glibc@lists.fedoraproject.org>
|
|
||||||
Origin: PATCH
|
|
||||||
Upstream status: https://sourceware.org/ml/libc-alpha/2014-02/msg00589.html
|
|
||||||
|
|
||||||
Upstream discussions:
|
|
||||||
https://sourceware.org/ml/libc-alpha/2014-02/msg00580.html
|
|
||||||
|
|
||||||
Based on the following commit:
|
|
||||||
~~~
|
|
||||||
From 16552c01a66633c9e412984d9d92616bd4e5303c Mon Sep 17 00:00:00 2001
|
|
||||||
From: Andreas Schwab <schwab@redhat.com>
|
|
||||||
Date: Fri, 11 Jun 2010 11:04:11 +0200
|
|
||||||
Subject: [PATCH] Properly set __libc_multiple_libcs
|
|
||||||
|
|
||||||
* elf/rtld.c (_dl_starting_up): Always define.
|
|
||||||
(dl_main): Always set _dl_starting_up.
|
|
||||||
* elf/dl-support.c (_dl_starting_up): Always define.
|
|
||||||
* elf/dl-init.c (_dl_init): Always clear _dl_starting_up.
|
|
||||||
|
|
||||||
---
|
|
||||||
ChangeLog | 7 +++++++
|
|
||||||
elf/dl-init.c | 4 ----
|
|
||||||
elf/dl-support.c | 2 --
|
|
||||||
elf/rtld.c | 4 ----
|
|
||||||
4 files changed, 7 insertions(+), 10 deletions(-)
|
|
||||||
~~~
|
|
||||||
|
|
||||||
This patch needs to go upstream to get cleaned up, but has always involed
|
|
||||||
analysis of the GNU/Hurd parts of the change and that stalled out, but
|
|
||||||
perhaps with build-many-glibcs we can now test these changes more easily.
|
|
||||||
|
|
||||||
Index: b/elf/dl-init.c
|
|
||||||
===================================================================
|
|
||||||
--- a/elf/dl-init.c
|
|
||||||
+++ b/elf/dl-init.c
|
|
||||||
@@ -119,8 +119,6 @@ _dl_init (struct link_map *main_map, int
|
|
||||||
while (i-- > 0)
|
|
||||||
call_init (main_map->l_initfini[i], argc, argv, env);
|
|
||||||
|
|
||||||
-#ifndef HAVE_INLINED_SYSCALLS
|
|
||||||
/* Finished starting up. */
|
|
||||||
_dl_starting_up = 0;
|
|
||||||
-#endif
|
|
||||||
}
|
|
||||||
Index: b/elf/dl-support.c
|
|
||||||
===================================================================
|
|
||||||
--- a/elf/dl-support.c
|
|
||||||
+++ b/elf/dl-support.c
|
|
||||||
@@ -117,10 +117,8 @@ struct r_scope_elem _dl_initial_searchli
|
|
||||||
.r_nlist = 1,
|
|
||||||
};
|
|
||||||
|
|
||||||
-#ifndef HAVE_INLINED_SYSCALLS
|
|
||||||
/* Nonzero during startup. */
|
|
||||||
int _dl_starting_up = 1;
|
|
||||||
-#endif
|
|
||||||
|
|
||||||
/* Random data provided by the kernel. */
|
|
||||||
void *_dl_random;
|
|
||||||
Index: b/elf/rtld.c
|
|
||||||
===================================================================
|
|
||||||
--- a/elf/rtld.c
|
|
||||||
+++ b/elf/rtld.c
|
|
||||||
@@ -214,7 +214,6 @@ audit_list_iter_next (struct audit_list_
|
|
||||||
return iter->previous->name;
|
|
||||||
}
|
|
||||||
|
|
||||||
-#ifndef HAVE_INLINED_SYSCALLS
|
|
||||||
/* Set nonzero during loading and initialization of executable and
|
|
||||||
libraries, cleared before the executable's entry point runs. This
|
|
||||||
must not be initialized to nonzero, because the unused dynamic
|
|
||||||
@@ -224,7 +223,6 @@ audit_list_iter_next (struct audit_list_
|
|
||||||
never be called. */
|
|
||||||
int _dl_starting_up = 0;
|
|
||||||
rtld_hidden_def (_dl_starting_up)
|
|
||||||
-#endif
|
|
||||||
|
|
||||||
/* This is the structure which defines all variables global to ld.so
|
|
||||||
(except those which cannot be added for some reason). */
|
|
||||||
@@ -898,10 +896,8 @@ dl_main (const ElfW(Phdr) *phdr,
|
|
||||||
/* Process the environment variable which control the behaviour. */
|
|
||||||
process_envvars (&mode);
|
|
||||||
|
|
||||||
-#ifndef HAVE_INLINED_SYSCALLS
|
|
||||||
/* Set up a flag which tells we are just starting. */
|
|
||||||
_dl_starting_up = 1;
|
|
||||||
-#endif
|
|
||||||
|
|
||||||
if (*user_entry == (ElfW(Addr)) ENTRY_POINT)
|
|
||||||
{
|
|
@ -1,21 +0,0 @@
|
|||||||
Short description: Fedora-specific glibc install locale changes.
|
|
||||||
Author(s): Fedora glibc team <glibc@lists.fedoraproject.org>
|
|
||||||
Origin: PATCH
|
|
||||||
Upstream status: not-needed
|
|
||||||
|
|
||||||
The Fedora glibc build and install does not need the normal install
|
|
||||||
behaviour which updates the locale archive. The Fedora install phase
|
|
||||||
in the spec file of the rpm will handle this manually.
|
|
||||||
|
|
||||||
diff --git a/localedata/Makefile b/localedata/Makefile
|
|
||||||
index a5f3c92d58954dfc..56719c7c714aa0f1 100644
|
|
||||||
--- a/localedata/Makefile
|
|
||||||
+++ b/localedata/Makefile
|
|
||||||
@@ -218,6 +218,7 @@ $(INSTALL-SUPPORTED-LOCALES): install-locales-dir
|
|
||||||
echo -n '...'; \
|
|
||||||
input=`echo $$locale | sed 's/\([^.]*\)[^@]*\(.*\)/\1\2/'`; \
|
|
||||||
$(LOCALEDEF) $$flags --alias-file=../intl/locale.alias \
|
|
||||||
+ --no-archive \
|
|
||||||
-i locales/$$input -f charmaps/$$charset \
|
|
||||||
$(addprefix --prefix=,$(install_root)) $$locale \
|
|
||||||
&& echo ' done'; \
|
|
@ -1,46 +0,0 @@
|
|||||||
Short description: Allow access to internal locale archive functions.
|
|
||||||
Author(s): Fedora glibc team <glibc@lists.fedoraproject.org>
|
|
||||||
Origin: PATCH
|
|
||||||
Upstream status: not-needed
|
|
||||||
|
|
||||||
This is a part of commit glibc-2.3.3-1492-ga891c7b,
|
|
||||||
needed for fedora/build-locale-archive.c only.
|
|
||||||
|
|
||||||
2007-04-16 Jakub Jelinek <jakub@redhat.com>
|
|
||||||
|
|
||||||
* locale/programs/locarchive.c (add_alias, insert_name): Remove static.
|
|
||||||
|
|
||||||
diff -Nrup a/locale/programs/locarchive.c b/locale/programs/locarchive.c
|
|
||||||
--- a/locale/programs/locarchive.c 2012-06-05 07:42:49.000000000 -0600
|
|
||||||
+++ b/locale/programs/locarchive.c 2012-06-07 12:15:21.585319540 -0600
|
|
||||||
@@ -252,9 +252,9 @@ oldlocrecentcmp (const void *a, const vo
|
|
||||||
/* forward decls for below */
|
|
||||||
static uint32_t add_locale (struct locarhandle *ah, const char *name,
|
|
||||||
locale_data_t data, bool replace);
|
|
||||||
-static void add_alias (struct locarhandle *ah, const char *alias,
|
|
||||||
- bool replace, const char *oldname,
|
|
||||||
- uint32_t *locrec_offset_p);
|
|
||||||
+void add_alias (struct locarhandle *ah, const char *alias,
|
|
||||||
+ bool replace, const char *oldname,
|
|
||||||
+ uint32_t *locrec_offset_p);
|
|
||||||
|
|
||||||
|
|
||||||
static bool
|
|
||||||
@@ -635,7 +635,7 @@ close_archive (struct locarhandle *ah)
|
|
||||||
#include "../../intl/explodename.c"
|
|
||||||
#include "../../intl/l10nflist.c"
|
|
||||||
|
|
||||||
-static struct namehashent *
|
|
||||||
+struct namehashent *
|
|
||||||
insert_name (struct locarhandle *ah,
|
|
||||||
const char *name, size_t name_len, bool replace)
|
|
||||||
{
|
|
||||||
@@ -693,7 +693,7 @@ insert_name (struct locarhandle *ah,
|
|
||||||
return &namehashtab[idx];
|
|
||||||
}
|
|
||||||
|
|
||||||
-static void
|
|
||||||
+void
|
|
||||||
add_alias (struct locarhandle *ah, const char *alias, bool replace,
|
|
||||||
const char *oldname, uint32_t *locrec_offset_p)
|
|
||||||
{
|
|
@ -1,31 +0,0 @@
|
|||||||
Short description: Fedora-specific enabling batch read in NSS.
|
|
||||||
Author(s): Fedora glibc team <glibc@lists.fedoraproject.org>
|
|
||||||
Origin: PATCH
|
|
||||||
Bug-RHEL: #188246
|
|
||||||
Upstream status: not-submitted
|
|
||||||
|
|
||||||
Enable batch read in NSS. It's not clear if this is always a win or
|
|
||||||
just a win for NIS+, this needs to be analyzed and sent upstream or
|
|
||||||
removed.
|
|
||||||
|
|
||||||
From baba5d9461d4e8a581ac26fe4412ad783ffc73e7 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Jakub Jelinek <jakub@redhat.com>
|
|
||||||
Date: Mon, 1 May 2006 08:02:53 +0000
|
|
||||||
Subject: [PATCH] Enable SETENT_BATCH_READ nis/nss option by default
|
|
||||||
|
|
||||||
* Mon May 1 2006 Jakub Jelinek <jakub@redhat.com> 2.4.90-4
|
|
||||||
- SETENT_BATCH_READ /etc/default/nss option for speeding up
|
|
||||||
some usages of NIS+ (#188246)
|
|
||||||
|
|
||||||
diff --git a/nis/nss b/nis/nss
|
|
||||||
--- a/nis/nss
|
|
||||||
+++ b/nis/nss
|
|
||||||
@@ -25,7 +25,7 @@
|
|
||||||
# memory with every getXXent() call. Otherwise each getXXent() call
|
|
||||||
# might result into a network communication with the server to get
|
|
||||||
# the next entry.
|
|
||||||
-#SETENT_BATCH_READ=TRUE
|
|
||||||
+SETENT_BATCH_READ=TRUE
|
|
||||||
#
|
|
||||||
# ADJUNCT_AS_SHADOW
|
|
||||||
# If set to TRUE, the passwd routines in the NIS NSS module will not
|
|
@ -1,38 +0,0 @@
|
|||||||
Short description: Do not define _XOPEN_STREAMS.
|
|
||||||
Author(s): Fedora glibc team <glibc@lists.fedoraproject.org>
|
|
||||||
Origin: PATCH
|
|
||||||
Bug-Fedora: #436349
|
|
||||||
Upstream status: not-submitted
|
|
||||||
|
|
||||||
This patch should go upstream. Not defining _XOPEN_STREAMS is the
|
|
||||||
same as setting it to -1 for POSIX conformance. The headers setting
|
|
||||||
needs to be reviewed indepedently.
|
|
||||||
|
|
||||||
This is part of commit glibc-2.3.3-1564-gd0b6ac6
|
|
||||||
|
|
||||||
* Fri Mar 14 2008 Jakub Jelinek <jakub@redhat.com> 2.7.90-11
|
|
||||||
- remove <stropts.h>, define _XOPEN_STREAMS -1 (#436349)
|
|
||||||
|
|
||||||
diff -Nrup a/nptl/sysdeps/unix/sysv/linux/bits/posix_opt.h b/nptl/sysdeps/unix/sysv/linux/bits/posix_opt.h
|
|
||||||
--- a/sysdeps/unix/sysv/linux/bits/posix_opt.h 2012-06-05 07:42:49.000000000 -0600
|
|
||||||
+++ b/sysdeps/unix/sysv/linux/bits/posix_opt.h 2012-06-07 12:15:21.817318674 -0600
|
|
||||||
@@ -188,4 +188,7 @@
|
|
||||||
/* Typed memory objects are not available. */
|
|
||||||
#define _POSIX_TYPED_MEMORY_OBJECTS -1
|
|
||||||
|
|
||||||
+/* Streams are not available. */
|
|
||||||
+#define _XOPEN_STREAMS -1
|
|
||||||
+
|
|
||||||
#endif /* bits/posix_opt.h */
|
|
||||||
diff -Nrup a/streams/Makefile b/streams/Makefile
|
|
||||||
--- a/streams/Makefile 2012-06-05 07:42:49.000000000 -0600
|
|
||||||
+++ b/streams/Makefile 2012-06-07 12:15:21.824318649 -0600
|
|
||||||
@@ -20,7 +20,7 @@
|
|
||||||
|
|
||||||
include ../Makeconfig
|
|
||||||
|
|
||||||
-headers = stropts.h sys/stropts.h bits/stropts.h bits/xtitypes.h
|
|
||||||
+#headers = stropts.h sys/stropts.h bits/stropts.h bits/xtitypes.h
|
|
||||||
routines = isastream getmsg getpmsg putmsg putpmsg fattach fdetach
|
|
||||||
|
|
||||||
include ../Rules
|
|
@ -1,35 +0,0 @@
|
|||||||
Downstream-specific patch to link DSO sorting tests with -ldl
|
|
||||||
if needed. Upstream does not need this because <dlfcn.h> interfaces
|
|
||||||
are part of libc.
|
|
||||||
|
|
||||||
diff --git a/scripts/dso-ordering-test.py b/scripts/dso-ordering-test.py
|
|
||||||
index 43b5ec4d920ad6a3..ae85e0f4a6ae5b3e 100644
|
|
||||||
--- a/scripts/dso-ordering-test.py
|
|
||||||
+++ b/scripts/dso-ordering-test.py
|
|
||||||
@@ -657,6 +657,8 @@ def process_testcase(t):
|
|
||||||
% (test_name + "-" + dep + ".FAKE.so",
|
|
||||||
("$(objpfx)" + test_subdir + "/"
|
|
||||||
+ test_name + "-" + dep + ".so")))
|
|
||||||
+ makefile.write(
|
|
||||||
+ "LDLIBS-%s += -Wl,--as-needed -ldl -Wl,--no-as-needed\n" % dso)
|
|
||||||
rule = ("$(objpfx)" + test_subdir + "/"
|
|
||||||
+ test_name + "-" + dep + ".FAKE.os: "
|
|
||||||
"$(objpfx)" + test_srcdir
|
|
||||||
@@ -685,6 +687,8 @@ def process_testcase(t):
|
|
||||||
+ test_descr.soname_map[o] + ".so")
|
|
||||||
ldflags += (" -Wl,-soname=" + soname)
|
|
||||||
makefile.write("LDFLAGS-%s = %s\n" % (dso, ldflags))
|
|
||||||
+ makefile.write(
|
|
||||||
+ "LDLIBS-%s += -Wl,--as-needed -ldl -Wl,--no-as-needed\n" % dso)
|
|
||||||
if o in test_descr.callrefs:
|
|
||||||
makefile.write("%s-no-z-defs = yes\n" % (dso))
|
|
||||||
|
|
||||||
@@ -702,6 +706,8 @@ def process_testcase(t):
|
|
||||||
+ test_descr.soname_map['#'] + ".so")
|
|
||||||
ldflags += (" -Wl,-soname=" + soname)
|
|
||||||
makefile.write("LDFLAGS-%s = %s\n" % (test_name, ldflags))
|
|
||||||
+ makefile.write(
|
|
||||||
+ "LDLIBS-%s += -Wl,--as-needed -ldl -Wl,--no-as-needed\n" % test_name)
|
|
||||||
rule = ("$(objpfx)" + test_subdir + "/" + test_name + ".o: "
|
|
||||||
"$(objpfx)" + test_srcdir + test_name + ".c\n"
|
|
||||||
"\t$(compile.c) $(OUTPUT_OPTION)\n")
|
|
@ -1,357 +0,0 @@
|
|||||||
commit 3a0588ae48fb35384a6bd33f9b66403badfa1262
|
|
||||||
Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
|
||||||
Date: Tue Feb 8 15:22:49 2022 -0300
|
|
||||||
|
|
||||||
elf: Fix DFS sorting algorithm for LD_TRACE_LOADED_OBJECTS with missing libraries (BZ #28868)
|
|
||||||
|
|
||||||
On _dl_map_object the underlying file is not opened in trace mode
|
|
||||||
(in other cases where the underlying file can't be opened,
|
|
||||||
_dl_map_object quits with an error). If there any missing libraries
|
|
||||||
being processed, they will not be considered on final nlist size
|
|
||||||
passed on _dl_sort_maps later in the function. And it is then used by
|
|
||||||
_dl_sort_maps_dfs on the stack allocated working maps:
|
|
||||||
|
|
||||||
222 /* Array to hold RPO sorting results, before we copy back to maps[]. */
|
|
||||||
223 struct link_map *rpo[nmaps];
|
|
||||||
224
|
|
||||||
225 /* The 'head' position during each DFS iteration. Note that we start at
|
|
||||||
226 one past the last element due to first-decrement-then-store (see the
|
|
||||||
227 bottom of above dfs_traversal() routine). */
|
|
||||||
228 struct link_map **rpo_head = &rpo[nmaps];
|
|
||||||
|
|
||||||
However while transversing the 'l_initfini' on dfs_traversal it will
|
|
||||||
still consider the l_faked maps and thus update rpo more times than the
|
|
||||||
allocated working 'rpo', overflowing the stack object.
|
|
||||||
|
|
||||||
As suggested in bugzilla, one option would be to avoid sorting the maps
|
|
||||||
for trace mode. However I think ignoring l_faked object does make
|
|
||||||
sense (there is one less constraint to call the sorting function), it
|
|
||||||
allows a slight less stack usage for trace, and it is slight simpler
|
|
||||||
solution.
|
|
||||||
|
|
||||||
The tests does trigger the stack overflow, however I tried to make
|
|
||||||
it more generic to check different scenarios or missing objects.
|
|
||||||
|
|
||||||
Checked on x86_64-linux-gnu.
|
|
||||||
|
|
||||||
Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
elf/Makefile
|
|
||||||
(differences in backported tests)
|
|
||||||
|
|
||||||
diff --git a/elf/Makefile b/elf/Makefile
|
|
||||||
index 22a8060f7d3bb1a1..634c3113227d64a6 100644
|
|
||||||
--- a/elf/Makefile
|
|
||||||
+++ b/elf/Makefile
|
|
||||||
@@ -584,6 +584,11 @@ modules-names = \
|
|
||||||
libmarkermod5-3 \
|
|
||||||
libmarkermod5-4 \
|
|
||||||
libmarkermod5-5 \
|
|
||||||
+ libtracemod1-1 \
|
|
||||||
+ libtracemod2-1 \
|
|
||||||
+ libtracemod3-1 \
|
|
||||||
+ libtracemod4-1 \
|
|
||||||
+ libtracemod5-1 \
|
|
||||||
ltglobmod1 \
|
|
||||||
ltglobmod2 \
|
|
||||||
neededobj1 \
|
|
||||||
@@ -983,6 +988,11 @@ tests-special += \
|
|
||||||
$(objpfx)tst-initorder2-cmp.out \
|
|
||||||
$(objpfx)tst-unused-dep-cmp.out \
|
|
||||||
$(objpfx)tst-unused-dep.out \
|
|
||||||
+ $(objpfx)tst-trace1.out \
|
|
||||||
+ $(objpfx)tst-trace2.out \
|
|
||||||
+ $(objpfx)tst-trace3.out \
|
|
||||||
+ $(objpfx)tst-trace4.out \
|
|
||||||
+ $(objpfx)tst-trace5.out \
|
|
||||||
# tests-special
|
|
||||||
endif
|
|
||||||
|
|
||||||
@@ -2619,6 +2629,51 @@ $(objpfx)tst-rtld-run-static.out: $(objpfx)/ldconfig
|
|
||||||
|
|
||||||
$(objpfx)tst-dlmopen-gethostbyname: $(libdl)
|
|
||||||
$(objpfx)tst-dlmopen-gethostbyname.out: $(objpfx)tst-dlmopen-gethostbyname-mod.so
|
|
||||||
+
|
|
||||||
+LDFLAGS-libtracemod1-1.so += -Wl,-soname,libtracemod1.so
|
|
||||||
+LDFLAGS-libtracemod2-1.so += -Wl,-soname,libtracemod2.so
|
|
||||||
+LDFLAGS-libtracemod3-1.so += -Wl,-soname,libtracemod3.so
|
|
||||||
+LDFLAGS-libtracemod4-1.so += -Wl,-soname,libtracemod4.so
|
|
||||||
+LDFLAGS-libtracemod5-1.so += -Wl,-soname,libtracemod5.so
|
|
||||||
+
|
|
||||||
+$(objpfx)libtracemod1-1.so: $(objpfx)libtracemod2-1.so \
|
|
||||||
+ $(objpfx)libtracemod3-1.so
|
|
||||||
+$(objpfx)libtracemod2-1.so: $(objpfx)libtracemod4-1.so \
|
|
||||||
+ $(objpfx)libtracemod5-1.so
|
|
||||||
+
|
|
||||||
+define libtracemod-x
|
|
||||||
+$(objpfx)libtracemod$(1)/libtracemod$(1).so: $(objpfx)libtracemod$(1)-1.so
|
|
||||||
+ $$(make-target-directory)
|
|
||||||
+ cp $$< $$@
|
|
||||||
+endef
|
|
||||||
+libtracemod-suffixes = 1 2 3 4 5
|
|
||||||
+$(foreach i,$(libtracemod-suffixes), $(eval $(call libtracemod-x,$(i))))
|
|
||||||
+
|
|
||||||
+define tst-trace-skeleton
|
|
||||||
+$(objpfx)tst-trace$(1).out: $(objpfx)libtracemod1/libtracemod1.so \
|
|
||||||
+ $(objpfx)libtracemod2/libtracemod2.so \
|
|
||||||
+ $(objpfx)libtracemod3/libtracemod3.so \
|
|
||||||
+ $(objpfx)libtracemod4/libtracemod4.so \
|
|
||||||
+ $(objpfx)libtracemod5/libtracemod5.so \
|
|
||||||
+ $(..)scripts/tst-ld-trace.py \
|
|
||||||
+ tst-trace$(1).exp
|
|
||||||
+ ${ $(PYTHON) $(..)scripts/tst-ld-trace.py \
|
|
||||||
+ "$(test-wrapper-env) $(elf-objpfx)$(rtld-installed-name) \
|
|
||||||
+ --library-path $(common-objpfx):$(strip $(2)) \
|
|
||||||
+ $(objpfx)libtracemod1/libtracemod1.so" tst-trace$(1).exp \
|
|
||||||
+ } > $$@; $$(evaluate-test)
|
|
||||||
+endef
|
|
||||||
+
|
|
||||||
+$(eval $(call tst-trace-skeleton,1,))
|
|
||||||
+$(eval $(call tst-trace-skeleton,2,\
|
|
||||||
+ $(objpfx)libtracemod2))
|
|
||||||
+$(eval $(call tst-trace-skeleton,3,\
|
|
||||||
+ $(objpfx)libtracemod2:$(objpfx)libtracemod3))
|
|
||||||
+$(eval $(call tst-trace-skeleton,4,\
|
|
||||||
+ $(objpfx)libtracemod2:$(objpfx)libtracemod3:$(objpfx)libtracemod4))
|
|
||||||
+$(eval $(call tst-trace-skeleton,5,\
|
|
||||||
+ $(objpfx)libtracemod2:$(objpfx)libtracemod3:$(objpfx)libtracemod4:$(objpfx)libtracemod5))
|
|
||||||
+
|
|
||||||
$(objpfx)tst-audit-tlsdesc: $(objpfx)tst-audit-tlsdesc-mod1.so \
|
|
||||||
$(objpfx)tst-audit-tlsdesc-mod2.so \
|
|
||||||
$(shared-thread-library)
|
|
||||||
diff --git a/elf/dl-deps.c b/elf/dl-deps.c
|
|
||||||
index 9365d54c8e03e5f4..9ff589c8562b2dd1 100644
|
|
||||||
--- a/elf/dl-deps.c
|
|
||||||
+++ b/elf/dl-deps.c
|
|
||||||
@@ -489,6 +489,8 @@ _dl_map_object_deps (struct link_map *map,
|
|
||||||
|
|
||||||
for (nlist = 0, runp = known; runp; runp = runp->next)
|
|
||||||
{
|
|
||||||
+ /* _dl_sort_maps ignores l_faked object, so it is safe to not consider
|
|
||||||
+ them for nlist. */
|
|
||||||
if (__builtin_expect (trace_mode, 0) && runp->map->l_faked)
|
|
||||||
/* This can happen when we trace the loading. */
|
|
||||||
--map->l_searchlist.r_nlist;
|
|
||||||
diff --git a/elf/dl-sort-maps.c b/elf/dl-sort-maps.c
|
|
||||||
index 398a08f28c4d9ff1..99354dc08a010dd3 100644
|
|
||||||
--- a/elf/dl-sort-maps.c
|
|
||||||
+++ b/elf/dl-sort-maps.c
|
|
||||||
@@ -140,7 +140,9 @@ static void
|
|
||||||
dfs_traversal (struct link_map ***rpo, struct link_map *map,
|
|
||||||
bool *do_reldeps)
|
|
||||||
{
|
|
||||||
- if (map->l_visited)
|
|
||||||
+ /* _dl_map_object_deps ignores l_faked objects when calculating the
|
|
||||||
+ number of maps before calling _dl_sort_maps, ignore them as well. */
|
|
||||||
+ if (map->l_visited || map->l_faked)
|
|
||||||
return;
|
|
||||||
|
|
||||||
map->l_visited = 1;
|
|
||||||
diff --git a/elf/libtracemod1-1.c b/elf/libtracemod1-1.c
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000000000000..7c89c9a5a40b9668
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/elf/libtracemod1-1.c
|
|
||||||
@@ -0,0 +1 @@
|
|
||||||
+/* Empty */
|
|
||||||
diff --git a/elf/libtracemod2-1.c b/elf/libtracemod2-1.c
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000000000000..7c89c9a5a40b9668
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/elf/libtracemod2-1.c
|
|
||||||
@@ -0,0 +1 @@
|
|
||||||
+/* Empty */
|
|
||||||
diff --git a/elf/libtracemod3-1.c b/elf/libtracemod3-1.c
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000000000000..7c89c9a5a40b9668
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/elf/libtracemod3-1.c
|
|
||||||
@@ -0,0 +1 @@
|
|
||||||
+/* Empty */
|
|
||||||
diff --git a/elf/libtracemod4-1.c b/elf/libtracemod4-1.c
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000000000000..7c89c9a5a40b9668
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/elf/libtracemod4-1.c
|
|
||||||
@@ -0,0 +1 @@
|
|
||||||
+/* Empty */
|
|
||||||
diff --git a/elf/libtracemod5-1.c b/elf/libtracemod5-1.c
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000000000000..7c89c9a5a40b9668
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/elf/libtracemod5-1.c
|
|
||||||
@@ -0,0 +1 @@
|
|
||||||
+/* Empty */
|
|
||||||
diff --git a/elf/tst-trace1.exp b/elf/tst-trace1.exp
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000000000000..4a6f5211a68fe2c8
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/elf/tst-trace1.exp
|
|
||||||
@@ -0,0 +1,4 @@
|
|
||||||
+ld 1
|
|
||||||
+libc 1
|
|
||||||
+libtracemod2.so 0
|
|
||||||
+libtracemod3.so 0
|
|
||||||
diff --git a/elf/tst-trace2.exp b/elf/tst-trace2.exp
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000000000000..e13506e2eb9aeca2
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/elf/tst-trace2.exp
|
|
||||||
@@ -0,0 +1,6 @@
|
|
||||||
+ld 1
|
|
||||||
+libc 1
|
|
||||||
+libtracemod2.so 1
|
|
||||||
+libtracemod3.so 0
|
|
||||||
+libtracemod4.so 0
|
|
||||||
+libtracemod5.so 0
|
|
||||||
diff --git a/elf/tst-trace3.exp b/elf/tst-trace3.exp
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000000000000..e574549d12a53d72
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/elf/tst-trace3.exp
|
|
||||||
@@ -0,0 +1,6 @@
|
|
||||||
+ld 1
|
|
||||||
+libc 1
|
|
||||||
+libtracemod2.so 1
|
|
||||||
+libtracemod3.so 1
|
|
||||||
+libtracemod4.so 0
|
|
||||||
+libtracemod5.so 0
|
|
||||||
diff --git a/elf/tst-trace4.exp b/elf/tst-trace4.exp
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000000000000..31ca97b35bde0009
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/elf/tst-trace4.exp
|
|
||||||
@@ -0,0 +1,6 @@
|
|
||||||
+ld 1
|
|
||||||
+libc 1
|
|
||||||
+libtracemod2.so 1
|
|
||||||
+libtracemod3.so 1
|
|
||||||
+libtracemod4.so 1
|
|
||||||
+libtracemod5.so 0
|
|
||||||
diff --git a/elf/tst-trace5.exp b/elf/tst-trace5.exp
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000000000000..5d7d95372656396f
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/elf/tst-trace5.exp
|
|
||||||
@@ -0,0 +1,6 @@
|
|
||||||
+ld 1
|
|
||||||
+libc 1
|
|
||||||
+libtracemod2.so 1
|
|
||||||
+libtracemod3.so 1
|
|
||||||
+libtracemod4.so 1
|
|
||||||
+libtracemod5.so 1
|
|
||||||
diff --git a/scripts/tst-ld-trace.py b/scripts/tst-ld-trace.py
|
|
||||||
new file mode 100755
|
|
||||||
index 0000000000000000..f5a402800377f44b
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/scripts/tst-ld-trace.py
|
|
||||||
@@ -0,0 +1,108 @@
|
|
||||||
+#!/usr/bin/python3
|
|
||||||
+# Dump the output of LD_TRACE_LOADED_OBJECTS in architecture neutral format.
|
|
||||||
+# Copyright (C) 2022 Free Software Foundation, Inc.
|
|
||||||
+# Copyright The GNU Toolchain Authors.
|
|
||||||
+# This file is part of the GNU C Library.
|
|
||||||
+#
|
|
||||||
+# The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+# modify it under the terms of the GNU Lesser General Public
|
|
||||||
+# License as published by the Free Software Foundation; either
|
|
||||||
+# version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+#
|
|
||||||
+# The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+# Lesser General Public License for more details.
|
|
||||||
+#
|
|
||||||
+# You should have received a copy of the GNU Lesser General Public
|
|
||||||
+# License along with the GNU C Library; if not, see
|
|
||||||
+# <https://www.gnu.org/licenses/>.
|
|
||||||
+
|
|
||||||
+import argparse
|
|
||||||
+import os
|
|
||||||
+import subprocess
|
|
||||||
+import sys
|
|
||||||
+
|
|
||||||
+try:
|
|
||||||
+ subprocess.run
|
|
||||||
+except:
|
|
||||||
+ class _CompletedProcess:
|
|
||||||
+ def __init__(self, args, returncode, stdout=None, stderr=None):
|
|
||||||
+ self.args = args
|
|
||||||
+ self.returncode = returncode
|
|
||||||
+ self.stdout = stdout
|
|
||||||
+ self.stderr = stderr
|
|
||||||
+
|
|
||||||
+ def _run(*popenargs, input=None, timeout=None, check=False, **kwargs):
|
|
||||||
+ assert(timeout is None)
|
|
||||||
+ with subprocess.Popen(*popenargs, **kwargs) as process:
|
|
||||||
+ try:
|
|
||||||
+ stdout, stderr = process.communicate(input)
|
|
||||||
+ except:
|
|
||||||
+ process.kill()
|
|
||||||
+ process.wait()
|
|
||||||
+ raise
|
|
||||||
+ returncode = process.poll()
|
|
||||||
+ if check and returncode:
|
|
||||||
+ raise subprocess.CalledProcessError(returncode, popenargs)
|
|
||||||
+ return _CompletedProcess(popenargs, returncode, stdout, stderr)
|
|
||||||
+
|
|
||||||
+ subprocess.run = _run
|
|
||||||
+
|
|
||||||
+def is_vdso(lib):
|
|
||||||
+ return lib.startswith('linux-gate') or lib.startswith('linux-vdso')
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+def parse_trace(cmd, fref):
|
|
||||||
+ new_env = os.environ.copy()
|
|
||||||
+ new_env['LD_TRACE_LOADED_OBJECTS'] = '1'
|
|
||||||
+ trace_out = subprocess.run(cmd, stdout=subprocess.PIPE, check=True,
|
|
||||||
+ universal_newlines=True, env=new_env).stdout
|
|
||||||
+ trace = []
|
|
||||||
+ for line in trace_out.splitlines():
|
|
||||||
+ line = line.strip()
|
|
||||||
+ if is_vdso(line):
|
|
||||||
+ continue
|
|
||||||
+ fields = line.split('=>' if '=>' in line else ' ')
|
|
||||||
+ lib = os.path.basename(fields[0].strip())
|
|
||||||
+ if lib.startswith('ld'):
|
|
||||||
+ lib = 'ld'
|
|
||||||
+ elif lib.startswith('libc'):
|
|
||||||
+ lib = 'libc'
|
|
||||||
+ found = 1 if fields[1].strip() != 'not found' else 0
|
|
||||||
+ trace += ['{} {}'.format(lib, found)]
|
|
||||||
+ trace = sorted(trace)
|
|
||||||
+
|
|
||||||
+ reference = sorted(line.replace('\n','') for line in fref.readlines())
|
|
||||||
+
|
|
||||||
+ ret = 0 if trace == reference else 1
|
|
||||||
+ if ret != 0:
|
|
||||||
+ for i in reference:
|
|
||||||
+ if i not in trace:
|
|
||||||
+ print("Only in {}: {}".format(fref.name, i))
|
|
||||||
+ for i in trace:
|
|
||||||
+ if i not in reference:
|
|
||||||
+ print("Only in trace: {}".format(i))
|
|
||||||
+
|
|
||||||
+ sys.exit(ret)
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+def get_parser():
|
|
||||||
+ parser = argparse.ArgumentParser(description=__doc__)
|
|
||||||
+ parser.add_argument('command',
|
|
||||||
+ help='comand to run')
|
|
||||||
+ parser.add_argument('reference',
|
|
||||||
+ help='reference file to compare')
|
|
||||||
+ return parser
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+def main(argv):
|
|
||||||
+ parser = get_parser()
|
|
||||||
+ opts = parser.parse_args(argv)
|
|
||||||
+ with open(opts.reference, 'r') as fref:
|
|
||||||
+ # Remove the initial 'env' command.
|
|
||||||
+ parse_trace(opts.command.split()[1:], fref)
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+if __name__ == '__main__':
|
|
||||||
+ main(sys.argv[1:])
|
|
@ -1,36 +0,0 @@
|
|||||||
commit a2211c76c3b994099fd58a06d6072d7495d699cd
|
|
||||||
Author: Florian Weimer <fweimer@redhat.com>
|
|
||||||
Date: Fri Mar 18 18:18:35 2022 +0100
|
|
||||||
|
|
||||||
scripts/dso-ordering-test.py: Fix C&P error in * callrefs processing
|
|
||||||
|
|
||||||
The elf/dso-sort-tests-src subdirectory is not changed by this commit,
|
|
||||||
so it seems that the cut-and-paste error was not material.
|
|
||||||
|
|
||||||
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
|
||||||
|
|
||||||
diff --git a/scripts/dso-ordering-test.py b/scripts/dso-ordering-test.py
|
|
||||||
index bde0406be9da14fc..ee476c810c76f1b0 100644
|
|
||||||
--- a/scripts/dso-ordering-test.py
|
|
||||||
+++ b/scripts/dso-ordering-test.py
|
|
||||||
@@ -551,17 +551,17 @@ def process_testcase(t):
|
|
||||||
if obj in t.deps:
|
|
||||||
deps = t.deps[obj]
|
|
||||||
if '*' in deps:
|
|
||||||
- t.deps[obj].remove('*')
|
|
||||||
+ deps.remove('*')
|
|
||||||
t.add_deps([obj], non_dep_tgt_objs)
|
|
||||||
if obj in t.callrefs:
|
|
||||||
deps = t.callrefs[obj]
|
|
||||||
if '*' in deps:
|
|
||||||
- t.deps[obj].remove('*')
|
|
||||||
+ deps.remove('*')
|
|
||||||
t.add_callrefs([obj], non_dep_tgt_objs)
|
|
||||||
if "#" in t.deps:
|
|
||||||
deps = t.deps["#"]
|
|
||||||
if '*' in deps:
|
|
||||||
- t.deps["#"].remove('*')
|
|
||||||
+ deps.remove('*')
|
|
||||||
t.add_deps(["#"], non_dep_tgt_objs)
|
|
||||||
|
|
||||||
# If no main program was specified in dependency description, make a
|
|
@ -1,49 +0,0 @@
|
|||||||
Backport of this Fedora Rawhide commit but split out into a distinct
|
|
||||||
patch.
|
|
||||||
|
|
||||||
commit 72195d44855ab96875f117acb75c37f98dcb26a9
|
|
||||||
Author: Carlos O'Donell <carlos@redhat.com>
|
|
||||||
Date: Thu Jun 6 23:58:21 2019 -0400
|
|
||||||
|
|
||||||
locale: Fix C.UTF-8 ranges.
|
|
||||||
|
|
||||||
The ellipsis range support only allows <Uxxxx> or <Uxxxxxxxx> as
|
|
||||||
valid unicode code points, otherwise it treats it as a symbol and
|
|
||||||
since we don't define the symbol the entire range is unused.
|
|
||||||
|
|
||||||
diff --git a/localedata/locales/C b/localedata/locales/C
|
|
||||||
index b2c2d1dc417cde69..30d9563213b8cb0f 100644
|
|
||||||
--- a/localedata/locales/C
|
|
||||||
+++ b/localedata/locales/C
|
|
||||||
@@ -43,21 +43,21 @@ order_start forward
|
|
||||||
<U0000>
|
|
||||||
..
|
|
||||||
<UFFFF>
|
|
||||||
-<U10000>
|
|
||||||
+<U00010000>
|
|
||||||
..
|
|
||||||
-<U1FFFF>
|
|
||||||
-<U20000>
|
|
||||||
+<U0001FFFF>
|
|
||||||
+<U00020000>
|
|
||||||
..
|
|
||||||
-<U2FFFF>
|
|
||||||
-<UE0000>
|
|
||||||
+<U0002FFFF>
|
|
||||||
+<U000E0000>
|
|
||||||
..
|
|
||||||
-<UEFFFF>
|
|
||||||
-<UF0000>
|
|
||||||
+<U000EFFFF>
|
|
||||||
+<U000F0000>
|
|
||||||
..
|
|
||||||
-<UFFFFF>
|
|
||||||
-<U100000>
|
|
||||||
+<U000FFFFF>
|
|
||||||
+<U00100000>
|
|
||||||
..
|
|
||||||
-<U10FFFF>
|
|
||||||
+<U0010FFFF>
|
|
||||||
UNDEFINED
|
|
||||||
order_end
|
|
||||||
END LC_COLLATE
|
|
@ -1,185 +0,0 @@
|
|||||||
commit 96cd0558bcd69481ccc42e1b392f0c0b36fce2b0
|
|
||||||
Author: Florian Weimer <fweimer@redhat.com>
|
|
||||||
Date: Wed Nov 28 19:59:45 2018 +0100
|
|
||||||
|
|
||||||
support: Add signal support to support_capture_subprocess_check
|
|
||||||
|
|
||||||
Signal zero does not terminate a process, so it is safe to use negative
|
|
||||||
values for signal numbers.
|
|
||||||
|
|
||||||
Adjust libio/tst-vtables-common.c to use this new functionality,
|
|
||||||
instead of determining the termination status for a signal indirectly.
|
|
||||||
|
|
||||||
diff --git a/libio/tst-vtables-common.c b/libio/tst-vtables-common.c
|
|
||||||
index 5e3101206919fa1b..85e246cd1131f8e8 100644
|
|
||||||
--- a/libio/tst-vtables-common.c
|
|
||||||
+++ b/libio/tst-vtables-common.c
|
|
||||||
@@ -380,21 +380,6 @@ without_compatibility_fflush (void *closure)
|
|
||||||
_exit (1);
|
|
||||||
}
|
|
||||||
|
|
||||||
-/* Exit status after abnormal termination. */
|
|
||||||
-static int termination_status;
|
|
||||||
-
|
|
||||||
-static void
|
|
||||||
-init_termination_status (void)
|
|
||||||
-{
|
|
||||||
- pid_t pid = xfork ();
|
|
||||||
- if (pid == 0)
|
|
||||||
- abort ();
|
|
||||||
- xwaitpid (pid, &termination_status, 0);
|
|
||||||
-
|
|
||||||
- TEST_VERIFY (WIFSIGNALED (termination_status));
|
|
||||||
- TEST_COMPARE (WTERMSIG (termination_status), SIGABRT);
|
|
||||||
-}
|
|
||||||
-
|
|
||||||
static void
|
|
||||||
check_for_termination (const char *name, void (*callback) (void *))
|
|
||||||
{
|
|
||||||
@@ -404,7 +389,7 @@ check_for_termination (const char *name, void (*callback) (void *))
|
|
||||||
shared->calls = 0;
|
|
||||||
struct support_capture_subprocess proc
|
|
||||||
= support_capture_subprocess (callback, NULL);
|
|
||||||
- support_capture_subprocess_check (&proc, name, termination_status,
|
|
||||||
+ support_capture_subprocess_check (&proc, name, -SIGABRT,
|
|
||||||
sc_allow_stderr);
|
|
||||||
const char *message
|
|
||||||
= "Fatal error: glibc detected an invalid stdio handle\n";
|
|
||||||
@@ -491,7 +476,6 @@ run_tests (bool initially_disabled)
|
|
||||||
|
|
||||||
shared = support_shared_allocate (sizeof (*shared));
|
|
||||||
shared->initially_disabled = initially_disabled;
|
|
||||||
- init_termination_status ();
|
|
||||||
|
|
||||||
if (initially_disabled)
|
|
||||||
{
|
|
||||||
diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h
|
|
||||||
index d5eac84d09ae325f..2d2384e73df0d2d0 100644
|
|
||||||
--- a/support/capture_subprocess.h
|
|
||||||
+++ b/support/capture_subprocess.h
|
|
||||||
@@ -55,13 +55,16 @@ enum support_capture_allow
|
|
||||||
sc_allow_stderr = 0x04,
|
|
||||||
};
|
|
||||||
|
|
||||||
-/* Check that the subprocess exited with STATUS and that only the
|
|
||||||
- allowed outputs happened. ALLOWED is a combination of
|
|
||||||
- support_capture_allow flags. Report errors under the CONTEXT
|
|
||||||
- message. */
|
|
||||||
+/* Check that the subprocess exited and that only the allowed outputs
|
|
||||||
+ happened. If STATUS_OR_SIGNAL is nonnegative, it is the expected
|
|
||||||
+ (decoded) exit status of the process, as returned by WEXITSTATUS.
|
|
||||||
+ If STATUS_OR_SIGNAL is negative, -STATUS_OR_SIGNAL is the expected
|
|
||||||
+ termination signal, as returned by WTERMSIG. ALLOWED is a
|
|
||||||
+ combination of support_capture_allow flags. Report errors under
|
|
||||||
+ the CONTEXT message. */
|
|
||||||
void support_capture_subprocess_check (struct support_capture_subprocess *,
|
|
||||||
- const char *context, int status,
|
|
||||||
- int allowed)
|
|
||||||
+ const char *context,
|
|
||||||
+ int status_or_signal, int allowed)
|
|
||||||
__attribute__ ((nonnull (1, 2)));
|
|
||||||
|
|
||||||
#endif /* SUPPORT_CAPTURE_SUBPROCESS_H */
|
|
||||||
diff --git a/support/support_capture_subprocess_check.c b/support/support_capture_subprocess_check.c
|
|
||||||
index ff5ee89fb02599ae..8b4c352c96227b78 100644
|
|
||||||
--- a/support/support_capture_subprocess_check.c
|
|
||||||
+++ b/support/support_capture_subprocess_check.c
|
|
||||||
@@ -20,6 +20,7 @@
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <support/capture_subprocess.h>
|
|
||||||
#include <support/check.h>
|
|
||||||
+#include <sys/wait.h>
|
|
||||||
|
|
||||||
static void
|
|
||||||
print_context (const char *context, bool *failed)
|
|
||||||
@@ -31,9 +32,22 @@ print_context (const char *context, bool *failed)
|
|
||||||
printf ("error: subprocess failed: %s\n", context);
|
|
||||||
}
|
|
||||||
|
|
||||||
+static void
|
|
||||||
+print_actual_status (struct support_capture_subprocess *proc)
|
|
||||||
+{
|
|
||||||
+ if (WIFEXITED (proc->status))
|
|
||||||
+ printf ("error: actual exit status: %d [0x%x]\n",
|
|
||||||
+ WEXITSTATUS (proc->status), proc->status);
|
|
||||||
+ else if (WIFSIGNALED (proc->status))
|
|
||||||
+ printf ("error: actual termination signal: %d [0x%x]\n",
|
|
||||||
+ WTERMSIG (proc->status), proc->status);
|
|
||||||
+ else
|
|
||||||
+ printf ("error: actual undecoded exit status: [0x%x]\n", proc->status);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
void
|
|
||||||
support_capture_subprocess_check (struct support_capture_subprocess *proc,
|
|
||||||
- const char *context, int status,
|
|
||||||
+ const char *context, int status_or_signal,
|
|
||||||
int allowed)
|
|
||||||
{
|
|
||||||
TEST_VERIFY ((allowed & sc_allow_none)
|
|
||||||
@@ -44,11 +58,28 @@ support_capture_subprocess_check (struct support_capture_subprocess *proc,
|
|
||||||
|| (allowed & sc_allow_stderr))));
|
|
||||||
|
|
||||||
bool failed = false;
|
|
||||||
- if (proc->status != status)
|
|
||||||
+ if (status_or_signal >= 0)
|
|
||||||
{
|
|
||||||
- print_context (context, &failed);
|
|
||||||
- printf ("error: expected exit status: %d\n", status);
|
|
||||||
- printf ("error: actual exit status: %d\n", proc->status);
|
|
||||||
+ /* Expect regular termination. */
|
|
||||||
+ if (!(WIFEXITED (proc->status)
|
|
||||||
+ && WEXITSTATUS (proc->status) == status_or_signal))
|
|
||||||
+ {
|
|
||||||
+ print_context (context, &failed);
|
|
||||||
+ printf ("error: expected exit status: %d\n", status_or_signal);
|
|
||||||
+ print_actual_status (proc);
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ else
|
|
||||||
+ {
|
|
||||||
+ /* status_or_signal < 0. Expect termination by signal. */
|
|
||||||
+ if (!(WIFSIGNALED (proc->status)
|
|
||||||
+ && WTERMSIG (proc->status) == -status_or_signal))
|
|
||||||
+ {
|
|
||||||
+ print_context (context, &failed);
|
|
||||||
+ printf ("error: expected termination signal: %d\n",
|
|
||||||
+ -status_or_signal);
|
|
||||||
+ print_actual_status (proc);
|
|
||||||
+ }
|
|
||||||
}
|
|
||||||
if (!(allowed & sc_allow_stdout) && proc->out.length != 0)
|
|
||||||
{
|
|
||||||
diff --git a/support/tst-support_capture_subprocess.c b/support/tst-support_capture_subprocess.c
|
|
||||||
index 63b6699622f97fcc..99570879eedd65b1 100644
|
|
||||||
--- a/support/tst-support_capture_subprocess.c
|
|
||||||
+++ b/support/tst-support_capture_subprocess.c
|
|
||||||
@@ -285,15 +285,29 @@ do_multiple_tests (enum test_type type)
|
|
||||||
|
|
||||||
check_stream ("stdout", &result.out, test.out);
|
|
||||||
check_stream ("stderr", &result.err, test.err);
|
|
||||||
+
|
|
||||||
+ /* Allowed output for support_capture_subprocess_check. */
|
|
||||||
+ int check_allow = 0;
|
|
||||||
+ if (lengths[length_idx_stdout] > 0)
|
|
||||||
+ check_allow |= sc_allow_stdout;
|
|
||||||
+ if (lengths[length_idx_stderr] > 0)
|
|
||||||
+ check_allow |= sc_allow_stderr;
|
|
||||||
+ if (check_allow == 0)
|
|
||||||
+ check_allow = sc_allow_none;
|
|
||||||
+
|
|
||||||
if (test.signal != 0)
|
|
||||||
{
|
|
||||||
TEST_VERIFY (WIFSIGNALED (result.status));
|
|
||||||
TEST_VERIFY (WTERMSIG (result.status) == test.signal);
|
|
||||||
+ support_capture_subprocess_check (&result, "signal",
|
|
||||||
+ -SIGTERM, check_allow);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
TEST_VERIFY (WIFEXITED (result.status));
|
|
||||||
TEST_VERIFY (WEXITSTATUS (result.status) == test.status);
|
|
||||||
+ support_capture_subprocess_check (&result, "exit",
|
|
||||||
+ test.status, check_allow);
|
|
||||||
}
|
|
||||||
support_capture_subprocess_free (&result);
|
|
||||||
free (test.out);
|
|
@ -1,42 +0,0 @@
|
|||||||
commit e37c2cf299b61ce18f62852f6c5624c27829b610
|
|
||||||
Author: Florian Weimer <fweimer@redhat.com>
|
|
||||||
Date: Thu Oct 31 18:48:43 2019 +0100
|
|
||||||
|
|
||||||
Move _dl_open_check to its original place in dl_open_worker
|
|
||||||
|
|
||||||
This reverts the non-test change from commit d0093c5cefb7f7a4143f
|
|
||||||
("Call _dl_open_check after relocation [BZ #24259]"), given that
|
|
||||||
the underlying bug has been fixed properly in commit 61b74477fa7f63
|
|
||||||
("Remove all loaded objects if dlopen fails, ignoring NODELETE
|
|
||||||
[BZ #20839]").
|
|
||||||
|
|
||||||
Tested on x86-64-linux-gnu, with and without --enable-cet.
|
|
||||||
|
|
||||||
Change-Id: I995a6cfb89f25d2b0cf5e606428c2a93eb48fc33
|
|
||||||
|
|
||||||
diff --git a/elf/dl-open.c b/elf/dl-open.c
|
|
||||||
index 25838b073ac1edaf..e13968d4d7c4c83f 100644
|
|
||||||
--- a/elf/dl-open.c
|
|
||||||
+++ b/elf/dl-open.c
|
|
||||||
@@ -619,6 +619,8 @@ dl_open_worker (void *a)
|
|
||||||
_dl_debug_state ();
|
|
||||||
LIBC_PROBE (map_complete, 3, args->nsid, r, new);
|
|
||||||
|
|
||||||
+ _dl_open_check (new);
|
|
||||||
+
|
|
||||||
/* Print scope information. */
|
|
||||||
if (__glibc_unlikely (GLRO(dl_debug_mask) & DL_DEBUG_SCOPES))
|
|
||||||
_dl_show_scope (new, 0);
|
|
||||||
@@ -699,12 +701,6 @@ dl_open_worker (void *a)
|
|
||||||
_dl_relocate_object (l, l->l_scope, reloc_mode, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
- /* NB: Workaround for [BZ #20839] which doesn't remove the NODELETE
|
|
||||||
- object when _dl_open_check throws an exception. Move it after
|
|
||||||
- relocation to avoid leaving the NODELETE object mapped without
|
|
||||||
- relocation. */
|
|
||||||
- _dl_open_check (new);
|
|
||||||
-
|
|
||||||
/* This only performs the memory allocations. The actual update of
|
|
||||||
the scopes happens below, after failure is impossible. */
|
|
||||||
resize_scopes (new);
|
|
@ -1,27 +0,0 @@
|
|||||||
commit 61a7c9df71ee4e6f94b56c20f0d37c6e17d5f284
|
|
||||||
Author: Florian Weimer <fweimer@redhat.com>
|
|
||||||
Date: Mon Dec 2 14:53:16 2019 +0100
|
|
||||||
|
|
||||||
elf/tst-dlopenfail: Disable --no-as-needed for tst-dlopenfailmod1.so
|
|
||||||
|
|
||||||
Otherwise, the shared object dependency which triggers the load
|
|
||||||
failure is dropped, invalidating the test.
|
|
||||||
|
|
||||||
diff --git a/elf/Makefile b/elf/Makefile
|
|
||||||
index bf7c41f38be42184..467e810e784bb96d 100644
|
|
||||||
--- a/elf/Makefile
|
|
||||||
+++ b/elf/Makefile
|
|
||||||
@@ -1543,8 +1543,11 @@ LDFLAGS-tst-finilazyfailmod.so = \
|
|
||||||
$(objpfx)tst-dlopenfail: $(libdl)
|
|
||||||
$(objpfx)tst-dlopenfail.out: \
|
|
||||||
$(objpfx)tst-dlopenfailmod1.so $(objpfx)tst-dlopenfailmod2.so
|
|
||||||
-# Order matters here. tst-dlopenfaillinkmod.so's soname ensures
|
|
||||||
-# a run-time loader failure.
|
|
||||||
+# Order matters here. tst-dlopenfaillinkmod.so's soname ensures a
|
|
||||||
+# run-time loader failure. --as-needed breaks this test because
|
|
||||||
+# nothing actually references tst-dlopenfailmod2.so (with its soname
|
|
||||||
+# tst-dlopenfail-missingmod.so).
|
|
||||||
+LDFLAGS-tst-dlopenfailmod1.so = -Wl,--no-as-needed
|
|
||||||
$(objpfx)tst-dlopenfailmod1.so: \
|
|
||||||
$(shared-thread-library) $(objpfx)tst-dlopenfaillinkmod.so
|
|
||||||
LDFLAGS-tst-dlopenfaillinkmod.so = -Wl,-soname,tst-dlopenfail-missingmod.so
|
|
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user