Compare commits
No commits in common. "c8" and "c9s" have entirely different histories.
7
.gitignore
vendored
7
.gitignore
vendored
@ -1 +1,6 @@
|
|||||||
SOURCES/glibc-2.28.tar.xz
|
# Release tarballs.
|
||||||
|
/glibc-*.tar.[gx]z
|
||||||
|
# Generated (source) RPMs.
|
||||||
|
/*.rpm
|
||||||
|
# Expanded source trees.
|
||||||
|
/glibc-*/
|
||||||
|
@ -1 +0,0 @@
|
|||||||
ccb5dc9e51a9884df8488f86982439d47b283b2a SOURCES/glibc-2.28.tar.xz
|
|
File diff suppressed because it is too large
Load Diff
2
README.scripts
Normal file
2
README.scripts
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
All of the useful glibc maintainer scripts are located at:
|
||||||
|
https://pagure.io/glibc-maintainer-scripts
|
@ -1,496 +0,0 @@
|
|||||||
# This file names the currently supported and somewhat tested locales.
|
|
||||||
# If you have any additions please file a glibc bug report.
|
|
||||||
SUPPORTED-LOCALES=\
|
|
||||||
C.UTF-8/UTF-8 \
|
|
||||||
aa_DJ.UTF-8/UTF-8 \
|
|
||||||
aa_DJ/ISO-8859-1 \
|
|
||||||
aa_ER/UTF-8 \
|
|
||||||
aa_ER@saaho/UTF-8 \
|
|
||||||
aa_ET/UTF-8 \
|
|
||||||
af_ZA.UTF-8/UTF-8 \
|
|
||||||
af_ZA/ISO-8859-1 \
|
|
||||||
agr_PE/UTF-8 \
|
|
||||||
ak_GH/UTF-8 \
|
|
||||||
am_ET/UTF-8 \
|
|
||||||
an_ES.UTF-8/UTF-8 \
|
|
||||||
an_ES/ISO-8859-15 \
|
|
||||||
anp_IN/UTF-8 \
|
|
||||||
ar_AE.UTF-8/UTF-8 \
|
|
||||||
ar_AE/ISO-8859-6 \
|
|
||||||
ar_BH.UTF-8/UTF-8 \
|
|
||||||
ar_BH/ISO-8859-6 \
|
|
||||||
ar_DZ.UTF-8/UTF-8 \
|
|
||||||
ar_DZ/ISO-8859-6 \
|
|
||||||
ar_EG.UTF-8/UTF-8 \
|
|
||||||
ar_EG/ISO-8859-6 \
|
|
||||||
ar_IN/UTF-8 \
|
|
||||||
ar_IQ.UTF-8/UTF-8 \
|
|
||||||
ar_IQ/ISO-8859-6 \
|
|
||||||
ar_JO.UTF-8/UTF-8 \
|
|
||||||
ar_JO/ISO-8859-6 \
|
|
||||||
ar_KW.UTF-8/UTF-8 \
|
|
||||||
ar_KW/ISO-8859-6 \
|
|
||||||
ar_LB.UTF-8/UTF-8 \
|
|
||||||
ar_LB/ISO-8859-6 \
|
|
||||||
ar_LY.UTF-8/UTF-8 \
|
|
||||||
ar_LY/ISO-8859-6 \
|
|
||||||
ar_MA.UTF-8/UTF-8 \
|
|
||||||
ar_MA/ISO-8859-6 \
|
|
||||||
ar_OM.UTF-8/UTF-8 \
|
|
||||||
ar_OM/ISO-8859-6 \
|
|
||||||
ar_QA.UTF-8/UTF-8 \
|
|
||||||
ar_QA/ISO-8859-6 \
|
|
||||||
ar_SA.UTF-8/UTF-8 \
|
|
||||||
ar_SA/ISO-8859-6 \
|
|
||||||
ar_SD.UTF-8/UTF-8 \
|
|
||||||
ar_SD/ISO-8859-6 \
|
|
||||||
ar_SS/UTF-8 \
|
|
||||||
ar_SY.UTF-8/UTF-8 \
|
|
||||||
ar_SY/ISO-8859-6 \
|
|
||||||
ar_TN.UTF-8/UTF-8 \
|
|
||||||
ar_TN/ISO-8859-6 \
|
|
||||||
ar_YE.UTF-8/UTF-8 \
|
|
||||||
ar_YE/ISO-8859-6 \
|
|
||||||
ayc_PE/UTF-8 \
|
|
||||||
az_AZ/UTF-8 \
|
|
||||||
az_IR/UTF-8 \
|
|
||||||
as_IN/UTF-8 \
|
|
||||||
ast_ES.UTF-8/UTF-8 \
|
|
||||||
ast_ES/ISO-8859-15 \
|
|
||||||
be_BY.UTF-8/UTF-8 \
|
|
||||||
be_BY/CP1251 \
|
|
||||||
be_BY@latin/UTF-8 \
|
|
||||||
bem_ZM/UTF-8 \
|
|
||||||
ber_DZ/UTF-8 \
|
|
||||||
ber_MA/UTF-8 \
|
|
||||||
bg_BG.UTF-8/UTF-8 \
|
|
||||||
bg_BG/CP1251 \
|
|
||||||
bhb_IN.UTF-8/UTF-8 \
|
|
||||||
bho_IN/UTF-8 \
|
|
||||||
bho_NP/UTF-8 \
|
|
||||||
bi_VU/UTF-8 \
|
|
||||||
bn_BD/UTF-8 \
|
|
||||||
bn_IN/UTF-8 \
|
|
||||||
bo_CN/UTF-8 \
|
|
||||||
bo_IN/UTF-8 \
|
|
||||||
br_FR.UTF-8/UTF-8 \
|
|
||||||
br_FR/ISO-8859-1 \
|
|
||||||
br_FR@euro/ISO-8859-15 \
|
|
||||||
brx_IN/UTF-8 \
|
|
||||||
bs_BA.UTF-8/UTF-8 \
|
|
||||||
bs_BA/ISO-8859-2 \
|
|
||||||
byn_ER/UTF-8 \
|
|
||||||
ca_AD.UTF-8/UTF-8 \
|
|
||||||
ca_AD/ISO-8859-15 \
|
|
||||||
ca_ES.UTF-8/UTF-8 \
|
|
||||||
ca_ES/ISO-8859-1 \
|
|
||||||
ca_ES@euro/ISO-8859-15 \
|
|
||||||
ca_ES@valencia/UTF-8 \
|
|
||||||
ca_FR.UTF-8/UTF-8 \
|
|
||||||
ca_FR/ISO-8859-15 \
|
|
||||||
ca_IT.UTF-8/UTF-8 \
|
|
||||||
ca_IT/ISO-8859-15 \
|
|
||||||
ce_RU/UTF-8 \
|
|
||||||
chr_US/UTF-8 \
|
|
||||||
cmn_TW/UTF-8 \
|
|
||||||
crh_UA/UTF-8 \
|
|
||||||
cs_CZ.UTF-8/UTF-8 \
|
|
||||||
cs_CZ/ISO-8859-2 \
|
|
||||||
csb_PL/UTF-8 \
|
|
||||||
cv_RU/UTF-8 \
|
|
||||||
cy_GB.UTF-8/UTF-8 \
|
|
||||||
cy_GB/ISO-8859-14 \
|
|
||||||
da_DK.UTF-8/UTF-8 \
|
|
||||||
da_DK/ISO-8859-1 \
|
|
||||||
da_DK.ISO-8859-15/ISO-8859-15 \
|
|
||||||
de_AT.UTF-8/UTF-8 \
|
|
||||||
de_AT/ISO-8859-1 \
|
|
||||||
de_AT@euro/ISO-8859-15 \
|
|
||||||
de_BE.UTF-8/UTF-8 \
|
|
||||||
de_BE/ISO-8859-1 \
|
|
||||||
de_BE@euro/ISO-8859-15 \
|
|
||||||
de_CH.UTF-8/UTF-8 \
|
|
||||||
de_CH/ISO-8859-1 \
|
|
||||||
de_DE.UTF-8/UTF-8 \
|
|
||||||
de_DE/ISO-8859-1 \
|
|
||||||
de_DE@euro/ISO-8859-15 \
|
|
||||||
de_IT.UTF-8/UTF-8 \
|
|
||||||
de_IT/ISO-8859-1 \
|
|
||||||
de_LI.UTF-8/UTF-8 \
|
|
||||||
de_LU.UTF-8/UTF-8 \
|
|
||||||
de_LU/ISO-8859-1 \
|
|
||||||
de_LU@euro/ISO-8859-15 \
|
|
||||||
doi_IN/UTF-8 \
|
|
||||||
dsb_DE/UTF-8 \
|
|
||||||
dv_MV/UTF-8 \
|
|
||||||
dz_BT/UTF-8 \
|
|
||||||
el_GR.UTF-8/UTF-8 \
|
|
||||||
el_GR/ISO-8859-7 \
|
|
||||||
el_GR@euro/ISO-8859-7 \
|
|
||||||
el_CY.UTF-8/UTF-8 \
|
|
||||||
el_CY/ISO-8859-7 \
|
|
||||||
en_AG/UTF-8 \
|
|
||||||
en_AU.UTF-8/UTF-8 \
|
|
||||||
en_AU/ISO-8859-1 \
|
|
||||||
en_BW.UTF-8/UTF-8 \
|
|
||||||
en_BW/ISO-8859-1 \
|
|
||||||
en_CA.UTF-8/UTF-8 \
|
|
||||||
en_CA/ISO-8859-1 \
|
|
||||||
en_DK.UTF-8/UTF-8 \
|
|
||||||
en_DK/ISO-8859-1 \
|
|
||||||
en_GB.UTF-8/UTF-8 \
|
|
||||||
en_GB/ISO-8859-1 \
|
|
||||||
en_GB.ISO-8859-15/ISO-8859-15 \
|
|
||||||
en_HK.UTF-8/UTF-8 \
|
|
||||||
en_HK/ISO-8859-1 \
|
|
||||||
en_IE.UTF-8/UTF-8 \
|
|
||||||
en_IE/ISO-8859-1 \
|
|
||||||
en_IE@euro/ISO-8859-15 \
|
|
||||||
en_IL/UTF-8 \
|
|
||||||
en_IN/UTF-8 \
|
|
||||||
en_NG/UTF-8 \
|
|
||||||
en_NZ.UTF-8/UTF-8 \
|
|
||||||
en_NZ/ISO-8859-1 \
|
|
||||||
en_PH.UTF-8/UTF-8 \
|
|
||||||
en_PH/ISO-8859-1 \
|
|
||||||
en_SC.UTF-8/UTF-8 \
|
|
||||||
en_SG.UTF-8/UTF-8 \
|
|
||||||
en_SG/ISO-8859-1 \
|
|
||||||
en_US.UTF-8/UTF-8 \
|
|
||||||
en_US/ISO-8859-1 \
|
|
||||||
en_US.ISO-8859-15/ISO-8859-15 \
|
|
||||||
en_US@ampm/UTF-8 \
|
|
||||||
en_US.UTF-8@ampm/UTF-8 \
|
|
||||||
en_ZA.UTF-8/UTF-8 \
|
|
||||||
en_ZA/ISO-8859-1 \
|
|
||||||
en_ZM/UTF-8 \
|
|
||||||
en_ZW.UTF-8/UTF-8 \
|
|
||||||
en_ZW/ISO-8859-1 \
|
|
||||||
eo/UTF-8 \
|
|
||||||
es_AR.UTF-8/UTF-8 \
|
|
||||||
es_AR/ISO-8859-1 \
|
|
||||||
es_BO.UTF-8/UTF-8 \
|
|
||||||
es_BO/ISO-8859-1 \
|
|
||||||
es_CL.UTF-8/UTF-8 \
|
|
||||||
es_CL/ISO-8859-1 \
|
|
||||||
es_CO.UTF-8/UTF-8 \
|
|
||||||
es_CO/ISO-8859-1 \
|
|
||||||
es_CR.UTF-8/UTF-8 \
|
|
||||||
es_CR/ISO-8859-1 \
|
|
||||||
es_CU/UTF-8 \
|
|
||||||
es_DO.UTF-8/UTF-8 \
|
|
||||||
es_DO/ISO-8859-1 \
|
|
||||||
es_EC.UTF-8/UTF-8 \
|
|
||||||
es_EC/ISO-8859-1 \
|
|
||||||
es_ES.UTF-8/UTF-8 \
|
|
||||||
es_ES/ISO-8859-1 \
|
|
||||||
es_ES@euro/ISO-8859-15 \
|
|
||||||
es_GT.UTF-8/UTF-8 \
|
|
||||||
es_GT/ISO-8859-1 \
|
|
||||||
es_HN.UTF-8/UTF-8 \
|
|
||||||
es_HN/ISO-8859-1 \
|
|
||||||
es_MX.UTF-8/UTF-8 \
|
|
||||||
es_MX/ISO-8859-1 \
|
|
||||||
es_NI.UTF-8/UTF-8 \
|
|
||||||
es_NI/ISO-8859-1 \
|
|
||||||
es_PA.UTF-8/UTF-8 \
|
|
||||||
es_PA/ISO-8859-1 \
|
|
||||||
es_PE.UTF-8/UTF-8 \
|
|
||||||
es_PE/ISO-8859-1 \
|
|
||||||
es_PR.UTF-8/UTF-8 \
|
|
||||||
es_PR/ISO-8859-1 \
|
|
||||||
es_PY.UTF-8/UTF-8 \
|
|
||||||
es_PY/ISO-8859-1 \
|
|
||||||
es_SV.UTF-8/UTF-8 \
|
|
||||||
es_SV/ISO-8859-1 \
|
|
||||||
es_US.UTF-8/UTF-8 \
|
|
||||||
es_US/ISO-8859-1 \
|
|
||||||
es_UY.UTF-8/UTF-8 \
|
|
||||||
es_UY/ISO-8859-1 \
|
|
||||||
es_VE.UTF-8/UTF-8 \
|
|
||||||
es_VE/ISO-8859-1 \
|
|
||||||
et_EE.UTF-8/UTF-8 \
|
|
||||||
et_EE/ISO-8859-1 \
|
|
||||||
et_EE.ISO-8859-15/ISO-8859-15 \
|
|
||||||
eu_ES.UTF-8/UTF-8 \
|
|
||||||
eu_ES/ISO-8859-1 \
|
|
||||||
eu_ES@euro/ISO-8859-15 \
|
|
||||||
fa_IR/UTF-8 \
|
|
||||||
ff_SN/UTF-8 \
|
|
||||||
fi_FI.UTF-8/UTF-8 \
|
|
||||||
fi_FI/ISO-8859-1 \
|
|
||||||
fi_FI@euro/ISO-8859-15 \
|
|
||||||
fil_PH/UTF-8 \
|
|
||||||
fo_FO.UTF-8/UTF-8 \
|
|
||||||
fo_FO/ISO-8859-1 \
|
|
||||||
fr_BE.UTF-8/UTF-8 \
|
|
||||||
fr_BE/ISO-8859-1 \
|
|
||||||
fr_BE@euro/ISO-8859-15 \
|
|
||||||
fr_CA.UTF-8/UTF-8 \
|
|
||||||
fr_CA/ISO-8859-1 \
|
|
||||||
fr_CH.UTF-8/UTF-8 \
|
|
||||||
fr_CH/ISO-8859-1 \
|
|
||||||
fr_FR.UTF-8/UTF-8 \
|
|
||||||
fr_FR/ISO-8859-1 \
|
|
||||||
fr_FR@euro/ISO-8859-15 \
|
|
||||||
fr_LU.UTF-8/UTF-8 \
|
|
||||||
fr_LU/ISO-8859-1 \
|
|
||||||
fr_LU@euro/ISO-8859-15 \
|
|
||||||
fur_IT/UTF-8 \
|
|
||||||
fy_NL/UTF-8 \
|
|
||||||
fy_DE/UTF-8 \
|
|
||||||
ga_IE.UTF-8/UTF-8 \
|
|
||||||
ga_IE/ISO-8859-1 \
|
|
||||||
ga_IE@euro/ISO-8859-15 \
|
|
||||||
gd_GB.UTF-8/UTF-8 \
|
|
||||||
gd_GB/ISO-8859-15 \
|
|
||||||
gez_ER/UTF-8 \
|
|
||||||
gez_ER@abegede/UTF-8 \
|
|
||||||
gez_ET/UTF-8 \
|
|
||||||
gez_ET@abegede/UTF-8 \
|
|
||||||
gl_ES.UTF-8/UTF-8 \
|
|
||||||
gl_ES/ISO-8859-1 \
|
|
||||||
gl_ES@euro/ISO-8859-15 \
|
|
||||||
gu_IN/UTF-8 \
|
|
||||||
gv_GB.UTF-8/UTF-8 \
|
|
||||||
gv_GB/ISO-8859-1 \
|
|
||||||
ha_NG/UTF-8 \
|
|
||||||
hak_TW/UTF-8 \
|
|
||||||
he_IL.UTF-8/UTF-8 \
|
|
||||||
he_IL/ISO-8859-8 \
|
|
||||||
hi_IN/UTF-8 \
|
|
||||||
hif_FJ/UTF-8 \
|
|
||||||
hne_IN/UTF-8 \
|
|
||||||
hr_HR.UTF-8/UTF-8 \
|
|
||||||
hr_HR/ISO-8859-2 \
|
|
||||||
hsb_DE/ISO-8859-2 \
|
|
||||||
hsb_DE.UTF-8/UTF-8 \
|
|
||||||
ht_HT/UTF-8 \
|
|
||||||
hu_HU.UTF-8/UTF-8 \
|
|
||||||
hu_HU/ISO-8859-2 \
|
|
||||||
hy_AM/UTF-8 \
|
|
||||||
hy_AM.ARMSCII-8/ARMSCII-8 \
|
|
||||||
ia_FR/UTF-8 \
|
|
||||||
id_ID.UTF-8/UTF-8 \
|
|
||||||
id_ID/ISO-8859-1 \
|
|
||||||
ig_NG/UTF-8 \
|
|
||||||
ik_CA/UTF-8 \
|
|
||||||
is_IS.UTF-8/UTF-8 \
|
|
||||||
is_IS/ISO-8859-1 \
|
|
||||||
it_CH.UTF-8/UTF-8 \
|
|
||||||
it_CH/ISO-8859-1 \
|
|
||||||
it_IT.UTF-8/UTF-8 \
|
|
||||||
it_IT/ISO-8859-1 \
|
|
||||||
it_IT@euro/ISO-8859-15 \
|
|
||||||
iu_CA/UTF-8 \
|
|
||||||
ja_JP.EUC-JP/EUC-JP \
|
|
||||||
ja_JP.UTF-8/UTF-8 \
|
|
||||||
ka_GE.UTF-8/UTF-8 \
|
|
||||||
ka_GE/GEORGIAN-PS \
|
|
||||||
kab_DZ/UTF-8 \
|
|
||||||
kk_KZ.UTF-8/UTF-8 \
|
|
||||||
kk_KZ/PT154 \
|
|
||||||
kl_GL.UTF-8/UTF-8 \
|
|
||||||
kl_GL/ISO-8859-1 \
|
|
||||||
km_KH/UTF-8 \
|
|
||||||
kn_IN/UTF-8 \
|
|
||||||
ko_KR.EUC-KR/EUC-KR \
|
|
||||||
ko_KR.UTF-8/UTF-8 \
|
|
||||||
kok_IN/UTF-8 \
|
|
||||||
ks_IN/UTF-8 \
|
|
||||||
ks_IN@devanagari/UTF-8 \
|
|
||||||
ku_TR.UTF-8/UTF-8 \
|
|
||||||
ku_TR/ISO-8859-9 \
|
|
||||||
kw_GB.UTF-8/UTF-8 \
|
|
||||||
kw_GB/ISO-8859-1 \
|
|
||||||
ky_KG/UTF-8 \
|
|
||||||
lb_LU/UTF-8 \
|
|
||||||
lg_UG.UTF-8/UTF-8 \
|
|
||||||
lg_UG/ISO-8859-10 \
|
|
||||||
li_BE/UTF-8 \
|
|
||||||
li_NL/UTF-8 \
|
|
||||||
lij_IT/UTF-8 \
|
|
||||||
ln_CD/UTF-8 \
|
|
||||||
lo_LA/UTF-8 \
|
|
||||||
lt_LT.UTF-8/UTF-8 \
|
|
||||||
lt_LT/ISO-8859-13 \
|
|
||||||
lv_LV.UTF-8/UTF-8 \
|
|
||||||
lv_LV/ISO-8859-13 \
|
|
||||||
lzh_TW/UTF-8 \
|
|
||||||
mag_IN/UTF-8 \
|
|
||||||
mai_IN/UTF-8 \
|
|
||||||
mai_NP/UTF-8 \
|
|
||||||
mfe_MU/UTF-8 \
|
|
||||||
mg_MG.UTF-8/UTF-8 \
|
|
||||||
mg_MG/ISO-8859-15 \
|
|
||||||
mhr_RU/UTF-8 \
|
|
||||||
mi_NZ.UTF-8/UTF-8 \
|
|
||||||
mi_NZ/ISO-8859-13 \
|
|
||||||
miq_NI/UTF-8 \
|
|
||||||
mjw_IN/UTF-8 \
|
|
||||||
mk_MK.UTF-8/UTF-8 \
|
|
||||||
mk_MK/ISO-8859-5 \
|
|
||||||
ml_IN/UTF-8 \
|
|
||||||
mn_MN/UTF-8 \
|
|
||||||
mni_IN/UTF-8 \
|
|
||||||
mr_IN/UTF-8 \
|
|
||||||
ms_MY.UTF-8/UTF-8 \
|
|
||||||
ms_MY/ISO-8859-1 \
|
|
||||||
mt_MT.UTF-8/UTF-8 \
|
|
||||||
mt_MT/ISO-8859-3 \
|
|
||||||
my_MM/UTF-8 \
|
|
||||||
nan_TW/UTF-8 \
|
|
||||||
nan_TW@latin/UTF-8 \
|
|
||||||
nb_NO.UTF-8/UTF-8 \
|
|
||||||
nb_NO/ISO-8859-1 \
|
|
||||||
nds_DE/UTF-8 \
|
|
||||||
nds_NL/UTF-8 \
|
|
||||||
ne_NP/UTF-8 \
|
|
||||||
nhn_MX/UTF-8 \
|
|
||||||
niu_NU/UTF-8 \
|
|
||||||
niu_NZ/UTF-8 \
|
|
||||||
nl_AW/UTF-8 \
|
|
||||||
nl_BE.UTF-8/UTF-8 \
|
|
||||||
nl_BE/ISO-8859-1 \
|
|
||||||
nl_BE@euro/ISO-8859-15 \
|
|
||||||
nl_NL.UTF-8/UTF-8 \
|
|
||||||
nl_NL/ISO-8859-1 \
|
|
||||||
nl_NL@euro/ISO-8859-15 \
|
|
||||||
nn_NO.UTF-8/UTF-8 \
|
|
||||||
nn_NO/ISO-8859-1 \
|
|
||||||
nr_ZA/UTF-8 \
|
|
||||||
nso_ZA/UTF-8 \
|
|
||||||
oc_FR.UTF-8/UTF-8 \
|
|
||||||
oc_FR/ISO-8859-1 \
|
|
||||||
om_ET/UTF-8 \
|
|
||||||
om_KE.UTF-8/UTF-8 \
|
|
||||||
om_KE/ISO-8859-1 \
|
|
||||||
or_IN/UTF-8 \
|
|
||||||
os_RU/UTF-8 \
|
|
||||||
pa_IN/UTF-8 \
|
|
||||||
pa_PK/UTF-8 \
|
|
||||||
pap_AW/UTF-8 \
|
|
||||||
pap_CW/UTF-8 \
|
|
||||||
pl_PL.UTF-8/UTF-8 \
|
|
||||||
pl_PL/ISO-8859-2 \
|
|
||||||
ps_AF/UTF-8 \
|
|
||||||
pt_BR.UTF-8/UTF-8 \
|
|
||||||
pt_BR/ISO-8859-1 \
|
|
||||||
pt_PT.UTF-8/UTF-8 \
|
|
||||||
pt_PT/ISO-8859-1 \
|
|
||||||
pt_PT@euro/ISO-8859-15 \
|
|
||||||
quz_PE/UTF-8 \
|
|
||||||
raj_IN/UTF-8 \
|
|
||||||
ro_RO.UTF-8/UTF-8 \
|
|
||||||
ro_RO/ISO-8859-2 \
|
|
||||||
ru_RU.KOI8-R/KOI8-R \
|
|
||||||
ru_RU.UTF-8/UTF-8 \
|
|
||||||
ru_RU/ISO-8859-5 \
|
|
||||||
ru_UA.UTF-8/UTF-8 \
|
|
||||||
ru_UA/KOI8-U \
|
|
||||||
rw_RW/UTF-8 \
|
|
||||||
sa_IN/UTF-8 \
|
|
||||||
sah_RU/UTF-8 \
|
|
||||||
sat_IN/UTF-8 \
|
|
||||||
sc_IT/UTF-8 \
|
|
||||||
sd_IN/UTF-8 \
|
|
||||||
sd_IN@devanagari/UTF-8 \
|
|
||||||
se_NO/UTF-8 \
|
|
||||||
sgs_LT/UTF-8 \
|
|
||||||
shn_MM/UTF-8 \
|
|
||||||
shs_CA/UTF-8 \
|
|
||||||
si_LK/UTF-8 \
|
|
||||||
sid_ET/UTF-8 \
|
|
||||||
sk_SK.UTF-8/UTF-8 \
|
|
||||||
sk_SK/ISO-8859-2 \
|
|
||||||
sl_SI.UTF-8/UTF-8 \
|
|
||||||
sl_SI/ISO-8859-2 \
|
|
||||||
sm_WS/UTF-8 \
|
|
||||||
so_DJ.UTF-8/UTF-8 \
|
|
||||||
so_DJ/ISO-8859-1 \
|
|
||||||
so_ET/UTF-8 \
|
|
||||||
so_KE.UTF-8/UTF-8 \
|
|
||||||
so_KE/ISO-8859-1 \
|
|
||||||
so_SO.UTF-8/UTF-8 \
|
|
||||||
so_SO/ISO-8859-1 \
|
|
||||||
sq_AL.UTF-8/UTF-8 \
|
|
||||||
sq_AL/ISO-8859-1 \
|
|
||||||
sq_MK/UTF-8 \
|
|
||||||
sr_ME/UTF-8 \
|
|
||||||
sr_RS/UTF-8 \
|
|
||||||
sr_RS@latin/UTF-8 \
|
|
||||||
ss_ZA/UTF-8 \
|
|
||||||
st_ZA.UTF-8/UTF-8 \
|
|
||||||
st_ZA/ISO-8859-1 \
|
|
||||||
sv_FI.UTF-8/UTF-8 \
|
|
||||||
sv_FI/ISO-8859-1 \
|
|
||||||
sv_FI@euro/ISO-8859-15 \
|
|
||||||
sv_SE.UTF-8/UTF-8 \
|
|
||||||
sv_SE/ISO-8859-1 \
|
|
||||||
sv_SE.ISO-8859-15/ISO-8859-15 \
|
|
||||||
sw_KE/UTF-8 \
|
|
||||||
sw_TZ/UTF-8 \
|
|
||||||
szl_PL/UTF-8 \
|
|
||||||
ta_IN/UTF-8 \
|
|
||||||
ta_LK/UTF-8 \
|
|
||||||
tcy_IN.UTF-8/UTF-8 \
|
|
||||||
te_IN/UTF-8 \
|
|
||||||
tg_TJ.UTF-8/UTF-8 \
|
|
||||||
tg_TJ/KOI8-T \
|
|
||||||
th_TH.UTF-8/UTF-8 \
|
|
||||||
th_TH/TIS-620 \
|
|
||||||
the_NP/UTF-8 \
|
|
||||||
ti_ER/UTF-8 \
|
|
||||||
ti_ET/UTF-8 \
|
|
||||||
tig_ER/UTF-8 \
|
|
||||||
tk_TM/UTF-8 \
|
|
||||||
tl_PH.UTF-8/UTF-8 \
|
|
||||||
tl_PH/ISO-8859-1 \
|
|
||||||
tn_ZA/UTF-8 \
|
|
||||||
to_TO/UTF-8 \
|
|
||||||
tpi_PG/UTF-8 \
|
|
||||||
tr_CY.UTF-8/UTF-8 \
|
|
||||||
tr_CY/ISO-8859-9 \
|
|
||||||
tr_TR.UTF-8/UTF-8 \
|
|
||||||
tr_TR/ISO-8859-9 \
|
|
||||||
ts_ZA/UTF-8 \
|
|
||||||
tt_RU/UTF-8 \
|
|
||||||
tt_RU@iqtelif/UTF-8 \
|
|
||||||
ug_CN/UTF-8 \
|
|
||||||
uk_UA.UTF-8/UTF-8 \
|
|
||||||
uk_UA/KOI8-U \
|
|
||||||
unm_US/UTF-8 \
|
|
||||||
ur_IN/UTF-8 \
|
|
||||||
ur_PK/UTF-8 \
|
|
||||||
uz_UZ.UTF-8/UTF-8 \
|
|
||||||
uz_UZ/ISO-8859-1 \
|
|
||||||
uz_UZ@cyrillic/UTF-8 \
|
|
||||||
ve_ZA/UTF-8 \
|
|
||||||
vi_VN/UTF-8 \
|
|
||||||
wa_BE/ISO-8859-1 \
|
|
||||||
wa_BE@euro/ISO-8859-15 \
|
|
||||||
wa_BE.UTF-8/UTF-8 \
|
|
||||||
wae_CH/UTF-8 \
|
|
||||||
wal_ET/UTF-8 \
|
|
||||||
wo_SN/UTF-8 \
|
|
||||||
xh_ZA.UTF-8/UTF-8 \
|
|
||||||
xh_ZA/ISO-8859-1 \
|
|
||||||
yi_US.UTF-8/UTF-8 \
|
|
||||||
yi_US/CP1255 \
|
|
||||||
yo_NG/UTF-8 \
|
|
||||||
yue_HK/UTF-8 \
|
|
||||||
yuw_PG/UTF-8 \
|
|
||||||
zh_CN.GB18030/GB18030 \
|
|
||||||
zh_CN.GBK/GBK \
|
|
||||||
zh_CN.UTF-8/UTF-8 \
|
|
||||||
zh_CN/GB2312 \
|
|
||||||
zh_HK.UTF-8/UTF-8 \
|
|
||||||
zh_HK/BIG5-HKSCS \
|
|
||||||
zh_SG.UTF-8/UTF-8 \
|
|
||||||
zh_SG.GBK/GBK \
|
|
||||||
zh_SG/GB2312 \
|
|
||||||
zh_TW.EUC-TW/EUC-TW \
|
|
||||||
zh_TW.UTF-8/UTF-8 \
|
|
||||||
zh_TW/BIG5 \
|
|
||||||
zu_ZA.UTF-8/UTF-8 \
|
|
||||||
zu_ZA/ISO-8859-1 \
|
|
@ -1,862 +0,0 @@
|
|||||||
#define _GNU_SOURCE
|
|
||||||
#include <assert.h>
|
|
||||||
#include <dirent.h>
|
|
||||||
#include <errno.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
#include <locale.h>
|
|
||||||
#include <stdarg.h>
|
|
||||||
#include <stdbool.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <getopt.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <sys/mman.h>
|
|
||||||
#include <sys/stat.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include "../locale/hashval.h"
|
|
||||||
#define __LC_LAST 13
|
|
||||||
#include "../locale/locarchive.h"
|
|
||||||
#include "../crypt/md5.h"
|
|
||||||
|
|
||||||
const char *alias_file = DATADIR "/locale/locale.alias";
|
|
||||||
const char *locar_file = PREFIX "/lib/locale/locale-archive";
|
|
||||||
const char *tmpl_file = PREFIX "/lib/locale/locale-archive.tmpl";
|
|
||||||
const char *loc_path = PREFIX "/lib/locale/";
|
|
||||||
/* Flags set by `--verbose` option. */
|
|
||||||
int be_quiet = 1;
|
|
||||||
int verbose = 0;
|
|
||||||
int max_locarchive_open_retry = 10;
|
|
||||||
const char *output_prefix;
|
|
||||||
|
|
||||||
/* Endianness should have been taken care of by localedef. We don't need to do
|
|
||||||
additional swapping. We need this variable exported however, since
|
|
||||||
locarchive.c uses it to determine if it needs to swap endianness of a value
|
|
||||||
before writing to or reading from the archive. */
|
|
||||||
bool swap_endianness_p = false;
|
|
||||||
|
|
||||||
static const char *locnames[] =
|
|
||||||
{
|
|
||||||
#define DEFINE_CATEGORY(category, category_name, items, a) \
|
|
||||||
[category] = category_name,
|
|
||||||
#include "../locale/categories.def"
|
|
||||||
#undef DEFINE_CATEGORY
|
|
||||||
};
|
|
||||||
|
|
||||||
static int
|
|
||||||
is_prime (unsigned long candidate)
|
|
||||||
{
|
|
||||||
/* No even number and none less than 10 will be passed here. */
|
|
||||||
unsigned long int divn = 3;
|
|
||||||
unsigned long int sq = divn * divn;
|
|
||||||
|
|
||||||
while (sq < candidate && candidate % divn != 0)
|
|
||||||
{
|
|
||||||
++divn;
|
|
||||||
sq += 4 * divn;
|
|
||||||
++divn;
|
|
||||||
}
|
|
||||||
|
|
||||||
return candidate % divn != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned long
|
|
||||||
next_prime (unsigned long seed)
|
|
||||||
{
|
|
||||||
/* Make it definitely odd. */
|
|
||||||
seed |= 1;
|
|
||||||
|
|
||||||
while (!is_prime (seed))
|
|
||||||
seed += 2;
|
|
||||||
|
|
||||||
return seed;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
error (int status, int errnum, const char *message, ...)
|
|
||||||
{
|
|
||||||
va_list args;
|
|
||||||
|
|
||||||
va_start (args, message);
|
|
||||||
fflush (stdout);
|
|
||||||
fprintf (stderr, "%s: ", program_invocation_name);
|
|
||||||
vfprintf (stderr, message, args);
|
|
||||||
va_end (args);
|
|
||||||
if (errnum)
|
|
||||||
fprintf (stderr, ": %s", strerror (errnum));
|
|
||||||
putc ('\n', stderr);
|
|
||||||
fflush (stderr);
|
|
||||||
if (status)
|
|
||||||
exit (errnum == EROFS ? 0 : status);
|
|
||||||
}
|
|
||||||
|
|
||||||
void *
|
|
||||||
xmalloc (size_t size)
|
|
||||||
{
|
|
||||||
void *p = malloc (size);
|
|
||||||
if (p == NULL)
|
|
||||||
error (EXIT_FAILURE, errno, "could not allocate %zd bytes of memory", size);
|
|
||||||
return p;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
open_tmpl_archive (struct locarhandle *ah)
|
|
||||||
{
|
|
||||||
struct stat64 st;
|
|
||||||
int fd;
|
|
||||||
struct locarhead head;
|
|
||||||
const char *archivefname = ah->fname == NULL ? tmpl_file : ah->fname;
|
|
||||||
|
|
||||||
/* Open the archive. We must have exclusive write access. */
|
|
||||||
fd = open64 (archivefname, O_RDONLY);
|
|
||||||
if (fd == -1)
|
|
||||||
error (EXIT_FAILURE, errno, "cannot open locale archive template file \"%s\"",
|
|
||||||
archivefname);
|
|
||||||
|
|
||||||
if (fstat64 (fd, &st) < 0)
|
|
||||||
error (EXIT_FAILURE, errno, "cannot stat locale archive template file \"%s\"",
|
|
||||||
archivefname);
|
|
||||||
|
|
||||||
/* Read the header. */
|
|
||||||
if (TEMP_FAILURE_RETRY (read (fd, &head, sizeof (head))) != sizeof (head))
|
|
||||||
error (EXIT_FAILURE, errno, "cannot read archive header");
|
|
||||||
|
|
||||||
ah->fd = fd;
|
|
||||||
ah->mmaped = (head.sumhash_offset
|
|
||||||
+ head.sumhash_size * sizeof (struct sumhashent));
|
|
||||||
if (ah->mmaped > (unsigned long) st.st_size)
|
|
||||||
error (EXIT_FAILURE, 0, "locale archive template file truncated");
|
|
||||||
ah->mmaped = st.st_size;
|
|
||||||
ah->reserved = st.st_size;
|
|
||||||
|
|
||||||
/* Now we know how large the administrative information part is.
|
|
||||||
Map all of it. */
|
|
||||||
ah->addr = mmap64 (NULL, ah->mmaped, PROT_READ, MAP_SHARED, fd, 0);
|
|
||||||
if (ah->addr == MAP_FAILED)
|
|
||||||
error (EXIT_FAILURE, errno, "cannot map archive header");
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Open the locale archive. */
|
|
||||||
extern void open_archive (struct locarhandle *ah, bool readonly);
|
|
||||||
|
|
||||||
/* Close the locale archive. */
|
|
||||||
extern void close_archive (struct locarhandle *ah);
|
|
||||||
|
|
||||||
/* Add given locale data to the archive. */
|
|
||||||
extern int add_locale_to_archive (struct locarhandle *ah, const char *name,
|
|
||||||
locale_data_t data, bool replace);
|
|
||||||
|
|
||||||
extern void add_alias (struct locarhandle *ah, const char *alias,
|
|
||||||
bool replace, const char *oldname,
|
|
||||||
uint32_t *locrec_offset_p);
|
|
||||||
|
|
||||||
extern struct namehashent *
|
|
||||||
insert_name (struct locarhandle *ah,
|
|
||||||
const char *name, size_t name_len, bool replace);
|
|
||||||
|
|
||||||
struct nameent
|
|
||||||
{
|
|
||||||
char *name;
|
|
||||||
struct locrecent *locrec;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct dataent
|
|
||||||
{
|
|
||||||
const unsigned char *sum;
|
|
||||||
uint32_t file_offset;
|
|
||||||
};
|
|
||||||
|
|
||||||
static int
|
|
||||||
nameentcmp (const void *a, const void *b)
|
|
||||||
{
|
|
||||||
struct locrecent *la = ((const struct nameent *) a)->locrec;
|
|
||||||
struct locrecent *lb = ((const struct nameent *) b)->locrec;
|
|
||||||
uint32_t start_a = -1, end_a = 0;
|
|
||||||
uint32_t start_b = -1, end_b = 0;
|
|
||||||
int cnt;
|
|
||||||
|
|
||||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
|
||||||
if (cnt != LC_ALL)
|
|
||||||
{
|
|
||||||
if (la->record[cnt].offset < start_a)
|
|
||||||
start_a = la->record[cnt].offset;
|
|
||||||
if (la->record[cnt].offset + la->record[cnt].len > end_a)
|
|
||||||
end_a = la->record[cnt].offset + la->record[cnt].len;
|
|
||||||
}
|
|
||||||
assert (start_a != (uint32_t)-1);
|
|
||||||
assert (end_a != 0);
|
|
||||||
|
|
||||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
|
||||||
if (cnt != LC_ALL)
|
|
||||||
{
|
|
||||||
if (lb->record[cnt].offset < start_b)
|
|
||||||
start_b = lb->record[cnt].offset;
|
|
||||||
if (lb->record[cnt].offset + lb->record[cnt].len > end_b)
|
|
||||||
end_b = lb->record[cnt].offset + lb->record[cnt].len;
|
|
||||||
}
|
|
||||||
assert (start_b != (uint32_t)-1);
|
|
||||||
assert (end_b != 0);
|
|
||||||
|
|
||||||
if (start_a != start_b)
|
|
||||||
return (int)start_a - (int)start_b;
|
|
||||||
return (int)end_a - (int)end_b;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int
|
|
||||||
dataentcmp (const void *a, const void *b)
|
|
||||||
{
|
|
||||||
if (((const struct dataent *) a)->file_offset
|
|
||||||
< ((const struct dataent *) b)->file_offset)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
if (((const struct dataent *) a)->file_offset
|
|
||||||
> ((const struct dataent *) b)->file_offset)
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int
|
|
||||||
sumsearchfn (const void *key, const void *ent)
|
|
||||||
{
|
|
||||||
uint32_t keyn = *(uint32_t *)key;
|
|
||||||
uint32_t entn = ((struct dataent *)ent)->file_offset;
|
|
||||||
|
|
||||||
if (keyn < entn)
|
|
||||||
return -1;
|
|
||||||
if (keyn > entn)
|
|
||||||
return 1;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
compute_data (struct locarhandle *ah, struct nameent *name, size_t sumused,
|
|
||||||
struct dataent *files, locale_data_t data)
|
|
||||||
{
|
|
||||||
int cnt;
|
|
||||||
struct locrecent *locrec = name->locrec;
|
|
||||||
struct dataent *file;
|
|
||||||
data[LC_ALL].addr = ((char *) ah->addr) + locrec->record[LC_ALL].offset;
|
|
||||||
data[LC_ALL].size = locrec->record[LC_ALL].len;
|
|
||||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
|
||||||
if (cnt != LC_ALL)
|
|
||||||
{
|
|
||||||
data[cnt].addr = ((char *) ah->addr) + locrec->record[cnt].offset;
|
|
||||||
data[cnt].size = locrec->record[cnt].len;
|
|
||||||
if (data[cnt].addr >= data[LC_ALL].addr
|
|
||||||
&& data[cnt].addr + data[cnt].size
|
|
||||||
<= data[LC_ALL].addr + data[LC_ALL].size)
|
|
||||||
__md5_buffer (data[cnt].addr, data[cnt].size, data[cnt].sum);
|
|
||||||
else
|
|
||||||
{
|
|
||||||
file = bsearch (&locrec->record[cnt].offset, files, sumused,
|
|
||||||
sizeof (*files), sumsearchfn);
|
|
||||||
if (file == NULL)
|
|
||||||
error (EXIT_FAILURE, 0, "inconsistent template file");
|
|
||||||
memcpy (data[cnt].sum, file->sum, sizeof (data[cnt].sum));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static int
|
|
||||||
fill_archive (struct locarhandle *tmpl_ah,
|
|
||||||
const char *fname,
|
|
||||||
size_t install_langs_count, char *install_langs_list[],
|
|
||||||
size_t nlist, char *list[],
|
|
||||||
const char *primary)
|
|
||||||
{
|
|
||||||
struct locarhandle ah;
|
|
||||||
struct locarhead *head;
|
|
||||||
int result = 0;
|
|
||||||
struct nameent *names;
|
|
||||||
struct namehashent *namehashtab;
|
|
||||||
size_t cnt, used;
|
|
||||||
struct dataent *files;
|
|
||||||
struct sumhashent *sumhashtab;
|
|
||||||
size_t sumused;
|
|
||||||
struct locrecent *primary_locrec = NULL;
|
|
||||||
struct nameent *primary_nameent = NULL;
|
|
||||||
|
|
||||||
head = tmpl_ah->addr;
|
|
||||||
names = (struct nameent *) malloc (head->namehash_used
|
|
||||||
* sizeof (struct nameent));
|
|
||||||
files = (struct dataent *) malloc (head->sumhash_used
|
|
||||||
* sizeof (struct dataent));
|
|
||||||
if (names == NULL || files == NULL)
|
|
||||||
error (EXIT_FAILURE, errno, "could not allocate tables");
|
|
||||||
|
|
||||||
namehashtab = (struct namehashent *) ((char *) tmpl_ah->addr
|
|
||||||
+ head->namehash_offset);
|
|
||||||
sumhashtab = (struct sumhashent *) ((char *) tmpl_ah->addr
|
|
||||||
+ head->sumhash_offset);
|
|
||||||
|
|
||||||
for (cnt = used = 0; cnt < head->namehash_size; ++cnt)
|
|
||||||
if (namehashtab[cnt].locrec_offset != 0)
|
|
||||||
{
|
|
||||||
char * name;
|
|
||||||
int i;
|
|
||||||
assert (used < head->namehash_used);
|
|
||||||
name = tmpl_ah->addr + namehashtab[cnt].name_offset;
|
|
||||||
if (install_langs_count == 0)
|
|
||||||
{
|
|
||||||
/* Always intstall the entry. */
|
|
||||||
names[used].name = name;
|
|
||||||
names[used++].locrec
|
|
||||||
= (struct locrecent *) ((char *) tmpl_ah->addr +
|
|
||||||
namehashtab[cnt].locrec_offset);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/* Only install the entry if the user asked for it via
|
|
||||||
--install-langs. */
|
|
||||||
for (i = 0; i < install_langs_count; i++)
|
|
||||||
{
|
|
||||||
/* Add one for "_" and one for the null terminator. */
|
|
||||||
size_t len = strlen (install_langs_list[i]) + 2;
|
|
||||||
char *install_lang = (char *)xmalloc (len);
|
|
||||||
strcpy (install_lang, install_langs_list[i]);
|
|
||||||
if (strchr (install_lang, '_') == NULL)
|
|
||||||
strcat (install_lang, "_");
|
|
||||||
if (strncmp (name, install_lang, strlen (install_lang)) == 0)
|
|
||||||
{
|
|
||||||
names[used].name = name;
|
|
||||||
names[used++].locrec
|
|
||||||
= (struct locrecent *) ((char *)tmpl_ah->addr
|
|
||||||
+ namehashtab[cnt].locrec_offset);
|
|
||||||
}
|
|
||||||
free (install_lang);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Sort the names. */
|
|
||||||
qsort (names, used, sizeof (struct nameent), nameentcmp);
|
|
||||||
|
|
||||||
for (cnt = sumused = 0; cnt < head->sumhash_size; ++cnt)
|
|
||||||
if (sumhashtab[cnt].file_offset != 0)
|
|
||||||
{
|
|
||||||
assert (sumused < head->sumhash_used);
|
|
||||||
files[sumused].sum = (const unsigned char *) sumhashtab[cnt].sum;
|
|
||||||
files[sumused++].file_offset = sumhashtab[cnt].file_offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Sort by file locations. */
|
|
||||||
qsort (files, sumused, sizeof (struct dataent), dataentcmp);
|
|
||||||
|
|
||||||
/* Open the archive. This call never returns if we cannot
|
|
||||||
successfully open the archive. */
|
|
||||||
ah.fname = NULL;
|
|
||||||
if (fname != NULL)
|
|
||||||
ah.fname = fname;
|
|
||||||
open_archive (&ah, false);
|
|
||||||
|
|
||||||
if (primary != NULL)
|
|
||||||
{
|
|
||||||
for (cnt = 0; cnt < used; ++cnt)
|
|
||||||
if (strcmp (names[cnt].name, primary) == 0)
|
|
||||||
break;
|
|
||||||
if (cnt < used)
|
|
||||||
{
|
|
||||||
locale_data_t data;
|
|
||||||
|
|
||||||
compute_data (tmpl_ah, &names[cnt], sumused, files, data);
|
|
||||||
result |= add_locale_to_archive (&ah, primary, data, 0);
|
|
||||||
primary_locrec = names[cnt].locrec;
|
|
||||||
primary_nameent = &names[cnt];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (cnt = 0; cnt < used; ++cnt)
|
|
||||||
if (&names[cnt] == primary_nameent)
|
|
||||||
continue;
|
|
||||||
else if ((cnt > 0 && names[cnt - 1].locrec == names[cnt].locrec)
|
|
||||||
|| names[cnt].locrec == primary_locrec)
|
|
||||||
{
|
|
||||||
const char *oldname;
|
|
||||||
struct namehashent *namehashent;
|
|
||||||
uint32_t locrec_offset;
|
|
||||||
|
|
||||||
if (names[cnt].locrec == primary_locrec)
|
|
||||||
oldname = primary;
|
|
||||||
else
|
|
||||||
oldname = names[cnt - 1].name;
|
|
||||||
namehashent = insert_name (&ah, oldname, strlen (oldname), true);
|
|
||||||
assert (namehashent->name_offset != 0);
|
|
||||||
assert (namehashent->locrec_offset != 0);
|
|
||||||
locrec_offset = namehashent->locrec_offset;
|
|
||||||
add_alias (&ah, names[cnt].name, 0, oldname, &locrec_offset);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
locale_data_t data;
|
|
||||||
|
|
||||||
compute_data (tmpl_ah, &names[cnt], sumused, files, data);
|
|
||||||
result |= add_locale_to_archive (&ah, names[cnt].name, data, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
while (nlist-- > 0)
|
|
||||||
{
|
|
||||||
const char *fname = *list++;
|
|
||||||
size_t fnamelen = strlen (fname);
|
|
||||||
struct stat64 st;
|
|
||||||
DIR *dirp;
|
|
||||||
struct dirent64 *d;
|
|
||||||
int seen;
|
|
||||||
locale_data_t data;
|
|
||||||
int cnt;
|
|
||||||
|
|
||||||
/* First see whether this really is a directory and whether it
|
|
||||||
contains all the require locale category files. */
|
|
||||||
if (stat64 (fname, &st) < 0)
|
|
||||||
{
|
|
||||||
error (0, 0, "stat of \"%s\" failed: %s: ignored", fname,
|
|
||||||
strerror (errno));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!S_ISDIR (st.st_mode))
|
|
||||||
{
|
|
||||||
error (0, 0, "\"%s\" is no directory; ignored", fname);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
dirp = opendir (fname);
|
|
||||||
if (dirp == NULL)
|
|
||||||
{
|
|
||||||
error (0, 0, "cannot open directory \"%s\": %s: ignored",
|
|
||||||
fname, strerror (errno));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
seen = 0;
|
|
||||||
while ((d = readdir64 (dirp)) != NULL)
|
|
||||||
{
|
|
||||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
|
||||||
if (cnt != LC_ALL)
|
|
||||||
if (strcmp (d->d_name, locnames[cnt]) == 0)
|
|
||||||
{
|
|
||||||
unsigned char d_type;
|
|
||||||
|
|
||||||
/* We have an object of the required name. If it's
|
|
||||||
a directory we have to look at a file with the
|
|
||||||
prefix "SYS_". Otherwise we have found what we
|
|
||||||
are looking for. */
|
|
||||||
#ifdef _DIRENT_HAVE_D_TYPE
|
|
||||||
d_type = d->d_type;
|
|
||||||
|
|
||||||
if (d_type != DT_REG)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
char fullname[fnamelen + 2 * strlen (d->d_name) + 7];
|
|
||||||
|
|
||||||
#ifdef _DIRENT_HAVE_D_TYPE
|
|
||||||
if (d_type == DT_UNKNOWN || d_type == DT_LNK)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
strcpy (stpcpy (stpcpy (fullname, fname), "/"),
|
|
||||||
d->d_name);
|
|
||||||
|
|
||||||
if (stat64 (fullname, &st) == -1)
|
|
||||||
/* We cannot stat the file, ignore it. */
|
|
||||||
break;
|
|
||||||
|
|
||||||
d_type = IFTODT (st.st_mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (d_type == DT_DIR)
|
|
||||||
{
|
|
||||||
/* We have to do more tests. The file is a
|
|
||||||
directory and it therefore must contain a
|
|
||||||
regular file with the same name except a
|
|
||||||
"SYS_" prefix. */
|
|
||||||
char *t = stpcpy (stpcpy (fullname, fname), "/");
|
|
||||||
strcpy (stpcpy (stpcpy (t, d->d_name), "/SYS_"),
|
|
||||||
d->d_name);
|
|
||||||
|
|
||||||
if (stat64 (fullname, &st) == -1)
|
|
||||||
/* There is no SYS_* file or we cannot
|
|
||||||
access it. */
|
|
||||||
break;
|
|
||||||
|
|
||||||
d_type = IFTODT (st.st_mode);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If we found a regular file (eventually after
|
|
||||||
following a symlink) we are successful. */
|
|
||||||
if (d_type == DT_REG)
|
|
||||||
++seen;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
closedir (dirp);
|
|
||||||
|
|
||||||
if (seen != __LC_LAST - 1)
|
|
||||||
{
|
|
||||||
/* We don't have all locale category files. Ignore the name. */
|
|
||||||
error (0, 0, "incomplete set of locale files in \"%s\"",
|
|
||||||
fname);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Add the files to the archive. To do this we first compute
|
|
||||||
sizes and the MD5 sums of all the files. */
|
|
||||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
|
||||||
if (cnt != LC_ALL)
|
|
||||||
{
|
|
||||||
char fullname[fnamelen + 2 * strlen (locnames[cnt]) + 7];
|
|
||||||
int fd;
|
|
||||||
|
|
||||||
strcpy (stpcpy (stpcpy (fullname, fname), "/"), locnames[cnt]);
|
|
||||||
fd = open64 (fullname, O_RDONLY);
|
|
||||||
if (fd == -1 || fstat64 (fd, &st) == -1)
|
|
||||||
{
|
|
||||||
/* Cannot read the file. */
|
|
||||||
if (fd != -1)
|
|
||||||
close (fd);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (S_ISDIR (st.st_mode))
|
|
||||||
{
|
|
||||||
char *t;
|
|
||||||
close (fd);
|
|
||||||
t = stpcpy (stpcpy (fullname, fname), "/");
|
|
||||||
strcpy (stpcpy (stpcpy (t, locnames[cnt]), "/SYS_"),
|
|
||||||
locnames[cnt]);
|
|
||||||
|
|
||||||
fd = open64 (fullname, O_RDONLY);
|
|
||||||
if (fd == -1 || fstat64 (fd, &st) == -1
|
|
||||||
|| !S_ISREG (st.st_mode))
|
|
||||||
{
|
|
||||||
if (fd != -1)
|
|
||||||
close (fd);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Map the file. */
|
|
||||||
data[cnt].addr = mmap64 (NULL, st.st_size, PROT_READ, MAP_SHARED,
|
|
||||||
fd, 0);
|
|
||||||
if (data[cnt].addr == MAP_FAILED)
|
|
||||||
{
|
|
||||||
/* Cannot map it. */
|
|
||||||
close (fd);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
data[cnt].size = st.st_size;
|
|
||||||
__md5_buffer (data[cnt].addr, st.st_size, data[cnt].sum);
|
|
||||||
|
|
||||||
/* We don't need the file descriptor anymore. */
|
|
||||||
close (fd);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cnt != __LC_LAST)
|
|
||||||
{
|
|
||||||
while (cnt-- > 0)
|
|
||||||
if (cnt != LC_ALL)
|
|
||||||
munmap (data[cnt].addr, data[cnt].size);
|
|
||||||
|
|
||||||
error (0, 0, "cannot read all files in \"%s\": ignored", fname);
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
result |= add_locale_to_archive (&ah, basename (fname), data, 0);
|
|
||||||
|
|
||||||
for (cnt = 0; cnt < __LC_LAST; ++cnt)
|
|
||||||
if (cnt != LC_ALL)
|
|
||||||
munmap (data[cnt].addr, data[cnt].size);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* We are done. */
|
|
||||||
close_archive (&ah);
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
void usage()
|
|
||||||
{
|
|
||||||
printf ("\
|
|
||||||
Usage: build-locale-archive [OPTION]... [TEMPLATE-FILE] [ARCHIVE-FILE]\n\
|
|
||||||
Builds a locale archive from a template file.\n\
|
|
||||||
Options:\n\
|
|
||||||
-h, --help Print this usage message.\n\
|
|
||||||
-v, --verbose Verbose execution.\n\
|
|
||||||
-l, --install-langs=LIST Only include locales given in LIST into the \n\
|
|
||||||
locale archive. LIST is a colon separated list\n\
|
|
||||||
of locale prefixes, for example \"de:en:ja\".\n\
|
|
||||||
The special argument \"all\" means to install\n\
|
|
||||||
all languages and it must be present by itself.\n\
|
|
||||||
If \"all\" is present with any other language it\n\
|
|
||||||
will be treated as the name of a locale.\n\
|
|
||||||
If the --install-langs option is missing, all\n\
|
|
||||||
locales are installed. The colon separated list\n\
|
|
||||||
can contain any strings matching the beginning of\n\
|
|
||||||
locale names.\n\
|
|
||||||
If a string does not contain a \"_\", it is added.\n\
|
|
||||||
Examples:\n\
|
|
||||||
--install-langs=\"en\"\n\
|
|
||||||
installs en_US, en_US.iso88591,\n\
|
|
||||||
en_US.iso885915, en_US.utf8,\n\
|
|
||||||
en_GB ...\n\
|
|
||||||
--install-langs=\"en_US.utf8\"\n\
|
|
||||||
installs only en_US.utf8.\n\
|
|
||||||
--install-langs=\"ko\"\n\
|
|
||||||
installs ko_KR, ko_KR.euckr,\n\
|
|
||||||
ko_KR.utf8 but *not* kok_IN\n\
|
|
||||||
because \"ko\" does not contain\n\
|
|
||||||
\"_\" and it is silently added\n\
|
|
||||||
--install-langs\"ko:kok\"\n\
|
|
||||||
installs ko_KR, ko_KR.euckr,\n\
|
|
||||||
ko_KR.utf8, kok_IN, and\n\
|
|
||||||
kok_IN.utf8.\n\
|
|
||||||
--install-langs=\"POSIX\" will\n\
|
|
||||||
installs *no* locales at all\n\
|
|
||||||
because POSIX matches none of\n\
|
|
||||||
the locales. Actually, any string\n\
|
|
||||||
matching nothing will do that.\n\
|
|
||||||
POSIX and C will always be\n\
|
|
||||||
available because they are\n\
|
|
||||||
builtin.\n\
|
|
||||||
Aliases are installed as well,\n\
|
|
||||||
i.e. --install-langs=\"de\"\n\
|
|
||||||
will install not only every locale starting with\n\
|
|
||||||
\"de\" but also the aliases \"deutsch\"\n\
|
|
||||||
and and \"german\" although the latter does not\n\
|
|
||||||
start with \"de\".\n\
|
|
||||||
\n\
|
|
||||||
If the arguments TEMPLATE-FILE and ARCHIVE-FILE are not given the locations\n\
|
|
||||||
where the glibc used expects these files are used by default.\n\
|
|
||||||
");
|
|
||||||
}
|
|
||||||
|
|
||||||
int main (int argc, char *argv[])
|
|
||||||
{
|
|
||||||
char path[4096];
|
|
||||||
DIR *dirp;
|
|
||||||
struct dirent64 *d;
|
|
||||||
struct stat64 st;
|
|
||||||
char *list[16384], *primary;
|
|
||||||
char *lang;
|
|
||||||
int install_langs_count = 0;
|
|
||||||
int i;
|
|
||||||
char *install_langs_arg, *ila_start;
|
|
||||||
char **install_langs_list = NULL;
|
|
||||||
unsigned int cnt = 0;
|
|
||||||
struct locarhandle tmpl_ah;
|
|
||||||
char *new_locar_fname = NULL;
|
|
||||||
size_t loc_path_len = strlen (loc_path);
|
|
||||||
|
|
||||||
while (1)
|
|
||||||
{
|
|
||||||
int c;
|
|
||||||
|
|
||||||
static struct option long_options[] =
|
|
||||||
{
|
|
||||||
{"help", no_argument, 0, 'h'},
|
|
||||||
{"verbose", no_argument, 0, 'v'},
|
|
||||||
{"install-langs", required_argument, 0, 'l'},
|
|
||||||
{0, 0, 0, 0}
|
|
||||||
};
|
|
||||||
/* getopt_long stores the option index here. */
|
|
||||||
int option_index = 0;
|
|
||||||
|
|
||||||
c = getopt_long (argc, argv, "vhl:",
|
|
||||||
long_options, &option_index);
|
|
||||||
|
|
||||||
/* Detect the end of the options. */
|
|
||||||
if (c == -1)
|
|
||||||
break;
|
|
||||||
|
|
||||||
switch (c)
|
|
||||||
{
|
|
||||||
case 0:
|
|
||||||
printf ("unknown option %s", long_options[option_index].name);
|
|
||||||
if (optarg)
|
|
||||||
printf (" with arg %s", optarg);
|
|
||||||
printf ("\n");
|
|
||||||
usage ();
|
|
||||||
exit (1);
|
|
||||||
|
|
||||||
case 'v':
|
|
||||||
verbose = 1;
|
|
||||||
be_quiet = 0;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case 'h':
|
|
||||||
usage ();
|
|
||||||
exit (0);
|
|
||||||
|
|
||||||
case 'l':
|
|
||||||
install_langs_arg = ila_start = strdup (optarg);
|
|
||||||
/* If the argument to --install-lang is "all", do
|
|
||||||
not limit the list of languages to install and install
|
|
||||||
them all. We do not support installing a single locale
|
|
||||||
called "all". */
|
|
||||||
#define MAGIC_INSTALL_ALL "all"
|
|
||||||
if (install_langs_arg != NULL
|
|
||||||
&& install_langs_arg[0] != '\0'
|
|
||||||
&& !(strncmp(install_langs_arg, MAGIC_INSTALL_ALL,
|
|
||||||
strlen(MAGIC_INSTALL_ALL)) == 0
|
|
||||||
&& strlen (install_langs_arg) == 3))
|
|
||||||
{
|
|
||||||
/* Count the number of languages we will install. */
|
|
||||||
while (true)
|
|
||||||
{
|
|
||||||
lang = strtok(install_langs_arg, ":;,");
|
|
||||||
if (lang == NULL)
|
|
||||||
break;
|
|
||||||
install_langs_count++;
|
|
||||||
install_langs_arg = NULL;
|
|
||||||
}
|
|
||||||
free (ila_start);
|
|
||||||
|
|
||||||
/* Reject an entire string made up of delimiters. */
|
|
||||||
if (install_langs_count == 0)
|
|
||||||
break;
|
|
||||||
|
|
||||||
/* Copy the list. */
|
|
||||||
install_langs_list = (char **)xmalloc (sizeof(char *) * install_langs_count);
|
|
||||||
install_langs_arg = ila_start = strdup (optarg);
|
|
||||||
install_langs_count = 0;
|
|
||||||
while (true)
|
|
||||||
{
|
|
||||||
lang = strtok(install_langs_arg, ":;,");
|
|
||||||
if (lang == NULL)
|
|
||||||
break;
|
|
||||||
install_langs_list[install_langs_count] = lang;
|
|
||||||
install_langs_count++;
|
|
||||||
install_langs_arg = NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case '?':
|
|
||||||
/* getopt_long already printed an error message. */
|
|
||||||
usage ();
|
|
||||||
exit (0);
|
|
||||||
|
|
||||||
default:
|
|
||||||
abort ();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
tmpl_ah.fname = NULL;
|
|
||||||
if (optind < argc)
|
|
||||||
tmpl_ah.fname = argv[optind];
|
|
||||||
if (optind + 1 < argc)
|
|
||||||
new_locar_fname = argv[optind + 1];
|
|
||||||
if (verbose)
|
|
||||||
{
|
|
||||||
if (tmpl_ah.fname)
|
|
||||||
printf("input archive file specified on command line: %s\n",
|
|
||||||
tmpl_ah.fname);
|
|
||||||
else
|
|
||||||
printf("using default input archive file.\n");
|
|
||||||
if (new_locar_fname)
|
|
||||||
printf("output archive file specified on command line: %s\n",
|
|
||||||
new_locar_fname);
|
|
||||||
else
|
|
||||||
printf("using default output archive file.\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
dirp = opendir (loc_path);
|
|
||||||
if (dirp == NULL)
|
|
||||||
error (EXIT_FAILURE, errno, "cannot open directory \"%s\"", loc_path);
|
|
||||||
|
|
||||||
open_tmpl_archive (&tmpl_ah);
|
|
||||||
|
|
||||||
if (new_locar_fname)
|
|
||||||
unlink (new_locar_fname);
|
|
||||||
else
|
|
||||||
unlink (locar_file);
|
|
||||||
primary = getenv ("LC_ALL");
|
|
||||||
if (primary == NULL)
|
|
||||||
primary = getenv ("LANG");
|
|
||||||
if (primary != NULL)
|
|
||||||
{
|
|
||||||
if (strncmp (primary, "ja", 2) != 0
|
|
||||||
&& strncmp (primary, "ko", 2) != 0
|
|
||||||
&& strncmp (primary, "zh", 2) != 0)
|
|
||||||
{
|
|
||||||
char *ptr = malloc (strlen (primary) + strlen (".utf8") + 1), *p, *q;
|
|
||||||
/* This leads to invalid locales sometimes:
|
|
||||||
de_DE.iso885915@euro -> de_DE.utf8@euro */
|
|
||||||
if (ptr != NULL)
|
|
||||||
{
|
|
||||||
p = ptr;
|
|
||||||
q = primary;
|
|
||||||
while (*q && *q != '.' && *q != '@')
|
|
||||||
*p++ = *q++;
|
|
||||||
if (*q == '.')
|
|
||||||
while (*q && *q != '@')
|
|
||||||
q++;
|
|
||||||
p = stpcpy (p, ".utf8");
|
|
||||||
strcpy (p, q);
|
|
||||||
primary = ptr;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
primary = NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
memcpy (path, loc_path, loc_path_len);
|
|
||||||
|
|
||||||
while ((d = readdir64 (dirp)) != NULL)
|
|
||||||
{
|
|
||||||
if (strcmp (d->d_name, ".") == 0 || strcmp (d->d_name, "..") == 0)
|
|
||||||
continue;
|
|
||||||
if (strchr (d->d_name, '_') == NULL)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
size_t d_name_len = strlen (d->d_name);
|
|
||||||
if (loc_path_len + d_name_len + 1 > sizeof (path))
|
|
||||||
{
|
|
||||||
error (0, 0, "too long filename \"%s\"", d->d_name);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
memcpy (path + loc_path_len, d->d_name, d_name_len + 1);
|
|
||||||
if (stat64 (path, &st) < 0)
|
|
||||||
{
|
|
||||||
error (0, errno, "cannot stat \"%s\"", path);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (! S_ISDIR (st.st_mode))
|
|
||||||
continue;
|
|
||||||
if (cnt == 16384)
|
|
||||||
{
|
|
||||||
error (0, 0, "too many directories in \"%s\"", loc_path);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
list[cnt] = strdup (path);
|
|
||||||
if (list[cnt] == NULL)
|
|
||||||
{
|
|
||||||
error (0, errno, "cannot add file to list \"%s\"", path);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (primary != NULL && cnt > 0 && strcmp (primary, d->d_name) == 0)
|
|
||||||
{
|
|
||||||
char *p = list[0];
|
|
||||||
list[0] = list[cnt];
|
|
||||||
list[cnt] = p;
|
|
||||||
}
|
|
||||||
cnt++;
|
|
||||||
}
|
|
||||||
closedir (dirp);
|
|
||||||
/* Store the archive to the file specified as the second argument on the
|
|
||||||
command line or the default locale archive. */
|
|
||||||
fill_archive (&tmpl_ah, new_locar_fname,
|
|
||||||
install_langs_count, install_langs_list,
|
|
||||||
cnt, list, primary);
|
|
||||||
close_archive (&tmpl_ah);
|
|
||||||
truncate (tmpl_file, 0);
|
|
||||||
if (install_langs_count > 0)
|
|
||||||
{
|
|
||||||
free (ila_start);
|
|
||||||
free (install_langs_list);
|
|
||||||
}
|
|
||||||
char *tz_argv[] = { "/usr/sbin/tzdata-update", NULL };
|
|
||||||
execve (tz_argv[0], (char *const *)tz_argv, (char *const *)&tz_argv[1]);
|
|
||||||
exit (0);
|
|
||||||
}
|
|
@ -1,259 +0,0 @@
|
|||||||
From 97700a34f36721b11a754cf37a1cc40695ece1fd Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 21 Jan 2019 11:23:59 -0800
|
|
||||||
Subject: [PATCH] x86-64 memchr/wmemchr: Properly handle the length parameter
|
|
||||||
[BZ# 24097]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
|
||||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
|
||||||
functions written in assembly can only use the lower 32 bits of a
|
|
||||||
64-bit register as length or must clear the upper 32 bits before using
|
|
||||||
the full 64-bit register for length.
|
|
||||||
|
|
||||||
This pach fixes memchr/wmemchr for x32. Tested on x86-64 and x32. On
|
|
||||||
x86-64, libc.so is the same with and withou the fix.
|
|
||||||
|
|
||||||
[BZ# 24097]
|
|
||||||
CVE-2019-6488
|
|
||||||
* sysdeps/x86_64/memchr.S: Use RDX_LP for length. Clear the
|
|
||||||
upper 32 bits of RDX register.
|
|
||||||
* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and
|
|
||||||
tst-size_t-wmemchr.
|
|
||||||
* sysdeps/x86_64/x32/test-size_t.h: New file.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/memchr.S | 10 ++--
|
|
||||||
sysdeps/x86_64/multiarch/memchr-avx2.S | 8 ++-
|
|
||||||
sysdeps/x86_64/x32/Makefile | 8 +++
|
|
||||||
sysdeps/x86_64/x32/test-size_t.h | 35 ++++++++++++
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-memchr.c | 72 +++++++++++++++++++++++++
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++
|
|
||||||
6 files changed, 148 insertions(+), 5 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/test-size_t.h
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
NEWS
|
|
||||||
(removed)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
|
|
||||||
index feef5d4f..cb320257 100644
|
|
||||||
--- a/sysdeps/x86_64/memchr.S
|
|
||||||
+++ b/sysdeps/x86_64/memchr.S
|
|
||||||
@@ -34,12 +34,16 @@ ENTRY(MEMCHR)
|
|
||||||
mov %edi, %ecx
|
|
||||||
|
|
||||||
#ifdef USE_AS_WMEMCHR
|
|
||||||
- test %rdx, %rdx
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
jz L(return_null)
|
|
||||||
- shl $2, %rdx
|
|
||||||
+ shl $2, %RDX_LP
|
|
||||||
#else
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %edx, %edx
|
|
||||||
+# endif
|
|
||||||
punpcklbw %xmm1, %xmm1
|
|
||||||
- test %rdx, %rdx
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
jz L(return_null)
|
|
||||||
punpcklbw %xmm1, %xmm1
|
|
||||||
#endif
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
index 5f5e7725..c81da19b 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
@@ -40,16 +40,20 @@
|
|
||||||
ENTRY (MEMCHR)
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
/* Check for zero length. */
|
|
||||||
- testq %rdx, %rdx
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
jz L(null)
|
|
||||||
# endif
|
|
||||||
movl %edi, %ecx
|
|
||||||
/* Broadcast CHAR to YMM0. */
|
|
||||||
vmovd %esi, %xmm0
|
|
||||||
# ifdef USE_AS_WMEMCHR
|
|
||||||
- shl $2, %rdx
|
|
||||||
+ shl $2, %RDX_LP
|
|
||||||
vpbroadcastd %xmm0, %ymm0
|
|
||||||
# else
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %edx, %edx
|
|
||||||
+# endif
|
|
||||||
vpbroadcastb %xmm0, %ymm0
|
|
||||||
# endif
|
|
||||||
/* Check if we may cross page boundary with one vector load. */
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index f2ebc24f..7d528889 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -4,3 +4,11 @@ ifeq ($(subdir),math)
|
|
||||||
# 64-bit llround. Add -fno-builtin-lround to silence the compiler.
|
|
||||||
CFLAGS-s_llround.c += -fno-builtin-lround
|
|
||||||
endif
|
|
||||||
+
|
|
||||||
+ifeq ($(subdir),string)
|
|
||||||
+tests += tst-size_t-memchr
|
|
||||||
+endif
|
|
||||||
+
|
|
||||||
+ifeq ($(subdir),wcsmbs)
|
|
||||||
+tests += tst-size_t-wmemchr
|
|
||||||
+endif
|
|
||||||
diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..78a94086
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/test-size_t.h
|
|
||||||
@@ -0,0 +1,35 @@
|
|
||||||
+/* Test string/memory functions with size_t in the lower 32 bits of
|
|
||||||
+ 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define TEST_MAIN
|
|
||||||
+#include <string/test-string.h>
|
|
||||||
+
|
|
||||||
+/* On x32, parameter_t may be passed in a 64-bit register with the LEN
|
|
||||||
+ field in the lower 32 bits. When the LEN field of 64-bit register
|
|
||||||
+ is passed to string/memory function as the size_t parameter, only
|
|
||||||
+ the lower 32 bits can be used. */
|
|
||||||
+typedef struct
|
|
||||||
+{
|
|
||||||
+ union
|
|
||||||
+ {
|
|
||||||
+ size_t len;
|
|
||||||
+ void (*fn) (void);
|
|
||||||
+ };
|
|
||||||
+ void *p;
|
|
||||||
+} parameter_t;
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..29a3daf1
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c
|
|
||||||
@@ -0,0 +1,72 @@
|
|
||||||
+/* Test memchr with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#ifndef WIDE
|
|
||||||
+# define TEST_NAME "memchr"
|
|
||||||
+#else
|
|
||||||
+# define TEST_NAME "wmemchr"
|
|
||||||
+#endif /* WIDE */
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+#ifndef WIDE
|
|
||||||
+# define MEMCHR memchr
|
|
||||||
+# define CHAR char
|
|
||||||
+# define UCHAR unsigned char
|
|
||||||
+#else
|
|
||||||
+# include <wchar.h>
|
|
||||||
+# define MEMCHR wmemchr
|
|
||||||
+# define CHAR wchar_t
|
|
||||||
+# define UCHAR wchar_t
|
|
||||||
+#endif /* WIDE */
|
|
||||||
+
|
|
||||||
+IMPL (MEMCHR, 1)
|
|
||||||
+
|
|
||||||
+typedef CHAR * (*proto_t) (const CHAR*, int, size_t);
|
|
||||||
+
|
|
||||||
+static CHAR *
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_memchr (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, (uintptr_t) b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
|
|
||||||
+ parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ c.fn = impl->fn;
|
|
||||||
+ CHAR *res = do_memchr (src, c);
|
|
||||||
+ if (res)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: %p != NULL",
|
|
||||||
+ impl->name, res);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..877801d6
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
|
|
||||||
@@ -0,0 +1,20 @@
|
|
||||||
+/* Test wmemchr with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define WIDE 1
|
|
||||||
+#include "tst-size_t-memchr.c"
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,41 +0,0 @@
|
|||||||
From ddf0992cf57a93200e0c782e2a94d0733a5a0b87 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Sun, 9 Jan 2022 16:02:21 -0600
|
|
||||||
Subject: [PATCH] x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
|
|
||||||
__wcscmp_avx2. For x86_64 this covers the entire address range so any
|
|
||||||
length larger could not possibly be used to bound `s1` or `s2`.
|
|
||||||
|
|
||||||
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 10 ++++++++++
|
|
||||||
1 file changed, 10 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
index 156c1949..8fb8eedc 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
@@ -83,6 +83,16 @@ ENTRY (STRCMP)
|
|
||||||
je L(char0)
|
|
||||||
jb L(zero)
|
|
||||||
# ifdef USE_AS_WCSCMP
|
|
||||||
+# ifndef __ILP32__
|
|
||||||
+ movq %rdx, %rcx
|
|
||||||
+ /* Check if length could overflow when multiplied by
|
|
||||||
+ sizeof(wchar_t). Checking top 8 bits will cover all potential
|
|
||||||
+ overflow cases as well as redirect cases where its impossible to
|
|
||||||
+ length to bound a valid memory region. In these cases just use
|
|
||||||
+ 'wcscmp'. */
|
|
||||||
+ shrq $56, %rcx
|
|
||||||
+ jnz __wcscmp_avx2
|
|
||||||
+# endif
|
|
||||||
/* Convert units: from wide to byte char. */
|
|
||||||
shl $2, %RDX_LP
|
|
||||||
# endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,263 +0,0 @@
|
|||||||
From 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Fri, 15 Apr 2022 12:28:01 -0500
|
|
||||||
Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Old code was both inefficient and wasted code size. New code (-62
|
|
||||||
bytes) and comparable or better performance in the page cross case.
|
|
||||||
|
|
||||||
geometric_mean(N=20) of page cross cases New / Original: 0.960
|
|
||||||
|
|
||||||
size, align0, align1, ret, New Time/Old Time
|
|
||||||
1, 4095, 0, 0, 1.001
|
|
||||||
1, 4095, 0, 1, 0.999
|
|
||||||
1, 4095, 0, -1, 1.0
|
|
||||||
2, 4094, 0, 0, 1.0
|
|
||||||
2, 4094, 0, 1, 1.0
|
|
||||||
2, 4094, 0, -1, 1.0
|
|
||||||
3, 4093, 0, 0, 1.0
|
|
||||||
3, 4093, 0, 1, 1.0
|
|
||||||
3, 4093, 0, -1, 1.0
|
|
||||||
4, 4092, 0, 0, 0.987
|
|
||||||
4, 4092, 0, 1, 1.0
|
|
||||||
4, 4092, 0, -1, 1.0
|
|
||||||
5, 4091, 0, 0, 0.984
|
|
||||||
5, 4091, 0, 1, 1.002
|
|
||||||
5, 4091, 0, -1, 1.005
|
|
||||||
6, 4090, 0, 0, 0.993
|
|
||||||
6, 4090, 0, 1, 1.001
|
|
||||||
6, 4090, 0, -1, 1.003
|
|
||||||
7, 4089, 0, 0, 0.991
|
|
||||||
7, 4089, 0, 1, 1.0
|
|
||||||
7, 4089, 0, -1, 1.001
|
|
||||||
8, 4088, 0, 0, 0.875
|
|
||||||
8, 4088, 0, 1, 0.881
|
|
||||||
8, 4088, 0, -1, 0.888
|
|
||||||
9, 4087, 0, 0, 0.872
|
|
||||||
9, 4087, 0, 1, 0.879
|
|
||||||
9, 4087, 0, -1, 0.883
|
|
||||||
10, 4086, 0, 0, 0.878
|
|
||||||
10, 4086, 0, 1, 0.886
|
|
||||||
10, 4086, 0, -1, 0.873
|
|
||||||
11, 4085, 0, 0, 0.878
|
|
||||||
11, 4085, 0, 1, 0.881
|
|
||||||
11, 4085, 0, -1, 0.879
|
|
||||||
12, 4084, 0, 0, 0.873
|
|
||||||
12, 4084, 0, 1, 0.889
|
|
||||||
12, 4084, 0, -1, 0.875
|
|
||||||
13, 4083, 0, 0, 0.873
|
|
||||||
13, 4083, 0, 1, 0.863
|
|
||||||
13, 4083, 0, -1, 0.863
|
|
||||||
14, 4082, 0, 0, 0.838
|
|
||||||
14, 4082, 0, 1, 0.869
|
|
||||||
14, 4082, 0, -1, 0.877
|
|
||||||
15, 4081, 0, 0, 0.841
|
|
||||||
15, 4081, 0, 1, 0.869
|
|
||||||
15, 4081, 0, -1, 0.876
|
|
||||||
16, 4080, 0, 0, 0.988
|
|
||||||
16, 4080, 0, 1, 0.99
|
|
||||||
16, 4080, 0, -1, 0.989
|
|
||||||
17, 4079, 0, 0, 0.978
|
|
||||||
17, 4079, 0, 1, 0.981
|
|
||||||
17, 4079, 0, -1, 0.98
|
|
||||||
18, 4078, 0, 0, 0.981
|
|
||||||
18, 4078, 0, 1, 0.98
|
|
||||||
18, 4078, 0, -1, 0.985
|
|
||||||
19, 4077, 0, 0, 0.977
|
|
||||||
19, 4077, 0, 1, 0.979
|
|
||||||
19, 4077, 0, -1, 0.986
|
|
||||||
20, 4076, 0, 0, 0.977
|
|
||||||
20, 4076, 0, 1, 0.986
|
|
||||||
20, 4076, 0, -1, 0.984
|
|
||||||
21, 4075, 0, 0, 0.977
|
|
||||||
21, 4075, 0, 1, 0.983
|
|
||||||
21, 4075, 0, -1, 0.988
|
|
||||||
22, 4074, 0, 0, 0.983
|
|
||||||
22, 4074, 0, 1, 0.994
|
|
||||||
22, 4074, 0, -1, 0.993
|
|
||||||
23, 4073, 0, 0, 0.98
|
|
||||||
23, 4073, 0, 1, 0.992
|
|
||||||
23, 4073, 0, -1, 0.995
|
|
||||||
24, 4072, 0, 0, 0.989
|
|
||||||
24, 4072, 0, 1, 0.989
|
|
||||||
24, 4072, 0, -1, 0.991
|
|
||||||
25, 4071, 0, 0, 0.99
|
|
||||||
25, 4071, 0, 1, 0.999
|
|
||||||
25, 4071, 0, -1, 0.996
|
|
||||||
26, 4070, 0, 0, 0.993
|
|
||||||
26, 4070, 0, 1, 0.995
|
|
||||||
26, 4070, 0, -1, 0.998
|
|
||||||
27, 4069, 0, 0, 0.993
|
|
||||||
27, 4069, 0, 1, 0.999
|
|
||||||
27, 4069, 0, -1, 1.0
|
|
||||||
28, 4068, 0, 0, 0.997
|
|
||||||
28, 4068, 0, 1, 1.0
|
|
||||||
28, 4068, 0, -1, 0.999
|
|
||||||
29, 4067, 0, 0, 0.996
|
|
||||||
29, 4067, 0, 1, 0.999
|
|
||||||
29, 4067, 0, -1, 0.999
|
|
||||||
30, 4066, 0, 0, 0.991
|
|
||||||
30, 4066, 0, 1, 1.001
|
|
||||||
30, 4066, 0, -1, 0.999
|
|
||||||
31, 4065, 0, 0, 0.988
|
|
||||||
31, 4065, 0, 1, 0.998
|
|
||||||
31, 4065, 0, -1, 0.998
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
|
|
||||||
1 file changed, 61 insertions(+), 37 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
index 16fc673e..99258cf5 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
@@ -429,22 +429,21 @@ L(page_cross_less_vec):
|
|
||||||
# ifndef USE_AS_WMEMCMP
|
|
||||||
cmpl $8, %edx
|
|
||||||
jae L(between_8_15)
|
|
||||||
+ /* Fall through for [4, 7]. */
|
|
||||||
cmpl $4, %edx
|
|
||||||
- jae L(between_4_7)
|
|
||||||
+ jb L(between_2_3)
|
|
||||||
|
|
||||||
- /* Load as big endian to avoid branches. */
|
|
||||||
- movzwl (%rdi), %eax
|
|
||||||
- movzwl (%rsi), %ecx
|
|
||||||
- shll $8, %eax
|
|
||||||
- shll $8, %ecx
|
|
||||||
- bswap %eax
|
|
||||||
- bswap %ecx
|
|
||||||
- movzbl -1(%rdi, %rdx), %edi
|
|
||||||
- movzbl -1(%rsi, %rdx), %esi
|
|
||||||
- orl %edi, %eax
|
|
||||||
- orl %esi, %ecx
|
|
||||||
- /* Subtraction is okay because the upper 8 bits are zero. */
|
|
||||||
- subl %ecx, %eax
|
|
||||||
+ movbe (%rdi), %eax
|
|
||||||
+ movbe (%rsi), %ecx
|
|
||||||
+ shlq $32, %rax
|
|
||||||
+ shlq $32, %rcx
|
|
||||||
+ movbe -4(%rdi, %rdx), %edi
|
|
||||||
+ movbe -4(%rsi, %rdx), %esi
|
|
||||||
+ orq %rdi, %rax
|
|
||||||
+ orq %rsi, %rcx
|
|
||||||
+ subq %rcx, %rax
|
|
||||||
+ /* Fast path for return zero. */
|
|
||||||
+ jnz L(ret_nonzero)
|
|
||||||
/* No ymm register was touched. */
|
|
||||||
ret
|
|
||||||
|
|
||||||
@@ -457,9 +456,33 @@ L(one_or_less):
|
|
||||||
/* No ymm register was touched. */
|
|
||||||
ret
|
|
||||||
|
|
||||||
+ .p2align 4,, 5
|
|
||||||
+L(ret_nonzero):
|
|
||||||
+ sbbl %eax, %eax
|
|
||||||
+ orl $1, %eax
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4,, 2
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
.p2align 4
|
|
||||||
L(between_8_15):
|
|
||||||
-# endif
|
|
||||||
+ movbe (%rdi), %rax
|
|
||||||
+ movbe (%rsi), %rcx
|
|
||||||
+ subq %rcx, %rax
|
|
||||||
+ jnz L(ret_nonzero)
|
|
||||||
+ movbe -8(%rdi, %rdx), %rax
|
|
||||||
+ movbe -8(%rsi, %rdx), %rcx
|
|
||||||
+ subq %rcx, %rax
|
|
||||||
+ /* Fast path for return zero. */
|
|
||||||
+ jnz L(ret_nonzero)
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
+ ret
|
|
||||||
+# else
|
|
||||||
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
|
|
||||||
vmovq (%rdi), %xmm1
|
|
||||||
vmovq (%rsi), %xmm2
|
|
||||||
@@ -475,16 +498,13 @@ L(between_8_15):
|
|
||||||
VPCMPEQ %xmm1, %xmm2, %xmm2
|
|
||||||
vpmovmskb %xmm2, %eax
|
|
||||||
subl $0xffff, %eax
|
|
||||||
+ /* Fast path for return zero. */
|
|
||||||
jnz L(return_vec_0)
|
|
||||||
/* No ymm register was touched. */
|
|
||||||
ret
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(zero):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
+ .p2align 4,, 10
|
|
||||||
L(between_16_31):
|
|
||||||
/* From 16 to 31 bytes. No branch when size == 16. */
|
|
||||||
vmovdqu (%rsi), %xmm2
|
|
||||||
@@ -501,11 +521,17 @@ L(between_16_31):
|
|
||||||
VPCMPEQ (%rdi), %xmm2, %xmm2
|
|
||||||
vpmovmskb %xmm2, %eax
|
|
||||||
subl $0xffff, %eax
|
|
||||||
+ /* Fast path for return zero. */
|
|
||||||
jnz L(return_vec_0)
|
|
||||||
/* No ymm register was touched. */
|
|
||||||
ret
|
|
||||||
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
+ .p2align 4,, 2
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
.p2align 4
|
|
||||||
L(one_or_less):
|
|
||||||
jb L(zero)
|
|
||||||
@@ -520,22 +546,20 @@ L(one_or_less):
|
|
||||||
# else
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(between_4_7):
|
|
||||||
- /* Load as big endian with overlapping movbe to avoid branches.
|
|
||||||
- */
|
|
||||||
- movbe (%rdi), %eax
|
|
||||||
- movbe (%rsi), %ecx
|
|
||||||
- shlq $32, %rax
|
|
||||||
- shlq $32, %rcx
|
|
||||||
- movbe -4(%rdi, %rdx), %edi
|
|
||||||
- movbe -4(%rsi, %rdx), %esi
|
|
||||||
- orq %rdi, %rax
|
|
||||||
- orq %rsi, %rcx
|
|
||||||
- subq %rcx, %rax
|
|
||||||
- jz L(zero_4_7)
|
|
||||||
- sbbl %eax, %eax
|
|
||||||
- orl $1, %eax
|
|
||||||
-L(zero_4_7):
|
|
||||||
+L(between_2_3):
|
|
||||||
+ /* Load as big endian to avoid branches. */
|
|
||||||
+ movzwl (%rdi), %eax
|
|
||||||
+ movzwl (%rsi), %ecx
|
|
||||||
+ bswap %eax
|
|
||||||
+ bswap %ecx
|
|
||||||
+ shrl %eax
|
|
||||||
+ shrl %ecx
|
|
||||||
+ movzbl -1(%rdi, %rdx), %edi
|
|
||||||
+ movzbl -1(%rsi, %rdx), %esi
|
|
||||||
+ orl %edi, %eax
|
|
||||||
+ orl %esi, %ecx
|
|
||||||
+ /* Subtraction is okay because the upper bit is zero. */
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
/* No ymm register was touched. */
|
|
||||||
ret
|
|
||||||
# endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,226 +0,0 @@
|
|||||||
From 8162147872491bb5b48e91543b19c49a29ae6b6d Mon Sep 17 00:00:00 2001
|
|
||||||
From: Wangyang Guo <wangyang.guo@intel.com>
|
|
||||||
Date: Fri, 6 May 2022 01:50:10 +0000
|
|
||||||
Subject: [PATCH] nptl: Add backoff mechanism to spinlock loop
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
When mutiple threads waiting for lock at the same time, once lock owner
|
|
||||||
releases the lock, waiters will see lock available and all try to lock,
|
|
||||||
which may cause an expensive CAS storm.
|
|
||||||
|
|
||||||
Binary exponential backoff with random jitter is introduced. As try-lock
|
|
||||||
attempt increases, there is more likely that a larger number threads
|
|
||||||
compete for adaptive mutex lock, so increase wait time in exponential.
|
|
||||||
A random jitter is also added to avoid synchronous try-lock from other
|
|
||||||
threads.
|
|
||||||
|
|
||||||
v2: Remove read-check before try-lock for performance.
|
|
||||||
|
|
||||||
v3:
|
|
||||||
1. Restore read-check since it works well in some platform.
|
|
||||||
2. Make backoff arch dependent, and enable it for x86_64.
|
|
||||||
3. Limit max backoff to reduce latency in large critical section.
|
|
||||||
|
|
||||||
v4: Fix strict-prototypes error in sysdeps/nptl/pthread_mutex_backoff.h
|
|
||||||
|
|
||||||
v5: Commit log updated for regression in large critical section.
|
|
||||||
|
|
||||||
Result of pthread-mutex-locks bench
|
|
||||||
|
|
||||||
Test Platform: Xeon 8280L (2 socket, 112 CPUs in total)
|
|
||||||
First Row: thread number
|
|
||||||
First Col: critical section length
|
|
||||||
Values: backoff vs upstream, time based, low is better
|
|
||||||
|
|
||||||
non-critical-length: 1
|
|
||||||
1 2 4 8 16 32 64 112 140
|
|
||||||
0 0.99 0.58 0.52 0.49 0.43 0.44 0.46 0.52 0.54
|
|
||||||
1 0.98 0.43 0.56 0.50 0.44 0.45 0.50 0.56 0.57
|
|
||||||
2 0.99 0.41 0.57 0.51 0.45 0.47 0.48 0.60 0.61
|
|
||||||
4 0.99 0.45 0.59 0.53 0.48 0.49 0.52 0.64 0.65
|
|
||||||
8 1.00 0.66 0.71 0.63 0.56 0.59 0.66 0.72 0.71
|
|
||||||
16 0.97 0.78 0.91 0.73 0.67 0.70 0.79 0.80 0.80
|
|
||||||
32 0.95 1.17 0.98 0.87 0.82 0.86 0.89 0.90 0.90
|
|
||||||
64 0.96 0.95 1.01 1.01 0.98 1.00 1.03 0.99 0.99
|
|
||||||
128 0.99 1.01 1.01 1.17 1.08 1.12 1.02 0.97 1.02
|
|
||||||
|
|
||||||
non-critical-length: 32
|
|
||||||
1 2 4 8 16 32 64 112 140
|
|
||||||
0 1.03 0.97 0.75 0.65 0.58 0.58 0.56 0.70 0.70
|
|
||||||
1 0.94 0.95 0.76 0.65 0.58 0.58 0.61 0.71 0.72
|
|
||||||
2 0.97 0.96 0.77 0.66 0.58 0.59 0.62 0.74 0.74
|
|
||||||
4 0.99 0.96 0.78 0.66 0.60 0.61 0.66 0.76 0.77
|
|
||||||
8 0.99 0.99 0.84 0.70 0.64 0.66 0.71 0.80 0.80
|
|
||||||
16 0.98 0.97 0.95 0.76 0.70 0.73 0.81 0.85 0.84
|
|
||||||
32 1.04 1.12 1.04 0.89 0.82 0.86 0.93 0.91 0.91
|
|
||||||
64 0.99 1.15 1.07 1.00 0.99 1.01 1.05 0.99 0.99
|
|
||||||
128 1.00 1.21 1.20 1.22 1.25 1.31 1.12 1.10 0.99
|
|
||||||
|
|
||||||
non-critical-length: 128
|
|
||||||
1 2 4 8 16 32 64 112 140
|
|
||||||
0 1.02 1.00 0.99 0.67 0.61 0.61 0.61 0.74 0.73
|
|
||||||
1 0.95 0.99 1.00 0.68 0.61 0.60 0.60 0.74 0.74
|
|
||||||
2 1.00 1.04 1.00 0.68 0.59 0.61 0.65 0.76 0.76
|
|
||||||
4 1.00 0.96 0.98 0.70 0.63 0.63 0.67 0.78 0.77
|
|
||||||
8 1.01 1.02 0.89 0.73 0.65 0.67 0.71 0.81 0.80
|
|
||||||
16 0.99 0.96 0.96 0.79 0.71 0.73 0.80 0.84 0.84
|
|
||||||
32 0.99 0.95 1.05 0.89 0.84 0.85 0.94 0.92 0.91
|
|
||||||
64 1.00 0.99 1.16 1.04 1.00 1.02 1.06 0.99 0.99
|
|
||||||
128 1.00 1.06 0.98 1.14 1.39 1.26 1.08 1.02 0.98
|
|
||||||
|
|
||||||
There is regression in large critical section. But adaptive mutex is
|
|
||||||
aimed for "quick" locks. Small critical section is more common when
|
|
||||||
users choose to use adaptive pthread_mutex.
|
|
||||||
|
|
||||||
Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
pthreadP.h
|
|
||||||
(had been moved)
|
|
||||||
nptl/pthread_mutex_lock.c
|
|
||||||
(max_adaptive_count renamed)
|
|
||||||
|
|
||||||
---
|
|
||||||
nptl/pthreadP.h | 1 +
|
|
||||||
nptl/pthread_mutex_lock.c | 16 +++++++--
|
|
||||||
sysdeps/nptl/pthread_mutex_backoff.h | 35 ++++++++++++++++++
|
|
||||||
sysdeps/x86_64/nptl/pthread_mutex_backoff.h | 39 +++++++++++++++++++++
|
|
||||||
4 files changed, 89 insertions(+), 2 deletions(-)
|
|
||||||
create mode 100644 sysdeps/nptl/pthread_mutex_backoff.h
|
|
||||||
create mode 100644 sysdeps/x86_64/nptl/pthread_mutex_backoff.h
|
|
||||||
|
|
||||||
diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
|
|
||||||
index 7ddc166c..1550e3b6 100644
|
|
||||||
--- a/nptl/pthreadP.h
|
|
||||||
+++ b/nptl/pthreadP.h
|
|
||||||
@@ -33,6 +33,7 @@
|
|
||||||
#include <kernel-features.h>
|
|
||||||
#include <errno.h>
|
|
||||||
#include <internal-signals.h>
|
|
||||||
+#include <pthread_mutex_backoff.h>
|
|
||||||
|
|
||||||
|
|
||||||
/* Atomic operations on TLS memory. */
|
|
||||||
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
|
|
||||||
index d96a9933..c7770fc9 100644
|
|
||||||
--- a/nptl/pthread_mutex_lock.c
|
|
||||||
+++ b/nptl/pthread_mutex_lock.c
|
|
||||||
@@ -133,14 +133,26 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
|
|
||||||
int cnt = 0;
|
|
||||||
int max_cnt = MIN (MAX_ADAPTIVE_COUNT,
|
|
||||||
mutex->__data.__spins * 2 + 10);
|
|
||||||
+ int spin_count, exp_backoff = 1;
|
|
||||||
+ unsigned int jitter = get_jitter ();
|
|
||||||
do
|
|
||||||
{
|
|
||||||
- if (cnt++ >= max_cnt)
|
|
||||||
+ /* In each loop, spin count is exponential backoff plus
|
|
||||||
+ random jitter, random range is [0, exp_backoff-1]. */
|
|
||||||
+ spin_count = exp_backoff + (jitter & (exp_backoff - 1));
|
|
||||||
+ cnt += spin_count;
|
|
||||||
+ if (cnt >= max_cnt)
|
|
||||||
{
|
|
||||||
+ /* If cnt exceeds max spin count, just go to wait
|
|
||||||
+ queue. */
|
|
||||||
LLL_MUTEX_LOCK (mutex);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
- atomic_spin_nop ();
|
|
||||||
+ do
|
|
||||||
+ atomic_spin_nop ();
|
|
||||||
+ while (--spin_count > 0);
|
|
||||||
+ /* Prepare for next loop. */
|
|
||||||
+ exp_backoff = get_next_backoff (exp_backoff);
|
|
||||||
}
|
|
||||||
while (LLL_MUTEX_READ_LOCK (mutex) != 0
|
|
||||||
|| LLL_MUTEX_TRYLOCK (mutex) != 0);
|
|
||||||
diff --git a/sysdeps/nptl/pthread_mutex_backoff.h b/sysdeps/nptl/pthread_mutex_backoff.h
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..5b26c22a
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/nptl/pthread_mutex_backoff.h
|
|
||||||
@@ -0,0 +1,35 @@
|
|
||||||
+/* Pthread mutex backoff configuration.
|
|
||||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
|
|
||||||
+#define _PTHREAD_MUTEX_BACKOFF_H 1
|
|
||||||
+
|
|
||||||
+static inline unsigned int
|
|
||||||
+get_jitter (void)
|
|
||||||
+{
|
|
||||||
+ /* Arch dependent random jitter, return 0 disables random. */
|
|
||||||
+ return 0;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static inline int
|
|
||||||
+get_next_backoff (int backoff)
|
|
||||||
+{
|
|
||||||
+ /* Next backoff, return 1 disables mutex backoff. */
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#endif
|
|
||||||
diff --git a/sysdeps/x86_64/nptl/pthread_mutex_backoff.h b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..ec74c3d9
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
|
|
||||||
@@ -0,0 +1,39 @@
|
|
||||||
+/* Pthread mutex backoff configuration.
|
|
||||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
|
|
||||||
+#define _PTHREAD_MUTEX_BACKOFF_H 1
|
|
||||||
+
|
|
||||||
+#include <fast-jitter.h>
|
|
||||||
+
|
|
||||||
+static inline unsigned int
|
|
||||||
+get_jitter (void)
|
|
||||||
+{
|
|
||||||
+ return get_fast_jitter ();
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#define MAX_BACKOFF 16
|
|
||||||
+
|
|
||||||
+static inline int
|
|
||||||
+get_next_backoff (int backoff)
|
|
||||||
+{
|
|
||||||
+ /* Binary expontial backoff. Limiting max backoff
|
|
||||||
+ can reduce latency in large critical section. */
|
|
||||||
+ return (backoff < MAX_BACKOFF) ? backoff << 1 : backoff;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,55 +0,0 @@
|
|||||||
From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Tue, 15 Feb 2022 08:18:15 -0600
|
|
||||||
Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
|
|
||||||
#28896]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
|
|
||||||
call strcmp-avx2 and wcscmp-avx2 respectively. This would have
|
|
||||||
not checks around vzeroupper and would trigger spurious
|
|
||||||
aborts. This commit fixes that.
|
|
||||||
|
|
||||||
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
|
|
||||||
AVX2 machines with and without RTM.
|
|
||||||
|
|
||||||
Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 8 ++------
|
|
||||||
1 file changed, 2 insertions(+), 6 deletions(-)
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
(split into two patches due to upstream bug differences)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
index 28cc98b6..e267c6cb 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
@@ -345,10 +345,10 @@ L(one_or_less):
|
|
||||||
movq %LOCALE_REG, %rdx
|
|
||||||
# endif
|
|
||||||
jb L(ret_zero)
|
|
||||||
-# ifdef USE_AS_WCSCMP
|
|
||||||
/* 'nbe' covers the case where length is negative (large
|
|
||||||
unsigned). */
|
|
||||||
- jnbe __wcscmp_avx2
|
|
||||||
+ jnbe OVERFLOW_STRCMP
|
|
||||||
+# ifdef USE_AS_WCSCMP
|
|
||||||
movl (%rdi), %edx
|
|
||||||
xorl %eax, %eax
|
|
||||||
cmpl (%rsi), %edx
|
|
||||||
@@ -357,10 +357,6 @@ L(one_or_less):
|
|
||||||
negl %eax
|
|
||||||
orl $1, %eax
|
|
||||||
# else
|
|
||||||
- /* 'nbe' covers the case where length is negative (large
|
|
||||||
- unsigned). */
|
|
||||||
-
|
|
||||||
- jnbe __strcmp_avx2
|
|
||||||
movzbl (%rdi), %eax
|
|
||||||
movzbl (%rsi), %ecx
|
|
||||||
TOLOWER_gpr (%rax, %eax)
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,60 +0,0 @@
|
|||||||
From 259a17cc98058d2576511201f85d28cb5d9de2a2 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Stefan Liebler <stli@linux.ibm.com>
|
|
||||||
Date: Mon, 28 Jun 2021 13:01:07 +0200
|
|
||||||
Subject: s390x: Update math: redirect roundeven function
|
|
||||||
|
|
||||||
After recent commit
|
|
||||||
447954a206837b5f153869cfeeeab44631c3fac9
|
|
||||||
"math: redirect roundeven function", building on
|
|
||||||
s390x fails with:
|
|
||||||
Error: symbol `__roundevenl' is already defined
|
|
||||||
|
|
||||||
Similar to aarch64/riscv fix, this patch redirects target
|
|
||||||
specific functions for s390x:
|
|
||||||
commit 3213ed770cbc5821920d16caa93c85e92dd7b9f6
|
|
||||||
"Update math: redirect roundeven function"
|
|
||||||
|
|
||||||
diff --git a/sysdeps/s390/fpu/s_roundeven.c b/sysdeps/s390/fpu/s_roundeven.c
|
|
||||||
index 40b07e054b..0773adfed0 100644
|
|
||||||
--- a/sysdeps/s390/fpu/s_roundeven.c
|
|
||||||
+++ b/sysdeps/s390/fpu/s_roundeven.c
|
|
||||||
@@ -18,6 +18,7 @@
|
|
||||||
<https://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
|
|
||||||
+# define NO_MATH_REDIRECT
|
|
||||||
# include <math.h>
|
|
||||||
# include <libm-alias-double.h>
|
|
||||||
|
|
||||||
@@ -31,7 +32,6 @@ __roundeven (double x)
|
|
||||||
__asm__ ("fidbra %0,4,%1,4" : "=f" (y) : "f" (x));
|
|
||||||
return y;
|
|
||||||
}
|
|
||||||
-hidden_def (__roundeven)
|
|
||||||
libm_alias_double (__roundeven, roundeven)
|
|
||||||
|
|
||||||
#else
|
|
||||||
diff --git a/sysdeps/s390/fpu/s_roundevenf.c b/sysdeps/s390/fpu/s_roundevenf.c
|
|
||||||
index d2fbf3d2b6..289785bc4a 100644
|
|
||||||
--- a/sysdeps/s390/fpu/s_roundevenf.c
|
|
||||||
+++ b/sysdeps/s390/fpu/s_roundevenf.c
|
|
||||||
@@ -18,6 +18,7 @@
|
|
||||||
<https://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
|
|
||||||
+# define NO_MATH_REDIRECT
|
|
||||||
# include <math.h>
|
|
||||||
# include <libm-alias-float.h>
|
|
||||||
|
|
||||||
diff --git a/sysdeps/s390/fpu/s_roundevenl.c b/sysdeps/s390/fpu/s_roundevenl.c
|
|
||||||
index 29ab7a8616..94b6459ab4 100644
|
|
||||||
--- a/sysdeps/s390/fpu/s_roundevenl.c
|
|
||||||
+++ b/sysdeps/s390/fpu/s_roundevenl.c
|
|
||||||
@@ -18,6 +18,7 @@
|
|
||||||
<https://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
|
|
||||||
+# define NO_MATH_REDIRECT
|
|
||||||
# include <math.h>
|
|
||||||
# include <math_private.h>
|
|
||||||
# include <libm-alias-ldouble.h>
|
|
@ -1,74 +0,0 @@
|
|||||||
From 1da50d4bda07f04135dca39f40e79fc9eabed1f8 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Fri, 26 Feb 2021 05:36:59 -0800
|
|
||||||
Subject: [PATCH] x86: Set Prefer_No_VZEROUPPER and add Prefer_AVX2_STRCMP
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
1. Set Prefer_No_VZEROUPPER if RTM is usable to avoid RTM abort triggered
|
|
||||||
by VZEROUPPER inside a transactionally executing RTM region.
|
|
||||||
2. Since to compare 2 32-byte strings, 256-bit EVEX strcmp requires 2
|
|
||||||
loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp requires 1 load, 2 VPCMPEQs,
|
|
||||||
1 VPMINU and 1 VPMOVMSKB, AVX2 strcmp is faster than EVEX strcmp. Add
|
|
||||||
Prefer_AVX2_STRCMP to prefer AVX2 strcmp family functions.
|
|
||||||
---
|
|
||||||
sysdeps/x86/cpu-features.c | 20 +++++++++++++++++--
|
|
||||||
sysdeps/x86/cpu-tunables.c | 2 ++
|
|
||||||
...cpu-features-preferred_feature_index_1.def | 1 +
|
|
||||||
3 files changed, 21 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
|
||||||
index 91042505..3610ee5c 100644
|
|
||||||
--- a/sysdeps/x86/cpu-features.c
|
|
||||||
+++ b/sysdeps/x86/cpu-features.c
|
|
||||||
@@ -524,8 +524,24 @@ init_cpu_features (struct cpu_features *cpu_features)
|
|
||||||
cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
|
|
||||||
|= bit_arch_Prefer_No_VZEROUPPER;
|
|
||||||
else
|
|
||||||
- cpu_features->preferred[index_arch_Prefer_No_AVX512]
|
|
||||||
- |= bit_arch_Prefer_No_AVX512;
|
|
||||||
+ {
|
|
||||||
+ cpu_features->preferred[index_arch_Prefer_No_AVX512]
|
|
||||||
+ |= bit_arch_Prefer_No_AVX512;
|
|
||||||
+
|
|
||||||
+ /* Avoid RTM abort triggered by VZEROUPPER inside a
|
|
||||||
+ transactionally executing RTM region. */
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
||||||
+ cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
|
|
||||||
+ |= bit_arch_Prefer_No_VZEROUPPER;
|
|
||||||
+
|
|
||||||
+ /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
|
|
||||||
+ requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
|
|
||||||
+ requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
|
|
||||||
+ AVX2 strcmp is faster than EVEX strcmp. */
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
|
||||||
+ cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
|
|
||||||
+ |= bit_arch_Prefer_AVX2_STRCMP;
|
|
||||||
+ }
|
|
||||||
}
|
|
||||||
/* This spells out "AuthenticAMD". */
|
|
||||||
else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
|
|
||||||
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
|
|
||||||
index 3173b2b9..73adbaba 100644
|
|
||||||
--- a/sysdeps/x86/cpu-tunables.c
|
|
||||||
+++ b/sysdeps/x86/cpu-tunables.c
|
|
||||||
@@ -239,6 +239,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
|
|
||||||
CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
|
|
||||||
Fast_Copy_Backward,
|
|
||||||
disable, 18);
|
|
||||||
+ CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
|
|
||||||
+ (n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case 19:
|
|
||||||
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
|
||||||
index 17a5cc42..4ca70b40 100644
|
|
||||||
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
|
||||||
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
|
||||||
@@ -32,3 +32,4 @@ BIT (Prefer_ERMS)
|
|
||||||
BIT (Prefer_FSRM)
|
|
||||||
BIT (Prefer_No_AVX512)
|
|
||||||
BIT (MathVec_Prefer_No_AVX512)
|
|
||||||
+BIT (Prefer_AVX2_STRCMP)
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,26 +0,0 @@
|
|||||||
From 3213ed770cbc5821920d16caa93c85e92dd7b9f6 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Wed, 23 Jun 2021 13:29:41 -0700
|
|
||||||
Subject: Update math: redirect roundeven function
|
|
||||||
|
|
||||||
Redirect target specific roundeven functions for aarch64, ldbl-128ibm
|
|
||||||
and riscv.
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
sysdeps/aarch64/*
|
|
||||||
(not needed)
|
|
||||||
sysdeps/riscv/*
|
|
||||||
(not supported)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
|
|
||||||
index 6701970f4a..90eecf496b 100644
|
|
||||||
--- a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
|
|
||||||
+++ b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
|
|
||||||
@@ -17,6 +17,7 @@
|
|
||||||
License along with the GNU C Library; if not, see
|
|
||||||
<http://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
+#define NO_MATH_REDIRECT
|
|
||||||
#include <math.h>
|
|
||||||
#include <math_private.h>
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,242 +0,0 @@
|
|||||||
From 63ad43566f7a25d140dc723598aeb441ad657eed Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Fri, 5 Mar 2021 06:46:08 -0800
|
|
||||||
Subject: [PATCH] x86-64: Add memmove family functions with 256-bit EVEX
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Update ifunc-memmove.h to select the function optimized with 256-bit EVEX
|
|
||||||
instructions using YMM16-YMM31 registers to avoid RTM abort with usable
|
|
||||||
AVX512VL since VZEROUPPER isn't needed at function exit.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/Makefile | 1 +
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 36 +++++++++++++++++++
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-memmove.h | 21 +++++++++--
|
|
||||||
.../multiarch/memmove-evex-unaligned-erms.S | 33 +++++++++++++++++
|
|
||||||
.../multiarch/memmove-vec-unaligned-erms.S | 24 ++++++++-----
|
|
||||||
5 files changed, 104 insertions(+), 11 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
index 46783cd1..4563fc56 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
|
||||||
memset-avx2-unaligned-erms \
|
|
||||||
memset-avx512-unaligned-erms \
|
|
||||||
memchr-evex \
|
|
||||||
+ memmove-evex-unaligned-erms \
|
|
||||||
memrchr-evex \
|
|
||||||
rawmemchr-evex \
|
|
||||||
stpcpy-evex \
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index 082e4da3..6bd3abfc 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
__memmove_chk_avx_unaligned_erms)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __memmove_chk_evex_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __memmove_chk_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
|
||||||
CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
__memmove_chk_ssse3_back)
|
|
||||||
@@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, memmove,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
__memmove_avx_unaligned_erms)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, memmove,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __memmove_evex_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, memmove,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __memmove_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memmove,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__memmove_avx512_no_vzeroupper)
|
|
||||||
@@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
__memcpy_chk_avx_unaligned_erms)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __memcpy_chk_evex_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __memcpy_chk_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
|
||||||
CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
__memcpy_chk_ssse3_back)
|
|
||||||
@@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcpy,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
__memcpy_avx_unaligned_erms)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, memcpy,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __memcpy_evex_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, memcpy,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __memcpy_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
__memcpy_ssse3_back)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
@@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
__mempcpy_chk_avx_unaligned_erms)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __mempcpy_chk_evex_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __mempcpy_chk_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
|
||||||
CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
__mempcpy_chk_ssse3_back)
|
|
||||||
@@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
__mempcpy_avx_unaligned_erms)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, mempcpy,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __mempcpy_evex_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, mempcpy,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __mempcpy_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
__mempcpy_ssse3_back)
|
|
||||||
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
|
||||||
index 5e5f0299..6f8bce5f 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
|
||||||
@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
|
|
||||||
attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
|
|
||||||
+ attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
|
|
||||||
+ attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
|
|
||||||
attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
|
|
||||||
@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void)
|
|
||||||
|
|
||||||
if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
{
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
- return OPTIMIZE (avx_unaligned_erms);
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
+ return OPTIMIZE (evex_unaligned_erms);
|
|
||||||
+
|
|
||||||
+ return OPTIMIZE (evex_unaligned);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
+ return OPTIMIZE (avx_unaligned_erms);
|
|
||||||
|
|
||||||
- return OPTIMIZE (avx_unaligned);
|
|
||||||
+ return OPTIMIZE (avx_unaligned);
|
|
||||||
+ }
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..0cbce8f9
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
|
||||||
@@ -0,0 +1,33 @@
|
|
||||||
+#if IS_IN (libc)
|
|
||||||
+# define VEC_SIZE 32
|
|
||||||
+# define XMM0 xmm16
|
|
||||||
+# define XMM1 xmm17
|
|
||||||
+# define YMM0 ymm16
|
|
||||||
+# define YMM1 ymm17
|
|
||||||
+# define VEC0 ymm16
|
|
||||||
+# define VEC1 ymm17
|
|
||||||
+# define VEC2 ymm18
|
|
||||||
+# define VEC3 ymm19
|
|
||||||
+# define VEC4 ymm20
|
|
||||||
+# define VEC5 ymm21
|
|
||||||
+# define VEC6 ymm22
|
|
||||||
+# define VEC7 ymm23
|
|
||||||
+# define VEC8 ymm24
|
|
||||||
+# define VEC9 ymm25
|
|
||||||
+# define VEC10 ymm26
|
|
||||||
+# define VEC11 ymm27
|
|
||||||
+# define VEC12 ymm28
|
|
||||||
+# define VEC13 ymm29
|
|
||||||
+# define VEC14 ymm30
|
|
||||||
+# define VEC15 ymm31
|
|
||||||
+# define VEC(i) VEC##i
|
|
||||||
+# define VMOVNT vmovntdq
|
|
||||||
+# define VMOVU vmovdqu64
|
|
||||||
+# define VMOVA vmovdqa64
|
|
||||||
+# define VZEROUPPER
|
|
||||||
+
|
|
||||||
+# define SECTION(p) p##.evex
|
|
||||||
+# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
|
|
||||||
+
|
|
||||||
+# include "memmove-vec-unaligned-erms.S"
|
|
||||||
+#endif
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
index 274aa1c7..08e21692 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
@@ -48,6 +48,14 @@
|
|
||||||
# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+#ifndef XMM0
|
|
||||||
+# define XMM0 xmm0
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#ifndef YMM0
|
|
||||||
+# define YMM0 ymm0
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
#ifndef VZEROUPPER
|
|
||||||
# if VEC_SIZE > 16
|
|
||||||
# define VZEROUPPER vzeroupper
|
|
||||||
@@ -277,20 +285,20 @@ L(less_vec):
|
|
||||||
#if VEC_SIZE > 32
|
|
||||||
L(between_32_63):
|
|
||||||
/* From 32 to 63. No branch when size == 32. */
|
|
||||||
- vmovdqu (%rsi), %ymm0
|
|
||||||
- vmovdqu -32(%rsi,%rdx), %ymm1
|
|
||||||
- vmovdqu %ymm0, (%rdi)
|
|
||||||
- vmovdqu %ymm1, -32(%rdi,%rdx)
|
|
||||||
+ VMOVU (%rsi), %YMM0
|
|
||||||
+ VMOVU -32(%rsi,%rdx), %YMM1
|
|
||||||
+ VMOVU %YMM0, (%rdi)
|
|
||||||
+ VMOVU %YMM1, -32(%rdi,%rdx)
|
|
||||||
VZEROUPPER
|
|
||||||
ret
|
|
||||||
#endif
|
|
||||||
#if VEC_SIZE > 16
|
|
||||||
/* From 16 to 31. No branch when size == 16. */
|
|
||||||
L(between_16_31):
|
|
||||||
- vmovdqu (%rsi), %xmm0
|
|
||||||
- vmovdqu -16(%rsi,%rdx), %xmm1
|
|
||||||
- vmovdqu %xmm0, (%rdi)
|
|
||||||
- vmovdqu %xmm1, -16(%rdi,%rdx)
|
|
||||||
+ VMOVU (%rsi), %XMM0
|
|
||||||
+ VMOVU -16(%rsi,%rdx), %XMM1
|
|
||||||
+ VMOVU %XMM0, (%rdi)
|
|
||||||
+ VMOVU %XMM1, -16(%rdi,%rdx)
|
|
||||||
ret
|
|
||||||
#endif
|
|
||||||
L(between_8_15):
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,254 +0,0 @@
|
|||||||
From 1b968b6b9b3aac702ac2f133e0dd16cfdbb415ee Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Fri, 5 Mar 2021 07:15:03 -0800
|
|
||||||
Subject: [PATCH] x86-64: Add memset family functions with 256-bit EVEX
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
|
|
||||||
with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM
|
|
||||||
abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
|
|
||||||
function exit.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/Makefile | 1 +
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 22 +++++++++++++++++
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-memset.h | 24 +++++++++++++++----
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-wmemset.h | 13 ++++++----
|
|
||||||
.../multiarch/memset-evex-unaligned-erms.S | 24 +++++++++++++++++++
|
|
||||||
.../multiarch/memset-vec-unaligned-erms.S | 20 +++++++++++-----
|
|
||||||
6 files changed, 90 insertions(+), 14 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
index 4563fc56..1cc0a10e 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
@@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
|
||||||
memchr-evex \
|
|
||||||
memmove-evex-unaligned-erms \
|
|
||||||
memrchr-evex \
|
|
||||||
+ memset-evex-unaligned-erms \
|
|
||||||
rawmemchr-evex \
|
|
||||||
stpcpy-evex \
|
|
||||||
stpncpy-evex \
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index 6bd3abfc..7cf83485 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX2),
|
|
||||||
__memset_chk_avx2_unaligned_erms)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ __memset_chk_evex_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ __memset_chk_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__memset_chk_avx512_unaligned_erms)
|
|
||||||
@@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
CPU_FEATURE_USABLE (AVX2),
|
|
||||||
__memset_avx2_unaligned_erms)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ __memset_evex_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ __memset_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__memset_avx512_unaligned_erms)
|
|
||||||
@@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
|
||||||
CPU_FEATURE_USABLE (AVX2),
|
|
||||||
__wmemset_avx2_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, wmemset,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __wmemset_evex_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__wmemset_avx512_unaligned))
|
|
||||||
@@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX2),
|
|
||||||
__wmemset_chk_avx2_unaligned)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ __wmemset_chk_evex_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__wmemset_chk_avx512_unaligned))
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
index 708bd72e..6f31f4dc 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
|
|
||||||
attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
|
|
||||||
+ attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
|
|
||||||
+ attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
|
|
||||||
attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
|
|
||||||
@@ -56,10 +60,22 @@ IFUNC_SELECTOR (void)
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
|
||||||
{
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
- return OPTIMIZE (avx2_unaligned_erms);
|
|
||||||
- else
|
|
||||||
- return OPTIMIZE (avx2_unaligned);
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
+ return OPTIMIZE (evex_unaligned_erms);
|
|
||||||
+
|
|
||||||
+ return OPTIMIZE (evex_unaligned);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
+ return OPTIMIZE (avx2_unaligned_erms);
|
|
||||||
+
|
|
||||||
+ return OPTIMIZE (avx2_unaligned);
|
|
||||||
+ }
|
|
||||||
}
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
|
||||||
index eb242210..9290c4bf 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
|
||||||
@@ -20,6 +20,7 @@
|
|
||||||
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
|
|
||||||
|
|
||||||
static inline void *
|
|
||||||
@@ -27,14 +28,18 @@ IFUNC_SELECTOR (void)
|
|
||||||
{
|
|
||||||
const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
||||||
|
|
||||||
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
{
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
|
||||||
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
|
||||||
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
|
|
||||||
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
return OPTIMIZE (avx512_unaligned);
|
|
||||||
- else
|
|
||||||
+
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
|
|
||||||
+ return OPTIMIZE (evex_unaligned);
|
|
||||||
+
|
|
||||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
return OPTIMIZE (avx2_unaligned);
|
|
||||||
}
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..ae0a4d6e
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
||||||
@@ -0,0 +1,24 @@
|
|
||||||
+#if IS_IN (libc)
|
|
||||||
+# define VEC_SIZE 32
|
|
||||||
+# define XMM0 xmm16
|
|
||||||
+# define YMM0 ymm16
|
|
||||||
+# define VEC0 ymm16
|
|
||||||
+# define VEC(i) VEC##i
|
|
||||||
+# define VMOVU vmovdqu64
|
|
||||||
+# define VMOVA vmovdqa64
|
|
||||||
+# define VZEROUPPER
|
|
||||||
+
|
|
||||||
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
||||||
+ movq r, %rax; \
|
|
||||||
+ vpbroadcastb d, %VEC0
|
|
||||||
+
|
|
||||||
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
||||||
+ movq r, %rax; \
|
|
||||||
+ vpbroadcastd d, %VEC0
|
|
||||||
+
|
|
||||||
+# define SECTION(p) p##.evex
|
|
||||||
+# define MEMSET_SYMBOL(p,s) p##_evex_##s
|
|
||||||
+# define WMEMSET_SYMBOL(p,s) p##_evex_##s
|
|
||||||
+
|
|
||||||
+# include "memset-vec-unaligned-erms.S"
|
|
||||||
+#endif
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
index 9a0fd818..71e91a8f 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
@@ -34,6 +34,14 @@
|
|
||||||
# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+#ifndef XMM0
|
|
||||||
+# define XMM0 xmm0
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#ifndef YMM0
|
|
||||||
+# define YMM0 ymm0
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
#ifndef VZEROUPPER
|
|
||||||
# if VEC_SIZE > 16
|
|
||||||
# define VZEROUPPER vzeroupper
|
|
||||||
@@ -67,7 +75,7 @@
|
|
||||||
ENTRY (__bzero)
|
|
||||||
mov %RDI_LP, %RAX_LP /* Set return value. */
|
|
||||||
mov %RSI_LP, %RDX_LP /* Set n. */
|
|
||||||
- pxor %xmm0, %xmm0
|
|
||||||
+ pxor %XMM0, %XMM0
|
|
||||||
jmp L(entry_from_bzero)
|
|
||||||
END (__bzero)
|
|
||||||
weak_alias (__bzero, bzero)
|
|
||||||
@@ -223,7 +231,7 @@ L(less_vec):
|
|
||||||
cmpb $16, %dl
|
|
||||||
jae L(between_16_31)
|
|
||||||
# endif
|
|
||||||
- MOVQ %xmm0, %rcx
|
|
||||||
+ MOVQ %XMM0, %rcx
|
|
||||||
cmpb $8, %dl
|
|
||||||
jae L(between_8_15)
|
|
||||||
cmpb $4, %dl
|
|
||||||
@@ -238,16 +246,16 @@ L(less_vec):
|
|
||||||
# if VEC_SIZE > 32
|
|
||||||
/* From 32 to 63. No branch when size == 32. */
|
|
||||||
L(between_32_63):
|
|
||||||
- vmovdqu %ymm0, -32(%rdi,%rdx)
|
|
||||||
- vmovdqu %ymm0, (%rdi)
|
|
||||||
+ VMOVU %YMM0, -32(%rdi,%rdx)
|
|
||||||
+ VMOVU %YMM0, (%rdi)
|
|
||||||
VZEROUPPER
|
|
||||||
ret
|
|
||||||
# endif
|
|
||||||
# if VEC_SIZE > 16
|
|
||||||
/* From 16 to 31. No branch when size == 16. */
|
|
||||||
L(between_16_31):
|
|
||||||
- vmovdqu %xmm0, -16(%rdi,%rdx)
|
|
||||||
- vmovdqu %xmm0, (%rdi)
|
|
||||||
+ VMOVU %XMM0, -16(%rdi,%rdx)
|
|
||||||
+ VMOVU %XMM0, (%rdi)
|
|
||||||
VZEROUPPER
|
|
||||||
ret
|
|
||||||
# endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,561 +0,0 @@
|
|||||||
From 91264fe3577fe887b4860923fa6142b5274c8965 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Fri, 5 Mar 2021 07:20:28 -0800
|
|
||||||
Subject: [PATCH] x86-64: Add memcmp family functions with 256-bit EVEX
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Update ifunc-memcmp.h to select the function optimized with 256-bit EVEX
|
|
||||||
instructions using YMM16-YMM31 registers to avoid RTM abort with usable
|
|
||||||
AVX512VL, AVX512BW and MOVBE since VZEROUPPER isn't needed at function
|
|
||||||
exit.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/Makefile | 4 +-
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 10 +
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 13 +-
|
|
||||||
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 440 ++++++++++++++++++
|
|
||||||
sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S | 4 +
|
|
||||||
5 files changed, 467 insertions(+), 4 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
index 1cc0a10e..9d79b138 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
|
||||||
memset-avx2-unaligned-erms \
|
|
||||||
memset-avx512-unaligned-erms \
|
|
||||||
memchr-evex \
|
|
||||||
+ memcmp-evex-movbe \
|
|
||||||
memmove-evex-unaligned-erms \
|
|
||||||
memrchr-evex \
|
|
||||||
memset-evex-unaligned-erms \
|
|
||||||
@@ -81,7 +82,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
|
|
||||||
wcsncmp-evex \
|
|
||||||
wcsnlen-evex \
|
|
||||||
wcsrchr-evex \
|
|
||||||
- wmemchr-evex
|
|
||||||
+ wmemchr-evex \
|
|
||||||
+ wmemcmp-evex-movbe
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),debug)
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index 7cf83485..c8da910e 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -56,6 +56,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
|
||||||
__memcmp_avx2_movbe)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, memcmp,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (MOVBE)),
|
|
||||||
+ __memcmp_evex_movbe)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
|
|
||||||
__memcmp_sse4_1)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
@@ -558,6 +563,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
|
||||||
__wmemcmp_avx2_movbe)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, wmemcmp,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (MOVBE)),
|
|
||||||
+ __wmemcmp_evex_movbe)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
|
|
||||||
__wmemcmp_sse4_1)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
|
||||||
index 6c1f3153..3ca1f0a6 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
|
||||||
@@ -23,17 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
|
||||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
|
|
||||||
|
|
||||||
static inline void *
|
|
||||||
IFUNC_SELECTOR (void)
|
|
||||||
{
|
|
||||||
const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
||||||
|
|
||||||
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
&& CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
|
|
||||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
- return OPTIMIZE (avx2_movbe);
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
+ return OPTIMIZE (evex_movbe);
|
|
||||||
+
|
|
||||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
+ return OPTIMIZE (avx2_movbe);
|
|
||||||
+ }
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
|
|
||||||
return OPTIMIZE (sse4_1);
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..9c093972
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
@@ -0,0 +1,440 @@
|
|
||||||
+/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#if IS_IN (libc)
|
|
||||||
+
|
|
||||||
+/* memcmp/wmemcmp is implemented as:
|
|
||||||
+ 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
|
|
||||||
+ to avoid branches.
|
|
||||||
+ 2. Use overlapping compare to avoid branch.
|
|
||||||
+ 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
|
|
||||||
+ bytes for wmemcmp.
|
|
||||||
+ 4. If size is 8 * VEC_SIZE or less, unroll the loop.
|
|
||||||
+ 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
|
||||||
+ area.
|
|
||||||
+ 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
|
|
||||||
+ 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
|
|
||||||
+ 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
|
|
||||||
+
|
|
||||||
+# include <sysdep.h>
|
|
||||||
+
|
|
||||||
+# ifndef MEMCMP
|
|
||||||
+# define MEMCMP __memcmp_evex_movbe
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+# define VMOVU vmovdqu64
|
|
||||||
+
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+# define VPCMPEQ vpcmpeqd
|
|
||||||
+# else
|
|
||||||
+# define VPCMPEQ vpcmpeqb
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+# define XMM1 xmm17
|
|
||||||
+# define XMM2 xmm18
|
|
||||||
+# define YMM1 ymm17
|
|
||||||
+# define YMM2 ymm18
|
|
||||||
+# define YMM3 ymm19
|
|
||||||
+# define YMM4 ymm20
|
|
||||||
+# define YMM5 ymm21
|
|
||||||
+# define YMM6 ymm22
|
|
||||||
+
|
|
||||||
+# define VEC_SIZE 32
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+# define VEC_MASK 0xff
|
|
||||||
+# define XMM_MASK 0xf
|
|
||||||
+# else
|
|
||||||
+# define VEC_MASK 0xffffffff
|
|
||||||
+# define XMM_MASK 0xffff
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+/* Warning!
|
|
||||||
+ wmemcmp has to use SIGNED comparison for elements.
|
|
||||||
+ memcmp has to use UNSIGNED comparison for elemnts.
|
|
||||||
+*/
|
|
||||||
+
|
|
||||||
+ .section .text.evex,"ax",@progbits
|
|
||||||
+ENTRY (MEMCMP)
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ shl $2, %RDX_LP
|
|
||||||
+# elif defined __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %edx, %edx
|
|
||||||
+# endif
|
|
||||||
+ cmp $VEC_SIZE, %RDX_LP
|
|
||||||
+ jb L(less_vec)
|
|
||||||
+
|
|
||||||
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMPEQ (%rdi), %YMM2, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+
|
|
||||||
+ cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
+ jbe L(last_vec)
|
|
||||||
+
|
|
||||||
+ /* More than 2 * VEC. */
|
|
||||||
+ cmpq $(VEC_SIZE * 8), %rdx
|
|
||||||
+ ja L(more_8x_vec)
|
|
||||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ jb L(last_4x_vec)
|
|
||||||
+
|
|
||||||
+ /* From 4 * VEC to 8 * VEC, inclusively. */
|
|
||||||
+ VMOVU (%rsi), %YMM1
|
|
||||||
+ VPCMPEQ (%rdi), %YMM1, %k1
|
|
||||||
+
|
|
||||||
+ VMOVU VEC_SIZE(%rsi), %YMM2
|
|
||||||
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
|
||||||
+
|
|
||||||
+ kandd %k1, %k2, %k5
|
|
||||||
+ kandd %k3, %k4, %k6
|
|
||||||
+ kandd %k5, %k6, %k6
|
|
||||||
+
|
|
||||||
+ kmovd %k6, %eax
|
|
||||||
+ cmpl $VEC_MASK, %eax
|
|
||||||
+ jne L(4x_vec_end)
|
|
||||||
+
|
|
||||||
+ leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
|
|
||||||
+ leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
|
|
||||||
+ VMOVU (%rsi), %YMM1
|
|
||||||
+ VPCMPEQ (%rdi), %YMM1, %k1
|
|
||||||
+
|
|
||||||
+ VMOVU VEC_SIZE(%rsi), %YMM2
|
|
||||||
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
|
||||||
+ kandd %k1, %k2, %k5
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
|
||||||
+ kandd %k3, %k5, %k5
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
|
||||||
+ kandd %k4, %k5, %k5
|
|
||||||
+
|
|
||||||
+ kmovd %k5, %eax
|
|
||||||
+ cmpl $VEC_MASK, %eax
|
|
||||||
+ jne L(4x_vec_end)
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_2x_vec):
|
|
||||||
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+
|
|
||||||
+L(last_vec):
|
|
||||||
+ /* Use overlapping loads to avoid branches. */
|
|
||||||
+ leaq -VEC_SIZE(%rdi, %rdx), %rdi
|
|
||||||
+ leaq -VEC_SIZE(%rsi, %rdx), %rsi
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec):
|
|
||||||
+ /* A byte or int32 is different within 16 or 32 bytes. */
|
|
||||||
+ tzcntl %eax, %ecx
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ movl (%rdi, %rcx, 4), %edx
|
|
||||||
+ cmpl (%rsi, %rcx, 4), %edx
|
|
||||||
+L(wmemcmp_return):
|
|
||||||
+ setl %al
|
|
||||||
+ negl %eax
|
|
||||||
+ orl $1, %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl (%rdi, %rcx), %eax
|
|
||||||
+ movzbl (%rsi, %rcx), %edx
|
|
||||||
+ sub %edx, %eax
|
|
||||||
+# endif
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ .p2align 4
|
|
||||||
+L(4):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ movl (%rdi), %edx
|
|
||||||
+ cmpl (%rsi), %edx
|
|
||||||
+ jne L(wmemcmp_return)
|
|
||||||
+ ret
|
|
||||||
+# else
|
|
||||||
+ .p2align 4
|
|
||||||
+L(between_4_7):
|
|
||||||
+ /* Load as big endian with overlapping movbe to avoid branches. */
|
|
||||||
+ movbe (%rdi), %eax
|
|
||||||
+ movbe (%rsi), %ecx
|
|
||||||
+ shlq $32, %rax
|
|
||||||
+ shlq $32, %rcx
|
|
||||||
+ movbe -4(%rdi, %rdx), %edi
|
|
||||||
+ movbe -4(%rsi, %rdx), %esi
|
|
||||||
+ orq %rdi, %rax
|
|
||||||
+ orq %rsi, %rcx
|
|
||||||
+ subq %rcx, %rax
|
|
||||||
+ je L(exit)
|
|
||||||
+ sbbl %eax, %eax
|
|
||||||
+ orl $1, %eax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(exit):
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(between_2_3):
|
|
||||||
+ /* Load as big endian to avoid branches. */
|
|
||||||
+ movzwl (%rdi), %eax
|
|
||||||
+ movzwl (%rsi), %ecx
|
|
||||||
+ shll $8, %eax
|
|
||||||
+ shll $8, %ecx
|
|
||||||
+ bswap %eax
|
|
||||||
+ bswap %ecx
|
|
||||||
+ movb -1(%rdi, %rdx), %al
|
|
||||||
+ movb -1(%rsi, %rdx), %cl
|
|
||||||
+ /* Subtraction is okay because the upper 8 bits are zero. */
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(1):
|
|
||||||
+ movzbl (%rdi), %eax
|
|
||||||
+ movzbl (%rsi), %ecx
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+ ret
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(less_vec):
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
|
|
||||||
+ cmpb $4, %dl
|
|
||||||
+ je L(4)
|
|
||||||
+ jb L(zero)
|
|
||||||
+# else
|
|
||||||
+ cmpb $1, %dl
|
|
||||||
+ je L(1)
|
|
||||||
+ jb L(zero)
|
|
||||||
+ cmpb $4, %dl
|
|
||||||
+ jb L(between_2_3)
|
|
||||||
+ cmpb $8, %dl
|
|
||||||
+ jb L(between_4_7)
|
|
||||||
+# endif
|
|
||||||
+ cmpb $16, %dl
|
|
||||||
+ jae L(between_16_31)
|
|
||||||
+ /* It is between 8 and 15 bytes. */
|
|
||||||
+ vmovq (%rdi), %XMM1
|
|
||||||
+ vmovq (%rsi), %XMM2
|
|
||||||
+ VPCMPEQ %XMM1, %XMM2, %k2
|
|
||||||
+ kmovw %k2, %eax
|
|
||||||
+ subl $XMM_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+ /* Use overlapping loads to avoid branches. */
|
|
||||||
+ leaq -8(%rdi, %rdx), %rdi
|
|
||||||
+ leaq -8(%rsi, %rdx), %rsi
|
|
||||||
+ vmovq (%rdi), %XMM1
|
|
||||||
+ vmovq (%rsi), %XMM2
|
|
||||||
+ VPCMPEQ %XMM1, %XMM2, %k2
|
|
||||||
+ kmovw %k2, %eax
|
|
||||||
+ subl $XMM_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(between_16_31):
|
|
||||||
+ /* From 16 to 31 bytes. No branch when size == 16. */
|
|
||||||
+ VMOVU (%rsi), %XMM2
|
|
||||||
+ VPCMPEQ (%rdi), %XMM2, %k2
|
|
||||||
+ kmovw %k2, %eax
|
|
||||||
+ subl $XMM_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+
|
|
||||||
+ /* Use overlapping loads to avoid branches. */
|
|
||||||
+ leaq -16(%rdi, %rdx), %rdi
|
|
||||||
+ leaq -16(%rsi, %rdx), %rsi
|
|
||||||
+ VMOVU (%rsi), %XMM2
|
|
||||||
+ VPCMPEQ (%rdi), %XMM2, %k2
|
|
||||||
+ kmovw %k2, %eax
|
|
||||||
+ subl $XMM_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(more_8x_vec):
|
|
||||||
+ /* More than 8 * VEC. Check the first VEC. */
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+
|
|
||||||
+ /* Align the first memory area for aligned loads in the loop.
|
|
||||||
+ Compute how much the first memory area is misaligned. */
|
|
||||||
+ movq %rdi, %rcx
|
|
||||||
+ andl $(VEC_SIZE - 1), %ecx
|
|
||||||
+ /* Get the negative of offset for alignment. */
|
|
||||||
+ subq $VEC_SIZE, %rcx
|
|
||||||
+ /* Adjust the second memory area. */
|
|
||||||
+ subq %rcx, %rsi
|
|
||||||
+ /* Adjust the first memory area which should be aligned now. */
|
|
||||||
+ subq %rcx, %rdi
|
|
||||||
+ /* Adjust length. */
|
|
||||||
+ addq %rcx, %rdx
|
|
||||||
+
|
|
||||||
+L(loop_4x_vec):
|
|
||||||
+ /* Compare 4 * VEC at a time forward. */
|
|
||||||
+ VMOVU (%rsi), %YMM1
|
|
||||||
+ VPCMPEQ (%rdi), %YMM1, %k1
|
|
||||||
+
|
|
||||||
+ VMOVU VEC_SIZE(%rsi), %YMM2
|
|
||||||
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
|
||||||
+ kandd %k2, %k1, %k5
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
|
||||||
+ kandd %k3, %k5, %k5
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
|
||||||
+ kandd %k4, %k5, %k5
|
|
||||||
+
|
|
||||||
+ kmovd %k5, %eax
|
|
||||||
+ cmpl $VEC_MASK, %eax
|
|
||||||
+ jne L(4x_vec_end)
|
|
||||||
+
|
|
||||||
+ addq $(VEC_SIZE * 4), %rdi
|
|
||||||
+ addq $(VEC_SIZE * 4), %rsi
|
|
||||||
+
|
|
||||||
+ subq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ jae L(loop_4x_vec)
|
|
||||||
+
|
|
||||||
+ /* Less than 4 * VEC. */
|
|
||||||
+ cmpq $VEC_SIZE, %rdx
|
|
||||||
+ jbe L(last_vec)
|
|
||||||
+ cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
+ jbe L(last_2x_vec)
|
|
||||||
+
|
|
||||||
+L(last_4x_vec):
|
|
||||||
+ /* From 2 * VEC to 4 * VEC. */
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+
|
|
||||||
+ addq $VEC_SIZE, %rdi
|
|
||||||
+ addq $VEC_SIZE, %rsi
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+
|
|
||||||
+ /* Use overlapping loads to avoid branches. */
|
|
||||||
+ leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
|
|
||||||
+ leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+
|
|
||||||
+ addq $VEC_SIZE, %rdi
|
|
||||||
+ addq $VEC_SIZE, %rsi
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(4x_vec_end):
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec)
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec_x1)
|
|
||||||
+ kmovd %k3, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ jnz L(first_vec_x2)
|
|
||||||
+ kmovd %k4, %eax
|
|
||||||
+ subl $VEC_MASK, %eax
|
|
||||||
+ tzcntl %eax, %ecx
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
|
|
||||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
|
|
||||||
+ jmp L(wmemcmp_return)
|
|
||||||
+# else
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
|
|
||||||
+ sub %edx, %eax
|
|
||||||
+# endif
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x1):
|
|
||||||
+ tzcntl %eax, %ecx
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ movl VEC_SIZE(%rdi, %rcx, 4), %edx
|
|
||||||
+ cmpl VEC_SIZE(%rsi, %rcx, 4), %edx
|
|
||||||
+ jmp L(wmemcmp_return)
|
|
||||||
+# else
|
|
||||||
+ movzbl VEC_SIZE(%rdi, %rcx), %eax
|
|
||||||
+ movzbl VEC_SIZE(%rsi, %rcx), %edx
|
|
||||||
+ sub %edx, %eax
|
|
||||||
+# endif
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x2):
|
|
||||||
+ tzcntl %eax, %ecx
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
|
|
||||||
+ cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
|
|
||||||
+ jmp L(wmemcmp_return)
|
|
||||||
+# else
|
|
||||||
+ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
|
|
||||||
+ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
|
|
||||||
+ sub %edx, %eax
|
|
||||||
+# endif
|
|
||||||
+ ret
|
|
||||||
+END (MEMCMP)
|
|
||||||
+#endif
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..4726d74a
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
|
|
||||||
@@ -0,0 +1,4 @@
|
|
||||||
+#define MEMCMP __wmemcmp_evex_movbe
|
|
||||||
+#define USE_AS_WMEMCMP 1
|
|
||||||
+
|
|
||||||
+#include "memcmp-evex-movbe.S"
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
@ -1,735 +0,0 @@
|
|||||||
From 4bd660be40967cd69072f69ebc2ad32bfcc1f206 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Tue, 23 Feb 2021 06:33:10 -0800
|
|
||||||
Subject: [PATCH] x86: Add string/memory function tests in RTM region
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
At function exit, AVX optimized string/memory functions have VZEROUPPER
|
|
||||||
which triggers RTM abort. When such functions are called inside a
|
|
||||||
transactionally executing RTM region, RTM abort causes severe performance
|
|
||||||
degradation. Add tests to verify that string/memory functions won't
|
|
||||||
cause RTM abort in RTM region.
|
|
||||||
---
|
|
||||||
sysdeps/x86/Makefile | 23 +++++++++++
|
|
||||||
sysdeps/x86/tst-memchr-rtm.c | 54 ++++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-memcmp-rtm.c | 52 +++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-memmove-rtm.c | 53 ++++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-memrchr-rtm.c | 54 ++++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-memset-rtm.c | 45 ++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-strchr-rtm.c | 54 ++++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-strcpy-rtm.c | 53 ++++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-string-rtm.h | 72 +++++++++++++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-strlen-rtm.c | 53 ++++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-strncmp-rtm.c | 52 +++++++++++++++++++++++++
|
|
||||||
sysdeps/x86/tst-strrchr-rtm.c | 53 ++++++++++++++++++++++++++
|
|
||||||
12 files changed, 618 insertions(+)
|
|
||||||
create mode 100644 sysdeps/x86/tst-memchr-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-memcmp-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-memmove-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-memrchr-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-memset-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-strchr-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-strcpy-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-string-rtm.h
|
|
||||||
create mode 100644 sysdeps/x86/tst-strlen-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-strncmp-rtm.c
|
|
||||||
create mode 100644 sysdeps/x86/tst-strrchr-rtm.c
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
|
|
||||||
index 59e928e9..5be71ada 100644
|
|
||||||
--- a/sysdeps/x86/Makefile
|
|
||||||
+++ b/sysdeps/x86/Makefile
|
|
||||||
@@ -17,6 +17,29 @@ endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
sysdep_routines += cacheinfo
|
|
||||||
+
|
|
||||||
+tests += \
|
|
||||||
+ tst-memchr-rtm \
|
|
||||||
+ tst-memcmp-rtm \
|
|
||||||
+ tst-memmove-rtm \
|
|
||||||
+ tst-memrchr-rtm \
|
|
||||||
+ tst-memset-rtm \
|
|
||||||
+ tst-strchr-rtm \
|
|
||||||
+ tst-strcpy-rtm \
|
|
||||||
+ tst-strlen-rtm \
|
|
||||||
+ tst-strncmp-rtm \
|
|
||||||
+ tst-strrchr-rtm
|
|
||||||
+
|
|
||||||
+CFLAGS-tst-memchr-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-memcmp-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-memmove-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-memrchr-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-memset-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-strchr-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-strcpy-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-strlen-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-strncmp-rtm.c += -mrtm
|
|
||||||
+CFLAGS-tst-strrchr-rtm.c += -mrtm
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifneq ($(enable-cet),no)
|
|
||||||
diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..e4749401
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-memchr-rtm.c
|
|
||||||
@@ -0,0 +1,54 @@
|
|
||||||
+/* Test case for memchr inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE);
|
|
||||||
+ string1[100] = 'c';
|
|
||||||
+ string1[STRING_SIZE - 100] = 'c';
|
|
||||||
+ char *p = memchr (string1, 'c', STRING_SIZE);
|
|
||||||
+ if (p == &string1[100])
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ char *p = memchr (string1, 'c', STRING_SIZE);
|
|
||||||
+ if (p == &string1[100])
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("memchr", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..e4c8a623
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-memcmp-rtm.c
|
|
||||||
@@ -0,0 +1,52 @@
|
|
||||||
+/* Test case for memcmp inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+char string2[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE);
|
|
||||||
+ memset (string2, 'a', STRING_SIZE);
|
|
||||||
+ if (memcmp (string1, string2, STRING_SIZE) == 0)
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ if (memcmp (string1, string2, STRING_SIZE) == 0)
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("memcmp", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..4bf97ef1
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-memmove-rtm.c
|
|
||||||
@@ -0,0 +1,53 @@
|
|
||||||
+/* Test case for memmove inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+char string2[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE);
|
|
||||||
+ if (memmove (string2, string1, STRING_SIZE) == string2
|
|
||||||
+ && memcmp (string2, string1, STRING_SIZE) == 0)
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ if (memmove (string2, string1, STRING_SIZE) == string2
|
|
||||||
+ && memcmp (string2, string1, STRING_SIZE) == 0)
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("memmove", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..a57a5a8e
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-memrchr-rtm.c
|
|
||||||
@@ -0,0 +1,54 @@
|
|
||||||
+/* Test case for memrchr inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE);
|
|
||||||
+ string1[100] = 'c';
|
|
||||||
+ string1[STRING_SIZE - 100] = 'c';
|
|
||||||
+ char *p = memrchr (string1, 'c', STRING_SIZE);
|
|
||||||
+ if (p == &string1[STRING_SIZE - 100])
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ char *p = memrchr (string1, 'c', STRING_SIZE);
|
|
||||||
+ if (p == &string1[STRING_SIZE - 100])
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("memrchr", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..bf343a4d
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-memset-rtm.c
|
|
||||||
@@ -0,0 +1,45 @@
|
|
||||||
+/* Test case for memset inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE);
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE);
|
|
||||||
+ return 0;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("memset", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..a82e29c0
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-strchr-rtm.c
|
|
||||||
@@ -0,0 +1,54 @@
|
|
||||||
+/* Test case for strchr inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
|
||||||
+ string1[100] = 'c';
|
|
||||||
+ string1[STRING_SIZE - 100] = 'c';
|
|
||||||
+ char *p = strchr (string1, 'c');
|
|
||||||
+ if (p == &string1[100])
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ char *p = strchr (string1, 'c');
|
|
||||||
+ if (p == &string1[100])
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("strchr", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..2b2a583f
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-strcpy-rtm.c
|
|
||||||
@@ -0,0 +1,53 @@
|
|
||||||
+/* Test case for strcpy inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+char string2[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
|
||||||
+ if (strcpy (string2, string1) == string2
|
|
||||||
+ && strcmp (string2, string1) == 0)
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ if (strcpy (string2, string1) == string2
|
|
||||||
+ && strcmp (string2, string1) == 0)
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("strcpy", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..d2470afa
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-string-rtm.h
|
|
||||||
@@ -0,0 +1,72 @@
|
|
||||||
+/* Test string function in a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <string.h>
|
|
||||||
+#include <x86intrin.h>
|
|
||||||
+#include <sys/platform/x86.h>
|
|
||||||
+#include <support/check.h>
|
|
||||||
+#include <support/test-driver.h>
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
|
|
||||||
+ int (*function) (void))
|
|
||||||
+{
|
|
||||||
+ if (!CPU_FEATURE_USABLE (RTM))
|
|
||||||
+ return EXIT_UNSUPPORTED;
|
|
||||||
+
|
|
||||||
+ int status = prepare ();
|
|
||||||
+ if (status != EXIT_SUCCESS)
|
|
||||||
+ return status;
|
|
||||||
+
|
|
||||||
+ unsigned int i;
|
|
||||||
+ unsigned int naborts = 0;
|
|
||||||
+ unsigned int failed = 0;
|
|
||||||
+ for (i = 0; i < loop; i++)
|
|
||||||
+ {
|
|
||||||
+ failed |= function ();
|
|
||||||
+ if (_xbegin() == _XBEGIN_STARTED)
|
|
||||||
+ {
|
|
||||||
+ failed |= function ();
|
|
||||||
+ _xend();
|
|
||||||
+ }
|
|
||||||
+ else
|
|
||||||
+ {
|
|
||||||
+ failed |= function ();
|
|
||||||
+ ++naborts;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (failed)
|
|
||||||
+ FAIL_EXIT1 ("%s() failed", name);
|
|
||||||
+
|
|
||||||
+ if (naborts)
|
|
||||||
+ {
|
|
||||||
+ /* NB: Low single digit (<= 5%) noise-level aborts are normal for
|
|
||||||
+ TSX. */
|
|
||||||
+ double rate = 100 * ((double) naborts) / ((double) loop);
|
|
||||||
+ if (rate > 5)
|
|
||||||
+ FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
|
|
||||||
+ rate, naborts, loop);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int do_test (void);
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..0dcf14db
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-strlen-rtm.c
|
|
||||||
@@ -0,0 +1,53 @@
|
|
||||||
+/* Test case for strlen inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
|
||||||
+ string1[STRING_SIZE - 100] = '\0';
|
|
||||||
+ size_t len = strlen (string1);
|
|
||||||
+ if (len == STRING_SIZE - 100)
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ size_t len = strlen (string1);
|
|
||||||
+ if (len == STRING_SIZE - 100)
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("strlen", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..236ad951
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-strncmp-rtm.c
|
|
||||||
@@ -0,0 +1,52 @@
|
|
||||||
+/* Test case for strncmp inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+char string2[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
|
||||||
+ memset (string2, 'a', STRING_SIZE - 1);
|
|
||||||
+ if (strncmp (string1, string2, STRING_SIZE) == 0)
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ if (strncmp (string1, string2, STRING_SIZE) == 0)
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("strncmp", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..e32bfaf5
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86/tst-strrchr-rtm.c
|
|
||||||
@@ -0,0 +1,53 @@
|
|
||||||
+/* Test case for strrchr inside a transactionally executing RTM region.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <tst-string-rtm.h>
|
|
||||||
+
|
|
||||||
+#define LOOP 3000
|
|
||||||
+#define STRING_SIZE 1024
|
|
||||||
+char string1[STRING_SIZE];
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+prepare (void)
|
|
||||||
+{
|
|
||||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
|
||||||
+ string1[STRING_SIZE - 100] = 'c';
|
|
||||||
+ char *p = strrchr (string1, 'c');
|
|
||||||
+ if (p == &string1[STRING_SIZE - 100])
|
|
||||||
+ return EXIT_SUCCESS;
|
|
||||||
+ else
|
|
||||||
+ return EXIT_FAILURE;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+static int
|
|
||||||
+function (void)
|
|
||||||
+{
|
|
||||||
+ char *p = strrchr (string1, 'c');
|
|
||||||
+ if (p == &string1[STRING_SIZE - 100])
|
|
||||||
+ return 0;
|
|
||||||
+ else
|
|
||||||
+ return 1;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+do_test (void)
|
|
||||||
+{
|
|
||||||
+ return do_test_1 ("strrchr", LOOP, prepare, function);
|
|
||||||
+}
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,148 +0,0 @@
|
|||||||
From 4e2d8f352774b56078c34648b14a2412c38384f4 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Sun, 7 Mar 2021 09:44:18 -0800
|
|
||||||
Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memset family functions
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
|
|
||||||
with AVX512 instructions using ZMM16-ZMM31 registers to avoid RTM abort
|
|
||||||
with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
|
|
||||||
function exit.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 14 +++++++++-----
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-memset.h | 13 ++++++++-----
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-wmemset.h | 12 ++++++------
|
|
||||||
.../multiarch/memset-avx512-unaligned-erms.S | 16 ++++++++--------
|
|
||||||
4 files changed, 31 insertions(+), 24 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index c1efeec0..d969a156 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -211,10 +211,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
__memset_chk_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
__memset_chk_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
__memset_chk_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
@@ -252,10 +254,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
__memset_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
__memset_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
__memset_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
@@ -719,7 +723,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__wmemset_evex_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__wmemset_avx512_unaligned))
|
|
||||||
|
|
||||||
#ifdef SHARED
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
index 6f3375cc..19795938 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
@@ -53,13 +53,16 @@ IFUNC_SELECTOR (void)
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
|
||||||
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
|
||||||
{
|
|
||||||
- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
- return OPTIMIZE (avx512_no_vzeroupper);
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
+ return OPTIMIZE (avx512_unaligned_erms);
|
|
||||||
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
- return OPTIMIZE (avx512_unaligned_erms);
|
|
||||||
+ return OPTIMIZE (avx512_unaligned);
|
|
||||||
+ }
|
|
||||||
|
|
||||||
- return OPTIMIZE (avx512_unaligned);
|
|
||||||
+ return OPTIMIZE (avx512_no_vzeroupper);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
|
||||||
index bdc94c6c..98c5d406 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
|
||||||
@@ -33,13 +33,13 @@ IFUNC_SELECTOR (void)
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
{
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
|
||||||
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
|
|
||||||
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
- return OPTIMIZE (avx512_unaligned);
|
|
||||||
-
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
|
|
||||||
- return OPTIMIZE (evex_unaligned);
|
|
||||||
+ {
|
|
||||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
|
||||||
+ return OPTIMIZE (avx512_unaligned);
|
|
||||||
+
|
|
||||||
+ return OPTIMIZE (evex_unaligned);
|
|
||||||
+ }
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
||||||
return OPTIMIZE (avx2_unaligned_rtm);
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
||||||
index 0783979c..22e7b187 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
||||||
@@ -1,22 +1,22 @@
|
|
||||||
#if IS_IN (libc)
|
|
||||||
# define VEC_SIZE 64
|
|
||||||
-# define VEC(i) zmm##i
|
|
||||||
+# define XMM0 xmm16
|
|
||||||
+# define YMM0 ymm16
|
|
||||||
+# define VEC0 zmm16
|
|
||||||
+# define VEC(i) VEC##i
|
|
||||||
# define VMOVU vmovdqu64
|
|
||||||
# define VMOVA vmovdqa64
|
|
||||||
+# define VZEROUPPER
|
|
||||||
|
|
||||||
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
||||||
- vmovd d, %xmm0; \
|
|
||||||
movq r, %rax; \
|
|
||||||
- vpbroadcastb %xmm0, %xmm0; \
|
|
||||||
- vpbroadcastq %xmm0, %zmm0
|
|
||||||
+ vpbroadcastb d, %VEC0
|
|
||||||
|
|
||||||
# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
||||||
- vmovd d, %xmm0; \
|
|
||||||
movq r, %rax; \
|
|
||||||
- vpbroadcastd %xmm0, %xmm0; \
|
|
||||||
- vpbroadcastq %xmm0, %zmm0
|
|
||||||
+ vpbroadcastd d, %VEC0
|
|
||||||
|
|
||||||
-# define SECTION(p) p##.avx512
|
|
||||||
+# define SECTION(p) p##.evex512
|
|
||||||
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
|
|
||||||
# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
|
|
||||||
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,230 +0,0 @@
|
|||||||
From b304fc201d2f6baf52ea790df8643e99772243cd Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 21 Jan 2019 11:25:56 -0800
|
|
||||||
Subject: [PATCH] x86-64 memcmp/wmemcmp: Properly handle the length parameter
|
|
||||||
[BZ# 24097]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
|
||||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
|
||||||
functions written in assembly can only use the lower 32 bits of a
|
|
||||||
64-bit register as length or must clear the upper 32 bits before using
|
|
||||||
the full 64-bit register for length.
|
|
||||||
|
|
||||||
This pach fixes memcmp/wmemcmp for x32. Tested on x86-64 and x32. On
|
|
||||||
x86-64, libc.so is the same with and withou the fix.
|
|
||||||
|
|
||||||
[BZ# 24097]
|
|
||||||
CVE-2019-6488
|
|
||||||
* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for
|
|
||||||
length. Clear the upper 32 bits of RDX register.
|
|
||||||
* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
|
|
||||||
* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and
|
|
||||||
tst-size_t-wmemcmp.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 7 +-
|
|
||||||
sysdeps/x86_64/multiarch/memcmp-sse4.S | 9 ++-
|
|
||||||
sysdeps/x86_64/multiarch/memcmp-ssse3.S | 7 +-
|
|
||||||
sysdeps/x86_64/x32/Makefile | 4 +-
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-memcmp.c | 76 ++++++++++++++++++++
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-wmemcmp.c | 20 ++++++
|
|
||||||
6 files changed, 114 insertions(+), 9 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
index 30f764c3..e3a35b89 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
@@ -58,9 +58,12 @@
|
|
||||||
.section .text.avx,"ax",@progbits
|
|
||||||
ENTRY (MEMCMP)
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
- shl $2, %rdx
|
|
||||||
+ shl $2, %RDX_LP
|
|
||||||
+# elif defined __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %edx, %edx
|
|
||||||
# endif
|
|
||||||
- cmpq $VEC_SIZE, %rdx
|
|
||||||
+ cmp $VEC_SIZE, %RDX_LP
|
|
||||||
jb L(less_vec)
|
|
||||||
|
|
||||||
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
|
||||||
index 8e164f2c..302900f5 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
|
||||||
@@ -42,13 +42,16 @@
|
|
||||||
.section .text.sse4.1,"ax",@progbits
|
|
||||||
ENTRY (MEMCMP)
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
- shl $2, %rdx
|
|
||||||
+ shl $2, %RDX_LP
|
|
||||||
+# elif defined __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ mov %edx, %edx
|
|
||||||
# endif
|
|
||||||
pxor %xmm0, %xmm0
|
|
||||||
- cmp $79, %rdx
|
|
||||||
+ cmp $79, %RDX_LP
|
|
||||||
ja L(79bytesormore)
|
|
||||||
# ifndef USE_AS_WMEMCMP
|
|
||||||
- cmp $1, %rdx
|
|
||||||
+ cmp $1, %RDX_LP
|
|
||||||
je L(firstbyte)
|
|
||||||
# endif
|
|
||||||
add %rdx, %rsi
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
|
|
||||||
index 6f76c641..69d030fc 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
|
|
||||||
@@ -33,9 +33,12 @@
|
|
||||||
atom_text_section
|
|
||||||
ENTRY (MEMCMP)
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
- shl $2, %rdx
|
|
||||||
- test %rdx, %rdx
|
|
||||||
+ shl $2, %RDX_LP
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
jz L(equal)
|
|
||||||
+# elif defined __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ mov %edx, %edx
|
|
||||||
# endif
|
|
||||||
mov %rdx, %rcx
|
|
||||||
mov %rdi, %rdx
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index 7d528889..ddec7f04 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
-tests += tst-size_t-memchr
|
|
||||||
+tests += tst-size_t-memchr tst-size_t-memcmp
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),wcsmbs)
|
|
||||||
-tests += tst-size_t-wmemchr
|
|
||||||
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp
|
|
||||||
endif
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..9bd6fdb4
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
|
|
||||||
@@ -0,0 +1,76 @@
|
|
||||||
+/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define TEST_MAIN
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# define TEST_NAME "wmemcmp"
|
|
||||||
+#else
|
|
||||||
+# define TEST_NAME "memcmp"
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# include <inttypes.h>
|
|
||||||
+# include <wchar.h>
|
|
||||||
+
|
|
||||||
+# define MEMCMP wmemcmp
|
|
||||||
+# define CHAR wchar_t
|
|
||||||
+#else
|
|
||||||
+# define MEMCMP memcmp
|
|
||||||
+# define CHAR char
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+IMPL (MEMCMP, 1)
|
|
||||||
+
|
|
||||||
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_memcmp (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
|
|
||||||
+ parameter_t src = { { 0 }, buf2 };
|
|
||||||
+
|
|
||||||
+ memcpy (buf1, buf2, page_size);
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ src.fn = impl->fn;
|
|
||||||
+ int res = do_memcmp (dest, src);
|
|
||||||
+ if (res)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: %i != 0",
|
|
||||||
+ impl->name, res);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..e8b5ffd0
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
|
|
||||||
@@ -0,0 +1,20 @@
|
|
||||||
+/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define WIDE 1
|
|
||||||
+#include "tst-size_t-memcmp.c"
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,164 +0,0 @@
|
|||||||
From e4fda4631017e49d4ee5a2755db34289b6860fa4 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Sun, 7 Mar 2021 09:45:23 -0800
|
|
||||||
Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memmove family functions
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Update ifunc-memmove.h to select the function optimized with AVX512
|
|
||||||
instructions using ZMM16-ZMM31 registers to avoid RTM abort with usable
|
|
||||||
AVX512VL since VZEROUPPER isn't needed at function exit.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 24 +++++++++---------
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-memmove.h | 12 +++++----
|
|
||||||
.../multiarch/memmove-avx512-unaligned-erms.S | 25 +++++++++++++++++--
|
|
||||||
3 files changed, 42 insertions(+), 19 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index d969a156..fec384f6 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -83,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__memmove_chk_avx512_no_vzeroupper)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__memmove_chk_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__memmove_chk_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
@@ -148,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__memmove_avx512_no_vzeroupper)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memmove,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__memmove_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memmove,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__memmove_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
|
|
||||||
__memmove_ssse3_back)
|
|
||||||
@@ -733,10 +733,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__memcpy_chk_avx512_no_vzeroupper)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__memcpy_chk_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__memcpy_chk_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
@@ -802,10 +802,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__memcpy_avx512_no_vzeroupper)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcpy,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__memcpy_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcpy,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__memcpy_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcpy, 1,
|
|
||||||
@@ -819,10 +819,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__mempcpy_chk_avx512_no_vzeroupper)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__mempcpy_chk_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__mempcpy_chk_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
@@ -864,10 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
__mempcpy_avx512_no_vzeroupper)
|
|
||||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__mempcpy_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
__mempcpy_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
|
||||||
CPU_FEATURE_USABLE (AVX),
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
|
||||||
index fa09b9fb..014e95c7 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
|
||||||
@@ -56,13 +56,15 @@ IFUNC_SELECTOR (void)
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
|
||||||
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
|
||||||
{
|
|
||||||
- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
- return OPTIMIZE (avx512_no_vzeroupper);
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
+ return OPTIMIZE (avx512_unaligned_erms);
|
|
||||||
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
- return OPTIMIZE (avx512_unaligned_erms);
|
|
||||||
+ return OPTIMIZE (avx512_unaligned);
|
|
||||||
+ }
|
|
||||||
|
|
||||||
- return OPTIMIZE (avx512_unaligned);
|
|
||||||
+ return OPTIMIZE (avx512_no_vzeroupper);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
|
||||||
index aac1515c..848848ab 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
|
||||||
@@ -1,11 +1,32 @@
|
|
||||||
#if IS_IN (libc)
|
|
||||||
# define VEC_SIZE 64
|
|
||||||
-# define VEC(i) zmm##i
|
|
||||||
+# define XMM0 xmm16
|
|
||||||
+# define XMM1 xmm17
|
|
||||||
+# define YMM0 ymm16
|
|
||||||
+# define YMM1 ymm17
|
|
||||||
+# define VEC0 zmm16
|
|
||||||
+# define VEC1 zmm17
|
|
||||||
+# define VEC2 zmm18
|
|
||||||
+# define VEC3 zmm19
|
|
||||||
+# define VEC4 zmm20
|
|
||||||
+# define VEC5 zmm21
|
|
||||||
+# define VEC6 zmm22
|
|
||||||
+# define VEC7 zmm23
|
|
||||||
+# define VEC8 zmm24
|
|
||||||
+# define VEC9 zmm25
|
|
||||||
+# define VEC10 zmm26
|
|
||||||
+# define VEC11 zmm27
|
|
||||||
+# define VEC12 zmm28
|
|
||||||
+# define VEC13 zmm29
|
|
||||||
+# define VEC14 zmm30
|
|
||||||
+# define VEC15 zmm31
|
|
||||||
+# define VEC(i) VEC##i
|
|
||||||
# define VMOVNT vmovntdq
|
|
||||||
# define VMOVU vmovdqu64
|
|
||||||
# define VMOVA vmovdqa64
|
|
||||||
+# define VZEROUPPER
|
|
||||||
|
|
||||||
-# define SECTION(p) p##.avx512
|
|
||||||
+# define SECTION(p) p##.evex512
|
|
||||||
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
|
|
||||||
|
|
||||||
# include "memmove-vec-unaligned-erms.S"
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,71 +0,0 @@
|
|||||||
From 595c22ecd8e87a27fd19270ed30fdbae9ad25426 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Sunil K Pandey <skpgkp2@gmail.com>
|
|
||||||
Date: Thu, 1 Apr 2021 15:47:04 -0700
|
|
||||||
Subject: [PATCH] x86-64: Fix ifdef indentation in strlen-evex.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Fix some indentations of ifdef in file strlen-evex.S which are off by 1
|
|
||||||
and confusing to read.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strlen-evex.S | 16 ++++++++--------
|
|
||||||
1 file changed, 8 insertions(+), 8 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
|
|
||||||
index cd022509..05838190 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
|
|
||||||
@@ -276,10 +276,10 @@ L(last_2x_vec):
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x0_check):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
/* Check the end of data. */
|
|
||||||
cmpq %rax, %rsi
|
|
||||||
jbe L(max)
|
|
||||||
@@ -293,10 +293,10 @@ L(first_vec_x0_check):
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x1_check):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
/* Check the end of data. */
|
|
||||||
cmpq %rax, %rsi
|
|
||||||
jbe L(max)
|
|
||||||
@@ -311,10 +311,10 @@ L(first_vec_x1_check):
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x2_check):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
/* Check the end of data. */
|
|
||||||
cmpq %rax, %rsi
|
|
||||||
jbe L(max)
|
|
||||||
@@ -329,10 +329,10 @@ L(first_vec_x2_check):
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x3_check):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
/* Check the end of data. */
|
|
||||||
cmpq %rax, %rsi
|
|
||||||
jbe L(max)
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,51 +0,0 @@
|
|||||||
From 55bf411b451c13f0fb7ff3d3bf9a820020b45df1 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 19 Apr 2021 07:07:21 -0700
|
|
||||||
Subject: [PATCH] x86-64: Require BMI2 for __strlen_evex and __strnlen_evex
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Since __strlen_evex and __strnlen_evex added by
|
|
||||||
|
|
||||||
commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77
|
|
||||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
Date: Fri Mar 5 06:24:52 2021 -0800
|
|
||||||
|
|
||||||
x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
|
|
||||||
|
|
||||||
use sarx:
|
|
||||||
|
|
||||||
c4 e2 6a f7 c0 sarx %edx,%eax,%eax
|
|
||||||
|
|
||||||
require BMI2 for __strlen_evex and __strnlen_evex in ifunc-impl-list.c.
|
|
||||||
ifunc-avx2.h already requires BMI2 for EVEX implementation.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 ++++--
|
|
||||||
1 file changed, 4 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index fec384f6..cbfc1a5d 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -293,7 +293,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
__strlen_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strlen,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__strlen_evex)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
|
|
||||||
|
|
||||||
@@ -308,7 +309,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
__strnlen_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__strnlen_evex)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
|
|
||||||
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,584 +0,0 @@
|
|||||||
From acfd088a1963ba51cd83c78f95c0ab25ead79e04 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon, 3 May 2021 03:01:58 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize memchr-avx2.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit optimizes memchr-avx2.S. The optimizations include
|
|
||||||
replacing some branches with cmovcc, avoiding some branches entirely
|
|
||||||
in the less_4x_vec case, making the page cross logic less strict,
|
|
||||||
asaving a few instructions the in loop return loop. test-memchr,
|
|
||||||
test-rawmemchr, and test-wmemchr are all passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++-----------
|
|
||||||
1 file changed, 247 insertions(+), 178 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
index cf893e77..b377f22e 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
@@ -26,8 +26,22 @@
|
|
||||||
|
|
||||||
# ifdef USE_AS_WMEMCHR
|
|
||||||
# define VPCMPEQ vpcmpeqd
|
|
||||||
+# define VPBROADCAST vpbroadcastd
|
|
||||||
+# define CHAR_SIZE 4
|
|
||||||
# else
|
|
||||||
# define VPCMPEQ vpcmpeqb
|
|
||||||
+# define VPBROADCAST vpbroadcastb
|
|
||||||
+# define CHAR_SIZE 1
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+# define ERAW_PTR_REG ecx
|
|
||||||
+# define RRAW_PTR_REG rcx
|
|
||||||
+# define ALGN_PTR_REG rdi
|
|
||||||
+# else
|
|
||||||
+# define ERAW_PTR_REG edi
|
|
||||||
+# define RRAW_PTR_REG rdi
|
|
||||||
+# define ALGN_PTR_REG rcx
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# ifndef VZEROUPPER
|
|
||||||
@@ -39,6 +53,7 @@
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
+# define PAGE_SIZE 4096
|
|
||||||
|
|
||||||
.section SECTION(.text),"ax",@progbits
|
|
||||||
ENTRY (MEMCHR)
|
|
||||||
@@ -47,295 +62,349 @@ ENTRY (MEMCHR)
|
|
||||||
test %RDX_LP, %RDX_LP
|
|
||||||
jz L(null)
|
|
||||||
# endif
|
|
||||||
- movl %edi, %ecx
|
|
||||||
- /* Broadcast CHAR to YMM0. */
|
|
||||||
- vmovd %esi, %xmm0
|
|
||||||
# ifdef USE_AS_WMEMCHR
|
|
||||||
shl $2, %RDX_LP
|
|
||||||
- vpbroadcastd %xmm0, %ymm0
|
|
||||||
# else
|
|
||||||
# ifdef __ILP32__
|
|
||||||
/* Clear the upper 32 bits. */
|
|
||||||
movl %edx, %edx
|
|
||||||
# endif
|
|
||||||
- vpbroadcastb %xmm0, %ymm0
|
|
||||||
# endif
|
|
||||||
+ /* Broadcast CHAR to YMMMATCH. */
|
|
||||||
+ vmovd %esi, %xmm0
|
|
||||||
+ VPBROADCAST %xmm0, %ymm0
|
|
||||||
/* Check if we may cross page boundary with one vector load. */
|
|
||||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
|
||||||
- cmpl $VEC_SIZE, %ecx
|
|
||||||
- ja L(cros_page_boundary)
|
|
||||||
+ movl %edi, %eax
|
|
||||||
+ andl $(PAGE_SIZE - 1), %eax
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
+ ja L(cross_page_boundary)
|
|
||||||
|
|
||||||
/* Check the first VEC_SIZE bytes. */
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
-
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- jnz L(first_vec_x0_check)
|
|
||||||
- /* Adjust length and check the end of data. */
|
|
||||||
- subq $VEC_SIZE, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
-# else
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ /* If length < CHAR_PER_VEC handle special. */
|
|
||||||
+ cmpq $VEC_SIZE, %rdx
|
|
||||||
+ jbe L(first_vec_x0)
|
|
||||||
# endif
|
|
||||||
-
|
|
||||||
- /* Align data for aligned loads in the loop. */
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(aligned_more)
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- /* Adjust length. */
|
|
||||||
- addq %rcx, %rdx
|
|
||||||
+ .p2align 5
|
|
||||||
+L(first_vec_x0):
|
|
||||||
+ /* Check if first match was before length. */
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ xorl %ecx, %ecx
|
|
||||||
+ cmpl %eax, %edx
|
|
||||||
+ leaq (%rdi, %rax), %rax
|
|
||||||
+ cmovle %rcx, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
+L(null):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
# endif
|
|
||||||
- jmp L(more_4x_vec)
|
|
||||||
-
|
|
||||||
.p2align 4
|
|
||||||
-L(cros_page_boundary):
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
+L(cross_page_boundary):
|
|
||||||
+ /* Save pointer before aligning as its original value is necessary
|
|
||||||
+ for computer return address if byte is found or adjusting length
|
|
||||||
+ if it is not and this is memchr. */
|
|
||||||
+ movq %rdi, %rcx
|
|
||||||
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
|
|
||||||
+ rdi for rawmemchr. */
|
|
||||||
+ orq $(VEC_SIZE - 1), %ALGN_PTR_REG
|
|
||||||
+ VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
+ /* Calculate length until end of page (length checked for a
|
|
||||||
+ match). */
|
|
||||||
+ leaq 1(%ALGN_PTR_REG), %rsi
|
|
||||||
+ subq %RRAW_PTR_REG, %rsi
|
|
||||||
+# endif
|
|
||||||
/* Remove the leading bytes. */
|
|
||||||
- sarl %cl, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jz L(aligned_more)
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
+ sarxl %ERAW_PTR_REG, %eax, %eax
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
/* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
+ cmpq %rsi, %rdx
|
|
||||||
+ jbe L(first_vec_x0)
|
|
||||||
# endif
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- addq %rcx, %rax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(cross_page_continue)
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addq %RRAW_PTR_REG, %rax
|
|
||||||
L(return_vzeroupper):
|
|
||||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(aligned_more):
|
|
||||||
-# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
|
|
||||||
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
|
|
||||||
- overflow. */
|
|
||||||
- negq %rcx
|
|
||||||
- addq $VEC_SIZE, %rcx
|
|
||||||
+L(first_vec_x1):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ incq %rdi
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- subq %rcx, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
-# endif
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x2):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addq $(VEC_SIZE + 1), %rdi
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x3):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addq $(VEC_SIZE * 2 + 1), %rdi
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
|
|
||||||
-# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
-# endif
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x4):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addq $(VEC_SIZE * 3 + 1), %rdi
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
-L(more_4x_vec):
|
|
||||||
+ .p2align 4
|
|
||||||
+L(aligned_more):
|
|
||||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
since data is only aligned to VEC_SIZE. */
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
+L(cross_page_continue):
|
|
||||||
+ /* Align data to VEC_SIZE - 1. */
|
|
||||||
+ xorl %ecx, %ecx
|
|
||||||
+ subl %edi, %ecx
|
|
||||||
+ orq $(VEC_SIZE - 1), %rdi
|
|
||||||
+ /* esi is for adjusting length to see if near the end. */
|
|
||||||
+ leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
|
|
||||||
+# else
|
|
||||||
+ orq $(VEC_SIZE - 1), %rdi
|
|
||||||
+L(cross_page_continue):
|
|
||||||
+# endif
|
|
||||||
+ /* Load first VEC regardless. */
|
|
||||||
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
+ /* Adjust length. If near end handle specially. */
|
|
||||||
+ subq %rsi, %rdx
|
|
||||||
+ jbe L(last_4x_vec_or_less)
|
|
||||||
+# endif
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x1)
|
|
||||||
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x2)
|
|
||||||
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x3)
|
|
||||||
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x4)
|
|
||||||
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
+ /* Check if at last VEC_SIZE * 4 length. */
|
|
||||||
subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
- /* Align data to 4 * VEC_SIZE. */
|
|
||||||
- movq %rdi, %rcx
|
|
||||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
-
|
|
||||||
-# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- /* Adjust length. */
|
|
||||||
+ jbe L(last_4x_vec_or_less_cmpeq)
|
|
||||||
+ /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
|
|
||||||
+ length. */
|
|
||||||
+ incq %rdi
|
|
||||||
+ movl %edi, %ecx
|
|
||||||
+ orq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
+ andl $(VEC_SIZE * 4 - 1), %ecx
|
|
||||||
addq %rcx, %rdx
|
|
||||||
+# else
|
|
||||||
+ /* Align data to VEC_SIZE * 4 - 1 for loop. */
|
|
||||||
+ incq %rdi
|
|
||||||
+ orq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
# endif
|
|
||||||
|
|
||||||
+ /* Compare 4 * VEC at a time forward. */
|
|
||||||
.p2align 4
|
|
||||||
L(loop_4x_vec):
|
|
||||||
- /* Compare 4 * VEC at a time forward. */
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
|
|
||||||
-
|
|
||||||
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
|
|
||||||
vpor %ymm1, %ymm2, %ymm5
|
|
||||||
vpor %ymm3, %ymm4, %ymm6
|
|
||||||
vpor %ymm5, %ymm6, %ymm5
|
|
||||||
|
|
||||||
- vpmovmskb %ymm5, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(4x_vec_end)
|
|
||||||
-
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
-
|
|
||||||
+ vpmovmskb %ymm5, %ecx
|
|
||||||
# ifdef USE_AS_RAWMEMCHR
|
|
||||||
- jmp L(loop_4x_vec)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
+ jz L(loop_4x_vec)
|
|
||||||
# else
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- ja L(loop_4x_vec)
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
+ jnz L(loop_4x_vec_end)
|
|
||||||
|
|
||||||
-L(last_4x_vec_or_less):
|
|
||||||
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
|
||||||
- addl $(VEC_SIZE * 2), %edx
|
|
||||||
- jle L(last_2x_vec)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ subq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ ja L(loop_4x_vec)
|
|
||||||
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
|
||||||
+ /* Fall through into less than 4 remaining vectors of length case.
|
|
||||||
+ */
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_4x_vec_or_less):
|
|
||||||
+ /* Check if first VEC contained match. */
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
+ jnz L(first_vec_x1_check)
|
|
||||||
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
+ /* If remaining length > VEC_SIZE * 2. */
|
|
||||||
+ addl $(VEC_SIZE * 2), %edx
|
|
||||||
+ jg L(last_4x_vec)
|
|
||||||
|
|
||||||
- jnz L(first_vec_x2_check)
|
|
||||||
- subl $VEC_SIZE, %edx
|
|
||||||
- jle L(zero)
|
|
||||||
+L(last_2x_vec):
|
|
||||||
+ /* If remaining length < VEC_SIZE. */
|
|
||||||
+ addl $VEC_SIZE, %edx
|
|
||||||
+ jle L(zero_end)
|
|
||||||
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
|
||||||
+ /* Check VEC2 and compare any match with remaining length. */
|
|
||||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
-
|
|
||||||
- jnz L(first_vec_x3_check)
|
|
||||||
- xorl %eax, %eax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ cmpl %eax, %edx
|
|
||||||
+ jbe L(set_zero_end)
|
|
||||||
+ addq $(VEC_SIZE + 1), %rdi
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+L(zero_end):
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(last_2x_vec):
|
|
||||||
- addl $(VEC_SIZE * 2), %edx
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
+L(loop_4x_vec_end):
|
|
||||||
+# endif
|
|
||||||
+ /* rawmemchr will fall through into this if match was found in
|
|
||||||
+ loop. */
|
|
||||||
+
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_x1_return)
|
|
||||||
|
|
||||||
- jnz L(first_vec_x0_check)
|
|
||||||
- subl $VEC_SIZE, %edx
|
|
||||||
- jle L(zero)
|
|
||||||
-
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm2, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1_check)
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
+ jnz L(last_vec_x2_return)
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x0_check):
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
+ vpmovmskb %ymm3, %eax
|
|
||||||
+ /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */
|
|
||||||
+ salq $32, %rcx
|
|
||||||
+ orq %rcx, %rax
|
|
||||||
+ tzcntq %rax, %rax
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+ subq $(VEC_SIZE * 2 - 1), %rdi
|
|
||||||
+# else
|
|
||||||
+ subq $-(VEC_SIZE * 2 + 1), %rdi
|
|
||||||
+# endif
|
|
||||||
addq %rdi, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x1_check):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
+ /* Adjust length. */
|
|
||||||
+ subl $-(VEC_SIZE * 4), %edx
|
|
||||||
+ /* Check if match within remaining length. */
|
|
||||||
+ cmpl %eax, %edx
|
|
||||||
+ jbe L(set_zero_end)
|
|
||||||
+ incq %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
+ .p2align 4
|
|
||||||
+L(set_zero_end):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x2_check):
|
|
||||||
+L(last_vec_x1_return):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+ subq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
+# else
|
|
||||||
+ incq %rdi
|
|
||||||
+# endif
|
|
||||||
addq %rdi, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x3_check):
|
|
||||||
+L(last_vec_x2_return):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+ subq $(VEC_SIZE * 3 - 1), %rdi
|
|
||||||
+# else
|
|
||||||
+ subq $-(VEC_SIZE + 1), %rdi
|
|
||||||
+# endif
|
|
||||||
addq %rdi, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
.p2align 4
|
|
||||||
-L(zero):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- jmp L(return_vzeroupper)
|
|
||||||
+L(last_4x_vec_or_less_cmpeq):
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ /* Check first VEC regardless. */
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x1_check)
|
|
||||||
|
|
||||||
+ /* If remaining length <= CHAR_PER_VEC * 2. */
|
|
||||||
+ addl $(VEC_SIZE * 2), %edx
|
|
||||||
+ jle L(last_2x_vec)
|
|
||||||
.p2align 4
|
|
||||||
-L(null):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- ret
|
|
||||||
-# endif
|
|
||||||
+L(last_4x_vec):
|
|
||||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_x2_return)
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x0):
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x1):
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
+ /* Create mask for possible matches within remaining length. */
|
|
||||||
+ movq $-1, %rcx
|
|
||||||
+ bzhiq %rdx, %rcx, %rcx
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x2):
|
|
||||||
+ /* Test matches in data against length match. */
|
|
||||||
+ andl %ecx, %eax
|
|
||||||
+ jnz L(last_vec_x3)
|
|
||||||
+
|
|
||||||
+ /* if remaining length <= VEC_SIZE * 3 (Note this is after
|
|
||||||
+ remaining length was found to be > VEC_SIZE * 2. */
|
|
||||||
+ subl $VEC_SIZE, %edx
|
|
||||||
+ jbe L(zero_end2)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ /* Shift remaining length mask for last VEC. */
|
|
||||||
+ shrq $32, %rcx
|
|
||||||
+ andl %ecx, %eax
|
|
||||||
+ jz L(zero_end2)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
+ addq $(VEC_SIZE * 3 + 1), %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
+L(zero_end2):
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(4x_vec_end):
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
- vpmovmskb %ymm3, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
- vpmovmskb %ymm4, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
-L(first_vec_x3):
|
|
||||||
+L(last_vec_x3):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
+ subq $-(VEC_SIZE * 2 + 1), %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
END (MEMCHR)
|
|
||||||
#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,388 +0,0 @@
|
|||||||
From 645a158978f9520e74074e8c14047503be4db0f0 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Wed, 9 Jun 2021 16:25:32 -0400
|
|
||||||
Subject: [PATCH] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ
|
|
||||||
#27974]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
This commit fixes the bug mentioned in the previous commit.
|
|
||||||
|
|
||||||
The previous implementations of wmemchr in these files relied
|
|
||||||
on n * sizeof(wchar_t) which was not guranteed by the standard.
|
|
||||||
|
|
||||||
The new overflow tests added in the previous commit now
|
|
||||||
pass (As well as all the other tests).
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/memchr.S | 77 +++++++++++++++++++-------
|
|
||||||
sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
|
|
||||||
2 files changed, 98 insertions(+), 37 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
|
|
||||||
index cb320257..24f9a0c5 100644
|
|
||||||
--- a/sysdeps/x86_64/memchr.S
|
|
||||||
+++ b/sysdeps/x86_64/memchr.S
|
|
||||||
@@ -21,9 +21,11 @@
|
|
||||||
#ifdef USE_AS_WMEMCHR
|
|
||||||
# define MEMCHR wmemchr
|
|
||||||
# define PCMPEQ pcmpeqd
|
|
||||||
+# define CHAR_PER_VEC 4
|
|
||||||
#else
|
|
||||||
# define MEMCHR memchr
|
|
||||||
# define PCMPEQ pcmpeqb
|
|
||||||
+# define CHAR_PER_VEC 16
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* fast SSE2 version with using pmaxub and 64 byte loop */
|
|
||||||
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
|
|
||||||
movd %esi, %xmm1
|
|
||||||
mov %edi, %ecx
|
|
||||||
|
|
||||||
+#ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %edx, %edx
|
|
||||||
+#endif
|
|
||||||
#ifdef USE_AS_WMEMCHR
|
|
||||||
test %RDX_LP, %RDX_LP
|
|
||||||
jz L(return_null)
|
|
||||||
- shl $2, %RDX_LP
|
|
||||||
#else
|
|
||||||
-# ifdef __ILP32__
|
|
||||||
- /* Clear the upper 32 bits. */
|
|
||||||
- movl %edx, %edx
|
|
||||||
-# endif
|
|
||||||
punpcklbw %xmm1, %xmm1
|
|
||||||
test %RDX_LP, %RDX_LP
|
|
||||||
jz L(return_null)
|
|
||||||
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
|
|
||||||
test %eax, %eax
|
|
||||||
|
|
||||||
jnz L(matches_1)
|
|
||||||
- sub $16, %rdx
|
|
||||||
+ sub $CHAR_PER_VEC, %rdx
|
|
||||||
jbe L(return_null)
|
|
||||||
add $16, %rdi
|
|
||||||
and $15, %ecx
|
|
||||||
and $-16, %rdi
|
|
||||||
+#ifdef USE_AS_WMEMCHR
|
|
||||||
+ shr $2, %ecx
|
|
||||||
+#endif
|
|
||||||
add %rcx, %rdx
|
|
||||||
- sub $64, %rdx
|
|
||||||
+ sub $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
jbe L(exit_loop)
|
|
||||||
jmp L(loop_prolog)
|
|
||||||
|
|
||||||
@@ -77,16 +81,21 @@ L(crosscache):
|
|
||||||
movdqa (%rdi), %xmm0
|
|
||||||
|
|
||||||
PCMPEQ %xmm1, %xmm0
|
|
||||||
-/* Check if there is a match. */
|
|
||||||
+ /* Check if there is a match. */
|
|
||||||
pmovmskb %xmm0, %eax
|
|
||||||
-/* Remove the leading bytes. */
|
|
||||||
+ /* Remove the leading bytes. */
|
|
||||||
sar %cl, %eax
|
|
||||||
test %eax, %eax
|
|
||||||
je L(unaligned_no_match)
|
|
||||||
-/* Check which byte is a match. */
|
|
||||||
+ /* Check which byte is a match. */
|
|
||||||
bsf %eax, %eax
|
|
||||||
-
|
|
||||||
+#ifdef USE_AS_WMEMCHR
|
|
||||||
+ mov %eax, %esi
|
|
||||||
+ shr $2, %esi
|
|
||||||
+ sub %rsi, %rdx
|
|
||||||
+#else
|
|
||||||
sub %rax, %rdx
|
|
||||||
+#endif
|
|
||||||
jbe L(return_null)
|
|
||||||
add %rdi, %rax
|
|
||||||
add %rcx, %rax
|
|
||||||
@@ -94,15 +103,18 @@ L(crosscache):
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(unaligned_no_match):
|
|
||||||
- /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
|
|
||||||
+ /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
|
|
||||||
"rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
|
|
||||||
possible addition overflow. */
|
|
||||||
neg %rcx
|
|
||||||
add $16, %rcx
|
|
||||||
+#ifdef USE_AS_WMEMCHR
|
|
||||||
+ shr $2, %ecx
|
|
||||||
+#endif
|
|
||||||
sub %rcx, %rdx
|
|
||||||
jbe L(return_null)
|
|
||||||
add $16, %rdi
|
|
||||||
- sub $64, %rdx
|
|
||||||
+ sub $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
jbe L(exit_loop)
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
@@ -135,7 +147,7 @@ L(loop_prolog):
|
|
||||||
test $0x3f, %rdi
|
|
||||||
jz L(align64_loop)
|
|
||||||
|
|
||||||
- sub $64, %rdx
|
|
||||||
+ sub $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
jbe L(exit_loop)
|
|
||||||
|
|
||||||
movdqa (%rdi), %xmm0
|
|
||||||
@@ -167,11 +179,14 @@ L(loop_prolog):
|
|
||||||
mov %rdi, %rcx
|
|
||||||
and $-64, %rdi
|
|
||||||
and $63, %ecx
|
|
||||||
+#ifdef USE_AS_WMEMCHR
|
|
||||||
+ shr $2, %ecx
|
|
||||||
+#endif
|
|
||||||
add %rcx, %rdx
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(align64_loop):
|
|
||||||
- sub $64, %rdx
|
|
||||||
+ sub $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
jbe L(exit_loop)
|
|
||||||
movdqa (%rdi), %xmm0
|
|
||||||
movdqa 16(%rdi), %xmm2
|
|
||||||
@@ -218,7 +233,7 @@ L(align64_loop):
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(exit_loop):
|
|
||||||
- add $32, %edx
|
|
||||||
+ add $(CHAR_PER_VEC * 2), %edx
|
|
||||||
jle L(exit_loop_32)
|
|
||||||
|
|
||||||
movdqa (%rdi), %xmm0
|
|
||||||
@@ -238,7 +253,7 @@ L(exit_loop):
|
|
||||||
pmovmskb %xmm3, %eax
|
|
||||||
test %eax, %eax
|
|
||||||
jnz L(matches32_1)
|
|
||||||
- sub $16, %edx
|
|
||||||
+ sub $CHAR_PER_VEC, %edx
|
|
||||||
jle L(return_null)
|
|
||||||
|
|
||||||
PCMPEQ 48(%rdi), %xmm1
|
|
||||||
@@ -250,13 +265,13 @@ L(exit_loop):
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(exit_loop_32):
|
|
||||||
- add $32, %edx
|
|
||||||
+ add $(CHAR_PER_VEC * 2), %edx
|
|
||||||
movdqa (%rdi), %xmm0
|
|
||||||
PCMPEQ %xmm1, %xmm0
|
|
||||||
pmovmskb %xmm0, %eax
|
|
||||||
test %eax, %eax
|
|
||||||
jnz L(matches_1)
|
|
||||||
- sub $16, %edx
|
|
||||||
+ sub $CHAR_PER_VEC, %edx
|
|
||||||
jbe L(return_null)
|
|
||||||
|
|
||||||
PCMPEQ 16(%rdi), %xmm1
|
|
||||||
@@ -293,7 +308,13 @@ L(matches32):
|
|
||||||
.p2align 4
|
|
||||||
L(matches_1):
|
|
||||||
bsf %eax, %eax
|
|
||||||
+#ifdef USE_AS_WMEMCHR
|
|
||||||
+ mov %eax, %esi
|
|
||||||
+ shr $2, %esi
|
|
||||||
+ sub %rsi, %rdx
|
|
||||||
+#else
|
|
||||||
sub %rax, %rdx
|
|
||||||
+#endif
|
|
||||||
jbe L(return_null)
|
|
||||||
add %rdi, %rax
|
|
||||||
ret
|
|
||||||
@@ -301,7 +322,13 @@ L(matches_1):
|
|
||||||
.p2align 4
|
|
||||||
L(matches16_1):
|
|
||||||
bsf %eax, %eax
|
|
||||||
+#ifdef USE_AS_WMEMCHR
|
|
||||||
+ mov %eax, %esi
|
|
||||||
+ shr $2, %esi
|
|
||||||
+ sub %rsi, %rdx
|
|
||||||
+#else
|
|
||||||
sub %rax, %rdx
|
|
||||||
+#endif
|
|
||||||
jbe L(return_null)
|
|
||||||
lea 16(%rdi, %rax), %rax
|
|
||||||
ret
|
|
||||||
@@ -309,7 +336,13 @@ L(matches16_1):
|
|
||||||
.p2align 4
|
|
||||||
L(matches32_1):
|
|
||||||
bsf %eax, %eax
|
|
||||||
+#ifdef USE_AS_WMEMCHR
|
|
||||||
+ mov %eax, %esi
|
|
||||||
+ shr $2, %esi
|
|
||||||
+ sub %rsi, %rdx
|
|
||||||
+#else
|
|
||||||
sub %rax, %rdx
|
|
||||||
+#endif
|
|
||||||
jbe L(return_null)
|
|
||||||
lea 32(%rdi, %rax), %rax
|
|
||||||
ret
|
|
||||||
@@ -317,7 +350,13 @@ L(matches32_1):
|
|
||||||
.p2align 4
|
|
||||||
L(matches48_1):
|
|
||||||
bsf %eax, %eax
|
|
||||||
+#ifdef USE_AS_WMEMCHR
|
|
||||||
+ mov %eax, %esi
|
|
||||||
+ shr $2, %esi
|
|
||||||
+ sub %rsi, %rdx
|
|
||||||
+#else
|
|
||||||
sub %rax, %rdx
|
|
||||||
+#endif
|
|
||||||
jbe L(return_null)
|
|
||||||
lea 48(%rdi, %rax), %rax
|
|
||||||
ret
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
index b377f22e..16027abb 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
||||||
@@ -54,21 +54,19 @@
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
# define PAGE_SIZE 4096
|
|
||||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
||||||
|
|
||||||
.section SECTION(.text),"ax",@progbits
|
|
||||||
ENTRY (MEMCHR)
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
/* Check for zero length. */
|
|
||||||
- test %RDX_LP, %RDX_LP
|
|
||||||
- jz L(null)
|
|
||||||
-# endif
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- shl $2, %RDX_LP
|
|
||||||
-# else
|
|
||||||
# ifdef __ILP32__
|
|
||||||
- /* Clear the upper 32 bits. */
|
|
||||||
- movl %edx, %edx
|
|
||||||
+ /* Clear upper bits. */
|
|
||||||
+ and %RDX_LP, %RDX_LP
|
|
||||||
+# else
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
# endif
|
|
||||||
+ jz L(null)
|
|
||||||
# endif
|
|
||||||
/* Broadcast CHAR to YMMMATCH. */
|
|
||||||
vmovd %esi, %xmm0
|
|
||||||
@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
/* If length < CHAR_PER_VEC handle special. */
|
|
||||||
- cmpq $VEC_SIZE, %rdx
|
|
||||||
+ cmpq $CHAR_PER_VEC, %rdx
|
|
||||||
jbe L(first_vec_x0)
|
|
||||||
# endif
|
|
||||||
testl %eax, %eax
|
|
||||||
@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
|
|
||||||
L(first_vec_x0):
|
|
||||||
/* Check if first match was before length. */
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Multiply length by 4 to get byte count. */
|
|
||||||
+ sall $2, %edx
|
|
||||||
+# endif
|
|
||||||
xorl %ecx, %ecx
|
|
||||||
cmpl %eax, %edx
|
|
||||||
leaq (%rdi, %rax), %rax
|
|
||||||
@@ -110,12 +112,12 @@ L(null):
|
|
||||||
# endif
|
|
||||||
.p2align 4
|
|
||||||
L(cross_page_boundary):
|
|
||||||
- /* Save pointer before aligning as its original value is necessary
|
|
||||||
- for computer return address if byte is found or adjusting length
|
|
||||||
- if it is not and this is memchr. */
|
|
||||||
+ /* Save pointer before aligning as its original value is
|
|
||||||
+ necessary for computer return address if byte is found or
|
|
||||||
+ adjusting length if it is not and this is memchr. */
|
|
||||||
movq %rdi, %rcx
|
|
||||||
- /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
|
|
||||||
- rdi for rawmemchr. */
|
|
||||||
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
|
|
||||||
+ and rdi for rawmemchr. */
|
|
||||||
orq $(VEC_SIZE - 1), %ALGN_PTR_REG
|
|
||||||
VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
@@ -124,6 +126,10 @@ L(cross_page_boundary):
|
|
||||||
match). */
|
|
||||||
leaq 1(%ALGN_PTR_REG), %rsi
|
|
||||||
subq %RRAW_PTR_REG, %rsi
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
+ shrl $2, %esi
|
|
||||||
+# endif
|
|
||||||
# endif
|
|
||||||
/* Remove the leading bytes. */
|
|
||||||
sarxl %ERAW_PTR_REG, %eax, %eax
|
|
||||||
@@ -181,6 +187,10 @@ L(cross_page_continue):
|
|
||||||
orq $(VEC_SIZE - 1), %rdi
|
|
||||||
/* esi is for adjusting length to see if near the end. */
|
|
||||||
leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %esi
|
|
||||||
+# endif
|
|
||||||
# else
|
|
||||||
orq $(VEC_SIZE - 1), %rdi
|
|
||||||
L(cross_page_continue):
|
|
||||||
@@ -213,7 +223,7 @@ L(cross_page_continue):
|
|
||||||
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
/* Check if at last VEC_SIZE * 4 length. */
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ subq $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
jbe L(last_4x_vec_or_less_cmpeq)
|
|
||||||
/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
|
|
||||||
length. */
|
|
||||||
@@ -221,6 +231,10 @@ L(cross_page_continue):
|
|
||||||
movl %edi, %ecx
|
|
||||||
orq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
andl $(VEC_SIZE * 4 - 1), %ecx
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %ecx
|
|
||||||
+# endif
|
|
||||||
addq %rcx, %rdx
|
|
||||||
# else
|
|
||||||
/* Align data to VEC_SIZE * 4 - 1 for loop. */
|
|
||||||
@@ -250,15 +264,19 @@ L(loop_4x_vec):
|
|
||||||
|
|
||||||
subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ subq $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
ja L(loop_4x_vec)
|
|
||||||
|
|
||||||
- /* Fall through into less than 4 remaining vectors of length case.
|
|
||||||
- */
|
|
||||||
+ /* Fall through into less than 4 remaining vectors of length
|
|
||||||
+ case. */
|
|
||||||
VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
.p2align 4
|
|
||||||
L(last_4x_vec_or_less):
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Multiply length by 4 to get byte count. */
|
|
||||||
+ sall $2, %edx
|
|
||||||
+# endif
|
|
||||||
/* Check if first VEC contained match. */
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x1_check)
|
|
||||||
@@ -355,6 +373,10 @@ L(last_vec_x2_return):
|
|
||||||
L(last_4x_vec_or_less_cmpeq):
|
|
||||||
VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Multiply length by 4 to get byte count. */
|
|
||||||
+ sall $2, %edx
|
|
||||||
+# endif
|
|
||||||
subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
/* Check first VEC regardless. */
|
|
||||||
testl %eax, %eax
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,767 +0,0 @@
|
|||||||
From aaa23c35071537e2dcf5807e956802ed215210aa Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon, 19 Apr 2021 19:36:07 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize strlen-avx2.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit optimizes strlen-avx2.S. The optimizations are
|
|
||||||
mostly small things but they add up to roughly 10-30% performance
|
|
||||||
improvement for strlen. The results for strnlen are bit more
|
|
||||||
ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
|
|
||||||
are all passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +-
|
|
||||||
sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++--------
|
|
||||||
2 files changed, 334 insertions(+), 214 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index cbfc1a5d..f1a6460a 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -285,10 +285,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
/* Support sysdeps/x86_64/multiarch/strlen.c. */
|
|
||||||
IFUNC_IMPL (i, name, strlen,
|
|
||||||
IFUNC_IMPL_ADD (array, i, strlen,
|
|
||||||
- CPU_FEATURE_USABLE (AVX2),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__strlen_avx2)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strlen,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__strlen_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strlen,
|
|
||||||
@@ -301,10 +303,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
/* Support sysdeps/x86_64/multiarch/strnlen.c. */
|
|
||||||
IFUNC_IMPL (i, name, strnlen,
|
|
||||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
|
||||||
- CPU_FEATURE_USABLE (AVX2),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__strnlen_avx2)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__strnlen_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
|
||||||
@@ -640,10 +644,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
/* Support sysdeps/x86_64/multiarch/wcslen.c. */
|
|
||||||
IFUNC_IMPL (i, name, wcslen,
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcslen,
|
|
||||||
- CPU_FEATURE_USABLE (AVX2),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wcslen_avx2)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcslen,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__wcslen_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcslen,
|
|
||||||
@@ -656,10 +662,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
|
|
||||||
IFUNC_IMPL (i, name, wcsnlen,
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
|
||||||
- CPU_FEATURE_USABLE (AVX2),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wcsnlen_avx2)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__wcsnlen_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
index 82826e10..be8a5db5 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
@@ -27,9 +27,11 @@
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
# define VPCMPEQ vpcmpeqd
|
|
||||||
# define VPMINU vpminud
|
|
||||||
+# define CHAR_SIZE 4
|
|
||||||
# else
|
|
||||||
# define VPCMPEQ vpcmpeqb
|
|
||||||
# define VPMINU vpminub
|
|
||||||
+# define CHAR_SIZE 1
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# ifndef VZEROUPPER
|
|
||||||
@@ -41,349 +43,459 @@
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
+# define PAGE_SIZE 4096
|
|
||||||
|
|
||||||
.section SECTION(.text),"ax",@progbits
|
|
||||||
ENTRY (STRLEN)
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* Check for zero length. */
|
|
||||||
+ /* Check zero length. */
|
|
||||||
test %RSI_LP, %RSI_LP
|
|
||||||
jz L(zero)
|
|
||||||
+ /* Store max len in R8_LP before adjusting if using WCSLEN. */
|
|
||||||
+ mov %RSI_LP, %R8_LP
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
shl $2, %RSI_LP
|
|
||||||
# elif defined __ILP32__
|
|
||||||
/* Clear the upper 32 bits. */
|
|
||||||
movl %esi, %esi
|
|
||||||
# endif
|
|
||||||
- mov %RSI_LP, %R8_LP
|
|
||||||
# endif
|
|
||||||
- movl %edi, %ecx
|
|
||||||
+ movl %edi, %eax
|
|
||||||
movq %rdi, %rdx
|
|
||||||
vpxor %xmm0, %xmm0, %xmm0
|
|
||||||
-
|
|
||||||
+ /* Clear high bits from edi. Only keeping bits relevant to page
|
|
||||||
+ cross check. */
|
|
||||||
+ andl $(PAGE_SIZE - 1), %eax
|
|
||||||
/* Check if we may cross page boundary with one vector load. */
|
|
||||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
|
||||||
- cmpl $VEC_SIZE, %ecx
|
|
||||||
- ja L(cros_page_boundary)
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
+ ja L(cross_page_boundary)
|
|
||||||
|
|
||||||
/* Check the first VEC_SIZE bytes. */
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
-
|
|
||||||
+ VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- jnz L(first_vec_x0_check)
|
|
||||||
- /* Adjust length and check the end of data. */
|
|
||||||
- subq $VEC_SIZE, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
-# else
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ /* If length < VEC_SIZE handle special. */
|
|
||||||
+ cmpq $VEC_SIZE, %rsi
|
|
||||||
+ jbe L(first_vec_x0)
|
|
||||||
# endif
|
|
||||||
-
|
|
||||||
- /* Align data for aligned loads in the loop. */
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
+ /* If empty continue to aligned_more. Otherwise return bit
|
|
||||||
+ position of first match. */
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(aligned_more)
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ shrl $2, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* Adjust length. */
|
|
||||||
- addq %rcx, %rsi
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- subq $(VEC_SIZE * 4), %rsi
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x0):
|
|
||||||
+ /* Set bit for max len so that tzcnt will return min of max len
|
|
||||||
+ and position of first match. */
|
|
||||||
+ btsq %rsi, %rax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ shrl $2, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
# endif
|
|
||||||
- jmp L(more_4x_vec)
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(cros_page_boundary):
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- /* Remove the leading bytes. */
|
|
||||||
- sarl %cl, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jz L(aligned_more)
|
|
||||||
+L(first_vec_x1):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
+ /* Safe to use 32 bit instructions as these are only called for
|
|
||||||
+ size = [1, 159]. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
+ /* Use ecx which was computed earlier to compute correct value.
|
|
||||||
+ */
|
|
||||||
+ subl $(VEC_SIZE * 4 + 1), %ecx
|
|
||||||
+ addl %ecx, %eax
|
|
||||||
+# else
|
|
||||||
+ subl %edx, %edi
|
|
||||||
+ incl %edi
|
|
||||||
+ addl %edi, %eax
|
|
||||||
# endif
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- addq %rcx, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+ shrl $2, %eax
|
|
||||||
# endif
|
|
||||||
-L(return_vzeroupper):
|
|
||||||
- ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(aligned_more):
|
|
||||||
+L(first_vec_x2):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Safe to use 32 bit instructions as these are only called for
|
|
||||||
+ size = [1, 159]. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
|
|
||||||
- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
|
|
||||||
- to void possible addition overflow. */
|
|
||||||
- negq %rcx
|
|
||||||
- addq $VEC_SIZE, %rcx
|
|
||||||
-
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- subq %rcx, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
+ /* Use ecx which was computed earlier to compute correct value.
|
|
||||||
+ */
|
|
||||||
+ subl $(VEC_SIZE * 3 + 1), %ecx
|
|
||||||
+ addl %ecx, %eax
|
|
||||||
+# else
|
|
||||||
+ subl %edx, %edi
|
|
||||||
+ addl $(VEC_SIZE + 1), %edi
|
|
||||||
+ addl %edi, %eax
|
|
||||||
# endif
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ shrl $2, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x3):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Safe to use 32 bit instructions as these are only called for
|
|
||||||
+ size = [1, 159]. */
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ /* Use ecx which was computed earlier to compute correct value.
|
|
||||||
+ */
|
|
||||||
+ subl $(VEC_SIZE * 2 + 1), %ecx
|
|
||||||
+ addl %ecx, %eax
|
|
||||||
+# else
|
|
||||||
+ subl %edx, %edi
|
|
||||||
+ addl $(VEC_SIZE * 2 + 1), %edi
|
|
||||||
+ addl %edi, %eax
|
|
||||||
+# endif
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ shrl $2, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x4):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Safe to use 32 bit instructions as these are only called for
|
|
||||||
+ size = [1, 159]. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- subq $(VEC_SIZE * 4), %rsi
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
+ /* Use ecx which was computed earlier to compute correct value.
|
|
||||||
+ */
|
|
||||||
+ subl $(VEC_SIZE + 1), %ecx
|
|
||||||
+ addl %ecx, %eax
|
|
||||||
+# else
|
|
||||||
+ subl %edx, %edi
|
|
||||||
+ addl $(VEC_SIZE * 3 + 1), %edi
|
|
||||||
+ addl %edi, %eax
|
|
||||||
# endif
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ shrl $2, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
-L(more_4x_vec):
|
|
||||||
+ .p2align 5
|
|
||||||
+L(aligned_more):
|
|
||||||
+ /* Align data to VEC_SIZE - 1. This is the same number of
|
|
||||||
+ instructions as using andq with -VEC_SIZE but saves 4 bytes of
|
|
||||||
+ code on the x4 check. */
|
|
||||||
+ orq $(VEC_SIZE - 1), %rdi
|
|
||||||
+L(cross_page_continue):
|
|
||||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
since data is only aligned to VEC_SIZE. */
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
-
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
|
|
||||||
+ it simplies the logic in last_4x_vec_or_less. */
|
|
||||||
+ leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
|
|
||||||
+ subq %rdx, %rcx
|
|
||||||
+# endif
|
|
||||||
+ /* Load first VEC regardless. */
|
|
||||||
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ /* Adjust length. If near end handle specially. */
|
|
||||||
+ subq %rcx, %rsi
|
|
||||||
+ jb L(last_4x_vec_or_less)
|
|
||||||
+# endif
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x1)
|
|
||||||
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x2)
|
|
||||||
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x3)
|
|
||||||
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
-
|
|
||||||
-# ifdef USE_AS_STRNLEN
|
|
||||||
- subq $(VEC_SIZE * 4), %rsi
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
- /* Align data to 4 * VEC_SIZE. */
|
|
||||||
- movq %rdi, %rcx
|
|
||||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x4)
|
|
||||||
|
|
||||||
+ /* Align data to VEC_SIZE * 4 - 1. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* Adjust length. */
|
|
||||||
+ /* Before adjusting length check if at last VEC_SIZE * 4. */
|
|
||||||
+ cmpq $(VEC_SIZE * 4 - 1), %rsi
|
|
||||||
+ jbe L(last_4x_vec_or_less_load)
|
|
||||||
+ incq %rdi
|
|
||||||
+ movl %edi, %ecx
|
|
||||||
+ orq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
+ andl $(VEC_SIZE * 4 - 1), %ecx
|
|
||||||
+ /* Readjust length. */
|
|
||||||
addq %rcx, %rsi
|
|
||||||
+# else
|
|
||||||
+ incq %rdi
|
|
||||||
+ orq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
# endif
|
|
||||||
-
|
|
||||||
+ /* Compare 4 * VEC at a time forward. */
|
|
||||||
.p2align 4
|
|
||||||
L(loop_4x_vec):
|
|
||||||
- /* Compare 4 * VEC at a time forward. */
|
|
||||||
- vmovdqa (%rdi), %ymm1
|
|
||||||
- vmovdqa VEC_SIZE(%rdi), %ymm2
|
|
||||||
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
|
|
||||||
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
|
|
||||||
- VPMINU %ymm1, %ymm2, %ymm5
|
|
||||||
- VPMINU %ymm3, %ymm4, %ymm6
|
|
||||||
- VPMINU %ymm5, %ymm6, %ymm5
|
|
||||||
-
|
|
||||||
- VPCMPEQ %ymm5, %ymm0, %ymm5
|
|
||||||
- vpmovmskb %ymm5, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(4x_vec_end)
|
|
||||||
-
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
-
|
|
||||||
-# ifndef USE_AS_STRNLEN
|
|
||||||
- jmp L(loop_4x_vec)
|
|
||||||
-# else
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ /* Break if at end of length. */
|
|
||||||
subq $(VEC_SIZE * 4), %rsi
|
|
||||||
- ja L(loop_4x_vec)
|
|
||||||
-
|
|
||||||
-L(last_4x_vec_or_less):
|
|
||||||
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
|
||||||
- addl $(VEC_SIZE * 2), %esi
|
|
||||||
- jle L(last_2x_vec)
|
|
||||||
+ jb L(last_4x_vec_or_less_cmpeq)
|
|
||||||
+# endif
|
|
||||||
+ /* Save some code size by microfusing VPMINU with the load. Since
|
|
||||||
+ the matches in ymm2/ymm4 can only be returned if there where no
|
|
||||||
+ matches in ymm1/ymm3 respectively there is no issue with overlap.
|
|
||||||
+ */
|
|
||||||
+ vmovdqa 1(%rdi), %ymm1
|
|
||||||
+ VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
|
|
||||||
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
|
|
||||||
+ VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
|
|
||||||
+
|
|
||||||
+ VPMINU %ymm2, %ymm4, %ymm5
|
|
||||||
+ VPCMPEQ %ymm5, %ymm0, %ymm5
|
|
||||||
+ vpmovmskb %ymm5, %ecx
|
|
||||||
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
+ jz L(loop_4x_vec)
|
|
||||||
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ VPCMPEQ %ymm1, %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_return_x0)
|
|
||||||
|
|
||||||
- jnz L(first_vec_x2_check)
|
|
||||||
- subl $VEC_SIZE, %esi
|
|
||||||
- jle L(max)
|
|
||||||
-
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ VPCMPEQ %ymm2, %ymm0, %ymm2
|
|
||||||
+ vpmovmskb %ymm2, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
-
|
|
||||||
- jnz L(first_vec_x3_check)
|
|
||||||
- movq %r8, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+ jnz L(last_vec_return_x1)
|
|
||||||
+
|
|
||||||
+ /* Combine last 2 VEC. */
|
|
||||||
+ VPCMPEQ %ymm3, %ymm0, %ymm3
|
|
||||||
+ vpmovmskb %ymm3, %eax
|
|
||||||
+ /* rcx has combined result from all 4 VEC. It will only be used if
|
|
||||||
+ the first 3 other VEC all did not contain a match. */
|
|
||||||
+ salq $32, %rcx
|
|
||||||
+ orq %rcx, %rax
|
|
||||||
+ tzcntq %rax, %rax
|
|
||||||
+ subq $(VEC_SIZE * 2 - 1), %rdi
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
+
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
.p2align 4
|
|
||||||
-L(last_2x_vec):
|
|
||||||
- addl $(VEC_SIZE * 2), %esi
|
|
||||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
+L(last_4x_vec_or_less_load):
|
|
||||||
+ /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+L(last_4x_vec_or_less_cmpeq):
|
|
||||||
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
|
||||||
+L(last_4x_vec_or_less):
|
|
||||||
|
|
||||||
- jnz L(first_vec_x0_check)
|
|
||||||
- subl $VEC_SIZE, %esi
|
|
||||||
- jle L(max)
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off by
|
|
||||||
+ VEC_SIZE * 4. */
|
|
||||||
+ testl $(VEC_SIZE * 2), %esi
|
|
||||||
+ jnz L(last_4x_vec)
|
|
||||||
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ /* length may have been negative or positive by an offset of
|
|
||||||
+ VEC_SIZE * 4 depending on where this was called from. This fixes
|
|
||||||
+ that. */
|
|
||||||
+ andl $(VEC_SIZE * 4 - 1), %esi
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1_check)
|
|
||||||
- movq %r8, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
+ jnz L(last_vec_x1_check)
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x0_check):
|
|
||||||
+ subl $VEC_SIZE, %esi
|
|
||||||
+ jb L(max)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
/* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
+ cmpl %eax, %esi
|
|
||||||
+ jb L(max)
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+ addl $(VEC_SIZE + 1), %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x1_check):
|
|
||||||
+L(last_vec_return_x0):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
+ subq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x2_check):
|
|
||||||
+L(last_vec_return_x1):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
+ subq $(VEC_SIZE * 3 - 1), %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x3_check):
|
|
||||||
+L(last_vec_x1_check):
|
|
||||||
+
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
/* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
+ cmpl %eax, %esi
|
|
||||||
+ jb L(max)
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+ incl %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
L(max):
|
|
||||||
movq %r8, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_4x_vec):
|
|
||||||
+ /* Test first 2x VEC normally. */
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_x1)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_x2)
|
|
||||||
+
|
|
||||||
+ /* Normalize length. */
|
|
||||||
+ andl $(VEC_SIZE * 4 - 1), %esi
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_x3)
|
|
||||||
+
|
|
||||||
+ subl $(VEC_SIZE * 3), %esi
|
|
||||||
+ jb L(max)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Check the end of data. */
|
|
||||||
+ cmpl %eax, %esi
|
|
||||||
+ jb L(max)
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+ addl $(VEC_SIZE * 3 + 1), %eax
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(zero):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- ret
|
|
||||||
-# endif
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x0):
|
|
||||||
+L(last_vec_x1):
|
|
||||||
+ /* essentially duplicates of first_vec_x1 but use 64 bit
|
|
||||||
+ instructions. */
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+ incl %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x1):
|
|
||||||
+L(last_vec_x2):
|
|
||||||
+ /* essentially duplicates of first_vec_x1 but use 64 bit
|
|
||||||
+ instructions. */
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+ addl $(VEC_SIZE + 1), %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x2):
|
|
||||||
+L(last_vec_x3):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
+ subl $(VEC_SIZE * 2), %esi
|
|
||||||
+ /* Check the end of data. */
|
|
||||||
+ cmpl %eax, %esi
|
|
||||||
+ jb L(max_end)
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+ addl $(VEC_SIZE * 2 + 1), %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+L(max_end):
|
|
||||||
+ movq %r8, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
+ /* Cold case for crossing page with first load. */
|
|
||||||
.p2align 4
|
|
||||||
-L(4x_vec_end):
|
|
||||||
- VPCMPEQ %ymm1, %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
- VPCMPEQ %ymm2, %ymm0, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
+L(cross_page_boundary):
|
|
||||||
+ /* Align data to VEC_SIZE - 1. */
|
|
||||||
+ orq $(VEC_SIZE - 1), %rdi
|
|
||||||
+ VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
|
|
||||||
+ so no need to manually mod rdx. */
|
|
||||||
+ sarxl %edx, %eax, %eax
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
- VPCMPEQ %ymm3, %ymm0, %ymm3
|
|
||||||
- vpmovmskb %ymm3, %eax
|
|
||||||
+ jnz L(cross_page_less_vec)
|
|
||||||
+ leaq 1(%rdi), %rcx
|
|
||||||
+ subq %rdx, %rcx
|
|
||||||
+ /* Check length. */
|
|
||||||
+ cmpq %rsi, %rcx
|
|
||||||
+ jb L(cross_page_continue)
|
|
||||||
+ movq %r8, %rax
|
|
||||||
+# else
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
- VPCMPEQ %ymm4, %ymm0, %ymm4
|
|
||||||
- vpmovmskb %ymm4, %eax
|
|
||||||
-L(first_vec_x3):
|
|
||||||
+ jz L(cross_page_continue)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ shrl $2, %eax
|
|
||||||
+# endif
|
|
||||||
# endif
|
|
||||||
+L(return_vzeroupper):
|
|
||||||
+ ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
||||||
+
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ .p2align 4
|
|
||||||
+L(cross_page_less_vec):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ cmpq %rax, %rsi
|
|
||||||
+ cmovb %esi, %eax
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ shrl $2, %eax
|
|
||||||
+# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
END (STRLEN)
|
|
||||||
#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,701 +0,0 @@
|
|||||||
From 2a76821c3081d2c0231ecd2618f52662cb48fccd Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon, 3 May 2021 03:03:19 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize memchr-evex.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit optimizes memchr-evex.S. The optimizations include
|
|
||||||
replacing some branches with cmovcc, avoiding some branches entirely
|
|
||||||
in the less_4x_vec case, making the page cross logic less strict,
|
|
||||||
saving some ALU in the alignment process, and most importantly
|
|
||||||
increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
|
|
||||||
test-wmemchr are all passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
|
|
||||||
1 file changed, 322 insertions(+), 225 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
index 6dd5d67b..81d5cd64 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
@@ -26,14 +26,28 @@
|
|
||||||
|
|
||||||
# ifdef USE_AS_WMEMCHR
|
|
||||||
# define VPBROADCAST vpbroadcastd
|
|
||||||
-# define VPCMP vpcmpd
|
|
||||||
-# define SHIFT_REG r8d
|
|
||||||
+# define VPMINU vpminud
|
|
||||||
+# define VPCMP vpcmpd
|
|
||||||
+# define VPCMPEQ vpcmpeqd
|
|
||||||
+# define CHAR_SIZE 4
|
|
||||||
# else
|
|
||||||
# define VPBROADCAST vpbroadcastb
|
|
||||||
-# define VPCMP vpcmpb
|
|
||||||
-# define SHIFT_REG ecx
|
|
||||||
+# define VPMINU vpminub
|
|
||||||
+# define VPCMP vpcmpb
|
|
||||||
+# define VPCMPEQ vpcmpeqb
|
|
||||||
+# define CHAR_SIZE 1
|
|
||||||
# endif
|
|
||||||
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+# define RAW_PTR_REG rcx
|
|
||||||
+# define ALGN_PTR_REG rdi
|
|
||||||
+# else
|
|
||||||
+# define RAW_PTR_REG rdi
|
|
||||||
+# define ALGN_PTR_REG rcx
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+# define XMMZERO xmm23
|
|
||||||
+# define YMMZERO ymm23
|
|
||||||
# define XMMMATCH xmm16
|
|
||||||
# define YMMMATCH ymm16
|
|
||||||
# define YMM1 ymm17
|
|
||||||
@@ -44,6 +58,8 @@
|
|
||||||
# define YMM6 ymm22
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
||||||
+# define PAGE_SIZE 4096
|
|
||||||
|
|
||||||
.section .text.evex,"ax",@progbits
|
|
||||||
ENTRY (MEMCHR)
|
|
||||||
@@ -51,11 +67,7 @@ ENTRY (MEMCHR)
|
|
||||||
/* Check for zero length. */
|
|
||||||
test %RDX_LP, %RDX_LP
|
|
||||||
jz L(zero)
|
|
||||||
-# endif
|
|
||||||
- movl %edi, %ecx
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- shl $2, %RDX_LP
|
|
||||||
-# else
|
|
||||||
+
|
|
||||||
# ifdef __ILP32__
|
|
||||||
/* Clear the upper 32 bits. */
|
|
||||||
movl %edx, %edx
|
|
||||||
@@ -64,318 +76,403 @@ ENTRY (MEMCHR)
|
|
||||||
/* Broadcast CHAR to YMMMATCH. */
|
|
||||||
VPBROADCAST %esi, %YMMMATCH
|
|
||||||
/* Check if we may cross page boundary with one vector load. */
|
|
||||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
|
||||||
- cmpl $VEC_SIZE, %ecx
|
|
||||||
- ja L(cros_page_boundary)
|
|
||||||
+ movl %edi, %eax
|
|
||||||
+ andl $(PAGE_SIZE - 1), %eax
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
+ ja L(cross_page_boundary)
|
|
||||||
|
|
||||||
/* Check the first VEC_SIZE bytes. */
|
|
||||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
-
|
|
||||||
+ VPCMP $0, (%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- jnz L(first_vec_x0_check)
|
|
||||||
- /* Adjust length and check the end of data. */
|
|
||||||
- subq $VEC_SIZE, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
+ /* If length < CHAR_PER_VEC handle special. */
|
|
||||||
+ cmpq $CHAR_PER_VEC, %rdx
|
|
||||||
+ jbe L(first_vec_x0)
|
|
||||||
+# endif
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(aligned_more)
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# else
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
# endif
|
|
||||||
-
|
|
||||||
- /* Align data for aligned loads in the loop. */
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- /* Adjust length. */
|
|
||||||
- addq %rcx, %rdx
|
|
||||||
-
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
-# endif
|
|
||||||
- jmp L(more_4x_vec)
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
+ .p2align 5
|
|
||||||
+L(first_vec_x0):
|
|
||||||
+ /* Check if first match was before length. */
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ xorl %ecx, %ecx
|
|
||||||
+ cmpl %eax, %edx
|
|
||||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ cmovle %rcx, %rax
|
|
||||||
+ ret
|
|
||||||
+# else
|
|
||||||
+ /* NB: first_vec_x0 is 17 bytes which will leave
|
|
||||||
+ cross_page_boundary (which is relatively cold) close enough
|
|
||||||
+ to ideal alignment. So only realign L(cross_page_boundary) if
|
|
||||||
+ rawmemchr. */
|
|
||||||
.p2align 4
|
|
||||||
-L(cros_page_boundary):
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
+# endif
|
|
||||||
+L(cross_page_boundary):
|
|
||||||
+ /* Save pointer before aligning as its original value is
|
|
||||||
+ necessary for computer return address if byte is found or
|
|
||||||
+ adjusting length if it is not and this is memchr. */
|
|
||||||
+ movq %rdi, %rcx
|
|
||||||
+ /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
|
|
||||||
+ for rawmemchr. */
|
|
||||||
+ andq $-VEC_SIZE, %ALGN_PTR_REG
|
|
||||||
+ VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %r8d
|
|
||||||
# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
|
|
||||||
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
|
|
||||||
bytes. */
|
|
||||||
- movl %ecx, %SHIFT_REG
|
|
||||||
- sarl $2, %SHIFT_REG
|
|
||||||
+ sarl $2, %eax
|
|
||||||
+# endif
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
+ movl $(PAGE_SIZE / CHAR_SIZE), %esi
|
|
||||||
+ subl %eax, %esi
|
|
||||||
# endif
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
- /* Remove the leading bytes. */
|
|
||||||
- sarxl %SHIFT_REG, %eax, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jz L(aligned_more)
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
+ andl $(CHAR_PER_VEC - 1), %eax
|
|
||||||
# endif
|
|
||||||
+ /* Remove the leading bytes. */
|
|
||||||
+ sarxl %eax, %r8d, %eax
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
/* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
+ cmpq %rsi, %rdx
|
|
||||||
+ jbe L(first_vec_x0)
|
|
||||||
+# endif
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(cross_page_continue)
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
|
|
||||||
+# else
|
|
||||||
+ addq %RAW_PTR_REG, %rax
|
|
||||||
# endif
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- addq %rcx, %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(aligned_more):
|
|
||||||
-# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
|
|
||||||
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
|
|
||||||
- overflow. */
|
|
||||||
- negq %rcx
|
|
||||||
- addq $VEC_SIZE, %rcx
|
|
||||||
+L(first_vec_x1):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- subq %rcx, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
-# endif
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x2):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x3):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
-# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
-# endif
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x4):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
-L(more_4x_vec):
|
|
||||||
+ .p2align 5
|
|
||||||
+L(aligned_more):
|
|
||||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
since data is only aligned to VEC_SIZE. */
|
|
||||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
|
|
||||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
+ /* Align data to VEC_SIZE. */
|
|
||||||
+L(cross_page_continue):
|
|
||||||
+ xorl %ecx, %ecx
|
|
||||||
+ subl %edi, %ecx
|
|
||||||
+ andq $-VEC_SIZE, %rdi
|
|
||||||
+ /* esi is for adjusting length to see if near the end. */
|
|
||||||
+ leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %esi
|
|
||||||
+# endif
|
|
||||||
+# else
|
|
||||||
+ andq $-VEC_SIZE, %rdi
|
|
||||||
+L(cross_page_continue):
|
|
||||||
+# endif
|
|
||||||
+ /* Load first VEC regardless. */
|
|
||||||
+ VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
+ /* Adjust length. If near end handle specially. */
|
|
||||||
+ subq %rsi, %rdx
|
|
||||||
+ jbe L(last_4x_vec_or_less)
|
|
||||||
+# endif
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x1)
|
|
||||||
|
|
||||||
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x2)
|
|
||||||
|
|
||||||
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x3)
|
|
||||||
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x4)
|
|
||||||
+
|
|
||||||
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
-# endif
|
|
||||||
+ /* Check if at last CHAR_PER_VEC * 4 length. */
|
|
||||||
+ subq $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
+ jbe L(last_4x_vec_or_less_cmpeq)
|
|
||||||
+ addq $VEC_SIZE, %rdi
|
|
||||||
|
|
||||||
- /* Align data to 4 * VEC_SIZE. */
|
|
||||||
- movq %rdi, %rcx
|
|
||||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
|
||||||
+ /* Align data to VEC_SIZE * 4 for the loop and readjust length.
|
|
||||||
+ */
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ movl %edi, %ecx
|
|
||||||
andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
-
|
|
||||||
-# ifndef USE_AS_RAWMEMCHR
|
|
||||||
- /* Adjust length. */
|
|
||||||
+ andl $(VEC_SIZE * 4 - 1), %ecx
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %ecx
|
|
||||||
addq %rcx, %rdx
|
|
||||||
+# else
|
|
||||||
+ addq %rdi, %rdx
|
|
||||||
+ andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
+ subq %rdi, %rdx
|
|
||||||
+# endif
|
|
||||||
+# else
|
|
||||||
+ addq $VEC_SIZE, %rdi
|
|
||||||
+ andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
# endif
|
|
||||||
|
|
||||||
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
|
||||||
+
|
|
||||||
+ /* Compare 4 * VEC at a time forward. */
|
|
||||||
.p2align 4
|
|
||||||
L(loop_4x_vec):
|
|
||||||
- /* Compare 4 * VEC at a time forward. */
|
|
||||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
||||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
|
|
||||||
- kord %k1, %k2, %k5
|
|
||||||
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
|
|
||||||
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
|
|
||||||
-
|
|
||||||
- kord %k3, %k4, %k6
|
|
||||||
- kortestd %k5, %k6
|
|
||||||
- jnz L(4x_vec_end)
|
|
||||||
-
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
-
|
|
||||||
+ /* It would be possible to save some instructions using 4x VPCMP
|
|
||||||
+ but bottleneck on port 5 makes it not woth it. */
|
|
||||||
+ VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
|
|
||||||
+ /* xor will set bytes match esi to zero. */
|
|
||||||
+ vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
|
|
||||||
+ vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
|
|
||||||
+ /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
|
|
||||||
+ VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
|
|
||||||
+ VPCMP $0, %YMM3, %YMMZERO, %k2
|
|
||||||
# ifdef USE_AS_RAWMEMCHR
|
|
||||||
- jmp L(loop_4x_vec)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ kortestd %k2, %k3
|
|
||||||
+ jz L(loop_4x_vec)
|
|
||||||
# else
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ kortestd %k2, %k3
|
|
||||||
+ jnz L(loop_4x_vec_end)
|
|
||||||
+
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+
|
|
||||||
+ subq $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
ja L(loop_4x_vec)
|
|
||||||
|
|
||||||
+ /* Fall through into less than 4 remaining vectors of length case.
|
|
||||||
+ */
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ addq $(VEC_SIZE * 3), %rdi
|
|
||||||
+ .p2align 4
|
|
||||||
L(last_4x_vec_or_less):
|
|
||||||
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
|
||||||
- addl $(VEC_SIZE * 2), %edx
|
|
||||||
- jle L(last_2x_vec)
|
|
||||||
-
|
|
||||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
+ /* Check if first VEC contained match. */
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ jnz L(first_vec_x1_check)
|
|
||||||
|
|
||||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
+ /* If remaining length > CHAR_PER_VEC * 2. */
|
|
||||||
+ addl $(CHAR_PER_VEC * 2), %edx
|
|
||||||
+ jg L(last_4x_vec)
|
|
||||||
|
|
||||||
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
+L(last_2x_vec):
|
|
||||||
+ /* If remaining length < CHAR_PER_VEC. */
|
|
||||||
+ addl $CHAR_PER_VEC, %edx
|
|
||||||
+ jle L(zero_end)
|
|
||||||
|
|
||||||
- jnz L(first_vec_x2_check)
|
|
||||||
- subl $VEC_SIZE, %edx
|
|
||||||
- jle L(zero)
|
|
||||||
+ /* Check VEC2 and compare any match with remaining length. */
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ cmpl %eax, %edx
|
|
||||||
+ jbe L(set_zero_end)
|
|
||||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+L(zero_end):
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
|
|
||||||
- jnz L(first_vec_x3_check)
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x1_check):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Adjust length. */
|
|
||||||
+ subl $-(CHAR_PER_VEC * 4), %edx
|
|
||||||
+ /* Check if match within remaining length. */
|
|
||||||
+ cmpl %eax, %edx
|
|
||||||
+ jbe L(set_zero_end)
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
+L(set_zero_end):
|
|
||||||
xorl %eax, %eax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(last_2x_vec):
|
|
||||||
- addl $(VEC_SIZE * 2), %edx
|
|
||||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
||||||
+L(loop_4x_vec_end):
|
|
||||||
+# endif
|
|
||||||
+ /* rawmemchr will fall through into this if match was found in
|
|
||||||
+ loop. */
|
|
||||||
+
|
|
||||||
+ /* k1 has not of matches with VEC1. */
|
|
||||||
kmovd %k1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ subl $((1 << CHAR_PER_VEC) - 1), %eax
|
|
||||||
+# else
|
|
||||||
+ incl %eax
|
|
||||||
+# endif
|
|
||||||
+ jnz L(last_vec_x1_return)
|
|
||||||
|
|
||||||
- jnz L(first_vec_x0_check)
|
|
||||||
- subl $VEC_SIZE, %edx
|
|
||||||
- jle L(zero)
|
|
||||||
+ VPCMP $0, %YMM2, %YMMZERO, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_x2_return)
|
|
||||||
|
|
||||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
+ kmovd %k2, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1_check)
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- ret
|
|
||||||
+ jnz L(last_vec_x3_return)
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x0_check):
|
|
||||||
+ kmovd %k3, %eax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+# else
|
|
||||||
+ leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# endif
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
- addq %rdi, %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x1_check):
|
|
||||||
+L(last_vec_x1_return):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+# else
|
|
||||||
addq %rdi, %rax
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x2_check):
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
+# endif
|
|
||||||
+# else
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# endif
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x3_check):
|
|
||||||
+L(last_vec_x2_return):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+# else
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# endif
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rdx
|
|
||||||
- jbe L(zero)
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(zero):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- ret
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x0):
|
|
||||||
+L(last_vec_x3_return):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (%rdi, %rax, 4), %rax
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# else
|
|
||||||
- addq %rdi, %rax
|
|
||||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
+ leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
+
|
|
||||||
+# ifndef USE_AS_RAWMEMCHR
|
|
||||||
+L(last_4x_vec_or_less_cmpeq):
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ /* Check first VEC regardless. */
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x1_check)
|
|
||||||
+
|
|
||||||
+ /* If remaining length <= CHAR_PER_VEC * 2. */
|
|
||||||
+ addl $(CHAR_PER_VEC * 2), %edx
|
|
||||||
+ jle L(last_2x_vec)
|
|
||||||
+
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x1):
|
|
||||||
+L(last_4x_vec):
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_x2)
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ /* Create mask for possible matches within remaining length. */
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
|
|
||||||
+ bzhil %edx, %ecx, %ecx
|
|
||||||
+# else
|
|
||||||
+ movq $-1, %rcx
|
|
||||||
+ bzhiq %rdx, %rcx, %rcx
|
|
||||||
+# endif
|
|
||||||
+ /* Test matches in data against length match. */
|
|
||||||
+ andl %ecx, %eax
|
|
||||||
+ jnz L(last_vec_x3)
|
|
||||||
+
|
|
||||||
+ /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
|
|
||||||
+ remaining length was found to be > CHAR_PER_VEC * 2. */
|
|
||||||
+ subl $CHAR_PER_VEC, %edx
|
|
||||||
+ jbe L(zero_end2)
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ /* Shift remaining length mask for last VEC. */
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ shrl $CHAR_PER_VEC, %ecx
|
|
||||||
+# else
|
|
||||||
+ shrq $CHAR_PER_VEC, %rcx
|
|
||||||
+# endif
|
|
||||||
+ andl %ecx, %eax
|
|
||||||
+ jz L(zero_end2)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq VEC_SIZE(%rdi, %rax, 4), %rax
|
|
||||||
-# else
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
-# endif
|
|
||||||
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+L(zero_end2):
|
|
||||||
ret
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x2):
|
|
||||||
+L(last_vec_x2):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
|
|
||||||
-# else
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
-# endif
|
|
||||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(4x_vec_end):
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
- kmovd %k3, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
- kmovd %k4, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
-L(first_vec_x3):
|
|
||||||
+L(last_vec_x3):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
|
|
||||||
-# else
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
-# endif
|
|
||||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
ret
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
END (MEMCHR)
|
|
||||||
#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,30 +0,0 @@
|
|||||||
From 6ea916adfa0ab9af6e7dc6adcf6f977dfe017835 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Alice Xu <alice.d.xu@gmail.com>
|
|
||||||
Date: Fri, 7 May 2021 19:03:21 -0700
|
|
||||||
Subject: [PATCH] x86-64: Fix an unknown vector operation in memchr-evex.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
An unknown vector operation occurred in commit 2a76821c308. Fixed it
|
|
||||||
by using "ymm{k1}{z}" but not "ymm {k1} {z}".
|
|
||||||
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memchr-evex.S | 2 +-
|
|
||||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
index 81d5cd64..f3fdad4f 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
@@ -271,7 +271,7 @@ L(loop_4x_vec):
|
|
||||||
vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
|
|
||||||
VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
|
|
||||||
/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
|
|
||||||
- VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
|
|
||||||
+ VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
|
|
||||||
VPCMP $0, %YMM3, %YMMZERO, %k2
|
|
||||||
# ifdef USE_AS_RAWMEMCHR
|
|
||||||
subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,566 +0,0 @@
|
|||||||
From a0db678071c60b6c47c468d231dd0b3694ba7a98 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Tue, 22 Jun 2021 20:42:10 -0700
|
|
||||||
Subject: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1
|
|
||||||
version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S
|
|
||||||
and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants.
|
|
||||||
This also removes the unused symbols, __GI___strlen_sse2 and
|
|
||||||
__GI___wcsnlen_sse4_1.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strlen-sse2.S | 2 +-
|
|
||||||
sysdeps/x86_64/multiarch/strlen-vec.S | 257 ++++++++++++++++++++++
|
|
||||||
sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 2 +-
|
|
||||||
sysdeps/x86_64/strlen.S | 243 +-------------------
|
|
||||||
4 files changed, 262 insertions(+), 242 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
|
|
||||||
(Copyright dates, URL)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
|
|
||||||
index 7bc57b8d..449c8a7f 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strlen-sse2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
|
|
||||||
@@ -20,4 +20,4 @@
|
|
||||||
# define strlen __strlen_sse2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
-#include "../strlen.S"
|
|
||||||
+#include "strlen-vec.S"
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..8f660bb9
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
@@ -0,0 +1,257 @@
|
|
||||||
+/* SSE2 version of strlen and SSE4.1 version of wcslen.
|
|
||||||
+ Copyright (C) 2012-2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <sysdep.h>
|
|
||||||
+
|
|
||||||
+#ifdef AS_WCSLEN
|
|
||||||
+# define PMINU pminud
|
|
||||||
+# define PCMPEQ pcmpeqd
|
|
||||||
+# define SHIFT_RETURN shrq $2, %rax
|
|
||||||
+#else
|
|
||||||
+# define PMINU pminub
|
|
||||||
+# define PCMPEQ pcmpeqb
|
|
||||||
+# define SHIFT_RETURN
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+/* Long lived register in strlen(s), strnlen(s, n) are:
|
|
||||||
+
|
|
||||||
+ %xmm3 - zero
|
|
||||||
+ %rdi - s
|
|
||||||
+ %r10 (s+n) & (~(64-1))
|
|
||||||
+ %r11 s+n
|
|
||||||
+*/
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+.text
|
|
||||||
+ENTRY(strlen)
|
|
||||||
+
|
|
||||||
+/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
|
|
||||||
+#define FIND_ZERO \
|
|
||||||
+ PCMPEQ (%rax), %xmm0; \
|
|
||||||
+ PCMPEQ 16(%rax), %xmm1; \
|
|
||||||
+ PCMPEQ 32(%rax), %xmm2; \
|
|
||||||
+ PCMPEQ 48(%rax), %xmm3; \
|
|
||||||
+ pmovmskb %xmm0, %esi; \
|
|
||||||
+ pmovmskb %xmm1, %edx; \
|
|
||||||
+ pmovmskb %xmm2, %r8d; \
|
|
||||||
+ pmovmskb %xmm3, %ecx; \
|
|
||||||
+ salq $16, %rdx; \
|
|
||||||
+ salq $16, %rcx; \
|
|
||||||
+ orq %rsi, %rdx; \
|
|
||||||
+ orq %r8, %rcx; \
|
|
||||||
+ salq $32, %rcx; \
|
|
||||||
+ orq %rcx, %rdx;
|
|
||||||
+
|
|
||||||
+#ifdef AS_STRNLEN
|
|
||||||
+/* Do not read anything when n==0. */
|
|
||||||
+ test %RSI_LP, %RSI_LP
|
|
||||||
+ jne L(n_nonzero)
|
|
||||||
+ xor %rax, %rax
|
|
||||||
+ ret
|
|
||||||
+L(n_nonzero):
|
|
||||||
+# ifdef AS_WCSLEN
|
|
||||||
+ shl $2, %RSI_LP
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+/* Initialize long lived registers. */
|
|
||||||
+
|
|
||||||
+ add %RDI_LP, %RSI_LP
|
|
||||||
+ mov %RSI_LP, %R10_LP
|
|
||||||
+ and $-64, %R10_LP
|
|
||||||
+ mov %RSI_LP, %R11_LP
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+ pxor %xmm0, %xmm0
|
|
||||||
+ pxor %xmm1, %xmm1
|
|
||||||
+ pxor %xmm2, %xmm2
|
|
||||||
+ pxor %xmm3, %xmm3
|
|
||||||
+ movq %rdi, %rax
|
|
||||||
+ movq %rdi, %rcx
|
|
||||||
+ andq $4095, %rcx
|
|
||||||
+/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
|
|
||||||
+ cmpq $4047, %rcx
|
|
||||||
+/* We cannot unify this branching as it would be ~6 cycles slower. */
|
|
||||||
+ ja L(cross_page)
|
|
||||||
+
|
|
||||||
+#ifdef AS_STRNLEN
|
|
||||||
+/* Test if end is among first 64 bytes. */
|
|
||||||
+# define STRNLEN_PROLOG \
|
|
||||||
+ mov %r11, %rsi; \
|
|
||||||
+ subq %rax, %rsi; \
|
|
||||||
+ andq $-64, %rax; \
|
|
||||||
+ testq $-64, %rsi; \
|
|
||||||
+ je L(strnlen_ret)
|
|
||||||
+#else
|
|
||||||
+# define STRNLEN_PROLOG andq $-64, %rax;
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+/* Ignore bits in mask that come before start of string. */
|
|
||||||
+#define PROLOG(lab) \
|
|
||||||
+ movq %rdi, %rcx; \
|
|
||||||
+ xorq %rax, %rcx; \
|
|
||||||
+ STRNLEN_PROLOG; \
|
|
||||||
+ sarq %cl, %rdx; \
|
|
||||||
+ test %rdx, %rdx; \
|
|
||||||
+ je L(lab); \
|
|
||||||
+ bsfq %rdx, %rax; \
|
|
||||||
+ SHIFT_RETURN; \
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+#ifdef AS_STRNLEN
|
|
||||||
+ andq $-16, %rax
|
|
||||||
+ FIND_ZERO
|
|
||||||
+#else
|
|
||||||
+ /* Test first 16 bytes unaligned. */
|
|
||||||
+ movdqu (%rax), %xmm4
|
|
||||||
+ PCMPEQ %xmm0, %xmm4
|
|
||||||
+ pmovmskb %xmm4, %edx
|
|
||||||
+ test %edx, %edx
|
|
||||||
+ je L(next48_bytes)
|
|
||||||
+ bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
|
|
||||||
+ SHIFT_RETURN
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+L(next48_bytes):
|
|
||||||
+/* Same as FIND_ZERO except we do not check first 16 bytes. */
|
|
||||||
+ andq $-16, %rax
|
|
||||||
+ PCMPEQ 16(%rax), %xmm1
|
|
||||||
+ PCMPEQ 32(%rax), %xmm2
|
|
||||||
+ PCMPEQ 48(%rax), %xmm3
|
|
||||||
+ pmovmskb %xmm1, %edx
|
|
||||||
+ pmovmskb %xmm2, %r8d
|
|
||||||
+ pmovmskb %xmm3, %ecx
|
|
||||||
+ salq $16, %rdx
|
|
||||||
+ salq $16, %rcx
|
|
||||||
+ orq %r8, %rcx
|
|
||||||
+ salq $32, %rcx
|
|
||||||
+ orq %rcx, %rdx
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+ /* When no zero byte is found xmm1-3 are zero so we do not have to
|
|
||||||
+ zero them. */
|
|
||||||
+ PROLOG(loop)
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(cross_page):
|
|
||||||
+ andq $-64, %rax
|
|
||||||
+ FIND_ZERO
|
|
||||||
+ PROLOG(loop_init)
|
|
||||||
+
|
|
||||||
+#ifdef AS_STRNLEN
|
|
||||||
+/* We must do this check to correctly handle strnlen (s, -1). */
|
|
||||||
+L(strnlen_ret):
|
|
||||||
+ bts %rsi, %rdx
|
|
||||||
+ sarq %cl, %rdx
|
|
||||||
+ test %rdx, %rdx
|
|
||||||
+ je L(loop_init)
|
|
||||||
+ bsfq %rdx, %rax
|
|
||||||
+ SHIFT_RETURN
|
|
||||||
+ ret
|
|
||||||
+#endif
|
|
||||||
+ .p2align 4
|
|
||||||
+L(loop_init):
|
|
||||||
+ pxor %xmm1, %xmm1
|
|
||||||
+ pxor %xmm2, %xmm2
|
|
||||||
+ pxor %xmm3, %xmm3
|
|
||||||
+#ifdef AS_STRNLEN
|
|
||||||
+ .p2align 4
|
|
||||||
+L(loop):
|
|
||||||
+
|
|
||||||
+ addq $64, %rax
|
|
||||||
+ cmpq %rax, %r10
|
|
||||||
+ je L(exit_end)
|
|
||||||
+
|
|
||||||
+ movdqa (%rax), %xmm0
|
|
||||||
+ PMINU 16(%rax), %xmm0
|
|
||||||
+ PMINU 32(%rax), %xmm0
|
|
||||||
+ PMINU 48(%rax), %xmm0
|
|
||||||
+ PCMPEQ %xmm3, %xmm0
|
|
||||||
+ pmovmskb %xmm0, %edx
|
|
||||||
+ testl %edx, %edx
|
|
||||||
+ jne L(exit)
|
|
||||||
+ jmp L(loop)
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(exit_end):
|
|
||||||
+ cmp %rax, %r11
|
|
||||||
+ je L(first) /* Do not read when end is at page boundary. */
|
|
||||||
+ pxor %xmm0, %xmm0
|
|
||||||
+ FIND_ZERO
|
|
||||||
+
|
|
||||||
+L(first):
|
|
||||||
+ bts %r11, %rdx
|
|
||||||
+ bsfq %rdx, %rdx
|
|
||||||
+ addq %rdx, %rax
|
|
||||||
+ subq %rdi, %rax
|
|
||||||
+ SHIFT_RETURN
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(exit):
|
|
||||||
+ pxor %xmm0, %xmm0
|
|
||||||
+ FIND_ZERO
|
|
||||||
+
|
|
||||||
+ bsfq %rdx, %rdx
|
|
||||||
+ addq %rdx, %rax
|
|
||||||
+ subq %rdi, %rax
|
|
||||||
+ SHIFT_RETURN
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+#else
|
|
||||||
+
|
|
||||||
+ /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(loop):
|
|
||||||
+
|
|
||||||
+ movdqa 64(%rax), %xmm0
|
|
||||||
+ PMINU 80(%rax), %xmm0
|
|
||||||
+ PMINU 96(%rax), %xmm0
|
|
||||||
+ PMINU 112(%rax), %xmm0
|
|
||||||
+ PCMPEQ %xmm3, %xmm0
|
|
||||||
+ pmovmskb %xmm0, %edx
|
|
||||||
+ testl %edx, %edx
|
|
||||||
+ jne L(exit64)
|
|
||||||
+
|
|
||||||
+ subq $-128, %rax
|
|
||||||
+
|
|
||||||
+ movdqa (%rax), %xmm0
|
|
||||||
+ PMINU 16(%rax), %xmm0
|
|
||||||
+ PMINU 32(%rax), %xmm0
|
|
||||||
+ PMINU 48(%rax), %xmm0
|
|
||||||
+ PCMPEQ %xmm3, %xmm0
|
|
||||||
+ pmovmskb %xmm0, %edx
|
|
||||||
+ testl %edx, %edx
|
|
||||||
+ jne L(exit0)
|
|
||||||
+ jmp L(loop)
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(exit64):
|
|
||||||
+ addq $64, %rax
|
|
||||||
+L(exit0):
|
|
||||||
+ pxor %xmm0, %xmm0
|
|
||||||
+ FIND_ZERO
|
|
||||||
+
|
|
||||||
+ bsfq %rdx, %rdx
|
|
||||||
+ addq %rdx, %rax
|
|
||||||
+ subq %rdi, %rax
|
|
||||||
+ SHIFT_RETURN
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+END(strlen)
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
|
|
||||||
index a8cab0cb..5fa51fe0 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
|
|
||||||
@@ -2,4 +2,4 @@
|
|
||||||
#define AS_STRNLEN
|
|
||||||
#define strlen __wcsnlen_sse4_1
|
|
||||||
|
|
||||||
-#include "../strlen.S"
|
|
||||||
+#include "strlen-vec.S"
|
|
||||||
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
|
|
||||||
index f845f3d4..ad047d84 100644
|
|
||||||
--- a/sysdeps/x86_64/strlen.S
|
|
||||||
+++ b/sysdeps/x86_64/strlen.S
|
|
||||||
@@ -1,5 +1,5 @@
|
|
||||||
-/* SSE2 version of strlen/wcslen.
|
|
||||||
- Copyright (C) 2012-2018 Free Software Foundation, Inc.
|
|
||||||
+/* SSE2 version of strlen.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
This file is part of the GNU C Library.
|
|
||||||
|
|
||||||
The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
@@ -16,243 +16,6 @@
|
|
||||||
License along with the GNU C Library; if not, see
|
|
||||||
<http://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
-#include <sysdep.h>
|
|
||||||
+#include "multiarch/strlen-vec.S"
|
|
||||||
|
|
||||||
-#ifdef AS_WCSLEN
|
|
||||||
-# define PMINU pminud
|
|
||||||
-# define PCMPEQ pcmpeqd
|
|
||||||
-# define SHIFT_RETURN shrq $2, %rax
|
|
||||||
-#else
|
|
||||||
-# define PMINU pminub
|
|
||||||
-# define PCMPEQ pcmpeqb
|
|
||||||
-# define SHIFT_RETURN
|
|
||||||
-#endif
|
|
||||||
-
|
|
||||||
-/* Long lived register in strlen(s), strnlen(s, n) are:
|
|
||||||
-
|
|
||||||
- %xmm3 - zero
|
|
||||||
- %rdi - s
|
|
||||||
- %r10 (s+n) & (~(64-1))
|
|
||||||
- %r11 s+n
|
|
||||||
-*/
|
|
||||||
-
|
|
||||||
-
|
|
||||||
-.text
|
|
||||||
-ENTRY(strlen)
|
|
||||||
-
|
|
||||||
-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
|
|
||||||
-#define FIND_ZERO \
|
|
||||||
- PCMPEQ (%rax), %xmm0; \
|
|
||||||
- PCMPEQ 16(%rax), %xmm1; \
|
|
||||||
- PCMPEQ 32(%rax), %xmm2; \
|
|
||||||
- PCMPEQ 48(%rax), %xmm3; \
|
|
||||||
- pmovmskb %xmm0, %esi; \
|
|
||||||
- pmovmskb %xmm1, %edx; \
|
|
||||||
- pmovmskb %xmm2, %r8d; \
|
|
||||||
- pmovmskb %xmm3, %ecx; \
|
|
||||||
- salq $16, %rdx; \
|
|
||||||
- salq $16, %rcx; \
|
|
||||||
- orq %rsi, %rdx; \
|
|
||||||
- orq %r8, %rcx; \
|
|
||||||
- salq $32, %rcx; \
|
|
||||||
- orq %rcx, %rdx;
|
|
||||||
-
|
|
||||||
-#ifdef AS_STRNLEN
|
|
||||||
-/* Do not read anything when n==0. */
|
|
||||||
- test %RSI_LP, %RSI_LP
|
|
||||||
- jne L(n_nonzero)
|
|
||||||
- xor %rax, %rax
|
|
||||||
- ret
|
|
||||||
-L(n_nonzero):
|
|
||||||
-# ifdef AS_WCSLEN
|
|
||||||
- shl $2, %RSI_LP
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
-/* Initialize long lived registers. */
|
|
||||||
-
|
|
||||||
- add %RDI_LP, %RSI_LP
|
|
||||||
- mov %RSI_LP, %R10_LP
|
|
||||||
- and $-64, %R10_LP
|
|
||||||
- mov %RSI_LP, %R11_LP
|
|
||||||
-#endif
|
|
||||||
-
|
|
||||||
- pxor %xmm0, %xmm0
|
|
||||||
- pxor %xmm1, %xmm1
|
|
||||||
- pxor %xmm2, %xmm2
|
|
||||||
- pxor %xmm3, %xmm3
|
|
||||||
- movq %rdi, %rax
|
|
||||||
- movq %rdi, %rcx
|
|
||||||
- andq $4095, %rcx
|
|
||||||
-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
|
|
||||||
- cmpq $4047, %rcx
|
|
||||||
-/* We cannot unify this branching as it would be ~6 cycles slower. */
|
|
||||||
- ja L(cross_page)
|
|
||||||
-
|
|
||||||
-#ifdef AS_STRNLEN
|
|
||||||
-/* Test if end is among first 64 bytes. */
|
|
||||||
-# define STRNLEN_PROLOG \
|
|
||||||
- mov %r11, %rsi; \
|
|
||||||
- subq %rax, %rsi; \
|
|
||||||
- andq $-64, %rax; \
|
|
||||||
- testq $-64, %rsi; \
|
|
||||||
- je L(strnlen_ret)
|
|
||||||
-#else
|
|
||||||
-# define STRNLEN_PROLOG andq $-64, %rax;
|
|
||||||
-#endif
|
|
||||||
-
|
|
||||||
-/* Ignore bits in mask that come before start of string. */
|
|
||||||
-#define PROLOG(lab) \
|
|
||||||
- movq %rdi, %rcx; \
|
|
||||||
- xorq %rax, %rcx; \
|
|
||||||
- STRNLEN_PROLOG; \
|
|
||||||
- sarq %cl, %rdx; \
|
|
||||||
- test %rdx, %rdx; \
|
|
||||||
- je L(lab); \
|
|
||||||
- bsfq %rdx, %rax; \
|
|
||||||
- SHIFT_RETURN; \
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
-#ifdef AS_STRNLEN
|
|
||||||
- andq $-16, %rax
|
|
||||||
- FIND_ZERO
|
|
||||||
-#else
|
|
||||||
- /* Test first 16 bytes unaligned. */
|
|
||||||
- movdqu (%rax), %xmm4
|
|
||||||
- PCMPEQ %xmm0, %xmm4
|
|
||||||
- pmovmskb %xmm4, %edx
|
|
||||||
- test %edx, %edx
|
|
||||||
- je L(next48_bytes)
|
|
||||||
- bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
|
|
||||||
- SHIFT_RETURN
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
-L(next48_bytes):
|
|
||||||
-/* Same as FIND_ZERO except we do not check first 16 bytes. */
|
|
||||||
- andq $-16, %rax
|
|
||||||
- PCMPEQ 16(%rax), %xmm1
|
|
||||||
- PCMPEQ 32(%rax), %xmm2
|
|
||||||
- PCMPEQ 48(%rax), %xmm3
|
|
||||||
- pmovmskb %xmm1, %edx
|
|
||||||
- pmovmskb %xmm2, %r8d
|
|
||||||
- pmovmskb %xmm3, %ecx
|
|
||||||
- salq $16, %rdx
|
|
||||||
- salq $16, %rcx
|
|
||||||
- orq %r8, %rcx
|
|
||||||
- salq $32, %rcx
|
|
||||||
- orq %rcx, %rdx
|
|
||||||
-#endif
|
|
||||||
-
|
|
||||||
- /* When no zero byte is found xmm1-3 are zero so we do not have to
|
|
||||||
- zero them. */
|
|
||||||
- PROLOG(loop)
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(cross_page):
|
|
||||||
- andq $-64, %rax
|
|
||||||
- FIND_ZERO
|
|
||||||
- PROLOG(loop_init)
|
|
||||||
-
|
|
||||||
-#ifdef AS_STRNLEN
|
|
||||||
-/* We must do this check to correctly handle strnlen (s, -1). */
|
|
||||||
-L(strnlen_ret):
|
|
||||||
- bts %rsi, %rdx
|
|
||||||
- sarq %cl, %rdx
|
|
||||||
- test %rdx, %rdx
|
|
||||||
- je L(loop_init)
|
|
||||||
- bsfq %rdx, %rax
|
|
||||||
- SHIFT_RETURN
|
|
||||||
- ret
|
|
||||||
-#endif
|
|
||||||
- .p2align 4
|
|
||||||
-L(loop_init):
|
|
||||||
- pxor %xmm1, %xmm1
|
|
||||||
- pxor %xmm2, %xmm2
|
|
||||||
- pxor %xmm3, %xmm3
|
|
||||||
-#ifdef AS_STRNLEN
|
|
||||||
- .p2align 4
|
|
||||||
-L(loop):
|
|
||||||
-
|
|
||||||
- addq $64, %rax
|
|
||||||
- cmpq %rax, %r10
|
|
||||||
- je L(exit_end)
|
|
||||||
-
|
|
||||||
- movdqa (%rax), %xmm0
|
|
||||||
- PMINU 16(%rax), %xmm0
|
|
||||||
- PMINU 32(%rax), %xmm0
|
|
||||||
- PMINU 48(%rax), %xmm0
|
|
||||||
- PCMPEQ %xmm3, %xmm0
|
|
||||||
- pmovmskb %xmm0, %edx
|
|
||||||
- testl %edx, %edx
|
|
||||||
- jne L(exit)
|
|
||||||
- jmp L(loop)
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(exit_end):
|
|
||||||
- cmp %rax, %r11
|
|
||||||
- je L(first) /* Do not read when end is at page boundary. */
|
|
||||||
- pxor %xmm0, %xmm0
|
|
||||||
- FIND_ZERO
|
|
||||||
-
|
|
||||||
-L(first):
|
|
||||||
- bts %r11, %rdx
|
|
||||||
- bsfq %rdx, %rdx
|
|
||||||
- addq %rdx, %rax
|
|
||||||
- subq %rdi, %rax
|
|
||||||
- SHIFT_RETURN
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(exit):
|
|
||||||
- pxor %xmm0, %xmm0
|
|
||||||
- FIND_ZERO
|
|
||||||
-
|
|
||||||
- bsfq %rdx, %rdx
|
|
||||||
- addq %rdx, %rax
|
|
||||||
- subq %rdi, %rax
|
|
||||||
- SHIFT_RETURN
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
-#else
|
|
||||||
-
|
|
||||||
- /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
|
|
||||||
- .p2align 4
|
|
||||||
-L(loop):
|
|
||||||
-
|
|
||||||
- movdqa 64(%rax), %xmm0
|
|
||||||
- PMINU 80(%rax), %xmm0
|
|
||||||
- PMINU 96(%rax), %xmm0
|
|
||||||
- PMINU 112(%rax), %xmm0
|
|
||||||
- PCMPEQ %xmm3, %xmm0
|
|
||||||
- pmovmskb %xmm0, %edx
|
|
||||||
- testl %edx, %edx
|
|
||||||
- jne L(exit64)
|
|
||||||
-
|
|
||||||
- subq $-128, %rax
|
|
||||||
-
|
|
||||||
- movdqa (%rax), %xmm0
|
|
||||||
- PMINU 16(%rax), %xmm0
|
|
||||||
- PMINU 32(%rax), %xmm0
|
|
||||||
- PMINU 48(%rax), %xmm0
|
|
||||||
- PCMPEQ %xmm3, %xmm0
|
|
||||||
- pmovmskb %xmm0, %edx
|
|
||||||
- testl %edx, %edx
|
|
||||||
- jne L(exit0)
|
|
||||||
- jmp L(loop)
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(exit64):
|
|
||||||
- addq $64, %rax
|
|
||||||
-L(exit0):
|
|
||||||
- pxor %xmm0, %xmm0
|
|
||||||
- FIND_ZERO
|
|
||||||
-
|
|
||||||
- bsfq %rdx, %rdx
|
|
||||||
- addq %rdx, %rax
|
|
||||||
- subq %rdi, %rax
|
|
||||||
- SHIFT_RETURN
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
-#endif
|
|
||||||
-
|
|
||||||
-END(strlen)
|
|
||||||
libc_hidden_builtin_def (strlen)
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,181 +0,0 @@
|
|||||||
From 6f573a27b6c8b4236445810a44660612323f5a73 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Wed, 23 Jun 2021 01:19:34 -0400
|
|
||||||
Subject: [PATCH] x86-64: Add wcslen optimize for sse4.1
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This comment adds the ifunc / build infrastructure
|
|
||||||
necessary for wcslen to prefer the sse4.1 implementation
|
|
||||||
in strlen-vec.S. test-wcslen.c is passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/Makefile | 4 +-
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 3 ++
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-wcslen.h | 52 ++++++++++++++++++++++
|
|
||||||
sysdeps/x86_64/multiarch/wcslen-sse4_1.S | 4 ++
|
|
||||||
sysdeps/x86_64/multiarch/wcslen.c | 2 +-
|
|
||||||
sysdeps/x86_64/multiarch/wcsnlen.c | 34 +-------------
|
|
||||||
6 files changed, 63 insertions(+), 36 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcslen.h
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/wcslen-sse4_1.S
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
index 491c7698..65fde4eb 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
@@ -93,8 +93,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
|
|
||||||
wcscpy-ssse3 wcscpy-c \
|
|
||||||
wcschr-sse2 wcschr-avx2 \
|
|
||||||
wcsrchr-sse2 wcsrchr-avx2 \
|
|
||||||
- wcsnlen-sse4_1 wcsnlen-c \
|
|
||||||
- wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
|
|
||||||
+ wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
|
|
||||||
+ wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
|
|
||||||
wcschr-avx2-rtm \
|
|
||||||
wcscmp-avx2-rtm \
|
|
||||||
wcslen-avx2-rtm \
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index f1a6460a..580913ca 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -657,6 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
&& CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wcslen_evex)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
|
|
||||||
+ CPU_FEATURE_USABLE (SSE4_1),
|
|
||||||
+ __wcsnlen_sse4_1)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
|
|
||||||
|
|
||||||
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..39e33473
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
|
|
||||||
@@ -0,0 +1,52 @@
|
|
||||||
+/* Common definition for ifunc selections for wcslen and wcsnlen
|
|
||||||
+ All versions must be listed in ifunc-impl-list.c.
|
|
||||||
+ Copyright (C) 2017-2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <init-arch.h>
|
|
||||||
+
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
|
||||||
+
|
|
||||||
+static inline void *
|
|
||||||
+IFUNC_SELECTOR (void)
|
|
||||||
+{
|
|
||||||
+ const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
||||||
+
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
|
||||||
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
+ return OPTIMIZE (evex);
|
|
||||||
+
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
||||||
+ return OPTIMIZE (avx2_rtm);
|
|
||||||
+
|
|
||||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
+ return OPTIMIZE (avx2);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
|
|
||||||
+ return OPTIMIZE (sse4_1);
|
|
||||||
+
|
|
||||||
+ return OPTIMIZE (sse2);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..7e62621a
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
|
|
||||||
@@ -0,0 +1,4 @@
|
|
||||||
+#define AS_WCSLEN
|
|
||||||
+#define strlen __wcslen_sse4_1
|
|
||||||
+
|
|
||||||
+#include "strlen-vec.S"
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
|
|
||||||
index 6d06e47c..3b04b75b 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/wcslen.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/wcslen.c
|
|
||||||
@@ -24,7 +24,7 @@
|
|
||||||
# undef __wcslen
|
|
||||||
|
|
||||||
# define SYMBOL_NAME wcslen
|
|
||||||
-# include "ifunc-avx2.h"
|
|
||||||
+# include "ifunc-wcslen.h"
|
|
||||||
|
|
||||||
libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ());
|
|
||||||
weak_alias (__wcslen, wcslen);
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
|
|
||||||
index 20b731ae..06736410 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/wcsnlen.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
|
|
||||||
@@ -24,39 +24,7 @@
|
|
||||||
# undef __wcsnlen
|
|
||||||
|
|
||||||
# define SYMBOL_NAME wcsnlen
|
|
||||||
-# include <init-arch.h>
|
|
||||||
-
|
|
||||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|
||||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
|
||||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
|
||||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
|
||||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
|
||||||
-
|
|
||||||
-static inline void *
|
|
||||||
-IFUNC_SELECTOR (void)
|
|
||||||
-{
|
|
||||||
- const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
||||||
-
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
- && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
- {
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
|
||||||
- return OPTIMIZE (evex);
|
|
||||||
-
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
||||||
- return OPTIMIZE (avx2_rtm);
|
|
||||||
-
|
|
||||||
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
- return OPTIMIZE (avx2);
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
|
|
||||||
- return OPTIMIZE (sse4_1);
|
|
||||||
-
|
|
||||||
- return OPTIMIZE (sse2);
|
|
||||||
-}
|
|
||||||
+# include "ifunc-wcslen.h"
|
|
||||||
|
|
||||||
libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
|
|
||||||
weak_alias (__wcsnlen, wcsnlen);
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,396 +0,0 @@
|
|||||||
From 231c56760c1e2ded21ad96bbb860b1f08c556c7a Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 21 Jan 2019 11:27:25 -0800
|
|
||||||
Subject: [PATCH] x86-64 memcpy: Properly handle the length parameter [BZ#
|
|
||||||
24097]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
|
||||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
|
||||||
functions written in assembly can only use the lower 32 bits of a
|
|
||||||
64-bit register as length or must clear the upper 32 bits before using
|
|
||||||
the full 64-bit register for length.
|
|
||||||
|
|
||||||
This pach fixes memcpy for x32. Tested on x86-64 and x32. On x86-64,
|
|
||||||
libc.so is the same with and withou the fix.
|
|
||||||
|
|
||||||
[BZ# 24097]
|
|
||||||
CVE-2019-6488
|
|
||||||
* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for
|
|
||||||
length. Clear the upper 32 bits of RDX register.
|
|
||||||
* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
|
|
||||||
* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S:
|
|
||||||
Likewise.
|
|
||||||
* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
|
|
||||||
Likewise.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy.
|
|
||||||
tst-size_t-wmemchr.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 17 ++++--
|
|
||||||
sysdeps/x86_64/multiarch/memcpy-ssse3.S | 17 ++++--
|
|
||||||
.../multiarch/memmove-avx512-no-vzeroupper.S | 16 +++--
|
|
||||||
.../multiarch/memmove-vec-unaligned-erms.S | 54 +++++++++--------
|
|
||||||
sysdeps/x86_64/x32/Makefile | 2 +-
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-memcpy.c | 58 +++++++++++++++++++
|
|
||||||
6 files changed, 122 insertions(+), 42 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
|
|
||||||
index 3cd11233..568eebd3 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
|
|
||||||
@@ -45,28 +45,33 @@
|
|
||||||
.section .text.ssse3,"ax",@progbits
|
|
||||||
#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
|
|
||||||
ENTRY (MEMPCPY_CHK)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMPCPY_CHK)
|
|
||||||
|
|
||||||
ENTRY (MEMPCPY)
|
|
||||||
- movq %rdi, %rax
|
|
||||||
- addq %rdx, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
jmp L(start)
|
|
||||||
END (MEMPCPY)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if !defined USE_AS_BCOPY
|
|
||||||
ENTRY (MEMCPY_CHK)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMCPY_CHK)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ENTRY (MEMCPY)
|
|
||||||
- mov %rdi, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
#ifdef USE_AS_MEMPCPY
|
|
||||||
- add %rdx, %rax
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ mov %edx, %edx
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef USE_AS_MEMMOVE
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
|
|
||||||
index 0240bfa3..0bd5ee99 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
|
|
||||||
@@ -45,28 +45,33 @@
|
|
||||||
.section .text.ssse3,"ax",@progbits
|
|
||||||
#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
|
|
||||||
ENTRY (MEMPCPY_CHK)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMPCPY_CHK)
|
|
||||||
|
|
||||||
ENTRY (MEMPCPY)
|
|
||||||
- movq %rdi, %rax
|
|
||||||
- addq %rdx, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
jmp L(start)
|
|
||||||
END (MEMPCPY)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if !defined USE_AS_BCOPY
|
|
||||||
ENTRY (MEMCPY_CHK)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMCPY_CHK)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ENTRY (MEMCPY)
|
|
||||||
- mov %rdi, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
#ifdef USE_AS_MEMPCPY
|
|
||||||
- add %rdx, %rax
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ mov %edx, %edx
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef USE_AS_MEMMOVE
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
|
|
||||||
index effc3ac2..6ca2bbc9 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
|
|
||||||
@@ -24,27 +24,31 @@
|
|
||||||
|
|
||||||
.section .text.avx512,"ax",@progbits
|
|
||||||
ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (__mempcpy_chk_avx512_no_vzeroupper)
|
|
||||||
|
|
||||||
ENTRY (__mempcpy_avx512_no_vzeroupper)
|
|
||||||
- movq %rdi, %rax
|
|
||||||
- addq %rdx, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
jmp L(start)
|
|
||||||
END (__mempcpy_avx512_no_vzeroupper)
|
|
||||||
|
|
||||||
ENTRY (__memmove_chk_avx512_no_vzeroupper)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (__memmove_chk_avx512_no_vzeroupper)
|
|
||||||
|
|
||||||
ENTRY (__memmove_avx512_no_vzeroupper)
|
|
||||||
- mov %rdi, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
# ifdef USE_AS_MEMPCPY
|
|
||||||
- add %rdx, %rax
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
# endif
|
|
||||||
L(start):
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ mov %edx, %edx
|
|
||||||
+# endif
|
|
||||||
lea (%rsi, %rdx), %rcx
|
|
||||||
lea (%rdi, %rdx), %r9
|
|
||||||
cmp $512, %rdx
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
index c952576c..274aa1c7 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
@@ -95,20 +95,20 @@
|
|
||||||
.section SECTION(.text),"ax",@progbits
|
|
||||||
#if defined SHARED && IS_IN (libc)
|
|
||||||
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
|
|
||||||
- movq %rdi, %rax
|
|
||||||
- addq %rdx, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
jmp L(start)
|
|
||||||
END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
|
|
||||||
|
|
||||||
#if defined SHARED && IS_IN (libc)
|
|
||||||
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
|
|
||||||
#endif
|
|
||||||
@@ -116,9 +116,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
|
|
||||||
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
|
|
||||||
movq %rdi, %rax
|
|
||||||
L(start):
|
|
||||||
- cmpq $VEC_SIZE, %rdx
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %edx, %edx
|
|
||||||
+# endif
|
|
||||||
+ cmp $VEC_SIZE, %RDX_LP
|
|
||||||
jb L(less_vec)
|
|
||||||
- cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
+ cmp $(VEC_SIZE * 2), %RDX_LP
|
|
||||||
ja L(more_2x_vec)
|
|
||||||
#if !defined USE_MULTIARCH || !IS_IN (libc)
|
|
||||||
L(last_2x_vec):
|
|
||||||
@@ -138,38 +142,38 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned))
|
|
||||||
|
|
||||||
# if VEC_SIZE == 16
|
|
||||||
ENTRY (__mempcpy_chk_erms)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (__mempcpy_chk_erms)
|
|
||||||
|
|
||||||
/* Only used to measure performance of REP MOVSB. */
|
|
||||||
ENTRY (__mempcpy_erms)
|
|
||||||
- movq %rdi, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
/* Skip zero length. */
|
|
||||||
- testq %rdx, %rdx
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
jz 2f
|
|
||||||
- addq %rdx, %rax
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
jmp L(start_movsb)
|
|
||||||
END (__mempcpy_erms)
|
|
||||||
|
|
||||||
ENTRY (__memmove_chk_erms)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (__memmove_chk_erms)
|
|
||||||
|
|
||||||
ENTRY (__memmove_erms)
|
|
||||||
movq %rdi, %rax
|
|
||||||
/* Skip zero length. */
|
|
||||||
- testq %rdx, %rdx
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
jz 2f
|
|
||||||
L(start_movsb):
|
|
||||||
- movq %rdx, %rcx
|
|
||||||
- cmpq %rsi, %rdi
|
|
||||||
+ mov %RDX_LP, %RCX_LP
|
|
||||||
+ cmp %RSI_LP, %RDI_LP
|
|
||||||
jb 1f
|
|
||||||
/* Source == destination is less common. */
|
|
||||||
je 2f
|
|
||||||
- leaq (%rsi,%rcx), %rdx
|
|
||||||
- cmpq %rdx, %rdi
|
|
||||||
+ lea (%rsi,%rcx), %RDX_LP
|
|
||||||
+ cmp %RDX_LP, %RDI_LP
|
|
||||||
jb L(movsb_backward)
|
|
||||||
1:
|
|
||||||
rep movsb
|
|
||||||
@@ -189,20 +193,20 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
|
|
||||||
|
|
||||||
# ifdef SHARED
|
|
||||||
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
|
|
||||||
# endif
|
|
||||||
|
|
||||||
ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
|
|
||||||
- movq %rdi, %rax
|
|
||||||
- addq %rdx, %rax
|
|
||||||
+ mov %RDI_LP, %RAX_LP
|
|
||||||
+ add %RDX_LP, %RAX_LP
|
|
||||||
jmp L(start_erms)
|
|
||||||
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
|
|
||||||
|
|
||||||
# ifdef SHARED
|
|
||||||
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
|
||||||
# endif
|
|
||||||
@@ -210,9 +214,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
|
||||||
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
|
||||||
movq %rdi, %rax
|
|
||||||
L(start_erms):
|
|
||||||
- cmpq $VEC_SIZE, %rdx
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %edx, %edx
|
|
||||||
+# endif
|
|
||||||
+ cmp $VEC_SIZE, %RDX_LP
|
|
||||||
jb L(less_vec)
|
|
||||||
- cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
+ cmp $(VEC_SIZE * 2), %RDX_LP
|
|
||||||
ja L(movsb_more_2x_vec)
|
|
||||||
L(last_2x_vec):
|
|
||||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
@@ -236,7 +244,7 @@ L(movsb):
|
|
||||||
/* Avoid slow backward REP MOVSB. */
|
|
||||||
jb L(more_8x_vec_backward)
|
|
||||||
1:
|
|
||||||
- movq %rdx, %rcx
|
|
||||||
+ mov %RDX_LP, %RCX_LP
|
|
||||||
rep movsb
|
|
||||||
L(nop):
|
|
||||||
ret
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index ddec7f04..2fe1e5ac 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
-tests += tst-size_t-memchr tst-size_t-memcmp
|
|
||||||
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),wcsmbs)
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..66b71e17
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
|
|
||||||
@@ -0,0 +1,58 @@
|
|
||||||
+/* Test memcpy with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define TEST_NAME "memcpy"
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+IMPL (memcpy, 1)
|
|
||||||
+
|
|
||||||
+typedef void *(*proto_t) (void *, const void *, size_t);
|
|
||||||
+
|
|
||||||
+static void *
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_memcpy (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ parameter_t dest = { { page_size }, buf1 };
|
|
||||||
+ parameter_t src = { { 0 }, buf2 };
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ src.fn = impl->fn;
|
|
||||||
+ do_memcpy (dest, src);
|
|
||||||
+ int res = memcmp (dest.p, src.p, dest.len);
|
|
||||||
+ if (res)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: %i != 0",
|
|
||||||
+ impl->name, res);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,497 +0,0 @@
|
|||||||
From a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Wed, 23 Jun 2021 01:56:29 -0400
|
|
||||||
Subject: [PATCH] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ
|
|
||||||
#27974]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
This commit fixes the bug mentioned in the previous commit.
|
|
||||||
|
|
||||||
The previous implementations of wmemchr in these files relied
|
|
||||||
on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
|
|
||||||
|
|
||||||
The new overflow tests added in the previous commit now
|
|
||||||
pass (As well as all the other tests).
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
|
|
||||||
sysdeps/x86_64/multiarch/strlen-vec.S | 15 ++-
|
|
||||||
2 files changed, 107 insertions(+), 38 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
index be8a5db5..37688966 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
@@ -44,21 +44,21 @@
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
# define PAGE_SIZE 4096
|
|
||||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
||||||
|
|
||||||
.section SECTION(.text),"ax",@progbits
|
|
||||||
ENTRY (STRLEN)
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* Check zero length. */
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear upper bits. */
|
|
||||||
+ and %RSI_LP, %RSI_LP
|
|
||||||
+# else
|
|
||||||
test %RSI_LP, %RSI_LP
|
|
||||||
+# endif
|
|
||||||
jz L(zero)
|
|
||||||
/* Store max len in R8_LP before adjusting if using WCSLEN. */
|
|
||||||
mov %RSI_LP, %R8_LP
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shl $2, %RSI_LP
|
|
||||||
-# elif defined __ILP32__
|
|
||||||
- /* Clear the upper 32 bits. */
|
|
||||||
- movl %esi, %esi
|
|
||||||
-# endif
|
|
||||||
# endif
|
|
||||||
movl %edi, %eax
|
|
||||||
movq %rdi, %rdx
|
|
||||||
@@ -72,10 +72,10 @@ ENTRY (STRLEN)
|
|
||||||
|
|
||||||
/* Check the first VEC_SIZE bytes. */
|
|
||||||
VPCMPEQ (%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* If length < VEC_SIZE handle special. */
|
|
||||||
- cmpq $VEC_SIZE, %rsi
|
|
||||||
+ cmpq $CHAR_PER_VEC, %rsi
|
|
||||||
jbe L(first_vec_x0)
|
|
||||||
# endif
|
|
||||||
/* If empty continue to aligned_more. Otherwise return bit
|
|
||||||
@@ -84,6 +84,7 @@ ENTRY (STRLEN)
|
|
||||||
jz L(aligned_more)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrl $2, %eax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -97,9 +98,14 @@ L(zero):
|
|
||||||
L(first_vec_x0):
|
|
||||||
/* Set bit for max len so that tzcnt will return min of max len
|
|
||||||
and position of first match. */
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Multiply length by 4 to get byte count. */
|
|
||||||
+ sall $2, %esi
|
|
||||||
+# endif
|
|
||||||
btsq %rsi, %rax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrl $2, %eax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -113,14 +119,19 @@ L(first_vec_x1):
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* Use ecx which was computed earlier to compute correct value.
|
|
||||||
*/
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
|
|
||||||
+# else
|
|
||||||
subl $(VEC_SIZE * 4 + 1), %ecx
|
|
||||||
addl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
# else
|
|
||||||
subl %edx, %edi
|
|
||||||
incl %edi
|
|
||||||
addl %edi, %eax
|
|
||||||
# endif
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrl $2, %eax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -133,14 +144,19 @@ L(first_vec_x2):
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* Use ecx which was computed earlier to compute correct value.
|
|
||||||
*/
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
|
|
||||||
+# else
|
|
||||||
subl $(VEC_SIZE * 3 + 1), %ecx
|
|
||||||
addl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
# else
|
|
||||||
subl %edx, %edi
|
|
||||||
addl $(VEC_SIZE + 1), %edi
|
|
||||||
addl %edi, %eax
|
|
||||||
# endif
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrl $2, %eax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -153,14 +169,19 @@ L(first_vec_x3):
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* Use ecx which was computed earlier to compute correct value.
|
|
||||||
*/
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
|
|
||||||
+# else
|
|
||||||
subl $(VEC_SIZE * 2 + 1), %ecx
|
|
||||||
addl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
# else
|
|
||||||
subl %edx, %edi
|
|
||||||
addl $(VEC_SIZE * 2 + 1), %edi
|
|
||||||
addl %edi, %eax
|
|
||||||
# endif
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrl $2, %eax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -173,14 +194,19 @@ L(first_vec_x4):
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* Use ecx which was computed earlier to compute correct value.
|
|
||||||
*/
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
|
|
||||||
+# else
|
|
||||||
subl $(VEC_SIZE + 1), %ecx
|
|
||||||
addl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
# else
|
|
||||||
subl %edx, %edi
|
|
||||||
addl $(VEC_SIZE * 3 + 1), %edi
|
|
||||||
addl %edi, %eax
|
|
||||||
# endif
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrl $2, %eax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -195,10 +221,14 @@ L(cross_page_continue):
|
|
||||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
since data is only aligned to VEC_SIZE. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
|
|
||||||
- it simplies the logic in last_4x_vec_or_less. */
|
|
||||||
+ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
|
|
||||||
+ because it simplies the logic in last_4x_vec_or_less. */
|
|
||||||
leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
|
|
||||||
subq %rdx, %rcx
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %ecx
|
|
||||||
+# endif
|
|
||||||
# endif
|
|
||||||
/* Load first VEC regardless. */
|
|
||||||
VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
|
||||||
@@ -207,34 +237,38 @@ L(cross_page_continue):
|
|
||||||
subq %rcx, %rsi
|
|
||||||
jb L(last_4x_vec_or_less)
|
|
||||||
# endif
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x1)
|
|
||||||
|
|
||||||
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x2)
|
|
||||||
|
|
||||||
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x3)
|
|
||||||
|
|
||||||
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x4)
|
|
||||||
|
|
||||||
/* Align data to VEC_SIZE * 4 - 1. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* Before adjusting length check if at last VEC_SIZE * 4. */
|
|
||||||
- cmpq $(VEC_SIZE * 4 - 1), %rsi
|
|
||||||
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
|
|
||||||
jbe L(last_4x_vec_or_less_load)
|
|
||||||
incq %rdi
|
|
||||||
movl %edi, %ecx
|
|
||||||
orq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
andl $(VEC_SIZE * 4 - 1), %ecx
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %ecx
|
|
||||||
+# endif
|
|
||||||
/* Readjust length. */
|
|
||||||
addq %rcx, %rsi
|
|
||||||
# else
|
|
||||||
@@ -246,13 +280,13 @@ L(cross_page_continue):
|
|
||||||
L(loop_4x_vec):
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* Break if at end of length. */
|
|
||||||
- subq $(VEC_SIZE * 4), %rsi
|
|
||||||
+ subq $(CHAR_PER_VEC * 4), %rsi
|
|
||||||
jb L(last_4x_vec_or_less_cmpeq)
|
|
||||||
# endif
|
|
||||||
- /* Save some code size by microfusing VPMINU with the load. Since
|
|
||||||
- the matches in ymm2/ymm4 can only be returned if there where no
|
|
||||||
- matches in ymm1/ymm3 respectively there is no issue with overlap.
|
|
||||||
- */
|
|
||||||
+ /* Save some code size by microfusing VPMINU with the load.
|
|
||||||
+ Since the matches in ymm2/ymm4 can only be returned if there
|
|
||||||
+ where no matches in ymm1/ymm3 respectively there is no issue
|
|
||||||
+ with overlap. */
|
|
||||||
vmovdqa 1(%rdi), %ymm1
|
|
||||||
VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
|
|
||||||
vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
|
|
||||||
@@ -260,7 +294,7 @@ L(loop_4x_vec):
|
|
||||||
|
|
||||||
VPMINU %ymm2, %ymm4, %ymm5
|
|
||||||
VPCMPEQ %ymm5, %ymm0, %ymm5
|
|
||||||
- vpmovmskb %ymm5, %ecx
|
|
||||||
+ vpmovmskb %ymm5, %ecx
|
|
||||||
|
|
||||||
subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
testl %ecx, %ecx
|
|
||||||
@@ -268,27 +302,28 @@ L(loop_4x_vec):
|
|
||||||
|
|
||||||
|
|
||||||
VPCMPEQ %ymm1, %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
subq %rdx, %rdi
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(last_vec_return_x0)
|
|
||||||
|
|
||||||
VPCMPEQ %ymm2, %ymm0, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
+ vpmovmskb %ymm2, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(last_vec_return_x1)
|
|
||||||
|
|
||||||
/* Combine last 2 VEC. */
|
|
||||||
VPCMPEQ %ymm3, %ymm0, %ymm3
|
|
||||||
- vpmovmskb %ymm3, %eax
|
|
||||||
- /* rcx has combined result from all 4 VEC. It will only be used if
|
|
||||||
- the first 3 other VEC all did not contain a match. */
|
|
||||||
+ vpmovmskb %ymm3, %eax
|
|
||||||
+ /* rcx has combined result from all 4 VEC. It will only be used
|
|
||||||
+ if the first 3 other VEC all did not contain a match. */
|
|
||||||
salq $32, %rcx
|
|
||||||
orq %rcx, %rax
|
|
||||||
tzcntq %rax, %rax
|
|
||||||
subq $(VEC_SIZE * 2 - 1), %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -297,15 +332,19 @@ L(loop_4x_vec):
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
.p2align 4
|
|
||||||
L(last_4x_vec_or_less_load):
|
|
||||||
- /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
|
|
||||||
+ /* Depending on entry adjust rdi / prepare first VEC in ymm1.
|
|
||||||
+ */
|
|
||||||
subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
L(last_4x_vec_or_less_cmpeq):
|
|
||||||
VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
|
||||||
L(last_4x_vec_or_less):
|
|
||||||
-
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- /* If remaining length > VEC_SIZE * 2. This works if esi is off by
|
|
||||||
- VEC_SIZE * 4. */
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Multiply length by 4 to get byte count. */
|
|
||||||
+ sall $2, %esi
|
|
||||||
+# endif
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off
|
|
||||||
+ by VEC_SIZE * 4. */
|
|
||||||
testl $(VEC_SIZE * 2), %esi
|
|
||||||
jnz L(last_4x_vec)
|
|
||||||
|
|
||||||
@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
|
|
||||||
jb L(max)
|
|
||||||
|
|
||||||
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
/* Check the end of data. */
|
|
||||||
cmpl %eax, %esi
|
|
||||||
@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
|
|
||||||
addl $(VEC_SIZE + 1), %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -340,6 +380,7 @@ L(last_vec_return_x0):
|
|
||||||
subq $(VEC_SIZE * 4 - 1), %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -350,6 +391,7 @@ L(last_vec_return_x1):
|
|
||||||
subq $(VEC_SIZE * 3 - 1), %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -366,6 +408,7 @@ L(last_vec_x1_check):
|
|
||||||
incl %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -381,14 +424,14 @@ L(last_4x_vec):
|
|
||||||
jnz L(last_vec_x1)
|
|
||||||
|
|
||||||
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(last_vec_x2)
|
|
||||||
|
|
||||||
/* Normalize length. */
|
|
||||||
andl $(VEC_SIZE * 4 - 1), %esi
|
|
||||||
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(last_vec_x3)
|
|
||||||
|
|
||||||
@@ -396,7 +439,7 @@ L(last_4x_vec):
|
|
||||||
jb L(max)
|
|
||||||
|
|
||||||
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
/* Check the end of data. */
|
|
||||||
cmpl %eax, %esi
|
|
||||||
@@ -405,6 +448,7 @@ L(last_4x_vec):
|
|
||||||
addl $(VEC_SIZE * 3 + 1), %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -419,6 +463,7 @@ L(last_vec_x1):
|
|
||||||
incl %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -432,6 +477,7 @@ L(last_vec_x2):
|
|
||||||
addl $(VEC_SIZE + 1), %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -447,6 +493,7 @@ L(last_vec_x3):
|
|
||||||
addl $(VEC_SIZE * 2 + 1), %eax
|
|
||||||
addq %rdi, %rax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
shrq $2, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -455,13 +502,13 @@ L(max_end):
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
# endif
|
|
||||||
|
|
||||||
- /* Cold case for crossing page with first load. */
|
|
||||||
+ /* Cold case for crossing page with first load. */
|
|
||||||
.p2align 4
|
|
||||||
L(cross_page_boundary):
|
|
||||||
/* Align data to VEC_SIZE - 1. */
|
|
||||||
orq $(VEC_SIZE - 1), %rdi
|
|
||||||
VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
|
|
||||||
so no need to manually mod rdx. */
|
|
||||||
sarxl %edx, %eax, %eax
|
|
||||||
@@ -470,6 +517,10 @@ L(cross_page_boundary):
|
|
||||||
jnz L(cross_page_less_vec)
|
|
||||||
leaq 1(%rdi), %rcx
|
|
||||||
subq %rdx, %rcx
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
||||||
+ shrl $2, %ecx
|
|
||||||
+# endif
|
|
||||||
/* Check length. */
|
|
||||||
cmpq %rsi, %rcx
|
|
||||||
jb L(cross_page_continue)
|
|
||||||
@@ -479,6 +530,7 @@ L(cross_page_boundary):
|
|
||||||
jz L(cross_page_continue)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide length by 4 to get wchar_t count. */
|
|
||||||
shrl $2, %eax
|
|
||||||
# endif
|
|
||||||
# endif
|
|
||||||
@@ -489,6 +541,10 @@ L(return_vzeroupper):
|
|
||||||
.p2align 4
|
|
||||||
L(cross_page_less_vec):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Multiply length by 4 to get byte count. */
|
|
||||||
+ sall $2, %esi
|
|
||||||
+# endif
|
|
||||||
cmpq %rax, %rsi
|
|
||||||
cmovb %esi, %eax
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
index 8f660bb9..439e486a 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
@@ -65,12 +65,25 @@ ENTRY(strlen)
|
|
||||||
ret
|
|
||||||
L(n_nonzero):
|
|
||||||
# ifdef AS_WCSLEN
|
|
||||||
- shl $2, %RSI_LP
|
|
||||||
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
|
|
||||||
+ overflow the only way this program doesn't have undefined behavior
|
|
||||||
+ is if there is a null terminator in valid memory so wcslen will
|
|
||||||
+ suffice. */
|
|
||||||
+ mov %RSI_LP, %R10_LP
|
|
||||||
+ sar $62, %R10_LP
|
|
||||||
+ test %R10_LP, %R10_LP
|
|
||||||
+ jnz __wcslen_sse4_1
|
|
||||||
+ sal $2, %RSI_LP
|
|
||||||
# endif
|
|
||||||
|
|
||||||
+
|
|
||||||
/* Initialize long lived registers. */
|
|
||||||
|
|
||||||
add %RDI_LP, %RSI_LP
|
|
||||||
+# ifdef AS_WCSLEN
|
|
||||||
+/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
|
|
||||||
+ jbe __wcslen_sse4_1
|
|
||||||
+# endif
|
|
||||||
mov %RSI_LP, %R10_LP
|
|
||||||
and $-64, %R10_LP
|
|
||||||
mov %RSI_LP, %R11_LP
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,745 +0,0 @@
|
|||||||
From 4ba65586847751372520a36757c17f114588794e Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon, 19 Apr 2021 19:36:06 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize strlen-evex.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit optimizes strlen-evex.S. The
|
|
||||||
optimizations are mostly small things but they add up to roughly
|
|
||||||
10-30% performance improvement for strlen. The results for strnlen are
|
|
||||||
bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and
|
|
||||||
test-wcsnlen are all passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++-----------
|
|
||||||
1 file changed, 317 insertions(+), 264 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
|
|
||||||
index 05838190..4bf6874b 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
|
|
||||||
@@ -29,11 +29,13 @@
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
# define VPCMP vpcmpd
|
|
||||||
# define VPMINU vpminud
|
|
||||||
-# define SHIFT_REG r9d
|
|
||||||
+# define SHIFT_REG ecx
|
|
||||||
+# define CHAR_SIZE 4
|
|
||||||
# else
|
|
||||||
# define VPCMP vpcmpb
|
|
||||||
# define VPMINU vpminub
|
|
||||||
-# define SHIFT_REG ecx
|
|
||||||
+# define SHIFT_REG edx
|
|
||||||
+# define CHAR_SIZE 1
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# define XMMZERO xmm16
|
|
||||||
@@ -46,132 +48,165 @@
|
|
||||||
# define YMM6 ymm22
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
+# define PAGE_SIZE 4096
|
|
||||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
||||||
|
|
||||||
.section .text.evex,"ax",@progbits
|
|
||||||
ENTRY (STRLEN)
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* Check for zero length. */
|
|
||||||
+ /* Check zero length. */
|
|
||||||
test %RSI_LP, %RSI_LP
|
|
||||||
jz L(zero)
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shl $2, %RSI_LP
|
|
||||||
-# elif defined __ILP32__
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
/* Clear the upper 32 bits. */
|
|
||||||
movl %esi, %esi
|
|
||||||
# endif
|
|
||||||
mov %RSI_LP, %R8_LP
|
|
||||||
# endif
|
|
||||||
- movl %edi, %ecx
|
|
||||||
- movq %rdi, %rdx
|
|
||||||
+ movl %edi, %eax
|
|
||||||
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
|
||||||
-
|
|
||||||
+ /* Clear high bits from edi. Only keeping bits relevant to page
|
|
||||||
+ cross check. */
|
|
||||||
+ andl $(PAGE_SIZE - 1), %eax
|
|
||||||
/* Check if we may cross page boundary with one vector load. */
|
|
||||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
|
||||||
- cmpl $VEC_SIZE, %ecx
|
|
||||||
- ja L(cros_page_boundary)
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
+ ja L(cross_page_boundary)
|
|
||||||
|
|
||||||
/* Check the first VEC_SIZE bytes. Each bit in K0 represents a
|
|
||||||
null byte. */
|
|
||||||
VPCMP $0, (%rdi), %YMMZERO, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
-
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- jnz L(first_vec_x0_check)
|
|
||||||
- /* Adjust length and check the end of data. */
|
|
||||||
- subq $VEC_SIZE, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
-# else
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ /* If length < CHAR_PER_VEC handle special. */
|
|
||||||
+ cmpq $CHAR_PER_VEC, %rsi
|
|
||||||
+ jbe L(first_vec_x0)
|
|
||||||
# endif
|
|
||||||
-
|
|
||||||
- /* Align data for aligned loads in the loop. */
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
-
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(aligned_more)
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* Adjust length. */
|
|
||||||
- addq %rcx, %rsi
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- subq $(VEC_SIZE * 4), %rsi
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x0):
|
|
||||||
+ /* Set bit for max len so that tzcnt will return min of max len
|
|
||||||
+ and position of first match. */
|
|
||||||
+ btsq %rsi, %rax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
# endif
|
|
||||||
- jmp L(more_4x_vec)
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(cros_page_boundary):
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
-
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Divide shift count by 4 since each bit in K0 represent 4
|
|
||||||
- bytes. */
|
|
||||||
- movl %ecx, %SHIFT_REG
|
|
||||||
- sarl $2, %SHIFT_REG
|
|
||||||
+L(first_vec_x1):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Safe to use 32 bit instructions as these are only called for
|
|
||||||
+ size = [1, 159]. */
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ /* Use ecx which was computed earlier to compute correct value.
|
|
||||||
+ */
|
|
||||||
+ leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
|
|
||||||
+# else
|
|
||||||
+ subl %edx, %edi
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %edi
|
|
||||||
+# endif
|
|
||||||
+ leal CHAR_PER_VEC(%rdi, %rax), %eax
|
|
||||||
# endif
|
|
||||||
- VPCMP $0, (%rdi), %YMMZERO, %k0
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- /* Remove the leading bytes. */
|
|
||||||
- sarxl %SHIFT_REG, %eax, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jz L(aligned_more)
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x2):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
+ /* Safe to use 32 bit instructions as these are only called for
|
|
||||||
+ size = [1, 159]. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
-# endif
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- addq %rcx, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+ /* Use ecx which was computed earlier to compute correct value.
|
|
||||||
+ */
|
|
||||||
+ leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
|
|
||||||
+# else
|
|
||||||
+ subl %edx, %edi
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %edi
|
|
||||||
+# endif
|
|
||||||
+ leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax
|
|
||||||
# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(aligned_more):
|
|
||||||
+L(first_vec_x3):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Safe to use 32 bit instructions as these are only called for
|
|
||||||
+ size = [1, 159]. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
|
|
||||||
- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
|
|
||||||
- to void possible addition overflow. */
|
|
||||||
- negq %rcx
|
|
||||||
- addq $VEC_SIZE, %rcx
|
|
||||||
-
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- subq %rcx, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
+ /* Use ecx which was computed earlier to compute correct value.
|
|
||||||
+ */
|
|
||||||
+ leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
|
|
||||||
+# else
|
|
||||||
+ subl %edx, %edi
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %edi
|
|
||||||
+# endif
|
|
||||||
+ leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax
|
|
||||||
# endif
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
-
|
|
||||||
+ .p2align 4
|
|
||||||
+L(first_vec_x4):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Safe to use 32 bit instructions as these are only called for
|
|
||||||
+ size = [1, 159]. */
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- subq $(VEC_SIZE * 4), %rsi
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
+ /* Use ecx which was computed earlier to compute correct value.
|
|
||||||
+ */
|
|
||||||
+ leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
|
|
||||||
+# else
|
|
||||||
+ subl %edx, %edi
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %edi
|
|
||||||
+# endif
|
|
||||||
+ leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
|
|
||||||
# endif
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
-L(more_4x_vec):
|
|
||||||
+ .p2align 5
|
|
||||||
+L(aligned_more):
|
|
||||||
+ movq %rdi, %rdx
|
|
||||||
+ /* Align data to VEC_SIZE. */
|
|
||||||
+ andq $-(VEC_SIZE), %rdi
|
|
||||||
+L(cross_page_continue):
|
|
||||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
since data is only aligned to VEC_SIZE. */
|
|
||||||
- VPCMP $0, (%rdi), %YMMZERO, %k0
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
-
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ /* + CHAR_SIZE because it simplies the logic in
|
|
||||||
+ last_4x_vec_or_less. */
|
|
||||||
+ leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
|
|
||||||
+ subq %rdx, %rcx
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %ecx
|
|
||||||
+# endif
|
|
||||||
+# endif
|
|
||||||
+ /* Load first VEC regardless. */
|
|
||||||
VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ /* Adjust length. If near end handle specially. */
|
|
||||||
+ subq %rcx, %rsi
|
|
||||||
+ jb L(last_4x_vec_or_less)
|
|
||||||
+# endif
|
|
||||||
kmovd %k0, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x1)
|
|
||||||
|
|
||||||
VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
+ test %eax, %eax
|
|
||||||
jnz L(first_vec_x2)
|
|
||||||
|
|
||||||
VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
|
|
||||||
@@ -179,258 +214,276 @@ L(more_4x_vec):
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x3)
|
|
||||||
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
-
|
|
||||||
-# ifdef USE_AS_STRNLEN
|
|
||||||
- subq $(VEC_SIZE * 4), %rsi
|
|
||||||
- jbe L(last_4x_vec_or_less)
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
- /* Align data to 4 * VEC_SIZE. */
|
|
||||||
- movq %rdi, %rcx
|
|
||||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x4)
|
|
||||||
|
|
||||||
+ addq $VEC_SIZE, %rdi
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
- /* Adjust length. */
|
|
||||||
+ /* Check if at last VEC_SIZE * 4 length. */
|
|
||||||
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
|
|
||||||
+ jbe L(last_4x_vec_or_less_load)
|
|
||||||
+ movl %edi, %ecx
|
|
||||||
+ andl $(VEC_SIZE * 4 - 1), %ecx
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarl $2, %ecx
|
|
||||||
+# endif
|
|
||||||
+ /* Readjust length. */
|
|
||||||
addq %rcx, %rsi
|
|
||||||
# endif
|
|
||||||
+ /* Align data to VEC_SIZE * 4. */
|
|
||||||
+ andq $-(VEC_SIZE * 4), %rdi
|
|
||||||
|
|
||||||
+ /* Compare 4 * VEC at a time forward. */
|
|
||||||
.p2align 4
|
|
||||||
L(loop_4x_vec):
|
|
||||||
- /* Compare 4 * VEC at a time forward. */
|
|
||||||
- VMOVA (%rdi), %YMM1
|
|
||||||
- VMOVA VEC_SIZE(%rdi), %YMM2
|
|
||||||
- VMOVA (VEC_SIZE * 2)(%rdi), %YMM3
|
|
||||||
- VMOVA (VEC_SIZE * 3)(%rdi), %YMM4
|
|
||||||
-
|
|
||||||
- VPMINU %YMM1, %YMM2, %YMM5
|
|
||||||
- VPMINU %YMM3, %YMM4, %YMM6
|
|
||||||
+ /* Load first VEC regardless. */
|
|
||||||
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+ /* Break if at end of length. */
|
|
||||||
+ subq $(CHAR_PER_VEC * 4), %rsi
|
|
||||||
+ jb L(last_4x_vec_or_less_cmpeq)
|
|
||||||
+# endif
|
|
||||||
+ /* Save some code size by microfusing VPMINU with the load. Since
|
|
||||||
+ the matches in ymm2/ymm4 can only be returned if there where no
|
|
||||||
+ matches in ymm1/ymm3 respectively there is no issue with overlap.
|
|
||||||
+ */
|
|
||||||
+ VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
|
|
||||||
+ VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
|
|
||||||
+ VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
|
|
||||||
+
|
|
||||||
+ VPCMP $0, %YMM2, %YMMZERO, %k0
|
|
||||||
+ VPCMP $0, %YMM4, %YMMZERO, %k1
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ kortestd %k0, %k1
|
|
||||||
+ jz L(loop_4x_vec)
|
|
||||||
+
|
|
||||||
+ /* Check if end was in first half. */
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ shrq $2, %rdi
|
|
||||||
+# endif
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(second_vec_return)
|
|
||||||
|
|
||||||
- VPMINU %YMM5, %YMM6, %YMM5
|
|
||||||
- VPCMP $0, %YMM5, %YMMZERO, %k0
|
|
||||||
- ktestd %k0, %k0
|
|
||||||
- jnz L(4x_vec_end)
|
|
||||||
+ VPCMP $0, %YMM1, %YMMZERO, %k2
|
|
||||||
+ kmovd %k2, %edx
|
|
||||||
+ /* Combine VEC1 matches (edx) with VEC2 matches (eax). */
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ sall $CHAR_PER_VEC, %eax
|
|
||||||
+ orl %edx, %eax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# else
|
|
||||||
+ salq $CHAR_PER_VEC, %rax
|
|
||||||
+ orq %rdx, %rax
|
|
||||||
+ tzcntq %rax, %rax
|
|
||||||
+# endif
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
|
|
||||||
-# ifndef USE_AS_STRNLEN
|
|
||||||
- jmp L(loop_4x_vec)
|
|
||||||
-# else
|
|
||||||
- subq $(VEC_SIZE * 4), %rsi
|
|
||||||
- ja L(loop_4x_vec)
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
|
|
||||||
+L(last_4x_vec_or_less_load):
|
|
||||||
+ /* Depending on entry adjust rdi / prepare first VEC in YMM1. */
|
|
||||||
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
|
|
||||||
+L(last_4x_vec_or_less_cmpeq):
|
|
||||||
+ VPCMP $0, %YMM1, %YMMZERO, %k0
|
|
||||||
+ addq $(VEC_SIZE * 3), %rdi
|
|
||||||
L(last_4x_vec_or_less):
|
|
||||||
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
|
||||||
- addl $(VEC_SIZE * 2), %esi
|
|
||||||
- jle L(last_2x_vec)
|
|
||||||
-
|
|
||||||
- VPCMP $0, (%rdi), %YMMZERO, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off by
|
|
||||||
+ VEC_SIZE * 4. */
|
|
||||||
+ testl $(CHAR_PER_VEC * 2), %esi
|
|
||||||
+ jnz L(last_4x_vec)
|
|
||||||
+
|
|
||||||
+ /* length may have been negative or positive by an offset of
|
|
||||||
+ CHAR_PER_VEC * 4 depending on where this was called from. This
|
|
||||||
+ fixes that. */
|
|
||||||
+ andl $(CHAR_PER_VEC * 4 - 1), %esi
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ jnz L(last_vec_x1_check)
|
|
||||||
|
|
||||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
+ /* Check the end of data. */
|
|
||||||
+ subl $CHAR_PER_VEC, %esi
|
|
||||||
+ jb L(max)
|
|
||||||
|
|
||||||
VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x2_check)
|
|
||||||
- subl $VEC_SIZE, %esi
|
|
||||||
- jle L(max)
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Check the end of data. */
|
|
||||||
+ cmpl %eax, %esi
|
|
||||||
+ jb L(max)
|
|
||||||
|
|
||||||
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x3_check)
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarq $2, %rdi
|
|
||||||
+# endif
|
|
||||||
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
|
|
||||||
+ ret
|
|
||||||
+L(max):
|
|
||||||
movq %r8, %rax
|
|
||||||
+ ret
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+ /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
|
|
||||||
+ in the 4x VEC loop can use 2 byte encoding. */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(second_vec_return):
|
|
||||||
+ VPCMP $0, %YMM3, %YMMZERO, %k0
|
|
||||||
+ /* Combine YMM3 matches (k0) with YMM4 matches (k1). */
|
|
||||||
+# ifdef USE_AS_WCSLEN
|
|
||||||
+ kunpckbw %k0, %k1, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# else
|
|
||||||
+ kunpckdq %k0, %k1, %k0
|
|
||||||
+ kmovq %k0, %rax
|
|
||||||
+ tzcntq %rax, %rax
|
|
||||||
+# endif
|
|
||||||
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+# ifdef USE_AS_STRNLEN
|
|
||||||
+L(last_vec_x1_check):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ /* Check the end of data. */
|
|
||||||
+ cmpl %eax, %esi
|
|
||||||
+ jb L(max)
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarq $2, %rdi
|
|
||||||
# endif
|
|
||||||
+ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(last_2x_vec):
|
|
||||||
- addl $(VEC_SIZE * 2), %esi
|
|
||||||
+L(last_4x_vec):
|
|
||||||
+ /* Test first 2x VEC normally. */
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(last_vec_x1)
|
|
||||||
|
|
||||||
- VPCMP $0, (%rdi), %YMMZERO, %k0
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0_check)
|
|
||||||
- subl $VEC_SIZE, %esi
|
|
||||||
- jle L(max)
|
|
||||||
+ jnz L(last_vec_x2)
|
|
||||||
|
|
||||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
|
|
||||||
+ /* Normalize length. */
|
|
||||||
+ andl $(CHAR_PER_VEC * 4 - 1), %esi
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1_check)
|
|
||||||
- movq %r8, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
- ret
|
|
||||||
+ jnz L(last_vec_x3)
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x0_check):
|
|
||||||
+ /* Check the end of data. */
|
|
||||||
+ subl $(CHAR_PER_VEC * 3), %esi
|
|
||||||
+ jb L(max)
|
|
||||||
+
|
|
||||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
/* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
+ cmpl %eax, %esi
|
|
||||||
+ jb L(max_end)
|
|
||||||
+
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarq $2, %rdi
|
|
||||||
# endif
|
|
||||||
+ leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x1_check):
|
|
||||||
+L(last_vec_x1):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarq $2, %rdi
|
|
||||||
# endif
|
|
||||||
+ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x2_check):
|
|
||||||
+L(last_vec_x2):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
- /* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarq $2, %rdi
|
|
||||||
# endif
|
|
||||||
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x3_check):
|
|
||||||
+L(last_vec_x3):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
+ subl $(CHAR_PER_VEC * 2), %esi
|
|
||||||
/* Check the end of data. */
|
|
||||||
- cmpq %rax, %rsi
|
|
||||||
- jbe L(max)
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
+ cmpl %eax, %esi
|
|
||||||
+ jb L(max_end)
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
+ sarq $2, %rdi
|
|
||||||
# endif
|
|
||||||
+ leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax
|
|
||||||
ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(max):
|
|
||||||
+L(max_end):
|
|
||||||
movq %r8, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(zero):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
ret
|
|
||||||
# endif
|
|
||||||
|
|
||||||
+ /* Cold case for crossing page with first load. */
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x0):
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
+L(cross_page_boundary):
|
|
||||||
+ movq %rdi, %rdx
|
|
||||||
+ /* Align data to VEC_SIZE. */
|
|
||||||
+ andq $-VEC_SIZE, %rdi
|
|
||||||
+ VPCMP $0, (%rdi), %YMMZERO, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ /* Remove the leading bytes. */
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
|
|
||||||
+ bytes. */
|
|
||||||
+ movl %edx, %ecx
|
|
||||||
+ shrl $2, %ecx
|
|
||||||
+ andl $(CHAR_PER_VEC - 1), %ecx
|
|
||||||
# endif
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x1):
|
|
||||||
+ /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */
|
|
||||||
+ sarxl %SHIFT_REG, %eax, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+# ifndef USE_AS_STRNLEN
|
|
||||||
+ jz L(cross_page_continue)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x2):
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
+# else
|
|
||||||
+ jnz L(cross_page_less_vec)
|
|
||||||
+# ifndef USE_AS_WCSLEN
|
|
||||||
+ movl %edx, %ecx
|
|
||||||
+ andl $(CHAR_PER_VEC - 1), %ecx
|
|
||||||
+# endif
|
|
||||||
+ movl $CHAR_PER_VEC, %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+ /* Check the end of data. */
|
|
||||||
+ cmpq %rax, %rsi
|
|
||||||
+ ja L(cross_page_continue)
|
|
||||||
+ movl %esi, %eax
|
|
||||||
ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(4x_vec_end):
|
|
||||||
- VPCMP $0, %YMM1, %YMMZERO, %k0
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
- VPCMP $0, %YMM2, %YMMZERO, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
- VPCMP $0, %YMM3, %YMMZERO, %k2
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
- VPCMP $0, %YMM4, %YMMZERO, %k3
|
|
||||||
- kmovd %k3, %eax
|
|
||||||
-L(first_vec_x3):
|
|
||||||
+L(cross_page_less_vec):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- sall $2, %eax
|
|
||||||
-# endif
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
- subq %rdx, %rax
|
|
||||||
-# ifdef USE_AS_WCSLEN
|
|
||||||
- shrq $2, %rax
|
|
||||||
-# endif
|
|
||||||
+ /* Select min of length and position of first null. */
|
|
||||||
+ cmpq %rax, %rsi
|
|
||||||
+ cmovb %esi, %eax
|
|
||||||
ret
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
END (STRLEN)
|
|
||||||
#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,158 +0,0 @@
|
|||||||
From ea8e465a6b8d0f26c72bcbe453a854de3abf68ec Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Wed, 30 Jun 2021 10:47:06 -0700
|
|
||||||
Subject: [PATCH] x86: Check RTM_ALWAYS_ABORT for RTM [BZ #28033]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
From
|
|
||||||
|
|
||||||
https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
|
|
||||||
|
|
||||||
* Intel TSX will be disabled by default.
|
|
||||||
* The processor will force abort all Restricted Transactional Memory (RTM)
|
|
||||||
transactions by default.
|
|
||||||
* A new CPUID bit CPUID.07H.0H.EDX[11](RTM_ALWAYS_ABORT) will be enumerated,
|
|
||||||
which is set to indicate to updated software that the loaded microcode is
|
|
||||||
forcing RTM abort.
|
|
||||||
* On processors that enumerate support for RTM, the CPUID enumeration bits
|
|
||||||
for Intel TSX (CPUID.07H.0H.EBX[11] and CPUID.07H.0H.EBX[4]) continue to
|
|
||||||
be set by default after microcode update.
|
|
||||||
* Workloads that were benefited from Intel TSX might experience a change
|
|
||||||
in performance.
|
|
||||||
* System software may use a new bit in Model-Specific Register (MSR) 0x10F
|
|
||||||
TSX_FORCE_ABORT[TSX_CPUID_CLEAR] functionality to clear the Hardware Lock
|
|
||||||
Elision (HLE) and RTM bits to indicate to software that Intel TSX is
|
|
||||||
disabled.
|
|
||||||
|
|
||||||
1. Add RTM_ALWAYS_ABORT to CPUID features.
|
|
||||||
2. Set RTM usable only if RTM_ALWAYS_ABORT isn't set. This skips the
|
|
||||||
string/tst-memchr-rtm etc. testcases on the affected processors, which
|
|
||||||
always fail after a microcde update.
|
|
||||||
3. Check RTM feature, instead of usability, against /proc/cpuinfo.
|
|
||||||
|
|
||||||
This fixes BZ #28033.
|
|
||||||
---
|
|
||||||
manual/platform.texi | 3 +++
|
|
||||||
sysdeps/x86/cpu-features.c | 5 ++++-
|
|
||||||
sysdeps/x86/sys/platform/x86.h | 6 +++---
|
|
||||||
sysdeps/x86/tst-cpu-features-supports.c | 2 +-
|
|
||||||
sysdeps/x86/tst-get-cpu-features.c | 2 ++
|
|
||||||
5 files changed, 13 insertions(+), 5 deletions(-)
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
sysdeps/x86/bits/platform/x86.h
|
|
||||||
(doesn't exist)
|
|
||||||
sysdeps/x86/bits/platform/x86.h
|
|
||||||
(account for lack of upstream renames)
|
|
||||||
|
|
||||||
diff --git a/manual/platform.texi b/manual/platform.texi
|
|
||||||
index 8fec2933..b7e8aef7 100644
|
|
||||||
--- a/manual/platform.texi
|
|
||||||
+++ b/manual/platform.texi
|
|
||||||
@@ -510,6 +510,9 @@ capability.
|
|
||||||
@item
|
|
||||||
@code{RTM} -- RTM instruction extensions.
|
|
||||||
|
|
||||||
+@item
|
|
||||||
+@code{RTM_ALWAYS_ABORT} -- Transactions always abort, making RTM unusable.
|
|
||||||
+
|
|
||||||
@item
|
|
||||||
@code{SDBG} -- IA32_DEBUG_INTERFACE MSR for silicon debug.
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
|
||||||
index 3610ee5c..4889f062 100644
|
|
||||||
--- a/sysdeps/x86/cpu-features.c
|
|
||||||
+++ b/sysdeps/x86/cpu-features.c
|
|
||||||
@@ -74,7 +74,6 @@ update_usable (struct cpu_features *cpu_features)
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, HLE);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, BMI2);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, ERMS);
|
|
||||||
- CPU_FEATURE_SET_USABLE (cpu_features, RTM);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, RDSEED);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, ADX);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, CLFLUSHOPT);
|
|
||||||
@@ -90,6 +89,7 @@ update_usable (struct cpu_features *cpu_features)
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, MOVDIRI);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, MOVDIR64B);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, FSRM);
|
|
||||||
+ CPU_FEATURE_SET_USABLE (cpu_features, RTM_ALWAYS_ABORT);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, SERIALIZE);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, TSXLDTRK);
|
|
||||||
CPU_FEATURE_SET_USABLE (cpu_features, LAHF64_SAHF64);
|
|
||||||
@@ -779,6 +779,9 @@ no_cpuid:
|
|
||||||
GLRO(dl_platform) = "i586";
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+ if (!CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT))
|
|
||||||
+ CPU_FEATURE_SET_USABLE (cpu_features, RTM);
|
|
||||||
+
|
|
||||||
#if CET_ENABLED
|
|
||||||
# if HAVE_TUNABLES
|
|
||||||
TUNABLE_GET (x86_ibt, tunable_val_t *,
|
|
||||||
diff --git a/sysdeps/x86/sys/platform/x86.h b/sysdeps/x86/sys/platform/x86.h
|
|
||||||
index e5cc7c68..7a434926 100644
|
|
||||||
--- a/sysdeps/x86/sys/platform/x86.h
|
|
||||||
+++ b/sysdeps/x86/sys/platform/x86.h
|
|
||||||
@@ -247,7 +247,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
|
|
||||||
#define bit_cpu_AVX512_VP2INTERSECT (1u << 8)
|
|
||||||
#define bit_cpu_INDEX_7_EDX_9 (1u << 9)
|
|
||||||
#define bit_cpu_MD_CLEAR (1u << 10)
|
|
||||||
-#define bit_cpu_INDEX_7_EDX_11 (1u << 11)
|
|
||||||
+#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11)
|
|
||||||
#define bit_cpu_INDEX_7_EDX_12 (1u << 12)
|
|
||||||
#define bit_cpu_INDEX_7_EDX_13 (1u << 13)
|
|
||||||
#define bit_cpu_SERIALIZE (1u << 14)
|
|
||||||
@@ -471,7 +471,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
|
|
||||||
#define index_cpu_AVX512_VP2INTERSECT COMMON_CPUID_INDEX_7
|
|
||||||
#define index_cpu_INDEX_7_EDX_9 COMMON_CPUID_INDEX_7
|
|
||||||
#define index_cpu_MD_CLEAR COMMON_CPUID_INDEX_7
|
|
||||||
-#define index_cpu_INDEX_7_EDX_11 COMMON_CPUID_INDEX_7
|
|
||||||
+#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7
|
|
||||||
#define index_cpu_INDEX_7_EDX_12 COMMON_CPUID_INDEX_7
|
|
||||||
#define index_cpu_INDEX_7_EDX_13 COMMON_CPUID_INDEX_7
|
|
||||||
#define index_cpu_SERIALIZE COMMON_CPUID_INDEX_7
|
|
||||||
@@ -695,7 +695,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
|
|
||||||
#define reg_AVX512_VP2INTERSECT edx
|
|
||||||
#define reg_INDEX_7_EDX_9 edx
|
|
||||||
#define reg_MD_CLEAR edx
|
|
||||||
-#define reg_INDEX_7_EDX_11 edx
|
|
||||||
+#define reg_RTM_ALWAYS_ABORT edx
|
|
||||||
#define reg_INDEX_7_EDX_12 edx
|
|
||||||
#define reg_INDEX_7_EDX_13 edx
|
|
||||||
#define reg_SERIALIZE edx
|
|
||||||
diff --git a/sysdeps/x86/tst-cpu-features-supports.c b/sysdeps/x86/tst-cpu-features-supports.c
|
|
||||||
index 287cf01f..8100a319 100644
|
|
||||||
--- a/sysdeps/x86/tst-cpu-features-supports.c
|
|
||||||
+++ b/sysdeps/x86/tst-cpu-features-supports.c
|
|
||||||
@@ -152,7 +152,7 @@ do_test (int argc, char **argv)
|
|
||||||
fails += CHECK_SUPPORTS (rdpid, RDPID);
|
|
||||||
fails += CHECK_SUPPORTS (rdrnd, RDRAND);
|
|
||||||
fails += CHECK_SUPPORTS (rdseed, RDSEED);
|
|
||||||
- fails += CHECK_SUPPORTS (rtm, RTM);
|
|
||||||
+ fails += CHECK_CPU_SUPPORTS (rtm, RTM);
|
|
||||||
fails += CHECK_SUPPORTS (serialize, SERIALIZE);
|
|
||||||
fails += CHECK_SUPPORTS (sha, SHA);
|
|
||||||
fails += CHECK_CPU_SUPPORTS (shstk, SHSTK);
|
|
||||||
diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c
|
|
||||||
index 2763deb6..0717e5d8 100644
|
|
||||||
--- a/sysdeps/x86/tst-get-cpu-features.c
|
|
||||||
+++ b/sysdeps/x86/tst-get-cpu-features.c
|
|
||||||
@@ -183,6 +183,7 @@ do_test (void)
|
|
||||||
CHECK_CPU_FEATURE (UINTR);
|
|
||||||
CHECK_CPU_FEATURE (AVX512_VP2INTERSECT);
|
|
||||||
CHECK_CPU_FEATURE (MD_CLEAR);
|
|
||||||
+ CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT);
|
|
||||||
CHECK_CPU_FEATURE (SERIALIZE);
|
|
||||||
CHECK_CPU_FEATURE (HYBRID);
|
|
||||||
CHECK_CPU_FEATURE (TSXLDTRK);
|
|
||||||
@@ -344,6 +345,7 @@ do_test (void)
|
|
||||||
CHECK_CPU_FEATURE_USABLE (FSRM);
|
|
||||||
CHECK_CPU_FEATURE_USABLE (AVX512_VP2INTERSECT);
|
|
||||||
CHECK_CPU_FEATURE_USABLE (MD_CLEAR);
|
|
||||||
+ CHECK_CPU_FEATURE_USABLE (RTM_ALWAYS_ABORT);
|
|
||||||
CHECK_CPU_FEATURE_USABLE (SERIALIZE);
|
|
||||||
CHECK_CPU_FEATURE_USABLE (HYBRID);
|
|
||||||
CHECK_CPU_FEATURE_USABLE (TSXLDTRK);
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,51 +0,0 @@
|
|||||||
From 0679442defedf7e52a94264975880ab8674736b2 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Thu, 8 Jul 2021 16:13:19 -0400
|
|
||||||
Subject: [PATCH] x86: Remove wcsnlen-sse4_1 from wcslen ifunc-impl-list [BZ
|
|
||||||
#28064]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
The following commit
|
|
||||||
|
|
||||||
commit 6f573a27b6c8b4236445810a44660612323f5a73
|
|
||||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Wed Jun 23 01:19:34 2021 -0400
|
|
||||||
|
|
||||||
x86-64: Add wcslen optimize for sse4.1
|
|
||||||
|
|
||||||
Added wcsnlen-sse4.1 to the wcslen ifunc implementation list and did
|
|
||||||
not add wcslen-sse4.1 to wcslen ifunc implementation list. This commit
|
|
||||||
fixes that by removing wcsnlen-sse4.1 from the wcslen ifunc
|
|
||||||
implementation list and adding wcslen-sse4.1 to the ifunc
|
|
||||||
implementation list.
|
|
||||||
|
|
||||||
Testing:
|
|
||||||
test-wcslen.c, test-rsi-wcslen.c, and test-rsi-strlen.c are passing as
|
|
||||||
well as all other tests in wcsmbs and string.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ++--
|
|
||||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index 580913ca..695cdba6 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -657,9 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
&& CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wcslen_evex)
|
|
||||||
- IFUNC_IMPL_ADD (array, i, wcsnlen,
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, wcslen,
|
|
||||||
CPU_FEATURE_USABLE (SSE4_1),
|
|
||||||
- __wcsnlen_sse4_1)
|
|
||||||
+ __wcslen_sse4_1)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
|
|
||||||
|
|
||||||
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,51 +0,0 @@
|
|||||||
From 55c7bcc71b84123d5d4bd2814366a6b05fcf8ebd Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Sat, 9 May 2020 12:04:23 -0700
|
|
||||||
Subject: [PATCH] x86-64: Use RDX_LP on __x86_shared_non_temporal_threshold [BZ
|
|
||||||
#25966]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Since __x86_shared_non_temporal_threshold is defined as
|
|
||||||
|
|
||||||
long int __x86_shared_non_temporal_threshold;
|
|
||||||
|
|
||||||
and long int is 4 bytes for x32, use RDX_LP to compare against
|
|
||||||
__x86_shared_non_temporal_threshold in assembly code.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
|
|
||||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
index 71f5954d..673b73aa 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
@@ -245,7 +245,7 @@ L(return):
|
|
||||||
#endif
|
|
||||||
|
|
||||||
L(movsb):
|
|
||||||
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
|
|
||||||
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
|
||||||
jae L(more_8x_vec)
|
|
||||||
cmpq %rsi, %rdi
|
|
||||||
jb 1f
|
|
||||||
@@ -397,7 +397,7 @@ L(more_8x_vec):
|
|
||||||
addq %r8, %rdx
|
|
||||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
|
||||||
/* Check non-temporal store threshold. */
|
|
||||||
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
|
|
||||||
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
|
||||||
ja L(large_forward)
|
|
||||||
#endif
|
|
||||||
L(loop_4x_vec_forward):
|
|
||||||
@@ -448,7 +448,7 @@ L(more_8x_vec_backward):
|
|
||||||
subq %r8, %rdx
|
|
||||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
|
||||||
/* Check non-temporal store threshold. */
|
|
||||||
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
|
|
||||||
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
|
||||||
ja L(large_backward)
|
|
||||||
#endif
|
|
||||||
L(loop_4x_vec_backward):
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,44 +0,0 @@
|
|||||||
From a35a59036ebae3efcdf5e8167610e0656fca9770 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Thu, 11 Jun 2020 12:41:18 -0700
|
|
||||||
Subject: [PATCH] x86_64: Use %xmmN with vpxor to clear a vector register
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Since "vpxor %xmmN, %xmmN, %xmmN" clears the whole vector register, use
|
|
||||||
%xmmN, instead of %ymmN, with vpxor to clear a vector register.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 4 ++--
|
|
||||||
sysdeps/x86_64/multiarch/strrchr-avx2.S | 2 +-
|
|
||||||
2 files changed, 3 insertions(+), 3 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
index 433ae047..70d8499b 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
@@ -105,8 +105,8 @@ ENTRY (STRCMP)
|
|
||||||
# endif
|
|
||||||
movl %edi, %eax
|
|
||||||
xorl %edx, %edx
|
|
||||||
- /* Make %ymm7 all zeros in this function. */
|
|
||||||
- vpxor %ymm7, %ymm7, %ymm7
|
|
||||||
+ /* Make %xmm7 (%ymm7) all zeros in this function. */
|
|
||||||
+ vpxor %xmm7, %xmm7, %xmm7
|
|
||||||
orl %esi, %eax
|
|
||||||
andl $(PAGE_SIZE - 1), %eax
|
|
||||||
cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
|
|
||||||
index 9f22a15e..c949410b 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
|
|
||||||
@@ -48,7 +48,7 @@ ENTRY (STRRCHR)
|
|
||||||
movl %edi, %ecx
|
|
||||||
/* Broadcast CHAR to YMM4. */
|
|
||||||
VPBROADCAST %xmm4, %ymm4
|
|
||||||
- vpxor %ymm0, %ymm0, %ymm0
|
|
||||||
+ vpxor %xmm0, %xmm0, %xmm0
|
|
||||||
|
|
||||||
/* Check if we may cross page boundary with one vector load. */
|
|
||||||
andl $(2 * VEC_SIZE - 1), %ecx
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,359 +0,0 @@
|
|||||||
From 1f745ecc2109890886b161d4791e1406fdfc29b8 Mon Sep 17 00:00:00 2001
|
|
||||||
From: noah <goldstein.w.n@gmail.com>
|
|
||||||
Date: Wed, 3 Feb 2021 00:38:59 -0500
|
|
||||||
Subject: [PATCH] x86-64: Refactor and improve performance of strchr-avx2.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. Just seemed the performance could be improved a bit. Observed
|
|
||||||
and expected behavior are unchanged. Optimized body of main
|
|
||||||
loop. Updated page cross logic and optimized accordingly. Made a few
|
|
||||||
minor instruction selection modifications. No regressions in test
|
|
||||||
suite. Both test-strchrnul and test-strchr passed.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strchr-avx2.S | 225 ++++++++++++-------------
|
|
||||||
sysdeps/x86_64/multiarch/strchr.c | 4 +-
|
|
||||||
2 files changed, 114 insertions(+), 115 deletions(-)
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
sysdeps/x86_64/multiarch/strchr.c
|
|
||||||
(account for missing upstream macros)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
|
|
||||||
index da7d2620..919d256c 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
|
|
||||||
@@ -27,10 +27,12 @@
|
|
||||||
# ifdef USE_AS_WCSCHR
|
|
||||||
# define VPBROADCAST vpbroadcastd
|
|
||||||
# define VPCMPEQ vpcmpeqd
|
|
||||||
+# define VPMINU vpminud
|
|
||||||
# define CHAR_REG esi
|
|
||||||
# else
|
|
||||||
# define VPBROADCAST vpbroadcastb
|
|
||||||
# define VPCMPEQ vpcmpeqb
|
|
||||||
+# define VPMINU vpminub
|
|
||||||
# define CHAR_REG sil
|
|
||||||
# endif
|
|
||||||
|
|
||||||
@@ -43,71 +45,54 @@
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
+# define PAGE_SIZE 4096
|
|
||||||
|
|
||||||
.section SECTION(.text),"ax",@progbits
|
|
||||||
ENTRY (STRCHR)
|
|
||||||
movl %edi, %ecx
|
|
||||||
- /* Broadcast CHAR to YMM0. */
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+ /* Broadcast CHAR to YMM0. */
|
|
||||||
vmovd %esi, %xmm0
|
|
||||||
vpxor %xmm9, %xmm9, %xmm9
|
|
||||||
VPBROADCAST %xmm0, %ymm0
|
|
||||||
- /* Check if we may cross page boundary with one vector load. */
|
|
||||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
|
||||||
- cmpl $VEC_SIZE, %ecx
|
|
||||||
- ja L(cros_page_boundary)
|
|
||||||
|
|
||||||
- /* Check the first VEC_SIZE bytes. Search for both CHAR and the
|
|
||||||
- null byte. */
|
|
||||||
- vmovdqu (%rdi), %ymm8
|
|
||||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
- vpor %ymm1, %ymm2, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ /* Check if we cross page boundary with one vector load. */
|
|
||||||
+ andl $(PAGE_SIZE - 1), %ecx
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
|
|
||||||
+ ja L(cross_page_boundary)
|
|
||||||
|
|
||||||
- /* Align data for aligned loads in the loop. */
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
-
|
|
||||||
- jmp L(more_4x_vec)
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(cros_page_boundary):
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
+ /* Check the first VEC_SIZE bytes. Search for both CHAR and the
|
|
||||||
+ null byte. */
|
|
||||||
vmovdqu (%rdi), %ymm8
|
|
||||||
VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
vpor %ymm1, %ymm2, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
- /* Remove the leading bytes. */
|
|
||||||
- sarl %cl, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jz L(aligned_more)
|
|
||||||
- /* Found CHAR or the null byte. */
|
|
||||||
+ jz L(more_vecs)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- addq %rcx, %rax
|
|
||||||
-# ifdef USE_AS_STRCHRNUL
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
addq %rdi, %rax
|
|
||||||
-# else
|
|
||||||
- xorl %edx, %edx
|
|
||||||
- leaq (%rdi, %rax), %rax
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ cmp (%rax), %CHAR_REG
|
|
||||||
cmovne %rdx, %rax
|
|
||||||
# endif
|
|
||||||
L(return_vzeroupper):
|
|
||||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
+L(more_vecs):
|
|
||||||
+ /* Align data for aligned loads in the loop. */
|
|
||||||
+ andq $-VEC_SIZE, %rdi
|
|
||||||
L(aligned_more):
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
|
|
||||||
-L(more_4x_vec):
|
|
||||||
- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
- since data is only aligned to VEC_SIZE. */
|
|
||||||
- vmovdqa (%rdi), %ymm8
|
|
||||||
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
+ since data is only aligned to VEC_SIZE. */
|
|
||||||
+ vmovdqa VEC_SIZE(%rdi), %ymm8
|
|
||||||
+ addq $VEC_SIZE, %rdi
|
|
||||||
VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
vpor %ymm1, %ymm2, %ymm1
|
|
||||||
@@ -137,61 +122,24 @@ L(more_4x_vec):
|
|
||||||
vpor %ymm1, %ymm2, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x3)
|
|
||||||
-
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
-
|
|
||||||
- /* Align data to 4 * VEC_SIZE. */
|
|
||||||
- movq %rdi, %rcx
|
|
||||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
|
||||||
- andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(loop_4x_vec):
|
|
||||||
- /* Compare 4 * VEC at a time forward. */
|
|
||||||
- vmovdqa (%rdi), %ymm5
|
|
||||||
- vmovdqa VEC_SIZE(%rdi), %ymm6
|
|
||||||
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
|
|
||||||
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
|
|
||||||
-
|
|
||||||
- VPCMPEQ %ymm5, %ymm0, %ymm1
|
|
||||||
- VPCMPEQ %ymm6, %ymm0, %ymm2
|
|
||||||
- VPCMPEQ %ymm7, %ymm0, %ymm3
|
|
||||||
- VPCMPEQ %ymm8, %ymm0, %ymm4
|
|
||||||
-
|
|
||||||
- VPCMPEQ %ymm5, %ymm9, %ymm5
|
|
||||||
- VPCMPEQ %ymm6, %ymm9, %ymm6
|
|
||||||
- VPCMPEQ %ymm7, %ymm9, %ymm7
|
|
||||||
- VPCMPEQ %ymm8, %ymm9, %ymm8
|
|
||||||
-
|
|
||||||
- vpor %ymm1, %ymm5, %ymm1
|
|
||||||
- vpor %ymm2, %ymm6, %ymm2
|
|
||||||
- vpor %ymm3, %ymm7, %ymm3
|
|
||||||
- vpor %ymm4, %ymm8, %ymm4
|
|
||||||
-
|
|
||||||
- vpor %ymm1, %ymm2, %ymm5
|
|
||||||
- vpor %ymm3, %ymm4, %ymm6
|
|
||||||
-
|
|
||||||
- vpor %ymm5, %ymm6, %ymm5
|
|
||||||
-
|
|
||||||
- vpmovmskb %ymm5, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(4x_vec_end)
|
|
||||||
-
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
+ jz L(prep_loop_4x)
|
|
||||||
|
|
||||||
- jmp L(loop_4x_vec)
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ cmp (%rax), %CHAR_REG
|
|
||||||
+ cmovne %rdx, %rax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x0):
|
|
||||||
- /* Found CHAR or the null byte. */
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_STRCHRNUL
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
addq %rdi, %rax
|
|
||||||
-# else
|
|
||||||
- xorl %edx, %edx
|
|
||||||
- leaq (%rdi, %rax), %rax
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ cmp (%rax), %CHAR_REG
|
|
||||||
cmovne %rdx, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -199,13 +147,9 @@ L(first_vec_x0):
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x1):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_STRCHRNUL
|
|
||||||
- addq $VEC_SIZE, %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
-# else
|
|
||||||
- xorl %edx, %edx
|
|
||||||
leaq VEC_SIZE(%rdi, %rax), %rax
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ cmp (%rax), %CHAR_REG
|
|
||||||
cmovne %rdx, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
@@ -213,42 +157,97 @@ L(first_vec_x1):
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x2):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_STRCHRNUL
|
|
||||||
- addq $(VEC_SIZE * 2), %rax
|
|
||||||
- addq %rdi, %rax
|
|
||||||
-# else
|
|
||||||
- xorl %edx, %edx
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ cmp (%rax), %CHAR_REG
|
|
||||||
cmovne %rdx, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
+L(prep_loop_4x):
|
|
||||||
+ /* Align data to 4 * VEC_SIZE. */
|
|
||||||
+ andq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+
|
|
||||||
.p2align 4
|
|
||||||
-L(4x_vec_end):
|
|
||||||
+L(loop_4x_vec):
|
|
||||||
+ /* Compare 4 * VEC at a time forward. */
|
|
||||||
+ vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
|
|
||||||
+ vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
|
|
||||||
+ vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
|
|
||||||
+ vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
|
|
||||||
+
|
|
||||||
+ /* Leaves only CHARS matching esi as 0. */
|
|
||||||
+ vpxor %ymm5, %ymm0, %ymm1
|
|
||||||
+ vpxor %ymm6, %ymm0, %ymm2
|
|
||||||
+ vpxor %ymm7, %ymm0, %ymm3
|
|
||||||
+ vpxor %ymm8, %ymm0, %ymm4
|
|
||||||
+
|
|
||||||
+ VPMINU %ymm1, %ymm5, %ymm1
|
|
||||||
+ VPMINU %ymm2, %ymm6, %ymm2
|
|
||||||
+ VPMINU %ymm3, %ymm7, %ymm3
|
|
||||||
+ VPMINU %ymm4, %ymm8, %ymm4
|
|
||||||
+
|
|
||||||
+ VPMINU %ymm1, %ymm2, %ymm5
|
|
||||||
+ VPMINU %ymm3, %ymm4, %ymm6
|
|
||||||
+
|
|
||||||
+ VPMINU %ymm5, %ymm6, %ymm5
|
|
||||||
+
|
|
||||||
+ VPCMPEQ %ymm5, %ymm9, %ymm5
|
|
||||||
+ vpmovmskb %ymm5, %eax
|
|
||||||
+
|
|
||||||
+ addq $(VEC_SIZE * 4), %rdi
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(loop_4x_vec)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ %ymm1, %ymm9, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x0)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ %ymm2, %ymm9, %ymm2
|
|
||||||
vpmovmskb %ymm2, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(first_vec_x1)
|
|
||||||
- vpmovmskb %ymm3, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ %ymm3, %ymm9, %ymm3
|
|
||||||
+ VPCMPEQ %ymm4, %ymm9, %ymm4
|
|
||||||
+ vpmovmskb %ymm3, %ecx
|
|
||||||
vpmovmskb %ymm4, %eax
|
|
||||||
+ salq $32, %rax
|
|
||||||
+ orq %rcx, %rax
|
|
||||||
+ tzcntq %rax, %rax
|
|
||||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ cmp (%rax), %CHAR_REG
|
|
||||||
+ cmovne %rdx, %rax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ /* Cold case for crossing page with first load. */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(cross_page_boundary):
|
|
||||||
+ andq $-VEC_SIZE, %rdi
|
|
||||||
+ andl $(VEC_SIZE - 1), %ecx
|
|
||||||
+
|
|
||||||
+ vmovdqa (%rdi), %ymm8
|
|
||||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
+ vpor %ymm1, %ymm2, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ /* Remove the leading bits. */
|
|
||||||
+ sarxl %ecx, %eax, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
-L(first_vec_x3):
|
|
||||||
+ jz L(aligned_more)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_STRCHRNUL
|
|
||||||
- addq $(VEC_SIZE * 3), %rax
|
|
||||||
+ addq %rcx, %rdi
|
|
||||||
addq %rdi, %rax
|
|
||||||
-# else
|
|
||||||
- xorl %edx, %edx
|
|
||||||
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ cmp (%rax), %CHAR_REG
|
|
||||||
cmovne %rdx, %rax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
END (STRCHR)
|
|
||||||
-#endif
|
|
||||||
+# endif
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
|
|
||||||
index 7e582f02..5225bd4f 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strchr.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strchr.c
|
|
||||||
@@ -38,11 +38,11 @@ IFUNC_SELECTOR (void)
|
|
||||||
const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
|
||||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
{
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
return OPTIMIZE (evex);
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,67 +0,0 @@
|
|||||||
From 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Sat, 25 Jan 2020 14:19:40 -0800
|
|
||||||
Subject: [PATCH] x86-64: Avoid rep movsb with short distance [BZ #27130]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
When copying with "rep movsb", if the distance between source and
|
|
||||||
destination is N*4GB + [1..63] with N >= 0, performance may be very
|
|
||||||
slow. This patch updates memmove-vec-unaligned-erms.S for AVX and
|
|
||||||
AVX512 versions with the distance in RCX:
|
|
||||||
|
|
||||||
cmpl $63, %ecx
|
|
||||||
// Don't use "rep movsb" if ECX <= 63
|
|
||||||
jbe L(Don't use rep movsb")
|
|
||||||
Use "rep movsb"
|
|
||||||
|
|
||||||
Benchtests data with bench-memcpy, bench-memcpy-large, bench-memcpy-random
|
|
||||||
and bench-memcpy-walk on Skylake, Ice Lake and Tiger Lake show that its
|
|
||||||
performance impact is within noise range as "rep movsb" is only used for
|
|
||||||
data size >= 4KB.
|
|
||||||
---
|
|
||||||
.../multiarch/memmove-vec-unaligned-erms.S | 21 +++++++++++++++++++
|
|
||||||
1 file changed, 21 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
index 673b73aa..c475fed4 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
@@ -64,6 +64,13 @@
|
|
||||||
# endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+/* Avoid short distance rep movsb only with non-SSE vector. */
|
|
||||||
+#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
|
|
||||||
+# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
|
|
||||||
+#else
|
|
||||||
+# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
#ifndef PREFETCH
|
|
||||||
# define PREFETCH(addr) prefetcht0 addr
|
|
||||||
#endif
|
|
||||||
@@ -255,7 +262,21 @@ L(movsb):
|
|
||||||
cmpq %r9, %rdi
|
|
||||||
/* Avoid slow backward REP MOVSB. */
|
|
||||||
jb L(more_8x_vec_backward)
|
|
||||||
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
|
|
||||||
+ movq %rdi, %rcx
|
|
||||||
+ subq %rsi, %rcx
|
|
||||||
+ jmp 2f
|
|
||||||
+# endif
|
|
||||||
1:
|
|
||||||
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
|
|
||||||
+ movq %rsi, %rcx
|
|
||||||
+ subq %rdi, %rcx
|
|
||||||
+2:
|
|
||||||
+/* Avoid "rep movsb" if RCX, the distance between source and destination,
|
|
||||||
+ is N*4GB + [1..63] with N >= 0. */
|
|
||||||
+ cmpl $63, %ecx
|
|
||||||
+ jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
|
|
||||||
+# endif
|
|
||||||
mov %RDX_LP, %RCX_LP
|
|
||||||
rep movsb
|
|
||||||
L(nop):
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,449 +0,0 @@
|
|||||||
From 1a8605b6cd257e8a74e29b5b71c057211f5fb847 Mon Sep 17 00:00:00 2001
|
|
||||||
From: noah <goldstein.w.n@gmail.com>
|
|
||||||
Date: Sat, 3 Apr 2021 04:12:15 -0400
|
|
||||||
Subject: [PATCH] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No Bug. This commit updates the large memcpy case (no overlap). The
|
|
||||||
update is to perform memcpy on either 2 or 4 contiguous pages at
|
|
||||||
once. This 1) helps to alleviate the affects of false memory aliasing
|
|
||||||
when destination and source have a close 4k alignment and 2) In most
|
|
||||||
cases and for most DRAM units is a modestly more efficient access
|
|
||||||
pattern. These changes are a clear performance improvement for
|
|
||||||
VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
|
|
||||||
test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
|
|
||||||
pass.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
.../multiarch/memmove-vec-unaligned-erms.S | 338 ++++++++++++++----
|
|
||||||
1 file changed, 265 insertions(+), 73 deletions(-)
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
(different number of sections)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
index c475fed4..3e2dd6bc 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
@@ -32,7 +32,16 @@
|
|
||||||
overlapping addresses.
|
|
||||||
6. If size >= __x86_shared_non_temporal_threshold and there is no
|
|
||||||
overlap between destination and source, use non-temporal store
|
|
||||||
- instead of aligned store. */
|
|
||||||
+ instead of aligned store copying from either 2 or 4 pages at
|
|
||||||
+ once.
|
|
||||||
+ 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
|
|
||||||
+ and source and destination do not page alias, copy from 2 pages
|
|
||||||
+ at once using non-temporal stores. Page aliasing in this case is
|
|
||||||
+ considered true if destination's page alignment - sources' page
|
|
||||||
+ alignment is less than 8 * VEC_SIZE.
|
|
||||||
+ 9. If size >= 16 * __x86_shared_non_temporal_threshold or source
|
|
||||||
+ and destination do page alias copy from 4 pages at once using
|
|
||||||
+ non-temporal stores. */
|
|
||||||
|
|
||||||
#include <sysdep.h>
|
|
||||||
|
|
||||||
@@ -64,6 +73,34 @@
|
|
||||||
# endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+#ifndef PAGE_SIZE
|
|
||||||
+# define PAGE_SIZE 4096
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#if PAGE_SIZE != 4096
|
|
||||||
+# error Unsupported PAGE_SIZE
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#ifndef LOG_PAGE_SIZE
|
|
||||||
+# define LOG_PAGE_SIZE 12
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
|
|
||||||
+# error Invalid LOG_PAGE_SIZE
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+/* Byte per page for large_memcpy inner loop. */
|
|
||||||
+#if VEC_SIZE == 64
|
|
||||||
+# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
|
|
||||||
+#else
|
|
||||||
+# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+/* Amount to shift rdx by to compare for memcpy_large_4x. */
|
|
||||||
+#ifndef LOG_4X_MEMCPY_THRESH
|
|
||||||
+# define LOG_4X_MEMCPY_THRESH 4
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
/* Avoid short distance rep movsb only with non-SSE vector. */
|
|
||||||
#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
|
|
||||||
# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
|
|
||||||
@@ -103,6 +140,28 @@
|
|
||||||
# error Unsupported PREFETCH_SIZE!
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
|
|
||||||
+# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
|
|
||||||
+ VMOVU (offset)base, vec0; \
|
|
||||||
+ VMOVU ((offset) + VEC_SIZE)base, vec1;
|
|
||||||
+# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
|
|
||||||
+ VMOVNT vec0, (offset)base; \
|
|
||||||
+ VMOVNT vec1, ((offset) + VEC_SIZE)base;
|
|
||||||
+#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
|
|
||||||
+# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
|
|
||||||
+ VMOVU (offset)base, vec0; \
|
|
||||||
+ VMOVU ((offset) + VEC_SIZE)base, vec1; \
|
|
||||||
+ VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \
|
|
||||||
+ VMOVU ((offset) + VEC_SIZE * 3)base, vec3;
|
|
||||||
+# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
|
|
||||||
+ VMOVNT vec0, (offset)base; \
|
|
||||||
+ VMOVNT vec1, ((offset) + VEC_SIZE)base; \
|
|
||||||
+ VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \
|
|
||||||
+ VMOVNT vec3, ((offset) + VEC_SIZE * 3)base;
|
|
||||||
+#else
|
|
||||||
+# error Invalid LARGE_LOAD_SIZE
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
#ifndef SECTION
|
|
||||||
# error SECTION is not defined!
|
|
||||||
#endif
|
|
||||||
@@ -390,6 +449,15 @@ L(last_4x_vec):
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
L(more_8x_vec):
|
|
||||||
+ /* Check if non-temporal move candidate. */
|
|
||||||
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
|
||||||
+ /* Check non-temporal store threshold. */
|
|
||||||
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
|
||||||
+ ja L(large_memcpy_2x)
|
|
||||||
+#endif
|
|
||||||
+ /* Entry if rdx is greater than non-temporal threshold but there
|
|
||||||
+ is overlap. */
|
|
||||||
+L(more_8x_vec_check):
|
|
||||||
cmpq %rsi, %rdi
|
|
||||||
ja L(more_8x_vec_backward)
|
|
||||||
/* Source == destination is less common. */
|
|
||||||
@@ -416,24 +484,21 @@ L(more_8x_vec):
|
|
||||||
subq %r8, %rdi
|
|
||||||
/* Adjust length. */
|
|
||||||
addq %r8, %rdx
|
|
||||||
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
|
||||||
- /* Check non-temporal store threshold. */
|
|
||||||
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
|
||||||
- ja L(large_forward)
|
|
||||||
-#endif
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
L(loop_4x_vec_forward):
|
|
||||||
/* Copy 4 * VEC a time forward. */
|
|
||||||
VMOVU (%rsi), %VEC(0)
|
|
||||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
|
||||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
|
||||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
|
||||||
- addq $(VEC_SIZE * 4), %rsi
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rsi
|
|
||||||
+ addq $-(VEC_SIZE * 4), %rdx
|
|
||||||
VMOVA %VEC(0), (%rdi)
|
|
||||||
VMOVA %VEC(1), VEC_SIZE(%rdi)
|
|
||||||
VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
|
|
||||||
VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
ja L(loop_4x_vec_forward)
|
|
||||||
/* Store the last 4 * VEC. */
|
|
||||||
@@ -467,24 +532,21 @@ L(more_8x_vec_backward):
|
|
||||||
subq %r8, %r9
|
|
||||||
/* Adjust length. */
|
|
||||||
subq %r8, %rdx
|
|
||||||
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
|
||||||
- /* Check non-temporal store threshold. */
|
|
||||||
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
|
||||||
- ja L(large_backward)
|
|
||||||
-#endif
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
L(loop_4x_vec_backward):
|
|
||||||
/* Copy 4 * VEC a time backward. */
|
|
||||||
VMOVU (%rcx), %VEC(0)
|
|
||||||
VMOVU -VEC_SIZE(%rcx), %VEC(1)
|
|
||||||
VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
|
|
||||||
VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
|
|
||||||
- subq $(VEC_SIZE * 4), %rcx
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ addq $-(VEC_SIZE * 4), %rcx
|
|
||||||
+ addq $-(VEC_SIZE * 4), %rdx
|
|
||||||
VMOVA %VEC(0), (%r9)
|
|
||||||
VMOVA %VEC(1), -VEC_SIZE(%r9)
|
|
||||||
VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
|
|
||||||
VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
|
|
||||||
- subq $(VEC_SIZE * 4), %r9
|
|
||||||
+ addq $-(VEC_SIZE * 4), %r9
|
|
||||||
cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
ja L(loop_4x_vec_backward)
|
|
||||||
/* Store the first 4 * VEC. */
|
|
||||||
@@ -497,72 +559,202 @@ L(loop_4x_vec_backward):
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
|
||||||
-L(large_forward):
|
|
||||||
+ .p2align 4
|
|
||||||
+L(large_memcpy_2x):
|
|
||||||
+ /* Compute absolute value of difference between source and
|
|
||||||
+ destination. */
|
|
||||||
+ movq %rdi, %r9
|
|
||||||
+ subq %rsi, %r9
|
|
||||||
+ movq %r9, %r8
|
|
||||||
+ leaq -1(%r9), %rcx
|
|
||||||
+ sarq $63, %r8
|
|
||||||
+ xorq %r8, %r9
|
|
||||||
+ subq %r8, %r9
|
|
||||||
/* Don't use non-temporal store if there is overlap between
|
|
||||||
- destination and source since destination may be in cache
|
|
||||||
- when source is loaded. */
|
|
||||||
- leaq (%rdi, %rdx), %r10
|
|
||||||
- cmpq %r10, %rsi
|
|
||||||
- jb L(loop_4x_vec_forward)
|
|
||||||
-L(loop_large_forward):
|
|
||||||
+ destination and source since destination may be in cache when
|
|
||||||
+ source is loaded. */
|
|
||||||
+ cmpq %r9, %rdx
|
|
||||||
+ ja L(more_8x_vec_check)
|
|
||||||
+
|
|
||||||
+ /* Cache align destination. First store the first 64 bytes then
|
|
||||||
+ adjust alignments. */
|
|
||||||
+ VMOVU (%rsi), %VEC(8)
|
|
||||||
+#if VEC_SIZE < 64
|
|
||||||
+ VMOVU VEC_SIZE(%rsi), %VEC(9)
|
|
||||||
+#if VEC_SIZE < 32
|
|
||||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
|
|
||||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
|
|
||||||
+#endif
|
|
||||||
+#endif
|
|
||||||
+ VMOVU %VEC(8), (%rdi)
|
|
||||||
+#if VEC_SIZE < 64
|
|
||||||
+ VMOVU %VEC(9), VEC_SIZE(%rdi)
|
|
||||||
+#if VEC_SIZE < 32
|
|
||||||
+ VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
|
|
||||||
+ VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
|
|
||||||
+#endif
|
|
||||||
+#endif
|
|
||||||
+ /* Adjust source, destination, and size. */
|
|
||||||
+ movq %rdi, %r8
|
|
||||||
+ andq $63, %r8
|
|
||||||
+ /* Get the negative of offset for alignment. */
|
|
||||||
+ subq $64, %r8
|
|
||||||
+ /* Adjust source. */
|
|
||||||
+ subq %r8, %rsi
|
|
||||||
+ /* Adjust destination which should be aligned now. */
|
|
||||||
+ subq %r8, %rdi
|
|
||||||
+ /* Adjust length. */
|
|
||||||
+ addq %r8, %rdx
|
|
||||||
+
|
|
||||||
+ /* Test if source and destination addresses will alias. If they do
|
|
||||||
+ the larger pipeline in large_memcpy_4x alleviated the
|
|
||||||
+ performance drop. */
|
|
||||||
+ testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
|
|
||||||
+ jz L(large_memcpy_4x)
|
|
||||||
+
|
|
||||||
+ movq %rdx, %r10
|
|
||||||
+ shrq $LOG_4X_MEMCPY_THRESH, %r10
|
|
||||||
+ cmp __x86_shared_non_temporal_threshold(%rip), %r10
|
|
||||||
+ jae L(large_memcpy_4x)
|
|
||||||
+
|
|
||||||
+ /* edx will store remainder size for copying tail. */
|
|
||||||
+ andl $(PAGE_SIZE * 2 - 1), %edx
|
|
||||||
+ /* r10 stores outer loop counter. */
|
|
||||||
+ shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
|
|
||||||
+ /* Copy 4x VEC at a time from 2 pages. */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(loop_large_memcpy_2x_outer):
|
|
||||||
+ /* ecx stores inner loop counter. */
|
|
||||||
+ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
|
|
||||||
+L(loop_large_memcpy_2x_inner):
|
|
||||||
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
|
|
||||||
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
|
|
||||||
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
|
|
||||||
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
|
|
||||||
+ /* Load vectors from rsi. */
|
|
||||||
+ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
|
|
||||||
+ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
|
|
||||||
+ subq $-LARGE_LOAD_SIZE, %rsi
|
|
||||||
+ /* Non-temporal store vectors to rdi. */
|
|
||||||
+ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
|
|
||||||
+ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
|
|
||||||
+ subq $-LARGE_LOAD_SIZE, %rdi
|
|
||||||
+ decl %ecx
|
|
||||||
+ jnz L(loop_large_memcpy_2x_inner)
|
|
||||||
+ addq $PAGE_SIZE, %rdi
|
|
||||||
+ addq $PAGE_SIZE, %rsi
|
|
||||||
+ decq %r10
|
|
||||||
+ jne L(loop_large_memcpy_2x_outer)
|
|
||||||
+ sfence
|
|
||||||
+
|
|
||||||
+ /* Check if only last 4 loads are needed. */
|
|
||||||
+ cmpl $(VEC_SIZE * 4), %edx
|
|
||||||
+ jbe L(large_memcpy_2x_end)
|
|
||||||
+
|
|
||||||
+ /* Handle the last 2 * PAGE_SIZE bytes. */
|
|
||||||
+L(loop_large_memcpy_2x_tail):
|
|
||||||
/* Copy 4 * VEC a time forward with non-temporal stores. */
|
|
||||||
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
|
|
||||||
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
|
|
||||||
+ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
|
|
||||||
+ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
|
|
||||||
VMOVU (%rsi), %VEC(0)
|
|
||||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
|
||||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
|
||||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
|
||||||
- addq $PREFETCHED_LOAD_SIZE, %rsi
|
|
||||||
- subq $PREFETCHED_LOAD_SIZE, %rdx
|
|
||||||
- VMOVNT %VEC(0), (%rdi)
|
|
||||||
- VMOVNT %VEC(1), VEC_SIZE(%rdi)
|
|
||||||
- VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
|
|
||||||
- VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
|
|
||||||
- addq $PREFETCHED_LOAD_SIZE, %rdi
|
|
||||||
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
|
|
||||||
- ja L(loop_large_forward)
|
|
||||||
- sfence
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rsi
|
|
||||||
+ addl $-(VEC_SIZE * 4), %edx
|
|
||||||
+ VMOVA %VEC(0), (%rdi)
|
|
||||||
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
|
|
||||||
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
|
|
||||||
+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ cmpl $(VEC_SIZE * 4), %edx
|
|
||||||
+ ja L(loop_large_memcpy_2x_tail)
|
|
||||||
+
|
|
||||||
+L(large_memcpy_2x_end):
|
|
||||||
/* Store the last 4 * VEC. */
|
|
||||||
- VMOVU %VEC(5), (%rcx)
|
|
||||||
- VMOVU %VEC(6), -VEC_SIZE(%rcx)
|
|
||||||
- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
|
|
||||||
- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
|
|
||||||
- /* Store the first VEC. */
|
|
||||||
- VMOVU %VEC(4), (%r11)
|
|
||||||
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
|
|
||||||
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
|
|
||||||
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
|
|
||||||
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
|
|
||||||
+
|
|
||||||
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
|
|
||||||
+ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
|
|
||||||
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
|
|
||||||
+ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
-L(large_backward):
|
|
||||||
- /* Don't use non-temporal store if there is overlap between
|
|
||||||
- destination and source since destination may be in cache
|
|
||||||
- when source is loaded. */
|
|
||||||
- leaq (%rcx, %rdx), %r10
|
|
||||||
- cmpq %r10, %r9
|
|
||||||
- jb L(loop_4x_vec_backward)
|
|
||||||
-L(loop_large_backward):
|
|
||||||
- /* Copy 4 * VEC a time backward with non-temporal stores. */
|
|
||||||
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
|
|
||||||
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
|
|
||||||
- VMOVU (%rcx), %VEC(0)
|
|
||||||
- VMOVU -VEC_SIZE(%rcx), %VEC(1)
|
|
||||||
- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
|
|
||||||
- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
|
|
||||||
- subq $PREFETCHED_LOAD_SIZE, %rcx
|
|
||||||
- subq $PREFETCHED_LOAD_SIZE, %rdx
|
|
||||||
- VMOVNT %VEC(0), (%r9)
|
|
||||||
- VMOVNT %VEC(1), -VEC_SIZE(%r9)
|
|
||||||
- VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
|
|
||||||
- VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
|
|
||||||
- subq $PREFETCHED_LOAD_SIZE, %r9
|
|
||||||
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
|
|
||||||
- ja L(loop_large_backward)
|
|
||||||
+ .p2align 4
|
|
||||||
+L(large_memcpy_4x):
|
|
||||||
+ movq %rdx, %r10
|
|
||||||
+ /* edx will store remainder size for copying tail. */
|
|
||||||
+ andl $(PAGE_SIZE * 4 - 1), %edx
|
|
||||||
+ /* r10 stores outer loop counter. */
|
|
||||||
+ shrq $(LOG_PAGE_SIZE + 2), %r10
|
|
||||||
+ /* Copy 4x VEC at a time from 4 pages. */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(loop_large_memcpy_4x_outer):
|
|
||||||
+ /* ecx stores inner loop counter. */
|
|
||||||
+ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
|
|
||||||
+L(loop_large_memcpy_4x_inner):
|
|
||||||
+ /* Only one prefetch set per page as doing 4 pages give more time
|
|
||||||
+ for prefetcher to keep up. */
|
|
||||||
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
|
|
||||||
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
|
|
||||||
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
|
|
||||||
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
|
|
||||||
+ /* Load vectors from rsi. */
|
|
||||||
+ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
|
|
||||||
+ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
|
|
||||||
+ LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
|
|
||||||
+ LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
|
|
||||||
+ subq $-LARGE_LOAD_SIZE, %rsi
|
|
||||||
+ /* Non-temporal store vectors to rdi. */
|
|
||||||
+ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
|
|
||||||
+ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
|
|
||||||
+ STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
|
|
||||||
+ STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
|
|
||||||
+ subq $-LARGE_LOAD_SIZE, %rdi
|
|
||||||
+ decl %ecx
|
|
||||||
+ jnz L(loop_large_memcpy_4x_inner)
|
|
||||||
+ addq $(PAGE_SIZE * 3), %rdi
|
|
||||||
+ addq $(PAGE_SIZE * 3), %rsi
|
|
||||||
+ decq %r10
|
|
||||||
+ jne L(loop_large_memcpy_4x_outer)
|
|
||||||
sfence
|
|
||||||
- /* Store the first 4 * VEC. */
|
|
||||||
- VMOVU %VEC(4), (%rdi)
|
|
||||||
- VMOVU %VEC(5), VEC_SIZE(%rdi)
|
|
||||||
- VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
|
|
||||||
- VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
|
|
||||||
- /* Store the last VEC. */
|
|
||||||
- VMOVU %VEC(8), (%r11)
|
|
||||||
+ /* Check if only last 4 loads are needed. */
|
|
||||||
+ cmpl $(VEC_SIZE * 4), %edx
|
|
||||||
+ jbe L(large_memcpy_4x_end)
|
|
||||||
+
|
|
||||||
+ /* Handle the last 4 * PAGE_SIZE bytes. */
|
|
||||||
+L(loop_large_memcpy_4x_tail):
|
|
||||||
+ /* Copy 4 * VEC a time forward with non-temporal stores. */
|
|
||||||
+ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
|
|
||||||
+ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
|
|
||||||
+ VMOVU (%rsi), %VEC(0)
|
|
||||||
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
|
|
||||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
|
||||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rsi
|
|
||||||
+ addl $-(VEC_SIZE * 4), %edx
|
|
||||||
+ VMOVA %VEC(0), (%rdi)
|
|
||||||
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
|
|
||||||
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
|
|
||||||
+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ cmpl $(VEC_SIZE * 4), %edx
|
|
||||||
+ ja L(loop_large_memcpy_4x_tail)
|
|
||||||
+
|
|
||||||
+L(large_memcpy_4x_end):
|
|
||||||
+ /* Store the last 4 * VEC. */
|
|
||||||
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
|
|
||||||
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
|
|
||||||
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
|
|
||||||
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
|
|
||||||
+
|
|
||||||
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
|
|
||||||
+ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
|
|
||||||
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
|
|
||||||
+ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
#endif
|
|
||||||
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,151 +0,0 @@
|
|||||||
From ecd8b842cf37ea112e59cd9085ff1f1b6e208ae0 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 21 Jan 2019 11:29:58 -0800
|
|
||||||
Subject: [PATCH] x86-64 memrchr: Properly handle the length parameter [BZ#
|
|
||||||
24097]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
|
||||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
|
||||||
functions written in assembly can only use the lower 32 bits of a
|
|
||||||
64-bit register as length or must clear the upper 32 bits before using
|
|
||||||
the full 64-bit register for length.
|
|
||||||
|
|
||||||
This pach fixes memrchr for x32. Tested on x86-64 and x32. On x86-64,
|
|
||||||
libc.so is the same with and withou the fix.
|
|
||||||
|
|
||||||
[BZ# 24097]
|
|
||||||
CVE-2019-6488
|
|
||||||
* sysdeps/x86_64/memrchr.S: Use RDX_LP for length.
|
|
||||||
* sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/memrchr.S | 4 +-
|
|
||||||
sysdeps/x86_64/multiarch/memrchr-avx2.S | 4 +-
|
|
||||||
sysdeps/x86_64/x32/Makefile | 3 +-
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-memrchr.c | 57 +++++++++++++++++++++++++
|
|
||||||
4 files changed, 63 insertions(+), 5 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memrchr.c
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
|
|
||||||
index b8e3fa1d..dc82f8f7 100644
|
|
||||||
--- a/sysdeps/x86_64/memrchr.S
|
|
||||||
+++ b/sysdeps/x86_64/memrchr.S
|
|
||||||
@@ -24,13 +24,13 @@
|
|
||||||
ENTRY (__memrchr)
|
|
||||||
movd %esi, %xmm1
|
|
||||||
|
|
||||||
- sub $16, %rdx
|
|
||||||
+ sub $16, %RDX_LP
|
|
||||||
jbe L(length_less16)
|
|
||||||
|
|
||||||
punpcklbw %xmm1, %xmm1
|
|
||||||
punpcklbw %xmm1, %xmm1
|
|
||||||
|
|
||||||
- add %rdx, %rdi
|
|
||||||
+ add %RDX_LP, %RDI_LP
|
|
||||||
pshufd $0, %xmm1, %xmm1
|
|
||||||
|
|
||||||
movdqu (%rdi), %xmm0
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
|
|
||||||
index b41a58bc..ce488dd9 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
|
|
||||||
@@ -32,10 +32,10 @@ ENTRY (__memrchr_avx2)
|
|
||||||
vmovd %esi, %xmm0
|
|
||||||
vpbroadcastb %xmm0, %ymm0
|
|
||||||
|
|
||||||
- subq $VEC_SIZE, %rdx
|
|
||||||
+ sub $VEC_SIZE, %RDX_LP
|
|
||||||
jbe L(last_vec_or_less)
|
|
||||||
|
|
||||||
- addq %rdx, %rdi
|
|
||||||
+ add %RDX_LP, %RDI_LP
|
|
||||||
|
|
||||||
/* Check the last VEC_SIZE bytes. */
|
|
||||||
vpcmpeqb (%rdi), %ymm0, %ymm1
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index 2fe1e5ac..e99dbd7c 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -6,7 +6,8 @@ CFLAGS-s_llround.c += -fno-builtin-lround
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
-tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
|
|
||||||
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
|
|
||||||
+ tst-size_t-memrchr
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),wcsmbs)
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memrchr.c b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..c83699c0
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
|
|
||||||
@@ -0,0 +1,57 @@
|
|
||||||
+/* Test memrchr with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define TEST_NAME "memrchr"
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+IMPL (memchr, 1)
|
|
||||||
+
|
|
||||||
+typedef void * (*proto_t) (const void *, int, size_t);
|
|
||||||
+
|
|
||||||
+static void *
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_memrchr (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, (uintptr_t) b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ parameter_t src = { { page_size }, buf2 };
|
|
||||||
+ parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ c.fn = impl->fn;
|
|
||||||
+ void * res = do_memrchr (src, c);
|
|
||||||
+ if (res)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: %p != NULL",
|
|
||||||
+ impl->name, res);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,92 +0,0 @@
|
|||||||
From 83c5b368226c34a2f0a5287df40fc290b2b34359 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 19 Apr 2021 10:45:07 -0700
|
|
||||||
Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Since strchr-avx2.S updated by
|
|
||||||
|
|
||||||
commit 1f745ecc2109890886b161d4791e1406fdfc29b8
|
|
||||||
Author: noah <goldstein.w.n@gmail.com>
|
|
||||||
Date: Wed Feb 3 00:38:59 2021 -0500
|
|
||||||
|
|
||||||
x86-64: Refactor and improve performance of strchr-avx2.S
|
|
||||||
|
|
||||||
uses sarx:
|
|
||||||
|
|
||||||
c4 e2 72 f7 c0 sarx %ecx,%eax,%eax
|
|
||||||
|
|
||||||
for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
|
|
||||||
ifunc-avx2.h.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++--
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
|
|
||||||
2 files changed, 11 insertions(+), 5 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
|
|
||||||
index e0f30e61..ef72b73f 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
|
|
||||||
@@ -30,11 +30,11 @@ IFUNC_SELECTOR (void)
|
|
||||||
const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
|
||||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
{
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
return OPTIMIZE (evex);
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index 695cdba6..85b8863a 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -400,10 +400,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
/* Support sysdeps/x86_64/multiarch/strchr.c. */
|
|
||||||
IFUNC_IMPL (i, name, strchr,
|
|
||||||
IFUNC_IMPL_ADD (array, i, strchr,
|
|
||||||
- CPU_FEATURE_USABLE (AVX2),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__strchr_avx2)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strchr,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__strchr_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strchr,
|
|
||||||
@@ -417,10 +419,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
/* Support sysdeps/x86_64/multiarch/strchrnul.c. */
|
|
||||||
IFUNC_IMPL (i, name, strchrnul,
|
|
||||||
IFUNC_IMPL_ADD (array, i, strchrnul,
|
|
||||||
- CPU_FEATURE_USABLE (AVX2),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__strchrnul_avx2)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strchrnul,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__strchrnul_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, strchrnul,
|
|
||||||
@@ -574,10 +578,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
/* Support sysdeps/x86_64/multiarch/wcschr.c. */
|
|
||||||
IFUNC_IMPL (i, name, wcschr,
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcschr,
|
|
||||||
- CPU_FEATURE_USABLE (AVX2),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wcschr_avx2)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcschr,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__wcschr_avx2_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wcschr,
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,265 +0,0 @@
|
|||||||
From f53790272ce7bdc5ecd14b45f65d0464d2a61a3a Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon, 19 Apr 2021 17:48:10 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize less_vec evex and avx512
|
|
||||||
memset-vec-unaligned-erms.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit adds optimized cased for less_vec memset case that
|
|
||||||
uses the avx512vl/avx512bw mask store avoiding the excessive
|
|
||||||
branches. test-memset and test-wmemset are passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 40 ++++++++++-----
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-memset.h | 6 ++-
|
|
||||||
.../multiarch/memset-avx512-unaligned-erms.S | 2 +-
|
|
||||||
.../multiarch/memset-evex-unaligned-erms.S | 2 +-
|
|
||||||
.../multiarch/memset-vec-unaligned-erms.S | 51 +++++++++++++++----
|
|
||||||
5 files changed, 74 insertions(+), 27 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index 85b8863a..d59d65f8 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -204,19 +204,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
__memset_chk_avx2_unaligned_erms_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memset_chk_evex_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memset_chk_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memset_chk_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memset_chk_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
@@ -247,19 +251,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
__memset_avx2_unaligned_erms_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memset_evex_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memset_evex_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memset_avx512_unaligned_erms)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memset_avx512_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memset,
|
|
||||||
CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
@@ -739,10 +747,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__wmemset_avx2_unaligned_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wmemset_evex_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wmemset_avx512_unaligned))
|
|
||||||
|
|
||||||
#ifdef SHARED
|
|
||||||
@@ -946,10 +958,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
CPU_FEATURE_USABLE (AVX2),
|
|
||||||
__wmemset_chk_avx2_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512VL),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wmemset_chk_evex_unaligned)
|
|
||||||
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
|
||||||
- CPU_FEATURE_USABLE (AVX512F),
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wmemset_chk_avx512_unaligned))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
index 19795938..100e3707 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
|
||||||
@@ -54,7 +54,8 @@ IFUNC_SELECTOR (void)
|
|
||||||
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
|
||||||
{
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
|
||||||
{
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
return OPTIMIZE (avx512_unaligned_erms);
|
|
||||||
@@ -68,7 +69,8 @@ IFUNC_SELECTOR (void)
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
|
||||||
{
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
|
||||||
{
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
|
||||||
return OPTIMIZE (evex_unaligned_erms);
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
||||||
index 22e7b187..8ad842fc 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
||||||
@@ -19,6 +19,6 @@
|
|
||||||
# define SECTION(p) p##.evex512
|
|
||||||
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
|
|
||||||
# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
|
|
||||||
-
|
|
||||||
+# define USE_LESS_VEC_MASK_STORE 1
|
|
||||||
# include "memset-vec-unaligned-erms.S"
|
|
||||||
#endif
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
||||||
index ae0a4d6e..640f0929 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
||||||
@@ -19,6 +19,6 @@
|
|
||||||
# define SECTION(p) p##.evex
|
|
||||||
# define MEMSET_SYMBOL(p,s) p##_evex_##s
|
|
||||||
# define WMEMSET_SYMBOL(p,s) p##_evex_##s
|
|
||||||
-
|
|
||||||
+# define USE_LESS_VEC_MASK_STORE 1
|
|
||||||
# include "memset-vec-unaligned-erms.S"
|
|
||||||
#endif
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
index bae5cba4..f877ac9d 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
@@ -63,6 +63,8 @@
|
|
||||||
# endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+#define PAGE_SIZE 4096
|
|
||||||
+
|
|
||||||
#ifndef SECTION
|
|
||||||
# error SECTION is not defined!
|
|
||||||
#endif
|
|
||||||
@@ -213,11 +215,38 @@ L(loop):
|
|
||||||
cmpq %rcx, %rdx
|
|
||||||
jne L(loop)
|
|
||||||
VZEROUPPER_SHORT_RETURN
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
L(less_vec):
|
|
||||||
/* Less than 1 VEC. */
|
|
||||||
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
|
||||||
# error Unsupported VEC_SIZE!
|
|
||||||
# endif
|
|
||||||
+# ifdef USE_LESS_VEC_MASK_STORE
|
|
||||||
+ /* Clear high bits from edi. Only keeping bits relevant to page
|
|
||||||
+ cross check. Note that we are using rax which is set in
|
|
||||||
+ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
|
|
||||||
+ */
|
|
||||||
+ andl $(PAGE_SIZE - 1), %edi
|
|
||||||
+ /* Check if VEC_SIZE store cross page. Mask stores suffer serious
|
|
||||||
+ performance degradation when it has to fault supress. */
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %edi
|
|
||||||
+ ja L(cross_page)
|
|
||||||
+# if VEC_SIZE > 32
|
|
||||||
+ movq $-1, %rcx
|
|
||||||
+ bzhiq %rdx, %rcx, %rcx
|
|
||||||
+ kmovq %rcx, %k1
|
|
||||||
+# else
|
|
||||||
+ movl $-1, %ecx
|
|
||||||
+ bzhil %edx, %ecx, %ecx
|
|
||||||
+ kmovd %ecx, %k1
|
|
||||||
+# endif
|
|
||||||
+ vmovdqu8 %VEC(0), (%rax) {%k1}
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(cross_page):
|
|
||||||
+# endif
|
|
||||||
# if VEC_SIZE > 32
|
|
||||||
cmpb $32, %dl
|
|
||||||
jae L(between_32_63)
|
|
||||||
@@ -234,36 +263,36 @@ L(less_vec):
|
|
||||||
cmpb $1, %dl
|
|
||||||
ja L(between_2_3)
|
|
||||||
jb 1f
|
|
||||||
- movb %cl, (%rdi)
|
|
||||||
+ movb %cl, (%rax)
|
|
||||||
1:
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
# if VEC_SIZE > 32
|
|
||||||
/* From 32 to 63. No branch when size == 32. */
|
|
||||||
L(between_32_63):
|
|
||||||
- VMOVU %YMM0, -32(%rdi,%rdx)
|
|
||||||
- VMOVU %YMM0, (%rdi)
|
|
||||||
+ VMOVU %YMM0, -32(%rax,%rdx)
|
|
||||||
+ VMOVU %YMM0, (%rax)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
# endif
|
|
||||||
# if VEC_SIZE > 16
|
|
||||||
/* From 16 to 31. No branch when size == 16. */
|
|
||||||
L(between_16_31):
|
|
||||||
- VMOVU %XMM0, -16(%rdi,%rdx)
|
|
||||||
- VMOVU %XMM0, (%rdi)
|
|
||||||
+ VMOVU %XMM0, -16(%rax,%rdx)
|
|
||||||
+ VMOVU %XMM0, (%rax)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
# endif
|
|
||||||
/* From 8 to 15. No branch when size == 8. */
|
|
||||||
L(between_8_15):
|
|
||||||
- movq %rcx, -8(%rdi,%rdx)
|
|
||||||
- movq %rcx, (%rdi)
|
|
||||||
+ movq %rcx, -8(%rax,%rdx)
|
|
||||||
+ movq %rcx, (%rax)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
L(between_4_7):
|
|
||||||
/* From 4 to 7. No branch when size == 4. */
|
|
||||||
- movl %ecx, -4(%rdi,%rdx)
|
|
||||||
- movl %ecx, (%rdi)
|
|
||||||
+ movl %ecx, -4(%rax,%rdx)
|
|
||||||
+ movl %ecx, (%rax)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
L(between_2_3):
|
|
||||||
/* From 2 to 3. No branch when size == 2. */
|
|
||||||
- movw %cx, -2(%rdi,%rdx)
|
|
||||||
- movw %cx, (%rdi)
|
|
||||||
+ movw %cx, -2(%rax,%rdx)
|
|
||||||
+ movw %cx, (%rax)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
END (MEMSET_SYMBOL (__memset, unaligned_erms))
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,396 +0,0 @@
|
|||||||
From ccabe7971f508709d034b63b8672f6f751a3d356 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Fri, 23 Apr 2021 15:56:24 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize strchr-avx2.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit optimizes strchr-avx2.S. The optimizations are all
|
|
||||||
small things such as save an ALU in the alignment process, saving a
|
|
||||||
few instructions in the loop return, saving some bytes in the main
|
|
||||||
loop, and increasing the ILP in the return cases. test-strchr,
|
|
||||||
test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strchr-avx2.S | 290 +++++++++++++++----------
|
|
||||||
1 file changed, 170 insertions(+), 120 deletions(-)
|
|
||||||
|
|
||||||
Conflics:
|
|
||||||
sysdeps/x86_64/multiarch/strchr-avx2.S
|
|
||||||
(rearranged to account for branch changes)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
|
|
||||||
index 919d256c..5884726b 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
|
|
||||||
@@ -49,133 +49,144 @@
|
|
||||||
|
|
||||||
.section SECTION(.text),"ax",@progbits
|
|
||||||
ENTRY (STRCHR)
|
|
||||||
- movl %edi, %ecx
|
|
||||||
-# ifndef USE_AS_STRCHRNUL
|
|
||||||
- xorl %edx, %edx
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
/* Broadcast CHAR to YMM0. */
|
|
||||||
vmovd %esi, %xmm0
|
|
||||||
+ movl %edi, %eax
|
|
||||||
+ andl $(PAGE_SIZE - 1), %eax
|
|
||||||
+ VPBROADCAST %xmm0, %ymm0
|
|
||||||
vpxor %xmm9, %xmm9, %xmm9
|
|
||||||
- VPBROADCAST %xmm0, %ymm0
|
|
||||||
|
|
||||||
/* Check if we cross page boundary with one vector load. */
|
|
||||||
- andl $(PAGE_SIZE - 1), %ecx
|
|
||||||
- cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
|
|
||||||
- ja L(cross_page_boundary)
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
+ ja L(cross_page_boundary)
|
|
||||||
|
|
||||||
/* Check the first VEC_SIZE bytes. Search for both CHAR and the
|
|
||||||
null byte. */
|
|
||||||
vmovdqu (%rdi), %ymm8
|
|
||||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
vpor %ymm1, %ymm2, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jz L(more_vecs)
|
|
||||||
+ jz L(aligned_more)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
/* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdi, %rax), %CHAR_REG
|
|
||||||
+ jne L(zero)
|
|
||||||
+# endif
|
|
||||||
addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ /* .p2align 5 helps keep performance more consistent if ENTRY()
|
|
||||||
+ alignment % 32 was either 16 or 0. As well this makes the
|
|
||||||
+ alignment % 32 of the loop_4x_vec fixed which makes tuning it
|
|
||||||
+ easier. */
|
|
||||||
+ .p2align 5
|
|
||||||
+L(first_vec_x4):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addq $(VEC_SIZE * 3 + 1), %rdi
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdi, %rax), %CHAR_REG
|
|
||||||
+ jne L(zero)
|
|
||||||
# endif
|
|
||||||
-L(return_vzeroupper):
|
|
||||||
- ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(more_vecs):
|
|
||||||
- /* Align data for aligned loads in the loop. */
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
-L(aligned_more):
|
|
||||||
-
|
|
||||||
- /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
- since data is only aligned to VEC_SIZE. */
|
|
||||||
- vmovdqa VEC_SIZE(%rdi), %ymm8
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
- vpor %ymm1, %ymm2, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
-
|
|
||||||
- vmovdqa VEC_SIZE(%rdi), %ymm8
|
|
||||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
- vpor %ymm1, %ymm2, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
-
|
|
||||||
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8
|
|
||||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
- vpor %ymm1, %ymm2, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
-
|
|
||||||
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
|
|
||||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
- vpor %ymm1, %ymm2, %ymm1
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jz L(prep_loop_4x)
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
- tzcntl %eax, %eax
|
|
||||||
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
# endif
|
|
||||||
- VZEROUPPER
|
|
||||||
- ret
|
|
||||||
+
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x0):
|
|
||||||
+L(first_vec_x1):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Found CHAR or the null byte. */
|
|
||||||
- addq %rdi, %rax
|
|
||||||
+ incq %rdi
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdi, %rax), %CHAR_REG
|
|
||||||
+ jne L(zero)
|
|
||||||
# endif
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x1):
|
|
||||||
+L(first_vec_x2):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- leaq VEC_SIZE(%rdi, %rax), %rax
|
|
||||||
+ addq $(VEC_SIZE + 1), %rdi
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdi, %rax), %CHAR_REG
|
|
||||||
+ jne L(zero)
|
|
||||||
# endif
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x2):
|
|
||||||
+L(first_vec_x3):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Found CHAR or the null byte. */
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
|
||||||
+ addq $(VEC_SIZE * 2 + 1), %rdi
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdi, %rax), %CHAR_REG
|
|
||||||
+ jne L(zero)
|
|
||||||
# endif
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
-L(prep_loop_4x):
|
|
||||||
- /* Align data to 4 * VEC_SIZE. */
|
|
||||||
- andq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ .p2align 4
|
|
||||||
+L(aligned_more):
|
|
||||||
+ /* Align data to VEC_SIZE - 1. This is the same number of
|
|
||||||
+ instructions as using andq -VEC_SIZE but saves 4 bytes of code
|
|
||||||
+ on x4 check. */
|
|
||||||
+ orq $(VEC_SIZE - 1), %rdi
|
|
||||||
+L(cross_page_continue):
|
|
||||||
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
+ since data is only aligned to VEC_SIZE. */
|
|
||||||
+ vmovdqa 1(%rdi), %ymm8
|
|
||||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
+ vpor %ymm1, %ymm2, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x1)
|
|
||||||
+
|
|
||||||
+ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
|
|
||||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
+ vpor %ymm1, %ymm2, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x2)
|
|
||||||
+
|
|
||||||
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
|
|
||||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
+ vpor %ymm1, %ymm2, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x3)
|
|
||||||
|
|
||||||
+ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
|
|
||||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
+ vpor %ymm1, %ymm2, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x4)
|
|
||||||
+ /* Align data to VEC_SIZE * 4 - 1. */
|
|
||||||
+ addq $(VEC_SIZE * 4 + 1), %rdi
|
|
||||||
+ andq $-(VEC_SIZE * 4), %rdi
|
|
||||||
.p2align 4
|
|
||||||
L(loop_4x_vec):
|
|
||||||
/* Compare 4 * VEC at a time forward. */
|
|
||||||
- vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
|
|
||||||
- vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
|
|
||||||
- vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
|
|
||||||
- vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
|
|
||||||
+ vmovdqa (%rdi), %ymm5
|
|
||||||
+ vmovdqa (VEC_SIZE)(%rdi), %ymm6
|
|
||||||
+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
|
|
||||||
+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
|
|
||||||
|
|
||||||
/* Leaves only CHARS matching esi as 0. */
|
|
||||||
vpxor %ymm5, %ymm0, %ymm1
|
|
||||||
@@ -191,63 +202,102 @@ L(loop_4x_vec):
|
|
||||||
VPMINU %ymm1, %ymm2, %ymm5
|
|
||||||
VPMINU %ymm3, %ymm4, %ymm6
|
|
||||||
|
|
||||||
- VPMINU %ymm5, %ymm6, %ymm5
|
|
||||||
+ VPMINU %ymm5, %ymm6, %ymm6
|
|
||||||
|
|
||||||
- VPCMPEQ %ymm5, %ymm9, %ymm5
|
|
||||||
- vpmovmskb %ymm5, %eax
|
|
||||||
+ VPCMPEQ %ymm6, %ymm9, %ymm6
|
|
||||||
+ vpmovmskb %ymm6, %ecx
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
+ jz L(loop_4x_vec)
|
|
||||||
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jz L(loop_4x_vec)
|
|
||||||
|
|
||||||
- VPCMPEQ %ymm1, %ymm9, %ymm1
|
|
||||||
+ VPCMPEQ %ymm1, %ymm9, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ jnz L(last_vec_x0)
|
|
||||||
+
|
|
||||||
|
|
||||||
- VPCMPEQ %ymm2, %ymm9, %ymm2
|
|
||||||
+ VPCMPEQ %ymm5, %ymm9, %ymm2
|
|
||||||
vpmovmskb %ymm2, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
+ jnz L(last_vec_x1)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ %ymm3, %ymm9, %ymm3
|
|
||||||
+ vpmovmskb %ymm3, %eax
|
|
||||||
+ /* rcx has combined result from all 4 VEC. It will only be used
|
|
||||||
+ if the first 3 other VEC all did not contain a match. */
|
|
||||||
+ salq $32, %rcx
|
|
||||||
+ orq %rcx, %rax
|
|
||||||
+ tzcntq %rax, %rax
|
|
||||||
+ subq $(VEC_SIZE * 2), %rdi
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdi, %rax), %CHAR_REG
|
|
||||||
+ jne L(zero_end)
|
|
||||||
+# endif
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_vec_x0):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdi, %rax), %CHAR_REG
|
|
||||||
+ jne L(zero_end)
|
|
||||||
+# endif
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
- VPCMPEQ %ymm3, %ymm9, %ymm3
|
|
||||||
- VPCMPEQ %ymm4, %ymm9, %ymm4
|
|
||||||
- vpmovmskb %ymm3, %ecx
|
|
||||||
- vpmovmskb %ymm4, %eax
|
|
||||||
- salq $32, %rax
|
|
||||||
- orq %rcx, %rax
|
|
||||||
- tzcntq %rax, %rax
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+L(zero_end):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
# endif
|
|
||||||
- VZEROUPPER
|
|
||||||
- ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_vec_x1):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ subq $(VEC_SIZE * 3), %rdi
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdi, %rax), %CHAR_REG
|
|
||||||
+ jne L(zero_end)
|
|
||||||
+# endif
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
|
|
||||||
/* Cold case for crossing page with first load. */
|
|
||||||
.p2align 4
|
|
||||||
L(cross_page_boundary):
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
-
|
|
||||||
- vmovdqa (%rdi), %ymm8
|
|
||||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
+ movq %rdi, %rdx
|
|
||||||
+ /* Align rdi to VEC_SIZE - 1. */
|
|
||||||
+ orq $(VEC_SIZE - 1), %rdi
|
|
||||||
+ vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
|
|
||||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
|
||||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
|
||||||
vpor %ymm1, %ymm2, %ymm1
|
|
||||||
vpmovmskb %ymm1, %eax
|
|
||||||
- /* Remove the leading bits. */
|
|
||||||
- sarxl %ecx, %eax, %eax
|
|
||||||
+ /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
|
|
||||||
+ so no need to manually mod edx. */
|
|
||||||
+ sarxl %edx, %eax, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jz L(aligned_more)
|
|
||||||
+ jz L(cross_page_continue)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- addq %rcx, %rdi
|
|
||||||
- addq %rdi, %rax
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ xorl %ecx, %ecx
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rdx, %rax), %CHAR_REG
|
|
||||||
+ leaq (%rdx, %rax), %rax
|
|
||||||
+ cmovne %rcx, %rax
|
|
||||||
+# else
|
|
||||||
+ addq %rdx, %rax
|
|
||||||
# endif
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
+L(return_vzeroupper):
|
|
||||||
+ ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
||||||
|
|
||||||
END (STRCHR)
|
|
||||||
# endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,532 +0,0 @@
|
|||||||
From 7f3e7c262cab4e2401e4331a6ef29c428de02044 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Fri, 23 Apr 2021 15:56:25 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize strchr-evex.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit optimizes strchr-evex.S. The optimizations are
|
|
||||||
mostly small things such as save an ALU in the alignment process,
|
|
||||||
saving a few instructions in the loop return. The one significant
|
|
||||||
change is saving 2 instructions in the 4x loop. test-strchr,
|
|
||||||
test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strchr-evex.S | 392 ++++++++++++++-----------
|
|
||||||
1 file changed, 218 insertions(+), 174 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
|
|
||||||
index ddc86a70..7f9d4ee4 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strchr-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
|
|
||||||
@@ -32,13 +32,15 @@
|
|
||||||
# define VPCMP vpcmpd
|
|
||||||
# define VPMINU vpminud
|
|
||||||
# define CHAR_REG esi
|
|
||||||
-# define SHIFT_REG r8d
|
|
||||||
+# define SHIFT_REG ecx
|
|
||||||
+# define CHAR_SIZE 4
|
|
||||||
# else
|
|
||||||
# define VPBROADCAST vpbroadcastb
|
|
||||||
# define VPCMP vpcmpb
|
|
||||||
# define VPMINU vpminub
|
|
||||||
# define CHAR_REG sil
|
|
||||||
-# define SHIFT_REG ecx
|
|
||||||
+# define SHIFT_REG edx
|
|
||||||
+# define CHAR_SIZE 1
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# define XMMZERO xmm16
|
|
||||||
@@ -56,23 +58,20 @@
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
# define PAGE_SIZE 4096
|
|
||||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
||||||
|
|
||||||
.section .text.evex,"ax",@progbits
|
|
||||||
ENTRY (STRCHR)
|
|
||||||
- movl %edi, %ecx
|
|
||||||
-# ifndef USE_AS_STRCHRNUL
|
|
||||||
- xorl %edx, %edx
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
/* Broadcast CHAR to YMM0. */
|
|
||||||
- VPBROADCAST %esi, %YMM0
|
|
||||||
-
|
|
||||||
+ VPBROADCAST %esi, %YMM0
|
|
||||||
+ movl %edi, %eax
|
|
||||||
+ andl $(PAGE_SIZE - 1), %eax
|
|
||||||
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
|
||||||
|
|
||||||
- /* Check if we cross page boundary with one vector load. */
|
|
||||||
- andl $(PAGE_SIZE - 1), %ecx
|
|
||||||
- cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
|
|
||||||
- ja L(cross_page_boundary)
|
|
||||||
+ /* Check if we cross page boundary with one vector load.
|
|
||||||
+ Otherwise it is safe to use an unaligned load. */
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
+ ja L(cross_page_boundary)
|
|
||||||
|
|
||||||
/* Check the first VEC_SIZE bytes. Search for both CHAR and the
|
|
||||||
null bytes. */
|
|
||||||
@@ -83,251 +82,296 @@ ENTRY (STRCHR)
|
|
||||||
VPMINU %YMM2, %YMM1, %YMM2
|
|
||||||
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
- ktestd %k0, %k0
|
|
||||||
- jz L(more_vecs)
|
|
||||||
kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jz L(aligned_more)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Found CHAR or the null byte. */
|
|
||||||
# ifdef USE_AS_WCSCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (%rdi, %rax, 4), %rax
|
|
||||||
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes.
|
|
||||||
+ */
|
|
||||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# else
|
|
||||||
addq %rdi, %rax
|
|
||||||
# endif
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (%rax), %CHAR_REG
|
|
||||||
+ jne L(zero)
|
|
||||||
# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(more_vecs):
|
|
||||||
- /* Align data for aligned loads in the loop. */
|
|
||||||
- andq $-VEC_SIZE, %rdi
|
|
||||||
-L(aligned_more):
|
|
||||||
-
|
|
||||||
- /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
||||||
- since data is only aligned to VEC_SIZE. */
|
|
||||||
- VMOVA VEC_SIZE(%rdi), %YMM1
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
-
|
|
||||||
- /* Leaves only CHARS matching esi as 0. */
|
|
||||||
- vpxorq %YMM1, %YMM0, %YMM2
|
|
||||||
- VPMINU %YMM2, %YMM1, %YMM2
|
|
||||||
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
-
|
|
||||||
- VMOVA VEC_SIZE(%rdi), %YMM1
|
|
||||||
- /* Leaves only CHARS matching esi as 0. */
|
|
||||||
- vpxorq %YMM1, %YMM0, %YMM2
|
|
||||||
- VPMINU %YMM2, %YMM1, %YMM2
|
|
||||||
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
-
|
|
||||||
- VMOVA (VEC_SIZE * 2)(%rdi), %YMM1
|
|
||||||
- /* Leaves only CHARS matching esi as 0. */
|
|
||||||
- vpxorq %YMM1, %YMM0, %YMM2
|
|
||||||
- VPMINU %YMM2, %YMM1, %YMM2
|
|
||||||
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
-
|
|
||||||
- VMOVA (VEC_SIZE * 3)(%rdi), %YMM1
|
|
||||||
- /* Leaves only CHARS matching esi as 0. */
|
|
||||||
- vpxorq %YMM1, %YMM0, %YMM2
|
|
||||||
- VPMINU %YMM2, %YMM1, %YMM2
|
|
||||||
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
- ktestd %k0, %k0
|
|
||||||
- jz L(prep_loop_4x)
|
|
||||||
-
|
|
||||||
- kmovd %k0, %eax
|
|
||||||
+ /* .p2align 5 helps keep performance more consistent if ENTRY()
|
|
||||||
+ alignment % 32 was either 16 or 0. As well this makes the
|
|
||||||
+ alignment % 32 of the loop_4x_vec fixed which makes tuning it
|
|
||||||
+ easier. */
|
|
||||||
+ .p2align 5
|
|
||||||
+L(first_vec_x3):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
/* Found CHAR or the null byte. */
|
|
||||||
-# ifdef USE_AS_WCSCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
|
|
||||||
-# else
|
|
||||||
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
|
|
||||||
+ cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
|
|
||||||
+ jne L(zero)
|
|
||||||
# endif
|
|
||||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
|
||||||
+ bytes. */
|
|
||||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
-# endif
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
ret
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x0):
|
|
||||||
+L(first_vec_x4):
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ /* Check to see if first match was CHAR (k0) or null (k1). */
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Found CHAR or the null byte. */
|
|
||||||
-# ifdef USE_AS_WCSCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (%rdi, %rax, 4), %rax
|
|
||||||
+ kmovd %k1, %ecx
|
|
||||||
+ /* bzhil will not be 0 if first match was null. */
|
|
||||||
+ bzhil %eax, %ecx, %ecx
|
|
||||||
+ jne L(zero)
|
|
||||||
# else
|
|
||||||
- addq %rdi, %rax
|
|
||||||
-# endif
|
|
||||||
-# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Combine CHAR and null matches. */
|
|
||||||
+ kord %k0, %k1, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
# endif
|
|
||||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
|
||||||
+ bytes. */
|
|
||||||
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x1):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Found CHAR or the null byte. */
|
|
||||||
-# ifdef USE_AS_WCSCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq VEC_SIZE(%rdi, %rax, 4), %rax
|
|
||||||
-# else
|
|
||||||
- leaq VEC_SIZE(%rdi, %rax), %rax
|
|
||||||
-# endif
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Found CHAR or the null byte. */
|
|
||||||
+ cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
|
|
||||||
+ jne L(zero)
|
|
||||||
+
|
|
||||||
# endif
|
|
||||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
|
||||||
+ bytes. */
|
|
||||||
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(first_vec_x2):
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ /* Check to see if first match was CHAR (k0) or null (k1). */
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- /* Found CHAR or the null byte. */
|
|
||||||
-# ifdef USE_AS_WCSCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
|
|
||||||
+ kmovd %k1, %ecx
|
|
||||||
+ /* bzhil will not be 0 if first match was null. */
|
|
||||||
+ bzhil %eax, %ecx, %ecx
|
|
||||||
+ jne L(zero)
|
|
||||||
# else
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
|
||||||
-# endif
|
|
||||||
-# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Combine CHAR and null matches. */
|
|
||||||
+ kord %k0, %k1, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
# endif
|
|
||||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
|
||||||
+ bytes. */
|
|
||||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
-L(prep_loop_4x):
|
|
||||||
- /* Align data to 4 * VEC_SIZE. */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(aligned_more):
|
|
||||||
+ /* Align data to VEC_SIZE. */
|
|
||||||
+ andq $-VEC_SIZE, %rdi
|
|
||||||
+L(cross_page_continue):
|
|
||||||
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
|
|
||||||
+ data is only aligned to VEC_SIZE. Use two alternating methods
|
|
||||||
+ for checking VEC to balance latency and port contention. */
|
|
||||||
+
|
|
||||||
+ /* This method has higher latency but has better port
|
|
||||||
+ distribution. */
|
|
||||||
+ VMOVA (VEC_SIZE)(%rdi), %YMM1
|
|
||||||
+ /* Leaves only CHARS matching esi as 0. */
|
|
||||||
+ vpxorq %YMM1, %YMM0, %YMM2
|
|
||||||
+ VPMINU %YMM2, %YMM1, %YMM2
|
|
||||||
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
+ VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x1)
|
|
||||||
+
|
|
||||||
+ /* This method has higher latency but has better port
|
|
||||||
+ distribution. */
|
|
||||||
+ VMOVA (VEC_SIZE * 2)(%rdi), %YMM1
|
|
||||||
+ /* Each bit in K0 represents a CHAR in YMM1. */
|
|
||||||
+ VPCMP $0, %YMM1, %YMM0, %k0
|
|
||||||
+ /* Each bit in K1 represents a CHAR in YMM1. */
|
|
||||||
+ VPCMP $0, %YMM1, %YMMZERO, %k1
|
|
||||||
+ kortestd %k0, %k1
|
|
||||||
+ jnz L(first_vec_x2)
|
|
||||||
+
|
|
||||||
+ VMOVA (VEC_SIZE * 3)(%rdi), %YMM1
|
|
||||||
+ /* Leaves only CHARS matching esi as 0. */
|
|
||||||
+ vpxorq %YMM1, %YMM0, %YMM2
|
|
||||||
+ VPMINU %YMM2, %YMM1, %YMM2
|
|
||||||
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
+ VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(first_vec_x3)
|
|
||||||
+
|
|
||||||
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
|
|
||||||
+ /* Each bit in K0 represents a CHAR in YMM1. */
|
|
||||||
+ VPCMP $0, %YMM1, %YMM0, %k0
|
|
||||||
+ /* Each bit in K1 represents a CHAR in YMM1. */
|
|
||||||
+ VPCMP $0, %YMM1, %YMMZERO, %k1
|
|
||||||
+ kortestd %k0, %k1
|
|
||||||
+ jnz L(first_vec_x4)
|
|
||||||
+
|
|
||||||
+ /* Align data to VEC_SIZE * 4 for the loop. */
|
|
||||||
+ addq $VEC_SIZE, %rdi
|
|
||||||
andq $-(VEC_SIZE * 4), %rdi
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(loop_4x_vec):
|
|
||||||
- /* Compare 4 * VEC at a time forward. */
|
|
||||||
+ /* Check 4x VEC at a time. No penalty to imm32 offset with evex
|
|
||||||
+ encoding. */
|
|
||||||
VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
|
|
||||||
VMOVA (VEC_SIZE * 5)(%rdi), %YMM2
|
|
||||||
VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
|
|
||||||
VMOVA (VEC_SIZE * 7)(%rdi), %YMM4
|
|
||||||
|
|
||||||
- /* Leaves only CHARS matching esi as 0. */
|
|
||||||
+ /* For YMM1 and YMM3 use xor to set the CHARs matching esi to
|
|
||||||
+ zero. */
|
|
||||||
vpxorq %YMM1, %YMM0, %YMM5
|
|
||||||
- vpxorq %YMM2, %YMM0, %YMM6
|
|
||||||
+ /* For YMM2 and YMM4 cmp not equals to CHAR and store result in
|
|
||||||
+ k register. Its possible to save either 1 or 2 instructions
|
|
||||||
+ using cmp no equals method for either YMM1 or YMM1 and YMM3
|
|
||||||
+ respectively but bottleneck on p5 makes it not worth it. */
|
|
||||||
+ VPCMP $4, %YMM0, %YMM2, %k2
|
|
||||||
vpxorq %YMM3, %YMM0, %YMM7
|
|
||||||
- vpxorq %YMM4, %YMM0, %YMM8
|
|
||||||
-
|
|
||||||
- VPMINU %YMM5, %YMM1, %YMM5
|
|
||||||
- VPMINU %YMM6, %YMM2, %YMM6
|
|
||||||
- VPMINU %YMM7, %YMM3, %YMM7
|
|
||||||
- VPMINU %YMM8, %YMM4, %YMM8
|
|
||||||
-
|
|
||||||
- VPMINU %YMM5, %YMM6, %YMM1
|
|
||||||
- VPMINU %YMM7, %YMM8, %YMM2
|
|
||||||
-
|
|
||||||
- VPMINU %YMM1, %YMM2, %YMM1
|
|
||||||
-
|
|
||||||
- /* Each bit in K0 represents a CHAR or a null byte. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM1, %k0
|
|
||||||
-
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
-
|
|
||||||
- ktestd %k0, %k0
|
|
||||||
+ VPCMP $4, %YMM0, %YMM4, %k4
|
|
||||||
+
|
|
||||||
+ /* Use min to select all zeros from either xor or end of string).
|
|
||||||
+ */
|
|
||||||
+ VPMINU %YMM1, %YMM5, %YMM1
|
|
||||||
+ VPMINU %YMM3, %YMM7, %YMM3
|
|
||||||
+
|
|
||||||
+ /* Use min + zeromask to select for zeros. Since k2 and k4 will
|
|
||||||
+ have 0 as positions that matched with CHAR which will set
|
|
||||||
+ zero in the corresponding destination bytes in YMM2 / YMM4.
|
|
||||||
+ */
|
|
||||||
+ VPMINU %YMM1, %YMM2, %YMM2{%k2}{z}
|
|
||||||
+ VPMINU %YMM3, %YMM4, %YMM4
|
|
||||||
+ VPMINU %YMM2, %YMM4, %YMM4{%k4}{z}
|
|
||||||
+
|
|
||||||
+ VPCMP $0, %YMMZERO, %YMM4, %k1
|
|
||||||
+ kmovd %k1, %ecx
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
jz L(loop_4x_vec)
|
|
||||||
|
|
||||||
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM5, %k0
|
|
||||||
+ VPCMP $0, %YMMZERO, %YMM1, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x0)
|
|
||||||
+ jnz L(last_vec_x1)
|
|
||||||
|
|
||||||
- /* Each bit in K1 represents a CHAR or a null byte in YMM2. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM6, %k1
|
|
||||||
- kmovd %k1, %eax
|
|
||||||
+ VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
-
|
|
||||||
- /* Each bit in K2 represents a CHAR or a null byte in YMM3. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM7, %k2
|
|
||||||
- /* Each bit in K3 represents a CHAR or a null byte in YMM4. */
|
|
||||||
- VPCMP $0, %YMMZERO, %YMM8, %k3
|
|
||||||
+ jnz L(last_vec_x2)
|
|
||||||
|
|
||||||
+ VPCMP $0, %YMMZERO, %YMM3, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */
|
|
||||||
# ifdef USE_AS_WCSCHR
|
|
||||||
- /* NB: Each bit in K2/K3 represents 4-byte element. */
|
|
||||||
- kshiftlw $8, %k3, %k1
|
|
||||||
+ sall $8, %ecx
|
|
||||||
+ orl %ecx, %eax
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
# else
|
|
||||||
- kshiftlq $32, %k3, %k1
|
|
||||||
+ salq $32, %rcx
|
|
||||||
+ orq %rcx, %rax
|
|
||||||
+ tzcntq %rax, %rax
|
|
||||||
# endif
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ /* Check if match was CHAR or null. */
|
|
||||||
+ cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
|
|
||||||
+ jne L(zero_end)
|
|
||||||
+# endif
|
|
||||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
|
||||||
+ bytes. */
|
|
||||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
|
||||||
- korq %k1, %k2, %k1
|
|
||||||
- kmovq %k1, %rax
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+L(zero_end):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ ret
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
- tzcntq %rax, %rax
|
|
||||||
-# ifdef USE_AS_WCSCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
|
|
||||||
-# else
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_vec_x1):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ /* Check if match was null. */
|
|
||||||
+ cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
|
|
||||||
+ jne L(zero_end)
|
|
||||||
# endif
|
|
||||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
|
||||||
+ bytes. */
|
|
||||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_vec_x2):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ /* Check if match was null. */
|
|
||||||
+ cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
|
|
||||||
+ jne L(zero_end)
|
|
||||||
# endif
|
|
||||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
|
||||||
+ bytes. */
|
|
||||||
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
/* Cold case for crossing page with first load. */
|
|
||||||
.p2align 4
|
|
||||||
L(cross_page_boundary):
|
|
||||||
+ movq %rdi, %rdx
|
|
||||||
+ /* Align rdi. */
|
|
||||||
andq $-VEC_SIZE, %rdi
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
-
|
|
||||||
VMOVA (%rdi), %YMM1
|
|
||||||
-
|
|
||||||
/* Leaves only CHARS matching esi as 0. */
|
|
||||||
vpxorq %YMM1, %YMM0, %YMM2
|
|
||||||
VPMINU %YMM2, %YMM1, %YMM2
|
|
||||||
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
||||||
VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
- testl %eax, %eax
|
|
||||||
-
|
|
||||||
+ /* Remove the leading bits. */
|
|
||||||
# ifdef USE_AS_WCSCHR
|
|
||||||
+ movl %edx, %SHIFT_REG
|
|
||||||
/* NB: Divide shift count by 4 since each bit in K1 represent 4
|
|
||||||
bytes. */
|
|
||||||
- movl %ecx, %SHIFT_REG
|
|
||||||
- sarl $2, %SHIFT_REG
|
|
||||||
+ sarl $2, %SHIFT_REG
|
|
||||||
+ andl $(CHAR_PER_VEC - 1), %SHIFT_REG
|
|
||||||
# endif
|
|
||||||
-
|
|
||||||
- /* Remove the leading bits. */
|
|
||||||
sarxl %SHIFT_REG, %eax, %eax
|
|
||||||
+ /* If eax is zero continue. */
|
|
||||||
testl %eax, %eax
|
|
||||||
-
|
|
||||||
- jz L(aligned_more)
|
|
||||||
+ jz L(cross_page_continue)
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
- addq %rcx, %rdi
|
|
||||||
+# ifndef USE_AS_STRCHRNUL
|
|
||||||
+ /* Check to see if match was CHAR or null. */
|
|
||||||
+ cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG
|
|
||||||
+ jne L(zero_end)
|
|
||||||
+# endif
|
|
||||||
# ifdef USE_AS_WCSCHR
|
|
||||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
||||||
- leaq (%rdi, %rax, 4), %rax
|
|
||||||
+ /* NB: Multiply wchar_t count by 4 to get the number of
|
|
||||||
+ bytes. */
|
|
||||||
+ leaq (%rdx, %rax, CHAR_SIZE), %rax
|
|
||||||
# else
|
|
||||||
- addq %rdi, %rax
|
|
||||||
-# endif
|
|
||||||
-# ifndef USE_AS_STRCHRNUL
|
|
||||||
- cmp (%rax), %CHAR_REG
|
|
||||||
- cmovne %rdx, %rax
|
|
||||||
+ addq %rdx, %rax
|
|
||||||
# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,536 +0,0 @@
|
|||||||
From 104c7b1967c3e78435c6f7eab5e225a7eddf9c6e Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Tue, 4 May 2021 19:02:40 -0400
|
|
||||||
Subject: [PATCH] x86: Add EVEX optimized memchr family not safe for RTM
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug.
|
|
||||||
|
|
||||||
This commit adds a new implementation for EVEX memchr that is not safe
|
|
||||||
for RTM because it uses vzeroupper. The benefit is that by using
|
|
||||||
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
|
|
||||||
faster than the RTM safe version which cannot use vpcmpeq because
|
|
||||||
there is no EVEX encoding for the instruction. All parts of the
|
|
||||||
implementation aside from the 4x loop are the same for the two
|
|
||||||
versions and the optimization is only relevant for large sizes.
|
|
||||||
|
|
||||||
Tigerlake:
|
|
||||||
size , algn , Pos , Cur T , New T , Win , Dif
|
|
||||||
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
|
|
||||||
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
|
|
||||||
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
|
|
||||||
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
|
|
||||||
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
|
|
||||||
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
|
|
||||||
|
|
||||||
Icelake:
|
|
||||||
size , algn , Pos , Cur T , New T , Win , Dif
|
|
||||||
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
|
|
||||||
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
|
|
||||||
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
|
|
||||||
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
|
|
||||||
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
|
|
||||||
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
|
|
||||||
|
|
||||||
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/Makefile | 7 +-
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-evex.h | 55 ++++++
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 ++
|
|
||||||
sysdeps/x86_64/multiarch/memchr-evex-rtm.S | 8 +
|
|
||||||
sysdeps/x86_64/multiarch/memchr-evex.S | 161 ++++++++++++++----
|
|
||||||
sysdeps/x86_64/multiarch/memchr.c | 2 +-
|
|
||||||
sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | 3 +
|
|
||||||
sysdeps/x86_64/multiarch/rawmemchr.c | 2 +-
|
|
||||||
sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S | 3 +
|
|
||||||
sysdeps/x86_64/multiarch/wmemchr.c | 2 +-
|
|
||||||
10 files changed, 217 insertions(+), 41 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/ifunc-evex.h
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-rtm.S
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
|
|
||||||
create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
index 65fde4eb..26be4095 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
|
||||||
@@ -77,7 +77,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
|
||||||
strncmp-evex \
|
|
||||||
strncpy-evex \
|
|
||||||
strnlen-evex \
|
|
||||||
- strrchr-evex
|
|
||||||
+ strrchr-evex \
|
|
||||||
+ memchr-evex-rtm \
|
|
||||||
+ rawmemchr-evex-rtm
|
|
||||||
CFLAGS-varshift.c += -msse4
|
|
||||||
CFLAGS-strcspn-c.c += -msse4
|
|
||||||
CFLAGS-strpbrk-c.c += -msse4
|
|
||||||
@@ -110,7 +112,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
|
|
||||||
wcsnlen-evex \
|
|
||||||
wcsrchr-evex \
|
|
||||||
wmemchr-evex \
|
|
||||||
- wmemcmp-evex-movbe
|
|
||||||
+ wmemcmp-evex-movbe \
|
|
||||||
+ wmemchr-evex-rtm
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),debug)
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..fc391edb
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-evex.h
|
|
||||||
@@ -0,0 +1,55 @@
|
|
||||||
+/* Common definition for ifunc selection optimized with EVEX.
|
|
||||||
+ All versions must be listed in ifunc-impl-list.c.
|
|
||||||
+ Copyright (C) 2017-2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <init-arch.h>
|
|
||||||
+
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
|
||||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden;
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+static inline void *
|
|
||||||
+IFUNC_SELECTOR (void)
|
|
||||||
+{
|
|
||||||
+ const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
||||||
+
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
|
||||||
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
|
||||||
+ {
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
||||||
+ return OPTIMIZE (evex_rtm);
|
|
||||||
+
|
|
||||||
+ return OPTIMIZE (evex);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
|
||||||
+ return OPTIMIZE (avx2_rtm);
|
|
||||||
+
|
|
||||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
||||||
+ return OPTIMIZE (avx2);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return OPTIMIZE (sse2);
|
|
||||||
+}
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index d59d65f8..ac097e8d 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -52,6 +52,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
&& CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__memchr_evex)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, memchr,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
+ __memchr_evex_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
|
|
||||||
|
|
||||||
/* Support sysdeps/x86_64/multiarch/memcmp.c. */
|
|
||||||
@@ -288,6 +293,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
&& CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__rawmemchr_evex)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, rawmemchr,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
+ __rawmemchr_evex_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
|
|
||||||
|
|
||||||
/* Support sysdeps/x86_64/multiarch/strlen.c. */
|
|
||||||
@@ -711,6 +721,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
&& CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
__wmemchr_evex)
|
|
||||||
+ IFUNC_IMPL_ADD (array, i, wmemchr,
|
|
||||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
||||||
+ __wmemchr_evex_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
|
|
||||||
|
|
||||||
/* Support sysdeps/x86_64/multiarch/wmemcmp.c. */
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memchr-evex-rtm.S b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..19871882
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
|
|
||||||
@@ -0,0 +1,8 @@
|
|
||||||
+#ifndef MEMCHR
|
|
||||||
+# define MEMCHR __memchr_evex_rtm
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#define USE_IN_RTM 1
|
|
||||||
+#define SECTION(p) p##.evex.rtm
|
|
||||||
+
|
|
||||||
+#include "memchr-evex.S"
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
index f3fdad4f..4d0ed6d1 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
||||||
@@ -38,10 +38,32 @@
|
|
||||||
# define CHAR_SIZE 1
|
|
||||||
# endif
|
|
||||||
|
|
||||||
+ /* In the 4x loop the RTM and non-RTM versions have data pointer
|
|
||||||
+ off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
|
|
||||||
+ This is represented by BASE_OFFSET. As well because the RTM
|
|
||||||
+ version uses vpcmp which stores a bit per element compared where
|
|
||||||
+ the non-RTM version uses vpcmpeq which stores a bit per byte
|
|
||||||
+ compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
|
|
||||||
+ version. */
|
|
||||||
+# ifdef USE_IN_RTM
|
|
||||||
+# define VZEROUPPER
|
|
||||||
+# define BASE_OFFSET (VEC_SIZE * 4)
|
|
||||||
+# define RET_SCALE CHAR_SIZE
|
|
||||||
+# else
|
|
||||||
+# define VZEROUPPER vzeroupper
|
|
||||||
+# define BASE_OFFSET 0
|
|
||||||
+# define RET_SCALE 1
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+ /* In the return from 4x loop memchr and rawmemchr versions have
|
|
||||||
+ data pointers off by VEC_SIZE * 4 with memchr version being
|
|
||||||
+ VEC_SIZE * 4 greater. */
|
|
||||||
# ifdef USE_AS_RAWMEMCHR
|
|
||||||
+# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4))
|
|
||||||
# define RAW_PTR_REG rcx
|
|
||||||
# define ALGN_PTR_REG rdi
|
|
||||||
# else
|
|
||||||
+# define RET_OFFSET BASE_OFFSET
|
|
||||||
# define RAW_PTR_REG rdi
|
|
||||||
# define ALGN_PTR_REG rcx
|
|
||||||
# endif
|
|
||||||
@@ -57,11 +79,15 @@
|
|
||||||
# define YMM5 ymm21
|
|
||||||
# define YMM6 ymm22
|
|
||||||
|
|
||||||
+# ifndef SECTION
|
|
||||||
+# define SECTION(p) p##.evex
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
||||||
# define PAGE_SIZE 4096
|
|
||||||
|
|
||||||
- .section .text.evex,"ax",@progbits
|
|
||||||
+ .section SECTION(.text),"ax",@progbits
|
|
||||||
ENTRY (MEMCHR)
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
/* Check for zero length. */
|
|
||||||
@@ -237,14 +263,15 @@ L(cross_page_continue):
|
|
||||||
/* Check if at last CHAR_PER_VEC * 4 length. */
|
|
||||||
subq $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
jbe L(last_4x_vec_or_less_cmpeq)
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
+ /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */
|
|
||||||
+ addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
|
|
||||||
|
|
||||||
/* Align data to VEC_SIZE * 4 for the loop and readjust length.
|
|
||||||
*/
|
|
||||||
# ifdef USE_AS_WMEMCHR
|
|
||||||
movl %edi, %ecx
|
|
||||||
andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
- andl $(VEC_SIZE * 4 - 1), %ecx
|
|
||||||
+ subl %edi, %ecx
|
|
||||||
/* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
||||||
sarl $2, %ecx
|
|
||||||
addq %rcx, %rdx
|
|
||||||
@@ -254,15 +281,28 @@ L(cross_page_continue):
|
|
||||||
subq %rdi, %rdx
|
|
||||||
# endif
|
|
||||||
# else
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
+ addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
|
|
||||||
andq $-(4 * VEC_SIZE), %rdi
|
|
||||||
# endif
|
|
||||||
-
|
|
||||||
+# ifdef USE_IN_RTM
|
|
||||||
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
|
||||||
+# else
|
|
||||||
+ /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
|
|
||||||
+ encodable with EVEX registers (ymm16-ymm31). */
|
|
||||||
+ vmovdqa64 %YMMMATCH, %ymm0
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
/* Compare 4 * VEC at a time forward. */
|
|
||||||
.p2align 4
|
|
||||||
L(loop_4x_vec):
|
|
||||||
+ /* Two versions of the loop. One that does not require
|
|
||||||
+ vzeroupper by not using ymm0-ymm15 and another does that require
|
|
||||||
+ vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
|
|
||||||
+ is used at all is because there is no EVEX encoding vpcmpeq and
|
|
||||||
+ with vpcmpeq this loop can be performed more efficiently. The
|
|
||||||
+ non-vzeroupper version is safe for RTM while the vzeroupper
|
|
||||||
+ version should be prefered if RTM are not supported. */
|
|
||||||
+# ifdef USE_IN_RTM
|
|
||||||
/* It would be possible to save some instructions using 4x VPCMP
|
|
||||||
but bottleneck on port 5 makes it not woth it. */
|
|
||||||
VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
|
|
||||||
@@ -273,12 +313,55 @@ L(loop_4x_vec):
|
|
||||||
/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
|
|
||||||
VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
|
|
||||||
VPCMP $0, %YMM3, %YMMZERO, %k2
|
|
||||||
+# else
|
|
||||||
+ /* Since vptern can only take 3x vectors fastest to do 1 vec
|
|
||||||
+ seperately with EVEX vpcmp. */
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* vptern can only accept masks for epi32/epi64 so can only save
|
|
||||||
+ instruction using not equals mask on vptern with wmemchr. */
|
|
||||||
+ VPCMP $4, (%rdi), %YMMMATCH, %k1
|
|
||||||
+# else
|
|
||||||
+ VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
||||||
+# endif
|
|
||||||
+ /* Compare 3x with vpcmpeq and or them all together with vptern.
|
|
||||||
+ */
|
|
||||||
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* This takes the not of or between ymm2, ymm3, ymm4 as well as
|
|
||||||
+ combines result from VEC0 with zero mask. */
|
|
||||||
+ vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
|
|
||||||
+ vpmovmskb %ymm4, %ecx
|
|
||||||
+# else
|
|
||||||
+ /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */
|
|
||||||
+ vpternlogd $254, %ymm2, %ymm3, %ymm4
|
|
||||||
+ vpmovmskb %ymm4, %ecx
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+# endif
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
# ifdef USE_AS_RAWMEMCHR
|
|
||||||
subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+# endif
|
|
||||||
+# ifdef USE_IN_RTM
|
|
||||||
kortestd %k2, %k3
|
|
||||||
+# else
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
+ /* ecx contains not of matches. All 1s means no matches. incl will
|
|
||||||
+ overflow and set zeroflag if that is the case. */
|
|
||||||
+ incl %ecx
|
|
||||||
+# else
|
|
||||||
+ /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
|
|
||||||
+ to ecx is not an issue because if eax is non-zero it will be
|
|
||||||
+ used for returning the match. If it is zero the add does
|
|
||||||
+ nothing. */
|
|
||||||
+ addq %rax, %rcx
|
|
||||||
+# endif
|
|
||||||
+# endif
|
|
||||||
+# ifdef USE_AS_RAWMEMCHR
|
|
||||||
jz L(loop_4x_vec)
|
|
||||||
# else
|
|
||||||
- kortestd %k2, %k3
|
|
||||||
jnz L(loop_4x_vec_end)
|
|
||||||
|
|
||||||
subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
@@ -288,10 +371,11 @@ L(loop_4x_vec):
|
|
||||||
|
|
||||||
/* Fall through into less than 4 remaining vectors of length case.
|
|
||||||
*/
|
|
||||||
- VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
|
|
||||||
+ VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
|
|
||||||
+ addq $(BASE_OFFSET - VEC_SIZE), %rdi
|
|
||||||
kmovd %k0, %eax
|
|
||||||
- addq $(VEC_SIZE * 3), %rdi
|
|
||||||
- .p2align 4
|
|
||||||
+ VZEROUPPER
|
|
||||||
+
|
|
||||||
L(last_4x_vec_or_less):
|
|
||||||
/* Check if first VEC contained match. */
|
|
||||||
testl %eax, %eax
|
|
||||||
@@ -338,73 +422,78 @@ L(loop_4x_vec_end):
|
|
||||||
/* rawmemchr will fall through into this if match was found in
|
|
||||||
loop. */
|
|
||||||
|
|
||||||
+# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
|
|
||||||
/* k1 has not of matches with VEC1. */
|
|
||||||
kmovd %k1, %eax
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
+# ifdef USE_AS_WMEMCHR
|
|
||||||
subl $((1 << CHAR_PER_VEC) - 1), %eax
|
|
||||||
-# else
|
|
||||||
+# else
|
|
||||||
incl %eax
|
|
||||||
+# endif
|
|
||||||
+# else
|
|
||||||
+ /* eax already has matches for VEC1. */
|
|
||||||
+ testl %eax, %eax
|
|
||||||
# endif
|
|
||||||
jnz L(last_vec_x1_return)
|
|
||||||
|
|
||||||
+# ifdef USE_IN_RTM
|
|
||||||
VPCMP $0, %YMM2, %YMMZERO, %k0
|
|
||||||
kmovd %k0, %eax
|
|
||||||
+# else
|
|
||||||
+ vpmovmskb %ymm2, %eax
|
|
||||||
+# endif
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(last_vec_x2_return)
|
|
||||||
|
|
||||||
+# ifdef USE_IN_RTM
|
|
||||||
kmovd %k2, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(last_vec_x3_return)
|
|
||||||
|
|
||||||
kmovd %k3, %eax
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_RAWMEMCHR
|
|
||||||
- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# else
|
|
||||||
- leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ vpmovmskb %ymm3, %eax
|
|
||||||
+ /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */
|
|
||||||
+ salq $VEC_SIZE, %rcx
|
|
||||||
+ orq %rcx, %rax
|
|
||||||
+ tzcntq %rax, %rax
|
|
||||||
+ leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
|
|
||||||
+ VZEROUPPER
|
|
||||||
# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(last_vec_x1_return):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_RAWMEMCHR
|
|
||||||
-# ifdef USE_AS_WMEMCHR
|
|
||||||
+# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
|
|
||||||
/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
- leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
-# else
|
|
||||||
- addq %rdi, %rax
|
|
||||||
-# endif
|
|
||||||
+ leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# else
|
|
||||||
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
- leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
# endif
|
|
||||||
+ VZEROUPPER
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(last_vec_x2_return):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_RAWMEMCHR
|
|
||||||
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
- leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
-# else
|
|
||||||
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
- leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
-# endif
|
|
||||||
+ /* NB: Multiply bytes by RET_SCALE to get the wchar_t count
|
|
||||||
+ if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
|
|
||||||
+ USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */
|
|
||||||
+ leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
|
|
||||||
+ VZEROUPPER
|
|
||||||
ret
|
|
||||||
|
|
||||||
+# ifdef USE_IN_RTM
|
|
||||||
.p2align 4
|
|
||||||
L(last_vec_x3_return):
|
|
||||||
tzcntl %eax, %eax
|
|
||||||
-# ifdef USE_AS_RAWMEMCHR
|
|
||||||
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
-# else
|
|
||||||
/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
|
||||||
- leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
-# endif
|
|
||||||
+ leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
ret
|
|
||||||
-
|
|
||||||
+# endif
|
|
||||||
|
|
||||||
# ifndef USE_AS_RAWMEMCHR
|
|
||||||
L(last_4x_vec_or_less_cmpeq):
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c
|
|
||||||
index 016f5784..f28aea77 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memchr.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memchr.c
|
|
||||||
@@ -24,7 +24,7 @@
|
|
||||||
# undef memchr
|
|
||||||
|
|
||||||
# define SYMBOL_NAME memchr
|
|
||||||
-# include "ifunc-avx2.h"
|
|
||||||
+# include "ifunc-evex.h"
|
|
||||||
|
|
||||||
libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ());
|
|
||||||
strong_alias (memchr, __memchr)
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..deda1ca3
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
|
|
||||||
@@ -0,0 +1,3 @@
|
|
||||||
+#define MEMCHR __rawmemchr_evex_rtm
|
|
||||||
+#define USE_AS_RAWMEMCHR 1
|
|
||||||
+#include "memchr-evex-rtm.S"
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c
|
|
||||||
index 8a0bc313..1f764f35 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/rawmemchr.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/rawmemchr.c
|
|
||||||
@@ -26,7 +26,7 @@
|
|
||||||
# undef __rawmemchr
|
|
||||||
|
|
||||||
# define SYMBOL_NAME rawmemchr
|
|
||||||
-# include "ifunc-avx2.h"
|
|
||||||
+# include "ifunc-evex.h"
|
|
||||||
|
|
||||||
libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
|
|
||||||
IFUNC_SELECTOR ());
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..a346cd35
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
|
|
||||||
@@ -0,0 +1,3 @@
|
|
||||||
+#define MEMCHR __wmemchr_evex_rtm
|
|
||||||
+#define USE_AS_WMEMCHR 1
|
|
||||||
+#include "memchr-evex-rtm.S"
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c
|
|
||||||
index 6d833702..f9c91915 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/wmemchr.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/wmemchr.c
|
|
||||||
@@ -26,7 +26,7 @@
|
|
||||||
# undef __wmemchr
|
|
||||||
|
|
||||||
# define SYMBOL_NAME wmemchr
|
|
||||||
-# include "ifunc-avx2.h"
|
|
||||||
+# include "ifunc-evex.h"
|
|
||||||
|
|
||||||
libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ());
|
|
||||||
weak_alias (__wmemchr, wmemchr)
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,873 +0,0 @@
|
|||||||
From 16d12015c57701b08d7bbed6ec536641bcafb428 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon, 17 May 2021 13:56:52 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize memcmp-avx2-movbe.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit optimizes memcmp-avx2.S. The optimizations include
|
|
||||||
adding a new vec compare path for small sizes, reorganizing the entry
|
|
||||||
control flow, and removing some unnecissary ALU instructions from the
|
|
||||||
main loop. test-memcmp and test-wmemcmp are both passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 +
|
|
||||||
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 1 +
|
|
||||||
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 676 +++++++++++--------
|
|
||||||
3 files changed, 402 insertions(+), 281 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
index ac097e8d..8be0d78a 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
||||||
@@ -63,16 +63,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL (i, name, memcmp,
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcmp,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
|
||||||
__memcmp_avx2_movbe)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcmp,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (MOVBE)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__memcmp_avx2_movbe_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcmp,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
|
||||||
__memcmp_evex_movbe)
|
|
||||||
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
|
|
||||||
@@ -732,16 +735,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
||||||
IFUNC_IMPL (i, name, wmemcmp,
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemcmp,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
|
||||||
__wmemcmp_avx2_movbe)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemcmp,
|
|
||||||
(CPU_FEATURE_USABLE (AVX2)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (MOVBE)
|
|
||||||
&& CPU_FEATURE_USABLE (RTM)),
|
|
||||||
__wmemcmp_avx2_movbe_rtm)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemcmp,
|
|
||||||
(CPU_FEATURE_USABLE (AVX512VL)
|
|
||||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
|
||||||
+ && CPU_FEATURE_USABLE (BMI2)
|
|
||||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
|
||||||
__wmemcmp_evex_movbe)
|
|
||||||
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
|
||||||
index 8043c635..690dffe8 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
|
||||||
@@ -33,6 +33,7 @@ IFUNC_SELECTOR (void)
|
|
||||||
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
||||||
&& CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
|
|
||||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
|
||||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
||||||
{
|
|
||||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
index 9d5c9c72..16fc673e 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
||||||
@@ -19,17 +19,23 @@
|
|
||||||
#if IS_IN (libc)
|
|
||||||
|
|
||||||
/* memcmp/wmemcmp is implemented as:
|
|
||||||
- 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
|
|
||||||
- to avoid branches.
|
|
||||||
- 2. Use overlapping compare to avoid branch.
|
|
||||||
- 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
|
|
||||||
- bytes for wmemcmp.
|
|
||||||
- 4. If size is 8 * VEC_SIZE or less, unroll the loop.
|
|
||||||
- 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
|
||||||
+ 1. Use ymm vector compares when possible. The only case where
|
|
||||||
+ vector compares is not possible for when size < VEC_SIZE
|
|
||||||
+ and loading from either s1 or s2 would cause a page cross.
|
|
||||||
+ 2. For size from 2 to 7 bytes on page cross, load as big endian
|
|
||||||
+ with movbe and bswap to avoid branches.
|
|
||||||
+ 3. Use xmm vector compare when size >= 4 bytes for memcmp or
|
|
||||||
+ size >= 8 bytes for wmemcmp.
|
|
||||||
+ 4. Optimistically compare up to first 4 * VEC_SIZE one at a
|
|
||||||
+ to check for early mismatches. Only do this if its guranteed the
|
|
||||||
+ work is not wasted.
|
|
||||||
+ 5. If size is 8 * VEC_SIZE or less, unroll the loop.
|
|
||||||
+ 6. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
|
||||||
area.
|
|
||||||
- 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
|
|
||||||
- 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
|
|
||||||
- 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
|
|
||||||
+ 7. Use 2 vector compares when size is 2 * VEC_SIZE or less.
|
|
||||||
+ 8. Use 4 vector compares when size is 4 * VEC_SIZE or less.
|
|
||||||
+ 9. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
|
|
||||||
+
|
|
||||||
|
|
||||||
# include <sysdep.h>
|
|
||||||
|
|
||||||
@@ -38,8 +44,10 @@
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
+# define CHAR_SIZE 4
|
|
||||||
# define VPCMPEQ vpcmpeqd
|
|
||||||
# else
|
|
||||||
+# define CHAR_SIZE 1
|
|
||||||
# define VPCMPEQ vpcmpeqb
|
|
||||||
# endif
|
|
||||||
|
|
||||||
@@ -52,7 +60,7 @@
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# define VEC_SIZE 32
|
|
||||||
-# define VEC_MASK ((1 << VEC_SIZE) - 1)
|
|
||||||
+# define PAGE_SIZE 4096
|
|
||||||
|
|
||||||
/* Warning!
|
|
||||||
wmemcmp has to use SIGNED comparison for elements.
|
|
||||||
@@ -71,136 +79,359 @@ ENTRY (MEMCMP)
|
|
||||||
jb L(less_vec)
|
|
||||||
|
|
||||||
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
- vmovdqu (%rsi), %ymm2
|
|
||||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ vmovdqu (%rsi), %ymm1
|
|
||||||
+ VPCMPEQ (%rdi), %ymm1, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ /* NB: eax must be destination register if going to
|
|
||||||
+ L(return_vec_[0,2]). For L(return_vec_3 destination register
|
|
||||||
+ must be ecx. */
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
|
|
||||||
cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
- jbe L(last_vec)
|
|
||||||
-
|
|
||||||
- VPCMPEQ %ymm0, %ymm0, %ymm0
|
|
||||||
- /* More than 2 * VEC. */
|
|
||||||
- cmpq $(VEC_SIZE * 8), %rdx
|
|
||||||
- ja L(more_8x_vec)
|
|
||||||
- cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jb L(last_4x_vec)
|
|
||||||
-
|
|
||||||
- /* From 4 * VEC to 8 * VEC, inclusively. */
|
|
||||||
- vmovdqu (%rsi), %ymm1
|
|
||||||
- VPCMPEQ (%rdi), %ymm1, %ymm1
|
|
||||||
+ jbe L(last_1x_vec)
|
|
||||||
|
|
||||||
+ /* Check second VEC no matter what. */
|
|
||||||
vmovdqu VEC_SIZE(%rsi), %ymm2
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
|
|
||||||
+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
|
|
||||||
+ vpmovmskb %ymm2, %eax
|
|
||||||
+ /* If all 4 VEC where equal eax will be all 1s so incl will
|
|
||||||
+ overflow and set zero flag. */
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(return_vec_1)
|
|
||||||
|
|
||||||
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
|
||||||
+ /* Less than 4 * VEC. */
|
|
||||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ jbe L(last_2x_vec)
|
|
||||||
|
|
||||||
+ /* Check third and fourth VEC no matter what. */
|
|
||||||
+ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
|
||||||
+ vpmovmskb %ymm3, %eax
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(return_vec_2)
|
|
||||||
vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
|
||||||
+ vpmovmskb %ymm4, %ecx
|
|
||||||
+ incl %ecx
|
|
||||||
+ jnz L(return_vec_3)
|
|
||||||
|
|
||||||
- vpand %ymm1, %ymm2, %ymm5
|
|
||||||
- vpand %ymm3, %ymm4, %ymm6
|
|
||||||
- vpand %ymm5, %ymm6, %ymm5
|
|
||||||
+ /* Go to 4x VEC loop. */
|
|
||||||
+ cmpq $(VEC_SIZE * 8), %rdx
|
|
||||||
+ ja L(more_8x_vec)
|
|
||||||
|
|
||||||
- vptest %ymm0, %ymm5
|
|
||||||
- jnc L(4x_vec_end)
|
|
||||||
+ /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
|
|
||||||
+ branches. */
|
|
||||||
|
|
||||||
+ /* Load first two VEC from s2 before adjusting addresses. */
|
|
||||||
+ vmovdqu -(VEC_SIZE * 4)(%rsi, %rdx), %ymm1
|
|
||||||
+ vmovdqu -(VEC_SIZE * 3)(%rsi, %rdx), %ymm2
|
|
||||||
leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
|
|
||||||
leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
|
|
||||||
- vmovdqu (%rsi), %ymm1
|
|
||||||
- VPCMPEQ (%rdi), %ymm1, %ymm1
|
|
||||||
|
|
||||||
- vmovdqu VEC_SIZE(%rsi), %ymm2
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
|
|
||||||
- vpand %ymm2, %ymm1, %ymm5
|
|
||||||
+ /* Wait to load from s1 until addressed adjust due to
|
|
||||||
+ unlamination of microfusion with complex address mode. */
|
|
||||||
+ VPCMPEQ (%rdi), %ymm1, %ymm1
|
|
||||||
+ VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2
|
|
||||||
|
|
||||||
vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
|
||||||
- vpand %ymm3, %ymm5, %ymm5
|
|
||||||
-
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
|
||||||
vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
|
||||||
- vpand %ymm4, %ymm5, %ymm5
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
|
||||||
|
|
||||||
- vptest %ymm0, %ymm5
|
|
||||||
- jnc L(4x_vec_end)
|
|
||||||
- xorl %eax, %eax
|
|
||||||
+ /* Reduce VEC0 - VEC4. */
|
|
||||||
+ vpand %ymm1, %ymm2, %ymm5
|
|
||||||
+ vpand %ymm3, %ymm4, %ymm6
|
|
||||||
+ vpand %ymm5, %ymm6, %ymm7
|
|
||||||
+ vpmovmskb %ymm7, %ecx
|
|
||||||
+ incl %ecx
|
|
||||||
+ jnz L(return_vec_0_1_2_3)
|
|
||||||
+ /* NB: eax must be zero to reach here. */
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(return_vec_0):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl (%rdi, %rax), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (%rsi, %rax), %ecx
|
|
||||||
+ /* NB: no partial register stall here because xorl zero idiom
|
|
||||||
+ above. */
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl (%rsi, %rax), %ecx
|
|
||||||
+ movzbl (%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
L(return_vzeroupper):
|
|
||||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(last_2x_vec):
|
|
||||||
- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
- vmovdqu (%rsi), %ymm2
|
|
||||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+L(return_vec_1):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl VEC_SIZE(%rdi, %rax), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl VEC_SIZE(%rsi, %rax), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl VEC_SIZE(%rsi, %rax), %ecx
|
|
||||||
+ movzbl VEC_SIZE(%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(return_vec_2):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
|
|
||||||
+ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ /* NB: p2align 5 here to ensure 4x loop is 32 byte aligned. */
|
|
||||||
+ .p2align 5
|
|
||||||
+L(8x_return_vec_0_1_2_3):
|
|
||||||
+ /* Returning from L(more_8x_vec) requires restoring rsi. */
|
|
||||||
+ addq %rdi, %rsi
|
|
||||||
+L(return_vec_0_1_2_3):
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
|
|
||||||
-L(last_vec):
|
|
||||||
- /* Use overlapping loads to avoid branches. */
|
|
||||||
- leaq -VEC_SIZE(%rdi, %rdx), %rdi
|
|
||||||
- leaq -VEC_SIZE(%rsi, %rdx), %rsi
|
|
||||||
- vmovdqu (%rsi), %ymm2
|
|
||||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(return_vec_1)
|
|
||||||
+
|
|
||||||
+ vpmovmskb %ymm3, %eax
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(return_vec_2)
|
|
||||||
+L(return_vec_3):
|
|
||||||
+ tzcntl %ecx, %ecx
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx), %eax
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(more_8x_vec):
|
|
||||||
+ /* Set end of s1 in rdx. */
|
|
||||||
+ leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
|
|
||||||
+ /* rsi stores s2 - s1. This allows loop to only update one
|
|
||||||
+ pointer. */
|
|
||||||
+ subq %rdi, %rsi
|
|
||||||
+ /* Align s1 pointer. */
|
|
||||||
+ andq $-VEC_SIZE, %rdi
|
|
||||||
+ /* Adjust because first 4x vec where check already. */
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ .p2align 4
|
|
||||||
+L(loop_4x_vec):
|
|
||||||
+ /* rsi has s2 - s1 so get correct address by adding s1 (in rdi).
|
|
||||||
+ */
|
|
||||||
+ vmovdqu (%rsi, %rdi), %ymm1
|
|
||||||
+ VPCMPEQ (%rdi), %ymm1, %ymm1
|
|
||||||
+
|
|
||||||
+ vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2
|
|
||||||
+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
|
|
||||||
+
|
|
||||||
+ vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
|
||||||
+
|
|
||||||
+ vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
|
||||||
+
|
|
||||||
+ vpand %ymm1, %ymm2, %ymm5
|
|
||||||
+ vpand %ymm3, %ymm4, %ymm6
|
|
||||||
+ vpand %ymm5, %ymm6, %ymm7
|
|
||||||
+ vpmovmskb %ymm7, %ecx
|
|
||||||
+ incl %ecx
|
|
||||||
+ jnz L(8x_return_vec_0_1_2_3)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ /* Check if s1 pointer at end. */
|
|
||||||
+ cmpq %rdx, %rdi
|
|
||||||
+ jb L(loop_4x_vec)
|
|
||||||
+
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+ /* rdi has 4 * VEC_SIZE - remaining length. */
|
|
||||||
+ cmpl $(VEC_SIZE * 3), %edi
|
|
||||||
+ jae L(8x_last_1x_vec)
|
|
||||||
+ /* Load regardless of branch. */
|
|
||||||
+ vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3
|
|
||||||
+ cmpl $(VEC_SIZE * 2), %edi
|
|
||||||
+ jae L(8x_last_2x_vec)
|
|
||||||
+
|
|
||||||
+ /* Check last 4 VEC. */
|
|
||||||
+ vmovdqu (%rsi, %rdx), %ymm1
|
|
||||||
+ VPCMPEQ (%rdx), %ymm1, %ymm1
|
|
||||||
+
|
|
||||||
+ vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2
|
|
||||||
+ VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2
|
|
||||||
+
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
|
|
||||||
+
|
|
||||||
+ vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
|
|
||||||
+
|
|
||||||
+ vpand %ymm1, %ymm2, %ymm5
|
|
||||||
+ vpand %ymm3, %ymm4, %ymm6
|
|
||||||
+ vpand %ymm5, %ymm6, %ymm7
|
|
||||||
+ vpmovmskb %ymm7, %ecx
|
|
||||||
+ /* Restore s1 pointer to rdi. */
|
|
||||||
+ movq %rdx, %rdi
|
|
||||||
+ incl %ecx
|
|
||||||
+ jnz L(8x_return_vec_0_1_2_3)
|
|
||||||
+ /* NB: eax must be zero to reach here. */
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ /* Only entry is from L(more_8x_vec). */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(8x_last_2x_vec):
|
|
||||||
+ /* Check second to last VEC. rdx store end pointer of s1 and
|
|
||||||
+ ymm3 has already been loaded with second to last VEC from s2.
|
|
||||||
+ */
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
|
|
||||||
+ vpmovmskb %ymm3, %eax
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(8x_return_vec_2)
|
|
||||||
+ /* Check last VEC. */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(8x_last_1x_vec):
|
|
||||||
+ vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4
|
|
||||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
|
|
||||||
+ vpmovmskb %ymm4, %eax
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(8x_return_vec_3)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec):
|
|
||||||
- /* A byte or int32 is different within 16 or 32 bytes. */
|
|
||||||
- tzcntl %eax, %ecx
|
|
||||||
+L(last_2x_vec):
|
|
||||||
+ /* Check second to last VEC. */
|
|
||||||
+ vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
|
|
||||||
+ VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(return_vec_1_end)
|
|
||||||
+ /* Check last VEC. */
|
|
||||||
+L(last_1x_vec):
|
|
||||||
+ vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
|
|
||||||
+ VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
|
|
||||||
+ vpmovmskb %ymm1, %eax
|
|
||||||
+ incl %eax
|
|
||||||
+ jnz L(return_vec_0_end)
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(8x_return_vec_2):
|
|
||||||
+ subq $VEC_SIZE, %rdx
|
|
||||||
+L(8x_return_vec_3):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addq %rdx, %rax
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl (%rdi, %rcx), %edx
|
|
||||||
- cmpl (%rsi, %rcx), %edx
|
|
||||||
-L(wmemcmp_return):
|
|
||||||
- setl %al
|
|
||||||
- negl %eax
|
|
||||||
- orl $1, %eax
|
|
||||||
+ movl (VEC_SIZE * 3)(%rax), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
# else
|
|
||||||
- movzbl (%rdi, %rcx), %eax
|
|
||||||
- movzbl (%rsi, %rcx), %edx
|
|
||||||
- sub %edx, %eax
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
# endif
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
.p2align 4
|
|
||||||
-L(4):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl (%rdi), %edx
|
|
||||||
- cmpl (%rsi), %edx
|
|
||||||
- jne L(wmemcmp_return)
|
|
||||||
- ret
|
|
||||||
+L(return_vec_1_end):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addl %edx, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl -(VEC_SIZE * 2)(%rdi, %rax), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
# else
|
|
||||||
+ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
|
|
||||||
+ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
.p2align 4
|
|
||||||
-L(between_4_7):
|
|
||||||
- /* Load as big endian with overlapping movbe to avoid branches. */
|
|
||||||
- movbe (%rdi), %eax
|
|
||||||
- movbe (%rsi), %ecx
|
|
||||||
- shlq $32, %rax
|
|
||||||
- shlq $32, %rcx
|
|
||||||
- movbe -4(%rdi, %rdx), %edi
|
|
||||||
- movbe -4(%rsi, %rdx), %esi
|
|
||||||
- orq %rdi, %rax
|
|
||||||
- orq %rsi, %rcx
|
|
||||||
- subq %rcx, %rax
|
|
||||||
- je L(exit)
|
|
||||||
- sbbl %eax, %eax
|
|
||||||
- orl $1, %eax
|
|
||||||
- ret
|
|
||||||
+L(return_vec_0_end):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addl %edx, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl -VEC_SIZE(%rdi, %rax), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl -VEC_SIZE(%rsi, %rax), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl -VEC_SIZE(%rsi, %rax), %ecx
|
|
||||||
+ movzbl -VEC_SIZE(%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(exit):
|
|
||||||
- ret
|
|
||||||
+L(less_vec):
|
|
||||||
+ /* Check if one or less CHAR. This is necessary for size = 0 but
|
|
||||||
+ is also faster for size = CHAR_SIZE. */
|
|
||||||
+ cmpl $CHAR_SIZE, %edx
|
|
||||||
+ jbe L(one_or_less)
|
|
||||||
+
|
|
||||||
+ /* Check if loading one VEC from either s1 or s2 could cause a
|
|
||||||
+ page cross. This can have false positives but is by far the
|
|
||||||
+ fastest method. */
|
|
||||||
+ movl %edi, %eax
|
|
||||||
+ orl %esi, %eax
|
|
||||||
+ andl $(PAGE_SIZE - 1), %eax
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
+ jg L(page_cross_less_vec)
|
|
||||||
+
|
|
||||||
+ /* No page cross possible. */
|
|
||||||
+ vmovdqu (%rsi), %ymm2
|
|
||||||
+ VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
+ vpmovmskb %ymm2, %eax
|
|
||||||
+ incl %eax
|
|
||||||
+ /* Result will be zero if s1 and s2 match. Otherwise first set
|
|
||||||
+ bit will be first mismatch. */
|
|
||||||
+ bzhil %edx, %eax, %edx
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
+ VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(between_2_3):
|
|
||||||
+L(page_cross_less_vec):
|
|
||||||
+ /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
|
|
||||||
+ bytes. */
|
|
||||||
+ cmpl $16, %edx
|
|
||||||
+ jae L(between_16_31)
|
|
||||||
+# ifndef USE_AS_WMEMCMP
|
|
||||||
+ cmpl $8, %edx
|
|
||||||
+ jae L(between_8_15)
|
|
||||||
+ cmpl $4, %edx
|
|
||||||
+ jae L(between_4_7)
|
|
||||||
+
|
|
||||||
/* Load as big endian to avoid branches. */
|
|
||||||
movzwl (%rdi), %eax
|
|
||||||
movzwl (%rsi), %ecx
|
|
||||||
@@ -208,223 +439,106 @@ L(between_2_3):
|
|
||||||
shll $8, %ecx
|
|
||||||
bswap %eax
|
|
||||||
bswap %ecx
|
|
||||||
- movb -1(%rdi, %rdx), %al
|
|
||||||
- movb -1(%rsi, %rdx), %cl
|
|
||||||
+ movzbl -1(%rdi, %rdx), %edi
|
|
||||||
+ movzbl -1(%rsi, %rdx), %esi
|
|
||||||
+ orl %edi, %eax
|
|
||||||
+ orl %esi, %ecx
|
|
||||||
/* Subtraction is okay because the upper 8 bits are zero. */
|
|
||||||
subl %ecx, %eax
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(1):
|
|
||||||
- movzbl (%rdi), %eax
|
|
||||||
+L(one_or_less):
|
|
||||||
+ jb L(zero)
|
|
||||||
movzbl (%rsi), %ecx
|
|
||||||
+ movzbl (%rdi), %eax
|
|
||||||
subl %ecx, %eax
|
|
||||||
- ret
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(zero):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(less_vec):
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
- /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
|
|
||||||
- cmpb $4, %dl
|
|
||||||
- je L(4)
|
|
||||||
- jb L(zero)
|
|
||||||
-# else
|
|
||||||
- cmpb $1, %dl
|
|
||||||
- je L(1)
|
|
||||||
- jb L(zero)
|
|
||||||
- cmpb $4, %dl
|
|
||||||
- jb L(between_2_3)
|
|
||||||
- cmpb $8, %dl
|
|
||||||
- jb L(between_4_7)
|
|
||||||
+L(between_8_15):
|
|
||||||
# endif
|
|
||||||
- cmpb $16, %dl
|
|
||||||
- jae L(between_16_31)
|
|
||||||
- /* It is between 8 and 15 bytes. */
|
|
||||||
+ /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
|
|
||||||
vmovq (%rdi), %xmm1
|
|
||||||
vmovq (%rsi), %xmm2
|
|
||||||
- VPCMPEQ %xmm1, %xmm2, %xmm2
|
|
||||||
+ VPCMPEQ %xmm1, %xmm2, %xmm2
|
|
||||||
vpmovmskb %xmm2, %eax
|
|
||||||
- subl $0xffff, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ subl $0xffff, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
/* Use overlapping loads to avoid branches. */
|
|
||||||
leaq -8(%rdi, %rdx), %rdi
|
|
||||||
leaq -8(%rsi, %rdx), %rsi
|
|
||||||
vmovq (%rdi), %xmm1
|
|
||||||
vmovq (%rsi), %xmm2
|
|
||||||
- VPCMPEQ %xmm1, %xmm2, %xmm2
|
|
||||||
+ VPCMPEQ %xmm1, %xmm2, %xmm2
|
|
||||||
vpmovmskb %xmm2, %eax
|
|
||||||
- subl $0xffff, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ subl $0xffff, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
L(between_16_31):
|
|
||||||
/* From 16 to 31 bytes. No branch when size == 16. */
|
|
||||||
vmovdqu (%rsi), %xmm2
|
|
||||||
- VPCMPEQ (%rdi), %xmm2, %xmm2
|
|
||||||
+ VPCMPEQ (%rdi), %xmm2, %xmm2
|
|
||||||
vpmovmskb %xmm2, %eax
|
|
||||||
- subl $0xffff, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ subl $0xffff, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
|
|
||||||
/* Use overlapping loads to avoid branches. */
|
|
||||||
+
|
|
||||||
+ vmovdqu -16(%rsi, %rdx), %xmm2
|
|
||||||
leaq -16(%rdi, %rdx), %rdi
|
|
||||||
leaq -16(%rsi, %rdx), %rsi
|
|
||||||
- vmovdqu (%rsi), %xmm2
|
|
||||||
- VPCMPEQ (%rdi), %xmm2, %xmm2
|
|
||||||
+ VPCMPEQ (%rdi), %xmm2, %xmm2
|
|
||||||
vpmovmskb %xmm2, %eax
|
|
||||||
- subl $0xffff, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ subl $0xffff, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
ret
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(more_8x_vec):
|
|
||||||
- /* More than 8 * VEC. Check the first VEC. */
|
|
||||||
- vmovdqu (%rsi), %ymm2
|
|
||||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
-
|
|
||||||
- /* Align the first memory area for aligned loads in the loop.
|
|
||||||
- Compute how much the first memory area is misaligned. */
|
|
||||||
- movq %rdi, %rcx
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- /* Get the negative of offset for alignment. */
|
|
||||||
- subq $VEC_SIZE, %rcx
|
|
||||||
- /* Adjust the second memory area. */
|
|
||||||
- subq %rcx, %rsi
|
|
||||||
- /* Adjust the first memory area which should be aligned now. */
|
|
||||||
- subq %rcx, %rdi
|
|
||||||
- /* Adjust length. */
|
|
||||||
- addq %rcx, %rdx
|
|
||||||
-
|
|
||||||
-L(loop_4x_vec):
|
|
||||||
- /* Compare 4 * VEC at a time forward. */
|
|
||||||
- vmovdqu (%rsi), %ymm1
|
|
||||||
- VPCMPEQ (%rdi), %ymm1, %ymm1
|
|
||||||
-
|
|
||||||
- vmovdqu VEC_SIZE(%rsi), %ymm2
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
|
|
||||||
- vpand %ymm2, %ymm1, %ymm5
|
|
||||||
-
|
|
||||||
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
|
||||||
- vpand %ymm3, %ymm5, %ymm5
|
|
||||||
-
|
|
||||||
- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
|
||||||
- vpand %ymm4, %ymm5, %ymm5
|
|
||||||
-
|
|
||||||
- vptest %ymm0, %ymm5
|
|
||||||
- jnc L(4x_vec_end)
|
|
||||||
-
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
- addq $(VEC_SIZE * 4), %rsi
|
|
||||||
-
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jae L(loop_4x_vec)
|
|
||||||
-
|
|
||||||
- /* Less than 4 * VEC. */
|
|
||||||
- cmpq $VEC_SIZE, %rdx
|
|
||||||
- jbe L(last_vec)
|
|
||||||
- cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
- jbe L(last_2x_vec)
|
|
||||||
-
|
|
||||||
-L(last_4x_vec):
|
|
||||||
- /* From 2 * VEC to 4 * VEC. */
|
|
||||||
- vmovdqu (%rsi), %ymm2
|
|
||||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
-
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- addq $VEC_SIZE, %rsi
|
|
||||||
- vmovdqu (%rsi), %ymm2
|
|
||||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
-
|
|
||||||
- /* Use overlapping loads to avoid branches. */
|
|
||||||
- leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
|
|
||||||
- leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
|
|
||||||
- vmovdqu (%rsi), %ymm2
|
|
||||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
-
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- addq $VEC_SIZE, %rsi
|
|
||||||
- vmovdqu (%rsi), %ymm2
|
|
||||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(4x_vec_end):
|
|
||||||
- vpmovmskb %ymm1, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
- vpmovmskb %ymm2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
- vpmovmskb %ymm3, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
- vpmovmskb %ymm4, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- tzcntl %eax, %ecx
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl (VEC_SIZE * 3)(%rdi, %rcx), %edx
|
|
||||||
- cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx
|
|
||||||
- jmp L(wmemcmp_return)
|
|
||||||
-# else
|
|
||||||
- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
|
||||||
- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
|
|
||||||
- sub %edx, %eax
|
|
||||||
-# endif
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
-
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x1):
|
|
||||||
- tzcntl %eax, %ecx
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl VEC_SIZE(%rdi, %rcx), %edx
|
|
||||||
- cmpl VEC_SIZE(%rsi, %rcx), %edx
|
|
||||||
- jmp L(wmemcmp_return)
|
|
||||||
+L(one_or_less):
|
|
||||||
+ jb L(zero)
|
|
||||||
+ movl (%rdi), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (%rsi), %ecx
|
|
||||||
+ je L(zero)
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
+ ret
|
|
||||||
# else
|
|
||||||
- movzbl VEC_SIZE(%rdi, %rcx), %eax
|
|
||||||
- movzbl VEC_SIZE(%rsi, %rcx), %edx
|
|
||||||
- sub %edx, %eax
|
|
||||||
-# endif
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x2):
|
|
||||||
- tzcntl %eax, %ecx
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
|
|
||||||
- cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
|
|
||||||
- jmp L(wmemcmp_return)
|
|
||||||
-# else
|
|
||||||
- movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
|
|
||||||
- movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
|
|
||||||
- sub %edx, %eax
|
|
||||||
+L(between_4_7):
|
|
||||||
+ /* Load as big endian with overlapping movbe to avoid branches.
|
|
||||||
+ */
|
|
||||||
+ movbe (%rdi), %eax
|
|
||||||
+ movbe (%rsi), %ecx
|
|
||||||
+ shlq $32, %rax
|
|
||||||
+ shlq $32, %rcx
|
|
||||||
+ movbe -4(%rdi, %rdx), %edi
|
|
||||||
+ movbe -4(%rsi, %rdx), %esi
|
|
||||||
+ orq %rdi, %rax
|
|
||||||
+ orq %rsi, %rcx
|
|
||||||
+ subq %rcx, %rax
|
|
||||||
+ jz L(zero_4_7)
|
|
||||||
+ sbbl %eax, %eax
|
|
||||||
+ orl $1, %eax
|
|
||||||
+L(zero_4_7):
|
|
||||||
+ /* No ymm register was touched. */
|
|
||||||
+ ret
|
|
||||||
# endif
|
|
||||||
- VZEROUPPER_RETURN
|
|
||||||
+
|
|
||||||
END (MEMCMP)
|
|
||||||
#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,851 +0,0 @@
|
|||||||
From 4ad473e97acdc5f6d811755b67c09f2128a644ce Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon, 17 May 2021 13:57:24 -0400
|
|
||||||
Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit optimizes memcmp-evex.S. The optimizations include
|
|
||||||
adding a new vec compare path for small sizes, reorganizing the entry
|
|
||||||
control flow, removing some unnecissary ALU instructions from the main
|
|
||||||
loop, and most importantly replacing the heavy use of vpcmp + kand
|
|
||||||
logic with vpxor + vptern. test-memcmp and test-wmemcmp are both
|
|
||||||
passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 710 +++++++++++--------
|
|
||||||
1 file changed, 408 insertions(+), 302 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
index 9c093972..654dc7ac 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
@@ -19,17 +19,22 @@
|
|
||||||
#if IS_IN (libc)
|
|
||||||
|
|
||||||
/* memcmp/wmemcmp is implemented as:
|
|
||||||
- 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
|
|
||||||
- to avoid branches.
|
|
||||||
- 2. Use overlapping compare to avoid branch.
|
|
||||||
- 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
|
|
||||||
- bytes for wmemcmp.
|
|
||||||
- 4. If size is 8 * VEC_SIZE or less, unroll the loop.
|
|
||||||
- 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
|
||||||
+ 1. Use ymm vector compares when possible. The only case where
|
|
||||||
+ vector compares is not possible for when size < CHAR_PER_VEC
|
|
||||||
+ and loading from either s1 or s2 would cause a page cross.
|
|
||||||
+ 2. For size from 2 to 7 bytes on page cross, load as big endian
|
|
||||||
+ with movbe and bswap to avoid branches.
|
|
||||||
+ 3. Use xmm vector compare when size >= 4 bytes for memcmp or
|
|
||||||
+ size >= 8 bytes for wmemcmp.
|
|
||||||
+ 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
|
|
||||||
+ to check for early mismatches. Only do this if its guranteed the
|
|
||||||
+ work is not wasted.
|
|
||||||
+ 5. If size is 8 * VEC_SIZE or less, unroll the loop.
|
|
||||||
+ 6. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
|
||||||
area.
|
|
||||||
- 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
|
|
||||||
- 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
|
|
||||||
- 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
|
|
||||||
+ 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
|
|
||||||
+ 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
|
|
||||||
+ 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */
|
|
||||||
|
|
||||||
# include <sysdep.h>
|
|
||||||
|
|
||||||
@@ -40,11 +45,21 @@
|
|
||||||
# define VMOVU vmovdqu64
|
|
||||||
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
-# define VPCMPEQ vpcmpeqd
|
|
||||||
+# define CHAR_SIZE 4
|
|
||||||
+# define VPCMP vpcmpd
|
|
||||||
# else
|
|
||||||
-# define VPCMPEQ vpcmpeqb
|
|
||||||
+# define CHAR_SIZE 1
|
|
||||||
+# define VPCMP vpcmpub
|
|
||||||
# endif
|
|
||||||
|
|
||||||
+# define VEC_SIZE 32
|
|
||||||
+# define PAGE_SIZE 4096
|
|
||||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
||||||
+
|
|
||||||
+# define XMM0 xmm16
|
|
||||||
+# define XMM1 xmm17
|
|
||||||
+# define XMM2 xmm18
|
|
||||||
+# define YMM0 ymm16
|
|
||||||
# define XMM1 xmm17
|
|
||||||
# define XMM2 xmm18
|
|
||||||
# define YMM1 ymm17
|
|
||||||
@@ -54,15 +69,6 @@
|
|
||||||
# define YMM5 ymm21
|
|
||||||
# define YMM6 ymm22
|
|
||||||
|
|
||||||
-# define VEC_SIZE 32
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
-# define VEC_MASK 0xff
|
|
||||||
-# define XMM_MASK 0xf
|
|
||||||
-# else
|
|
||||||
-# define VEC_MASK 0xffffffff
|
|
||||||
-# define XMM_MASK 0xffff
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
/* Warning!
|
|
||||||
wmemcmp has to use SIGNED comparison for elements.
|
|
||||||
memcmp has to use UNSIGNED comparison for elemnts.
|
|
||||||
@@ -70,145 +76,370 @@
|
|
||||||
|
|
||||||
.section .text.evex,"ax",@progbits
|
|
||||||
ENTRY (MEMCMP)
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
- shl $2, %RDX_LP
|
|
||||||
-# elif defined __ILP32__
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
/* Clear the upper 32 bits. */
|
|
||||||
movl %edx, %edx
|
|
||||||
# endif
|
|
||||||
- cmp $VEC_SIZE, %RDX_LP
|
|
||||||
+ cmp $CHAR_PER_VEC, %RDX_LP
|
|
||||||
jb L(less_vec)
|
|
||||||
|
|
||||||
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
- VMOVU (%rsi), %YMM2
|
|
||||||
- VPCMPEQ (%rdi), %YMM2, %k1
|
|
||||||
+ VMOVU (%rsi), %YMM1
|
|
||||||
+ /* Use compare not equals to directly check for mismatch. */
|
|
||||||
+ VPCMP $4, (%rdi), %YMM1, %k1
|
|
||||||
kmovd %k1, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
-
|
|
||||||
- cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
- jbe L(last_vec)
|
|
||||||
-
|
|
||||||
- /* More than 2 * VEC. */
|
|
||||||
- cmpq $(VEC_SIZE * 8), %rdx
|
|
||||||
- ja L(more_8x_vec)
|
|
||||||
- cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jb L(last_4x_vec)
|
|
||||||
+ /* NB: eax must be destination register if going to
|
|
||||||
+ L(return_vec_[0,2]). For L(return_vec_3 destination register
|
|
||||||
+ must be ecx. */
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
|
|
||||||
- /* From 4 * VEC to 8 * VEC, inclusively. */
|
|
||||||
- VMOVU (%rsi), %YMM1
|
|
||||||
- VPCMPEQ (%rdi), %YMM1, %k1
|
|
||||||
+ cmpq $(CHAR_PER_VEC * 2), %rdx
|
|
||||||
+ jbe L(last_1x_vec)
|
|
||||||
|
|
||||||
+ /* Check second VEC no matter what. */
|
|
||||||
VMOVU VEC_SIZE(%rsi), %YMM2
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
|
||||||
+ VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_1)
|
|
||||||
+
|
|
||||||
+ /* Less than 4 * VEC. */
|
|
||||||
+ cmpq $(CHAR_PER_VEC * 4), %rdx
|
|
||||||
+ jbe L(last_2x_vec)
|
|
||||||
|
|
||||||
+ /* Check third and fourth VEC no matter what. */
|
|
||||||
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
|
||||||
+ VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_2)
|
|
||||||
|
|
||||||
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
|
||||||
+ VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
|
|
||||||
+ kmovd %k1, %ecx
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
+ jnz L(return_vec_3)
|
|
||||||
|
|
||||||
- kandd %k1, %k2, %k5
|
|
||||||
- kandd %k3, %k4, %k6
|
|
||||||
- kandd %k5, %k6, %k6
|
|
||||||
+ /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
|
|
||||||
+ compare with zero to get a mask is needed. */
|
|
||||||
+ vpxorq %XMM0, %XMM0, %XMM0
|
|
||||||
|
|
||||||
- kmovd %k6, %eax
|
|
||||||
- cmpl $VEC_MASK, %eax
|
|
||||||
- jne L(4x_vec_end)
|
|
||||||
+ /* Go to 4x VEC loop. */
|
|
||||||
+ cmpq $(CHAR_PER_VEC * 8), %rdx
|
|
||||||
+ ja L(more_8x_vec)
|
|
||||||
|
|
||||||
- leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
|
|
||||||
- leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
|
|
||||||
- VMOVU (%rsi), %YMM1
|
|
||||||
- VPCMPEQ (%rdi), %YMM1, %k1
|
|
||||||
+ /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
|
|
||||||
+ branches. */
|
|
||||||
|
|
||||||
- VMOVU VEC_SIZE(%rsi), %YMM2
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
|
||||||
- kandd %k1, %k2, %k5
|
|
||||||
+ /* Load first two VEC from s2 before adjusting addresses. */
|
|
||||||
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
|
|
||||||
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
|
|
||||||
+ leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
|
|
||||||
+ leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
|
|
||||||
+
|
|
||||||
+ /* Wait to load from s1 until addressed adjust due to
|
|
||||||
+ unlamination of microfusion with complex address mode. */
|
|
||||||
+
|
|
||||||
+ /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
|
|
||||||
+ will have some 1s. */
|
|
||||||
+ vpxorq (%rdi), %YMM1, %YMM1
|
|
||||||
+ vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2
|
|
||||||
|
|
||||||
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
|
||||||
- kandd %k3, %k5, %k5
|
|
||||||
+ vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
|
|
||||||
+ /* Or together YMM1, YMM2, and YMM3 into YMM3. */
|
|
||||||
+ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
|
|
||||||
|
|
||||||
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
|
||||||
- kandd %k4, %k5, %k5
|
|
||||||
+ /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
|
|
||||||
+ oring with YMM3. Result is stored in YMM4. */
|
|
||||||
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
|
|
||||||
+ /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
|
|
||||||
+ VPCMP $4, %YMM4, %YMM0, %k1
|
|
||||||
+ kmovd %k1, %ecx
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
+ jnz L(return_vec_0_1_2_3)
|
|
||||||
+ /* NB: eax must be zero to reach here. */
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- kmovd %k5, %eax
|
|
||||||
- cmpl $VEC_MASK, %eax
|
|
||||||
- jne L(4x_vec_end)
|
|
||||||
- xorl %eax, %eax
|
|
||||||
+ /* NB: aligning 32 here allows for the rest of the jump targets
|
|
||||||
+ to be tuned for 32 byte alignment. Most important this ensures
|
|
||||||
+ the L(more_8x_vec) loop is 32 byte aligned. */
|
|
||||||
+ .p2align 5
|
|
||||||
+L(less_vec):
|
|
||||||
+ /* Check if one or less CHAR. This is necessary for size = 0 but
|
|
||||||
+ is also faster for size = CHAR_SIZE. */
|
|
||||||
+ cmpl $1, %edx
|
|
||||||
+ jbe L(one_or_less)
|
|
||||||
+
|
|
||||||
+ /* Check if loading one VEC from either s1 or s2 could cause a
|
|
||||||
+ page cross. This can have false positives but is by far the
|
|
||||||
+ fastest method. */
|
|
||||||
+ movl %edi, %eax
|
|
||||||
+ orl %esi, %eax
|
|
||||||
+ andl $(PAGE_SIZE - 1), %eax
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
+ jg L(page_cross_less_vec)
|
|
||||||
+
|
|
||||||
+ /* No page cross possible. */
|
|
||||||
+ VMOVU (%rsi), %YMM2
|
|
||||||
+ VPCMP $4, (%rdi), %YMM2, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ /* Create mask in ecx for potentially in bound matches. */
|
|
||||||
+ bzhil %edx, %eax, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(last_2x_vec):
|
|
||||||
- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
- VMOVU (%rsi), %YMM2
|
|
||||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+L(return_vec_0):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl (%rdi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (%rsi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ /* NB: no partial register stall here because xorl zero idiom
|
|
||||||
+ above. */
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl (%rsi, %rax), %ecx
|
|
||||||
+ movzbl (%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
-L(last_vec):
|
|
||||||
- /* Use overlapping loads to avoid branches. */
|
|
||||||
- leaq -VEC_SIZE(%rdi, %rdx), %rdi
|
|
||||||
- leaq -VEC_SIZE(%rsi, %rdx), %rsi
|
|
||||||
- VMOVU (%rsi), %YMM2
|
|
||||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ /* NB: No p2align necessary. Alignment % 16 is naturally 1
|
|
||||||
+ which is good enough for a target not in a loop. */
|
|
||||||
+L(return_vec_1):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl VEC_SIZE(%rsi, %rax), %ecx
|
|
||||||
+ movzbl VEC_SIZE(%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec):
|
|
||||||
- /* A byte or int32 is different within 16 or 32 bytes. */
|
|
||||||
- tzcntl %eax, %ecx
|
|
||||||
+ /* NB: No p2align necessary. Alignment % 16 is naturally 2
|
|
||||||
+ which is good enough for a target not in a loop. */
|
|
||||||
+L(return_vec_2):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl (%rdi, %rcx, 4), %edx
|
|
||||||
- cmpl (%rsi, %rcx, 4), %edx
|
|
||||||
-L(wmemcmp_return):
|
|
||||||
- setl %al
|
|
||||||
- negl %eax
|
|
||||||
- orl $1, %eax
|
|
||||||
+ movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
# else
|
|
||||||
- movzbl (%rdi, %rcx), %eax
|
|
||||||
- movzbl (%rsi, %rcx), %edx
|
|
||||||
- sub %edx, %eax
|
|
||||||
+ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
|
|
||||||
+ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
+ .p2align 4
|
|
||||||
+L(8x_return_vec_0_1_2_3):
|
|
||||||
+ /* Returning from L(more_8x_vec) requires restoring rsi. */
|
|
||||||
+ addq %rdi, %rsi
|
|
||||||
+L(return_vec_0_1_2_3):
|
|
||||||
+ VPCMP $4, %YMM1, %YMM0, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
+
|
|
||||||
+ VPCMP $4, %YMM2, %YMM0, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_1)
|
|
||||||
+
|
|
||||||
+ VPCMP $4, %YMM3, %YMM0, %k0
|
|
||||||
+ kmovd %k0, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_2)
|
|
||||||
+L(return_vec_3):
|
|
||||||
+ tzcntl %ecx, %ecx
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
.p2align 4
|
|
||||||
-L(4):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl (%rdi), %edx
|
|
||||||
- cmpl (%rsi), %edx
|
|
||||||
- jne L(wmemcmp_return)
|
|
||||||
+L(more_8x_vec):
|
|
||||||
+ /* Set end of s1 in rdx. */
|
|
||||||
+ leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
|
|
||||||
+ /* rsi stores s2 - s1. This allows loop to only update one
|
|
||||||
+ pointer. */
|
|
||||||
+ subq %rdi, %rsi
|
|
||||||
+ /* Align s1 pointer. */
|
|
||||||
+ andq $-VEC_SIZE, %rdi
|
|
||||||
+ /* Adjust because first 4x vec where check already. */
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ .p2align 4
|
|
||||||
+L(loop_4x_vec):
|
|
||||||
+ VMOVU (%rsi, %rdi), %YMM1
|
|
||||||
+ vpxorq (%rdi), %YMM1, %YMM1
|
|
||||||
+
|
|
||||||
+ VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
|
|
||||||
+ vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
|
|
||||||
+ vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
|
|
||||||
+ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
|
|
||||||
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
|
|
||||||
+ VPCMP $4, %YMM4, %YMM0, %k1
|
|
||||||
+ kmovd %k1, %ecx
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
+ jnz L(8x_return_vec_0_1_2_3)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ cmpq %rdx, %rdi
|
|
||||||
+ jb L(loop_4x_vec)
|
|
||||||
+
|
|
||||||
+ subq %rdx, %rdi
|
|
||||||
+ /* rdi has 4 * VEC_SIZE - remaining length. */
|
|
||||||
+ cmpl $(VEC_SIZE * 3), %edi
|
|
||||||
+ jae L(8x_last_1x_vec)
|
|
||||||
+ /* Load regardless of branch. */
|
|
||||||
+ VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
|
|
||||||
+ cmpl $(VEC_SIZE * 2), %edi
|
|
||||||
+ jae L(8x_last_2x_vec)
|
|
||||||
+
|
|
||||||
+ VMOVU (%rsi, %rdx), %YMM1
|
|
||||||
+ vpxorq (%rdx), %YMM1, %YMM1
|
|
||||||
+
|
|
||||||
+ VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
|
|
||||||
+ vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
|
|
||||||
+
|
|
||||||
+ vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
|
|
||||||
+ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
|
|
||||||
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
|
|
||||||
+ VPCMP $4, %YMM4, %YMM0, %k1
|
|
||||||
+ kmovd %k1, %ecx
|
|
||||||
+ /* Restore s1 pointer to rdi. */
|
|
||||||
+ movq %rdx, %rdi
|
|
||||||
+ testl %ecx, %ecx
|
|
||||||
+ jnz L(8x_return_vec_0_1_2_3)
|
|
||||||
+ /* NB: eax must be zero to reach here. */
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ /* Only entry is from L(more_8x_vec). */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(8x_last_2x_vec):
|
|
||||||
+ VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(8x_return_vec_2)
|
|
||||||
+ /* Naturally aligned to 16 bytes. */
|
|
||||||
+L(8x_last_1x_vec):
|
|
||||||
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
|
|
||||||
+ VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(8x_return_vec_3)
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_2x_vec):
|
|
||||||
+ /* Check second to last VEC. */
|
|
||||||
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
|
|
||||||
+ VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_1_end)
|
|
||||||
+
|
|
||||||
+ /* Check last VEC. */
|
|
||||||
+ .p2align 4
|
|
||||||
+L(last_1x_vec):
|
|
||||||
+ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
|
|
||||||
+ VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_0_end)
|
|
||||||
ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+L(8x_return_vec_2):
|
|
||||||
+ subq $VEC_SIZE, %rdx
|
|
||||||
+L(8x_return_vec_3):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ leaq (%rdx, %rax, CHAR_SIZE), %rax
|
|
||||||
+ movl (VEC_SIZE * 3)(%rax), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
# else
|
|
||||||
+ addq %rdx, %rax
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
|
||||||
+ movzbl (VEC_SIZE * 3)(%rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
.p2align 4
|
|
||||||
-L(between_4_7):
|
|
||||||
- /* Load as big endian with overlapping movbe to avoid branches. */
|
|
||||||
- movbe (%rdi), %eax
|
|
||||||
- movbe (%rsi), %ecx
|
|
||||||
- shlq $32, %rax
|
|
||||||
- shlq $32, %rcx
|
|
||||||
- movbe -4(%rdi, %rdx), %edi
|
|
||||||
- movbe -4(%rsi, %rdx), %esi
|
|
||||||
- orq %rdi, %rax
|
|
||||||
- orq %rsi, %rcx
|
|
||||||
- subq %rcx, %rax
|
|
||||||
- je L(exit)
|
|
||||||
- sbbl %eax, %eax
|
|
||||||
- orl $1, %eax
|
|
||||||
+L(return_vec_0_end):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addl %edx, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl -VEC_SIZE(%rsi, %rax), %ecx
|
|
||||||
+ movzbl -VEC_SIZE(%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(exit):
|
|
||||||
+L(return_vec_1_end):
|
|
||||||
+ tzcntl %eax, %eax
|
|
||||||
+ addl %edx, %eax
|
|
||||||
+# ifdef USE_AS_WMEMCMP
|
|
||||||
+ movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
+# else
|
|
||||||
+ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
|
|
||||||
+ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
|
|
||||||
+ subl %ecx, %eax
|
|
||||||
+# endif
|
|
||||||
ret
|
|
||||||
|
|
||||||
+
|
|
||||||
.p2align 4
|
|
||||||
+L(page_cross_less_vec):
|
|
||||||
+ /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
|
|
||||||
+ bytes. */
|
|
||||||
+ cmpl $(16 / CHAR_SIZE), %edx
|
|
||||||
+ jae L(between_16_31)
|
|
||||||
+# ifndef USE_AS_WMEMCMP
|
|
||||||
+ cmpl $8, %edx
|
|
||||||
+ jae L(between_8_15)
|
|
||||||
+ cmpl $4, %edx
|
|
||||||
+ jae L(between_4_7)
|
|
||||||
L(between_2_3):
|
|
||||||
/* Load as big endian to avoid branches. */
|
|
||||||
movzwl (%rdi), %eax
|
|
||||||
@@ -217,224 +448,99 @@ L(between_2_3):
|
|
||||||
shll $8, %ecx
|
|
||||||
bswap %eax
|
|
||||||
bswap %ecx
|
|
||||||
- movb -1(%rdi, %rdx), %al
|
|
||||||
- movb -1(%rsi, %rdx), %cl
|
|
||||||
+ movzbl -1(%rdi, %rdx), %edi
|
|
||||||
+ movzbl -1(%rsi, %rdx), %esi
|
|
||||||
+ orl %edi, %eax
|
|
||||||
+ orl %esi, %ecx
|
|
||||||
/* Subtraction is okay because the upper 8 bits are zero. */
|
|
||||||
subl %ecx, %eax
|
|
||||||
ret
|
|
||||||
-
|
|
||||||
.p2align 4
|
|
||||||
-L(1):
|
|
||||||
- movzbl (%rdi), %eax
|
|
||||||
+L(one_or_less):
|
|
||||||
+ jb L(zero)
|
|
||||||
movzbl (%rsi), %ecx
|
|
||||||
+ movzbl (%rdi), %eax
|
|
||||||
subl %ecx, %eax
|
|
||||||
ret
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(zero):
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(less_vec):
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
- /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
|
|
||||||
- cmpb $4, %dl
|
|
||||||
- je L(4)
|
|
||||||
- jb L(zero)
|
|
||||||
-# else
|
|
||||||
- cmpb $1, %dl
|
|
||||||
- je L(1)
|
|
||||||
- jb L(zero)
|
|
||||||
- cmpb $4, %dl
|
|
||||||
- jb L(between_2_3)
|
|
||||||
- cmpb $8, %dl
|
|
||||||
- jb L(between_4_7)
|
|
||||||
+L(between_8_15):
|
|
||||||
# endif
|
|
||||||
- cmpb $16, %dl
|
|
||||||
- jae L(between_16_31)
|
|
||||||
- /* It is between 8 and 15 bytes. */
|
|
||||||
+ /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
|
|
||||||
vmovq (%rdi), %XMM1
|
|
||||||
vmovq (%rsi), %XMM2
|
|
||||||
- VPCMPEQ %XMM1, %XMM2, %k2
|
|
||||||
- kmovw %k2, %eax
|
|
||||||
- subl $XMM_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ VPCMP $4, %XMM1, %XMM2, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
/* Use overlapping loads to avoid branches. */
|
|
||||||
- leaq -8(%rdi, %rdx), %rdi
|
|
||||||
- leaq -8(%rsi, %rdx), %rsi
|
|
||||||
+ leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi
|
|
||||||
+ leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi
|
|
||||||
vmovq (%rdi), %XMM1
|
|
||||||
vmovq (%rsi), %XMM2
|
|
||||||
- VPCMPEQ %XMM1, %XMM2, %k2
|
|
||||||
- kmovw %k2, %eax
|
|
||||||
- subl $XMM_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+ VPCMP $4, %XMM1, %XMM2, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(between_16_31):
|
|
||||||
- /* From 16 to 31 bytes. No branch when size == 16. */
|
|
||||||
- VMOVU (%rsi), %XMM2
|
|
||||||
- VPCMPEQ (%rdi), %XMM2, %k2
|
|
||||||
- kmovw %k2, %eax
|
|
||||||
- subl $XMM_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
-
|
|
||||||
- /* Use overlapping loads to avoid branches. */
|
|
||||||
- leaq -16(%rdi, %rdx), %rdi
|
|
||||||
- leaq -16(%rsi, %rdx), %rsi
|
|
||||||
- VMOVU (%rsi), %XMM2
|
|
||||||
- VPCMPEQ (%rdi), %XMM2, %k2
|
|
||||||
- kmovw %k2, %eax
|
|
||||||
- subl $XMM_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+L(zero):
|
|
||||||
+ xorl %eax, %eax
|
|
||||||
ret
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(more_8x_vec):
|
|
||||||
- /* More than 8 * VEC. Check the first VEC. */
|
|
||||||
- VMOVU (%rsi), %YMM2
|
|
||||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
-
|
|
||||||
- /* Align the first memory area for aligned loads in the loop.
|
|
||||||
- Compute how much the first memory area is misaligned. */
|
|
||||||
- movq %rdi, %rcx
|
|
||||||
- andl $(VEC_SIZE - 1), %ecx
|
|
||||||
- /* Get the negative of offset for alignment. */
|
|
||||||
- subq $VEC_SIZE, %rcx
|
|
||||||
- /* Adjust the second memory area. */
|
|
||||||
- subq %rcx, %rsi
|
|
||||||
- /* Adjust the first memory area which should be aligned now. */
|
|
||||||
- subq %rcx, %rdi
|
|
||||||
- /* Adjust length. */
|
|
||||||
- addq %rcx, %rdx
|
|
||||||
-
|
|
||||||
-L(loop_4x_vec):
|
|
||||||
- /* Compare 4 * VEC at a time forward. */
|
|
||||||
- VMOVU (%rsi), %YMM1
|
|
||||||
- VPCMPEQ (%rdi), %YMM1, %k1
|
|
||||||
-
|
|
||||||
- VMOVU VEC_SIZE(%rsi), %YMM2
|
|
||||||
- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
|
||||||
- kandd %k2, %k1, %k5
|
|
||||||
-
|
|
||||||
- VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
|
||||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
|
||||||
- kandd %k3, %k5, %k5
|
|
||||||
-
|
|
||||||
- VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
|
||||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
|
||||||
- kandd %k4, %k5, %k5
|
|
||||||
-
|
|
||||||
- kmovd %k5, %eax
|
|
||||||
- cmpl $VEC_MASK, %eax
|
|
||||||
- jne L(4x_vec_end)
|
|
||||||
-
|
|
||||||
- addq $(VEC_SIZE * 4), %rdi
|
|
||||||
- addq $(VEC_SIZE * 4), %rsi
|
|
||||||
-
|
|
||||||
- subq $(VEC_SIZE * 4), %rdx
|
|
||||||
- cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jae L(loop_4x_vec)
|
|
||||||
-
|
|
||||||
- /* Less than 4 * VEC. */
|
|
||||||
- cmpq $VEC_SIZE, %rdx
|
|
||||||
- jbe L(last_vec)
|
|
||||||
- cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
- jbe L(last_2x_vec)
|
|
||||||
-
|
|
||||||
-L(last_4x_vec):
|
|
||||||
- /* From 2 * VEC to 4 * VEC. */
|
|
||||||
- VMOVU (%rsi), %YMM2
|
|
||||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
-
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- addq $VEC_SIZE, %rsi
|
|
||||||
- VMOVU (%rsi), %YMM2
|
|
||||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
+L(between_16_31):
|
|
||||||
+ /* From 16 to 31 bytes. No branch when size == 16. */
|
|
||||||
+ VMOVU (%rsi), %XMM2
|
|
||||||
+ VPCMP $4, (%rdi), %XMM2, %k1
|
|
||||||
+ kmovd %k1, %eax
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
|
|
||||||
/* Use overlapping loads to avoid branches. */
|
|
||||||
- leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
|
|
||||||
- leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
|
|
||||||
- VMOVU (%rsi), %YMM2
|
|
||||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- addq $VEC_SIZE, %rsi
|
|
||||||
- VMOVU (%rsi), %YMM2
|
|
||||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
- .p2align 4
|
|
||||||
-L(4x_vec_end):
|
|
||||||
+ VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2
|
|
||||||
+ leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi
|
|
||||||
+ leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi
|
|
||||||
+ VPCMP $4, (%rdi), %XMM2, %k1
|
|
||||||
kmovd %k1, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec)
|
|
||||||
- kmovd %k2, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
- kmovd %k3, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
- kmovd %k4, %eax
|
|
||||||
- subl $VEC_MASK, %eax
|
|
||||||
- tzcntl %eax, %ecx
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
|
|
||||||
- cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
|
|
||||||
- jmp L(wmemcmp_return)
|
|
||||||
-# else
|
|
||||||
- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
|
||||||
- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
|
|
||||||
- sub %edx, %eax
|
|
||||||
-# endif
|
|
||||||
+ testl %eax, %eax
|
|
||||||
+ jnz L(return_vec_0)
|
|
||||||
ret
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_vec_x1):
|
|
||||||
- tzcntl %eax, %ecx
|
|
||||||
# ifdef USE_AS_WMEMCMP
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl VEC_SIZE(%rdi, %rcx, 4), %edx
|
|
||||||
- cmpl VEC_SIZE(%rsi, %rcx, 4), %edx
|
|
||||||
- jmp L(wmemcmp_return)
|
|
||||||
-# else
|
|
||||||
- movzbl VEC_SIZE(%rdi, %rcx), %eax
|
|
||||||
- movzbl VEC_SIZE(%rsi, %rcx), %edx
|
|
||||||
- sub %edx, %eax
|
|
||||||
-# endif
|
|
||||||
+ .p2align 4
|
|
||||||
+L(one_or_less):
|
|
||||||
+ jb L(zero)
|
|
||||||
+ movl (%rdi), %ecx
|
|
||||||
+ xorl %edx, %edx
|
|
||||||
+ cmpl (%rsi), %ecx
|
|
||||||
+ je L(zero)
|
|
||||||
+ setg %dl
|
|
||||||
+ leal -1(%rdx, %rdx), %eax
|
|
||||||
ret
|
|
||||||
+# else
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
-L(first_vec_x2):
|
|
||||||
- tzcntl %eax, %ecx
|
|
||||||
-# ifdef USE_AS_WMEMCMP
|
|
||||||
- xorl %eax, %eax
|
|
||||||
- movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
|
|
||||||
- cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
|
|
||||||
- jmp L(wmemcmp_return)
|
|
||||||
-# else
|
|
||||||
- movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
|
|
||||||
- movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
|
|
||||||
- sub %edx, %eax
|
|
||||||
-# endif
|
|
||||||
+L(between_4_7):
|
|
||||||
+ /* Load as big endian with overlapping movbe to avoid branches.
|
|
||||||
+ */
|
|
||||||
+ movbe (%rdi), %eax
|
|
||||||
+ movbe (%rsi), %ecx
|
|
||||||
+ shlq $32, %rax
|
|
||||||
+ shlq $32, %rcx
|
|
||||||
+ movbe -4(%rdi, %rdx), %edi
|
|
||||||
+ movbe -4(%rsi, %rdx), %esi
|
|
||||||
+ orq %rdi, %rax
|
|
||||||
+ orq %rsi, %rcx
|
|
||||||
+ subq %rcx, %rax
|
|
||||||
+ jz L(zero_4_7)
|
|
||||||
+ sbbl %eax, %eax
|
|
||||||
+ orl $1, %eax
|
|
||||||
+L(zero_4_7):
|
|
||||||
ret
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
END (MEMCMP)
|
|
||||||
#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,104 +0,0 @@
|
|||||||
From 6abf27980a947f9b6e514d6b33b83059d39566ae Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Thu, 20 May 2021 13:13:51 -0400
|
|
||||||
Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. This commit makes a few small improvements to
|
|
||||||
memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64
|
|
||||||
instead of 128. Either alignment will perform equally well in a loop
|
|
||||||
and 128 just increases the odds of having to do an extra iteration
|
|
||||||
which can be significant overhead for small values. 2) Align some
|
|
||||||
targets and the loop. 3) Remove an ALU from the alignment process. 4)
|
|
||||||
Reorder the last 4x VEC so that they are stored after the loop. 5)
|
|
||||||
Move the condition for leq 8x VEC to before the alignment
|
|
||||||
process. test-memset and test-wmemset are both passing.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
.../multiarch/memset-vec-unaligned-erms.S | 50 +++++++++++--------
|
|
||||||
1 file changed, 28 insertions(+), 22 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
index f877ac9d..909c33f6 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
@@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
|
|
||||||
VMOVU %VEC(0), (%rdi)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
|
|
||||||
+ .p2align 4
|
|
||||||
L(stosb_more_2x_vec):
|
|
||||||
cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
|
|
||||||
ja L(stosb)
|
|
||||||
+#else
|
|
||||||
+ .p2align 4
|
|
||||||
#endif
|
|
||||||
L(more_2x_vec):
|
|
||||||
- cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
- ja L(loop_start)
|
|
||||||
+ /* Stores to first 2x VEC before cmp as any path forward will
|
|
||||||
+ require it. */
|
|
||||||
VMOVU %VEC(0), (%rdi)
|
|
||||||
VMOVU %VEC(0), VEC_SIZE(%rdi)
|
|
||||||
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
||||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
+ ja L(loop_start)
|
|
||||||
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
|
|
||||||
+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
||||||
L(return):
|
|
||||||
#if VEC_SIZE > 16
|
|
||||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
||||||
@@ -192,28 +197,29 @@ L(return):
|
|
||||||
#endif
|
|
||||||
|
|
||||||
L(loop_start):
|
|
||||||
- leaq (VEC_SIZE * 4)(%rdi), %rcx
|
|
||||||
- VMOVU %VEC(0), (%rdi)
|
|
||||||
- andq $-(VEC_SIZE * 4), %rcx
|
|
||||||
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
||||||
- VMOVU %VEC(0), VEC_SIZE(%rdi)
|
|
||||||
- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
|
|
||||||
VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
|
|
||||||
- VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
|
|
||||||
VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
|
|
||||||
- VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
|
|
||||||
- addq %rdi, %rdx
|
|
||||||
- andq $-(VEC_SIZE * 4), %rdx
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
- je L(return)
|
|
||||||
+ cmpq $(VEC_SIZE * 8), %rdx
|
|
||||||
+ jbe L(loop_end)
|
|
||||||
+ andq $-(VEC_SIZE * 2), %rdi
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx
|
|
||||||
+ .p2align 4
|
|
||||||
L(loop):
|
|
||||||
- VMOVA %VEC(0), (%rcx)
|
|
||||||
- VMOVA %VEC(0), VEC_SIZE(%rcx)
|
|
||||||
- VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
|
|
||||||
- VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
|
|
||||||
- addq $(VEC_SIZE * 4), %rcx
|
|
||||||
- cmpq %rcx, %rdx
|
|
||||||
- jne L(loop)
|
|
||||||
+ VMOVA %VEC(0), (%rdi)
|
|
||||||
+ VMOVA %VEC(0), VEC_SIZE(%rdi)
|
|
||||||
+ VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi)
|
|
||||||
+ VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi)
|
|
||||||
+ subq $-(VEC_SIZE * 4), %rdi
|
|
||||||
+ cmpq %rcx, %rdi
|
|
||||||
+ jb L(loop)
|
|
||||||
+L(loop_end):
|
|
||||||
+ /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
|
|
||||||
+ rdx as length is also unchanged. */
|
|
||||||
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
|
|
||||||
+ VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
|
|
||||||
+ VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
|
|
||||||
+ VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
|
|
||||||
VZEROUPPER_SHORT_RETURN
|
|
||||||
|
|
||||||
.p2align 4
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,84 +0,0 @@
|
|||||||
From 1b992204f68af851e905c16016756fd4421e1934 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Sun, 23 May 2021 19:43:24 -0400
|
|
||||||
Subject: [PATCH] x86: Improve memmove-vec-unaligned-erms.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
This patch changes the condition for copy 4x VEC so that if length is
|
|
||||||
exactly equal to 4 * VEC_SIZE it will use the 4x VEC case instead of
|
|
||||||
8x VEC case.
|
|
||||||
|
|
||||||
Results For Skylake memcpy-avx2-erms
|
|
||||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
|
||||||
128 , 0 , 0 , 9.137 , 6.873 , New , 75.22
|
|
||||||
128 , 7 , 0 , 12.933 , 7.732 , New , 59.79
|
|
||||||
128 , 0 , 7 , 11.852 , 6.76 , New , 57.04
|
|
||||||
128 , 7 , 7 , 12.587 , 6.808 , New , 54.09
|
|
||||||
|
|
||||||
Results For Icelake memcpy-evex-erms
|
|
||||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
|
||||||
128 , 0 , 0 , 9.963 , 5.416 , New , 54.36
|
|
||||||
128 , 7 , 0 , 16.467 , 8.061 , New , 48.95
|
|
||||||
128 , 0 , 7 , 14.388 , 7.644 , New , 53.13
|
|
||||||
128 , 7 , 7 , 14.546 , 7.642 , New , 52.54
|
|
||||||
|
|
||||||
Results For Tigerlake memcpy-evex-erms
|
|
||||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
|
||||||
128 , 0 , 0 , 8.979 , 4.95 , New , 55.13
|
|
||||||
128 , 7 , 0 , 14.245 , 7.122 , New , 50.0
|
|
||||||
128 , 0 , 7 , 12.668 , 6.675 , New , 52.69
|
|
||||||
128 , 7 , 7 , 13.042 , 6.802 , New , 52.15
|
|
||||||
|
|
||||||
Results For Skylake memmove-avx2-erms
|
|
||||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
|
||||||
128 , 0 , 32 , 6.181 , 5.691 , New , 92.07
|
|
||||||
128 , 32 , 0 , 6.165 , 5.752 , New , 93.3
|
|
||||||
128 , 0 , 7 , 13.923 , 9.37 , New , 67.3
|
|
||||||
128 , 7 , 0 , 12.049 , 10.182 , New , 84.5
|
|
||||||
|
|
||||||
Results For Icelake memmove-evex-erms
|
|
||||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
|
||||||
128 , 0 , 32 , 5.479 , 4.889 , New , 89.23
|
|
||||||
128 , 32 , 0 , 5.127 , 4.911 , New , 95.79
|
|
||||||
128 , 0 , 7 , 18.885 , 13.547 , New , 71.73
|
|
||||||
128 , 7 , 0 , 15.565 , 14.436 , New , 92.75
|
|
||||||
|
|
||||||
Results For Tigerlake memmove-evex-erms
|
|
||||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
|
||||||
128 , 0 , 32 , 5.275 , 4.815 , New , 91.28
|
|
||||||
128 , 32 , 0 , 5.376 , 4.565 , New , 84.91
|
|
||||||
128 , 0 , 7 , 19.426 , 14.273 , New , 73.47
|
|
||||||
128 , 7 , 0 , 15.924 , 14.951 , New , 93.89
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
|
|
||||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
index 3e2dd6bc..572cef04 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
||||||
@@ -417,8 +417,8 @@ L(more_2x_vec):
|
|
||||||
cmpq $(VEC_SIZE * 8), %rdx
|
|
||||||
ja L(more_8x_vec)
|
|
||||||
cmpq $(VEC_SIZE * 4), %rdx
|
|
||||||
- jb L(last_4x_vec)
|
|
||||||
- /* Copy from 4 * VEC to 8 * VEC, inclusively. */
|
|
||||||
+ jbe L(last_4x_vec)
|
|
||||||
+ /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
|
|
||||||
VMOVU (%rsi), %VEC(0)
|
|
||||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
|
||||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
|
||||||
@@ -437,7 +437,7 @@ L(more_2x_vec):
|
|
||||||
VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
|
|
||||||
VZEROUPPER_RETURN
|
|
||||||
L(last_4x_vec):
|
|
||||||
- /* Copy from 2 * VEC to 4 * VEC. */
|
|
||||||
+ /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
|
|
||||||
VMOVU (%rsi), %VEC(0)
|
|
||||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
|
||||||
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,55 +0,0 @@
|
|||||||
From 08cbcd4dbc686bb38ec3093aff2f919fbff5ec17 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Wed, 23 Jun 2021 19:19:34 -0400
|
|
||||||
Subject: [PATCH] x86: Remove unnecessary overflow check from wcsnlen-sse4_1.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
No bug. The way wcsnlen will check if near the end of maxlen
|
|
||||||
is the following macro:
|
|
||||||
|
|
||||||
mov %r11, %rsi; \
|
|
||||||
subq %rax, %rsi; \
|
|
||||||
andq $-64, %rax; \
|
|
||||||
testq $-64, %rsi; \
|
|
||||||
je L(strnlen_ret)
|
|
||||||
|
|
||||||
Which words independently of s + maxlen overflowing. So the
|
|
||||||
second overflow check is unnecissary for correctness and
|
|
||||||
just extra overhead in the common no overflow case.
|
|
||||||
|
|
||||||
test-strlen.c, test-wcslen.c, test-strnlen.c and test-wcsnlen.c are
|
|
||||||
all passing
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strlen-vec.S | 7 -------
|
|
||||||
1 file changed, 7 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
index 439e486a..b7657282 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
|
|
||||||
@@ -71,19 +71,12 @@ L(n_nonzero):
|
|
||||||
suffice. */
|
|
||||||
mov %RSI_LP, %R10_LP
|
|
||||||
sar $62, %R10_LP
|
|
||||||
- test %R10_LP, %R10_LP
|
|
||||||
jnz __wcslen_sse4_1
|
|
||||||
sal $2, %RSI_LP
|
|
||||||
# endif
|
|
||||||
|
|
||||||
-
|
|
||||||
/* Initialize long lived registers. */
|
|
||||||
-
|
|
||||||
add %RDI_LP, %RSI_LP
|
|
||||||
-# ifdef AS_WCSLEN
|
|
||||||
-/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
|
|
||||||
- jbe __wcslen_sse4_1
|
|
||||||
-# endif
|
|
||||||
mov %RSI_LP, %R10_LP
|
|
||||||
and $-64, %R10_LP
|
|
||||||
mov %RSI_LP, %R11_LP
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,290 +0,0 @@
|
|||||||
From 82d0b4a4d76db554eb6757acb790fcea30b19965 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 21 Jan 2019 11:32:24 -0800
|
|
||||||
Subject: [PATCH] x86-64 memset/wmemset: Properly handle the length parameter
|
|
||||||
[BZ# 24097]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
|
||||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
|
||||||
functions written in assembly can only use the lower 32 bits of a
|
|
||||||
64-bit register as length or must clear the upper 32 bits before using
|
|
||||||
the full 64-bit register for length.
|
|
||||||
|
|
||||||
This pach fixes memset/wmemset for x32. Tested on x86-64 and x32. On
|
|
||||||
x86-64, libc.so is the same with and withou the fix.
|
|
||||||
|
|
||||||
[BZ# 24097]
|
|
||||||
CVE-2019-6488
|
|
||||||
* sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use
|
|
||||||
RDX_LP for length. Clear the upper 32 bits of RDX register.
|
|
||||||
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-memset.c: New file.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise.
|
|
||||||
---
|
|
||||||
.../multiarch/memset-avx512-no-vzeroupper.S | 6 +-
|
|
||||||
.../multiarch/memset-vec-unaligned-erms.S | 34 +++++----
|
|
||||||
sysdeps/x86_64/x32/Makefile | 4 +-
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-memset.c | 73 +++++++++++++++++++
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-wmemset.c | 20 +++++
|
|
||||||
5 files changed, 121 insertions(+), 16 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memset.c
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemset.c
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
|
|
||||||
index 689cc119..99e25519 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
|
|
||||||
@@ -29,12 +29,16 @@
|
|
||||||
.section .text.avx512,"ax",@progbits
|
|
||||||
#if defined PIC
|
|
||||||
ENTRY (MEMSET_CHK)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (MEMSET_CHK)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ENTRY (MEMSET)
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ mov %edx, %edx
|
|
||||||
+# endif
|
|
||||||
vpxor %xmm0, %xmm0, %xmm0
|
|
||||||
vmovd %esi, %xmm1
|
|
||||||
lea (%rdi, %rdx), %rsi
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
index 270a1d49..9a0fd818 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
@@ -65,8 +65,8 @@
|
|
||||||
.section SECTION(.text),"ax",@progbits
|
|
||||||
#if VEC_SIZE == 16 && IS_IN (libc)
|
|
||||||
ENTRY (__bzero)
|
|
||||||
- movq %rdi, %rax /* Set return value. */
|
|
||||||
- movq %rsi, %rdx /* Set n. */
|
|
||||||
+ mov %RDI_LP, %RAX_LP /* Set return value. */
|
|
||||||
+ mov %RSI_LP, %RDX_LP /* Set n. */
|
|
||||||
pxor %xmm0, %xmm0
|
|
||||||
jmp L(entry_from_bzero)
|
|
||||||
END (__bzero)
|
|
||||||
@@ -76,13 +76,13 @@ weak_alias (__bzero, bzero)
|
|
||||||
#if IS_IN (libc)
|
|
||||||
# if defined SHARED
|
|
||||||
ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
|
||||||
# endif
|
|
||||||
|
|
||||||
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
|
|
||||||
- shlq $2, %rdx
|
|
||||||
+ shl $2, %RDX_LP
|
|
||||||
WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
||||||
jmp L(entry_from_bzero)
|
|
||||||
END (WMEMSET_SYMBOL (__wmemset, unaligned))
|
|
||||||
@@ -90,13 +90,17 @@ END (WMEMSET_SYMBOL (__wmemset, unaligned))
|
|
||||||
|
|
||||||
#if defined SHARED && IS_IN (libc)
|
|
||||||
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
|
||||||
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ mov %edx, %edx
|
|
||||||
+# endif
|
|
||||||
L(entry_from_bzero):
|
|
||||||
cmpq $VEC_SIZE, %rdx
|
|
||||||
jb L(less_vec)
|
|
||||||
@@ -112,14 +116,14 @@ END (MEMSET_SYMBOL (__memset, unaligned))
|
|
||||||
|
|
||||||
# if VEC_SIZE == 16
|
|
||||||
ENTRY (__memset_chk_erms)
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END (__memset_chk_erms)
|
|
||||||
|
|
||||||
/* Only used to measure performance of REP STOSB. */
|
|
||||||
ENTRY (__memset_erms)
|
|
||||||
/* Skip zero length. */
|
|
||||||
- testq %rdx, %rdx
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
jnz L(stosb)
|
|
||||||
movq %rdi, %rax
|
|
||||||
ret
|
|
||||||
@@ -131,11 +135,11 @@ ENTRY (MEMSET_SYMBOL (__memset, erms))
|
|
||||||
L(stosb):
|
|
||||||
/* Issue vzeroupper before rep stosb. */
|
|
||||||
VZEROUPPER
|
|
||||||
- movq %rdx, %rcx
|
|
||||||
+ mov %RDX_LP, %RCX_LP
|
|
||||||
movzbl %sil, %eax
|
|
||||||
- movq %rdi, %rdx
|
|
||||||
+ mov %RDI_LP, %RDX_LP
|
|
||||||
rep stosb
|
|
||||||
- movq %rdx, %rax
|
|
||||||
+ mov %RDX_LP, %RAX_LP
|
|
||||||
ret
|
|
||||||
# if VEC_SIZE == 16
|
|
||||||
END (__memset_erms)
|
|
||||||
@@ -145,16 +149,20 @@ END (MEMSET_SYMBOL (__memset, erms))
|
|
||||||
|
|
||||||
# if defined SHARED && IS_IN (libc)
|
|
||||||
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
|
||||||
- cmpq %rdx, %rcx
|
|
||||||
+ cmp %RDX_LP, %RCX_LP
|
|
||||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
||||||
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
|
||||||
# endif
|
|
||||||
|
|
||||||
ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
|
|
||||||
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
||||||
- cmpq $VEC_SIZE, %rdx
|
|
||||||
+# ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ mov %edx, %edx
|
|
||||||
+# endif
|
|
||||||
+ cmp $VEC_SIZE, %RDX_LP
|
|
||||||
jb L(less_vec)
|
|
||||||
- cmpq $(VEC_SIZE * 2), %rdx
|
|
||||||
+ cmp $(VEC_SIZE * 2), %RDX_LP
|
|
||||||
ja L(stosb_more_2x_vec)
|
|
||||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
||||||
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index e99dbd7c..98bd9ae9 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -7,9 +7,9 @@ endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
|
|
||||||
- tst-size_t-memrchr
|
|
||||||
+ tst-size_t-memrchr tst-size_t-memset
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),wcsmbs)
|
|
||||||
-tests += tst-size_t-wmemchr tst-size_t-wmemcmp
|
|
||||||
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
|
|
||||||
endif
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memset.c b/sysdeps/x86_64/x32/tst-size_t-memset.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..2c367af6
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memset.c
|
|
||||||
@@ -0,0 +1,73 @@
|
|
||||||
+/* Test memset with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# define TEST_NAME "wmemset"
|
|
||||||
+#else
|
|
||||||
+# define TEST_NAME "memset"
|
|
||||||
+#endif /* WIDE */
|
|
||||||
+
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# include <wchar.h>
|
|
||||||
+# define MEMSET wmemset
|
|
||||||
+# define CHAR wchar_t
|
|
||||||
+#else
|
|
||||||
+# define MEMSET memset
|
|
||||||
+# define CHAR char
|
|
||||||
+#endif /* WIDE */
|
|
||||||
+
|
|
||||||
+IMPL (MEMSET, 1)
|
|
||||||
+
|
|
||||||
+typedef CHAR *(*proto_t) (CHAR *, int, size_t);
|
|
||||||
+
|
|
||||||
+static void *
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_memset (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, (uintptr_t) b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ CHAR ch = 0x23;
|
|
||||||
+ parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
|
|
||||||
+ parameter_t c = { { 0 }, (void *) (uintptr_t) ch };
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ c.fn = impl->fn;
|
|
||||||
+ CHAR *p = (CHAR *) do_memset (src, c);
|
|
||||||
+ size_t i;
|
|
||||||
+ for (i = 0; i < src.len; i++)
|
|
||||||
+ if (p[i] != ch)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s", impl->name);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemset.c b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..955eb488
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
|
|
||||||
@@ -0,0 +1,20 @@
|
|
||||||
+/* Test wmemset with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define WIDE 1
|
|
||||||
+#include "tst-size_t-memset.c"
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,43 +0,0 @@
|
|||||||
From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
|
|
||||||
Author: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com> 2021-05-23 21:43:10
|
|
||||||
Committer: H.J. Lu <hjl.tools@gmail.com> 2021-06-27 10:56:57
|
|
||||||
Parent: 2c16cb88a6e5ace0fb7cedca86860ea7bde522a7 (Linux: Move timer helper routines from librt to libc)
|
|
||||||
Child: 1683249d17e14827b6579529742eb895027dfa84 (x86_64: roundeven with sse4.1 support)
|
|
||||||
Branches: master, remotes/origin/master and many more (41)
|
|
||||||
Follows: glibc-2.33.9000
|
|
||||||
Precedes: glibc-2.34
|
|
||||||
|
|
||||||
math: redirect roundeven function
|
|
||||||
|
|
||||||
This patch redirect roundeven function for futhermore changes.
|
|
||||||
|
|
||||||
Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
*
|
|
||||||
(rewritten for older branch)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
|
|
||||||
index 7bbbb2dc..8728d0f2 100644
|
|
||||||
--- a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
|
|
||||||
+++ b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
|
|
||||||
@@ -17,6 +17,7 @@
|
|
||||||
License along with the GNU C Library; if not, see
|
|
||||||
<http://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
+#define NO_MATH_REDIRECT
|
|
||||||
#include <math.h>
|
|
||||||
#include <math_private.h>
|
|
||||||
#include <libm-alias-double.h>
|
|
||||||
@@ -67,5 +68,6 @@ __roundeven (double x)
|
|
||||||
INSERT_WORDS64 (x, ix);
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
-hidden_def (__roundeven)
|
|
||||||
+#ifndef __roundeven
|
|
||||||
libm_alias_double (__roundeven, roundeven)
|
|
||||||
+#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,118 +0,0 @@
|
|||||||
From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
|
|
||||||
Date: Mon, 24 May 2021 09:43:10 +0800
|
|
||||||
Subject: [PATCH] math: redirect roundeven function
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
This patch redirect roundeven function for futhermore changes.
|
|
||||||
|
|
||||||
Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
include/math.h | 3 ++-
|
|
||||||
sysdeps/ieee754/dbl-64/s_roundeven.c | 4 +++-
|
|
||||||
sysdeps/ieee754/float128/s_roundevenf128.c | 1 +
|
|
||||||
sysdeps/ieee754/flt-32/s_roundevenf.c | 3 +++
|
|
||||||
sysdeps/ieee754/ldbl-128/s_roundevenl.c | 1 +
|
|
||||||
sysdeps/ieee754/ldbl-96/s_roundevenl.c | 1 +
|
|
||||||
6 files changed, 11 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
include/math.h
|
|
||||||
(missing MATH_REDIRECT macros)
|
|
||||||
|
|
||||||
diff --git a/include/math.h b/include/math.h
|
|
||||||
index e21d34b8..1f9f9a54 100644
|
|
||||||
--- a/include/math.h
|
|
||||||
+++ b/include/math.h
|
|
||||||
@@ -38,7 +38,6 @@ libm_hidden_proto (__issignaling)
|
|
||||||
libm_hidden_proto (__issignalingf)
|
|
||||||
libm_hidden_proto (__exp)
|
|
||||||
libm_hidden_proto (__expf)
|
|
||||||
-libm_hidden_proto (__roundeven)
|
|
||||||
|
|
||||||
# ifndef __NO_LONG_DOUBLE_MATH
|
|
||||||
libm_hidden_proto (__fpclassifyl)
|
|
||||||
@@ -56,6 +55,8 @@ libm_hidden_proto (__expm1f128)
|
|
||||||
|
|
||||||
# if !(defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ > 0)
|
|
||||||
# ifndef NO_MATH_REDIRECT
|
|
||||||
+float (roundevenf) (float) asm ("__roundevenf");
|
|
||||||
+double (roundeven) (double) asm ("__roundeven");
|
|
||||||
/* Declare sqrt for use within GLIBC. Compilers typically inline sqrt as a
|
|
||||||
single instruction. Use an asm to avoid use of PLTs if it doesn't. */
|
|
||||||
float (sqrtf) (float) asm ("__ieee754_sqrtf");
|
|
||||||
diff --git a/sysdeps/ieee754/dbl-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/s_roundeven.c
|
|
||||||
index 1438e81d..61962184 100644
|
|
||||||
--- a/sysdeps/ieee754/dbl-64/s_roundeven.c
|
|
||||||
+++ b/sysdeps/ieee754/dbl-64/s_roundeven.c
|
|
||||||
@@ -17,6 +17,7 @@
|
|
||||||
License along with the GNU C Library; if not, see
|
|
||||||
<http://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
+#define NO_MATH_REDIRECT
|
|
||||||
#include <math.h>
|
|
||||||
#include <math_private.h>
|
|
||||||
#include <libm-alias-double.h>
|
|
||||||
@@ -101,5 +102,6 @@ __roundeven (double x)
|
|
||||||
INSERT_WORDS (x, hx, lx);
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
-hidden_def (__roundeven)
|
|
||||||
+#ifndef __roundeven
|
|
||||||
libm_alias_double (__roundeven, roundeven)
|
|
||||||
+#endif
|
|
||||||
diff --git a/sysdeps/ieee754/float128/s_roundevenf128.c b/sysdeps/ieee754/float128/s_roundevenf128.c
|
|
||||||
index 5a9b3f39..e0faf727 100644
|
|
||||||
--- a/sysdeps/ieee754/float128/s_roundevenf128.c
|
|
||||||
+++ b/sysdeps/ieee754/float128/s_roundevenf128.c
|
|
||||||
@@ -1,2 +1,3 @@
|
|
||||||
+#define NO_MATH_REDIRECT
|
|
||||||
#include <float128_private.h>
|
|
||||||
#include "../ldbl-128/s_roundevenl.c"
|
|
||||||
diff --git a/sysdeps/ieee754/flt-32/s_roundevenf.c b/sysdeps/ieee754/flt-32/s_roundevenf.c
|
|
||||||
index 90f991d5..a661875e 100644
|
|
||||||
--- a/sysdeps/ieee754/flt-32/s_roundevenf.c
|
|
||||||
+++ b/sysdeps/ieee754/flt-32/s_roundevenf.c
|
|
||||||
@@ -17,6 +17,7 @@
|
|
||||||
License along with the GNU C Library; if not, see
|
|
||||||
<http://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
+#define NO_MATH_REDIRECT
|
|
||||||
#include <math.h>
|
|
||||||
#include <math_private.h>
|
|
||||||
#include <libm-alias-float.h>
|
|
||||||
@@ -67,4 +68,6 @@ __roundevenf (float x)
|
|
||||||
SET_FLOAT_WORD (x, ix);
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
+#ifndef __roundevenf
|
|
||||||
libm_alias_float (__roundeven, roundeven)
|
|
||||||
+#endif
|
|
||||||
diff --git a/sysdeps/ieee754/ldbl-128/s_roundevenl.c b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
|
|
||||||
index 5fc59af4..b9375b6c 100644
|
|
||||||
--- a/sysdeps/ieee754/ldbl-128/s_roundevenl.c
|
|
||||||
+++ b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
|
|
||||||
@@ -17,6 +17,7 @@
|
|
||||||
License along with the GNU C Library; if not, see
|
|
||||||
<http://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
+#define NO_MATH_REDIRECT
|
|
||||||
#include <math.h>
|
|
||||||
#include <math_private.h>
|
|
||||||
#include <libm-alias-ldouble.h>
|
|
||||||
diff --git a/sysdeps/ieee754/ldbl-96/s_roundevenl.c b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
|
|
||||||
index be2e4fa4..65031ab7 100644
|
|
||||||
--- a/sysdeps/ieee754/ldbl-96/s_roundevenl.c
|
|
||||||
+++ b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
|
|
||||||
@@ -17,6 +17,7 @@
|
|
||||||
License along with the GNU C Library; if not, see
|
|
||||||
<http://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
+#define NO_MATH_REDIRECT
|
|
||||||
#include <math.h>
|
|
||||||
#include <math_private.h>
|
|
||||||
#include <libm-alias-ldouble.h>
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,242 +0,0 @@
|
|||||||
From 1683249d17e14827b6579529742eb895027dfa84 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
|
|
||||||
Date: Mon, 24 May 2021 09:43:11 +0800
|
|
||||||
Subject: [PATCH] x86_64: roundeven with sse4.1 support
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
This patch adds support for the sse4.1 hardware floating point
|
|
||||||
roundeven.
|
|
||||||
|
|
||||||
Here is some benchmark results on my systems:
|
|
||||||
|
|
||||||
=AMD Ryzen 9 3900X 12-Core Processor=
|
|
||||||
|
|
||||||
* benchmark result before this commit
|
|
||||||
| | roundeven | roundevenf |
|
|
||||||
|------------|--------------|--------------|
|
|
||||||
| duration | 3.75587e+09 | 3.75114e+09 |
|
|
||||||
| iterations | 3.93053e+08 | 4.35402e+08 |
|
|
||||||
| max | 52.592 | 58.71 |
|
|
||||||
| min | 7.98 | 7.22 |
|
|
||||||
| mean | 9.55563 | 8.61535 |
|
|
||||||
|
|
||||||
* benchmark result after this commit
|
|
||||||
| | roundeven | roundevenf |
|
|
||||||
|------------|---------------|--------------|
|
|
||||||
| duration | 3.73815e+09 | 3.73738e+09 |
|
|
||||||
| iterations | 5.82692e+08 | 5.91498e+08 |
|
|
||||||
| max | 56.468 | 51.642 |
|
|
||||||
| min | 6.27 | 6.156 |
|
|
||||||
| mean | 6.41532 | 6.3185 |
|
|
||||||
|
|
||||||
=Intel(R) Pentium(R) CPU D1508 @ 2.20GHz=
|
|
||||||
|
|
||||||
* benchmark result before this commit
|
|
||||||
| | roundeven | roundevenf |
|
|
||||||
|------------|--------------|--------------|
|
|
||||||
| duration | 2.18208e+09 | 2.18258e+09 |
|
|
||||||
| iterations | 2.39932e+08 | 2.46924e+08 |
|
|
||||||
| max | 96.378 | 98.035 |
|
|
||||||
| min | 6.776 | 5.94 |
|
|
||||||
| mean | 9.09456 | 8.83907 |
|
|
||||||
|
|
||||||
* benchmark result after this commit
|
|
||||||
| | roundeven | roundevenf |
|
|
||||||
|------------|--------------|--------------|
|
|
||||||
| duration | 2.17415e+09 | 2.17005e+09 |
|
|
||||||
| iterations | 3.56193e+08 | 4.09824e+08 |
|
|
||||||
| max | 51.693 | 97.192 |
|
|
||||||
| min | 5.926 | 5.093 |
|
|
||||||
| mean | 6.10385 | 5.29507 |
|
|
||||||
|
|
||||||
Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/fpu/multiarch/Makefile | 5 +--
|
|
||||||
sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c | 2 ++
|
|
||||||
.../x86_64/fpu/multiarch/s_roundeven-sse4_1.S | 24 ++++++++++++++
|
|
||||||
sysdeps/x86_64/fpu/multiarch/s_roundeven.c | 31 +++++++++++++++++++
|
|
||||||
sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c | 3 ++
|
|
||||||
.../fpu/multiarch/s_roundevenf-sse4_1.S | 24 ++++++++++++++
|
|
||||||
sysdeps/x86_64/fpu/multiarch/s_roundevenf.c | 31 +++++++++++++++++++
|
|
||||||
7 files changed, 118 insertions(+), 2 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
|
|
||||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
|
|
||||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven.c
|
|
||||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
|
|
||||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
|
|
||||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
|
|
||||||
index 9f387248..6ddd1c01 100644
|
|
||||||
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
|
|
||||||
@@ -1,11 +1,12 @@
|
|
||||||
ifeq ($(subdir),math)
|
|
||||||
libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
|
|
||||||
s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \
|
|
||||||
- s_trunc-c s_truncf-c
|
|
||||||
+ s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c
|
|
||||||
|
|
||||||
libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
|
|
||||||
s_floorf-sse4_1 s_nearbyint-sse4_1 \
|
|
||||||
- s_nearbyintf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
|
|
||||||
+ s_nearbyintf-sse4_1 s_roundeven-sse4_1 \
|
|
||||||
+ s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
|
|
||||||
s_trunc-sse4_1 s_truncf-sse4_1
|
|
||||||
|
|
||||||
libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
|
|
||||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..c7be43cb
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
|
|
||||||
@@ -0,0 +1,2 @@
|
|
||||||
+#define __roundeven __roundeven_c
|
|
||||||
+#include <sysdeps/ieee754/dbl-64/s_roundeven.c>
|
|
||||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..6ae8f6b1
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
|
|
||||||
@@ -0,0 +1,24 @@
|
|
||||||
+/* Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <sysdep.h>
|
|
||||||
+
|
|
||||||
+ .section .text.sse4.1,"ax",@progbits
|
|
||||||
+ENTRY(__roundeven_sse41)
|
|
||||||
+ roundsd $8, %xmm0, %xmm0
|
|
||||||
+ ret
|
|
||||||
+END(__roundeven_sse41)
|
|
||||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..d92eda65
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
|
|
||||||
@@ -0,0 +1,31 @@
|
|
||||||
+/* Multiple versions of __roundeven.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <libm-alias-double.h>
|
|
||||||
+
|
|
||||||
+#define roundeven __redirect_roundeven
|
|
||||||
+#define __roundeven __redirect___roundeven
|
|
||||||
+#include <math.h>
|
|
||||||
+#undef roundeven
|
|
||||||
+#undef __roundeven
|
|
||||||
+
|
|
||||||
+#define SYMBOL_NAME roundeven
|
|
||||||
+#include "ifunc-sse4_1.h"
|
|
||||||
+
|
|
||||||
+libc_ifunc_redirected (__redirect_roundeven, __roundeven, IFUNC_SELECTOR ());
|
|
||||||
+libm_alias_double (__roundeven, roundeven)
|
|
||||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..72a6e7d1
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
|
|
||||||
@@ -0,0 +1,3 @@
|
|
||||||
+#undef __roundevenf
|
|
||||||
+#define __roundevenf __roundevenf_c
|
|
||||||
+#include <sysdeps/ieee754/flt-32/s_roundevenf.c>
|
|
||||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..a76e1080
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
|
|
||||||
@@ -0,0 +1,24 @@
|
|
||||||
+/* Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <sysdep.h>
|
|
||||||
+
|
|
||||||
+ .section .text.sse4.1,"ax",@progbits
|
|
||||||
+ENTRY(__roundevenf_sse41)
|
|
||||||
+ roundss $8, %xmm0, %xmm0
|
|
||||||
+ ret
|
|
||||||
+END(__roundevenf_sse41)
|
|
||||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..2ee196e6
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
|
|
||||||
@@ -0,0 +1,31 @@
|
|
||||||
+/* Multiple versions of __roundevenf.
|
|
||||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <libm-alias-float.h>
|
|
||||||
+
|
|
||||||
+#define roundevenf __redirect_roundevenf
|
|
||||||
+#define __roundevenf __redirect___roundevenf
|
|
||||||
+#include <math.h>
|
|
||||||
+#undef roundevenf
|
|
||||||
+#undef __roundevenf
|
|
||||||
+
|
|
||||||
+#define SYMBOL_NAME roundevenf
|
|
||||||
+#include "ifunc-sse4_1.h"
|
|
||||||
+
|
|
||||||
+libc_ifunc_redirected (__redirect_roundevenf, __roundevenf, IFUNC_SELECTOR ());
|
|
||||||
+libm_alias_float (__roundeven, roundeven)
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,41 +0,0 @@
|
|||||||
From 7e08db3359c86c94918feb33a1182cd0ff3bb10b Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Sun, 9 Jan 2022 16:02:28 -0600
|
|
||||||
Subject: [PATCH] x86: Fix __wcsncmp_evex in strcmp-evex.S [BZ# 28755]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
|
|
||||||
__wcscmp_evex. For x86_64 this covers the entire address range so any
|
|
||||||
length larger could not possibly be used to bound `s1` or `s2`.
|
|
||||||
|
|
||||||
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
|
|
||||||
|
|
||||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-evex.S | 10 ++++++++++
|
|
||||||
1 file changed, 10 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
index 459eeed0..d5aa6daa 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
@@ -97,6 +97,16 @@ ENTRY (STRCMP)
|
|
||||||
je L(char0)
|
|
||||||
jb L(zero)
|
|
||||||
# ifdef USE_AS_WCSCMP
|
|
||||||
+# ifndef __ILP32__
|
|
||||||
+ movq %rdx, %rcx
|
|
||||||
+ /* Check if length could overflow when multiplied by
|
|
||||||
+ sizeof(wchar_t). Checking top 8 bits will cover all potential
|
|
||||||
+ overflow cases as well as redirect cases where its impossible to
|
|
||||||
+ length to bound a valid memory region. In these cases just use
|
|
||||||
+ 'wcscmp'. */
|
|
||||||
+ shrq $56, %rcx
|
|
||||||
+ jnz __wcscmp_evex
|
|
||||||
+# endif
|
|
||||||
/* Convert units: from wide to byte char. */
|
|
||||||
shl $2, %RDX_LP
|
|
||||||
# endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,45 +0,0 @@
|
|||||||
From bad852b61b79503fcb3c5fc379c70f768df3e1fb Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Sat, 23 Oct 2021 01:26:47 -0400
|
|
||||||
Subject: [PATCH] x86: Replace sse2 instructions with avx in
|
|
||||||
memcmp-evex-movbe.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
|
|
||||||
|
|
||||||
it could potentially be dangerous to use SSE2 if this function is ever
|
|
||||||
called without using 'vzeroupper' beforehand. While compilers appear
|
|
||||||
to use 'vzeroupper' before function calls if AVX2 has been used, using
|
|
||||||
SSE2 here is more brittle. Since it is not absolutely necessary it
|
|
||||||
should be avoided.
|
|
||||||
|
|
||||||
It costs 2-extra bytes but the extra bytes should only eat into
|
|
||||||
alignment padding.
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 4 ++--
|
|
||||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
index 2761b54f..640f6757 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
|
||||||
@@ -561,13 +561,13 @@ L(between_16_31):
|
|
||||||
/* From 16 to 31 bytes. No branch when size == 16. */
|
|
||||||
|
|
||||||
/* Use movups to save code size. */
|
|
||||||
- movups (%rsi), %xmm2
|
|
||||||
+ vmovdqu (%rsi), %xmm2
|
|
||||||
VPCMP $4, (%rdi), %xmm2, %k1
|
|
||||||
kmovd %k1, %eax
|
|
||||||
testl %eax, %eax
|
|
||||||
jnz L(return_vec_0_lv)
|
|
||||||
/* Use overlapping loads to avoid branches. */
|
|
||||||
- movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2
|
|
||||||
+ vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2
|
|
||||||
VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
|
|
||||||
addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
|
|
||||||
kmovd %k1, %eax
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,300 +0,0 @@
|
|||||||
From ee915088a0231cd421054dbd8abab7aadf331153 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 21 Jan 2019 11:33:52 -0800
|
|
||||||
Subject: [PATCH] x86-64 strncmp family: Properly handle the length parameter
|
|
||||||
[BZ# 24097]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
|
||||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
|
||||||
functions written in assembly can only use the lower 32 bits of a
|
|
||||||
64-bit register as length or must clear the upper 32 bits before using
|
|
||||||
the full 64-bit register for length.
|
|
||||||
|
|
||||||
This pach fixes the strncmp family for x32. Tested on x86-64 and x32.
|
|
||||||
On x86-64, libc.so is the same with and withou the fix.
|
|
||||||
|
|
||||||
[BZ# 24097]
|
|
||||||
CVE-2019-6488
|
|
||||||
* sysdeps/x86_64/multiarch/strcmp-avx2.S: Use RDX_LP for length.
|
|
||||||
* sysdeps/x86_64/multiarch/strcmp-sse42.S: Likewise.
|
|
||||||
* sysdeps/x86_64/strcmp.S: Likewise.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp,
|
|
||||||
tst-size_t-strncmp and tst-size_t-wcsncmp.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 6 +-
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-sse42.S | 6 +-
|
|
||||||
sysdeps/x86_64/strcmp.S | 6 +-
|
|
||||||
sysdeps/x86_64/x32/Makefile | 6 +-
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-strncasecmp.c | 59 ++++++++++++++++
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-strncmp.c | 78 +++++++++++++++++++++
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-wcsncmp.c | 20 ++++++
|
|
||||||
7 files changed, 170 insertions(+), 11 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncmp.c
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
index 327e3d87..156c1949 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
@@ -79,15 +79,15 @@
|
|
||||||
ENTRY (STRCMP)
|
|
||||||
# ifdef USE_AS_STRNCMP
|
|
||||||
/* Check for simple cases (0 or 1) in offset. */
|
|
||||||
- cmp $1, %rdx
|
|
||||||
+ cmp $1, %RDX_LP
|
|
||||||
je L(char0)
|
|
||||||
jb L(zero)
|
|
||||||
# ifdef USE_AS_WCSCMP
|
|
||||||
/* Convert units: from wide to byte char. */
|
|
||||||
- shl $2, %rdx
|
|
||||||
+ shl $2, %RDX_LP
|
|
||||||
# endif
|
|
||||||
/* Register %r11 tracks the maximum offset. */
|
|
||||||
- movq %rdx, %r11
|
|
||||||
+ mov %RDX_LP, %R11_LP
|
|
||||||
# endif
|
|
||||||
movl %edi, %eax
|
|
||||||
xorl %edx, %edx
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
|
||||||
index d3c07bd2..a1ebea46 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
|
||||||
@@ -156,11 +156,11 @@ STRCMP_SSE42:
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
|
|
||||||
- test %rdx, %rdx
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
je LABEL(strcmp_exitz)
|
|
||||||
- cmp $1, %rdx
|
|
||||||
+ cmp $1, %RDX_LP
|
|
||||||
je LABEL(Byte0)
|
|
||||||
- mov %rdx, %r11
|
|
||||||
+ mov %RDX_LP, %R11_LP
|
|
||||||
#endif
|
|
||||||
mov %esi, %ecx
|
|
||||||
mov %edi, %eax
|
|
||||||
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
|
|
||||||
index e16945b9..f47c8ad4 100644
|
|
||||||
--- a/sysdeps/x86_64/strcmp.S
|
|
||||||
+++ b/sysdeps/x86_64/strcmp.S
|
|
||||||
@@ -135,11 +135,11 @@ ENTRY (STRCMP)
|
|
||||||
* This implementation uses SSE to compare up to 16 bytes at a time.
|
|
||||||
*/
|
|
||||||
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
|
|
||||||
- test %rdx, %rdx
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
je LABEL(strcmp_exitz)
|
|
||||||
- cmp $1, %rdx
|
|
||||||
+ cmp $1, %RDX_LP
|
|
||||||
je LABEL(Byte0)
|
|
||||||
- mov %rdx, %r11
|
|
||||||
+ mov %RDX_LP, %R11_LP
|
|
||||||
#endif
|
|
||||||
mov %esi, %ecx
|
|
||||||
mov %edi, %eax
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index 98bd9ae9..db302839 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -7,9 +7,11 @@ endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
|
|
||||||
- tst-size_t-memrchr tst-size_t-memset
|
|
||||||
+ tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
|
|
||||||
+ tst-size_t-strncmp
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),wcsmbs)
|
|
||||||
-tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
|
|
||||||
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
|
|
||||||
+ tst-size_t-wcsncmp
|
|
||||||
endif
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..86233593
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
|
|
||||||
@@ -0,0 +1,59 @@
|
|
||||||
+/* Test strncaecmp with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define TEST_NAME "strncasecmp"
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+IMPL (strncasecmp, 1)
|
|
||||||
+
|
|
||||||
+typedef int (*proto_t) (const char *, const char *, size_t);
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_strncasecmp (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ parameter_t dest = { { page_size }, buf1 };
|
|
||||||
+ parameter_t src = { { 0 }, buf2 };
|
|
||||||
+
|
|
||||||
+ strncpy ((char *) buf1, (const char *) buf2, page_size);
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ src.fn = impl->fn;
|
|
||||||
+ int res = do_strncasecmp (dest, src);
|
|
||||||
+ if (res)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: %i != 0",
|
|
||||||
+ impl->name, res);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncmp.c b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..54e6bd83
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
|
|
||||||
@@ -0,0 +1,78 @@
|
|
||||||
+/* Test strncmp with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# define TEST_NAME "wcsncmp"
|
|
||||||
+#else
|
|
||||||
+# define TEST_NAME "strncmp"
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# include <wchar.h>
|
|
||||||
+
|
|
||||||
+# define STRNCMP wcsncmp
|
|
||||||
+# define STRNCPY wcsncpy
|
|
||||||
+# define CHAR wchar_t
|
|
||||||
+#else
|
|
||||||
+# define STRNCMP strncmp
|
|
||||||
+# define STRNCPY strncpy
|
|
||||||
+# define CHAR char
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+IMPL (STRNCMP, 1)
|
|
||||||
+
|
|
||||||
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_strncmp (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ size_t size = page_size / sizeof (CHAR);
|
|
||||||
+ parameter_t dest = { { size }, buf1 };
|
|
||||||
+ parameter_t src = { { 0 }, buf2 };
|
|
||||||
+
|
|
||||||
+ STRNCPY ((CHAR *) buf1, (const CHAR *) buf2, size);
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ src.fn = impl->fn;
|
|
||||||
+ int res = do_strncmp (dest, src);
|
|
||||||
+ if (res)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: %i != 0",
|
|
||||||
+ impl->name, res);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..4829647c
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
|
|
||||||
@@ -0,0 +1,20 @@
|
|
||||||
+/* Test wcsncmp with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define WIDE 1
|
|
||||||
+#include "tst-size_t-strncmp.c"
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,56 +0,0 @@
|
|||||||
From cf2c57526ba4b57e6863ad4db8a868e2678adce8 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Fri, 30 Apr 2021 05:58:59 -0700
|
|
||||||
Subject: [PATCH] x86: Set rep_movsb_threshold to 2112 on processors with FSRM
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
The glibc memcpy benchmark on Intel Core i7-1065G7 (Ice Lake) showed
|
|
||||||
that REP MOVSB became faster after 2112 bytes:
|
|
||||||
|
|
||||||
Vector Move REP MOVSB
|
|
||||||
length=2112, align1=0, align2=0: 24.20 24.40
|
|
||||||
length=2112, align1=1, align2=0: 26.07 23.13
|
|
||||||
length=2112, align1=0, align2=1: 27.18 28.13
|
|
||||||
length=2112, align1=1, align2=1: 26.23 25.16
|
|
||||||
length=2176, align1=0, align2=0: 23.18 22.52
|
|
||||||
length=2176, align1=2, align2=0: 25.45 22.52
|
|
||||||
length=2176, align1=0, align2=2: 27.14 27.82
|
|
||||||
length=2176, align1=2, align2=2: 22.73 25.56
|
|
||||||
length=2240, align1=0, align2=0: 24.62 24.25
|
|
||||||
length=2240, align1=3, align2=0: 29.77 27.15
|
|
||||||
length=2240, align1=0, align2=3: 35.55 29.93
|
|
||||||
length=2240, align1=3, align2=3: 34.49 25.15
|
|
||||||
length=2304, align1=0, align2=0: 34.75 26.64
|
|
||||||
length=2304, align1=4, align2=0: 32.09 22.63
|
|
||||||
length=2304, align1=0, align2=4: 28.43 31.24
|
|
||||||
|
|
||||||
Use REP MOVSB for data size > 2112 bytes in memcpy on processors with
|
|
||||||
fast short REP MOVSB (FSRM).
|
|
||||||
|
|
||||||
* sysdeps/x86/dl-cacheinfo.h (dl_init_cacheinfo): Set
|
|
||||||
rep_movsb_threshold to 2112 on processors with fast short REP
|
|
||||||
MOVSB (FSRM).
|
|
||||||
---
|
|
||||||
sysdeps/x86/cacheinfo.h | 6 ++++++
|
|
||||||
1 file changed, 6 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
|
|
||||||
index f72f634a..cc3941d3 100644
|
|
||||||
--- a/sysdeps/x86/cacheinfo.h
|
|
||||||
+++ b/sysdeps/x86/cacheinfo.h
|
|
||||||
@@ -430,6 +430,12 @@ init_cacheinfo (void)
|
|
||||||
rep_movsb_threshold = 2048 * (16 / 16);
|
|
||||||
minimum_rep_movsb_threshold = 16 * 8;
|
|
||||||
}
|
|
||||||
+
|
|
||||||
+ /* NB: The default REP MOVSB threshold is 2112 on processors with fast
|
|
||||||
+ short REP MOVSB (FSRM). */
|
|
||||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
|
||||||
+ rep_movsb_threshold = 2112;
|
|
||||||
+
|
|
||||||
if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold)
|
|
||||||
__x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
|
|
||||||
else
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,51 +0,0 @@
|
|||||||
From d672a98a1af106bd68deb15576710cd61363f7a6 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Tue, 2 Nov 2021 18:33:07 -0700
|
|
||||||
Subject: [PATCH] Add LLL_MUTEX_READ_LOCK [BZ #28537]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
CAS instruction is expensive. From the x86 CPU's point of view, getting
|
|
||||||
a cache line for writing is more expensive than reading. See Appendix
|
|
||||||
A.2 Spinlock in:
|
|
||||||
|
|
||||||
https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf
|
|
||||||
|
|
||||||
The full compare and swap will grab the cache line exclusive and cause
|
|
||||||
excessive cache line bouncing.
|
|
||||||
|
|
||||||
Add LLL_MUTEX_READ_LOCK to do an atomic load and skip CAS in spinlock
|
|
||||||
loop if compare may fail to reduce cache line bouncing on contended locks.
|
|
||||||
|
|
||||||
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
|
||||||
---
|
|
||||||
nptl/pthread_mutex_lock.c | 7 +++++++
|
|
||||||
1 file changed, 7 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
|
|
||||||
index 60ada70d..eb4d8baa 100644
|
|
||||||
--- a/nptl/pthread_mutex_lock.c
|
|
||||||
+++ b/nptl/pthread_mutex_lock.c
|
|
||||||
@@ -56,6 +56,11 @@
|
|
||||||
#define FORCE_ELISION(m, s)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+#ifndef LLL_MUTEX_READ_LOCK
|
|
||||||
+# define LLL_MUTEX_READ_LOCK(mutex) \
|
|
||||||
+ atomic_load_relaxed (&(mutex)->__data.__lock)
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
|
|
||||||
__attribute_noinline__;
|
|
||||||
|
|
||||||
@@ -136,6 +141,8 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
atomic_spin_nop ();
|
|
||||||
+ if (LLL_MUTEX_READ_LOCK (mutex) != 0)
|
|
||||||
+ continue;
|
|
||||||
}
|
|
||||||
while (LLL_MUTEX_TRYLOCK (mutex) != 0);
|
|
||||||
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,71 +0,0 @@
|
|||||||
From 120ac6d238825452e8024e2f627da33b2508dfd3 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Fri, 12 Nov 2021 11:47:42 -0800
|
|
||||||
Subject: [PATCH] Move assignment out of the CAS condition
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Update
|
|
||||||
|
|
||||||
commit 49302b8fdf9103b6fc0a398678668a22fa19574c
|
|
||||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
Date: Thu Nov 11 06:54:01 2021 -0800
|
|
||||||
|
|
||||||
Avoid extra load with CAS in __pthread_mutex_clocklock_common [BZ #28537]
|
|
||||||
|
|
||||||
Replace boolean CAS with value CAS to avoid the extra load.
|
|
||||||
|
|
||||||
and
|
|
||||||
|
|
||||||
commit 0b82747dc48d5bf0871bdc6da8cb6eec1256355f
|
|
||||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
Date: Thu Nov 11 06:31:51 2021 -0800
|
|
||||||
|
|
||||||
Avoid extra load with CAS in __pthread_mutex_lock_full [BZ #28537]
|
|
||||||
|
|
||||||
Replace boolean CAS with value CAS to avoid the extra load.
|
|
||||||
|
|
||||||
by moving assignment out of the CAS condition.
|
|
||||||
---
|
|
||||||
nptl/pthread_mutex_lock.c | 7 +++----
|
|
||||||
nptl/pthread_mutex_timedlock.c | 7 +++----
|
|
||||||
2 files changed, 6 insertions(+), 8 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
|
|
||||||
index eb4d8baa..a633d95e 100644
|
|
||||||
--- a/nptl/pthread_mutex_lock.c
|
|
||||||
+++ b/nptl/pthread_mutex_lock.c
|
|
||||||
@@ -299,10 +299,9 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
|
|
||||||
meantime. */
|
|
||||||
if ((oldval & FUTEX_WAITERS) == 0)
|
|
||||||
{
|
|
||||||
- int val;
|
|
||||||
- if ((val = atomic_compare_and_exchange_val_acq
|
|
||||||
- (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
|
|
||||||
- oldval)) != oldval)
|
|
||||||
+ int val = atomic_compare_and_exchange_val_acq
|
|
||||||
+ (&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
|
|
||||||
+ if (val != oldval)
|
|
||||||
{
|
|
||||||
oldval = val;
|
|
||||||
continue;
|
|
||||||
diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
|
|
||||||
index c4627ef6..a76c30b7 100644
|
|
||||||
--- a/nptl/pthread_mutex_timedlock.c
|
|
||||||
+++ b/nptl/pthread_mutex_timedlock.c
|
|
||||||
@@ -269,10 +269,9 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
|
|
||||||
meantime. */
|
|
||||||
if ((oldval & FUTEX_WAITERS) == 0)
|
|
||||||
{
|
|
||||||
- int val;
|
|
||||||
- if ((val = atomic_compare_and_exchange_val_acq
|
|
||||||
- (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
|
|
||||||
- oldval)) != oldval)
|
|
||||||
+ int val = atomic_compare_and_exchange_val_acq
|
|
||||||
+ (&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
|
|
||||||
+ if (val != oldval)
|
|
||||||
{
|
|
||||||
oldval = val;
|
|
||||||
continue;
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,60 +0,0 @@
|
|||||||
From 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Fri, 3 Dec 2021 15:29:25 -0800
|
|
||||||
Subject: [PATCH] x86-64: Use notl in EVEX strcmp [BZ #28646]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Must use notl %edi here as lower bits are for CHAR comparisons
|
|
||||||
potentially out of range thus can be 0 without indicating mismatch.
|
|
||||||
This fixes BZ #28646.
|
|
||||||
|
|
||||||
Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-evex.S | 14 ++++++++------
|
|
||||||
1 file changed, 8 insertions(+), 6 deletions(-)
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
string/test-strcmp.c
|
|
||||||
(new check omitted)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
index 82f12ac8..6f5c4bf9 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
@@ -656,12 +656,13 @@ L(loop_cross_page):
|
|
||||||
in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */
|
|
||||||
VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
|
|
||||||
kmovd %k3, %edi
|
|
||||||
+ /* Must use notl %edi here as lower bits are for CHAR
|
|
||||||
+ comparisons potentially out of range thus can be 0 without
|
|
||||||
+ indicating mismatch. */
|
|
||||||
+ notl %edi
|
|
||||||
# ifdef USE_AS_WCSCMP
|
|
||||||
/* Don't use subl since it is the upper 8 bits of EDI below. */
|
|
||||||
- notl %edi
|
|
||||||
andl $0xff, %edi
|
|
||||||
-# else
|
|
||||||
- incl %edi
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# ifdef USE_AS_WCSCMP
|
|
||||||
@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec):
|
|
||||||
in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */
|
|
||||||
VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
|
|
||||||
kmovd %k3, %edi
|
|
||||||
+ /* Must use notl %edi here as lower bits are for CHAR
|
|
||||||
+ comparisons potentially out of range thus can be 0 without
|
|
||||||
+ indicating mismatch. */
|
|
||||||
+ notl %edi
|
|
||||||
# ifdef USE_AS_WCSCMP
|
|
||||||
/* Don't use subl since it is the upper 8 bits of EDI below. */
|
|
||||||
- notl %edi
|
|
||||||
andl $0xff, %edi
|
|
||||||
-# else
|
|
||||||
- incl %edi
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# ifdef USE_AS_WCSCMP
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,153 +0,0 @@
|
|||||||
From c7c54f65b080affb87a1513dee449c8ad6143c8b Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 21 Jan 2019 11:35:18 -0800
|
|
||||||
Subject: [PATCH] x86-64 strncpy: Properly handle the length parameter [BZ#
|
|
||||||
24097]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
|
||||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
|
||||||
functions written in assembly can only use the lower 32 bits of a
|
|
||||||
64-bit register as length or must clear the upper 32 bits before using
|
|
||||||
the full 64-bit register for length.
|
|
||||||
|
|
||||||
This pach fixes strncpy for x32. Tested on x86-64 and x32. On x86-64,
|
|
||||||
libc.so is the same with and withou the fix.
|
|
||||||
|
|
||||||
[BZ# 24097]
|
|
||||||
CVE-2019-6488
|
|
||||||
* sysdeps/x86_64/multiarch/strcpy-avx2.S: Use RDX_LP for length.
|
|
||||||
* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise.
|
|
||||||
* sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file.
|
|
||||||
---
|
|
||||||
.../x86_64/multiarch/strcpy-sse2-unaligned.S | 4 +-
|
|
||||||
sysdeps/x86_64/multiarch/strcpy-ssse3.S | 6 +-
|
|
||||||
sysdeps/x86_64/x32/Makefile | 2 +-
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-strncpy.c | 58 +++++++++++++++++++
|
|
||||||
4 files changed, 64 insertions(+), 6 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncpy.c
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
sysdeps/x86_64/multiarch/strcpy-avx2.S
|
|
||||||
(skipped, only needed for x32 arch)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
|
|
||||||
index 72bf7e85..50aca22d 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
|
|
||||||
@@ -40,8 +40,8 @@
|
|
||||||
.text
|
|
||||||
ENTRY (STRCPY)
|
|
||||||
# ifdef USE_AS_STRNCPY
|
|
||||||
- mov %rdx, %r8
|
|
||||||
- test %r8, %r8
|
|
||||||
+ mov %RDX_LP, %R8_LP
|
|
||||||
+ test %R8_LP, %R8_LP
|
|
||||||
jz L(ExitZero)
|
|
||||||
# endif
|
|
||||||
mov %rsi, %rcx
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
|
|
||||||
index 9858d0c4..0a62814a 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
|
|
||||||
@@ -31,13 +31,13 @@ ENTRY (STRCPY)
|
|
||||||
|
|
||||||
mov %rsi, %rcx
|
|
||||||
# ifdef USE_AS_STRNCPY
|
|
||||||
- mov %rdx, %r8
|
|
||||||
+ mov %RDX_LP, %R8_LP
|
|
||||||
# endif
|
|
||||||
mov %rdi, %rdx
|
|
||||||
# ifdef USE_AS_STRNCPY
|
|
||||||
- test %r8, %r8
|
|
||||||
+ test %R8_LP, %R8_LP
|
|
||||||
jz L(Exit0)
|
|
||||||
- cmp $8, %r8
|
|
||||||
+ cmp $8, %R8_LP
|
|
||||||
jbe L(StrncpyExit8Bytes)
|
|
||||||
# endif
|
|
||||||
cmpb $0, (%rcx)
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index db302839..2a9e20a9 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -8,7 +8,7 @@ endif
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
|
|
||||||
tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
|
|
||||||
- tst-size_t-strncmp
|
|
||||||
+ tst-size_t-strncmp tst-size_t-strncpy
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),wcsmbs)
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncpy.c b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..4dec71e6
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
|
|
||||||
@@ -0,0 +1,58 @@
|
|
||||||
+/* Test strncpy with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define TEST_NAME "strncpy"
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+IMPL (strncpy, 1)
|
|
||||||
+
|
|
||||||
+typedef char *(*proto_t) (char *, const char*, size_t);
|
|
||||||
+
|
|
||||||
+static void *
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_strncpy (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ parameter_t dest = { { page_size }, buf1 };
|
|
||||||
+ parameter_t src = { { 0 }, buf2 };
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ src.fn = impl->fn;
|
|
||||||
+ do_strncpy (dest, src);
|
|
||||||
+ int res = strncmp (dest.p, src.p, dest.len);
|
|
||||||
+ if (res)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: %i != 0",
|
|
||||||
+ impl->name, res);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,43 +0,0 @@
|
|||||||
From 6b8dbbd03ac88f169b65b5c7d7278576a11d2e44 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Jangwoong Kim <6812skiii@gmail.com>
|
|
||||||
Date: Tue, 14 Dec 2021 21:30:51 +0900
|
|
||||||
Subject: [PATCH] nptl: Effectively skip CAS in spinlock loop
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
The commit:
|
|
||||||
"Add LLL_MUTEX_READ_LOCK [BZ #28537]"
|
|
||||||
SHA1: d672a98a1af106bd68deb15576710cd61363f7a6
|
|
||||||
|
|
||||||
introduced LLL_MUTEX_READ_LOCK, to skip CAS in spinlock loop
|
|
||||||
if atomic load fails. But, "continue" inside of do-while loop
|
|
||||||
does not skip the evaluation of escape expression, thus CAS
|
|
||||||
is not skipped.
|
|
||||||
|
|
||||||
Replace do-while with while and skip LLL_MUTEX_TRYLOCK if
|
|
||||||
LLL_MUTEX_READ_LOCK fails.
|
|
||||||
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
nptl/pthread_mutex_lock.c | 5 ++---
|
|
||||||
1 file changed, 2 insertions(+), 3 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
|
|
||||||
index a633d95e..d96a9933 100644
|
|
||||||
--- a/nptl/pthread_mutex_lock.c
|
|
||||||
+++ b/nptl/pthread_mutex_lock.c
|
|
||||||
@@ -141,10 +141,9 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
atomic_spin_nop ();
|
|
||||||
- if (LLL_MUTEX_READ_LOCK (mutex) != 0)
|
|
||||||
- continue;
|
|
||||||
}
|
|
||||||
- while (LLL_MUTEX_TRYLOCK (mutex) != 0);
|
|
||||||
+ while (LLL_MUTEX_READ_LOCK (mutex) != 0
|
|
||||||
+ || LLL_MUTEX_TRYLOCK (mutex) != 0);
|
|
||||||
|
|
||||||
mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8;
|
|
||||||
}
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,37 +0,0 @@
|
|||||||
From b98d0bbf747f39770e0caba7e984ce9f8f900330 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Fri, 18 Feb 2022 17:00:25 -0600
|
|
||||||
Subject: [PATCH] x86: Fix TEST_NAME to make it a string in tst-strncmp-rtm.c
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Previously TEST_NAME was passing a function pointer. This didn't fail
|
|
||||||
because of the -Wno-error flag (to allow for overflow sizes passed
|
|
||||||
to strncmp/wcsncmp)
|
|
||||||
|
|
||||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
---
|
|
||||||
sysdeps/x86/tst-strncmp-rtm.c | 4 ++--
|
|
||||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
|
|
||||||
index 4e9f094f..aef9866c 100644
|
|
||||||
--- a/sysdeps/x86/tst-strncmp-rtm.c
|
|
||||||
+++ b/sysdeps/x86/tst-strncmp-rtm.c
|
|
||||||
@@ -23,12 +23,12 @@
|
|
||||||
# define CHAR wchar_t
|
|
||||||
# define MEMSET wmemset
|
|
||||||
# define STRNCMP wcsncmp
|
|
||||||
-# define TEST_NAME wcsncmp
|
|
||||||
+# define TEST_NAME "wcsncmp"
|
|
||||||
#else /* !WIDE */
|
|
||||||
# define CHAR char
|
|
||||||
# define MEMSET memset
|
|
||||||
# define STRNCMP strncmp
|
|
||||||
-# define TEST_NAME strncmp
|
|
||||||
+# define TEST_NAME "strncmp"
|
|
||||||
#endif /* !WIDE */
|
|
||||||
|
|
||||||
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,33 +0,0 @@
|
|||||||
From c15efd011cea3d8f0494269eb539583215a1feed Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Fri, 4 Feb 2022 11:09:10 -0800
|
|
||||||
Subject: [PATCH] x86-64: Fix strcmp-avx2.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Change "movl %edx, %rdx" to "movl %edx, %edx" in:
|
|
||||||
|
|
||||||
commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45
|
|
||||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon Jan 10 15:35:38 2022 -0600
|
|
||||||
|
|
||||||
x86: Optimize strcmp-avx2.S
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
|
|
||||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
index 554ffe4c..04675aa4 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
|
||||||
@@ -106,7 +106,7 @@ ENTRY(STRCMP)
|
|
||||||
# ifdef USE_AS_STRNCMP
|
|
||||||
# ifdef __ILP32__
|
|
||||||
/* Clear the upper 32 bits. */
|
|
||||||
- movl %edx, %rdx
|
|
||||||
+ movl %edx, %edx
|
|
||||||
# endif
|
|
||||||
cmp $1, %RDX_LP
|
|
||||||
/* Signed comparison intentional. We use this branch to also
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,33 +0,0 @@
|
|||||||
From 0e0199a9e02ebe42e2b36958964d63f03573c382 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Fri, 4 Feb 2022 11:11:08 -0800
|
|
||||||
Subject: [PATCH] x86-64: Fix strcmp-evex.S
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Change "movl %edx, %rdx" to "movl %edx, %edx" in:
|
|
||||||
|
|
||||||
commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9
|
|
||||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon Jan 10 15:35:39 2022 -0600
|
|
||||||
|
|
||||||
x86: Optimize strcmp-evex.S
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strcmp-evex.S | 2 +-
|
|
||||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
index 99d8409a..ed56af8e 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
||||||
@@ -116,7 +116,7 @@ ENTRY(STRCMP)
|
|
||||||
# ifdef USE_AS_STRNCMP
|
|
||||||
# ifdef __ILP32__
|
|
||||||
/* Clear the upper 32 bits. */
|
|
||||||
- movl %edx, %rdx
|
|
||||||
+ movl %edx, %edx
|
|
||||||
# endif
|
|
||||||
cmp $1, %RDX_LP
|
|
||||||
/* Signed comparison intentional. We use this branch to also
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,40 +0,0 @@
|
|||||||
From 1b0c60f95bbe2eded80b2bb5be75c0e45b11cde1 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Mon, 7 Feb 2022 00:32:23 -0600
|
|
||||||
Subject: [PATCH] x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2
|
|
||||||
Only)
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
commit b62ace2740a106222e124cc86956448fa07abf4d
|
|
||||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Sun Feb 6 00:54:18 2022 -0600
|
|
||||||
|
|
||||||
x86: Improve vec generation in memset-vec-unaligned-erms.S
|
|
||||||
|
|
||||||
Revert usage of 'pshufb' in broadcast logic as it is an SSSE3
|
|
||||||
instruction and memset.S is restricted to only SSE2 instructions.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/memset.S | 7 ++++---
|
|
||||||
1 file changed, 4 insertions(+), 3 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
|
|
||||||
index 27debd2b..4cb4aa71 100644
|
|
||||||
--- a/sysdeps/x86_64/memset.S
|
|
||||||
+++ b/sysdeps/x86_64/memset.S
|
|
||||||
@@ -30,9 +30,10 @@
|
|
||||||
|
|
||||||
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
||||||
movd d, %xmm0; \
|
|
||||||
- pxor %xmm1, %xmm1; \
|
|
||||||
- pshufb %xmm1, %xmm0; \
|
|
||||||
- movq r, %rax
|
|
||||||
+ movq r, %rax; \
|
|
||||||
+ punpcklbw %xmm0, %xmm0; \
|
|
||||||
+ punpcklwd %xmm0, %xmm0; \
|
|
||||||
+ pshufd $0, %xmm0, %xmm0
|
|
||||||
|
|
||||||
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
||||||
movd d, %xmm0; \
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,218 +0,0 @@
|
|||||||
From 5165de69c0908e28a380cbd4bb054e55ea4abc95 Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 21 Jan 2019 11:36:36 -0800
|
|
||||||
Subject: [PATCH] x86-64 strnlen/wcsnlen: Properly handle the length parameter
|
|
||||||
[BZ# 24097]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
|
||||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
|
||||||
functions written in assembly can only use the lower 32 bits of a
|
|
||||||
64-bit register as length or must clear the upper 32 bits before using
|
|
||||||
the full 64-bit register for length.
|
|
||||||
|
|
||||||
This pach fixes strnlen/wcsnlen for x32. Tested on x86-64 and x32. On
|
|
||||||
x86-64, libc.so is the same with and withou the fix.
|
|
||||||
|
|
||||||
[BZ# 24097]
|
|
||||||
CVE-2019-6488
|
|
||||||
* sysdeps/x86_64/multiarch/strlen-avx2.S: Use RSI_LP for length.
|
|
||||||
Clear the upper 32 bits of RSI register.
|
|
||||||
* sysdeps/x86_64/strlen.S: Use RSI_LP for length.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strnlen
|
|
||||||
and tst-size_t-wcsnlen.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-strnlen.c: New file.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-wcsnlen.c: Likewise.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/strlen-avx2.S | 9 ++--
|
|
||||||
sysdeps/x86_64/strlen.S | 12 ++---
|
|
||||||
sysdeps/x86_64/x32/Makefile | 4 +-
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-strnlen.c | 72 +++++++++++++++++++++++++
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-wcsnlen.c | 20 +++++++
|
|
||||||
5 files changed, 106 insertions(+), 11 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-strnlen.c
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
|
|
||||||
|
|
||||||
Conflicts:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
index fb2418cd..645e0446 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
|
||||||
@@ -42,12 +42,15 @@
|
|
||||||
ENTRY (STRLEN)
|
|
||||||
# ifdef USE_AS_STRNLEN
|
|
||||||
/* Check for zero length. */
|
|
||||||
- testq %rsi, %rsi
|
|
||||||
+ test %RSI_LP, %RSI_LP
|
|
||||||
jz L(zero)
|
|
||||||
# ifdef USE_AS_WCSLEN
|
|
||||||
- shl $2, %rsi
|
|
||||||
+ shl $2, %RSI_LP
|
|
||||||
+# elif defined __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %esi, %esi
|
|
||||||
# endif
|
|
||||||
- movq %rsi, %r8
|
|
||||||
+ mov %RSI_LP, %R8_LP
|
|
||||||
# endif
|
|
||||||
movl %edi, %ecx
|
|
||||||
movq %rdi, %rdx
|
|
||||||
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
|
|
||||||
index 01cb5fa8..f845f3d4 100644
|
|
||||||
--- a/sysdeps/x86_64/strlen.S
|
|
||||||
+++ b/sysdeps/x86_64/strlen.S
|
|
||||||
@@ -59,21 +59,21 @@ ENTRY(strlen)
|
|
||||||
|
|
||||||
#ifdef AS_STRNLEN
|
|
||||||
/* Do not read anything when n==0. */
|
|
||||||
- test %rsi, %rsi
|
|
||||||
+ test %RSI_LP, %RSI_LP
|
|
||||||
jne L(n_nonzero)
|
|
||||||
xor %rax, %rax
|
|
||||||
ret
|
|
||||||
L(n_nonzero):
|
|
||||||
# ifdef AS_WCSLEN
|
|
||||||
- shlq $2, %rsi
|
|
||||||
+ shl $2, %RSI_LP
|
|
||||||
# endif
|
|
||||||
|
|
||||||
/* Initialize long lived registers. */
|
|
||||||
|
|
||||||
- add %rdi, %rsi
|
|
||||||
- mov %rsi, %r10
|
|
||||||
- and $-64, %r10
|
|
||||||
- mov %rsi, %r11
|
|
||||||
+ add %RDI_LP, %RSI_LP
|
|
||||||
+ mov %RSI_LP, %R10_LP
|
|
||||||
+ and $-64, %R10_LP
|
|
||||||
+ mov %RSI_LP, %R11_LP
|
|
||||||
#endif
|
|
||||||
|
|
||||||
pxor %xmm0, %xmm0
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index 2a9e20a9..1557724b 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -8,10 +8,10 @@ endif
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
|
|
||||||
tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
|
|
||||||
- tst-size_t-strncmp tst-size_t-strncpy
|
|
||||||
+ tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),wcsmbs)
|
|
||||||
tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
|
|
||||||
- tst-size_t-wcsncmp
|
|
||||||
+ tst-size_t-wcsncmp tst-size_t-wcsnlen
|
|
||||||
endif
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-strnlen.c b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..690a4a8a
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
|
|
||||||
@@ -0,0 +1,72 @@
|
|
||||||
+/* Test strnlen with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# define TEST_NAME "wcsnlen"
|
|
||||||
+#else
|
|
||||||
+# define TEST_NAME "strnlen"
|
|
||||||
+#endif /* WIDE */
|
|
||||||
+
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# include <wchar.h>
|
|
||||||
+# define STRNLEN wcsnlen
|
|
||||||
+# define CHAR wchar_t
|
|
||||||
+#else
|
|
||||||
+# define STRNLEN strnlen
|
|
||||||
+# define CHAR char
|
|
||||||
+#endif /* WIDE */
|
|
||||||
+
|
|
||||||
+IMPL (STRNLEN, 1)
|
|
||||||
+
|
|
||||||
+typedef size_t (*proto_t) (const CHAR *, size_t);
|
|
||||||
+
|
|
||||||
+static size_t
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_strnlen (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&a, a.p, b.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ size_t size = page_size / sizeof (CHAR);
|
|
||||||
+ parameter_t src = { { 0 }, buf2 };
|
|
||||||
+ parameter_t c = { { size }, (void *) (uintptr_t) 'a' };
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ src.fn = impl->fn;
|
|
||||||
+ size_t res = do_strnlen (src, c);
|
|
||||||
+ if (res != size)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: 0x%x != 0x%x",
|
|
||||||
+ impl->name, res, size);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..093b4bbe
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
|
|
||||||
@@ -0,0 +1,20 @@
|
|
||||||
+/* Test wcsnlen with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define WIDE 1
|
|
||||||
+#include "tst-size_t-strnlen.c"
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,33 +0,0 @@
|
|||||||
From 7912236f4a597deb092650ca79f33504ddb4af28 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
||||||
Date: Sat, 12 Feb 2022 00:45:00 -0600
|
|
||||||
Subject: [PATCH] x86: Set .text section in memset-vec-unaligned-erms
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
|
|
||||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
Date: Mon Feb 7 05:55:15 2022 -0800
|
|
||||||
|
|
||||||
x86-64: Optimize bzero
|
|
||||||
|
|
||||||
Remove setting the .text section for the code. This commit
|
|
||||||
adds that back.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 1 +
|
|
||||||
1 file changed, 1 insertion(+)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
index 06f5f5d7..4fb475c0 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
||||||
@@ -114,6 +114,7 @@
|
|
||||||
# error SECTION is not defined!
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+ .section SECTION(.text), "ax", @progbits
|
|
||||||
#if IS_IN (libc)
|
|
||||||
# if defined SHARED
|
|
||||||
ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,36 +0,0 @@
|
|||||||
From 0fb8800029d230b3711bf722b2a47db92d0e273f Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Thu, 10 Feb 2022 11:52:50 -0800
|
|
||||||
Subject: [PATCH] x86-64: Remove bzero weak alias in SS2 memset
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
|
|
||||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
|
||||||
Date: Mon Feb 7 05:55:15 2022 -0800
|
|
||||||
|
|
||||||
x86-64: Optimize bzero
|
|
||||||
|
|
||||||
added the optimized bzero. Remove bzero weak alias in SS2 memset to
|
|
||||||
avoid undefined __bzero in memset-sse2-unaligned-erms.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S | 4 +---
|
|
||||||
1 file changed, 1 insertion(+), 3 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
|
|
||||||
index 8f579ad6..af51362b 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
|
|
||||||
@@ -31,9 +31,7 @@
|
|
||||||
# endif
|
|
||||||
|
|
||||||
# undef weak_alias
|
|
||||||
-# define weak_alias(original, alias) \
|
|
||||||
- .weak bzero; bzero = __bzero
|
|
||||||
-
|
|
||||||
+# define weak_alias(original, alias)
|
|
||||||
# undef strong_alias
|
|
||||||
# define strong_alias(ignored1, ignored2)
|
|
||||||
#endif
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,29 +0,0 @@
|
|||||||
From bf92893a14ebc161b08b28acc24fa06ae6be19cb Mon Sep 17 00:00:00 2001
|
|
||||||
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
|
||||||
Date: Thu, 10 Feb 2022 11:23:24 -0300
|
|
||||||
Subject: [PATCH] x86_64: Remove bcopy optimizations
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
The symbols is not present in current POSIX specification and compiler
|
|
||||||
already generates memmove call.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/multiarch/bcopy.S | 7 -------
|
|
||||||
1 file changed, 7 deletions(-)
|
|
||||||
delete mode 100644 sysdeps/x86_64/multiarch/bcopy.S
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S
|
|
||||||
deleted file mode 100644
|
|
||||||
index 639f02bd..00000000
|
|
||||||
--- a/sysdeps/x86_64/multiarch/bcopy.S
|
|
||||||
+++ /dev/null
|
|
||||||
@@ -1,7 +0,0 @@
|
|
||||||
-#include <sysdep.h>
|
|
||||||
-
|
|
||||||
- .text
|
|
||||||
-ENTRY(bcopy)
|
|
||||||
- xchg %rdi, %rsi
|
|
||||||
- jmp __libc_memmove /* Branch to IFUNC memmove. */
|
|
||||||
-END(bcopy)
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,206 +0,0 @@
|
|||||||
From 3f635fb43389b54f682fc9ed2acc0b2aaf4a923d Mon Sep 17 00:00:00 2001
|
|
||||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
||||||
Date: Mon, 4 Feb 2019 06:31:01 -0800
|
|
||||||
Subject: [PATCH] x86-64 memcmp: Use unsigned Jcc instructions on size [BZ
|
|
||||||
#24155]
|
|
||||||
Content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
Since the size argument is unsigned. we should use unsigned Jcc
|
|
||||||
instructions, instead of signed, to check size.
|
|
||||||
|
|
||||||
Tested on x86-64 and x32, with and without --disable-multi-arch.
|
|
||||||
|
|
||||||
[BZ #24155]
|
|
||||||
CVE-2019-7309
|
|
||||||
* NEWS: Updated for CVE-2019-7309.
|
|
||||||
* sysdeps/x86_64/memcmp.S: Use RDX_LP for size. Clear the
|
|
||||||
upper 32 bits of RDX register for x32. Use unsigned Jcc
|
|
||||||
instructions, instead of signed.
|
|
||||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp-2.
|
|
||||||
* sysdeps/x86_64/x32/tst-size_t-memcmp-2.c: New test.
|
|
||||||
---
|
|
||||||
sysdeps/x86_64/memcmp.S | 20 +++---
|
|
||||||
sysdeps/x86_64/x32/Makefile | 3 +-
|
|
||||||
sysdeps/x86_64/x32/tst-size_t-memcmp-2.c | 79 ++++++++++++++++++++++++
|
|
||||||
3 files changed, 93 insertions(+), 9 deletions(-)
|
|
||||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
|
|
||||||
|
|
||||||
Conflics:
|
|
||||||
ChangeLog
|
|
||||||
(removed)
|
|
||||||
NEWS
|
|
||||||
(removed)
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
|
|
||||||
index bcb4a2e8..45918d37 100644
|
|
||||||
--- a/sysdeps/x86_64/memcmp.S
|
|
||||||
+++ b/sysdeps/x86_64/memcmp.S
|
|
||||||
@@ -21,14 +21,18 @@
|
|
||||||
|
|
||||||
.text
|
|
||||||
ENTRY (memcmp)
|
|
||||||
- test %rdx, %rdx
|
|
||||||
+#ifdef __ILP32__
|
|
||||||
+ /* Clear the upper 32 bits. */
|
|
||||||
+ movl %edx, %edx
|
|
||||||
+#endif
|
|
||||||
+ test %RDX_LP, %RDX_LP
|
|
||||||
jz L(finz)
|
|
||||||
cmpq $1, %rdx
|
|
||||||
- jle L(finr1b)
|
|
||||||
+ jbe L(finr1b)
|
|
||||||
subq %rdi, %rsi
|
|
||||||
movq %rdx, %r10
|
|
||||||
cmpq $32, %r10
|
|
||||||
- jge L(gt32)
|
|
||||||
+ jae L(gt32)
|
|
||||||
/* Handle small chunks and last block of less than 32 bytes. */
|
|
||||||
L(small):
|
|
||||||
testq $1, %r10
|
|
||||||
@@ -156,7 +160,7 @@ L(A32):
|
|
||||||
movq %r11, %r10
|
|
||||||
andq $-32, %r10
|
|
||||||
cmpq %r10, %rdi
|
|
||||||
- jge L(mt16)
|
|
||||||
+ jae L(mt16)
|
|
||||||
/* Pre-unroll to be ready for unrolled 64B loop. */
|
|
||||||
testq $32, %rdi
|
|
||||||
jz L(A64)
|
|
||||||
@@ -178,7 +182,7 @@ L(A64):
|
|
||||||
movq %r11, %r10
|
|
||||||
andq $-64, %r10
|
|
||||||
cmpq %r10, %rdi
|
|
||||||
- jge L(mt32)
|
|
||||||
+ jae L(mt32)
|
|
||||||
|
|
||||||
L(A64main):
|
|
||||||
movdqu (%rdi,%rsi), %xmm0
|
|
||||||
@@ -216,7 +220,7 @@ L(mt32):
|
|
||||||
movq %r11, %r10
|
|
||||||
andq $-32, %r10
|
|
||||||
cmpq %r10, %rdi
|
|
||||||
- jge L(mt16)
|
|
||||||
+ jae L(mt16)
|
|
||||||
|
|
||||||
L(A32main):
|
|
||||||
movdqu (%rdi,%rsi), %xmm0
|
|
||||||
@@ -254,7 +258,7 @@ L(ATR):
|
|
||||||
movq %r11, %r10
|
|
||||||
andq $-32, %r10
|
|
||||||
cmpq %r10, %rdi
|
|
||||||
- jge L(mt16)
|
|
||||||
+ jae L(mt16)
|
|
||||||
testq $16, %rdi
|
|
||||||
jz L(ATR32)
|
|
||||||
|
|
||||||
@@ -325,7 +329,7 @@ L(ATR64main):
|
|
||||||
movq %r11, %r10
|
|
||||||
andq $-32, %r10
|
|
||||||
cmpq %r10, %rdi
|
|
||||||
- jge L(mt16)
|
|
||||||
+ jae L(mt16)
|
|
||||||
|
|
||||||
L(ATR32res):
|
|
||||||
movdqa (%rdi,%rsi), %xmm0
|
|
||||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
|
||||||
index 1557724b..87489565 100644
|
|
||||||
--- a/sysdeps/x86_64/x32/Makefile
|
|
||||||
+++ b/sysdeps/x86_64/x32/Makefile
|
|
||||||
@@ -8,7 +8,8 @@ endif
|
|
||||||
ifeq ($(subdir),string)
|
|
||||||
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
|
|
||||||
tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
|
|
||||||
- tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
|
|
||||||
+ tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen \
|
|
||||||
+ tst-size_t-memcmp-2
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(subdir),wcsmbs)
|
|
||||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..d8ae1a08
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
|
|
||||||
@@ -0,0 +1,79 @@
|
|
||||||
+/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
|
|
||||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <http://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#define TEST_MAIN
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# define TEST_NAME "wmemcmp"
|
|
||||||
+#else
|
|
||||||
+# define TEST_NAME "memcmp"
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+#include "test-size_t.h"
|
|
||||||
+
|
|
||||||
+#ifdef WIDE
|
|
||||||
+# include <inttypes.h>
|
|
||||||
+# include <wchar.h>
|
|
||||||
+
|
|
||||||
+# define MEMCMP wmemcmp
|
|
||||||
+# define CHAR wchar_t
|
|
||||||
+#else
|
|
||||||
+# define MEMCMP memcmp
|
|
||||||
+# define CHAR char
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+IMPL (MEMCMP, 1)
|
|
||||||
+
|
|
||||||
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+__attribute__ ((noinline, noclone))
|
|
||||||
+do_memcmp (parameter_t a, parameter_t b)
|
|
||||||
+{
|
|
||||||
+ return CALL (&b, a.p, b.p, a.len);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static int
|
|
||||||
+test_main (void)
|
|
||||||
+{
|
|
||||||
+ test_init ();
|
|
||||||
+
|
|
||||||
+ parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
|
|
||||||
+ parameter_t src = { { 0 }, buf2 };
|
|
||||||
+
|
|
||||||
+ memcpy (buf1, buf2, page_size);
|
|
||||||
+
|
|
||||||
+ CHAR *p = (CHAR *) buf1;
|
|
||||||
+ p[page_size / sizeof (CHAR) - 1] = (CHAR) 1;
|
|
||||||
+
|
|
||||||
+ int ret = 0;
|
|
||||||
+ FOR_EACH_IMPL (impl, 0)
|
|
||||||
+ {
|
|
||||||
+ src.fn = impl->fn;
|
|
||||||
+ int res = do_memcmp (dest, src);
|
|
||||||
+ if (res >= 0)
|
|
||||||
+ {
|
|
||||||
+ error (0, 0, "Wrong result in function %s: %i >= 0",
|
|
||||||
+ impl->name, res);
|
|
||||||
+ ret = 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#include <support/test-driver.c>
|
|
||||||
--
|
|
||||||
GitLab
|
|
||||||
|
|
@ -1,31 +0,0 @@
|
|||||||
Based on the following commit, adjusted for glibc-2.28 in RHEL-8:
|
|
||||||
|
|
||||||
commit 5eabdb6a6ac1599d23dd5966a37417215950245f
|
|
||||||
Author: Andreas Schwab <schwab@suse.de>
|
|
||||||
Date: Wed Dec 6 14:48:22 2023 +0100
|
|
||||||
|
|
||||||
getaddrinfo: translate ENOMEM to EAI_MEMORY (bug 31163)
|
|
||||||
|
|
||||||
When __resolv_context_get returns NULL due to out of memory, translate it
|
|
||||||
to a return value of EAI_MEMORY.
|
|
||||||
|
|
||||||
diff --git a/sysdeps/posix/getaddrinfo.c b/sysdeps/posix/getaddrinfo.c
|
|
||||||
index 46046504a6858f2e..d0708f3e84e20025 100644
|
|
||||||
--- a/sysdeps/posix/getaddrinfo.c
|
|
||||||
+++ b/sysdeps/posix/getaddrinfo.c
|
|
||||||
@@ -777,7 +777,14 @@ gaih_inet (const char *name, const struct gaih_service *service,
|
|
||||||
res_ctx = __resolv_context_get ();
|
|
||||||
res_enable_inet6 = __resolv_context_disable_inet6 (res_ctx);
|
|
||||||
if (res_ctx == NULL)
|
|
||||||
- no_more = 1;
|
|
||||||
+ {
|
|
||||||
+ if (errno == ENOMEM)
|
|
||||||
+ {
|
|
||||||
+ result = -EAI_MEMORY;
|
|
||||||
+ goto free_and_return;
|
|
||||||
+ }
|
|
||||||
+ no_more = 1;
|
|
||||||
+ }
|
|
||||||
|
|
||||||
while (!no_more)
|
|
||||||
{
|
|
@ -1,112 +0,0 @@
|
|||||||
This downstream-only patch compensates for the missing backport of
|
|
||||||
commit 2d651eb9265d1366d7b9e881bfddd46db9c1ecc4 ("x86: Move
|
|
||||||
x86 processor cache info to cpu_features"). Without it,
|
|
||||||
ld.so --list-diagnostics prints values that have not been properly
|
|
||||||
initalized from CPUID data.
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
|
|
||||||
index 10ebadd819d9efff..d8421fab83ab08ac 100644
|
|
||||||
--- a/sysdeps/x86/cacheinfo.h
|
|
||||||
+++ b/sysdeps/x86/cacheinfo.h
|
|
||||||
@@ -19,31 +19,42 @@
|
|
||||||
#include <assert.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
|
|
||||||
+/* When building ld.so, do not export any of the variables. They are
|
|
||||||
+ only used for diagnostics and are not initialized during regular
|
|
||||||
+ operation. */
|
|
||||||
+#if IS_IN (rtld)
|
|
||||||
+# define CACHEINFO_VARIABLE(name, initializer) \
|
|
||||||
+ static long int name = initializer
|
|
||||||
+#else
|
|
||||||
+# define CACHEINFO_VARIABLE(name, initializer) \
|
|
||||||
+ long int name attribute_hidden = initializer
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
/* Data cache size for use in memory and string routines, typically
|
|
||||||
L1 size, rounded to multiple of 256 bytes. */
|
|
||||||
-long int __x86_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
|
|
||||||
-long int __x86_data_cache_size attribute_hidden = 32 * 1024;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_data_cache_size_half, 32 * 1024 / 2);
|
|
||||||
+CACHEINFO_VARIABLE (__x86_data_cache_size, 32 * 1024);
|
|
||||||
/* Similar to __x86_data_cache_size_half, but not rounded. */
|
|
||||||
-long int __x86_raw_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_raw_data_cache_size_half, 32 * 1024 / 2);
|
|
||||||
/* Similar to __x86_data_cache_size, but not rounded. */
|
|
||||||
-long int __x86_raw_data_cache_size attribute_hidden = 32 * 1024;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_raw_data_cache_size, 32 * 1024);
|
|
||||||
/* Shared cache size for use in memory and string routines, typically
|
|
||||||
L2 or L3 size, rounded to multiple of 256 bytes. */
|
|
||||||
-long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
|
|
||||||
-long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_shared_cache_size_half, 1024 * 1024 / 2);
|
|
||||||
+CACHEINFO_VARIABLE (__x86_shared_cache_size, 1024 * 1024);
|
|
||||||
/* Similar to __x86_shared_cache_size_half, but not rounded. */
|
|
||||||
-long int __x86_raw_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_raw_shared_cache_size_half, 1024 * 1024 / 2);
|
|
||||||
/* Similar to __x86_shared_cache_size, but not rounded. */
|
|
||||||
-long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_raw_shared_cache_size, 1024 * 1024);
|
|
||||||
|
|
||||||
/* Threshold to use non temporal store. */
|
|
||||||
-long int __x86_shared_non_temporal_threshold attribute_hidden;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_shared_non_temporal_threshold, 0);
|
|
||||||
|
|
||||||
/* Threshold to use Enhanced REP MOVSB. */
|
|
||||||
-long int __x86_rep_movsb_threshold attribute_hidden = 2048;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_rep_movsb_threshold, 2048);
|
|
||||||
|
|
||||||
/* Threshold to use Enhanced REP STOSB. */
|
|
||||||
-long int __x86_rep_stosb_threshold attribute_hidden = 2048;
|
|
||||||
+CACHEINFO_VARIABLE (__x86_rep_stosb_threshold, 2048);
|
|
||||||
|
|
||||||
static void
|
|
||||||
get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
|
|
||||||
diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
|
|
||||||
index 0ba286a828b69937..9215604ecf22344c 100644
|
|
||||||
--- a/sysdeps/x86/dl-diagnostics-cpu.c
|
|
||||||
+++ b/sysdeps/x86/dl-diagnostics-cpu.c
|
|
||||||
@@ -19,6 +19,13 @@
|
|
||||||
#include <dl-diagnostics.h>
|
|
||||||
#include <ldsodefs.h>
|
|
||||||
|
|
||||||
+#include <assert.h>
|
|
||||||
+#include <unistd.h>
|
|
||||||
+#include <cpu-features.h>
|
|
||||||
+#include <cpuid.h>
|
|
||||||
+#include <dl-cacheinfo.h>
|
|
||||||
+#include <cacheinfo.h>
|
|
||||||
+
|
|
||||||
static void
|
|
||||||
print_cpu_features_value (const char *label, uint64_t value)
|
|
||||||
{
|
|
||||||
@@ -81,19 +88,21 @@ _dl_diagnostics_cpu (void)
|
|
||||||
#include "cpu-features-preferred_feature_index_1.def"
|
|
||||||
#undef BIT
|
|
||||||
|
|
||||||
+ /* The cache information variables are only used for diagnostics and
|
|
||||||
+ are not initialized during startup. The values used at run time
|
|
||||||
+ are only in libc.so.6. */
|
|
||||||
+ init_cacheinfo ();
|
|
||||||
+
|
|
||||||
print_cpu_features_value ("xsave_state_size",
|
|
||||||
cpu_features->xsave_state_size);
|
|
||||||
print_cpu_features_value ("xsave_state_full_size",
|
|
||||||
cpu_features->xsave_state_full_size);
|
|
||||||
- print_cpu_features_value ("data_cache_size", cpu_features->data_cache_size);
|
|
||||||
- print_cpu_features_value ("shared_cache_size",
|
|
||||||
- cpu_features->shared_cache_size);
|
|
||||||
+ print_cpu_features_value ("data_cache_size", __x86_data_cache_size);
|
|
||||||
+ print_cpu_features_value ("shared_cache_size", __x86_shared_cache_size);
|
|
||||||
print_cpu_features_value ("non_temporal_threshold",
|
|
||||||
- cpu_features->non_temporal_threshold);
|
|
||||||
- print_cpu_features_value ("rep_movsb_threshold",
|
|
||||||
- cpu_features->rep_movsb_threshold);
|
|
||||||
- print_cpu_features_value ("rep_stosb_threshold",
|
|
||||||
- cpu_features->rep_stosb_threshold);
|
|
||||||
+ __x86_shared_non_temporal_threshold);
|
|
||||||
+ print_cpu_features_value ("rep_movsb_threshold", __x86_rep_movsb_threshold);
|
|
||||||
+ print_cpu_features_value ("rep_stosb_threshold", __x86_rep_stosb_threshold);
|
|
||||||
_Static_assert (offsetof (struct cpu_features, rep_stosb_threshold)
|
|
||||||
+ sizeof (cpu_features->rep_stosb_threshold)
|
|
||||||
== sizeof (*cpu_features),
|
|
@ -1,22 +0,0 @@
|
|||||||
Work around in the test case, the fact that RHEL-8 NSS modules
|
|
||||||
infrastructure incorrectly allows merging in the hosts database. This
|
|
||||||
is a RHEL-8 only fix.
|
|
||||||
|
|
||||||
diff --git a/nss/tst-nss-gai-actions.c b/nss/tst-nss-gai-actions.c
|
|
||||||
index efca6cd1837a172a..c35e752896eceb2a 100644
|
|
||||||
--- a/nss/tst-nss-gai-actions.c
|
|
||||||
+++ b/nss/tst-nss-gai-actions.c
|
|
||||||
@@ -87,6 +87,13 @@ do_one_test (int action, int family, bool canon)
|
|
||||||
case ACTION_MERGE:
|
|
||||||
if (ret == 0)
|
|
||||||
{
|
|
||||||
+ if (hints.ai_flags == 0 && hints.ai_family == AF_INET)
|
|
||||||
+ {
|
|
||||||
+ printf ("***** RHEL-8 limitation: "
|
|
||||||
+ "NSS modules infrastructure incorrectly allows MERGE\n");
|
|
||||||
+ return;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
char *formatted = support_format_addrinfo (ai, ret);
|
|
||||||
|
|
||||||
printf ("merge unexpectedly succeeded:\n %s\n", formatted);
|
|
@ -1,97 +0,0 @@
|
|||||||
commit 2ac579f9c25388a7734948d77b03e4dd10f35334
|
|
||||||
Author: DJ Delorie <dj@redhat.com>
|
|
||||||
Date: Mon Sep 30 16:04:52 2019 -0400
|
|
||||||
|
|
||||||
Add run-one-test convenience target and makefile help text
|
|
||||||
|
|
||||||
Adds "make test" for re-running just one test. Also adds
|
|
||||||
"make help" for help with our Makefile targets, and adds a
|
|
||||||
mini-help when you just run "make".
|
|
||||||
|
|
||||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
|
||||||
|
|
||||||
diff --git a/Makefile b/Makefile
|
|
||||||
index 6d73241bbc811c13..6518f62ee0676b0d 100644
|
|
||||||
--- a/Makefile
|
|
||||||
+++ b/Makefile
|
|
||||||
@@ -26,8 +26,17 @@ include Makeconfig
|
|
||||||
|
|
||||||
|
|
||||||
# This is the default target; it makes everything except the tests.
|
|
||||||
-.PHONY: all
|
|
||||||
-all: lib others
|
|
||||||
+.PHONY: all help minihelp
|
|
||||||
+all: minihelp lib others
|
|
||||||
+
|
|
||||||
+help:
|
|
||||||
+ @sed '0,/^help-starts-here$$/d' Makefile.help
|
|
||||||
+
|
|
||||||
+minihelp:
|
|
||||||
+ @echo
|
|
||||||
+ @echo type \"make help\" for help with common glibc makefile targets
|
|
||||||
+ @echo
|
|
||||||
+
|
|
||||||
|
|
||||||
ifneq ($(AUTOCONF),no)
|
|
||||||
|
|
||||||
@@ -503,3 +512,12 @@ FORCE:
|
|
||||||
|
|
||||||
iconvdata/% localedata/% po/%: FORCE
|
|
||||||
$(MAKE) $(PARALLELMFLAGS) -C $(@D) $(@F)
|
|
||||||
+
|
|
||||||
+# Convenience target to rerun one test, from the top of the build tree
|
|
||||||
+# Example: make test t=wcsmbs/test-wcsnlen
|
|
||||||
+.PHONY: test
|
|
||||||
+test :
|
|
||||||
+ @-rm -f $(objpfx)$t.out
|
|
||||||
+ $(MAKE) subdir=$(dir $t) -C $(dir $t) ..=../ $(objpfx)$t.out
|
|
||||||
+ @cat $(objpfx)$t.test-result
|
|
||||||
+ @cat $(objpfx)$t.out
|
|
||||||
diff --git a/Makefile.help b/Makefile.help
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000000000000..3b043bce013cc2b4
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/Makefile.help
|
|
||||||
@@ -0,0 +1,42 @@
|
|
||||||
+# Copyright (C) 2019 Free Software Foundation, Inc.
|
|
||||||
+# This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+# The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+# modify it under the terms of the GNU Lesser General Public
|
|
||||||
+# License as published by the Free Software Foundation; either
|
|
||||||
+# version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+# The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+# Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+# You should have received a copy of the GNU Lesser General Public
|
|
||||||
+# License along with the GNU C Library; if not, see
|
|
||||||
+# <https://www.gnu.org/licenses/>.
|
|
||||||
+
|
|
||||||
+This is the file that gets printed when the user runs "make help",
|
|
||||||
+starting just after the "help-starts-here" line.
|
|
||||||
+
|
|
||||||
+help-starts-here
|
|
||||||
+
|
|
||||||
+all
|
|
||||||
+ The usual default; builds everything but doesn't run the
|
|
||||||
+ tests.
|
|
||||||
+
|
|
||||||
+check (or tests)
|
|
||||||
+ Runs the standard set of tests.
|
|
||||||
+
|
|
||||||
+test
|
|
||||||
+ Runs one test. Use like this:
|
|
||||||
+ make test t=wcsmbs/test-wcsnlen
|
|
||||||
+ Note that this will rebuild the test if needed, but will not
|
|
||||||
+ rebuild what "make all" would have rebuilt.
|
|
||||||
+
|
|
||||||
+--
|
|
||||||
+Other useful hints:
|
|
||||||
+
|
|
||||||
+builddir$ rm testroot.pristine/install.stamp
|
|
||||||
+ Forces the testroot to be reinstalled the next time you run
|
|
||||||
+ the testsuite (or just rm -rf testroot.pristine)
|
|
||||||
+
|
|
@ -1,34 +0,0 @@
|
|||||||
commit 56e098118a31753a9f755948bb5a47bc7111e214
|
|
||||||
Author: Andreas Schwab <schwab@suse.de>
|
|
||||||
Date: Thu Aug 15 12:14:35 2019 +0200
|
|
||||||
|
|
||||||
Update i386 libm-test-ulps
|
|
||||||
|
|
||||||
Conflicts: ChangeLog removed
|
|
||||||
|
|
||||||
diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
|
|
||||||
index e83bae71b4..2232296fe0 100644
|
|
||||||
--- a/sysdeps/i386/fpu/libm-test-ulps
|
|
||||||
+++ b/sysdeps/i386/fpu/libm-test-ulps
|
|
||||||
@@ -1158,8 +1158,8 @@ float128: 4
|
|
||||||
idouble: 4
|
|
||||||
ifloat: 5
|
|
||||||
ifloat128: 4
|
|
||||||
-ildouble: 7
|
|
||||||
-ldouble: 7
|
|
||||||
+ildouble: 8
|
|
||||||
+ldouble: 8
|
|
||||||
|
|
||||||
Function: Imaginary part of "clog10_upward":
|
|
||||||
double: 2
|
|
||||||
@@ -2222,8 +2222,8 @@ float128: 8
|
|
||||||
idouble: 5
|
|
||||||
ifloat: 5
|
|
||||||
ifloat128: 8
|
|
||||||
-ildouble: 5
|
|
||||||
-ldouble: 5
|
|
||||||
+ildouble: 6
|
|
||||||
+ldouble: 6
|
|
||||||
|
|
||||||
Function: "log":
|
|
||||||
double: 1
|
|
@ -1,26 +0,0 @@
|
|||||||
Author: Patsy Griffin <patsy@redhat.com>
|
|
||||||
|
|
||||||
i386: update ulps
|
|
||||||
|
|
||||||
This change fixes 3 test failures:
|
|
||||||
math/test-ildouble-lgamma
|
|
||||||
math/test-ldouble-finite-lgamma
|
|
||||||
math/test-ldouble-lgamma
|
|
||||||
|
|
||||||
This is a downstream only patch as upstream removed entries for
|
|
||||||
i{float,double,ldouble} by commit: 1c15464ca05f36db5c582856d3770d5e8bde9d61.
|
|
||||||
The ldouble change is already upstream.
|
|
||||||
|
|
||||||
--- a/sysdeps/i386/fpu/libm-test-ulps 2024-08-06 15:51:18.182808710 -0400
|
|
||||||
+++ b/sysdeps/i386/fpu/libm-test-ulps 2024-08-06 18:01:50.579719841 -0400
|
|
||||||
@@ -2030,8 +2030,8 @@ double: 5
|
|
||||||
float: 5
|
|
||||||
idouble: 5
|
|
||||||
ifloat: 5
|
|
||||||
-ildouble: 5
|
|
||||||
-ldouble: 5
|
|
||||||
+ildouble: 6
|
|
||||||
+ldouble: 6
|
|
||||||
|
|
||||||
Function: "hypot":
|
|
||||||
double: 1
|
|
@ -1,374 +0,0 @@
|
|||||||
commit 03e1378f94173fc192a81e421457198f7b8a34a0
|
|
||||||
Author: Alex Butler <Alex.Butler@arm.com>
|
|
||||||
Date: Tue Jun 16 12:44:24 2020 +0000
|
|
||||||
|
|
||||||
aarch64: MTE compatible strncmp
|
|
||||||
|
|
||||||
Add support for MTE to strncmp. Regression tested with xcheck and benchmarked
|
|
||||||
with glibc's benchtests on the Cortex-A53, Cortex-A72, and Neoverse N1.
|
|
||||||
|
|
||||||
The existing implementation assumes that any access to the pages in which the
|
|
||||||
string resides is safe. This assumption is not true when MTE is enabled. This
|
|
||||||
patch updates the algorithm to ensure that accesses remain within the bounds
|
|
||||||
of an MTE tag (16-byte chunks) and improves overall performance.
|
|
||||||
|
|
||||||
Co-authored-by: Branislav Rankov <branislav.rankov@arm.com>
|
|
||||||
Co-authored-by: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
|
||||||
|
|
||||||
diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S
|
|
||||||
index c5141fab8a..ba2563490e 100644
|
|
||||||
--- a/sysdeps/aarch64/strncmp.S
|
|
||||||
+++ b/sysdeps/aarch64/strncmp.S
|
|
||||||
@@ -25,7 +25,6 @@
|
|
||||||
|
|
||||||
#define REP8_01 0x0101010101010101
|
|
||||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
|
||||||
-#define REP8_80 0x8080808080808080
|
|
||||||
|
|
||||||
/* Parameters and result. */
|
|
||||||
#define src1 x0
|
|
||||||
@@ -46,15 +45,31 @@
|
|
||||||
#define tmp3 x10
|
|
||||||
#define zeroones x11
|
|
||||||
#define pos x12
|
|
||||||
-#define limit_wd x13
|
|
||||||
-#define mask x14
|
|
||||||
-#define endloop x15
|
|
||||||
+#define mask x13
|
|
||||||
+#define endloop x14
|
|
||||||
#define count mask
|
|
||||||
+#define offset pos
|
|
||||||
+#define neg_offset x15
|
|
||||||
|
|
||||||
-ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
|
|
||||||
- DELOUSE (0)
|
|
||||||
- DELOUSE (1)
|
|
||||||
- DELOUSE (2)
|
|
||||||
+/* Define endian dependent shift operations.
|
|
||||||
+ On big-endian early bytes are at MSB and on little-endian LSB.
|
|
||||||
+ LS_FW means shifting towards early bytes.
|
|
||||||
+ LS_BK means shifting towards later bytes.
|
|
||||||
+ */
|
|
||||||
+#ifdef __AARCH64EB__
|
|
||||||
+#define LS_FW lsl
|
|
||||||
+#define LS_BK lsr
|
|
||||||
+#else
|
|
||||||
+#define LS_FW lsr
|
|
||||||
+#define LS_BK lsl
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+ .text
|
|
||||||
+ .p2align 6
|
|
||||||
+ .rep 9
|
|
||||||
+ nop /* Pad so that the loop below fits a cache line. */
|
|
||||||
+ .endr
|
|
||||||
+ENTRY_ALIGN (strncmp, 0)
|
|
||||||
cbz limit, L(ret0)
|
|
||||||
eor tmp1, src1, src2
|
|
||||||
mov zeroones, #REP8_01
|
|
||||||
@@ -62,9 +77,6 @@ ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
|
|
||||||
and count, src1, #7
|
|
||||||
b.ne L(misaligned8)
|
|
||||||
cbnz count, L(mutual_align)
|
|
||||||
- /* Calculate the number of full and partial words -1. */
|
|
||||||
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
|
|
||||||
- lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
|
|
||||||
|
|
||||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
|
||||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
|
||||||
@@ -74,56 +86,52 @@ L(loop_aligned):
|
|
||||||
ldr data1, [src1], #8
|
|
||||||
ldr data2, [src2], #8
|
|
||||||
L(start_realigned):
|
|
||||||
- subs limit_wd, limit_wd, #1
|
|
||||||
+ subs limit, limit, #8
|
|
||||||
sub tmp1, data1, zeroones
|
|
||||||
orr tmp2, data1, #REP8_7f
|
|
||||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
|
||||||
- csinv endloop, diff, xzr, pl /* Last Dword or differences. */
|
|
||||||
+ csinv endloop, diff, xzr, hi /* Last Dword or differences. */
|
|
||||||
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
|
||||||
ccmp endloop, #0, #0, eq
|
|
||||||
b.eq L(loop_aligned)
|
|
||||||
/* End of performance-critical section -- one 64B cache line. */
|
|
||||||
|
|
||||||
- /* Not reached the limit, must have found the end or a diff. */
|
|
||||||
- tbz limit_wd, #63, L(not_limit)
|
|
||||||
-
|
|
||||||
- /* Limit % 8 == 0 => all bytes significant. */
|
|
||||||
- ands limit, limit, #7
|
|
||||||
- b.eq L(not_limit)
|
|
||||||
-
|
|
||||||
- lsl limit, limit, #3 /* Bits -> bytes. */
|
|
||||||
- mov mask, #~0
|
|
||||||
-#ifdef __AARCH64EB__
|
|
||||||
- lsr mask, mask, limit
|
|
||||||
-#else
|
|
||||||
- lsl mask, mask, limit
|
|
||||||
-#endif
|
|
||||||
- bic data1, data1, mask
|
|
||||||
- bic data2, data2, mask
|
|
||||||
-
|
|
||||||
- /* Make sure that the NUL byte is marked in the syndrome. */
|
|
||||||
- orr has_nul, has_nul, mask
|
|
||||||
-
|
|
||||||
-L(not_limit):
|
|
||||||
+L(full_check):
|
|
||||||
+#ifndef __AARCH64EB__
|
|
||||||
orr syndrome, diff, has_nul
|
|
||||||
-
|
|
||||||
-#ifndef __AARCH64EB__
|
|
||||||
+ add limit, limit, 8 /* Rewind limit to before last subs. */
|
|
||||||
+L(syndrome_check):
|
|
||||||
+ /* Limit was reached. Check if the NUL byte or the difference
|
|
||||||
+ is before the limit. */
|
|
||||||
rev syndrome, syndrome
|
|
||||||
rev data1, data1
|
|
||||||
- /* The MS-non-zero bit of the syndrome marks either the first bit
|
|
||||||
- that is different, or the top bit of the first zero byte.
|
|
||||||
- Shifting left now will bring the critical information into the
|
|
||||||
- top bits. */
|
|
||||||
clz pos, syndrome
|
|
||||||
rev data2, data2
|
|
||||||
lsl data1, data1, pos
|
|
||||||
+ cmp limit, pos, lsr #3
|
|
||||||
lsl data2, data2, pos
|
|
||||||
/* But we need to zero-extend (char is unsigned) the value and then
|
|
||||||
perform a signed 32-bit subtraction. */
|
|
||||||
lsr data1, data1, #56
|
|
||||||
sub result, data1, data2, lsr #56
|
|
||||||
- RET
|
|
||||||
+ csel result, result, xzr, hi
|
|
||||||
+ ret
|
|
||||||
#else
|
|
||||||
+ /* Not reached the limit, must have found the end or a diff. */
|
|
||||||
+ tbz limit, #63, L(not_limit)
|
|
||||||
+ add tmp1, limit, 8
|
|
||||||
+ cbz limit, L(not_limit)
|
|
||||||
+
|
|
||||||
+ lsl limit, tmp1, #3 /* Bits -> bytes. */
|
|
||||||
+ mov mask, #~0
|
|
||||||
+ lsr mask, mask, limit
|
|
||||||
+ bic data1, data1, mask
|
|
||||||
+ bic data2, data2, mask
|
|
||||||
+
|
|
||||||
+ /* Make sure that the NUL byte is marked in the syndrome. */
|
|
||||||
+ orr has_nul, has_nul, mask
|
|
||||||
+
|
|
||||||
+L(not_limit):
|
|
||||||
/* For big-endian we cannot use the trick with the syndrome value
|
|
||||||
as carry-propagation can corrupt the upper bits if the trailing
|
|
||||||
bytes in the string contain 0x01. */
|
|
||||||
@@ -134,7 +142,7 @@ L(not_limit):
|
|
||||||
cmp data1, data2
|
|
||||||
cset result, ne
|
|
||||||
cneg result, result, lo
|
|
||||||
- RET
|
|
||||||
+ ret
|
|
||||||
1:
|
|
||||||
/* Re-compute the NUL-byte detection, using a byte-reversed value. */
|
|
||||||
rev tmp3, data1
|
|
||||||
@@ -144,17 +152,18 @@ L(not_limit):
|
|
||||||
rev has_nul, has_nul
|
|
||||||
orr syndrome, diff, has_nul
|
|
||||||
clz pos, syndrome
|
|
||||||
- /* The MS-non-zero bit of the syndrome marks either the first bit
|
|
||||||
- that is different, or the top bit of the first zero byte.
|
|
||||||
+ /* The most-significant-non-zero bit of the syndrome marks either the
|
|
||||||
+ first bit that is different, or the top bit of the first zero byte.
|
|
||||||
Shifting left now will bring the critical information into the
|
|
||||||
top bits. */
|
|
||||||
+L(end_quick):
|
|
||||||
lsl data1, data1, pos
|
|
||||||
lsl data2, data2, pos
|
|
||||||
/* But we need to zero-extend (char is unsigned) the value and then
|
|
||||||
perform a signed 32-bit subtraction. */
|
|
||||||
lsr data1, data1, #56
|
|
||||||
sub result, data1, data2, lsr #56
|
|
||||||
- RET
|
|
||||||
+ ret
|
|
||||||
#endif
|
|
||||||
|
|
||||||
L(mutual_align):
|
|
||||||
@@ -169,22 +178,12 @@ L(mutual_align):
|
|
||||||
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
|
|
||||||
ldr data2, [src2], #8
|
|
||||||
mov tmp2, #~0
|
|
||||||
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
|
|
||||||
-#ifdef __AARCH64EB__
|
|
||||||
- /* Big-endian. Early bytes are at MSB. */
|
|
||||||
- lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
|
|
||||||
-#else
|
|
||||||
- /* Little-endian. Early bytes are at LSB. */
|
|
||||||
- lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
|
|
||||||
-#endif
|
|
||||||
- and tmp3, limit_wd, #7
|
|
||||||
- lsr limit_wd, limit_wd, #3
|
|
||||||
- /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
|
|
||||||
- add limit, limit, count
|
|
||||||
- add tmp3, tmp3, count
|
|
||||||
+ LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
|
|
||||||
+ /* Adjust the limit and ensure it doesn't overflow. */
|
|
||||||
+ adds limit, limit, count
|
|
||||||
+ csinv limit, limit, xzr, lo
|
|
||||||
orr data1, data1, tmp2
|
|
||||||
orr data2, data2, tmp2
|
|
||||||
- add limit_wd, limit_wd, tmp3, lsr #3
|
|
||||||
b L(start_realigned)
|
|
||||||
|
|
||||||
.p2align 6
|
|
||||||
@@ -203,18 +202,15 @@ L(byte_loop):
|
|
||||||
b.eq L(byte_loop)
|
|
||||||
L(done):
|
|
||||||
sub result, data1, data2
|
|
||||||
- RET
|
|
||||||
-
|
|
||||||
+ ret
|
|
||||||
/* Align the SRC1 to a dword by doing a bytewise compare and then do
|
|
||||||
the dword loop. */
|
|
||||||
L(try_misaligned_words):
|
|
||||||
- lsr limit_wd, limit, #3
|
|
||||||
- cbz count, L(do_misaligned)
|
|
||||||
+ cbz count, L(src1_aligned)
|
|
||||||
|
|
||||||
neg count, count
|
|
||||||
and count, count, #7
|
|
||||||
sub limit, limit, count
|
|
||||||
- lsr limit_wd, limit, #3
|
|
||||||
|
|
||||||
L(page_end_loop):
|
|
||||||
ldrb data1w, [src1], #1
|
|
||||||
@@ -225,48 +221,98 @@ L(page_end_loop):
|
|
||||||
subs count, count, #1
|
|
||||||
b.hi L(page_end_loop)
|
|
||||||
|
|
||||||
-L(do_misaligned):
|
|
||||||
- /* Prepare ourselves for the next page crossing. Unlike the aligned
|
|
||||||
- loop, we fetch 1 less dword because we risk crossing bounds on
|
|
||||||
- SRC2. */
|
|
||||||
- mov count, #8
|
|
||||||
- subs limit_wd, limit_wd, #1
|
|
||||||
- b.lo L(done_loop)
|
|
||||||
+ /* The following diagram explains the comparison of misaligned strings.
|
|
||||||
+ The bytes are shown in natural order. For little-endian, it is
|
|
||||||
+ reversed in the registers. The "x" bytes are before the string.
|
|
||||||
+ The "|" separates data that is loaded at one time.
|
|
||||||
+ src1 | a a a a a a a a | b b b c c c c c | . . .
|
|
||||||
+ src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
|
|
||||||
+ After shifting in each step, the data looks like this:
|
|
||||||
+ STEP_A STEP_B STEP_C
|
|
||||||
+ data1 a a a a a a a a b b b c c c c c b b b c c c c c
|
|
||||||
+ data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
|
|
||||||
+ The bytes with "0" are eliminated from the syndrome via mask.
|
|
||||||
+ Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
|
|
||||||
+ time from SRC2. The comparison happens in 3 steps. After each step
|
|
||||||
+ the loop can exit, or read from SRC1 or SRC2. */
|
|
||||||
+L(src1_aligned):
|
|
||||||
+ /* Calculate offset from 8 byte alignment to string start in bits. No
|
|
||||||
+ need to mask offset since shifts are ignoring upper bits. */
|
|
||||||
+ lsl offset, src2, #3
|
|
||||||
+ bic src2, src2, #0xf
|
|
||||||
+ mov mask, -1
|
|
||||||
+ neg neg_offset, offset
|
|
||||||
+ ldr data1, [src1], #8
|
|
||||||
+ ldp tmp1, tmp2, [src2], #16
|
|
||||||
+ LS_BK mask, mask, neg_offset
|
|
||||||
+ and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
|
|
||||||
+ /* Skip the first compare if data in tmp1 is irrelevant. */
|
|
||||||
+ tbnz offset, 6, L(misaligned_mid_loop)
|
|
||||||
+
|
|
||||||
L(loop_misaligned):
|
|
||||||
- and tmp2, src2, #0xff8
|
|
||||||
- eor tmp2, tmp2, #0xff8
|
|
||||||
- cbz tmp2, L(page_end_loop)
|
|
||||||
+ /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
|
|
||||||
+ LS_FW data2, tmp1, offset
|
|
||||||
+ LS_BK tmp1, tmp2, neg_offset
|
|
||||||
+ subs limit, limit, #8
|
|
||||||
+ orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
|
|
||||||
+ sub has_nul, data1, zeroones
|
|
||||||
+ eor diff, data1, data2 /* Non-zero if differences found. */
|
|
||||||
+ orr tmp3, data1, #REP8_7f
|
|
||||||
+ csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
|
|
||||||
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
|
|
||||||
+ orr tmp3, endloop, has_nul
|
|
||||||
+ cbnz tmp3, L(full_check)
|
|
||||||
|
|
||||||
ldr data1, [src1], #8
|
|
||||||
- ldr data2, [src2], #8
|
|
||||||
- sub tmp1, data1, zeroones
|
|
||||||
- orr tmp2, data1, #REP8_7f
|
|
||||||
- eor diff, data1, data2 /* Non-zero if differences found. */
|
|
||||||
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
|
||||||
- ccmp diff, #0, #0, eq
|
|
||||||
- b.ne L(not_limit)
|
|
||||||
- subs limit_wd, limit_wd, #1
|
|
||||||
- b.pl L(loop_misaligned)
|
|
||||||
+L(misaligned_mid_loop):
|
|
||||||
+ /* STEP_B: Compare first part of data1 to second part of tmp2. */
|
|
||||||
+ LS_FW data2, tmp2, offset
|
|
||||||
+#ifdef __AARCH64EB__
|
|
||||||
+ /* For big-endian we do a byte reverse to avoid carry-propagation
|
|
||||||
+ problem described above. This way we can reuse the has_nul in the
|
|
||||||
+ next step and also use syndrome value trick at the end. */
|
|
||||||
+ rev tmp3, data1
|
|
||||||
+ #define data1_fixed tmp3
|
|
||||||
+#else
|
|
||||||
+ #define data1_fixed data1
|
|
||||||
+#endif
|
|
||||||
+ sub has_nul, data1_fixed, zeroones
|
|
||||||
+ orr tmp3, data1_fixed, #REP8_7f
|
|
||||||
+ eor diff, data2, data1 /* Non-zero if differences found. */
|
|
||||||
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
|
|
||||||
+#ifdef __AARCH64EB__
|
|
||||||
+ rev has_nul, has_nul
|
|
||||||
+#endif
|
|
||||||
+ cmp limit, neg_offset, lsr #3
|
|
||||||
+ orr syndrome, diff, has_nul
|
|
||||||
+ bic syndrome, syndrome, mask /* Ignore later bytes. */
|
|
||||||
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
|
|
||||||
+ cbnz tmp3, L(syndrome_check)
|
|
||||||
|
|
||||||
-L(done_loop):
|
|
||||||
- /* We found a difference or a NULL before the limit was reached. */
|
|
||||||
- and limit, limit, #7
|
|
||||||
- cbz limit, L(not_limit)
|
|
||||||
- /* Read the last word. */
|
|
||||||
- sub src1, src1, 8
|
|
||||||
- sub src2, src2, 8
|
|
||||||
- ldr data1, [src1, limit]
|
|
||||||
- ldr data2, [src2, limit]
|
|
||||||
- sub tmp1, data1, zeroones
|
|
||||||
- orr tmp2, data1, #REP8_7f
|
|
||||||
- eor diff, data1, data2 /* Non-zero if differences found. */
|
|
||||||
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
|
||||||
- ccmp diff, #0, #0, eq
|
|
||||||
- b.ne L(not_limit)
|
|
||||||
+ /* STEP_C: Compare second part of data1 to first part of tmp1. */
|
|
||||||
+ ldp tmp1, tmp2, [src2], #16
|
|
||||||
+ cmp limit, #8
|
|
||||||
+ LS_BK data2, tmp1, neg_offset
|
|
||||||
+ eor diff, data2, data1 /* Non-zero if differences found. */
|
|
||||||
+ orr syndrome, diff, has_nul
|
|
||||||
+ and syndrome, syndrome, mask /* Ignore earlier bytes. */
|
|
||||||
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
|
|
||||||
+ cbnz tmp3, L(syndrome_check)
|
|
||||||
+
|
|
||||||
+ ldr data1, [src1], #8
|
|
||||||
+ sub limit, limit, #8
|
|
||||||
+ b L(loop_misaligned)
|
|
||||||
+
|
|
||||||
+#ifdef __AARCH64EB__
|
|
||||||
+L(syndrome_check):
|
|
||||||
+ clz pos, syndrome
|
|
||||||
+ cmp pos, limit, lsl #3
|
|
||||||
+ b.lo L(end_quick)
|
|
||||||
+#endif
|
|
||||||
|
|
||||||
L(ret0):
|
|
||||||
mov result, #0
|
|
||||||
- RET
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
END (strncmp)
|
|
||||||
libc_hidden_builtin_def (strncmp)
|
|
@ -1,845 +0,0 @@
|
|||||||
From e1d3312015e8f70344620375aedf91afe7e7e7a4 Mon Sep 17 00:00:00 2001
|
|
||||||
From: lijianglin <lijianglin2@huawei.com>
|
|
||||||
Date: Tue, 27 Jun 2023 20:15:49 +0800
|
|
||||||
Subject: add GB18030-2022 charmap and test the entire GB18030 charmap [BZ
|
|
||||||
#30243]
|
|
||||||
|
|
||||||
support GB18030-2022 after add and change some transcoding relationship
|
|
||||||
of GB18030-2022.Details are as follows:
|
|
||||||
add 25 transcoding relationship
|
|
||||||
UE81E 0x82359037
|
|
||||||
UE826 0x82359038
|
|
||||||
UE82B 0x82359039
|
|
||||||
UE82C 0x82359130
|
|
||||||
UE832 0x82359131
|
|
||||||
UE843 0x82359132
|
|
||||||
UE854 0x82359133
|
|
||||||
UE864 0x82359134
|
|
||||||
UE78D 0x84318236
|
|
||||||
UE78F 0x84318237
|
|
||||||
UE78E 0x84318238
|
|
||||||
UE790 0x84318239
|
|
||||||
UE791 0x84318330
|
|
||||||
UE792 0x84318331
|
|
||||||
UE793 0x84318332
|
|
||||||
UE794 0x84318333
|
|
||||||
UE795 0x84318334
|
|
||||||
UE796 0x84318335
|
|
||||||
UE816 0xfe51
|
|
||||||
UE817 0xfe52
|
|
||||||
UE818 0xfe53
|
|
||||||
UE831 0xfe6c
|
|
||||||
UE83B 0xfe76
|
|
||||||
UE855 0xfe91
|
|
||||||
change 6 transcoding relationship
|
|
||||||
U20087 0x95329031
|
|
||||||
U20089 0x95329033
|
|
||||||
U200CC 0x95329730
|
|
||||||
U215D7 0x9536b937
|
|
||||||
U2298F 0x9630ba35
|
|
||||||
U241FE 0x9635b630
|
|
||||||
Test the entire GB18030 charmap, not only the Unicode BMP part.
|
|
||||||
|
|
||||||
Co-authored-by: yangyanchao <yangyanchao6@huawei.com>
|
|
||||||
Co-authored-by: liqingqing <liqingqing3@huawei.com>
|
|
||||||
Co-authored-by: Bruno Haible <bruno@clisp.org>
|
|
||||||
Reviewed-by: Andreas Schwab <schwab@suse.de>
|
|
||||||
Reviewed-by: Mike FABIAN <mfabian@redhat.com>
|
|
||||||
|
|
||||||
diff --git a/iconvdata/gb18030.c b/iconvdata/gb18030.c
|
|
||||||
index 9996a59eaf..be6cfe652c 100644
|
|
||||||
--- a/iconvdata/gb18030.c
|
|
||||||
+++ b/iconvdata/gb18030.c
|
|
||||||
@@ -6009,49 +6009,50 @@ static const uint16_t __twobyte_to_ucs[] =
|
|
||||||
[0x5dc2] = 0xfa0e, [0x5dc3] = 0xfa0f, [0x5dc4] = 0xfa11, [0x5dc5] = 0xfa13,
|
|
||||||
[0x5dc6] = 0xfa14, [0x5dc7] = 0xfa18, [0x5dc8] = 0xfa1f, [0x5dc9] = 0xfa20,
|
|
||||||
[0x5dca] = 0xfa21, [0x5dcb] = 0xfa23, [0x5dcc] = 0xfa24, [0x5dcd] = 0xfa27,
|
|
||||||
- [0x5dce] = 0xfa28, [0x5dcf] = 0xfa29, [0x5dd0] = 0x2e81, [0x5dd4] = 0x2e84,
|
|
||||||
- [0x5dd5] = 0x3473, [0x5dd6] = 0x3447, [0x5dd7] = 0x2e88, [0x5dd8] = 0x2e8b,
|
|
||||||
- [0x5dd9] = 0x9fb4, [0x5dda] = 0x359e, [0x5ddb] = 0x361a, [0x5ddc] = 0x360e,
|
|
||||||
- [0x5ddd] = 0x2e8c, [0x5dde] = 0x2e97, [0x5ddf] = 0x396e, [0x5de0] = 0x3918,
|
|
||||||
- [0x5de1] = 0x9fb5, [0x5de2] = 0x39cf, [0x5de3] = 0x39df, [0x5de4] = 0x3a73,
|
|
||||||
- [0x5de5] = 0x39d0, [0x5de6] = 0x9fb6, [0x5de7] = 0x9fb7, [0x5de8] = 0x3b4e,
|
|
||||||
- [0x5de9] = 0x3c6e, [0x5dea] = 0x3ce0, [0x5deb] = 0x2ea7, [0x5ded] = 0x9fb8,
|
|
||||||
+ [0x5dce] = 0xfa28, [0x5dcf] = 0xfa29, [0x5dd0] = 0x2e81, [0x5dd1] = 0xe816,
|
|
||||||
+ [0x5dd2] = 0xe817, [0x5dd3] = 0xe818, [0x5dd4] = 0x2e84, [0x5dd5] = 0x3473,
|
|
||||||
+ [0x5dd6] = 0x3447, [0x5dd7] = 0x2e88, [0x5dd8] = 0x2e8b, [0x5dd9] = 0x9fb4,
|
|
||||||
+ [0x5dda] = 0x359e, [0x5ddb] = 0x361a, [0x5ddc] = 0x360e, [0x5ddd] = 0x2e8c,
|
|
||||||
+ [0x5dde] = 0x2e97, [0x5ddf] = 0x396e, [0x5de0] = 0x3918, [0x5de1] = 0x9fb5,
|
|
||||||
+ [0x5de2] = 0x39cf, [0x5de3] = 0x39df, [0x5de4] = 0x3a73, [0x5de5] = 0x39d0,
|
|
||||||
+ [0x5de6] = 0x9fb6, [0x5de7] = 0x9fb7, [0x5de8] = 0x3b4e, [0x5de9] = 0x3c6e,
|
|
||||||
+ [0x5dea] = 0x3ce0, [0x5deb] = 0x2ea7, [0x5dec] = 0xe831, [0x5ded] = 0x9fb8,
|
|
||||||
[0x5dee] = 0x2eaa, [0x5def] = 0x4056, [0x5df0] = 0x415f, [0x5df1] = 0x2eae,
|
|
||||||
[0x5df2] = 0x4337, [0x5df3] = 0x2eb3, [0x5df4] = 0x2eb6, [0x5df5] = 0x2eb7,
|
|
||||||
- [0x5df7] = 0x43b1, [0x5df8] = 0x43ac, [0x5df9] = 0x2ebb, [0x5dfa] = 0x43dd,
|
|
||||||
- [0x5dfb] = 0x44d6, [0x5dfc] = 0x4661, [0x5dfd] = 0x464c, [0x5dfe] = 0x9fb9,
|
|
||||||
- [0x5e00] = 0x4723, [0x5e01] = 0x4729, [0x5e02] = 0x477c, [0x5e03] = 0x478d,
|
|
||||||
- [0x5e04] = 0x2eca, [0x5e05] = 0x4947, [0x5e06] = 0x497a, [0x5e07] = 0x497d,
|
|
||||||
- [0x5e08] = 0x4982, [0x5e09] = 0x4983, [0x5e0a] = 0x4985, [0x5e0b] = 0x4986,
|
|
||||||
- [0x5e0c] = 0x499f, [0x5e0d] = 0x499b, [0x5e0e] = 0x49b7, [0x5e0f] = 0x49b6,
|
|
||||||
- [0x5e10] = 0x9fba, [0x5e12] = 0x4ca3, [0x5e13] = 0x4c9f, [0x5e14] = 0x4ca0,
|
|
||||||
- [0x5e15] = 0x4ca1, [0x5e16] = 0x4c77, [0x5e17] = 0x4ca2, [0x5e18] = 0x4d13,
|
|
||||||
- [0x5e19] = 0x4d14, [0x5e1a] = 0x4d15, [0x5e1b] = 0x4d16, [0x5e1c] = 0x4d17,
|
|
||||||
- [0x5e1d] = 0x4d18, [0x5e1e] = 0x4d19, [0x5e1f] = 0x4dae, [0x5e20] = 0x9fbb,
|
|
||||||
- [0x5e21] = 0xe468, [0x5e22] = 0xe469, [0x5e23] = 0xe46a, [0x5e24] = 0xe46b,
|
|
||||||
- [0x5e25] = 0xe46c, [0x5e26] = 0xe46d, [0x5e27] = 0xe46e, [0x5e28] = 0xe46f,
|
|
||||||
- [0x5e29] = 0xe470, [0x5e2a] = 0xe471, [0x5e2b] = 0xe472, [0x5e2c] = 0xe473,
|
|
||||||
- [0x5e2d] = 0xe474, [0x5e2e] = 0xe475, [0x5e2f] = 0xe476, [0x5e30] = 0xe477,
|
|
||||||
- [0x5e31] = 0xe478, [0x5e32] = 0xe479, [0x5e33] = 0xe47a, [0x5e34] = 0xe47b,
|
|
||||||
- [0x5e35] = 0xe47c, [0x5e36] = 0xe47d, [0x5e37] = 0xe47e, [0x5e38] = 0xe47f,
|
|
||||||
- [0x5e39] = 0xe480, [0x5e3a] = 0xe481, [0x5e3b] = 0xe482, [0x5e3c] = 0xe483,
|
|
||||||
- [0x5e3d] = 0xe484, [0x5e3e] = 0xe485, [0x5e3f] = 0xe486, [0x5e40] = 0xe487,
|
|
||||||
- [0x5e41] = 0xe488, [0x5e42] = 0xe489, [0x5e43] = 0xe48a, [0x5e44] = 0xe48b,
|
|
||||||
- [0x5e45] = 0xe48c, [0x5e46] = 0xe48d, [0x5e47] = 0xe48e, [0x5e48] = 0xe48f,
|
|
||||||
- [0x5e49] = 0xe490, [0x5e4a] = 0xe491, [0x5e4b] = 0xe492, [0x5e4c] = 0xe493,
|
|
||||||
- [0x5e4d] = 0xe494, [0x5e4e] = 0xe495, [0x5e4f] = 0xe496, [0x5e50] = 0xe497,
|
|
||||||
- [0x5e51] = 0xe498, [0x5e52] = 0xe499, [0x5e53] = 0xe49a, [0x5e54] = 0xe49b,
|
|
||||||
- [0x5e55] = 0xe49c, [0x5e56] = 0xe49d, [0x5e57] = 0xe49e, [0x5e58] = 0xe49f,
|
|
||||||
- [0x5e59] = 0xe4a0, [0x5e5a] = 0xe4a1, [0x5e5b] = 0xe4a2, [0x5e5c] = 0xe4a3,
|
|
||||||
- [0x5e5d] = 0xe4a4, [0x5e5e] = 0xe4a5, [0x5e5f] = 0xe4a6, [0x5e60] = 0xe4a7,
|
|
||||||
- [0x5e61] = 0xe4a8, [0x5e62] = 0xe4a9, [0x5e63] = 0xe4aa, [0x5e64] = 0xe4ab,
|
|
||||||
- [0x5e65] = 0xe4ac, [0x5e66] = 0xe4ad, [0x5e67] = 0xe4ae, [0x5e68] = 0xe4af,
|
|
||||||
- [0x5e69] = 0xe4b0, [0x5e6a] = 0xe4b1, [0x5e6b] = 0xe4b2, [0x5e6c] = 0xe4b3,
|
|
||||||
- [0x5e6d] = 0xe4b4, [0x5e6e] = 0xe4b5, [0x5e6f] = 0xe4b6, [0x5e70] = 0xe4b7,
|
|
||||||
- [0x5e71] = 0xe4b8, [0x5e72] = 0xe4b9, [0x5e73] = 0xe4ba, [0x5e74] = 0xe4bb,
|
|
||||||
- [0x5e75] = 0xe4bc, [0x5e76] = 0xe4bd, [0x5e77] = 0xe4be, [0x5e78] = 0xe4bf,
|
|
||||||
- [0x5e79] = 0xe4c0, [0x5e7a] = 0xe4c1, [0x5e7b] = 0xe4c2, [0x5e7c] = 0xe4c3,
|
|
||||||
- [0x5e7d] = 0xe4c4, [0x5e7e] = 0xe4c5,
|
|
||||||
+ [0x5df6] = 0xe83b, [0x5df7] = 0x43b1, [0x5df8] = 0x43ac, [0x5df9] = 0x2ebb,
|
|
||||||
+ [0x5dfa] = 0x43dd, [0x5dfb] = 0x44d6, [0x5dfc] = 0x4661, [0x5dfd] = 0x464c,
|
|
||||||
+ [0x5dfe] = 0x9fb9, [0x5e00] = 0x4723, [0x5e01] = 0x4729, [0x5e02] = 0x477c,
|
|
||||||
+ [0x5e03] = 0x478d, [0x5e04] = 0x2eca, [0x5e05] = 0x4947, [0x5e06] = 0x497a,
|
|
||||||
+ [0x5e07] = 0x497d, [0x5e08] = 0x4982, [0x5e09] = 0x4983, [0x5e0a] = 0x4985,
|
|
||||||
+ [0x5e0b] = 0x4986, [0x5e0c] = 0x499f, [0x5e0d] = 0x499b, [0x5e0e] = 0x49b7,
|
|
||||||
+ [0x5e0f] = 0x49b6, [0x5e10] = 0x9fba, [0x5e11] = 0xe855, [0x5e12] = 0x4ca3,
|
|
||||||
+ [0x5e13] = 0x4c9f, [0x5e14] = 0x4ca0, [0x5e15] = 0x4ca1, [0x5e16] = 0x4c77,
|
|
||||||
+ [0x5e17] = 0x4ca2, [0x5e18] = 0x4d13, [0x5e19] = 0x4d14, [0x5e1a] = 0x4d15,
|
|
||||||
+ [0x5e1b] = 0x4d16, [0x5e1c] = 0x4d17, [0x5e1d] = 0x4d18, [0x5e1e] = 0x4d19,
|
|
||||||
+ [0x5e1f] = 0x4dae, [0x5e20] = 0x9fbb, [0x5e21] = 0xe468, [0x5e22] = 0xe469,
|
|
||||||
+ [0x5e23] = 0xe46a, [0x5e24] = 0xe46b, [0x5e25] = 0xe46c, [0x5e26] = 0xe46d,
|
|
||||||
+ [0x5e27] = 0xe46e, [0x5e28] = 0xe46f, [0x5e29] = 0xe470, [0x5e2a] = 0xe471,
|
|
||||||
+ [0x5e2b] = 0xe472, [0x5e2c] = 0xe473, [0x5e2d] = 0xe474, [0x5e2e] = 0xe475,
|
|
||||||
+ [0x5e2f] = 0xe476, [0x5e30] = 0xe477, [0x5e31] = 0xe478, [0x5e32] = 0xe479,
|
|
||||||
+ [0x5e33] = 0xe47a, [0x5e34] = 0xe47b, [0x5e35] = 0xe47c, [0x5e36] = 0xe47d,
|
|
||||||
+ [0x5e37] = 0xe47e, [0x5e38] = 0xe47f, [0x5e39] = 0xe480, [0x5e3a] = 0xe481,
|
|
||||||
+ [0x5e3b] = 0xe482, [0x5e3c] = 0xe483, [0x5e3d] = 0xe484, [0x5e3e] = 0xe485,
|
|
||||||
+ [0x5e3f] = 0xe486, [0x5e40] = 0xe487, [0x5e41] = 0xe488, [0x5e42] = 0xe489,
|
|
||||||
+ [0x5e43] = 0xe48a, [0x5e44] = 0xe48b, [0x5e45] = 0xe48c, [0x5e46] = 0xe48d,
|
|
||||||
+ [0x5e47] = 0xe48e, [0x5e48] = 0xe48f, [0x5e49] = 0xe490, [0x5e4a] = 0xe491,
|
|
||||||
+ [0x5e4b] = 0xe492, [0x5e4c] = 0xe493, [0x5e4d] = 0xe494, [0x5e4e] = 0xe495,
|
|
||||||
+ [0x5e4f] = 0xe496, [0x5e50] = 0xe497, [0x5e51] = 0xe498, [0x5e52] = 0xe499,
|
|
||||||
+ [0x5e53] = 0xe49a, [0x5e54] = 0xe49b, [0x5e55] = 0xe49c, [0x5e56] = 0xe49d,
|
|
||||||
+ [0x5e57] = 0xe49e, [0x5e58] = 0xe49f, [0x5e59] = 0xe4a0, [0x5e5a] = 0xe4a1,
|
|
||||||
+ [0x5e5b] = 0xe4a2, [0x5e5c] = 0xe4a3, [0x5e5d] = 0xe4a4, [0x5e5e] = 0xe4a5,
|
|
||||||
+ [0x5e5f] = 0xe4a6, [0x5e60] = 0xe4a7, [0x5e61] = 0xe4a8, [0x5e62] = 0xe4a9,
|
|
||||||
+ [0x5e63] = 0xe4aa, [0x5e64] = 0xe4ab, [0x5e65] = 0xe4ac, [0x5e66] = 0xe4ad,
|
|
||||||
+ [0x5e67] = 0xe4ae, [0x5e68] = 0xe4af, [0x5e69] = 0xe4b0, [0x5e6a] = 0xe4b1,
|
|
||||||
+ [0x5e6b] = 0xe4b2, [0x5e6c] = 0xe4b3, [0x5e6d] = 0xe4b4, [0x5e6e] = 0xe4b5,
|
|
||||||
+ [0x5e6f] = 0xe4b6, [0x5e70] = 0xe4b7, [0x5e71] = 0xe4b8, [0x5e72] = 0xe4b9,
|
|
||||||
+ [0x5e73] = 0xe4ba, [0x5e74] = 0xe4bb, [0x5e75] = 0xe4bc, [0x5e76] = 0xe4bd,
|
|
||||||
+ [0x5e77] = 0xe4be, [0x5e78] = 0xe4bf, [0x5e79] = 0xe4c0, [0x5e7a] = 0xe4c1,
|
|
||||||
+ [0x5e7b] = 0xe4c2, [0x5e7c] = 0xe4c3, [0x5e7d] = 0xe4c4, [0x5e7e] = 0xe4c5,
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Table for GB18030 -> UCS-4, containing the four-byte characters only,
|
|
||||||
@@ -8680,7 +8681,9 @@ static const uint16_t __fourbyte_to_ucs[0x99e2 - 6637 - 2110 - 14404 - 4295] =
|
|
||||||
[0x2838] = 0x9fa6, [0x2839] = 0x9fa7, [0x283a] = 0x9fa8, [0x283b] = 0x9fa9,
|
|
||||||
[0x283c] = 0x9faa, [0x283d] = 0x9fab, [0x283e] = 0x9fac, [0x283f] = 0x9fad,
|
|
||||||
[0x2840] = 0x9fae, [0x2841] = 0x9faf, [0x2842] = 0x9fb0, [0x2843] = 0x9fb1,
|
|
||||||
- [0x2844] = 0x9fb2, [0x2845] = 0x9fb3, [0x284e] = 0xe76c, [0x284f] = 0xe7c8,
|
|
||||||
+ [0x2844] = 0x9fb2, [0x2845] = 0x9fb3, [0x2846] = 0xe81e, [0x2847] = 0xe826,
|
|
||||||
+ [0x2848] = 0xe82b, [0x2849] = 0xe82c, [0x284a] = 0xe832, [0x284b] = 0xe843,
|
|
||||||
+ [0x284c] = 0xe854, [0x284d] = 0xe864, [0x284e] = 0xe76c, [0x284f] = 0xe7c8,
|
|
||||||
[0x2850] = 0xe7e7, [0x2851] = 0xe7e8, [0x2852] = 0xe7e9, [0x2853] = 0xe7ea,
|
|
||||||
[0x2854] = 0xe7eb, [0x2855] = 0xe7ec, [0x2856] = 0xe7ed, [0x2857] = 0xe7ee,
|
|
||||||
[0x2858] = 0xe7ef, [0x2859] = 0xe7f0, [0x285a] = 0xe7f1, [0x285b] = 0xe7f2,
|
|
||||||
@@ -9008,84 +9011,86 @@ static const uint16_t __fourbyte_to_ucs[0x99e2 - 6637 - 2110 - 14404 - 4295] =
|
|
||||||
[0x2d60] = 0xfe02, [0x2d61] = 0xfe03, [0x2d62] = 0xfe04, [0x2d63] = 0xfe05,
|
|
||||||
[0x2d64] = 0xfe06, [0x2d65] = 0xfe07, [0x2d66] = 0xfe08, [0x2d67] = 0xfe09,
|
|
||||||
[0x2d68] = 0xfe0a, [0x2d69] = 0xfe0b, [0x2d6a] = 0xfe0c, [0x2d6b] = 0xfe0d,
|
|
||||||
- [0x2d6c] = 0xfe0e, [0x2d6d] = 0xfe0f, [0x2d78] = 0xfe1a, [0x2d79] = 0xfe1b,
|
|
||||||
- [0x2d7a] = 0xfe1c, [0x2d7b] = 0xfe1d, [0x2d7c] = 0xfe1e, [0x2d7d] = 0xfe1f,
|
|
||||||
- [0x2d7e] = 0xfe20, [0x2d7f] = 0xfe21, [0x2d80] = 0xfe22, [0x2d81] = 0xfe23,
|
|
||||||
- [0x2d82] = 0xfe24, [0x2d83] = 0xfe25, [0x2d84] = 0xfe26, [0x2d85] = 0xfe27,
|
|
||||||
- [0x2d86] = 0xfe28, [0x2d87] = 0xfe29, [0x2d88] = 0xfe2a, [0x2d89] = 0xfe2b,
|
|
||||||
- [0x2d8a] = 0xfe2c, [0x2d8b] = 0xfe2d, [0x2d8c] = 0xfe2e, [0x2d8d] = 0xfe2f,
|
|
||||||
- [0x2d8e] = 0xfe32, [0x2d8f] = 0xfe45, [0x2d90] = 0xfe46, [0x2d91] = 0xfe47,
|
|
||||||
- [0x2d92] = 0xfe48, [0x2d93] = 0xfe53, [0x2d94] = 0xfe58, [0x2d95] = 0xfe67,
|
|
||||||
- [0x2d96] = 0xfe6c, [0x2d97] = 0xfe6d, [0x2d98] = 0xfe6e, [0x2d99] = 0xfe6f,
|
|
||||||
- [0x2d9a] = 0xfe70, [0x2d9b] = 0xfe71, [0x2d9c] = 0xfe72, [0x2d9d] = 0xfe73,
|
|
||||||
- [0x2d9e] = 0xfe74, [0x2d9f] = 0xfe75, [0x2da0] = 0xfe76, [0x2da1] = 0xfe77,
|
|
||||||
- [0x2da2] = 0xfe78, [0x2da3] = 0xfe79, [0x2da4] = 0xfe7a, [0x2da5] = 0xfe7b,
|
|
||||||
- [0x2da6] = 0xfe7c, [0x2da7] = 0xfe7d, [0x2da8] = 0xfe7e, [0x2da9] = 0xfe7f,
|
|
||||||
- [0x2daa] = 0xfe80, [0x2dab] = 0xfe81, [0x2dac] = 0xfe82, [0x2dad] = 0xfe83,
|
|
||||||
- [0x2dae] = 0xfe84, [0x2daf] = 0xfe85, [0x2db0] = 0xfe86, [0x2db1] = 0xfe87,
|
|
||||||
- [0x2db2] = 0xfe88, [0x2db3] = 0xfe89, [0x2db4] = 0xfe8a, [0x2db5] = 0xfe8b,
|
|
||||||
- [0x2db6] = 0xfe8c, [0x2db7] = 0xfe8d, [0x2db8] = 0xfe8e, [0x2db9] = 0xfe8f,
|
|
||||||
- [0x2dba] = 0xfe90, [0x2dbb] = 0xfe91, [0x2dbc] = 0xfe92, [0x2dbd] = 0xfe93,
|
|
||||||
- [0x2dbe] = 0xfe94, [0x2dbf] = 0xfe95, [0x2dc0] = 0xfe96, [0x2dc1] = 0xfe97,
|
|
||||||
- [0x2dc2] = 0xfe98, [0x2dc3] = 0xfe99, [0x2dc4] = 0xfe9a, [0x2dc5] = 0xfe9b,
|
|
||||||
- [0x2dc6] = 0xfe9c, [0x2dc7] = 0xfe9d, [0x2dc8] = 0xfe9e, [0x2dc9] = 0xfe9f,
|
|
||||||
- [0x2dca] = 0xfea0, [0x2dcb] = 0xfea1, [0x2dcc] = 0xfea2, [0x2dcd] = 0xfea3,
|
|
||||||
- [0x2dce] = 0xfea4, [0x2dcf] = 0xfea5, [0x2dd0] = 0xfea6, [0x2dd1] = 0xfea7,
|
|
||||||
- [0x2dd2] = 0xfea8, [0x2dd3] = 0xfea9, [0x2dd4] = 0xfeaa, [0x2dd5] = 0xfeab,
|
|
||||||
- [0x2dd6] = 0xfeac, [0x2dd7] = 0xfead, [0x2dd8] = 0xfeae, [0x2dd9] = 0xfeaf,
|
|
||||||
- [0x2dda] = 0xfeb0, [0x2ddb] = 0xfeb1, [0x2ddc] = 0xfeb2, [0x2ddd] = 0xfeb3,
|
|
||||||
- [0x2dde] = 0xfeb4, [0x2ddf] = 0xfeb5, [0x2de0] = 0xfeb6, [0x2de1] = 0xfeb7,
|
|
||||||
- [0x2de2] = 0xfeb8, [0x2de3] = 0xfeb9, [0x2de4] = 0xfeba, [0x2de5] = 0xfebb,
|
|
||||||
- [0x2de6] = 0xfebc, [0x2de7] = 0xfebd, [0x2de8] = 0xfebe, [0x2de9] = 0xfebf,
|
|
||||||
- [0x2dea] = 0xfec0, [0x2deb] = 0xfec1, [0x2dec] = 0xfec2, [0x2ded] = 0xfec3,
|
|
||||||
- [0x2dee] = 0xfec4, [0x2def] = 0xfec5, [0x2df0] = 0xfec6, [0x2df1] = 0xfec7,
|
|
||||||
- [0x2df2] = 0xfec8, [0x2df3] = 0xfec9, [0x2df4] = 0xfeca, [0x2df5] = 0xfecb,
|
|
||||||
- [0x2df6] = 0xfecc, [0x2df7] = 0xfecd, [0x2df8] = 0xfece, [0x2df9] = 0xfecf,
|
|
||||||
- [0x2dfa] = 0xfed0, [0x2dfb] = 0xfed1, [0x2dfc] = 0xfed2, [0x2dfd] = 0xfed3,
|
|
||||||
- [0x2dfe] = 0xfed4, [0x2dff] = 0xfed5, [0x2e00] = 0xfed6, [0x2e01] = 0xfed7,
|
|
||||||
- [0x2e02] = 0xfed8, [0x2e03] = 0xfed9, [0x2e04] = 0xfeda, [0x2e05] = 0xfedb,
|
|
||||||
- [0x2e06] = 0xfedc, [0x2e07] = 0xfedd, [0x2e08] = 0xfede, [0x2e09] = 0xfedf,
|
|
||||||
- [0x2e0a] = 0xfee0, [0x2e0b] = 0xfee1, [0x2e0c] = 0xfee2, [0x2e0d] = 0xfee3,
|
|
||||||
- [0x2e0e] = 0xfee4, [0x2e0f] = 0xfee5, [0x2e10] = 0xfee6, [0x2e11] = 0xfee7,
|
|
||||||
- [0x2e12] = 0xfee8, [0x2e13] = 0xfee9, [0x2e14] = 0xfeea, [0x2e15] = 0xfeeb,
|
|
||||||
- [0x2e16] = 0xfeec, [0x2e17] = 0xfeed, [0x2e18] = 0xfeee, [0x2e19] = 0xfeef,
|
|
||||||
- [0x2e1a] = 0xfef0, [0x2e1b] = 0xfef1, [0x2e1c] = 0xfef2, [0x2e1d] = 0xfef3,
|
|
||||||
- [0x2e1e] = 0xfef4, [0x2e1f] = 0xfef5, [0x2e20] = 0xfef6, [0x2e21] = 0xfef7,
|
|
||||||
- [0x2e22] = 0xfef8, [0x2e23] = 0xfef9, [0x2e24] = 0xfefa, [0x2e25] = 0xfefb,
|
|
||||||
- [0x2e26] = 0xfefc, [0x2e27] = 0xfefd, [0x2e28] = 0xfefe, [0x2e29] = 0xfeff,
|
|
||||||
- [0x2e2a] = 0xff00, [0x2e2b] = 0xff5f, [0x2e2c] = 0xff60, [0x2e2d] = 0xff61,
|
|
||||||
- [0x2e2e] = 0xff62, [0x2e2f] = 0xff63, [0x2e30] = 0xff64, [0x2e31] = 0xff65,
|
|
||||||
- [0x2e32] = 0xff66, [0x2e33] = 0xff67, [0x2e34] = 0xff68, [0x2e35] = 0xff69,
|
|
||||||
- [0x2e36] = 0xff6a, [0x2e37] = 0xff6b, [0x2e38] = 0xff6c, [0x2e39] = 0xff6d,
|
|
||||||
- [0x2e3a] = 0xff6e, [0x2e3b] = 0xff6f, [0x2e3c] = 0xff70, [0x2e3d] = 0xff71,
|
|
||||||
- [0x2e3e] = 0xff72, [0x2e3f] = 0xff73, [0x2e40] = 0xff74, [0x2e41] = 0xff75,
|
|
||||||
- [0x2e42] = 0xff76, [0x2e43] = 0xff77, [0x2e44] = 0xff78, [0x2e45] = 0xff79,
|
|
||||||
- [0x2e46] = 0xff7a, [0x2e47] = 0xff7b, [0x2e48] = 0xff7c, [0x2e49] = 0xff7d,
|
|
||||||
- [0x2e4a] = 0xff7e, [0x2e4b] = 0xff7f, [0x2e4c] = 0xff80, [0x2e4d] = 0xff81,
|
|
||||||
- [0x2e4e] = 0xff82, [0x2e4f] = 0xff83, [0x2e50] = 0xff84, [0x2e51] = 0xff85,
|
|
||||||
- [0x2e52] = 0xff86, [0x2e53] = 0xff87, [0x2e54] = 0xff88, [0x2e55] = 0xff89,
|
|
||||||
- [0x2e56] = 0xff8a, [0x2e57] = 0xff8b, [0x2e58] = 0xff8c, [0x2e59] = 0xff8d,
|
|
||||||
- [0x2e5a] = 0xff8e, [0x2e5b] = 0xff8f, [0x2e5c] = 0xff90, [0x2e5d] = 0xff91,
|
|
||||||
- [0x2e5e] = 0xff92, [0x2e5f] = 0xff93, [0x2e60] = 0xff94, [0x2e61] = 0xff95,
|
|
||||||
- [0x2e62] = 0xff96, [0x2e63] = 0xff97, [0x2e64] = 0xff98, [0x2e65] = 0xff99,
|
|
||||||
- [0x2e66] = 0xff9a, [0x2e67] = 0xff9b, [0x2e68] = 0xff9c, [0x2e69] = 0xff9d,
|
|
||||||
- [0x2e6a] = 0xff9e, [0x2e6b] = 0xff9f, [0x2e6c] = 0xffa0, [0x2e6d] = 0xffa1,
|
|
||||||
- [0x2e6e] = 0xffa2, [0x2e6f] = 0xffa3, [0x2e70] = 0xffa4, [0x2e71] = 0xffa5,
|
|
||||||
- [0x2e72] = 0xffa6, [0x2e73] = 0xffa7, [0x2e74] = 0xffa8, [0x2e75] = 0xffa9,
|
|
||||||
- [0x2e76] = 0xffaa, [0x2e77] = 0xffab, [0x2e78] = 0xffac, [0x2e79] = 0xffad,
|
|
||||||
- [0x2e7a] = 0xffae, [0x2e7b] = 0xffaf, [0x2e7c] = 0xffb0, [0x2e7d] = 0xffb1,
|
|
||||||
- [0x2e7e] = 0xffb2, [0x2e7f] = 0xffb3, [0x2e80] = 0xffb4, [0x2e81] = 0xffb5,
|
|
||||||
- [0x2e82] = 0xffb6, [0x2e83] = 0xffb7, [0x2e84] = 0xffb8, [0x2e85] = 0xffb9,
|
|
||||||
- [0x2e86] = 0xffba, [0x2e87] = 0xffbb, [0x2e88] = 0xffbc, [0x2e89] = 0xffbd,
|
|
||||||
- [0x2e8a] = 0xffbe, [0x2e8b] = 0xffbf, [0x2e8c] = 0xffc0, [0x2e8d] = 0xffc1,
|
|
||||||
- [0x2e8e] = 0xffc2, [0x2e8f] = 0xffc3, [0x2e90] = 0xffc4, [0x2e91] = 0xffc5,
|
|
||||||
- [0x2e92] = 0xffc6, [0x2e93] = 0xffc7, [0x2e94] = 0xffc8, [0x2e95] = 0xffc9,
|
|
||||||
- [0x2e96] = 0xffca, [0x2e97] = 0xffcb, [0x2e98] = 0xffcc, [0x2e99] = 0xffcd,
|
|
||||||
- [0x2e9a] = 0xffce, [0x2e9b] = 0xffcf, [0x2e9c] = 0xffd0, [0x2e9d] = 0xffd1,
|
|
||||||
- [0x2e9e] = 0xffd2, [0x2e9f] = 0xffd3, [0x2ea0] = 0xffd4, [0x2ea1] = 0xffd5,
|
|
||||||
- [0x2ea2] = 0xffd6, [0x2ea3] = 0xffd7, [0x2ea4] = 0xffd8, [0x2ea5] = 0xffd9,
|
|
||||||
- [0x2ea6] = 0xffda, [0x2ea7] = 0xffdb, [0x2ea8] = 0xffdc, [0x2ea9] = 0xffdd,
|
|
||||||
- [0x2eaa] = 0xffde, [0x2eab] = 0xffdf,
|
|
||||||
+ [0x2d6c] = 0xfe0e, [0x2d6d] = 0xfe0f, [0x2d6e] = 0xe78d, [0x2d6f] = 0xe78f,
|
|
||||||
+ [0x2d70] = 0xe78e, [0x2d71] = 0xe790, [0x2d72] = 0xe791, [0x2d73] = 0xe792,
|
|
||||||
+ [0x2d74] = 0xe793, [0x2d75] = 0xe794, [0x2d76] = 0xe795, [0x2d77] = 0xe796,
|
|
||||||
+ [0x2d78] = 0xfe1a, [0x2d79] = 0xfe1b, [0x2d7a] = 0xfe1c, [0x2d7b] = 0xfe1d,
|
|
||||||
+ [0x2d7c] = 0xfe1e, [0x2d7d] = 0xfe1f, [0x2d7e] = 0xfe20, [0x2d7f] = 0xfe21,
|
|
||||||
+ [0x2d80] = 0xfe22, [0x2d81] = 0xfe23, [0x2d82] = 0xfe24, [0x2d83] = 0xfe25,
|
|
||||||
+ [0x2d84] = 0xfe26, [0x2d85] = 0xfe27, [0x2d86] = 0xfe28, [0x2d87] = 0xfe29,
|
|
||||||
+ [0x2d88] = 0xfe2a, [0x2d89] = 0xfe2b, [0x2d8a] = 0xfe2c, [0x2d8b] = 0xfe2d,
|
|
||||||
+ [0x2d8c] = 0xfe2e, [0x2d8d] = 0xfe2f, [0x2d8e] = 0xfe32, [0x2d8f] = 0xfe45,
|
|
||||||
+ [0x2d90] = 0xfe46, [0x2d91] = 0xfe47, [0x2d92] = 0xfe48, [0x2d93] = 0xfe53,
|
|
||||||
+ [0x2d94] = 0xfe58, [0x2d95] = 0xfe67, [0x2d96] = 0xfe6c, [0x2d97] = 0xfe6d,
|
|
||||||
+ [0x2d98] = 0xfe6e, [0x2d99] = 0xfe6f, [0x2d9a] = 0xfe70, [0x2d9b] = 0xfe71,
|
|
||||||
+ [0x2d9c] = 0xfe72, [0x2d9d] = 0xfe73, [0x2d9e] = 0xfe74, [0x2d9f] = 0xfe75,
|
|
||||||
+ [0x2da0] = 0xfe76, [0x2da1] = 0xfe77, [0x2da2] = 0xfe78, [0x2da3] = 0xfe79,
|
|
||||||
+ [0x2da4] = 0xfe7a, [0x2da5] = 0xfe7b, [0x2da6] = 0xfe7c, [0x2da7] = 0xfe7d,
|
|
||||||
+ [0x2da8] = 0xfe7e, [0x2da9] = 0xfe7f, [0x2daa] = 0xfe80, [0x2dab] = 0xfe81,
|
|
||||||
+ [0x2dac] = 0xfe82, [0x2dad] = 0xfe83, [0x2dae] = 0xfe84, [0x2daf] = 0xfe85,
|
|
||||||
+ [0x2db0] = 0xfe86, [0x2db1] = 0xfe87, [0x2db2] = 0xfe88, [0x2db3] = 0xfe89,
|
|
||||||
+ [0x2db4] = 0xfe8a, [0x2db5] = 0xfe8b, [0x2db6] = 0xfe8c, [0x2db7] = 0xfe8d,
|
|
||||||
+ [0x2db8] = 0xfe8e, [0x2db9] = 0xfe8f, [0x2dba] = 0xfe90, [0x2dbb] = 0xfe91,
|
|
||||||
+ [0x2dbc] = 0xfe92, [0x2dbd] = 0xfe93, [0x2dbe] = 0xfe94, [0x2dbf] = 0xfe95,
|
|
||||||
+ [0x2dc0] = 0xfe96, [0x2dc1] = 0xfe97, [0x2dc2] = 0xfe98, [0x2dc3] = 0xfe99,
|
|
||||||
+ [0x2dc4] = 0xfe9a, [0x2dc5] = 0xfe9b, [0x2dc6] = 0xfe9c, [0x2dc7] = 0xfe9d,
|
|
||||||
+ [0x2dc8] = 0xfe9e, [0x2dc9] = 0xfe9f, [0x2dca] = 0xfea0, [0x2dcb] = 0xfea1,
|
|
||||||
+ [0x2dcc] = 0xfea2, [0x2dcd] = 0xfea3, [0x2dce] = 0xfea4, [0x2dcf] = 0xfea5,
|
|
||||||
+ [0x2dd0] = 0xfea6, [0x2dd1] = 0xfea7, [0x2dd2] = 0xfea8, [0x2dd3] = 0xfea9,
|
|
||||||
+ [0x2dd4] = 0xfeaa, [0x2dd5] = 0xfeab, [0x2dd6] = 0xfeac, [0x2dd7] = 0xfead,
|
|
||||||
+ [0x2dd8] = 0xfeae, [0x2dd9] = 0xfeaf, [0x2dda] = 0xfeb0, [0x2ddb] = 0xfeb1,
|
|
||||||
+ [0x2ddc] = 0xfeb2, [0x2ddd] = 0xfeb3, [0x2dde] = 0xfeb4, [0x2ddf] = 0xfeb5,
|
|
||||||
+ [0x2de0] = 0xfeb6, [0x2de1] = 0xfeb7, [0x2de2] = 0xfeb8, [0x2de3] = 0xfeb9,
|
|
||||||
+ [0x2de4] = 0xfeba, [0x2de5] = 0xfebb, [0x2de6] = 0xfebc, [0x2de7] = 0xfebd,
|
|
||||||
+ [0x2de8] = 0xfebe, [0x2de9] = 0xfebf, [0x2dea] = 0xfec0, [0x2deb] = 0xfec1,
|
|
||||||
+ [0x2dec] = 0xfec2, [0x2ded] = 0xfec3, [0x2dee] = 0xfec4, [0x2def] = 0xfec5,
|
|
||||||
+ [0x2df0] = 0xfec6, [0x2df1] = 0xfec7, [0x2df2] = 0xfec8, [0x2df3] = 0xfec9,
|
|
||||||
+ [0x2df4] = 0xfeca, [0x2df5] = 0xfecb, [0x2df6] = 0xfecc, [0x2df7] = 0xfecd,
|
|
||||||
+ [0x2df8] = 0xfece, [0x2df9] = 0xfecf, [0x2dfa] = 0xfed0, [0x2dfb] = 0xfed1,
|
|
||||||
+ [0x2dfc] = 0xfed2, [0x2dfd] = 0xfed3, [0x2dfe] = 0xfed4, [0x2dff] = 0xfed5,
|
|
||||||
+ [0x2e00] = 0xfed6, [0x2e01] = 0xfed7, [0x2e02] = 0xfed8, [0x2e03] = 0xfed9,
|
|
||||||
+ [0x2e04] = 0xfeda, [0x2e05] = 0xfedb, [0x2e06] = 0xfedc, [0x2e07] = 0xfedd,
|
|
||||||
+ [0x2e08] = 0xfede, [0x2e09] = 0xfedf, [0x2e0a] = 0xfee0, [0x2e0b] = 0xfee1,
|
|
||||||
+ [0x2e0c] = 0xfee2, [0x2e0d] = 0xfee3, [0x2e0e] = 0xfee4, [0x2e0f] = 0xfee5,
|
|
||||||
+ [0x2e10] = 0xfee6, [0x2e11] = 0xfee7, [0x2e12] = 0xfee8, [0x2e13] = 0xfee9,
|
|
||||||
+ [0x2e14] = 0xfeea, [0x2e15] = 0xfeeb, [0x2e16] = 0xfeec, [0x2e17] = 0xfeed,
|
|
||||||
+ [0x2e18] = 0xfeee, [0x2e19] = 0xfeef, [0x2e1a] = 0xfef0, [0x2e1b] = 0xfef1,
|
|
||||||
+ [0x2e1c] = 0xfef2, [0x2e1d] = 0xfef3, [0x2e1e] = 0xfef4, [0x2e1f] = 0xfef5,
|
|
||||||
+ [0x2e20] = 0xfef6, [0x2e21] = 0xfef7, [0x2e22] = 0xfef8, [0x2e23] = 0xfef9,
|
|
||||||
+ [0x2e24] = 0xfefa, [0x2e25] = 0xfefb, [0x2e26] = 0xfefc, [0x2e27] = 0xfefd,
|
|
||||||
+ [0x2e28] = 0xfefe, [0x2e29] = 0xfeff, [0x2e2a] = 0xff00, [0x2e2b] = 0xff5f,
|
|
||||||
+ [0x2e2c] = 0xff60, [0x2e2d] = 0xff61, [0x2e2e] = 0xff62, [0x2e2f] = 0xff63,
|
|
||||||
+ [0x2e30] = 0xff64, [0x2e31] = 0xff65, [0x2e32] = 0xff66, [0x2e33] = 0xff67,
|
|
||||||
+ [0x2e34] = 0xff68, [0x2e35] = 0xff69, [0x2e36] = 0xff6a, [0x2e37] = 0xff6b,
|
|
||||||
+ [0x2e38] = 0xff6c, [0x2e39] = 0xff6d, [0x2e3a] = 0xff6e, [0x2e3b] = 0xff6f,
|
|
||||||
+ [0x2e3c] = 0xff70, [0x2e3d] = 0xff71, [0x2e3e] = 0xff72, [0x2e3f] = 0xff73,
|
|
||||||
+ [0x2e40] = 0xff74, [0x2e41] = 0xff75, [0x2e42] = 0xff76, [0x2e43] = 0xff77,
|
|
||||||
+ [0x2e44] = 0xff78, [0x2e45] = 0xff79, [0x2e46] = 0xff7a, [0x2e47] = 0xff7b,
|
|
||||||
+ [0x2e48] = 0xff7c, [0x2e49] = 0xff7d, [0x2e4a] = 0xff7e, [0x2e4b] = 0xff7f,
|
|
||||||
+ [0x2e4c] = 0xff80, [0x2e4d] = 0xff81, [0x2e4e] = 0xff82, [0x2e4f] = 0xff83,
|
|
||||||
+ [0x2e50] = 0xff84, [0x2e51] = 0xff85, [0x2e52] = 0xff86, [0x2e53] = 0xff87,
|
|
||||||
+ [0x2e54] = 0xff88, [0x2e55] = 0xff89, [0x2e56] = 0xff8a, [0x2e57] = 0xff8b,
|
|
||||||
+ [0x2e58] = 0xff8c, [0x2e59] = 0xff8d, [0x2e5a] = 0xff8e, [0x2e5b] = 0xff8f,
|
|
||||||
+ [0x2e5c] = 0xff90, [0x2e5d] = 0xff91, [0x2e5e] = 0xff92, [0x2e5f] = 0xff93,
|
|
||||||
+ [0x2e60] = 0xff94, [0x2e61] = 0xff95, [0x2e62] = 0xff96, [0x2e63] = 0xff97,
|
|
||||||
+ [0x2e64] = 0xff98, [0x2e65] = 0xff99, [0x2e66] = 0xff9a, [0x2e67] = 0xff9b,
|
|
||||||
+ [0x2e68] = 0xff9c, [0x2e69] = 0xff9d, [0x2e6a] = 0xff9e, [0x2e6b] = 0xff9f,
|
|
||||||
+ [0x2e6c] = 0xffa0, [0x2e6d] = 0xffa1, [0x2e6e] = 0xffa2, [0x2e6f] = 0xffa3,
|
|
||||||
+ [0x2e70] = 0xffa4, [0x2e71] = 0xffa5, [0x2e72] = 0xffa6, [0x2e73] = 0xffa7,
|
|
||||||
+ [0x2e74] = 0xffa8, [0x2e75] = 0xffa9, [0x2e76] = 0xffaa, [0x2e77] = 0xffab,
|
|
||||||
+ [0x2e78] = 0xffac, [0x2e79] = 0xffad, [0x2e7a] = 0xffae, [0x2e7b] = 0xffaf,
|
|
||||||
+ [0x2e7c] = 0xffb0, [0x2e7d] = 0xffb1, [0x2e7e] = 0xffb2, [0x2e7f] = 0xffb3,
|
|
||||||
+ [0x2e80] = 0xffb4, [0x2e81] = 0xffb5, [0x2e82] = 0xffb6, [0x2e83] = 0xffb7,
|
|
||||||
+ [0x2e84] = 0xffb8, [0x2e85] = 0xffb9, [0x2e86] = 0xffba, [0x2e87] = 0xffbb,
|
|
||||||
+ [0x2e88] = 0xffbc, [0x2e89] = 0xffbd, [0x2e8a] = 0xffbe, [0x2e8b] = 0xffbf,
|
|
||||||
+ [0x2e8c] = 0xffc0, [0x2e8d] = 0xffc1, [0x2e8e] = 0xffc2, [0x2e8f] = 0xffc3,
|
|
||||||
+ [0x2e90] = 0xffc4, [0x2e91] = 0xffc5, [0x2e92] = 0xffc6, [0x2e93] = 0xffc7,
|
|
||||||
+ [0x2e94] = 0xffc8, [0x2e95] = 0xffc9, [0x2e96] = 0xffca, [0x2e97] = 0xffcb,
|
|
||||||
+ [0x2e98] = 0xffcc, [0x2e99] = 0xffcd, [0x2e9a] = 0xffce, [0x2e9b] = 0xffcf,
|
|
||||||
+ [0x2e9c] = 0xffd0, [0x2e9d] = 0xffd1, [0x2e9e] = 0xffd2, [0x2e9f] = 0xffd3,
|
|
||||||
+ [0x2ea0] = 0xffd4, [0x2ea1] = 0xffd5, [0x2ea2] = 0xffd6, [0x2ea3] = 0xffd7,
|
|
||||||
+ [0x2ea4] = 0xffd8, [0x2ea5] = 0xffd9, [0x2ea6] = 0xffda, [0x2ea7] = 0xffdb,
|
|
||||||
+ [0x2ea8] = 0xffdc, [0x2ea9] = 0xffdd, [0x2eaa] = 0xffde, [0x2eab] = 0xffdf,
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Table for UCS-4 -> GB18030, for the range U+0080..U+9FBB.
|
|
||||||
@@ -23437,71 +23442,79 @@ static const unsigned char __ucs_to_gb18030_tab2[][2] =
|
|
||||||
[0x0783] = "\xa5\xfd", [0x0784] = "\xa5\xfe", [0x0785] = "\xa6\xb9",
|
|
||||||
[0x0786] = "\xa6\xba", [0x0787] = "\xa6\xbb", [0x0788] = "\xa6\xbc",
|
|
||||||
[0x0789] = "\xa6\xbd", [0x078a] = "\xa6\xbe", [0x078b] = "\xa6\xbf",
|
|
||||||
- [0x078c] = "\xa6\xc0", [0x0797] = "\xa6\xf6", [0x0798] = "\xa6\xf7",
|
|
||||||
- [0x0799] = "\xa6\xf8", [0x079a] = "\xa6\xf9", [0x079b] = "\xa6\xfa",
|
|
||||||
- [0x079c] = "\xa6\xfb", [0x079d] = "\xa6\xfc", [0x079e] = "\xa6\xfd",
|
|
||||||
- [0x079f] = "\xa6\xfe", [0x07a0] = "\xa7\xc2", [0x07a1] = "\xa7\xc3",
|
|
||||||
- [0x07a2] = "\xa7\xc4", [0x07a3] = "\xa7\xc5", [0x07a4] = "\xa7\xc6",
|
|
||||||
- [0x07a5] = "\xa7\xc7", [0x07a6] = "\xa7\xc8", [0x07a7] = "\xa7\xc9",
|
|
||||||
- [0x07a8] = "\xa7\xca", [0x07a9] = "\xa7\xcb", [0x07aa] = "\xa7\xcc",
|
|
||||||
- [0x07ab] = "\xa7\xcd", [0x07ac] = "\xa7\xce", [0x07ad] = "\xa7\xcf",
|
|
||||||
- [0x07ae] = "\xa7\xd0", [0x07af] = "\xa7\xf2", [0x07b0] = "\xa7\xf3",
|
|
||||||
- [0x07b1] = "\xa7\xf4", [0x07b2] = "\xa7\xf5", [0x07b3] = "\xa7\xf6",
|
|
||||||
- [0x07b4] = "\xa7\xf7", [0x07b5] = "\xa7\xf8", [0x07b6] = "\xa7\xf9",
|
|
||||||
- [0x07b7] = "\xa7\xfa", [0x07b8] = "\xa7\xfb", [0x07b9] = "\xa7\xfc",
|
|
||||||
- [0x07ba] = "\xa7\xfd", [0x07bb] = "\xa7\xfe", [0x07bc] = "\xa8\x96",
|
|
||||||
- [0x07bd] = "\xa8\x97", [0x07be] = "\xa8\x98", [0x07bf] = "\xa8\x99",
|
|
||||||
- [0x07c0] = "\xa8\x9a", [0x07c1] = "\xa8\x9b", [0x07c2] = "\xa8\x9c",
|
|
||||||
- [0x07c3] = "\xa8\x9d", [0x07c4] = "\xa8\x9e", [0x07c5] = "\xa8\x9f",
|
|
||||||
- [0x07c6] = "\xa8\xa0", [0x07c7] = "\x00\x01", [0x07c8] = "\x65\x9e",
|
|
||||||
- [0x07c9] = "\xa8\xc1", [0x07ca] = "\xa8\xc2", [0x07cb] = "\xa8\xc3",
|
|
||||||
- [0x07cc] = "\xa8\xc4", [0x07cd] = "\xa8\xea", [0x07ce] = "\xa8\xeb",
|
|
||||||
- [0x07cf] = "\xa8\xec", [0x07d0] = "\xa8\xed", [0x07d1] = "\xa8\xee",
|
|
||||||
- [0x07d2] = "\xa8\xef", [0x07d3] = "\xa8\xf0", [0x07d4] = "\xa8\xf1",
|
|
||||||
- [0x07d5] = "\xa8\xf2", [0x07d6] = "\xa8\xf3", [0x07d7] = "\xa8\xf4",
|
|
||||||
- [0x07d8] = "\xa8\xf5", [0x07d9] = "\xa8\xf6", [0x07da] = "\xa8\xf7",
|
|
||||||
- [0x07db] = "\xa8\xf8", [0x07dc] = "\xa8\xf9", [0x07dd] = "\xa8\xfa",
|
|
||||||
- [0x07de] = "\xa8\xfb", [0x07df] = "\xa8\xfc", [0x07e0] = "\xa8\xfd",
|
|
||||||
- [0x07e1] = "\xa8\xfe", [0x07e2] = "\xa9\x58", [0x07e3] = "\xa9\x5b",
|
|
||||||
- [0x07e4] = "\xa9\x5d", [0x07e5] = "\xa9\x5e", [0x07e6] = "\xa9\x5f",
|
|
||||||
- [0x07e7] = "\x65\x9f", [0x07e8] = "\x65\xa0", [0x07e9] = "\x65\xa1",
|
|
||||||
- [0x07ea] = "\x65\xa2", [0x07eb] = "\x65\xa3", [0x07ec] = "\x65\xa4",
|
|
||||||
- [0x07ed] = "\x65\xa5", [0x07ee] = "\x65\xa6", [0x07ef] = "\x65\xa7",
|
|
||||||
- [0x07f0] = "\x65\xa8", [0x07f1] = "\x65\xa9", [0x07f2] = "\x65\xaa",
|
|
||||||
- [0x07f3] = "\x65\xab", [0x07f4] = "\xa9\x97", [0x07f5] = "\xa9\x98",
|
|
||||||
- [0x07f6] = "\xa9\x99", [0x07f7] = "\xa9\x9a", [0x07f8] = "\xa9\x9b",
|
|
||||||
- [0x07f9] = "\xa9\x9c", [0x07fa] = "\xa9\x9d", [0x07fb] = "\xa9\x9e",
|
|
||||||
- [0x07fc] = "\xa9\x9f", [0x07fd] = "\xa9\xa0", [0x07fe] = "\xa9\xa1",
|
|
||||||
- [0x07ff] = "\xa9\xa2", [0x0800] = "\xa9\xa3", [0x0801] = "\xa9\xf0",
|
|
||||||
- [0x0802] = "\xa9\xf1", [0x0803] = "\xa9\xf2", [0x0804] = "\xa9\xf3",
|
|
||||||
- [0x0805] = "\xa9\xf4", [0x0806] = "\xa9\xf5", [0x0807] = "\xa9\xf6",
|
|
||||||
- [0x0808] = "\xa9\xf7", [0x0809] = "\xa9\xf8", [0x080a] = "\xa9\xf9",
|
|
||||||
- [0x080b] = "\xa9\xfa", [0x080c] = "\xa9\xfb", [0x080d] = "\xa9\xfc",
|
|
||||||
- [0x080e] = "\xa9\xfd", [0x080f] = "\xa9\xfe", [0x0810] = "\xd7\xfa",
|
|
||||||
- [0x0811] = "\xd7\xfb", [0x0812] = "\xd7\xfc", [0x0813] = "\xd7\xfd",
|
|
||||||
- [0x0814] = "\xd7\xfe", [0x0815] = "\x65\xac", [0x0819] = "\x65\xad",
|
|
||||||
- [0x081a] = "\x65\xae", [0x081b] = "\x65\xaf", [0x081c] = "\x65\xb0",
|
|
||||||
- [0x081d] = "\x65\xb1", [0x081f] = "\x65\xb2", [0x0820] = "\x65\xb3",
|
|
||||||
- [0x0821] = "\x65\xb4", [0x0822] = "\x65\xb5", [0x0823] = "\x65\xb6",
|
|
||||||
- [0x0824] = "\x65\xb7", [0x0825] = "\x65\xb8", [0x0827] = "\x65\xb9",
|
|
||||||
+ [0x078c] = "\xa6\xc0", [0x078d] = "\x7b\x84", [0x078e] = "\x7b\x86",
|
|
||||||
+ [0x078f] = "\x7b\x85", [0x0790] = "\x7b\x87", [0x0791] = "\x7b\x88",
|
|
||||||
+ [0x0792] = "\x7b\x89", [0x0793] = "\x7b\x8a", [0x0794] = "\x7b\x8b",
|
|
||||||
+ [0x0795] = "\x7b\x8c", [0x0796] = "\x7b\x8d", [0x0797] = "\xa6\xf6",
|
|
||||||
+ [0x0798] = "\xa6\xf7", [0x0799] = "\xa6\xf8", [0x079a] = "\xa6\xf9",
|
|
||||||
+ [0x079b] = "\xa6\xfa", [0x079c] = "\xa6\xfb", [0x079d] = "\xa6\xfc",
|
|
||||||
+ [0x079e] = "\xa6\xfd", [0x079f] = "\xa6\xfe", [0x07a0] = "\xa7\xc2",
|
|
||||||
+ [0x07a1] = "\xa7\xc3", [0x07a2] = "\xa7\xc4", [0x07a3] = "\xa7\xc5",
|
|
||||||
+ [0x07a4] = "\xa7\xc6", [0x07a5] = "\xa7\xc7", [0x07a6] = "\xa7\xc8",
|
|
||||||
+ [0x07a7] = "\xa7\xc9", [0x07a8] = "\xa7\xca", [0x07a9] = "\xa7\xcb",
|
|
||||||
+ [0x07aa] = "\xa7\xcc", [0x07ab] = "\xa7\xcd", [0x07ac] = "\xa7\xce",
|
|
||||||
+ [0x07ad] = "\xa7\xcf", [0x07ae] = "\xa7\xd0", [0x07af] = "\xa7\xf2",
|
|
||||||
+ [0x07b0] = "\xa7\xf3", [0x07b1] = "\xa7\xf4", [0x07b2] = "\xa7\xf5",
|
|
||||||
+ [0x07b3] = "\xa7\xf6", [0x07b4] = "\xa7\xf7", [0x07b5] = "\xa7\xf8",
|
|
||||||
+ [0x07b6] = "\xa7\xf9", [0x07b7] = "\xa7\xfa", [0x07b8] = "\xa7\xfb",
|
|
||||||
+ [0x07b9] = "\xa7\xfc", [0x07ba] = "\xa7\xfd", [0x07bb] = "\xa7\xfe",
|
|
||||||
+ [0x07bc] = "\xa8\x96", [0x07bd] = "\xa8\x97", [0x07be] = "\xa8\x98",
|
|
||||||
+ [0x07bf] = "\xa8\x99", [0x07c0] = "\xa8\x9a", [0x07c1] = "\xa8\x9b",
|
|
||||||
+ [0x07c2] = "\xa8\x9c", [0x07c3] = "\xa8\x9d", [0x07c4] = "\xa8\x9e",
|
|
||||||
+ [0x07c5] = "\xa8\x9f", [0x07c6] = "\xa8\xa0", [0x07c7] = "\x00\x01",
|
|
||||||
+ [0x07c8] = "\x65\x9e", [0x07c9] = "\xa8\xc1", [0x07ca] = "\xa8\xc2",
|
|
||||||
+ [0x07cb] = "\xa8\xc3", [0x07cc] = "\xa8\xc4", [0x07cd] = "\xa8\xea",
|
|
||||||
+ [0x07ce] = "\xa8\xeb", [0x07cf] = "\xa8\xec", [0x07d0] = "\xa8\xed",
|
|
||||||
+ [0x07d1] = "\xa8\xee", [0x07d2] = "\xa8\xef", [0x07d3] = "\xa8\xf0",
|
|
||||||
+ [0x07d4] = "\xa8\xf1", [0x07d5] = "\xa8\xf2", [0x07d6] = "\xa8\xf3",
|
|
||||||
+ [0x07d7] = "\xa8\xf4", [0x07d8] = "\xa8\xf5", [0x07d9] = "\xa8\xf6",
|
|
||||||
+ [0x07da] = "\xa8\xf7", [0x07db] = "\xa8\xf8", [0x07dc] = "\xa8\xf9",
|
|
||||||
+ [0x07dd] = "\xa8\xfa", [0x07de] = "\xa8\xfb", [0x07df] = "\xa8\xfc",
|
|
||||||
+ [0x07e0] = "\xa8\xfd", [0x07e1] = "\xa8\xfe", [0x07e2] = "\xa9\x58",
|
|
||||||
+ [0x07e3] = "\xa9\x5b", [0x07e4] = "\xa9\x5d", [0x07e5] = "\xa9\x5e",
|
|
||||||
+ [0x07e6] = "\xa9\x5f", [0x07e7] = "\x65\x9f", [0x07e8] = "\x65\xa0",
|
|
||||||
+ [0x07e9] = "\x65\xa1", [0x07ea] = "\x65\xa2", [0x07eb] = "\x65\xa3",
|
|
||||||
+ [0x07ec] = "\x65\xa4", [0x07ed] = "\x65\xa5", [0x07ee] = "\x65\xa6",
|
|
||||||
+ [0x07ef] = "\x65\xa7", [0x07f0] = "\x65\xa8", [0x07f1] = "\x65\xa9",
|
|
||||||
+ [0x07f2] = "\x65\xaa", [0x07f3] = "\x65\xab", [0x07f4] = "\xa9\x97",
|
|
||||||
+ [0x07f5] = "\xa9\x98", [0x07f6] = "\xa9\x99", [0x07f7] = "\xa9\x9a",
|
|
||||||
+ [0x07f8] = "\xa9\x9b", [0x07f9] = "\xa9\x9c", [0x07fa] = "\xa9\x9d",
|
|
||||||
+ [0x07fb] = "\xa9\x9e", [0x07fc] = "\xa9\x9f", [0x07fd] = "\xa9\xa0",
|
|
||||||
+ [0x07fe] = "\xa9\xa1", [0x07ff] = "\xa9\xa2", [0x0800] = "\xa9\xa3",
|
|
||||||
+ [0x0801] = "\xa9\xf0", [0x0802] = "\xa9\xf1", [0x0803] = "\xa9\xf2",
|
|
||||||
+ [0x0804] = "\xa9\xf3", [0x0805] = "\xa9\xf4", [0x0806] = "\xa9\xf5",
|
|
||||||
+ [0x0807] = "\xa9\xf6", [0x0808] = "\xa9\xf7", [0x0809] = "\xa9\xf8",
|
|
||||||
+ [0x080a] = "\xa9\xf9", [0x080b] = "\xa9\xfa", [0x080c] = "\xa9\xfb",
|
|
||||||
+ [0x080d] = "\xa9\xfc", [0x080e] = "\xa9\xfd", [0x080f] = "\xa9\xfe",
|
|
||||||
+ [0x0810] = "\xd7\xfa", [0x0811] = "\xd7\xfb", [0x0812] = "\xd7\xfc",
|
|
||||||
+ [0x0813] = "\xd7\xfd", [0x0814] = "\xd7\xfe", [0x0815] = "\x65\xac",
|
|
||||||
+ [0x0816] = "\xfe\x51", [0x0817] = "\xfe\x52", [0x0818] = "\xfe\x53",
|
|
||||||
+ [0x0819] = "\x65\xad", [0x081a] = "\x65\xae", [0x081b] = "\x65\xaf",
|
|
||||||
+ [0x081c] = "\x65\xb0", [0x081d] = "\x65\xb1", [0x081e] = "\x2d\x51",
|
|
||||||
+ [0x081f] = "\x65\xb2", [0x0820] = "\x65\xb3", [0x0821] = "\x65\xb4",
|
|
||||||
+ [0x0822] = "\x65\xb5", [0x0823] = "\x65\xb6", [0x0824] = "\x65\xb7",
|
|
||||||
+ [0x0825] = "\x65\xb8", [0x0826] = "\x2d\x52", [0x0827] = "\x65\xb9",
|
|
||||||
[0x0828] = "\x65\xba", [0x0829] = "\x65\xbb", [0x082a] = "\x65\xbc",
|
|
||||||
- [0x082d] = "\x65\xbd", [0x082e] = "\x65\xbe", [0x082f] = "\x65\xbf",
|
|
||||||
- [0x0830] = "\x65\xc0", [0x0833] = "\x65\xc1", [0x0834] = "\x65\xc2",
|
|
||||||
- [0x0835] = "\x65\xc3", [0x0836] = "\x65\xc4", [0x0837] = "\x65\xc5",
|
|
||||||
- [0x0838] = "\x65\xc6", [0x0839] = "\x65\xc7", [0x083a] = "\x65\xc8",
|
|
||||||
- [0x083c] = "\x65\xc9", [0x083d] = "\x65\xca", [0x083e] = "\x65\xcb",
|
|
||||||
- [0x083f] = "\x65\xcc", [0x0840] = "\x65\xcd", [0x0841] = "\x65\xce",
|
|
||||||
- [0x0842] = "\x65\xcf", [0x0844] = "\x65\xd0", [0x0845] = "\x65\xd1",
|
|
||||||
+ [0x082b] = "\x2d\x53", [0x082c] = "\x2d\x54", [0x082d] = "\x65\xbd",
|
|
||||||
+ [0x082e] = "\x65\xbe", [0x082f] = "\x65\xbf", [0x0830] = "\x65\xc0",
|
|
||||||
+ [0x0831] = "\xfe\x6c", [0x0832] = "\x2d\x55", [0x0833] = "\x65\xc1",
|
|
||||||
+ [0x0834] = "\x65\xc2", [0x0835] = "\x65\xc3", [0x0836] = "\x65\xc4",
|
|
||||||
+ [0x0837] = "\x65\xc5", [0x0838] = "\x65\xc6", [0x0839] = "\x65\xc7",
|
|
||||||
+ [0x083a] = "\x65\xc8", [0x083b] = "\xfe\x76", [0x083c] = "\x65\xc9",
|
|
||||||
+ [0x083d] = "\x65\xca", [0x083e] = "\x65\xcb", [0x083f] = "\x65\xcc",
|
|
||||||
+ [0x0840] = "\x65\xcd", [0x0841] = "\x65\xce", [0x0842] = "\x65\xcf",
|
|
||||||
+ [0x0843] = "\x2d\x56", [0x0844] = "\x65\xd0", [0x0845] = "\x65\xd1",
|
|
||||||
[0x0846] = "\x65\xd2", [0x0847] = "\x65\xd3", [0x0848] = "\x65\xd4",
|
|
||||||
[0x0849] = "\x65\xd5", [0x084a] = "\x65\xd6", [0x084b] = "\x65\xd7",
|
|
||||||
[0x084c] = "\x65\xd8", [0x084d] = "\x65\xd9", [0x084e] = "\x65\xda",
|
|
||||||
[0x084f] = "\x65\xdb", [0x0850] = "\x65\xdc", [0x0851] = "\x65\xdd",
|
|
||||||
- [0x0852] = "\x65\xde", [0x0853] = "\x65\xdf", [0x0856] = "\x65\xe0",
|
|
||||||
- [0x0857] = "\x65\xe1", [0x0858] = "\x65\xe2", [0x0859] = "\x65\xe3",
|
|
||||||
- [0x085a] = "\x65\xe4", [0x085b] = "\x65\xe5", [0x085c] = "\x65\xe6",
|
|
||||||
- [0x085d] = "\x65\xe7", [0x085e] = "\x65\xe8", [0x085f] = "\x65\xe9",
|
|
||||||
- [0x0860] = "\x65\xea", [0x0861] = "\x65\xeb", [0x0862] = "\x65\xec",
|
|
||||||
- [0x0863] = "\x65\xed", [0x0865] = "\xfd\x9c", [0x0866] = "\x76\xb5",
|
|
||||||
+ [0x0852] = "\x65\xde", [0x0853] = "\x65\xdf", [0x0854] = "\x2d\x57",
|
|
||||||
+ [0x0855] = "\xfe\x91", [0x0856] = "\x65\xe0", [0x0857] = "\x65\xe1",
|
|
||||||
+ [0x0858] = "\x65\xe2", [0x0859] = "\x65\xe3", [0x085a] = "\x65\xe4",
|
|
||||||
+ [0x085b] = "\x65\xe5", [0x085c] = "\x65\xe6", [0x085d] = "\x65\xe7",
|
|
||||||
+ [0x085e] = "\x65\xe8", [0x085f] = "\x65\xe9", [0x0860] = "\x65\xea",
|
|
||||||
+ [0x0861] = "\x65\xeb", [0x0862] = "\x65\xec", [0x0863] = "\x65\xed",
|
|
||||||
+ [0x0864] = "\x2d\x58", [0x0865] = "\xfd\x9c", [0x0866] = "\x76\xb5",
|
|
||||||
[0x0867] = "\x76\xb6", [0x0868] = "\x76\xb7", [0x0869] = "\x76\xb8",
|
|
||||||
[0x086a] = "\x76\xb9", [0x086b] = "\x76\xba", [0x086c] = "\x76\xbb",
|
|
||||||
[0x086d] = "\x76\xbc", [0x086e] = "\x76\xbd", [0x086f] = "\x76\xbe",
|
|
||||||
@@ -24211,24 +24224,8 @@ static const unsigned char __ucs_to_gb18030_tab2[][2] =
|
|
||||||
|| (ch = __twobyte_to_ucs[idx], \
|
|
||||||
ch == 0 && *inptr != '\0')) \
|
|
||||||
{ \
|
|
||||||
- /* Handle a few special cases. */ \
|
|
||||||
- if (idx == 0x5dd1) \
|
|
||||||
- ch = 0x20087; \
|
|
||||||
- else if (idx == 0x5dd2) \
|
|
||||||
- ch = 0x20089; \
|
|
||||||
- else if (idx == 0x5dd3) \
|
|
||||||
- ch = 0x200cc; \
|
|
||||||
- else if (idx == 0x5dec) \
|
|
||||||
- ch = 0x215D7; \
|
|
||||||
- else if (idx == 0x5df6) \
|
|
||||||
- ch = 0x2298F; \
|
|
||||||
- else if (idx == 0x5e11) \
|
|
||||||
- ch = 0x241FE; \
|
|
||||||
- else \
|
|
||||||
- { \
|
|
||||||
- /* This is an illegal character. */ \
|
|
||||||
- STANDARD_FROM_LOOP_ERR_HANDLER (2); \
|
|
||||||
- } \
|
|
||||||
+ /* This is an illegal character. */ \
|
|
||||||
+ STANDARD_FROM_LOOP_ERR_HANDLER (2); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
inptr += 2; \
|
|
||||||
@@ -24320,17 +24317,35 @@ static const unsigned char __ucs_to_gb18030_tab2[][2] =
|
|
||||||
len = 4; \
|
|
||||||
} \
|
|
||||||
else if (ch == 0x20087) \
|
|
||||||
- cp = (const unsigned char *) "\xfe\x51"; \
|
|
||||||
+ { \
|
|
||||||
+ idx = 0x3E2CF; \
|
|
||||||
+ len = 4; \
|
|
||||||
+ } \
|
|
||||||
else if (ch == 0x20089) \
|
|
||||||
- cp = (const unsigned char *) "\xfe\x52"; \
|
|
||||||
+ { \
|
|
||||||
+ idx = 0x3E2D1; \
|
|
||||||
+ len = 4; \
|
|
||||||
+ } \
|
|
||||||
else if (ch == 0x200CC) \
|
|
||||||
- cp = (const unsigned char *) "\xfe\x53"; \
|
|
||||||
+ { \
|
|
||||||
+ idx = 0x3E314; \
|
|
||||||
+ len = 4; \
|
|
||||||
+ } \
|
|
||||||
else if (ch == 0x215d7) \
|
|
||||||
- cp = (const unsigned char *) "\xfe\x6c"; \
|
|
||||||
+ { \
|
|
||||||
+ idx = 0x3F81F; \
|
|
||||||
+ len = 4; \
|
|
||||||
+ } \
|
|
||||||
else if (ch == 0x2298F) \
|
|
||||||
- cp = (const unsigned char *) "\xfe\x76"; \
|
|
||||||
+ { \
|
|
||||||
+ idx = 0x40BD7; \
|
|
||||||
+ len = 4; \
|
|
||||||
+ } \
|
|
||||||
else if (ch == 0x241FE) \
|
|
||||||
- cp = (const unsigned char *) "\xfe\x91"; \
|
|
||||||
+ { \
|
|
||||||
+ idx = 0x42446; \
|
|
||||||
+ len = 4; \
|
|
||||||
+ } \
|
|
||||||
else if (ch >= 0x10000 && ch <= 0x10FFFF) \
|
|
||||||
{ \
|
|
||||||
idx = ch + 0x1E248; \
|
|
||||||
diff --git a/iconvdata/tst-table-from.c b/iconvdata/tst-table-from.c
|
|
||||||
index 09aaaf0942..55a7113d8c 100644
|
|
||||||
--- a/iconvdata/tst-table-from.c
|
|
||||||
+++ b/iconvdata/tst-table-from.c
|
|
||||||
@@ -194,10 +194,9 @@ main (int argc, char *argv[])
|
|
||||||
exit (1);
|
|
||||||
}
|
|
||||||
|
|
||||||
- /* When testing UTF-8 or GB18030, stop at 0x10000, otherwise the output
|
|
||||||
+ /* When testing UTF-8, stop at 0x10000, otherwise the output
|
|
||||||
file gets too big. */
|
|
||||||
- bmp_only = (strcmp (charset, "UTF-8") == 0
|
|
||||||
- || strcmp (charset, "GB18030") == 0);
|
|
||||||
+ bmp_only = (strcmp (charset, "UTF-8") == 0);
|
|
||||||
search_depth = (strcmp (charset, "UTF-8") == 0 ? 3 : 4);
|
|
||||||
|
|
||||||
{
|
|
||||||
diff --git a/iconvdata/tst-table-to.c b/iconvdata/tst-table-to.c
|
|
||||||
index 4dec4acad1..2b75f0c6e8 100644
|
|
||||||
--- a/iconvdata/tst-table-to.c
|
|
||||||
+++ b/iconvdata/tst-table-to.c
|
|
||||||
@@ -32,6 +32,7 @@ main (int argc, char *argv[])
|
|
||||||
const char *charset;
|
|
||||||
iconv_t cd;
|
|
||||||
int bmp_only;
|
|
||||||
+ int no_tags;
|
|
||||||
|
|
||||||
if (argc != 2)
|
|
||||||
{
|
|
||||||
@@ -47,16 +48,19 @@ main (int argc, char *argv[])
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
- /* When testing UTF-8 or GB18030, stop at 0x10000, otherwise the output
|
|
||||||
+ /* When testing UTF-8, stop at 0x10000, otherwise the output
|
|
||||||
file gets too big. */
|
|
||||||
- bmp_only = (strcmp (charset, "UTF-8") == 0
|
|
||||||
+ bmp_only = (strcmp (charset, "UTF-8") == 0);
|
|
||||||
+ /* When testing any encoding other than UTF-8 or GB18030, stop at 0xE0000,
|
|
||||||
+ because the conversion drops Unicode tag characters (range
|
|
||||||
+ U+E0000..U+E007F). */
|
|
||||||
+ no_tags = !(strcmp (charset, "UTF-8") == 0
|
|
||||||
|| strcmp (charset, "GB18030") == 0);
|
|
||||||
|
|
||||||
{
|
|
||||||
unsigned int i;
|
|
||||||
unsigned char buf[10];
|
|
||||||
-
|
|
||||||
- for (i = 0; i < (bmp_only ? 0x10000 : 0x30000); i++)
|
|
||||||
+ for (i = 0; i < (bmp_only ? 0x10000 : no_tags ? 0xE0000 : 0x110000); i++)
|
|
||||||
{
|
|
||||||
unsigned char in[6];
|
|
||||||
unsigned int incount =
|
|
||||||
diff --git a/iconvdata/tst-table.sh b/iconvdata/tst-table.sh
|
|
||||||
index bc6f542b24..7ba15bbf5c 100755
|
|
||||||
--- a/iconvdata/tst-table.sh
|
|
||||||
+++ b/iconvdata/tst-table.sh
|
|
||||||
@@ -37,7 +37,8 @@ set -e
|
|
||||||
< ../localedata/charmaps/${charmap:-$charset} \
|
|
||||||
> ${objpfx}tst-${charset}.charmap.table
|
|
||||||
# When the charset is GB18030, truncate this table because for this encoding,
|
|
||||||
-# the tst-table-from and tst-table-to programs scan the Unicode BMP only.
|
|
||||||
+# the charmap contains ranges (<Unnnn>..<Ummmm> notation), which the
|
|
||||||
+# tst-table-charmap.sh script does not grok.
|
|
||||||
if test ${charset} = GB18030; then
|
|
||||||
grep '0x....$' < ${objpfx}tst-${charset}.charmap.table \
|
|
||||||
> ${objpfx}tst-${charset}.truncated.table
|
|
||||||
@@ -73,25 +74,42 @@ diff ${objpfx}tst-${charset}.charmap.table ${objpfx}tst-${charset}.inverse.table
|
|
||||||
|
|
||||||
# Check 1: charmap and iconv forward should be identical, except for
|
|
||||||
# precomposed characters.
|
|
||||||
-if test -f ${precomposed}; then
|
|
||||||
- cat ${objpfx}tst-${charset}.table ${precomposed} | sort | uniq -u \
|
|
||||||
- > ${objpfx}tst-${charset}.tmp.table
|
|
||||||
- cmp -s ${objpfx}tst-${charset}.charmap.table ${objpfx}tst-${charset}.tmp.table ||
|
|
||||||
+{ if test -f ${precomposed}; then
|
|
||||||
+ cat ${objpfx}tst-${charset}.table ${precomposed} | sort | uniq -u
|
|
||||||
+ else
|
|
||||||
+ cat ${objpfx}tst-${charset}.table
|
|
||||||
+ fi
|
|
||||||
+} | { if test ${charset} = GB18030; then grep '0x....$'; else cat; fi; } \
|
|
||||||
+ > ${objpfx}tst-${charset}.tmp1.table
|
|
||||||
+cmp -s ${objpfx}tst-${charset}.charmap.table ${objpfx}tst-${charset}.tmp1.table ||
|
|
||||||
exit 1
|
|
||||||
-else
|
|
||||||
- cmp -s ${objpfx}tst-${charset}.charmap.table ${objpfx}tst-${charset}.table ||
|
|
||||||
- exit 1
|
|
||||||
-fi
|
|
||||||
|
|
||||||
# Check 2: the difference between the charmap and iconv backward.
|
|
||||||
-if test -f ${irreversible}; then
|
|
||||||
- cat ${objpfx}tst-${charset}.charmap.table ${irreversible} | sort | uniq -u \
|
|
||||||
- > ${objpfx}tst-${charset}.tmp.table
|
|
||||||
- cmp -s ${objpfx}tst-${charset}.tmp.table ${objpfx}tst-${charset}.inverse.table ||
|
|
||||||
- exit 1
|
|
||||||
-else
|
|
||||||
- cmp -s ${objpfx}tst-${charset}.charmap.table ${objpfx}tst-${charset}.inverse.table ||
|
|
||||||
+{ if test -f ${irreversible}; then
|
|
||||||
+ cat ${objpfx}tst-${charset}.charmap.table ${irreversible} | sort | uniq -u
|
|
||||||
+ else
|
|
||||||
+ cat ${objpfx}tst-${charset}.charmap.table
|
|
||||||
+ fi
|
|
||||||
+} | { if test ${charset} = GB18030; then grep '0x....$'; else cat; fi; } \
|
|
||||||
+ > ${objpfx}tst-${charset}.tmp2c.table
|
|
||||||
+cat ${objpfx}tst-${charset}.inverse.table \
|
|
||||||
+ | { if test ${charset} = GB18030; then grep '0x....$'; else cat; fi; } \
|
|
||||||
+ > ${objpfx}tst-${charset}.tmp2i.table
|
|
||||||
+cmp -s ${objpfx}tst-${charset}.tmp2c.table ${objpfx}tst-${charset}.tmp2i.table ||
|
|
||||||
exit 1
|
|
||||||
+
|
|
||||||
+# Check 3: the difference between iconv forward and iconv backward. This is
|
|
||||||
+# necessary only for GB18030, because ${objpfx}tst-${charset}.charmap.table
|
|
||||||
+# is truncated for this encoding (see above).
|
|
||||||
+if test ${charset} = GB18030; then
|
|
||||||
+ { if test -f ${irreversible}; then
|
|
||||||
+ cat ${objpfx}tst-${charset}.table ${irreversible} | sort | uniq -u
|
|
||||||
+ else
|
|
||||||
+ cat ${objpfx}tst-${charset}.table
|
|
||||||
+ fi
|
|
||||||
+ } > ${objpfx}tst-${charset}.tmp3.table
|
|
||||||
+ cmp -s ${objpfx}tst-${charset}.tmp3.table ${objpfx}tst-${charset}.inverse.table ||
|
|
||||||
+ exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
exit 0
|
|
||||||
diff --git a/localedata/charmaps/GB18030 b/localedata/charmaps/GB18030
|
|
||||||
index ad6728c5bd..fc3b1d2d40 100644
|
|
||||||
--- a/localedata/charmaps/GB18030
|
|
||||||
+++ b/localedata/charmaps/GB18030
|
|
||||||
@@ -57234,32 +57234,16 @@ CHARMAP
|
|
||||||
<UE78A> /xa6/xbe <Private Use>
|
|
||||||
<UE78B> /xa6/xbf <Private Use>
|
|
||||||
<UE78C> /xa6/xc0 <Private Use>
|
|
||||||
-% The newest GB 18030-2005 standard still uses some private use area
|
|
||||||
-% code points. Any implementation which has Unicode 4.1 or newer
|
|
||||||
-% support should not use these PUA code points, and instead should
|
|
||||||
-% map these entries to their equivalent non-PUA code points. There
|
|
||||||
-% are 24 idiograms in GB 18030-2005 which have non-PUA equivalents.
|
|
||||||
-% In glibc we only support roundtrip code points, and so must choose
|
|
||||||
-% between supporting the old PUA code points, or using the newer
|
|
||||||
-% non-PUA code points. We choose to use the non-PUA code points to
|
|
||||||
-% be compatible with ICU's similar choice. In choosing the non-PUA
|
|
||||||
-% code points we can no longer convert the old PUA code points back
|
|
||||||
-% to GB-18030-2005 (technically only fixable if we added support
|
|
||||||
-% for non-roundtrip code points e.g. ICU's "fallback mapping").
|
|
||||||
-% The recommendation to use the non-PUA code points, where available,
|
|
||||||
-% is based on "CJKV Information Processing" 2nd Ed. by Dr. Ken Lunde.
|
|
||||||
-%
|
|
||||||
-% These 10 PUA mappings use equivalents from <UFE10> to <UFE19>.
|
|
||||||
-% <UE78D> /xa6/xd9 <Private Use>
|
|
||||||
-% <UE78E> /xa6/xda <Private Use>
|
|
||||||
-% <UE78F> /xa6/xdb <Private Use>
|
|
||||||
-% <UE790> /xa6/xdc <Private Use>
|
|
||||||
-% <UE791> /xa6/xdd <Private Use>
|
|
||||||
-% <UE792> /xa6/xde <Private Use>
|
|
||||||
-% <UE793> /xa6/xdf <Private Use>
|
|
||||||
-% <UE794> /xa6/xec <Private Use>
|
|
||||||
-% <UE795> /xa6/xed <Private Use>
|
|
||||||
-% <UE796> /xa6/xf3 <Private Use>
|
|
||||||
+<UE78D> /x84/x31/x82/x36 <Private Use>
|
|
||||||
+<UE78E> /x84/x31/x82/x38 <Private Use>
|
|
||||||
+<UE78F> /x84/x31/x82/x37 <Private Use>
|
|
||||||
+<UE790> /x84/x31/x82/x39 <Private Use>
|
|
||||||
+<UE791> /x84/x31/x83/x30 <Private Use>
|
|
||||||
+<UE792> /x84/x31/x83/x31 <Private Use>
|
|
||||||
+<UE793> /x84/x31/x83/x32 <Private Use>
|
|
||||||
+<UE794> /x84/x31/x83/x33 <Private Use>
|
|
||||||
+<UE795> /x84/x31/x83/x34 <Private Use>
|
|
||||||
+<UE796> /x84/x31/x83/x35 <Private Use>
|
|
||||||
<UE797> /xa6/xf6 <Private Use>
|
|
||||||
<UE798> /xa6/xf7 <Private Use>
|
|
||||||
<UE799> /xa6/xf8 <Private Use>
|
|
||||||
@@ -57387,17 +57371,15 @@ CHARMAP
|
|
||||||
<UE813> /xd7/xfd <Private Use>
|
|
||||||
<UE814> /xd7/xfe <Private Use>
|
|
||||||
<UE815> /x83/x36/xc9/x34 <Private Use>
|
|
||||||
-% These 3 PUA mappings use equivalents <U20087>, <U20089> and <U200CC>.
|
|
||||||
-% <UE816> /xfe/x51 <Private Use>
|
|
||||||
-% <UE817> /xfe/x52 <Private Use>
|
|
||||||
-% <UE818> /xfe/x53 <Private Use>
|
|
||||||
+<UE816> /xfe/x51 <Private Use>
|
|
||||||
+<UE817> /xfe/x52 <Private Use>
|
|
||||||
+<UE818> /xfe/x53 <Private Use>
|
|
||||||
<UE819> /x83/x36/xc9/x35 <Private Use>
|
|
||||||
<UE81A> /x83/x36/xc9/x36 <Private Use>
|
|
||||||
<UE81B> /x83/x36/xc9/x37 <Private Use>
|
|
||||||
<UE81C> /x83/x36/xc9/x38 <Private Use>
|
|
||||||
<UE81D> /x83/x36/xc9/x39 <Private Use>
|
|
||||||
-% This 1 PUA mapping uses the equivalent <U9FB4>.
|
|
||||||
-% <UE81E> /xfe/x59 <Private Use>
|
|
||||||
+<UE81E> /x82/x35/x90/x37 <Private Use>
|
|
||||||
<UE81F> /x83/x36/xca/x30 <Private Use>
|
|
||||||
<UE820> /x83/x36/xca/x31 <Private Use>
|
|
||||||
<UE821> /x83/x36/xca/x32 <Private Use>
|
|
||||||
@@ -57405,22 +57387,19 @@ CHARMAP
|
|
||||||
<UE823> /x83/x36/xca/x34 <Private Use>
|
|
||||||
<UE824> /x83/x36/xca/x35 <Private Use>
|
|
||||||
<UE825> /x83/x36/xca/x36 <Private Use>
|
|
||||||
-% This 1 PUA mapping uses the equivalent <U9FB5>.
|
|
||||||
-% <UE826> /xfe/x61 <Private Use>
|
|
||||||
+<UE826> /x82/x35/x90/x38 <Private Use>
|
|
||||||
<UE827> /x83/x36/xca/x37 <Private Use>
|
|
||||||
<UE828> /x83/x36/xca/x38 <Private Use>
|
|
||||||
<UE829> /x83/x36/xca/x39 <Private Use>
|
|
||||||
<UE82A> /x83/x36/xcb/x30 <Private Use>
|
|
||||||
-% These 2 PUA mappings use the equivalents <U9FB6> and <U9FB7>.
|
|
||||||
-% <UE82B> /xfe/x66 <Private Use>
|
|
||||||
-% <UE82C> /xfe/x67 <Private Use>
|
|
||||||
+<UE82B> /x82/x35/x90/x39 <Private Use>
|
|
||||||
+<UE82C> /x82/x35/x91/x30 <Private Use>
|
|
||||||
<UE82D> /x83/x36/xcb/x31 <Private Use>
|
|
||||||
<UE82E> /x83/x36/xcb/x32 <Private Use>
|
|
||||||
<UE82F> /x83/x36/xcb/x33 <Private Use>
|
|
||||||
<UE830> /x83/x36/xcb/x34 <Private Use>
|
|
||||||
-% These 2 PUA mappings use the equivalents <U215D7> and <U9FB8>.
|
|
||||||
-% <UE831> /xfe/x6c <Private Use>
|
|
||||||
-% <UE832> /xfe/x6d <Private Use>
|
|
||||||
+<UE831> /xfe/x6c <Private Use>
|
|
||||||
+<UE832> /x82/x35/x91/x31 <Private Use>
|
|
||||||
<UE833> /x83/x36/xcb/x35 <Private Use>
|
|
||||||
<UE834> /x83/x36/xcb/x36 <Private Use>
|
|
||||||
<UE835> /x83/x36/xcb/x37 <Private Use>
|
|
||||||
@@ -57429,8 +57408,7 @@ CHARMAP
|
|
||||||
<UE838> /x83/x36/xcc/x30 <Private Use>
|
|
||||||
<UE839> /x83/x36/xcc/x31 <Private Use>
|
|
||||||
<UE83A> /x83/x36/xcc/x32 <Private Use>
|
|
||||||
-% This 1 PUA mapping uses the equivalent <U2298F>.
|
|
||||||
-% <UE83B> /xfe/x76 <Private Use>
|
|
||||||
+<UE83B> /xfe/x76 <Private Use>
|
|
||||||
<UE83C> /x83/x36/xcc/x33 <Private Use>
|
|
||||||
<UE83D> /x83/x36/xcc/x34 <Private Use>
|
|
||||||
<UE83E> /x83/x36/xcc/x35 <Private Use>
|
|
||||||
@@ -57438,8 +57416,7 @@ CHARMAP
|
|
||||||
<UE840> /x83/x36/xcc/x37 <Private Use>
|
|
||||||
<UE841> /x83/x36/xcc/x38 <Private Use>
|
|
||||||
<UE842> /x83/x36/xcc/x39 <Private Use>
|
|
||||||
-% This 1 PUA mapping uses the equivalent <U9FB9>.
|
|
||||||
-% <UE843> /xfe/x7e <Private Use>
|
|
||||||
+<UE843> /x82/x35/x91/x32 <Private Use>
|
|
||||||
<UE844> /x83/x36/xcd/x30 <Private Use>
|
|
||||||
<UE845> /x83/x36/xcd/x31 <Private Use>
|
|
||||||
<UE846> /x83/x36/xcd/x32 <Private Use>
|
|
||||||
@@ -57456,9 +57433,8 @@ CHARMAP
|
|
||||||
<UE851> /x83/x36/xce/x33 <Private Use>
|
|
||||||
<UE852> /x83/x36/xce/x34 <Private Use>
|
|
||||||
<UE853> /x83/x36/xce/x35 <Private Use>
|
|
||||||
-% These 2 PUA mappings use the equivalents <U9FBA> and <U241FE>.
|
|
||||||
-% <UE854> /xfe/x90 <Private Use>
|
|
||||||
-% <UE855> /xfe/x91 <Private Use>
|
|
||||||
+<UE854> /x82/x35/x91/x33 <Private Use>
|
|
||||||
+<UE855> /xfe/x91 <Private Use>
|
|
||||||
<UE856> /x83/x36/xce/x36 <Private Use>
|
|
||||||
<UE857> /x83/x36/xce/x37 <Private Use>
|
|
||||||
<UE858> /x83/x36/xce/x38 <Private Use>
|
|
||||||
@@ -57473,8 +57449,7 @@ CHARMAP
|
|
||||||
<UE861> /x83/x36/xcf/x37 <Private Use>
|
|
||||||
<UE862> /x83/x36/xcf/x38 <Private Use>
|
|
||||||
<UE863> /x83/x36/xcf/x39 <Private Use>
|
|
||||||
-% This 1 PUA mapping uses the equivalent <U9FBB>.
|
|
||||||
-% <UE864> /xfe/xa0 <Private Use>
|
|
||||||
+<UE864> /x82/x35/x91/x34 <Private Use>
|
|
||||||
<UE865> /x83/x36/xd0/x30 <Private Use>
|
|
||||||
<UE866> /x83/x36/xd0/x31 <Private Use>
|
|
||||||
<UE867> /x83/x36/xd0/x32 <Private Use>
|
|
||||||
@@ -70447,19 +70422,14 @@ CHARMAP
|
|
||||||
<U00020068>..<U00020071> /x95/x32/x8d/x30 <CJK>
|
|
||||||
<U00020072>..<U0002007B> /x95/x32/x8e/x30 <CJK>
|
|
||||||
<U0002007C>..<U00020085> /x95/x32/x8f/x30 <CJK>
|
|
||||||
-<U00020086> /x95/x32/x90/x30 <CJK>
|
|
||||||
-<U00020087> /xfe/x51 <CJK>
|
|
||||||
-<U00020088> /x95/x32/x90/x32 <CJK>
|
|
||||||
-<U00020089> /xfe/x52 <CJK>
|
|
||||||
-<U0002008A>..<U0002008F> /x95/x32/x90/x34 <CJK>
|
|
||||||
+<U00020086>..<U0002008F> /x95/x32/x90/x30 <CJK>
|
|
||||||
<U00020090>..<U00020099> /x95/x32/x91/x30 <CJK>
|
|
||||||
<U0002009A>..<U000200A3> /x95/x32/x92/x30 <CJK>
|
|
||||||
<U000200A4>..<U000200AD> /x95/x32/x93/x30 <CJK>
|
|
||||||
<U000200AE>..<U000200B7> /x95/x32/x94/x30 <CJK>
|
|
||||||
<U000200B8>..<U000200C1> /x95/x32/x95/x30 <CJK>
|
|
||||||
<U000200C2>..<U000200CB> /x95/x32/x96/x30 <CJK>
|
|
||||||
-<U000200CC> /xfe/x53 <CJK>
|
|
||||||
-<U000200CD>..<U000200D5> /x95/x32/x97/x31 <CJK>
|
|
||||||
+<U000200CC>..<U000200D5> /x95/x32/x97/x30 <CJK>
|
|
||||||
<U000200D6>..<U000200DF> /x95/x32/x98/x30 <CJK>
|
|
||||||
<U000200E0>..<U000200E9> /x95/x32/x99/x30 <CJK>
|
|
||||||
<U000200EA>..<U000200F3> /x95/x32/x9a/x30 <CJK>
|
|
||||||
@@ -70998,8 +70968,7 @@ CHARMAP
|
|
||||||
<U000215BC>..<U000215C5> /x95/x36/xb7/x30 <CJK>
|
|
||||||
<U000215C6>..<U000215CF> /x95/x36/xb8/x30 <CJK>
|
|
||||||
<U000215D0>..<U000215D6> /x95/x36/xb9/x30 <CJK>
|
|
||||||
-<U000215D7> /xfe/x6c <CJK>
|
|
||||||
-<U000215D8>..<U000215D9> /x95/x36/xb9/x38 <CJK>
|
|
||||||
+<U000215D7>..<U000215D9> /x95/x36/xb9/x37 <CJK>
|
|
||||||
<U000215DA>..<U000215E3> /x95/x36/xba/x30 <CJK>
|
|
||||||
<U000215E4>..<U000215ED> /x95/x36/xbb/x30 <CJK>
|
|
||||||
<U000215EE>..<U000215F7> /x95/x36/xbc/x30 <CJK>
|
|
||||||
@@ -71505,8 +71474,7 @@ CHARMAP
|
|
||||||
<U00022976>..<U0002297F> /x96/x30/xb8/x30 <CJK>
|
|
||||||
<U00022980>..<U00022989> /x96/x30/xb9/x30 <CJK>
|
|
||||||
<U0002298A>..<U0002298E> /x96/x30/xba/x30 <CJK>
|
|
||||||
-<U0002298F> /xfe/x76 <CJK>
|
|
||||||
-<U00022990>..<U00022993> /x96/x30/xba/x36 <CJK>
|
|
||||||
+<U0002298F>..<U00022993> /x96/x30/xba/x35 <CJK>
|
|
||||||
<U00022994>..<U0002299D> /x96/x30/xbb/x30 <CJK>
|
|
||||||
<U0002299E>..<U000229A7> /x96/x30/xbc/x30 <CJK>
|
|
||||||
<U000229A8>..<U000229B1> /x96/x30/xbd/x30 <CJK>
|
|
||||||
@@ -72132,8 +72100,7 @@ CHARMAP
|
|
||||||
<U000241E0>..<U000241E9> /x96/x35/xb3/x30 <CJK>
|
|
||||||
<U000241EA>..<U000241F3> /x96/x35/xb4/x30 <CJK>
|
|
||||||
<U000241F4>..<U000241FD> /x96/x35/xb5/x30 <CJK>
|
|
||||||
-<U000241FE> /xfe/x91 <CJK>
|
|
||||||
-<U000241FF>..<U00024207> /x96/x35/xb6/x31 <CJK>
|
|
||||||
+<U000241FE>..<U00024207> /x96/x35/xb6/x30 <CJK>
|
|
||||||
<U00024208>..<U00024211> /x96/x35/xb7/x30 <CJK>
|
|
||||||
<U00024212>..<U0002421B> /x96/x35/xb8/x30 <CJK>
|
|
||||||
<U0002421C>..<U00024225> /x96/x35/xb9/x30 <CJK>
|
|
@ -1,114 +0,0 @@
|
|||||||
commit c4e4b2e149705559d28b16a9b47ba2f6142d6a6c
|
|
||||||
Author: Andreas Schwab <schwab@suse.de>
|
|
||||||
Date: Tue Jun 23 12:55:49 2020 +0200
|
|
||||||
|
|
||||||
Correct locking and cancellation cleanup in syslog functions (bug 26100)
|
|
||||||
|
|
||||||
Properly serialize the access to the global state shared between the
|
|
||||||
syslog functions, to avoid races in multithreaded processes. Protect a
|
|
||||||
local allocation in the __vsyslog_internal function from leaking during
|
|
||||||
cancellation.
|
|
||||||
|
|
||||||
diff --git a/misc/syslog.c b/misc/syslog.c
|
|
||||||
index fd6537edf6..2cc63ef287 100644
|
|
||||||
--- a/misc/syslog.c
|
|
||||||
+++ b/misc/syslog.c
|
|
||||||
@@ -91,14 +91,20 @@ struct cleanup_arg
|
|
||||||
static void
|
|
||||||
cancel_handler (void *ptr)
|
|
||||||
{
|
|
||||||
-#ifndef NO_SIGPIPE
|
|
||||||
/* Restore the old signal handler. */
|
|
||||||
struct cleanup_arg *clarg = (struct cleanup_arg *) ptr;
|
|
||||||
|
|
||||||
- if (clarg != NULL && clarg->oldaction != NULL)
|
|
||||||
- __sigaction (SIGPIPE, clarg->oldaction, NULL);
|
|
||||||
+ if (clarg != NULL)
|
|
||||||
+ {
|
|
||||||
+#ifndef NO_SIGPIPE
|
|
||||||
+ if (clarg->oldaction != NULL)
|
|
||||||
+ __sigaction (SIGPIPE, clarg->oldaction, NULL);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+ /* Free the memstream buffer, */
|
|
||||||
+ free (clarg->buf);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
/* Free the lock. */
|
|
||||||
__libc_lock_unlock (syslog_lock);
|
|
||||||
}
|
|
||||||
@@ -169,9 +175,17 @@ __vsyslog_internal(int pri, const char *fmt, va_list ap,
|
|
||||||
pri &= LOG_PRIMASK|LOG_FACMASK;
|
|
||||||
}
|
|
||||||
|
|
||||||
+ /* Prepare for multiple users. We have to take care: most
|
|
||||||
+ syscalls we are using are cancellation points. */
|
|
||||||
+ struct cleanup_arg clarg;
|
|
||||||
+ clarg.buf = NULL;
|
|
||||||
+ clarg.oldaction = NULL;
|
|
||||||
+ __libc_cleanup_push (cancel_handler, &clarg);
|
|
||||||
+ __libc_lock_lock (syslog_lock);
|
|
||||||
+
|
|
||||||
/* Check priority against setlogmask values. */
|
|
||||||
if ((LOG_MASK (LOG_PRI (pri)) & LogMask) == 0)
|
|
||||||
- return;
|
|
||||||
+ goto out;
|
|
||||||
|
|
||||||
/* Set default facility if none specified. */
|
|
||||||
if ((pri & LOG_FACMASK) == 0)
|
|
||||||
@@ -235,6 +249,9 @@ __vsyslog_internal(int pri, const char *fmt, va_list ap,
|
|
||||||
/* Close the memory stream; this will finalize the data
|
|
||||||
into a malloc'd buffer in BUF. */
|
|
||||||
fclose (f);
|
|
||||||
+
|
|
||||||
+ /* Tell the cancellation handler to free this buffer. */
|
|
||||||
+ clarg.buf = buf;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Output to stderr if requested. */
|
|
||||||
@@ -252,22 +269,10 @@ __vsyslog_internal(int pri, const char *fmt, va_list ap,
|
|
||||||
v->iov_len = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
- __libc_cleanup_push (free, buf == failbuf ? NULL : buf);
|
|
||||||
-
|
|
||||||
/* writev is a cancellation point. */
|
|
||||||
(void)__writev(STDERR_FILENO, iov, v - iov + 1);
|
|
||||||
-
|
|
||||||
- __libc_cleanup_pop (0);
|
|
||||||
}
|
|
||||||
|
|
||||||
- /* Prepare for multiple users. We have to take care: open and
|
|
||||||
- write are cancellation points. */
|
|
||||||
- struct cleanup_arg clarg;
|
|
||||||
- clarg.buf = buf;
|
|
||||||
- clarg.oldaction = NULL;
|
|
||||||
- __libc_cleanup_push (cancel_handler, &clarg);
|
|
||||||
- __libc_lock_lock (syslog_lock);
|
|
||||||
-
|
|
||||||
#ifndef NO_SIGPIPE
|
|
||||||
/* Prepare for a broken connection. */
|
|
||||||
memset (&action, 0, sizeof (action));
|
|
||||||
@@ -320,6 +325,7 @@ __vsyslog_internal(int pri, const char *fmt, va_list ap,
|
|
||||||
__sigaction (SIGPIPE, &oldaction, (struct sigaction *) NULL);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
+ out:
|
|
||||||
/* End of critical section. */
|
|
||||||
__libc_cleanup_pop (0);
|
|
||||||
__libc_lock_unlock (syslog_lock);
|
|
||||||
@@ -430,8 +436,14 @@ setlogmask (int pmask)
|
|
||||||
{
|
|
||||||
int omask;
|
|
||||||
|
|
||||||
+ /* Protect against multiple users. */
|
|
||||||
+ __libc_lock_lock (syslog_lock);
|
|
||||||
+
|
|
||||||
omask = LogMask;
|
|
||||||
if (pmask != 0)
|
|
||||||
LogMask = pmask;
|
|
||||||
+
|
|
||||||
+ __libc_lock_unlock (syslog_lock);
|
|
||||||
+
|
|
||||||
return (omask);
|
|
||||||
}
|
|
@ -1,463 +0,0 @@
|
|||||||
commit 1db84775f831a1494993ce9c118deaf9537cc50a
|
|
||||||
Author: Frank Barrus <frankbarrus_sw@shaggy.cc>
|
|
||||||
Date: Wed Dec 4 07:55:02 2024 -0500
|
|
||||||
|
|
||||||
pthreads NPTL: lost wakeup fix 2
|
|
||||||
|
|
||||||
This fixes the lost wakeup (from a bug in signal stealing) with a change
|
|
||||||
in the usage of g_signals[] in the condition variable internal state.
|
|
||||||
It also completely eliminates the concept and handling of signal stealing,
|
|
||||||
as well as the need for signalers to block to wait for waiters to wake
|
|
||||||
up every time there is a G1/G2 switch. This greatly reduces the average
|
|
||||||
and maximum latency for pthread_cond_signal.
|
|
||||||
|
|
||||||
The g_signals[] field now contains a signal count that is relative to
|
|
||||||
the current g1_start value. Since it is a 32-bit field, and the LSB is
|
|
||||||
still reserved (though not currently used anymore), it has a 31-bit value
|
|
||||||
that corresponds to the low 31 bits of the sequence number in g1_start.
|
|
||||||
(since g1_start also has an LSB flag, this means bits 31:1 in g_signals
|
|
||||||
correspond to bits 31:1 in g1_start, plus the current signal count)
|
|
||||||
|
|
||||||
By making the signal count relative to g1_start, there is no longer
|
|
||||||
any ambiguity or A/B/A issue, and thus any checks before blocking,
|
|
||||||
including the futex call itself, are guaranteed not to block if the G1/G2
|
|
||||||
switch occurs, even if the signal count remains the same. This allows
|
|
||||||
initially safely blocking in G2 until the switch to G1 occurs, and
|
|
||||||
then transitioning from G1 to a new G1 or G2, and always being able to
|
|
||||||
distinguish the state change. This removes the race condition and A/B/A
|
|
||||||
problems that otherwise ocurred if a late (pre-empted) waiter were to
|
|
||||||
resume just as the futex call attempted to block on g_signal since
|
|
||||||
otherwise there was no last opportunity to re-check things like whether
|
|
||||||
the current G1 group was already closed.
|
|
||||||
|
|
||||||
By fixing these issues, the signal stealing code can be eliminated,
|
|
||||||
since there is no concept of signal stealing anymore. The code to block
|
|
||||||
for all waiters to exit g_refs can also be removed, since any waiters
|
|
||||||
that are still in the g_refs region can be guaranteed to safely wake
|
|
||||||
up and exit. If there are still any left at this time, they are all
|
|
||||||
sent one final futex wakeup to ensure that they are not blocked any
|
|
||||||
longer, but there is no need for the signaller to block and wait for
|
|
||||||
them to wake up and exit the g_refs region.
|
|
||||||
|
|
||||||
The signal count is then effectively "zeroed" but since it is now
|
|
||||||
relative to g1_start, this is done by advancing it to a new value that
|
|
||||||
can be observed by any pending blocking waiters. Any late waiters can
|
|
||||||
always tell the difference, and can thus just cleanly exit if they are
|
|
||||||
in a stale G1 or G2. They can never steal a signal from the current
|
|
||||||
G1 if they are not in the current G1, since the signal value that has
|
|
||||||
to match in the cmpxchg has the low 31 bits of the g1_start value
|
|
||||||
contained in it, and that's first checked, and then it won't match if
|
|
||||||
there's a G1/G2 change.
|
|
||||||
|
|
||||||
Note: the 31-bit sequence number used in g_signals is designed to
|
|
||||||
handle wrap-around when checking the signal count, but if the entire
|
|
||||||
31-bit wraparound (2 billion signals) occurs while there is still a
|
|
||||||
late waiter that has not yet resumed, and it happens to then match
|
|
||||||
the current g1_start low bits, and the pre-emption occurs after the
|
|
||||||
normal "closed group" checks (which are 64-bit) but then hits the
|
|
||||||
futex syscall and signal consuming code, then an A/B/A issue could
|
|
||||||
still result and cause an incorrect assumption about whether it
|
|
||||||
should block. This particular scenario seems unlikely in practice.
|
|
||||||
Note that once awake from the futex, the waiter would notice the
|
|
||||||
closed group before consuming the signal (since that's still a 64-bit
|
|
||||||
check that would not be aliased in the wrap-around in g_signals),
|
|
||||||
so the biggest impact would be blocking on the futex until the next
|
|
||||||
full wakeup from a G1/G2 switch.
|
|
||||||
|
|
||||||
Signed-off-by: Frank Barrus <frankbarrus_sw@shaggy.cc>
|
|
||||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
|
||||||
|
|
||||||
# Conflicts:
|
|
||||||
# nptl/pthread_cond_common.c (timed wait refactor)
|
|
||||||
# nptl/pthread_cond_wait.c (textual conflicts)
|
|
||||||
|
|
||||||
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
|
|
||||||
index 479e54febb417675..9175e6779ebff244 100644
|
|
||||||
--- a/nptl/pthread_cond_common.c
|
|
||||||
+++ b/nptl/pthread_cond_common.c
|
|
||||||
@@ -341,7 +341,6 @@ static bool __attribute__ ((unused))
|
|
||||||
__condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
|
||||||
unsigned int *g1index, int private)
|
|
||||||
{
|
|
||||||
- const unsigned int maxspin = 0;
|
|
||||||
unsigned int g1 = *g1index;
|
|
||||||
|
|
||||||
/* If there is no waiter in G2, we don't do anything. The expression may
|
|
||||||
@@ -362,84 +361,46 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
|
||||||
* New waiters arriving concurrently with the group switching will all go
|
|
||||||
into G2 until we atomically make the switch. Waiters existing in G2
|
|
||||||
are not affected.
|
|
||||||
- * Waiters in G1 will be closed out immediately by setting a flag in
|
|
||||||
- __g_signals, which will prevent waiters from blocking using a futex on
|
|
||||||
- __g_signals and also notifies them that the group is closed. As a
|
|
||||||
- result, they will eventually remove their group reference, allowing us
|
|
||||||
- to close switch group roles. */
|
|
||||||
-
|
|
||||||
- /* First, set the closed flag on __g_signals. This tells waiters that are
|
|
||||||
- about to wait that they shouldn't do that anymore. This basically
|
|
||||||
- serves as an advance notificaton of the upcoming change to __g1_start;
|
|
||||||
- waiters interpret it as if __g1_start was larger than their waiter
|
|
||||||
- sequence position. This allows us to change __g1_start after waiting
|
|
||||||
- for all existing waiters with group references to leave, which in turn
|
|
||||||
- makes recovery after stealing a signal simpler because it then can be
|
|
||||||
- skipped if __g1_start indicates that the group is closed (otherwise,
|
|
||||||
- we would have to recover always because waiters don't know how big their
|
|
||||||
- groups are). Relaxed MO is fine. */
|
|
||||||
- atomic_fetch_or_relaxed (cond->__data.__g_signals + g1, 1);
|
|
||||||
-
|
|
||||||
- /* Wait until there are no group references anymore. The fetch-or operation
|
|
||||||
- injects us into the modification order of __g_refs; release MO ensures
|
|
||||||
- that waiters incrementing __g_refs after our fetch-or see the previous
|
|
||||||
- changes to __g_signals and to __g1_start that had to happen before we can
|
|
||||||
- switch this G1 and alias with an older group (we have two groups, so
|
|
||||||
- aliasing requires switching group roles twice). Note that nobody else
|
|
||||||
- can have set the wake-request flag, so we do not have to act upon it.
|
|
||||||
-
|
|
||||||
- Also note that it is harmless if older waiters or waiters from this G1
|
|
||||||
- get a group reference after we have quiesced the group because it will
|
|
||||||
- remain closed for them either because of the closed flag in __g_signals
|
|
||||||
- or the later update to __g1_start. New waiters will never arrive here
|
|
||||||
- but instead continue to go into the still current G2. */
|
|
||||||
- unsigned r = atomic_fetch_or_release (cond->__data.__g_refs + g1, 0);
|
|
||||||
- while ((r >> 1) > 0)
|
|
||||||
- {
|
|
||||||
- for (unsigned int spin = maxspin; ((r >> 1) > 0) && (spin > 0); spin--)
|
|
||||||
- {
|
|
||||||
- /* TODO Back off. */
|
|
||||||
- r = atomic_load_relaxed (cond->__data.__g_refs + g1);
|
|
||||||
- }
|
|
||||||
- if ((r >> 1) > 0)
|
|
||||||
- {
|
|
||||||
- /* There is still a waiter after spinning. Set the wake-request
|
|
||||||
- flag and block. Relaxed MO is fine because this is just about
|
|
||||||
- this futex word.
|
|
||||||
-
|
|
||||||
- Update r to include the set wake-request flag so that the upcoming
|
|
||||||
- futex_wait only blocks if the flag is still set (otherwise, we'd
|
|
||||||
- violate the basic client-side futex protocol). */
|
|
||||||
- r = atomic_fetch_or_relaxed (cond->__data.__g_refs + g1, 1) | 1;
|
|
||||||
-
|
|
||||||
- if ((r >> 1) > 0)
|
|
||||||
- futex_wait_simple (cond->__data.__g_refs + g1, r, private);
|
|
||||||
- /* Reload here so we eventually see the most recent value even if we
|
|
||||||
- do not spin. */
|
|
||||||
- r = atomic_load_relaxed (cond->__data.__g_refs + g1);
|
|
||||||
- }
|
|
||||||
- }
|
|
||||||
- /* Acquire MO so that we synchronize with the release operation that waiters
|
|
||||||
- use to decrement __g_refs and thus happen after the waiters we waited
|
|
||||||
- for. */
|
|
||||||
- atomic_thread_fence_acquire ();
|
|
||||||
+ * Waiters in G1 will be closed out immediately by the advancing of
|
|
||||||
+ __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
|
|
||||||
+ which will prevent waiters from blocking using a futex on
|
|
||||||
+ __g_signals since it provides enough signals for all possible
|
|
||||||
+ remaining waiters. As a result, they can each consume a signal
|
|
||||||
+ and they will eventually remove their group reference. */
|
|
||||||
|
|
||||||
/* Update __g1_start, which finishes closing this group. The value we add
|
|
||||||
will never be negative because old_orig_size can only be zero when we
|
|
||||||
switch groups the first time after a condvar was initialized, in which
|
|
||||||
- case G1 will be at index 1 and we will add a value of 1. See above for
|
|
||||||
- why this takes place after waiting for quiescence of the group.
|
|
||||||
+ case G1 will be at index 1 and we will add a value of 1.
|
|
||||||
Relaxed MO is fine because the change comes with no additional
|
|
||||||
constraints that others would have to observe. */
|
|
||||||
__condvar_add_g1_start_relaxed (cond,
|
|
||||||
(old_orig_size << 1) + (g1 == 1 ? 1 : - 1));
|
|
||||||
|
|
||||||
- /* Now reopen the group, thus enabling waiters to again block using the
|
|
||||||
- futex controlled by __g_signals. Release MO so that observers that see
|
|
||||||
- no signals (and thus can block) also see the write __g1_start and thus
|
|
||||||
- that this is now a new group (see __pthread_cond_wait_common for the
|
|
||||||
- matching acquire MO loads). */
|
|
||||||
- atomic_store_release (cond->__data.__g_signals + g1, 0);
|
|
||||||
+ unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U;
|
|
||||||
+
|
|
||||||
+ /* If any waiters still hold group references (and thus could be blocked),
|
|
||||||
+ then wake them all up now and prevent any running ones from blocking.
|
|
||||||
+ This is effectively a catch-all for any possible current or future
|
|
||||||
+ bugs that can allow the group size to reach 0 before all G1 waiters
|
|
||||||
+ have been awakened or at least given signals to consume, or any
|
|
||||||
+ other case that can leave blocked (or about to block) older waiters.. */
|
|
||||||
+ if ((atomic_fetch_or_release (cond->__data.__g_refs + g1, 0) >> 1) > 0)
|
|
||||||
+ {
|
|
||||||
+ /* First advance signals to the end of the group (i.e. enough signals
|
|
||||||
+ for the entire G1 group) to ensure that waiters which have not
|
|
||||||
+ yet blocked in the futex will not block.
|
|
||||||
+ Note that in the vast majority of cases, this should never
|
|
||||||
+ actually be necessary, since __g_signals will have enough
|
|
||||||
+ signals for the remaining g_refs waiters. As an optimization,
|
|
||||||
+ we could check this first before proceeding, although that
|
|
||||||
+ could still leave the potential for futex lost wakeup bugs
|
|
||||||
+ if the signal count was non-zero but the futex wakeup
|
|
||||||
+ was somehow lost. */
|
|
||||||
+ atomic_store_release (cond->__data.__g_signals + g1, lowseq);
|
|
||||||
+
|
|
||||||
+ futex_wake (cond->__data.__g_signals + g1, INT_MAX, private);
|
|
||||||
+ }
|
|
||||||
|
|
||||||
/* At this point, the old G1 is now a valid new G2 (but not in use yet).
|
|
||||||
No old waiter can neither grab a signal nor acquire a reference without
|
|
||||||
@@ -451,6 +412,10 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
|
||||||
g1 ^= 1;
|
|
||||||
*g1index ^= 1;
|
|
||||||
|
|
||||||
+ /* Now advance the new G1 g_signals to the new lowseq, giving it
|
|
||||||
+ an effective signal count of 0 to start. */
|
|
||||||
+ atomic_store_release (cond->__data.__g_signals + g1, lowseq);
|
|
||||||
+
|
|
||||||
/* These values are just observed by signalers, and thus protected by the
|
|
||||||
lock. */
|
|
||||||
unsigned int orig_size = wseq - (old_g1_start + old_orig_size);
|
|
||||||
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
|
|
||||||
index ebf07ca82d87de7d..4fb22b28a7a20ecd 100644
|
|
||||||
--- a/nptl/pthread_cond_wait.c
|
|
||||||
+++ b/nptl/pthread_cond_wait.c
|
|
||||||
@@ -239,9 +239,7 @@ __condvar_cleanup_waiting (void *arg)
|
|
||||||
signaled), and a reference count.
|
|
||||||
|
|
||||||
The group reference count is used to maintain the number of waiters that
|
|
||||||
- are using the group's futex. Before a group can change its role, the
|
|
||||||
- reference count must show that no waiters are using the futex anymore; this
|
|
||||||
- prevents ABA issues on the futex word.
|
|
||||||
+ are using the group's futex.
|
|
||||||
|
|
||||||
To represent which intervals in the waiter sequence the groups cover (and
|
|
||||||
thus also which group slot contains G1 or G2), we use a 64b counter to
|
|
||||||
@@ -301,11 +299,12 @@ __condvar_cleanup_waiting (void *arg)
|
|
||||||
last reference.
|
|
||||||
* Reference count used by waiters concurrently with signalers that have
|
|
||||||
acquired the condvar-internal lock.
|
|
||||||
- __g_signals: The number of signals that can still be consumed.
|
|
||||||
+ __g_signals: The number of signals that can still be consumed, relative to
|
|
||||||
+ the current g1_start. (i.e. bits 31 to 1 of __g_signals are bits
|
|
||||||
+ 31 to 1 of g1_start with the signal count added)
|
|
||||||
* Used as a futex word by waiters. Used concurrently by waiters and
|
|
||||||
signalers.
|
|
||||||
- * LSB is true iff this group has been completely signaled (i.e., it is
|
|
||||||
- closed).
|
|
||||||
+ * LSB is currently reserved and 0.
|
|
||||||
__g_size: Waiters remaining in this group (i.e., which have not been
|
|
||||||
signaled yet.
|
|
||||||
* Accessed by signalers and waiters that cancel waiting (both do so only
|
|
||||||
@@ -329,18 +328,6 @@ __condvar_cleanup_waiting (void *arg)
|
|
||||||
sufficient because if a waiter can see a sufficiently large value, it could
|
|
||||||
have also consume a signal in the waiters group.
|
|
||||||
|
|
||||||
- Waiters try to grab a signal from __g_signals without holding a reference
|
|
||||||
- count, which can lead to stealing a signal from a more recent group after
|
|
||||||
- their own group was already closed. They cannot always detect whether they
|
|
||||||
- in fact did because they do not know when they stole, but they can
|
|
||||||
- conservatively add a signal back to the group they stole from; if they
|
|
||||||
- did so unnecessarily, all that happens is a spurious wake-up. To make this
|
|
||||||
- even less likely, __g1_start contains the index of the current g2 too,
|
|
||||||
- which allows waiters to check if there aliasing on the group slots; if
|
|
||||||
- there wasn't, they didn't steal from the current G1, which means that the
|
|
||||||
- G1 they stole from must have been already closed and they do not need to
|
|
||||||
- fix anything.
|
|
||||||
-
|
|
||||||
It is essential that the last field in pthread_cond_t is __g_signals[1]:
|
|
||||||
The previous condvar used a pointer-sized field in pthread_cond_t, so a
|
|
||||||
PTHREAD_COND_INITIALIZER from that condvar implementation might only
|
|
||||||
@@ -431,6 +418,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
{
|
|
||||||
while (1)
|
|
||||||
{
|
|
||||||
+ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
+ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
|
||||||
+
|
|
||||||
/* Spin-wait first.
|
|
||||||
Note that spinning first without checking whether a timeout
|
|
||||||
passed might lead to what looks like a spurious wake-up even
|
|
||||||
@@ -442,35 +432,45 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
having to compare against the current time seems to be the right
|
|
||||||
choice from a performance perspective for most use cases. */
|
|
||||||
unsigned int spin = maxspin;
|
|
||||||
- while (signals == 0 && spin > 0)
|
|
||||||
+ while (spin > 0 && ((int)(signals - lowseq) < 2))
|
|
||||||
{
|
|
||||||
/* Check that we are not spinning on a group that's already
|
|
||||||
closed. */
|
|
||||||
- if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
|
|
||||||
- goto done;
|
|
||||||
+ if (seq < (g1_start >> 1))
|
|
||||||
+ break;
|
|
||||||
|
|
||||||
/* TODO Back off. */
|
|
||||||
|
|
||||||
/* Reload signals. See above for MO. */
|
|
||||||
signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
|
||||||
+ g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
+ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
|
||||||
spin--;
|
|
||||||
}
|
|
||||||
|
|
||||||
- /* If our group will be closed as indicated by the flag on signals,
|
|
||||||
- don't bother grabbing a signal. */
|
|
||||||
- if (signals & 1)
|
|
||||||
- goto done;
|
|
||||||
-
|
|
||||||
- /* If there is an available signal, don't block. */
|
|
||||||
- if (signals != 0)
|
|
||||||
+ if (seq < (g1_start >> 1))
|
|
||||||
+ {
|
|
||||||
+ /* If the group is closed already,
|
|
||||||
+ then this waiter originally had enough extra signals to
|
|
||||||
+ consume, up until the time its group was closed. */
|
|
||||||
+ goto done;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ /* If there is an available signal, don't block.
|
|
||||||
+ If __g1_start has advanced at all, then we must be in G1
|
|
||||||
+ by now, perhaps in the process of switching back to an older
|
|
||||||
+ G2, but in either case we're allowed to consume the available
|
|
||||||
+ signal and should not block anymore. */
|
|
||||||
+ if ((int)(signals - lowseq) >= 2)
|
|
||||||
break;
|
|
||||||
|
|
||||||
/* No signals available after spinning, so prepare to block.
|
|
||||||
We first acquire a group reference and use acquire MO for that so
|
|
||||||
that we synchronize with the dummy read-modify-write in
|
|
||||||
__condvar_quiesce_and_switch_g1 if we read from that. In turn,
|
|
||||||
- in this case this will make us see the closed flag on __g_signals
|
|
||||||
- that designates a concurrent attempt to reuse the group's slot.
|
|
||||||
+ in this case this will make us see the advancement of __g_signals
|
|
||||||
+ to the upcoming new g1_start that occurs with a concurrent
|
|
||||||
+ attempt to reuse the group's slot.
|
|
||||||
We use acquire MO for the __g_signals check to make the
|
|
||||||
__g1_start check work (see spinning above).
|
|
||||||
Note that the group reference acquisition will not mask the
|
|
||||||
@@ -478,15 +478,24 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
an atomic read-modify-write operation and thus extend the release
|
|
||||||
sequence. */
|
|
||||||
atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
|
|
||||||
- if (((atomic_load_acquire (cond->__data.__g_signals + g) & 1) != 0)
|
|
||||||
- || (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)))
|
|
||||||
+ signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
|
||||||
+ g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
+ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
|
||||||
+
|
|
||||||
+ if (seq < (g1_start >> 1))
|
|
||||||
{
|
|
||||||
- /* Our group is closed. Wake up any signalers that might be
|
|
||||||
- waiting. */
|
|
||||||
+ /* group is closed already, so don't block */
|
|
||||||
__condvar_dec_grefs (cond, g, private);
|
|
||||||
goto done;
|
|
||||||
}
|
|
||||||
|
|
||||||
+ if ((int)(signals - lowseq) >= 2)
|
|
||||||
+ {
|
|
||||||
+ /* a signal showed up or G1/G2 switched after we grabbed the refcount */
|
|
||||||
+ __condvar_dec_grefs (cond, g, private);
|
|
||||||
+ break;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
// Now block.
|
|
||||||
struct _pthread_cleanup_buffer buffer;
|
|
||||||
struct _condvar_cleanup_buffer cbuffer;
|
|
||||||
@@ -500,7 +509,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
{
|
|
||||||
/* Block without a timeout. */
|
|
||||||
err = futex_wait_cancelable (
|
|
||||||
- cond->__data.__g_signals + g, 0, private);
|
|
||||||
+ cond->__data.__g_signals + g, signals, private);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
@@ -531,13 +540,13 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
err = ETIMEDOUT;
|
|
||||||
else
|
|
||||||
err = futex_reltimed_wait_cancelable
|
|
||||||
- (cond->__data.__g_signals + g, 0, &rt, private);
|
|
||||||
+ (cond->__data.__g_signals + g, signals, &rt, private);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/* Use CLOCK_REALTIME. */
|
|
||||||
err = futex_abstimed_wait_cancelable
|
|
||||||
- (cond->__data.__g_signals + g, 0, abstime, private);
|
|
||||||
+ (cond->__data.__g_signals + g, signals, abstime, private);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -562,6 +571,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
|
||||||
}
|
|
||||||
|
|
||||||
+ if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
|
|
||||||
+ goto done;
|
|
||||||
}
|
|
||||||
/* Try to grab a signal. Use acquire MO so that we see an up-to-date value
|
|
||||||
of __g1_start below (see spinning above for a similar case). In
|
|
||||||
@@ -570,69 +581,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
|
|
||||||
&signals, signals - 2));
|
|
||||||
|
|
||||||
- /* We consumed a signal but we could have consumed from a more recent group
|
|
||||||
- that aliased with ours due to being in the same group slot. If this
|
|
||||||
- might be the case our group must be closed as visible through
|
|
||||||
- __g1_start. */
|
|
||||||
- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
- if (seq < (g1_start >> 1))
|
|
||||||
- {
|
|
||||||
- /* We potentially stole a signal from a more recent group but we do not
|
|
||||||
- know which group we really consumed from.
|
|
||||||
- We do not care about groups older than current G1 because they are
|
|
||||||
- closed; we could have stolen from these, but then we just add a
|
|
||||||
- spurious wake-up for the current groups.
|
|
||||||
- We will never steal a signal from current G2 that was really intended
|
|
||||||
- for G2 because G2 never receives signals (until it becomes G1). We
|
|
||||||
- could have stolen a signal from G2 that was conservatively added by a
|
|
||||||
- previous waiter that also thought it stole a signal -- but given that
|
|
||||||
- that signal was added unnecessarily, it's not a problem if we steal
|
|
||||||
- it.
|
|
||||||
- Thus, the remaining case is that we could have stolen from the current
|
|
||||||
- G1, where "current" means the __g1_start value we observed. However,
|
|
||||||
- if the current G1 does not have the same slot index as we do, we did
|
|
||||||
- not steal from it and do not need to undo that. This is the reason
|
|
||||||
- for putting a bit with G2's index into__g1_start as well. */
|
|
||||||
- if (((g1_start & 1) ^ 1) == g)
|
|
||||||
- {
|
|
||||||
- /* We have to conservatively undo our potential mistake of stealing
|
|
||||||
- a signal. We can stop trying to do that when the current G1
|
|
||||||
- changes because other spinning waiters will notice this too and
|
|
||||||
- __condvar_quiesce_and_switch_g1 has checked that there are no
|
|
||||||
- futex waiters anymore before switching G1.
|
|
||||||
- Relaxed MO is fine for the __g1_start load because we need to
|
|
||||||
- merely be able to observe this fact and not have to observe
|
|
||||||
- something else as well.
|
|
||||||
- ??? Would it help to spin for a little while to see whether the
|
|
||||||
- current G1 gets closed? This might be worthwhile if the group is
|
|
||||||
- small or close to being closed. */
|
|
||||||
- unsigned int s = atomic_load_relaxed (cond->__data.__g_signals + g);
|
|
||||||
- while (__condvar_load_g1_start_relaxed (cond) == g1_start)
|
|
||||||
- {
|
|
||||||
- /* Try to add a signal. We don't need to acquire the lock
|
|
||||||
- because at worst we can cause a spurious wake-up. If the
|
|
||||||
- group is in the process of being closed (LSB is true), this
|
|
||||||
- has an effect similar to us adding a signal. */
|
|
||||||
- if (((s & 1) != 0)
|
|
||||||
- || atomic_compare_exchange_weak_relaxed
|
|
||||||
- (cond->__data.__g_signals + g, &s, s + 2))
|
|
||||||
- {
|
|
||||||
- /* If we added a signal, we also need to add a wake-up on
|
|
||||||
- the futex. We also need to do that if we skipped adding
|
|
||||||
- a signal because the group is being closed because
|
|
||||||
- while __condvar_quiesce_and_switch_g1 could have closed
|
|
||||||
- the group, it might stil be waiting for futex waiters to
|
|
||||||
- leave (and one of those waiters might be the one we stole
|
|
||||||
- the signal from, which cause it to block using the
|
|
||||||
- futex). */
|
|
||||||
- futex_wake (cond->__data.__g_signals + g, 1, private);
|
|
||||||
- break;
|
|
||||||
- }
|
|
||||||
- /* TODO Back off. */
|
|
||||||
- }
|
|
||||||
- }
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
done:
|
|
||||||
|
|
||||||
/* Confirm that we have been woken. We do that before acquiring the mutex
|
|
@ -1,39 +0,0 @@
|
|||||||
Partial revert of commit c36fc50781995e6758cae2b6927839d0157f213c
|
|
||||||
to restore the layout of pthread_cond_t and avoid a downstream
|
|
||||||
rpminspect and abidiff (libabigail tooling) spurious warning
|
|
||||||
about internal ABI changes. Without this change all RHEL developers
|
|
||||||
using pthread_cond_t would have to audit and waive the warning.
|
|
||||||
The alternative is to update the supression lists used in abidiff,
|
|
||||||
propagate that to the rpminspect service, and wait for that to
|
|
||||||
complete before doing the update. The more conservative position
|
|
||||||
is the partial revert of the layout change.
|
|
||||||
|
|
||||||
This is a downstream-only change and is not required upstream.
|
|
||||||
|
|
||||||
diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h
|
|
||||||
index dcb799b130178f3f..798e0de31680065b 100644
|
|
||||||
--- a/sysdeps/nptl/bits/thread-shared-types.h
|
|
||||||
+++ b/sysdeps/nptl/bits/thread-shared-types.h
|
|
||||||
@@ -188,7 +188,8 @@ struct __pthread_cond_s
|
|
||||||
unsigned int __high;
|
|
||||||
} __g1_start32;
|
|
||||||
};
|
|
||||||
- unsigned int __g_size[2] __LOCK_ALIGNMENT;
|
|
||||||
+ unsigned int __glibc_unused___g_refs[2] __LOCK_ALIGNMENT;
|
|
||||||
+ unsigned int __g_size[2];
|
|
||||||
unsigned int __g1_orig_size;
|
|
||||||
unsigned int __wrefs;
|
|
||||||
unsigned int __g_signals[2];
|
|
||||||
diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h
|
|
||||||
index 4f7adccdab1d6e9e..df049abf74d47522 100644
|
|
||||||
--- a/sysdeps/nptl/pthread.h
|
|
||||||
+++ b/sysdeps/nptl/pthread.h
|
|
||||||
@@ -184,7 +184,7 @@ enum
|
|
||||||
|
|
||||||
|
|
||||||
/* Conditional variable handling. */
|
|
||||||
-#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0} } }
|
|
||||||
+#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, {0, 0}, 0, 0, {0, 0} } }
|
|
||||||
|
|
||||||
|
|
||||||
/* Cleanup buffers */
|
|
@ -1,133 +0,0 @@
|
|||||||
commit 0cc973160c23bb67f895bc887dd6942d29f8fee3
|
|
||||||
Author: Malte Skarupke <malteskarupke@fastmail.fm>
|
|
||||||
Date: Wed Dec 4 07:55:22 2024 -0500
|
|
||||||
|
|
||||||
nptl: Update comments and indentation for new condvar implementation
|
|
||||||
|
|
||||||
Some comments were wrong after the most recent commit. This fixes that.
|
|
||||||
|
|
||||||
Also fixing indentation where it was using spaces instead of tabs.
|
|
||||||
|
|
||||||
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
|
|
||||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
|
||||||
|
|
||||||
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
|
|
||||||
index 9175e6779ebff244..36ec30a103390b3e 100644
|
|
||||||
--- a/nptl/pthread_cond_common.c
|
|
||||||
+++ b/nptl/pthread_cond_common.c
|
|
||||||
@@ -361,8 +361,9 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
|
||||||
* New waiters arriving concurrently with the group switching will all go
|
|
||||||
into G2 until we atomically make the switch. Waiters existing in G2
|
|
||||||
are not affected.
|
|
||||||
- * Waiters in G1 will be closed out immediately by the advancing of
|
|
||||||
- __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
|
|
||||||
+ * Waiters in G1 have already received a signal and been woken. If they
|
|
||||||
+ haven't woken yet, they will be closed out immediately by the advancing
|
|
||||||
+ of __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
|
|
||||||
which will prevent waiters from blocking using a futex on
|
|
||||||
__g_signals since it provides enough signals for all possible
|
|
||||||
remaining waiters. As a result, they can each consume a signal
|
|
||||||
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
|
|
||||||
index 4fb22b28a7a20ecd..2964c2d1be046b8a 100644
|
|
||||||
--- a/nptl/pthread_cond_wait.c
|
|
||||||
+++ b/nptl/pthread_cond_wait.c
|
|
||||||
@@ -250,7 +250,7 @@ __condvar_cleanup_waiting (void *arg)
|
|
||||||
figure out whether they are in a group that has already been completely
|
|
||||||
signaled (i.e., if the current G1 starts at a later position that the
|
|
||||||
waiter's position). Waiters cannot determine whether they are currently
|
|
||||||
- in G2 or G1 -- but they do not have too because all they are interested in
|
|
||||||
+ in G2 or G1 -- but they do not have to because all they are interested in
|
|
||||||
is whether there are available signals, and they always start in G2 (whose
|
|
||||||
group slot they know because of the bit in the waiter sequence. Signalers
|
|
||||||
will simply fill the right group until it is completely signaled and can
|
|
||||||
@@ -408,7 +408,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Now wait until a signal is available in our group or it is closed.
|
|
||||||
- Acquire MO so that if we observe a value of zero written after group
|
|
||||||
+ Acquire MO so that if we observe (signals == lowseq) after group
|
|
||||||
switching in __condvar_quiesce_and_switch_g1, we synchronize with that
|
|
||||||
store and will see the prior update of __g1_start done while switching
|
|
||||||
groups too. */
|
|
||||||
@@ -418,8 +418,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
{
|
|
||||||
while (1)
|
|
||||||
{
|
|
||||||
- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
- unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
|
||||||
+ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
+ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
|
||||||
|
|
||||||
/* Spin-wait first.
|
|
||||||
Note that spinning first without checking whether a timeout
|
|
||||||
@@ -443,21 +443,21 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
|
|
||||||
/* Reload signals. See above for MO. */
|
|
||||||
signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
|
||||||
- g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
|
||||||
+ g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
+ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
|
||||||
spin--;
|
|
||||||
}
|
|
||||||
|
|
||||||
- if (seq < (g1_start >> 1))
|
|
||||||
+ if (seq < (g1_start >> 1))
|
|
||||||
{
|
|
||||||
- /* If the group is closed already,
|
|
||||||
+ /* If the group is closed already,
|
|
||||||
then this waiter originally had enough extra signals to
|
|
||||||
consume, up until the time its group was closed. */
|
|
||||||
goto done;
|
|
||||||
- }
|
|
||||||
+ }
|
|
||||||
|
|
||||||
/* If there is an available signal, don't block.
|
|
||||||
- If __g1_start has advanced at all, then we must be in G1
|
|
||||||
+ If __g1_start has advanced at all, then we must be in G1
|
|
||||||
by now, perhaps in the process of switching back to an older
|
|
||||||
G2, but in either case we're allowed to consume the available
|
|
||||||
signal and should not block anymore. */
|
|
||||||
@@ -479,22 +479,23 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
sequence. */
|
|
||||||
atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
|
|
||||||
signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
|
||||||
- g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
|
||||||
+ g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
+ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
|
||||||
|
|
||||||
- if (seq < (g1_start >> 1))
|
|
||||||
+ if (seq < (g1_start >> 1))
|
|
||||||
{
|
|
||||||
- /* group is closed already, so don't block */
|
|
||||||
+ /* group is closed already, so don't block */
|
|
||||||
__condvar_dec_grefs (cond, g, private);
|
|
||||||
goto done;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((int)(signals - lowseq) >= 2)
|
|
||||||
{
|
|
||||||
- /* a signal showed up or G1/G2 switched after we grabbed the refcount */
|
|
||||||
+ /* a signal showed up or G1/G2 switched after we grabbed the
|
|
||||||
+ refcount */
|
|
||||||
__condvar_dec_grefs (cond, g, private);
|
|
||||||
break;
|
|
||||||
- }
|
|
||||||
+ }
|
|
||||||
|
|
||||||
// Now block.
|
|
||||||
struct _pthread_cleanup_buffer buffer;
|
|
||||||
@@ -574,10 +575,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
|
|
||||||
goto done;
|
|
||||||
}
|
|
||||||
- /* Try to grab a signal. Use acquire MO so that we see an up-to-date value
|
|
||||||
- of __g1_start below (see spinning above for a similar case). In
|
|
||||||
- particular, if we steal from a more recent group, we will also see a
|
|
||||||
- more recent __g1_start below. */
|
|
||||||
+ /* Try to grab a signal. See above for MO. (if we do another loop
|
|
||||||
+ iteration we need to see the correct value of g1_start) */
|
|
||||||
while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
|
|
||||||
&signals, signals - 2));
|
|
||||||
|
|
@ -1,67 +0,0 @@
|
|||||||
commit b42cc6af11062c260c7dfa91f1c89891366fed3e
|
|
||||||
Author: Malte Skarupke <malteskarupke@fastmail.fm>
|
|
||||||
Date: Wed Dec 4 07:55:50 2024 -0500
|
|
||||||
|
|
||||||
nptl: Remove unnecessary catch-all-wake in condvar group switch
|
|
||||||
|
|
||||||
This wake is unnecessary. We only switch groups after every sleeper in a group
|
|
||||||
has been woken. Sure, they may take a while to actually wake up and may still
|
|
||||||
hold a reference, but waking them a second time doesn't speed that up. Instead
|
|
||||||
this just makes the code more complicated and may hide problems.
|
|
||||||
|
|
||||||
In particular this safety wake wouldn't even have helped with the bug that was
|
|
||||||
fixed by Barrus' patch: The bug there was that pthread_cond_signal would not
|
|
||||||
switch g1 when it should, so we wouldn't even have entered this code path.
|
|
||||||
|
|
||||||
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
|
|
||||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
|
||||||
|
|
||||||
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
|
|
||||||
index 36ec30a103390b3e..f6d8c72b7f30ecff 100644
|
|
||||||
--- a/nptl/pthread_cond_common.c
|
|
||||||
+++ b/nptl/pthread_cond_common.c
|
|
||||||
@@ -361,13 +361,7 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
|
||||||
* New waiters arriving concurrently with the group switching will all go
|
|
||||||
into G2 until we atomically make the switch. Waiters existing in G2
|
|
||||||
are not affected.
|
|
||||||
- * Waiters in G1 have already received a signal and been woken. If they
|
|
||||||
- haven't woken yet, they will be closed out immediately by the advancing
|
|
||||||
- of __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
|
|
||||||
- which will prevent waiters from blocking using a futex on
|
|
||||||
- __g_signals since it provides enough signals for all possible
|
|
||||||
- remaining waiters. As a result, they can each consume a signal
|
|
||||||
- and they will eventually remove their group reference. */
|
|
||||||
+ * Waiters in G1 have already received a signal and been woken. */
|
|
||||||
|
|
||||||
/* Update __g1_start, which finishes closing this group. The value we add
|
|
||||||
will never be negative because old_orig_size can only be zero when we
|
|
||||||
@@ -380,29 +374,6 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
|
||||||
|
|
||||||
unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U;
|
|
||||||
|
|
||||||
- /* If any waiters still hold group references (and thus could be blocked),
|
|
||||||
- then wake them all up now and prevent any running ones from blocking.
|
|
||||||
- This is effectively a catch-all for any possible current or future
|
|
||||||
- bugs that can allow the group size to reach 0 before all G1 waiters
|
|
||||||
- have been awakened or at least given signals to consume, or any
|
|
||||||
- other case that can leave blocked (or about to block) older waiters.. */
|
|
||||||
- if ((atomic_fetch_or_release (cond->__data.__g_refs + g1, 0) >> 1) > 0)
|
|
||||||
- {
|
|
||||||
- /* First advance signals to the end of the group (i.e. enough signals
|
|
||||||
- for the entire G1 group) to ensure that waiters which have not
|
|
||||||
- yet blocked in the futex will not block.
|
|
||||||
- Note that in the vast majority of cases, this should never
|
|
||||||
- actually be necessary, since __g_signals will have enough
|
|
||||||
- signals for the remaining g_refs waiters. As an optimization,
|
|
||||||
- we could check this first before proceeding, although that
|
|
||||||
- could still leave the potential for futex lost wakeup bugs
|
|
||||||
- if the signal count was non-zero but the futex wakeup
|
|
||||||
- was somehow lost. */
|
|
||||||
- atomic_store_release (cond->__data.__g_signals + g1, lowseq);
|
|
||||||
-
|
|
||||||
- futex_wake (cond->__data.__g_signals + g1, INT_MAX, private);
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
/* At this point, the old G1 is now a valid new G2 (but not in use yet).
|
|
||||||
No old waiter can neither grab a signal nor acquire a reference without
|
|
||||||
noticing that __g1_start is larger.
|
|
@ -1,107 +0,0 @@
|
|||||||
commit 4f7b051f8ee3feff1b53b27a906f245afaa9cee1
|
|
||||||
Author: Malte Skarupke <malteskarupke@fastmail.fm>
|
|
||||||
Date: Wed Dec 4 07:56:13 2024 -0500
|
|
||||||
|
|
||||||
nptl: Remove unnecessary quadruple check in pthread_cond_wait
|
|
||||||
|
|
||||||
pthread_cond_wait was checking whether it was in a closed group no less than
|
|
||||||
four times. Checking once is enough. Here are the four checks:
|
|
||||||
|
|
||||||
1. While spin-waiting. This was dead code: maxspin is set to 0 and has been
|
|
||||||
for years.
|
|
||||||
2. Before deciding to go to sleep, and before incrementing grefs: I kept this
|
|
||||||
3. After incrementing grefs. There is no reason to think that the group would
|
|
||||||
close while we do an atomic increment. Obviously it could close at any
|
|
||||||
point, but that doesn't mean we have to recheck after every step. This
|
|
||||||
check was equally good as check 2, except it has to do more work.
|
|
||||||
4. When we find ourselves in a group that has a signal. We only get here after
|
|
||||||
we check that we're not in a closed group. There is no need to check again.
|
|
||||||
The check would only have helped in cases where the compare_exchange in the
|
|
||||||
next line would also have failed. Relying on the compare_exchange is fine.
|
|
||||||
|
|
||||||
Removing the duplicate checks clarifies the code.
|
|
||||||
|
|
||||||
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
|
|
||||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
|
||||||
|
|
||||||
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
|
|
||||||
index 2964c2d1be046b8a..8358784867f6074a 100644
|
|
||||||
--- a/nptl/pthread_cond_wait.c
|
|
||||||
+++ b/nptl/pthread_cond_wait.c
|
|
||||||
@@ -367,7 +367,6 @@ static __always_inline int
|
|
||||||
__pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
const struct timespec *abstime)
|
|
||||||
{
|
|
||||||
- const int maxspin = 0;
|
|
||||||
int err;
|
|
||||||
int result = 0;
|
|
||||||
|
|
||||||
@@ -421,33 +420,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
|
||||||
|
|
||||||
- /* Spin-wait first.
|
|
||||||
- Note that spinning first without checking whether a timeout
|
|
||||||
- passed might lead to what looks like a spurious wake-up even
|
|
||||||
- though we should return ETIMEDOUT (e.g., if the caller provides
|
|
||||||
- an absolute timeout that is clearly in the past). However,
|
|
||||||
- (1) spurious wake-ups are allowed, (2) it seems unlikely that a
|
|
||||||
- user will (ab)use pthread_cond_wait as a check for whether a
|
|
||||||
- point in time is in the past, and (3) spinning first without
|
|
||||||
- having to compare against the current time seems to be the right
|
|
||||||
- choice from a performance perspective for most use cases. */
|
|
||||||
- unsigned int spin = maxspin;
|
|
||||||
- while (spin > 0 && ((int)(signals - lowseq) < 2))
|
|
||||||
- {
|
|
||||||
- /* Check that we are not spinning on a group that's already
|
|
||||||
- closed. */
|
|
||||||
- if (seq < (g1_start >> 1))
|
|
||||||
- break;
|
|
||||||
-
|
|
||||||
- /* TODO Back off. */
|
|
||||||
-
|
|
||||||
- /* Reload signals. See above for MO. */
|
|
||||||
- signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
|
||||||
- g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
|
||||||
- spin--;
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
if (seq < (g1_start >> 1))
|
|
||||||
{
|
|
||||||
/* If the group is closed already,
|
|
||||||
@@ -478,24 +450,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
an atomic read-modify-write operation and thus extend the release
|
|
||||||
sequence. */
|
|
||||||
atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
|
|
||||||
- signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
|
||||||
- g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
|
||||||
-
|
|
||||||
- if (seq < (g1_start >> 1))
|
|
||||||
- {
|
|
||||||
- /* group is closed already, so don't block */
|
|
||||||
- __condvar_dec_grefs (cond, g, private);
|
|
||||||
- goto done;
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
- if ((int)(signals - lowseq) >= 2)
|
|
||||||
- {
|
|
||||||
- /* a signal showed up or G1/G2 switched after we grabbed the
|
|
||||||
- refcount */
|
|
||||||
- __condvar_dec_grefs (cond, g, private);
|
|
||||||
- break;
|
|
||||||
- }
|
|
||||||
|
|
||||||
// Now block.
|
|
||||||
struct _pthread_cleanup_buffer buffer;
|
|
||||||
@@ -571,9 +525,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
/* Reload signals. See above for MO. */
|
|
||||||
signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
|
||||||
}
|
|
||||||
-
|
|
||||||
- if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
|
|
||||||
- goto done;
|
|
||||||
}
|
|
||||||
/* Try to grab a signal. See above for MO. (if we do another loop
|
|
||||||
iteration we need to see the correct value of g1_start) */
|
|
@ -1,172 +0,0 @@
|
|||||||
commit c36fc50781995e6758cae2b6927839d0157f213c
|
|
||||||
Author: Malte Skarupke <malteskarupke@fastmail.fm>
|
|
||||||
Date: Wed Dec 4 07:56:38 2024 -0500
|
|
||||||
|
|
||||||
nptl: Remove g_refs from condition variables
|
|
||||||
|
|
||||||
This variable used to be needed to wait in group switching until all sleepers
|
|
||||||
have confirmed that they have woken. This is no longer needed. Nothing waits
|
|
||||||
on this variable so there is no need to track how many threads are currently
|
|
||||||
asleep in each group.
|
|
||||||
|
|
||||||
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
|
|
||||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
|
||||||
|
|
||||||
# Conflicts:
|
|
||||||
# nptl/tst-cond22.c (64-bit atomic counter refactor missing)
|
|
||||||
# sysdeps/nptl/bits/thread-shared-types.h (Likewise)
|
|
||||||
|
|
||||||
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
|
|
||||||
index 8358784867f6074a..b2bf3bd0a7af43e8 100644
|
|
||||||
--- a/nptl/pthread_cond_wait.c
|
|
||||||
+++ b/nptl/pthread_cond_wait.c
|
|
||||||
@@ -144,23 +144,6 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
-/* Wake up any signalers that might be waiting. */
|
|
||||||
-static void
|
|
||||||
-__condvar_dec_grefs (pthread_cond_t *cond, unsigned int g, int private)
|
|
||||||
-{
|
|
||||||
- /* Release MO to synchronize-with the acquire load in
|
|
||||||
- __condvar_quiesce_and_switch_g1. */
|
|
||||||
- if (atomic_fetch_add_release (cond->__data.__g_refs + g, -2) == 3)
|
|
||||||
- {
|
|
||||||
- /* Clear the wake-up request flag before waking up. We do not need more
|
|
||||||
- than relaxed MO and it doesn't matter if we apply this for an aliased
|
|
||||||
- group because we wake all futex waiters right after clearing the
|
|
||||||
- flag. */
|
|
||||||
- atomic_fetch_and_relaxed (cond->__data.__g_refs + g, ~(unsigned int) 1);
|
|
||||||
- futex_wake (cond->__data.__g_refs + g, INT_MAX, private);
|
|
||||||
- }
|
|
||||||
-}
|
|
||||||
-
|
|
||||||
/* Clean-up for cancellation of waiters waiting for normal signals. We cancel
|
|
||||||
our registration as a waiter, confirm we have woken up, and re-acquire the
|
|
||||||
mutex. */
|
|
||||||
@@ -172,8 +155,6 @@ __condvar_cleanup_waiting (void *arg)
|
|
||||||
pthread_cond_t *cond = cbuffer->cond;
|
|
||||||
unsigned g = cbuffer->wseq & 1;
|
|
||||||
|
|
||||||
- __condvar_dec_grefs (cond, g, cbuffer->private);
|
|
||||||
-
|
|
||||||
__condvar_cancel_waiting (cond, cbuffer->wseq >> 1, g, cbuffer->private);
|
|
||||||
/* FIXME With the current cancellation implementation, it is possible that
|
|
||||||
a thread is cancelled after it has returned from a syscall. This could
|
|
||||||
@@ -328,15 +309,6 @@ __condvar_cleanup_waiting (void *arg)
|
|
||||||
sufficient because if a waiter can see a sufficiently large value, it could
|
|
||||||
have also consume a signal in the waiters group.
|
|
||||||
|
|
||||||
- It is essential that the last field in pthread_cond_t is __g_signals[1]:
|
|
||||||
- The previous condvar used a pointer-sized field in pthread_cond_t, so a
|
|
||||||
- PTHREAD_COND_INITIALIZER from that condvar implementation might only
|
|
||||||
- initialize 4 bytes to zero instead of the 8 bytes we need (i.e., 44 bytes
|
|
||||||
- in total instead of the 48 we need). __g_signals[1] is not accessed before
|
|
||||||
- the first group switch (G2 starts at index 0), which will set its value to
|
|
||||||
- zero after a harmless fetch-or whose return value is ignored. This
|
|
||||||
- effectively completes initialization.
|
|
||||||
-
|
|
||||||
|
|
||||||
Limitations:
|
|
||||||
* This condvar isn't designed to allow for more than
|
|
||||||
@@ -436,21 +408,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
if ((int)(signals - lowseq) >= 2)
|
|
||||||
break;
|
|
||||||
|
|
||||||
- /* No signals available after spinning, so prepare to block.
|
|
||||||
- We first acquire a group reference and use acquire MO for that so
|
|
||||||
- that we synchronize with the dummy read-modify-write in
|
|
||||||
- __condvar_quiesce_and_switch_g1 if we read from that. In turn,
|
|
||||||
- in this case this will make us see the advancement of __g_signals
|
|
||||||
- to the upcoming new g1_start that occurs with a concurrent
|
|
||||||
- attempt to reuse the group's slot.
|
|
||||||
- We use acquire MO for the __g_signals check to make the
|
|
||||||
- __g1_start check work (see spinning above).
|
|
||||||
- Note that the group reference acquisition will not mask the
|
|
||||||
- release MO when decrementing the reference count because we use
|
|
||||||
- an atomic read-modify-write operation and thus extend the release
|
|
||||||
- sequence. */
|
|
||||||
- atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
|
|
||||||
-
|
|
||||||
// Now block.
|
|
||||||
struct _pthread_cleanup_buffer buffer;
|
|
||||||
struct _condvar_cleanup_buffer cbuffer;
|
|
||||||
@@ -509,18 +466,11 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
|
|
||||||
if (__glibc_unlikely (err == ETIMEDOUT))
|
|
||||||
{
|
|
||||||
- __condvar_dec_grefs (cond, g, private);
|
|
||||||
- /* If we timed out, we effectively cancel waiting. Note that
|
|
||||||
- we have decremented __g_refs before cancellation, so that a
|
|
||||||
- deadlock between waiting for quiescence of our group in
|
|
||||||
- __condvar_quiesce_and_switch_g1 and us trying to acquire
|
|
||||||
- the lock during cancellation is not possible. */
|
|
||||||
+ /* If we timed out, we effectively cancel waiting. */
|
|
||||||
__condvar_cancel_waiting (cond, seq, g, private);
|
|
||||||
result = ETIMEDOUT;
|
|
||||||
goto done;
|
|
||||||
}
|
|
||||||
- else
|
|
||||||
- __condvar_dec_grefs (cond, g, private);
|
|
||||||
|
|
||||||
/* Reload signals. See above for MO. */
|
|
||||||
signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
|
||||||
diff --git a/nptl/tst-cond22.c b/nptl/tst-cond22.c
|
|
||||||
index 64f19ea0a55af057..ebeeeaf666070076 100644
|
|
||||||
--- a/nptl/tst-cond22.c
|
|
||||||
+++ b/nptl/tst-cond22.c
|
|
||||||
@@ -106,10 +106,10 @@ do_test (void)
|
|
||||||
status = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
- printf ("cond = { %llu, %llu, %u/%u/%u, %u/%u/%u, %u, %u }\n",
|
|
||||||
+ printf ("cond = { %llu, %llu, %u/%u, %u/%u, %u, %u }\n",
|
|
||||||
c.__data.__wseq, c.__data.__g1_start,
|
|
||||||
- c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
|
|
||||||
- c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
|
|
||||||
+ c.__data.__g_signals[0], c.__data.__g_size[0],
|
|
||||||
+ c.__data.__g_signals[1], c.__data.__g_size[1],
|
|
||||||
c.__data.__g1_orig_size, c.__data.__wrefs);
|
|
||||||
|
|
||||||
if (pthread_create (&th, NULL, tf, (void *) 1l) != 0)
|
|
||||||
@@ -149,10 +149,10 @@ do_test (void)
|
|
||||||
status = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
- printf ("cond = { %llu, %llu, %u/%u/%u, %u/%u/%u, %u, %u }\n",
|
|
||||||
+ printf ("cond = { %llu, %llu, %u/%u, %u/%u, %u, %u }\n",
|
|
||||||
c.__data.__wseq, c.__data.__g1_start,
|
|
||||||
- c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
|
|
||||||
- c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
|
|
||||||
+ c.__data.__g_signals[0], c.__data.__g_size[0],
|
|
||||||
+ c.__data.__g_signals[1], c.__data.__g_size[1],
|
|
||||||
c.__data.__g1_orig_size, c.__data.__wrefs);
|
|
||||||
|
|
||||||
return status;
|
|
||||||
diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h
|
|
||||||
index 05c94e7a710c0eb9..dcb799b130178f3f 100644
|
|
||||||
--- a/sysdeps/nptl/bits/thread-shared-types.h
|
|
||||||
+++ b/sysdeps/nptl/bits/thread-shared-types.h
|
|
||||||
@@ -188,8 +188,7 @@ struct __pthread_cond_s
|
|
||||||
unsigned int __high;
|
|
||||||
} __g1_start32;
|
|
||||||
};
|
|
||||||
- unsigned int __g_refs[2] __LOCK_ALIGNMENT;
|
|
||||||
- unsigned int __g_size[2];
|
|
||||||
+ unsigned int __g_size[2] __LOCK_ALIGNMENT;
|
|
||||||
unsigned int __g1_orig_size;
|
|
||||||
unsigned int __wrefs;
|
|
||||||
unsigned int __g_signals[2];
|
|
||||||
diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h
|
|
||||||
index df049abf74d47522..4f7adccdab1d6e9e 100644
|
|
||||||
--- a/sysdeps/nptl/pthread.h
|
|
||||||
+++ b/sysdeps/nptl/pthread.h
|
|
||||||
@@ -184,7 +184,7 @@ enum
|
|
||||||
|
|
||||||
|
|
||||||
/* Conditional variable handling. */
|
|
||||||
-#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, {0, 0}, 0, 0, {0, 0} } }
|
|
||||||
+#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0} } }
|
|
||||||
|
|
||||||
|
|
||||||
/* Cleanup buffers */
|
|
@ -1,94 +0,0 @@
|
|||||||
commit 929a4764ac90382616b6a21f099192b2475da674
|
|
||||||
Author: Malte Skarupke <malteskarupke@fastmail.fm>
|
|
||||||
Date: Wed Dec 4 08:03:44 2024 -0500
|
|
||||||
|
|
||||||
nptl: Use a single loop in pthread_cond_wait instaed of a nested loop
|
|
||||||
|
|
||||||
The loop was a little more complicated than necessary. There was only one
|
|
||||||
break statement out of the inner loop, and the outer loop was nearly empty.
|
|
||||||
So just remove the outer loop, moving its code to the one break statement in
|
|
||||||
the inner loop. This allows us to replace all gotos with break statements.
|
|
||||||
|
|
||||||
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
|
|
||||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
|
||||||
|
|
||||||
# Conflicts:
|
|
||||||
# nptl/pthread_cond_wait.c (Missing EOVERFLOW checks for y2038)
|
|
||||||
|
|
||||||
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
|
|
||||||
index b2bf3bd0a7af43e8..8f12fc4ee288cf4a 100644
|
|
||||||
--- a/nptl/pthread_cond_wait.c
|
|
||||||
+++ b/nptl/pthread_cond_wait.c
|
|
||||||
@@ -378,17 +378,15 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
- /* Now wait until a signal is available in our group or it is closed.
|
|
||||||
- Acquire MO so that if we observe (signals == lowseq) after group
|
|
||||||
- switching in __condvar_quiesce_and_switch_g1, we synchronize with that
|
|
||||||
- store and will see the prior update of __g1_start done while switching
|
|
||||||
- groups too. */
|
|
||||||
- unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
|
||||||
-
|
|
||||||
- do
|
|
||||||
- {
|
|
||||||
+
|
|
||||||
while (1)
|
|
||||||
{
|
|
||||||
+ /* Now wait until a signal is available in our group or it is closed.
|
|
||||||
+ Acquire MO so that if we observe (signals == lowseq) after group
|
|
||||||
+ switching in __condvar_quiesce_and_switch_g1, we synchronize with that
|
|
||||||
+ store and will see the prior update of __g1_start done while switching
|
|
||||||
+ groups too. */
|
|
||||||
+ unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
|
||||||
uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
|
||||||
|
|
||||||
@@ -397,7 +395,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
/* If the group is closed already,
|
|
||||||
then this waiter originally had enough extra signals to
|
|
||||||
consume, up until the time its group was closed. */
|
|
||||||
- goto done;
|
|
||||||
+ break;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If there is an available signal, don't block.
|
|
||||||
@@ -406,7 +404,16 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
G2, but in either case we're allowed to consume the available
|
|
||||||
signal and should not block anymore. */
|
|
||||||
if ((int)(signals - lowseq) >= 2)
|
|
||||||
- break;
|
|
||||||
+ {
|
|
||||||
+ /* Try to grab a signal. See above for MO. (if we do another loop
|
|
||||||
+ iteration we need to see the correct value of g1_start) */
|
|
||||||
+ if (atomic_compare_exchange_weak_acquire (
|
|
||||||
+ cond->__data.__g_signals + g,
|
|
||||||
+ &signals, signals - 2))
|
|
||||||
+ break;
|
|
||||||
+ else
|
|
||||||
+ continue;
|
|
||||||
+ }
|
|
||||||
|
|
||||||
// Now block.
|
|
||||||
struct _pthread_cleanup_buffer buffer;
|
|
||||||
@@ -469,19 +476,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
/* If we timed out, we effectively cancel waiting. */
|
|
||||||
__condvar_cancel_waiting (cond, seq, g, private);
|
|
||||||
result = ETIMEDOUT;
|
|
||||||
- goto done;
|
|
||||||
+ break;
|
|
||||||
}
|
|
||||||
-
|
|
||||||
- /* Reload signals. See above for MO. */
|
|
||||||
- signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
|
||||||
}
|
|
||||||
- }
|
|
||||||
- /* Try to grab a signal. See above for MO. (if we do another loop
|
|
||||||
- iteration we need to see the correct value of g1_start) */
|
|
||||||
- while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
|
|
||||||
- &signals, signals - 2));
|
|
||||||
-
|
|
||||||
- done:
|
|
||||||
|
|
||||||
/* Confirm that we have been woken. We do that before acquiring the mutex
|
|
||||||
to allow for execution of pthread_cond_destroy while having acquired the
|
|
@ -1,218 +0,0 @@
|
|||||||
commit ee6c14ed59d480720721aaacc5fb03213dc153da
|
|
||||||
Author: Malte Skarupke <malteskarupke@fastmail.fm>
|
|
||||||
Date: Wed Dec 4 08:04:10 2024 -0500
|
|
||||||
|
|
||||||
nptl: Fix indentation
|
|
||||||
|
|
||||||
In my previous change I turned a nested loop into a simple loop. I'm doing
|
|
||||||
the resulting indentation changes in a separate commit to make the diff on
|
|
||||||
the previous commit easier to review.
|
|
||||||
|
|
||||||
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
|
|
||||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
|
||||||
|
|
||||||
# Conflicts:
|
|
||||||
# nptl/pthread_cond_wait.c (Missing futex_wait_cancelable cleanup)
|
|
||||||
|
|
||||||
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
|
|
||||||
index 8f12fc4ee288cf4a..964591449dc57758 100644
|
|
||||||
--- a/nptl/pthread_cond_wait.c
|
|
||||||
+++ b/nptl/pthread_cond_wait.c
|
|
||||||
@@ -379,107 +379,108 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
- while (1)
|
|
||||||
- {
|
|
||||||
- /* Now wait until a signal is available in our group or it is closed.
|
|
||||||
- Acquire MO so that if we observe (signals == lowseq) after group
|
|
||||||
- switching in __condvar_quiesce_and_switch_g1, we synchronize with that
|
|
||||||
- store and will see the prior update of __g1_start done while switching
|
|
||||||
- groups too. */
|
|
||||||
- unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
|
||||||
- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
- unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
|
||||||
-
|
|
||||||
- if (seq < (g1_start >> 1))
|
|
||||||
- {
|
|
||||||
- /* If the group is closed already,
|
|
||||||
- then this waiter originally had enough extra signals to
|
|
||||||
- consume, up until the time its group was closed. */
|
|
||||||
- break;
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
- /* If there is an available signal, don't block.
|
|
||||||
- If __g1_start has advanced at all, then we must be in G1
|
|
||||||
- by now, perhaps in the process of switching back to an older
|
|
||||||
- G2, but in either case we're allowed to consume the available
|
|
||||||
- signal and should not block anymore. */
|
|
||||||
- if ((int)(signals - lowseq) >= 2)
|
|
||||||
- {
|
|
||||||
- /* Try to grab a signal. See above for MO. (if we do another loop
|
|
||||||
- iteration we need to see the correct value of g1_start) */
|
|
||||||
- if (atomic_compare_exchange_weak_acquire (
|
|
||||||
- cond->__data.__g_signals + g,
|
|
||||||
+ while (1)
|
|
||||||
+ {
|
|
||||||
+ /* Now wait until a signal is available in our group or it is closed.
|
|
||||||
+ Acquire MO so that if we observe (signals == lowseq) after group
|
|
||||||
+ switching in __condvar_quiesce_and_switch_g1, we synchronize with that
|
|
||||||
+ store and will see the prior update of __g1_start done while switching
|
|
||||||
+ groups too. */
|
|
||||||
+ unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
|
||||||
+ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
+ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
|
||||||
+
|
|
||||||
+ if (seq < (g1_start >> 1))
|
|
||||||
+ {
|
|
||||||
+ /* If the group is closed already,
|
|
||||||
+ then this waiter originally had enough extra signals to
|
|
||||||
+ consume, up until the time its group was closed. */
|
|
||||||
+ break;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ /* If there is an available signal, don't block.
|
|
||||||
+ If __g1_start has advanced at all, then we must be in G1
|
|
||||||
+ by now, perhaps in the process of switching back to an older
|
|
||||||
+ G2, but in either case we're allowed to consume the available
|
|
||||||
+ signal and should not block anymore. */
|
|
||||||
+ if ((int)(signals - lowseq) >= 2)
|
|
||||||
+ {
|
|
||||||
+ /* Try to grab a signal. See above for MO. (if we do another loop
|
|
||||||
+ iteration we need to see the correct value of g1_start) */
|
|
||||||
+ if (atomic_compare_exchange_weak_acquire (
|
|
||||||
+ cond->__data.__g_signals + g,
|
|
||||||
&signals, signals - 2))
|
|
||||||
- break;
|
|
||||||
- else
|
|
||||||
- continue;
|
|
||||||
- }
|
|
||||||
+ break;
|
|
||||||
+ else
|
|
||||||
+ continue;
|
|
||||||
+ }
|
|
||||||
|
|
||||||
- // Now block.
|
|
||||||
- struct _pthread_cleanup_buffer buffer;
|
|
||||||
- struct _condvar_cleanup_buffer cbuffer;
|
|
||||||
- cbuffer.wseq = wseq;
|
|
||||||
- cbuffer.cond = cond;
|
|
||||||
- cbuffer.mutex = mutex;
|
|
||||||
- cbuffer.private = private;
|
|
||||||
- __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
|
|
||||||
+ // Now block.
|
|
||||||
+ struct _pthread_cleanup_buffer buffer;
|
|
||||||
+ struct _condvar_cleanup_buffer cbuffer;
|
|
||||||
+ cbuffer.wseq = wseq;
|
|
||||||
+ cbuffer.cond = cond;
|
|
||||||
+ cbuffer.mutex = mutex;
|
|
||||||
+ cbuffer.private = private;
|
|
||||||
+ __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
|
|
||||||
|
|
||||||
- if (abstime == NULL)
|
|
||||||
- {
|
|
||||||
- /* Block without a timeout. */
|
|
||||||
- err = futex_wait_cancelable (
|
|
||||||
- cond->__data.__g_signals + g, signals, private);
|
|
||||||
- }
|
|
||||||
- else
|
|
||||||
+ if (abstime == NULL)
|
|
||||||
+ {
|
|
||||||
+ /* Block without a timeout. */
|
|
||||||
+ err = futex_wait_cancelable
|
|
||||||
+ (cond->__data.__g_signals + g, signals, private);
|
|
||||||
+ }
|
|
||||||
+ else
|
|
||||||
+ {
|
|
||||||
+ /* Block, but with a timeout.
|
|
||||||
+ Work around the fact that the kernel rejects negative timeout
|
|
||||||
+ values despite them being valid. */
|
|
||||||
+ if (__glibc_unlikely (abstime->tv_sec < 0))
|
|
||||||
+ err = ETIMEDOUT;
|
|
||||||
+ else if ((flags & __PTHREAD_COND_CLOCK_MONOTONIC_MASK) != 0)
|
|
||||||
{
|
|
||||||
- /* Block, but with a timeout.
|
|
||||||
- Work around the fact that the kernel rejects negative timeout
|
|
||||||
- values despite them being valid. */
|
|
||||||
- if (__glibc_unlikely (abstime->tv_sec < 0))
|
|
||||||
- err = ETIMEDOUT;
|
|
||||||
-
|
|
||||||
- else if ((flags & __PTHREAD_COND_CLOCK_MONOTONIC_MASK) != 0)
|
|
||||||
+ /* CLOCK_MONOTONIC is requested. */
|
|
||||||
+ struct timespec rt;
|
|
||||||
+ if (__clock_gettime (CLOCK_MONOTONIC, &rt) != 0)
|
|
||||||
+ __libc_fatal ("clock_gettime does not support "
|
|
||||||
+ "CLOCK_MONOTONIC\n");
|
|
||||||
+ /* Convert the absolute timeout value to a relative
|
|
||||||
+ timeout. */
|
|
||||||
+ rt.tv_sec = abstime->tv_sec - rt.tv_sec;
|
|
||||||
+ rt.tv_nsec = abstime->tv_nsec - rt.tv_nsec;
|
|
||||||
+ if (rt.tv_nsec < 0)
|
|
||||||
{
|
|
||||||
- /* CLOCK_MONOTONIC is requested. */
|
|
||||||
- struct timespec rt;
|
|
||||||
- if (__clock_gettime (CLOCK_MONOTONIC, &rt) != 0)
|
|
||||||
- __libc_fatal ("clock_gettime does not support "
|
|
||||||
- "CLOCK_MONOTONIC\n");
|
|
||||||
- /* Convert the absolute timeout value to a relative
|
|
||||||
- timeout. */
|
|
||||||
- rt.tv_sec = abstime->tv_sec - rt.tv_sec;
|
|
||||||
- rt.tv_nsec = abstime->tv_nsec - rt.tv_nsec;
|
|
||||||
- if (rt.tv_nsec < 0)
|
|
||||||
- {
|
|
||||||
- rt.tv_nsec += 1000000000;
|
|
||||||
- --rt.tv_sec;
|
|
||||||
- }
|
|
||||||
- /* Did we already time out? */
|
|
||||||
- if (__glibc_unlikely (rt.tv_sec < 0))
|
|
||||||
- err = ETIMEDOUT;
|
|
||||||
- else
|
|
||||||
- err = futex_reltimed_wait_cancelable
|
|
||||||
- (cond->__data.__g_signals + g, signals, &rt, private);
|
|
||||||
+ rt.tv_nsec += 1000000000;
|
|
||||||
+ --rt.tv_sec;
|
|
||||||
}
|
|
||||||
+ /* Did we already time out? */
|
|
||||||
+ if (__glibc_unlikely (rt.tv_sec < 0))
|
|
||||||
+ err = ETIMEDOUT;
|
|
||||||
else
|
|
||||||
- {
|
|
||||||
- /* Use CLOCK_REALTIME. */
|
|
||||||
- err = futex_abstimed_wait_cancelable
|
|
||||||
- (cond->__data.__g_signals + g, signals, abstime, private);
|
|
||||||
- }
|
|
||||||
+ err = futex_reltimed_wait_cancelable
|
|
||||||
+ (cond->__data.__g_signals + g, signals,
|
|
||||||
+ &rt, private);
|
|
||||||
}
|
|
||||||
-
|
|
||||||
- __pthread_cleanup_pop (&buffer, 0);
|
|
||||||
-
|
|
||||||
- if (__glibc_unlikely (err == ETIMEDOUT))
|
|
||||||
+ else
|
|
||||||
{
|
|
||||||
- /* If we timed out, we effectively cancel waiting. */
|
|
||||||
- __condvar_cancel_waiting (cond, seq, g, private);
|
|
||||||
- result = ETIMEDOUT;
|
|
||||||
- break;
|
|
||||||
+ /* Use CLOCK_REALTIME. */
|
|
||||||
+ err = futex_abstimed_wait_cancelable
|
|
||||||
+ (cond->__data.__g_signals + g, signals,
|
|
||||||
+ abstime, private);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
+ __pthread_cleanup_pop (&buffer, 0);
|
|
||||||
+
|
|
||||||
+ if (__glibc_unlikely (err == ETIMEDOUT))
|
|
||||||
+ {
|
|
||||||
+ /* If we timed out, we effectively cancel waiting. */
|
|
||||||
+ __condvar_cancel_waiting (cond, seq, g, private);
|
|
||||||
+ result = ETIMEDOUT;
|
|
||||||
+ break;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
/* Confirm that we have been woken. We do that before acquiring the mutex
|
|
||||||
to allow for execution of pthread_cond_destroy while having acquired the
|
|
||||||
mutex. */
|
|
@ -1,147 +0,0 @@
|
|||||||
commit 4b79e27a5073c02f6bff9aa8f4791230a0ab1867
|
|
||||||
Author: Malte Skarupke <malteskarupke@fastmail.fm>
|
|
||||||
Date: Wed Dec 4 08:04:54 2024 -0500
|
|
||||||
|
|
||||||
nptl: rename __condvar_quiesce_and_switch_g1
|
|
||||||
|
|
||||||
This function no longer waits for threads to leave g1, so rename it to
|
|
||||||
__condvar_switch_g1
|
|
||||||
|
|
||||||
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
|
|
||||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
|
||||||
|
|
||||||
diff --git a/nptl/pthread_cond_broadcast.c b/nptl/pthread_cond_broadcast.c
|
|
||||||
index e6bcb9b61b0055a5..1ec746ec3df51c4f 100644
|
|
||||||
--- a/nptl/pthread_cond_broadcast.c
|
|
||||||
+++ b/nptl/pthread_cond_broadcast.c
|
|
||||||
@@ -61,7 +61,7 @@ __pthread_cond_broadcast (pthread_cond_t *cond)
|
|
||||||
cond->__data.__g_size[g1] << 1);
|
|
||||||
cond->__data.__g_size[g1] = 0;
|
|
||||||
|
|
||||||
- /* We need to wake G1 waiters before we quiesce G1 below. */
|
|
||||||
+ /* We need to wake G1 waiters before we switch G1 below. */
|
|
||||||
/* TODO Only set it if there are indeed futex waiters. We could
|
|
||||||
also try to move this out of the critical section in cases when
|
|
||||||
G2 is empty (and we don't need to quiesce). */
|
|
||||||
@@ -70,7 +70,7 @@ __pthread_cond_broadcast (pthread_cond_t *cond)
|
|
||||||
|
|
||||||
/* G1 is complete. Step (2) is next unless there are no waiters in G2, in
|
|
||||||
which case we can stop. */
|
|
||||||
- if (__condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private))
|
|
||||||
+ if (__condvar_switch_g1 (cond, wseq, &g1, private))
|
|
||||||
{
|
|
||||||
/* Step (3): Send signals to all waiters in the old G2 / new G1. */
|
|
||||||
atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
|
|
||||||
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
|
|
||||||
index f6d8c72b7f30ecff..770f8e7c5d347f6b 100644
|
|
||||||
--- a/nptl/pthread_cond_common.c
|
|
||||||
+++ b/nptl/pthread_cond_common.c
|
|
||||||
@@ -329,16 +329,15 @@ __condvar_get_private (int flags)
|
|
||||||
return FUTEX_SHARED;
|
|
||||||
}
|
|
||||||
|
|
||||||
-/* This closes G1 (whose index is in G1INDEX), waits for all futex waiters to
|
|
||||||
- leave G1, converts G1 into a fresh G2, and then switches group roles so that
|
|
||||||
- the former G2 becomes the new G1 ending at the current __wseq value when we
|
|
||||||
- eventually make the switch (WSEQ is just an observation of __wseq by the
|
|
||||||
- signaler).
|
|
||||||
+/* This closes G1 (whose index is in G1INDEX), converts G1 into a fresh G2,
|
|
||||||
+ and then switches group roles so that the former G2 becomes the new G1
|
|
||||||
+ ending at the current __wseq value when we eventually make the switch
|
|
||||||
+ (WSEQ is just an observation of __wseq by the signaler).
|
|
||||||
If G2 is empty, it will not switch groups because then it would create an
|
|
||||||
empty G1 which would require switching groups again on the next signal.
|
|
||||||
Returns false iff groups were not switched because G2 was empty. */
|
|
||||||
static bool __attribute__ ((unused))
|
|
||||||
-__condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
|
||||||
+__condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
|
||||||
unsigned int *g1index, int private)
|
|
||||||
{
|
|
||||||
unsigned int g1 = *g1index;
|
|
||||||
@@ -354,8 +353,7 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
|
||||||
+ cond->__data.__g_size[g1 ^ 1]) == 0)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
- /* Now try to close and quiesce G1. We have to consider the following kinds
|
|
||||||
- of waiters:
|
|
||||||
+ /* We have to consider the following kinds of waiters:
|
|
||||||
* Waiters from less recent groups than G1 are not affected because
|
|
||||||
nothing will change for them apart from __g1_start getting larger.
|
|
||||||
* New waiters arriving concurrently with the group switching will all go
|
|
||||||
@@ -363,12 +361,12 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
|
||||||
are not affected.
|
|
||||||
* Waiters in G1 have already received a signal and been woken. */
|
|
||||||
|
|
||||||
- /* Update __g1_start, which finishes closing this group. The value we add
|
|
||||||
- will never be negative because old_orig_size can only be zero when we
|
|
||||||
- switch groups the first time after a condvar was initialized, in which
|
|
||||||
- case G1 will be at index 1 and we will add a value of 1.
|
|
||||||
- Relaxed MO is fine because the change comes with no additional
|
|
||||||
- constraints that others would have to observe. */
|
|
||||||
+ /* Update __g1_start, which closes this group. The value we add will never
|
|
||||||
+ be negative because old_orig_size can only be zero when we switch groups
|
|
||||||
+ the first time after a condvar was initialized, in which case G1 will be
|
|
||||||
+ at index 1 and we will add a value of 1. Relaxed MO is fine because the
|
|
||||||
+ change comes with no additional constraints that others would have to
|
|
||||||
+ observe. */
|
|
||||||
__condvar_add_g1_start_relaxed (cond,
|
|
||||||
(old_orig_size << 1) + (g1 == 1 ? 1 : - 1));
|
|
||||||
|
|
||||||
diff --git a/nptl/pthread_cond_signal.c b/nptl/pthread_cond_signal.c
|
|
||||||
index 3db3d1fbeb165ea4..24c9d813e7c0ada6 100644
|
|
||||||
--- a/nptl/pthread_cond_signal.c
|
|
||||||
+++ b/nptl/pthread_cond_signal.c
|
|
||||||
@@ -70,18 +70,17 @@ __pthread_cond_signal (pthread_cond_t *cond)
|
|
||||||
bool do_futex_wake = false;
|
|
||||||
|
|
||||||
/* If G1 is still receiving signals, we put the signal there. If not, we
|
|
||||||
- check if G2 has waiters, and if so, quiesce and switch G1 to the former
|
|
||||||
- G2; if this results in a new G1 with waiters (G2 might have cancellations
|
|
||||||
- already, see __condvar_quiesce_and_switch_g1), we put the signal in the
|
|
||||||
- new G1. */
|
|
||||||
+ check if G2 has waiters, and if so, switch G1 to the former G2; if this
|
|
||||||
+ results in a new G1 with waiters (G2 might have cancellations already,
|
|
||||||
+ see __condvar_switch_g1), we put the signal in the new G1. */
|
|
||||||
if ((cond->__data.__g_size[g1] != 0)
|
|
||||||
- || __condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private))
|
|
||||||
+ || __condvar_switch_g1 (cond, wseq, &g1, private))
|
|
||||||
{
|
|
||||||
/* Add a signal. Relaxed MO is fine because signaling does not need to
|
|
||||||
- establish a happens-before relation (see above). We do not mask the
|
|
||||||
- release-MO store when initializing a group in
|
|
||||||
- __condvar_quiesce_and_switch_g1 because we use an atomic
|
|
||||||
- read-modify-write and thus extend that store's release sequence. */
|
|
||||||
+ establish a happens-before relation (see above). We do not mask the
|
|
||||||
+ release-MO store when initializing a group in __condvar_switch_g1
|
|
||||||
+ because we use an atomic read-modify-write and thus extend that
|
|
||||||
+ store's release sequence. */
|
|
||||||
atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 2);
|
|
||||||
cond->__data.__g_size[g1]--;
|
|
||||||
/* TODO Only set it if there are indeed futex waiters. */
|
|
||||||
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
|
|
||||||
index c97b5d22cb31ca6b..5b82ce639367e0c0 100644
|
|
||||||
--- a/nptl/pthread_cond_wait.c
|
|
||||||
+++ b/nptl/pthread_cond_wait.c
|
|
||||||
@@ -350,8 +350,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
because we do not need to establish any happens-before relation with
|
|
||||||
signalers (see __pthread_cond_signal); modification order alone
|
|
||||||
establishes a total order of waiters/signals. We do need acquire MO
|
|
||||||
- to synchronize with group reinitialization in
|
|
||||||
- __condvar_quiesce_and_switch_g1. */
|
|
||||||
+ to synchronize with group reinitialization in __condvar_switch_g1. */
|
|
||||||
uint64_t wseq = __condvar_fetch_add_wseq_acquire (cond, 2);
|
|
||||||
/* Find our group's index. We always go into what was G2 when we acquired
|
|
||||||
our position. */
|
|
||||||
@@ -383,9 +382,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
{
|
|
||||||
/* Now wait until a signal is available in our group or it is closed.
|
|
||||||
Acquire MO so that if we observe (signals == lowseq) after group
|
|
||||||
- switching in __condvar_quiesce_and_switch_g1, we synchronize with that
|
|
||||||
- store and will see the prior update of __g1_start done while switching
|
|
||||||
- groups too. */
|
|
||||||
+ switching in __condvar_switch_g1, we synchronize with that store and
|
|
||||||
+ will see the prior update of __g1_start done while switching groups
|
|
||||||
+ too. */
|
|
||||||
unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
|
||||||
uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
|
@ -1,179 +0,0 @@
|
|||||||
commit 91bb902f58264a2fd50fbce8f39a9a290dd23706
|
|
||||||
Author: Malte Skarupke <malteskarupke@fastmail.fm>
|
|
||||||
Date: Wed Dec 4 08:05:40 2024 -0500
|
|
||||||
|
|
||||||
nptl: Use all of g1_start and g_signals
|
|
||||||
|
|
||||||
The LSB of g_signals was unused. The LSB of g1_start was used to indicate
|
|
||||||
which group is G2. This was used to always go to sleep in pthread_cond_wait
|
|
||||||
if a waiter is in G2. A comment earlier in the file says that this is not
|
|
||||||
correct to do:
|
|
||||||
|
|
||||||
"Waiters cannot determine whether they are currently in G2 or G1 -- but they
|
|
||||||
do not have to because all they are interested in is whether there are
|
|
||||||
available signals"
|
|
||||||
|
|
||||||
I either would have had to update the comment, or get rid of the check. I
|
|
||||||
chose to get rid of the check. In fact I don't quite know why it was there.
|
|
||||||
There will never be available signals for group G2, so we didn't need the
|
|
||||||
special case. Even if there were, this would just be a spurious wake. This
|
|
||||||
might have caught some cases where the count has wrapped around, but it
|
|
||||||
wouldn't reliably do that, (and even if it did, why would you want to force a
|
|
||||||
sleep in that case?) and we don't support that many concurrent waiters
|
|
||||||
anyway. Getting rid of it allows us to use one more bit, making us more
|
|
||||||
robust to wraparound.
|
|
||||||
|
|
||||||
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
|
|
||||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
|
||||||
|
|
||||||
diff --git a/nptl/pthread_cond_broadcast.c b/nptl/pthread_cond_broadcast.c
|
|
||||||
index 1ec746ec3df51c4f..14d3e533ad4b24d7 100644
|
|
||||||
--- a/nptl/pthread_cond_broadcast.c
|
|
||||||
+++ b/nptl/pthread_cond_broadcast.c
|
|
||||||
@@ -58,7 +58,7 @@ __pthread_cond_broadcast (pthread_cond_t *cond)
|
|
||||||
{
|
|
||||||
/* Add as many signals as the remaining size of the group. */
|
|
||||||
atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
|
|
||||||
- cond->__data.__g_size[g1] << 1);
|
|
||||||
+ cond->__data.__g_size[g1]);
|
|
||||||
cond->__data.__g_size[g1] = 0;
|
|
||||||
|
|
||||||
/* We need to wake G1 waiters before we switch G1 below. */
|
|
||||||
@@ -74,7 +74,7 @@ __pthread_cond_broadcast (pthread_cond_t *cond)
|
|
||||||
{
|
|
||||||
/* Step (3): Send signals to all waiters in the old G2 / new G1. */
|
|
||||||
atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
|
|
||||||
- cond->__data.__g_size[g1] << 1);
|
|
||||||
+ cond->__data.__g_size[g1]);
|
|
||||||
cond->__data.__g_size[g1] = 0;
|
|
||||||
/* TODO Only set it if there are indeed futex waiters. */
|
|
||||||
do_futex_wake = true;
|
|
||||||
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
|
|
||||||
index 770f8e7c5d347f6b..1fe0448d8705c326 100644
|
|
||||||
--- a/nptl/pthread_cond_common.c
|
|
||||||
+++ b/nptl/pthread_cond_common.c
|
|
||||||
@@ -348,9 +348,9 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
|
||||||
behavior.
|
|
||||||
Note that this works correctly for a zero-initialized condvar too. */
|
|
||||||
unsigned int old_orig_size = __condvar_get_orig_size (cond);
|
|
||||||
- uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
|
|
||||||
- if (((unsigned) (wseq - old_g1_start - old_orig_size)
|
|
||||||
- + cond->__data.__g_size[g1 ^ 1]) == 0)
|
|
||||||
+ uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
+ uint64_t new_g1_start = old_g1_start + old_orig_size;
|
|
||||||
+ if (((unsigned) (wseq - new_g1_start) + cond->__data.__g_size[g1 ^ 1]) == 0)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
/* We have to consider the following kinds of waiters:
|
|
||||||
@@ -361,16 +361,10 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
|
||||||
are not affected.
|
|
||||||
* Waiters in G1 have already received a signal and been woken. */
|
|
||||||
|
|
||||||
- /* Update __g1_start, which closes this group. The value we add will never
|
|
||||||
- be negative because old_orig_size can only be zero when we switch groups
|
|
||||||
- the first time after a condvar was initialized, in which case G1 will be
|
|
||||||
- at index 1 and we will add a value of 1. Relaxed MO is fine because the
|
|
||||||
- change comes with no additional constraints that others would have to
|
|
||||||
- observe. */
|
|
||||||
- __condvar_add_g1_start_relaxed (cond,
|
|
||||||
- (old_orig_size << 1) + (g1 == 1 ? 1 : - 1));
|
|
||||||
-
|
|
||||||
- unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U;
|
|
||||||
+ /* Update __g1_start, which closes this group. Relaxed MO is fine because
|
|
||||||
+ the change comes with no additional constraints that others would have
|
|
||||||
+ to observe. */
|
|
||||||
+ __condvar_add_g1_start_relaxed (cond, old_orig_size);
|
|
||||||
|
|
||||||
/* At this point, the old G1 is now a valid new G2 (but not in use yet).
|
|
||||||
No old waiter can neither grab a signal nor acquire a reference without
|
|
||||||
@@ -382,13 +376,13 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
|
||||||
g1 ^= 1;
|
|
||||||
*g1index ^= 1;
|
|
||||||
|
|
||||||
- /* Now advance the new G1 g_signals to the new lowseq, giving it
|
|
||||||
+ /* Now advance the new G1 g_signals to the new g1_start, giving it
|
|
||||||
an effective signal count of 0 to start. */
|
|
||||||
- atomic_store_release (cond->__data.__g_signals + g1, lowseq);
|
|
||||||
+ atomic_store_release (cond->__data.__g_signals + g1, (unsigned)new_g1_start);
|
|
||||||
|
|
||||||
/* These values are just observed by signalers, and thus protected by the
|
|
||||||
lock. */
|
|
||||||
- unsigned int orig_size = wseq - (old_g1_start + old_orig_size);
|
|
||||||
+ unsigned int orig_size = wseq - new_g1_start;
|
|
||||||
__condvar_set_orig_size (cond, orig_size);
|
|
||||||
/* Use and addition to not loose track of cancellations in what was
|
|
||||||
previously G2. */
|
|
||||||
diff --git a/nptl/pthread_cond_signal.c b/nptl/pthread_cond_signal.c
|
|
||||||
index 24c9d813e7c0ada6..9f04833119fd3f59 100644
|
|
||||||
--- a/nptl/pthread_cond_signal.c
|
|
||||||
+++ b/nptl/pthread_cond_signal.c
|
|
||||||
@@ -81,7 +81,7 @@ __pthread_cond_signal (pthread_cond_t *cond)
|
|
||||||
release-MO store when initializing a group in __condvar_switch_g1
|
|
||||||
because we use an atomic read-modify-write and thus extend that
|
|
||||||
store's release sequence. */
|
|
||||||
- atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 2);
|
|
||||||
+ atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 1);
|
|
||||||
cond->__data.__g_size[g1]--;
|
|
||||||
/* TODO Only set it if there are indeed futex waiters. */
|
|
||||||
do_futex_wake = true;
|
|
||||||
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
|
|
||||||
index 5b82ce639367e0c0..031ec717ca64f66f 100644
|
|
||||||
--- a/nptl/pthread_cond_wait.c
|
|
||||||
+++ b/nptl/pthread_cond_wait.c
|
|
||||||
@@ -85,7 +85,7 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g,
|
|
||||||
not hold a reference on the group. */
|
|
||||||
__condvar_acquire_lock (cond, private);
|
|
||||||
|
|
||||||
- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
|
|
||||||
+ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
if (g1_start > seq)
|
|
||||||
{
|
|
||||||
/* Our group is closed, so someone provided enough signals for it.
|
|
||||||
@@ -260,7 +260,6 @@ __condvar_cleanup_waiting (void *arg)
|
|
||||||
* Waiters fetch-add while having acquire the mutex associated with the
|
|
||||||
condvar. Signalers load it and fetch-xor it concurrently.
|
|
||||||
__g1_start: Starting position of G1 (inclusive)
|
|
||||||
- * LSB is index of current G2.
|
|
||||||
* Modified by signalers while having acquired the condvar-internal lock
|
|
||||||
and observed concurrently by waiters.
|
|
||||||
__g1_orig_size: Initial size of G1
|
|
||||||
@@ -281,11 +280,9 @@ __condvar_cleanup_waiting (void *arg)
|
|
||||||
* Reference count used by waiters concurrently with signalers that have
|
|
||||||
acquired the condvar-internal lock.
|
|
||||||
__g_signals: The number of signals that can still be consumed, relative to
|
|
||||||
- the current g1_start. (i.e. bits 31 to 1 of __g_signals are bits
|
|
||||||
- 31 to 1 of g1_start with the signal count added)
|
|
||||||
+ the current g1_start. (i.e. g1_start with the signal count added)
|
|
||||||
* Used as a futex word by waiters. Used concurrently by waiters and
|
|
||||||
signalers.
|
|
||||||
- * LSB is currently reserved and 0.
|
|
||||||
__g_size: Waiters remaining in this group (i.e., which have not been
|
|
||||||
signaled yet.
|
|
||||||
* Accessed by signalers and waiters that cancel waiting (both do so only
|
|
||||||
@@ -387,9 +384,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
too. */
|
|
||||||
unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
|
||||||
uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
|
||||||
- unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
|
||||||
|
|
||||||
- if (seq < (g1_start >> 1))
|
|
||||||
+ if (seq < g1_start)
|
|
||||||
{
|
|
||||||
/* If the group is closed already,
|
|
||||||
then this waiter originally had enough extra signals to
|
|
||||||
@@ -402,13 +398,13 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
|
||||||
by now, perhaps in the process of switching back to an older
|
|
||||||
G2, but in either case we're allowed to consume the available
|
|
||||||
signal and should not block anymore. */
|
|
||||||
- if ((int)(signals - lowseq) >= 2)
|
|
||||||
+ if ((int)(signals - (unsigned int)g1_start) > 0)
|
|
||||||
{
|
|
||||||
/* Try to grab a signal. See above for MO. (if we do another loop
|
|
||||||
iteration we need to see the correct value of g1_start) */
|
|
||||||
if (atomic_compare_exchange_weak_acquire (
|
|
||||||
cond->__data.__g_signals + g,
|
|
||||||
- &signals, signals - 2))
|
|
||||||
+ &signals, signals - 1))
|
|
||||||
break;
|
|
||||||
else
|
|
||||||
continue;
|
|
@ -1,25 +0,0 @@
|
|||||||
Author: Florian Weimer <fweimer@redhat.com>
|
|
||||||
Date: Wed Jul 4 16:16:57 2018 +0200
|
|
||||||
|
|
||||||
Makeconfig (ASFLAGS): Always append required assembler flags.
|
|
||||||
|
|
||||||
Submitted upstream here:
|
|
||||||
|
|
||||||
https://sourceware.org/ml/libc-alpha/2018-07/msg00077.html
|
|
||||||
|
|
||||||
Otherwise, we lose essential flags such as -Wa,--noexecstack due to
|
|
||||||
the way += works in make due to the ASFLAGS command line override.
|
|
||||||
|
|
||||||
diff --git a/Makeconfig b/Makeconfig
|
|
||||||
index b0b27f0113ac18b8..92e76d6200bbcd5b 100644
|
|
||||||
--- a/Makeconfig
|
|
||||||
+++ b/Makeconfig
|
|
||||||
@@ -1047,7 +1047,7 @@ endif
|
|
||||||
ifndef ASFLAGS
|
|
||||||
ASFLAGS := $(filter -g% -fdebug-prefix-map=%,$(CFLAGS))
|
|
||||||
endif
|
|
||||||
-ASFLAGS += -Werror=undef $(ASFLAGS-config) $(asflags-cpu)
|
|
||||||
+override ASFLAGS += -Werror=undef $(ASFLAGS-config) $(asflags-cpu)
|
|
||||||
|
|
||||||
ifndef BUILD_CC
|
|
||||||
BUILD_CC = $(CC)
|
|
@ -1,286 +0,0 @@
|
|||||||
Short description: Add C.UTF-8 support.
|
|
||||||
Author(s): Fedora glibc team <glibc@lists.fedoraproject.org>
|
|
||||||
Origin: PATCH
|
|
||||||
Upstream status: not-submitted
|
|
||||||
|
|
||||||
This patch needs to upstream as part of Carlos O'Donell
|
|
||||||
<carlos@redhat.com>'s work on enabling upstream C.UTF-8 support. This
|
|
||||||
work is currently blocked on cleaning up the test results to prove that
|
|
||||||
full code-point sorting is working as intended.
|
|
||||||
|
|
||||||
Note that this patch does not provide full code-point sorting as
|
|
||||||
expected.
|
|
||||||
|
|
||||||
This patch needs to upstream as soon as possible since it would be nice
|
|
||||||
to have this in F29 and fixed.
|
|
||||||
|
|
||||||
From 2eda7b462b415105f5a05c1323372d4e39d46439 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Mike FABIAN <mfabian@redhat.com>
|
|
||||||
Date: Mon, 10 Aug 2015 15:58:12 +0200
|
|
||||||
Subject: [PATCH] Add a C.UTF-8 locale
|
|
||||||
|
|
||||||
---
|
|
||||||
localedata/SUPPORTED | 1 +
|
|
||||||
localedata/locales/C | 238 +++++++++++++++++++++++++++++++++++++++++++++++++++
|
|
||||||
2 files changed, 239 insertions(+)
|
|
||||||
create mode 100644 localedata/locales/C
|
|
||||||
|
|
||||||
diff --git a/localedata/SUPPORTED b/localedata/SUPPORTED
|
|
||||||
index 8ca023e..2a78391 100644
|
|
||||||
--- a/localedata/SUPPORTED
|
|
||||||
+++ b/localedata/SUPPORTED
|
|
||||||
@@ -1,6 +1,7 @@
|
|
||||||
# This file names the currently supported and somewhat tested locales.
|
|
||||||
# If you have any additions please file a glibc bug report.
|
|
||||||
SUPPORTED-LOCALES=\
|
|
||||||
+C.UTF-8/UTF-8 \
|
|
||||||
aa_DJ.UTF-8/UTF-8 \
|
|
||||||
aa_DJ/ISO-8859-1 \
|
|
||||||
aa_ER/UTF-8 \
|
|
||||||
diff --git a/localedata/locales/C b/localedata/locales/C
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000..fdf460e
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/localedata/locales/C
|
|
||||||
@@ -0,0 +1,238 @@
|
|
||||||
+escape_char /
|
|
||||||
+comment_char %
|
|
||||||
+% Locale for C locale in UTF-8
|
|
||||||
+
|
|
||||||
+LC_IDENTIFICATION
|
|
||||||
+title "C locale"
|
|
||||||
+source ""
|
|
||||||
+address ""
|
|
||||||
+contact ""
|
|
||||||
+email "mfabian@redhat.com"
|
|
||||||
+tel ""
|
|
||||||
+fax ""
|
|
||||||
+language "C"
|
|
||||||
+territory ""
|
|
||||||
+revision "1.0"
|
|
||||||
+date "2015-08-10"
|
|
||||||
+%
|
|
||||||
+category "i18n:2012";LC_IDENTIFICATION
|
|
||||||
+category "i18n:2012";LC_CTYPE
|
|
||||||
+category "i18n:2012";LC_COLLATE
|
|
||||||
+category "i18n:2012";LC_TIME
|
|
||||||
+category "i18n:2012";LC_NUMERIC
|
|
||||||
+category "i18n:2012";LC_MONETARY
|
|
||||||
+category "i18n:2012";LC_MESSAGES
|
|
||||||
+category "i18n:2012";LC_PAPER
|
|
||||||
+category "i18n:2012";LC_NAME
|
|
||||||
+category "i18n:2012";LC_ADDRESS
|
|
||||||
+category "i18n:2012";LC_TELEPHONE
|
|
||||||
+category "i18n:2012";LC_MEASUREMENT
|
|
||||||
+END LC_IDENTIFICATION
|
|
||||||
+
|
|
||||||
+LC_CTYPE
|
|
||||||
+copy "i18n"
|
|
||||||
+
|
|
||||||
+translit_start
|
|
||||||
+include "translit_combining";""
|
|
||||||
+translit_end
|
|
||||||
+
|
|
||||||
+END LC_CTYPE
|
|
||||||
+
|
|
||||||
+LC_COLLATE
|
|
||||||
+order_start forward
|
|
||||||
+<U0000>
|
|
||||||
+..
|
|
||||||
+<UFFFF>
|
|
||||||
+<U10000>
|
|
||||||
+..
|
|
||||||
+<U1FFFF>
|
|
||||||
+<U20000>
|
|
||||||
+..
|
|
||||||
+<U2FFFF>
|
|
||||||
+<UE0000>
|
|
||||||
+..
|
|
||||||
+<UEFFFF>
|
|
||||||
+<UF0000>
|
|
||||||
+..
|
|
||||||
+<UFFFFF>
|
|
||||||
+<U100000>
|
|
||||||
+..
|
|
||||||
+<U10FFFF>
|
|
||||||
+UNDEFINED
|
|
||||||
+order_end
|
|
||||||
+END LC_COLLATE
|
|
||||||
+
|
|
||||||
+LC_MONETARY
|
|
||||||
+% This is the 14652 i18n fdcc-set definition for
|
|
||||||
+% the LC_MONETARY category
|
|
||||||
+% (except for the int_curr_symbol and currency_symbol, they are empty in
|
|
||||||
+% the 14652 i18n fdcc-set definition and also empty in
|
|
||||||
+% glibc/locale/C-monetary.c. But localedef complains in that case).
|
|
||||||
+%
|
|
||||||
+% Using "USD" for int_curr_symbol. But maybe "XXX" would be better?
|
|
||||||
+% XXX is "No currency" (https://en.wikipedia.org/wiki/ISO_4217)
|
|
||||||
+int_curr_symbol "<U0055><U0053><U0044><U0020>"
|
|
||||||
+% Using "$" for currency_symbol. But maybe <U00A4> would be better?
|
|
||||||
+% U+00A4 is the "generic currency symbol"
|
|
||||||
+% (https://en.wikipedia.org/wiki/Currency_sign_%28typography%29)
|
|
||||||
+currency_symbol "<U0024>"
|
|
||||||
+mon_decimal_point "<U002E>"
|
|
||||||
+mon_thousands_sep ""
|
|
||||||
+mon_grouping -1
|
|
||||||
+positive_sign ""
|
|
||||||
+negative_sign "<U002D>"
|
|
||||||
+int_frac_digits -1
|
|
||||||
+frac_digits -1
|
|
||||||
+p_cs_precedes -1
|
|
||||||
+int_p_sep_by_space -1
|
|
||||||
+p_sep_by_space -1
|
|
||||||
+n_cs_precedes -1
|
|
||||||
+int_n_sep_by_space -1
|
|
||||||
+n_sep_by_space -1
|
|
||||||
+p_sign_posn -1
|
|
||||||
+n_sign_posn -1
|
|
||||||
+%
|
|
||||||
+END LC_MONETARY
|
|
||||||
+
|
|
||||||
+LC_NUMERIC
|
|
||||||
+% This is the POSIX Locale definition for
|
|
||||||
+% the LC_NUMERIC category.
|
|
||||||
+%
|
|
||||||
+decimal_point "<U002E>"
|
|
||||||
+thousands_sep ""
|
|
||||||
+grouping -1
|
|
||||||
+END LC_NUMERIC
|
|
||||||
+
|
|
||||||
+LC_TIME
|
|
||||||
+% This is the POSIX Locale definition for
|
|
||||||
+% the LC_TIME category.
|
|
||||||
+%
|
|
||||||
+% Abbreviated weekday names (%a)
|
|
||||||
+abday "<U0053><U0075><U006E>";"<U004D><U006F><U006E>";/
|
|
||||||
+ "<U0054><U0075><U0065>";"<U0057><U0065><U0064>";/
|
|
||||||
+ "<U0054><U0068><U0075>";"<U0046><U0072><U0069>";/
|
|
||||||
+ "<U0053><U0061><U0074>"
|
|
||||||
+
|
|
||||||
+% Full weekday names (%A)
|
|
||||||
+day "<U0053><U0075><U006E><U0064><U0061><U0079>";/
|
|
||||||
+ "<U004D><U006F><U006E><U0064><U0061><U0079>";/
|
|
||||||
+ "<U0054><U0075><U0065><U0073><U0064><U0061><U0079>";/
|
|
||||||
+ "<U0057><U0065><U0064><U006E><U0065><U0073><U0064><U0061><U0079>";/
|
|
||||||
+ "<U0054><U0068><U0075><U0072><U0073><U0064><U0061><U0079>";/
|
|
||||||
+ "<U0046><U0072><U0069><U0064><U0061><U0079>";/
|
|
||||||
+ "<U0053><U0061><U0074><U0075><U0072><U0064><U0061><U0079>"
|
|
||||||
+
|
|
||||||
+% Abbreviated month names (%b)
|
|
||||||
+abmon "<U004A><U0061><U006E>";"<U0046><U0065><U0062>";/
|
|
||||||
+ "<U004D><U0061><U0072>";"<U0041><U0070><U0072>";/
|
|
||||||
+ "<U004D><U0061><U0079>";"<U004A><U0075><U006E>";/
|
|
||||||
+ "<U004A><U0075><U006C>";"<U0041><U0075><U0067>";/
|
|
||||||
+ "<U0053><U0065><U0070>";"<U004F><U0063><U0074>";/
|
|
||||||
+ "<U004E><U006F><U0076>";"<U0044><U0065><U0063>"
|
|
||||||
+
|
|
||||||
+% Full month names (%B)
|
|
||||||
+mon "<U004A><U0061><U006E><U0075><U0061><U0072><U0079>";/
|
|
||||||
+ "<U0046><U0065><U0062><U0072><U0075><U0061><U0072><U0079>";/
|
|
||||||
+ "<U004D><U0061><U0072><U0063><U0068>";/
|
|
||||||
+ "<U0041><U0070><U0072><U0069><U006C>";/
|
|
||||||
+ "<U004D><U0061><U0079>";/
|
|
||||||
+ "<U004A><U0075><U006E><U0065>";/
|
|
||||||
+ "<U004A><U0075><U006C><U0079>";/
|
|
||||||
+ "<U0041><U0075><U0067><U0075><U0073><U0074>";/
|
|
||||||
+ "<U0053><U0065><U0070><U0074><U0065><U006D><U0062><U0065><U0072>";/
|
|
||||||
+ "<U004F><U0063><U0074><U006F><U0062><U0065><U0072>";/
|
|
||||||
+ "<U004E><U006F><U0076><U0065><U006D><U0062><U0065><U0072>";/
|
|
||||||
+ "<U0044><U0065><U0063><U0065><U006D><U0062><U0065><U0072>"
|
|
||||||
+
|
|
||||||
+% Week description, consists of three fields:
|
|
||||||
+% 1. Number of days in a week.
|
|
||||||
+% 2. Gregorian date that is a first weekday (19971130 for Sunday, 19971201 for Monday).
|
|
||||||
+% 3. The weekday number to be contained in the first week of the year.
|
|
||||||
+%
|
|
||||||
+% ISO 8601 conforming applications should use the values 7, 19971201 (a
|
|
||||||
+% Monday), and 4 (Thursday), respectively.
|
|
||||||
+week 7;19971201;4
|
|
||||||
+first_weekday 1
|
|
||||||
+first_workday 1
|
|
||||||
+
|
|
||||||
+% Appropriate date and time representation (%c)
|
|
||||||
+% "%a %b %e %H:%M:%S %Y"
|
|
||||||
+d_t_fmt "<U0025><U0061><U0020><U0025><U0062><U0020><U0025><U0065><U0020><U0025><U0048><U003A><U0025><U004D><U003A><U0025><U0053><U0020><U0025><U0059>"
|
|
||||||
+
|
|
||||||
+% Appropriate date representation (%x)
|
|
||||||
+% "%m/%d/%y"
|
|
||||||
+d_fmt "<U0025><U006D><U002F><U0025><U0064><U002F><U0025><U0079>"
|
|
||||||
+
|
|
||||||
+% Appropriate time representation (%X)
|
|
||||||
+% "%H:%M:%S"
|
|
||||||
+t_fmt "<U0025><U0048><U003A><U0025><U004D><U003A><U0025><U0053>"
|
|
||||||
+
|
|
||||||
+% Appropriate AM/PM time representation (%r)
|
|
||||||
+% "%I:%M:%S %p"
|
|
||||||
+t_fmt_ampm "<U0025><U0049><U003A><U0025><U004D><U003A><U0025><U0053><U0020><U0025><U0070>"
|
|
||||||
+
|
|
||||||
+% Equivalent of AM/PM (%p) "AM"/"PM"
|
|
||||||
+%
|
|
||||||
+am_pm "<U0041><U004D>";"<U0050><U004D>"
|
|
||||||
+
|
|
||||||
+% Appropriate date representation (date(1)) "%a %b %e %H:%M:%S %Z %Y"
|
|
||||||
+date_fmt "<U0025><U0061><U0020><U0025><U0062><U0020><U0025><U0065><U0020><U0025><U0048><U003A><U0025><U004D><U003A><U0025><U0053><U0020><U0025><U005A><U0020><U0025><U0059>"
|
|
||||||
+END LC_TIME
|
|
||||||
+
|
|
||||||
+LC_MESSAGES
|
|
||||||
+% This is the POSIX Locale definition for
|
|
||||||
+% the LC_NUMERIC category.
|
|
||||||
+%
|
|
||||||
+yesexpr "<U005E><U005B><U0079><U0059><U005D>"
|
|
||||||
+noexpr "<U005E><U005B><U006E><U004E><U005D>"
|
|
||||||
+yesstr "<U0059><U0065><U0073>"
|
|
||||||
+nostr "<U004E><U006F>"
|
|
||||||
+END LC_MESSAGES
|
|
||||||
+
|
|
||||||
+LC_PAPER
|
|
||||||
+% This is the ISO/IEC 14652 "i18n" definition for
|
|
||||||
+% the LC_PAPER category.
|
|
||||||
+% (A4 paper, this is also used in the built in C/POSIX
|
|
||||||
+% locale in glibc/locale/C-paper.c)
|
|
||||||
+height 297
|
|
||||||
+width 210
|
|
||||||
+END LC_PAPER
|
|
||||||
+
|
|
||||||
+LC_NAME
|
|
||||||
+% This is the ISO/IEC 14652 "i18n" definition for
|
|
||||||
+% the LC_NAME category.
|
|
||||||
+% "%p%t%g%t%m%t%f"
|
|
||||||
+% (also used in the built in C/POSIX locale in glibc/locale/C-name.c)
|
|
||||||
+name_fmt "<U0025><U0070><U0025><U0074><U0025><U0067><U0025><U0074>/
|
|
||||||
+<U0025><U006D><U0025><U0074><U0025><U0066>"
|
|
||||||
+END LC_NAME
|
|
||||||
+
|
|
||||||
+LC_ADDRESS
|
|
||||||
+% This is the ISO/IEC 14652 "i18n" definition for
|
|
||||||
+% the LC_ADDRESS category.
|
|
||||||
+% "%a%N%f%N%d%N%b%N%s %h %e %r%N%C-%z %T%N%c%N"
|
|
||||||
+% (also used in the built in C/POSIX locale in glibc/locale/C-address.c)
|
|
||||||
+postal_fmt "<U0025><U0061><U0025><U004E><U0025><U0066><U0025><U004E>/
|
|
||||||
+<U0025><U0064><U0025><U004E><U0025><U0062><U0025><U004E><U0025><U0073>/
|
|
||||||
+<U0020><U0025><U0068><U0020><U0025><U0065><U0020><U0025><U0072><U0025>/
|
|
||||||
+<U004E><U0025><U0043><U002D><U0025><U007A><U0020><U0025><U0054><U0025>/
|
|
||||||
+<U004E><U0025><U0063><U0025><U004E>"
|
|
||||||
+END LC_ADDRESS
|
|
||||||
+
|
|
||||||
+LC_TELEPHONE
|
|
||||||
+% This is the ISO/IEC 14652 "i18n" definition for
|
|
||||||
+% the LC_TELEPHONE category.
|
|
||||||
+% "+%c %a %l"
|
|
||||||
+tel_int_fmt "<U002B><U0025><U0063><U0020><U0025><U0061><U0020><U0025>/
|
|
||||||
+<U006C>"
|
|
||||||
+% (also used in the built in C/POSIX locale in glibc/locale/C-telephone.c)
|
|
||||||
+END LC_TELEPHONE
|
|
||||||
+
|
|
||||||
+LC_MEASUREMENT
|
|
||||||
+% This is the ISO/IEC 14652 "i18n" definition for
|
|
||||||
+% the LC_MEASUREMENT category.
|
|
||||||
+% (same as in the built in C/POSIX locale in glibc/locale/C-measurement.c)
|
|
||||||
+%metric
|
|
||||||
+measurement 1
|
|
||||||
+END LC_MEASUREMENT
|
|
||||||
+
|
|
||||||
--
|
|
||||||
2.4.3
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
|||||||
Short description: Adjust CS_PATH return value.
|
|
||||||
Author(s): Fedora glibc team <glibc@lists.fedoraproject.org>
|
|
||||||
Origin: PATCH
|
|
||||||
Upstream status: not-needed
|
|
||||||
|
|
||||||
In Fedora we should return only /usr/bin because /bin is just a symlink
|
|
||||||
to /usr/bin after MoveToUsr transition (which glibc has not really
|
|
||||||
completed).
|
|
||||||
|
|
||||||
diff -pruN a/sysdeps/unix/confstr.h b/sysdeps/unix/confstr.h
|
|
||||||
--- a/sysdeps/unix/confstr.h 2012-12-25 08:32:13.000000000 +0530
|
|
||||||
+++ b/sysdeps/unix/confstr.h 2014-09-05 20:02:55.698275219 +0530
|
|
||||||
@@ -1 +1 @@
|
|
||||||
-#define CS_PATH "/bin:/usr/bin"
|
|
||||||
+#define CS_PATH "/usr/bin"
|
|
@ -1,91 +0,0 @@
|
|||||||
Short description: Cleanup use of _dl_starting_up.
|
|
||||||
Author(s): Fedora glibc team <glibc@lists.fedoraproject.org>
|
|
||||||
Origin: PATCH
|
|
||||||
Upstream status: https://sourceware.org/ml/libc-alpha/2014-02/msg00589.html
|
|
||||||
|
|
||||||
Upstream discussions:
|
|
||||||
https://sourceware.org/ml/libc-alpha/2014-02/msg00580.html
|
|
||||||
|
|
||||||
Based on the following commit:
|
|
||||||
~~~
|
|
||||||
From 16552c01a66633c9e412984d9d92616bd4e5303c Mon Sep 17 00:00:00 2001
|
|
||||||
From: Andreas Schwab <schwab@redhat.com>
|
|
||||||
Date: Fri, 11 Jun 2010 11:04:11 +0200
|
|
||||||
Subject: [PATCH] Properly set __libc_multiple_libcs
|
|
||||||
|
|
||||||
* elf/rtld.c (_dl_starting_up): Always define.
|
|
||||||
(dl_main): Always set _dl_starting_up.
|
|
||||||
* elf/dl-support.c (_dl_starting_up): Always define.
|
|
||||||
* elf/dl-init.c (_dl_init): Always clear _dl_starting_up.
|
|
||||||
|
|
||||||
---
|
|
||||||
ChangeLog | 7 +++++++
|
|
||||||
elf/dl-init.c | 4 ----
|
|
||||||
elf/dl-support.c | 2 --
|
|
||||||
elf/rtld.c | 4 ----
|
|
||||||
4 files changed, 7 insertions(+), 10 deletions(-)
|
|
||||||
~~~
|
|
||||||
|
|
||||||
This patch needs to go upstream to get cleaned up, but has always involed
|
|
||||||
analysis of the GNU/Hurd parts of the change and that stalled out, but
|
|
||||||
perhaps with build-many-glibcs we can now test these changes more easily.
|
|
||||||
|
|
||||||
Index: b/elf/dl-init.c
|
|
||||||
===================================================================
|
|
||||||
--- a/elf/dl-init.c
|
|
||||||
+++ b/elf/dl-init.c
|
|
||||||
@@ -119,8 +119,6 @@ _dl_init (struct link_map *main_map, int
|
|
||||||
while (i-- > 0)
|
|
||||||
call_init (main_map->l_initfini[i], argc, argv, env);
|
|
||||||
|
|
||||||
-#ifndef HAVE_INLINED_SYSCALLS
|
|
||||||
/* Finished starting up. */
|
|
||||||
_dl_starting_up = 0;
|
|
||||||
-#endif
|
|
||||||
}
|
|
||||||
Index: b/elf/dl-support.c
|
|
||||||
===================================================================
|
|
||||||
--- a/elf/dl-support.c
|
|
||||||
+++ b/elf/dl-support.c
|
|
||||||
@@ -117,10 +117,8 @@ struct r_scope_elem _dl_initial_searchli
|
|
||||||
.r_nlist = 1,
|
|
||||||
};
|
|
||||||
|
|
||||||
-#ifndef HAVE_INLINED_SYSCALLS
|
|
||||||
/* Nonzero during startup. */
|
|
||||||
int _dl_starting_up = 1;
|
|
||||||
-#endif
|
|
||||||
|
|
||||||
/* Random data provided by the kernel. */
|
|
||||||
void *_dl_random;
|
|
||||||
Index: b/elf/rtld.c
|
|
||||||
===================================================================
|
|
||||||
--- a/elf/rtld.c
|
|
||||||
+++ b/elf/rtld.c
|
|
||||||
@@ -214,7 +214,6 @@ audit_list_iter_next (struct audit_list_
|
|
||||||
return iter->previous->name;
|
|
||||||
}
|
|
||||||
|
|
||||||
-#ifndef HAVE_INLINED_SYSCALLS
|
|
||||||
/* Set nonzero during loading and initialization of executable and
|
|
||||||
libraries, cleared before the executable's entry point runs. This
|
|
||||||
must not be initialized to nonzero, because the unused dynamic
|
|
||||||
@@ -224,7 +223,6 @@ audit_list_iter_next (struct audit_list_
|
|
||||||
never be called. */
|
|
||||||
int _dl_starting_up = 0;
|
|
||||||
rtld_hidden_def (_dl_starting_up)
|
|
||||||
-#endif
|
|
||||||
|
|
||||||
/* This is the structure which defines all variables global to ld.so
|
|
||||||
(except those which cannot be added for some reason). */
|
|
||||||
@@ -898,10 +896,8 @@ dl_main (const ElfW(Phdr) *phdr,
|
|
||||||
/* Process the environment variable which control the behaviour. */
|
|
||||||
process_envvars (&mode);
|
|
||||||
|
|
||||||
-#ifndef HAVE_INLINED_SYSCALLS
|
|
||||||
/* Set up a flag which tells we are just starting. */
|
|
||||||
_dl_starting_up = 1;
|
|
||||||
-#endif
|
|
||||||
|
|
||||||
if (*user_entry == (ElfW(Addr)) ENTRY_POINT)
|
|
||||||
{
|
|
@ -1,21 +0,0 @@
|
|||||||
Short description: Fedora-specific glibc install locale changes.
|
|
||||||
Author(s): Fedora glibc team <glibc@lists.fedoraproject.org>
|
|
||||||
Origin: PATCH
|
|
||||||
Upstream status: not-needed
|
|
||||||
|
|
||||||
The Fedora glibc build and install does not need the normal install
|
|
||||||
behaviour which updates the locale archive. The Fedora install phase
|
|
||||||
in the spec file of the rpm will handle this manually.
|
|
||||||
|
|
||||||
diff --git a/localedata/Makefile b/localedata/Makefile
|
|
||||||
index a5f3c92d58954dfc..56719c7c714aa0f1 100644
|
|
||||||
--- a/localedata/Makefile
|
|
||||||
+++ b/localedata/Makefile
|
|
||||||
@@ -218,6 +218,7 @@ $(INSTALL-SUPPORTED-LOCALES): install-locales-dir
|
|
||||||
echo -n '...'; \
|
|
||||||
input=`echo $$locale | sed 's/\([^.]*\)[^@]*\(.*\)/\1\2/'`; \
|
|
||||||
$(LOCALEDEF) $$flags --alias-file=../intl/locale.alias \
|
|
||||||
+ --no-archive \
|
|
||||||
-i locales/$$input -f charmaps/$$charset \
|
|
||||||
$(addprefix --prefix=,$(install_root)) $$locale \
|
|
||||||
&& echo ' done'; \
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user