Compare commits

..

No commits in common. "c8" and "c9-beta" have entirely different histories.
c8 ... c9-beta

2109 changed files with 344239 additions and 230766 deletions

2
.gitignore vendored
View File

@ -1 +1 @@
SOURCES/glibc-2.28.tar.xz
SOURCES/glibc-2.34.tar.xz

View File

@ -1 +1 @@
ccb5dc9e51a9884df8488f86982439d47b283b2a SOURCES/glibc-2.28.tar.xz
7c3b8890a6346793b6334cc5f2fea5d437d307b8 SOURCES/glibc-2.34.tar.xz

File diff suppressed because it is too large Load Diff

View File

@ -1,496 +0,0 @@
# This file names the currently supported and somewhat tested locales.
# If you have any additions please file a glibc bug report.
SUPPORTED-LOCALES=\
C.UTF-8/UTF-8 \
aa_DJ.UTF-8/UTF-8 \
aa_DJ/ISO-8859-1 \
aa_ER/UTF-8 \
aa_ER@saaho/UTF-8 \
aa_ET/UTF-8 \
af_ZA.UTF-8/UTF-8 \
af_ZA/ISO-8859-1 \
agr_PE/UTF-8 \
ak_GH/UTF-8 \
am_ET/UTF-8 \
an_ES.UTF-8/UTF-8 \
an_ES/ISO-8859-15 \
anp_IN/UTF-8 \
ar_AE.UTF-8/UTF-8 \
ar_AE/ISO-8859-6 \
ar_BH.UTF-8/UTF-8 \
ar_BH/ISO-8859-6 \
ar_DZ.UTF-8/UTF-8 \
ar_DZ/ISO-8859-6 \
ar_EG.UTF-8/UTF-8 \
ar_EG/ISO-8859-6 \
ar_IN/UTF-8 \
ar_IQ.UTF-8/UTF-8 \
ar_IQ/ISO-8859-6 \
ar_JO.UTF-8/UTF-8 \
ar_JO/ISO-8859-6 \
ar_KW.UTF-8/UTF-8 \
ar_KW/ISO-8859-6 \
ar_LB.UTF-8/UTF-8 \
ar_LB/ISO-8859-6 \
ar_LY.UTF-8/UTF-8 \
ar_LY/ISO-8859-6 \
ar_MA.UTF-8/UTF-8 \
ar_MA/ISO-8859-6 \
ar_OM.UTF-8/UTF-8 \
ar_OM/ISO-8859-6 \
ar_QA.UTF-8/UTF-8 \
ar_QA/ISO-8859-6 \
ar_SA.UTF-8/UTF-8 \
ar_SA/ISO-8859-6 \
ar_SD.UTF-8/UTF-8 \
ar_SD/ISO-8859-6 \
ar_SS/UTF-8 \
ar_SY.UTF-8/UTF-8 \
ar_SY/ISO-8859-6 \
ar_TN.UTF-8/UTF-8 \
ar_TN/ISO-8859-6 \
ar_YE.UTF-8/UTF-8 \
ar_YE/ISO-8859-6 \
ayc_PE/UTF-8 \
az_AZ/UTF-8 \
az_IR/UTF-8 \
as_IN/UTF-8 \
ast_ES.UTF-8/UTF-8 \
ast_ES/ISO-8859-15 \
be_BY.UTF-8/UTF-8 \
be_BY/CP1251 \
be_BY@latin/UTF-8 \
bem_ZM/UTF-8 \
ber_DZ/UTF-8 \
ber_MA/UTF-8 \
bg_BG.UTF-8/UTF-8 \
bg_BG/CP1251 \
bhb_IN.UTF-8/UTF-8 \
bho_IN/UTF-8 \
bho_NP/UTF-8 \
bi_VU/UTF-8 \
bn_BD/UTF-8 \
bn_IN/UTF-8 \
bo_CN/UTF-8 \
bo_IN/UTF-8 \
br_FR.UTF-8/UTF-8 \
br_FR/ISO-8859-1 \
br_FR@euro/ISO-8859-15 \
brx_IN/UTF-8 \
bs_BA.UTF-8/UTF-8 \
bs_BA/ISO-8859-2 \
byn_ER/UTF-8 \
ca_AD.UTF-8/UTF-8 \
ca_AD/ISO-8859-15 \
ca_ES.UTF-8/UTF-8 \
ca_ES/ISO-8859-1 \
ca_ES@euro/ISO-8859-15 \
ca_ES@valencia/UTF-8 \
ca_FR.UTF-8/UTF-8 \
ca_FR/ISO-8859-15 \
ca_IT.UTF-8/UTF-8 \
ca_IT/ISO-8859-15 \
ce_RU/UTF-8 \
chr_US/UTF-8 \
cmn_TW/UTF-8 \
crh_UA/UTF-8 \
cs_CZ.UTF-8/UTF-8 \
cs_CZ/ISO-8859-2 \
csb_PL/UTF-8 \
cv_RU/UTF-8 \
cy_GB.UTF-8/UTF-8 \
cy_GB/ISO-8859-14 \
da_DK.UTF-8/UTF-8 \
da_DK/ISO-8859-1 \
da_DK.ISO-8859-15/ISO-8859-15 \
de_AT.UTF-8/UTF-8 \
de_AT/ISO-8859-1 \
de_AT@euro/ISO-8859-15 \
de_BE.UTF-8/UTF-8 \
de_BE/ISO-8859-1 \
de_BE@euro/ISO-8859-15 \
de_CH.UTF-8/UTF-8 \
de_CH/ISO-8859-1 \
de_DE.UTF-8/UTF-8 \
de_DE/ISO-8859-1 \
de_DE@euro/ISO-8859-15 \
de_IT.UTF-8/UTF-8 \
de_IT/ISO-8859-1 \
de_LI.UTF-8/UTF-8 \
de_LU.UTF-8/UTF-8 \
de_LU/ISO-8859-1 \
de_LU@euro/ISO-8859-15 \
doi_IN/UTF-8 \
dsb_DE/UTF-8 \
dv_MV/UTF-8 \
dz_BT/UTF-8 \
el_GR.UTF-8/UTF-8 \
el_GR/ISO-8859-7 \
el_GR@euro/ISO-8859-7 \
el_CY.UTF-8/UTF-8 \
el_CY/ISO-8859-7 \
en_AG/UTF-8 \
en_AU.UTF-8/UTF-8 \
en_AU/ISO-8859-1 \
en_BW.UTF-8/UTF-8 \
en_BW/ISO-8859-1 \
en_CA.UTF-8/UTF-8 \
en_CA/ISO-8859-1 \
en_DK.UTF-8/UTF-8 \
en_DK/ISO-8859-1 \
en_GB.UTF-8/UTF-8 \
en_GB/ISO-8859-1 \
en_GB.ISO-8859-15/ISO-8859-15 \
en_HK.UTF-8/UTF-8 \
en_HK/ISO-8859-1 \
en_IE.UTF-8/UTF-8 \
en_IE/ISO-8859-1 \
en_IE@euro/ISO-8859-15 \
en_IL/UTF-8 \
en_IN/UTF-8 \
en_NG/UTF-8 \
en_NZ.UTF-8/UTF-8 \
en_NZ/ISO-8859-1 \
en_PH.UTF-8/UTF-8 \
en_PH/ISO-8859-1 \
en_SC.UTF-8/UTF-8 \
en_SG.UTF-8/UTF-8 \
en_SG/ISO-8859-1 \
en_US.UTF-8/UTF-8 \
en_US/ISO-8859-1 \
en_US.ISO-8859-15/ISO-8859-15 \
en_US@ampm/UTF-8 \
en_US.UTF-8@ampm/UTF-8 \
en_ZA.UTF-8/UTF-8 \
en_ZA/ISO-8859-1 \
en_ZM/UTF-8 \
en_ZW.UTF-8/UTF-8 \
en_ZW/ISO-8859-1 \
eo/UTF-8 \
es_AR.UTF-8/UTF-8 \
es_AR/ISO-8859-1 \
es_BO.UTF-8/UTF-8 \
es_BO/ISO-8859-1 \
es_CL.UTF-8/UTF-8 \
es_CL/ISO-8859-1 \
es_CO.UTF-8/UTF-8 \
es_CO/ISO-8859-1 \
es_CR.UTF-8/UTF-8 \
es_CR/ISO-8859-1 \
es_CU/UTF-8 \
es_DO.UTF-8/UTF-8 \
es_DO/ISO-8859-1 \
es_EC.UTF-8/UTF-8 \
es_EC/ISO-8859-1 \
es_ES.UTF-8/UTF-8 \
es_ES/ISO-8859-1 \
es_ES@euro/ISO-8859-15 \
es_GT.UTF-8/UTF-8 \
es_GT/ISO-8859-1 \
es_HN.UTF-8/UTF-8 \
es_HN/ISO-8859-1 \
es_MX.UTF-8/UTF-8 \
es_MX/ISO-8859-1 \
es_NI.UTF-8/UTF-8 \
es_NI/ISO-8859-1 \
es_PA.UTF-8/UTF-8 \
es_PA/ISO-8859-1 \
es_PE.UTF-8/UTF-8 \
es_PE/ISO-8859-1 \
es_PR.UTF-8/UTF-8 \
es_PR/ISO-8859-1 \
es_PY.UTF-8/UTF-8 \
es_PY/ISO-8859-1 \
es_SV.UTF-8/UTF-8 \
es_SV/ISO-8859-1 \
es_US.UTF-8/UTF-8 \
es_US/ISO-8859-1 \
es_UY.UTF-8/UTF-8 \
es_UY/ISO-8859-1 \
es_VE.UTF-8/UTF-8 \
es_VE/ISO-8859-1 \
et_EE.UTF-8/UTF-8 \
et_EE/ISO-8859-1 \
et_EE.ISO-8859-15/ISO-8859-15 \
eu_ES.UTF-8/UTF-8 \
eu_ES/ISO-8859-1 \
eu_ES@euro/ISO-8859-15 \
fa_IR/UTF-8 \
ff_SN/UTF-8 \
fi_FI.UTF-8/UTF-8 \
fi_FI/ISO-8859-1 \
fi_FI@euro/ISO-8859-15 \
fil_PH/UTF-8 \
fo_FO.UTF-8/UTF-8 \
fo_FO/ISO-8859-1 \
fr_BE.UTF-8/UTF-8 \
fr_BE/ISO-8859-1 \
fr_BE@euro/ISO-8859-15 \
fr_CA.UTF-8/UTF-8 \
fr_CA/ISO-8859-1 \
fr_CH.UTF-8/UTF-8 \
fr_CH/ISO-8859-1 \
fr_FR.UTF-8/UTF-8 \
fr_FR/ISO-8859-1 \
fr_FR@euro/ISO-8859-15 \
fr_LU.UTF-8/UTF-8 \
fr_LU/ISO-8859-1 \
fr_LU@euro/ISO-8859-15 \
fur_IT/UTF-8 \
fy_NL/UTF-8 \
fy_DE/UTF-8 \
ga_IE.UTF-8/UTF-8 \
ga_IE/ISO-8859-1 \
ga_IE@euro/ISO-8859-15 \
gd_GB.UTF-8/UTF-8 \
gd_GB/ISO-8859-15 \
gez_ER/UTF-8 \
gez_ER@abegede/UTF-8 \
gez_ET/UTF-8 \
gez_ET@abegede/UTF-8 \
gl_ES.UTF-8/UTF-8 \
gl_ES/ISO-8859-1 \
gl_ES@euro/ISO-8859-15 \
gu_IN/UTF-8 \
gv_GB.UTF-8/UTF-8 \
gv_GB/ISO-8859-1 \
ha_NG/UTF-8 \
hak_TW/UTF-8 \
he_IL.UTF-8/UTF-8 \
he_IL/ISO-8859-8 \
hi_IN/UTF-8 \
hif_FJ/UTF-8 \
hne_IN/UTF-8 \
hr_HR.UTF-8/UTF-8 \
hr_HR/ISO-8859-2 \
hsb_DE/ISO-8859-2 \
hsb_DE.UTF-8/UTF-8 \
ht_HT/UTF-8 \
hu_HU.UTF-8/UTF-8 \
hu_HU/ISO-8859-2 \
hy_AM/UTF-8 \
hy_AM.ARMSCII-8/ARMSCII-8 \
ia_FR/UTF-8 \
id_ID.UTF-8/UTF-8 \
id_ID/ISO-8859-1 \
ig_NG/UTF-8 \
ik_CA/UTF-8 \
is_IS.UTF-8/UTF-8 \
is_IS/ISO-8859-1 \
it_CH.UTF-8/UTF-8 \
it_CH/ISO-8859-1 \
it_IT.UTF-8/UTF-8 \
it_IT/ISO-8859-1 \
it_IT@euro/ISO-8859-15 \
iu_CA/UTF-8 \
ja_JP.EUC-JP/EUC-JP \
ja_JP.UTF-8/UTF-8 \
ka_GE.UTF-8/UTF-8 \
ka_GE/GEORGIAN-PS \
kab_DZ/UTF-8 \
kk_KZ.UTF-8/UTF-8 \
kk_KZ/PT154 \
kl_GL.UTF-8/UTF-8 \
kl_GL/ISO-8859-1 \
km_KH/UTF-8 \
kn_IN/UTF-8 \
ko_KR.EUC-KR/EUC-KR \
ko_KR.UTF-8/UTF-8 \
kok_IN/UTF-8 \
ks_IN/UTF-8 \
ks_IN@devanagari/UTF-8 \
ku_TR.UTF-8/UTF-8 \
ku_TR/ISO-8859-9 \
kw_GB.UTF-8/UTF-8 \
kw_GB/ISO-8859-1 \
ky_KG/UTF-8 \
lb_LU/UTF-8 \
lg_UG.UTF-8/UTF-8 \
lg_UG/ISO-8859-10 \
li_BE/UTF-8 \
li_NL/UTF-8 \
lij_IT/UTF-8 \
ln_CD/UTF-8 \
lo_LA/UTF-8 \
lt_LT.UTF-8/UTF-8 \
lt_LT/ISO-8859-13 \
lv_LV.UTF-8/UTF-8 \
lv_LV/ISO-8859-13 \
lzh_TW/UTF-8 \
mag_IN/UTF-8 \
mai_IN/UTF-8 \
mai_NP/UTF-8 \
mfe_MU/UTF-8 \
mg_MG.UTF-8/UTF-8 \
mg_MG/ISO-8859-15 \
mhr_RU/UTF-8 \
mi_NZ.UTF-8/UTF-8 \
mi_NZ/ISO-8859-13 \
miq_NI/UTF-8 \
mjw_IN/UTF-8 \
mk_MK.UTF-8/UTF-8 \
mk_MK/ISO-8859-5 \
ml_IN/UTF-8 \
mn_MN/UTF-8 \
mni_IN/UTF-8 \
mr_IN/UTF-8 \
ms_MY.UTF-8/UTF-8 \
ms_MY/ISO-8859-1 \
mt_MT.UTF-8/UTF-8 \
mt_MT/ISO-8859-3 \
my_MM/UTF-8 \
nan_TW/UTF-8 \
nan_TW@latin/UTF-8 \
nb_NO.UTF-8/UTF-8 \
nb_NO/ISO-8859-1 \
nds_DE/UTF-8 \
nds_NL/UTF-8 \
ne_NP/UTF-8 \
nhn_MX/UTF-8 \
niu_NU/UTF-8 \
niu_NZ/UTF-8 \
nl_AW/UTF-8 \
nl_BE.UTF-8/UTF-8 \
nl_BE/ISO-8859-1 \
nl_BE@euro/ISO-8859-15 \
nl_NL.UTF-8/UTF-8 \
nl_NL/ISO-8859-1 \
nl_NL@euro/ISO-8859-15 \
nn_NO.UTF-8/UTF-8 \
nn_NO/ISO-8859-1 \
nr_ZA/UTF-8 \
nso_ZA/UTF-8 \
oc_FR.UTF-8/UTF-8 \
oc_FR/ISO-8859-1 \
om_ET/UTF-8 \
om_KE.UTF-8/UTF-8 \
om_KE/ISO-8859-1 \
or_IN/UTF-8 \
os_RU/UTF-8 \
pa_IN/UTF-8 \
pa_PK/UTF-8 \
pap_AW/UTF-8 \
pap_CW/UTF-8 \
pl_PL.UTF-8/UTF-8 \
pl_PL/ISO-8859-2 \
ps_AF/UTF-8 \
pt_BR.UTF-8/UTF-8 \
pt_BR/ISO-8859-1 \
pt_PT.UTF-8/UTF-8 \
pt_PT/ISO-8859-1 \
pt_PT@euro/ISO-8859-15 \
quz_PE/UTF-8 \
raj_IN/UTF-8 \
ro_RO.UTF-8/UTF-8 \
ro_RO/ISO-8859-2 \
ru_RU.KOI8-R/KOI8-R \
ru_RU.UTF-8/UTF-8 \
ru_RU/ISO-8859-5 \
ru_UA.UTF-8/UTF-8 \
ru_UA/KOI8-U \
rw_RW/UTF-8 \
sa_IN/UTF-8 \
sah_RU/UTF-8 \
sat_IN/UTF-8 \
sc_IT/UTF-8 \
sd_IN/UTF-8 \
sd_IN@devanagari/UTF-8 \
se_NO/UTF-8 \
sgs_LT/UTF-8 \
shn_MM/UTF-8 \
shs_CA/UTF-8 \
si_LK/UTF-8 \
sid_ET/UTF-8 \
sk_SK.UTF-8/UTF-8 \
sk_SK/ISO-8859-2 \
sl_SI.UTF-8/UTF-8 \
sl_SI/ISO-8859-2 \
sm_WS/UTF-8 \
so_DJ.UTF-8/UTF-8 \
so_DJ/ISO-8859-1 \
so_ET/UTF-8 \
so_KE.UTF-8/UTF-8 \
so_KE/ISO-8859-1 \
so_SO.UTF-8/UTF-8 \
so_SO/ISO-8859-1 \
sq_AL.UTF-8/UTF-8 \
sq_AL/ISO-8859-1 \
sq_MK/UTF-8 \
sr_ME/UTF-8 \
sr_RS/UTF-8 \
sr_RS@latin/UTF-8 \
ss_ZA/UTF-8 \
st_ZA.UTF-8/UTF-8 \
st_ZA/ISO-8859-1 \
sv_FI.UTF-8/UTF-8 \
sv_FI/ISO-8859-1 \
sv_FI@euro/ISO-8859-15 \
sv_SE.UTF-8/UTF-8 \
sv_SE/ISO-8859-1 \
sv_SE.ISO-8859-15/ISO-8859-15 \
sw_KE/UTF-8 \
sw_TZ/UTF-8 \
szl_PL/UTF-8 \
ta_IN/UTF-8 \
ta_LK/UTF-8 \
tcy_IN.UTF-8/UTF-8 \
te_IN/UTF-8 \
tg_TJ.UTF-8/UTF-8 \
tg_TJ/KOI8-T \
th_TH.UTF-8/UTF-8 \
th_TH/TIS-620 \
the_NP/UTF-8 \
ti_ER/UTF-8 \
ti_ET/UTF-8 \
tig_ER/UTF-8 \
tk_TM/UTF-8 \
tl_PH.UTF-8/UTF-8 \
tl_PH/ISO-8859-1 \
tn_ZA/UTF-8 \
to_TO/UTF-8 \
tpi_PG/UTF-8 \
tr_CY.UTF-8/UTF-8 \
tr_CY/ISO-8859-9 \
tr_TR.UTF-8/UTF-8 \
tr_TR/ISO-8859-9 \
ts_ZA/UTF-8 \
tt_RU/UTF-8 \
tt_RU@iqtelif/UTF-8 \
ug_CN/UTF-8 \
uk_UA.UTF-8/UTF-8 \
uk_UA/KOI8-U \
unm_US/UTF-8 \
ur_IN/UTF-8 \
ur_PK/UTF-8 \
uz_UZ.UTF-8/UTF-8 \
uz_UZ/ISO-8859-1 \
uz_UZ@cyrillic/UTF-8 \
ve_ZA/UTF-8 \
vi_VN/UTF-8 \
wa_BE/ISO-8859-1 \
wa_BE@euro/ISO-8859-15 \
wa_BE.UTF-8/UTF-8 \
wae_CH/UTF-8 \
wal_ET/UTF-8 \
wo_SN/UTF-8 \
xh_ZA.UTF-8/UTF-8 \
xh_ZA/ISO-8859-1 \
yi_US.UTF-8/UTF-8 \
yi_US/CP1255 \
yo_NG/UTF-8 \
yue_HK/UTF-8 \
yuw_PG/UTF-8 \
zh_CN.GB18030/GB18030 \
zh_CN.GBK/GBK \
zh_CN.UTF-8/UTF-8 \
zh_CN/GB2312 \
zh_HK.UTF-8/UTF-8 \
zh_HK/BIG5-HKSCS \
zh_SG.UTF-8/UTF-8 \
zh_SG.GBK/GBK \
zh_SG/GB2312 \
zh_TW.EUC-TW/EUC-TW \
zh_TW.UTF-8/UTF-8 \
zh_TW/BIG5 \
zu_ZA.UTF-8/UTF-8 \
zu_ZA/ISO-8859-1 \

View File

@ -1,862 +0,0 @@
#define _GNU_SOURCE
#include <assert.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <locale.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include "../locale/hashval.h"
#define __LC_LAST 13
#include "../locale/locarchive.h"
#include "../crypt/md5.h"
const char *alias_file = DATADIR "/locale/locale.alias";
const char *locar_file = PREFIX "/lib/locale/locale-archive";
const char *tmpl_file = PREFIX "/lib/locale/locale-archive.tmpl";
const char *loc_path = PREFIX "/lib/locale/";
/* Flags set by `--verbose` option. */
int be_quiet = 1;
int verbose = 0;
int max_locarchive_open_retry = 10;
const char *output_prefix;
/* Endianness should have been taken care of by localedef. We don't need to do
additional swapping. We need this variable exported however, since
locarchive.c uses it to determine if it needs to swap endianness of a value
before writing to or reading from the archive. */
bool swap_endianness_p = false;
static const char *locnames[] =
{
#define DEFINE_CATEGORY(category, category_name, items, a) \
[category] = category_name,
#include "../locale/categories.def"
#undef DEFINE_CATEGORY
};
static int
is_prime (unsigned long candidate)
{
/* No even number and none less than 10 will be passed here. */
unsigned long int divn = 3;
unsigned long int sq = divn * divn;
while (sq < candidate && candidate % divn != 0)
{
++divn;
sq += 4 * divn;
++divn;
}
return candidate % divn != 0;
}
unsigned long
next_prime (unsigned long seed)
{
/* Make it definitely odd. */
seed |= 1;
while (!is_prime (seed))
seed += 2;
return seed;
}
void
error (int status, int errnum, const char *message, ...)
{
va_list args;
va_start (args, message);
fflush (stdout);
fprintf (stderr, "%s: ", program_invocation_name);
vfprintf (stderr, message, args);
va_end (args);
if (errnum)
fprintf (stderr, ": %s", strerror (errnum));
putc ('\n', stderr);
fflush (stderr);
if (status)
exit (errnum == EROFS ? 0 : status);
}
void *
xmalloc (size_t size)
{
void *p = malloc (size);
if (p == NULL)
error (EXIT_FAILURE, errno, "could not allocate %zd bytes of memory", size);
return p;
}
static void
open_tmpl_archive (struct locarhandle *ah)
{
struct stat64 st;
int fd;
struct locarhead head;
const char *archivefname = ah->fname == NULL ? tmpl_file : ah->fname;
/* Open the archive. We must have exclusive write access. */
fd = open64 (archivefname, O_RDONLY);
if (fd == -1)
error (EXIT_FAILURE, errno, "cannot open locale archive template file \"%s\"",
archivefname);
if (fstat64 (fd, &st) < 0)
error (EXIT_FAILURE, errno, "cannot stat locale archive template file \"%s\"",
archivefname);
/* Read the header. */
if (TEMP_FAILURE_RETRY (read (fd, &head, sizeof (head))) != sizeof (head))
error (EXIT_FAILURE, errno, "cannot read archive header");
ah->fd = fd;
ah->mmaped = (head.sumhash_offset
+ head.sumhash_size * sizeof (struct sumhashent));
if (ah->mmaped > (unsigned long) st.st_size)
error (EXIT_FAILURE, 0, "locale archive template file truncated");
ah->mmaped = st.st_size;
ah->reserved = st.st_size;
/* Now we know how large the administrative information part is.
Map all of it. */
ah->addr = mmap64 (NULL, ah->mmaped, PROT_READ, MAP_SHARED, fd, 0);
if (ah->addr == MAP_FAILED)
error (EXIT_FAILURE, errno, "cannot map archive header");
}
/* Open the locale archive. */
extern void open_archive (struct locarhandle *ah, bool readonly);
/* Close the locale archive. */
extern void close_archive (struct locarhandle *ah);
/* Add given locale data to the archive. */
extern int add_locale_to_archive (struct locarhandle *ah, const char *name,
locale_data_t data, bool replace);
extern void add_alias (struct locarhandle *ah, const char *alias,
bool replace, const char *oldname,
uint32_t *locrec_offset_p);
extern struct namehashent *
insert_name (struct locarhandle *ah,
const char *name, size_t name_len, bool replace);
struct nameent
{
char *name;
struct locrecent *locrec;
};
struct dataent
{
const unsigned char *sum;
uint32_t file_offset;
};
static int
nameentcmp (const void *a, const void *b)
{
struct locrecent *la = ((const struct nameent *) a)->locrec;
struct locrecent *lb = ((const struct nameent *) b)->locrec;
uint32_t start_a = -1, end_a = 0;
uint32_t start_b = -1, end_b = 0;
int cnt;
for (cnt = 0; cnt < __LC_LAST; ++cnt)
if (cnt != LC_ALL)
{
if (la->record[cnt].offset < start_a)
start_a = la->record[cnt].offset;
if (la->record[cnt].offset + la->record[cnt].len > end_a)
end_a = la->record[cnt].offset + la->record[cnt].len;
}
assert (start_a != (uint32_t)-1);
assert (end_a != 0);
for (cnt = 0; cnt < __LC_LAST; ++cnt)
if (cnt != LC_ALL)
{
if (lb->record[cnt].offset < start_b)
start_b = lb->record[cnt].offset;
if (lb->record[cnt].offset + lb->record[cnt].len > end_b)
end_b = lb->record[cnt].offset + lb->record[cnt].len;
}
assert (start_b != (uint32_t)-1);
assert (end_b != 0);
if (start_a != start_b)
return (int)start_a - (int)start_b;
return (int)end_a - (int)end_b;
}
static int
dataentcmp (const void *a, const void *b)
{
if (((const struct dataent *) a)->file_offset
< ((const struct dataent *) b)->file_offset)
return -1;
if (((const struct dataent *) a)->file_offset
> ((const struct dataent *) b)->file_offset)
return 1;
return 0;
}
static int
sumsearchfn (const void *key, const void *ent)
{
uint32_t keyn = *(uint32_t *)key;
uint32_t entn = ((struct dataent *)ent)->file_offset;
if (keyn < entn)
return -1;
if (keyn > entn)
return 1;
return 0;
}
static void
compute_data (struct locarhandle *ah, struct nameent *name, size_t sumused,
struct dataent *files, locale_data_t data)
{
int cnt;
struct locrecent *locrec = name->locrec;
struct dataent *file;
data[LC_ALL].addr = ((char *) ah->addr) + locrec->record[LC_ALL].offset;
data[LC_ALL].size = locrec->record[LC_ALL].len;
for (cnt = 0; cnt < __LC_LAST; ++cnt)
if (cnt != LC_ALL)
{
data[cnt].addr = ((char *) ah->addr) + locrec->record[cnt].offset;
data[cnt].size = locrec->record[cnt].len;
if (data[cnt].addr >= data[LC_ALL].addr
&& data[cnt].addr + data[cnt].size
<= data[LC_ALL].addr + data[LC_ALL].size)
__md5_buffer (data[cnt].addr, data[cnt].size, data[cnt].sum);
else
{
file = bsearch (&locrec->record[cnt].offset, files, sumused,
sizeof (*files), sumsearchfn);
if (file == NULL)
error (EXIT_FAILURE, 0, "inconsistent template file");
memcpy (data[cnt].sum, file->sum, sizeof (data[cnt].sum));
}
}
}
static int
fill_archive (struct locarhandle *tmpl_ah,
const char *fname,
size_t install_langs_count, char *install_langs_list[],
size_t nlist, char *list[],
const char *primary)
{
struct locarhandle ah;
struct locarhead *head;
int result = 0;
struct nameent *names;
struct namehashent *namehashtab;
size_t cnt, used;
struct dataent *files;
struct sumhashent *sumhashtab;
size_t sumused;
struct locrecent *primary_locrec = NULL;
struct nameent *primary_nameent = NULL;
head = tmpl_ah->addr;
names = (struct nameent *) malloc (head->namehash_used
* sizeof (struct nameent));
files = (struct dataent *) malloc (head->sumhash_used
* sizeof (struct dataent));
if (names == NULL || files == NULL)
error (EXIT_FAILURE, errno, "could not allocate tables");
namehashtab = (struct namehashent *) ((char *) tmpl_ah->addr
+ head->namehash_offset);
sumhashtab = (struct sumhashent *) ((char *) tmpl_ah->addr
+ head->sumhash_offset);
for (cnt = used = 0; cnt < head->namehash_size; ++cnt)
if (namehashtab[cnt].locrec_offset != 0)
{
char * name;
int i;
assert (used < head->namehash_used);
name = tmpl_ah->addr + namehashtab[cnt].name_offset;
if (install_langs_count == 0)
{
/* Always intstall the entry. */
names[used].name = name;
names[used++].locrec
= (struct locrecent *) ((char *) tmpl_ah->addr +
namehashtab[cnt].locrec_offset);
}
else
{
/* Only install the entry if the user asked for it via
--install-langs. */
for (i = 0; i < install_langs_count; i++)
{
/* Add one for "_" and one for the null terminator. */
size_t len = strlen (install_langs_list[i]) + 2;
char *install_lang = (char *)xmalloc (len);
strcpy (install_lang, install_langs_list[i]);
if (strchr (install_lang, '_') == NULL)
strcat (install_lang, "_");
if (strncmp (name, install_lang, strlen (install_lang)) == 0)
{
names[used].name = name;
names[used++].locrec
= (struct locrecent *) ((char *)tmpl_ah->addr
+ namehashtab[cnt].locrec_offset);
}
free (install_lang);
}
}
}
/* Sort the names. */
qsort (names, used, sizeof (struct nameent), nameentcmp);
for (cnt = sumused = 0; cnt < head->sumhash_size; ++cnt)
if (sumhashtab[cnt].file_offset != 0)
{
assert (sumused < head->sumhash_used);
files[sumused].sum = (const unsigned char *) sumhashtab[cnt].sum;
files[sumused++].file_offset = sumhashtab[cnt].file_offset;
}
/* Sort by file locations. */
qsort (files, sumused, sizeof (struct dataent), dataentcmp);
/* Open the archive. This call never returns if we cannot
successfully open the archive. */
ah.fname = NULL;
if (fname != NULL)
ah.fname = fname;
open_archive (&ah, false);
if (primary != NULL)
{
for (cnt = 0; cnt < used; ++cnt)
if (strcmp (names[cnt].name, primary) == 0)
break;
if (cnt < used)
{
locale_data_t data;
compute_data (tmpl_ah, &names[cnt], sumused, files, data);
result |= add_locale_to_archive (&ah, primary, data, 0);
primary_locrec = names[cnt].locrec;
primary_nameent = &names[cnt];
}
}
for (cnt = 0; cnt < used; ++cnt)
if (&names[cnt] == primary_nameent)
continue;
else if ((cnt > 0 && names[cnt - 1].locrec == names[cnt].locrec)
|| names[cnt].locrec == primary_locrec)
{
const char *oldname;
struct namehashent *namehashent;
uint32_t locrec_offset;
if (names[cnt].locrec == primary_locrec)
oldname = primary;
else
oldname = names[cnt - 1].name;
namehashent = insert_name (&ah, oldname, strlen (oldname), true);
assert (namehashent->name_offset != 0);
assert (namehashent->locrec_offset != 0);
locrec_offset = namehashent->locrec_offset;
add_alias (&ah, names[cnt].name, 0, oldname, &locrec_offset);
}
else
{
locale_data_t data;
compute_data (tmpl_ah, &names[cnt], sumused, files, data);
result |= add_locale_to_archive (&ah, names[cnt].name, data, 0);
}
while (nlist-- > 0)
{
const char *fname = *list++;
size_t fnamelen = strlen (fname);
struct stat64 st;
DIR *dirp;
struct dirent64 *d;
int seen;
locale_data_t data;
int cnt;
/* First see whether this really is a directory and whether it
contains all the require locale category files. */
if (stat64 (fname, &st) < 0)
{
error (0, 0, "stat of \"%s\" failed: %s: ignored", fname,
strerror (errno));
continue;
}
if (!S_ISDIR (st.st_mode))
{
error (0, 0, "\"%s\" is no directory; ignored", fname);
continue;
}
dirp = opendir (fname);
if (dirp == NULL)
{
error (0, 0, "cannot open directory \"%s\": %s: ignored",
fname, strerror (errno));
continue;
}
seen = 0;
while ((d = readdir64 (dirp)) != NULL)
{
for (cnt = 0; cnt < __LC_LAST; ++cnt)
if (cnt != LC_ALL)
if (strcmp (d->d_name, locnames[cnt]) == 0)
{
unsigned char d_type;
/* We have an object of the required name. If it's
a directory we have to look at a file with the
prefix "SYS_". Otherwise we have found what we
are looking for. */
#ifdef _DIRENT_HAVE_D_TYPE
d_type = d->d_type;
if (d_type != DT_REG)
#endif
{
char fullname[fnamelen + 2 * strlen (d->d_name) + 7];
#ifdef _DIRENT_HAVE_D_TYPE
if (d_type == DT_UNKNOWN || d_type == DT_LNK)
#endif
{
strcpy (stpcpy (stpcpy (fullname, fname), "/"),
d->d_name);
if (stat64 (fullname, &st) == -1)
/* We cannot stat the file, ignore it. */
break;
d_type = IFTODT (st.st_mode);
}
if (d_type == DT_DIR)
{
/* We have to do more tests. The file is a
directory and it therefore must contain a
regular file with the same name except a
"SYS_" prefix. */
char *t = stpcpy (stpcpy (fullname, fname), "/");
strcpy (stpcpy (stpcpy (t, d->d_name), "/SYS_"),
d->d_name);
if (stat64 (fullname, &st) == -1)
/* There is no SYS_* file or we cannot
access it. */
break;
d_type = IFTODT (st.st_mode);
}
}
/* If we found a regular file (eventually after
following a symlink) we are successful. */
if (d_type == DT_REG)
++seen;
break;
}
}
closedir (dirp);
if (seen != __LC_LAST - 1)
{
/* We don't have all locale category files. Ignore the name. */
error (0, 0, "incomplete set of locale files in \"%s\"",
fname);
continue;
}
/* Add the files to the archive. To do this we first compute
sizes and the MD5 sums of all the files. */
for (cnt = 0; cnt < __LC_LAST; ++cnt)
if (cnt != LC_ALL)
{
char fullname[fnamelen + 2 * strlen (locnames[cnt]) + 7];
int fd;
strcpy (stpcpy (stpcpy (fullname, fname), "/"), locnames[cnt]);
fd = open64 (fullname, O_RDONLY);
if (fd == -1 || fstat64 (fd, &st) == -1)
{
/* Cannot read the file. */
if (fd != -1)
close (fd);
break;
}
if (S_ISDIR (st.st_mode))
{
char *t;
close (fd);
t = stpcpy (stpcpy (fullname, fname), "/");
strcpy (stpcpy (stpcpy (t, locnames[cnt]), "/SYS_"),
locnames[cnt]);
fd = open64 (fullname, O_RDONLY);
if (fd == -1 || fstat64 (fd, &st) == -1
|| !S_ISREG (st.st_mode))
{
if (fd != -1)
close (fd);
break;
}
}
/* Map the file. */
data[cnt].addr = mmap64 (NULL, st.st_size, PROT_READ, MAP_SHARED,
fd, 0);
if (data[cnt].addr == MAP_FAILED)
{
/* Cannot map it. */
close (fd);
break;
}
data[cnt].size = st.st_size;
__md5_buffer (data[cnt].addr, st.st_size, data[cnt].sum);
/* We don't need the file descriptor anymore. */
close (fd);
}
if (cnt != __LC_LAST)
{
while (cnt-- > 0)
if (cnt != LC_ALL)
munmap (data[cnt].addr, data[cnt].size);
error (0, 0, "cannot read all files in \"%s\": ignored", fname);
continue;
}
result |= add_locale_to_archive (&ah, basename (fname), data, 0);
for (cnt = 0; cnt < __LC_LAST; ++cnt)
if (cnt != LC_ALL)
munmap (data[cnt].addr, data[cnt].size);
}
/* We are done. */
close_archive (&ah);
return result;
}
void usage()
{
printf ("\
Usage: build-locale-archive [OPTION]... [TEMPLATE-FILE] [ARCHIVE-FILE]\n\
Builds a locale archive from a template file.\n\
Options:\n\
-h, --help Print this usage message.\n\
-v, --verbose Verbose execution.\n\
-l, --install-langs=LIST Only include locales given in LIST into the \n\
locale archive. LIST is a colon separated list\n\
of locale prefixes, for example \"de:en:ja\".\n\
The special argument \"all\" means to install\n\
all languages and it must be present by itself.\n\
If \"all\" is present with any other language it\n\
will be treated as the name of a locale.\n\
If the --install-langs option is missing, all\n\
locales are installed. The colon separated list\n\
can contain any strings matching the beginning of\n\
locale names.\n\
If a string does not contain a \"_\", it is added.\n\
Examples:\n\
--install-langs=\"en\"\n\
installs en_US, en_US.iso88591,\n\
en_US.iso885915, en_US.utf8,\n\
en_GB ...\n\
--install-langs=\"en_US.utf8\"\n\
installs only en_US.utf8.\n\
--install-langs=\"ko\"\n\
installs ko_KR, ko_KR.euckr,\n\
ko_KR.utf8 but *not* kok_IN\n\
because \"ko\" does not contain\n\
\"_\" and it is silently added\n\
--install-langs\"ko:kok\"\n\
installs ko_KR, ko_KR.euckr,\n\
ko_KR.utf8, kok_IN, and\n\
kok_IN.utf8.\n\
--install-langs=\"POSIX\" will\n\
installs *no* locales at all\n\
because POSIX matches none of\n\
the locales. Actually, any string\n\
matching nothing will do that.\n\
POSIX and C will always be\n\
available because they are\n\
builtin.\n\
Aliases are installed as well,\n\
i.e. --install-langs=\"de\"\n\
will install not only every locale starting with\n\
\"de\" but also the aliases \"deutsch\"\n\
and and \"german\" although the latter does not\n\
start with \"de\".\n\
\n\
If the arguments TEMPLATE-FILE and ARCHIVE-FILE are not given the locations\n\
where the glibc used expects these files are used by default.\n\
");
}
int main (int argc, char *argv[])
{
char path[4096];
DIR *dirp;
struct dirent64 *d;
struct stat64 st;
char *list[16384], *primary;
char *lang;
int install_langs_count = 0;
int i;
char *install_langs_arg, *ila_start;
char **install_langs_list = NULL;
unsigned int cnt = 0;
struct locarhandle tmpl_ah;
char *new_locar_fname = NULL;
size_t loc_path_len = strlen (loc_path);
while (1)
{
int c;
static struct option long_options[] =
{
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
{"install-langs", required_argument, 0, 'l'},
{0, 0, 0, 0}
};
/* getopt_long stores the option index here. */
int option_index = 0;
c = getopt_long (argc, argv, "vhl:",
long_options, &option_index);
/* Detect the end of the options. */
if (c == -1)
break;
switch (c)
{
case 0:
printf ("unknown option %s", long_options[option_index].name);
if (optarg)
printf (" with arg %s", optarg);
printf ("\n");
usage ();
exit (1);
case 'v':
verbose = 1;
be_quiet = 0;
break;
case 'h':
usage ();
exit (0);
case 'l':
install_langs_arg = ila_start = strdup (optarg);
/* If the argument to --install-lang is "all", do
not limit the list of languages to install and install
them all. We do not support installing a single locale
called "all". */
#define MAGIC_INSTALL_ALL "all"
if (install_langs_arg != NULL
&& install_langs_arg[0] != '\0'
&& !(strncmp(install_langs_arg, MAGIC_INSTALL_ALL,
strlen(MAGIC_INSTALL_ALL)) == 0
&& strlen (install_langs_arg) == 3))
{
/* Count the number of languages we will install. */
while (true)
{
lang = strtok(install_langs_arg, ":;,");
if (lang == NULL)
break;
install_langs_count++;
install_langs_arg = NULL;
}
free (ila_start);
/* Reject an entire string made up of delimiters. */
if (install_langs_count == 0)
break;
/* Copy the list. */
install_langs_list = (char **)xmalloc (sizeof(char *) * install_langs_count);
install_langs_arg = ila_start = strdup (optarg);
install_langs_count = 0;
while (true)
{
lang = strtok(install_langs_arg, ":;,");
if (lang == NULL)
break;
install_langs_list[install_langs_count] = lang;
install_langs_count++;
install_langs_arg = NULL;
}
}
break;
case '?':
/* getopt_long already printed an error message. */
usage ();
exit (0);
default:
abort ();
}
}
tmpl_ah.fname = NULL;
if (optind < argc)
tmpl_ah.fname = argv[optind];
if (optind + 1 < argc)
new_locar_fname = argv[optind + 1];
if (verbose)
{
if (tmpl_ah.fname)
printf("input archive file specified on command line: %s\n",
tmpl_ah.fname);
else
printf("using default input archive file.\n");
if (new_locar_fname)
printf("output archive file specified on command line: %s\n",
new_locar_fname);
else
printf("using default output archive file.\n");
}
dirp = opendir (loc_path);
if (dirp == NULL)
error (EXIT_FAILURE, errno, "cannot open directory \"%s\"", loc_path);
open_tmpl_archive (&tmpl_ah);
if (new_locar_fname)
unlink (new_locar_fname);
else
unlink (locar_file);
primary = getenv ("LC_ALL");
if (primary == NULL)
primary = getenv ("LANG");
if (primary != NULL)
{
if (strncmp (primary, "ja", 2) != 0
&& strncmp (primary, "ko", 2) != 0
&& strncmp (primary, "zh", 2) != 0)
{
char *ptr = malloc (strlen (primary) + strlen (".utf8") + 1), *p, *q;
/* This leads to invalid locales sometimes:
de_DE.iso885915@euro -> de_DE.utf8@euro */
if (ptr != NULL)
{
p = ptr;
q = primary;
while (*q && *q != '.' && *q != '@')
*p++ = *q++;
if (*q == '.')
while (*q && *q != '@')
q++;
p = stpcpy (p, ".utf8");
strcpy (p, q);
primary = ptr;
}
else
primary = NULL;
}
}
memcpy (path, loc_path, loc_path_len);
while ((d = readdir64 (dirp)) != NULL)
{
if (strcmp (d->d_name, ".") == 0 || strcmp (d->d_name, "..") == 0)
continue;
if (strchr (d->d_name, '_') == NULL)
continue;
size_t d_name_len = strlen (d->d_name);
if (loc_path_len + d_name_len + 1 > sizeof (path))
{
error (0, 0, "too long filename \"%s\"", d->d_name);
continue;
}
memcpy (path + loc_path_len, d->d_name, d_name_len + 1);
if (stat64 (path, &st) < 0)
{
error (0, errno, "cannot stat \"%s\"", path);
continue;
}
if (! S_ISDIR (st.st_mode))
continue;
if (cnt == 16384)
{
error (0, 0, "too many directories in \"%s\"", loc_path);
break;
}
list[cnt] = strdup (path);
if (list[cnt] == NULL)
{
error (0, errno, "cannot add file to list \"%s\"", path);
continue;
}
if (primary != NULL && cnt > 0 && strcmp (primary, d->d_name) == 0)
{
char *p = list[0];
list[0] = list[cnt];
list[cnt] = p;
}
cnt++;
}
closedir (dirp);
/* Store the archive to the file specified as the second argument on the
command line or the default locale archive. */
fill_archive (&tmpl_ah, new_locar_fname,
install_langs_count, install_langs_list,
cnt, list, primary);
close_archive (&tmpl_ah);
truncate (tmpl_file, 0);
if (install_langs_count > 0)
{
free (ila_start);
free (install_langs_list);
}
char *tz_argv[] = { "/usr/sbin/tzdata-update", NULL };
execve (tz_argv[0], (char *const *)tz_argv, (char *const *)&tz_argv[1]);
exit (0);
}

View File

@ -0,0 +1,432 @@
From e4ca6de1bc5e4ba3f94cf0c501a293c5bc827b10 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@ozlabs.org>
Date: Tue, 27 Jul 2021 15:47:49 +1000
Subject: powerpc64: Replace some PPC_FEATURE_HAS_VSX with
PPC_FEATURE_ARCH_2_06
We use PPC_FEATURE_HAS_VSX to select a number of POWER7 optimised
functions. These functions don't use any VSX instructions, so
PPC_FEATURE_ARCH_2_06 seems like a better fit.
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 0acdf22ba3..32564c8f1f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -95,7 +95,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#endif
IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
__memset_power8)
- IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
+ IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_06,
__memset_power7)
IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05,
__memset_power6)
@@ -139,7 +139,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#endif
IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
__strlen_power8)
- IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,
+ IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_ARCH_2_06,
__strlen_power7)
IFUNC_IMPL_ADD (array, i, strlen, 1,
__strlen_ppc))
@@ -152,7 +152,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#endif
IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
__strncmp_power8)
- IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_HAS_VSX,
+ IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_ARCH_2_06,
__strncmp_power7)
IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_POWER4,
__strncmp_power4)
@@ -165,7 +165,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__strchr_power8)
IFUNC_IMPL_ADD (array, i, strchr,
- hwcap & PPC_FEATURE_HAS_VSX,
+ hwcap & PPC_FEATURE_ARCH_2_06,
__strchr_power7)
IFUNC_IMPL_ADD (array, i, strchr, 1,
__strchr_ppc))
@@ -176,7 +176,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__strchrnul_power8)
IFUNC_IMPL_ADD (array, i, strchrnul,
- hwcap & PPC_FEATURE_HAS_VSX,
+ hwcap & PPC_FEATURE_ARCH_2_06,
__strchrnul_power7)
IFUNC_IMPL_ADD (array, i, strchrnul, 1,
__strchrnul_ppc))
@@ -192,7 +192,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#endif
IFUNC_IMPL_ADD (array, i, memcmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
__memcmp_power8)
- IFUNC_IMPL_ADD (array, i, memcmp, hwcap & PPC_FEATURE_HAS_VSX,
+ IFUNC_IMPL_ADD (array, i, memcmp, hwcap & PPC_FEATURE_ARCH_2_06,
__memcmp_power7)
IFUNC_IMPL_ADD (array, i, memcmp, hwcap & PPC_FEATURE_POWER4,
__memcmp_power4)
@@ -244,7 +244,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__memchr_power8)
IFUNC_IMPL_ADD (array, i, memchr,
- hwcap & PPC_FEATURE_HAS_VSX,
+ hwcap & PPC_FEATURE_ARCH_2_06,
__memchr_power7)
IFUNC_IMPL_ADD (array, i, memchr, 1,
__memchr_ppc))
@@ -255,7 +255,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__memrchr_power8)
IFUNC_IMPL_ADD (array, i, memrchr,
- hwcap & PPC_FEATURE_HAS_VSX,
+ hwcap & PPC_FEATURE_ARCH_2_06,
__memrchr_power7)
IFUNC_IMPL_ADD (array, i, memrchr, 1,
__memrchr_ppc))
@@ -272,7 +272,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__rawmemchr_power9)
#endif
IFUNC_IMPL_ADD (array, i, rawmemchr,
- hwcap & PPC_FEATURE_HAS_VSX,
+ hwcap & PPC_FEATURE_ARCH_2_06,
__rawmemchr_power7)
IFUNC_IMPL_ADD (array, i, rawmemchr, 1,
__rawmemchr_ppc))
@@ -282,7 +282,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strnlen,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__strnlen_power8)
- IFUNC_IMPL_ADD (array, i, strnlen, hwcap & PPC_FEATURE_HAS_VSX,
+ IFUNC_IMPL_ADD (array, i, strnlen, hwcap & PPC_FEATURE_ARCH_2_06,
__strnlen_power7)
IFUNC_IMPL_ADD (array, i, strnlen, 1,
__strnlen_ppc))
@@ -293,14 +293,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__strcasecmp_power8)
IFUNC_IMPL_ADD (array, i, strcasecmp,
- hwcap & PPC_FEATURE_HAS_VSX,
+ hwcap & PPC_FEATURE_ARCH_2_06,
__strcasecmp_power7)
IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_ppc))
/* Support sysdeps/powerpc/powerpc64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp_l,
IFUNC_IMPL_ADD (array, i, strcasecmp_l,
- hwcap & PPC_FEATURE_HAS_VSX,
+ hwcap & PPC_FEATURE_ARCH_2_06,
__strcasecmp_l_power7)
IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
__strcasecmp_l_ppc))
@@ -311,14 +311,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__strncasecmp_power8)
IFUNC_IMPL_ADD (array, i, strncasecmp,
- hwcap & PPC_FEATURE_HAS_VSX,
+ hwcap & PPC_FEATURE_ARCH_2_06,
__strncasecmp_power7)
IFUNC_IMPL_ADD (array, i, strncasecmp, 1, __strncasecmp_ppc))
/* Support sysdeps/powerpc/powerpc64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp_l,
IFUNC_IMPL_ADD (array, i, strncasecmp_l,
- hwcap & PPC_FEATURE_HAS_VSX,
+ hwcap & PPC_FEATURE_ARCH_2_06,
__strncasecmp_l_power7)
IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
__strncasecmp_l_ppc))
@@ -329,7 +329,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__strrchr_power8)
IFUNC_IMPL_ADD (array, i, strrchr,
- hwcap & PPC_FEATURE_HAS_VSX,
+ hwcap & PPC_FEATURE_ARCH_2_06,
__strrchr_power7)
IFUNC_IMPL_ADD (array, i, strrchr, 1,
__strrchr_ppc))
@@ -357,7 +357,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__strncpy_power8)
IFUNC_IMPL_ADD (array, i, strncpy,
- hwcap & PPC_FEATURE_HAS_VSX,
+ hwcap & PPC_FEATURE_ARCH_2_06,
__strncpy_power7)
IFUNC_IMPL_ADD (array, i, strncpy, 1,
__strncpy_ppc))
@@ -374,7 +374,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__stpncpy_power8)
IFUNC_IMPL_ADD (array, i, stpncpy,
- hwcap & PPC_FEATURE_HAS_VSX,
+ hwcap & PPC_FEATURE_ARCH_2_06,
__stpncpy_power7)
IFUNC_IMPL_ADD (array, i, stpncpy, 1,
__stpncpy_ppc))
@@ -390,7 +390,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__strcmp_power8)
IFUNC_IMPL_ADD (array, i, strcmp,
- hwcap & PPC_FEATURE_HAS_VSX,
+ hwcap & PPC_FEATURE_ARCH_2_06,
__strcmp_power7)
IFUNC_IMPL_ADD (array, i, strcmp, 1,
__strcmp_ppc))
@@ -425,7 +425,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strstr.c. */
IFUNC_IMPL (i, name, strstr,
IFUNC_IMPL_ADD (array, i, strstr,
- hwcap & PPC_FEATURE_HAS_VSX,
+ hwcap & PPC_FEATURE_ARCH_2_06,
__strstr_power7)
IFUNC_IMPL_ADD (array, i, strstr, 1,
__strstr_ppc))
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
index 0c718d4f15..c24186689e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
@@ -30,7 +30,7 @@ extern __typeof (__memchr) __memchr_power8 attribute_hidden;
libc_ifunc (__memchr,
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __memchr_power8 :
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06)
? __memchr_power7
: __memchr_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
index 4fd089aba7..99559bce26 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
@@ -40,7 +40,7 @@ libc_ifunc_redirected (__redirect_memcmp, memcmp,
#endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __memcmp_power8 :
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06)
? __memcmp_power7
: (hwcap & PPC_FEATURE_POWER4)
? __memcmp_power4
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memrchr.c b/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
index e06d6468b8..16bb6f0042 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
@@ -30,7 +30,7 @@ extern __typeof (__memrchr) __memrchr_power8 attribute_hidden;
libc_ifunc (__memrchr,
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __memrchr_power8 :
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06)
? __memrchr_power7
: __memrchr_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
index 5994bf02e6..c1aa143f60 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
@@ -48,7 +48,7 @@ libc_ifunc (__libc_memset,
# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __memset_power8 :
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06)
? __memset_power7 :
(hwcap & PPC_FEATURE_ARCH_2_05)
? __memset_power6 :
diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
index c0ffea2b93..b5d2d3a635 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
@@ -41,7 +41,7 @@ libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr,
(hwcap2 & PPC_FEATURE2_ARCH_3_00)
? __rawmemchr_power9 :
# endif
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06)
? __rawmemchr_power7
: __rawmemchr_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
index bebd377fd9..e7035761a7 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -40,7 +40,7 @@ libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __stpncpy_power8
- : (hwcap & PPC_FEATURE_HAS_VSX)
+ : (hwcap & PPC_FEATURE_ARCH_2_06)
? __stpncpy_power7
: __stpncpy_ppc);
weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
index dcd7774403..55ca6c85c4 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
@@ -29,7 +29,7 @@ extern __typeof (__strcasecmp) __strcasecmp_power8 attribute_hidden;
libc_ifunc (__libc_strcasecmp,
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __strcasecmp_power8:
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06)
? __strcasecmp_power7
: __strcasecmp_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp_l.c b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp_l.c
index 96a70b8b11..1afee5d7fd 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp_l.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp_l.c
@@ -32,7 +32,7 @@ extern __typeof (__strcasecmp_l) __strcasecmp_l_power7 attribute_hidden;
extern __typeof (__strcasecmp_l) __libc_strcasecmp_l;
libc_ifunc (__libc_strcasecmp_l,
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06)
? __strcasecmp_l_power7
: __strcasecmp_l_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchr.c b/sysdeps/powerpc/powerpc64/multiarch/strchr.c
index ea9ac1134f..27c794c6b7 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strchr.c
@@ -35,7 +35,7 @@ extern __typeof (strchr) __strchr_power8 attribute_hidden;
libc_ifunc_redirected (__redirect_strchr, strchr,
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __strchr_power8 :
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06)
? __strchr_power7
: __strchr_ppc);
weak_alias (strchr, index)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c b/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
index 4688e7c3f0..4a07b4a242 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
@@ -30,7 +30,7 @@ extern __typeof (__strchrnul) __strchrnul_power8 attribute_hidden;
libc_ifunc (__strchrnul,
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __strchrnul_power8 :
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06)
? __strchrnul_power7
: __strchrnul_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
index 72f9a639bf..4b0b25fff6 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
@@ -40,7 +40,7 @@ libc_ifunc_redirected (__redirect_strcmp, strcmp,
# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __strcmp_power8
- : (hwcap & PPC_FEATURE_HAS_VSX)
+ : (hwcap & PPC_FEATURE_ARCH_2_06)
? __strcmp_power7
: __strcmp_ppc);
#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
index 109c8a90bd..0cd1c6faff 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
@@ -42,7 +42,7 @@ libc_ifunc (__libc_strlen,
# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __strlen_power8 :
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06)
? __strlen_power7
: __strlen_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase.c b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
index 2013a5d75a..644046bd74 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncase.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
@@ -29,7 +29,7 @@ extern __typeof (__strncasecmp) __strncasecmp_power8 attribute_hidden;
libc_ifunc (__libc_strncasecmp,
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __strncasecmp_power8:
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06)
? __strncasecmp_power7
: __strncasecmp_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase_l.c b/sysdeps/powerpc/powerpc64/multiarch/strncase_l.c
index cad6da302d..d2d761af72 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncase_l.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncase_l.c
@@ -34,7 +34,7 @@ extern __typeof (__strncasecmp_l) __strncasecmp_l_power7 attribute_hidden;
ifunc symbol properly. */
extern __typeof (__strncasecmp_l) __libc_strncasecmp_l;
libc_ifunc (__libc_strncasecmp_l,
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06)
? __strncasecmp_l_power7
: __strncasecmp_l_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
index eef524ddfb..1f689e5c05 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
@@ -43,7 +43,7 @@ libc_ifunc_redirected (__redirect_strncmp, strncmp,
# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __strncmp_power8
- : (hwcap & PPC_FEATURE_HAS_VSX)
+ : (hwcap & PPC_FEATURE_ARCH_2_06)
? __strncmp_power7
: (hwcap & PPC_FEATURE_POWER4)
? __strncmp_power4
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
index 7da9def358..d4d3463bd1 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -43,7 +43,7 @@ libc_ifunc_redirected (__redirect_strncpy, strncpy,
# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __strncpy_power8
- : (hwcap & PPC_FEATURE_HAS_VSX)
+ : (hwcap & PPC_FEATURE_ARCH_2_06)
? __strncpy_power7
: __strncpy_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strnlen.c b/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
index 264b7a752d..baf375a75a 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
@@ -31,7 +31,7 @@ extern __typeof (__strnlen) __strnlen_power8 attribute_hidden;
libc_ifunc_redirected (__redirect___strnlen, __strnlen,
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __strnlen_power8 :
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06)
? __strnlen_power7
: __strnlen_ppc);
weak_alias (__strnlen, strnlen)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
index bb06b93d19..1c9eea1817 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
@@ -33,7 +33,7 @@ extern __typeof (strrchr) __strrchr_power8 attribute_hidden;
libc_ifunc_redirected (__redirect_strrchr, strrchr,
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __strrchr_power8 :
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06)
? __strrchr_power7
: __strrchr_ppc);
weak_alias (strrchr, rindex)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strstr.c b/sysdeps/powerpc/powerpc64/multiarch/strstr.c
index bb0588844e..6582798dda 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strstr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strstr.c
@@ -30,7 +30,7 @@ extern __typeof (strstr) __strstr_power7 attribute_hidden;
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc_redirected (__redirect_strstr, strstr,
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06)
? __strstr_power7
: __strstr_ppc);
#endif

View File

@ -0,0 +1,83 @@
From f2a15dd668913c5a1388ba7e1131b25162b2ea75 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@ozlabs.org>
Date: Tue, 27 Jul 2021 15:47:50 +1000
Subject: powerpc64: Check cacheline size before using optimised memset
routines
A number of optimised memset routines assume the cacheline size is 128B,
so we better check before using them.
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 32564c8f1f..a3fdcd43bd 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -35,6 +35,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
unsigned long int hwcap = GLRO(dl_hwcap);
unsigned long int hwcap2 = GLRO(dl_hwcap2);
+#ifdef SHARED
+ int cacheline_size = GLRO(dl_cache_line_size);
+#endif
/* hwcap contains only the latest supported ISA, the code checks which is
and fills the previous supported ones. */
@@ -90,16 +93,21 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memset,
hwcap2 & PPC_FEATURE2_ARCH_3_1
&& hwcap2 & PPC_FEATURE2_HAS_ISEL
- && hwcap & PPC_FEATURE_HAS_VSX,
+ && hwcap & PPC_FEATURE_HAS_VSX
+ && cacheline_size == 128,
__memset_power10)
#endif
- IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && cacheline_size == 128,
__memset_power8)
- IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_06,
+ IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_06
+ && cacheline_size == 128,
__memset_power7)
- IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05,
+ IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05
+ && cacheline_size == 128,
__memset_power6)
- IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_POWER4,
+ IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_POWER4
+ && cacheline_size == 128,
__memset_power4)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ppc))
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
index c1aa143f60..056e911699 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
@@ -43,16 +43,21 @@ libc_ifunc (__libc_memset,
# ifdef __LITTLE_ENDIAN__
(hwcap2 & PPC_FEATURE2_ARCH_3_1
&& hwcap2 & PPC_FEATURE2_HAS_ISEL
- && hwcap & PPC_FEATURE_HAS_VSX)
+ && hwcap & PPC_FEATURE_HAS_VSX
+ && GLRO(dl_cache_line_size) == 128)
? __memset_power10 :
# endif
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && GLRO(dl_cache_line_size) == 128)
? __memset_power8 :
- (hwcap & PPC_FEATURE_ARCH_2_06)
+ (hwcap & PPC_FEATURE_ARCH_2_06
+ && GLRO(dl_cache_line_size) == 128)
? __memset_power7 :
- (hwcap & PPC_FEATURE_ARCH_2_05)
+ (hwcap & PPC_FEATURE_ARCH_2_05
+ && GLRO(dl_cache_line_size) == 128)
? __memset_power6 :
- (hwcap & PPC_FEATURE_POWER4)
+ (hwcap & PPC_FEATURE_POWER4
+ && GLRO(dl_cache_line_size) == 128)
? __memset_power4
: __memset_ppc);

View File

@ -0,0 +1,703 @@
From 60b4dd25790342b40e8942e3a4115f511a6b6911 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@ozlabs.org>
Date: Tue, 27 Jul 2021 15:47:51 +1000
Subject: powerpc64: Add checks for Altivec and VSX in ifunc selection
We'd like to support processors without Altivec or VSX, so check
the relevant hwcap bits before selecting them.
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
index 660d7dc686..c8ffbea01c 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
@@ -38,11 +38,13 @@ libc_ifunc (__bzero,
&& hwcap & PPC_FEATURE_HAS_VSX)
? __bzero_power10 :
# endif
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __bzero_power8 :
(hwcap & PPC_FEATURE_HAS_VSX)
? __bzero_power7 :
- (hwcap & PPC_FEATURE_ARCH_2_05)
+ (hwcap & PPC_FEATURE_ARCH_2_05
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __bzero_power6 :
(hwcap & PPC_FEATURE_POWER4)
? __bzero_power4
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index a3fdcd43bd..c3e25c5981 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -60,9 +60,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& hwcap & PPC_FEATURE_HAS_VSX,
__memcpy_power10)
#endif
- IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__memcpy_power8_cached)
- IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX,
+ IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_ARCH_2_06
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__memcpy_power7)
IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_ARCH_2_06,
__memcpy_a2)
@@ -83,7 +85,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& hwcap & PPC_FEATURE_HAS_VSX,
__memmove_power10)
#endif
- IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
+ IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_ARCH_2_06
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__memmove_power7)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ppc))
@@ -98,6 +101,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memset_power10)
#endif
IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC
&& cacheline_size == 128,
__memset_power8)
IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_06
@@ -114,12 +118,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c. */
IFUNC_IMPL (i, name, strcpy,
#ifdef __LITTLE_ENDIAN__
- IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00
+ && hwcap & PPC_FEATURE_HAS_VSX,
__strcpy_power9)
#endif
- IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__strcpy_power8)
- IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX,
+ IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_ARCH_2_06
+ && hwcap & PPC_FEATURE_HAS_VSX,
__strcpy_power7)
IFUNC_IMPL_ADD (array, i, strcpy, 1,
__strcpy_ppc))
@@ -127,12 +134,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/stpcpy.c. */
IFUNC_IMPL (i, name, stpcpy,
#ifdef __LITTLE_ENDIAN__
- IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00
+ && hwcap & PPC_FEATURE_HAS_VSX,
__stpcpy_power9)
#endif
- IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__stpcpy_power8)
- IFUNC_IMPL_ADD (array, i, stpcpy, hwcap & PPC_FEATURE_HAS_VSX,
+ IFUNC_IMPL_ADD (array, i, stpcpy, hwcap & PPC_FEATURE_ARCH_2_06
+ && hwcap & PPC_FEATURE_HAS_VSX,
__stpcpy_power7)
IFUNC_IMPL_ADD (array, i, stpcpy, 1,
__stpcpy_ppc))
@@ -140,12 +150,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c. */
IFUNC_IMPL (i, name, strlen,
#ifdef __LITTLE_ENDIAN__
- IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_1,
+ IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_1
+ && hwcap & PPC_FEATURE_HAS_VSX,
__strlen_power10)
- IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_00
+ && hwcap & PPC_FEATURE_HAS_VSX,
__strlen_power9)
#endif
- IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__strlen_power8)
IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_ARCH_2_06,
__strlen_power7)
@@ -155,7 +168,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strncmp.c. */
IFUNC_IMPL (i, name, strncmp,
#ifdef __LITTLE_ENDIAN__
- IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_00
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__strncmp_power9)
#endif
IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
@@ -170,7 +184,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strchr.c. */
IFUNC_IMPL (i, name, strchr,
IFUNC_IMPL_ADD (array, i, strchr,
- hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__strchr_power8)
IFUNC_IMPL_ADD (array, i, strchr,
hwcap & PPC_FEATURE_ARCH_2_06,
@@ -181,7 +196,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strchrnul.c. */
IFUNC_IMPL (i, name, strchrnul,
IFUNC_IMPL_ADD (array, i, strchrnul,
- hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__strchrnul_power8)
IFUNC_IMPL_ADD (array, i, strchrnul,
hwcap & PPC_FEATURE_ARCH_2_06,
@@ -198,7 +214,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& hwcap & PPC_FEATURE_HAS_VSX,
__memcmp_power10)
#endif
- IFUNC_IMPL_ADD (array, i, memcmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ IFUNC_IMPL_ADD (array, i, memcmp, hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__memcmp_power8)
IFUNC_IMPL_ADD (array, i, memcmp, hwcap & PPC_FEATURE_ARCH_2_06,
__memcmp_power7)
@@ -215,11 +232,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& hwcap & PPC_FEATURE_HAS_VSX,
__bzero_power10)
#endif
- IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__bzero_power8)
IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX,
__bzero_power7)
- IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05,
+ IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__bzero_power6)
IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_POWER4,
__bzero_power4)
@@ -241,7 +260,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/mempcpy.c. */
IFUNC_IMPL (i, name, mempcpy,
IFUNC_IMPL_ADD (array, i, mempcpy,
- hwcap & PPC_FEATURE_HAS_VSX,
+ hwcap & PPC_FEATURE_ARCH_2_06
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__mempcpy_power7)
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
__mempcpy_ppc))
@@ -249,7 +269,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/memchr.c. */
IFUNC_IMPL (i, name, memchr,
IFUNC_IMPL_ADD (array, i, memchr,
- hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__memchr_power8)
IFUNC_IMPL_ADD (array, i, memchr,
hwcap & PPC_FEATURE_ARCH_2_06,
@@ -260,7 +281,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/memrchr.c. */
IFUNC_IMPL (i, name, memrchr,
IFUNC_IMPL_ADD (array, i, memrchr,
- hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__memrchr_power8)
IFUNC_IMPL_ADD (array, i, memrchr,
hwcap & PPC_FEATURE_ARCH_2_06,
@@ -276,7 +298,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& (hwcap & PPC_FEATURE_HAS_VSX),
__rawmemchr_power10)
IFUNC_IMPL_ADD (array, i, rawmemchr,
- hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ hwcap2 & PPC_FEATURE2_ARCH_3_00
+ && hwcap & PPC_FEATURE_HAS_VSX,
__rawmemchr_power9)
#endif
IFUNC_IMPL_ADD (array, i, rawmemchr,
@@ -288,7 +311,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strnlen.c. */
IFUNC_IMPL (i, name, strnlen,
IFUNC_IMPL_ADD (array, i, strnlen,
- hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__strnlen_power8)
IFUNC_IMPL_ADD (array, i, strnlen, hwcap & PPC_FEATURE_ARCH_2_06,
__strnlen_power7)
@@ -298,7 +322,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c. */
IFUNC_IMPL (i, name, strcasecmp,
IFUNC_IMPL_ADD (array, i, strcasecmp,
- hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__strcasecmp_power8)
IFUNC_IMPL_ADD (array, i, strcasecmp,
hwcap & PPC_FEATURE_ARCH_2_06,
@@ -316,7 +341,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strncase.c. */
IFUNC_IMPL (i, name, strncasecmp,
IFUNC_IMPL_ADD (array, i, strncasecmp,
- hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__strncasecmp_power8)
IFUNC_IMPL_ADD (array, i, strncasecmp,
hwcap & PPC_FEATURE_ARCH_2_06,
@@ -334,7 +360,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strrchr.c. */
IFUNC_IMPL (i, name, strrchr,
IFUNC_IMPL_ADD (array, i, strrchr,
- hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__strrchr_power8)
IFUNC_IMPL_ADD (array, i, strrchr,
hwcap & PPC_FEATURE_ARCH_2_06,
@@ -345,10 +372,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strncat.c. */
IFUNC_IMPL (i, name, strncat,
IFUNC_IMPL_ADD (array, i, strncat,
- hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_VSX,
__strncat_power8)
IFUNC_IMPL_ADD (array, i, strncat,
- hwcap & PPC_FEATURE_HAS_VSX,
+ hwcap & PPC_FEATURE_ARCH_2_06
+ && hwcap & PPC_FEATURE_HAS_VSX,
__strncat_power7)
IFUNC_IMPL_ADD (array, i, strncat, 1,
__strncat_ppc))
@@ -391,7 +420,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, strcmp,
#ifdef __LITTLE_ENDIAN__
IFUNC_IMPL_ADD (array, i, strcmp,
- hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ hwcap2 & PPC_FEATURE2_ARCH_3_00
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__strcmp_power9)
#endif
IFUNC_IMPL_ADD (array, i, strcmp,
@@ -406,10 +436,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c. */
IFUNC_IMPL (i, name, strcat,
IFUNC_IMPL_ADD (array, i, strcat,
- hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_VSX,
__strcat_power8)
IFUNC_IMPL_ADD (array, i, strcat,
- hwcap & PPC_FEATURE_HAS_VSX,
+ hwcap & PPC_FEATURE_ARCH_2_06
+ && hwcap & PPC_FEATURE_HAS_VSX,
__strcat_power7)
IFUNC_IMPL_ADD (array, i, strcat, 1,
__strcat_ppc))
@@ -417,7 +449,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strspn.c. */
IFUNC_IMPL (i, name, strspn,
IFUNC_IMPL_ADD (array, i, strspn,
- hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_VSX,
__strspn_power8)
IFUNC_IMPL_ADD (array, i, strspn, 1,
__strspn_ppc))
@@ -425,7 +458,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strcspn.c. */
IFUNC_IMPL (i, name, strcspn,
IFUNC_IMPL_ADD (array, i, strcspn,
- hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_VSX,
__strcspn_power8)
IFUNC_IMPL_ADD (array, i, strcspn, 1,
__strcspn_ppc))
@@ -442,7 +476,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strcasestr.c. */
IFUNC_IMPL (i, name, strcasestr,
IFUNC_IMPL_ADD (array, i, strcasestr,
- hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
__strcasestr_power8)
IFUNC_IMPL_ADD (array, i, strcasestr, 1,
__strcasestr_ppc))
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
index c24186689e..f40013e061 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
@@ -28,7 +28,8 @@ extern __typeof (__memchr) __memchr_power8 attribute_hidden;
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc (__memchr,
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __memchr_power8 :
(hwcap & PPC_FEATURE_ARCH_2_06)
? __memchr_power7
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
index 99559bce26..89b56c103b 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
@@ -38,7 +38,8 @@ libc_ifunc_redirected (__redirect_memcmp, memcmp,
&& hwcap & PPC_FEATURE_HAS_VSX)
? __memcmp_power10 :
#endif
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __memcmp_power8 :
(hwcap & PPC_FEATURE_ARCH_2_06)
? __memcmp_power7
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
index 53ab32ef26..684ee064f2 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
@@ -45,9 +45,12 @@ libc_ifunc (__libc_memcpy,
(hwcap2 & PPC_FEATURE2_ARCH_3_1 && hwcap & PPC_FEATURE_HAS_VSX)
? __memcpy_power10 :
# endif
- ((hwcap2 & PPC_FEATURE2_ARCH_2_07) && use_cached_memopt)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC
+ && use_cached_memopt)
? __memcpy_power8_cached :
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __memcpy_power7 :
(hwcap & PPC_FEATURE_ARCH_2_06)
? __memcpy_a2 :
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
index 637b2cbf7f..50253b4554 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memmove.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
@@ -41,7 +41,8 @@ libc_ifunc (__libc_memmove,
&& hwcap & PPC_FEATURE_HAS_VSX)
? __memmove_power10 :
#endif
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __memmove_power7
: __memmove_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/mempcpy.c b/sysdeps/powerpc/powerpc64/multiarch/mempcpy.c
index b37e0f35b5..563095a5ec 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/mempcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/mempcpy.c
@@ -33,7 +33,8 @@ extern __typeof (__mempcpy) __mempcpy_power7 attribute_hidden;
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc_redirected (__redirect___mempcpy, __mempcpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
+ (hwcap & PPC_FEATURE_ARCH_2_06
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __mempcpy_power7
: __mempcpy_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memrchr.c b/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
index 16bb6f0042..a8b985b06a 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
@@ -28,7 +28,8 @@ extern __typeof (__memrchr) __memrchr_power8 attribute_hidden;
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc (__memrchr,
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __memrchr_power8 :
(hwcap & PPC_FEATURE_ARCH_2_06)
? __memrchr_power7
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
index 056e911699..a2bc223bcc 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
@@ -48,6 +48,7 @@ libc_ifunc (__libc_memset,
? __memset_power10 :
# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC
&& GLRO(dl_cache_line_size) == 128)
? __memset_power8 :
(hwcap & PPC_FEATURE_ARCH_2_06
diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
index b5d2d3a635..43eb459e02 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
@@ -38,7 +38,8 @@ libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr,
(hwcap2 & PPC_FEATURE2_ARCH_3_1)
&& (hwcap & PPC_FEATURE_HAS_VSX)
? __rawmemchr_power10 :
- (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00
+ && hwcap & PPC_FEATURE_HAS_VSX)
? __rawmemchr_power9 :
# endif
(hwcap & PPC_FEATURE_ARCH_2_06)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpcpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpcpy.c
index d4eb4285fc..5be413405e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpcpy.c
@@ -32,12 +32,15 @@ extern __typeof (__stpcpy) __stpcpy_power9 attribute_hidden;
libc_ifunc_hidden (__stpcpy, __stpcpy,
# ifdef __LITTLE_ENDIAN__
- (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00
+ && hwcap & PPC_FEATURE_HAS_VSX)
? __stpcpy_power9 :
# endif
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __stpcpy_power8
- : (hwcap & PPC_FEATURE_HAS_VSX)
+ : (hwcap & PPC_FEATURE_ARCH_2_06
+ && hwcap & PPC_FEATURE_HAS_VSX)
? __stpcpy_power7
: __stpcpy_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
index 55ca6c85c4..21ce2d279b 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
@@ -27,7 +27,8 @@ extern __typeof (__strcasecmp) __strcasecmp_power7 attribute_hidden;
extern __typeof (__strcasecmp) __strcasecmp_power8 attribute_hidden;
libc_ifunc (__libc_strcasecmp,
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __strcasecmp_power8:
(hwcap & PPC_FEATURE_ARCH_2_06)
? __strcasecmp_power7
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasestr.c b/sysdeps/powerpc/powerpc64/multiarch/strcasestr.c
index 7e4bd3b5ac..5bb3016022 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcasestr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasestr.c
@@ -27,7 +27,8 @@ extern __typeof (__strcasestr) __strcasestr_power8 attribute_hidden;
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc (__strcasestr,
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __strcasestr_power8
: __strcasestr_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
index 6d342324c4..d8d9870824 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
@@ -28,9 +28,11 @@ extern __typeof (strcat) __strcat_power8 attribute_hidden;
# undef strcat
libc_ifunc_redirected (__redirect_strcat, strcat,
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_VSX)
? __strcat_power8
- : (hwcap & PPC_FEATURE_HAS_VSX)
+ : (hwcap & PPC_FEATURE_ARCH_2_06
+ && hwcap & PPC_FEATURE_HAS_VSX)
? __strcat_power7
: __strcat_ppc);
#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchr.c b/sysdeps/powerpc/powerpc64/multiarch/strchr.c
index 27c794c6b7..62b202baf9 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strchr.c
@@ -33,7 +33,8 @@ extern __typeof (strchr) __strchr_power8 attribute_hidden;
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc_redirected (__redirect_strchr, strchr,
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __strchr_power8 :
(hwcap & PPC_FEATURE_ARCH_2_06)
? __strchr_power7
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c b/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
index 4a07b4a242..40e529b9d9 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
@@ -28,7 +28,8 @@ extern __typeof (__strchrnul) __strchrnul_power8 attribute_hidden;
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc (__strchrnul,
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __strchrnul_power8 :
(hwcap & PPC_FEATURE_ARCH_2_06)
? __strchrnul_power7
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
index 4b0b25fff6..8132682a99 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
@@ -35,7 +35,8 @@ extern __typeof (strcmp) __strcmp_power9 attribute_hidden;
libc_ifunc_redirected (__redirect_strcmp, strcmp,
# ifdef __LITTLE_ENDIAN__
- (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __strcmp_power9 :
# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
index b733fa5a23..5af1d45cc1 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
@@ -32,12 +32,15 @@ extern __typeof (strcpy) __strcpy_power9 attribute_hidden;
libc_ifunc_redirected (__redirect_strcpy, strcpy,
# ifdef __LITTLE_ENDIAN__
- (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00
+ && hwcap & PPC_FEATURE_HAS_VSX)
? __strcpy_power9 :
# endif
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __strcpy_power8
- : (hwcap & PPC_FEATURE_HAS_VSX)
+ : (hwcap & PPC_FEATURE_ARCH_2_06
+ && hwcap & PPC_FEATURE_HAS_VSX)
? __strcpy_power7
: __strcpy_ppc);
#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcspn.c b/sysdeps/powerpc/powerpc64/multiarch/strcspn.c
index 683aa104d7..8ba01c13b1 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcspn.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcspn.c
@@ -27,7 +27,8 @@ extern __typeof (strcspn) __strcspn_ppc attribute_hidden;
extern __typeof (strcspn) __strcspn_power8 attribute_hidden;
libc_ifunc (__libc_strcspn,
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_VSX)
? __strcspn_power8
: __strcspn_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
index 0cd1c6faff..f1e28414e0 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
@@ -35,12 +35,15 @@ extern __typeof (__redirect_strlen) __strlen_power10 attribute_hidden;
libc_ifunc (__libc_strlen,
# ifdef __LITTLE_ENDIAN__
- (hwcap2 & PPC_FEATURE2_ARCH_3_1)
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1
+ && hwcap & PPC_FEATURE_HAS_VSX)
? __strlen_power10 :
- (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00
+ && hwcap & PPC_FEATURE_HAS_VSX)
? __strlen_power9 :
# endif
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __strlen_power8 :
(hwcap & PPC_FEATURE_ARCH_2_06)
? __strlen_power7
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase.c b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
index 644046bd74..2802cf2c3f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncase.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
@@ -27,7 +27,8 @@ extern __typeof (__strncasecmp) __strncasecmp_power7 attribute_hidden;
extern __typeof (__strncasecmp) __strncasecmp_power8 attribute_hidden;
libc_ifunc (__libc_strncasecmp,
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __strncasecmp_power8:
(hwcap & PPC_FEATURE_ARCH_2_06)
? __strncasecmp_power7
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat.c b/sysdeps/powerpc/powerpc64/multiarch/strncat.c
index 0036fca91a..9ea294a72d 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncat.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncat.c
@@ -26,9 +26,11 @@ extern __typeof (strncat) __strncat_power7 attribute_hidden;
extern __typeof (strncat) __strncat_power8 attribute_hidden;
libc_ifunc (strncat,
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_VSX)
? __strncat_power8
- : (hwcap & PPC_FEATURE_HAS_VSX)
+ : (hwcap & PPC_FEATURE_ARCH_2_06
+ && hwcap & PPC_FEATURE_HAS_VSX)
? __strncat_power7
: __strncat_ppc);
#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
index 1f689e5c05..2d21122854 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
@@ -38,7 +38,8 @@ extern __typeof (strncmp) __strncmp_power9 attribute_hidden;
ifunc symbol properly. */
libc_ifunc_redirected (__redirect_strncmp, strncmp,
# ifdef __LITTLE_ENDIAN__
- (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __strncmp_power9 :
# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strnlen.c b/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
index baf375a75a..e68e9d9f88 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
@@ -29,7 +29,8 @@ extern __typeof (__strnlen) __strnlen_power8 attribute_hidden;
# undef strnlen
# undef __strnlen
libc_ifunc_redirected (__redirect___strnlen, __strnlen,
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __strnlen_power8 :
(hwcap & PPC_FEATURE_ARCH_2_06)
? __strnlen_power7
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
index 1c9eea1817..7f0cf2a1b7 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
@@ -31,7 +31,8 @@ extern __typeof (strrchr) __strrchr_power8 attribute_hidden;
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc_redirected (__redirect_strrchr, strrchr,
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __strrchr_power8 :
(hwcap & PPC_FEATURE_ARCH_2_06)
? __strrchr_power7
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn.c b/sysdeps/powerpc/powerpc64/multiarch/strspn.c
index 70167a176b..7613ab3d55 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strspn.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strspn.c
@@ -27,7 +27,8 @@ extern __typeof (strspn) __strspn_ppc attribute_hidden;
extern __typeof (strspn) __strspn_power8 attribute_hidden;
libc_ifunc (__libc_strspn,
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_VSX)
? __strspn_power8
: __strspn_ppc);

View File

@ -0,0 +1,652 @@
From 21841f0d562f0e944c4d267a28cc3ebd19c847e9 Mon Sep 17 00:00:00 2001
From: Mahesh Bodapati <bmahi496@linux.ibm.com>
Date: Tue, 1 Aug 2023 07:41:17 -0500
Subject: PowerPC: Influence cpu/arch hwcap features via GLIBC_TUNABLES
This patch enables the option to influence hwcaps used by PowerPC.
The environment variable, GLIBC_TUNABLES=glibc.cpu.hwcaps=-xxx,yyy,-zzz....,
can be used to enable CPU/ARCH feature yyy, disable CPU/ARCH feature xxx
and zzz, where the feature name is case-sensitive and has to match the ones
mentioned in the file{sysdeps/powerpc/dl-procinfo.c}.
Note that the hwcap tunables only used in the IFUNC selection.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
[rebased to c9s by DJ]
diff -rupN a/manual/tunables.texi b/manual/tunables.texi
--- a/manual/tunables.texi 2023-09-13 01:16:19.979884270 -0400
+++ b/manual/tunables.texi 2023-09-13 01:17:19.217179994 -0400
@@ -476,7 +476,10 @@ On s390x, the supported HWCAP and STFLE
@code{sysdeps/s390/cpu-features.c}. In addition the user can also set
a CPU arch-level like @code{z13} instead of single HWCAP and STFLE features.
-This tunable is specific to i386, x86-64 and s390x.
+On powerpc, the supported HWCAP and HWCAP2 features can be found in
+@code{sysdeps/powerpc/dl-procinfo.c}.
+
+This tunable is specific to i386, x86-64, s390x and powerpc.
@end deftp
@deftp Tunable glibc.cpu.cached_memopt
diff -rupN a/sysdeps/powerpc/cpu-features.c b/sysdeps/powerpc/cpu-features.c
--- a/sysdeps/powerpc/cpu-features.c 2021-08-01 21:33:43.000000000 -0400
+++ b/sysdeps/powerpc/cpu-features.c 1969-12-31 19:00:00.000000000 -0500
@@ -1,39 +0,0 @@
-/* Initialize cpu feature data. PowerPC version.
- Copyright (C) 2017-2021 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <stdint.h>
-#include <cpu-features.h>
-
-#if HAVE_TUNABLES
-# include <elf/dl-tunables.h>
-#endif
-
-static inline void
-init_cpu_features (struct cpu_features *cpu_features)
-{
- /* Default is to use aligned memory access on optimized function unless
- tunables is enable, since for this case user can explicit disable
- unaligned optimizations. */
-#if HAVE_TUNABLES
- int32_t cached_memfunc = TUNABLE_GET (glibc, cpu, cached_memopt, int32_t,
- NULL);
- cpu_features->use_cached_memopt = (cached_memfunc > 0);
-#else
- cpu_features->use_cached_memopt = false;
-#endif
-}
diff -rupN a/sysdeps/powerpc/cpu-features.h b/sysdeps/powerpc/cpu-features.h
--- a/sysdeps/powerpc/cpu-features.h 2021-08-01 21:33:43.000000000 -0400
+++ b/sysdeps/powerpc/cpu-features.h 1969-12-31 19:00:00.000000000 -0500
@@ -1,28 +0,0 @@
-/* Initialize cpu feature data. PowerPC version.
- Copyright (C) 2017-2021 Free Software Foundation, Inc.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#ifndef __CPU_FEATURES_POWERPC_H
-# define __CPU_FEATURES_POWERPC_H
-
-#include <stdbool.h>
-
-struct cpu_features
-{
- bool use_cached_memopt;
-};
-
-#endif /* __CPU_FEATURES_H */
diff -rupN a/sysdeps/powerpc/dl-tunables.list b/sysdeps/powerpc/dl-tunables.list
--- a/sysdeps/powerpc/dl-tunables.list 2021-08-01 21:33:43.000000000 -0400
+++ b/sysdeps/powerpc/dl-tunables.list 2023-09-13 01:17:19.226180343 -0400
@@ -24,5 +24,8 @@ glibc {
maxval: 1
default: 0
}
+ hwcaps {
+ type: STRING
+ }
}
}
diff -rupN a/sysdeps/powerpc/hwcapinfo.c b/sysdeps/powerpc/hwcapinfo.c
--- a/sysdeps/powerpc/hwcapinfo.c 2021-08-01 21:33:43.000000000 -0400
+++ b/sysdeps/powerpc/hwcapinfo.c 2023-09-13 01:17:19.229180459 -0400
@@ -19,6 +19,7 @@
#include <unistd.h>
#include <shlib-compat.h>
#include <dl-procinfo.h>
+#include <cpu-features.c>
uint64_t __tcb_hwcap __attribute__ ((visibility ("hidden")));
uint32_t __tcb_platform __attribute__ ((visibility ("hidden")));
@@ -64,6 +65,9 @@ __tcb_parse_hwcap_and_convert_at_platfor
else if (h1 & PPC_FEATURE_POWER5)
h1 |= PPC_FEATURE_POWER4;
+ uint64_t array_hwcaps[] = { h1, h2 };
+ init_cpu_features (&GLRO(dl_powerpc_cpu_features), array_hwcaps);
+
/* Consolidate both HWCAP and HWCAP2 into a single doubleword so that
we can read both in a single load later. */
__tcb_hwcap = h2;
diff -rupN a/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c
--- a/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c 2021-08-01 21:33:43.000000000 -0400
+++ b/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c 2023-09-13 01:17:19.232180575 -0400
@@ -21,6 +21,7 @@
#include <wchar.h>
#include <ldsodefs.h>
#include <ifunc-impl-list.h>
+#include <cpu-features.h>
/* Maximum number of IFUNC implementations. */
#define MAX_IFUNC 6
@@ -33,7 +34,8 @@ __libc_ifunc_impl_list (const char *name
size_t i = 0;
- unsigned long int hwcap = GLRO(dl_hwcap);
+ const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features);
+ unsigned long int hwcap = features->hwcap;
/* hwcap contains only the latest supported ISA, the code checks which is
and fills the previous supported ones. */
if (hwcap & PPC_FEATURE_ARCH_2_06)
diff -rupN a/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h b/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h
--- a/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h 2021-08-01 21:33:43.000000000 -0400
+++ b/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h 2023-09-13 01:17:19.232180575 -0400
@@ -16,6 +16,7 @@
<https://www.gnu.org/licenses/>. */
#include <ldsodefs.h>
+#include <cpu-features.h>
/* The code checks if _rtld_global_ro was realocated before trying to access
the dl_hwcap field. The assembly is to make the compiler not optimize the
@@ -32,11 +33,12 @@
# define __GLRO(value) GLRO(value)
#endif
-/* dl_hwcap contains only the latest supported ISA, the macro checks which is
- and fills the previous ones. */
+/* Get the hardware information post the tunables set, the macro checks
+ it and fills the previous ones. */
#define INIT_ARCH() \
- unsigned long int hwcap = __GLRO(dl_hwcap); \
- unsigned long int __attribute__((unused)) hwcap2 = __GLRO(dl_hwcap2); \
+ const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features); \
+ unsigned long int hwcap = features->hwcap; \
+ unsigned long int __attribute__((unused)) hwcap2 = features->hwcap2; \
bool __attribute__((unused)) use_cached_memopt = \
__GLRO(dl_powerpc_cpu_features.use_cached_memopt); \
if (hwcap & PPC_FEATURE_ARCH_2_06) \
diff -rupN a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h
--- a/sysdeps/powerpc/powerpc64/dl-machine.h 2023-09-13 01:16:17.582791395 -0400
+++ b/sysdeps/powerpc/powerpc64/dl-machine.h 2023-09-13 01:17:19.236180730 -0400
@@ -27,7 +27,6 @@
#include <dl-tls.h>
#include <sysdep.h>
#include <hwcapinfo.h>
-#include <cpu-features.c>
#include <dl-static-tls.h>
#include <dl-funcdesc.h>
#include <dl-machine-rel.h>
@@ -293,7 +292,6 @@ static inline void __attribute__ ((unuse
dl_platform_init (void)
{
__tcb_parse_hwcap_and_convert_at_platform ();
- init_cpu_features (&GLRO(dl_powerpc_cpu_features));
}
#endif
diff -rupN a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c 2023-09-13 01:16:20.219893569 -0400
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c 2023-09-13 01:19:17.169756083 -0400
@@ -17,6 +17,7 @@
<https://www.gnu.org/licenses/>. */
#include <assert.h>
+#include <cpu-features.h>
#include <string.h>
#include <wchar.h>
#include <ldsodefs.h>
@@ -32,9 +33,9 @@ __libc_ifunc_impl_list (const char *name
assert (max >= MAX_IFUNC);
size_t i = 0;
-
- unsigned long int hwcap = GLRO(dl_hwcap);
- unsigned long int hwcap2 = GLRO(dl_hwcap2);
+ const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features);
+ unsigned long int hwcap = features->hwcap;
+ unsigned long int hwcap2 = features->hwcap2;
#ifdef SHARED
int cacheline_size = GLRO(dl_cache_line_size);
#endif
diff -rupN a/sysdeps/unix/sysv/linux/powerpc/Makefile b/sysdeps/unix/sysv/linux/powerpc/Makefile
--- a/sysdeps/unix/sysv/linux/powerpc/Makefile 2021-08-01 21:33:43.000000000 -0400
+++ b/sysdeps/unix/sysv/linux/powerpc/Makefile 2023-09-13 01:17:19.243181002 -0400
@@ -21,7 +21,12 @@ ifeq ($(subdir),misc)
sysdep_headers += bits/ppc.h
sysdep_routines += get_timebase_freq
tests-static += test-gettimebasefreq-static
-tests += $(tests-static)
-tests += test-gettimebasefreq
-tests += test-powerpc-linux-sysconf
+tests += \
+ $(tests-static) \
+ test-gettimebasefreq \
+ test-powerpc-linux-sysconf \
+ tst-hwcap-tunables \
+ # tests
+
+tst-hwcap-tunables-ARGS = -- $(host-test-program-cmd)
endif
diff -rupN a/sysdeps/unix/sysv/linux/powerpc/cpu-features.c b/sysdeps/unix/sysv/linux/powerpc/cpu-features.c
--- a/sysdeps/unix/sysv/linux/powerpc/cpu-features.c 1969-12-31 19:00:00.000000000 -0500
+++ b/sysdeps/unix/sysv/linux/powerpc/cpu-features.c 2023-09-13 01:17:19.247181157 -0400
@@ -0,0 +1,124 @@
+/* Initialize cpu feature data. PowerPC version.
+ Copyright (C) 2017-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <array_length.h>
+#include <stdint.h>
+#include <cpu-features.h>
+#include <elf/dl-tunables.h>
+#include <unistd.h>
+#include <string.h>
+
+static void
+TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
+{
+ /* The current IFUNC selection is always using the most recent
+ features which are available via AT_HWCAP or AT_HWCAP2. But in
+ some scenarios it is useful to adjust this selection.
+
+ The environment variable:
+
+ GLIBC_TUNABLES=glibc.cpu.hwcaps=-xxx,yyy,....
+
+ Can be used to enable HWCAP/HWCAP2 feature yyy, disable HWCAP/HWCAP2
+ feature xxx, where the feature name is case-sensitive and has to match
+ the ones mentioned in the file{sysdeps/powerpc/dl-procinfo.c}. */
+
+ /* Copy the features from dl_powerpc_cpu_features, which contains the
+ features provided by AT_HWCAP and AT_HWCAP2. */
+ struct cpu_features *cpu_features = &GLRO(dl_powerpc_cpu_features);
+ unsigned long int tcbv_hwcap = cpu_features->hwcap;
+ unsigned long int tcbv_hwcap2 = cpu_features->hwcap2;
+ const char *token = valp->strval;
+ do
+ {
+ const char *token_end, *feature;
+ bool disable;
+ size_t token_len, i, feature_len, offset = 0;
+ /* Find token separator or end of string. */
+ for (token_end = token; *token_end != ','; token_end++)
+ if (*token_end == '\0')
+ break;
+
+ /* Determine feature. */
+ token_len = token_end - token;
+ if (*token == '-')
+ {
+ disable = true;
+ feature = token + 1;
+ feature_len = token_len - 1;
+ }
+ else
+ {
+ disable = false;
+ feature = token;
+ feature_len = token_len;
+ }
+ for (i = 0; i < array_length (hwcap_tunables); ++i)
+ {
+ const char *hwcap_name = hwcap_names + offset;
+ size_t hwcap_name_len = strlen (hwcap_name);
+ /* Check the tunable name on the supported list. */
+ if (hwcap_name_len == feature_len
+ && memcmp (feature, hwcap_name, feature_len) == 0)
+ {
+ /* Update the hwcap and hwcap2 bits. */
+ if (disable)
+ {
+ /* Id is 1 for hwcap2 tunable. */
+ if (hwcap_tunables[i].id)
+ cpu_features->hwcap2 &= ~(hwcap_tunables[i].mask);
+ else
+ cpu_features->hwcap &= ~(hwcap_tunables[i].mask);
+ }
+ else
+ {
+ /* Enable the features and also check that no unsupported
+ features were enabled by user. */
+ if (hwcap_tunables[i].id)
+ cpu_features->hwcap2 |= (tcbv_hwcap2 & hwcap_tunables[i].mask);
+ else
+ cpu_features->hwcap |= (tcbv_hwcap & hwcap_tunables[i].mask);
+ }
+ break;
+ }
+ offset += hwcap_name_len + 1;
+ }
+ token += token_len;
+ /* ... and skip token separator for next round. */
+ if (*token == ',')
+ token++;
+ }
+ while (*token != '\0');
+}
+
+static inline void
+init_cpu_features (struct cpu_features *cpu_features, uint64_t hwcaps[])
+{
+ /* Fill the cpu_features with the supported hwcaps
+ which are set by __tcb_parse_hwcap_and_convert_at_platform. */
+ cpu_features->hwcap = hwcaps[0];
+ cpu_features->hwcap2 = hwcaps[1];
+ /* Default is to use aligned memory access on optimized function unless
+ tunables is enable, since for this case user can explicit disable
+ unaligned optimizations. */
+ int32_t cached_memfunc = TUNABLE_GET (glibc, cpu, cached_memopt, int32_t,
+ NULL);
+ cpu_features->use_cached_memopt = (cached_memfunc > 0);
+ TUNABLE_GET (glibc, cpu, hwcaps, tunable_val_t *,
+ TUNABLE_CALLBACK (set_hwcaps));
+}
diff -rupN a/sysdeps/unix/sysv/linux/powerpc/cpu-features.h b/sysdeps/unix/sysv/linux/powerpc/cpu-features.h
--- a/sysdeps/unix/sysv/linux/powerpc/cpu-features.h 1969-12-31 19:00:00.000000000 -0500
+++ b/sysdeps/unix/sysv/linux/powerpc/cpu-features.h 2023-09-13 01:17:19.251181312 -0400
@@ -0,0 +1,130 @@
+/* Initialize cpu feature data. PowerPC version.
+ Copyright (C) 2017-2023 Free Software Foundation, Inc.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef __CPU_FEATURES_POWERPC_H
+# define __CPU_FEATURES_POWERPC_H
+
+#include <stdbool.h>
+#include <sys/auxv.h>
+
+struct cpu_features
+{
+ bool use_cached_memopt;
+ unsigned long int hwcap;
+ unsigned long int hwcap2;
+};
+
+static const char hwcap_names[] = {
+ "4xxmac\0"
+ "altivec\0"
+ "arch_2_05\0"
+ "arch_2_06\0"
+ "archpmu\0"
+ "booke\0"
+ "cellbe\0"
+ "dfp\0"
+ "efpdouble\0"
+ "efpsingle\0"
+ "fpu\0"
+ "ic_snoop\0"
+ "mmu\0"
+ "notb\0"
+ "pa6t\0"
+ "power4\0"
+ "power5\0"
+ "power5+\0"
+ "power6x\0"
+ "ppc32\0"
+ "ppc601\0"
+ "ppc64\0"
+ "ppcle\0"
+ "smt\0"
+ "spe\0"
+ "true_le\0"
+ "ucache\0"
+ "vsx\0"
+ "arch_2_07\0"
+ "dscr\0"
+ "ebb\0"
+ "htm\0"
+ "htm-nosc\0"
+ "htm-no-suspend\0"
+ "isel\0"
+ "tar\0"
+ "vcrypto\0"
+ "arch_3_00\0"
+ "ieee128\0"
+ "darn\0"
+ "scv\0"
+ "arch_3_1\0"
+ "mma\0"
+};
+
+static const struct
+{
+ unsigned int mask;
+ bool id;
+} hwcap_tunables[] = {
+ /* AT_HWCAP tunable masks. */
+ { PPC_FEATURE_HAS_4xxMAC, 0 },
+ { PPC_FEATURE_HAS_ALTIVEC, 0 },
+ { PPC_FEATURE_ARCH_2_05, 0 },
+ { PPC_FEATURE_ARCH_2_06, 0 },
+ { PPC_FEATURE_PSERIES_PERFMON_COMPAT, 0 },
+ { PPC_FEATURE_BOOKE, 0 },
+ { PPC_FEATURE_CELL_BE, 0 },
+ { PPC_FEATURE_HAS_DFP, 0 },
+ { PPC_FEATURE_HAS_EFP_DOUBLE, 0 },
+ { PPC_FEATURE_HAS_EFP_SINGLE, 0 },
+ { PPC_FEATURE_HAS_FPU, 0 },
+ { PPC_FEATURE_ICACHE_SNOOP, 0 },
+ { PPC_FEATURE_HAS_MMU, 0 },
+ { PPC_FEATURE_NO_TB, 0 },
+ { PPC_FEATURE_PA6T, 0 },
+ { PPC_FEATURE_POWER4, 0 },
+ { PPC_FEATURE_POWER5, 0 },
+ { PPC_FEATURE_POWER5_PLUS, 0 },
+ { PPC_FEATURE_POWER6_EXT, 0 },
+ { PPC_FEATURE_32, 0 },
+ { PPC_FEATURE_601_INSTR, 0 },
+ { PPC_FEATURE_64, 0 },
+ { PPC_FEATURE_PPC_LE, 0 },
+ { PPC_FEATURE_SMT, 0 },
+ { PPC_FEATURE_HAS_SPE, 0 },
+ { PPC_FEATURE_TRUE_LE, 0 },
+ { PPC_FEATURE_UNIFIED_CACHE, 0 },
+ { PPC_FEATURE_HAS_VSX, 0 },
+
+ /* AT_HWCAP2 tunable masks. */
+ { PPC_FEATURE2_ARCH_2_07, 1 },
+ { PPC_FEATURE2_HAS_DSCR, 1 },
+ { PPC_FEATURE2_HAS_EBB, 1 },
+ { PPC_FEATURE2_HAS_HTM, 1 },
+ { PPC_FEATURE2_HTM_NOSC, 1 },
+ { PPC_FEATURE2_HTM_NO_SUSPEND, 1 },
+ { PPC_FEATURE2_HAS_ISEL, 1 },
+ { PPC_FEATURE2_HAS_TAR, 1 },
+ { PPC_FEATURE2_HAS_VEC_CRYPTO, 1 },
+ { PPC_FEATURE2_ARCH_3_00, 1 },
+ { PPC_FEATURE2_HAS_IEEE128, 1 },
+ { PPC_FEATURE2_DARN, 1 },
+ { PPC_FEATURE2_SCV, 1 },
+ { PPC_FEATURE2_ARCH_3_1, 1 },
+ { PPC_FEATURE2_MMA, 1 },
+};
+
+#endif /* __CPU_FEATURES_H */
diff -rupN a/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/dl-tunables.list b/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/dl-tunables.list
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/dl-tunables.list 2023-09-13 01:16:19.989884657 -0400
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/dl-tunables.list 2023-09-13 01:17:19.254181428 -0400
@@ -28,3 +28,4 @@
@order glibc.malloc.check
@order glibc.gmon.minarcs
@order glibc.gmon.maxarcs
+@order glibc.cpu.hwcaps
diff -rupN a/sysdeps/unix/sysv/linux/powerpc/tst-hwcap-tunables.c b/sysdeps/unix/sysv/linux/powerpc/tst-hwcap-tunables.c
--- a/sysdeps/unix/sysv/linux/powerpc/tst-hwcap-tunables.c 1969-12-31 19:00:00.000000000 -0500
+++ b/sysdeps/unix/sysv/linux/powerpc/tst-hwcap-tunables.c 2023-09-13 01:17:19.258181583 -0400
@@ -0,0 +1,128 @@
+/* Tests for powerpc GLIBC_TUNABLES=glibc.cpu.hwcaps filter.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <array_length.h>
+#include <getopt.h>
+#include <ifunc-impl-list.h>
+#include <spawn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/xunistd.h>
+#include <sys/auxv.h>
+#include <sys/wait.h>
+
+/* Nonzero if the program gets called via `exec'. */
+#define CMDLINE_OPTIONS \
+ { "restart", no_argument, &restart, 1 },
+static int restart;
+
+/* Hold the four initial argument used to respawn the process, plus the extra
+ '--direct', '--restart', and the function to check */
+static char *spargs[8];
+static int fc;
+
+/* Called on process re-execution. */
+_Noreturn static void
+handle_restart (int argc, char *argv[])
+{
+ TEST_VERIFY_EXIT (argc == 1);
+ const char *funcname = argv[0];
+
+ struct libc_ifunc_impl impls[32];
+ int cnt = __libc_ifunc_impl_list ("memcpy", impls, array_length (impls));
+ if (cnt == 0)
+ _exit (EXIT_SUCCESS);
+ TEST_VERIFY_EXIT (cnt >= 1);
+ for (int i = 0; i < cnt; i++) {
+ if (strcmp (impls[i].name, funcname) == 0)
+ {
+ TEST_COMPARE (impls[i].usable, false);
+ break;
+ }
+ }
+
+ _exit (EXIT_SUCCESS);
+}
+
+static void
+run_test (const char *filter, const char *funcname)
+{
+ printf ("info: checking filter %s (expect %s ifunc selection to be removed)\n",
+ filter, funcname);
+ char *tunable = xasprintf ("GLIBC_TUNABLES=glibc.cpu.hwcaps=%s", filter);
+ char *const newenvs[] = { (char*) tunable, NULL };
+ spargs[fc] = (char *) funcname;
+
+ pid_t pid;
+ TEST_COMPARE (posix_spawn (&pid, spargs[0], NULL, NULL, spargs, newenvs), 0);
+ int status;
+ TEST_COMPARE (xwaitpid (pid, &status, 0), pid);
+ TEST_VERIFY (WIFEXITED (status));
+ TEST_VERIFY (!WIFSIGNALED (status));
+ TEST_COMPARE (WEXITSTATUS (status), 0);
+
+ free (tunable);
+}
+
+static int
+do_test (int argc, char *argv[])
+{
+ if (restart)
+ handle_restart (argc - 1, &argv[1]);
+
+ TEST_VERIFY_EXIT (argc == 2 || argc == 5);
+
+ int i;
+ for (i = 0; i < argc - 1; i++)
+ spargs[i] = argv[i + 1];
+ spargs[i++] = (char *) "--direct";
+ spargs[i++] = (char *) "--restart";
+ fc = i++;
+ spargs[i] = NULL;
+
+ unsigned long int hwcap = getauxval (AT_HWCAP);
+ unsigned long int hwcap2 = getauxval (AT_HWCAP2);
+ if (__WORDSIZE == 64)
+ {
+ if (hwcap2 & PPC_FEATURE2_ARCH_3_1)
+ run_test ("-arch_3_1", "__memcpy_power10");
+ if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ run_test ("-arch_2_07", "__memcpy_power8_cached");
+ if (hwcap & PPC_FEATURE_ARCH_2_06)
+ run_test ("-arch_2_06", "__memcpy_power7");
+ if (hwcap & PPC_FEATURE_ARCH_2_05)
+ run_test ("-arch_2_06,-arch_2_05","__memcpy_power6");
+ run_test ("-arch_2_06,-arch_2_05,-power5+,-power5,-power4", "__memcpy_power4");
+ }
+ else
+ {
+ if (hwcap & PPC_FEATURE_HAS_VSX)
+ run_test ("-vsx", "__memcpy_power7");
+ if (hwcap & PPC_FEATURE_ARCH_2_06)
+ run_test ("-arch_2_06", "__memcpy_a2");
+ if (hwcap & PPC_FEATURE_ARCH_2_05)
+ run_test ("-arch_2_05", "__memcpy_power6");
+ }
+ return 0;
+}
+
+#define TEST_FUNCTION_ARGV do_test
+#include <support/test-driver.c>

View File

@ -0,0 +1,148 @@
commit 8329939a37f483a16013dd8af8303cbcb86d92cb
Author: Florian Weimer <fweimer@redhat.com>
Date: Fri Jul 4 21:46:16 2025 +0200
elf: Introduce _dl_debug_change_state
It combines updating r_state with the debugger notification.
The second change to _dl_open introduces an additional debugger
notification for dlmopen, but debuggers are expected to ignore it.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
diff --git a/elf/dl-close.c b/elf/dl-close.c
index 236d89f67f3bf410..fa3974afba798073 100644
--- a/elf/dl-close.c
+++ b/elf/dl-close.c
@@ -472,8 +472,7 @@ _dl_close_worker (struct link_map *map, bool force)
/* Notify the debugger we are about to remove some loaded objects.
LA_ACT_DELETE has already been signalled above for !unload_any. */
struct r_debug *r = _dl_debug_update (nsid);
- r->r_state = RT_DELETE;
- _dl_debug_state ();
+ _dl_debug_change_state (r, RT_DELETE);
LIBC_PROBE (unmap_start, 2, nsid, r);
if (unload_global)
@@ -762,8 +761,7 @@ _dl_close_worker (struct link_map *map, bool force)
__rtld_lock_unlock_recursive (GL(dl_load_tls_lock));
/* Notify the debugger those objects are finalized and gone. */
- r->r_state = RT_CONSISTENT;
- _dl_debug_state ();
+ _dl_debug_change_state (r, RT_CONSISTENT);
LIBC_PROBE (unmap_complete, 2, nsid, r);
#ifdef SHARED
diff --git a/elf/dl-debug.c b/elf/dl-debug.c
index 649386d5a6b885ed..f840a1b92292968d 100644
--- a/elf/dl-debug.c
+++ b/elf/dl-debug.c
@@ -67,6 +67,13 @@ _dl_debug_update (Lmid_t ns)
return &r->base;
}
+void
+_dl_debug_change_state (struct r_debug *r, int state)
+{
+ atomic_store_release (&r->r_state, state);
+ _dl_debug_state ();
+}
+
/* Initialize _r_debug_extended for the namespace NS. LDBASE is the
run-time load address of the dynamic linker, to be put in
_r_debug_extended.r_ldbase. Return the address of _r_debug. */
diff --git a/elf/dl-load.c b/elf/dl-load.c
index 6714807946b60188..c118db811d8899f6 100644
--- a/elf/dl-load.c
+++ b/elf/dl-load.c
@@ -946,8 +946,7 @@ _dl_notify_new_object (int mode, Lmid_t nsid, struct link_map *l)
/* Notify the debugger we have added some objects. We need to
call _dl_debug_initialize in a static program in case dynamic
linking has not been used before. */
- r->r_state = RT_ADD;
- _dl_debug_state ();
+ _dl_debug_change_state (r, RT_ADD);
LIBC_PROBE (map_start, 2, nsid, r);
}
else
diff --git a/elf/dl-open.c b/elf/dl-open.c
index 1e61e402455da666..df6aa55a8842ee62 100644
--- a/elf/dl-open.c
+++ b/elf/dl-open.c
@@ -787,8 +787,7 @@ dl_open_worker (void *a)
#ifdef SHARED
bool was_not_consistent = r->r_state != RT_CONSISTENT;
#endif
- r->r_state = RT_CONSISTENT;
- _dl_debug_state ();
+ _dl_debug_change_state (r, RT_CONSISTENT);
LIBC_PROBE (map_complete, 3, nsid, r, args->map);
#ifdef SHARED
@@ -866,7 +865,7 @@ no more namespaces available for dlmopen()"));
}
GL(dl_ns)[nsid].libc_map = NULL;
- _dl_debug_update (nsid)->r_state = RT_CONSISTENT;
+ _dl_debug_change_state (_dl_debug_update (nsid), RT_CONSISTENT);
}
/* Never allow loading a DSO in a namespace which is empty. Such
direct placements is only causing problems. Also don't allow
diff --git a/elf/rtld.c b/elf/rtld.c
index cd233174c9d944b2..0fe9986e4c7ed830 100644
--- a/elf/rtld.c
+++ b/elf/rtld.c
@@ -1842,8 +1842,7 @@ dl_main (const ElfW(Phdr) *phdr,
#endif
/* We start adding objects. */
- r->r_state = RT_ADD;
- _dl_debug_state ();
+ _dl_debug_change_state (r, RT_ADD);
LIBC_PROBE (init_start, 2, LM_ID_BASE, r);
/* Auditing checkpoint: we are ready to signal that the initial map
@@ -2527,8 +2526,7 @@ dl_main (const ElfW(Phdr) *phdr,
/* Notify the debugger all new objects are now ready to go. We must re-get
the address since by now the variable might be in another object. */
r = _dl_debug_update (LM_ID_BASE);
- r->r_state = RT_CONSISTENT;
- _dl_debug_state ();
+ _dl_debug_change_state (r, RT_CONSISTENT);
LIBC_PROBE (init_complete, 2, LM_ID_BASE, r);
#ifdef SHARED
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
index 21dbe2d21ed8e605..371c32dd79c3ea2b 100644
--- a/sysdeps/generic/ldsodefs.h
+++ b/sysdeps/generic/ldsodefs.h
@@ -1129,8 +1129,14 @@ extern void _dl_debug_state (void);
rtld_hidden_proto (_dl_debug_state)
/* Initialize `struct r_debug_extended' for the namespace NS. LDBASE
- is the run-time load address of the dynamic linker, to be put in the
- `r_ldbase' member. Return the address of the structure. */
+ is the run-time load address of the dynamic linker, to be put in
+ the `r_ldbase' member.
+
+ Return the address of the r_debug structure for the namespace.
+ This is not merely a convenience or optimization, but it is
+ necessary for the LIBC_PROBE Systemtap/debugger probes to work
+ reliably: direct variable access can create probes that tools
+ cannot consume. */
extern struct r_debug *_dl_debug_initialize (ElfW(Addr) ldbase, Lmid_t ns)
attribute_hidden;
@@ -1138,6 +1144,10 @@ extern struct r_debug *_dl_debug_initialize (ElfW(Addr) ldbase, Lmid_t ns)
of the namespace NS. */
extern struct r_debug *_dl_debug_update (Lmid_t ns) attribute_hidden;
+/* Update R->r_state to STATE and notify the debugger by calling
+ _dl_debug_state. */
+void _dl_debug_change_state (struct r_debug *r, int state) attribute_hidden;
+
/* Initialize the basic data structure for the search paths. SOURCE
is either "LD_LIBRARY_PATH" or "--library-path".
GLIBC_HWCAPS_PREPEND adds additional glibc-hwcaps subdirectories to

View File

@ -0,0 +1,264 @@
commit ea85e7d55087075376a29261e722e4fae14ecbe7
Author: Florian Weimer <fweimer@redhat.com>
Date: Fri Jul 4 21:46:30 2025 +0200
elf: Restore support for _r_debug interpositions and copy relocations
The changes in commit a93d9e03a31ec14405cb3a09aa95413b67067380
("Extend struct r_debug to support multiple namespaces [BZ #15971]")
break the dyninst dynamic instrumentation tool. It brings its
own definition of _r_debug (rather than a declaration).
Furthermore, it turns out it is rather hard to use the proposed
handshake for accessing _r_debug via DT_DEBUG. If applications want
to access _r_debug, they can do so directly if the relevant code has
been built as PIC. To protect against harm from accidental copy
relocations due to linker relaxations, this commit restores copy
relocation support by adjusting both copies if interposition or
copy relocations are in play. Therefore, it is possible to
use a hidden reference in ld.so to access _r_debug.
Only perform the copy relocation initialization if libc has been
loaded. Otherwise, the ld.so search scope can be empty, and the
lookup of the _r_debug symbol mail fail.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Conflicts:
elf/rtld.c: Adjust for prelink.
diff --git a/elf/Makefile b/elf/Makefile
index 721f254d121118c0..3eac746d21042ec9 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -404,6 +404,8 @@ tests += \
tst-dlmopen1 \
tst-dlmopen3 \
tst-dlmopen4 \
+ tst-dlmopen4-nonpic \
+ tst-dlmopen4-pic \
tst-dlopen-auditdup \
tst-dlopen-constructor-null \
tst-dlopen-self \
@@ -1986,6 +1988,13 @@ $(objpfx)tst-dlmopen3.out: $(objpfx)tst-dlmopen1mod.so
$(objpfx)tst-dlmopen4.out: $(objpfx)tst-dlmopen1mod.so
+CFLAGS-tst-dlmopen4-pic.c += -fPIC
+$(objpfx)tst-dlmopen4-pic.out: $(objpfx)tst-dlmopen1mod.so
+
+CFLAGS-tst-dlmopen4-nonpic.c += -fno-pie
+tst-dlmopen4-nonpic-no-pie = yes
+$(objpfx)tst-dlmopen4-nonpic.out: $(objpfx)tst-dlmopen1mod.so
+
$(objpfx)tst-audit1.out: $(objpfx)tst-auditmod1.so
tst-audit1-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so
diff --git a/elf/dl-debug-symbols.S b/elf/dl-debug-symbols.S
index 28456ab1f237ea87..629b0c0c6b2cd9e1 100644
--- a/elf/dl-debug-symbols.S
+++ b/elf/dl-debug-symbols.S
@@ -38,3 +38,4 @@
_r_debug:
_r_debug_extended:
.zero R_DEBUG_EXTENDED_SIZE
+rtld_hidden_def (_r_debug)
diff --git a/elf/dl-debug.c b/elf/dl-debug.c
index f840a1b92292968d..4388a04cdf828898 100644
--- a/elf/dl-debug.c
+++ b/elf/dl-debug.c
@@ -16,6 +16,7 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
+#include <assert.h>
#include <ldsodefs.h>
@@ -37,6 +38,37 @@ extern const int verify_link_map_members[(VERIFY_MEMBER (l_addr)
to LM_ID_BASE + 1. See elf/dl-debug-symbols.S. */
struct r_debug_extended _r_debug_array[DL_NNS - 1];
+/* If not null, pointer to the _r_debug in the main executable. */
+static struct r_debug *_r_debug_main;
+
+void
+_dl_debug_post_relocate (struct link_map *main_map)
+{
+ /* Perform a full symbol search in all objects, to maintain
+ compatibility if interposed _r_debug definitions. The lookup
+ cannot fail because there is a definition in ld.so, and this
+ function is only called if the ld.so search scope is not empty. */
+ const ElfW(Sym) *sym = NULL;
+ lookup_t result =_dl_lookup_symbol_x ("_r_debug", main_map, &sym,
+ main_map->l_scope, NULL, 0, 0, NULL);
+ if (sym->st_size >= sizeof (struct r_debug))
+ {
+ struct r_debug *main_r_debug = DL_SYMBOL_ADDRESS (result, sym);
+ if (main_r_debug != &_r_debug_extended.base)
+ {
+ /* The extended version of the struct is not available in
+ the main executable because a copy relocation has been
+ used. r_map etc. have already been copied as part of the
+ copy relocation processing. */
+ main_r_debug->r_version = 1;
+
+ /* Record that dual updates of the initial link map are
+ required. */
+ _r_debug_main = main_r_debug;
+ }
+ }
+}
+
/* Return the r_debug object for the namespace NS. */
static inline struct r_debug_extended *
get_rdebug (Lmid_t ns)
@@ -71,6 +103,11 @@ void
_dl_debug_change_state (struct r_debug *r, int state)
{
atomic_store_release (&r->r_state, state);
+#ifdef SHARED
+ if (r == &_r_debug_extended.base && _r_debug_main != NULL)
+ /* Update the copy-relocation of _r_debug. */
+ atomic_store_release (&_r_debug_main->r_state, state);
+#endif
_dl_debug_state ();
}
@@ -103,7 +140,9 @@ _dl_debug_initialize (ElfW(Addr) ldbase, Lmid_t ns)
if (ns - 1 == LM_ID_BASE)
{
atomic_store_release (&_r_debug_extended.r_next, r);
- /* Now there are multiple namespaces. */
+ /* Now there are multiple namespaces. Note that this
+ deliberately does not update the copy in the main
+ executable (if it exists). */
atomic_store_release (&_r_debug_extended.base.r_version, 2);
}
else
@@ -116,8 +155,15 @@ _dl_debug_initialize (ElfW(Addr) ldbase, Lmid_t ns)
}
if (r->base.r_map == NULL)
- atomic_store_release (&r->base.r_map,
- (void *) GL(dl_ns)[ns]._ns_loaded);
+ {
+ struct link_map_public *l = (void *) GL(dl_ns)[ns]._ns_loaded;
+ atomic_store_release (&r->base.r_map, l);
+#ifdef SHARED
+ if (ns == LM_ID_BASE && _r_debug_main != NULL)
+ /* Update the copy-relocation of _r_debug. */
+ atomic_store_release (&_r_debug_main->r_map, l);
+#endif
+ }
return &r->base;
}
diff --git a/elf/rtld.c b/elf/rtld.c
index 0fe9986e4c7ed830..dac827e249b2fe14 100644
--- a/elf/rtld.c
+++ b/elf/rtld.c
@@ -2395,6 +2395,9 @@ dl_main (const ElfW(Phdr) *phdr,
/* Likewise for the locking implementation. */
__rtld_mutex_init ();
+ /* Update copy-relocated _r_debug if necessary. */
+ _dl_debug_post_relocate (main_map);
+
/* Mark all the objects so we know they have been already relocated. */
for (struct link_map *l = main_map; l != NULL; l = l->l_next)
{
@@ -2505,6 +2508,9 @@ dl_main (const ElfW(Phdr) *phdr,
__rtld_mutex_init ();
__rtld_malloc_init_real (main_map);
+
+ /* Update copy-relocated _r_debug if necessary. */
+ _dl_debug_post_relocate (main_map);
}
/* All ld.so initialization is complete. Apply RELRO. */
diff --git a/elf/tst-dlmopen4-nonpic.c b/elf/tst-dlmopen4-nonpic.c
new file mode 100644
index 0000000000000000..ad4e40995337f4f9
--- /dev/null
+++ b/elf/tst-dlmopen4-nonpic.c
@@ -0,0 +1,2 @@
+#define BUILD_FOR_NONPIC
+#include "tst-dlmopen4.c"
diff --git a/elf/tst-dlmopen4-pic.c b/elf/tst-dlmopen4-pic.c
new file mode 100644
index 0000000000000000..919fa85c2579fb5d
--- /dev/null
+++ b/elf/tst-dlmopen4-pic.c
@@ -0,0 +1,2 @@
+#define BUILD_FOR_PIC
+#include "tst-dlmopen4.c"
diff --git a/elf/tst-dlmopen4.c b/elf/tst-dlmopen4.c
index 3fe150e50bc259f0..633addf41978cee8 100644
--- a/elf/tst-dlmopen4.c
+++ b/elf/tst-dlmopen4.c
@@ -53,6 +53,15 @@ do_test (void)
TEST_COMPARE (debug->base.r_version, 1);
TEST_VERIFY_EXIT (debug->r_next == NULL);
+#ifdef BUILD_FOR_PIC
+ /* In a PIC build, using _r_debug directly should give us the same
+ object. */
+ TEST_VERIFY (&_r_debug == &debug->base);
+#endif
+#ifdef BUILD_FOR_NONPIC
+ TEST_COMPARE (_r_debug.r_version, 1);
+#endif
+
void *h = xdlmopen (LM_ID_NEWLM, "$ORIGIN/tst-dlmopen1mod.so",
RTLD_LAZY);
@@ -64,6 +73,19 @@ do_test (void)
const char *name = basename (debug->r_next->base.r_map->l_name);
TEST_COMPARE_STRING (name, "tst-dlmopen1mod.so");
+#ifdef BUILD_FOR_NONPIC
+ /* If a copy relocation is used, it must be at version 1. */
+ if (&_r_debug != &debug->base)
+ {
+ TEST_COMPARE (_r_debug.r_version, 1);
+ TEST_COMPARE ((uintptr_t) _r_debug.r_map,
+ (uintptr_t) debug->base.r_map);
+ TEST_COMPARE (_r_debug.r_brk, debug->base.r_brk);
+ TEST_COMPARE (_r_debug.r_state, debug->base.r_state);
+ TEST_COMPARE (_r_debug.r_ldbase, debug->base.r_ldbase);
+ }
+#endif
+
xdlclose (h);
return 0;
diff --git a/include/link.h b/include/link.h
index 0cf130ddb8af2e89..bafac6c9628b183c 100644
--- a/include/link.h
+++ b/include/link.h
@@ -366,6 +366,8 @@ struct auditstate
dynamic linker. */
extern struct r_debug_extended _r_debug_extended attribute_hidden;
+rtld_hidden_proto (_r_debug)
+
#if __ELF_NATIVE_CLASS == 32
# define symbind symbind32
# define LA_SYMBIND "la_symbind32"
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
index 371c32dd79c3ea2b..484893c2928db8e7 100644
--- a/sysdeps/generic/ldsodefs.h
+++ b/sysdeps/generic/ldsodefs.h
@@ -1140,6 +1140,10 @@ rtld_hidden_proto (_dl_debug_state)
extern struct r_debug *_dl_debug_initialize (ElfW(Addr) ldbase, Lmid_t ns)
attribute_hidden;
+/* This is called after relocation processing to handle a potential
+ copy relocation for _r_debug. */
+void _dl_debug_post_relocate (struct link_map *main_map) attribute_hidden;
+
/* Update the `r_map' member and return the address of `struct r_debug'
of the namespace NS. */
extern struct r_debug *_dl_debug_update (Lmid_t ns) attribute_hidden;

View File

@ -0,0 +1,51 @@
commit cdcf24ee14c27b77744ff52ab3ae852821207eb0
Author: Florian Weimer <fweimer@redhat.com>
Date: Thu Jul 17 14:44:05 2025 +0200
iconv: iconv -o should not create executable files (bug 33164)
The mistake is that open must use 0666 to pick up the umask,
and not 0777 (which is required by mkdir).
Fixes commit 8ef3cff9d1ceafe369f982d980678d749fb93bd2
("iconv: Support in-place conversions (bug 10460, bug 32033)").
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c
index e3b051a309ff142b..08ea99d6adf6ea86 100644
--- a/iconv/iconv_prog.c
+++ b/iconv/iconv_prog.c
@@ -437,7 +437,7 @@ input_error (const char *path)
static void
open_output_direct (void)
{
- output_fd = open64 (output_file, O_WRONLY | O_CREAT | O_TRUNC, 0777);
+ output_fd = open64 (output_file, O_WRONLY | O_CREAT | O_TRUNC, 0666);
if (output_fd < 0)
output_error ();
}
@@ -458,7 +458,7 @@ prepare_output_file (char **argv)
else
{
/* If iconv creates the output file, no overlap is possible. */
- output_fd = open64 (output_file, O_WRONLY | O_CREAT | O_EXCL, 0777);
+ output_fd = open64 (output_file, O_WRONLY | O_CREAT | O_EXCL, 0666);
if (output_fd >= 0)
output_buffer_size = copy_buffer_size;
else
diff --git a/iconv/tst-iconv_prog-buffer.sh b/iconv/tst-iconv_prog-buffer.sh
index 23098ac56a344c48..562f90fe513e94d7 100644
--- a/iconv/tst-iconv_prog-buffer.sh
+++ b/iconv/tst-iconv_prog-buffer.sh
@@ -75,6 +75,10 @@ run_iconv () {
}
check_out_expected () {
+ if test -x "$tmp/out" ; then
+ echo "error: iconv output file is executable"
+ failure=true
+ fi
if ! cmp -s "$tmp/out" "$tmp/expected" ; then
echo "error: iconv output difference" >&$logfd
echo "*** expected ***" >&$logfd

View File

@ -1,112 +0,0 @@
commit 849274d48fc59bfa6db3c713c8ced8026b20f3b7
Author: Florian Weimer <fweimer@redhat.com>
Date: Thu Nov 16 19:55:35 2023 +0100
elf: Fix force_first handling in dlclose (bug 30981)
The force_first parameter was ineffective because the dlclose'd
object was not necessarily the first in the maps array. Also
enable force_first handling unconditionally, regardless of namespace.
The initial object in a namespace should be destructed first, too.
The _dl_sort_maps_dfs function had early returns for relocation
dependency processing which broke force_first handling, too, and
this is fixed in this change as well.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
diff --git a/elf/dl-close.c b/elf/dl-close.c
index 66524b6708c59f29..8107c2d5f6ad2bc6 100644
--- a/elf/dl-close.c
+++ b/elf/dl-close.c
@@ -182,6 +182,16 @@ _dl_close_worker (struct link_map *map, bool force)
}
assert (idx == nloaded);
+ /* Put the dlclose'd map first, so that its destructor runs first.
+ The map variable is NULL after a retry. */
+ if (map != NULL)
+ {
+ maps[map->l_idx] = maps[0];
+ maps[map->l_idx]->l_idx = map->l_idx;
+ maps[0] = map;
+ maps[0]->l_idx = 0;
+ }
+
/* Keep track of the lowest index link map we have covered already. */
int done_index = -1;
while (++done_index < nloaded)
@@ -255,9 +265,10 @@ _dl_close_worker (struct link_map *map, bool force)
}
}
- /* Sort the entries. We can skip looking for the binary itself which is
- at the front of the search list for the main namespace. */
- _dl_sort_maps (maps, nloaded, (nsid == LM_ID_BASE), true);
+ /* Sort the entries. Unless retrying, the maps[0] object (the
+ original argument to dlclose) needs to remain first, so that its
+ destructor runs first. */
+ _dl_sort_maps (maps, nloaded, /* force_first */ map != NULL, true);
/* Call all termination functions at once. */
bool unload_any = false;
@@ -768,7 +779,11 @@ _dl_close_worker (struct link_map *map, bool force)
/* Recheck if we need to retry, release the lock. */
out:
if (dl_close_state == rerun)
- goto retry;
+ {
+ /* The map may have been deallocated. */
+ map = NULL;
+ goto retry;
+ }
dl_close_state = not_pending;
}
diff --git a/elf/dl-sort-maps.c b/elf/dl-sort-maps.c
index aeb79b40b45054c0..c17ac325eca658ef 100644
--- a/elf/dl-sort-maps.c
+++ b/elf/dl-sort-maps.c
@@ -260,13 +260,12 @@ _dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps,
The below memcpy is not needed in the do_reldeps case here,
since we wrote back to maps[] during DFS traversal. */
if (maps_head == maps)
- return;
+ break;
}
assert (maps_head == maps);
- return;
}
-
- memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
+ else
+ memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
/* Skipping the first object at maps[0] is not valid in general,
since traversing along object dependency-links may "find" that
diff --git a/elf/dso-sort-tests-1.def b/elf/dso-sort-tests-1.def
index 4bf9052db16fb352..cf6453e9eb85ac65 100644
--- a/elf/dso-sort-tests-1.def
+++ b/elf/dso-sort-tests-1.def
@@ -56,14 +56,16 @@ output: b>a>{}<a<b
# relocation(dynamic) dependencies. While this is technically unspecified, the
# presumed reasonable practical behavior is for the destructor order to respect
# the static DT_NEEDED links (here this means the a->b->c->d order).
-# The older dynamic_sort=1 algorithm does not achieve this, while the DFS-based
-# dynamic_sort=2 algorithm does, although it is still arguable whether going
-# beyond spec to do this is the right thing to do.
+# The older dynamic_sort=1 algorithm originally did not achieve this,
+# but this was a bug in the way _dl_sort_maps was called from _dl_close_worker,
+# effectively disabling proper force_first handling.
+# The new dynamic_sort=2 algorithm shows the effect of the simpler force_first
+# handling: the a object is simply moved to the front.
# The below expected outputs are what the two algorithms currently produce
# respectively, for regression testing purposes.
tst-bz15311: {+a;+e;+f;+g;+d;%d;-d;-g;-f;-e;-a};a->b->c->d;d=>[ba];c=>a;b=>e=>a;c=>f=>b;d=>g=>c
-output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<c<d<g<f<b<e];}
-output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<g<f<a<b<c<d<e];}
+output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<b<c<d<g<f<e];}
+output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<g<f<b<c<d<e];}
# Test that even in the presence of dependency loops involving dlopen'ed
# object, that object is initialized last (and not unloaded prematurely).

View File

@ -0,0 +1,106 @@
commit 2cac9559e06044ba520e785c151fbbd25011865f
Author: Florian Weimer <fweimer@redhat.com>
Date: Fri Aug 1 10:20:23 2025 +0200
elf: Extract rtld_setup_phdr function from dl_main
Remove historic binutils reference from comment and update
how this data is used by applications.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Conflicts:
elf/rtld.
(missing __ehdr_start cleanup downstream)
diff --git a/elf/rtld.c b/elf/rtld.c
index 667880e18ae816d8..a9073d4e14b07410 100644
--- a/elf/rtld.c
+++ b/elf/rtld.c
@@ -1306,6 +1306,45 @@ rtld_setup_main_map (struct link_map *main_map)
return has_interp;
}
+/* Set up the program header information for the dynamic linker
+ itself. It can be accessed via _r_debug and dl_iterate_phdr
+ callbacks. */
+static void
+rtld_setup_phdr (void)
+{
+ const ElfW(Ehdr) *rtld_ehdr;
+
+ /* Starting from binutils-2.23, the linker will define the magic symbol
+ __ehdr_start to point to our own ELF header if it is visible in a
+ segment that also includes the phdrs. If that's not available, we use
+ the old method that assumes the beginning of the file is part of the
+ lowest-addressed PT_LOAD segment. */
+#ifdef HAVE_EHDR_START
+ extern const ElfW(Ehdr) __ehdr_start __attribute__ ((visibility ("hidden")));
+ rtld_ehdr = &__ehdr_start;
+#else
+ rtld_ehdr = (void *) GL(dl_rtld_map).l_map_start;
+#endif
+ assert (rtld_ehdr->e_ehsize == sizeof *rtld_ehdr);
+ assert (rtld_ehdr->e_phentsize == sizeof (ElfW(Phdr)));
+
+ const ElfW(Phdr) *rtld_phdr = (const void *) rtld_ehdr + rtld_ehdr->e_phoff;
+
+ GL(dl_rtld_map).l_phdr = rtld_phdr;
+ GL(dl_rtld_map).l_phnum = rtld_ehdr->e_phnum;
+
+
+ /* PT_GNU_RELRO is usually the last phdr. */
+ size_t cnt = rtld_ehdr->e_phnum;
+ while (cnt-- > 0)
+ if (rtld_phdr[cnt].p_type == PT_GNU_RELRO)
+ {
+ GL(dl_rtld_map).l_relro_addr = rtld_phdr[cnt].p_vaddr;
+ GL(dl_rtld_map).l_relro_size = rtld_phdr[cnt].p_memsz;
+ break;
+ }
+}
+
/* Adjusts the contents of the stack and related globals for the user
entry point. The ld.so processed skip_args arguments and bumped
_dl_argv and _dl_argc accordingly. Those arguments are removed from
@@ -1790,39 +1829,7 @@ dl_main (const ElfW(Phdr) *phdr,
if (GLRO(dl_use_load_bias) == (ElfW(Addr)) -2)
GLRO(dl_use_load_bias) = main_map->l_addr == 0 ? -1 : 0;
- /* Set up the program header information for the dynamic linker
- itself. It is needed in the dl_iterate_phdr callbacks. */
- const ElfW(Ehdr) *rtld_ehdr;
-
- /* Starting from binutils-2.23, the linker will define the magic symbol
- __ehdr_start to point to our own ELF header if it is visible in a
- segment that also includes the phdrs. If that's not available, we use
- the old method that assumes the beginning of the file is part of the
- lowest-addressed PT_LOAD segment. */
-#ifdef HAVE_EHDR_START
- extern const ElfW(Ehdr) __ehdr_start __attribute__ ((visibility ("hidden")));
- rtld_ehdr = &__ehdr_start;
-#else
- rtld_ehdr = (void *) GL(dl_rtld_map).l_map_start;
-#endif
- assert (rtld_ehdr->e_ehsize == sizeof *rtld_ehdr);
- assert (rtld_ehdr->e_phentsize == sizeof (ElfW(Phdr)));
-
- const ElfW(Phdr) *rtld_phdr = (const void *) rtld_ehdr + rtld_ehdr->e_phoff;
-
- GL(dl_rtld_map).l_phdr = rtld_phdr;
- GL(dl_rtld_map).l_phnum = rtld_ehdr->e_phnum;
-
-
- /* PT_GNU_RELRO is usually the last phdr. */
- size_t cnt = rtld_ehdr->e_phnum;
- while (cnt-- > 0)
- if (rtld_phdr[cnt].p_type == PT_GNU_RELRO)
- {
- GL(dl_rtld_map).l_relro_addr = rtld_phdr[cnt].p_vaddr;
- GL(dl_rtld_map).l_relro_size = rtld_phdr[cnt].p_memsz;
- break;
- }
+ rtld_setup_phdr ();
/* Add the dynamic linker to the TLS list if it also uses TLS. */
if (GL(dl_rtld_map).l_tls_blocksize != 0)

View File

@ -0,0 +1,457 @@
commit 20681be149b9eb1b6c1f4246bf4bd801221c86cd
Author: Florian Weimer <fweimer@redhat.com>
Date: Fri Aug 1 10:20:23 2025 +0200
elf: Handle ld.so with LOAD segment gaps in _dl_find_object (bug 31943)
Detect if ld.so not contiguous and handle that case in _dl_find_object.
Set l_find_object_processed even for initially loaded link maps,
otherwise dlopen of an initially loaded object adds it to
_dlfo_loaded_mappings (where maps are expected to be contiguous),
in addition to _dlfo_nodelete_mappings.
Test elf/tst-link-map-contiguous-ldso iterates over the loader
image, reading every word to make sure memory is actually mapped.
It only does that if the l_contiguous flag is set for the link map.
Otherwise, it finds gaps with mmap and checks that _dl_find_object
does not return the ld.so mapping for them.
The test elf/tst-link-map-contiguous-main does the same thing for
the libc.so shared object. This only works if the kernel loaded
the main program because the glibc dynamic loader may fill
the gaps with PROT_NONE mappings in some cases, making it contiguous,
but accesses to individual words may still fault.
Test elf/tst-link-map-contiguous-libc is again slightly different
because the dynamic loader always fills the gaps with PROT_NONE
mappings, so a different form of probing has to be used.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Conflicts:
elf/dl-find_object.h
(missing SFrame support downstream. Update
l_find_object_processed early due to the possible
return from the for loop).
elf/dl-find_object.c
(missing is_rtld_link_map downstream)
elf/rtld.c
(missing GL(dl_rtld_map) refactoring downstream)
diff --git a/elf/Makefile b/elf/Makefile
index b37636c0f865f4e6..190ee83120c498a3 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -519,6 +519,8 @@ tests-internal += \
tst-dl-hwcaps_split \
tst-dlmopen2 \
tst-hash-collision3 \
+ tst-link-map-contiguous-ldso \
+ tst-link-map-contiguous-libc \
tst-ptrguard1 \
tst-stackguard1 \
tst-tls-surplus \
@@ -531,6 +533,10 @@ tests-internal += \
tst-dl_find_object tst-dl_find_object-threads \
# tests-internal
+ifeq ($(build-hardcoded-path-in-tests),yes)
+tests-internal += tst-link-map-contiguous-main
+endif
+
tests-container += \
tst-dlopen-self-container \
tst-dlopen-tlsmodid-container \
diff --git a/elf/dl-find_object.c b/elf/dl-find_object.c
index 62a8a61f6b6032cf..128329aa20ef9ed6 100644
--- a/elf/dl-find_object.c
+++ b/elf/dl-find_object.c
@@ -468,6 +468,37 @@ rtld_hidden_def (__dl_find_object_internal)
strong_alias (__dl_find_object_internal, _dl_find_object)
#endif
+/* Subroutine of _dlfo_process_initial to split out noncontigous link
+ maps. NODELETE is the number of used _dlfo_nodelete_mappings
+ elements. It is incremented as needed, and the new NODELETE value
+ is returned. */
+static size_t
+_dlfo_process_initial_noncontiguous_map (struct link_map *map,
+ size_t nodelete)
+{
+ struct dl_find_object_internal dlfo;
+ _dl_find_object_from_map (map, &dlfo);
+
+ /* PT_LOAD segments for a non-contiguous link map are added to the
+ non-closeable mappings. */
+ const ElfW(Phdr) *ph = map->l_phdr;
+ const ElfW(Phdr) *ph_end = map->l_phdr + map->l_phnum;
+ for (; ph < ph_end; ++ph)
+ if (ph->p_type == PT_LOAD)
+ {
+ if (_dlfo_nodelete_mappings != NULL)
+ {
+ /* Second pass only. */
+ _dlfo_nodelete_mappings[nodelete] = dlfo;
+ ElfW(Addr) start = ph->p_vaddr + map->l_addr;
+ _dlfo_nodelete_mappings[nodelete].map_start = start;
+ _dlfo_nodelete_mappings[nodelete].map_end = start + ph->p_memsz;
+ }
+ ++nodelete;
+ }
+ return nodelete;
+}
+
/* _dlfo_process_initial is called twice. First to compute the array
sizes from the initial loaded mappings. Second to fill in the
bases and infos arrays with the (still unsorted) data. Returns the
@@ -479,29 +510,8 @@ _dlfo_process_initial (void)
size_t nodelete = 0;
if (!main_map->l_contiguous)
- {
- struct dl_find_object_internal dlfo;
- _dl_find_object_from_map (main_map, &dlfo);
-
- /* PT_LOAD segments for a non-contiguous are added to the
- non-closeable mappings. */
- for (const ElfW(Phdr) *ph = main_map->l_phdr,
- *ph_end = main_map->l_phdr + main_map->l_phnum;
- ph < ph_end; ++ph)
- if (ph->p_type == PT_LOAD)
- {
- if (_dlfo_nodelete_mappings != NULL)
- {
- /* Second pass only. */
- _dlfo_nodelete_mappings[nodelete] = dlfo;
- _dlfo_nodelete_mappings[nodelete].map_start
- = ph->p_vaddr + main_map->l_addr;
- _dlfo_nodelete_mappings[nodelete].map_end
- = _dlfo_nodelete_mappings[nodelete].map_start + ph->p_memsz;
- }
- ++nodelete;
- }
- }
+ /* Contiguous case already handled in _dl_find_object_init. */
+ nodelete = _dlfo_process_initial_noncontiguous_map (main_map, nodelete);
size_t loaded = 0;
for (Lmid_t ns = 0; ns < GL(dl_nns); ++ns)
@@ -513,11 +523,22 @@ _dlfo_process_initial (void)
/* lt_library link maps are implicitly NODELETE. */
if (l->l_type == lt_library || l->l_nodelete_active)
{
- if (_dlfo_nodelete_mappings != NULL)
- /* Second pass only. */
- _dl_find_object_from_map
- (l, _dlfo_nodelete_mappings + nodelete);
- ++nodelete;
+ /* The kernel may have loaded ld.so with gaps. */
+ if (!l->l_contiguous
+#ifdef SHARED
+ && l == &GL(dl_rtld_map)
+#endif
+ )
+ nodelete
+ = _dlfo_process_initial_noncontiguous_map (l, nodelete);
+ else
+ {
+ if (_dlfo_nodelete_mappings != NULL)
+ /* Second pass only. */
+ _dl_find_object_from_map
+ (l, _dlfo_nodelete_mappings + nodelete);
+ ++nodelete;
+ }
}
else if (l->l_type == lt_loaded)
{
@@ -767,7 +788,6 @@ _dl_find_object_update_1 (struct link_map **loaded, size_t count)
/* Prefer newly loaded link map. */
assert (loaded_index1 > 0);
_dl_find_object_from_map (loaded[loaded_index1 - 1], dlfo);
- loaded[loaded_index1 - 1]->l_find_object_processed = 1;
--loaded_index1;
}
diff --git a/elf/dl-find_object.h b/elf/dl-find_object.h
index 11569efc9b7daf9c..fae25747edc6dca0 100644
--- a/elf/dl-find_object.h
+++ b/elf/dl-find_object.h
@@ -87,7 +87,7 @@ _dl_find_object_to_external (struct dl_find_object_internal *internal,
}
/* Extract the object location data from a link map and writes it to
- *RESULT using relaxed MO stores. */
+ *RESULT using relaxed MO stores. Set L->l_find_object_processed. */
static void __attribute__ ((unused))
_dl_find_object_from_map (struct link_map *l,
struct dl_find_object_internal *result)
@@ -100,6 +100,8 @@ _dl_find_object_from_map (struct link_map *l,
atomic_store_relaxed (&result->eh_dbase, (void *) l->l_info[DT_PLTGOT]);
#endif
+ l->l_find_object_processed = 1;
+
for (const ElfW(Phdr) *ph = l->l_phdr, *ph_end = l->l_phdr + l->l_phnum;
ph < ph_end; ++ph)
if (ph->p_type == DLFO_EH_SEGMENT_TYPE)
diff --git a/elf/rtld.c b/elf/rtld.c
index a9073d4e14b07410..425003e6c8e452ab 100644
--- a/elf/rtld.c
+++ b/elf/rtld.c
@@ -1308,7 +1308,7 @@ rtld_setup_main_map (struct link_map *main_map)
/* Set up the program header information for the dynamic linker
itself. It can be accessed via _r_debug and dl_iterate_phdr
- callbacks. */
+ callbacks, and it is used by _dl_find_object. */
static void
rtld_setup_phdr (void)
{
@@ -1334,6 +1334,29 @@ rtld_setup_phdr (void)
GL(dl_rtld_map).l_phnum = rtld_ehdr->e_phnum;
+ GL(dl_rtld_map).l_contiguous = 1;
+ /* The linker may not have produced a contiguous object. The kernel
+ will load the object with actual gaps (unlike the glibc loader
+ for shared objects, which always produces a contiguous mapping).
+ See similar logic in rtld_setup_main_map above. */
+ {
+ ElfW(Addr) expected_load_address = 0;
+ for (const ElfW(Phdr) *ph = rtld_phdr; ph < &rtld_phdr[rtld_ehdr->e_phnum];
+ ++ph)
+ if (ph->p_type == PT_LOAD)
+ {
+ ElfW(Addr) mapstart = ph->p_vaddr & ~(GLRO(dl_pagesize) - 1);
+ if (GL(dl_rtld_map).l_contiguous && expected_load_address != 0
+ && expected_load_address != mapstart)
+ GL(dl_rtld_map).l_contiguous = 0;
+ ElfW(Addr) allocend = ph->p_vaddr + ph->p_memsz;
+ /* The next expected address is the page following this load
+ segment. */
+ expected_load_address = ((allocend + GLRO(dl_pagesize) - 1)
+ & ~(GLRO(dl_pagesize) - 1));
+ }
+ }
+
/* PT_GNU_RELRO is usually the last phdr. */
size_t cnt = rtld_ehdr->e_phnum;
while (cnt-- > 0)
diff --git a/elf/tst-link-map-contiguous-ldso.c b/elf/tst-link-map-contiguous-ldso.c
new file mode 100644
index 0000000000000000..04de808bb234fe38
--- /dev/null
+++ b/elf/tst-link-map-contiguous-ldso.c
@@ -0,0 +1,98 @@
+/* Check that _dl_find_object behavior matches up with gaps.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <dlfcn.h>
+#include <gnu/lib-names.h>
+#include <link.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <support/check.h>
+#include <support/xdlfcn.h>
+#include <support/xunistd.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+static int
+do_test (void)
+{
+ struct link_map *l = xdlopen (LD_SO, RTLD_NOW);
+ if (!l->l_contiguous)
+ {
+ puts ("info: ld.so link map is not contiguous");
+
+ /* Try to find holes by probing with mmap. */
+ int pagesize = getpagesize ();
+ bool gap_found = false;
+ ElfW(Addr) addr = l->l_map_start;
+ TEST_COMPARE (addr % pagesize, 0);
+ while (addr < l->l_map_end)
+ {
+ void *expected = (void *) addr;
+ void *ptr = xmmap (expected, 1, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1);
+ struct dl_find_object dlfo;
+ int dlfo_ret = _dl_find_object (expected, &dlfo);
+ if (ptr == expected)
+ {
+ if (dlfo_ret < 0)
+ {
+ TEST_COMPARE (dlfo_ret, -1);
+ printf ("info: hole without mapping data found at %p\n", ptr);
+ }
+ else
+ FAIL ("object \"%s\" found in gap at %p",
+ dlfo.dlfo_link_map->l_name, ptr);
+ gap_found = true;
+ }
+ else if (dlfo_ret == 0)
+ {
+ if ((void *) dlfo.dlfo_link_map != (void *) l)
+ {
+ printf ("info: object \"%s\" found at %p\n",
+ dlfo.dlfo_link_map->l_name, ptr);
+ gap_found = true;
+ }
+ }
+ else
+ TEST_COMPARE (dlfo_ret, -1);
+ xmunmap (ptr, 1);
+ addr += pagesize;
+ }
+ if (!gap_found)
+ FAIL ("no ld.so gap found");
+ }
+ else
+ {
+ puts ("info: ld.so link map is contiguous");
+
+ /* Assert that ld.so is truly contiguous in memory. */
+ volatile long int *p = (volatile long int *) l->l_map_start;
+ volatile long int *end = (volatile long int *) l->l_map_end;
+ while (p < end)
+ {
+ *p;
+ ++p;
+ }
+ }
+
+ xdlclose (l);
+
+ return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-link-map-contiguous-libc.c b/elf/tst-link-map-contiguous-libc.c
new file mode 100644
index 0000000000000000..eb5728c765ac3cfb
--- /dev/null
+++ b/elf/tst-link-map-contiguous-libc.c
@@ -0,0 +1,57 @@
+/* Check that the entire libc.so program image is readable if contiguous.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <gnu/lib-names.h>
+#include <link.h>
+#include <support/check.h>
+#include <support/xdlfcn.h>
+#include <support/xunistd.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+static int
+do_test (void)
+{
+ struct link_map *l = xdlopen (LIBC_SO, RTLD_NOW);
+
+ /* The dynamic loader fills holes with PROT_NONE mappings. */
+ if (!l->l_contiguous)
+ FAIL_EXIT1 ("libc.so link map is not contiguous");
+
+ /* Direct probing does not work because not everything is readable
+ due to PROT_NONE mappings. */
+ int pagesize = getpagesize ();
+ ElfW(Addr) addr = l->l_map_start;
+ TEST_COMPARE (addr % pagesize, 0);
+ while (addr < l->l_map_end)
+ {
+ void *expected = (void *) addr;
+ void *ptr = xmmap (expected, 1, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1);
+ if (ptr == expected)
+ FAIL ("hole in libc.so memory image after %lu bytes",
+ (unsigned long int) (addr - l->l_map_start));
+ xmunmap (ptr, 1);
+ addr += pagesize;
+ }
+
+ xdlclose (l);
+
+ return 0;
+}
+#include <support/test-driver.c>
diff --git a/elf/tst-link-map-contiguous-main.c b/elf/tst-link-map-contiguous-main.c
new file mode 100644
index 0000000000000000..2d1a054f0fbb0855
--- /dev/null
+++ b/elf/tst-link-map-contiguous-main.c
@@ -0,0 +1,45 @@
+/* Check that the entire main program image is readable if contiguous.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <link.h>
+#include <support/check.h>
+#include <support/xdlfcn.h>
+
+static int
+do_test (void)
+{
+ struct link_map *l = xdlopen ("", RTLD_NOW);
+ if (!l->l_contiguous)
+ FAIL_UNSUPPORTED ("main link map is not contiguous");
+
+ /* This check only works if the kernel loaded the main program. The
+ dynamic loader replaces gaps with PROT_NONE mappings, resulting
+ in faults. */
+ volatile long int *p = (volatile long int *) l->l_map_start;
+ volatile long int *end = (volatile long int *) l->l_map_end;
+ while (p < end)
+ {
+ *p;
+ ++p;
+ }
+
+ xdlclose (l);
+
+ return 0;
+}
+#include <support/test-driver.c>

View File

@ -1,236 +0,0 @@
commit 7ea06e994093fa0bcca0d0ee2c1db271d8d7885d
Author: Florian Weimer <fweimer@redhat.com>
Date: Mon Jul 21 21:43:49 2025 +0200
posix: Fix double-free after allocation failure in regcomp (bug 33185)
If a memory allocation failure occurs during bracket expression
parsing in regcomp, a double-free error may result.
Reported-by: Anastasia Belova <abelova@astralinux.ru>
Co-authored-by: Paul Eggert <eggert@cs.ucla.edu>
Reviewed-by: Andreas K. Huettel <dilfridge@gentoo.org>
Conflicts:
posix/Makefile
(tests list not reformatted/sorted downstream)
posix/tst-regcomp-bracket-free.c
(missing strerrorname_np downstream)
diff --git a/posix/Makefile b/posix/Makefile
index 83162123f9c927a0..42a0290370b40fd9 100644
--- a/posix/Makefile
+++ b/posix/Makefile
@@ -96,7 +96,7 @@ tests := test-errno tstgetopt testfnm runtests runptests \
tst-posix_fadvise tst-posix_fadvise64 \
tst-sysconf-empty-chroot tst-glob_symlinks tst-fexecve \
tst-glob-tilde test-ssize-max tst-spawn4 bug-regex37 \
- bug-regex38 tst-regcomp-truncated
+ bug-regex38 tst-regcomp-truncated tst-regcomp-bracket-free
tests-internal := bug-regex5 bug-regex20 bug-regex33 \
tst-rfc3484 tst-rfc3484-2 tst-rfc3484-3 \
tst-glob_lstat_compat tst-spawn4-compat
diff --git a/posix/regcomp.c b/posix/regcomp.c
index 545d188468c376e7..b737b22da8703d6c 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -3375,6 +3375,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
{
#ifdef RE_ENABLE_I18N
free_charset (mbcset);
+ mbcset = NULL;
#endif
/* Build a tree for simple bracket. */
br_token.type = SIMPLE_BRACKET;
@@ -3390,7 +3391,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
parse_bracket_exp_free_return:
re_free (sbcset);
#ifdef RE_ENABLE_I18N
- free_charset (mbcset);
+ if (__glibc_likely (mbcset != NULL))
+ free_charset (mbcset);
#endif /* RE_ENABLE_I18N */
return NULL;
}
diff --git a/posix/tst-regcomp-bracket-free.c b/posix/tst-regcomp-bracket-free.c
new file mode 100644
index 0000000000000000..e6041ddaeba3045c
--- /dev/null
+++ b/posix/tst-regcomp-bracket-free.c
@@ -0,0 +1,176 @@
+/* Test regcomp bracket parsing with injected allocation failures (bug 33185).
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* This test invokes regcomp multiple times, failing one memory
+ allocation in each call. The function call should fail with
+ REG_ESPACE (or succeed if it can recover from the allocation
+ failure). Previously, there was double-free bug. */
+
+#include <errno.h>
+#include <regex.h>
+#include <stdio.h>
+#include <string.h>
+#include <support/check.h>
+#include <support/namespace.h>
+#include <support/support.h>
+
+/* Data structure allocated via MAP_SHARED, so that writes from the
+ subprocess are visible. */
+struct shared_data
+{
+ /* Number of tracked allocations performed so far. */
+ volatile unsigned int allocation_count;
+
+ /* If this number is reached, one allocation fails. */
+ volatile unsigned int failing_allocation;
+
+ /* The subprocess stores the expected name here. */
+ char name[100];
+};
+
+/* Allocation count in shared mapping. */
+static struct shared_data *shared;
+
+/* Returns true if a failure should be injected for this allocation. */
+static bool
+fail_this_allocation (void)
+{
+ if (shared != NULL)
+ {
+ unsigned int count = shared->allocation_count;
+ shared->allocation_count = count + 1;
+ return count == shared->failing_allocation;
+ }
+ else
+ return false;
+}
+
+/* Failure-injecting wrappers for allocation functions used by glibc. */
+
+void *
+malloc (size_t size)
+{
+ if (fail_this_allocation ())
+ {
+ errno = ENOMEM;
+ return NULL;
+ }
+ extern __typeof (malloc) __libc_malloc;
+ return __libc_malloc (size);
+}
+
+void *
+calloc (size_t a, size_t b)
+{
+ if (fail_this_allocation ())
+ {
+ errno = ENOMEM;
+ return NULL;
+ }
+ extern __typeof (calloc) __libc_calloc;
+ return __libc_calloc (a, b);
+}
+
+void *
+realloc (void *ptr, size_t size)
+{
+ if (fail_this_allocation ())
+ {
+ errno = ENOMEM;
+ return NULL;
+ }
+ extern __typeof (realloc) __libc_realloc;
+ return __libc_realloc (ptr, size);
+}
+
+/* No-op subprocess to verify that support_isolate_in_subprocess does
+ not perform any heap allocations. */
+static void
+no_op (void *ignored)
+{
+}
+
+/* Perform a regcomp call in a subprocess. Used to count its
+ allocations. */
+static void
+initialize (void *regexp1)
+{
+ const char *regexp = regexp1;
+
+ shared->allocation_count = 0;
+
+ regex_t reg;
+ TEST_COMPARE (regcomp (&reg, regexp, 0), 0);
+}
+
+/* Perform regcomp in a subprocess with fault injection. */
+static void
+test_in_subprocess (void *regexp1)
+{
+ const char *regexp = regexp1;
+ unsigned int inject_at = shared->failing_allocation;
+
+ regex_t reg;
+ int ret = regcomp (&reg, regexp, 0);
+
+ if (ret != 0)
+ {
+ TEST_COMPARE (ret, REG_ESPACE);
+ printf ("info: allocation %u failure results in return value %d,"
+ " error %s (%d)\n",
+ inject_at, ret, strerror (errno), errno);
+ }
+}
+
+static int
+do_test (void)
+{
+ char regexp[] = "[:alpha:]";
+
+ shared = support_shared_allocate (sizeof (*shared));
+
+ /* Disable fault injection. */
+ shared->failing_allocation = ~0U;
+
+ support_isolate_in_subprocess (no_op, NULL);
+ TEST_COMPARE (shared->allocation_count, 0);
+
+ support_isolate_in_subprocess (initialize, regexp);
+
+ /* The number of allocations in the successful case, plus some
+ slack. Once the number of expected allocations is exceeded,
+ injecting further failures does not make a difference. */
+ unsigned int maximum_allocation_count = shared->allocation_count;
+ printf ("info: successful call performs %u allocations\n",
+ maximum_allocation_count);
+ maximum_allocation_count += 10;
+
+ for (unsigned int inject_at = 0; inject_at <= maximum_allocation_count;
+ ++inject_at)
+ {
+ shared->allocation_count = 0;
+ shared->failing_allocation = inject_at;
+ support_isolate_in_subprocess (test_in_subprocess, regexp);
+ }
+
+ support_shared_free (shared);
+
+ return 0;
+}
+
+#include <support/test-driver.c>

View File

@ -0,0 +1,234 @@
commit 7ea06e994093fa0bcca0d0ee2c1db271d8d7885d
Author: Florian Weimer <fweimer@redhat.com>
Date: Mon Jul 21 21:43:49 2025 +0200
posix: Fix double-free after allocation failure in regcomp (bug 33185)
If a memory allocation failure occurs during bracket expression
parsing in regcomp, a double-free error may result.
Reported-by: Anastasia Belova <abelova@astralinux.ru>
Co-authored-by: Paul Eggert <eggert@cs.ucla.edu>
Reviewed-by: Andreas K. Huettel <dilfridge@gentoo.org>
Conflicts:
posix/Makefile (New test added)
diff --git a/posix/Makefile b/posix/Makefile
index 7b70b4a736bc1215..562e8cb85fdb6f43 100644
--- a/posix/Makefile
+++ b/posix/Makefile
@@ -112,7 +112,8 @@ tests := test-errno tstgetopt testfnm runtests runptests \
tst-sched_getaffinity \
tst-cpuset-dynamic \
tst-cpuset-static \
- tst-spawn6
+ tst-spawn6 \
+ tst-regcomp-bracket-free
# Test for the glob symbol version that was replaced in glibc 2.27.
ifeq ($(have-GLIBC_2.26)$(build-shared),yesyes)
diff --git a/posix/regcomp.c b/posix/regcomp.c
index 887e5b50684e22f5..005e6459bbe8bd55 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -3365,6 +3365,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
{
#ifdef RE_ENABLE_I18N
free_charset (mbcset);
+ mbcset = NULL;
#endif
/* Build a tree for simple bracket. */
br_token.type = SIMPLE_BRACKET;
@@ -3380,7 +3381,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
parse_bracket_exp_free_return:
re_free (sbcset);
#ifdef RE_ENABLE_I18N
- free_charset (mbcset);
+ if (__glibc_likely (mbcset != NULL))
+ free_charset (mbcset);
#endif /* RE_ENABLE_I18N */
return NULL;
}
diff --git a/posix/tst-regcomp-bracket-free.c b/posix/tst-regcomp-bracket-free.c
new file mode 100644
index 0000000000000000..3c091d8c44ebe56f
--- /dev/null
+++ b/posix/tst-regcomp-bracket-free.c
@@ -0,0 +1,176 @@
+/* Test regcomp bracket parsing with injected allocation failures (bug 33185).
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* This test invokes regcomp multiple times, failing one memory
+ allocation in each call. The function call should fail with
+ REG_ESPACE (or succeed if it can recover from the allocation
+ failure). Previously, there was double-free bug. */
+
+#include <errno.h>
+#include <regex.h>
+#include <stdio.h>
+#include <string.h>
+#include <support/check.h>
+#include <support/namespace.h>
+#include <support/support.h>
+
+/* Data structure allocated via MAP_SHARED, so that writes from the
+ subprocess are visible. */
+struct shared_data
+{
+ /* Number of tracked allocations performed so far. */
+ volatile unsigned int allocation_count;
+
+ /* If this number is reached, one allocation fails. */
+ volatile unsigned int failing_allocation;
+
+ /* The subprocess stores the expected name here. */
+ char name[100];
+};
+
+/* Allocation count in shared mapping. */
+static struct shared_data *shared;
+
+/* Returns true if a failure should be injected for this allocation. */
+static bool
+fail_this_allocation (void)
+{
+ if (shared != NULL)
+ {
+ unsigned int count = shared->allocation_count;
+ shared->allocation_count = count + 1;
+ return count == shared->failing_allocation;
+ }
+ else
+ return false;
+}
+
+/* Failure-injecting wrappers for allocation functions used by glibc. */
+
+void *
+malloc (size_t size)
+{
+ if (fail_this_allocation ())
+ {
+ errno = ENOMEM;
+ return NULL;
+ }
+ extern __typeof (malloc) __libc_malloc;
+ return __libc_malloc (size);
+}
+
+void *
+calloc (size_t a, size_t b)
+{
+ if (fail_this_allocation ())
+ {
+ errno = ENOMEM;
+ return NULL;
+ }
+ extern __typeof (calloc) __libc_calloc;
+ return __libc_calloc (a, b);
+}
+
+void *
+realloc (void *ptr, size_t size)
+{
+ if (fail_this_allocation ())
+ {
+ errno = ENOMEM;
+ return NULL;
+ }
+ extern __typeof (realloc) __libc_realloc;
+ return __libc_realloc (ptr, size);
+}
+
+/* No-op subprocess to verify that support_isolate_in_subprocess does
+ not perform any heap allocations. */
+static void
+no_op (void *ignored)
+{
+}
+
+/* Perform a regcomp call in a subprocess. Used to count its
+ allocations. */
+static void
+initialize (void *regexp1)
+{
+ const char *regexp = regexp1;
+
+ shared->allocation_count = 0;
+
+ regex_t reg;
+ TEST_COMPARE (regcomp (&reg, regexp, 0), 0);
+}
+
+/* Perform regcomp in a subprocess with fault injection. */
+static void
+test_in_subprocess (void *regexp1)
+{
+ const char *regexp = regexp1;
+ unsigned int inject_at = shared->failing_allocation;
+
+ regex_t reg;
+ int ret = regcomp (&reg, regexp, 0);
+
+ if (ret != 0)
+ {
+ TEST_COMPARE (ret, REG_ESPACE);
+ printf ("info: allocation %u failure results in return value %d,"
+ " error %s (%d)\n",
+ inject_at, ret, strerrorname_np (errno), errno);
+ }
+}
+
+static int
+do_test (void)
+{
+ char regexp[] = "[:alpha:]";
+
+ shared = support_shared_allocate (sizeof (*shared));
+
+ /* Disable fault injection. */
+ shared->failing_allocation = ~0U;
+
+ support_isolate_in_subprocess (no_op, NULL);
+ TEST_COMPARE (shared->allocation_count, 0);
+
+ support_isolate_in_subprocess (initialize, regexp);
+
+ /* The number of allocations in the successful case, plus some
+ slack. Once the number of expected allocations is exceeded,
+ injecting further failures does not make a difference. */
+ unsigned int maximum_allocation_count = shared->allocation_count;
+ printf ("info: successful call performs %u allocations\n",
+ maximum_allocation_count);
+ maximum_allocation_count += 10;
+
+ for (unsigned int inject_at = 0; inject_at <= maximum_allocation_count;
+ ++inject_at)
+ {
+ shared->allocation_count = 0;
+ shared->failing_allocation = inject_at;
+ support_isolate_in_subprocess (test_in_subprocess, regexp);
+ }
+
+ support_shared_free (shared);
+
+ return 0;
+}
+
+#include <support/test-driver.c>

View File

@ -0,0 +1,33 @@
Downstream patch only.
Revert changes made to elf/dl-readonly-area.c compared to
ed6a68bac7cd056abda9008019c71b167f0362dc since `_dl_find_object` has
been backported.
Note: `_dl_find_object` isn't directly made available internally
downstream, we can use `__dl_find_object_internal` instead.
diff --git a/elf/dl-readonly-area.c b/elf/dl-readonly-area.c
index 570b99b11527db13..3b39eed06a379ce3 100644
--- a/elf/dl-readonly-area.c
+++ b/elf/dl-readonly-area.c
@@ -40,16 +40,11 @@ check_relro (const struct link_map *l, uintptr_t start, uintptr_t end)
enum dl_readonly_area_error_type
_dl_readonly_area (const void *ptr, size_t size)
{
- /* Protect against concurrent loads and unloads. */
- __rtld_lock_lock_recursive (GL(dl_load_lock));
-
- const struct link_map *l = _dl_find_dso_for_object ((ElfW(Addr)) ptr);
-
- __rtld_lock_unlock_recursive (GL(dl_load_lock));
-
- if (l == NULL)
+ struct dl_find_object dlfo;
+ if (__dl_find_object_internal ((void *)ptr, &dlfo) != 0)
return dl_readonly_area_not_found;
+ const struct link_map *l = dlfo.dlfo_link_map;
uintptr_t ptr_start = (uintptr_t) ptr;
uintptr_t ptr_end = ptr_start + size;

View File

@ -0,0 +1,80 @@
commit 620f0730f311635cd0e175a3ae4d0fc700c76366
Author: Florian Weimer <fweimer@redhat.com>
Date: Mon Jul 28 14:16:52 2025 +0200
elf: Compile _dl_debug_state separately (bug 33224)
This ensures that the compiler will not inline it, so that
debuggers which do not use the Systemtap probes can reliably
set a breakpoint on it.
Reviewed-by: Andreas K. Huettel <dilfridge@gentoo.org>
Tested-by: Andreas K. Huettel <dilfridge@gentoo.org>
diff --git a/elf/Makefile b/elf/Makefile
index b181150b36773d24..b37636c0f865f4e6 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -56,6 +56,7 @@ dl-routines = \
dl-close \
dl-debug \
dl-debug-symbols \
+ dl-debug_state \
dl-deps \
dl-exception \
dl-execstack \
diff --git a/elf/dl-debug.c b/elf/dl-debug.c
index 4388a04cdf828898..8c1472a84ebfefe7 100644
--- a/elf/dl-debug.c
+++ b/elf/dl-debug.c
@@ -167,14 +167,3 @@ _dl_debug_initialize (ElfW(Addr) ldbase, Lmid_t ns)
return &r->base;
}
-
-
-/* This function exists solely to have a breakpoint set on it by the
- debugger. The debugger is supposed to find this function's address by
- examining the r_brk member of struct r_debug, but GDB 4.15 in fact looks
- for this particular symbol name in the PT_INTERP file. */
-void
-_dl_debug_state (void)
-{
-}
-rtld_hidden_def (_dl_debug_state)
diff --git a/elf/dl-debug_state.c b/elf/dl-debug_state.c
new file mode 100644
index 0000000000000000..40c134a49e2455f3
--- /dev/null
+++ b/elf/dl-debug_state.c
@@ -0,0 +1,30 @@
+/* Debugger hook called after dynamic linker updates.
+ Copyright (C) 1996-2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <ldsodefs.h>
+
+/* This function exists solely to have a breakpoint set on it by the
+ debugger. The debugger is supposed to find this function's address by
+ examining the r_brk member of struct r_debug, but GDB 4.15 in fact looks
+ for this particular symbol name in the PT_INTERP file. Therefore,
+ this function must not be inlined. */
+void
+_dl_debug_state (void)
+{
+}
+rtld_hidden_def (_dl_debug_state)

View File

@ -0,0 +1,34 @@
commit 87afbd7a1ad9c1dd116921817fa97198171045db
Author: Sam James <sam@gentoo.org>
Date: Mon Jul 28 21:55:30 2025 +0100
inet-fortified: fix namespace violation (bug 33227)
We need to use __sz, not sz, as we do elsewhere.
Reviewed-by: Florian Weimer <fweimer@redhat.com>
diff --git a/inet/bits/inet-fortified.h b/inet/bits/inet-fortified.h
index 8420a4b7fb41086f..5d16b1f871c49e6f 100644
--- a/inet/bits/inet-fortified.h
+++ b/inet/bits/inet-fortified.h
@@ -38,15 +38,15 @@ __fortify_function int
__NTH (inet_pton (int __af, const char *__restrict __src,
void * __restrict __dst))
{
- size_t sz = 0;
+ size_t __sz = 0;
if (__af == AF_INET)
- sz = sizeof (struct in_addr);
+ __sz = sizeof (struct in_addr);
else if (__af == AF_INET6)
- sz = sizeof (struct in6_addr);
+ __sz = sizeof (struct in6_addr);
else
return __inet_pton_alias (__af, __src, __dst);
- return __glibc_fortify (inet_pton, sz, sizeof (char),
+ return __glibc_fortify (inet_pton, __sz, sizeof (char),
__glibc_objsize (__dst),
__af, __src, __dst);
};

View File

@ -0,0 +1,154 @@
commit dc1e5eeb25c4bcb1cc0c883a2d67cf93eb252478
Author: Andreas Schwab <schwab@suse.de>
Date: Tue May 31 13:09:38 2022 +0200
x86_64: Optimize sincos where sin/cos is optimized (bug 29193)
The compiler may substitute calls to sin or cos with calls to sincos, thus
we should have the same optimized implementations for sincos. The
optimized implementations may produce results that differ, that also makes
sure that the sincos call aggrees with the sin and cos calls.
diff --git a/sysdeps/ieee754/dbl-64/s_sincos.c b/sysdeps/ieee754/dbl-64/s_sincos.c
index a4a521d74227c06d..afd4179b9c04b932 100644
--- a/sysdeps/ieee754/dbl-64/s_sincos.c
+++ b/sysdeps/ieee754/dbl-64/s_sincos.c
@@ -25,10 +25,15 @@
#include <math-underflow.h>
#include <libm-alias-double.h>
+#ifndef SECTION
+# define SECTION
+#endif
+
#define IN_SINCOS
#include "s_sin.c"
void
+SECTION
__sincos (double x, double *sinx, double *cosx)
{
mynumber u;
@@ -101,4 +106,6 @@ __sincos (double x, double *sinx, double *cosx)
*sinx = *cosx = x / x;
}
+#ifndef __sincos
libm_alias_double (__sincos, sincos)
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index d425ffd6d32e13ed..4d00a27988073c40 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -10,7 +10,8 @@ libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
s_trunc-sse4_1 s_truncf-sse4_1
libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
- e_asin-fma e_atan2-fma s_sin-fma s_tan-fma
+ e_asin-fma e_atan2-fma s_sin-fma s_tan-fma \
+ s_sincos-fma
CFLAGS-e_asin-fma.c = -mfma -mavx2
CFLAGS-e_atan2-fma.c = -mfma -mavx2
@@ -20,6 +21,7 @@ CFLAGS-e_pow-fma.c = -mfma -mavx2
CFLAGS-s_atan-fma.c = -mfma -mavx2
CFLAGS-s_sin-fma.c = -mfma -mavx2
CFLAGS-s_tan-fma.c = -mfma -mavx2
+CFLAGS-s_sincos-fma.c = -mfma -mavx2
libm-sysdep_routines += s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
@@ -36,7 +38,8 @@ CFLAGS-s_cosf-fma.c = -mfma -mavx2
CFLAGS-s_sincosf-fma.c = -mfma -mavx2
libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \
- e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4
+ e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \
+ s_sincos-fma4
CFLAGS-e_asin-fma4.c = -mfma4
CFLAGS-e_atan2-fma4.c = -mfma4
@@ -46,9 +49,11 @@ CFLAGS-e_pow-fma4.c = -mfma4
CFLAGS-s_atan-fma4.c = -mfma4
CFLAGS-s_sin-fma4.c = -mfma4
CFLAGS-s_tan-fma4.c = -mfma4
+CFLAGS-s_sincos-fma4.c = -mfma4
libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \
- e_atan2-avx s_sin-avx s_tan-avx
+ e_atan2-avx s_sin-avx s_tan-avx \
+ s_sincos-avx
CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX
CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX
@@ -56,6 +61,7 @@ CFLAGS-e_log-avx.c = -msse2avx -DSSE2AVX
CFLAGS-s_atan-avx.c = -msse2avx -DSSE2AVX
CFLAGS-s_sin-avx.c = -msse2avx -DSSE2AVX
CFLAGS-s_tan-avx.c = -msse2avx -DSSE2AVX
+CFLAGS-s_sincos-avx.c = -msse2avx -DSSE2AVX
endif
ifeq ($(subdir),mathvec)
diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c
new file mode 100644
index 0000000000000000..debea0f619ab1062
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c
@@ -0,0 +1,3 @@
+#define __sincos __sincos_avx
+#define SECTION __attribute__ ((section (".text.avx")))
+#include <sysdeps/ieee754/dbl-64/s_sincos.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c
new file mode 100644
index 0000000000000000..31610b335613500e
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c
@@ -0,0 +1,3 @@
+#define __sincos __sincos_fma
+#define SECTION __attribute__ ((section (".text.fma")))
+#include <sysdeps/ieee754/dbl-64/s_sincos.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c
new file mode 100644
index 0000000000000000..7e8b8e64846a9b96
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c
@@ -0,0 +1,3 @@
+#define __sincos __sincos_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+#include <sysdeps/ieee754/dbl-64/s_sincos.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos.c b/sysdeps/x86_64/fpu/multiarch/s_sincos.c
new file mode 100644
index 0000000000000000..67c2817d688379ab
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_sincos.c
@@ -0,0 +1,30 @@
+/* Multiple versions of sincos.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <libm-alias-double.h>
+
+extern void __redirect_sincos (double, double *, double *);
+
+#define SYMBOL_NAME sincos
+#include "ifunc-fma4.h"
+
+libc_ifunc_redirected (__redirect_sincos, __sincos, IFUNC_SELECTOR ());
+libm_alias_double (__sincos, sincos)
+
+#define __sincos __sincos_sse2
+#include <sysdeps/ieee754/dbl-64/s_sincos.c>

View File

@ -0,0 +1,123 @@
commit 1b214630ce6f7e0099b8b6f87246246739b079cf
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Aug 11 08:04:08 2023 -0700
x86_64: Add expm1 with FMA
On Skylake, it improves expm1 bench performance by:
Before After Improvement
max 70.204 68.054 3%
min 20.709 16.2 22%
mean 22.1221 16.7367 24%
NB: Add
extern long double __expm1l (long double);
extern long double __expm1f128 (long double);
for __typeof (__expm1l) and __typeof (__expm1f128) when __expm1 is
defined since __expm1 may be expanded in their declarations which
causes the build failure.
diff --git a/sysdeps/ieee754/dbl-64/s_expm1.c b/sysdeps/ieee754/dbl-64/s_expm1.c
index 8f1c95bd04bdb0bc..1cafeca9c02381a1 100644
--- a/sysdeps/ieee754/dbl-64/s_expm1.c
+++ b/sysdeps/ieee754/dbl-64/s_expm1.c
@@ -130,6 +130,11 @@ static const double
4.00821782732936239552e-06, /* 3ED0CFCA 86E65239 */
-2.01099218183624371326e-07 }; /* BE8AFDB7 6E09C32D */
+#ifndef SECTION
+# define SECTION
+#endif
+
+SECTION
double
__expm1 (double x)
{
@@ -258,4 +263,6 @@ __expm1 (double x)
}
return y;
}
+#ifndef __expm1
libm_alias_double (__expm1, expm1)
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index 56ef0482a2a8f498..069ea227dbd1965e 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -37,6 +37,7 @@ libm-sysdep_routines += \
e_log2-fma \
e_pow-fma \
s_atan-fma \
+ s_expm1-fma \
s_sin-fma \
s_sincos-fma \
s_tan-fma \
@@ -49,6 +50,7 @@ CFLAGS-e_log-fma.c = -mfma -mavx2
CFLAGS-e_log2-fma.c = -mfma -mavx2
CFLAGS-e_pow-fma.c = -mfma -mavx2
CFLAGS-s_atan-fma.c = -mfma -mavx2
+CFLAGS-s_expm1-fma.c = -mfma -mavx2
CFLAGS-s_sin-fma.c = -mfma -mavx2
CFLAGS-s_tan-fma.c = -mfma -mavx2
CFLAGS-s_sincos-fma.c = -mfma -mavx2
diff --git a/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c b/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c
new file mode 100644
index 0000000000000000..3ee2bd804ebdea4d
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c
@@ -0,0 +1,10 @@
+#define __expm1 __expm1_fma
+
+/* NB: __expm1 may be expanded to __expm1_fma in the following
+ prototypes. */
+extern long double __expm1l (long double);
+extern long double __expm1f128 (long double);
+
+#define SECTION __attribute__ ((section (".text.fma")))
+
+#include <sysdeps/ieee754/dbl-64/s_expm1.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_expm1.c b/sysdeps/x86_64/fpu/multiarch/s_expm1.c
new file mode 100644
index 0000000000000000..2cae83fb7f21bb99
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_expm1.c
@@ -0,0 +1,36 @@
+/* Multiple versions of expm1.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <libm-alias-double.h>
+
+extern double __redirect_expm1 (double);
+
+#define SYMBOL_NAME expm1
+#include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_expm1, __expm1, IFUNC_SELECTOR ());
+libm_alias_double (__expm1, expm1)
+
+#define __expm1 __expm1_sse2
+
+/* NB: __expm1 may be expanded to __expm1_sse2 in the following
+ prototypes. */
+extern long double __expm1l (long double);
+extern long double __expm1f128 (long double);
+
+#include <sysdeps/ieee754/dbl-64/s_expm1.c>

View File

@ -0,0 +1,128 @@
commit a8ecb126d4c26c52f4ad828c566afe4043a28155
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Aug 17 09:42:29 2023 -0700
x86_64: Add log1p with FMA
On Skylake, it changes log1p bench performance by:
Before After Improvement
max 63.349 58.347 8%
min 4.448 5.651 -30%
mean 12.0674 10.336 14%
The minimum code path is
if (hx < 0x3FDA827A) /* x < 0.41422 */
{
if (__glibc_unlikely (ax >= 0x3ff00000)) /* x <= -1.0 */
{
...
}
if (__glibc_unlikely (ax < 0x3e200000)) /* |x| < 2**-29 */
{
math_force_eval (two54 + x); /* raise inexact */
if (ax < 0x3c900000) /* |x| < 2**-54 */
{
...
}
else
return x - x * x * 0.5;
FMA and non-FMA code sequences look similar. Non-FMA version is slightly
faster. Since log1p is called by asinh and atanh, it improves asinh
performance by:
Before After Improvement
max 75.645 63.135 16%
min 10.074 10.071 0%
mean 15.9483 14.9089 6%
and improves atanh performance by:
Before After Improvement
max 91.768 75.081 18%
min 15.548 13.883 10%
mean 18.3713 16.8011 8%
diff --git a/sysdeps/ieee754/dbl-64/s_log1p.c b/sysdeps/ieee754/dbl-64/s_log1p.c
index e6476a826039ea8c..eeb0af859ffce7da 100644
--- a/sysdeps/ieee754/dbl-64/s_log1p.c
+++ b/sysdeps/ieee754/dbl-64/s_log1p.c
@@ -99,6 +99,11 @@ static const double
static const double zero = 0.0;
+#ifndef SECTION
+# define SECTION
+#endif
+
+SECTION
double
__log1p (double x)
{
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index 069ea227dbd1965e..29ab16d2f3d1e38f 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -38,6 +38,7 @@ libm-sysdep_routines += \
e_pow-fma \
s_atan-fma \
s_expm1-fma \
+ s_log1p-fma \
s_sin-fma \
s_sincos-fma \
s_tan-fma \
@@ -51,6 +52,7 @@ CFLAGS-e_log2-fma.c = -mfma -mavx2
CFLAGS-e_pow-fma.c = -mfma -mavx2
CFLAGS-s_atan-fma.c = -mfma -mavx2
CFLAGS-s_expm1-fma.c = -mfma -mavx2
+CFLAGS-s_log1p-fma.c = -mfma -mavx2
CFLAGS-s_sin-fma.c = -mfma -mavx2
CFLAGS-s_tan-fma.c = -mfma -mavx2
CFLAGS-s_sincos-fma.c = -mfma -mavx2
diff --git a/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c b/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c
new file mode 100644
index 0000000000000000..8952df8f9e08d49a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c
@@ -0,0 +1,4 @@
+#define __log1p __log1p_fma
+#define SECTION __attribute__ ((section (".text.fma")))
+
+#include <sysdeps/ieee754/dbl-64/s_log1p.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_log1p.c b/sysdeps/x86_64/fpu/multiarch/s_log1p.c
new file mode 100644
index 0000000000000000..6ce5198d6d844c1c
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_log1p.c
@@ -0,0 +1,29 @@
+/* Multiple versions of log1p.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <libm-alias-double.h>
+
+extern double __redirect_log1p (double);
+
+#define SYMBOL_NAME log1p
+#include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_log1p, __log1p, IFUNC_SELECTOR ());
+
+#define __log1p __log1p_sse2
+#include <sysdeps/ieee754/dbl-64/s_log1p.c>

View File

@ -0,0 +1,177 @@
commit 7e03e0de7e7c2de975b5c5e18f5a4b0c75816674
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Dec 7 09:00:11 2023 -0800
sysdeps/x86/Makefile: Split and sort tests
Put each test on a separate line and sort tests.
Conflicts:
sysdeps/x86/Makefile
(tunables are still optional downstream)
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index cd94e683afd5b4a4..60feef5d87fef1fd 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -10,34 +10,49 @@ sysdep_headers += sys/platform/x86.h bits/platform/x86.h
CFLAGS-dl-get-cpu-features.os += $(rtld-early-cflags)
CFLAGS-get-cpuid-feature-leaf.o += $(no-stack-protector)
-tests += tst-get-cpu-features tst-get-cpu-features-static \
- tst-cpu-features-cpuinfo tst-cpu-features-cpuinfo-static \
- tst-cpu-features-supports tst-cpu-features-supports-static
-tests-static += tst-get-cpu-features-static \
- tst-cpu-features-cpuinfo-static \
- tst-cpu-features-supports-static
+tests += \
+ tst-get-cpu-features \
+ tst-get-cpu-features-static \
+ tst-cpu-features-cpuinfo \
+ tst-cpu-features-cpuinfo-static \
+ tst-cpu-features-supports \
+ tst-cpu-features-supports-static \
+# tests
+tests-static += \
+ tst-get-cpu-features-static \
+ tst-cpu-features-cpuinfo-static \
+ tst-cpu-features-supports-static \
+# tests-static
ifeq (yes,$(have-ifunc))
ifeq (yes,$(have-gcc-ifunc))
tests += \
tst-ifunc-isa-1 \
- tst-ifunc-isa-1-static
-tests-static += \
- tst-ifunc-isa-1-static
+ tst-ifunc-isa-1-static \
+# tests
ifneq ($(have-tunables),no)
+tests-static += \
+ tst-ifunc-isa-1-static \
+# tests-static
tests += \
tst-ifunc-isa-2 \
- tst-ifunc-isa-2-static
+ tst-ifunc-isa-2-static \
+# tests
tests-static += \
- tst-ifunc-isa-2-static
+ tst-ifunc-isa-2-static \
+# tests-static
endif
endif
endif
ifeq (yes,$(enable-x86-isa-level))
-tests += tst-isa-level-1
-modules-names += tst-isa-level-mod-1-baseline \
- tst-isa-level-mod-1-v2 \
- tst-isa-level-mod-1-v3 \
- tst-isa-level-mod-1-v4 \
+tests += \
+ tst-isa-level-1 \
+# tests
+modules-names += \
+ tst-isa-level-mod-1-baseline \
+ tst-isa-level-mod-1-v2 \
+ tst-isa-level-mod-1-v3 \
+ tst-isa-level-mod-1-v4 \
+# modules-names
# X86 ISA level baseline
CFLAGS-tst-isa-level-mod-1-baseline.c += -DINCLUDE_X86_ISA_LEVEL \
@@ -68,7 +83,9 @@ endif
endif
ifeq ($(subdir),math)
-tests += tst-ldbl-nonnormal-printf
+tests += \
+ tst-ldbl-nonnormal-printf \
+# tests
endif # $(subdir) == math
ifeq ($(subdir),setjmp)
@@ -76,7 +93,9 @@ gen-as-const-headers += jmp_buf-ssp.sym
sysdep_routines += __longjmp_cancel
ifneq ($(enable-cet),no)
ifneq ($(have-tunables),no)
-tests += tst-setjmp-cet
+tests += \
+ tst-setjmp-cet \
+# tests
tst-setjmp-cet-ENV = GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on
endif
endif
@@ -116,22 +135,47 @@ ifneq ($(enable-cet),no)
ifeq ($(subdir),elf)
sysdep-dl-routines += dl-cet
-tests += tst-cet-legacy-1 tst-cet-legacy-1a tst-cet-legacy-2 \
- tst-cet-legacy-2a tst-cet-legacy-3 tst-cet-legacy-4 \
- tst-cet-legacy-5a tst-cet-legacy-6a tst-cet-legacy-7 \
- tst-cet-legacy-8 tst-cet-legacy-9 tst-cet-legacy-9-static \
- tst-cet-legacy-10 tst-cet-legacy-10-static
-tests-static += tst-cet-legacy-9-static tst-cet-legacy-10-static
+tests += \
+ tst-cet-legacy-1 \
+ tst-cet-legacy-1a \
+ tst-cet-legacy-2 \
+ tst-cet-legacy-2a \
+ tst-cet-legacy-3 \
+ tst-cet-legacy-4 \
+ tst-cet-legacy-5a \
+ tst-cet-legacy-6a \
+ tst-cet-legacy-7 \
+ tst-cet-legacy-8 \
+ tst-cet-legacy-9 \
+ tst-cet-legacy-9-static \
+ tst-cet-legacy-10 \
+ tst-cet-legacy-10-static \
+# tests
+tests-static += \
+ tst-cet-legacy-9-static \
+ tst-cet-legacy-10-static \
+# tests-static
tst-cet-legacy-1a-ARGS = -- $(host-test-program-cmd)
ifneq (no,$(have-tunables))
-tests += tst-cet-legacy-4a tst-cet-legacy-4b tst-cet-legacy-4c \
- tst-cet-legacy-5b tst-cet-legacy-6b
+tests += \
+ tst-cet-legacy-4a \
+ tst-cet-legacy-4b \
+ tst-cet-legacy-4c \
+ tst-cet-legacy-5b \
+ tst-cet-legacy-6b \
+# tests
endif
-modules-names += tst-cet-legacy-mod-1 tst-cet-legacy-mod-2 \
- tst-cet-legacy-mod-4 tst-cet-legacy-mod-5a \
- tst-cet-legacy-mod-5b tst-cet-legacy-mod-5c \
- tst-cet-legacy-mod-6a tst-cet-legacy-mod-6b \
- tst-cet-legacy-mod-6c
+modules-names += \
+ tst-cet-legacy-mod-1 \
+ tst-cet-legacy-mod-2 \
+ tst-cet-legacy-mod-4 \
+ tst-cet-legacy-mod-5a \
+ tst-cet-legacy-mod-5b \
+ tst-cet-legacy-mod-5c \
+ tst-cet-legacy-mod-6a \
+ tst-cet-legacy-mod-6b \
+ tst-cet-legacy-mod-6c \
+# modules-names
CFLAGS-tst-cet-legacy-2.c += -fcf-protection=branch
CFLAGS-tst-cet-legacy-2a.c += -fcf-protection
@@ -241,7 +285,9 @@ endif
ifeq ($(subdir),posix)
tests += \
tst-sysconf-cache-linesize \
- tst-sysconf-cache-linesize-static
+ tst-sysconf-cache-linesize-static \
+# tests
tests-static += \
- tst-sysconf-cache-linesize-static
+ tst-sysconf-cache-linesize-static \
+# tests-static
endif

View File

@ -0,0 +1,100 @@
commit ef7f4b1fef67430a8f3cfc77fa6aada2add851d7
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Feb 15 11:19:56 2024 -0800
Apply the Makefile sorting fix
Apply the Makefile sorting fix generated by sort-makefile-lines.py.
Conflicts:
sysdeps/loongarch/lp64/multiarch/Makefile
(loongarch does not exist downstream)
sysdeps/x86/Makefile
(missing commit 2a969b53c0b02fed7e43473a92f219d737fd217a)
sysdeps/x86_64/Makefile
(CET support is not specific to x86-64 downstream)
sysdeps/x86_64/multiarch/Makefile
(memchr-evex512, rawmemchr-evex512 not present downstream)
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 60feef5d87fef1fd..e938d2f821d7f839 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -11,17 +11,17 @@ CFLAGS-dl-get-cpu-features.os += $(rtld-early-cflags)
CFLAGS-get-cpuid-feature-leaf.o += $(no-stack-protector)
tests += \
- tst-get-cpu-features \
- tst-get-cpu-features-static \
tst-cpu-features-cpuinfo \
tst-cpu-features-cpuinfo-static \
tst-cpu-features-supports \
tst-cpu-features-supports-static \
+ tst-get-cpu-features \
+ tst-get-cpu-features-static \
# tests
tests-static += \
- tst-get-cpu-features-static \
tst-cpu-features-cpuinfo-static \
tst-cpu-features-supports-static \
+ tst-get-cpu-features-static \
# tests-static
ifeq (yes,$(have-ifunc))
ifeq (yes,$(have-gcc-ifunc))
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index 29ab16d2f3d1e38f..6843bb99ca99a07d 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -4,10 +4,10 @@ libm-sysdep_routines += \
s_ceilf-c \
s_floor-c \
s_floorf-c \
- s_rint-c \
- s_rintf-c \
s_nearbyint-c \
s_nearbyintf-c \
+ s_rint-c \
+ s_rintf-c \
s_roundeven-c \
s_roundevenf-c \
s_trunc-c \
@@ -21,10 +21,10 @@ libm-sysdep_routines += \
s_floorf-sse4_1 \
s_nearbyint-sse4_1 \
s_nearbyintf-sse4_1 \
- s_roundeven-sse4_1 \
- s_roundevenf-sse4_1 \
s_rint-sse4_1 \
s_rintf-sse4_1 \
+ s_roundeven-sse4_1 \
+ s_roundevenf-sse4_1 \
s_trunc-sse4_1 \
s_truncf-sse4_1 \
# libm-sysdep_routines
@@ -84,12 +84,12 @@ CFLAGS-s_cosf-fma.c = -mfma -mavx2
CFLAGS-s_sincosf-fma.c = -mfma -mavx2
libm-sysdep_routines += \
+ e_asin-fma4 \
+ e_atan2-fma4 \
e_exp-fma4 \
e_log-fma4 \
e_pow-fma4 \
- e_asin-fma4 \
s_atan-fma4 \
- e_atan2-fma4 \
s_sin-fma4 \
s_sincos-fma4 \
s_tan-fma4 \
@@ -106,10 +106,10 @@ CFLAGS-s_tan-fma4.c = -mfma4
CFLAGS-s_sincos-fma4.c = -mfma4
libm-sysdep_routines += \
+ e_atan2-avx \
e_exp-avx \
e_log-avx \
s_atan-avx \
- e_atan2-avx \
s_sin-avx \
s_sincos-avx \
s_tan-avx \

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,191 @@
commit 9e1f4aef865ddeffeb4b5f6578fefab606783120
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Apr 4 15:43:50 2024 -0700
x86-64: Exclude FMA4 IFUNC functions for -mapxf
When -mapxf is used to build glibc, the resulting glibc will never run
on FMA4 machines. Exclude FMA4 IFUNC functions when -mapxf is used.
This requires GCC which defines __APX_F__ for -mapxf with commit:
1df56719bd8 x86: Define __APX_F__ for -mapxf
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
diff --git a/config.h.in b/config.h.in
index 790038fec60eb049..c95d510f078136b7 100644
--- a/config.h.in
+++ b/config.h.in
@@ -298,4 +298,7 @@
/* Define if -mmovbe is enabled by default on x86. */
#undef HAVE_X86_MOVBE
+/* Define if -mapxf is enabled by default on x86. */
+#undef HAVE_X86_APX
+
#endif
diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure
index 75c96d60d4da5543..642386d8d185b37d 100755
--- a/sysdeps/x86_64/configure
+++ b/sysdeps/x86_64/configure
@@ -113,5 +113,37 @@ $as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h
$as_echo "#define SUPPORT_STATIC_PIE 1" >>confdefs.h
+# Check if -mapxf is enabled.
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether -mapxf is enabled" >&5
+printf %s "checking whether -mapxf is enabled... " >&6; }
+if test ${libc_cv_x86_have_apx+y}
+then :
+ printf %s "(cached) " >&6
+else $as_nop
+ cat > conftest.c <<EOF
+#ifndef __APX_F__
+# error APX isn't enabled
+#endif
+EOF
+ libc_cv_x86_have_apx=no
+ if { ac_try='${CC-cc} -c $CFLAGS conftest.c -o conftest.o 1>&5'
+ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; }; then
+ libc_cv_x86_have_apx=yes
+ fi
+ rm -rf conftest*
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_x86_have_apx" >&5
+printf "%s\n" "$libc_cv_x86_have_apx" >&6; }
+if test $libc_cv_x86_have_apx = yes; then
+ printf "%s\n" "#define HAVE_X86_APX 1" >>confdefs.h
+
+fi
+config_vars="$config_vars
+have-x86-apx = $libc_cv_x86_have_apx"
+
test -n "$critic_missing" && as_fn_error $? "
*** $critic_missing" "$LINENO" 5
diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac
index 66219e7ce5e190ae..53259c24a9f49822 100644
--- a/sysdeps/x86_64/configure.ac
+++ b/sysdeps/x86_64/configure.ac
@@ -60,5 +60,23 @@ AC_DEFINE(PI_STATIC_AND_HIDDEN)
dnl Static PIE is supported.
AC_DEFINE(SUPPORT_STATIC_PIE)
+# Check if -mapxf is enabled.
+AC_CACHE_CHECK(whether -mapxf is enabled,
+ libc_cv_x86_have_apx, [dnl
+cat > conftest.c <<EOF
+#ifndef __APX_F__
+# error APX isn't enabled
+#endif
+EOF
+ libc_cv_x86_have_apx=no
+ if AC_TRY_COMMAND(${CC-cc} -c $CFLAGS conftest.c -o conftest.o 1>&AS_MESSAGE_LOG_FD); then
+ libc_cv_x86_have_apx=yes
+ fi
+ rm -rf conftest*])
+if test $libc_cv_x86_have_apx = yes; then
+ AC_DEFINE(HAVE_X86_APX)
+fi
+LIBC_CONFIG_VAR([have-x86-apx], [$libc_cv_x86_have_apx])
+
test -n "$critic_missing" && AC_MSG_ERROR([
*** $critic_missing])
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index 73f9af871b9bad45..e94c2c3ed78ca462 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -38,29 +38,36 @@ libm-sysdep_routines += \
s_truncf-avx \
# libm-sysdep_routines
else
+ifeq (no,$(have-x86-apx))
libm-sysdep_routines += \
- e_asin-fma \
e_asin-fma4 \
+ e_atan2-fma4 \
+ e_exp-fma4 \
+ e_log-fma4 \
+ e_pow-fma4 \
+ s_atan-fma4 \
+ s_sin-fma4 \
+ s_sincos-fma4 \
+ s_tan-fma4 \
+# libm-sysdep_routines
+endif
+libm-sysdep_routines += \
+ e_asin-fma \
e_atan2-avx \
e_atan2-fma \
- e_atan2-fma4 \
e_exp-avx \
e_exp-fma \
- e_exp-fma4 \
e_exp2f-fma \
e_expf-fma \
e_log-avx \
e_log-fma \
- e_log-fma4 \
e_log2-fma \
e_log2f-fma \
e_logf-fma \
e_pow-fma \
- e_pow-fma4 \
e_powf-fma \
s_atan-avx \
s_atan-fma \
- s_atan-fma4 \
s_ceil-sse4_1 \
s_ceilf-sse4_1 \
s_cosf-fma \
@@ -77,17 +84,14 @@ libm-sysdep_routines += \
s_roundevenf-sse4_1 \
s_sin-avx \
s_sin-fma \
- s_sin-fma4 \
s_sincos-avx \
s_sincos-fma \
- s_sincos-fma4 \
s_sincosf-fma \
s_sincosf-sse2 \
s_sinf-fma \
s_sinf-sse2 \
s_tan-avx \
s_tan-fma \
- s_tan-fma4 \
s_trunc-sse4_1 \
s_truncf-sse4_1 \
# libm-sysdep_routines
diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h b/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h
index 997602b3b5ad5383..e1c01e6e07ea4a27 100644
--- a/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h
+++ b/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h
@@ -33,8 +33,10 @@ IFUNC_SELECTOR (void)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX2))
return OPTIMIZE (fma);
+#ifndef HAVE_X86_APX
if (CPU_FEATURE_USABLE_P (cpu_features, FMA4))
return OPTIMIZE (fma4);
+#endif
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
return OPTIMIZE (avx);
diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h b/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h
index 20219ad2a38e6699..1299763fecea57ab 100644
--- a/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h
+++ b/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h
@@ -32,8 +32,10 @@ IFUNC_SELECTOR (void)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX2))
return OPTIMIZE (fma);
+#ifndef HAVE_X86_APX
if (CPU_FEATURE_USABLE_P (cpu_features, FMA4))
return OPTIMIZE (fma4);
+#endif
return OPTIMIZE (sse2);
}

View File

@ -0,0 +1,105 @@
commit c6352111c72a20b3588ae304dd99b63e25dd6d85
Author: Sunil K Pandey <skpgkp2@gmail.com>
Date: Mon Mar 10 10:24:07 2025 -0700
x86_64: Add tanh with FMA
On Skylake, it improves tanh bench performance by:
Before After Improvement
max 110.89 95.826 14%
min 20.966 20.157 4%
mean 30.9601 29.8431 4%
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
diff --git a/sysdeps/ieee754/dbl-64/s_tanh.c b/sysdeps/ieee754/dbl-64/s_tanh.c
index 673a97102de292fd..13063db04ebb198c 100644
--- a/sysdeps/ieee754/dbl-64/s_tanh.c
+++ b/sysdeps/ieee754/dbl-64/s_tanh.c
@@ -46,6 +46,11 @@ static char rcsid[] = "$NetBSD: s_tanh.c,v 1.7 1995/05/10 20:48:22 jtc Exp $";
static const double one = 1.0, two = 2.0, tiny = 1.0e-300;
+#ifndef SECTION
+# define SECTION
+#endif
+
+SECTION
double
__tanh (double x)
{
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index e94c2c3ed78ca462..75842a7d0a845600 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -10,6 +10,7 @@ CFLAGS-s_expm1-fma.c = -mfma -mavx2
CFLAGS-s_log1p-fma.c = -mfma -mavx2
CFLAGS-s_sin-fma.c = -mfma -mavx2
CFLAGS-s_tan-fma.c = -mfma -mavx2
+CFLAGS-s_tanh-fma.c = -mfma -mavx2
CFLAGS-s_sincos-fma.c = -mfma -mavx2
CFLAGS-e_exp2f-fma.c = -mfma -mavx2
@@ -92,6 +93,7 @@ libm-sysdep_routines += \
s_sinf-sse2 \
s_tan-avx \
s_tan-fma \
+ s_tanh-fma \
s_trunc-sse4_1 \
s_truncf-sse4_1 \
# libm-sysdep_routines
diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c
new file mode 100644
index 0000000000000000..1b808b1227f50cf5
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c
@@ -0,0 +1,11 @@
+#define __tanh __tanh_fma
+#define __expm1 __expm1_fma
+
+/* NB: __expm1 may be expanded to __expm1_fma in the following
+ prototypes. */
+extern long double __expm1l (long double);
+extern long double __expm1f128 (long double);
+
+#define SECTION __attribute__ ((section (".text.fma")))
+
+#include <sysdeps/ieee754/dbl-64/s_tanh.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh.c b/sysdeps/x86_64/fpu/multiarch/s_tanh.c
new file mode 100644
index 0000000000000000..5539b6c61c63548d
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_tanh.c
@@ -0,0 +1,31 @@
+/* Multiple versions of tanh.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdeps/x86/isa-level.h>
+#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
+
+extern double __redirect_tanh (double);
+
+# define SYMBOL_NAME tanh
+# include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_tanh, __tanh, IFUNC_SELECTOR ());
+
+# define __tanh __tanh_sse2
+#endif
+#include <sysdeps/ieee754/dbl-64/s_tanh.c>

View File

@ -0,0 +1,129 @@
commit dded0d20f67ba1925ccbcb9cf28f0c75febe0dbe
Author: Sunil K Pandey <skpgkp2@gmail.com>
Date: Sat Mar 8 08:51:10 2025 -0800
x86_64: Add sinh with FMA
On SPR, it improves sinh bench performance by:
Before After Improvement
reciprocal-throughput 14.2017 11.815 17%
latency 36.4917 35.2114 4%
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
diff --git a/benchtests/sinh-inputs b/benchtests/sinh-inputs
index 7b1ac46a39c0a0b0..2fcb2fabf82ce778 100644
--- a/benchtests/sinh-inputs
+++ b/benchtests/sinh-inputs
@@ -1,6 +1,7 @@
## args: double
## ret: double
## includes: math.h
+## name: workload-random
0x1.bcb6129b5ff2bp8
-0x1.63057386325ebp9
0x1.62f1d7dc4e8bfp9
diff --git a/sysdeps/ieee754/dbl-64/e_sinh.c b/sysdeps/ieee754/dbl-64/e_sinh.c
index b4b5857dddf90f7a..3f787967f93d72f0 100644
--- a/sysdeps/ieee754/dbl-64/e_sinh.c
+++ b/sysdeps/ieee754/dbl-64/e_sinh.c
@@ -41,6 +41,11 @@ static char rcsid[] = "$NetBSD: e_sinh.c,v 1.7 1995/05/10 20:46:13 jtc Exp $";
static const double one = 1.0, shuge = 1.0e307;
+#ifndef SECTION
+# define SECTION
+#endif
+
+SECTION
double
__ieee754_sinh (double x)
{
@@ -90,4 +95,7 @@ __ieee754_sinh (double x)
/* |x| > overflowthresold, sinh(x) overflow */
return math_narrow_eval (x * shuge);
}
+
+#ifndef __ieee754_sinh
libm_alias_finite (__ieee754_sinh, __sinh)
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index 75842a7d0a845600..40a9654ecadca713 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -5,6 +5,7 @@ CFLAGS-e_exp-fma.c = -mfma -mavx2
CFLAGS-e_log-fma.c = -mfma -mavx2
CFLAGS-e_log2-fma.c = -mfma -mavx2
CFLAGS-e_pow-fma.c = -mfma -mavx2
+CFLAGS-e_sinh-fma.c = -mfma -mavx2
CFLAGS-s_atan-fma.c = -mfma -mavx2
CFLAGS-s_expm1-fma.c = -mfma -mavx2
CFLAGS-s_log1p-fma.c = -mfma -mavx2
@@ -67,6 +68,7 @@ libm-sysdep_routines += \
e_logf-fma \
e_pow-fma \
e_powf-fma \
+ e_sinh-fma \
s_atan-avx \
s_atan-fma \
s_ceil-sse4_1 \
diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c
new file mode 100644
index 0000000000000000..e0e1e39a7a606dc8
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c
@@ -0,0 +1,12 @@
+#define __ieee754_sinh __ieee754_sinh_fma
+#define __ieee754_exp __ieee754_exp_fma
+#define __expm1 __expm1_fma
+
+/* NB: __expm1 may be expanded to __expm1_fma in the following
+ prototypes. */
+extern long double __expm1l (long double);
+extern long double __expm1f128 (long double);
+
+#define SECTION __attribute__ ((section (".text.fma")))
+
+#include <sysdeps/ieee754/dbl-64/e_sinh.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh.c b/sysdeps/x86_64/fpu/multiarch/e_sinh.c
new file mode 100644
index 0000000000000000..3d3c18ccdf1d437a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_sinh.c
@@ -0,0 +1,35 @@
+/* Multiple versions of sinh.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdeps/x86/isa-level.h>
+#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
+# include <libm-alias-finite.h>
+
+extern double __redirect_ieee754_sinh (double);
+
+# define SYMBOL_NAME ieee754_sinh
+# include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_ieee754_sinh, __ieee754_sinh,
+ IFUNC_SELECTOR ());
+
+libm_alias_finite (__ieee754_sinh, __sinh)
+
+# define __ieee754_sinh __ieee754_sinh_sse2
+#endif
+#include <sysdeps/ieee754/dbl-64/e_sinh.c>

View File

@ -0,0 +1,122 @@
commit c7c4a5906f326f1290b1c2413a83c530564ec4b8
Author: Sunil K Pandey <skpgkp2@gmail.com>
Date: Wed Mar 5 16:13:38 2025 -0800
x86_64: Add atanh with FMA
On SPR, it improves atanh bench performance by:
Before After Improvement
reciprocal-throughput 15.1715 14.8628 2%
latency 57.1941 56.1883 2%
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
diff --git a/benchtests/atanh-inputs b/benchtests/atanh-inputs
index 455aa65b6500bccb..498529325436d48f 100644
--- a/benchtests/atanh-inputs
+++ b/benchtests/atanh-inputs
@@ -1,6 +1,7 @@
## args: double
## ret: double
## includes: math.h
+## name: workload-random
0x1.5a2730bacd94ap-1
-0x1.b57eb40fc048ep-21
-0x1.c0b185fb450e2p-17
diff --git a/sysdeps/ieee754/dbl-64/e_atanh.c b/sysdeps/ieee754/dbl-64/e_atanh.c
index db1f69f953e62166..fbfabf0ca0a35e92 100644
--- a/sysdeps/ieee754/dbl-64/e_atanh.c
+++ b/sysdeps/ieee754/dbl-64/e_atanh.c
@@ -45,6 +45,11 @@
static const double huge = 1e300;
+#ifndef SECTION
+# define SECTION
+#endif
+
+SECTION
double
__ieee754_atanh (double x)
{
@@ -74,4 +79,7 @@ __ieee754_atanh (double x)
return copysign (t, x);
}
+
+#ifndef __ieee754_atanh
libm_alias_finite (__ieee754_atanh, __atanh)
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index 40a9654ecadca713..9ebc7f62f7479eed 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -1,6 +1,7 @@
ifeq ($(subdir),math)
CFLAGS-e_asin-fma.c = -mfma -mavx2
CFLAGS-e_atan2-fma.c = -mfma -mavx2
+CFLAGS-e_atanh-fma.c = -mfma -mavx2
CFLAGS-e_exp-fma.c = -mfma -mavx2
CFLAGS-e_log-fma.c = -mfma -mavx2
CFLAGS-e_log2-fma.c = -mfma -mavx2
@@ -57,6 +58,7 @@ libm-sysdep_routines += \
e_asin-fma \
e_atan2-avx \
e_atan2-fma \
+ e_atanh-fma \
e_exp-avx \
e_exp-fma \
e_exp2f-fma \
diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c
new file mode 100644
index 0000000000000000..c3f2f9e5506ae363
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c
@@ -0,0 +1,6 @@
+#define __ieee754_atanh __ieee754_atanh_fma
+#define __log1p __log1p_fma
+
+#define SECTION __attribute__ ((section (".text.fma")))
+
+#include <sysdeps/ieee754/dbl-64/e_atanh.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh.c b/sysdeps/x86_64/fpu/multiarch/e_atanh.c
new file mode 100644
index 0000000000000000..d2b785dfc0268df8
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_atanh.c
@@ -0,0 +1,34 @@
+/* Multiple versions of atanh.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdeps/x86/isa-level.h>
+#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
+# include <libm-alias-finite.h>
+
+extern double __redirect_ieee754_atanh (double);
+
+# define SYMBOL_NAME ieee754_atanh
+# include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_ieee754_atanh, __ieee754_atanh, IFUNC_SELECTOR ());
+
+libm_alias_finite (__ieee754_atanh, __atanh)
+
+# define __ieee754_atanh __ieee754_atanh_sse2
+#endif
+#include <sysdeps/ieee754/dbl-64/e_atanh.c>

View File

@ -0,0 +1,321 @@
commit 703f4341083afa7d71987aa96a35eab81309e634
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed Jun 22 16:51:19 2022 -0700
x86: Add defines / utilities for making ISA specific x86 builds
1. Factor out some of the ISA level defines in isa-level.c to
standalone header isa-level.h
2. Add new headers with ISA level dependent macros for handling
ifuncs.
Note, this file does not change any code.
Tested with and without multiarch on x86_64 for ISA levels:
{generic, x86-64-v2, x86-64-v3, x86-64-v4}
And m32 with and without multiarch.
diff --git a/sysdeps/x86/init-arch.h b/sysdeps/x86/init-arch.h
index 2e8141ffbaecca69..df1bce2aae11450b 100644
--- a/sysdeps/x86/init-arch.h
+++ b/sysdeps/x86/init-arch.h
@@ -19,7 +19,9 @@
#include <ifunc-init.h>
#include <isa.h>
-#ifndef __x86_64__
+#ifdef __x86_64__
+# include <isa-ifunc-macros.h>
+#else
/* Due to the reordering and the other nifty extensions in i686, it is
not really good to use heavily i586 optimized code on an i686. It's
better to use i486 code if it isn't an i586. */
diff --git a/sysdeps/x86/isa-ifunc-macros.h b/sysdeps/x86/isa-ifunc-macros.h
new file mode 100644
index 0000000000000000..ba6826d518501396
--- /dev/null
+++ b/sysdeps/x86/isa-ifunc-macros.h
@@ -0,0 +1,70 @@
+/* Common ifunc selection utils
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _ISA_IFUNC_MACROS_H
+#define _ISA_IFUNC_MACROS_H 1
+
+#include <isa-level.h>
+#include <sys/cdefs.h>
+#include <stdlib.h>
+
+/* Only include at the level of the minimum build ISA or higher. I.e
+ if built with ISA=V1, then include all implementations. On the
+ other hand if built with ISA=V3 only include V3/V4
+ implementations. If there is no implementation at or above the
+ minimum build ISA level, then include the highest ISA level
+ implementation. */
+#if MINIMUM_X86_ISA_LEVEL <= 4
+# define X86_IFUNC_IMPL_ADD_V4(...) IFUNC_IMPL_ADD (__VA_ARGS__)
+#endif
+#if MINIMUM_X86_ISA_LEVEL <= 3
+# define X86_IFUNC_IMPL_ADD_V3(...) IFUNC_IMPL_ADD (__VA_ARGS__)
+#endif
+#if MINIMUM_X86_ISA_LEVEL <= 2
+# define X86_IFUNC_IMPL_ADD_V2(...) IFUNC_IMPL_ADD (__VA_ARGS__)
+#endif
+#if MINIMUM_X86_ISA_LEVEL <= 1
+# define X86_IFUNC_IMPL_ADD_V1(...) IFUNC_IMPL_ADD (__VA_ARGS__)
+#endif
+
+#ifndef X86_IFUNC_IMPL_ADD_V4
+# define X86_IFUNC_IMPL_ADD_V4(...)
+#endif
+#ifndef X86_IFUNC_IMPL_ADD_V3
+# define X86_IFUNC_IMPL_ADD_V3(...)
+#endif
+#ifndef X86_IFUNC_IMPL_ADD_V2
+# define X86_IFUNC_IMPL_ADD_V2(...)
+#endif
+#ifndef X86_IFUNC_IMPL_ADD_V1
+# define X86_IFUNC_IMPL_ADD_V1(...)
+#endif
+
+#define X86_ISA_CPU_FEATURE_CONST_CHECK_ENABLED(name) \
+ ((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)
+
+#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name) \
+ (X86_ISA_CPU_FEATURE_CONST_CHECK_ENABLED (name) \
+ || CPU_FEATURE_USABLE_P (ptr, name))
+
+#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name) \
+ (X86_ISA_CPU_FEATURE_CONST_CHECK_ENABLED (name) \
+ || CPU_FEATURES_ARCH_P (ptr, name))
+
+#endif
diff --git a/sysdeps/x86/isa-level.c b/sysdeps/x86/isa-level.c
index 07815381122c94c3..9bd52eadf8f2382b 100644
--- a/sysdeps/x86/isa-level.c
+++ b/sysdeps/x86/isa-level.c
@@ -26,38 +26,31 @@
<https://www.gnu.org/licenses/>. */
#include <elf.h>
-
+#include <sysdeps/x86/isa-level.h>
/* ELF program property for x86 ISA level. */
#ifdef INCLUDE_X86_ISA_LEVEL
-# if defined __SSE__ && defined __SSE2__
+# if MINIMUM_X86_ISA_LEVEL >= 1
/* NB: ISAs, excluding MMX, in x86-64 ISA level baseline are used. */
# define ISA_BASELINE GNU_PROPERTY_X86_ISA_1_BASELINE
# else
# define ISA_BASELINE 0
# endif
-# if ISA_BASELINE && defined __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 \
- && defined HAVE_X86_LAHF_SAHF && defined __POPCNT__ \
- && defined __SSE3__ && defined __SSSE3__ && defined __SSE4_1__ \
- && defined __SSE4_2__
+# if MINIMUM_X86_ISA_LEVEL >= 2
/* NB: ISAs in x86-64 ISA level v2 are used. */
# define ISA_V2 GNU_PROPERTY_X86_ISA_1_V2
# else
# define ISA_V2 0
# endif
-# if ISA_V2 && defined __AVX__ && defined __AVX2__ && defined __F16C__ \
- && defined __FMA__ && defined __LZCNT__ && defined HAVE_X86_MOVBE \
- && defined __BMI__ && defined __BMI2__
+# if MINIMUM_X86_ISA_LEVEL >= 3
/* NB: ISAs in x86-64 ISA level v3 are used. */
# define ISA_V3 GNU_PROPERTY_X86_ISA_1_V3
# else
# define ISA_V3 0
# endif
-# if ISA_V3 && defined __AVX512F__ && defined __AVX512BW__ \
- && defined __AVX512CD__ && defined __AVX512DQ__ \
- && defined __AVX512VL__
+# if MINIMUM_X86_ISA_LEVEL >= 4
/* NB: ISAs in x86-64 ISA level v4 are used. */
# define ISA_V4 GNU_PROPERTY_X86_ISA_1_V4
# else
diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
new file mode 100644
index 0000000000000000..7cae11c2289dd54d
--- /dev/null
+++ b/sysdeps/x86/isa-level.h
@@ -0,0 +1,102 @@
+/* Header defining the minimum x86 ISA level
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ In addition to the permissions in the GNU Lesser General Public
+ License, the Free Software Foundation gives you unlimited
+ permission to link the compiled version of this file with other
+ programs, and to distribute those programs without any restriction
+ coming from the use of this file. (The Lesser General Public
+ License restrictions do apply in other respects; for example, they
+ cover modification of the file, and distribution when not linked
+ into another program.)
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _ISA_LEVEL_H
+#define _ISA_LEVEL_H
+
+#if defined __SSE__ && defined __SSE2__
+/* NB: ISAs, excluding MMX, in x86-64 ISA level baseline are used. */
+# define __X86_ISA_V1 1
+#else
+# define __X86_ISA_V1 0
+#endif
+
+#if __X86_ISA_V1 && defined __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 \
+ && defined HAVE_X86_LAHF_SAHF && defined __POPCNT__ && defined __SSE3__ \
+ && defined __SSSE3__ && defined __SSE4_1__ && defined __SSE4_2__
+/* NB: ISAs in x86-64 ISA level v2 are used. */
+# define __X86_ISA_V2 1
+#else
+# define __X86_ISA_V2 0
+#endif
+
+#if __X86_ISA_V2 && defined __AVX__ && defined __AVX2__ && defined __F16C__ \
+ && defined __FMA__ && defined __LZCNT__ && defined HAVE_X86_MOVBE \
+ && defined __BMI__ && defined __BMI2__
+/* NB: ISAs in x86-64 ISA level v3 are used. */
+# define __X86_ISA_V3 1
+#else
+# define __X86_ISA_V3 0
+#endif
+
+#if __X86_ISA_V3 && defined __AVX512F__ && defined __AVX512BW__ \
+ && defined __AVX512CD__ && defined __AVX512DQ__ && defined __AVX512VL__
+/* NB: ISAs in x86-64 ISA level v4 are used. */
+# define __X86_ISA_V4 1
+#else
+# define __X86_ISA_V4 0
+#endif
+
+#define MINIMUM_X86_ISA_LEVEL \
+ (__X86_ISA_V1 + __X86_ISA_V2 + __X86_ISA_V3 + __X86_ISA_V4)
+
+
+/*
+ * CPU Features that are hard coded as enabled depending on ISA build
+ * level.
+ * - Values > 0 features are always ENABLED if:
+ * Value >= MINIMUM_X86_ISA_LEVEL
+ */
+
+
+/* ISA level >= 4 guaranteed includes. */
+#define AVX512VL_X86_ISA_LEVEL 4
+#define AVX512BW_X86_ISA_LEVEL 4
+
+/* ISA level >= 3 guaranteed includes. */
+#define AVX2_X86_ISA_LEVEL 3
+#define BMI2_X86_ISA_LEVEL 3
+
+/*
+ * NB: This may not be fully assumable for ISA level >= 3. From
+ * looking over the architectures supported in cpu-features.h the
+ * following CPUs may have an issue with this being default set:
+ * - AMD Excavator
+ */
+#define AVX_Fast_Unaligned_Load_X86_ISA_LEVEL 3
+
+/*
+ * KNL (the only cpu that sets this supported in cpu-features.h)
+ * builds with ISA V1 so this shouldn't harm any architectures.
+ */
+#define Prefer_No_VZEROUPPER_X86_ISA_LEVEL 3
+
+#define ISA_SHOULD_BUILD(isa_build_level) \
+ (MINIMUM_X86_ISA_LEVEL <= (isa_build_level) && IS_IN (libc)) \
+ || defined ISA_DEFAULT_IMPL
+
+#endif
diff --git a/sysdeps/x86_64/isa-default-impl.h b/sysdeps/x86_64/isa-default-impl.h
new file mode 100644
index 0000000000000000..34634668e56669c9
--- /dev/null
+++ b/sysdeps/x86_64/isa-default-impl.h
@@ -0,0 +1,49 @@
+/* Utility for including proper default function based on ISA level
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <isa-level.h>
+
+#ifndef DEFAULT_IMPL_V1
+# error "Must have at least ISA V1 Version"
+#endif
+
+#ifndef DEFAULT_IMPL_V2
+# define DEFAULT_IMPL_V2 DEFAULT_IMPL_V1
+#endif
+
+#ifndef DEFAULT_IMPL_V3
+# define DEFAULT_IMPL_V3 DEFAULT_IMPL_V2
+#endif
+
+#ifndef DEFAULT_IMPL_V4
+# define DEFAULT_IMPL_V4 DEFAULT_IMPL_V3
+#endif
+
+#if MINIMUM_X86_ISA_LEVEL == 1
+# define ISA_DEFAULT_IMPL DEFAULT_IMPL_V1
+#elif MINIMUM_X86_ISA_LEVEL == 2
+# define ISA_DEFAULT_IMPL DEFAULT_IMPL_V2
+#elif MINIMUM_X86_ISA_LEVEL == 3
+# define ISA_DEFAULT_IMPL DEFAULT_IMPL_V3
+#elif MINIMUM_X86_ISA_LEVEL == 4
+# define ISA_DEFAULT_IMPL DEFAULT_IMPL_V4
+#else
+# error "Unsupported ISA Level!"
+#endif
+
+#include ISA_DEFAULT_IMPL

View File

@ -0,0 +1,109 @@
Partial backport of:
commit 4fc321dc58b29217e322526b49549930d0807179
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri Jun 24 16:15:42 2022 -0700
x86: Fix backwards Prefer_No_VZEROUPPER check in ifunc-evex.h
Add third argument to X86_ISA_CPU_FEATURES_ARCH_P macro so the runtime
CPU_FEATURES_ARCH_P check can be inverted if the
MINIMUM_X86_ISA_LEVEL is not high enough to constantly evaluate
the check.
Use this new macro to correct the backwards check in ifunc-evex.h
Conflicts:
sysdeps/x86_64/multiarch/ifunc-evex.h
(skipped; EVEX variant selection not enabled downstream)
diff --git a/sysdeps/x86/isa-ifunc-macros.h b/sysdeps/x86/isa-ifunc-macros.h
index ba6826d518501396..d69905689b0963ec 100644
--- a/sysdeps/x86/isa-ifunc-macros.h
+++ b/sysdeps/x86/isa-ifunc-macros.h
@@ -56,15 +56,31 @@
# define X86_IFUNC_IMPL_ADD_V1(...)
#endif
-#define X86_ISA_CPU_FEATURE_CONST_CHECK_ENABLED(name) \
- ((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)
+/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
+ macros are wrappers for the the respective
+ CPU_FEATURE{S}_{USABLE|ARCH}_P runtime checks. They differ in two
+ ways.
+
+ 1. The USABLE_P version is evaluated to true when the feature
+ is enabled.
+
+ 2. The ARCH_P version has a third argument `not`. The `not`
+ argument can either be '!' or empty. If the feature is
+ enabled above an ISA level, the third argument should be empty
+ and the expression is evaluated to true when the feature is
+ enabled. If the feature is disabled above an ISA level, the
+ third argument should be `!` and the expression is evaluated
+ to true when the feature is disabled.
+ */
#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name) \
- (X86_ISA_CPU_FEATURE_CONST_CHECK_ENABLED (name) \
+ (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL) \
|| CPU_FEATURE_USABLE_P (ptr, name))
-#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name) \
- (X86_ISA_CPU_FEATURE_CONST_CHECK_ENABLED (name) \
- || CPU_FEATURES_ARCH_P (ptr, name))
+
+#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not) \
+ (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL) \
+ || not CPU_FEATURES_ARCH_P (ptr, name))
+
#endif
diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
index 7cae11c2289dd54d..075e7c6ee17777db 100644
--- a/sysdeps/x86/isa-level.h
+++ b/sysdeps/x86/isa-level.h
@@ -64,14 +64,8 @@
#define MINIMUM_X86_ISA_LEVEL \
(__X86_ISA_V1 + __X86_ISA_V2 + __X86_ISA_V3 + __X86_ISA_V4)
-
-/*
- * CPU Features that are hard coded as enabled depending on ISA build
- * level.
- * - Values > 0 features are always ENABLED if:
- * Value >= MINIMUM_X86_ISA_LEVEL
- */
-
+/* Depending on the minimum ISA level, a feature check result can be a
+ compile-time constant.. */
/* ISA level >= 4 guaranteed includes. */
#define AVX512VL_X86_ISA_LEVEL 4
@@ -81,18 +75,16 @@
#define AVX2_X86_ISA_LEVEL 3
#define BMI2_X86_ISA_LEVEL 3
-/*
- * NB: This may not be fully assumable for ISA level >= 3. From
- * looking over the architectures supported in cpu-features.h the
- * following CPUs may have an issue with this being default set:
- * - AMD Excavator
- */
+/* NB: This feature is enabled when ISA level >= 3, which was disabled
+ for the following CPUs:
+ - AMD Excavator
+ when ISA level < 3. */
#define AVX_Fast_Unaligned_Load_X86_ISA_LEVEL 3
-/*
- * KNL (the only cpu that sets this supported in cpu-features.h)
- * builds with ISA V1 so this shouldn't harm any architectures.
- */
+/* NB: This feature is disabled when ISA level >= 3, which was enabled
+ for the following CPUs:
+ - Intel KNL
+ when ISA level < 3. */
#define Prefer_No_VZEROUPPER_X86_ISA_LEVEL 3
#define ISA_SHOULD_BUILD(isa_build_level) \

View File

@ -0,0 +1,80 @@
commit f56c497d2b640577f0a8a41f04d4f2c25a8800bd
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Jun 27 12:52:58 2022 -0700
x86: Move CPU_FEATURE{S}_{USABLE|ARCH}_P to isa-level.h
Move X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P to
where MINIMUM_X86_ISA_LEVEL and XXX_X86_ISA_LEVEL are defined.
diff --git a/sysdeps/x86/isa-ifunc-macros.h b/sysdeps/x86/isa-ifunc-macros.h
index d69905689b0963ec..f967a1bec6fd57d2 100644
--- a/sysdeps/x86/isa-ifunc-macros.h
+++ b/sysdeps/x86/isa-ifunc-macros.h
@@ -56,31 +56,4 @@
# define X86_IFUNC_IMPL_ADD_V1(...)
#endif
-/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
- macros are wrappers for the the respective
- CPU_FEATURE{S}_{USABLE|ARCH}_P runtime checks. They differ in two
- ways.
-
- 1. The USABLE_P version is evaluated to true when the feature
- is enabled.
-
- 2. The ARCH_P version has a third argument `not`. The `not`
- argument can either be '!' or empty. If the feature is
- enabled above an ISA level, the third argument should be empty
- and the expression is evaluated to true when the feature is
- enabled. If the feature is disabled above an ISA level, the
- third argument should be `!` and the expression is evaluated
- to true when the feature is disabled.
- */
-
-#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name) \
- (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL) \
- || CPU_FEATURE_USABLE_P (ptr, name))
-
-
-#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not) \
- (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL) \
- || not CPU_FEATURES_ARCH_P (ptr, name))
-
-
#endif
diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
index 075e7c6ee17777db..c6156e7f7ac7ece5 100644
--- a/sysdeps/x86/isa-level.h
+++ b/sysdeps/x86/isa-level.h
@@ -87,6 +87,30 @@
when ISA level < 3. */
#define Prefer_No_VZEROUPPER_X86_ISA_LEVEL 3
+/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
+ macros are wrappers for the respective CPU_FEATURE{S}_{USABLE|ARCH}_P
+ runtime checks. They differ in two ways.
+
+ 1. The USABLE_P version is evaluated to true when the feature
+ is enabled.
+
+ 2. The ARCH_P version has a third argument `not`. The `not`
+ argument can either be `!` or empty. If the feature is
+ enabled above an ISA level, the third argument should be empty
+ and the expression is evaluated to true when the feature is
+ enabled. If the feature is disabled above an ISA level, the
+ third argument should be `!` and the expression is evaluated
+ to true when the feature is disabled.
+ */
+
+#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name) \
+ (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL) \
+ || CPU_FEATURE_USABLE_P (ptr, name))
+
+#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not) \
+ (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL) \
+ || not CPU_FEATURES_ARCH_P (ptr, name))
+
#define ISA_SHOULD_BUILD(isa_build_level) \
(MINIMUM_X86_ISA_LEVEL <= (isa_build_level) && IS_IN (libc)) \
|| defined ISA_DEFAULT_IMPL

View File

@ -0,0 +1,153 @@
commit cfdc4df66ce1464611e1b508f7a5a8f38afd5337
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Jun 27 11:36:28 2022 -0700
x86-64: Only define used SSE/AVX/AVX512 run-time resolvers
When glibc is built with x86-64 ISA level v3, SSE run-time resolvers
aren't used. For x86-64 ISA level v4 build, both SSE and AVX resolvers
are unused. Check the minimum x86-64 ISA level to exclude the unused
run-time resolvers.
diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
index c6156e7f7ac7ece5..f293aea9068cc2a9 100644
--- a/sysdeps/x86/isa-level.h
+++ b/sysdeps/x86/isa-level.h
@@ -68,10 +68,12 @@
compile-time constant.. */
/* ISA level >= 4 guaranteed includes. */
+#define AVX512F_X86_ISA_LEVEL 4
#define AVX512VL_X86_ISA_LEVEL 4
#define AVX512BW_X86_ISA_LEVEL 4
/* ISA level >= 3 guaranteed includes. */
+#define AVX_X86_ISA_LEVEL 3
#define AVX2_X86_ISA_LEVEL 3
#define BMI2_X86_ISA_LEVEL 3
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index 742682517179fab5..5da493ea4097886d 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -29,6 +29,7 @@
#include <dl-tlsdesc.h>
#include <dl-static-tls.h>
#include <dl-machine-rel.h>
+#include <isa-level.h>
/* Return nonzero iff ELF header is compatible with the running host. */
static inline int __attribute__ ((unused))
@@ -94,6 +95,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
/* Identify this shared object. */
*(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
+ const struct cpu_features* cpu_features = __get_cpu_features ();
+
/* The got[2] entry contains the address of a function which gets
called to get the address of a so far unresolved function and
jump to it. The profiling extension of the dynamic linker allows
@@ -102,9 +105,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
end in this function. */
if (__glibc_unlikely (profile))
{
- if (CPU_FEATURE_USABLE (AVX512F))
+ if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
*(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
- else if (CPU_FEATURE_USABLE (AVX))
+ else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
*(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
else
*(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
@@ -120,9 +123,10 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
/* This function will get called to fix up the GOT entry
indicated by the offset on the stack, and then jump to
the resolved address. */
- if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
+ if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
+ || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
*(ElfW(Addr) *) (got + 2)
- = (CPU_FEATURE_USABLE (XSAVEC)
+ = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
: (ElfW(Addr)) &_dl_runtime_resolve_xsave);
else
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 90b2a6ce56d1d370..3cf517f84c87cf22 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -20,6 +20,7 @@
#include <sysdep.h>
#include <cpu-features-offsets.h>
#include <link-defines.h>
+#include <isa-level.h>
#ifndef DL_STACK_ALIGNMENT
/* Due to GCC bug:
@@ -71,35 +72,39 @@
#undef VMOVA
#undef VEC_SIZE
-#define VEC_SIZE 32
-#define VMOVA vmovdqa
-#define VEC(i) ymm##i
-#define _dl_runtime_profile _dl_runtime_profile_avx
-#include "dl-trampoline.h"
-#undef _dl_runtime_profile
-#undef VEC
-#undef VMOVA
-#undef VEC_SIZE
+#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL
+# define VEC_SIZE 32
+# define VMOVA vmovdqa
+# define VEC(i) ymm##i
+# define _dl_runtime_profile _dl_runtime_profile_avx
+# include "dl-trampoline.h"
+# undef _dl_runtime_profile
+# undef VEC
+# undef VMOVA
+# undef VEC_SIZE
+#endif
+#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL
/* movaps/movups is 1-byte shorter. */
-#define VEC_SIZE 16
-#define VMOVA movaps
-#define VEC(i) xmm##i
-#define _dl_runtime_profile _dl_runtime_profile_sse
-#undef RESTORE_AVX
-#include "dl-trampoline.h"
-#undef _dl_runtime_profile
-#undef VEC
-#undef VMOVA
-#undef VEC_SIZE
-
-#define USE_FXSAVE
-#define STATE_SAVE_ALIGNMENT 16
-#define _dl_runtime_resolve _dl_runtime_resolve_fxsave
-#include "dl-trampoline.h"
-#undef _dl_runtime_resolve
-#undef USE_FXSAVE
-#undef STATE_SAVE_ALIGNMENT
+# define VEC_SIZE 16
+# define VMOVA movaps
+# define VEC(i) xmm##i
+# define _dl_runtime_profile _dl_runtime_profile_sse
+# undef RESTORE_AVX
+# include "dl-trampoline.h"
+# undef _dl_runtime_profile
+# undef VEC
+# undef VMOVA
+# undef VEC_SIZE
+
+# define USE_FXSAVE
+# define STATE_SAVE_ALIGNMENT 16
+# define _dl_runtime_resolve _dl_runtime_resolve_fxsave
+# include "dl-trampoline.h"
+# undef _dl_runtime_resolve
+# undef USE_FXSAVE
+# undef STATE_SAVE_ALIGNMENT
+#endif
#define USE_XSAVE
#define STATE_SAVE_ALIGNMENT 64

View File

@ -0,0 +1,51 @@
commit a3563f3f369878467dd74aeb360448119a7a4b41
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon Jun 27 21:07:03 2022 -0700
x86: Add more feature definitions to isa-level.h
This commit doesn't change anything in itself. It is just to add
definitions that will be needed by future patches.
diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
index f293aea9068cc2a9..77f9e2c0c3ccd41d 100644
--- a/sysdeps/x86/isa-level.h
+++ b/sysdeps/x86/isa-level.h
@@ -67,15 +67,27 @@
/* Depending on the minimum ISA level, a feature check result can be a
compile-time constant.. */
+
+/* For CPU_FEATURE_USABLE_P. */
+
/* ISA level >= 4 guaranteed includes. */
#define AVX512F_X86_ISA_LEVEL 4
#define AVX512VL_X86_ISA_LEVEL 4
#define AVX512BW_X86_ISA_LEVEL 4
+#define AVX512DQ_X86_ISA_LEVEL 4
/* ISA level >= 3 guaranteed includes. */
#define AVX_X86_ISA_LEVEL 3
#define AVX2_X86_ISA_LEVEL 3
#define BMI2_X86_ISA_LEVEL 3
+#define MOVBE_X86_ISA_LEVEL 3
+
+/* ISA level >= 2 guaranteed includes. */
+#define SSE4_2_X86_ISA_LEVEL 2
+#define SSSE3_X86_ISA_LEVEL 2
+
+
+/* For X86_ISA_CPU_FEATURES_ARCH_P. */
/* NB: This feature is enabled when ISA level >= 3, which was disabled
for the following CPUs:
@@ -89,6 +101,9 @@
when ISA level < 3. */
#define Prefer_No_VZEROUPPER_X86_ISA_LEVEL 3
+/* Feature(s) enabled when ISA level >= 2. */
+#define Fast_Unaligned_Load_X86_ISA_LEVEL 2
+
/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
macros are wrappers for the respective CPU_FEATURE{S}_{USABLE|ARCH}_P
runtime checks. They differ in two ways.

View File

@ -0,0 +1,100 @@
Backport the sysdeps/x86/isa-level.h changes from:
commit ceabdcd130ca7043b0fcf2676183d79431d10493
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed Jul 13 16:32:59 2022 -0700
x86: Add support to build strcmp/strlen/strchr with explicit ISA level
1. Add default ISA level selection in non-multiarch/rtld
implementations.
2. Add ISA level build guards to different implementations.
- I.e strcmp-avx2.S which is ISA level 3 will only build if
compiled ISA level <= 3. Otherwise there is no reason to
include it as we will always use one of the ISA level 4
implementations (strcmp-evex.S).
3. Refactor the ifunc selector and ifunc implementation list to use
the ISA level aware wrapper macros that allow functions below the
compiled ISA level (with a guranteed replacement) to be skipped.
Tested with and without multiarch on x86_64 for ISA levels:
{generic, x86-64-v2, x86-64-v3, x86-64-v4}
And m32 with and without multiarch.
Conflicts:
support/xpthread_cond_signal.c
sysdeps/x86_64/memrchr.S
sysdeps/x86_64/multiarch/Makefile
sysdeps/x86_64/multiarch/ifunc-avx2.h
sysdeps/x86_64/multiarch/ifunc-impl-list.c
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
sysdeps/x86_64/multiarch/ifunc-wcslen.h
sysdeps/x86_64/multiarch/memrchr-sse2.S
sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S
sysdeps/x86_64/multiarch/strchr-sse2.S
sysdeps/x86_64/multiarch/strchrnul-sse2.S
sysdeps/x86_64/multiarch/strcmp-avx2.S
sysdeps/x86_64/multiarch/strcmp-evex.S
sysdeps/x86_64/multiarch/strcmp-sse2.S
sysdeps/x86_64/multiarch/strcmp-sse4_2.S
sysdeps/x86_64/multiarch/strcmp.c
sysdeps/x86_64/multiarch/strlen-sse2.S
sysdeps/x86_64/multiarch/strncmp.c
sysdeps/x86_64/multiarch/strnlen-sse2.S
sysdeps/x86_64/multiarch/strrchr-sse2.S
sysdeps/x86_64/multiarch/wcschr-sse2.S
sysdeps/x86_64/multiarch/wcscmp-sse2.S
sysdeps/x86_64/multiarch/wcslen-sse2.S
sysdeps/x86_64/multiarch/wcslen-sse4_1.S
sysdeps/x86_64/multiarch/wcsncmp-sse2.c
sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
sysdeps/x86_64/multiarch/wcsrchr-sse2.S
sysdeps/x86_64/strcasecmp_l.S
sysdeps/x86_64/strchr.S
sysdeps/x86_64/strchrnul.S
sysdeps/x86_64/strcmp.S
sysdeps/x86_64/strlen.S
sysdeps/x86_64/strncase_l.S
sysdeps/x86_64/strncmp.S
sysdeps/x86_64/strnlen.S
sysdeps/x86_64/strrchr.S
sysdeps/x86_64/wcschr.S
sysdeps/x86_64/wcscmp.S
sysdeps/x86_64/wcslen.S
sysdeps/x86_64/wcsrchr.S
(no backport of string function changes)
diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
index 77f9e2c0c3ccd41d..3c4480aba70193a2 100644
--- a/sysdeps/x86/isa-level.h
+++ b/sysdeps/x86/isa-level.h
@@ -84,6 +84,7 @@
/* ISA level >= 2 guaranteed includes. */
#define SSE4_2_X86_ISA_LEVEL 2
+#define SSE4_1_X86_ISA_LEVEL 2
#define SSSE3_X86_ISA_LEVEL 2
@@ -101,9 +102,18 @@
when ISA level < 3. */
#define Prefer_No_VZEROUPPER_X86_ISA_LEVEL 3
+/* NB: This feature is disable when ISA level >= 3. All CPUs with
+ this feature don't run on glibc built with ISA level >= 3. */
+#define Slow_SSE42_X86_ISA_LEVEL 3
+
/* Feature(s) enabled when ISA level >= 2. */
#define Fast_Unaligned_Load_X86_ISA_LEVEL 2
+/* NB: This feature is disable when ISA level >= 2, which was enabled
+ for the early Atom CPUs. */
+#define Slow_BSF_X86_ISA_LEVEL 2
+
+
/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
macros are wrappers for the respective CPU_FEATURE{S}_{USABLE|ARCH}_P
runtime checks. They differ in two ways.

View File

@ -0,0 +1,137 @@
commit 881546979d0219c18337e1b4f4d00cfacab13c40
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Aug 9 11:08:52 2023 -0700
x86_64: Sort fpu/multiarch/Makefile
Sort Makefile variables using scripts/sort-makefile-lines.py.
No code generation changes observed in libm. No regressions on x86_64.
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index 4d00a27988073c40..3c49c7b27778d9e8 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -1,17 +1,45 @@
ifeq ($(subdir),math)
-libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
- s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \
- s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c
+libm-sysdep_routines += \
+ s_ceil-c \
+ s_ceilf-c \
+ s_floor-c \
+ s_floorf-c \
+ s_rint-c \
+ s_rintf-c \
+ s_nearbyint-c \
+ s_nearbyintf-c \
+ s_roundeven-c \
+ s_roundevenf-c \
+ s_trunc-c \
+ s_truncf-c \
+# libm-sysdep_routines
-libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
- s_floorf-sse4_1 s_nearbyint-sse4_1 \
- s_nearbyintf-sse4_1 s_roundeven-sse4_1 \
- s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
- s_trunc-sse4_1 s_truncf-sse4_1
+libm-sysdep_routines += \
+ s_ceil-sse4_1 \
+ s_ceilf-sse4_1 \
+ s_floor-sse4_1 \
+ s_floorf-sse4_1 \
+ s_nearbyint-sse4_1 \
+ s_nearbyintf-sse4_1 \
+ s_roundeven-sse4_1 \
+ s_roundevenf-sse4_1 \
+ s_rint-sse4_1 \
+ s_rintf-sse4_1 \
+ s_trunc-sse4_1 \
+ s_truncf-sse4_1 \
+# libm-sysdep_routines
-libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
- e_asin-fma e_atan2-fma s_sin-fma s_tan-fma \
- s_sincos-fma
+libm-sysdep_routines += \
+ e_asin-fma \
+ e_atan2-fma \
+ e_exp-fma \
+ e_log-fma \
+ e_pow-fma \
+ s_atan-fma \
+ s_sin-fma \
+ s_sincos-fma \
+ s_tan-fma \
+# libm-sysdep_routines
CFLAGS-e_asin-fma.c = -mfma -mavx2
CFLAGS-e_atan2-fma.c = -mfma -mavx2
@@ -23,10 +51,22 @@ CFLAGS-s_sin-fma.c = -mfma -mavx2
CFLAGS-s_tan-fma.c = -mfma -mavx2
CFLAGS-s_sincos-fma.c = -mfma -mavx2
-libm-sysdep_routines += s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
+libm-sysdep_routines += \
+ s_cosf-sse2 \
+ s_sincosf-sse2 \
+ s_sinf-sse2 \
+# libm-sysdep_routines
-libm-sysdep_routines += e_exp2f-fma e_expf-fma e_log2f-fma e_logf-fma \
- e_powf-fma s_sinf-fma s_cosf-fma s_sincosf-fma
+libm-sysdep_routines += \
+ e_exp2f-fma \
+ e_expf-fma \
+ e_log2f-fma \
+ e_logf-fma \
+ e_powf-fma \
+ s_cosf-fma \
+ s_sincosf-fma \
+ s_sinf-fma \
+# libm-sysdep_routines
CFLAGS-e_exp2f-fma.c = -mfma -mavx2
CFLAGS-e_expf-fma.c = -mfma -mavx2
@@ -37,9 +77,17 @@ CFLAGS-s_sinf-fma.c = -mfma -mavx2
CFLAGS-s_cosf-fma.c = -mfma -mavx2
CFLAGS-s_sincosf-fma.c = -mfma -mavx2
-libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \
- e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \
- s_sincos-fma4
+libm-sysdep_routines += \
+ e_exp-fma4 \
+ e_log-fma4 \
+ e_pow-fma4 \
+ e_asin-fma4 \
+ s_atan-fma4 \
+ e_atan2-fma4 \
+ s_sin-fma4 \
+ s_sincos-fma4 \
+ s_tan-fma4 \
+# libm-sysdep_routines
CFLAGS-e_asin-fma4.c = -mfma4
CFLAGS-e_atan2-fma4.c = -mfma4
@@ -51,9 +99,15 @@ CFLAGS-s_sin-fma4.c = -mfma4
CFLAGS-s_tan-fma4.c = -mfma4
CFLAGS-s_sincos-fma4.c = -mfma4
-libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \
- e_atan2-avx s_sin-avx s_tan-avx \
- s_sincos-avx
+libm-sysdep_routines += \
+ e_exp-avx \
+ e_log-avx \
+ s_atan-avx \
+ e_atan2-avx \
+ s_sin-avx \
+ s_sincos-avx \
+ s_tan-avx \
+# libm-sysdep_routines
CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX
CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX

View File

@ -0,0 +1,91 @@
commit f6b10ed8e9a00de49d0951e760cc2b5288862b47
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Aug 10 11:24:30 2023 -0700
x86_64: Add log2 with FMA
On Skylake, it improves log2 bench performance by:
Before After Improvement
max 208.779 63.827 69%
min 9.977 6.55 34%
mean 10.366 6.8191 34%
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index 3c49c7b27778d9e8..56ef0482a2a8f498 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -34,6 +34,7 @@ libm-sysdep_routines += \
e_atan2-fma \
e_exp-fma \
e_log-fma \
+ e_log2-fma \
e_pow-fma \
s_atan-fma \
s_sin-fma \
@@ -45,6 +46,7 @@ CFLAGS-e_asin-fma.c = -mfma -mavx2
CFLAGS-e_atan2-fma.c = -mfma -mavx2
CFLAGS-e_exp-fma.c = -mfma -mavx2
CFLAGS-e_log-fma.c = -mfma -mavx2
+CFLAGS-e_log2-fma.c = -mfma -mavx2
CFLAGS-e_pow-fma.c = -mfma -mavx2
CFLAGS-s_atan-fma.c = -mfma -mavx2
CFLAGS-s_sin-fma.c = -mfma -mavx2
diff --git a/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c b/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c
new file mode 100644
index 0000000000000000..9fbebc1b4725c41d
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c
@@ -0,0 +1,3 @@
+#define __log2 __log2_fma
+
+#include <sysdeps/ieee754/dbl-64/e_log2.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/e_log2.c b/sysdeps/x86_64/fpu/multiarch/e_log2.c
new file mode 100644
index 0000000000000000..c0320caf368b1bf0
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_log2.c
@@ -0,0 +1,43 @@
+/* Multiple versions of log2.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <libm-alias-double.h>
+#include <libm-alias-finite.h>
+
+extern double __redirect_log2 (double);
+
+#define SYMBOL_NAME log2
+#include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_log2, __log2, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (__log2, __GI___log2, __redirect_log2)
+ __attribute__ ((visibility ("hidden")));
+
+versioned_symbol (libm, __ieee754_log2, log2, GLIBC_2_29);
+libm_alias_double_other (__log2, log2)
+#else
+libm_alias_double (__log2, log2)
+#endif
+
+strong_alias (__log2, __ieee754_log2)
+libm_alias_finite (__log2, __log2)
+
+#define __log2 __log2_sse2
+#include <sysdeps/ieee754/dbl-64/e_log2.c>

View File

@ -0,0 +1,39 @@
commit e5363e6f460c2d58809bf10fc96d70fd1ef8b5b2
Author: Jens Remus <jremus@linux.ibm.com>
Date: Fri Jul 25 15:40:03 2025 +0200
Use TLS initial-exec model for __libc_tsd_CTYPE_* thread variables [BZ #33234]
Commit 10a66a8e421b ("Remove <libc-tsd.h>") removed the TLS initial-exec
(IE) model attribute from the __libc_tsd_CTYPE_* thread variable declarations
and definitions. Commit a894f04d8776 ("Optimize __libc_tsd_* thread
variable access") restored it on declarations.
Restore the TLS initial-exec model attribute on __libc_tsd_CTYPE_* thread
variable definitions.
This resolves test tst-locale1 failure on s390 32-bit, when using a
GNU linker without the fix from GNU binutils commit aefebe82dc89
("IBM zSystems: Fix offset relative to static TLS").
Reviewed-by: Florian Weimer <fweimer@redhat.com>
diff --git a/ctype/ctype-info.c b/ctype/ctype-info.c
index e0752b4a1af6df15..315cedcfaa357ead 100644
--- a/ctype/ctype-info.c
+++ b/ctype/ctype-info.c
@@ -24,11 +24,11 @@
__ctype_init before user code runs, but this does not happen for
threads in secondary namespaces. With the initializers, secondary
namespaces at least get locale data from the C locale. */
-__thread const uint16_t * __libc_tsd_CTYPE_B
+__thread const uint16_t * __libc_tsd_CTYPE_B attribute_tls_model_ie
= (const uint16_t *) _nl_C_LC_CTYPE_class + 128;
-__thread const int32_t * __libc_tsd_CTYPE_TOLOWER
+__thread const int32_t * __libc_tsd_CTYPE_TOLOWER attribute_tls_model_ie
= (const int32_t *) _nl_C_LC_CTYPE_tolower + 128;
-__thread const int32_t * __libc_tsd_CTYPE_TOUPPER
+__thread const int32_t * __libc_tsd_CTYPE_TOUPPER attribute_tls_model_ie
= (const int32_t *) _nl_C_LC_CTYPE_toupper + 128;

View File

@ -0,0 +1,35 @@
Downstream-only patch correcting the GLIBC_PRIVATE ABI preservation
change in glibc-RHEL-93320-19.patch.
This patch adds a symbol alias that supplies the expect external
name _dl_find_object within libc.a, where the forwarder in
elf/libc-dl_find_object.c no longer exists after the changes
in glibc-RHEL-93320-19.patch. It also brings back the
tst-dl_find_object-static static linking test.
diff --git a/elf/Makefile b/elf/Makefile
index ba11f3a8b81e7218..7b75afda32bcd579 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -281,6 +281,7 @@ tests-static-normal := \
# tests-static-normal
tests-static-internal := \
+ tst-dl_find_object-static \
tst-ptrguard1-static \
tst-stackguard1-static \
tst-tls1-static \
diff --git a/elf/dl-find_object.c b/elf/dl-find_object.c
index 99797580066cdce8..62a8a61f6b6032cf 100644
--- a/elf/dl-find_object.c
+++ b/elf/dl-find_object.c
@@ -464,6 +464,9 @@ __dl_find_object_internal (void *pc1, struct dl_find_object *result)
} /* Transaction retry loop. */
}
rtld_hidden_def (__dl_find_object_internal)
+#ifndef SHARED
+strong_alias (__dl_find_object_internal, _dl_find_object)
+#endif
/* _dlfo_process_initial is called twice. First to compute the array
sizes from the initial loaded mappings. Second to fill in the

View File

@ -0,0 +1,18 @@
Downstream-only patch to remove duplicate Makefile target entry for
tst-dlmopen1mod.
Conflict resolution in glibc-RHEL-93320-4.patch led to the duplicate
tst-dlmopen1mod entry. Therefore remove it.
diff --git a/elf/Makefile b/elf/Makefile
index 190ee83120c498a3..e8587b10c1a8bedd 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -811,7 +811,6 @@ modules-names = \
tst-dl_find_object-mod7 \
tst-dl_find_object-mod8 \
tst-dl_find_object-mod9 \
- tst-dlmopen1mod \
tst-dlclose-lazy-mod1 \
tst-dlclose-lazy-mod2 \
tst-dlmopen-dlerror-mod \

View File

@ -0,0 +1,36 @@
In glibc-RHEL-93320-9.patch, the elf/tst-dl_find_object test was
added. It was later disabled in glibc-RHEL-93320-19.patch,
as noted in the patch description. This was missed when the
patch was re-added in glibc-RHEL-107564.patch. The test remains
valuable because we do not test _dl_find_object in libc.a elsewhere
in the glibc build, so this patch disables just the failing subtest,
and puts an explanation directly into the test.
diff --git a/elf/tst-dl_find_object.c b/elf/tst-dl_find_object.c
index d8c217545d116453..6bfda7bd23a34c0d 100644
--- a/elf/tst-dl_find_object.c
+++ b/elf/tst-dl_find_object.c
@@ -231,6 +231,7 @@ do_test (void)
check (map_start, &expected, __LINE__);
check (map_end, &expected, __LINE__);
+#ifndef FOR_STATIC
/* Check that _dl_find_object works from a shared object (mostly for
static dlopen). */
__typeof (_dl_find_object) *find_object
@@ -238,6 +239,15 @@ do_test (void)
struct dl_find_object actual;
TEST_COMPARE (find_object (&main_program_data, &actual), 0);
check (&main_program_data, &actual, __LINE__); /* Reversed check. */
+#else
+ /* Downstream, _dl_find_object does not work after static dlopen
+ because the ld.so copy loaded as part of static dlopen is not
+ initialized. Upstream, we redirect _dl_find_object to the
+ statically version from the main program by patching a function
+ pointer in _rtld_global_ro. Downstream, we have not changed the
+ layout of _rtld_global_ro, so this patching is missing. */
+ printf ("info: skipping dlopen-based test for static build\n");
+#endif
return 0;
}

View File

@ -0,0 +1,80 @@
commit 399384e0c8193e31aea014220ccfa24300ae5938
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Aug 14 07:03:20 2025 -0700
x86-64: Add GLIBC_ABI_DT_X86_64_PLT [BZ #33212]
When the linker -z mark-plt option is used to add DT_X86_64_PLT,
DT_X86_64_PLTSZ and DT_X86_64_PLTENT, the r_addend field of the
R_X86_64_JUMP_SLOT relocation stores the offset of the indirect
branch instruction. However, glibc versions without the commit:
commit f8587a61892cbafd98ce599131bf4f103466f084
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri May 20 19:21:48 2022 -0700
x86-64: Ignore r_addend for R_X86_64_GLOB_DAT/R_X86_64_JUMP_SLOT
According to x86-64 psABI, r_addend should be ignored for R_X86_64_GLOB_DAT
and R_X86_64_JUMP_SLOT. Since linkers always set their r_addends to 0, we
can ignore their r_addends.
Reviewed-by: Fangrui Song <maskray@google.com>
won't ignore the r_addend value in the R_X86_64_JUMP_SLOT relocation.
Such programs and shared libraries will fail at run-time randomly.
Add GLIBC_ABI_DT_X86_64_PLT version to indicate that glibc is compatible
with DT_X86_64_PLT.
The linker can add the glibc GLIBC_ABI_DT_X86_64_PLT version dependency
whenever -z mark-plt is passed to the linker. The resulting programs and
shared libraries will fail to load at run-time against libc.so without the
GLIBC_ABI_DT_X86_64_PLT version, instead of fail randomly.
This fixes BZ #33212.
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
Reviewed-by: Sam James <sam@gentoo.org>
Conflicts:
sysdeps/x86_64/Makefile
(Adapt to missing 848746e88ec2aa22e8dea25f2110e2b2c59c712e)
sysdeps/x86_64/Versions
(Adapt to missing 9df8fa397d515dc86ff5565f6c45625e672d539e)
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 22c5b63ab5afa452..da919b181a1d891f 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -183,6 +183,14 @@ endif
tests-internal += tst-x86-64-tls-1
+tests-special += $(objpfx)check-dt-x86-64-plt.out
+
+$(objpfx)check-dt-x86-64-plt.out: $(common-objpfx)libc.so
+ LC_ALL=C $(READELF) -V -W $< \
+ | sed -ne '/.gnu.version_d/, /.gnu.version_r/ p' \
+ | grep GLIBC_ABI_DT_X86_64_PLT > $@; \
+ $(evaluate-test)
+generated += check-dt-x86-64-plt.out
endif # $(subdir) == elf
ifeq ($(subdir),csu)
diff --git a/sysdeps/x86_64/Versions b/sysdeps/x86_64/Versions
index e94758b23643a905..6a989ad3b373cdf6 100644
--- a/sysdeps/x86_64/Versions
+++ b/sysdeps/x86_64/Versions
@@ -5,6 +5,11 @@ libc {
GLIBC_2.13 {
__fentry__;
}
+ GLIBC_ABI_DT_X86_64_PLT {
+ # This symbol is used only for empty version map and will be removed
+ # by scripts/versions.awk.
+ __placeholder_only_for_empty_version_map;
+ }
}
libm {
GLIBC_2.1 {

View File

@ -0,0 +1,581 @@
commit 88ce558a31c041778bd14d177ed700f2f268daea
Author: Arjun Shankar <arjun@redhat.com>
Date: Mon Oct 13 15:51:09 2025 +0200
string: Add tests for unique strerror and strsignal strings
strerror, strsignal, and their variants should return unique strings for
each known (and, depending on the function, unknown) error/signal. Add
tests to verify this for strerror, strerror_r (GNU and XSI compliant
variants), and strerror_l (for the C locale), strerrordesc_np,
strsignal, sigabbrev_np, and sigdescr_np.
Co-authored-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Reviewed-by: Florian Weimer <fweimer@redhat.com>
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Conflicts:
sysdeps/unix/sysv/linux/tst-strerror-strings.c: ERR_MAP is not supported
sysdeps/unix/sysv/linux/tst-strerrordesc_np-strings.c: ERR_MAP is not supported
sysdeps/unix/sysv/linux/tst-xsi-strerror_r-mod.c: add define of _ISOMAC
diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index 2c5bf42236..d1ac6409b1 100644
--- a/sysdeps/unix/sysv/linux/Makefile 2025-12-10 10:36:31.630511707 -0500
+++ b/sysdeps/unix/sysv/linux/Makefile 2025-12-10 10:40:52.152371031 -0500
@@ -423,3 +423,21 @@ tests += tst-align-clone tst-getpid1 \
# __NR_rseq from the internal system call list.
tests-internal += tst-rseq-nptl
endif
+
+ifeq ($(subdir),string)
+modules-names += \
+ tst-xsi-strerror_r-mod \
+ # modules-names
+
+tests += \
+ tst-sigabbrev_np-strings \
+ tst-strerror-strings \
+ tst-strerror_l-strings \
+ tst-strerror_r-strings \
+ tst-strerrordesc_np-strings \
+ tst-strsignal-strings \
+ tst-xsi-strerror_r-strings \
+ # tests
+
+$(objpfx)tst-xsi-strerror_r-strings: $(objpfx)tst-xsi-strerror_r-mod.so
+endif # $(subdir) == string
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
diff --git a/sysdeps/unix/sysv/linux/tst-sigabbrev_np-strings.c b/sysdeps/unix/sysv/linux/tst-sigabbrev_np-strings.c
new file mode 100644
index 0000000000..2c0f28c682
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/tst-sigabbrev_np-strings.c
@@ -0,0 +1,61 @@
+/* Test that sig{abbrev,descr}_np return unique strings for each signal.
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <string.h>
+#include <stdlib.h>
+
+#include <support/support.h>
+#include <support/check.h>
+
+#include "tst-verify-unique-strings.c"
+
+/* Both functions should return NULL for (unknown) signal numbers outside
+ [1, 31]. */
+
+static int
+do_test (void)
+{
+ char *str_sigabbrev[31];
+ char *str_sigdescr[31];
+
+ for (int i = -128; i <= 128; i++)
+ if (i >= 1 && i <= 31)
+ {
+ str_sigabbrev[i-1] = xstrdup (sigabbrev_np (i));
+ str_sigdescr[i-1] = xstrdup (sigdescr_np (i));
+ }
+ else
+ {
+ TEST_VERIFY_EXIT (sigabbrev_np (i) == NULL);
+ TEST_VERIFY_EXIT (sigdescr_np (i) == NULL);
+ }
+
+ VERIFY_UNIQUE_STRINGS (str_sigabbrev, 31);
+ VERIFY_UNIQUE_STRINGS (str_sigdescr, 31);
+
+ for (int i = 0; i < 31; i++)
+ {
+ free (str_sigabbrev[i]);
+ free (str_sigdescr[i]);
+ }
+
+ return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/unix/sysv/linux/tst-strerror-strings.c b/sysdeps/unix/sysv/linux/tst-strerror-strings.c
--- a/sysdeps/unix/sysv/linux/tst-strerror-strings.c 1969-12-31 19:00:00.000000000 -0500
+++ b/sysdeps/unix/sysv/linux/tst-strerror-strings.c 1969-12-31 19:00:00.000000000 -0500
@@ -0,0 +1,78 @@
+/* Test that strerror variants return unique strings for each errnum.
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <array_length.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <support/support.h>
+#include <support/check.h>
+
+#include "tst-verify-unique-strings.c"
+
+/* As defined by stdio-common/errlist-data-gen.c */
+#include <errno.h>
+#define N_(msgid) msgid
+const char *const errlist_internal[] __attribute_maybe_unused__ =
+ {
+#define _S(n, str) [n] = str,
+#include <errlist.h>
+#undef _S
+ };
+const int errlist_internal_len = array_length (errlist_internal);
+
+static int
+do_test (void)
+{
+ char *string[2 * errlist_internal_len + 1];
+
+ /* Convenient indexing for error strings from -errlist_internal_len to
+ errlist_internal_len. */
+ char **err_str = string + errlist_internal_len;
+
+ unsetenv ("LANGUAGE");
+
+ xsetlocale (LC_ALL, "C");
+
+ for (int i = -errlist_internal_len; i <= errlist_internal_len; i++)
+ {
+
+#ifdef TEST_STRERROR_VARIANT
+ /* Used for testing strerror_r and strerror_l. */
+ err_str[i] = TEST_STRERROR_VARIANT (i);
+#else
+ err_str[i] = xstrdup (strerror (i));
+#endif
+
+ int is_unknown_error
+ = (strstr (err_str[i], "Unknown error ") == err_str[i]);
+ TEST_VERIFY_EXIT ((i >= 0 && i < errlist_internal_len)
+ || is_unknown_error);
+ }
+
+ /* We check for and fail on duplicate strings. */
+ VERIFY_UNIQUE_STRINGS (string, 2 * errlist_internal_len + 1);
+
+ for (int i = -errlist_internal_len; i <= errlist_internal_len; i++)
+ free (err_str[i]);
+
+ return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/unix/sysv/linux/tst-strerror_l-strings.c b/sysdeps/unix/sysv/linux/tst-strerror_l-strings.c
new file mode 100644
index 0000000000..65c5a2f08c
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/tst-strerror_l-strings.c
@@ -0,0 +1,40 @@
+/* Test that strerror_l returns unique strings for each errnum.
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <stdlib.h>
+#include <string.h>
+#include <support/support.h>
+#include <support/check.h>
+
+/* newlocale returns (locale_t) 0 upon error, so it makes for a good initial
+ value that is different from any valid locale_t. */
+static locale_t loc = (locale_t) 0;
+
+/* Wrap strerror_l to be plugged into the equivalent strerror test. */
+static char *
+wrap_strerror_l (int errnum)
+{
+ if (loc == (locale_t) 0)
+ loc = xnewlocale (LC_ALL_MASK, "C", (locale_t) 0);
+
+ return xstrdup (strerror_l (errnum, loc));
+}
+
+#define TEST_STRERROR_VARIANT wrap_strerror_l
+#include "tst-strerror-strings.c"
diff --git a/sysdeps/unix/sysv/linux/tst-strerror_r-strings.c b/sysdeps/unix/sysv/linux/tst-strerror_r-strings.c
new file mode 100644
index 0000000000..dea9415eb8
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/tst-strerror_r-strings.c
@@ -0,0 +1,43 @@
+/* Test that GNU strerror_r returns unique strings for each errnum.
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <string.h>
+#include <support/support.h>
+#include <support/check.h>
+
+/* Wrap strerror_r into a checked variant that can be plugged into the
+ equivalent strerror test. */
+static char *
+test_and_return_strerror_r (int errnum)
+{
+ char buf[1024];
+
+ char *ret = strerror_r (errnum, buf, sizeof (buf));
+
+ /* User supplied buffer used for and only for "Unknown error" strings. */
+ if (strstr (ret, "Unknown error ") == ret)
+ TEST_VERIFY_EXIT (ret == buf);
+ else
+ TEST_VERIFY_EXIT (ret != buf);
+
+ return xstrdup (ret);
+}
+
+#define TEST_STRERROR_VARIANT test_and_return_strerror_r
+#include "tst-strerror-strings.c"
diff --git a/sysdeps/unix/sysv/linux/tst-strerrordesc_np-strings.c b/sysdeps/unix/sysv/linux/tst-strerrordesc_np-strings.c
--- a/sysdeps/unix/sysv/linux/tst-strerrordesc_np-strings.c 1969-12-31 19:00:00.000000000 -0500
+++ b/sysdeps/unix/sysv/linux/tst-strerrordesc_np-strings.c 1969-12-31 19:00:00.000000000 -0500
@@ -0,0 +1,72 @@
+/* Test that strerrordesc_np returns unique strings for each errnum.
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <array_length.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <support/support.h>
+#include <support/check.h>
+
+#include "tst-verify-unique-strings.c"
+
+/* As defined by stdio-common/errlist-data-gen.c */
+#include <errno.h>
+#define N_(msgid) msgid
+const char *const errlist_internal[] __attribute_maybe_unused__ =
+ {
+#define _S(n, str) [n] = str,
+#include <errlist.h>
+#undef _S
+ };
+const int errlist_internal_len = array_length (errlist_internal);
+
+static int
+do_test (void)
+{
+ char *string[2 * errlist_internal_len + 1];
+
+ int i, s;
+ for (i = -errlist_internal_len, s = 0; i <= errlist_internal_len; i++)
+ {
+ const char *ret = strerrordesc_np (i);
+
+ /* Range of known errors. Some errnums could still be unused. */
+ if (i >= 0 && i < errlist_internal_len)
+ {
+ if (ret != NULL)
+ {
+ TEST_VERIFY_EXIT (strcasestr (ret, "Unknown error") == NULL);
+ string[s++] = xstrdup (ret);
+ }
+ }
+ else
+ TEST_VERIFY_EXIT (ret == NULL);
+ }
+
+ /* We check for and fail on duplicate strings. */
+ VERIFY_UNIQUE_STRINGS (string, s);
+
+ for (int i = 0; i < s; i++)
+ free (string[i]);
+
+ return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/unix/sysv/linux/tst-strsignal-strings.c b/sysdeps/unix/sysv/linux/tst-strsignal-strings.c
new file mode 100644
index 0000000000..476437aaa4
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/tst-strsignal-strings.c
@@ -0,0 +1,73 @@
+/* Test that strsignal returns unique strings for each signal.
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <string.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#include <support/support.h>
+#include <support/check.h>
+
+#include "tst-verify-unique-strings.c"
+
+#define NSTRINGS 257
+
+static int
+do_test (void)
+{
+ char *string[NSTRINGS];
+ /* Convenient indexing for signal strings from -128 to 128. */
+ char **sig_str = string + 128;
+
+ unsetenv ("LANGUAGE");
+
+ xsetlocale (LC_ALL, "C");
+
+ for (int i = -128; i <= 128; i++)
+ {
+ sig_str[i] = xstrdup (strsignal (i));
+
+ if (i > 0 && i <= 31)
+ {
+ /* Signals between 1 and 31 are known. */
+ TEST_VERIFY_EXIT (strstr (sig_str[i], "Unknown signal ")
+ == NULL);
+ TEST_VERIFY_EXIT (strstr (sig_str[i], "Real-time signal ")
+ == NULL);
+ }
+ else if ((i <= 0)
+ || (i > 31 && i < SIGRTMIN)
+ || (i > SIGRTMAX))
+ TEST_VERIFY_EXIT (strstr (sig_str[i], "Unknown signal ")
+ == sig_str[i]);
+
+ else if (i >= SIGRTMIN && i <= SIGRTMAX)
+ TEST_VERIFY_EXIT (strstr (sig_str[i], "Real-time signal ")
+ == sig_str[i]);
+ }
+
+ VERIFY_UNIQUE_STRINGS (string, NSTRINGS);
+
+ for (int i = -128; i <= 128; i++)
+ free (sig_str[i]);
+
+ return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/unix/sysv/linux/tst-verify-unique-strings.c b/sysdeps/unix/sysv/linux/tst-verify-unique-strings.c
new file mode 100644
index 0000000000..75a2bc38ff
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/tst-verify-unique-strings.c
@@ -0,0 +1,40 @@
+/* Test that an array of strings does not contain duplicates.
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <string.h>
+#include <stdlib.h>
+#include <support/check.h>
+
+static int
+compare_strings (const void *a, const void *b)
+{
+ const char *stra = * (const char **) a;
+ const char *strb = * (const char **) b;
+
+ int ret = strcmp (stra, strb);
+
+ if (!ret)
+ FAIL_EXIT1 ("Found duplicate strings: \"%s\"\n", stra);
+
+ return ret;
+}
+
+/* We check for and fail on duplicate strings in the comparator. */
+#define VERIFY_UNIQUE_STRINGS(strarray, narray) \
+ qsort ((strarray), (narray), sizeof (char *), compare_strings)
diff --git a/sysdeps/unix/sysv/linux/tst-xsi-strerror_r-mod.c b/sysdeps/unix/sysv/linux/tst-xsi-strerror_r-mod.c
--- a/sysdeps/unix/sysv/linux/tst-xsi-strerror_r-mod.c 1969-12-31 19:00:00.000000000 -0500
+++ b/sysdeps/unix/sysv/linux/tst-xsi-strerror_r-mod.c 1969-12-31 19:00:00.000000000 -0500
@@ -0,0 +1,31 @@
+/* A module that provides XSI compliant strerror_r for testing.
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* This allows us to compile the rest of the test with GNU extensions. */
+
+#undef _GNU_SOURCE
+#define _DEFAULT_SOURCE
+#define _ISOMAC
+#include <string.h>
+
+int
+xsi_strerror_r (int errnum, char *buf, size_t buflen)
+{
+ return strerror_r (errnum, buf, buflen);
+}
diff --git a/sysdeps/unix/sysv/linux/tst-xsi-strerror_r-strings.c b/sysdeps/unix/sysv/linux/tst-xsi-strerror_r-strings.c
new file mode 100644
index 0000000000..49f05c1c0a
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/tst-xsi-strerror_r-strings.c
@@ -0,0 +1,46 @@
+/* Test that XSI strerror_r returns unique strings for each errnum.
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <string.h>
+#include <support/support.h>
+#include <support/check.h>
+
+extern int
+xsi_strerror_r (int errnum, char *buf, size_t buflen);
+
+/* Wrap strerror_r into a checked variant that can be plugged into the
+ equivalent strerror test. */
+static char *
+test_and_return_xsi_strerror_r (int errnum)
+{
+ char buf[1024];
+
+ int ret = xsi_strerror_r (errnum, buf, sizeof (buf));
+
+ /* Unknown errnums lead to a positive error returned from strerror_r. */
+ if (strstr (buf, "Unknown error ") == buf)
+ TEST_VERIFY (ret > 0);
+ else
+ TEST_VERIFY (ret == 0);
+
+ return xstrdup (buf);
+}
+
+#define TEST_STRERROR_VARIANT test_and_return_xsi_strerror_r
+#include "tst-strerror-strings.c"

View File

@ -0,0 +1,69 @@
commit 1493622f4f9048ffede3fbedb64695efa49d662a
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Aug 28 12:08:14 2023 -0700
x86: Check the lower byte of EAX of CPUID leaf 2 [BZ #30643]
The old Intel software developer manual specified that the low byte of
EAX of CPUID leaf 2 returned 1 which indicated the number of rounds of
CPUDID leaf 2 was needed to retrieve the complete cache information. The
newer Intel manual has been changed to that it should always return 1
and be ignored. If the lower byte isn't 1, CPUID leaf 2 can't be used.
In this case, we ignore CPUID leaf 2 and use CPUID leaf 4 instead. If
CPUID leaf 4 doesn't contain the cache information, cache information
isn't available at all. This addresses BZ #30643.
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index f950e488cfbe42dd..bd2f2b65f78056ca 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -187,7 +187,7 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
++round;
}
/* There is no other cache information anywhere else. */
- break;
+ return -1;
}
else
{
@@ -257,28 +257,23 @@ handle_intel (int name, const struct cpu_features *cpu_features)
/* OK, we can use the CPUID instruction to get all info about the
caches. */
- unsigned int cnt = 0;
- unsigned int max = 1;
long int result = 0;
bool no_level_2_or_3 = false;
bool has_level_2 = false;
+ unsigned int eax;
+ unsigned int ebx;
+ unsigned int ecx;
+ unsigned int edx;
+ __cpuid (2, eax, ebx, ecx, edx);
- while (cnt++ < max)
+ /* The low byte of EAX of CPUID leaf 2 should always return 1 and it
+ should be ignored. If it isn't 1, use CPUID leaf 4 instead. */
+ if ((eax & 0xff) != 1)
+ return intel_check_word (name, 0xff, &has_level_2, &no_level_2_or_3,
+ cpu_features);
+ else
{
- unsigned int eax;
- unsigned int ebx;
- unsigned int ecx;
- unsigned int edx;
- __cpuid (2, eax, ebx, ecx, edx);
-
- /* The low byte of EAX in the first round contain the number of
- rounds we have to make. At least one, the one we are already
- doing. */
- if (cnt == 1)
- {
- max = eax & 0xff;
- eax &= 0xffffff00;
- }
+ eax &= 0xffffff00;
/* Process the individual registers' value. */
result = intel_check_word (name, eax, &has_level_2,

View File

@ -1,83 +0,0 @@
commit c00b984fcd53f679ca2dafcd1aee2c89836e6e73
Author: Florian Weimer <fweimer@redhat.com>
Date: Tue Aug 29 08:28:31 2023 +0200
nscd: Skip unusable entries in first pass in prune_cache (bug 30800)
Previously, if an entry was marked unusable for any reason, but had
not timed out yet, the assert would trigger.
One way to get into such state is if a data change is detected during
re-validation of an entry. This causes the entry to be marked as not
usable. If exits nscd soon after that, then the clock jumps
backwards, and nscd restarted, the cache re-validation run after
startup triggers the removed assert.
The change is more complicated than just the removal of the assert
because entries marked as not usable should be garbage-collected in
the second pass. To make this happen, it is necessary to update some
book-keeping data.
Reviewed-by: DJ Delorie <dj@redhat.com>
diff --git a/nscd/cache.c b/nscd/cache.c
index efe4214d953edb30..2fd3f78ebb567bbe 100644
--- a/nscd/cache.c
+++ b/nscd/cache.c
@@ -371,8 +371,11 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
serv2str[runp->type], str, dh->timeout);
}
- /* Check whether the entry timed out. */
- if (dh->timeout < now)
+ /* Check whether the entry timed out. Timed out entries
+ will be revalidated. For unusable records, it is still
+ necessary to record that the bucket needs to be scanned
+ again below. */
+ if (dh->timeout < now || !dh->usable)
{
/* This hash bucket could contain entries which need to
be looked at. */
@@ -384,7 +387,7 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
/* We only have to look at the data of the first entries
since the count information is kept in the data part
which is shared. */
- if (runp->first)
+ if (runp->first && dh->usable)
{
/* At this point there are two choices: we reload the
@@ -400,9 +403,6 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
{
/* Remove the value. */
dh->usable = false;
-
- /* We definitely have some garbage entries now. */
- any = true;
}
else
{
@@ -414,18 +414,15 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
time_t timeout = readdfcts[runp->type] (table, runp, dh);
next_timeout = MIN (next_timeout, timeout);
-
- /* If the entry has been replaced, we might need
- cleanup. */
- any |= !dh->usable;
}
}
+
+ /* If the entry has been replaced, we might need cleanup. */
+ any |= !dh->usable;
}
else
- {
- assert (dh->usable);
- next_timeout = MIN (next_timeout, dh->timeout);
- }
+ /* Entry has not timed out and is usable. */
+ next_timeout = MIN (next_timeout, dh->timeout);
run = runp->next;
}

View File

@ -0,0 +1,39 @@
commit 6f999af332c91035350390ef8af96388b8f4fd2c
Author: Arjun Shankar <arjun@redhat.com>
Date: Mon Aug 18 15:33:13 2025 +0200
support: Handle FUSE_GETXATTR during FUSE FS mount
When testing with some kernel versions, support FUSE infrastructure
encounters a FUSE_GETXATTR request, leading to FUSE tests hanging until
timed out. Therefore, pass FUSE_GETXATTR requests from
support_fuse_handle_mountpoint to support_fuse_handle_directory, and
adjust support_fuse_handle_directory to return ENOSYS so that tests can
proceed.
Reviewed-by: Florian Weimer <fweimer@redhat.com>
diff --git a/support/support_fuse.c b/support/support_fuse.c
index f6c063b549e2c26c..5af2f7d8ab93a5ea 100644
--- a/support/support_fuse.c
+++ b/support/support_fuse.c
@@ -212,6 +212,9 @@ support_fuse_handle_directory (struct support_fuse *f)
support_fuse_reply_prepared (f);
}
return true;
+ case FUSE_GETXATTR:
+ support_fuse_reply_error (f, ENOSYS);
+ return true;
default:
return false;
}
@@ -222,7 +225,8 @@ support_fuse_handle_mountpoint (struct support_fuse *f)
{
TEST_VERIFY (f->inh != NULL);
/* 1 is the root node. */
- if (f->inh->opcode == FUSE_GETATTR && f->inh->nodeid == 1)
+ if ((f->inh->opcode == FUSE_GETATTR || f->inh->opcode == FUSE_GETXATTR)
+ && f->inh->nodeid == 1)
return support_fuse_handle_directory (f);
return false;
}

View File

@ -0,0 +1,710 @@
commit e067e53080386e93dcf8b07e25fb6656f2c8941e
Author: Florian Weimer <fweimer@redhat.com>
Date: Mon Nov 17 11:15:13 2025 +0100
Add COPYINGv3 with the GPL version 3 text
The license is referenced in various headers, so we should ship it.
The text was copied from gnulib commit d64d66cc4897d605f543257dcd0,
file doc/COPYINGv3.
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Reviewed-by: Collin Funk <collin.funk1@gmail.com>
Signed-off-by: Florian Weimer <fweimer@redhat.com>
Conflicts:
SHARED-FILES (not backported)
diff --git a/COPYINGv3 b/COPYINGv3
new file mode 100644
index 0000000000000000..f288702d2fa16d3c
--- /dev/null
+++ b/COPYINGv3
@@ -0,0 +1,674 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ <program> Copyright (C) <year> <name of author>
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.
diff --git a/scripts/update-copyrights b/scripts/update-copyrights
index a56cc03f1d9cf3d1..9d8bc5de51b8dc81 100755
--- a/scripts/update-copyrights
+++ b/scripts/update-copyrights
@@ -44,7 +44,7 @@ files=$(find . -type f | sed 's|^\./||' | grep -v '^\.git/')
for f in $files; do
case $f in
- COPYING | COPYING.LIB | manual/fdl-1.3.texi | manual/lgpl-2.1.texi)
+ COPYING* | manual/fdl-1.3.texi | manual/lgpl-2.1.texi)
# Licenses imported verbatim from FSF sources.
;;
manual/texinfo.tex | scripts/config.guess | scripts/config.sub \

View File

@ -0,0 +1,31 @@
commit 78fdb2d6b1c34ea8e779fd48f9436dfbd50b6387
Author: Florian Weimer <fweimer@redhat.com>
Date: Thu Jan 8 12:35:08 2026 +0100
Switch currency symbol for the bg_BG locale to euro
Bulgaria joined the eurozone on 2026-01-01.
Suggested-by: Йордан Гигов <jgigov@abv.bg>
Reviewed-by: Collin Funk <collin.funk1@gmail.com>
Conflicts:
localedata/locales/bg_BG
(Adjust for missing UTF-8 conversion and separator/grouping
fixes downstream)
diff --git a/localedata/locales/bg_BG b/localedata/locales/bg_BG
index 7dcd03fafc1fb394..c8a2e6a70857c80e 100644
--- a/localedata/locales/bg_BG
+++ b/localedata/locales/bg_BG
@@ -248,8 +248,8 @@ reorder-end
END LC_COLLATE
LC_MONETARY
-int_curr_symbol "BGN "
-currency_symbol "<U043B><U0432>."
+int_curr_symbol "EUR "
+currency_symbol "<U20AC>"
mon_decimal_point ","
mon_thousands_sep "<U202F>"
mon_grouping 3;3

View File

@ -1,72 +0,0 @@
commit 2aa0974d2573441bffd596b07bff8698b1f2f18c
Author: Florian Weimer <fweimer@redhat.com>
Date: Fri Oct 20 14:29:50 2023 +0200
elf: ldconfig should skip temporary files created by package managers
This avoids crashes due to partially written files, after a package
update is interrupted.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Conflicts:
elf/ldconfig.c
(missing alloca removal downstream)
diff --git a/elf/ldconfig.c b/elf/ldconfig.c
index 8c66d7e5426d8cc4..51de08f91fbaf093 100644
--- a/elf/ldconfig.c
+++ b/elf/ldconfig.c
@@ -771,6 +771,31 @@ struct dlib_entry
struct dlib_entry *next;
};
+/* Skip some temporary DSO files. These files may be partially written
+ and lead to ldconfig crashes when examined. */
+static bool
+skip_dso_based_on_name (const char *name, size_t len)
+{
+ /* Skip temporary files created by the prelink program. Files with
+ names like these are never really DSOs we want to look at. */
+ if (len >= sizeof (".#prelink#") - 1)
+ {
+ if (strcmp (name + len - sizeof (".#prelink#") + 1,
+ ".#prelink#") == 0)
+ return true;
+ if (len >= sizeof (".#prelink#.XXXXXX") - 1
+ && memcmp (name + len - sizeof (".#prelink#.XXXXXX")
+ + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0)
+ return true;
+ }
+ /* Skip temporary files created by RPM. */
+ if (memchr (name, len, ';') != NULL)
+ return true;
+ /* Skip temporary files created by dpkg. */
+ if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
+ return true;
+ return false;
+}
static void
search_dir (const struct dir_entry *entry)
@@ -849,18 +874,8 @@ search_dir (const struct dir_entry *entry)
continue;
size_t len = strlen (direntry->d_name);
- /* Skip temporary files created by the prelink program. Files with
- names like these are never really DSOs we want to look at. */
- if (len >= sizeof (".#prelink#") - 1)
- {
- if (strcmp (direntry->d_name + len - sizeof (".#prelink#") + 1,
- ".#prelink#") == 0)
- continue;
- if (len >= sizeof (".#prelink#.XXXXXX") - 1
- && memcmp (direntry->d_name + len - sizeof (".#prelink#.XXXXXX")
- + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0)
- continue;
- }
+ if (skip_dso_based_on_name (direntry->d_name, len))
+ continue;
len += strlen (entry->path) + 2;
if (len > file_name_len)
{

View File

@ -1,61 +0,0 @@
commit cfb5a97a93ea656e3b2263e42142a4032986d9ba
Author: Florian Weimer <fweimer@redhat.com>
Date: Mon Oct 23 12:53:16 2023 +0200
ldconfig: Fixes for skipping temporary files.
Arguments to a memchr call were swapped, causing incorrect skipping
of files.
Files related to dpkg have different names: they actually end in
.dpkg-new and .dpkg-tmp, not .tmp as I mistakenly assumed.
Fixes commit 2aa0974d2573441bffd59 ("elf: ldconfig should skip
temporary files created by package managers").
diff --git a/elf/ldconfig.c b/elf/ldconfig.c
index 51de08f91fbaf093..fb19dd68d41c07a4 100644
--- a/elf/ldconfig.c
+++ b/elf/ldconfig.c
@@ -771,6 +771,17 @@ struct dlib_entry
struct dlib_entry *next;
};
+/* Return true if the N bytes at NAME end with with the characters in
+ the string SUFFIX. (NAME[N + 1] does not have to be a null byte.)
+ Expected to be called with a string literal for SUFFIX. */
+static inline bool
+endswithn (const char *name, size_t n, const char *suffix)
+{
+ return (n >= strlen (suffix)
+ && memcmp (name + n - strlen (suffix), suffix,
+ strlen (suffix)) == 0);
+}
+
/* Skip some temporary DSO files. These files may be partially written
and lead to ldconfig crashes when examined. */
static bool
@@ -780,8 +791,7 @@ skip_dso_based_on_name (const char *name, size_t len)
names like these are never really DSOs we want to look at. */
if (len >= sizeof (".#prelink#") - 1)
{
- if (strcmp (name + len - sizeof (".#prelink#") + 1,
- ".#prelink#") == 0)
+ if (endswithn (name, len, ".#prelink#"))
return true;
if (len >= sizeof (".#prelink#.XXXXXX") - 1
&& memcmp (name + len - sizeof (".#prelink#.XXXXXX")
@@ -789,10 +799,11 @@ skip_dso_based_on_name (const char *name, size_t len)
return true;
}
/* Skip temporary files created by RPM. */
- if (memchr (name, len, ';') != NULL)
+ if (memchr (name, ';', len) != NULL)
return true;
/* Skip temporary files created by dpkg. */
- if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
+ if (endswithn (name, len, ".dpkg-new")
+ || endswithn (name, len, ".dpkg-tmp"))
return true;
return false;
}

View File

@ -0,0 +1,86 @@
commit c9188d333717d3ceb7e3020011651f424f749f93
Author: Siddhesh Poyarekar <siddhesh@gotplt.org>
Date: Thu Jan 15 06:06:40 2026 -0500
memalign: reinstate alignment overflow check (CVE-2026-0861)
The change to cap valid sizes to PTRDIFF_MAX inadvertently dropped the
overflow check for alignment in memalign functions, _mid_memalign and
_int_memalign. Reinstate the overflow check in _int_memalign, aligned
with the PTRDIFF_MAX change since that is directly responsible for the
CVE. The missing _mid_memalign check is not relevant (and does not have
a security impact) and may need a different approach to fully resolve,
so it has been omitted.
CVE-Id: CVE-2026-0861
Vulnerable-Commit: 9bf8e29ca136094f73f69f725f15c51facc97206
Reported-by: Igor Morgenstern, Aisle Research
Fixes: BZ #33796
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Signed-off-by: Siddhesh Poyarekar <siddhesh@gotplt.org>
Conflicts:
malloc/malloc.c
(old checked_request2size interface downstream)
diff --git a/malloc/malloc.c b/malloc/malloc.c
index fe80b8239756a7c9..8d2ede60d93e433f 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -4815,7 +4815,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
- if (!checked_request2size (bytes, &nb))
+ if (!checked_request2size (bytes, &nb) || alignment > PTRDIFF_MAX)
{
__set_errno (ENOMEM);
return NULL;
@@ -4826,8 +4826,10 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
request, and then possibly free the leading and trailing space.
*/
- /* Call malloc with worst case padding to hit alignment. */
-
+ /* Call malloc with worst case padding to hit alignment. ALIGNMENT is a
+ power of 2, so it tops out at (PTRDIFF_MAX >> 1) + 1, leaving plenty of
+ space to add MINSIZE and whatever checked_request2size adds to BYTES to
+ get NB. Consequently, total below also does not overflow. */
m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
if (m == 0)
diff --git a/malloc/tst-malloc-too-large.c b/malloc/tst-malloc-too-large.c
index 328b4a2a4fd72cf4..593381520c40cb84 100644
--- a/malloc/tst-malloc-too-large.c
+++ b/malloc/tst-malloc-too-large.c
@@ -151,7 +151,6 @@ test_large_allocations (size_t size)
}
-static long pagesize;
/* This function tests the following aligned memory allocation functions
using several valid alignments and precedes each allocation test with a
@@ -170,8 +169,8 @@ test_large_aligned_allocations (size_t size)
/* All aligned memory allocation functions expect an alignment that is a
power of 2. Given this, we test each of them with every valid
- alignment from 1 thru PAGESIZE. */
- for (align = 1; align <= pagesize; align *= 2)
+ alignment for the type of ALIGN, i.e. until it wraps to 0. */
+ for (align = 1; align > 0; align <<= 1)
{
test_setup ();
#if __GNUC_PREREQ (7, 0)
@@ -264,11 +263,6 @@ do_test (void)
DIAG_IGNORE_NEEDS_COMMENT (7, "-Walloc-size-larger-than=");
#endif
- /* Aligned memory allocation functions need to be tested up to alignment
- size equivalent to page size, which should be a power of 2. */
- pagesize = sysconf (_SC_PAGESIZE);
- TEST_VERIFY_EXIT (powerof2 (pagesize));
-
/* Loop 1: Ensure that all allocations with SIZE close to SIZE_MAX, i.e.
in the range (SIZE_MAX - 2^14, SIZE_MAX], fail.

View File

@ -1,70 +0,0 @@
commit e56ff82d5034ec66c6a78f517af6faa427f65b0b
Author: Carlos O'Donell <carlos@redhat.com>
Date: Thu Jan 15 15:09:38 2026 -0500
resolv: Fix NSS DNS backend for getnetbyaddr (CVE-2026-0915)
The default network value of zero for net was never tested for and
results in a DNS query constructed from uninitialized stack bytes.
The solution is to provide a default query for the case where net
is zero.
Adding a test case for this was straight forward given the existence of
tst-resolv-network and if the test is added without the fix you observe
this failure:
FAIL: resolv/tst-resolv-network
original exit status 1
error: tst-resolv-network.c:174: invalid QNAME: \146\218\129\128
error: 1 test failures
With a random QNAME resulting from the use of uninitialized stack bytes.
After the fix the test passes.
Additionally verified using wireshark before and after to ensure
on-the-wire bytes for the DNS query were as expected.
No regressions on x86_64.
Reviewed-by: Florian Weimer <fweimer@redhat.com>
diff --git a/resolv/nss_dns/dns-network.c b/resolv/nss_dns/dns-network.c
index 61bddd754f2d73c0..cbab554f1e761016 100644
--- a/resolv/nss_dns/dns-network.c
+++ b/resolv/nss_dns/dns-network.c
@@ -207,6 +207,10 @@ _nss_dns_getnetbyaddr_r (uint32_t net, int type, struct netent *result,
sprintf (qbuf, "%u.%u.%u.%u.in-addr.arpa", net_bytes[3], net_bytes[2],
net_bytes[1], net_bytes[0]);
break;
+ default:
+ /* Default network (net is originally zero). */
+ strcpy (qbuf, "0.0.0.0.in-addr.arpa");
+ break;
}
net_buffer.buf = orig_net_buffer = (querybuf *) alloca (1024);
diff --git a/resolv/tst-resolv-network.c b/resolv/tst-resolv-network.c
index 4b862d57e65276e5..afc1874160179fcc 100644
--- a/resolv/tst-resolv-network.c
+++ b/resolv/tst-resolv-network.c
@@ -46,6 +46,9 @@ handle_code (const struct resolv_response_context *ctx,
{
switch (code)
{
+ case 0:
+ send_ptr (b, qname, qclass, qtype, "0.in-addr.arpa");
+ break;
case 1:
send_ptr (b, qname, qclass, qtype, "1.in-addr.arpa");
break;
@@ -259,6 +262,9 @@ do_test (void)
"error: NO_RECOVERY\n");
/* Lookup by address, success cases. */
+ check_reverse (0,
+ "name: 0.in-addr.arpa\n"
+ "net: 0x00000000\n");
check_reverse (1,
"name: 1.in-addr.arpa\n"
"net: 0x00000001\n");

View File

@ -0,0 +1,70 @@
commit e56ff82d5034ec66c6a78f517af6faa427f65b0b
Author: Carlos O'Donell <carlos@redhat.com>
Date: Thu Jan 15 15:09:38 2026 -0500
resolv: Fix NSS DNS backend for getnetbyaddr (CVE-2026-0915)
The default network value of zero for net was never tested for and
results in a DNS query constructed from uninitialized stack bytes.
The solution is to provide a default query for the case where net
is zero.
Adding a test case for this was straight forward given the existence of
tst-resolv-network and if the test is added without the fix you observe
this failure:
FAIL: resolv/tst-resolv-network
original exit status 1
error: tst-resolv-network.c:174: invalid QNAME: \146\218\129\128
error: 1 test failures
With a random QNAME resulting from the use of uninitialized stack bytes.
After the fix the test passes.
Additionally verified using wireshark before and after to ensure
on-the-wire bytes for the DNS query were as expected.
No regressions on x86_64.
Reviewed-by: Florian Weimer <fweimer@redhat.com>
diff --git a/resolv/nss_dns/dns-network.c b/resolv/nss_dns/dns-network.c
index 74b78959c230a8e3..1912fef6f2284c2d 100644
--- a/resolv/nss_dns/dns-network.c
+++ b/resolv/nss_dns/dns-network.c
@@ -208,6 +208,10 @@ _nss_dns_getnetbyaddr_r (uint32_t net, int type, struct netent *result,
sprintf (qbuf, "%u.%u.%u.%u.in-addr.arpa", net_bytes[3], net_bytes[2],
net_bytes[1], net_bytes[0]);
break;
+ default:
+ /* Default network (net is originally zero). */
+ strcpy (qbuf, "0.0.0.0.in-addr.arpa");
+ break;
}
net_buffer.buf = orig_net_buffer = (querybuf *) alloca (1024);
diff --git a/resolv/tst-resolv-network.c b/resolv/tst-resolv-network.c
index 156cb5692b6a9a28..febb447f60814498 100644
--- a/resolv/tst-resolv-network.c
+++ b/resolv/tst-resolv-network.c
@@ -46,6 +46,9 @@ handle_code (const struct resolv_response_context *ctx,
{
switch (code)
{
+ case 0:
+ send_ptr (b, qname, qclass, qtype, "0.in-addr.arpa");
+ break;
case 1:
send_ptr (b, qname, qclass, qtype, "1.in-addr.arpa");
break;
@@ -265,6 +268,9 @@ do_test (void)
"error: TRY_AGAIN\n");
/* Lookup by address, success cases. */
+ check_reverse (0,
+ "name: 0.in-addr.arpa\n"
+ "net: 0x00000000\n");
check_reverse (1,
"name: 1.in-addr.arpa\n"
"net: 0x00000001\n");

View File

@ -1,84 +0,0 @@
commit 7b543dcdf97d07fd4346feb17916e08fe83ad0ae
Author: Florian Weimer <fweimer@redhat.com>
Date: Thu Jan 15 22:29:46 2026 +0100
elf: Ignore LD_PROFILE if LD_PROFILE_OUTPUT is not set (bug 33797)
The previous default for LD_PROFILE_OUTPUT, /var/tmp, is insecure
because it's typically a 1777 directory, and other systems could
place malicious files there which interfere with execution.
Requiring the user to specify a profiling directory mitigates
the impact of bug 33797. Clear LD_PROFILE_OUTPUT alongside
with LD_PROFILE.
Rework the test not to use predictable file names.
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Conflicts:
elf/rtld.c
(different implementation of environment variable filtering
downstream, incorporate changes from upstream commit
4a133885a7c8ae7ebe34e36fcdb353f8e94c810f, adjust for
GLRO(_dl_profile_output) use in glibc-rh2047981-44.patch)
elf/tst-env-setuid.c
(no LD_PROFILE test downstream)
diff --git a/elf/rtld.c b/elf/rtld.c
index 48698f93a4873a6d..848f6f51d093f313 100644
--- a/elf/rtld.c
+++ b/elf/rtld.c
@@ -2684,11 +2684,9 @@ process_envvars (struct dl_main_state *state)
char *envline;
char *debug_output = NULL;
- /* This is the default place for profiling data file. As a side
- effect, this marks ld.so as initialized, so that the rtld_active
- function returns true from now on. */
- GLRO(dl_profile_output)
- = &"/var/tmp\0/var/profile"[__libc_enable_secure ? 9 : 0];
+ /* This marks ld.so as initialized, so that the rtld_active function
+ returns true from now on. "" means no default. */
+ GLRO(dl_profile_output) = "";
while ((envline = _dl_next_ld_env_entry (&runp)) != NULL)
{
@@ -2738,7 +2736,8 @@ process_envvars (struct dl_main_state *state)
}
/* Which shared object shall be profiled. */
- if (memcmp (envline, "PROFILE", 7) == 0 && envline[8] != '\0')
+ if (!__libc_enable_secure
+ && memcmp (envline, "PROFILE", 7) == 0 && envline[8] != '\0')
GLRO(dl_profile) = &envline[8];
break;
@@ -2899,6 +2898,15 @@ process_envvars (struct dl_main_state *state)
/* We use standard output if opening the file failed. */
GLRO(dl_debug_fd) = STDOUT_FILENO;
}
+
+ /* There is no fixed, safe directory to store profiling data, so
+ activate LD_PROFILE only if LD_PROFILE_OUTPUT is set as well. */
+ if (GLRO(dl_profile) != NULL && *GLRO(dl_profile_output) == '\0')
+ {
+ _dl_error_printf ("\
+warning: LD_PROFILE ignored because LD_PROFILE_OUTPUT not specified\n");
+ GLRO(dl_profile) = NULL;
+ }
}
#if HP_TIMING_INLINE
diff --git a/sysdeps/generic/unsecvars.h b/sysdeps/generic/unsecvars.h
index 5ea8a4a259ef753c..0b84642f71ae9351 100644
--- a/sysdeps/generic/unsecvars.h
+++ b/sysdeps/generic/unsecvars.h
@@ -21,6 +21,7 @@
"LD_ORIGIN_PATH\0" \
"LD_PRELOAD\0" \
"LD_PROFILE\0" \
+ "LD_PROFILE_OUTPUT\0" \
"LD_SHOW_AUXV\0" \
"LD_USE_LOAD_BIAS\0" \
"LOCALDOMAIN\0" \

View File

@ -1,174 +0,0 @@
commit 80cc58ea2de214f85b0a1d902a3b668ad2ecb302
Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Thu Jan 15 10:32:19 2026 -0300
posix: Reset wordexp_t fields with WRDE_REUSE (CVE-2025-15281 / BZ 33814)
The wordexp fails to properly initialize the input wordexp_t when
WRDE_REUSE is used. The wordexp_t struct is properly freed, but
reuses the old wc_wordc value and updates the we_wordv in the
wrong position. A later wordfree will then call free with an
invalid pointer.
Checked on x86_64-linux-gnu and i686-linux-gnu.
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Conflicts:
posix/Makefile
(Makefile not sorted downstream)
diff --git a/posix/Makefile b/posix/Makefile
index 42a0290370b40fd9..e546b8d667b9c6c4 100644
--- a/posix/Makefile
+++ b/posix/Makefile
@@ -96,7 +96,8 @@ tests := test-errno tstgetopt testfnm runtests runptests \
tst-posix_fadvise tst-posix_fadvise64 \
tst-sysconf-empty-chroot tst-glob_symlinks tst-fexecve \
tst-glob-tilde test-ssize-max tst-spawn4 bug-regex37 \
- bug-regex38 tst-regcomp-truncated tst-regcomp-bracket-free
+ bug-regex38 tst-regcomp-truncated tst-regcomp-bracket-free \
+ tst-wordexp-reuse
tests-internal := bug-regex5 bug-regex20 bug-regex33 \
tst-rfc3484 tst-rfc3484-2 tst-rfc3484-3 \
tst-glob_lstat_compat tst-spawn4-compat
@@ -128,7 +129,8 @@ generated += $(addprefix wordexp-test-result, 1 2 3 4 5 6 7 8 9 10) \
tst-boost.mtrace bug-ga2.mtrace bug-ga2-mem.out \
bug-glob2.mtrace bug-glob2-mem.out tst-vfork3-mem.out \
tst-vfork3.mtrace getconf.speclist tst-fnmatch-mem.out \
- tst-fnmatch.mtrace bug-regex36.mtrace
+ tst-fnmatch.mtrace bug-regex36.mtrace \
+ tst-wordexp-reuse-mem.out tst-wordexp-reuse.mtrace
ifeq ($(run-built-tests),yes)
ifeq (yes,$(build-shared))
@@ -146,7 +148,8 @@ tests-special += $(objpfx)bug-regex2-mem.out $(objpfx)bug-regex14-mem.out \
$(objpfx)tst-boost-mem.out $(objpfx)tst-getconf.out \
$(objpfx)bug-glob2-mem.out $(objpfx)tst-vfork3-mem.out \
$(objpfx)tst-fnmatch-mem.out $(objpfx)bug-regex36-mem.out \
- $(objpfx)tst-glob-tilde-mem.out
+ $(objpfx)tst-glob-tilde-mem.out \
+ $(objpfx)tst-wordexp-reuse.out
xtests-special += $(objpfx)bug-ga2-mem.out
endif
@@ -387,3 +390,10 @@ $(objpfx)posix-conf-vars-def.h: $(..)scripts/gen-posix-conf-vars.awk \
$(make-target-directory)
$(AWK) -f $(filter-out Makefile, $^) > $@.tmp
mv -f $@.tmp $@
+
+tst-wordexp-reuse-ENV += MALLOC_TRACE=$(objpfx)tst-wordexp-reuse.mtrace \
+ LD_PRELOAD=$(common-objpfx)/malloc/libc_malloc_debug.so
+
+$(objpfx)tst-wordexp-reuse-mem.out: $(objpfx)tst-wordexp-reuse.out
+ $(common-objpfx)malloc/mtrace $(objpfx)tst-wordexp-reuse.mtrace > $@; \
+ $(evaluate-test)
diff --git a/posix/tst-wordexp-reuse.c b/posix/tst-wordexp-reuse.c
new file mode 100644
index 0000000000000000..3926b9f5576750ac
--- /dev/null
+++ b/posix/tst-wordexp-reuse.c
@@ -0,0 +1,89 @@
+/* Test for wordexp with WRDE_REUSE flag.
+ Copyright (C) 2026 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <wordexp.h>
+#include <mcheck.h>
+
+#include <support/check.h>
+
+static int
+do_test (void)
+{
+ mtrace ();
+
+ {
+ wordexp_t p = { 0 };
+ TEST_COMPARE (wordexp ("one", &p, 0), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[0], "one");
+ TEST_COMPARE (wordexp ("two", &p, WRDE_REUSE), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[0], "two");
+ wordfree (&p);
+ }
+
+ {
+ wordexp_t p = { .we_offs = 2 };
+ TEST_COMPARE (wordexp ("one", &p, 0), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[0], "one");
+ TEST_COMPARE (wordexp ("two", &p, WRDE_REUSE | WRDE_DOOFFS), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[p.we_offs + 0], "two");
+ wordfree (&p);
+ }
+
+ {
+ wordexp_t p = { 0 };
+ TEST_COMPARE (wordexp ("one", &p, 0), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[0], "one");
+ TEST_COMPARE (wordexp ("two", &p, WRDE_REUSE | WRDE_APPEND), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[0], "two");
+ wordfree (&p);
+ }
+
+ {
+ wordexp_t p = { .we_offs = 2 };
+ TEST_COMPARE (wordexp ("one", &p, WRDE_DOOFFS), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[p.we_offs + 0], "one");
+ TEST_COMPARE (wordexp ("two", &p, WRDE_REUSE
+ | WRDE_DOOFFS), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[p.we_offs + 0], "two");
+ wordfree (&p);
+ }
+
+ {
+ wordexp_t p = { .we_offs = 2 };
+ TEST_COMPARE (wordexp ("one", &p, WRDE_DOOFFS), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[p.we_offs + 0], "one");
+ TEST_COMPARE (wordexp ("two", &p, WRDE_REUSE
+ | WRDE_DOOFFS | WRDE_APPEND), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[p.we_offs + 0], "two");
+ wordfree (&p);
+ }
+
+ return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/posix/wordexp.c b/posix/wordexp.c
index 4061969c720f1f34..0f503b1877d2ce5b 100644
--- a/posix/wordexp.c
+++ b/posix/wordexp.c
@@ -2241,7 +2241,9 @@ wordexp (const char *words, wordexp_t *pwordexp, int flags)
{
/* Minimal implementation of WRDE_REUSE for now */
wordfree (pwordexp);
+ old_word.we_wordc = 0;
old_word.we_wordv = NULL;
+ pwordexp->we_wordc = 0;
}
if ((flags & WRDE_APPEND) == 0)

View File

@ -1,29 +0,0 @@
commit bed2db02f3183e93f21d506786c5f884a1dec9e7
Author: Florian Weimer <fweimer@redhat.com>
Date: Mon Jan 26 17:12:37 2026 +0100
posix: Run tst-wordexp-reuse-mem test
The test was not properly scheduled for execution with a Makefile
dependency.
Fixes commit 80cc58ea2de214f85b0a1d902a3b668ad2ecb302 ("posix: Reset
wordexp_t fields with WRDE_REUSE (CVE-2025-15281 / BZ 33814").
Conflicts:
posix/Makefile
(Makefile not sorted downstream)
diff --git a/posix/Makefile b/posix/Makefile
index e546b8d667b9c6c4..b399b1dab0a8cb9c 100644
--- a/posix/Makefile
+++ b/posix/Makefile
@@ -149,7 +149,7 @@ tests-special += $(objpfx)bug-regex2-mem.out $(objpfx)bug-regex14-mem.out \
$(objpfx)bug-glob2-mem.out $(objpfx)tst-vfork3-mem.out \
$(objpfx)tst-fnmatch-mem.out $(objpfx)bug-regex36-mem.out \
$(objpfx)tst-glob-tilde-mem.out \
- $(objpfx)tst-wordexp-reuse.out
+ $(objpfx)tst-wordexp-reuse-mem.out
xtests-special += $(objpfx)bug-ga2-mem.out
endif

View File

@ -0,0 +1,172 @@
commit 80cc58ea2de214f85b0a1d902a3b668ad2ecb302
Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Thu Jan 15 10:32:19 2026 -0300
posix: Reset wordexp_t fields with WRDE_REUSE (CVE-2025-15281 / BZ 33814)
The wordexp fails to properly initialize the input wordexp_t when
WRDE_REUSE is used. The wordexp_t struct is properly freed, but
reuses the old wc_wordc value and updates the we_wordv in the
wrong position. A later wordfree will then call free with an
invalid pointer.
Checked on x86_64-linux-gnu and i686-linux-gnu.
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Conflicts:
posix/Makefile
(Makefile not sorted downstream)
diff --git a/posix/Makefile b/posix/Makefile
index 562e8cb85fdb6f43..2dfd687430b205a8 100644
--- a/posix/Makefile
+++ b/posix/Makefile
@@ -113,7 +113,8 @@ tests := test-errno tstgetopt testfnm runtests runptests \
tst-cpuset-dynamic \
tst-cpuset-static \
tst-spawn6 \
- tst-regcomp-bracket-free
+ tst-regcomp-bracket-free \
+ tst-wordexp-reuse
# Test for the glob symbol version that was replaced in glibc 2.27.
ifeq ($(have-GLIBC_2.26)$(build-shared),yesyes)
@@ -160,6 +161,7 @@ generated += $(addprefix wordexp-test-result, 1 2 3 4 5 6 7 8 9 10) \
bug-glob2.mtrace bug-glob2-mem.out tst-vfork3-mem.out \
tst-vfork3.mtrace getconf.speclist tst-fnmatch-mem.out \
tst-fnmatch.mtrace bug-regex36.mtrace \
+ tst-wordexp-reuse-mem.out tst-wordexp-reuse.mtrace \
testcases.h ptestcases.h
ifeq ($(run-built-tests),yes)
@@ -178,7 +180,8 @@ tests-special += $(objpfx)bug-regex2-mem.out $(objpfx)bug-regex14-mem.out \
$(objpfx)tst-boost-mem.out $(objpfx)tst-getconf.out \
$(objpfx)bug-glob2-mem.out $(objpfx)tst-vfork3-mem.out \
$(objpfx)tst-fnmatch-mem.out $(objpfx)bug-regex36-mem.out \
- $(objpfx)tst-glob-tilde-mem.out $(objpfx)bug-ga2-mem.out
+ $(objpfx)tst-glob-tilde-mem.out $(objpfx)bug-ga2-mem.out \
+ $(objpfx)tst-wordexp-reuse.out
endif
include ../Rules
@@ -455,3 +458,10 @@ $(objpfx)posix-conf-vars-def.h: $(..)scripts/gen-posix-conf-vars.awk \
$(make-target-directory)
$(AWK) -f $(filter-out Makefile, $^) > $@.tmp
mv -f $@.tmp $@
+
+tst-wordexp-reuse-ENV += MALLOC_TRACE=$(objpfx)tst-wordexp-reuse.mtrace \
+ LD_PRELOAD=$(common-objpfx)/malloc/libc_malloc_debug.so
+
+$(objpfx)tst-wordexp-reuse-mem.out: $(objpfx)tst-wordexp-reuse.out
+ $(common-objpfx)malloc/mtrace $(objpfx)tst-wordexp-reuse.mtrace > $@; \
+ $(evaluate-test)
diff --git a/posix/tst-wordexp-reuse.c b/posix/tst-wordexp-reuse.c
new file mode 100644
index 0000000000000000..3926b9f5576750ac
--- /dev/null
+++ b/posix/tst-wordexp-reuse.c
@@ -0,0 +1,89 @@
+/* Test for wordexp with WRDE_REUSE flag.
+ Copyright (C) 2026 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <wordexp.h>
+#include <mcheck.h>
+
+#include <support/check.h>
+
+static int
+do_test (void)
+{
+ mtrace ();
+
+ {
+ wordexp_t p = { 0 };
+ TEST_COMPARE (wordexp ("one", &p, 0), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[0], "one");
+ TEST_COMPARE (wordexp ("two", &p, WRDE_REUSE), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[0], "two");
+ wordfree (&p);
+ }
+
+ {
+ wordexp_t p = { .we_offs = 2 };
+ TEST_COMPARE (wordexp ("one", &p, 0), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[0], "one");
+ TEST_COMPARE (wordexp ("two", &p, WRDE_REUSE | WRDE_DOOFFS), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[p.we_offs + 0], "two");
+ wordfree (&p);
+ }
+
+ {
+ wordexp_t p = { 0 };
+ TEST_COMPARE (wordexp ("one", &p, 0), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[0], "one");
+ TEST_COMPARE (wordexp ("two", &p, WRDE_REUSE | WRDE_APPEND), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[0], "two");
+ wordfree (&p);
+ }
+
+ {
+ wordexp_t p = { .we_offs = 2 };
+ TEST_COMPARE (wordexp ("one", &p, WRDE_DOOFFS), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[p.we_offs + 0], "one");
+ TEST_COMPARE (wordexp ("two", &p, WRDE_REUSE
+ | WRDE_DOOFFS), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[p.we_offs + 0], "two");
+ wordfree (&p);
+ }
+
+ {
+ wordexp_t p = { .we_offs = 2 };
+ TEST_COMPARE (wordexp ("one", &p, WRDE_DOOFFS), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[p.we_offs + 0], "one");
+ TEST_COMPARE (wordexp ("two", &p, WRDE_REUSE
+ | WRDE_DOOFFS | WRDE_APPEND), 0);
+ TEST_COMPARE (p.we_wordc, 1);
+ TEST_COMPARE_STRING (p.we_wordv[p.we_offs + 0], "two");
+ wordfree (&p);
+ }
+
+ return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/posix/wordexp.c b/posix/wordexp.c
index 1f3b09f721bbee5c..b23608d4ee1cccd8 100644
--- a/posix/wordexp.c
+++ b/posix/wordexp.c
@@ -2220,7 +2220,9 @@ wordexp (const char *words, wordexp_t *pwordexp, int flags)
{
/* Minimal implementation of WRDE_REUSE for now */
wordfree (pwordexp);
+ old_word.we_wordc = 0;
old_word.we_wordv = NULL;
+ pwordexp->we_wordc = 0;
}
if ((flags & WRDE_APPEND) == 0)

View File

@ -0,0 +1,29 @@
commit bed2db02f3183e93f21d506786c5f884a1dec9e7
Author: Florian Weimer <fweimer@redhat.com>
Date: Mon Jan 26 17:12:37 2026 +0100
posix: Run tst-wordexp-reuse-mem test
The test was not properly scheduled for execution with a Makefile
dependency.
Fixes commit 80cc58ea2de214f85b0a1d902a3b668ad2ecb302 ("posix: Reset
wordexp_t fields with WRDE_REUSE (CVE-2025-15281 / BZ 33814").
Conflicts:
posix/Makefile
(Makefile not sorted downstream)
diff --git a/posix/Makefile b/posix/Makefile
index 2dfd687430b205a8..837fc26868ed74a8 100644
--- a/posix/Makefile
+++ b/posix/Makefile
@@ -181,7 +181,7 @@ tests-special += $(objpfx)bug-regex2-mem.out $(objpfx)bug-regex14-mem.out \
$(objpfx)bug-glob2-mem.out $(objpfx)tst-vfork3-mem.out \
$(objpfx)tst-fnmatch-mem.out $(objpfx)bug-regex36-mem.out \
$(objpfx)tst-glob-tilde-mem.out $(objpfx)bug-ga2-mem.out \
- $(objpfx)tst-wordexp-reuse.out
+ $(objpfx)tst-wordexp-reuse-mem.out
endif
include ../Rules

View File

@ -0,0 +1,72 @@
commit 2aa0974d2573441bffd596b07bff8698b1f2f18c
Author: Florian Weimer <fweimer@redhat.com>
Date: Fri Oct 20 14:29:50 2023 +0200
elf: ldconfig should skip temporary files created by package managers
This avoids crashes due to partially written files, after a package
update is interrupted.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Conflicts:
elf/ldconfig.c
(missing alloca removal downstream)
diff --git a/elf/ldconfig.c b/elf/ldconfig.c
index be47ad8c2d7f89f3..f0c811001965cc46 100644
--- a/elf/ldconfig.c
+++ b/elf/ldconfig.c
@@ -778,6 +778,31 @@ struct dlib_entry
struct dlib_entry *next;
};
+/* Skip some temporary DSO files. These files may be partially written
+ and lead to ldconfig crashes when examined. */
+static bool
+skip_dso_based_on_name (const char *name, size_t len)
+{
+ /* Skip temporary files created by the prelink program. Files with
+ names like these are never really DSOs we want to look at. */
+ if (len >= sizeof (".#prelink#") - 1)
+ {
+ if (strcmp (name + len - sizeof (".#prelink#") + 1,
+ ".#prelink#") == 0)
+ return true;
+ if (len >= sizeof (".#prelink#.XXXXXX") - 1
+ && memcmp (name + len - sizeof (".#prelink#.XXXXXX")
+ + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0)
+ return true;
+ }
+ /* Skip temporary files created by RPM. */
+ if (memchr (name, len, ';') != NULL)
+ return true;
+ /* Skip temporary files created by dpkg. */
+ if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
+ return true;
+ return false;
+}
static void
search_dir (const struct dir_entry *entry)
@@ -854,18 +879,8 @@ search_dir (const struct dir_entry *entry)
continue;
size_t len = strlen (direntry->d_name);
- /* Skip temporary files created by the prelink program. Files with
- names like these are never really DSOs we want to look at. */
- if (len >= sizeof (".#prelink#") - 1)
- {
- if (strcmp (direntry->d_name + len - sizeof (".#prelink#") + 1,
- ".#prelink#") == 0)
- continue;
- if (len >= sizeof (".#prelink#.XXXXXX") - 1
- && memcmp (direntry->d_name + len - sizeof (".#prelink#.XXXXXX")
- + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0)
- continue;
- }
+ if (skip_dso_based_on_name (direntry->d_name, len))
+ continue;
len += strlen (entry->path) + 2;
if (len > file_name_len)
{

View File

@ -0,0 +1,61 @@
commit cfb5a97a93ea656e3b2263e42142a4032986d9ba
Author: Florian Weimer <fweimer@redhat.com>
Date: Mon Oct 23 12:53:16 2023 +0200
ldconfig: Fixes for skipping temporary files.
Arguments to a memchr call were swapped, causing incorrect skipping
of files.
Files related to dpkg have different names: they actually end in
.dpkg-new and .dpkg-tmp, not .tmp as I mistakenly assumed.
Fixes commit 2aa0974d2573441bffd59 ("elf: ldconfig should skip
temporary files created by package managers").
diff --git a/elf/ldconfig.c b/elf/ldconfig.c
index f0c811001965cc46..4a96c409994d96c8 100644
--- a/elf/ldconfig.c
+++ b/elf/ldconfig.c
@@ -778,6 +778,17 @@ struct dlib_entry
struct dlib_entry *next;
};
+/* Return true if the N bytes at NAME end with with the characters in
+ the string SUFFIX. (NAME[N + 1] does not have to be a null byte.)
+ Expected to be called with a string literal for SUFFIX. */
+static inline bool
+endswithn (const char *name, size_t n, const char *suffix)
+{
+ return (n >= strlen (suffix)
+ && memcmp (name + n - strlen (suffix), suffix,
+ strlen (suffix)) == 0);
+}
+
/* Skip some temporary DSO files. These files may be partially written
and lead to ldconfig crashes when examined. */
static bool
@@ -787,8 +798,7 @@ skip_dso_based_on_name (const char *name, size_t len)
names like these are never really DSOs we want to look at. */
if (len >= sizeof (".#prelink#") - 1)
{
- if (strcmp (name + len - sizeof (".#prelink#") + 1,
- ".#prelink#") == 0)
+ if (endswithn (name, len, ".#prelink#"))
return true;
if (len >= sizeof (".#prelink#.XXXXXX") - 1
&& memcmp (name + len - sizeof (".#prelink#.XXXXXX")
@@ -796,10 +806,11 @@ skip_dso_based_on_name (const char *name, size_t len)
return true;
}
/* Skip temporary files created by RPM. */
- if (memchr (name, len, ';') != NULL)
+ if (memchr (name, ';', len) != NULL)
return true;
/* Skip temporary files created by dpkg. */
- if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
+ if (endswithn (name, len, ".dpkg-new")
+ || endswithn (name, len, ".dpkg-tmp"))
return true;
return false;
}

View File

@ -0,0 +1,26 @@
commit 1626d8a521c7c771d4118b1328421fea113cab64
Author: Joe Simmons-Talbott <josimmon@redhat.com>
Date: Fri Apr 21 09:24:22 2023 -0400
string: Allow use of test-string.h for non-ifunc implementations.
Mark two variables as unused to silence warning when using
test-string.h for non-ifunc implementations.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
diff --git a/string/test-string.h b/string/test-string.h
index 41de973479..8bcb8afd0a 100644
--- a/string/test-string.h
+++ b/string/test-string.h
@@ -130,8 +130,8 @@ cmdline_process_function (int c)
/* Increase size of FUNC_LIST if assert is triggered at run-time. */
static struct libc_ifunc_impl func_list[32];
static int func_count;
-static int impl_count = -1;
-static impl_t *impl_array;
+static int impl_count __attribute__ ((unused)) = -1;
+static impl_t *impl_array __attribute__ ((unused));
# define FOR_EACH_IMPL(impl, notall) \
impl_t *impl; \

View File

@ -0,0 +1,233 @@
commit eaaad78db41724e5a18a42becb238bfc4e683998
Author: Joe Simmons-Talbott <josimmon@redhat.com>
Date: Fri Apr 21 09:24:23 2023 -0400
string: Add tests for strdup (BZ #30266)
Copy strcpy tests for strdup. Covers some basic testcases with random
strings. Add a zero-length string testcase.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Conflicts:
string/Makefile
(different test backport order)
diff -Nrup a/string/Makefile b/string/Makefile
--- a/string/Makefile 2023-11-30 10:59:16.400251685 -0500
+++ b/string/Makefile 2023-11-30 11:16:42.829613344 -0500
@@ -63,7 +63,8 @@ tests := tester inl-tester noinl-tester
tst-strtok_r bug-strcoll2 tst-cmp tst-xbzero-opt \
test-endian-types test-endian-file-scope \
test-endian-sign-conversion tst-memmove-overflow \
- test-sig_np tst-strerror-fail
+ test-sig_np tst-strerror-fail \
+ test-strdup
# Both tests require the .mo translation files generated by msgfmt.
tests-translation := tst-strsignal \
diff -Nrup a/string/test-strdup.c b/string/test-strdup.c
--- a/string/test-strdup.c 1969-12-31 19:00:00.000000000 -0500
+++ b/string/test-strdup.c 2023-11-30 11:11:32.850447614 -0500
@@ -0,0 +1,201 @@
+/* Test and measure strdup functions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <support/check.h>
+
+#ifdef WIDE
+# include <wchar.h>
+# define CHAR wchar_t
+# define sfmt "ls"
+# define BIG_CHAR WCHAR_MAX
+# define SMALL_CHAR 1273
+# define STRCMP wcscmp
+# define MEMCMP wmemcmp
+# define MEMSET wmemset
+# define TCS TEST_COMPARE_STRING_WIDE
+#else
+# define CHAR char
+# define sfmt "s"
+# define BIG_CHAR CHAR_MAX
+# define SMALL_CHAR 127
+# define STRCMP strcmp
+# define MEMCMP memcmp
+# define MEMSET memset
+# define TCS TEST_COMPARE_STRING
+#endif
+
+#ifndef STRDUP_RESULT
+# define STRDUP_RESULT(dst, len) dst
+# define TEST_MAIN
+# ifndef WIDE
+# define TEST_NAME "strdup"
+# else
+# define TEST_NAME "wcsdup"
+# endif
+# include "test-string.h"
+# ifndef WIDE
+# define STRDUP strdup
+# else
+# define STRDUP wcsdup
+# endif
+#endif
+
+typedef CHAR *(*proto_t) (const CHAR *);
+
+static void
+do_zero_len_test (void)
+{
+ CHAR src[1] = { '\0' };
+ CHAR *dst = STRDUP (src);
+
+ TCS (dst, src);
+ free (dst);
+}
+
+static void
+do_one_test (const CHAR *src,
+ size_t len __attribute__((unused)))
+{
+ CHAR *dst = STRDUP (src);
+
+ if (STRCMP (dst, src) != 0)
+ {
+ error (0, 0,
+ "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
+ TEST_NAME, dst, src);
+ ret = 1;
+ free (dst);
+ return;
+ }
+ free (dst);
+}
+
+static void
+do_test (size_t align1, size_t align2, size_t len, int max_char)
+{
+ size_t i;
+ CHAR *s1;
+/* For wcsdup: align1 and align2 here mean alignment not in bytes,
+ but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
+ len for wcschr here isn't in bytes but it's number of wchar_t symbols. */
+ align1 &= 7;
+ if ((align1 + len) * sizeof (CHAR) >= page_size)
+ return;
+
+ align2 &= 7;
+ if ((align2 + len) * sizeof (CHAR) >= page_size)
+ return;
+
+ s1 = (CHAR *) (buf1) + align1;
+
+ for (i = 0; i < len; i++)
+ s1[i] = 32 + 23 * i % (max_char - 32);
+ s1[len] = 0;
+
+ do_one_test (s1, len);
+}
+
+static void
+do_random_tests (void)
+{
+ size_t i, j, n, align1, align2, len;
+ CHAR *p1 = (CHAR *)(buf1 + page_size) - 512;
+ CHAR *res;
+
+ for (n = 0; n < ITERATIONS; n++)
+ {
+ /* align1 and align2 are expressed as wchar_t and not in bytes for wide
+ char test, and thus it will be equal to align times wchar_t size.
+
+ For non wide version we need to check all alignments from 0 to 63
+ since some assembly implementations have separate prolog for alignments
+ more 48. */
+
+ align1 = random () & (63 / sizeof (CHAR));
+ if (random () & 1)
+ align2 = random () & (63 / sizeof (CHAR));
+ else
+ align2 = align1 + (random () & 24);
+ len = random () & 511;
+ j = align1;
+ if (align2 > j)
+ j = align2;
+ if (len + j >= 511)
+ len = 510 - j - (random () & 7);
+ j = len + align1 + 64;
+ if (j > 512)
+ j = 512;
+ for (i = 0; i < j; i++)
+ {
+ if (i == len + align1)
+ p1[i] = 0;
+ else
+ {
+ p1[i] = random () & BIG_CHAR;
+ if (i >= align1 && i < len + align1 && !p1[i])
+ p1[i] = (random () & SMALL_CHAR) + 3;
+ }
+ }
+
+ res = STRDUP(p1 + align1);
+ TCS (res, (p1 + align1));
+ free (res);
+ }
+}
+
+
+int
+test_main (void)
+{
+ size_t i;
+
+ test_init ();
+
+ printf ("%23s", "");
+ printf ("\t%s", TEST_NAME);
+ putchar ('\n');
+
+ for (i = 0; i < 16; ++i)
+ {
+ do_test (0, 0, i, SMALL_CHAR);
+ do_test (0, 0, i, BIG_CHAR);
+ do_test (0, i, i, SMALL_CHAR);
+ do_test (i, 0, i, BIG_CHAR);
+ }
+
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (0, 0, 8 << i, SMALL_CHAR);
+ do_test (8 - i, 2 * i, 8 << i, SMALL_CHAR);
+ }
+
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (i, 2 * i, 8 << i, SMALL_CHAR);
+ do_test (2 * i, i, 8 << i, BIG_CHAR);
+ do_test (i, i, 8 << i, SMALL_CHAR);
+ do_test (i, i, 8 << i, BIG_CHAR);
+ }
+
+ do_zero_len_test ();
+ do_random_tests ();
+
+ return ret;
+}
+
+#include <support/test-driver.c>

View File

@ -0,0 +1,232 @@
commit 0c48aa0551151ea201f7f528492e89a0b08a6890
Author: Joe Simmons-Talbott <josimmon@redhat.com>
Date: Fri Apr 21 09:24:24 2023 -0400
string: Add tests for strndup (BZ #30266)
Copy strncpy tests for strndup. Covers some basic testcases with random
strings. Remove tests that set the destination's bytes and checked the
resulting buffer's bytes. Remove wide character test support since
wcsndup() doesn't exist.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Conflicts:
string/Makefile
(different test backport order)
diff -Nrup a/string/Makefile b/string/Makefile
--- a/string/Makefile 2023-11-30 11:55:02.263010916 -0500
+++ b/string/Makefile 2023-11-30 11:58:29.238954539 -0500
@@ -64,7 +64,7 @@ tests := tester inl-tester noinl-tester
test-endian-types test-endian-file-scope \
test-endian-sign-conversion tst-memmove-overflow \
test-sig_np tst-strerror-fail \
- test-strdup
+ test-strdup test-strndup
# Both tests require the .mo translation files generated by msgfmt.
tests-translation := tst-strsignal \
diff -Nrup a/string/test-strndup.c b/string/test-strndup.c
--- a/string/test-strndup.c 1969-12-31 19:00:00.000000000 -0500
+++ b/string/test-strndup.c 2023-11-30 11:56:24.986388053 -0500
@@ -0,0 +1,200 @@
+/* Test strndup functions.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <support/check.h>
+
+#define TEST_MAIN
+#include "test-string.h"
+
+static void
+do_one_test (const char *src, size_t len, size_t n)
+{
+ char *dst = strndup (src, n);
+ size_t s = (len > n ? n: len) * sizeof (char);
+
+ TEST_COMPARE_BLOB (dst, s, src, s);
+
+ free (dst);
+}
+
+static void
+do_test (size_t align1, size_t align2, size_t len, size_t n, int max_char)
+{
+ size_t i;
+ char *s1;
+
+ align1 &= 7;
+ if ((align1 + len) * sizeof (char) >= page_size)
+ return;
+
+ align2 &= 7;
+ if ((align2 + len) * sizeof (char) >= page_size)
+ return;
+
+ s1 = (char *) (buf1) + align1;
+
+ for (i = 0; i < len; ++i)
+ s1[i] = 32 + 23 * i % (max_char - 32);
+ s1[len] = 0;
+ for (i = len + 1; (i + align1) * sizeof (char) < page_size && i < len + 64;
+ ++i)
+ s1[i] = 32 + 32 * i % (max_char - 32);
+
+ do_one_test (s1, len, n);
+}
+
+static void
+do_page_tests (void)
+{
+ char *s1;
+ const size_t maxoffset = 64;
+
+ /* Put s1 at the maxoffset from the edge of buf1's last page. */
+ s1 = (char *) buf1 + BUF1PAGES * page_size / sizeof (char) - maxoffset;
+
+ memset (s1, 'a', maxoffset - 1);
+ s1[maxoffset - 1] = '\0';
+
+ /* Both strings are bounded to a page with read/write access and the next
+ page is protected with PROT_NONE (meaning that any access outside of the
+ page regions will trigger an invalid memory access).
+
+ The loop copies the string s1 for all possible offsets up to maxoffset
+ for both inputs with a size larger than s1 (so memory access outside the
+ expected memory regions might trigger invalid access). */
+
+ for (size_t off1 = 0; off1 < maxoffset; off1++)
+ for (size_t off2 = 0; off2 < maxoffset; off2++)
+ do_one_test (s1 + off1, maxoffset - off1 - 1,
+ maxoffset + (maxoffset - off2));
+}
+
+static void
+do_random_tests (void)
+{
+ size_t i, j, n, align1, align2, len, size, mode;
+ char *p1 = (char *) (buf1 + page_size) - 512;
+ char *res;
+
+ for (n = 0; n < ITERATIONS; n++)
+ {
+ mode = random ();
+ if (mode & 1)
+ {
+ size = random () & 255;
+ align1 = 512 - size - (random () & 15);
+ if (mode & 2)
+ align2 = align1 - (random () & 24);
+ else
+ align2 = align1 - (random () & 31);
+ if (mode & 4)
+ {
+ j = align1;
+ align1 = align2;
+ align2 = j;
+ }
+ if (mode & 8)
+ len = size - (random () & 31);
+ else
+ len = 512;
+ if (len >= 512)
+ len = random () & 511;
+ }
+ else
+ {
+ align1 = random () & 31;
+ if (mode & 2)
+ align2 = random () & 31;
+ else
+ align2 = align1 + (random () & 24);
+ len = random () & 511;
+ j = align1;
+ if (align2 > j)
+ j = align2;
+ if (mode & 4)
+ {
+ size = random () & 511;
+ if (size + j > 512)
+ size = 512 - j - (random () & 31);
+ }
+ else
+ size = 512 - j;
+ if ((mode & 8) && len + j >= 512)
+ len = 512 - j - (random () & 7);
+ }
+ j = len + align1 + 64;
+ if (j > 512)
+ j = 512;
+ for (i = 0; i < j; i++)
+ {
+ if (i == len + align1)
+ p1[i] = 0;
+ else
+ {
+ p1[i] = random () & CHAR_MAX;
+ if (i >= align1 && i < len + align1 && !p1[i])
+ p1[i] = (random () & 127) + 3;
+ }
+ }
+
+ res = (char *) strndup ((char *) (p1 + align1), size);
+ j = len + 1;
+ if (size < j)
+ j = size;
+ TEST_COMPARE_BLOB (res, j, (char *) (p1 + align1), j);
+ free (res);
+ }
+}
+
+int
+test_main (void)
+{
+ size_t i;
+
+ test_init ();
+
+ printf ("%28s", "");
+ printf ("\t%s", "strndup");
+ putchar ('\n');
+
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (i, i, 16, 16, 127);
+ do_test (i, i, 16, 16, CHAR_MAX);
+ do_test (i, 2 * i, 16, 16, 127);
+ do_test (2 * i, i, 16, 16, CHAR_MAX);
+ do_test (8 - i, 2 * i, 1 << i, 2 << i, 127);
+ do_test (2 * i, 8 - i, 2 << i, 1 << i, 127);
+ do_test (8 - i, 2 * i, 1 << i, 2 << i, CHAR_MAX);
+ do_test (2 * i, 8 - i, 2 << i, 1 << i, CHAR_MAX);
+ }
+
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (0, 0, 4 << i, 8 << i, 127);
+ do_test (0, 0, 16 << i, 8 << i, 127);
+ do_test (8 - i, 2 * i, 4 << i, 8 << i, 127);
+ do_test (8 - i, 2 * i, 16 << i, 8 << i, 127);
+ }
+
+ do_random_tests ();
+ do_page_tests ();
+ return ret;
+}
+
+#include <support/test-driver.c>

View File

@ -0,0 +1,33 @@
commit 0aa5b28a504c6f1f17b387d8147715d1496fff62
Author: Joe Simmons-Talbott <josimmon@redhat.com>
Date: Fri Apr 21 09:24:25 2023 -0400
wcsmbs: Add wcsdup() tests. (BZ #30266)
Enable wide character testcases for wcsdup().
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Conflicts:
wcsmbs/Makefile
(different test backport order)
diff -Nrup a/wcsmbs/Makefile b/wcsmbs/Makefile
--- a/wcsmbs/Makefile 2023-11-30 14:14:18.755010508 -0500
+++ b/wcsmbs/Makefile 2023-11-30 14:38:18.511131851 -0500
@@ -53,7 +53,8 @@ tests := tst-wcstof wcsmbs-tst1 tst-wcsn
tst-c16c32-1 wcsatcliff tst-wcstol-locale tst-wcstod-nan-locale \
tst-wcstod-round test-char-types tst-fgetwc-after-eof \
tst-wcstod-nan-sign tst-c16-surrogate tst-c32-state \
- $(addprefix test-,$(strop-tests)) tst-mbstowcs
+ $(addprefix test-,$(strop-tests)) tst-mbstowcs \
+ test-wcsdup
include ../Rules
diff -Nrup a/wcsmbs/test-wcsdup.c b/wcsmbs/test-wcsdup.c
--- a/wcsmbs/test-wcsdup.c 1969-12-31 19:00:00.000000000 -0500
+++ b/wcsmbs/test-wcsdup.c 2023-11-30 14:14:48.869138712 -0500
@@ -0,0 +1,2 @@
+#define WIDE 1
+#include "../string/test-strdup.c"

View File

@ -1,259 +0,0 @@
From 97700a34f36721b11a754cf37a1cc40695ece1fd Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 21 Jan 2019 11:23:59 -0800
Subject: [PATCH] x86-64 memchr/wmemchr: Properly handle the length parameter
[BZ# 24097]
Content-type: text/plain; charset=UTF-8
On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits. The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.
This pach fixes memchr/wmemchr for x32. Tested on x86-64 and x32. On
x86-64, libc.so is the same with and withou the fix.
[BZ# 24097]
CVE-2019-6488
* sysdeps/x86_64/memchr.S: Use RDX_LP for length. Clear the
upper 32 bits of RDX register.
* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise.
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and
tst-size_t-wmemchr.
* sysdeps/x86_64/x32/test-size_t.h: New file.
* sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise.
* sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise.
---
sysdeps/x86_64/memchr.S | 10 ++--
sysdeps/x86_64/multiarch/memchr-avx2.S | 8 ++-
sysdeps/x86_64/x32/Makefile | 8 +++
sysdeps/x86_64/x32/test-size_t.h | 35 ++++++++++++
sysdeps/x86_64/x32/tst-size_t-memchr.c | 72 +++++++++++++++++++++++++
sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++
6 files changed, 148 insertions(+), 5 deletions(-)
create mode 100644 sysdeps/x86_64/x32/test-size_t.h
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c
Conflicts:
ChangeLog
(removed)
NEWS
(removed)
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index feef5d4f..cb320257 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -34,12 +34,16 @@ ENTRY(MEMCHR)
mov %edi, %ecx
#ifdef USE_AS_WMEMCHR
- test %rdx, %rdx
+ test %RDX_LP, %RDX_LP
jz L(return_null)
- shl $2, %rdx
+ shl $2, %RDX_LP
#else
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
punpcklbw %xmm1, %xmm1
- test %rdx, %rdx
+ test %RDX_LP, %RDX_LP
jz L(return_null)
punpcklbw %xmm1, %xmm1
#endif
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 5f5e7725..c81da19b 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -40,16 +40,20 @@
ENTRY (MEMCHR)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
- testq %rdx, %rdx
+ test %RDX_LP, %RDX_LP
jz L(null)
# endif
movl %edi, %ecx
/* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
# ifdef USE_AS_WMEMCHR
- shl $2, %rdx
+ shl $2, %RDX_LP
vpbroadcastd %xmm0, %ymm0
# else
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
vpbroadcastb %xmm0, %ymm0
# endif
/* Check if we may cross page boundary with one vector load. */
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index f2ebc24f..7d528889 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -4,3 +4,11 @@ ifeq ($(subdir),math)
# 64-bit llround. Add -fno-builtin-lround to silence the compiler.
CFLAGS-s_llround.c += -fno-builtin-lround
endif
+
+ifeq ($(subdir),string)
+tests += tst-size_t-memchr
+endif
+
+ifeq ($(subdir),wcsmbs)
+tests += tst-size_t-wmemchr
+endif
diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h
new file mode 100644
index 00000000..78a94086
--- /dev/null
+++ b/sysdeps/x86_64/x32/test-size_t.h
@@ -0,0 +1,35 @@
+/* Test string/memory functions with size_t in the lower 32 bits of
+ 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define TEST_MAIN
+#include <string/test-string.h>
+
+/* On x32, parameter_t may be passed in a 64-bit register with the LEN
+ field in the lower 32 bits. When the LEN field of 64-bit register
+ is passed to string/memory function as the size_t parameter, only
+ the lower 32 bits can be used. */
+typedef struct
+{
+ union
+ {
+ size_t len;
+ void (*fn) (void);
+ };
+ void *p;
+} parameter_t;
diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c
new file mode 100644
index 00000000..29a3daf1
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c
@@ -0,0 +1,72 @@
+/* Test memchr with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef WIDE
+# define TEST_NAME "memchr"
+#else
+# define TEST_NAME "wmemchr"
+#endif /* WIDE */
+#include "test-size_t.h"
+
+#ifndef WIDE
+# define MEMCHR memchr
+# define CHAR char
+# define UCHAR unsigned char
+#else
+# include <wchar.h>
+# define MEMCHR wmemchr
+# define CHAR wchar_t
+# define UCHAR wchar_t
+#endif /* WIDE */
+
+IMPL (MEMCHR, 1)
+
+typedef CHAR * (*proto_t) (const CHAR*, int, size_t);
+
+static CHAR *
+__attribute__ ((noinline, noclone))
+do_memchr (parameter_t a, parameter_t b)
+{
+ return CALL (&b, a.p, (uintptr_t) b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+ test_init ();
+
+ parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
+ parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
+
+ int ret = 0;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ c.fn = impl->fn;
+ CHAR *res = do_memchr (src, c);
+ if (res)
+ {
+ error (0, 0, "Wrong result in function %s: %p != NULL",
+ impl->name, res);
+ ret = 1;
+ }
+ }
+
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
new file mode 100644
index 00000000..877801d6
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
@@ -0,0 +1,20 @@
+/* Test wmemchr with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define WIDE 1
+#include "tst-size_t-memchr.c"
--
GitLab

View File

@ -1,41 +0,0 @@
From ddf0992cf57a93200e0c782e2a94d0733a5a0b87 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Sun, 9 Jan 2022 16:02:21 -0600
Subject: [PATCH] x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
Content-type: text/plain; charset=UTF-8
Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
__wcscmp_avx2. For x86_64 this covers the entire address range so any
length larger could not possibly be used to bound `s1` or `s2`.
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/strcmp-avx2.S | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 156c1949..8fb8eedc 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -83,6 +83,16 @@ ENTRY (STRCMP)
je L(char0)
jb L(zero)
# ifdef USE_AS_WCSCMP
+# ifndef __ILP32__
+ movq %rdx, %rcx
+ /* Check if length could overflow when multiplied by
+ sizeof(wchar_t). Checking top 8 bits will cover all potential
+ overflow cases as well as redirect cases where its impossible to
+ length to bound a valid memory region. In these cases just use
+ 'wcscmp'. */
+ shrq $56, %rcx
+ jnz __wcscmp_avx2
+# endif
/* Convert units: from wide to byte char. */
shl $2, %RDX_LP
# endif
--
GitLab

View File

@ -1,257 +0,0 @@
From 244b415d386487521882debb845a040a4758cb18 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 25 Mar 2022 17:13:33 -0500
Subject: [PATCH] x86: Small improvements for wcslen
Content-type: text/plain; charset=UTF-8
Just a few QOL changes.
1. Prefer `add` > `lea` as it has high execution units it can run
on.
2. Don't break macro-fusion between `test` and `jcc`
3. Reduce code size by removing gratuitous padding bytes (-90
bytes).
geometric_mean(N=20) of all benchmarks New / Original: 0.959
All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++---------------------
1 file changed, 41 insertions(+), 45 deletions(-)
diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
index 9f5f7232..254bb030 100644
--- a/sysdeps/x86_64/wcslen.S
+++ b/sysdeps/x86_64/wcslen.S
@@ -41,82 +41,82 @@ ENTRY (__wcslen)
pxor %xmm0, %xmm0
lea 32(%rdi), %rax
- lea 16(%rdi), %rcx
+ addq $16, %rdi
and $-16, %rax
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
pxor %xmm1, %xmm1
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
pxor %xmm2, %xmm2
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
pxor %xmm3, %xmm3
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
and $-0x40, %rax
@@ -133,104 +133,100 @@ L(aligned_64_loop):
pminub %xmm0, %xmm2
pcmpeqd %xmm3, %xmm2
pmovmskb %xmm2, %edx
+ addq $64, %rax
test %edx, %edx
- lea 64(%rax), %rax
jz L(aligned_64_loop)
pcmpeqd -64(%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $48, %rdi
test %edx, %edx
- lea 48(%rcx), %rcx
jnz L(exit)
pcmpeqd %xmm1, %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
jnz L(exit)
pcmpeqd -32(%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
jnz L(exit)
pcmpeqd %xmm6, %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
- jnz L(exit)
-
- jmp L(aligned_64_loop)
+ jz L(aligned_64_loop)
.p2align 4
L(exit):
- sub %rcx, %rax
+ sub %rdi, %rax
shr $2, %rax
test %dl, %dl
jz L(exit_high)
- mov %dl, %cl
- and $15, %cl
+ andl $15, %edx
jz L(exit_1)
ret
- .p2align 4
+ /* No align here. Naturally aligned % 16 == 1. */
L(exit_high):
- mov %dh, %ch
- and $15, %ch
+ andl $(15 << 8), %edx
jz L(exit_3)
add $2, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_1):
add $1, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_3):
add $3, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_tail0):
- xor %rax, %rax
+ xorl %eax, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail1):
- mov $1, %rax
+ movl $1, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail2):
- mov $2, %rax
+ movl $2, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail3):
- mov $3, %rax
+ movl $3, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail4):
- mov $4, %rax
+ movl $4, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail5):
- mov $5, %rax
+ movl $5, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail6):
- mov $6, %rax
+ movl $6, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail7):
- mov $7, %rax
+ movl $7, %eax
ret
END (__wcslen)
--
GitLab

View File

@ -1,964 +0,0 @@
From 7cbc03d03091d5664060924789afe46d30a5477e Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 15 Apr 2022 12:28:00 -0500
Subject: [PATCH] x86: Remove memcmp-sse4.S
Content-type: text/plain; charset=UTF-8
Code didn't actually use any sse4 instructions since `ptest` was
removed in:
commit 2f9062d7171850451e6044ef78d91ff8c017b9c0
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed Nov 10 16:18:56 2021 -0600
x86: Shrink memcmp-sse4.S code size
The new memcmp-sse2 implementation is also faster.
geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905
Note there are two regressions preferring SSE2 for Size = 1 and Size =
65.
Size = 1:
size, align0, align1, ret, New Time/Old Time
1, 1, 1, 0, 1.2
1, 1, 1, 1, 1.197
1, 1, 1, -1, 1.2
This is intentional. Size == 1 is significantly less hot based on
profiles of GCC11 and Python3 than sizes [4, 8] (which is made
hotter).
Python3 Size = 1 -> 13.64%
Python3 Size = [4, 8] -> 60.92%
GCC11 Size = 1 -> 1.29%
GCC11 Size = [4, 8] -> 33.86%
size, align0, align1, ret, New Time/Old Time
4, 4, 4, 0, 0.622
4, 4, 4, 1, 0.797
4, 4, 4, -1, 0.805
5, 5, 5, 0, 0.623
5, 5, 5, 1, 0.777
5, 5, 5, -1, 0.802
6, 6, 6, 0, 0.625
6, 6, 6, 1, 0.813
6, 6, 6, -1, 0.788
7, 7, 7, 0, 0.625
7, 7, 7, 1, 0.799
7, 7, 7, -1, 0.795
8, 8, 8, 0, 0.625
8, 8, 8, 1, 0.848
8, 8, 8, -1, 0.914
9, 9, 9, 0, 0.625
Size = 65:
size, align0, align1, ret, New Time/Old Time
65, 0, 0, 0, 1.103
65, 0, 0, 1, 1.216
65, 0, 0, -1, 1.227
65, 65, 0, 0, 1.091
65, 0, 65, 1, 1.19
65, 65, 65, -1, 1.215
This is because A) the checks in range [65, 96] are now unrolled 2x
and B) because smaller values <= 16 are now given a hotter path. By
contrast the SSE4 version has a branch for Size = 80. The unrolled
version has get better performance for returns which need both
comparisons.
size, align0, align1, ret, New Time/Old Time
128, 4, 8, 0, 0.858
128, 4, 8, 1, 0.879
128, 4, 8, -1, 0.888
As well, out of microbenchmark environments that are not full
predictable the branch will have a real-cost.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 -
sysdeps/x86_64/multiarch/memcmp-sse4.S | 804 ---------------------
4 files changed, 814 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index bca82e38..b503e4b8 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -11,7 +11,6 @@ sysdep_routines += \
memcmp-avx2-movbe-rtm \
memcmp-evex-movbe \
memcmp-sse2 \
- memcmp-sse4 \
memcmp-ssse3 \
memcpy-ssse3 \
memcpy-ssse3-back \
@@ -174,7 +173,6 @@ sysdep_routines += \
wmemcmp-avx2-movbe-rtm \
wmemcmp-c \
wmemcmp-evex-movbe \
- wmemcmp-sse4 \
wmemcmp-ssse3 \
# sysdep_routines
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 14314367..450a2917 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -78,8 +78,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_evex_movbe)
- IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
- __memcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
__memcmp_ssse3)
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
@@ -824,8 +822,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__wmemcmp_evex_movbe)
- IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
- __wmemcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
__wmemcmp_ssse3)
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index 690dffe8..0bc47a7f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -21,7 +21,6 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
@@ -47,9 +46,6 @@ IFUNC_SELECTOR (void)
return OPTIMIZE (avx2_movbe);
}
- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
- return OPTIMIZE (sse4_1);
-
if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
return OPTIMIZE (ssse3);
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
deleted file mode 100644
index 50060006..00000000
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ /dev/null
@@ -1,804 +0,0 @@
-/* memcmp with SSE4.1, wmemcmp with SSE4.1
- Copyright (C) 2010-2018 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-# define MEMCMP __memcmp_sse4_1
-# endif
-
-#ifdef USE_AS_WMEMCMP
-# define CMPEQ pcmpeqd
-# define CHAR_SIZE 4
-#else
-# define CMPEQ pcmpeqb
-# define CHAR_SIZE 1
-#endif
-
-
-/* Warning!
- wmemcmp has to use SIGNED comparison for elements.
- memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
- .section .text.sse4.1,"ax",@progbits
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
- shl $2, %RDX_LP
-# elif defined __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-# endif
- cmp $79, %RDX_LP
- ja L(79bytesormore)
-
- cmp $CHAR_SIZE, %RDX_LP
- jbe L(firstbyte)
-
- /* N in (CHAR_SIZE, 79) bytes. */
- cmpl $32, %edx
- ja L(more_32_bytes)
-
- cmpl $16, %edx
- jae L(16_to_32_bytes)
-
-# ifndef USE_AS_WMEMCMP
- cmpl $8, %edx
- jae L(8_to_16_bytes)
-
- cmpl $4, %edx
- jb L(2_to_3_bytes)
-
- movl (%rdi), %eax
- movl (%rsi), %ecx
-
- bswap %eax
- bswap %ecx
-
- shlq $32, %rax
- shlq $32, %rcx
-
- movl -4(%rdi, %rdx), %edi
- movl -4(%rsi, %rdx), %esi
-
- bswap %edi
- bswap %esi
-
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- cmovne %edx, %eax
- sbbl %ecx, %ecx
- orl %ecx, %eax
- ret
-
- .p2align 4,, 8
-L(2_to_3_bytes):
- movzwl (%rdi), %eax
- movzwl (%rsi), %ecx
- shll $8, %eax
- shll $8, %ecx
- bswap %eax
- bswap %ecx
- movzbl -1(%rdi, %rdx), %edi
- movzbl -1(%rsi, %rdx), %esi
- orl %edi, %eax
- orl %esi, %ecx
- subl %ecx, %eax
- ret
-
- .p2align 4,, 8
-L(8_to_16_bytes):
- movq (%rdi), %rax
- movq (%rsi), %rcx
-
- bswap %rax
- bswap %rcx
-
- subq %rcx, %rax
- jne L(8_to_16_bytes_done)
-
- movq -8(%rdi, %rdx), %rax
- movq -8(%rsi, %rdx), %rcx
-
- bswap %rax
- bswap %rcx
-
- subq %rcx, %rax
-
-L(8_to_16_bytes_done):
- cmovne %edx, %eax
- sbbl %ecx, %ecx
- orl %ecx, %eax
- ret
-# else
- xorl %eax, %eax
- movl (%rdi), %ecx
- cmpl (%rsi), %ecx
- jne L(8_to_16_bytes_done)
- movl 4(%rdi), %ecx
- cmpl 4(%rsi), %ecx
- jne L(8_to_16_bytes_done)
- movl -4(%rdi, %rdx), %ecx
- cmpl -4(%rsi, %rdx), %ecx
- jne L(8_to_16_bytes_done)
- ret
-# endif
-
- .p2align 4,, 3
-L(ret_zero):
- xorl %eax, %eax
-L(zero):
- ret
-
- .p2align 4,, 8
-L(firstbyte):
- jb L(ret_zero)
-# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl (%rdi), %ecx
- cmpl (%rsi), %ecx
- je L(zero)
-L(8_to_16_bytes_done):
- setg %al
- leal -1(%rax, %rax), %eax
-# else
- movzbl (%rdi), %eax
- movzbl (%rsi), %ecx
- sub %ecx, %eax
-# endif
- ret
-
- .p2align 4
-L(vec_return_begin_48):
- addq $16, %rdi
- addq $16, %rsi
-L(vec_return_begin_32):
- bsfl %eax, %eax
-# ifdef USE_AS_WMEMCMP
- movl 32(%rdi, %rax), %ecx
- xorl %edx, %edx
- cmpl 32(%rsi, %rax), %ecx
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl 32(%rsi, %rax), %ecx
- movzbl 32(%rdi, %rax), %eax
- subl %ecx, %eax
-# endif
- ret
-
- .p2align 4
-L(vec_return_begin_16):
- addq $16, %rdi
- addq $16, %rsi
-L(vec_return_begin):
- bsfl %eax, %eax
-# ifdef USE_AS_WMEMCMP
- movl (%rdi, %rax), %ecx
- xorl %edx, %edx
- cmpl (%rsi, %rax), %ecx
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl (%rsi, %rax), %ecx
- movzbl (%rdi, %rax), %eax
- subl %ecx, %eax
-# endif
- ret
-
- .p2align 4
-L(vec_return_end_16):
- subl $16, %edx
-L(vec_return_end):
- bsfl %eax, %eax
- addl %edx, %eax
-# ifdef USE_AS_WMEMCMP
- movl -16(%rdi, %rax), %ecx
- xorl %edx, %edx
- cmpl -16(%rsi, %rax), %ecx
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl -16(%rsi, %rax), %ecx
- movzbl -16(%rdi, %rax), %eax
- subl %ecx, %eax
-# endif
- ret
-
- .p2align 4,, 8
-L(more_32_bytes):
- movdqu (%rdi), %xmm0
- movdqu (%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm0
- movdqu 16(%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- cmpl $64, %edx
- jbe L(32_to_64_bytes)
- movdqu 32(%rdi), %xmm0
- movdqu 32(%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- .p2align 4,, 6
-L(32_to_64_bytes):
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(16_to_32_bytes):
- movdqu (%rdi), %xmm0
- movdqu (%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
-
- .p2align 4
-L(79bytesormore):
- movdqu (%rdi), %xmm0
- movdqu (%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
-
- mov %rsi, %rcx
- and $-16, %rsi
- add $16, %rsi
- sub %rsi, %rcx
-
- sub %rcx, %rdi
- add %rcx, %rdx
- test $0xf, %rdi
- jz L(2aligned)
-
- cmp $128, %rdx
- ja L(128bytesormore)
-
- .p2align 4,, 6
-L(less128bytes):
- movdqu (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqu 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- cmp $96, %rdx
- jb L(32_to_64_bytes)
-
- addq $64, %rdi
- addq $64, %rsi
- subq $64, %rdx
-
- .p2align 4,, 6
-L(last_64_bytes):
- movdqu (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(128bytesormore):
- cmp $256, %rdx
- ja L(unaligned_loop)
-L(less256bytes):
- movdqu (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqu 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- addq $64, %rdi
- addq $64, %rsi
-
- movdqu (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqu 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- addq $-128, %rdx
- subq $-64, %rsi
- subq $-64, %rdi
-
- cmp $64, %rdx
- ja L(less128bytes)
-
- cmp $32, %rdx
- ja L(last_64_bytes)
-
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(unaligned_loop):
-# ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
-# else
- mov __x86_data_cache_size_half(%rip), %R8_LP
-# endif
- movq %r8, %r9
- addq %r8, %r8
- addq %r9, %r8
- cmpq %r8, %rdx
- ja L(L2_L3_cache_unaligned)
- sub $64, %rdx
- .p2align 4
-L(64bytesormore_loop):
- movdqu (%rdi), %xmm0
- movdqu 16(%rdi), %xmm1
- movdqu 32(%rdi), %xmm2
- movdqu 48(%rdi), %xmm3
-
- CMPEQ (%rsi), %xmm0
- CMPEQ 16(%rsi), %xmm1
- CMPEQ 32(%rsi), %xmm2
- CMPEQ 48(%rsi), %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
-
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- ja L(64bytesormore_loop)
-
- .p2align 4,, 6
-L(loop_tail):
- addq %rdx, %rdi
- movdqu (%rdi), %xmm0
- movdqu 16(%rdi), %xmm1
- movdqu 32(%rdi), %xmm2
- movdqu 48(%rdi), %xmm3
-
- addq %rdx, %rsi
- movdqu (%rsi), %xmm4
- movdqu 16(%rsi), %xmm5
- movdqu 32(%rsi), %xmm6
- movdqu 48(%rsi), %xmm7
-
- CMPEQ %xmm4, %xmm0
- CMPEQ %xmm5, %xmm1
- CMPEQ %xmm6, %xmm2
- CMPEQ %xmm7, %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
- ret
-
-L(L2_L3_cache_unaligned):
- subq $64, %rdx
- .p2align 4
-L(L2_L3_unaligned_128bytes_loop):
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x1c0(%rsi)
-
- movdqu (%rdi), %xmm0
- movdqu 16(%rdi), %xmm1
- movdqu 32(%rdi), %xmm2
- movdqu 48(%rdi), %xmm3
-
- CMPEQ (%rsi), %xmm0
- CMPEQ 16(%rsi), %xmm1
- CMPEQ 32(%rsi), %xmm2
- CMPEQ 48(%rsi), %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
-
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- ja L(L2_L3_unaligned_128bytes_loop)
- jmp L(loop_tail)
-
-
- /* This case is for machines which are sensitive for unaligned
- * instructions. */
- .p2align 4
-L(2aligned):
- cmp $128, %rdx
- ja L(128bytesormorein2aligned)
-L(less128bytesin2aligned):
- movdqa (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqa 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqa 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqa 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- cmp $96, %rdx
- jb L(32_to_64_bytes)
-
- addq $64, %rdi
- addq $64, %rsi
- subq $64, %rdx
-
- .p2align 4,, 6
-L(aligned_last_64_bytes):
- movdqa (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqa 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(128bytesormorein2aligned):
- cmp $256, %rdx
- ja L(aligned_loop)
-L(less256bytesin2alinged):
- movdqa (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqa 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqa 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqa 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- addq $64, %rdi
- addq $64, %rsi
-
- movdqa (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqa 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqa 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqa 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- addq $-128, %rdx
- subq $-64, %rsi
- subq $-64, %rdi
-
- cmp $64, %rdx
- ja L(less128bytesin2aligned)
-
- cmp $32, %rdx
- ja L(aligned_last_64_bytes)
-
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(aligned_loop):
-# ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
-# else
- mov __x86_data_cache_size_half(%rip), %R8_LP
-# endif
- movq %r8, %r9
- addq %r8, %r8
- addq %r9, %r8
- cmpq %r8, %rdx
- ja L(L2_L3_cache_aligned)
-
- sub $64, %rdx
- .p2align 4
-L(64bytesormore_loopin2aligned):
- movdqa (%rdi), %xmm0
- movdqa 16(%rdi), %xmm1
- movdqa 32(%rdi), %xmm2
- movdqa 48(%rdi), %xmm3
-
- CMPEQ (%rsi), %xmm0
- CMPEQ 16(%rsi), %xmm1
- CMPEQ 32(%rsi), %xmm2
- CMPEQ 48(%rsi), %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- ja L(64bytesormore_loopin2aligned)
- jmp L(loop_tail)
-
-L(L2_L3_cache_aligned):
- subq $64, %rdx
- .p2align 4
-L(L2_L3_aligned_128bytes_loop):
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x1c0(%rsi)
- movdqa (%rdi), %xmm0
- movdqa 16(%rdi), %xmm1
- movdqa 32(%rdi), %xmm2
- movdqa 48(%rdi), %xmm3
-
- CMPEQ (%rsi), %xmm0
- CMPEQ 16(%rsi), %xmm1
- CMPEQ 32(%rsi), %xmm2
- CMPEQ 48(%rsi), %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
-
- addq $64, %rsi
- addq $64, %rdi
- subq $64, %rdx
- ja L(L2_L3_aligned_128bytes_loop)
- jmp L(loop_tail)
-
- .p2align 4
-L(64bytesormore_loop_end):
- pmovmskb %xmm0, %ecx
- incw %cx
- jnz L(loop_end_ret)
-
- pmovmskb %xmm1, %ecx
- notw %cx
- sall $16, %ecx
- jnz L(loop_end_ret)
-
- pmovmskb %xmm2, %ecx
- notw %cx
- shlq $32, %rcx
- jnz L(loop_end_ret)
-
- addq $48, %rdi
- addq $48, %rsi
- movq %rax, %rcx
-
- .p2align 4,, 6
-L(loop_end_ret):
- bsfq %rcx, %rcx
-# ifdef USE_AS_WMEMCMP
- movl (%rdi, %rcx), %eax
- xorl %edx, %edx
- cmpl (%rsi, %rcx), %eax
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl (%rdi, %rcx), %eax
- movzbl (%rsi, %rcx), %ecx
- subl %ecx, %eax
-# endif
- ret
-END (MEMCMP)
-#endif
--
GitLab

View File

@ -1,263 +0,0 @@
From 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 15 Apr 2022 12:28:01 -0500
Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S
Content-type: text/plain; charset=UTF-8
Old code was both inefficient and wasted code size. New code (-62
bytes) and comparable or better performance in the page cross case.
geometric_mean(N=20) of page cross cases New / Original: 0.960
size, align0, align1, ret, New Time/Old Time
1, 4095, 0, 0, 1.001
1, 4095, 0, 1, 0.999
1, 4095, 0, -1, 1.0
2, 4094, 0, 0, 1.0
2, 4094, 0, 1, 1.0
2, 4094, 0, -1, 1.0
3, 4093, 0, 0, 1.0
3, 4093, 0, 1, 1.0
3, 4093, 0, -1, 1.0
4, 4092, 0, 0, 0.987
4, 4092, 0, 1, 1.0
4, 4092, 0, -1, 1.0
5, 4091, 0, 0, 0.984
5, 4091, 0, 1, 1.002
5, 4091, 0, -1, 1.005
6, 4090, 0, 0, 0.993
6, 4090, 0, 1, 1.001
6, 4090, 0, -1, 1.003
7, 4089, 0, 0, 0.991
7, 4089, 0, 1, 1.0
7, 4089, 0, -1, 1.001
8, 4088, 0, 0, 0.875
8, 4088, 0, 1, 0.881
8, 4088, 0, -1, 0.888
9, 4087, 0, 0, 0.872
9, 4087, 0, 1, 0.879
9, 4087, 0, -1, 0.883
10, 4086, 0, 0, 0.878
10, 4086, 0, 1, 0.886
10, 4086, 0, -1, 0.873
11, 4085, 0, 0, 0.878
11, 4085, 0, 1, 0.881
11, 4085, 0, -1, 0.879
12, 4084, 0, 0, 0.873
12, 4084, 0, 1, 0.889
12, 4084, 0, -1, 0.875
13, 4083, 0, 0, 0.873
13, 4083, 0, 1, 0.863
13, 4083, 0, -1, 0.863
14, 4082, 0, 0, 0.838
14, 4082, 0, 1, 0.869
14, 4082, 0, -1, 0.877
15, 4081, 0, 0, 0.841
15, 4081, 0, 1, 0.869
15, 4081, 0, -1, 0.876
16, 4080, 0, 0, 0.988
16, 4080, 0, 1, 0.99
16, 4080, 0, -1, 0.989
17, 4079, 0, 0, 0.978
17, 4079, 0, 1, 0.981
17, 4079, 0, -1, 0.98
18, 4078, 0, 0, 0.981
18, 4078, 0, 1, 0.98
18, 4078, 0, -1, 0.985
19, 4077, 0, 0, 0.977
19, 4077, 0, 1, 0.979
19, 4077, 0, -1, 0.986
20, 4076, 0, 0, 0.977
20, 4076, 0, 1, 0.986
20, 4076, 0, -1, 0.984
21, 4075, 0, 0, 0.977
21, 4075, 0, 1, 0.983
21, 4075, 0, -1, 0.988
22, 4074, 0, 0, 0.983
22, 4074, 0, 1, 0.994
22, 4074, 0, -1, 0.993
23, 4073, 0, 0, 0.98
23, 4073, 0, 1, 0.992
23, 4073, 0, -1, 0.995
24, 4072, 0, 0, 0.989
24, 4072, 0, 1, 0.989
24, 4072, 0, -1, 0.991
25, 4071, 0, 0, 0.99
25, 4071, 0, 1, 0.999
25, 4071, 0, -1, 0.996
26, 4070, 0, 0, 0.993
26, 4070, 0, 1, 0.995
26, 4070, 0, -1, 0.998
27, 4069, 0, 0, 0.993
27, 4069, 0, 1, 0.999
27, 4069, 0, -1, 1.0
28, 4068, 0, 0, 0.997
28, 4068, 0, 1, 1.0
28, 4068, 0, -1, 0.999
29, 4067, 0, 0, 0.996
29, 4067, 0, 1, 0.999
29, 4067, 0, -1, 0.999
30, 4066, 0, 0, 0.991
30, 4066, 0, 1, 1.001
30, 4066, 0, -1, 0.999
31, 4065, 0, 0, 0.988
31, 4065, 0, 1, 0.998
31, 4065, 0, -1, 0.998
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
1 file changed, 61 insertions(+), 37 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index 16fc673e..99258cf5 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -429,22 +429,21 @@ L(page_cross_less_vec):
# ifndef USE_AS_WMEMCMP
cmpl $8, %edx
jae L(between_8_15)
+ /* Fall through for [4, 7]. */
cmpl $4, %edx
- jae L(between_4_7)
+ jb L(between_2_3)
- /* Load as big endian to avoid branches. */
- movzwl (%rdi), %eax
- movzwl (%rsi), %ecx
- shll $8, %eax
- shll $8, %ecx
- bswap %eax
- bswap %ecx
- movzbl -1(%rdi, %rdx), %edi
- movzbl -1(%rsi, %rdx), %esi
- orl %edi, %eax
- orl %esi, %ecx
- /* Subtraction is okay because the upper 8 bits are zero. */
- subl %ecx, %eax
+ movbe (%rdi), %eax
+ movbe (%rsi), %ecx
+ shlq $32, %rax
+ shlq $32, %rcx
+ movbe -4(%rdi, %rdx), %edi
+ movbe -4(%rsi, %rdx), %esi
+ orq %rdi, %rax
+ orq %rsi, %rcx
+ subq %rcx, %rax
+ /* Fast path for return zero. */
+ jnz L(ret_nonzero)
/* No ymm register was touched. */
ret
@@ -457,9 +456,33 @@ L(one_or_less):
/* No ymm register was touched. */
ret
+ .p2align 4,, 5
+L(ret_nonzero):
+ sbbl %eax, %eax
+ orl $1, %eax
+ /* No ymm register was touched. */
+ ret
+
+ .p2align 4,, 2
+L(zero):
+ xorl %eax, %eax
+ /* No ymm register was touched. */
+ ret
+
.p2align 4
L(between_8_15):
-# endif
+ movbe (%rdi), %rax
+ movbe (%rsi), %rcx
+ subq %rcx, %rax
+ jnz L(ret_nonzero)
+ movbe -8(%rdi, %rdx), %rax
+ movbe -8(%rsi, %rdx), %rcx
+ subq %rcx, %rax
+ /* Fast path for return zero. */
+ jnz L(ret_nonzero)
+ /* No ymm register was touched. */
+ ret
+# else
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
vmovq (%rdi), %xmm1
vmovq (%rsi), %xmm2
@@ -475,16 +498,13 @@ L(between_8_15):
VPCMPEQ %xmm1, %xmm2, %xmm2
vpmovmskb %xmm2, %eax
subl $0xffff, %eax
+ /* Fast path for return zero. */
jnz L(return_vec_0)
/* No ymm register was touched. */
ret
+# endif
- .p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-
- .p2align 4
+ .p2align 4,, 10
L(between_16_31):
/* From 16 to 31 bytes. No branch when size == 16. */
vmovdqu (%rsi), %xmm2
@@ -501,11 +521,17 @@ L(between_16_31):
VPCMPEQ (%rdi), %xmm2, %xmm2
vpmovmskb %xmm2, %eax
subl $0xffff, %eax
+ /* Fast path for return zero. */
jnz L(return_vec_0)
/* No ymm register was touched. */
ret
# ifdef USE_AS_WMEMCMP
+ .p2align 4,, 2
+L(zero):
+ xorl %eax, %eax
+ ret
+
.p2align 4
L(one_or_less):
jb L(zero)
@@ -520,22 +546,20 @@ L(one_or_less):
# else
.p2align 4
-L(between_4_7):
- /* Load as big endian with overlapping movbe to avoid branches.
- */
- movbe (%rdi), %eax
- movbe (%rsi), %ecx
- shlq $32, %rax
- shlq $32, %rcx
- movbe -4(%rdi, %rdx), %edi
- movbe -4(%rsi, %rdx), %esi
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- jz L(zero_4_7)
- sbbl %eax, %eax
- orl $1, %eax
-L(zero_4_7):
+L(between_2_3):
+ /* Load as big endian to avoid branches. */
+ movzwl (%rdi), %eax
+ movzwl (%rsi), %ecx
+ bswap %eax
+ bswap %ecx
+ shrl %eax
+ shrl %ecx
+ movzbl -1(%rdi, %rdx), %edi
+ movzbl -1(%rsi, %rdx), %esi
+ orl %edi, %eax
+ orl %esi, %ecx
+ /* Subtraction is okay because the upper bit is zero. */
+ subl %ecx, %eax
/* No ymm register was touched. */
ret
# endif
--
GitLab

View File

@ -1,876 +0,0 @@
From 5307aa9c1800f36a64c183c091c9af392c1fa75c Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 21 Apr 2022 20:52:28 -0500
Subject: [PATCH] x86: Optimize {str|wcs}rchr-sse2
Content-type: text/plain; charset=UTF-8
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.741
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
sysdeps/x86_64/strrchr.S | 510 +++++++++++++++---------
sysdeps/x86_64/wcsrchr.S | 266 +-----------
4 files changed, 338 insertions(+), 443 deletions(-)
Conflicts:
sysdeps/x86_64/wcsrchr.S
(copyright header)
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
index 0ec76fe9..6bb1284b 100644
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -17,7 +17,7 @@
<http://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define strrchr __strrchr_sse2
+# define STRRCHR __strrchr_sse2
# undef weak_alias
# define weak_alias(strrchr, rindex)
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
index d015e953..f26d53b5 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -17,7 +17,6 @@
<http://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define wcsrchr __wcsrchr_sse2
+# define STRRCHR __wcsrchr_sse2
#endif
-
#include "../wcsrchr.S"
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index aca98e7e..a58cc220 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -19,210 +19,360 @@
#include <sysdep.h>
+#ifndef STRRCHR
+# define STRRCHR strrchr
+#endif
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ pcmpeqd
+# define CHAR_SIZE 4
+# define PMINU pminud
+#else
+# define PCMPEQ pcmpeqb
+# define CHAR_SIZE 1
+# define PMINU pminub
+#endif
+
+#define PAGE_SIZE 4096
+#define VEC_SIZE 16
+
.text
-ENTRY (strrchr)
- movd %esi, %xmm1
+ENTRY(STRRCHR)
+ movd %esi, %xmm0
movq %rdi, %rax
- andl $4095, %eax
- punpcklbw %xmm1, %xmm1
- cmpq $4032, %rax
- punpcklwd %xmm1, %xmm1
- pshufd $0, %xmm1, %xmm1
+ andl $(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+ punpcklbw %xmm0, %xmm0
+ punpcklwd %xmm0, %xmm0
+#endif
+ pshufd $0, %xmm0, %xmm0
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
ja L(cross_page)
- movdqu (%rdi), %xmm0
+
+L(cross_page_continue):
+ movups (%rdi), %xmm1
pxor %xmm2, %xmm2
- movdqa %xmm0, %xmm3
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %ecx
- pmovmskb %xmm3, %edx
- testq %rdx, %rdx
- je L(next_48_bytes)
- leaq -1(%rdx), %rax
- xorq %rdx, %rax
- andq %rcx, %rax
- je L(exit)
- bsrq %rax, %rax
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %ecx
+ testl %ecx, %ecx
+ jz L(aligned_more)
+
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
addq %rdi, %rax
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+ search CHAR is zero we are correct. Either way `andq
+ -CHAR_SIZE, %rax` gets the correct result. */
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret0):
ret
+ /* Returns for first vec x1/x2 have hard coded backward search
+ path for earlier matches. */
.p2align 4
-L(next_48_bytes):
- movdqu 16(%rdi), %xmm4
- movdqa %xmm4, %xmm5
- movdqu 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm2, %xmm5
- movdqu 48(%rdi), %xmm0
- pmovmskb %xmm5, %edx
- movdqa %xmm3, %xmm5
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm2, %xmm5
- pcmpeqb %xmm0, %xmm2
- salq $16, %rdx
- pmovmskb %xmm3, %r8d
- pmovmskb %xmm5, %eax
- pmovmskb %xmm2, %esi
- salq $32, %r8
- salq $32, %rax
- pcmpeqb %xmm1, %xmm0
- orq %rdx, %rax
- movq %rsi, %rdx
- pmovmskb %xmm4, %esi
- salq $48, %rdx
- salq $16, %rsi
- orq %r8, %rsi
- orq %rcx, %rsi
- pmovmskb %xmm0, %ecx
- salq $48, %rcx
- orq %rcx, %rsi
- orq %rdx, %rax
- je L(loop_header2)
- leaq -1(%rax), %rcx
- xorq %rax, %rcx
- andq %rcx, %rsi
- je L(exit)
- bsrq %rsi, %rsi
- leaq (%rdi,%rsi), %rax
+L(first_vec_x0_test):
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ testl %eax, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
+ addq %r8, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
.p2align 4
-L(loop_header2):
- testq %rsi, %rsi
- movq %rdi, %rcx
- je L(no_c_found)
-L(loop_header):
- addq $64, %rdi
- pxor %xmm7, %xmm7
- andq $-64, %rdi
- jmp L(loop_entry)
+L(first_vec_x1):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
.p2align 4
-L(loop64):
- testq %rdx, %rdx
- cmovne %rdx, %rsi
- cmovne %rdi, %rcx
- addq $64, %rdi
-L(loop_entry):
- movdqa 32(%rdi), %xmm3
- pxor %xmm6, %xmm6
- movdqa 48(%rdi), %xmm2
- movdqa %xmm3, %xmm0
- movdqa 16(%rdi), %xmm4
- pminub %xmm2, %xmm0
- movdqa (%rdi), %xmm5
- pminub %xmm4, %xmm0
- pminub %xmm5, %xmm0
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %eax
- movdqa %xmm5, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %r9d
- movdqa %xmm4, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
- movdqa %xmm3, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $16, %rdx
- pmovmskb %xmm0, %r10d
- movdqa %xmm2, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $32, %r10
- orq %r10, %rdx
- pmovmskb %xmm0, %r8d
- orq %r9, %rdx
- salq $48, %r8
- orq %r8, %rdx
+L(first_vec_x1_test):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
testl %eax, %eax
- je L(loop64)
- pcmpeqb %xmm6, %xmm4
- pcmpeqb %xmm6, %xmm3
- pcmpeqb %xmm6, %xmm5
- pmovmskb %xmm4, %eax
- pmovmskb %xmm3, %r10d
- pcmpeqb %xmm6, %xmm2
- pmovmskb %xmm5, %r9d
- salq $32, %r10
- salq $16, %rax
- pmovmskb %xmm2, %r8d
- orq %r10, %rax
- orq %r9, %rax
- salq $48, %r8
- orq %r8, %rax
- leaq -1(%rax), %r8
- xorq %rax, %r8
- andq %r8, %rdx
- cmovne %rdi, %rcx
- cmovne %rdx, %rsi
- bsrq %rsi, %rsi
- leaq (%rcx,%rsi), %rax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm3, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(aligned_more):
+ /* Save original pointer if match was in VEC 0. */
+ movq %rdi, %r8
+ andq $-VEC_SIZE, %rdi
+
+ movaps VEC_SIZE(%rdi), %xmm2
+ pxor %xmm3, %xmm3
+ PCMPEQ %xmm2, %xmm3
+ pmovmskb %xmm3, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x1)
+
+ movaps (VEC_SIZE * 2)(%rdi), %xmm3
+ pxor %xmm4, %xmm4
+ PCMPEQ %xmm3, %xmm4
+ pmovmskb %xmm4, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
+
+ addq $VEC_SIZE, %rdi
+ /* Save pointer again before realigning. */
+ movq %rdi, %rsi
+ andq $-(VEC_SIZE * 2), %rdi
+ .p2align 4
+L(first_loop):
+ /* Do 2x VEC at a time. */
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
+ detecting zero. Note if this is found to be a bottleneck it
+ may be worth adding an SSE4.1 wcsrchr implementation. */
+#ifdef USE_AS_WCSRCHR
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
+#endif
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
+
+ addq $(VEC_SIZE * 2), %rdi
+ /* Use `addl` 1) so we can undo it with `subl` and 2) it can
+ macro-fuse with `jz`. */
+ addl %ecx, %eax
+ jz L(first_loop)
+
+ /* Check if there is zero match. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+ /* Check if there was a match in last iteration. */
+ subl %ecx, %eax
+ jnz L(new_match)
+
+L(first_loop_old_match):
+ PCMPEQ %xmm0, %xmm2
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ addl %eax, %ecx
+ jz L(first_vec_x0_test)
+ /* NB: We could move this shift to before the branch and save a
+ bit of code size / performance on the fall through. The
+ branch leads to the null case which generally seems hotter
+ than char in first 3x VEC. */
+ sall $16, %eax
+ orl %ecx, %eax
+
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
+ /* Save minimum state for getting most recent match. We can
+ throw out all previous work. */
.p2align 4
-L(no_c_found):
- movl $1, %esi
- xorl %ecx, %ecx
- jmp L(loop_header)
+L(second_loop_match):
+ movq %rdi, %rsi
+ movaps %xmm4, %xmm2
+ movaps %xmm7, %xmm3
.p2align 4
-L(exit):
- xorl %eax, %eax
+L(second_loop):
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
+ detecting zero. Note if this is found to be a bottleneck it
+ may be worth adding an SSE4.1 wcsrchr implementation. */
+#ifdef USE_AS_WCSRCHR
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
+#endif
+
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
+
+ addq $(VEC_SIZE * 2), %rdi
+ /* Either null term or new occurence of CHAR. */
+ addl %ecx, %eax
+ jz L(second_loop)
+
+ /* No null term so much be new occurence of CHAR. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+
+ subl %ecx, %eax
+ jnz L(second_loop_new_match)
+
+L(second_loop_old_match):
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ sall $16, %eax
+ orl %ecx, %eax
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
.p2align 4
+L(second_loop_new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(second_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4,, 4
L(cross_page):
- movq %rdi, %rax
- pxor %xmm0, %xmm0
- andq $-64, %rax
- movdqu (%rax), %xmm5
- movdqa %xmm5, %xmm6
- movdqu 16(%rax), %xmm4
- pcmpeqb %xmm1, %xmm5
- pcmpeqb %xmm0, %xmm6
- movdqu 32(%rax), %xmm3
- pmovmskb %xmm6, %esi
- movdqa %xmm4, %xmm6
- movdqu 48(%rax), %xmm2
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm0, %xmm6
- pmovmskb %xmm6, %edx
- movdqa %xmm3, %xmm6
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm0, %xmm6
- pcmpeqb %xmm2, %xmm0
- salq $16, %rdx
- pmovmskb %xmm3, %r9d
- pmovmskb %xmm6, %r8d
- pmovmskb %xmm0, %ecx
- salq $32, %r9
- salq $32, %r8
- pcmpeqb %xmm1, %xmm2
- orq %r8, %rdx
- salq $48, %rcx
- pmovmskb %xmm5, %r8d
- orq %rsi, %rdx
- pmovmskb %xmm4, %esi
- orq %rcx, %rdx
- pmovmskb %xmm2, %ecx
- salq $16, %rsi
- salq $48, %rcx
- orq %r9, %rsi
- orq %r8, %rsi
- orq %rcx, %rsi
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rsi
+ movaps (%rsi), %xmm1
+ pxor %xmm2, %xmm2
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %edx
movl %edi, %ecx
- subl %eax, %ecx
- shrq %cl, %rdx
- shrq %cl, %rsi
- testq %rdx, %rdx
- je L(loop_header2)
- leaq -1(%rdx), %rax
- xorq %rdx, %rax
- andq %rax, %rsi
- je L(exit)
- bsrq %rsi, %rax
+ andl $(VEC_SIZE - 1), %ecx
+ sarl %cl, %edx
+ jz L(cross_page_continue)
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ sarl %cl, %eax
+ leal -1(%rdx), %ecx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret1)
+ bsrl %eax, %eax
addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret1):
ret
-END (strrchr)
+END(STRRCHR)
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
+#ifndef USE_AS_WCSRCHR
+ weak_alias (STRRCHR, rindex)
+ libc_hidden_builtin_def (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
index 2f388537..ae3cfa7d 100644
--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -17,266 +17,12 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <sysdep.h>
- .text
-ENTRY (wcsrchr)
+#define USE_AS_WCSRCHR 1
+#define NO_PMINU 1
- movd %rsi, %xmm1
- mov %rdi, %rcx
- punpckldq %xmm1, %xmm1
- pxor %xmm2, %xmm2
- punpckldq %xmm1, %xmm1
- and $63, %rcx
- cmp $48, %rcx
- ja L(crosscache)
+#ifndef STRRCHR
+# define STRRCHR wcsrchr
+#endif
- movdqu (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match1)
-
- test %rcx, %rcx
- jnz L(return_null)
-
- and $-16, %rdi
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match1):
- test %rcx, %rcx
- jnz L(prolog_find_zero_1)
-
- mov %rax, %r8
- mov %rdi, %rsi
- and $-16, %rdi
- jmp L(loop)
-
- .p2align 4
-L(crosscache):
- and $15, %rcx
- and $-16, %rdi
- pxor %xmm3, %xmm3
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm3
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm3, %rdx
- pmovmskb %xmm0, %rax
- shr %cl, %rdx
- shr %cl, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match)
-
- test %rdx, %rdx
- jnz L(return_null)
-
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match):
- test %rdx, %rdx
- jnz L(prolog_find_zero)
-
- mov %rax, %r8
- lea (%rdi, %rcx), %rsi
-
-/* Loop start on aligned string. */
- .p2align 4
-L(loop):
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm3
- pcmpeqd %xmm3, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm3
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm3, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm4
- pcmpeqd %xmm4, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm4
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm4, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm5
- pcmpeqd %xmm5, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm5
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm5, %rax
- or %rax, %rcx
- jz L(loop)
-
- .p2align 4
-L(matches):
- test %rax, %rax
- jnz L(match)
-L(return_value):
- test %r8, %r8
- jz L(return_null)
- mov %r8, %rax
- mov %rsi, %rdi
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match):
- pmovmskb %xmm2, %rcx
- test %rcx, %rcx
- jnz L(find_zero)
- mov %rax, %r8
- mov %rdi, %rsi
- jmp L(loop)
-
- .p2align 4
-L(find_zero):
- test $15, %cl
- jnz L(find_zero_in_first_wchar)
- test %cl, %cl
- jnz L(find_zero_in_second_wchar)
- test $15, %ch
- jnz L(find_zero_in_third_wchar)
-
- and $1 << 13 - 1, %rax
- jz L(return_value)
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_first_wchar):
- test $1, %rax
- jz L(return_value)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_second_wchar):
- and $1 << 5 - 1, %rax
- jz L(return_value)
-
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_third_wchar):
- and $1 << 9 - 1, %rax
- jz L(return_value)
-
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero):
- add %rcx, %rdi
- mov %rdx, %rcx
-L(prolog_find_zero_1):
- test $15, %cl
- jnz L(prolog_find_zero_in_first_wchar)
- test %cl, %cl
- jnz L(prolog_find_zero_in_second_wchar)
- test $15, %ch
- jnz L(prolog_find_zero_in_third_wchar)
-
- and $1 << 13 - 1, %rax
- jz L(return_null)
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_first_wchar):
- test $1, %rax
- jz L(return_null)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_second_wchar):
- and $1 << 5 - 1, %rax
- jz L(return_null)
-
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_third_wchar):
- and $1 << 9 - 1, %rax
- jz L(return_null)
-
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match_second_wchar):
- lea -12(%rdi), %rax
- ret
-
- .p2align 4
-L(match_third_wchar):
- lea -8(%rdi), %rax
- ret
-
- .p2align 4
-L(match_fourth_wchar):
- lea -4(%rdi), %rax
- ret
-
- .p2align 4
-L(return_null):
- xor %rax, %rax
- ret
-
-END (wcsrchr)
+#include "../strrchr.S"
--
GitLab

View File

@ -1,501 +0,0 @@
From df7e295d18ffa34f629578c0017a9881af7620f6 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 21 Apr 2022 20:52:29 -0500
Subject: [PATCH] x86: Optimize {str|wcs}rchr-avx2
Content-type: text/plain; charset=UTF-8
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.832
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
1 file changed, 269 insertions(+), 157 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index c949410b..3d26fad4 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -27,9 +27,13 @@
# ifdef USE_AS_WCSRCHR
# define VPBROADCAST vpbroadcastd
# define VPCMPEQ vpcmpeqd
+# define VPMIN vpminud
+# define CHAR_SIZE 4
# else
# define VPBROADCAST vpbroadcastb
# define VPCMPEQ vpcmpeqb
+# define VPMIN vpminub
+# define CHAR_SIZE 1
# endif
# ifndef VZEROUPPER
@@ -41,196 +45,304 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
- .section SECTION(.text),"ax",@progbits
-ENTRY (STRRCHR)
- movd %esi, %xmm4
- movl %edi, %ecx
+ .section SECTION(.text), "ax", @progbits
+ENTRY(STRRCHR)
+ movd %esi, %xmm7
+ movl %edi, %eax
/* Broadcast CHAR to YMM4. */
- VPBROADCAST %xmm4, %ymm4
+ VPBROADCAST %xmm7, %ymm7
vpxor %xmm0, %xmm0, %xmm0
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ /* Shift here instead of `andl` to save code size (saves a fetch
+ block). */
+ sall $20, %eax
+ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
+ ja L(cross_page)
+L(page_cross_continue):
vmovdqu (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- addq $VEC_SIZE, %rdi
+ /* Check end of string match. */
+ VPCMPEQ %ymm1, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ testl %ecx, %ecx
+ jz L(aligned_more)
+
+ /* Only check match with search CHAR if needed. */
+ VPCMPEQ %ymm1, %ymm7, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Check if match before first zero. */
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+ search CHAR is zero we are correct. Either way `andq
+ -CHAR_SIZE, %rax` gets the correct result. */
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret0):
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+
+ /* Returns for first vec x1/x2 have hard coded backward search
+ path for earlier matches. */
+ .p2align 4,, 10
+L(first_vec_x1):
+ VPCMPEQ %ymm2, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jnz L(first_vec_x1_return)
+
+ .p2align 4,, 4
+L(first_vec_x0_test):
+ VPCMPEQ %ymm1, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ testl %eax, %eax
+ jz L(ret1)
+ bsrl %eax, %eax
+ addq %r8, %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret1):
+ VZEROUPPER_RETURN
+ .p2align 4,, 10
+L(first_vec_x0_x1_test):
+ VPCMPEQ %ymm2, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ /* Check ymm2 for search CHAR match. If no match then check ymm1
+ before returning. */
testl %eax, %eax
- jnz L(first_vec)
+ jz L(first_vec_x0_test)
+ .p2align 4,, 4
+L(first_vec_x1_return):
+ bsrl %eax, %eax
+ leaq 1(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
- testl %ecx, %ecx
- jnz L(return_null)
- andq $-VEC_SIZE, %rdi
- xorl %edx, %edx
- jmp L(aligned_loop)
+ .p2align 4,, 10
+L(first_vec_x2):
+ VPCMPEQ %ymm3, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ blsmskl %ecx, %ecx
+ /* If no in-range search CHAR match in ymm3 then need to check
+ ymm1/ymm2 for an earlier match (we delay checking search
+ CHAR matches until needed). */
+ andl %ecx, %eax
+ jz L(first_vec_x0_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE + 1)(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
+
.p2align 4
-L(first_vec):
- /* Check if there is a nul CHAR. */
+L(aligned_more):
+ /* Save original pointer if match was in VEC 0. */
+ movq %rdi, %r8
+
+ /* Align src. */
+ orq $(VEC_SIZE - 1), %rdi
+ vmovdqu 1(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
testl %ecx, %ecx
- jnz L(char_and_nul_in_first_vec)
+ jnz L(first_vec_x1)
- /* Remember the match and keep searching. */
- movl %eax, %edx
- movq %rdi, %rsi
- andq $-VEC_SIZE, %rdi
- jmp L(aligned_loop)
+ vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
+ VPCMPEQ %ymm3, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
+ /* Save pointer again before realigning. */
+ movq %rdi, %rsi
+ addq $(VEC_SIZE + 1), %rdi
+ andq $-(VEC_SIZE * 2), %rdi
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %edx
- vpmovmskb %ymm3, %eax
- shrl %cl, %edx
- shrl %cl, %eax
- addq $VEC_SIZE, %rdi
-
- /* Check if there is a CHAR. */
+L(first_aligned_loop):
+ /* Do 2x VEC at a time. Any more and the cost of finding the
+ match outweights loop benefit. */
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
+
+ VPCMPEQ %ymm4, %ymm7, %ymm6
+ VPMIN %ymm4, %ymm5, %ymm8
+ VPCMPEQ %ymm5, %ymm7, %ymm10
+ vpor %ymm6, %ymm10, %ymm5
+ VPCMPEQ %ymm8, %ymm0, %ymm8
+ vpor %ymm5, %ymm8, %ymm9
+
+ vpmovmskb %ymm9, %eax
+ addq $(VEC_SIZE * 2), %rdi
+ /* No zero or search CHAR. */
testl %eax, %eax
- jnz L(found_char)
-
- testl %edx, %edx
- jnz L(return_null)
+ jz L(first_aligned_loop)
- jmp L(aligned_loop)
-
- .p2align 4
-L(found_char):
- testl %edx, %edx
- jnz L(char_and_nul)
+ /* If no zero CHAR then go to second loop (this allows us to
+ throw away all prior work). */
+ vpmovmskb %ymm8, %ecx
+ testl %ecx, %ecx
+ jz L(second_aligned_loop_prep)
- /* Remember the match and keep searching. */
- movl %eax, %edx
- leaq (%rdi, %rcx), %rsi
+ /* Search char could be zero so we need to get the true match.
+ */
+ vpmovmskb %ymm5, %eax
+ testl %eax, %eax
+ jnz L(first_aligned_loop_return)
- .p2align 4
-L(aligned_loop):
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- add $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
+ .p2align 4,, 4
+L(first_vec_x1_or_x2):
+ VPCMPEQ %ymm3, %ymm7, %ymm3
+ VPCMPEQ %ymm2, %ymm7, %ymm2
vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jz L(aligned_loop)
-
- .p2align 4
-L(char_nor_null):
- /* Find a CHAR or a nul CHAR in a loop. */
- testl %eax, %eax
- jnz L(match)
-L(return_value):
- testl %edx, %edx
- jz L(return_null)
- movl %edx, %eax
- movq %rsi, %rdi
+ vpmovmskb %ymm2, %edx
+ /* Use add for macro-fusion. */
+ addq %rax, %rdx
+ jz L(first_vec_x0_test)
+ /* NB: We could move this shift to before the branch and save a
+ bit of code size / performance on the fall through. The
+ branch leads to the null case which generally seems hotter
+ than char in first 3x VEC. */
+ salq $32, %rax
+ addq %rdx, %rax
+ bsrq %rax, %rax
+ leaq 1(%rsi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
+ .p2align 4,, 8
+L(first_aligned_loop_return):
+ VPCMPEQ %ymm4, %ymm0, %ymm4
+ vpmovmskb %ymm4, %edx
+ salq $32, %rcx
+ orq %rdx, %rcx
+
+ vpmovmskb %ymm10, %eax
+ vpmovmskb %ymm6, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ blsmskq %rcx, %rcx
+ andq %rcx, %rax
+ jz L(first_vec_x1_or_x2)
+
+ bsrq %rax, %rax
+ leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax
# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %eax
+ andq $-CHAR_SIZE, %rax
# endif
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
-L(return_vzeroupper):
- ZERO_UPPER_VEC_REGISTERS_RETURN
+ VZEROUPPER_RETURN
+ /* Search char cannot be zero. */
.p2align 4
-L(match):
- /* Find a CHAR. Check if there is a nul CHAR. */
- vpmovmskb %ymm2, %ecx
- testl %ecx, %ecx
- jnz L(find_nul)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
+L(second_aligned_loop_set_furthest_match):
+ /* Save VEC and pointer from most recent match. */
+L(second_aligned_loop_prep):
movq %rdi, %rsi
- jmp L(aligned_loop)
+ vmovdqu %ymm6, %ymm2
+ vmovdqu %ymm10, %ymm3
.p2align 4
-L(find_nul):
-# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %ecx
- andl $0x11111111, %eax
-# endif
- /* Mask out any matching bits after the nul CHAR. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
+L(second_aligned_loop):
+ /* Search 2x at at time. */
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
+
+ VPCMPEQ %ymm4, %ymm7, %ymm6
+ VPMIN %ymm4, %ymm5, %ymm1
+ VPCMPEQ %ymm5, %ymm7, %ymm10
+ vpor %ymm6, %ymm10, %ymm5
+ VPCMPEQ %ymm1, %ymm0, %ymm1
+ vpor %ymm5, %ymm1, %ymm9
+
+ vpmovmskb %ymm9, %eax
+ addq $(VEC_SIZE * 2), %rdi
testl %eax, %eax
- /* If there is no CHAR here, return the remembered one. */
- jz L(return_value)
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
- VZEROUPPER_RETURN
-
- .p2align 4
-L(char_and_nul):
- /* Find both a CHAR and a nul CHAR. */
- addq %rcx, %rdi
- movl %edx, %ecx
-L(char_and_nul_in_first_vec):
-# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %ecx
- andl $0x11111111, %eax
-# endif
- /* Mask out any matching bits after the nul CHAR. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
+ jz L(second_aligned_loop)
+ vpmovmskb %ymm1, %ecx
+ testl %ecx, %ecx
+ jz L(second_aligned_loop_set_furthest_match)
+ vpmovmskb %ymm5, %eax
testl %eax, %eax
- /* Return null pointer if the nul CHAR comes first. */
- jz L(return_null)
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ jnz L(return_new_match)
+
+ /* This is the hot patch. We know CHAR is inbounds and that
+ ymm3/ymm2 have latest match. */
+ .p2align 4,, 4
+L(return_old_match):
+ vpmovmskb %ymm3, %eax
+ vpmovmskb %ymm2, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ bsrq %rax, %rax
+ /* Search char cannot be zero so safe to just use lea for
+ wcsrchr. */
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
VZEROUPPER_RETURN
- .p2align 4
-L(return_null):
- xorl %eax, %eax
+ /* Last iteration also potentially has a match. */
+ .p2align 4,, 8
+L(return_new_match):
+ VPCMPEQ %ymm4, %ymm0, %ymm4
+ vpmovmskb %ymm4, %edx
+ salq $32, %rcx
+ orq %rdx, %rcx
+
+ vpmovmskb %ymm10, %eax
+ vpmovmskb %ymm6, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ blsmskq %rcx, %rcx
+ andq %rcx, %rax
+ jz L(return_old_match)
+ bsrq %rax, %rax
+ /* Search char cannot be zero so safe to just use lea for
+ wcsrchr. */
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
VZEROUPPER_RETURN
-END (STRRCHR)
+ .p2align 4,, 4
+L(cross_page):
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rsi
+ vmovdqu (%rsi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ /* Shift out zero CHAR matches that are before the begining of
+ src (rdi). */
+ shrxl %edi, %ecx, %ecx
+ testl %ecx, %ecx
+ jz L(page_cross_continue)
+ VPCMPEQ %ymm1, %ymm7, %ymm1
+ vpmovmskb %ymm1, %eax
+
+ /* Shift out search CHAR matches that are before the begining of
+ src (rdi). */
+ shrxl %edi, %eax, %eax
+ blsmskl %ecx, %ecx
+ /* Check if any search CHAR match in range. */
+ andl %ecx, %eax
+ jz L(ret2)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret2):
+ VZEROUPPER_RETURN
+END(STRRCHR)
#endif
--
GitLab

View File

@ -1,558 +0,0 @@
From c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 21 Apr 2022 20:52:30 -0500
Subject: [PATCH] x86: Optimize {str|wcs}rchr-evex
Content-type: text/plain; charset=UTF-8
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.755
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++---------
1 file changed, 290 insertions(+), 181 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index f920b5a5..f5b6d755 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -24,242 +24,351 @@
# define STRRCHR __strrchr_evex
# endif
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
# ifdef USE_AS_WCSRCHR
+# define SHIFT_REG esi
+
+# define kunpck kunpckbw
+# define kmov_2x kmovd
+# define maskz_2x ecx
+# define maskm_2x eax
+# define CHAR_SIZE 4
+# define VPMIN vpminud
+# define VPTESTN vptestnmd
# define VPBROADCAST vpbroadcastd
-# define VPCMP vpcmpd
-# define SHIFT_REG r8d
+# define VPCMP vpcmpd
# else
+# define SHIFT_REG edi
+
+# define kunpck kunpckdq
+# define kmov_2x kmovq
+# define maskz_2x rcx
+# define maskm_2x rax
+
+# define CHAR_SIZE 1
+# define VPMIN vpminub
+# define VPTESTN vptestnmb
# define VPBROADCAST vpbroadcastb
-# define VPCMP vpcmpb
-# define SHIFT_REG ecx
+# define VPCMP vpcmpb
# endif
# define XMMZERO xmm16
# define YMMZERO ymm16
# define YMMMATCH ymm17
-# define YMM1 ymm18
+# define YMMSAVE ymm18
+
+# define YMM1 ymm19
+# define YMM2 ymm20
+# define YMM3 ymm21
+# define YMM4 ymm22
+# define YMM5 ymm23
+# define YMM6 ymm24
+# define YMM7 ymm25
+# define YMM8 ymm26
-# define VEC_SIZE 32
- .section .text.evex,"ax",@progbits
-ENTRY (STRRCHR)
- movl %edi, %ecx
+# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+ .section .text.evex, "ax", @progbits
+ENTRY(STRRCHR)
+ movl %edi, %eax
/* Broadcast CHAR to YMMMATCH. */
VPBROADCAST %esi, %YMMMATCH
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
-
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ jg L(cross_page_boundary)
+L(page_cross_continue):
VMOVU (%rdi), %YMM1
-
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ /* k0 has a 1 for each zero CHAR in YMM1. */
+ VPTESTN %YMM1, %YMM1, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
-
- addq $VEC_SIZE, %rdi
-
- testl %eax, %eax
- jnz L(first_vec)
-
testl %ecx, %ecx
- jnz L(return_null)
-
- andq $-VEC_SIZE, %rdi
- xorl %edx, %edx
- jmp L(aligned_loop)
-
- .p2align 4
-L(first_vec):
- /* Check if there is a null byte. */
- testl %ecx, %ecx
- jnz L(char_and_nul_in_first_vec)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
- movq %rdi, %rsi
- andq $-VEC_SIZE, %rdi
- jmp L(aligned_loop)
-
- .p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ jz L(aligned_more)
+ /* fallthrough: zero CHAR in first VEC. */
+ /* K1 has a 1 for each search CHAR match in YMM1. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k1, %eax
+ /* Build mask up until first zero CHAR (used to mask of
+ potential search CHAR matches past the end of the string).
+ */
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ /* Get last match (the `andl` removed any out of bounds
+ matches). */
+ bsrl %eax, %eax
# ifdef USE_AS_WCSRCHR
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
- bytes. */
- movl %ecx, %SHIFT_REG
- sarl $2, %SHIFT_REG
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rdi, %rax
# endif
+L(ret0):
+ ret
- VMOVA (%rdi), %YMM1
-
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
+ /* Returns for first vec x1/x2/x3 have hard coded backward
+ search path for earlier matches. */
+ .p2align 4,, 6
+L(first_vec_x1):
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
+ kmovd %k1, %eax
+ blsmskl %ecx, %ecx
+ /* eax non-zero if search CHAR in range. */
+ andl %ecx, %eax
+ jnz L(first_vec_x1_return)
+
+ /* fallthrough: no match in YMM2 then need to check for earlier
+ matches (in YMM1). */
+ .p2align 4,, 4
+L(first_vec_x0_test):
VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %edx
kmovd %k1, %eax
-
- shrxl %SHIFT_REG, %edx, %edx
- shrxl %SHIFT_REG, %eax, %eax
- addq $VEC_SIZE, %rdi
-
- /* Check if there is a CHAR. */
testl %eax, %eax
- jnz L(found_char)
-
- testl %edx, %edx
- jnz L(return_null)
-
- jmp L(aligned_loop)
-
- .p2align 4
-L(found_char):
- testl %edx, %edx
- jnz L(char_and_nul)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
- leaq (%rdi, %rcx), %rsi
+ jz L(ret1)
+ bsrl %eax, %eax
+# ifdef USE_AS_WCSRCHR
+ leaq (%rsi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rsi, %rax
+# endif
+L(ret1):
+ ret
- .p2align 4
-L(aligned_loop):
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
+ .p2align 4,, 10
+L(first_vec_x1_or_x2):
+ VPCMP $0, %YMM3, %YMMMATCH, %k3
+ VPCMP $0, %YMM2, %YMMMATCH, %k2
+ /* K2 and K3 have 1 for any search CHAR match. Test if any
+ matches between either of them. Otherwise check YMM1. */
+ kortestd %k2, %k3
+ jz L(first_vec_x0_test)
+
+ /* Guranteed that YMM2 and YMM3 are within range so merge the
+ two bitmasks then get last result. */
+ kunpck %k2, %k3, %k3
+ kmovq %k3, %rax
+ bsrq %rax, %rax
+ leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
+ ret
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %ecx
+ .p2align 4,, 6
+L(first_vec_x3):
+ VPCMP $0, %YMMMATCH, %YMM4, %k1
kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ blsmskl %ecx, %ecx
+ /* If no search CHAR match in range check YMM1/YMM2/YMM3. */
+ andl %ecx, %eax
+ jz L(first_vec_x1_or_x2)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- VMOVA (%rdi), %YMM1
- add $VEC_SIZE, %rdi
+ .p2align 4,, 6
+L(first_vec_x0_x1_test):
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
+ kmovd %k1, %eax
+ /* Check YMM2 for last match first. If no match try YMM1. */
+ testl %eax, %eax
+ jz L(first_vec_x0_test)
+ .p2align 4,, 4
+L(first_vec_x1_return):
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %ecx
+ .p2align 4,, 10
+L(first_vec_x2):
+ VPCMP $0, %YMMMATCH, %YMM3, %k1
kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ blsmskl %ecx, %ecx
+ /* Check YMM3 for last match first. If no match try YMM2/YMM1.
+ */
+ andl %ecx, %eax
+ jz L(first_vec_x0_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ .p2align 4
+L(aligned_more):
+ /* Need to keep original pointer incase YMM1 has last match. */
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rdi
+ VMOVU VEC_SIZE(%rdi), %YMM2
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ testl %ecx, %ecx
+ jnz L(first_vec_x1)
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
+ VMOVU (VEC_SIZE * 2)(%rdi), %YMM3
+ VPTESTN %YMM3, %YMM3, %k0
+ kmovd %k0, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ VMOVU (VEC_SIZE * 3)(%rdi), %YMM4
+ VPTESTN %YMM4, %YMM4, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
- orl %eax, %ecx
- jz L(aligned_loop)
+ movq %rdi, %r8
+ testl %ecx, %ecx
+ jnz L(first_vec_x3)
+ andq $-(VEC_SIZE * 2), %rdi
.p2align 4
-L(char_nor_null):
- /* Find a CHAR or a null byte in a loop. */
+L(first_aligned_loop):
+ /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
+ they don't store a match. */
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM5
+ VMOVA (VEC_SIZE * 5)(%rdi), %YMM6
+
+ VPCMP $0, %YMM5, %YMMMATCH, %k2
+ vpxord %YMM6, %YMMMATCH, %YMM7
+
+ VPMIN %YMM5, %YMM6, %YMM8
+ VPMIN %YMM8, %YMM7, %YMM7
+
+ VPTESTN %YMM7, %YMM7, %k1
+ subq $(VEC_SIZE * -2), %rdi
+ kortestd %k1, %k2
+ jz L(first_aligned_loop)
+
+ VPCMP $0, %YMM6, %YMMMATCH, %k3
+ VPTESTN %YMM8, %YMM8, %k1
+ ktestd %k1, %k1
+ jz L(second_aligned_loop_prep)
+
+ kortestd %k2, %k3
+ jnz L(return_first_aligned_loop)
+
+ .p2align 4,, 6
+L(first_vec_x1_or_x2_or_x3):
+ VPCMP $0, %YMM4, %YMMMATCH, %k4
+ kmovd %k4, %eax
testl %eax, %eax
- jnz L(match)
-L(return_value):
- testl %edx, %edx
- jz L(return_null)
- movl %edx, %eax
- movq %rsi, %rdi
+ jz L(first_vec_x1_or_x2)
bsrl %eax, %eax
-# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
-# endif
+ leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
ret
- .p2align 4
-L(match):
- /* Find a CHAR. Check if there is a null byte. */
- kmovd %k0, %ecx
- testl %ecx, %ecx
- jnz L(find_nul)
+ .p2align 4,, 8
+L(return_first_aligned_loop):
+ VPTESTN %YMM5, %YMM5, %k0
+ kunpck %k0, %k1, %k0
+ kmov_2x %k0, %maskz_2x
+
+ blsmsk %maskz_2x, %maskz_2x
+ kunpck %k2, %k3, %k3
+ kmov_2x %k3, %maskm_2x
+ and %maskz_2x, %maskm_2x
+ jz L(first_vec_x1_or_x2_or_x3)
- /* Remember the match and keep searching. */
- movl %eax, %edx
+ bsr %maskm_2x, %maskm_2x
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+ .p2align 4
+ /* We can throw away the work done for the first 4x checks here
+ as we have a later match. This is the 'fast' path persay.
+ */
+L(second_aligned_loop_prep):
+L(second_aligned_loop_set_furthest_match):
movq %rdi, %rsi
- jmp L(aligned_loop)
+ kunpck %k2, %k3, %k4
.p2align 4
-L(find_nul):
- /* Mask out any matching bits after the null byte. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
- testl %eax, %eax
- /* If there is no CHAR here, return the remembered one. */
- jz L(return_value)
- bsrl %eax, %eax
+L(second_aligned_loop):
+ VMOVU (VEC_SIZE * 4)(%rdi), %YMM1
+ VMOVU (VEC_SIZE * 5)(%rdi), %YMM2
+
+ VPCMP $0, %YMM1, %YMMMATCH, %k2
+ vpxord %YMM2, %YMMMATCH, %YMM3
+
+ VPMIN %YMM1, %YMM2, %YMM4
+ VPMIN %YMM3, %YMM4, %YMM3
+
+ VPTESTN %YMM3, %YMM3, %k1
+ subq $(VEC_SIZE * -2), %rdi
+ kortestd %k1, %k2
+ jz L(second_aligned_loop)
+
+ VPCMP $0, %YMM2, %YMMMATCH, %k3
+ VPTESTN %YMM4, %YMM4, %k1
+ ktestd %k1, %k1
+ jz L(second_aligned_loop_set_furthest_match)
+
+ kortestd %k2, %k3
+ /* branch here because there is a significant advantage interms
+ of output dependency chance in using edx. */
+ jnz L(return_new_match)
+L(return_old_match):
+ kmovq %k4, %rax
+ bsrq %rax, %rax
+ leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
+ ret
+
+L(return_new_match):
+ VPTESTN %YMM1, %YMM1, %k0
+ kunpck %k0, %k1, %k0
+ kmov_2x %k0, %maskz_2x
+
+ blsmsk %maskz_2x, %maskz_2x
+ kunpck %k2, %k3, %k3
+ kmov_2x %k3, %maskm_2x
+ and %maskz_2x, %maskm_2x
+ jz L(return_old_match)
+
+ bsr %maskm_2x, %maskm_2x
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+L(cross_page_boundary):
+ /* eax contains all the page offset bits of src (rdi). `xor rdi,
+ rax` sets pointer will all page offset bits cleared so
+ offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+ before page cross (guranteed to be safe to read). Doing this
+ as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+ a bit of code size. */
+ xorq %rdi, %rax
+ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
+ VPTESTN %YMM1, %YMM1, %k0
+ kmovd %k0, %ecx
+
+ /* Shift out zero CHAR matches that are before the begining of
+ src (rdi). */
# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ movl %edi, %esi
+ andl $(VEC_SIZE - 1), %esi
+ shrl $2, %esi
# endif
- ret
+ shrxl %SHIFT_REG, %ecx, %ecx
- .p2align 4
-L(char_and_nul):
- /* Find both a CHAR and a null byte. */
- addq %rcx, %rdi
- movl %edx, %ecx
-L(char_and_nul_in_first_vec):
- /* Mask out any matching bits after the null byte. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
- testl %eax, %eax
- /* Return null pointer if the null byte comes first. */
- jz L(return_null)
+ testl %ecx, %ecx
+ jz L(page_cross_continue)
+
+ /* Found zero CHAR so need to test for search CHAR. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k1, %eax
+ /* Shift out search CHAR matches that are before the begining of
+ src (rdi). */
+ shrxl %SHIFT_REG, %eax, %eax
+
+ /* Check if any search CHAR match in range. */
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret3)
bsrl %eax, %eax
# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ addq %rdi, %rax
# endif
+L(ret3):
ret
- .p2align 4
-L(return_null):
- xorl %eax, %eax
- ret
-
-END (STRRCHR)
+END(STRRCHR)
#endif
--
GitLab

View File

@ -1,73 +0,0 @@
From 911c63a51c690dd1a97dfc587097277029baf00f Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 27 Apr 2022 15:13:02 -0500
Subject: [PATCH] sysdeps: Add 'get_fast_jitter' interace in fast-jitter.h
Content-type: text/plain; charset=UTF-8
'get_fast_jitter' is meant to be used purely for performance
purposes. In all cases it's used it should be acceptable to get no
randomness (see default case). An example use case is in setting
jitter for retries between threads at a lock. There is a
performance benefit to having jitter, but only if the jitter can
be generated very quickly and ultimately there is no serious issue
if no jitter is generated.
The implementation generally uses 'HP_TIMING_NOW' iff it is
inlined (avoid any potential syscall paths).
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/generic/fast-jitter.h | 42 +++++++++++++++++++++++++++++++++++
1 file changed, 42 insertions(+)
create mode 100644 sysdeps/generic/fast-jitter.h
diff --git a/sysdeps/generic/fast-jitter.h b/sysdeps/generic/fast-jitter.h
new file mode 100644
index 00000000..4dd53e34
--- /dev/null
+++ b/sysdeps/generic/fast-jitter.h
@@ -0,0 +1,42 @@
+/* Fallback for fast jitter just return 0.
+ Copyright (C) 2019-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _FAST_JITTER_H
+# define _FAST_JITTER_H
+
+# include <stdint.h>
+# include <hp-timing.h>
+
+/* Baseline just return 0. We could create jitter using a clock or
+ 'random_bits' but that may imply a syscall and the goal of
+ 'get_fast_jitter' is minimal overhead "randomness" when such
+ randomness helps performance. Adding high overhead the function
+ defeats the purpose. */
+static inline uint32_t
+get_fast_jitter (void)
+{
+# if HP_TIMING_INLINE
+ hp_timing_t jitter;
+ HP_TIMING_NOW (jitter);
+ return (uint32_t) jitter;
+# else
+ return 0;
+# endif
+}
+
+#endif
--
GitLab

View File

@ -1,226 +0,0 @@
From 8162147872491bb5b48e91543b19c49a29ae6b6d Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Fri, 6 May 2022 01:50:10 +0000
Subject: [PATCH] nptl: Add backoff mechanism to spinlock loop
Content-type: text/plain; charset=UTF-8
When mutiple threads waiting for lock at the same time, once lock owner
releases the lock, waiters will see lock available and all try to lock,
which may cause an expensive CAS storm.
Binary exponential backoff with random jitter is introduced. As try-lock
attempt increases, there is more likely that a larger number threads
compete for adaptive mutex lock, so increase wait time in exponential.
A random jitter is also added to avoid synchronous try-lock from other
threads.
v2: Remove read-check before try-lock for performance.
v3:
1. Restore read-check since it works well in some platform.
2. Make backoff arch dependent, and enable it for x86_64.
3. Limit max backoff to reduce latency in large critical section.
v4: Fix strict-prototypes error in sysdeps/nptl/pthread_mutex_backoff.h
v5: Commit log updated for regression in large critical section.
Result of pthread-mutex-locks bench
Test Platform: Xeon 8280L (2 socket, 112 CPUs in total)
First Row: thread number
First Col: critical section length
Values: backoff vs upstream, time based, low is better
non-critical-length: 1
1 2 4 8 16 32 64 112 140
0 0.99 0.58 0.52 0.49 0.43 0.44 0.46 0.52 0.54
1 0.98 0.43 0.56 0.50 0.44 0.45 0.50 0.56 0.57
2 0.99 0.41 0.57 0.51 0.45 0.47 0.48 0.60 0.61
4 0.99 0.45 0.59 0.53 0.48 0.49 0.52 0.64 0.65
8 1.00 0.66 0.71 0.63 0.56 0.59 0.66 0.72 0.71
16 0.97 0.78 0.91 0.73 0.67 0.70 0.79 0.80 0.80
32 0.95 1.17 0.98 0.87 0.82 0.86 0.89 0.90 0.90
64 0.96 0.95 1.01 1.01 0.98 1.00 1.03 0.99 0.99
128 0.99 1.01 1.01 1.17 1.08 1.12 1.02 0.97 1.02
non-critical-length: 32
1 2 4 8 16 32 64 112 140
0 1.03 0.97 0.75 0.65 0.58 0.58 0.56 0.70 0.70
1 0.94 0.95 0.76 0.65 0.58 0.58 0.61 0.71 0.72
2 0.97 0.96 0.77 0.66 0.58 0.59 0.62 0.74 0.74
4 0.99 0.96 0.78 0.66 0.60 0.61 0.66 0.76 0.77
8 0.99 0.99 0.84 0.70 0.64 0.66 0.71 0.80 0.80
16 0.98 0.97 0.95 0.76 0.70 0.73 0.81 0.85 0.84
32 1.04 1.12 1.04 0.89 0.82 0.86 0.93 0.91 0.91
64 0.99 1.15 1.07 1.00 0.99 1.01 1.05 0.99 0.99
128 1.00 1.21 1.20 1.22 1.25 1.31 1.12 1.10 0.99
non-critical-length: 128
1 2 4 8 16 32 64 112 140
0 1.02 1.00 0.99 0.67 0.61 0.61 0.61 0.74 0.73
1 0.95 0.99 1.00 0.68 0.61 0.60 0.60 0.74 0.74
2 1.00 1.04 1.00 0.68 0.59 0.61 0.65 0.76 0.76
4 1.00 0.96 0.98 0.70 0.63 0.63 0.67 0.78 0.77
8 1.01 1.02 0.89 0.73 0.65 0.67 0.71 0.81 0.80
16 0.99 0.96 0.96 0.79 0.71 0.73 0.80 0.84 0.84
32 0.99 0.95 1.05 0.89 0.84 0.85 0.94 0.92 0.91
64 1.00 0.99 1.16 1.04 1.00 1.02 1.06 0.99 0.99
128 1.00 1.06 0.98 1.14 1.39 1.26 1.08 1.02 0.98
There is regression in large critical section. But adaptive mutex is
aimed for "quick" locks. Small critical section is more common when
users choose to use adaptive pthread_mutex.
Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Conflicts:
pthreadP.h
(had been moved)
nptl/pthread_mutex_lock.c
(max_adaptive_count renamed)
---
nptl/pthreadP.h | 1 +
nptl/pthread_mutex_lock.c | 16 +++++++--
sysdeps/nptl/pthread_mutex_backoff.h | 35 ++++++++++++++++++
sysdeps/x86_64/nptl/pthread_mutex_backoff.h | 39 +++++++++++++++++++++
4 files changed, 89 insertions(+), 2 deletions(-)
create mode 100644 sysdeps/nptl/pthread_mutex_backoff.h
create mode 100644 sysdeps/x86_64/nptl/pthread_mutex_backoff.h
diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
index 7ddc166c..1550e3b6 100644
--- a/nptl/pthreadP.h
+++ b/nptl/pthreadP.h
@@ -33,6 +33,7 @@
#include <kernel-features.h>
#include <errno.h>
#include <internal-signals.h>
+#include <pthread_mutex_backoff.h>
/* Atomic operations on TLS memory. */
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
index d96a9933..c7770fc9 100644
--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
@@ -133,14 +133,26 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
int cnt = 0;
int max_cnt = MIN (MAX_ADAPTIVE_COUNT,
mutex->__data.__spins * 2 + 10);
+ int spin_count, exp_backoff = 1;
+ unsigned int jitter = get_jitter ();
do
{
- if (cnt++ >= max_cnt)
+ /* In each loop, spin count is exponential backoff plus
+ random jitter, random range is [0, exp_backoff-1]. */
+ spin_count = exp_backoff + (jitter & (exp_backoff - 1));
+ cnt += spin_count;
+ if (cnt >= max_cnt)
{
+ /* If cnt exceeds max spin count, just go to wait
+ queue. */
LLL_MUTEX_LOCK (mutex);
break;
}
- atomic_spin_nop ();
+ do
+ atomic_spin_nop ();
+ while (--spin_count > 0);
+ /* Prepare for next loop. */
+ exp_backoff = get_next_backoff (exp_backoff);
}
while (LLL_MUTEX_READ_LOCK (mutex) != 0
|| LLL_MUTEX_TRYLOCK (mutex) != 0);
diff --git a/sysdeps/nptl/pthread_mutex_backoff.h b/sysdeps/nptl/pthread_mutex_backoff.h
new file mode 100644
index 00000000..5b26c22a
--- /dev/null
+++ b/sysdeps/nptl/pthread_mutex_backoff.h
@@ -0,0 +1,35 @@
+/* Pthread mutex backoff configuration.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
+#define _PTHREAD_MUTEX_BACKOFF_H 1
+
+static inline unsigned int
+get_jitter (void)
+{
+ /* Arch dependent random jitter, return 0 disables random. */
+ return 0;
+}
+
+static inline int
+get_next_backoff (int backoff)
+{
+ /* Next backoff, return 1 disables mutex backoff. */
+ return 1;
+}
+
+#endif
diff --git a/sysdeps/x86_64/nptl/pthread_mutex_backoff.h b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
new file mode 100644
index 00000000..ec74c3d9
--- /dev/null
+++ b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
@@ -0,0 +1,39 @@
+/* Pthread mutex backoff configuration.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
+#define _PTHREAD_MUTEX_BACKOFF_H 1
+
+#include <fast-jitter.h>
+
+static inline unsigned int
+get_jitter (void)
+{
+ return get_fast_jitter ();
+}
+
+#define MAX_BACKOFF 16
+
+static inline int
+get_next_backoff (int backoff)
+{
+ /* Binary expontial backoff. Limiting max backoff
+ can reduce latency in large critical section. */
+ return (backoff < MAX_BACKOFF) ? backoff << 1 : backoff;
+}
+
+#endif
--
GitLab

View File

@ -1,55 +0,0 @@
From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Tue, 15 Feb 2022 08:18:15 -0600
Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
#28896]
Content-type: text/plain; charset=UTF-8
In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
call strcmp-avx2 and wcscmp-avx2 respectively. This would have
not checks around vzeroupper and would trigger spurious
aborts. This commit fixes that.
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
AVX2 machines with and without RTM.
Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/strcmp-avx2.S | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
Conflicts:
sysdeps/x86_64/multiarch/strcmp-avx2.S
(split into two patches due to upstream bug differences)
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 28cc98b6..e267c6cb 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -345,10 +345,10 @@ L(one_or_less):
movq %LOCALE_REG, %rdx
# endif
jb L(ret_zero)
-# ifdef USE_AS_WCSCMP
/* 'nbe' covers the case where length is negative (large
unsigned). */
- jnbe __wcscmp_avx2
+ jnbe OVERFLOW_STRCMP
+# ifdef USE_AS_WCSCMP
movl (%rdi), %edx
xorl %eax, %eax
cmpl (%rsi), %edx
@@ -357,10 +357,6 @@ L(one_or_less):
negl %eax
orl $1, %eax
# else
- /* 'nbe' covers the case where length is negative (large
- unsigned). */
-
- jnbe __strcmp_avx2
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
TOLOWER_gpr (%rax, %eax)
--
GitLab

View File

@ -1,60 +0,0 @@
From 259a17cc98058d2576511201f85d28cb5d9de2a2 Mon Sep 17 00:00:00 2001
From: Stefan Liebler <stli@linux.ibm.com>
Date: Mon, 28 Jun 2021 13:01:07 +0200
Subject: s390x: Update math: redirect roundeven function
After recent commit
447954a206837b5f153869cfeeeab44631c3fac9
"math: redirect roundeven function", building on
s390x fails with:
Error: symbol `__roundevenl' is already defined
Similar to aarch64/riscv fix, this patch redirects target
specific functions for s390x:
commit 3213ed770cbc5821920d16caa93c85e92dd7b9f6
"Update math: redirect roundeven function"
diff --git a/sysdeps/s390/fpu/s_roundeven.c b/sysdeps/s390/fpu/s_roundeven.c
index 40b07e054b..0773adfed0 100644
--- a/sysdeps/s390/fpu/s_roundeven.c
+++ b/sysdeps/s390/fpu/s_roundeven.c
@@ -18,6 +18,7 @@
<https://www.gnu.org/licenses/>. */
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
+# define NO_MATH_REDIRECT
# include <math.h>
# include <libm-alias-double.h>
@@ -31,7 +32,6 @@ __roundeven (double x)
__asm__ ("fidbra %0,4,%1,4" : "=f" (y) : "f" (x));
return y;
}
-hidden_def (__roundeven)
libm_alias_double (__roundeven, roundeven)
#else
diff --git a/sysdeps/s390/fpu/s_roundevenf.c b/sysdeps/s390/fpu/s_roundevenf.c
index d2fbf3d2b6..289785bc4a 100644
--- a/sysdeps/s390/fpu/s_roundevenf.c
+++ b/sysdeps/s390/fpu/s_roundevenf.c
@@ -18,6 +18,7 @@
<https://www.gnu.org/licenses/>. */
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
+# define NO_MATH_REDIRECT
# include <math.h>
# include <libm-alias-float.h>
diff --git a/sysdeps/s390/fpu/s_roundevenl.c b/sysdeps/s390/fpu/s_roundevenl.c
index 29ab7a8616..94b6459ab4 100644
--- a/sysdeps/s390/fpu/s_roundevenl.c
+++ b/sysdeps/s390/fpu/s_roundevenl.c
@@ -18,6 +18,7 @@
<https://www.gnu.org/licenses/>. */
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
+# define NO_MATH_REDIRECT
# include <math.h>
# include <math_private.h>
# include <libm-alias-ldouble.h>

View File

@ -1,74 +0,0 @@
From 1da50d4bda07f04135dca39f40e79fc9eabed1f8 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 26 Feb 2021 05:36:59 -0800
Subject: [PATCH] x86: Set Prefer_No_VZEROUPPER and add Prefer_AVX2_STRCMP
Content-type: text/plain; charset=UTF-8
1. Set Prefer_No_VZEROUPPER if RTM is usable to avoid RTM abort triggered
by VZEROUPPER inside a transactionally executing RTM region.
2. Since to compare 2 32-byte strings, 256-bit EVEX strcmp requires 2
loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp requires 1 load, 2 VPCMPEQs,
1 VPMINU and 1 VPMOVMSKB, AVX2 strcmp is faster than EVEX strcmp. Add
Prefer_AVX2_STRCMP to prefer AVX2 strcmp family functions.
---
sysdeps/x86/cpu-features.c | 20 +++++++++++++++++--
sysdeps/x86/cpu-tunables.c | 2 ++
...cpu-features-preferred_feature_index_1.def | 1 +
3 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 91042505..3610ee5c 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -524,8 +524,24 @@ init_cpu_features (struct cpu_features *cpu_features)
cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
|= bit_arch_Prefer_No_VZEROUPPER;
else
- cpu_features->preferred[index_arch_Prefer_No_AVX512]
- |= bit_arch_Prefer_No_AVX512;
+ {
+ cpu_features->preferred[index_arch_Prefer_No_AVX512]
+ |= bit_arch_Prefer_No_AVX512;
+
+ /* Avoid RTM abort triggered by VZEROUPPER inside a
+ transactionally executing RTM region. */
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
+ |= bit_arch_Prefer_No_VZEROUPPER;
+
+ /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
+ requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
+ requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
+ AVX2 strcmp is faster than EVEX strcmp. */
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+ cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
+ |= bit_arch_Prefer_AVX2_STRCMP;
+ }
}
/* This spells out "AuthenticAMD". */
else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 3173b2b9..73adbaba 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -239,6 +239,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
Fast_Copy_Backward,
disable, 18);
+ CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
+ (n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
}
break;
case 19:
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index 17a5cc42..4ca70b40 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -32,3 +32,4 @@ BIT (Prefer_ERMS)
BIT (Prefer_FSRM)
BIT (Prefer_No_AVX512)
BIT (MathVec_Prefer_No_AVX512)
+BIT (Prefer_AVX2_STRCMP)
--
GitLab

View File

@ -1,26 +0,0 @@
From 3213ed770cbc5821920d16caa93c85e92dd7b9f6 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Wed, 23 Jun 2021 13:29:41 -0700
Subject: Update math: redirect roundeven function
Redirect target specific roundeven functions for aarch64, ldbl-128ibm
and riscv.
Conflicts:
sysdeps/aarch64/*
(not needed)
sysdeps/riscv/*
(not supported)
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
index 6701970f4a..90eecf496b 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
@@ -17,6 +17,7 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
+#define NO_MATH_REDIRECT
#include <math.h>
#include <math_private.h>

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,242 +0,0 @@
From 63ad43566f7a25d140dc723598aeb441ad657eed Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 5 Mar 2021 06:46:08 -0800
Subject: [PATCH] x86-64: Add memmove family functions with 256-bit EVEX
Content-type: text/plain; charset=UTF-8
Update ifunc-memmove.h to select the function optimized with 256-bit EVEX
instructions using YMM16-YMM31 registers to avoid RTM abort with usable
AVX512VL since VZEROUPPER isn't needed at function exit.
---
sysdeps/x86_64/multiarch/Makefile | 1 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 36 +++++++++++++++++++
sysdeps/x86_64/multiarch/ifunc-memmove.h | 21 +++++++++--
.../multiarch/memmove-evex-unaligned-erms.S | 33 +++++++++++++++++
.../multiarch/memmove-vec-unaligned-erms.S | 24 ++++++++-----
5 files changed, 104 insertions(+), 11 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 46783cd1..4563fc56 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
memset-avx2-unaligned-erms \
memset-avx512-unaligned-erms \
memchr-evex \
+ memmove-evex-unaligned-erms \
memrchr-evex \
rawmemchr-evex \
stpcpy-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 082e4da3..6bd3abfc 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX),
__memmove_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (SSSE3),
__memmove_chk_ssse3_back)
@@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove,
CPU_FEATURE_USABLE (AVX),
__memmove_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove,
CPU_FEATURE_USABLE (AVX512F),
__memmove_avx512_no_vzeroupper)
@@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX),
__memcpy_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__memcpy_chk_ssse3_back)
@@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
CPU_FEATURE_USABLE (AVX),
__memcpy_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
__memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
@@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX),
__mempcpy_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__mempcpy_chk_ssse3_back)
@@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX),
__mempcpy_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index 5e5f0299..6f8bce5f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+ attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (avx_unaligned_erms);
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (evex_unaligned_erms);
+
+ return OPTIMIZE (evex_unaligned);
+ }
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (avx_unaligned_erms);
- return OPTIMIZE (avx_unaligned);
+ return OPTIMIZE (avx_unaligned);
+ }
}
if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
new file mode 100644
index 00000000..0cbce8f9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -0,0 +1,33 @@
+#if IS_IN (libc)
+# define VEC_SIZE 32
+# define XMM0 xmm16
+# define XMM1 xmm17
+# define YMM0 ymm16
+# define YMM1 ymm17
+# define VEC0 ymm16
+# define VEC1 ymm17
+# define VEC2 ymm18
+# define VEC3 ymm19
+# define VEC4 ymm20
+# define VEC5 ymm21
+# define VEC6 ymm22
+# define VEC7 ymm23
+# define VEC8 ymm24
+# define VEC9 ymm25
+# define VEC10 ymm26
+# define VEC11 ymm27
+# define VEC12 ymm28
+# define VEC13 ymm29
+# define VEC14 ymm30
+# define VEC15 ymm31
+# define VEC(i) VEC##i
+# define VMOVNT vmovntdq
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+# define VZEROUPPER
+
+# define SECTION(p) p##.evex
+# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 274aa1c7..08e21692 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -48,6 +48,14 @@
# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
#endif
+#ifndef XMM0
+# define XMM0 xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0 ymm0
+#endif
+
#ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
@@ -277,20 +285,20 @@ L(less_vec):
#if VEC_SIZE > 32
L(between_32_63):
/* From 32 to 63. No branch when size == 32. */
- vmovdqu (%rsi), %ymm0
- vmovdqu -32(%rsi,%rdx), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, -32(%rdi,%rdx)
+ VMOVU (%rsi), %YMM0
+ VMOVU -32(%rsi,%rdx), %YMM1
+ VMOVU %YMM0, (%rdi)
+ VMOVU %YMM1, -32(%rdi,%rdx)
VZEROUPPER
ret
#endif
#if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
L(between_16_31):
- vmovdqu (%rsi), %xmm0
- vmovdqu -16(%rsi,%rdx), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, -16(%rdi,%rdx)
+ VMOVU (%rsi), %XMM0
+ VMOVU -16(%rsi,%rdx), %XMM1
+ VMOVU %XMM0, (%rdi)
+ VMOVU %XMM1, -16(%rdi,%rdx)
ret
#endif
L(between_8_15):
--
GitLab

View File

@ -1,254 +0,0 @@
From 1b968b6b9b3aac702ac2f133e0dd16cfdbb415ee Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 5 Mar 2021 07:15:03 -0800
Subject: [PATCH] x86-64: Add memset family functions with 256-bit EVEX
Content-type: text/plain; charset=UTF-8
Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM
abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
function exit.
---
sysdeps/x86_64/multiarch/Makefile | 1 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 22 +++++++++++++++++
sysdeps/x86_64/multiarch/ifunc-memset.h | 24 +++++++++++++++----
sysdeps/x86_64/multiarch/ifunc-wmemset.h | 13 ++++++----
.../multiarch/memset-evex-unaligned-erms.S | 24 +++++++++++++++++++
.../multiarch/memset-vec-unaligned-erms.S | 20 +++++++++++-----
6 files changed, 90 insertions(+), 14 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 4563fc56..1cc0a10e 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
memchr-evex \
memmove-evex-unaligned-erms \
memrchr-evex \
+ memset-evex-unaligned-erms \
rawmemchr-evex \
stpcpy-evex \
stpncpy-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 6bd3abfc..7cf83485 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memset_chk,
CPU_FEATURE_USABLE (AVX2),
__memset_chk_avx2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __memset_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __memset_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
CPU_FEATURE_USABLE (AVX512F),
__memset_chk_avx512_unaligned_erms)
@@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memset,
CPU_FEATURE_USABLE (AVX2),
__memset_avx2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memset,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __memset_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, memset,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __memset_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memset,
CPU_FEATURE_USABLE (AVX512F),
__memset_avx512_unaligned_erms)
@@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, wmemset,
CPU_FEATURE_USABLE (AVX2),
__wmemset_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, wmemset,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __wmemset_evex_unaligned)
IFUNC_IMPL_ADD (array, i, wmemset,
CPU_FEATURE_USABLE (AVX512F),
__wmemset_avx512_unaligned))
@@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
CPU_FEATURE_USABLE (AVX2),
__wmemset_chk_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __wmemset_chk_evex_unaligned)
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
CPU_FEATURE_USABLE (AVX512F),
__wmemset_chk_avx512_unaligned))
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index 708bd72e..6f31f4dc 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+ attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
@@ -56,10 +60,22 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (avx2_unaligned_erms);
- else
- return OPTIMIZE (avx2_unaligned);
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (evex_unaligned_erms);
+
+ return OPTIMIZE (evex_unaligned);
+ }
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (avx2_unaligned_erms);
+
+ return OPTIMIZE (avx2_unaligned);
+ }
}
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
index eb242210..9290c4bf 100644
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
@@ -20,6 +20,7 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
static inline void *
@@ -27,14 +28,18 @@ IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx512_unaligned);
- else
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+ return OPTIMIZE (evex_unaligned);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2_unaligned);
}
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
new file mode 100644
index 00000000..ae0a4d6e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -0,0 +1,24 @@
+#if IS_IN (libc)
+# define VEC_SIZE 32
+# define XMM0 xmm16
+# define YMM0 ymm16
+# define VEC0 ymm16
+# define VEC(i) VEC##i
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+# define VZEROUPPER
+
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ movq r, %rax; \
+ vpbroadcastb d, %VEC0
+
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ movq r, %rax; \
+ vpbroadcastd d, %VEC0
+
+# define SECTION(p) p##.evex
+# define MEMSET_SYMBOL(p,s) p##_evex_##s
+# define WMEMSET_SYMBOL(p,s) p##_evex_##s
+
+# include "memset-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 9a0fd818..71e91a8f 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -34,6 +34,14 @@
# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
#endif
+#ifndef XMM0
+# define XMM0 xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0 ymm0
+#endif
+
#ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
@@ -67,7 +75,7 @@
ENTRY (__bzero)
mov %RDI_LP, %RAX_LP /* Set return value. */
mov %RSI_LP, %RDX_LP /* Set n. */
- pxor %xmm0, %xmm0
+ pxor %XMM0, %XMM0
jmp L(entry_from_bzero)
END (__bzero)
weak_alias (__bzero, bzero)
@@ -223,7 +231,7 @@ L(less_vec):
cmpb $16, %dl
jae L(between_16_31)
# endif
- MOVQ %xmm0, %rcx
+ MOVQ %XMM0, %rcx
cmpb $8, %dl
jae L(between_8_15)
cmpb $4, %dl
@@ -238,16 +246,16 @@ L(less_vec):
# if VEC_SIZE > 32
/* From 32 to 63. No branch when size == 32. */
L(between_32_63):
- vmovdqu %ymm0, -32(%rdi,%rdx)
- vmovdqu %ymm0, (%rdi)
+ VMOVU %YMM0, -32(%rdi,%rdx)
+ VMOVU %YMM0, (%rdi)
VZEROUPPER
ret
# endif
# if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
L(between_16_31):
- vmovdqu %xmm0, -16(%rdi,%rdx)
- vmovdqu %xmm0, (%rdi)
+ VMOVU %XMM0, -16(%rdi,%rdx)
+ VMOVU %XMM0, (%rdi)
VZEROUPPER
ret
# endif
--
GitLab

View File

@ -1,561 +0,0 @@
From 91264fe3577fe887b4860923fa6142b5274c8965 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 5 Mar 2021 07:20:28 -0800
Subject: [PATCH] x86-64: Add memcmp family functions with 256-bit EVEX
Content-type: text/plain; charset=UTF-8
Update ifunc-memcmp.h to select the function optimized with 256-bit EVEX
instructions using YMM16-YMM31 registers to avoid RTM abort with usable
AVX512VL, AVX512BW and MOVBE since VZEROUPPER isn't needed at function
exit.
---
sysdeps/x86_64/multiarch/Makefile | 4 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 10 +
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 13 +-
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 440 ++++++++++++++++++
sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S | 4 +
5 files changed, 467 insertions(+), 4 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 1cc0a10e..9d79b138 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
memset-avx2-unaligned-erms \
memset-avx512-unaligned-erms \
memchr-evex \
+ memcmp-evex-movbe \
memmove-evex-unaligned-erms \
memrchr-evex \
memset-evex-unaligned-erms \
@@ -81,7 +82,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wcsncmp-evex \
wcsnlen-evex \
wcsrchr-evex \
- wmemchr-evex
+ wmemchr-evex \
+ wmemcmp-evex-movbe
endif
ifeq ($(subdir),debug)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7cf83485..c8da910e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -56,6 +56,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_avx2_movbe)
+ IFUNC_IMPL_ADD (array, i, memcmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (MOVBE)),
+ __memcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
__memcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
@@ -558,6 +563,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (MOVBE)),
__wmemcmp_avx2_movbe)
+ IFUNC_IMPL_ADD (array, i, wmemcmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (MOVBE)),
+ __wmemcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
__wmemcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index 6c1f3153..3ca1f0a6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -23,17 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
- return OPTIMIZE (avx2_movbe);
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ return OPTIMIZE (evex_movbe);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2_movbe);
+ }
if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
return OPTIMIZE (sse4_1);
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
new file mode 100644
index 00000000..9c093972
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -0,0 +1,440 @@
+/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+/* memcmp/wmemcmp is implemented as:
+ 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+ to avoid branches.
+ 2. Use overlapping compare to avoid branch.
+ 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+ bytes for wmemcmp.
+ 4. If size is 8 * VEC_SIZE or less, unroll the loop.
+ 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+ area.
+ 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+ 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+ 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+# define MEMCMP __memcmp_evex_movbe
+# endif
+
+# define VMOVU vmovdqu64
+
+# ifdef USE_AS_WMEMCMP
+# define VPCMPEQ vpcmpeqd
+# else
+# define VPCMPEQ vpcmpeqb
+# endif
+
+# define XMM1 xmm17
+# define XMM2 xmm18
+# define YMM1 ymm17
+# define YMM2 ymm18
+# define YMM3 ymm19
+# define YMM4 ymm20
+# define YMM5 ymm21
+# define YMM6 ymm22
+
+# define VEC_SIZE 32
+# ifdef USE_AS_WMEMCMP
+# define VEC_MASK 0xff
+# define XMM_MASK 0xf
+# else
+# define VEC_MASK 0xffffffff
+# define XMM_MASK 0xffff
+# endif
+
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+ .section .text.evex,"ax",@progbits
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+ shl $2, %RDX_LP
+# elif defined __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
+ cmp $VEC_SIZE, %RDX_LP
+ jb L(less_vec)
+
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k1
+ kmovd %k1, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ cmpq $(VEC_SIZE * 2), %rdx
+ jbe L(last_vec)
+
+ /* More than 2 * VEC. */
+ cmpq $(VEC_SIZE * 8), %rdx
+ ja L(more_8x_vec)
+ cmpq $(VEC_SIZE * 4), %rdx
+ jb L(last_4x_vec)
+
+ /* From 4 * VEC to 8 * VEC, inclusively. */
+ VMOVU (%rsi), %YMM1
+ VPCMPEQ (%rdi), %YMM1, %k1
+
+ VMOVU VEC_SIZE(%rsi), %YMM2
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+
+ kandd %k1, %k2, %k5
+ kandd %k3, %k4, %k6
+ kandd %k5, %k6, %k6
+
+ kmovd %k6, %eax
+ cmpl $VEC_MASK, %eax
+ jne L(4x_vec_end)
+
+ leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+ leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+ VMOVU (%rsi), %YMM1
+ VPCMPEQ (%rdi), %YMM1, %k1
+
+ VMOVU VEC_SIZE(%rsi), %YMM2
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+ kandd %k1, %k2, %k5
+
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+ kandd %k3, %k5, %k5
+
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+ kandd %k4, %k5, %k5
+
+ kmovd %k5, %eax
+ cmpl $VEC_MASK, %eax
+ jne L(4x_vec_end)
+ xorl %eax, %eax
+ ret
+
+ .p2align 4
+L(last_2x_vec):
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+L(last_vec):
+ /* Use overlapping loads to avoid branches. */
+ leaq -VEC_SIZE(%rdi, %rdx), %rdi
+ leaq -VEC_SIZE(%rsi, %rdx), %rsi
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(first_vec):
+ /* A byte or int32 is different within 16 or 32 bytes. */
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl (%rdi, %rcx, 4), %edx
+ cmpl (%rsi, %rcx, 4), %edx
+L(wmemcmp_return):
+ setl %al
+ negl %eax
+ orl $1, %eax
+# else
+ movzbl (%rdi, %rcx), %eax
+ movzbl (%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ ret
+
+# ifdef USE_AS_WMEMCMP
+ .p2align 4
+L(4):
+ xorl %eax, %eax
+ movl (%rdi), %edx
+ cmpl (%rsi), %edx
+ jne L(wmemcmp_return)
+ ret
+# else
+ .p2align 4
+L(between_4_7):
+ /* Load as big endian with overlapping movbe to avoid branches. */
+ movbe (%rdi), %eax
+ movbe (%rsi), %ecx
+ shlq $32, %rax
+ shlq $32, %rcx
+ movbe -4(%rdi, %rdx), %edi
+ movbe -4(%rsi, %rdx), %esi
+ orq %rdi, %rax
+ orq %rsi, %rcx
+ subq %rcx, %rax
+ je L(exit)
+ sbbl %eax, %eax
+ orl $1, %eax
+ ret
+
+ .p2align 4
+L(exit):
+ ret
+
+ .p2align 4
+L(between_2_3):
+ /* Load as big endian to avoid branches. */
+ movzwl (%rdi), %eax
+ movzwl (%rsi), %ecx
+ shll $8, %eax
+ shll $8, %ecx
+ bswap %eax
+ bswap %ecx
+ movb -1(%rdi, %rdx), %al
+ movb -1(%rsi, %rdx), %cl
+ /* Subtraction is okay because the upper 8 bits are zero. */
+ subl %ecx, %eax
+ ret
+
+ .p2align 4
+L(1):
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %ecx
+ subl %ecx, %eax
+ ret
+# endif
+
+ .p2align 4
+L(zero):
+ xorl %eax, %eax
+ ret
+
+ .p2align 4
+L(less_vec):
+# ifdef USE_AS_WMEMCMP
+ /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
+ cmpb $4, %dl
+ je L(4)
+ jb L(zero)
+# else
+ cmpb $1, %dl
+ je L(1)
+ jb L(zero)
+ cmpb $4, %dl
+ jb L(between_2_3)
+ cmpb $8, %dl
+ jb L(between_4_7)
+# endif
+ cmpb $16, %dl
+ jae L(between_16_31)
+ /* It is between 8 and 15 bytes. */
+ vmovq (%rdi), %XMM1
+ vmovq (%rsi), %XMM2
+ VPCMPEQ %XMM1, %XMM2, %k2
+ kmovw %k2, %eax
+ subl $XMM_MASK, %eax
+ jnz L(first_vec)
+ /* Use overlapping loads to avoid branches. */
+ leaq -8(%rdi, %rdx), %rdi
+ leaq -8(%rsi, %rdx), %rsi
+ vmovq (%rdi), %XMM1
+ vmovq (%rsi), %XMM2
+ VPCMPEQ %XMM1, %XMM2, %k2
+ kmovw %k2, %eax
+ subl $XMM_MASK, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(between_16_31):
+ /* From 16 to 31 bytes. No branch when size == 16. */
+ VMOVU (%rsi), %XMM2
+ VPCMPEQ (%rdi), %XMM2, %k2
+ kmovw %k2, %eax
+ subl $XMM_MASK, %eax
+ jnz L(first_vec)
+
+ /* Use overlapping loads to avoid branches. */
+ leaq -16(%rdi, %rdx), %rdi
+ leaq -16(%rsi, %rdx), %rsi
+ VMOVU (%rsi), %XMM2
+ VPCMPEQ (%rdi), %XMM2, %k2
+ kmovw %k2, %eax
+ subl $XMM_MASK, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(more_8x_vec):
+ /* More than 8 * VEC. Check the first VEC. */
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ /* Align the first memory area for aligned loads in the loop.
+ Compute how much the first memory area is misaligned. */
+ movq %rdi, %rcx
+ andl $(VEC_SIZE - 1), %ecx
+ /* Get the negative of offset for alignment. */
+ subq $VEC_SIZE, %rcx
+ /* Adjust the second memory area. */
+ subq %rcx, %rsi
+ /* Adjust the first memory area which should be aligned now. */
+ subq %rcx, %rdi
+ /* Adjust length. */
+ addq %rcx, %rdx
+
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ VMOVU (%rsi), %YMM1
+ VPCMPEQ (%rdi), %YMM1, %k1
+
+ VMOVU VEC_SIZE(%rsi), %YMM2
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+ kandd %k2, %k1, %k5
+
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+ kandd %k3, %k5, %k5
+
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+ kandd %k4, %k5, %k5
+
+ kmovd %k5, %eax
+ cmpl $VEC_MASK, %eax
+ jne L(4x_vec_end)
+
+ addq $(VEC_SIZE * 4), %rdi
+ addq $(VEC_SIZE * 4), %rsi
+
+ subq $(VEC_SIZE * 4), %rdx
+ cmpq $(VEC_SIZE * 4), %rdx
+ jae L(loop_4x_vec)
+
+ /* Less than 4 * VEC. */
+ cmpq $VEC_SIZE, %rdx
+ jbe L(last_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ jbe L(last_2x_vec)
+
+L(last_4x_vec):
+ /* From 2 * VEC to 4 * VEC. */
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ addq $VEC_SIZE, %rdi
+ addq $VEC_SIZE, %rsi
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ /* Use overlapping loads to avoid branches. */
+ leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+ leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ addq $VEC_SIZE, %rdi
+ addq $VEC_SIZE, %rsi
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(4x_vec_end):
+ kmovd %k1, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec_x1)
+ kmovd %k3, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec_x2)
+ kmovd %k4, %eax
+ subl $VEC_MASK, %eax
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
+ jmp L(wmemcmp_return)
+# else
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x1):
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl VEC_SIZE(%rdi, %rcx, 4), %edx
+ cmpl VEC_SIZE(%rsi, %rcx, 4), %edx
+ jmp L(wmemcmp_return)
+# else
+ movzbl VEC_SIZE(%rdi, %rcx), %eax
+ movzbl VEC_SIZE(%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
+ cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
+ jmp L(wmemcmp_return)
+# else
+ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ ret
+END (MEMCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
new file mode 100644
index 00000000..4726d74a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_evex_movbe
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-evex-movbe.S"
--
GitLab

File diff suppressed because it is too large Load Diff

View File

@ -1,735 +0,0 @@
From 4bd660be40967cd69072f69ebc2ad32bfcc1f206 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 23 Feb 2021 06:33:10 -0800
Subject: [PATCH] x86: Add string/memory function tests in RTM region
Content-type: text/plain; charset=UTF-8
At function exit, AVX optimized string/memory functions have VZEROUPPER
which triggers RTM abort. When such functions are called inside a
transactionally executing RTM region, RTM abort causes severe performance
degradation. Add tests to verify that string/memory functions won't
cause RTM abort in RTM region.
---
sysdeps/x86/Makefile | 23 +++++++++++
sysdeps/x86/tst-memchr-rtm.c | 54 ++++++++++++++++++++++++++
sysdeps/x86/tst-memcmp-rtm.c | 52 +++++++++++++++++++++++++
sysdeps/x86/tst-memmove-rtm.c | 53 ++++++++++++++++++++++++++
sysdeps/x86/tst-memrchr-rtm.c | 54 ++++++++++++++++++++++++++
sysdeps/x86/tst-memset-rtm.c | 45 ++++++++++++++++++++++
sysdeps/x86/tst-strchr-rtm.c | 54 ++++++++++++++++++++++++++
sysdeps/x86/tst-strcpy-rtm.c | 53 ++++++++++++++++++++++++++
sysdeps/x86/tst-string-rtm.h | 72 +++++++++++++++++++++++++++++++++++
sysdeps/x86/tst-strlen-rtm.c | 53 ++++++++++++++++++++++++++
sysdeps/x86/tst-strncmp-rtm.c | 52 +++++++++++++++++++++++++
sysdeps/x86/tst-strrchr-rtm.c | 53 ++++++++++++++++++++++++++
12 files changed, 618 insertions(+)
create mode 100644 sysdeps/x86/tst-memchr-rtm.c
create mode 100644 sysdeps/x86/tst-memcmp-rtm.c
create mode 100644 sysdeps/x86/tst-memmove-rtm.c
create mode 100644 sysdeps/x86/tst-memrchr-rtm.c
create mode 100644 sysdeps/x86/tst-memset-rtm.c
create mode 100644 sysdeps/x86/tst-strchr-rtm.c
create mode 100644 sysdeps/x86/tst-strcpy-rtm.c
create mode 100644 sysdeps/x86/tst-string-rtm.h
create mode 100644 sysdeps/x86/tst-strlen-rtm.c
create mode 100644 sysdeps/x86/tst-strncmp-rtm.c
create mode 100644 sysdeps/x86/tst-strrchr-rtm.c
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 59e928e9..5be71ada 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -17,6 +17,29 @@ endif
ifeq ($(subdir),string)
sysdep_routines += cacheinfo
+
+tests += \
+ tst-memchr-rtm \
+ tst-memcmp-rtm \
+ tst-memmove-rtm \
+ tst-memrchr-rtm \
+ tst-memset-rtm \
+ tst-strchr-rtm \
+ tst-strcpy-rtm \
+ tst-strlen-rtm \
+ tst-strncmp-rtm \
+ tst-strrchr-rtm
+
+CFLAGS-tst-memchr-rtm.c += -mrtm
+CFLAGS-tst-memcmp-rtm.c += -mrtm
+CFLAGS-tst-memmove-rtm.c += -mrtm
+CFLAGS-tst-memrchr-rtm.c += -mrtm
+CFLAGS-tst-memset-rtm.c += -mrtm
+CFLAGS-tst-strchr-rtm.c += -mrtm
+CFLAGS-tst-strcpy-rtm.c += -mrtm
+CFLAGS-tst-strlen-rtm.c += -mrtm
+CFLAGS-tst-strncmp-rtm.c += -mrtm
+CFLAGS-tst-strrchr-rtm.c += -mrtm
endif
ifneq ($(enable-cet),no)
diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
new file mode 100644
index 00000000..e4749401
--- /dev/null
+++ b/sysdeps/x86/tst-memchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for memchr inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ string1[100] = 'c';
+ string1[STRING_SIZE - 100] = 'c';
+ char *p = memchr (string1, 'c', STRING_SIZE);
+ if (p == &string1[100])
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ char *p = memchr (string1, 'c', STRING_SIZE);
+ if (p == &string1[100])
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
new file mode 100644
index 00000000..e4c8a623
--- /dev/null
+++ b/sysdeps/x86/tst-memcmp-rtm.c
@@ -0,0 +1,52 @@
+/* Test case for memcmp inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ memset (string2, 'a', STRING_SIZE);
+ if (memcmp (string1, string2, STRING_SIZE) == 0)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ if (memcmp (string1, string2, STRING_SIZE) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memcmp", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
new file mode 100644
index 00000000..4bf97ef1
--- /dev/null
+++ b/sysdeps/x86/tst-memmove-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for memmove inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ if (memmove (string2, string1, STRING_SIZE) == string2
+ && memcmp (string2, string1, STRING_SIZE) == 0)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ if (memmove (string2, string1, STRING_SIZE) == string2
+ && memcmp (string2, string1, STRING_SIZE) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memmove", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
new file mode 100644
index 00000000..a57a5a8e
--- /dev/null
+++ b/sysdeps/x86/tst-memrchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for memrchr inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ string1[100] = 'c';
+ string1[STRING_SIZE - 100] = 'c';
+ char *p = memrchr (string1, 'c', STRING_SIZE);
+ if (p == &string1[STRING_SIZE - 100])
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ char *p = memrchr (string1, 'c', STRING_SIZE);
+ if (p == &string1[STRING_SIZE - 100])
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memrchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
new file mode 100644
index 00000000..bf343a4d
--- /dev/null
+++ b/sysdeps/x86/tst-memset-rtm.c
@@ -0,0 +1,45 @@
+/* Test case for memset inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ return EXIT_SUCCESS;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ return 0;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memset", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
new file mode 100644
index 00000000..a82e29c0
--- /dev/null
+++ b/sysdeps/x86/tst-strchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for strchr inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ string1[100] = 'c';
+ string1[STRING_SIZE - 100] = 'c';
+ char *p = strchr (string1, 'c');
+ if (p == &string1[100])
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ char *p = strchr (string1, 'c');
+ if (p == &string1[100])
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
new file mode 100644
index 00000000..2b2a583f
--- /dev/null
+++ b/sysdeps/x86/tst-strcpy-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strcpy inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ if (strcpy (string2, string1) == string2
+ && strcmp (string2, string1) == 0)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ if (strcpy (string2, string1) == string2
+ && strcmp (string2, string1) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strcpy", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
new file mode 100644
index 00000000..d2470afa
--- /dev/null
+++ b/sysdeps/x86/tst-string-rtm.h
@@ -0,0 +1,72 @@
+/* Test string function in a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <string.h>
+#include <x86intrin.h>
+#include <sys/platform/x86.h>
+#include <support/check.h>
+#include <support/test-driver.h>
+
+static int
+do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
+ int (*function) (void))
+{
+ if (!CPU_FEATURE_USABLE (RTM))
+ return EXIT_UNSUPPORTED;
+
+ int status = prepare ();
+ if (status != EXIT_SUCCESS)
+ return status;
+
+ unsigned int i;
+ unsigned int naborts = 0;
+ unsigned int failed = 0;
+ for (i = 0; i < loop; i++)
+ {
+ failed |= function ();
+ if (_xbegin() == _XBEGIN_STARTED)
+ {
+ failed |= function ();
+ _xend();
+ }
+ else
+ {
+ failed |= function ();
+ ++naborts;
+ }
+ }
+
+ if (failed)
+ FAIL_EXIT1 ("%s() failed", name);
+
+ if (naborts)
+ {
+ /* NB: Low single digit (<= 5%) noise-level aborts are normal for
+ TSX. */
+ double rate = 100 * ((double) naborts) / ((double) loop);
+ if (rate > 5)
+ FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
+ rate, naborts, loop);
+ }
+
+ return EXIT_SUCCESS;
+}
+
+static int do_test (void);
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
new file mode 100644
index 00000000..0dcf14db
--- /dev/null
+++ b/sysdeps/x86/tst-strlen-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strlen inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ string1[STRING_SIZE - 100] = '\0';
+ size_t len = strlen (string1);
+ if (len == STRING_SIZE - 100)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ size_t len = strlen (string1);
+ if (len == STRING_SIZE - 100)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strlen", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
new file mode 100644
index 00000000..236ad951
--- /dev/null
+++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -0,0 +1,52 @@
+/* Test case for strncmp inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ memset (string2, 'a', STRING_SIZE - 1);
+ if (strncmp (string1, string2, STRING_SIZE) == 0)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ if (strncmp (string1, string2, STRING_SIZE) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strncmp", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
new file mode 100644
index 00000000..e32bfaf5
--- /dev/null
+++ b/sysdeps/x86/tst-strrchr-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strrchr inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ string1[STRING_SIZE - 100] = 'c';
+ char *p = strrchr (string1, 'c');
+ if (p == &string1[STRING_SIZE - 100])
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ char *p = strrchr (string1, 'c');
+ if (p == &string1[STRING_SIZE - 100])
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strrchr", LOOP, prepare, function);
+}
--
GitLab

View File

@ -1,148 +0,0 @@
From 4e2d8f352774b56078c34648b14a2412c38384f4 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sun, 7 Mar 2021 09:44:18 -0800
Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memset family functions
Content-type: text/plain; charset=UTF-8
Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
with AVX512 instructions using ZMM16-ZMM31 registers to avoid RTM abort
with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
function exit.
---
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 14 +++++++++-----
sysdeps/x86_64/multiarch/ifunc-memset.h | 13 ++++++++-----
sysdeps/x86_64/multiarch/ifunc-wmemset.h | 12 ++++++------
.../multiarch/memset-avx512-unaligned-erms.S | 16 ++++++++--------
4 files changed, 31 insertions(+), 24 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index c1efeec0..d969a156 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -211,10 +211,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)),
__memset_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
__memset_chk_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
__memset_chk_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, __memset_chk,
CPU_FEATURE_USABLE (AVX512F),
@@ -252,10 +254,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)),
__memset_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memset,
- CPU_FEATURE_USABLE (AVX512F),
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
__memset_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memset,
- CPU_FEATURE_USABLE (AVX512F),
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
__memset_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, memset,
CPU_FEATURE_USABLE (AVX512F),
@@ -719,7 +723,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512VL),
__wmemset_evex_unaligned)
IFUNC_IMPL_ADD (array, i, wmemset,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__wmemset_avx512_unaligned))
#ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index 6f3375cc..19795938 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -53,13 +53,16 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
{
- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
- return OPTIMIZE (avx512_no_vzeroupper);
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (avx512_unaligned_erms);
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (avx512_unaligned_erms);
+ return OPTIMIZE (avx512_unaligned);
+ }
- return OPTIMIZE (avx512_unaligned);
+ return OPTIMIZE (avx512_no_vzeroupper);
}
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
index bdc94c6c..98c5d406 100644
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
@@ -33,13 +33,13 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
- return OPTIMIZE (avx512_unaligned);
-
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
- return OPTIMIZE (evex_unaligned);
+ {
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+ return OPTIMIZE (avx512_unaligned);
+
+ return OPTIMIZE (evex_unaligned);
+ }
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_unaligned_rtm);
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 0783979c..22e7b187 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -1,22 +1,22 @@
#if IS_IN (libc)
# define VEC_SIZE 64
-# define VEC(i) zmm##i
+# define XMM0 xmm16
+# define YMM0 ymm16
+# define VEC0 zmm16
+# define VEC(i) VEC##i
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
+# define VZEROUPPER
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
movq r, %rax; \
- vpbroadcastb %xmm0, %xmm0; \
- vpbroadcastq %xmm0, %zmm0
+ vpbroadcastb d, %VEC0
# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
movq r, %rax; \
- vpbroadcastd %xmm0, %xmm0; \
- vpbroadcastq %xmm0, %zmm0
+ vpbroadcastd d, %VEC0
-# define SECTION(p) p##.avx512
+# define SECTION(p) p##.evex512
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
--
GitLab

View File

@ -1,230 +0,0 @@
From b304fc201d2f6baf52ea790df8643e99772243cd Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 21 Jan 2019 11:25:56 -0800
Subject: [PATCH] x86-64 memcmp/wmemcmp: Properly handle the length parameter
[BZ# 24097]
Content-type: text/plain; charset=UTF-8
On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits. The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.
This pach fixes memcmp/wmemcmp for x32. Tested on x86-64 and x32. On
x86-64, libc.so is the same with and withou the fix.
[BZ# 24097]
CVE-2019-6488
* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for
length. Clear the upper 32 bits of RDX register.
* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and
tst-size_t-wmemcmp.
* sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file.
* sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise.
---
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 7 +-
sysdeps/x86_64/multiarch/memcmp-sse4.S | 9 ++-
sysdeps/x86_64/multiarch/memcmp-ssse3.S | 7 +-
sysdeps/x86_64/x32/Makefile | 4 +-
sysdeps/x86_64/x32/tst-size_t-memcmp.c | 76 ++++++++++++++++++++
sysdeps/x86_64/x32/tst-size_t-wmemcmp.c | 20 ++++++
6 files changed, 114 insertions(+), 9 deletions(-)
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
Conflicts:
ChangeLog
(removed)
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index 30f764c3..e3a35b89 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -58,9 +58,12 @@
.section .text.avx,"ax",@progbits
ENTRY (MEMCMP)
# ifdef USE_AS_WMEMCMP
- shl $2, %rdx
+ shl $2, %RDX_LP
+# elif defined __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
# endif
- cmpq $VEC_SIZE, %rdx
+ cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index 8e164f2c..302900f5 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -42,13 +42,16 @@
.section .text.sse4.1,"ax",@progbits
ENTRY (MEMCMP)
# ifdef USE_AS_WMEMCMP
- shl $2, %rdx
+ shl $2, %RDX_LP
+# elif defined __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
# endif
pxor %xmm0, %xmm0
- cmp $79, %rdx
+ cmp $79, %RDX_LP
ja L(79bytesormore)
# ifndef USE_AS_WMEMCMP
- cmp $1, %rdx
+ cmp $1, %RDX_LP
je L(firstbyte)
# endif
add %rdx, %rsi
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
index 6f76c641..69d030fc 100644
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
@@ -33,9 +33,12 @@
atom_text_section
ENTRY (MEMCMP)
# ifdef USE_AS_WMEMCMP
- shl $2, %rdx
- test %rdx, %rdx
+ shl $2, %RDX_LP
+ test %RDX_LP, %RDX_LP
jz L(equal)
+# elif defined __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
# endif
mov %rdx, %rcx
mov %rdi, %rdx
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index 7d528889..ddec7f04 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround
endif
ifeq ($(subdir),string)
-tests += tst-size_t-memchr
+tests += tst-size_t-memchr tst-size_t-memcmp
endif
ifeq ($(subdir),wcsmbs)
-tests += tst-size_t-wmemchr
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp
endif
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
new file mode 100644
index 00000000..9bd6fdb4
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
@@ -0,0 +1,76 @@
+/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define TEST_MAIN
+#ifdef WIDE
+# define TEST_NAME "wmemcmp"
+#else
+# define TEST_NAME "memcmp"
+#endif
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <inttypes.h>
+# include <wchar.h>
+
+# define MEMCMP wmemcmp
+# define CHAR wchar_t
+#else
+# define MEMCMP memcmp
+# define CHAR char
+#endif
+
+IMPL (MEMCMP, 1)
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+
+static int
+__attribute__ ((noinline, noclone))
+do_memcmp (parameter_t a, parameter_t b)
+{
+ return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+ test_init ();
+
+ parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
+ parameter_t src = { { 0 }, buf2 };
+
+ memcpy (buf1, buf2, page_size);
+
+ int ret = 0;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ src.fn = impl->fn;
+ int res = do_memcmp (dest, src);
+ if (res)
+ {
+ error (0, 0, "Wrong result in function %s: %i != 0",
+ impl->name, res);
+ ret = 1;
+ }
+ }
+
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
new file mode 100644
index 00000000..e8b5ffd0
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
@@ -0,0 +1,20 @@
+/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define WIDE 1
+#include "tst-size_t-memcmp.c"
--
GitLab

View File

@ -1,164 +0,0 @@
From e4fda4631017e49d4ee5a2755db34289b6860fa4 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sun, 7 Mar 2021 09:45:23 -0800
Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memmove family functions
Content-type: text/plain; charset=UTF-8
Update ifunc-memmove.h to select the function optimized with AVX512
instructions using ZMM16-ZMM31 registers to avoid RTM abort with usable
AVX512VL since VZEROUPPER isn't needed at function exit.
---
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 24 +++++++++---------
sysdeps/x86_64/multiarch/ifunc-memmove.h | 12 +++++----
.../multiarch/memmove-avx512-unaligned-erms.S | 25 +++++++++++++++++--
3 files changed, 42 insertions(+), 19 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index d969a156..fec384f6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -83,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__memmove_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX),
@@ -148,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__memmove_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, memmove,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memmove_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, memmove,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memmove_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
__memmove_ssse3_back)
@@ -733,10 +733,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__memcpy_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX),
@@ -802,10 +802,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__memcpy_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, memcpy,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memcpy_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memcpy_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy, 1,
@@ -819,10 +819,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__mempcpy_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX),
@@ -864,10 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__mempcpy_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, mempcpy,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, mempcpy,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX),
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index fa09b9fb..014e95c7 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -56,13 +56,15 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
{
- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
- return OPTIMIZE (avx512_no_vzeroupper);
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (avx512_unaligned_erms);
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (avx512_unaligned_erms);
+ return OPTIMIZE (avx512_unaligned);
+ }
- return OPTIMIZE (avx512_unaligned);
+ return OPTIMIZE (avx512_no_vzeroupper);
}
if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index aac1515c..848848ab 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -1,11 +1,32 @@
#if IS_IN (libc)
# define VEC_SIZE 64
-# define VEC(i) zmm##i
+# define XMM0 xmm16
+# define XMM1 xmm17
+# define YMM0 ymm16
+# define YMM1 ymm17
+# define VEC0 zmm16
+# define VEC1 zmm17
+# define VEC2 zmm18
+# define VEC3 zmm19
+# define VEC4 zmm20
+# define VEC5 zmm21
+# define VEC6 zmm22
+# define VEC7 zmm23
+# define VEC8 zmm24
+# define VEC9 zmm25
+# define VEC10 zmm26
+# define VEC11 zmm27
+# define VEC12 zmm28
+# define VEC13 zmm29
+# define VEC14 zmm30
+# define VEC15 zmm31
+# define VEC(i) VEC##i
# define VMOVNT vmovntdq
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
+# define VZEROUPPER
-# define SECTION(p) p##.avx512
+# define SECTION(p) p##.evex512
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
# include "memmove-vec-unaligned-erms.S"
--
GitLab

View File

@ -1,71 +0,0 @@
From 595c22ecd8e87a27fd19270ed30fdbae9ad25426 Mon Sep 17 00:00:00 2001
From: Sunil K Pandey <skpgkp2@gmail.com>
Date: Thu, 1 Apr 2021 15:47:04 -0700
Subject: [PATCH] x86-64: Fix ifdef indentation in strlen-evex.S
Content-type: text/plain; charset=UTF-8
Fix some indentations of ifdef in file strlen-evex.S which are off by 1
and confusing to read.
---
sysdeps/x86_64/multiarch/strlen-evex.S | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
index cd022509..05838190 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -276,10 +276,10 @@ L(last_2x_vec):
.p2align 4
L(first_vec_x0_check):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
-# endif
+# endif
/* Check the end of data. */
cmpq %rax, %rsi
jbe L(max)
@@ -293,10 +293,10 @@ L(first_vec_x0_check):
.p2align 4
L(first_vec_x1_check):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
-# endif
+# endif
/* Check the end of data. */
cmpq %rax, %rsi
jbe L(max)
@@ -311,10 +311,10 @@ L(first_vec_x1_check):
.p2align 4
L(first_vec_x2_check):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
-# endif
+# endif
/* Check the end of data. */
cmpq %rax, %rsi
jbe L(max)
@@ -329,10 +329,10 @@ L(first_vec_x2_check):
.p2align 4
L(first_vec_x3_check):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
-# endif
+# endif
/* Check the end of data. */
cmpq %rax, %rsi
jbe L(max)
--
GitLab

View File

@ -1,51 +0,0 @@
From 55bf411b451c13f0fb7ff3d3bf9a820020b45df1 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 19 Apr 2021 07:07:21 -0700
Subject: [PATCH] x86-64: Require BMI2 for __strlen_evex and __strnlen_evex
Content-type: text/plain; charset=UTF-8
Since __strlen_evex and __strnlen_evex added by
commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 5 06:24:52 2021 -0800
x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
use sarx:
c4 e2 6a f7 c0 sarx %edx,%eax,%eax
require BMI2 for __strlen_evex and __strnlen_evex in ifunc-impl-list.c.
ifunc-avx2.h already requires BMI2 for EVEX implementation.
---
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index fec384f6..cbfc1a5d 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -293,7 +293,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strlen,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__strlen_evex)
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
@@ -308,7 +309,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strnlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strnlen,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__strnlen_evex)
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
--
GitLab

View File

@ -1,584 +0,0 @@
From acfd088a1963ba51cd83c78f95c0ab25ead79e04 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 3 May 2021 03:01:58 -0400
Subject: [PATCH] x86: Optimize memchr-avx2.S
Content-type: text/plain; charset=UTF-8
No bug. This commit optimizes memchr-avx2.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
asaving a few instructions the in loop return loop. test-memchr,
test-rawmemchr, and test-wmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++-----------
1 file changed, 247 insertions(+), 178 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index cf893e77..b377f22e 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -26,8 +26,22 @@
# ifdef USE_AS_WMEMCHR
# define VPCMPEQ vpcmpeqd
+# define VPBROADCAST vpbroadcastd
+# define CHAR_SIZE 4
# else
# define VPCMPEQ vpcmpeqb
+# define VPBROADCAST vpbroadcastb
+# define CHAR_SIZE 1
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+# define ERAW_PTR_REG ecx
+# define RRAW_PTR_REG rcx
+# define ALGN_PTR_REG rdi
+# else
+# define ERAW_PTR_REG edi
+# define RRAW_PTR_REG rdi
+# define ALGN_PTR_REG rcx
# endif
# ifndef VZEROUPPER
@@ -39,6 +53,7 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
.section SECTION(.text),"ax",@progbits
ENTRY (MEMCHR)
@@ -47,295 +62,349 @@ ENTRY (MEMCHR)
test %RDX_LP, %RDX_LP
jz L(null)
# endif
- movl %edi, %ecx
- /* Broadcast CHAR to YMM0. */
- vmovd %esi, %xmm0
# ifdef USE_AS_WMEMCHR
shl $2, %RDX_LP
- vpbroadcastd %xmm0, %ymm0
# else
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
# endif
- vpbroadcastb %xmm0, %ymm0
# endif
+ /* Broadcast CHAR to YMMMATCH. */
+ vmovd %esi, %xmm0
+ VPBROADCAST %xmm0, %ymm0
/* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
+ VPCMPEQ (%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
# ifndef USE_AS_RAWMEMCHR
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rdx
- jbe L(zero)
-# else
- jnz L(first_vec_x0)
+ /* If length < CHAR_PER_VEC handle special. */
+ cmpq $VEC_SIZE, %rdx
+ jbe L(first_vec_x0)
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
- addq %rcx, %rdx
+ .p2align 5
+L(first_vec_x0):
+ /* Check if first match was before length. */
+ tzcntl %eax, %eax
+ xorl %ecx, %ecx
+ cmpl %eax, %edx
+ leaq (%rdi, %rax), %rax
+ cmovle %rcx, %rax
+ VZEROUPPER_RETURN
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
+L(null):
+ xorl %eax, %eax
+ ret
# endif
- jmp L(more_4x_vec)
-
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
+L(cross_page_boundary):
+ /* Save pointer before aligning as its original value is necessary
+ for computer return address if byte is found or adjusting length
+ if it is not and this is memchr. */
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
+ rdi for rawmemchr. */
+ orq $(VEC_SIZE - 1), %ALGN_PTR_REG
+ VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Calculate length until end of page (length checked for a
+ match). */
+ leaq 1(%ALGN_PTR_REG), %rsi
+ subq %RRAW_PTR_REG, %rsi
+# endif
/* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
- tzcntl %eax, %eax
+ sarxl %ERAW_PTR_REG, %eax, %eax
# ifndef USE_AS_RAWMEMCHR
/* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ cmpq %rsi, %rdx
+ jbe L(first_vec_x0)
# endif
- addq %rdi, %rax
- addq %rcx, %rax
+ testl %eax, %eax
+ jz L(cross_page_continue)
+ tzcntl %eax, %eax
+ addq %RRAW_PTR_REG, %rax
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
- overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ incq %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- /* Check the end of data. */
- subq %rcx, %rdx
- jbe L(zero)
-# endif
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
+
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 2 + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- addq $VEC_SIZE, %rdi
-# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 3 + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
-L(more_4x_vec):
+ .p2align 4
+L(aligned_more):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+# ifndef USE_AS_RAWMEMCHR
+L(cross_page_continue):
+ /* Align data to VEC_SIZE - 1. */
+ xorl %ecx, %ecx
+ subl %edi, %ecx
+ orq $(VEC_SIZE - 1), %rdi
+ /* esi is for adjusting length to see if near the end. */
+ leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# else
+ orq $(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+# endif
+ /* Load first VEC regardless. */
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. If near end handle specially. */
+ subq %rsi, %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
# ifndef USE_AS_RAWMEMCHR
+ /* Check if at last VEC_SIZE * 4 length. */
subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
+ jbe L(last_4x_vec_or_less_cmpeq)
+ /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+ length. */
+ incq %rdi
+ movl %edi, %ecx
+ orq $(VEC_SIZE * 4 - 1), %rdi
+ andl $(VEC_SIZE * 4 - 1), %ecx
addq %rcx, %rdx
+# else
+ /* Align data to VEC_SIZE * 4 - 1 for loop. */
+ incq %rdi
+ orq $(VEC_SIZE * 4 - 1), %rdi
# endif
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
-
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
vpor %ymm1, %ymm2, %ymm5
vpor %ymm3, %ymm4, %ymm6
vpor %ymm5, %ymm6, %ymm5
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
+ vpmovmskb %ymm5, %ecx
# ifdef USE_AS_RAWMEMCHR
- jmp L(loop_4x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
+ testl %ecx, %ecx
+ jz L(loop_4x_vec)
# else
- subq $(VEC_SIZE * 4), %rdx
- ja L(loop_4x_vec)
+ testl %ecx, %ecx
+ jnz L(loop_4x_vec_end)
-L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %edx
- jle L(last_2x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
+ subq $(VEC_SIZE * 4), %rdx
+ ja L(loop_4x_vec)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ /* Fall through into less than 4 remaining vectors of length case.
+ */
+ VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+ .p2align 4
+L(last_4x_vec_or_less):
+ /* Check if first VEC contained match. */
testl %eax, %eax
- jnz L(first_vec_x1)
+ jnz L(first_vec_x1_check)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
+ /* If remaining length > VEC_SIZE * 2. */
+ addl $(VEC_SIZE * 2), %edx
+ jg L(last_4x_vec)
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+L(last_2x_vec):
+ /* If remaining length < VEC_SIZE. */
+ addl $VEC_SIZE, %edx
+ jle L(zero_end)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ /* Check VEC2 and compare any match with remaining length. */
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
- jnz L(first_vec_x3_check)
- xorl %eax, %eax
+ tzcntl %eax, %eax
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ addq $(VEC_SIZE + 1), %rdi
+ addq %rdi, %rax
+L(zero_end):
VZEROUPPER_RETURN
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %edx
- VPCMPEQ (%rdi), %ymm0, %ymm1
+L(loop_4x_vec_end):
+# endif
+ /* rawmemchr will fall through into this if match was found in
+ loop. */
+
vpmovmskb %ymm1, %eax
testl %eax, %eax
+ jnz L(last_vec_x1_return)
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
-
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
- jnz L(first_vec_x1_check)
- xorl %eax, %eax
- VZEROUPPER_RETURN
+ jnz L(last_vec_x2_return)
- .p2align 4
-L(first_vec_x0_check):
- tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ vpmovmskb %ymm3, %eax
+ /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */
+ salq $32, %rcx
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 2 - 1), %rdi
+# else
+ subq $-(VEC_SIZE * 2 + 1), %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
.p2align 4
L(first_vec_x1_check):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $VEC_SIZE, %rax
+ /* Adjust length. */
+ subl $-(VEC_SIZE * 4), %edx
+ /* Check if match within remaining length. */
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ incq %rdi
addq %rdi, %rax
VZEROUPPER_RETURN
+ .p2align 4
+L(set_zero_end):
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
+# endif
.p2align 4
-L(first_vec_x2_check):
+L(last_vec_x1_return):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 2), %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 4 - 1), %rdi
+# else
+ incq %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 3), %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 3 - 1), %rdi
+# else
+ subq $-(VEC_SIZE + 1), %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
.p2align 4
-L(zero):
- xorl %eax, %eax
- jmp L(return_vzeroupper)
+L(last_4x_vec_or_less_cmpeq):
+ VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ subq $-(VEC_SIZE * 4), %rdi
+ /* Check first VEC regardless. */
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+ /* If remaining length <= CHAR_PER_VEC * 2. */
+ addl $(VEC_SIZE * 2), %edx
+ jle L(last_2x_vec)
.p2align 4
-L(null):
- xorl %eax, %eax
- ret
-# endif
+L(last_4x_vec):
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2_return)
- .p2align 4
-L(first_vec_x0):
- tzcntl %eax, %eax
- addq %rdi, %rax
- VZEROUPPER_RETURN
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
- .p2align 4
-L(first_vec_x1):
- tzcntl %eax, %eax
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
- VZEROUPPER_RETURN
+ /* Create mask for possible matches within remaining length. */
+ movq $-1, %rcx
+ bzhiq %rdx, %rcx, %rcx
- .p2align 4
-L(first_vec_x2):
+ /* Test matches in data against length match. */
+ andl %ecx, %eax
+ jnz L(last_vec_x3)
+
+ /* if remaining length <= VEC_SIZE * 3 (Note this is after
+ remaining length was found to be > VEC_SIZE * 2. */
+ subl $VEC_SIZE, %edx
+ jbe L(zero_end2)
+
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Shift remaining length mask for last VEC. */
+ shrq $32, %rcx
+ andl %ecx, %eax
+ jz L(zero_end2)
tzcntl %eax, %eax
- addq $(VEC_SIZE * 2), %rax
+ addq $(VEC_SIZE * 3 + 1), %rdi
addq %rdi, %rax
+L(zero_end2):
VZEROUPPER_RETURN
.p2align 4
-L(4x_vec_end):
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- vpmovmskb %ymm2, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
- vpmovmskb %ymm4, %eax
- testl %eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
tzcntl %eax, %eax
- addq $(VEC_SIZE * 3), %rax
+ subq $-(VEC_SIZE * 2 + 1), %rdi
addq %rdi, %rax
VZEROUPPER_RETURN
+# endif
END (MEMCHR)
#endif
--
GitLab

View File

@ -1,388 +0,0 @@
From 645a158978f9520e74074e8c14047503be4db0f0 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 9 Jun 2021 16:25:32 -0400
Subject: [PATCH] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ
#27974]
Content-type: text/plain; charset=UTF-8
This commit fixes the bug mentioned in the previous commit.
The previous implementations of wmemchr in these files relied
on n * sizeof(wchar_t) which was not guranteed by the standard.
The new overflow tests added in the previous commit now
pass (As well as all the other tests).
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/memchr.S | 77 +++++++++++++++++++-------
sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
2 files changed, 98 insertions(+), 37 deletions(-)
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index cb320257..24f9a0c5 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -21,9 +21,11 @@
#ifdef USE_AS_WMEMCHR
# define MEMCHR wmemchr
# define PCMPEQ pcmpeqd
+# define CHAR_PER_VEC 4
#else
# define MEMCHR memchr
# define PCMPEQ pcmpeqb
+# define CHAR_PER_VEC 16
#endif
/* fast SSE2 version with using pmaxub and 64 byte loop */
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
movd %esi, %xmm1
mov %edi, %ecx
+#ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+#endif
#ifdef USE_AS_WMEMCHR
test %RDX_LP, %RDX_LP
jz L(return_null)
- shl $2, %RDX_LP
#else
-# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %edx, %edx
-# endif
punpcklbw %xmm1, %xmm1
test %RDX_LP, %RDX_LP
jz L(return_null)
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
test %eax, %eax
jnz L(matches_1)
- sub $16, %rdx
+ sub $CHAR_PER_VEC, %rdx
jbe L(return_null)
add $16, %rdi
and $15, %ecx
and $-16, %rdi
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
add %rcx, %rdx
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
jmp L(loop_prolog)
@@ -77,16 +81,21 @@ L(crosscache):
movdqa (%rdi), %xmm0
PCMPEQ %xmm1, %xmm0
-/* Check if there is a match. */
+ /* Check if there is a match. */
pmovmskb %xmm0, %eax
-/* Remove the leading bytes. */
+ /* Remove the leading bytes. */
sar %cl, %eax
test %eax, %eax
je L(unaligned_no_match)
-/* Check which byte is a match. */
+ /* Check which byte is a match. */
bsf %eax, %eax
-
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
add %rdi, %rax
add %rcx, %rax
@@ -94,15 +103,18 @@ L(crosscache):
.p2align 4
L(unaligned_no_match):
- /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
+ /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
"rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
possible addition overflow. */
neg %rcx
add $16, %rcx
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
sub %rcx, %rdx
jbe L(return_null)
add $16, %rdi
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
.p2align 4
@@ -135,7 +147,7 @@ L(loop_prolog):
test $0x3f, %rdi
jz L(align64_loop)
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
movdqa (%rdi), %xmm0
@@ -167,11 +179,14 @@ L(loop_prolog):
mov %rdi, %rcx
and $-64, %rdi
and $63, %ecx
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
add %rcx, %rdx
.p2align 4
L(align64_loop):
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
movdqa (%rdi), %xmm0
movdqa 16(%rdi), %xmm2
@@ -218,7 +233,7 @@ L(align64_loop):
.p2align 4
L(exit_loop):
- add $32, %edx
+ add $(CHAR_PER_VEC * 2), %edx
jle L(exit_loop_32)
movdqa (%rdi), %xmm0
@@ -238,7 +253,7 @@ L(exit_loop):
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32_1)
- sub $16, %edx
+ sub $CHAR_PER_VEC, %edx
jle L(return_null)
PCMPEQ 48(%rdi), %xmm1
@@ -250,13 +265,13 @@ L(exit_loop):
.p2align 4
L(exit_loop_32):
- add $32, %edx
+ add $(CHAR_PER_VEC * 2), %edx
movdqa (%rdi), %xmm0
PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches_1)
- sub $16, %edx
+ sub $CHAR_PER_VEC, %edx
jbe L(return_null)
PCMPEQ 16(%rdi), %xmm1
@@ -293,7 +308,13 @@ L(matches32):
.p2align 4
L(matches_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
add %rdi, %rax
ret
@@ -301,7 +322,13 @@ L(matches_1):
.p2align 4
L(matches16_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 16(%rdi, %rax), %rax
ret
@@ -309,7 +336,13 @@ L(matches16_1):
.p2align 4
L(matches32_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 32(%rdi, %rax), %rax
ret
@@ -317,7 +350,13 @@ L(matches32_1):
.p2align 4
L(matches48_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 48(%rdi, %rax), %rax
ret
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index b377f22e..16027abb 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -54,21 +54,19 @@
# define VEC_SIZE 32
# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section SECTION(.text),"ax",@progbits
ENTRY (MEMCHR)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
- test %RDX_LP, %RDX_LP
- jz L(null)
-# endif
-# ifdef USE_AS_WMEMCHR
- shl $2, %RDX_LP
-# else
# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %edx, %edx
+ /* Clear upper bits. */
+ and %RDX_LP, %RDX_LP
+# else
+ test %RDX_LP, %RDX_LP
# endif
+ jz L(null)
# endif
/* Broadcast CHAR to YMMMATCH. */
vmovd %esi, %xmm0
@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
vpmovmskb %ymm1, %eax
# ifndef USE_AS_RAWMEMCHR
/* If length < CHAR_PER_VEC handle special. */
- cmpq $VEC_SIZE, %rdx
+ cmpq $CHAR_PER_VEC, %rdx
jbe L(first_vec_x0)
# endif
testl %eax, %eax
@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
L(first_vec_x0):
/* Check if first match was before length. */
tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
xorl %ecx, %ecx
cmpl %eax, %edx
leaq (%rdi, %rax), %rax
@@ -110,12 +112,12 @@ L(null):
# endif
.p2align 4
L(cross_page_boundary):
- /* Save pointer before aligning as its original value is necessary
- for computer return address if byte is found or adjusting length
- if it is not and this is memchr. */
+ /* Save pointer before aligning as its original value is
+ necessary for computer return address if byte is found or
+ adjusting length if it is not and this is memchr. */
movq %rdi, %rcx
- /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
- rdi for rawmemchr. */
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+ and rdi for rawmemchr. */
orq $(VEC_SIZE - 1), %ALGN_PTR_REG
VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
@@ -124,6 +126,10 @@ L(cross_page_boundary):
match). */
leaq 1(%ALGN_PTR_REG), %rsi
subq %RRAW_PTR_REG, %rsi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get wchar_t count. */
+ shrl $2, %esi
+# endif
# endif
/* Remove the leading bytes. */
sarxl %ERAW_PTR_REG, %eax, %eax
@@ -181,6 +187,10 @@ L(cross_page_continue):
orq $(VEC_SIZE - 1), %rdi
/* esi is for adjusting length to see if near the end. */
leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %esi
+# endif
# else
orq $(VEC_SIZE - 1), %rdi
L(cross_page_continue):
@@ -213,7 +223,7 @@ L(cross_page_continue):
# ifndef USE_AS_RAWMEMCHR
/* Check if at last VEC_SIZE * 4 length. */
- subq $(VEC_SIZE * 4), %rdx
+ subq $(CHAR_PER_VEC * 4), %rdx
jbe L(last_4x_vec_or_less_cmpeq)
/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
length. */
@@ -221,6 +231,10 @@ L(cross_page_continue):
movl %edi, %ecx
orq $(VEC_SIZE * 4 - 1), %rdi
andl $(VEC_SIZE * 4 - 1), %ecx
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
addq %rcx, %rdx
# else
/* Align data to VEC_SIZE * 4 - 1 for loop. */
@@ -250,15 +264,19 @@ L(loop_4x_vec):
subq $-(VEC_SIZE * 4), %rdi
- subq $(VEC_SIZE * 4), %rdx
+ subq $(CHAR_PER_VEC * 4), %rdx
ja L(loop_4x_vec)
- /* Fall through into less than 4 remaining vectors of length case.
- */
+ /* Fall through into less than 4 remaining vectors of length
+ case. */
VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
.p2align 4
L(last_4x_vec_or_less):
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
/* Check if first VEC contained match. */
testl %eax, %eax
jnz L(first_vec_x1_check)
@@ -355,6 +373,10 @@ L(last_vec_x2_return):
L(last_4x_vec_or_less_cmpeq):
VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
subq $-(VEC_SIZE * 4), %rdi
/* Check first VEC regardless. */
testl %eax, %eax
--
GitLab

View File

@ -1,767 +0,0 @@
From aaa23c35071537e2dcf5807e956802ed215210aa Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 19 Apr 2021 19:36:07 -0400
Subject: [PATCH] x86: Optimize strlen-avx2.S
Content-type: text/plain; charset=UTF-8
No bug. This commit optimizes strlen-avx2.S. The optimizations are
mostly small things but they add up to roughly 10-30% performance
improvement for strlen. The results for strnlen are bit more
ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +-
sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++--------
2 files changed, 334 insertions(+), 214 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index cbfc1a5d..f1a6460a 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -285,10 +285,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strlen.c. */
IFUNC_IMPL (i, name, strlen,
IFUNC_IMPL_ADD (array, i, strlen,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__strlen_avx2)
IFUNC_IMPL_ADD (array, i, strlen,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strlen,
@@ -301,10 +303,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strnlen.c. */
IFUNC_IMPL (i, name, strnlen,
IFUNC_IMPL_ADD (array, i, strnlen,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__strnlen_avx2)
IFUNC_IMPL_ADD (array, i, strnlen,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strnlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strnlen,
@@ -640,10 +644,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/wcslen.c. */
IFUNC_IMPL (i, name, wcslen,
IFUNC_IMPL_ADD (array, i, wcslen,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__wcslen_avx2)
IFUNC_IMPL_ADD (array, i, wcslen,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__wcslen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, wcslen,
@@ -656,10 +662,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
IFUNC_IMPL (i, name, wcsnlen,
IFUNC_IMPL_ADD (array, i, wcsnlen,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__wcsnlen_avx2)
IFUNC_IMPL_ADD (array, i, wcsnlen,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__wcsnlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, wcsnlen,
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index 82826e10..be8a5db5 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -27,9 +27,11 @@
# ifdef USE_AS_WCSLEN
# define VPCMPEQ vpcmpeqd
# define VPMINU vpminud
+# define CHAR_SIZE 4
# else
# define VPCMPEQ vpcmpeqb
# define VPMINU vpminub
+# define CHAR_SIZE 1
# endif
# ifndef VZEROUPPER
@@ -41,349 +43,459 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
.section SECTION(.text),"ax",@progbits
ENTRY (STRLEN)
# ifdef USE_AS_STRNLEN
- /* Check for zero length. */
+ /* Check zero length. */
test %RSI_LP, %RSI_LP
jz L(zero)
+ /* Store max len in R8_LP before adjusting if using WCSLEN. */
+ mov %RSI_LP, %R8_LP
# ifdef USE_AS_WCSLEN
shl $2, %RSI_LP
# elif defined __ILP32__
/* Clear the upper 32 bits. */
movl %esi, %esi
# endif
- mov %RSI_LP, %R8_LP
# endif
- movl %edi, %ecx
+ movl %edi, %eax
movq %rdi, %rdx
vpxor %xmm0, %xmm0, %xmm0
-
+ /* Clear high bits from edi. Only keeping bits relevant to page
+ cross check. */
+ andl $(PAGE_SIZE - 1), %eax
/* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
# ifdef USE_AS_STRNLEN
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rsi
- jbe L(max)
-# else
- jnz L(first_vec_x0)
+ /* If length < VEC_SIZE handle special. */
+ cmpq $VEC_SIZE, %rsi
+ jbe L(first_vec_x0)
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ /* If empty continue to aligned_more. Otherwise return bit
+ position of first match. */
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
# ifdef USE_AS_STRNLEN
- /* Adjust length. */
- addq %rcx, %rsi
+L(zero):
+ xorl %eax, %eax
+ ret
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
+ .p2align 4
+L(first_vec_x0):
+ /* Set bit for max len so that tzcnt will return min of max len
+ and position of first match. */
+ btsq %rsi, %rax
+ tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
# endif
- jmp L(more_4x_vec)
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- /* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
+L(first_vec_x1):
tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ subl $(VEC_SIZE * 4 + 1), %ecx
+ addl %ecx, %eax
+# else
+ subl %edx, %edi
+ incl %edi
+ addl %edi, %eax
# endif
- addq %rdi, %rax
- addq %rcx, %rax
- subq %rdx, %rax
# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ shrl $2, %eax
# endif
-L(return_vzeroupper):
- ZERO_UPPER_VEC_REGISTERS_RETURN
+ VZEROUPPER_RETURN
.p2align 4
-L(aligned_more):
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
- to void possible addition overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
-
- /* Check the end of data. */
- subq %rcx, %rsi
- jbe L(max)
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ subl $(VEC_SIZE * 3 + 1), %ecx
+ addl %ecx, %eax
+# else
+ subl %edx, %edi
+ addl $(VEC_SIZE + 1), %edi
+ addl %edi, %eax
# endif
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
- addq $VEC_SIZE, %rdi
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
+# ifdef USE_AS_STRNLEN
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ subl $(VEC_SIZE * 2 + 1), %ecx
+ addl %ecx, %eax
+# else
+ subl %edx, %edi
+ addl $(VEC_SIZE * 2 + 1), %edi
+ addl %edi, %eax
+# endif
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ subl $(VEC_SIZE + 1), %ecx
+ addl %ecx, %eax
+# else
+ subl %edx, %edi
+ addl $(VEC_SIZE * 3 + 1), %edi
+ addl %edi, %eax
# endif
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
-L(more_4x_vec):
+ .p2align 5
+L(aligned_more):
+ /* Align data to VEC_SIZE - 1. This is the same number of
+ instructions as using andq with -VEC_SIZE but saves 4 bytes of
+ code on the x4 check. */
+ orq $(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
-
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+# ifdef USE_AS_STRNLEN
+ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
+ it simplies the logic in last_4x_vec_or_less. */
+ leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
+ subq %rdx, %rcx
+# endif
+ /* Load first VEC regardless. */
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
+# ifdef USE_AS_STRNLEN
+ /* Adjust length. If near end handle specially. */
+ subq %rcx, %rsi
+ jb L(last_4x_vec_or_less)
+# endif
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
-
-# ifdef USE_AS_STRNLEN
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
-# endif
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
+ /* Align data to VEC_SIZE * 4 - 1. */
# ifdef USE_AS_STRNLEN
- /* Adjust length. */
+ /* Before adjusting length check if at last VEC_SIZE * 4. */
+ cmpq $(VEC_SIZE * 4 - 1), %rsi
+ jbe L(last_4x_vec_or_less_load)
+ incq %rdi
+ movl %edi, %ecx
+ orq $(VEC_SIZE * 4 - 1), %rdi
+ andl $(VEC_SIZE * 4 - 1), %ecx
+ /* Readjust length. */
addq %rcx, %rsi
+# else
+ incq %rdi
+ orq $(VEC_SIZE * 4 - 1), %rdi
# endif
-
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- vmovdqa (%rdi), %ymm1
- vmovdqa VEC_SIZE(%rdi), %ymm2
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
- VPMINU %ymm1, %ymm2, %ymm5
- VPMINU %ymm3, %ymm4, %ymm6
- VPMINU %ymm5, %ymm6, %ymm5
-
- VPCMPEQ %ymm5, %ymm0, %ymm5
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
-# ifndef USE_AS_STRNLEN
- jmp L(loop_4x_vec)
-# else
+# ifdef USE_AS_STRNLEN
+ /* Break if at end of length. */
subq $(VEC_SIZE * 4), %rsi
- ja L(loop_4x_vec)
-
-L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %esi
- jle L(last_2x_vec)
+ jb L(last_4x_vec_or_less_cmpeq)
+# endif
+ /* Save some code size by microfusing VPMINU with the load. Since
+ the matches in ymm2/ymm4 can only be returned if there where no
+ matches in ymm1/ymm3 respectively there is no issue with overlap.
+ */
+ vmovdqa 1(%rdi), %ymm1
+ VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
+ VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
+
+ VPMINU %ymm2, %ymm4, %ymm5
+ VPCMPEQ %ymm5, %ymm0, %ymm5
+ vpmovmskb %ymm5, %ecx
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
+ subq $-(VEC_SIZE * 4), %rdi
+ testl %ecx, %ecx
+ jz L(loop_4x_vec)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ %ymm1, %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ subq %rdx, %rdi
testl %eax, %eax
+ jnz L(last_vec_return_x0)
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %esi
- jle L(max)
-
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ %ymm2, %ymm0, %ymm2
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
-
- jnz L(first_vec_x3_check)
- movq %r8, %rax
-# ifdef USE_AS_WCSLEN
+ jnz L(last_vec_return_x1)
+
+ /* Combine last 2 VEC. */
+ VPCMPEQ %ymm3, %ymm0, %ymm3
+ vpmovmskb %ymm3, %eax
+ /* rcx has combined result from all 4 VEC. It will only be used if
+ the first 3 other VEC all did not contain a match. */
+ salq $32, %rcx
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+ subq $(VEC_SIZE * 2 - 1), %rdi
+ addq %rdi, %rax
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
VZEROUPPER_RETURN
+
+# ifdef USE_AS_STRNLEN
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %esi
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
+L(last_4x_vec_or_less_load):
+ /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
+ subq $-(VEC_SIZE * 4), %rdi
+L(last_4x_vec_or_less_cmpeq):
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
+L(last_4x_vec_or_less):
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %esi
- jle L(max)
+ vpmovmskb %ymm1, %eax
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off by
+ VEC_SIZE * 4. */
+ testl $(VEC_SIZE * 2), %esi
+ jnz L(last_4x_vec)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ /* length may have been negative or positive by an offset of
+ VEC_SIZE * 4 depending on where this was called from. This fixes
+ that. */
+ andl $(VEC_SIZE * 4 - 1), %esi
testl %eax, %eax
- jnz L(first_vec_x1_check)
- movq %r8, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
- VZEROUPPER_RETURN
+ jnz L(last_vec_x1_check)
- .p2align 4
-L(first_vec_x0_check):
+ subl $VEC_SIZE, %esi
+ jb L(max)
+
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
tzcntl %eax, %eax
/* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
+ cmpl %eax, %esi
+ jb L(max)
+ subq %rdx, %rdi
+ addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
- subq %rdx, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER_RETURN
+# endif
.p2align 4
-L(first_vec_x1_check):
+L(last_vec_return_x0):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $VEC_SIZE, %rax
+ subq $(VEC_SIZE * 4 - 1), %rdi
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x2_check):
+L(last_vec_return_x1):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $(VEC_SIZE * 2), %rax
+ subq $(VEC_SIZE * 3 - 1), %rdi
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
VZEROUPPER_RETURN
+# ifdef USE_AS_STRNLEN
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x1_check):
+
tzcntl %eax, %eax
/* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $(VEC_SIZE * 3), %rax
+ cmpl %eax, %esi
+ jb L(max)
+ subq %rdx, %rdi
+ incl %eax
addq %rdi, %rax
- subq %rdx, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER_RETURN
- .p2align 4
L(max):
movq %r8, %rax
+ VZEROUPPER_RETURN
+
+ .p2align 4
+L(last_4x_vec):
+ /* Test first 2x VEC normally. */
+ testl %eax, %eax
+ jnz L(last_vec_x1)
+
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+ /* Normalize length. */
+ andl $(VEC_SIZE * 4 - 1), %esi
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x3)
+
+ subl $(VEC_SIZE * 3), %esi
+ jb L(max)
+
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpl %eax, %esi
+ jb L(max)
+ subq %rdx, %rdi
+ addl $(VEC_SIZE * 3 + 1), %eax
+ addq %rdi, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER_RETURN
- .p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-# endif
.p2align 4
-L(first_vec_x0):
+L(last_vec_x1):
+ /* essentially duplicates of first_vec_x1 but use 64 bit
+ instructions. */
tzcntl %eax, %eax
+ subq %rdx, %rdi
+ incl %eax
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x1):
+L(last_vec_x2):
+ /* essentially duplicates of first_vec_x1 but use 64 bit
+ instructions. */
tzcntl %eax, %eax
- addq $VEC_SIZE, %rax
+ subq %rdx, %rdi
+ addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x2):
+L(last_vec_x3):
tzcntl %eax, %eax
- addq $(VEC_SIZE * 2), %rax
+ subl $(VEC_SIZE * 2), %esi
+ /* Check the end of data. */
+ cmpl %eax, %esi
+ jb L(max_end)
+ subq %rdx, %rdi
+ addl $(VEC_SIZE * 2 + 1), %eax
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
+ VZEROUPPER_RETURN
+L(max_end):
+ movq %r8, %rax
VZEROUPPER_RETURN
+# endif
+ /* Cold case for crossing page with first load. */
.p2align 4
-L(4x_vec_end):
- VPCMPEQ %ymm1, %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMPEQ %ymm2, %ymm0, %ymm2
- vpmovmskb %ymm2, %eax
+L(cross_page_boundary):
+ /* Align data to VEC_SIZE - 1. */
+ orq $(VEC_SIZE - 1), %rdi
+ VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+ so no need to manually mod rdx. */
+ sarxl %edx, %eax, %eax
+# ifdef USE_AS_STRNLEN
testl %eax, %eax
- jnz L(first_vec_x1)
- VPCMPEQ %ymm3, %ymm0, %ymm3
- vpmovmskb %ymm3, %eax
+ jnz L(cross_page_less_vec)
+ leaq 1(%rdi), %rcx
+ subq %rdx, %rcx
+ /* Check length. */
+ cmpq %rsi, %rcx
+ jb L(cross_page_continue)
+ movq %r8, %rax
+# else
testl %eax, %eax
- jnz L(first_vec_x2)
- VPCMPEQ %ymm4, %ymm0, %ymm4
- vpmovmskb %ymm4, %eax
-L(first_vec_x3):
+ jz L(cross_page_continue)
tzcntl %eax, %eax
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
# endif
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+
+# ifdef USE_AS_STRNLEN
+ .p2align 4
+L(cross_page_less_vec):
+ tzcntl %eax, %eax
+ cmpq %rax, %rsi
+ cmovb %esi, %eax
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
VZEROUPPER_RETURN
+# endif
END (STRLEN)
#endif
--
GitLab

View File

@ -1,701 +0,0 @@
From 2a76821c3081d2c0231ecd2618f52662cb48fccd Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 3 May 2021 03:03:19 -0400
Subject: [PATCH] x86: Optimize memchr-evex.S
Content-type: text/plain; charset=UTF-8
No bug. This commit optimizes memchr-evex.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
saving some ALU in the alignment process, and most importantly
increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
test-wmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
1 file changed, 322 insertions(+), 225 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 6dd5d67b..81d5cd64 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -26,14 +26,28 @@
# ifdef USE_AS_WMEMCHR
# define VPBROADCAST vpbroadcastd
-# define VPCMP vpcmpd
-# define SHIFT_REG r8d
+# define VPMINU vpminud
+# define VPCMP vpcmpd
+# define VPCMPEQ vpcmpeqd
+# define CHAR_SIZE 4
# else
# define VPBROADCAST vpbroadcastb
-# define VPCMP vpcmpb
-# define SHIFT_REG ecx
+# define VPMINU vpminub
+# define VPCMP vpcmpb
+# define VPCMPEQ vpcmpeqb
+# define CHAR_SIZE 1
# endif
+# ifdef USE_AS_RAWMEMCHR
+# define RAW_PTR_REG rcx
+# define ALGN_PTR_REG rdi
+# else
+# define RAW_PTR_REG rdi
+# define ALGN_PTR_REG rcx
+# endif
+
+# define XMMZERO xmm23
+# define YMMZERO ymm23
# define XMMMATCH xmm16
# define YMMMATCH ymm16
# define YMM1 ymm17
@@ -44,6 +58,8 @@
# define YMM6 ymm22
# define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+# define PAGE_SIZE 4096
.section .text.evex,"ax",@progbits
ENTRY (MEMCHR)
@@ -51,11 +67,7 @@ ENTRY (MEMCHR)
/* Check for zero length. */
test %RDX_LP, %RDX_LP
jz L(zero)
-# endif
- movl %edi, %ecx
-# ifdef USE_AS_WMEMCHR
- shl $2, %RDX_LP
-# else
+
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
@@ -64,318 +76,403 @@ ENTRY (MEMCHR)
/* Broadcast CHAR to YMMMATCH. */
VPBROADCAST %esi, %YMMMATCH
/* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
-
+ VPCMP $0, (%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
# ifndef USE_AS_RAWMEMCHR
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rdx
- jbe L(zero)
+ /* If length < CHAR_PER_VEC handle special. */
+ cmpq $CHAR_PER_VEC, %rdx
+ jbe L(first_vec_x0)
+# endif
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
- jnz L(first_vec_x0)
+ addq %rdi, %rax
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ ret
# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
- addq %rcx, %rdx
-
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
- jmp L(more_4x_vec)
+L(zero):
+ xorl %eax, %eax
+ ret
+ .p2align 5
+L(first_vec_x0):
+ /* Check if first match was before length. */
+ tzcntl %eax, %eax
+ xorl %ecx, %ecx
+ cmpl %eax, %edx
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+ cmovle %rcx, %rax
+ ret
+# else
+ /* NB: first_vec_x0 is 17 bytes which will leave
+ cross_page_boundary (which is relatively cold) close enough
+ to ideal alignment. So only realign L(cross_page_boundary) if
+ rawmemchr. */
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
+# endif
+L(cross_page_boundary):
+ /* Save pointer before aligning as its original value is
+ necessary for computer return address if byte is found or
+ adjusting length if it is not and this is memchr. */
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
+ for rawmemchr. */
+ andq $-VEC_SIZE, %ALGN_PTR_REG
+ VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+ kmovd %k0, %r8d
# ifdef USE_AS_WMEMCHR
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
bytes. */
- movl %ecx, %SHIFT_REG
- sarl $2, %SHIFT_REG
+ sarl $2, %eax
+# endif
+# ifndef USE_AS_RAWMEMCHR
+ movl $(PAGE_SIZE / CHAR_SIZE), %esi
+ subl %eax, %esi
# endif
- andq $-VEC_SIZE, %rdi
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- /* Remove the leading bytes. */
- sarxl %SHIFT_REG, %eax, %eax
- testl %eax, %eax
- jz L(aligned_more)
- tzcntl %eax, %eax
# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+ andl $(CHAR_PER_VEC - 1), %eax
# endif
+ /* Remove the leading bytes. */
+ sarxl %eax, %r8d, %eax
# ifndef USE_AS_RAWMEMCHR
/* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ cmpq %rsi, %rdx
+ jbe L(first_vec_x0)
+# endif
+ testl %eax, %eax
+ jz L(cross_page_continue)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+# else
+ addq %RAW_PTR_REG, %rax
# endif
- addq %rdi, %rax
- addq %rcx, %rax
ret
.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
- overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- /* Check the end of data. */
- subq %rcx, %rdx
- jbe L(zero)
-# endif
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- addq $VEC_SIZE, %rdi
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
-# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
-L(more_4x_vec):
+ .p2align 5
+L(aligned_more):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Align data to VEC_SIZE. */
+L(cross_page_continue):
+ xorl %ecx, %ecx
+ subl %edi, %ecx
+ andq $-VEC_SIZE, %rdi
+ /* esi is for adjusting length to see if near the end. */
+ leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %esi
+# endif
+# else
+ andq $-VEC_SIZE, %rdi
+L(cross_page_continue):
+# endif
+ /* Load first VEC regardless. */
+ VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. If near end handle specially. */
+ subq %rsi, %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
+
# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
+ /* Check if at last CHAR_PER_VEC * 4 length. */
+ subq $(CHAR_PER_VEC * 4), %rdx
+ jbe L(last_4x_vec_or_less_cmpeq)
+ addq $VEC_SIZE, %rdi
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
+ /* Align data to VEC_SIZE * 4 for the loop and readjust length.
+ */
+# ifdef USE_AS_WMEMCHR
+ movl %edi, %ecx
andq $-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
+ andl $(VEC_SIZE * 4 - 1), %ecx
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
addq %rcx, %rdx
+# else
+ addq %rdi, %rdx
+ andq $-(4 * VEC_SIZE), %rdi
+ subq %rdi, %rdx
+# endif
+# else
+ addq $VEC_SIZE, %rdi
+ andq $-(4 * VEC_SIZE), %rdi
# endif
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
+
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
- kord %k1, %k2, %k5
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
- kord %k3, %k4, %k6
- kortestd %k5, %k6
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
+ /* It would be possible to save some instructions using 4x VPCMP
+ but bottleneck on port 5 makes it not woth it. */
+ VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+ /* xor will set bytes match esi to zero. */
+ vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
+ vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+ VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+ /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
+ VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
+ VPCMP $0, %YMM3, %YMMZERO, %k2
# ifdef USE_AS_RAWMEMCHR
- jmp L(loop_4x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
+ kortestd %k2, %k3
+ jz L(loop_4x_vec)
# else
- subq $(VEC_SIZE * 4), %rdx
+ kortestd %k2, %k3
+ jnz L(loop_4x_vec_end)
+
+ subq $-(VEC_SIZE * 4), %rdi
+
+ subq $(CHAR_PER_VEC * 4), %rdx
ja L(loop_4x_vec)
+ /* Fall through into less than 4 remaining vectors of length case.
+ */
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ addq $(VEC_SIZE * 3), %rdi
+ .p2align 4
L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %edx
- jle L(last_2x_vec)
-
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ /* Check if first VEC contained match. */
testl %eax, %eax
- jnz L(first_vec_x0)
+ jnz L(first_vec_x1_check)
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
+ /* If remaining length > CHAR_PER_VEC * 2. */
+ addl $(CHAR_PER_VEC * 2), %edx
+ jg L(last_4x_vec)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
+L(last_2x_vec):
+ /* If remaining length < CHAR_PER_VEC. */
+ addl $CHAR_PER_VEC, %edx
+ jle L(zero_end)
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+ /* Check VEC2 and compare any match with remaining length. */
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ tzcntl %eax, %eax
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end):
+ ret
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x3_check)
+ .p2align 4
+L(first_vec_x1_check):
+ tzcntl %eax, %eax
+ /* Adjust length. */
+ subl $-(CHAR_PER_VEC * 4), %edx
+ /* Check if match within remaining length. */
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+L(set_zero_end):
xorl %eax, %eax
ret
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %edx
- VPCMP $0, (%rdi), %YMMMATCH, %k1
+L(loop_4x_vec_end):
+# endif
+ /* rawmemchr will fall through into this if match was found in
+ loop. */
+
+ /* k1 has not of matches with VEC1. */
kmovd %k1, %eax
- testl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ subl $((1 << CHAR_PER_VEC) - 1), %eax
+# else
+ incl %eax
+# endif
+ jnz L(last_vec_x1_return)
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+ VPCMP $0, %YMM2, %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2_return)
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ kmovd %k2, %eax
testl %eax, %eax
- jnz L(first_vec_x1_check)
- xorl %eax, %eax
- ret
+ jnz L(last_vec_x3_return)
- .p2align 4
-L(first_vec_x0_check):
+ kmovd %k3, %eax
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# ifdef USE_AS_RAWMEMCHR
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+ leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq %rdi, %rax
ret
.p2align 4
-L(first_vec_x1_check):
+L(last_vec_x1_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $VEC_SIZE, %rax
+# ifdef USE_AS_RAWMEMCHR
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
addq %rdi, %rax
- ret
-
- .p2align 4
-L(first_vec_x2_check):
- tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# endif
+# else
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
ret
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# ifdef USE_AS_RAWMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# else
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
ret
.p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-# endif
-
- .p2align 4
-L(first_vec_x0):
+L(last_vec_x3_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (%rdi, %rax, 4), %rax
+# ifdef USE_AS_RAWMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
# else
- addq %rdi, %rax
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
# endif
ret
+
+# ifndef USE_AS_RAWMEMCHR
+L(last_4x_vec_or_less_cmpeq):
+ VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ subq $-(VEC_SIZE * 4), %rdi
+ /* Check first VEC regardless. */
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+
+ /* If remaining length <= CHAR_PER_VEC * 2. */
+ addl $(CHAR_PER_VEC * 2), %edx
+ jle L(last_2x_vec)
+
.p2align 4
-L(first_vec_x1):
+L(last_4x_vec):
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ /* Create mask for possible matches within remaining length. */
+# ifdef USE_AS_WMEMCHR
+ movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
+ bzhil %edx, %ecx, %ecx
+# else
+ movq $-1, %rcx
+ bzhiq %rdx, %rcx, %rcx
+# endif
+ /* Test matches in data against length match. */
+ andl %ecx, %eax
+ jnz L(last_vec_x3)
+
+ /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
+ remaining length was found to be > CHAR_PER_VEC * 2. */
+ subl $CHAR_PER_VEC, %edx
+ jbe L(zero_end2)
+
+
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ /* Shift remaining length mask for last VEC. */
+# ifdef USE_AS_WMEMCHR
+ shrl $CHAR_PER_VEC, %ecx
+# else
+ shrq $CHAR_PER_VEC, %rcx
+# endif
+ andl %ecx, %eax
+ jz L(zero_end2)
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end2):
ret
- .p2align 4
-L(first_vec_x2):
+L(last_vec_x2):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
-# else
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
ret
.p2align 4
-L(4x_vec_end):
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- kmovd %k2, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- kmovd %k3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
- kmovd %k4, %eax
- testl %eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
-# else
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
ret
+# endif
END (MEMCHR)
#endif
--
GitLab

View File

@ -1,30 +0,0 @@
From 6ea916adfa0ab9af6e7dc6adcf6f977dfe017835 Mon Sep 17 00:00:00 2001
From: Alice Xu <alice.d.xu@gmail.com>
Date: Fri, 7 May 2021 19:03:21 -0700
Subject: [PATCH] x86-64: Fix an unknown vector operation in memchr-evex.S
Content-type: text/plain; charset=UTF-8
An unknown vector operation occurred in commit 2a76821c308. Fixed it
by using "ymm{k1}{z}" but not "ymm {k1} {z}".
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/memchr-evex.S | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 81d5cd64..f3fdad4f 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -271,7 +271,7 @@ L(loop_4x_vec):
vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
- VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
+ VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
VPCMP $0, %YMM3, %YMMZERO, %k2
# ifdef USE_AS_RAWMEMCHR
subq $-(VEC_SIZE * 4), %rdi
--
GitLab

View File

@ -1,566 +0,0 @@
From a0db678071c60b6c47c468d231dd0b3694ba7a98 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 22 Jun 2021 20:42:10 -0700
Subject: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S
Content-type: text/plain; charset=UTF-8
Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1
version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S
and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants.
This also removes the unused symbols, __GI___strlen_sse2 and
__GI___wcsnlen_sse4_1.
---
sysdeps/x86_64/multiarch/strlen-sse2.S | 2 +-
sysdeps/x86_64/multiarch/strlen-vec.S | 257 ++++++++++++++++++++++
sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 2 +-
sysdeps/x86_64/strlen.S | 243 +-------------------
4 files changed, 262 insertions(+), 242 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
Conflicts:
sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
(Copyright dates, URL)
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
index 7bc57b8d..449c8a7f 100644
--- a/sysdeps/x86_64/multiarch/strlen-sse2.S
+++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
@@ -20,4 +20,4 @@
# define strlen __strlen_sse2
#endif
-#include "../strlen.S"
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
new file mode 100644
index 00000000..8f660bb9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
@@ -0,0 +1,257 @@
+/* SSE2 version of strlen and SSE4.1 version of wcslen.
+ Copyright (C) 2012-2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifdef AS_WCSLEN
+# define PMINU pminud
+# define PCMPEQ pcmpeqd
+# define SHIFT_RETURN shrq $2, %rax
+#else
+# define PMINU pminub
+# define PCMPEQ pcmpeqb
+# define SHIFT_RETURN
+#endif
+
+/* Long lived register in strlen(s), strnlen(s, n) are:
+
+ %xmm3 - zero
+ %rdi - s
+ %r10 (s+n) & (~(64-1))
+ %r11 s+n
+*/
+
+
+.text
+ENTRY(strlen)
+
+/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
+#define FIND_ZERO \
+ PCMPEQ (%rax), %xmm0; \
+ PCMPEQ 16(%rax), %xmm1; \
+ PCMPEQ 32(%rax), %xmm2; \
+ PCMPEQ 48(%rax), %xmm3; \
+ pmovmskb %xmm0, %esi; \
+ pmovmskb %xmm1, %edx; \
+ pmovmskb %xmm2, %r8d; \
+ pmovmskb %xmm3, %ecx; \
+ salq $16, %rdx; \
+ salq $16, %rcx; \
+ orq %rsi, %rdx; \
+ orq %r8, %rcx; \
+ salq $32, %rcx; \
+ orq %rcx, %rdx;
+
+#ifdef AS_STRNLEN
+/* Do not read anything when n==0. */
+ test %RSI_LP, %RSI_LP
+ jne L(n_nonzero)
+ xor %rax, %rax
+ ret
+L(n_nonzero):
+# ifdef AS_WCSLEN
+ shl $2, %RSI_LP
+# endif
+
+/* Initialize long lived registers. */
+
+ add %RDI_LP, %RSI_LP
+ mov %RSI_LP, %R10_LP
+ and $-64, %R10_LP
+ mov %RSI_LP, %R11_LP
+#endif
+
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ movq %rdi, %rax
+ movq %rdi, %rcx
+ andq $4095, %rcx
+/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
+ cmpq $4047, %rcx
+/* We cannot unify this branching as it would be ~6 cycles slower. */
+ ja L(cross_page)
+
+#ifdef AS_STRNLEN
+/* Test if end is among first 64 bytes. */
+# define STRNLEN_PROLOG \
+ mov %r11, %rsi; \
+ subq %rax, %rsi; \
+ andq $-64, %rax; \
+ testq $-64, %rsi; \
+ je L(strnlen_ret)
+#else
+# define STRNLEN_PROLOG andq $-64, %rax;
+#endif
+
+/* Ignore bits in mask that come before start of string. */
+#define PROLOG(lab) \
+ movq %rdi, %rcx; \
+ xorq %rax, %rcx; \
+ STRNLEN_PROLOG; \
+ sarq %cl, %rdx; \
+ test %rdx, %rdx; \
+ je L(lab); \
+ bsfq %rdx, %rax; \
+ SHIFT_RETURN; \
+ ret
+
+#ifdef AS_STRNLEN
+ andq $-16, %rax
+ FIND_ZERO
+#else
+ /* Test first 16 bytes unaligned. */
+ movdqu (%rax), %xmm4
+ PCMPEQ %xmm0, %xmm4
+ pmovmskb %xmm4, %edx
+ test %edx, %edx
+ je L(next48_bytes)
+ bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
+ SHIFT_RETURN
+ ret
+
+L(next48_bytes):
+/* Same as FIND_ZERO except we do not check first 16 bytes. */
+ andq $-16, %rax
+ PCMPEQ 16(%rax), %xmm1
+ PCMPEQ 32(%rax), %xmm2
+ PCMPEQ 48(%rax), %xmm3
+ pmovmskb %xmm1, %edx
+ pmovmskb %xmm2, %r8d
+ pmovmskb %xmm3, %ecx
+ salq $16, %rdx
+ salq $16, %rcx
+ orq %r8, %rcx
+ salq $32, %rcx
+ orq %rcx, %rdx
+#endif
+
+ /* When no zero byte is found xmm1-3 are zero so we do not have to
+ zero them. */
+ PROLOG(loop)
+
+ .p2align 4
+L(cross_page):
+ andq $-64, %rax
+ FIND_ZERO
+ PROLOG(loop_init)
+
+#ifdef AS_STRNLEN
+/* We must do this check to correctly handle strnlen (s, -1). */
+L(strnlen_ret):
+ bts %rsi, %rdx
+ sarq %cl, %rdx
+ test %rdx, %rdx
+ je L(loop_init)
+ bsfq %rdx, %rax
+ SHIFT_RETURN
+ ret
+#endif
+ .p2align 4
+L(loop_init):
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+#ifdef AS_STRNLEN
+ .p2align 4
+L(loop):
+
+ addq $64, %rax
+ cmpq %rax, %r10
+ je L(exit_end)
+
+ movdqa (%rax), %xmm0
+ PMINU 16(%rax), %xmm0
+ PMINU 32(%rax), %xmm0
+ PMINU 48(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne L(exit)
+ jmp L(loop)
+
+ .p2align 4
+L(exit_end):
+ cmp %rax, %r11
+ je L(first) /* Do not read when end is at page boundary. */
+ pxor %xmm0, %xmm0
+ FIND_ZERO
+
+L(first):
+ bts %r11, %rdx
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
+ SHIFT_RETURN
+ ret
+
+ .p2align 4
+L(exit):
+ pxor %xmm0, %xmm0
+ FIND_ZERO
+
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
+ SHIFT_RETURN
+ ret
+
+#else
+
+ /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
+ .p2align 4
+L(loop):
+
+ movdqa 64(%rax), %xmm0
+ PMINU 80(%rax), %xmm0
+ PMINU 96(%rax), %xmm0
+ PMINU 112(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne L(exit64)
+
+ subq $-128, %rax
+
+ movdqa (%rax), %xmm0
+ PMINU 16(%rax), %xmm0
+ PMINU 32(%rax), %xmm0
+ PMINU 48(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne L(exit0)
+ jmp L(loop)
+
+ .p2align 4
+L(exit64):
+ addq $64, %rax
+L(exit0):
+ pxor %xmm0, %xmm0
+ FIND_ZERO
+
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
+ SHIFT_RETURN
+ ret
+
+#endif
+
+END(strlen)
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
index a8cab0cb..5fa51fe0 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
@@ -2,4 +2,4 @@
#define AS_STRNLEN
#define strlen __wcsnlen_sse4_1
-#include "../strlen.S"
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index f845f3d4..ad047d84 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,5 +1,5 @@
-/* SSE2 version of strlen/wcslen.
- Copyright (C) 2012-2018 Free Software Foundation, Inc.
+/* SSE2 version of strlen.
+ Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,243 +16,6 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <sysdep.h>
+#include "multiarch/strlen-vec.S"
-#ifdef AS_WCSLEN
-# define PMINU pminud
-# define PCMPEQ pcmpeqd
-# define SHIFT_RETURN shrq $2, %rax
-#else
-# define PMINU pminub
-# define PCMPEQ pcmpeqb
-# define SHIFT_RETURN
-#endif
-
-/* Long lived register in strlen(s), strnlen(s, n) are:
-
- %xmm3 - zero
- %rdi - s
- %r10 (s+n) & (~(64-1))
- %r11 s+n
-*/
-
-
-.text
-ENTRY(strlen)
-
-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
-#define FIND_ZERO \
- PCMPEQ (%rax), %xmm0; \
- PCMPEQ 16(%rax), %xmm1; \
- PCMPEQ 32(%rax), %xmm2; \
- PCMPEQ 48(%rax), %xmm3; \
- pmovmskb %xmm0, %esi; \
- pmovmskb %xmm1, %edx; \
- pmovmskb %xmm2, %r8d; \
- pmovmskb %xmm3, %ecx; \
- salq $16, %rdx; \
- salq $16, %rcx; \
- orq %rsi, %rdx; \
- orq %r8, %rcx; \
- salq $32, %rcx; \
- orq %rcx, %rdx;
-
-#ifdef AS_STRNLEN
-/* Do not read anything when n==0. */
- test %RSI_LP, %RSI_LP
- jne L(n_nonzero)
- xor %rax, %rax
- ret
-L(n_nonzero):
-# ifdef AS_WCSLEN
- shl $2, %RSI_LP
-# endif
-
-/* Initialize long lived registers. */
-
- add %RDI_LP, %RSI_LP
- mov %RSI_LP, %R10_LP
- and $-64, %R10_LP
- mov %RSI_LP, %R11_LP
-#endif
-
- pxor %xmm0, %xmm0
- pxor %xmm1, %xmm1
- pxor %xmm2, %xmm2
- pxor %xmm3, %xmm3
- movq %rdi, %rax
- movq %rdi, %rcx
- andq $4095, %rcx
-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
- cmpq $4047, %rcx
-/* We cannot unify this branching as it would be ~6 cycles slower. */
- ja L(cross_page)
-
-#ifdef AS_STRNLEN
-/* Test if end is among first 64 bytes. */
-# define STRNLEN_PROLOG \
- mov %r11, %rsi; \
- subq %rax, %rsi; \
- andq $-64, %rax; \
- testq $-64, %rsi; \
- je L(strnlen_ret)
-#else
-# define STRNLEN_PROLOG andq $-64, %rax;
-#endif
-
-/* Ignore bits in mask that come before start of string. */
-#define PROLOG(lab) \
- movq %rdi, %rcx; \
- xorq %rax, %rcx; \
- STRNLEN_PROLOG; \
- sarq %cl, %rdx; \
- test %rdx, %rdx; \
- je L(lab); \
- bsfq %rdx, %rax; \
- SHIFT_RETURN; \
- ret
-
-#ifdef AS_STRNLEN
- andq $-16, %rax
- FIND_ZERO
-#else
- /* Test first 16 bytes unaligned. */
- movdqu (%rax), %xmm4
- PCMPEQ %xmm0, %xmm4
- pmovmskb %xmm4, %edx
- test %edx, %edx
- je L(next48_bytes)
- bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
- SHIFT_RETURN
- ret
-
-L(next48_bytes):
-/* Same as FIND_ZERO except we do not check first 16 bytes. */
- andq $-16, %rax
- PCMPEQ 16(%rax), %xmm1
- PCMPEQ 32(%rax), %xmm2
- PCMPEQ 48(%rax), %xmm3
- pmovmskb %xmm1, %edx
- pmovmskb %xmm2, %r8d
- pmovmskb %xmm3, %ecx
- salq $16, %rdx
- salq $16, %rcx
- orq %r8, %rcx
- salq $32, %rcx
- orq %rcx, %rdx
-#endif
-
- /* When no zero byte is found xmm1-3 are zero so we do not have to
- zero them. */
- PROLOG(loop)
-
- .p2align 4
-L(cross_page):
- andq $-64, %rax
- FIND_ZERO
- PROLOG(loop_init)
-
-#ifdef AS_STRNLEN
-/* We must do this check to correctly handle strnlen (s, -1). */
-L(strnlen_ret):
- bts %rsi, %rdx
- sarq %cl, %rdx
- test %rdx, %rdx
- je L(loop_init)
- bsfq %rdx, %rax
- SHIFT_RETURN
- ret
-#endif
- .p2align 4
-L(loop_init):
- pxor %xmm1, %xmm1
- pxor %xmm2, %xmm2
- pxor %xmm3, %xmm3
-#ifdef AS_STRNLEN
- .p2align 4
-L(loop):
-
- addq $64, %rax
- cmpq %rax, %r10
- je L(exit_end)
-
- movdqa (%rax), %xmm0
- PMINU 16(%rax), %xmm0
- PMINU 32(%rax), %xmm0
- PMINU 48(%rax), %xmm0
- PCMPEQ %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- testl %edx, %edx
- jne L(exit)
- jmp L(loop)
-
- .p2align 4
-L(exit_end):
- cmp %rax, %r11
- je L(first) /* Do not read when end is at page boundary. */
- pxor %xmm0, %xmm0
- FIND_ZERO
-
-L(first):
- bts %r11, %rdx
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
- SHIFT_RETURN
- ret
-
- .p2align 4
-L(exit):
- pxor %xmm0, %xmm0
- FIND_ZERO
-
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
- SHIFT_RETURN
- ret
-
-#else
-
- /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
- .p2align 4
-L(loop):
-
- movdqa 64(%rax), %xmm0
- PMINU 80(%rax), %xmm0
- PMINU 96(%rax), %xmm0
- PMINU 112(%rax), %xmm0
- PCMPEQ %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- testl %edx, %edx
- jne L(exit64)
-
- subq $-128, %rax
-
- movdqa (%rax), %xmm0
- PMINU 16(%rax), %xmm0
- PMINU 32(%rax), %xmm0
- PMINU 48(%rax), %xmm0
- PCMPEQ %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- testl %edx, %edx
- jne L(exit0)
- jmp L(loop)
-
- .p2align 4
-L(exit64):
- addq $64, %rax
-L(exit0):
- pxor %xmm0, %xmm0
- FIND_ZERO
-
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
- SHIFT_RETURN
- ret
-
-#endif
-
-END(strlen)
libc_hidden_builtin_def (strlen)
--
GitLab

View File

@ -1,181 +0,0 @@
From 6f573a27b6c8b4236445810a44660612323f5a73 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 23 Jun 2021 01:19:34 -0400
Subject: [PATCH] x86-64: Add wcslen optimize for sse4.1
Content-type: text/plain; charset=UTF-8
No bug. This comment adds the ifunc / build infrastructure
necessary for wcslen to prefer the sse4.1 implementation
in strlen-vec.S. test-wcslen.c is passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/Makefile | 4 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 3 ++
sysdeps/x86_64/multiarch/ifunc-wcslen.h | 52 ++++++++++++++++++++++
sysdeps/x86_64/multiarch/wcslen-sse4_1.S | 4 ++
sysdeps/x86_64/multiarch/wcslen.c | 2 +-
sysdeps/x86_64/multiarch/wcsnlen.c | 34 +-------------
6 files changed, 63 insertions(+), 36 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcslen.h
create mode 100644 sysdeps/x86_64/multiarch/wcslen-sse4_1.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 491c7698..65fde4eb 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -93,8 +93,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wcscpy-ssse3 wcscpy-c \
wcschr-sse2 wcschr-avx2 \
wcsrchr-sse2 wcsrchr-avx2 \
- wcsnlen-sse4_1 wcsnlen-c \
- wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
+ wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
+ wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
wcschr-avx2-rtm \
wcscmp-avx2-rtm \
wcslen-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f1a6460a..580913ca 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -657,6 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcslen_evex)
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
+ CPU_FEATURE_USABLE (SSE4_1),
+ __wcsnlen_sse4_1)
IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
new file mode 100644
index 00000000..39e33473
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
@@ -0,0 +1,52 @@
+/* Common definition for ifunc selections for wcslen and wcsnlen
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017-2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+ const struct cpu_features* cpu_features = __get_cpu_features ();
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ return OPTIMIZE (evex);
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+ return OPTIMIZE (sse4_1);
+
+ return OPTIMIZE (sse2);
+}
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
new file mode 100644
index 00000000..7e62621a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
@@ -0,0 +1,4 @@
+#define AS_WCSLEN
+#define strlen __wcslen_sse4_1
+
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
index 6d06e47c..3b04b75b 100644
--- a/sysdeps/x86_64/multiarch/wcslen.c
+++ b/sysdeps/x86_64/multiarch/wcslen.c
@@ -24,7 +24,7 @@
# undef __wcslen
# define SYMBOL_NAME wcslen
-# include "ifunc-avx2.h"
+# include "ifunc-wcslen.h"
libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ());
weak_alias (__wcslen, wcslen);
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
index 20b731ae..06736410 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen.c
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
@@ -24,39 +24,7 @@
# undef __wcsnlen
# define SYMBOL_NAME wcsnlen
-# include <init-arch.h>
-
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
-
-static inline void *
-IFUNC_SELECTOR (void)
-{
- const struct cpu_features* cpu_features = __get_cpu_features ();
-
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
- && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
- {
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
- return OPTIMIZE (evex);
-
- if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
- return OPTIMIZE (avx2_rtm);
-
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
- return OPTIMIZE (avx2);
- }
-
- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
- return OPTIMIZE (sse4_1);
-
- return OPTIMIZE (sse2);
-}
+# include "ifunc-wcslen.h"
libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
weak_alias (__wcsnlen, wcsnlen);
--
GitLab

View File

@ -1,396 +0,0 @@
From 231c56760c1e2ded21ad96bbb860b1f08c556c7a Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 21 Jan 2019 11:27:25 -0800
Subject: [PATCH] x86-64 memcpy: Properly handle the length parameter [BZ#
24097]
Content-type: text/plain; charset=UTF-8
On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits. The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.
This pach fixes memcpy for x32. Tested on x86-64 and x32. On x86-64,
libc.so is the same with and withou the fix.
[BZ# 24097]
CVE-2019-6488
* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for
length. Clear the upper 32 bits of RDX register.
* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S:
Likewise.
* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
Likewise.
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy.
tst-size_t-wmemchr.
* sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file.
---
sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 17 ++++--
sysdeps/x86_64/multiarch/memcpy-ssse3.S | 17 ++++--
.../multiarch/memmove-avx512-no-vzeroupper.S | 16 +++--
.../multiarch/memmove-vec-unaligned-erms.S | 54 +++++++++--------
sysdeps/x86_64/x32/Makefile | 2 +-
sysdeps/x86_64/x32/tst-size_t-memcpy.c | 58 +++++++++++++++++++
6 files changed, 122 insertions(+), 42 deletions(-)
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c
Conflicts:
ChangeLog
(removed)
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index 3cd11233..568eebd3 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -45,28 +45,33 @@
.section .text.ssse3,"ax",@progbits
#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
ENTRY (MEMPCPY_CHK)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMPCPY_CHK)
ENTRY (MEMPCPY)
- movq %rdi, %rax
- addq %rdx, %rax
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
jmp L(start)
END (MEMPCPY)
#endif
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMCPY_CHK)
#endif
ENTRY (MEMCPY)
- mov %rdi, %rax
+ mov %RDI_LP, %RAX_LP
#ifdef USE_AS_MEMPCPY
- add %rdx, %rax
+ add %RDX_LP, %RAX_LP
+#endif
+
+#ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
#endif
#ifdef USE_AS_MEMMOVE
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
index 0240bfa3..0bd5ee99 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -45,28 +45,33 @@
.section .text.ssse3,"ax",@progbits
#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
ENTRY (MEMPCPY_CHK)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMPCPY_CHK)
ENTRY (MEMPCPY)
- movq %rdi, %rax
- addq %rdx, %rax
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
jmp L(start)
END (MEMPCPY)
#endif
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMCPY_CHK)
#endif
ENTRY (MEMCPY)
- mov %rdi, %rax
+ mov %RDI_LP, %RAX_LP
#ifdef USE_AS_MEMPCPY
- add %rdx, %rax
+ add %RDX_LP, %RAX_LP
+#endif
+
+#ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
#endif
#ifdef USE_AS_MEMMOVE
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
index effc3ac2..6ca2bbc9 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
@@ -24,27 +24,31 @@
.section .text.avx512,"ax",@progbits
ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__mempcpy_chk_avx512_no_vzeroupper)
ENTRY (__mempcpy_avx512_no_vzeroupper)
- movq %rdi, %rax
- addq %rdx, %rax
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
jmp L(start)
END (__mempcpy_avx512_no_vzeroupper)
ENTRY (__memmove_chk_avx512_no_vzeroupper)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__memmove_chk_avx512_no_vzeroupper)
ENTRY (__memmove_avx512_no_vzeroupper)
- mov %rdi, %rax
+ mov %RDI_LP, %RAX_LP
# ifdef USE_AS_MEMPCPY
- add %rdx, %rax
+ add %RDX_LP, %RAX_LP
# endif
L(start):
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
+# endif
lea (%rsi, %rdx), %rcx
lea (%rdi, %rdx), %r9
cmp $512, %rdx
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index c952576c..274aa1c7 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -95,20 +95,20 @@
.section SECTION(.text),"ax",@progbits
#if defined SHARED && IS_IN (libc)
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
#endif
ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
- movq %rdi, %rax
- addq %rdx, %rax
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
jmp L(start)
END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
#if defined SHARED && IS_IN (libc)
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
#endif
@@ -116,9 +116,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
movq %rdi, %rax
L(start):
- cmpq $VEC_SIZE, %rdx
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
+ cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
- cmpq $(VEC_SIZE * 2), %rdx
+ cmp $(VEC_SIZE * 2), %RDX_LP
ja L(more_2x_vec)
#if !defined USE_MULTIARCH || !IS_IN (libc)
L(last_2x_vec):
@@ -138,38 +142,38 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned))
# if VEC_SIZE == 16
ENTRY (__mempcpy_chk_erms)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__mempcpy_chk_erms)
/* Only used to measure performance of REP MOVSB. */
ENTRY (__mempcpy_erms)
- movq %rdi, %rax
+ mov %RDI_LP, %RAX_LP
/* Skip zero length. */
- testq %rdx, %rdx
+ test %RDX_LP, %RDX_LP
jz 2f
- addq %rdx, %rax
+ add %RDX_LP, %RAX_LP
jmp L(start_movsb)
END (__mempcpy_erms)
ENTRY (__memmove_chk_erms)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__memmove_chk_erms)
ENTRY (__memmove_erms)
movq %rdi, %rax
/* Skip zero length. */
- testq %rdx, %rdx
+ test %RDX_LP, %RDX_LP
jz 2f
L(start_movsb):
- movq %rdx, %rcx
- cmpq %rsi, %rdi
+ mov %RDX_LP, %RCX_LP
+ cmp %RSI_LP, %RDI_LP
jb 1f
/* Source == destination is less common. */
je 2f
- leaq (%rsi,%rcx), %rdx
- cmpq %rdx, %rdi
+ lea (%rsi,%rcx), %RDX_LP
+ cmp %RDX_LP, %RDI_LP
jb L(movsb_backward)
1:
rep movsb
@@ -189,20 +193,20 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
# ifdef SHARED
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
# endif
ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
- movq %rdi, %rax
- addq %rdx, %rax
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
jmp L(start_erms)
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
# ifdef SHARED
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
# endif
@@ -210,9 +214,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
movq %rdi, %rax
L(start_erms):
- cmpq $VEC_SIZE, %rdx
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
+ cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
- cmpq $(VEC_SIZE * 2), %rdx
+ cmp $(VEC_SIZE * 2), %RDX_LP
ja L(movsb_more_2x_vec)
L(last_2x_vec):
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
@@ -236,7 +244,7 @@ L(movsb):
/* Avoid slow backward REP MOVSB. */
jb L(more_8x_vec_backward)
1:
- movq %rdx, %rcx
+ mov %RDX_LP, %RCX_LP
rep movsb
L(nop):
ret
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index ddec7f04..2fe1e5ac 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround
endif
ifeq ($(subdir),string)
-tests += tst-size_t-memchr tst-size_t-memcmp
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
endif
ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
new file mode 100644
index 00000000..66b71e17
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
@@ -0,0 +1,58 @@
+/* Test memcpy with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define TEST_NAME "memcpy"
+#include "test-size_t.h"
+
+IMPL (memcpy, 1)
+
+typedef void *(*proto_t) (void *, const void *, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_memcpy (parameter_t a, parameter_t b)
+{
+ return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+ test_init ();
+
+ parameter_t dest = { { page_size }, buf1 };
+ parameter_t src = { { 0 }, buf2 };
+
+ int ret = 0;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ src.fn = impl->fn;
+ do_memcpy (dest, src);
+ int res = memcmp (dest.p, src.p, dest.len);
+ if (res)
+ {
+ error (0, 0, "Wrong result in function %s: %i != 0",
+ impl->name, res);
+ ret = 1;
+ }
+ }
+
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
--
GitLab

Some files were not shown because too many files have changed in this diff Show More