Resolves: #2120692 - speedup magic matching

This commit is contained in:
Vincent Mihalkovic 2022-08-28 20:32:33 +02:00
parent 8fa5828ebd
commit 9882f5663a
8 changed files with 1038 additions and 1 deletions

View File

@ -0,0 +1,377 @@
diff --git a/src/apprentice.c b/src/apprentice.c
index b609dd1..21eac1e 100644
--- a/src/apprentice.c
+++ b/src/apprentice.c
@@ -423,7 +423,15 @@ add_mlist(struct mlist *mlp, struct magic_map *map, size_t idx)
ml->map = idx == 0 ? map : NULL;
ml->magic = map->magic[idx];
ml->nmagic = map->nmagic[idx];
-
+ if (ml->nmagic) {
+ ml->magic_rxcomp = CAST(file_regex_t **,
+ calloc(ml->nmagic, sizeof(*ml->magic_rxcomp)));
+ if (ml->magic_rxcomp == NULL) {
+ free(ml);
+ return -1;
+ }
+ } else
+ ml->magic_rxcomp = NULL;
mlp->prev->next = ml;
ml->prev = mlp->prev;
ml->next = mlp;
@@ -607,8 +615,19 @@ mlist_free_all(struct magic_set *ms)
private void
mlist_free_one(struct mlist *ml)
{
+ size_t i;
+
if (ml->map)
apprentice_unmap(CAST(struct magic_map *, ml->map));
+
+ for (i = 0; i < ml->nmagic; ++i) {
+ if (ml->magic_rxcomp[i]) {
+ file_regfree(ml->magic_rxcomp[i]);
+ free(ml->magic_rxcomp[i]);
+ }
+ }
+ free(ml->magic_rxcomp);
+ ml->magic_rxcomp = NULL;
free(ml);
}
@@ -3492,16 +3511,16 @@ file_magicfind(struct magic_set *ms, const char *name, struct mlist *v)
for (ml = mlist->next; ml != mlist; ml = ml->next) {
struct magic *ma = ml->magic;
- uint32_t nma = ml->nmagic;
- for (i = 0; i < nma; i++) {
+ for (i = 0; i < ml->nmagic; i++) {
if (ma[i].type != FILE_NAME)
continue;
if (strcmp(ma[i].value.s, name) == 0) {
v->magic = &ma[i];
- for (j = i + 1; j < nma; j++)
+ for (j = i + 1; j < ml->nmagic; j++)
if (ma[j].cont_level == 0)
break;
v->nmagic = j - i;
+ v->magic_rxcomp = ml->magic_rxcomp;
return 0;
}
}
diff --git a/src/file.h b/src/file.h
index 48f4b69..c0b5a7c 100644
--- a/src/file.h
+++ b/src/file.h
@@ -88,6 +88,10 @@
/* Do this here and now, because struct stat gets re-defined on solaris */
#include <sys/stat.h>
#include <stdarg.h>
+#include <locale.h>
+#if defined(HAVE_XLOCALE_H)
+#include <xlocale.h>
+#endif
#define ENABLE_CONDITIONALS
@@ -167,6 +171,19 @@
#define FILE_COMPILE 2
#define FILE_LIST 3
+typedef struct {
+ const char *pat;
+#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE)
+#define USE_C_LOCALE
+ locale_t old_lc_ctype;
+ locale_t c_lc_ctype;
+#else
+ char *old_lc_ctype;
+#endif
+ int rc;
+ regex_t rx;
+} file_regex_t;
+
struct buffer {
int fd;
struct stat st;
@@ -394,7 +411,8 @@ struct magic {
/* list of magic entries */
struct mlist {
struct magic *magic; /* array of magic entries */
- uint32_t nmagic; /* number of entries in array */
+ file_regex_t **magic_rxcomp; /* array of compiled regexps */
+ size_t nmagic; /* number of entries in array */
void *map; /* internal resources used by entry */
struct mlist *next, *prev;
};
@@ -554,23 +572,7 @@ protected void buffer_init(struct buffer *, int, const struct stat *,
protected void buffer_fini(struct buffer *);
protected int buffer_fill(const struct buffer *);
-#include <locale.h>
-#if defined(HAVE_XLOCALE_H)
-#include <xlocale.h>
-#endif
-typedef struct {
- const char *pat;
-#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE)
-#define USE_C_LOCALE
- locale_t old_lc_ctype;
- locale_t c_lc_ctype;
-#else
- char *old_lc_ctype;
-#endif
- int rc;
- regex_t rx;
-} file_regex_t;
protected int file_regcomp(file_regex_t *, const char *, int);
protected int file_regexec(file_regex_t *, const char *, size_t, regmatch_t *,
diff --git a/src/softmagic.c b/src/softmagic.c
index 95061e5..834dfe3 100644
--- a/src/softmagic.c
+++ b/src/softmagic.c
@@ -43,7 +43,7 @@ FILE_RCSID("@(#)$File: softmagic.c,v 1.299 2020/06/07 21:58:01 christos Exp $")
#include <time.h>
#include "der.h"
-private int match(struct magic_set *, struct magic *, uint32_t,
+private int match(struct magic_set *, struct magic *, file_regex_t **, uint32_t,
const struct buffer *, size_t, int, int, int, uint16_t *,
uint16_t *, int *, int *, int *, int *);
private int mget(struct magic_set *, struct magic *, const struct buffer *,
@@ -52,7 +52,7 @@ private int mget(struct magic_set *, struct magic *, const struct buffer *,
uint16_t *, int *, int *, int *, int *);
private int msetoffset(struct magic_set *, struct magic *, struct buffer *,
const struct buffer *, size_t, unsigned int);
-private int magiccheck(struct magic_set *, struct magic *);
+private int magiccheck(struct magic_set *, struct magic *, file_regex_t **);
private int32_t mprint(struct magic_set *, struct magic *);
private int moffset(struct magic_set *, struct magic *, const struct buffer *,
int32_t *);
@@ -131,8 +131,8 @@ file_softmagic(struct magic_set *ms, const struct buffer *b,
}
for (ml = ms->mlist[0]->next; ml != ms->mlist[0]; ml = ml->next)
- if ((rv = match(ms, ml->magic, ml->nmagic, b, 0, mode,
- text, 0, indir_count, name_count,
+ if ((rv = match(ms, ml->magic, ml->magic_rxcomp, ml->nmagic, b,
+ 0, mode, text, 0, indir_count, name_count,
&printed_something, &need_separator, NULL, NULL)) != 0)
return rv;
@@ -191,8 +191,8 @@ file_fmtcheck(struct magic_set *ms, const char *desc, const char *def,
* so that higher-level continuations are processed.
*/
private int
-match(struct magic_set *ms, struct magic *magic, uint32_t nmagic,
- const struct buffer *b, size_t offset, int mode, int text,
+match(struct magic_set *ms, struct magic *magic, file_regex_t **magic_rxcomp,
+ uint32_t nmagic, const struct buffer *b, size_t offset, int mode, int text,
int flip, uint16_t *indir_count, uint16_t *name_count,
int *printed_something, int *need_separator, int *returnval,
int *found_match)
@@ -220,6 +220,7 @@ match(struct magic_set *ms, struct magic *magic, uint32_t nmagic,
for (magindex = 0; magindex < nmagic; magindex++) {
int flush = 0;
struct magic *m = &magic[magindex];
+ file_regex_t **m_rxcomp = &magic_rxcomp[magindex];
if (m->type != FILE_NAME)
if ((IS_STRING(m->type) &&
@@ -257,7 +258,7 @@ flush:
*returnval = 1;
}
- switch (magiccheck(ms, m)) {
+ switch (magiccheck(ms, m, m_rxcomp)) {
case -1:
return -1;
case 0:
@@ -317,6 +318,7 @@ flush:
while (magindex + 1 < nmagic &&
magic[magindex + 1].cont_level != 0) {
m = &magic[++magindex];
+ m_rxcomp = &magic_rxcomp[magindex];
ms->line = m->lineno; /* for messages */
if (cont_level < m->cont_level)
@@ -370,7 +372,7 @@ flush:
break;
}
- switch (flush ? 1 : magiccheck(ms, m)) {
+ switch (flush ? 1 : magiccheck(ms, m, m_rxcomp)) {
case -1:
return -1;
case 0:
@@ -1880,8 +1882,8 @@ mget(struct magic_set *ms, struct magic *m, const struct buffer *b,
oneed_separator = *need_separator;
if (m->flag & NOSPACE)
*need_separator = 0;
- rv = match(ms, ml.magic, ml.nmagic, b, offset + o,
- mode, text, flip, indir_count, name_count,
+ rv = match(ms, ml.magic, ml.magic_rxcomp, ml.nmagic, b,
+ offset + o, mode, text, flip, indir_count, name_count,
printed_something, need_separator, returnval, found_match);
(*name_count)--;
if (rv != 1)
@@ -1989,8 +1991,31 @@ file_strncmp16(const char *a, const char *b, size_t len, size_t maxlen,
return file_strncmp(a, b, len, maxlen, flags);
}
+private file_regex_t *
+alloc_regex(struct magic_set *ms, struct magic *m)
+{
+ int rc;
+ file_regex_t *rx = CAST(file_regex_t *, malloc(sizeof(*rx)));
+
+ if (rx == NULL) {
+ file_error(ms, errno, "can't allocate %" SIZE_T_FORMAT
+ "u bytes", sizeof(*rx));
+ return NULL;
+ }
+
+ rc = file_regcomp(rx, m->value.s, REG_EXTENDED | REG_NEWLINE |
+ ((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0));
+ if (rc == 0)
+ return rx;
+
+ file_regerror(rx, rc, ms);
+ file_regfree(rx);
+ free(rx);
+ return NULL;
+}
+
private int
-magiccheck(struct magic_set *ms, struct magic *m)
+magiccheck(struct magic_set *ms, struct magic *m, file_regex_t **m_cache)
{
uint64_t l = m->value.q;
uint64_t v;
@@ -2068,8 +2093,8 @@ magiccheck(struct magic_set *ms, struct magic *m)
break;
default:
- file_magerror(ms, "cannot happen with float: invalid relation `%c'",
- m->reln);
+ file_magerror(ms, "cannot happen with float: "
+ "invalid relation `%c'", m->reln);
return -1;
}
return matched;
@@ -2101,7 +2126,8 @@ magiccheck(struct magic_set *ms, struct magic *m)
break;
default:
- file_magerror(ms, "cannot happen with double: invalid relation `%c'", m->reln);
+ file_magerror(ms, "cannot happen with double: "
+ "invalid relation `%c'", m->reln);
return -1;
}
return matched;
@@ -2169,62 +2195,57 @@ magiccheck(struct magic_set *ms, struct magic *m)
}
case FILE_REGEX: {
int rc;
- file_regex_t rx;
+ file_regex_t *rx = *m_cache;
const char *search;
+ regmatch_t pmatch;
+ size_t slen = ms->search.s_len;
+ char *copy;
if (ms->search.s == NULL)
return 0;
+ if (rx == NULL) {
+ rx = *m_cache = alloc_regex(ms, m);
+ if (rx == NULL)
+ return -1;
+ }
l = 0;
- rc = file_regcomp(&rx, m->value.s,
- REG_EXTENDED|REG_NEWLINE|
- ((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0));
- if (rc) {
- file_regerror(&rx, rc, ms);
- v = CAST(uint64_t, -1);
+ if (slen != 0) {
+ copy = CAST(char *, malloc(slen));
+ if (copy == NULL) {
+ file_error(ms, errno,
+ "can't allocate %" SIZE_T_FORMAT "u bytes",
+ slen);
+ return -1;
+ }
+ memcpy(copy, ms->search.s, slen);
+ copy[--slen] = '\0';
+ search = copy;
} else {
- regmatch_t pmatch;
- size_t slen = ms->search.s_len;
- char *copy;
- if (slen != 0) {
- copy = CAST(char *, malloc(slen));
- if (copy == NULL) {
- file_regfree(&rx);
- file_error(ms, errno,
- "can't allocate %" SIZE_T_FORMAT "u bytes",
- slen);
- return -1;
- }
- memcpy(copy, ms->search.s, slen);
- copy[--slen] = '\0';
- search = copy;
- } else {
- search = CCAST(char *, "");
- copy = NULL;
- }
- rc = file_regexec(&rx, RCAST(const char *, search),
- 1, &pmatch, 0);
- free(copy);
- switch (rc) {
- case 0:
- ms->search.s += CAST(int, pmatch.rm_so);
- ms->search.offset += CAST(size_t, pmatch.rm_so);
- ms->search.rm_len = CAST(size_t,
- pmatch.rm_eo - pmatch.rm_so);
- v = 0;
- break;
+ search = CCAST(char *, "");
+ copy = NULL;
+ }
+ rc = file_regexec(rx, RCAST(const char *, search),
+ 1, &pmatch, 0);
+ free(copy);
+ switch (rc) {
+ case 0:
+ ms->search.s += CAST(int, pmatch.rm_so);
+ ms->search.offset += CAST(size_t, pmatch.rm_so);
+ ms->search.rm_len = CAST(size_t,
+ pmatch.rm_eo - pmatch.rm_so);
+ v = 0;
+ break;
- case REG_NOMATCH:
- v = 1;
- break;
+ case REG_NOMATCH:
+ v = 1;
+ break;
- default:
- file_regerror(&rx, rc, ms);
- v = CAST(uint64_t, -1);
- break;
- }
+ default:
+ file_regerror(rx, rc, ms);
+ v = CAST(uint64_t, -1);
+ break;
}
- file_regfree(&rx);
if (v == CAST(uint64_t, -1))
return -1;
break;

View File

@ -0,0 +1,226 @@
diff --git a/src/apprentice.c b/src/apprentice.c
index 21eac1e..781c5e1 100644
--- a/src/apprentice.c
+++ b/src/apprentice.c
@@ -513,6 +513,9 @@ file_ms_free(struct magic_set *ms)
free(ms->o.pbuf);
free(ms->o.buf);
free(ms->c.li);
+#ifdef USE_C_LOCALE
+ freelocale(ms->c_lc_ctype);
+#endif
free(ms);
}
@@ -551,6 +554,10 @@ file_ms_alloc(int flags)
ms->elf_notes_max = FILE_ELF_NOTES_MAX;
ms->regex_max = FILE_REGEX_MAX;
ms->bytes_max = FILE_BYTES_MAX;
+#ifdef USE_C_LOCALE
+ ms->c_lc_ctype = newlocale(LC_CTYPE_MASK, "C", 0);
+ assert(ms->c_lc_ctype != NULL);
+#endif
return ms;
free:
free(ms);
@@ -624,6 +631,7 @@ mlist_free_one(struct mlist *ml)
if (ml->magic_rxcomp[i]) {
file_regfree(ml->magic_rxcomp[i]);
free(ml->magic_rxcomp[i]);
+ ml->magic_rxcomp[i] = NULL;
}
}
free(ml->magic_rxcomp);
@@ -2714,7 +2722,8 @@ getvalue(struct magic_set *ms, struct magic *m, const char **p, int action)
}
if (m->type == FILE_REGEX) {
file_regex_t rx;
- int rc = file_regcomp(&rx, m->value.s, REG_EXTENDED);
+ int rc = file_regcomp(ms, &rx, m->value.s,
+ REG_EXTENDED);
if (rc) {
if (ms->flags & MAGIC_CHECK)
file_regerror(&rx, rc, ms);
diff --git a/src/file.h b/src/file.h
index c0b5a7c..f049446 100644
--- a/src/file.h
+++ b/src/file.h
@@ -173,13 +173,6 @@
typedef struct {
const char *pat;
-#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE)
-#define USE_C_LOCALE
- locale_t old_lc_ctype;
- locale_t c_lc_ctype;
-#else
- char *old_lc_ctype;
-#endif
int rc;
regex_t rx;
} file_regex_t;
@@ -487,6 +480,10 @@ struct magic_set {
#define FILE_INDIR_MAX 50
#define FILE_NAME_MAX 50
#define FILE_REGEX_MAX 8192
+#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE)
+#define USE_C_LOCALE
+ locale_t c_lc_ctype;
+#endif
};
/* Type for Unicode characters */
@@ -574,9 +571,10 @@ protected int buffer_fill(const struct buffer *);
-protected int file_regcomp(file_regex_t *, const char *, int);
-protected int file_regexec(file_regex_t *, const char *, size_t, regmatch_t *,
+protected int file_regcomp(struct magic_set *, file_regex_t *, const char *,
int);
+protected int file_regexec(struct magic_set *, file_regex_t *, const char *,
+ size_t, regmatch_t *, int);
protected void file_regfree(file_regex_t *);
protected void file_regerror(file_regex_t *, int, struct magic_set *);
diff --git a/src/funcs.c b/src/funcs.c
index 6320cf2..1391a44 100644
--- a/src/funcs.c
+++ b/src/funcs.c
@@ -613,13 +613,13 @@ file_replace(struct magic_set *ms, const char *pat, const char *rep)
file_regex_t rx;
int rc, rv = -1;
- rc = file_regcomp(&rx, pat, REG_EXTENDED);
+ rc = file_regcomp(ms, &rx, pat, REG_EXTENDED);
if (rc) {
file_regerror(&rx, rc, ms);
} else {
regmatch_t rm;
int nm = 0;
- while (file_regexec(&rx, ms->o.buf, 1, &rm, 0) == 0) {
+ while (file_regexec(ms, &rx, ms->o.buf, 1, &rm, 0) == 0) {
ms->o.buf[rm.rm_so] = '\0';
if (file_printf(ms, "%s%s", rep,
rm.rm_eo != 0 ? ms->o.buf + rm.rm_eo : "") == -1)
@@ -634,34 +634,52 @@ out:
}
protected int
-file_regcomp(file_regex_t *rx, const char *pat, int flags)
+file_regcomp(struct magic_set *ms, file_regex_t *rx, const char *pat, int flags)
{
#ifdef USE_C_LOCALE
- rx->c_lc_ctype = newlocale(LC_CTYPE_MASK, "C", 0);
- assert(rx->c_lc_ctype != NULL);
- rx->old_lc_ctype = uselocale(rx->c_lc_ctype);
- assert(rx->old_lc_ctype != NULL);
+ locale_t old = uselocale(ms->c_lc_ctype);
+ assert(old != NULL);
#else
- rx->old_lc_ctype = setlocale(LC_CTYPE, NULL);
- assert(rx->old_lc_ctype != NULL);
- rx->old_lc_ctype = strdup(rx->old_lc_ctype);
- assert(rx->old_lc_ctype != NULL);
+ char old[1024];
+ strlcpy(old, setlocale(LC_CTYPE, NULL), sizeof(old));
(void)setlocale(LC_CTYPE, "C");
#endif
rx->pat = pat;
- return rx->rc = regcomp(&rx->rx, pat, flags);
+ rx->rc = regcomp(&rx->rx, pat, flags);
+
+#ifdef USE_C_LOCALE
+ uselocale(old);
+#else
+ (void)setlocale(LC_CTYPE, old);
+#endif
+ return rx->rc;
}
protected int
-file_regexec(file_regex_t *rx, const char *str, size_t nmatch,
- regmatch_t* pmatch, int eflags)
+file_regexec(struct magic_set *ms, file_regex_t *rx, const char *str,
+ size_t nmatch, regmatch_t* pmatch, int eflags)
{
+#ifdef USE_C_LOCALE
+ locale_t old = uselocale(ms->c_lc_ctype);
+ assert(old != NULL);
+#else
+ char old[1024];
+ strlcpy(old, setlocale(LC_CTYPE, NULL), sizeof(old));
+ (void)setlocale(LC_CTYPE, "C");
+#endif
+ int rc;
assert(rx->rc == 0);
/* XXX: force initialization because glibc does not always do this */
if (nmatch != 0)
memset(pmatch, 0, nmatch * sizeof(*pmatch));
- return regexec(&rx->rx, str, nmatch, pmatch, eflags);
+ rc = regexec(&rx->rx, str, nmatch, pmatch, eflags);
+#ifdef USE_C_LOCALE
+ uselocale(old);
+#else
+ (void)setlocale(LC_CTYPE, old);
+#endif
+ return rc;
}
protected void
@@ -669,13 +687,6 @@ file_regfree(file_regex_t *rx)
{
if (rx->rc == 0)
regfree(&rx->rx);
-#ifdef USE_C_LOCALE
- (void)uselocale(rx->old_lc_ctype);
- freelocale(rx->c_lc_ctype);
-#else
- (void)setlocale(LC_CTYPE, rx->old_lc_ctype);
- free(rx->old_lc_ctype);
-#endif
}
protected void
diff --git a/src/softmagic.c b/src/softmagic.c
index 43338fc..b4052a6 100644
--- a/src/softmagic.c
+++ b/src/softmagic.c
@@ -478,11 +478,11 @@ check_fmt(struct magic_set *ms, const char *fmt)
if (strchr(fmt, '%') == NULL)
return 0;
- rc = file_regcomp(&rx, "%[-0-9\\.]*s", REG_EXTENDED|REG_NOSUB);
+ rc = file_regcomp(ms, &rx, "%[-0-9\\.]*s", REG_EXTENDED|REG_NOSUB);
if (rc) {
file_regerror(&rx, rc, ms);
} else {
- rc = file_regexec(&rx, fmt, 0, 0, 0);
+ rc = file_regexec(ms, &rx, fmt, 0, 0, 0);
rv = !rc;
}
file_regfree(&rx);
@@ -2003,11 +2003,12 @@ alloc_regex(struct magic_set *ms, struct magic *m)
return NULL;
}
- rc = file_regcomp(rx, m->value.s, REG_EXTENDED | REG_NEWLINE |
+ rc = file_regcomp(ms, rx, m->value.s, REG_EXTENDED | REG_NEWLINE |
((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0));
if (rc == 0)
return rx;
+fprintf(stderr, "regcomp %s %d\n", m->value.s, rc);
file_regerror(rx, rc, ms);
file_regfree(rx);
free(rx);
@@ -2225,7 +2226,7 @@ magiccheck(struct magic_set *ms, struct magic *m, file_regex_t **m_cache)
search = CCAST(char *, "");
copy = NULL;
}
- rc = file_regexec(rx, RCAST(const char *, search),
+ rc = file_regexec(ms, rx, RCAST(const char *, search),
1, &pmatch, 0);
free(copy);
switch (rc) {

View File

@ -0,0 +1,28 @@
From d1a00ae92b2cf09298615cf3aba474d8fec7380f Mon Sep 17 00:00:00 2001
From: Christos Zoulas <christos@zoulas.com>
Date: Mon, 18 Apr 2022 21:46:43 +0000
Subject: [PATCH] From Dirk Mueller:
when name/use was used, the regex caching table was incorrectly
initialized, which led to false or missing matches.
---
src/apprentice.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/apprentice.c b/src/apprentice.c
index 804c0e33e..992102b4e 100644
--- a/src/apprentice.c
+++ b/src/apprentice.c
@@ -3678,11 +3678,11 @@ file_magicfind(struct magic_set *ms, const char *name, struct mlist *v)
continue;
if (strcmp(ma[i].value.s, name) == 0) {
v->magic = &ma[i];
+ v->magic_rxcomp = &(ml->magic_rxcomp[i]);
for (j = i + 1; j < ml->nmagic; j++)
if (ma[j].cont_level == 0)
break;
v->nmagic = j - i;
- v->magic_rxcomp = ml->magic_rxcomp;
return 0;
}
}

View File

@ -0,0 +1,164 @@
diff --git a/src/apprentice.c b/src/apprentice.c
index 781c5e1..50a91cf 100644
--- a/src/apprentice.c
+++ b/src/apprentice.c
@@ -2724,11 +2724,9 @@ getvalue(struct magic_set *ms, struct magic *m, const char **p, int action)
file_regex_t rx;
int rc = file_regcomp(ms, &rx, m->value.s,
REG_EXTENDED);
- if (rc) {
- if (ms->flags & MAGIC_CHECK)
- file_regerror(&rx, rc, ms);
+ if (rc == 0) {
+ file_regfree(&rx);
}
- file_regfree(&rx);
return rc ? -1 : 0;
}
return 0;
diff --git a/src/file.h b/src/file.h
index f049446..ee15855 100644
--- a/src/file.h
+++ b/src/file.h
@@ -171,11 +171,7 @@
#define FILE_COMPILE 2
#define FILE_LIST 3
-typedef struct {
- const char *pat;
- int rc;
- regex_t rx;
-} file_regex_t;
+typedef regex_t file_regex_t;
struct buffer {
int fd;
@@ -576,7 +572,6 @@ protected int file_regcomp(struct magic_set *, file_regex_t *, const char *,
protected int file_regexec(struct magic_set *, file_regex_t *, const char *,
size_t, regmatch_t *, int);
protected void file_regfree(file_regex_t *);
-protected void file_regerror(file_regex_t *, int, struct magic_set *);
typedef struct {
char *buf;
diff --git a/src/funcs.c b/src/funcs.c
index 1391a44..df436eb 100644
--- a/src/funcs.c
+++ b/src/funcs.c
@@ -614,9 +614,7 @@ file_replace(struct magic_set *ms, const char *pat, const char *rep)
int rc, rv = -1;
rc = file_regcomp(ms, &rx, pat, REG_EXTENDED);
- if (rc) {
- file_regerror(&rx, rc, ms);
- } else {
+ if (rc == 0) {
regmatch_t rm;
int nm = 0;
while (file_regexec(ms, &rx, ms->o.buf, 1, &rm, 0) == 0) {
@@ -644,16 +642,22 @@ file_regcomp(struct magic_set *ms, file_regex_t *rx, const char *pat, int flags)
strlcpy(old, setlocale(LC_CTYPE, NULL), sizeof(old));
(void)setlocale(LC_CTYPE, "C");
#endif
- rx->pat = pat;
-
- rx->rc = regcomp(&rx->rx, pat, flags);
+ int rc;
+ rc = regcomp(rx, pat, flags);
#ifdef USE_C_LOCALE
uselocale(old);
#else
(void)setlocale(LC_CTYPE, old);
#endif
- return rx->rc;
+ if (rc > 0 && (ms->flags & MAGIC_CHECK)) {
+ char errmsg[512];
+
+ (void)regerror(rc, rx, errmsg, sizeof(errmsg));
+ file_magerror(ms, "regex error %d for `%s', (%s)", rc, pat,
+ errmsg);
+ }
+ return rc;
}
protected int
@@ -669,11 +673,10 @@ file_regexec(struct magic_set *ms, file_regex_t *rx, const char *str,
(void)setlocale(LC_CTYPE, "C");
#endif
int rc;
- assert(rx->rc == 0);
/* XXX: force initialization because glibc does not always do this */
if (nmatch != 0)
memset(pmatch, 0, nmatch * sizeof(*pmatch));
- rc = regexec(&rx->rx, str, nmatch, pmatch, eflags);
+ rc = regexec(rx, str, nmatch, pmatch, eflags);
#ifdef USE_C_LOCALE
uselocale(old);
#else
@@ -685,18 +688,7 @@ file_regexec(struct magic_set *ms, file_regex_t *rx, const char *str,
protected void
file_regfree(file_regex_t *rx)
{
- if (rx->rc == 0)
- regfree(&rx->rx);
-}
-
-protected void
-file_regerror(file_regex_t *rx, int rc, struct magic_set *ms)
-{
- char errmsg[512];
-
- (void)regerror(rc, &rx->rx, errmsg, sizeof(errmsg));
- file_magerror(ms, "regex error %d for `%s', (%s)", rc, rx->pat,
- errmsg);
+ regfree(rx);
}
protected file_pushbuf_t *
diff --git a/src/softmagic.c b/src/softmagic.c
index b4052a6..f469a12 100644
--- a/src/softmagic.c
+++ b/src/softmagic.c
@@ -474,14 +474,13 @@ check_fmt(struct magic_set *ms, const char *fmt)
{
file_regex_t rx;
int rc, rv = -1;
+ const char* pat = "%[-0-9\\.]*s";
if (strchr(fmt, '%') == NULL)
return 0;
- rc = file_regcomp(ms, &rx, "%[-0-9\\.]*s", REG_EXTENDED|REG_NOSUB);
- if (rc) {
- file_regerror(&rx, rc, ms);
- } else {
+ rc = file_regcomp(ms, &rx, pat, REG_EXTENDED|REG_NOSUB);
+ if (rc == 0) {
rc = file_regexec(ms, &rx, fmt, 0, 0, 0);
rv = !rc;
}
@@ -2008,9 +2007,6 @@ alloc_regex(struct magic_set *ms, struct magic *m)
if (rc == 0)
return rx;
-fprintf(stderr, "regcomp %s %d\n", m->value.s, rc);
- file_regerror(rx, rc, ms);
- file_regfree(rx);
free(rx);
return NULL;
}
@@ -2243,12 +2239,9 @@ magiccheck(struct magic_set *ms, struct magic *m, file_regex_t **m_cache)
break;
default:
- file_regerror(rx, rc, ms);
- v = CAST(uint64_t, -1);
+ return -1;
break;
}
- if (v == CAST(uint64_t, -1))
- return -1;
break;
}
case FILE_INDIRECT:

View File

@ -0,0 +1,58 @@
From a4b3abc4104d8a4eeb84a6242f2f1a830204a539 Mon Sep 17 00:00:00 2001
From: Christos Zoulas <christos@zoulas.com>
Date: Sat, 12 Mar 2022 15:09:47 +0000
Subject: [PATCH] Combine regex's to improve performance adjusting strength to
preserve ranking (Dirk Mueller)
---
magic/Magdir/make | 29 +++++++----------------------
1 file changed, 7 insertions(+), 22 deletions(-)
diff --git a/magic/Magdir/make b/magic/Magdir/make
index f522b4f18..1abdf7a3e 100644
--- a/magic/Magdir/make
+++ b/magic/Magdir/make
@@ -1,36 +1,21 @@
#------------------------------------------------------------------------------
-# $File: make,v 1.4 2018/05/29 17:26:02 christos Exp $
+# $File: make,v 1.5 2022/03/12 15:09:47 christos Exp $
# make: file(1) magic for makefiles
#
# URL: https://en.wikipedia.org/wiki/Make_(software)
-0 regex/100l \^CFLAGS makefile script text
-!:mime text/x-makefile
-0 regex/100l \^VPATH makefile script text
-!:mime text/x-makefile
-0 regex/100l \^LDFLAGS makefile script text
-!:mime text/x-makefile
-0 regex/100l \^all: makefile script text
-!:mime text/x-makefile
-0 regex/100l \^\\.PRECIOUS makefile script text
+0 regex/100l \^(CFLAGS|VPATH|LDFLAGS|all:|\\.PRECIOUS) makefile script text
!:mime text/x-makefile
+!:strength -15
# Update: Joerg Jenderek
# Reference: https://www.freebsd.org/cgi/man.cgi?make(1)
# exclude grub-core\lib\libgcrypt\mpi\Makefile.am with "#BEGIN_ASM_LIST"
# by additional escaping point character
-0 regex/100l \^\\.BEGIN BSD makefile script text
-!:mime text/x-makefile
-!:ext /mk
-!:strength +10
# exclude MS Windows help file CoNtenT with ":include FOOBAR.CNT"
# and NSIS script with "!include" by additional escaping point character
-0 regex/100l \^\\.include BSD makefile script text
-!:mime text/x-makefile
-!:ext /mk
-!:strength +10
-0 regex/100l \^\\.endif BSD makefile script text
+0 regex/100l \^\\.(BEGIN|endif|include) BSD makefile script text
!:mime text/x-makefile
!:ext /mk
-!:strength +10
-0 regex/100l \^SUBDIRS automake makefile script text
+!:strength -10
+0 regex/100l \^SUBDIRS[[:space:]]+= automake makefile script text
!:mime text/x-makefile
-!:strength +10
+!:strength -15

View File

@ -0,0 +1,86 @@
diff --git a/magic/Magdir/archive b/magic/Magdir/archive
index 99798b0..945f536 100644
--- a/magic/Magdir/archive
+++ b/magic/Magdir/archive
@@ -150,7 +150,7 @@
# Incremental snapshot gnu-tar format from:
# https://www.gnu.org/software/tar/manual/html_node/Snapshot-Files.html
0 string GNU\ tar- GNU tar incremental snapshot data
->&0 regex [0-9]\.[0-9]+-[0-9]+ version %s
+>&0 regex [0-9]\\.[0-9]+-[0-9]+ version %s
# cpio archives
#
diff --git a/magic/Magdir/ctf b/magic/Magdir/ctf
index ebea8f3..d91684d 100644
--- a/magic/Magdir/ctf
+++ b/magic/Magdir/ctf
@@ -20,4 +20,4 @@
# CTF metadata (plain text)
0 string /*\x20CTF\x20 Common Trace Format (CTF) plain text metadata
!:strength + 5 # this is to make sure we beat C
->&0 regex [0-9]+\.[0-9]+ \b, v%s
+>&0 regex [0-9]+\\.[0-9]+ \b, v%s
diff --git a/magic/Magdir/geo b/magic/Magdir/geo
index d72e514..dda5f73 100644
--- a/magic/Magdir/geo
+++ b/magic/Magdir/geo
@@ -1,6 +1,6 @@
#------------------------------------------------------------------------------
-# $File: geo,v 1.7 2019/04/19 00:42:27 christos Exp $
+# $File: geo,v 1.8 2022/03/24 15:48:58 christos Exp $
# Geo- files from Kurt Schwehr <schwehr@ccom.unh.edu>
######################################################################
@@ -28,8 +28,8 @@
# Knudsen subbottom chirp profiler - Binary File Format: B9
# KEB D409-03167 V1.75 Huffman
0 string KEB\ Knudsen seismic KEL binary (KEB) -
->4 regex [-A-Z0-9]* Software: %s
->>&1 regex V[0-9]*\.[0-9]* version %s
+>4 regex [-A-Z0-9]+ Software: %s
+>>&1 regex V[0-9]+\\.[0-9]+ version %s
######################################################################
#
@@ -40,7 +40,7 @@
# Caris LIDAR format for LADS comes as two parts... ascii location file and binary waveform data
0 string HCA LADS Caris Ascii Format (CAF) bathymetric lidar
->4 regex [0-9]*\.[0-9]* version %s
+>4 regex [0-9]+\\.[0-9]+ version %s
0 string HCB LADS Caris Binary Format (CBF) bathymetric lidar waveform data
>3 byte x version %d .
@@ -69,7 +69,7 @@
# mb121 https://www.saic.com/maritime/gsf/
8 string GSF-v SAIC generic sensor format (GSF) sonar data,
->&0 regex [0-9]*\.[0-9]* version %s
+>&0 regex [0-9]+\\.[0-9]+ version %s
# MGD77 - https://www.ngdc.noaa.gov/mgg/dat/geodas/docs/mgd77.htm
# mb161
diff --git a/magic/Magdir/windows b/magic/Magdir/windows
index 8a7923f..1247e03 100644
--- a/magic/Magdir/windows
+++ b/magic/Magdir/windows
@@ -358,7 +358,7 @@
# skip space at beginning
>0 string \040
# name without extension and greater character or name with hlp extension
->>1 regex/c \^([^\xd>]*|.*\.hlp) MS Windows help file Content, based "%s"
+>>1 regex/c \^([^\xd>]*|.*\\.hlp) MS Windows help file Content, based "%s"
!:mime text/plain
!:apple ????TEXT
!:ext cnt
@@ -535,7 +535,7 @@
# http://www.winfaq.de/faq_html/Content/tip2500/onlinefaq.php?h=tip2653.htm
# https://msdn.microsoft.com/en-us/library/windows/desktop/cc144102.aspx
# .ShellClassInfo DeleteOnCopy LocalizedFileNames ASCII coded case-independent
->>&0 regex/c \^(\.ShellClassInfo|DeleteOnCopy|LocalizedFileNames)] Windows desktop.ini
+>>&0 regex/1024c \^(\\.ShellClassInfo|DeleteOnCopy|LocalizedFileNames)] Windows desktop.ini
!:mime application/x-wine-extension-ini
#!:mime text/plain
# https://support.microsoft.com/kb/84709/

View File

@ -0,0 +1,79 @@
From 14b5d7aa0b55275969809fdf84e8a8caee857c0f Mon Sep 17 00:00:00 2001
From: Christos Zoulas <christos@zoulas.com>
Date: Mon, 18 Apr 2022 21:38:10 +0000
Subject: [PATCH] From Dirk Mueller: * regex rules need literal dots escaped,
otherwise they are considered any character * literal search strings can be
searched using search rather than the much more expensive regex * use
standard xml declaration search as used in other format matchers * only match
the first 1024 bytes, the information we look for should be in the very
first tag * remove unnecessary parentheses
---
magic/Magdir/dataone | 28 ++++++++++++++--------------
1 file changed, 14 insertions(+), 14 deletions(-)
diff --git a/magic/Magdir/dataone b/magic/Magdir/dataone
index 8ef3f7981..566633eff 100644
--- a/magic/Magdir/dataone
+++ b/magic/Magdir/dataone
@@ -1,6 +1,6 @@
#------------------------------------------------------------------------------
-# $File: dataone,v 1.2 2019/04/19 00:42:27 christos Exp $
+# $File: dataone,v 1.3 2022/04/18 21:38:10 christos Exp $
#
# DataONE- files from Dave Vieglais <dave.vieglais@gmail.com> &
# Pratik Shrivastava <pratikshrivastava23@gmail.com>
@@ -9,39 +9,39 @@
#------------------------------------------------------------------------------
# EML (Ecological Metadata Language Format)
-0 string <?xml
->&0 regex (eml)-[0-9].[0-9].[0-9]+ eml://ecoinformatics.org/%s
+0 string \<?xml\ version=
+>&0 regex/1024 eml-[0-9]\\.[0-9]\\.[0-9]+ eml://ecoinformatics.org/%s
# onedcx (DataONE Dublin Core Extended v1.0)
->&0 regex (onedcx/v)[0-9].[0-9]+ https://ns.dataone.org/metadata/schema/onedcx/v1.0
+>&0 regex/1024 onedcx/v[0-9]\\.[0-9]+ https://ns.dataone.org/metadata/schema/onedcx/v1.0
# FGDC-STD-001-1998 (Content Standard for Digital Geospatial Metadata,
# version 001-1998)
->&0 regex fgdc FGDC-STD-001-1998
+>&0 search/1024 fgdc FGDC-STD-001-1998
# Mercury (Oak Ridge National Lab Mercury Metadata version 1.0)
->&0 regex (mercury/terms/v)[0-9].[0-9] https://purl.org/ornl/schema/mercury/terms/v1.0
+>&0 regex/1024 mercury/terms/v[0-9]\\.[0-9] https://purl.org/ornl/schema/mercury/terms/v1.0
# ISOTC211 (Geographic MetaData (GMD) Extensible Markup Language)
->&0 regex isotc211
->>&0 regex eng;USA https://www.isotc211.org/2005/gmd
+>&0 search/1024 isotc211
+>>&0 search/1024 eng;USA https://www.isotc211.org/2005/gmd
# ISOTC211 (NOAA Variant Geographic MetaData (GMD) Extensible Markup Language)
->>&0 regex gov.noaa.nodc:[0-9]+ https://www.isotc211.org/2005/gmd-noaa
+>>&0 regex/1024 gov\\.noaa\\.nodc:[0-9]+ https://www.isotc211.org/2005/gmd-noaa
# ISOTC211 PANGAEA Variant Geographic MetaData (GMD) Extensible Markup Language
->>&0 regex pangaea.dataset[0-9][0-9][0-9][0-9][0-9][0-9]+ https://www.isotc211.org/2005/gmd-pangaea
+>>&0 regex/1024 pangaea\\.dataset[0-9][0-9][0-9][0-9][0-9][0-9]+ https://www.isotc211.org/2005/gmd-pangaea
!:mime text/xml
# Object Reuse and Exchange Vocabulary
-0 string <?xml
->&0 regex rdf
->>&0 regex openarchives https://www.openarchives.org/ore/terms
+0 string \<?xml\ version=
+>&0 search/1024 rdf
+>>&0 search/1024 openarchives https://www.openarchives.org/ore/terms
!:mime application/rdf+xml
# Dryad Metadata Application Profile Version 3.1
0 string <DryadData
->&0 regex (dryad-bibo/v)[0-9].[0-9] https://datadryad.org/profile/v3.1
+>&0 regex/1024 dryad-bibo/v[0-9]\\.[0-9] https://datadryad.org/profile/v3.1
!:mime text/xml

View File

@ -15,7 +15,7 @@
Summary: Utility for determining file types
Name: file
Version: 5.39
Release: 9%{?dist}
Release: 10%{?dist}
License: BSD
Source0: http://ftp.astron.com/pub/file/file-%{version}.tar.gz
@ -35,6 +35,21 @@ Patch4: file-5.40-magic-python.patch
# not yet upstream
Patch5: file-5.33-fix-compression.patch
# Upstream commit 4c8a4d8dbab1e73bfb30e391dcec49fcf269f84d (#2120692)
Patch6: file-5.39-regex-caching-1.patch
# Upstream commit 7d438e28c16773e28a3707935c8e5d9927a515a7 (#2120692)
Patch7: file-5.39-regex-caching-2.patch
# Upstream commit d1a00ae92b2cf09298615cf3aba474d8fec7380f (#2120692)
Patch8: file-5.39-regex-caching-3.patch
# Upstream commit 4254a0afecfd2ae200c694dfcf93f4b4ac21652e (#2120692)
Patch9: file-5.39-regex-caching-4.patch
# Upstream commit a4b3abc4104d8a4eeb84a6242f2f1a830204a539 (#2120692)
Patch10: file-5.39-regex-combinations.patch
# Upstream commit d17d8e9ff8ad8e95fdf66239ccdcc2133d1ce5ce (#2120692)
Patch11: file-5.39-regex-escape.patch
# Upstream commit 14b5d7aa0b55275969809fdf84e8a8caee857c0f (#2120692)
Patch12: file-5.39-regex-optimalizations.patch
URL: https://www.darwinsys.com/file/
Requires: file-libs%{?_isa} = %{version}-%{release}
BuildRequires: zlib-devel
@ -213,6 +228,10 @@ cd %{py3dir}
%endif
%changelog
* Wed Aug 24 2022 Vincent Mihalkovic <vmihalko@redhat.com> - 5.39-10
- speedup magic matching
Resolves: #2120692
* Wed Aug 17 2022 Vincent Mihalkovic <vmihalko@redhat.com> - 5.39-9
- fix recognition (src/compress.c) of compressed empty files
Resolves: #2121694