diff --git a/string/Makefile b/string/Makefile index aaad743..f70dc7a 100644 --- a/string/Makefile +++ b/string/Makefile @@ -57,6 +57,8 @@ tests := tester inl-tester noinl-tester testcopy test-ffs \ tests-ifunc := $(strop-tests:%=test-%-ifunc) tests += $(tests-ifunc) +xtests = tst-strcoll-overflow + include ../Rules tester-ENV = LANGUAGE=C diff --git a/string/strcoll_l.c b/string/strcoll_l.c index 50ed84d..4ee101a 100644 --- a/string/strcoll_l.c +++ b/string/strcoll_l.c @@ -45,7 +45,7 @@ typedef struct { int len; /* Length of the current sequence. */ - int val; /* Position of the sequence relative to the + size_t val; /* Position of the sequence relative to the previous non-ignored sequence. */ size_t idxnow; /* Current index in sequences. */ size_t idxmax; /* Maximum index in sequences. */ @@ -55,6 +55,12 @@ typedef struct const USTRING_TYPE *us; /* The string. */ int32_t *idxarr; /* Array to cache weight indices. */ unsigned char *rulearr; /* Array to cache rules. */ + unsigned char rule; /* Saved rule for the first sequence. */ + int32_t idx; /* Index to weight of the current sequence. */ + int32_t save_idx; /* Save looked up index of a forward + sequence after the last backward + sequence. */ + const USTRING_TYPE *back_us; /* Beginning of the backward sequence. */ } coll_seq; /* Get next sequence. The weight indices are cached, so we don't need to @@ -64,7 +70,7 @@ get_next_seq_cached (coll_seq *seq, int nrules, int pass, const unsigned char *rulesets, const USTRING_TYPE *weights) { - int val = seq->val = 0; + size_t val = seq->val = 0; int len = seq->len; size_t backw_stop = seq->backw_stop; size_t backw = seq->backw; @@ -146,7 +152,7 @@ get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets, const USTRING_TYPE *extra, const int32_t *indirect) { #include WEIGHT_H - int val = seq->val = 0; + size_t val = seq->val = 0; int len = seq->len; size_t backw_stop = seq->backw_stop; size_t backw = seq->backw; @@ -162,7 +168,7 @@ get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets, ++val; if (backw_stop != ~0ul) { - /* The is something pushed. */ + /* There is something pushed. */ if (backw == backw_stop) { /* The last pushed character was handled. Continue @@ -227,15 +233,199 @@ get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets, seq->us = us; } -/* Compare two sequences. */ +/* Get next sequence. Traverse the string as required. This function does not + set or use any index or rule cache. */ +static void +get_next_seq_nocache (coll_seq *seq, int nrules, const unsigned char *rulesets, + const USTRING_TYPE *weights, const int32_t *table, + const USTRING_TYPE *extra, const int32_t *indirect, + int pass) +{ +#include WEIGHT_H + size_t val = seq->val = 0; + int len = seq->len; + size_t backw_stop = seq->backw_stop; + size_t backw = seq->backw; + size_t idxcnt = seq->idxcnt; + size_t idxmax = seq->idxmax; + int32_t idx = seq->idx; + const USTRING_TYPE *us = seq->us; + + while (len == 0) + { + ++val; + if (backw_stop != ~0ul) + { + /* There is something pushed. */ + if (backw == backw_stop) + { + /* The last pushed character was handled. Continue + with forward characters. */ + if (idxcnt < idxmax) + { + idx = seq->save_idx; + backw_stop = ~0ul; + } + else + { + /* Nothing anymore. The backward sequence ended with + the last sequence in the string. Note that len is + still zero. */ + idx = 0; + break; + } + } + else + { + /* XXX Traverse BACKW sequences from the beginning of + BACKW_STOP to get the next sequence. Is ther a quicker way + to do this? */ + size_t i = backw_stop; + us = seq->back_us; + while (i < backw) + { + int32_t tmp = findidx (&us, -1); + idx = tmp & 0xffffff; + i++; + } + --backw; + us = seq->us; + } + } + else + { + backw_stop = idxmax; + int32_t prev_idx = idx; + + while (*us != L('\0')) + { + int32_t tmp = findidx (&us, -1); + unsigned char rule = tmp >> 24; + prev_idx = idx; + idx = tmp & 0xffffff; + idxcnt = idxmax++; + + /* Save the rule for the first sequence. */ + if (__glibc_unlikely (idxcnt == 0)) + seq->rule = rule; + + if ((rulesets[rule * nrules + pass] + & sort_backward) == 0) + /* No more backward characters to push. */ + break; + ++idxcnt; + } + + if (backw_stop >= idxcnt) + { + /* No sequence at all or just one. */ + if (idxcnt == idxmax || backw_stop > idxcnt) + /* Note that len is still zero. */ + break; + + backw_stop = ~0ul; + } + else + { + /* We pushed backward sequences. If the stream ended with the + backward sequence, then we process the last sequence we + found. Otherwise we process the sequence before the last + one since the last one was a forward sequence. */ + seq->back_us = seq->us; + seq->us = us; + backw = idxcnt; + if (idxmax > idxcnt) + { + backw--; + seq->save_idx = idx; + idx = prev_idx; + } + if (backw > backw_stop) + backw--; + } + } + + len = weights[idx++]; + /* Skip over indices of previous levels. */ + for (int i = 0; i < pass; i++) + { + idx += len; + len = weights[idx]; + idx++; + } + } + + /* Update the structure. */ + seq->val = val; + seq->len = len; + seq->backw_stop = backw_stop; + seq->backw = backw; + seq->idxcnt = idxcnt; + seq->idxmax = idxmax; + seq->us = us; + seq->idx = idx; +} + +/* Compare two sequences. This version does not use the index and rules + cache. */ +static int +do_compare_nocache (coll_seq *seq1, coll_seq *seq2, int position, + const USTRING_TYPE *weights) +{ + int seq1len = seq1->len; + int seq2len = seq2->len; + size_t val1 = seq1->val; + size_t val2 = seq2->val; + int idx1 = seq1->idx; + int idx2 = seq2->idx; + int result = 0; + + /* Test for position if necessary. */ + if (position && val1 != val2) + { + result = val1 > val2 ? 1 : -1; + goto out; + } + + /* Compare the two sequences. */ + do + { + if (weights[idx1] != weights[idx2]) + { + /* The sequences differ. */ + result = weights[idx1] - weights[idx2]; + goto out; + } + + /* Increment the offsets. */ + ++idx1; + ++idx2; + + --seq1len; + --seq2len; + } + while (seq1len > 0 && seq2len > 0); + + if (position && seq1len != seq2len) + result = seq1len - seq2len; + +out: + seq1->len = seq1len; + seq2->len = seq2len; + seq1->idx = idx1; + seq2->idx = idx2; + return result; +} + +/* Compare two sequences using the index cache. */ static int do_compare (coll_seq *seq1, coll_seq *seq2, int position, const USTRING_TYPE *weights) { int seq1len = seq1->len; int seq2len = seq2->len; - int val1 = seq1->val; - int val2 = seq2->val; + size_t val1 = seq1->val; + size_t val2 = seq2->val; int32_t *idx1arr = seq1->idxarr; int32_t *idx2arr = seq2->idxarr; int idx1now = seq1->idxnow; @@ -245,7 +435,7 @@ do_compare (coll_seq *seq1, coll_seq *seq2, int position, /* Test for position if necessary. */ if (position && val1 != val2) { - result = val1 - val2; + result = val1 > val2 ? 1 : -1; goto out; } @@ -334,57 +524,70 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l) memset (&seq1, 0, sizeof (seq1)); seq2 = seq1; - /* We need the elements of the strings as unsigned values since they - are used as indices. */ - seq1.us = (const USTRING_TYPE *) s1; - seq2.us = (const USTRING_TYPE *) s2; + size_t size_max = SIZE_MAX / (sizeof (int32_t) + 1); - if (! __libc_use_alloca ((s1len + s2len) * (sizeof (int32_t) + 1))) + if (MIN (s1len, s2len) > size_max + || MAX (s1len, s2len) > size_max - MIN (s1len, s2len)) + { + /* If the strings are long enough to cause overflow in the size request, + then skip the allocation and proceed with the non-cached routines. */ + } + else if (! __libc_use_alloca ((s1len + s2len) * (sizeof (int32_t) + 1))) { seq1.idxarr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1)); - seq2.idxarr = &seq1.idxarr[s1len]; - seq1.rulearr = (unsigned char *) &seq2.idxarr[s2len]; - seq2.rulearr = &seq1.rulearr[s1len]; - - if (seq1.idxarr == NULL) - /* No memory. Well, go with the stack then. - - XXX Once this implementation is stable we will handle this - differently. Instead of precomputing the indices we will - do this in time. This means, though, that this happens for - every pass again. */ - goto try_stack; - use_malloc = true; + + /* If we failed to allocate memory, we leave everything as NULL so that + we use the nocache version of traversal and comparison functions. */ + if (seq1.idxarr != NULL) + { + seq2.idxarr = &seq1.idxarr[s1len]; + seq1.rulearr = (unsigned char *) &seq2.idxarr[s2len]; + seq2.rulearr = &seq1.rulearr[s1len]; + use_malloc = true; + } } else { - try_stack: seq1.idxarr = (int32_t *) alloca (s1len * sizeof (int32_t)); seq2.idxarr = (int32_t *) alloca (s2len * sizeof (int32_t)); seq1.rulearr = (unsigned char *) alloca (s1len); seq2.rulearr = (unsigned char *) alloca (s2len); } - seq1.rulearr[0] = 0; + int rule = 0; /* Cache values in the first pass and if needed, use them in subsequent passes. */ for (int pass = 0; pass < nrules; ++pass) { seq1.idxcnt = 0; + seq1.idx = 0; + seq2.idx = 0; seq1.backw_stop = ~0ul; seq1.backw = ~0ul; seq2.idxcnt = 0; seq2.backw_stop = ~0ul; seq2.backw = ~0ul; + /* We need the elements of the strings as unsigned values since they + are used as indices. */ + seq1.us = (const USTRING_TYPE *) s1; + seq2.us = (const USTRING_TYPE *) s2; + /* We assume that if a rule has defined `position' in one section this is true for all of them. */ - int position = rulesets[seq1.rulearr[0] * nrules + pass] & sort_position; + int position = rulesets[rule * nrules + pass] & sort_position; while (1) { - if (pass == 0) + if (__glibc_unlikely (seq1.idxarr == NULL)) + { + get_next_seq_nocache (&seq1, nrules, rulesets, weights, table, + extra, indirect, pass); + get_next_seq_nocache (&seq2, nrules, rulesets, weights, table, + extra, indirect, pass); + } + else if (pass == 0) { get_next_seq (&seq1, nrules, rulesets, weights, table, extra, indirect); @@ -411,10 +614,18 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l) goto free_and_return; } - result = do_compare (&seq1, &seq2, position, weights); + if (__glibc_unlikely (seq1.idxarr == NULL)) + result = do_compare_nocache (&seq1, &seq2, position, weights); + else + result = do_compare (&seq1, &seq2, position, weights); if (result != 0) goto free_and_return; } + + if (__glibc_likely (seq1.rulearr != NULL)) + rule = seq1.rulearr[0]; + else + rule = seq1.rule; } /* Free the memory if needed. */ diff --git a/string/tst-strcoll-overflow.c b/string/tst-strcoll-overflow.c new file mode 100644 index 0000000..bb665ac --- /dev/null +++ b/string/tst-strcoll-overflow.c @@ -0,0 +1,61 @@ +/* Copyright (C) 2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include + +/* Verify that strcoll does not crash for large strings for which it cannot + cache weight lookup results. The size is large enough to cause integer + overflows on 32-bit as well as buffer overflows on 64-bit. The test should + work reasonably reliably when overcommit is disabled, but it obviously + depends on how much memory the system has. There's a limitation to this + test in that it does not run to completion. Actually collating such a + large string can take days and we can't have xcheck running that long. For + that reason, we run the test for about 5 minutes and then assume that + everything is fine if there are no crashes. */ +#define SIZE 0x40000000ul + +int +do_test (void) +{ + if (setlocale (LC_COLLATE, "en_GB.UTF-8") == NULL) + { + puts ("setlocale failed, cannot test for overflow"); + return 0; + } + + char *p = malloc (SIZE); + + if (p == NULL) + { + puts ("could not allocate memory"); + return 1; + } + + memset (p, 'x', SIZE - 1); + p[SIZE - 1] = 0; + printf ("%d\n", strcoll (p, p)); + return 0; +} + +#define TIMEOUT 300 +#define EXPECTED_SIGNAL SIGALRM +#define TEST_FUNCTION do_test () +#include "../test-skeleton.c"