sort: fix multibyte incompabilities (#821264)

This commit is contained in:
Ondrej Oprala 2013-08-14 19:01:16 +02:00
parent bce5991c5f
commit 0dcc5a0d5e
2 changed files with 128 additions and 100 deletions

View File

@ -2419,8 +2419,8 @@ diff -urNp coreutils-8.21-orig/src/pr.c coreutils-8.21/src/pr.c
looking for more options and printing the next batch of files.
diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
--- coreutils-8.21-orig/src/sort.c 2013-01-31 01:46:24.000000000 +0100
+++ coreutils-8.21/src/sort.c 2013-02-15 14:25:07.828467769 +0100
--- coreutils-8.21-orig/src/sort.c 2013-08-14 18:14:06.172216606 +0200
+++ coreutils-8.21/src/sort.c 2013-08-14 18:13:30.295247905 +0200
@@ -29,6 +29,14 @@
#include <sys/wait.h>
#include <signal.h>
@ -2440,8 +2440,8 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
/* Nonzero if the corresponding locales are hard. */
static bool hard_LC_COLLATE;
-#if HAVE_NL_LANGINFO
+#if HAVE_LANGINFO_CODESET
-#if HAVE_NL_LANGINFO
static bool hard_LC_TIME;
#endif
@ -2476,16 +2476,16 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
they were read if all keys compare equal. */
static bool stable;
+/* Tab character separating fields. If tab_length is 0, then fields are
-/* If TAB has this value, blanks separate fields. */
-enum { TAB_DEFAULT = CHAR_MAX + 1 };
-
-/* Tab character separating fields. If TAB_DEFAULT, then fields are
+/* Tab character separating fields. If tab_length is 0, then fields are
separated by the empty string between a non-blank character and a blank
character. */
-static int tab = TAB_DEFAULT;
+static char tab[MB_LEN_MAX + 1];
+static size_t tab_length = 0;
-static int tab = TAB_DEFAULT;
/* Flag to remove consecutive duplicate lines from the output.
Only the last of a sequence of equal lines will be output. */
@ -2540,8 +2540,8 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
free (node);
}
-#if HAVE_NL_LANGINFO
+#if HAVE_LANGINFO_CODESET
-#if HAVE_NL_LANGINFO
static int
struct_month_cmp (void const *m1, void const *m2)
@ -2549,17 +2549,17 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
/* Initialize the character class tables. */
static void
-inittables (void)
+inittables_uni (void)
-inittables (void)
{
size_t i;
@@ -1250,7 +1318,7 @@ inittables (void)
@@ -1250,7 +1318,7 @@ inittables_uni (void)
fold_toupper[i] = toupper (i);
}
-#if HAVE_NL_LANGINFO
+#if HAVE_LANGINFO_CODESET
-#if HAVE_NL_LANGINFO
/* If we're not in the "C" locale, read different names for months. */
if (hard_LC_TIME)
{
@ -2652,25 +2652,25 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
by KEY in LINE. */
static char *
-begfield (struct line const *line, struct keyfield const *key)
+begfield_uni (const struct line *line, const struct keyfield *key)
-begfield (struct line const *line, struct keyfield const *key)
{
char *ptr = line->text, *lim = ptr + line->length - 1;
size_t sword = key->sword;
@@ -1573,10 +1719,10 @@ begfield (struct line const *line, struc
@@ -1573,10 +1719,10 @@ begfield_uni (const struct line *line, c
/* The leading field separator itself is included in a field when -t
is absent. */
- if (tab != TAB_DEFAULT)
+ if (tab_length)
- if (tab != TAB_DEFAULT)
while (ptr < lim && sword--)
{
- while (ptr < lim && *ptr != tab)
+ while (ptr < lim && *ptr != tab[0])
- while (ptr < lim && *ptr != tab)
++ptr;
if (ptr < lim)
++ptr;
@@ -1602,11 +1748,70 @@ begfield (struct line const *line, struc
@@ -1602,11 +1748,70 @@ begfield_uni (const struct line *line, c
return ptr;
}
@ -2737,38 +2737,38 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
in LINE specified by KEY. */
static char *
-limfield (struct line const *line, struct keyfield const *key)
+limfield_uni (const struct line *line, const struct keyfield *key)
-limfield (struct line const *line, struct keyfield const *key)
{
char *ptr = line->text, *lim = ptr + line->length - 1;
size_t eword = key->eword, echar = key->echar;
@@ -1621,10 +1826,10 @@ limfield (struct line const *line, struc
@@ -1621,10 +1826,10 @@ limfield_uni (const struct line *line, c
'beginning' is the first character following the delimiting TAB.
Otherwise, leave PTR pointing at the first 'blank' character after
the preceding field. */
- if (tab != TAB_DEFAULT)
+ if (tab_length)
- if (tab != TAB_DEFAULT)
while (ptr < lim && eword--)
{
- while (ptr < lim && *ptr != tab)
+ while (ptr < lim && *ptr != tab[0])
- while (ptr < lim && *ptr != tab)
++ptr;
if (ptr < lim && (eword || echar))
++ptr;
@@ -1670,10 +1875,10 @@ limfield (struct line const *line, struc
@@ -1670,10 +1875,10 @@ limfield_uni (const struct line *line, c
*/
/* Make LIM point to the end of (one byte past) the current field. */
- if (tab != TAB_DEFAULT)
+ if (tab_length)
- if (tab != TAB_DEFAULT)
{
char *newlim;
- newlim = memchr (ptr, tab, lim - ptr);
+ newlim = memchr (ptr, tab[0], lim - ptr);
- newlim = memchr (ptr, tab, lim - ptr);
if (newlim)
lim = newlim;
}
@@ -1704,6 +1909,130 @@ limfield (struct line const *line, struc
@@ -1704,6 +1909,130 @@ limfield_uni (const struct line *line, c
return ptr;
}
@ -2857,7 +2857,7 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
+ {
+ /* If we're skipping leading blanks, don't start counting characters
+ * until after skipping past any leading blanks. */
+ if (key->skipsblanks)
+ if (key->skipeblanks)
+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
+
@ -2903,8 +2903,6 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
else
{
if (key->skipsblanks)
- while (blanks[to_uchar (*line_start)])
- line_start++;
+ {
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
@ -2921,6 +2919,8 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
+ while (blanks[to_uchar (*line_start)])
+ line_start++;
+ }
- while (blanks[to_uchar (*line_start)])
- line_start++;
line->keybeg = line_start;
}
}
@ -2928,12 +2928,12 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
hideously fast. */
static int
-numcompare (char const *a, char const *b)
+numcompare_uni (const char *a, const char *b)
-numcompare (char const *a, char const *b)
{
while (blanks[to_uchar (*a)])
a++;
@@ -1922,6 +2265,25 @@ numcompare (char const *a, char const *b
@@ -1922,6 +2265,25 @@ numcompare_uni (const char *a, const cha
return strnumcmp (a, b, decimal_point, thousands_sep);
}
@ -2963,8 +2963,8 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
Return 0 if the name in S is not recognized. */
static int
-getmonth (char const *month, char **ea)
+getmonth_uni (char const *month, size_t len, char **ea)
-getmonth (char const *month, char **ea)
{
size_t lo = 0;
size_t hi = MONTHS_PER_YEAR;
@ -2972,17 +2972,17 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
char saved = *lim;
*lim = '\0';
+ skipblanks (&beg, lim);
- while (blanks[to_uchar (*beg)])
- beg++;
+ skipblanks (&beg, lim);
char *tighter_lim = beg;
if (lim < beg)
tighter_lim = lim;
else if (key->month)
- getmonth (beg, &tighter_lim);
+ getmonth (beg, lim-beg, &tighter_lim);
- getmonth (beg, &tighter_lim);
else if (key->general_numeric)
ignore_value (strtold (beg, &tighter_lim));
else if (key->numeric || key->human_numeric)
@ -2990,8 +2990,8 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
bool maybe_space_aligned = !hard_LC_COLLATE && default_key_compare (key)
&& !(key->schar || key->echar);
bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
- if (!gkey_only && tab == TAB_DEFAULT && !line_offset
+ if (!gkey_only && !tab_length && !line_offset
- if (!gkey_only && tab == TAB_DEFAULT && !line_offset
&& ((!key->skipsblanks && !(implicit_skip || maybe_space_aligned))
|| (!key->skipsblanks && key->schar)
|| (!key->skipeblanks && key->echar)))
@ -3079,21 +3079,21 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
are no more keys or a difference is found. */
static int
-keycompare (struct line const *a, struct line const *b)
+keycompare_uni (const struct line *a, const struct line *b)
-keycompare (struct line const *a, struct line const *b)
{
struct keyfield *key = keylist;
@@ -2546,7 +2983,7 @@ keycompare (struct line const *a, struct
@@ -2546,7 +2983,7 @@ keycompare_uni (const struct line *a, co
else if (key->human_numeric)
diff = human_numcompare (ta, tb);
else if (key->month)
- diff = getmonth (ta, NULL) - getmonth (tb, NULL);
+ diff = getmonth (ta, tlena, NULL) - getmonth (tb, tlenb, NULL);
- diff = getmonth (ta, NULL) - getmonth (tb, NULL);
else if (key->random)
diff = compare_random (ta, tlena, tb, tlenb);
else if (key->version)
@@ -2662,6 +3099,181 @@ keycompare (struct line const *a, struct
@@ -2662,6 +3099,191 @@ keycompare_uni (const struct line *a, co
return key->reverse ? -diff : diff;
}
@ -3114,45 +3114,14 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
+ wchar_t wc_a, wc_b;
+ mbstate_t state_a, state_b;
+
+ int diff;
+ int diff = 0;
+
+ memset (&state_a, '\0', sizeof(mbstate_t));
+ memset (&state_b, '\0', sizeof(mbstate_t));
+ /* Ignore keys with start after end. */
+ if (a->keybeg - a->keylim > 0)
+ return 0;
+
+ for (;;)
+ {
+ char const *translate = key->translate;
+ bool const *ignore = key->ignore;
+
+ /* Find the lengths. */
+ size_t lena = lima <= texta ? 0 : lima - texta;
+ size_t lenb = limb <= textb ? 0 : limb - textb;
+
+ /* Actually compare the fields. */
+ if (key->random)
+ diff = compare_random (texta, lena, textb, lenb);
+ else if (key->numeric | key->general_numeric | key->human_numeric)
+ {
+ char savea = *lima, saveb = *limb;
+
+ *lima = *limb = '\0';
+ diff = (key->numeric ? numcompare (texta, textb)
+ : key->general_numeric ? general_numcompare (texta, textb)
+ : human_numcompare (texta, textb));
+ *lima = savea, *limb = saveb;
+ }
+ else if (key->version)
+ diff = filevercmp (texta, textb);
+ else if (key->month)
+ diff = getmonth (texta, lena, NULL) - getmonth (textb, lenb, NULL);
+ else
+ {
+ if (ignore || translate)
+ {
+ char *copy_a = (char *) xmalloc (lena + 1 + lenb + 1);
+ char *copy_b = copy_a + lena + 1;
+ size_t new_len_a, new_len_b;
+ size_t i, j;
+
+ /* Ignore and/or translate chars before comparing. */
+# define IGNORE_CHARS(NEW_LEN, LEN, TEXT, COPY, WC, MBLENGTH, STATE) \
@ -3220,21 +3189,63 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
+ COPY[NEW_LEN] = '\0'; \
+ } \
+ while (0)
+
+ /* Actually compare the fields. */
+
+ for (;;)
+ {
+ /* Find the lengths. */
+ size_t lena = lima <= texta ? 0 : lima - texta;
+ size_t lenb = limb <= textb ? 0 : limb - textb;
+
+ char const *translate = key->translate;
+ bool const *ignore = key->ignore;
+
+ if (ignore || translate)
+ {
+ char *copy_a = (char *) xmalloc (lena + 1 + lenb + 1);
+ char *copy_b = copy_a + lena + 1;
+ size_t new_len_a, new_len_b;
+ size_t i, j;
+
+ IGNORE_CHARS (new_len_a, lena, texta, copy_a,
+ wc_a, mblength_a, state_a);
+ IGNORE_CHARS (new_len_b, lenb, textb, copy_b,
+ wc_b, mblength_b, state_b);
+ diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
+ free(copy_a);
+ texta = copy_a; textb = copy_b;
+ lena = new_len_a; lenb = new_len_b;
+ }
+
+ if (key->random)
+ diff = compare_random (texta, lena, textb, lenb);
+ else if (key->numeric | key->general_numeric | key->human_numeric)
+ {
+ char savea = *lima, saveb = *limb;
+
+ *lima = *limb = '\0';
+ diff = (key->numeric ? numcompare (texta, textb)
+ : key->general_numeric ? general_numcompare (texta, textb)
+ : human_numcompare (texta, textb));
+ *lima = savea, *limb = saveb;
+ }
+ else if (key->version)
+ diff = filevercmp (texta, textb);
+ else if (key->month)
+ diff = getmonth (texta, lena, NULL) - getmonth (textb, lenb, NULL);
+ else if (lena == 0)
+ diff = - NONZERO (lenb);
+ else if (lenb == 0)
+ goto greater;
+ diff = 1;
+ else
+ {
+ diff = memcmp (texta, textb, MIN (lena,lenb));
+ if (!diff)
+ diff = xmemcoll (texta, lena, textb, lenb);
+ }
+
+ if (ignore || translate)
+ free (texta);
+
+ if (diff)
+ goto not_equal;
+
@ -3263,28 +3274,42 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
+ }
+ }
+
+ return 0;
+
+greater:
+ diff = 1;
+not_equal:
+ return key->reverse ? -diff : diff;
+ if (key && key->reverse)
+ return -diff;
+ else
+ return diff;
+}
+#endif
+
/* Compare two lines A and B, returning negative, zero, or positive
depending on whether A compares less than, equal to, or greater than B. */
@@ -4157,7 +4769,7 @@ main (int argc, char **argv)
@@ -2689,14 +3311,6 @@ compare (struct line const *a, struct li
diff = - NONZERO (blen);
else if (blen == 0)
diff = 1;
- else if (hard_LC_COLLATE)
- {
- /* Note xmemcoll0 is a performance enhancement as
- it will not unconditionally write '\0' after the
- passed in buffers, which was seen to give around
- a 3% increase in performance for short lines. */
- diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
- }
else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
diff = alen < blen ? -1 : alen != blen;
@@ -4157,7 +4771,7 @@ main (int argc, char **argv)
initialize_exit_failure (SORT_FAILURE);
hard_LC_COLLATE = hard_locale (LC_COLLATE);
-#if HAVE_NL_LANGINFO
+#if HAVE_LANGINFO_CODESET
-#if HAVE_NL_LANGINFO
hard_LC_TIME = hard_locale (LC_TIME);
#endif
@@ -4178,6 +4790,29 @@ main (int argc, char **argv)
@@ -4178,6 +4792,29 @@ main (int argc, char **argv)
thousands_sep = -1;
}
@ -3314,18 +3339,17 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
have_read_stdin = false;
inittables ();
@@ -4452,13 +5087,34 @@ main (int argc, char **argv)
@@ -4452,13 +5089,34 @@ main (int argc, char **argv)
case 't':
{
- char newtab = optarg[0];
- if (! newtab)
+ char newtab[MB_LEN_MAX + 1];
+ size_t newtab_length = 1;
+ strncpy (newtab, optarg, MB_LEN_MAX);
+ if (! newtab[0])
- char newtab = optarg[0];
- if (! newtab)
error (SORT_FAILURE, 0, _("empty tab"));
- if (optarg[1])
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ {
@ -3346,25 +3370,26 @@ diff -urNp coreutils-8.21-orig/src/sort.c coreutils-8.21/src/sort.c
+ }
+#endif
+ if (newtab_length == 1 && optarg[1])
- if (optarg[1])
{
if (STREQ (optarg, "\\0"))
- newtab = '\0';
+ newtab[0] = '\0';
- newtab = '\0';
else
{
/* Provoke with 'sort -txx'. Complain about
@@ -4469,9 +5125,12 @@ main (int argc, char **argv)
@@ -4469,9 +5127,12 @@ main (int argc, char **argv)
quote (optarg));
}
}
- if (tab != TAB_DEFAULT && tab != newtab)
+ if (tab_length
+ && (tab_length != newtab_length
+ || memcmp (tab, newtab, tab_length) != 0))
- if (tab != TAB_DEFAULT && tab != newtab)
error (SORT_FAILURE, 0, _("incompatible tabs"));
- tab = newtab;
+ memcpy (tab, newtab, newtab_length);
+ tab_length = newtab_length;
- tab = newtab;
}
break;

View File

@ -1,7 +1,7 @@
Summary: A set of basic GNU tools commonly used in shell scripts
Name: coreutils
Version: 8.21
Release: 16%{?dist}
Release: 17%{?dist}
License: GPLv3+
Group: System Environment/Base
Url: http://www.gnu.org/software/coreutils/
@ -375,6 +375,9 @@ fi
%{_sbindir}/chroot
%changelog
* Wed Aug 14 2013 Ondrej Oprala <ooprala@redhat.com> 8.21-17
- Fix sort multibyte incompatibilities
* Sat Aug 03 2013 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 8.21-16
- Rebuilt for https://fedoraproject.org/wiki/Fedora_20_Mass_Rebuild