Resolves: #1355780 - fix 'sort -h -k' in locales that use blank as thousands separator

This commit is contained in:
Kamil Dudka 2016-07-19 13:52:45 +02:00
parent 6cf6cd48dd
commit 8b01f2371c
2 changed files with 342 additions and 1 deletions

View File

@ -0,0 +1,332 @@
From c479153d77b419a6cae4551b63d2b73096c1130e Mon Sep 17 00:00:00 2001
From: Kamil Dudka <kdudka@redhat.com>
Date: Mon, 18 Jul 2016 19:04:43 +0200
Subject: [PATCH 1/3] maint: sort.c: deduplicate code for traversing numbers
* src/sort.c (traverse_raw_number): New function for traversing numbers.
(find_unit_order): Use traverse_raw_number() instead of open-coding it.
(debug_key): Likewise.
---
src/sort.c | 63 ++++++++++++++++++++++++++++++++++----------------------------
1 file changed, 35 insertions(+), 28 deletions(-)
diff --git a/src/sort.c b/src/sort.c
index 5b02343..e28bb6c 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -2231,18 +2231,16 @@ static char const unit_order[UCHAR_LIM] =
#endif
};
-/* Return an integer that represents the order of magnitude of the
- unit following the number. The number may contain thousands
- separators and a decimal point, but it may not contain leading blanks.
- Negative numbers get negative orders; zero numbers have a zero order. */
-
-static int _GL_ATTRIBUTE_PURE
-find_unit_order (char const *number)
+/* Traverse number given as *number consisting of digits, thousands_sep, and
+ decimal_point chars only. Returns the highest digit found in the number,
+ or '\0' if no digit has been found. Upon return *number points at the
+ character that immediately follows after the given number. */
+static unsigned char
+traverse_raw_number (char const **number)
{
- bool minus_sign = (*number == '-');
- char const *p = number + minus_sign;
- int nonzero = 0;
+ char const *p = *number;
unsigned char ch;
+ unsigned char max_digit = '\0';
/* Scan to end of number.
Decimals or separators not followed by digits stop the scan.
@@ -2253,16 +2251,34 @@ find_unit_order (char const *number)
do
{
while (ISDIGIT (ch = *p++))
- nonzero |= ch - '0';
+ if (max_digit < ch)
+ max_digit = ch;
}
while (ch == thousands_sep);
if (ch == decimal_point)
while (ISDIGIT (ch = *p++))
- nonzero |= ch - '0';
+ if (max_digit < ch)
+ max_digit = ch;
+
+ *number = p - 1;
+ return max_digit;
+}
+
+/* Return an integer that represents the order of magnitude of the
+ unit following the number. The number may contain thousands
+ separators and a decimal point, but it may not contain leading blanks.
+ Negative numbers get negative orders; zero numbers have a zero order. */
- if (nonzero)
+static int _GL_ATTRIBUTE_PURE
+find_unit_order (char const *number)
+{
+ bool minus_sign = (*number == '-');
+ char const *p = number + minus_sign;
+ unsigned char max_digit = traverse_raw_number (&p);
+ if ('0' < max_digit)
{
+ unsigned char ch = *p;
int order = unit_order[ch];
return (minus_sign ? -order : order);
}
@@ -2655,23 +2671,14 @@ debug_key (struct line const *line, struct keyfield const *key)
ignore_value (strtold (beg, &tighter_lim));
else if (key->numeric || key->human_numeric)
{
- char *p = beg + (beg < lim && *beg == '-');
- bool found_digit = false;
- unsigned char ch;
-
- do
+ char const *p = beg + (beg < lim && *beg == '-');
+ unsigned char max_digit = traverse_raw_number (&p);
+ if ('0' <= max_digit)
{
- while (ISDIGIT (ch = *p++))
- found_digit = true;
+ unsigned char ch = *p;
+ tighter_lim = (char *) p
+ + (key->human_numeric && unit_order[ch]);
}
- while (ch == thousands_sep);
-
- if (ch == decimal_point)
- while (ISDIGIT (ch = *p++))
- found_digit = true;
-
- if (found_digit)
- tighter_lim = p - ! (key->human_numeric && unit_order[ch]);
}
else
tighter_lim = lim;
--
2.5.5
From 8c39465a5b0343ff7a21286dd69ed5430685d2f7 Mon Sep 17 00:00:00 2001
From: Kamil Dudka <kdudka@redhat.com>
Date: Mon, 18 Jul 2016 19:04:44 +0200
Subject: [PATCH 2/3] sort: make -h work with -k and blank used as thousands
separator
* src/sort.c (traverse_raw_number): Allow to skip only one occurrence
of thousands_sep to avoid finding the unit in the next column in case
thousands_sep matches as blank and is used as column delimiter.
* tests/misc/sort-h-thousands-sep.sh: Add regression test for this bug.
* tests/local.mk: Reference the test.
* NEWS: Mention the bug fix.
Reported at https://bugzilla.redhat.com/1355780
Fixes http://bugs.gnu.org/24015
---
src/sort.c | 14 ++++++++----
tests/local.mk | 1 +
tests/misc/sort-h-thousands-sep.sh | 47 ++++++++++++++++++++++++++++++++++++++
3 files changed, 57 insertions(+), 5 deletions(-)
create mode 100755 tests/misc/sort-h-thousands-sep.sh
diff --git a/src/sort.c b/src/sort.c
index e28bb6c..dd3ba58 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -2248,13 +2248,17 @@ traverse_raw_number (char const **number)
to be lacking in units.
FIXME: add support for multibyte thousands_sep and decimal_point. */
- do
+ while (ISDIGIT (ch = *p++))
{
- while (ISDIGIT (ch = *p++))
- if (max_digit < ch)
- max_digit = ch;
+ if (max_digit < ch)
+ max_digit = ch;
+
+ /* Allow to skip only one occurrence of thousands_sep to avoid finding
+ the unit in the next column in case thousands_sep matches as blank
+ and is used as column delimiter. */
+ if (*p == thousands_sep)
+ ++p;
}
- while (ch == thousands_sep);
if (ch == decimal_point)
while (ISDIGIT (ch = *p++))
diff --git a/tests/local.mk b/tests/local.mk
index 42d39f2..dccff8d 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -344,6 +344,7 @@ all_tests = \
tests/misc/sort-discrim.sh \
tests/misc/sort-files0-from.pl \
tests/misc/sort-float.sh \
+ tests/misc/sort-h-thousands-sep.sh \
tests/misc/sort-mb-tests.sh \
tests/i18n/sort.sh \
tests/misc/sort-merge.pl \
diff --git a/tests/misc/sort-h-thousands-sep.sh b/tests/misc/sort-h-thousands-sep.sh
new file mode 100755
index 0000000..17f1b6c
--- /dev/null
+++ b/tests/misc/sort-h-thousands-sep.sh
@@ -0,0 +1,47 @@
+#!/bin/sh
+# exercise 'sort -h' in locales where thousands separator is blank
+
+# Copyright (C) 2016 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ sort
+test "$(LC_ALL=sv_SE locale thousands_sep)" = ' ' \
+ || skip_ 'The Swedish locale with blank thousands separator is unavailable.'
+
+tee exp1 > in << _EOF_
+1 1k 4 003 1M
+2k 2M 4 002 2
+3M 3 4 001 3k
+_EOF_
+
+cat > exp2 << _EOF_
+3M 3 4 001 3k
+1 1k 4 003 1M
+2k 2M 4 002 2
+_EOF_
+
+cat > exp3 << _EOF_
+3M 3 4 001 3k
+2k 2M 4 002 2
+1 1k 4 003 1M
+_EOF_
+
+for i in 1 2 3; do
+ LC_ALL="sv_SE.utf8" sort -h -k $i "in" > "out${i}" || fail=1
+ compare "exp${i}" "out${i}" || fail=1
+done
+
+Exit $fail
--
2.5.5
From 46ef53f558e7bc1c0bc0abd62a86b40b4141e058 Mon Sep 17 00:00:00 2001
From: Kamil Dudka <kdudka@redhat.com>
Date: Mon, 18 Jul 2016 19:04:45 +0200
Subject: [PATCH 3/3] sort: with -h, disallow thousands separator between
number and unit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* src/sort.c (traverse_raw_number): Accept thousands separator only
if it is immediately followed by a digit.
* tests/misc/sort-h-thousands-sep.sh: Cover the fix for this bug.
Suggested by Pádraig Brady in http://bugs.gnu.org/24015
---
src/sort.c | 11 ++++++++++-
tests/misc/sort-h-thousands-sep.sh | 25 +++++++++++++------------
2 files changed, 23 insertions(+), 13 deletions(-)
diff --git a/src/sort.c b/src/sort.c
index dd3ba58..69ef75f 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -2241,6 +2241,7 @@ traverse_raw_number (char const **number)
char const *p = *number;
unsigned char ch;
unsigned char max_digit = '\0';
+ bool ends_with_thousands_sep = false;
/* Scan to end of number.
Decimals or separators not followed by digits stop the scan.
@@ -2256,10 +2257,18 @@ traverse_raw_number (char const **number)
/* Allow to skip only one occurrence of thousands_sep to avoid finding
the unit in the next column in case thousands_sep matches as blank
and is used as column delimiter. */
- if (*p == thousands_sep)
+ ends_with_thousands_sep = (*p == thousands_sep);
+ if (ends_with_thousands_sep)
++p;
}
+ if (ends_with_thousands_sep)
+ {
+ /* thousands_sep not followed by digit is not allowed. */
+ *number = p - 2;
+ return max_digit;
+ }
+
if (ch == decimal_point)
while (ISDIGIT (ch = *p++))
if (max_digit < ch)
diff --git a/tests/misc/sort-h-thousands-sep.sh b/tests/misc/sort-h-thousands-sep.sh
index 17f1b6c..3ffa89e 100755
--- a/tests/misc/sort-h-thousands-sep.sh
+++ b/tests/misc/sort-h-thousands-sep.sh
@@ -18,28 +18,29 @@
. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
print_ver_ sort
+
test "$(LC_ALL=sv_SE locale thousands_sep)" = ' ' \
|| skip_ 'The Swedish locale with blank thousands separator is unavailable.'
-tee exp1 > in << _EOF_
-1 1k 4 003 1M
-2k 2M 4 002 2
-3M 3 4 001 3k
+tee exp1 exp3 > in << _EOF_
+1 1k 1 M 4 003 1M
+2k 2M 2 k 4 002 2
+3M 3 3 G 4 001 3k
_EOF_
cat > exp2 << _EOF_
-3M 3 4 001 3k
-1 1k 4 003 1M
-2k 2M 4 002 2
+3M 3 3 G 4 001 3k
+1 1k 1 M 4 003 1M
+2k 2M 2 k 4 002 2
_EOF_
-cat > exp3 << _EOF_
-3M 3 4 001 3k
-2k 2M 4 002 2
-1 1k 4 003 1M
+cat > exp5 << _EOF_
+3M 3 3 G 4 001 3k
+2k 2M 2 k 4 002 2
+1 1k 1 M 4 003 1M
_EOF_
-for i in 1 2 3; do
+for i in 1 2 3 5; do
LC_ALL="sv_SE.utf8" sort -h -k $i "in" > "out${i}" || fail=1
compare "exp${i}" "out${i}" || fail=1
done
--
2.5.5

View File

@ -1,7 +1,7 @@
Summary: A set of basic GNU tools commonly used in shell scripts Summary: A set of basic GNU tools commonly used in shell scripts
Name: coreutils Name: coreutils
Version: 8.25 Version: 8.25
Release: 13%{?dist} Release: 14%{?dist}
License: GPLv3+ License: GPLv3+
Group: System Environment/Base Group: System Environment/Base
Url: http://www.gnu.org/software/coreutils/ Url: http://www.gnu.org/software/coreutils/
@ -19,6 +19,8 @@ Source10: coreutils-find-requires.sh
# From upstream # From upstream
Patch952: coreutils-8.25-intall-Z-selinux.patch Patch952: coreutils-8.25-intall-Z-selinux.patch
# fix 'sort -h -k' in locales that use blank as thousands separator (#1355780)
Patch953: coreutils-8.25-sort-thousands-sep.patch
# Our patches # Our patches
#general patch to workaround koji build system issues #general patch to workaround koji build system issues
@ -209,9 +211,13 @@ tee DIR_COLORS{,.256color,.lightbgcolor} <src/dircolors.hin >/dev/null
%patch951 -p1 -b .selinuxman %patch951 -p1 -b .selinuxman
%patch952 -p1 %patch952 -p1
# upstream patches
%patch953 -p1
chmod a+x \ chmod a+x \
tests/df/direct.sh \ tests/df/direct.sh \
tests/install/install-Z-selinux.sh \ tests/install/install-Z-selinux.sh \
tests/misc/sort-h-thousands-sep.sh \
tests/misc/sort-mb-tests.sh \ tests/misc/sort-mb-tests.sh \
|| : || :
@ -352,6 +358,9 @@ fi
%license COPYING %license COPYING
%changelog %changelog
* Tue Jul 19 2016 Kamil Dudka <kdudka@redhat.com> - 8.25-14
- fix 'sort -h -k' in locales that use blank as thousands separator (#1355780)
* Thu Jul 14 2016 Kamil Dudka <kdudka@redhat.com> - 8.25-13 * Thu Jul 14 2016 Kamil Dudka <kdudka@redhat.com> - 8.25-13
- make 'sort -h' work for arbitrary column even when using UTF-8 locales - make 'sort -h' work for arbitrary column even when using UTF-8 locales