2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/lib/linebuffer.h coreutils-8.24/lib/linebuffer.h
--- coreutils-8.24-orig/lib/linebuffer.h 2015-06-16 07:00:37.000000000 +0200
+++ coreutils-8.24/lib/linebuffer.h 2015-07-05 09:04:33.027546943 +0200
2009-10-07 08:11:44 +00:00
@@ -21,6 +21,11 @@
2007-01-09 19:29:30 +00:00
# include <stdio.h>
+/* Get mbstate_t. */
+# if HAVE_WCHAR_H
+# include <wchar.h>
+# endif
+
2012-01-07 19:47:10 +00:00
/* A 'struct linebuffer' holds a line of text. */
2007-01-09 19:29:30 +00:00
struct linebuffer
2009-10-07 08:11:44 +00:00
@@ -28,6 +33,9 @@ struct linebuffer
2010-01-08 08:10:16 +00:00
size_t size; /* Allocated. */
size_t length; /* Used. */
2007-01-09 19:29:30 +00:00
char *buffer;
+# if HAVE_WCHAR_H
+ mbstate_t state;
+# endif
};
/* Initialize linebuffer LINEBUFFER for use. */
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/src/cut.c coreutils-8.24/src/cut.c
--- coreutils-8.24-orig/src/cut.c 2015-06-26 19:05:22.000000000 +0200
+++ coreutils-8.24/src/cut.c 2015-07-05 09:04:33.028546950 +0200
2009-10-07 08:11:44 +00:00
@@ -28,6 +28,11 @@
#include <assert.h>
2004-09-09 03:58:39 +00:00
#include <getopt.h>
#include <sys/types.h>
+
2009-10-07 08:11:44 +00:00
+/* Get mbstate_t, mbrtowc(). */
2004-09-09 03:58:39 +00:00
+#if HAVE_WCHAR_H
+# include <wchar.h>
+#endif
#include "system.h"
2009-10-07 08:11:44 +00:00
2004-09-09 03:58:39 +00:00
#include "error.h"
2010-10-20 12:03:53 +00:00
@@ -37,6 +42,18 @@
2004-09-09 03:58:39 +00:00
#include "quote.h"
#include "xstrndup.h"
+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
2009-10-07 08:11:44 +00:00
+ installation; work around this configuration error. */
2004-09-09 03:58:39 +00:00
+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
2009-10-07 08:11:44 +00:00
+# undef MB_LEN_MAX
2004-09-09 03:58:39 +00:00
+# define MB_LEN_MAX 16
+#endif
+
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
+#if HAVE_MBRTOWC && defined mbstate_t
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
+#endif
+
2012-03-26 16:23:32 +00:00
/* The official name of this program (e.g., no 'g' prefix). */
2009-10-07 08:11:44 +00:00
#define PROGRAM_NAME "cut"
2004-09-09 03:58:39 +00:00
2013-12-14 17:41:07 +00:00
@@ -53,6 +70,52 @@
} \
2009-10-07 08:11:44 +00:00
while (0)
2004-09-09 03:58:39 +00:00
2009-10-07 08:11:44 +00:00
+/* Refill the buffer BUF to get a multibyte character. */
+#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \
+ do \
+ { \
+ if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \
+ { \
+ memmove (BUF, BUFPOS, BUFLEN); \
+ BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
+ BUFPOS = BUF; \
+ } \
+ } \
+ while (0)
2004-09-09 03:58:39 +00:00
+
2009-10-07 08:11:44 +00:00
+/* Get wide character on BUFPOS. BUFPOS is not included after that.
2014-01-04 21:48:09 +00:00
+ If byte sequence is not valid as a character, CONVFAIL is true. Otherwise false. */
2009-10-07 08:11:44 +00:00
+#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
+ do \
+ { \
+ mbstate_t state_bak; \
+ \
+ if (BUFLEN < 1) \
+ { \
+ WC = WEOF; \
+ break; \
+ } \
+ \
+ /* Get a wide character. */ \
2014-01-04 21:48:09 +00:00
+ CONVFAIL = false; \
2009-10-07 08:11:44 +00:00
+ state_bak = STATE; \
+ MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE); \
+ \
+ switch (MBLENGTH) \
+ { \
+ case (size_t)-1: \
+ case (size_t)-2: \
2014-01-04 21:48:09 +00:00
+ CONVFAIL = true; \
2009-10-07 08:11:44 +00:00
+ STATE = state_bak; \
+ /* Fall througn. */ \
+ \
+ case 0: \
+ MBLENGTH = 1; \
+ break; \
+ } \
+ } \
+ while (0)
2004-09-09 03:58:39 +00:00
+
2013-12-14 17:41:07 +00:00
2009-10-07 08:11:44 +00:00
struct range_pair
{
2014-01-06 15:32:53 +00:00
@@ -75,6 +138,8 @@ static size_t n_rp;
/* Number of `struct range_pair's allocated. */
static size_t n_rp_allocated;
+/* Length of the delimiter given as argument to -d. */
+size_t delimlen;
/* Append LOW, HIGH to the list RP of range pairs, allocating additional
space if necessary. Update global variable N_RP. When allocating,
@@ -106,15 +171,25 @@ enum operating_mode
2009-10-07 08:11:44 +00:00
{
undefined_mode,
- /* Output characters that are in the given bytes. */
+ /* Output bytes that are at the given positions. */
byte_mode,
+ /* Output characters that are at the given positions. */
+ character_mode,
2004-09-09 03:58:39 +00:00
+
2014-07-22 12:01:39 +00:00
/* Output the given delimiter-separated fields. */
2009-10-07 08:11:44 +00:00
field_mode
};
static enum operating_mode operating_mode;
+/* If nonzero, when in byte mode, don't split multibyte characters. */
+static int byte_mode_character_aware;
2004-09-09 03:58:39 +00:00
+
2009-10-07 08:11:44 +00:00
+/* If nonzero, the function for single byte locale is work
+ if this program runs on multibyte locale. */
+static int force_singlebyte_mode;
2004-09-09 03:58:39 +00:00
+
2014-07-22 12:01:39 +00:00
/* If true do not output lines containing no delimiter characters.
2009-10-07 08:11:44 +00:00
Otherwise, all such lines are printed. This option is valid only
with field mode. */
2014-01-06 15:32:53 +00:00
@@ -126,6 +201,9 @@ static bool complement;
2009-10-07 08:11:44 +00:00
2014-07-22 12:01:39 +00:00
/* The delimiter character for field mode. */
2009-10-07 08:11:44 +00:00
static unsigned char delim;
+#if HAVE_WCHAR_H
+static wchar_t wcdelim;
+#endif
/* True if the --output-delimiter=STRING option was specified. */
static bool output_delimiter_specified;
2015-07-05 07:17:02 +00:00
@@ -189,7 +267,7 @@ Print selected parts of lines from each
2009-10-07 08:11:44 +00:00
-f, --fields=LIST select only these fields; also print any line\n\
that contains no delimiter character, unless\n\
the -s option is specified\n\
- -n (ignored)\n\
+ -n with -b: don't split multibyte characters\n\
"), stdout);
fputs (_("\
--complement complement the set of selected bytes, characters\n\
2015-07-05 07:17:02 +00:00
@@ -380,6 +458,9 @@ set_fields (const char *fieldstr)
2009-10-07 08:11:44 +00:00
if (operating_mode == byte_mode)
error (0, 0,
_("byte offset %s is too large"), quote (bad_num));
+ else if (operating_mode == character_mode)
+ error (0, 0,
+ _("character offset %s is too large"), quote (bad_num));
else
error (0, 0,
_("field number %s is too large"), quote (bad_num));
2015-07-05 07:17:02 +00:00
@@ -504,6 +585,82 @@ cut_bytes (FILE *stream)
2009-10-07 08:11:44 +00:00
}
}
+#if HAVE_MBRTOWC
+/* This function is in use for the following case.
2004-09-09 03:58:39 +00:00
+
2009-10-07 08:11:44 +00:00
+ 1. Read from the stream STREAM, printing to standard output any selected
2011-09-05 06:18:24 +00:00
+ characters.
2004-09-09 03:58:39 +00:00
+
2009-10-07 08:11:44 +00:00
+ 2. Read from stream STREAM, printing to standard output any selected bytes,
+ without splitting multibyte characters. */
2011-09-05 06:18:24 +00:00
+
2009-10-07 08:11:44 +00:00
+static void
+cut_characters_or_cut_bytes_no_split (FILE *stream)
+{
2013-12-17 21:24:10 +00:00
+ size_t idx; /* number of bytes or characters in the line so far. */
2009-10-07 08:11:44 +00:00
+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
+ char *bufpos; /* Next read position of BUF. */
+ size_t buflen; /* The length of the byte sequence in buf. */
+ wint_t wc; /* A gotten wide character. */
+ size_t mblength; /* The byte size of a multibyte character which shows
+ as same character as WC. */
+ mbstate_t state; /* State of the stream. */
2014-01-04 21:48:09 +00:00
+ bool convfail = false; /* true, when conversion failed. Otherwise false. */
2011-09-05 06:18:24 +00:00
+ /* Whether to begin printing delimiters between ranges for the current line.
+ Set after we've begun printing data corresponding to the first range. */
2011-09-09 12:53:00 +00:00
+ bool print_delimiter = false;
2004-09-09 03:58:39 +00:00
+
2009-10-07 08:11:44 +00:00
+ idx = 0;
+ buflen = 0;
+ bufpos = buf;
+ memset (&state, '\0', sizeof(mbstate_t));
2004-09-09 03:58:39 +00:00
+
2013-12-14 17:41:07 +00:00
+ current_rp = rp;
+
2009-10-07 08:11:44 +00:00
+ while (1)
+ {
+ REFILL_BUFFER (buf, bufpos, buflen, stream);
2004-09-09 03:58:39 +00:00
+
2009-10-07 08:11:44 +00:00
+ GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
2014-01-04 21:48:09 +00:00
+ (void) convfail; /* ignore unused */
2009-10-07 08:11:44 +00:00
+
+ if (wc == WEOF)
+ {
+ if (idx > 0)
+ putchar ('\n');
+ break;
+ }
+ else if (wc == L'\n')
+ {
+ putchar ('\n');
+ idx = 0;
2011-09-05 06:18:24 +00:00
+ print_delimiter = false;
2013-12-15 08:05:15 +00:00
+ current_rp = rp;
2009-10-07 08:11:44 +00:00
+ }
+ else
+ {
2013-12-14 17:41:07 +00:00
+ next_item (&idx);
+ if (print_kth (idx))
2011-09-05 06:18:24 +00:00
+ {
2013-12-15 08:05:15 +00:00
+ if (output_delimiter_specified)
2011-09-05 06:18:24 +00:00
+ {
2013-12-15 08:05:15 +00:00
+ if (print_delimiter && is_range_start_index (idx))
2014-01-04 21:34:02 +00:00
+ {
+ fwrite (output_delimiter_string, sizeof (char),
+ output_delimiter_length, stdout);
+ }
+ print_delimiter = true;
2013-12-15 08:05:15 +00:00
+ }
2011-09-09 12:53:00 +00:00
+ fwrite (bufpos, mblength, sizeof(char), stdout);
2011-09-05 06:18:24 +00:00
+ }
2009-10-07 08:11:44 +00:00
+ }
2004-09-09 03:58:39 +00:00
+
2006-05-15 14:10:12 +00:00
+ buflen -= mblength;
+ bufpos += mblength;
2004-09-09 03:58:39 +00:00
+ }
+}
+#endif
2011-09-05 06:18:24 +00:00
+
2009-10-07 08:11:44 +00:00
/* Read from stream STREAM, printing to standard output any selected fields. */
2006-05-15 14:10:12 +00:00
2009-10-07 08:11:44 +00:00
static void
2015-07-05 07:17:02 +00:00
@@ -648,13 +805,211 @@ cut_fields (FILE *stream)
2009-10-07 08:11:44 +00:00
}
}
2006-05-15 14:10:12 +00:00
2004-09-09 03:58:39 +00:00
+#if HAVE_MBRTOWC
2009-10-07 08:11:44 +00:00
+static void
+cut_fields_mb (FILE *stream)
+{
+ int c;
2013-12-17 21:24:10 +00:00
+ size_t field_idx;
2009-10-07 08:11:44 +00:00
+ int found_any_selected_field;
+ int buffer_first_field;
+ int empty_input;
+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
+ char *bufpos; /* Next read position of BUF. */
+ size_t buflen; /* The length of the byte sequence in buf. */
+ wint_t wc = 0; /* A gotten wide character. */
+ size_t mblength; /* The byte size of a multibyte character which shows
+ as same character as WC. */
+ mbstate_t state; /* State of the stream. */
2014-01-04 21:48:09 +00:00
+ bool convfail = false; /* true, when conversion failed. Otherwise false. */
2004-09-09 03:58:39 +00:00
+
2013-12-14 17:41:07 +00:00
+ current_rp = rp;
+
2009-10-07 08:11:44 +00:00
+ found_any_selected_field = 0;
+ field_idx = 1;
+ bufpos = buf;
+ buflen = 0;
+ memset (&state, '\0', sizeof(mbstate_t));
2004-09-09 03:58:39 +00:00
+
2009-10-07 08:11:44 +00:00
+ c = getc (stream);
+ empty_input = (c == EOF);
+ if (c != EOF)
2011-09-05 06:18:24 +00:00
+ {
2009-10-07 08:11:44 +00:00
+ ungetc (c, stream);
2011-09-05 06:18:24 +00:00
+ wc = 0;
+ }
2009-10-07 08:11:44 +00:00
+ else
+ wc = WEOF;
2004-09-09 03:58:39 +00:00
+
2009-10-07 08:11:44 +00:00
+ /* To support the semantics of the -s flag, we may have to buffer
+ all of the first field to determine whether it is `delimited.'
+ But that is unnecessary if all non-delimited lines must be printed
+ and the first field has been selected, or if non-delimited lines
+ must be suppressed and the first field has *not* been selected.
+ That is because a non-delimited line has exactly one field. */
2013-12-14 17:41:07 +00:00
+ buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
2004-09-09 03:58:39 +00:00
+
2009-10-07 08:11:44 +00:00
+ while (1)
2004-09-09 03:58:39 +00:00
+ {
2009-10-07 08:11:44 +00:00
+ if (field_idx == 1 && buffer_first_field)
+ {
+ int len = 0;
2007-01-09 19:29:30 +00:00
+
2009-10-07 08:11:44 +00:00
+ while (1)
+ {
+ REFILL_BUFFER (buf, bufpos, buflen, stream);
2007-01-09 19:29:30 +00:00
+
2009-10-07 08:11:44 +00:00
+ GET_NEXT_WC_FROM_BUFFER
+ (wc, bufpos, buflen, mblength, state, convfail);
2007-01-09 19:29:30 +00:00
+
2009-10-07 08:11:44 +00:00
+ if (wc == WEOF)
+ break;
2007-01-09 19:29:30 +00:00
+
2009-10-07 08:11:44 +00:00
+ field_1_buffer = xrealloc (field_1_buffer, len + mblength);
+ memcpy (field_1_buffer + len, bufpos, mblength);
+ len += mblength;
+ buflen -= mblength;
+ bufpos += mblength;
2004-09-09 03:58:39 +00:00
+
2009-10-07 08:11:44 +00:00
+ if (!convfail && (wc == L'\n' || wc == wcdelim))
+ break;
+ }
+
2012-08-20 11:57:53 +00:00
+ if (len <= 0 && wc == WEOF)
2007-01-09 19:29:30 +00:00
+ break;
2004-09-09 03:58:39 +00:00
+
2009-10-07 08:11:44 +00:00
+ /* If the first field extends to the end of line (it is not
+ delimited) and we are printing all non-delimited lines,
+ print this one. */
+ if (convfail || (!convfail && wc != wcdelim))
+ {
+ if (suppress_non_delimited)
+ {
+ /* Empty. */
+ }
+ else
+ {
+ fwrite (field_1_buffer, sizeof (char), len, stdout);
+ /* Make sure the output line is newline terminated. */
+ if (convfail || (!convfail && wc != L'\n'))
+ putchar ('\n');
+ }
+ continue;
+ }
2007-01-09 19:29:30 +00:00
+
2013-12-14 17:41:07 +00:00
+ if (print_kth (1))
2009-10-07 08:11:44 +00:00
+ {
+ /* Print the field, but not the trailing delimiter. */
+ fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
+ found_any_selected_field = 1;
+ }
2014-01-04 21:48:09 +00:00
+ next_item (&field_idx);
2009-10-07 08:11:44 +00:00
+ }
2007-01-09 19:29:30 +00:00
+
2009-10-07 08:11:44 +00:00
+ if (wc != WEOF)
+ {
2013-12-14 17:41:07 +00:00
+ if (print_kth (field_idx))
2009-10-07 08:11:44 +00:00
+ {
+ if (found_any_selected_field)
+ {
+ fwrite (output_delimiter_string, sizeof (char),
+ output_delimiter_length, stdout);
+ }
+ found_any_selected_field = 1;
+ }
2004-09-09 03:58:39 +00:00
+
2009-10-07 08:11:44 +00:00
+ while (1)
+ {
+ REFILL_BUFFER (buf, bufpos, buflen, stream);
2007-01-09 19:29:30 +00:00
+
2009-10-07 08:11:44 +00:00
+ GET_NEXT_WC_FROM_BUFFER
+ (wc, bufpos, buflen, mblength, state, convfail);
2004-09-09 03:58:39 +00:00
+
2009-10-07 08:11:44 +00:00
+ if (wc == WEOF)
+ break;
+ else if (!convfail && (wc == wcdelim || wc == L'\n'))
+ {
+ buflen -= mblength;
+ bufpos += mblength;
+ break;
+ }
2004-09-09 03:58:39 +00:00
+
2013-12-14 17:41:07 +00:00
+ if (print_kth (field_idx))
2009-10-07 08:11:44 +00:00
+ fwrite (bufpos, mblength, sizeof(char), stdout);
2007-01-09 19:29:30 +00:00
+
2009-10-07 08:11:44 +00:00
+ buflen -= mblength;
+ bufpos += mblength;
+ }
+ }
2004-09-09 03:58:39 +00:00
+
2009-10-07 08:11:44 +00:00
+ if ((!convfail || wc == L'\n') && buflen < 1)
+ wc = WEOF;
2007-01-09 19:29:30 +00:00
+
2009-10-07 08:11:44 +00:00
+ if (!convfail && wc == wcdelim)
2014-01-04 21:48:09 +00:00
+ next_item (&field_idx);
2009-10-07 08:11:44 +00:00
+ else if (wc == WEOF || (!convfail && wc == L'\n'))
+ {
+ if (found_any_selected_field
+ || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
+ putchar ('\n');
+ if (wc == WEOF)
+ break;
+ field_idx = 1;
2013-12-15 08:05:15 +00:00
+ current_rp = rp;
2009-10-07 08:11:44 +00:00
+ found_any_selected_field = 0;
+ }
+ }
2007-01-09 19:29:30 +00:00
+}
+#endif
+
static void
2009-10-07 08:11:44 +00:00
cut_stream (FILE *stream)
2004-09-09 03:58:39 +00:00
{
2009-10-07 08:11:44 +00:00
- if (operating_mode == byte_mode)
- cut_bytes (stream);
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
+ {
+ switch (operating_mode)
+ {
+ case byte_mode:
+ if (byte_mode_character_aware)
+ cut_characters_or_cut_bytes_no_split (stream);
+ else
+ cut_bytes (stream);
+ break;
+
+ case character_mode:
+ cut_characters_or_cut_bytes_no_split (stream);
+ break;
+
+ case field_mode:
2014-01-06 15:32:53 +00:00
+ if (delimlen == 1)
2014-01-10 15:06:15 +00:00
+ {
2014-01-13 11:49:45 +00:00
+ /* Check if we have utf8 multibyte locale, so we can use this
+ optimization because of uniqueness of characters, which is
+ not true for e.g. SJIS */
2014-01-10 15:06:15 +00:00
+ char * loc = setlocale(LC_CTYPE, NULL);
2014-01-13 11:49:45 +00:00
+ if (loc && (strstr (loc, "UTF-8") || strstr (loc, "utf-8") ||
+ strstr (loc, "UTF8") || strstr (loc, "utf8")))
2014-01-10 15:06:15 +00:00
+ {
+ cut_fields (stream);
+ break;
+ }
+ }
+ cut_fields_mb (stream);
2009-10-07 08:11:44 +00:00
+ break;
+
+ default:
+ abort ();
+ }
+ }
else
- cut_fields (stream);
+#endif
+ {
+ if (operating_mode == field_mode)
+ cut_fields (stream);
+ else
+ cut_bytes (stream);
+ }
}
2004-09-09 03:58:39 +00:00
2009-10-07 08:11:44 +00:00
/* Process file FILE to standard output.
2015-07-05 07:17:02 +00:00
@@ -706,6 +1061,7 @@ main (int argc, char **argv)
2009-10-07 08:11:44 +00:00
bool ok;
bool delim_specified = false;
2010-10-20 12:03:53 +00:00
char *spec_list_string IF_LINT ( = NULL);
2009-10-07 08:11:44 +00:00
+ char mbdelim[MB_LEN_MAX + 1];
2004-09-09 03:58:39 +00:00
2009-10-07 08:11:44 +00:00
initialize_main (&argc, &argv);
set_program_name (argv[0]);
2015-07-05 07:17:02 +00:00
@@ -728,7 +1084,6 @@ main (int argc, char **argv)
2009-10-07 08:11:44 +00:00
switch (optc)
2009-09-12 09:28:49 +00:00
{
2009-10-07 08:11:44 +00:00
case 'b':
- case 'c':
/* Build the byte list. */
if (operating_mode != undefined_mode)
FATAL_ERROR (_("only one type of list may be specified"));
2015-07-05 07:17:02 +00:00
@@ -736,6 +1091,14 @@ main (int argc, char **argv)
2009-10-07 08:11:44 +00:00
spec_list_string = optarg;
break;
2004-09-09 03:58:39 +00:00
2009-10-07 08:11:44 +00:00
+ case 'c':
+ /* Build the character list. */
+ if (operating_mode != undefined_mode)
+ FATAL_ERROR (_("only one type of list may be specified"));
+ operating_mode = character_mode;
+ spec_list_string = optarg;
+ break;
+
case 'f':
/* Build the field list. */
if (operating_mode != undefined_mode)
2015-07-05 07:17:02 +00:00
@@ -747,10 +1110,38 @@ main (int argc, char **argv)
2009-10-07 08:11:44 +00:00
case 'd':
/* New delimiter. */
2012-03-26 16:23:32 +00:00
/* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */
2009-10-07 08:11:44 +00:00
- if (optarg[0] != '\0' && optarg[1] != '\0')
- FATAL_ERROR (_("the delimiter must be a single character"));
- delim = optarg[0];
- delim_specified = true;
+ {
2007-01-09 19:29:30 +00:00
+#if HAVE_MBRTOWC
2009-10-07 08:11:44 +00:00
+ if(MB_CUR_MAX > 1)
+ {
+ mbstate_t state;
2009-09-12 09:28:49 +00:00
+
2009-10-07 08:11:44 +00:00
+ memset (&state, '\0', sizeof(mbstate_t));
+ delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state);
+
+ if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
+ ++force_singlebyte_mode;
+ else
+ {
+ delimlen = (delimlen < 1) ? 1 : delimlen;
+ if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
+ FATAL_ERROR (_("the delimiter must be a single character"));
+ memcpy (mbdelim, optarg, delimlen);
2013-02-16 20:13:13 +00:00
+ mbdelim[delimlen] = '\0';
2014-01-06 15:32:53 +00:00
+ if (delimlen == 1)
+ delim = *optarg;
2009-10-07 08:11:44 +00:00
+ }
+ }
+
+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
2009-09-12 09:28:49 +00:00
+#endif
2009-10-07 08:11:44 +00:00
+ {
+ if (optarg[0] != '\0' && optarg[1] != '\0')
+ FATAL_ERROR (_("the delimiter must be a single character"));
+ delim = (unsigned char) optarg[0];
+ }
+ delim_specified = true;
+ }
2009-09-12 09:28:49 +00:00
break;
2009-10-07 08:11:44 +00:00
case OUTPUT_DELIMITER_OPTION:
2015-07-05 07:17:02 +00:00
@@ -763,6 +1154,7 @@ main (int argc, char **argv)
2009-10-07 08:11:44 +00:00
break;
2008-04-23 09:58:23 +00:00
2009-10-07 08:11:44 +00:00
case 'n':
+ byte_mode_character_aware = 1;
break;
2008-04-23 09:58:23 +00:00
2009-10-07 08:11:44 +00:00
case 's':
2015-07-05 07:17:02 +00:00
@@ -802,15 +1194,34 @@ main (int argc, char **argv)
2008-04-23 09:58:23 +00:00
}
2009-10-07 08:11:44 +00:00
if (!delim_specified)
- delim = '\t';
+ {
+ delim = '\t';
+#ifdef HAVE_MBRTOWC
+ wcdelim = L'\t';
+ mbdelim[0] = '\t';
+ mbdelim[1] = '\0';
+ delimlen = 1;
+#endif
+ }
2008-04-23 09:58:23 +00:00
2009-10-07 08:11:44 +00:00
if (output_delimiter_string == NULL)
2008-04-23 09:58:23 +00:00
{
2009-10-07 08:11:44 +00:00
- static char dummy[2];
- dummy[0] = delim;
- dummy[1] = '\0';
- output_delimiter_string = dummy;
- output_delimiter_length = 1;
2008-04-23 09:58:23 +00:00
+#ifdef HAVE_MBRTOWC
2009-10-07 08:11:44 +00:00
+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
+ {
+ output_delimiter_string = xstrdup(mbdelim);
+ output_delimiter_length = delimlen;
+ }
2008-04-23 09:58:23 +00:00
+
2009-10-07 08:11:44 +00:00
+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
+#endif
+ {
2011-09-05 06:18:24 +00:00
+ static char dummy[2];
2009-10-07 08:11:44 +00:00
+ dummy[0] = delim;
+ dummy[1] = '\0';
+ output_delimiter_string = dummy;
+ output_delimiter_length = 1;
+ }
}
if (optind == argc)
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/src/expand.c coreutils-8.24/src/expand.c
--- coreutils-8.24-orig/src/expand.c 2015-06-26 19:05:22.000000000 +0200
+++ coreutils-8.24/src/expand.c 2015-07-05 09:04:33.028546950 +0200
2014-01-08 12:58:02 +00:00
@@ -37,12 +37,34 @@
2009-11-18 14:48:00 +00:00
#include <stdio.h>
#include <getopt.h>
#include <sys/types.h>
2008-04-23 09:58:23 +00:00
+
2009-11-18 14:48:00 +00:00
+/* Get mbstate_t, mbrtowc(), wcwidth(). */
+#if HAVE_WCHAR_H
+# include <wchar.h>
+#endif
2014-01-04 21:48:09 +00:00
+
+/* Get iswblank(). */
+#if HAVE_WCTYPE_H
+# include <wctype.h>
+#endif
2008-04-23 09:58:23 +00:00
+
2009-11-18 14:48:00 +00:00
#include "system.h"
#include "error.h"
2010-10-20 12:03:53 +00:00
#include "fadvise.h"
2009-11-18 14:48:00 +00:00
#include "quote.h"
#include "xstrndup.h"
+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
+ installation; work around this configuration error. */
+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
+# define MB_LEN_MAX 16
+#endif
2008-04-23 09:58:23 +00:00
+
2009-11-18 14:48:00 +00:00
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
+#if HAVE_MBRTOWC && defined mbstate_t
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
+#endif
2008-04-23 09:58:23 +00:00
+
2012-03-26 16:23:32 +00:00
/* The official name of this program (e.g., no 'g' prefix). */
2009-11-18 14:48:00 +00:00
#define PROGRAM_NAME "expand"
2014-01-08 12:58:02 +00:00
@@ -357,6 +379,142 @@ expand (void)
2009-11-18 14:48:00 +00:00
}
}
+#if HAVE_MBRTOWC
+static void
+expand_multibyte (void)
+{
+ FILE *fp; /* Input strem. */
+ mbstate_t i_state; /* Current shift state of the input stream. */
+ mbstate_t i_state_bak; /* Back up the I_STATE. */
+ mbstate_t o_state; /* Current shift state of the output stream. */
+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
2011-03-14 08:43:52 +00:00
+ char *bufpos = buf; /* Next read position of BUF. */
2009-11-18 14:48:00 +00:00
+ size_t buflen = 0; /* The length of the byte sequence in buf. */
+ wchar_t wc; /* A gotten wide character. */
+ size_t mblength; /* The byte size of a multibyte character
+ which shows as same character as WC. */
+ int tab_index = 0; /* Index in `tab_list' of next tabstop. */
+ int column = 0; /* Column on screen of the next char. */
+ int next_tab_column; /* Column the next tab stop is on. */
+ int convert = 1; /* If nonzero, perform translations. */
2008-04-23 09:58:23 +00:00
+
2009-11-18 14:48:00 +00:00
+ fp = next_file ((FILE *) NULL);
+ if (fp == NULL)
+ return;
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ memset (&o_state, '\0', sizeof(mbstate_t));
+ memset (&i_state, '\0', sizeof(mbstate_t));
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ for (;;)
+ {
+ /* Refill the buffer BUF. */
+ if (buflen < MB_LEN_MAX && !feof(fp) && !ferror(fp))
+ {
+ memmove (buf, bufpos, buflen);
+ buflen += fread (buf + buflen, sizeof(char), BUFSIZ, fp);
+ bufpos = buf;
+ }
2007-01-09 19:29:30 +00:00
+
2009-11-18 14:48:00 +00:00
+ /* No character is left in BUF. */
+ if (buflen < 1)
+ {
+ fp = next_file (fp);
2007-01-09 19:29:30 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (fp == NULL)
+ break; /* No more files. */
+ else
+ {
+ memset (&i_state, '\0', sizeof(mbstate_t));
+ continue;
+ }
+ }
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ /* Get a wide character. */
+ i_state_bak = i_state;
+ mblength = mbrtowc (&wc, bufpos, buflen, &i_state);
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ switch (mblength)
+ {
+ case (size_t)-1: /* illegal byte sequence. */
+ case (size_t)-2:
+ mblength = 1;
+ i_state = i_state_bak;
+ if (convert)
+ {
+ ++column;
2012-08-20 11:57:53 +00:00
+ if (convert_entire_line == 0 && !isblank(*bufpos))
2009-11-18 14:48:00 +00:00
+ convert = 0;
+ }
+ putchar (*bufpos);
+ break;
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ case 0: /* null. */
+ mblength = 1;
+ if (convert && convert_entire_line == 0)
+ convert = 0;
+ putchar ('\0');
+ break;
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ default:
+ if (wc == L'\n') /* LF. */
+ {
+ tab_index = 0;
+ column = 0;
+ convert = 1;
+ putchar ('\n');
+ }
+ else if (wc == L'\t' && convert) /* Tab. */
+ {
+ if (tab_size == 0)
+ {
+ /* Do not let tab_index == first_free_tab;
+ stop when it is 1 less. */
+ while (tab_index < first_free_tab - 1
+ && column >= tab_list[tab_index])
+ tab_index++;
+ next_tab_column = tab_list[tab_index];
+ if (tab_index < first_free_tab - 1)
+ tab_index++;
+ if (column >= next_tab_column)
+ next_tab_column = column + 1;
+ }
+ else
+ next_tab_column = column + tab_size - column % tab_size;
2006-05-15 14:10:12 +00:00
+
2009-11-18 14:48:00 +00:00
+ while (column < next_tab_column)
+ {
+ putchar (' ');
+ ++column;
+ }
+ }
+ else /* Others. */
+ {
+ if (convert)
+ {
+ if (wc == L'\b')
+ {
+ if (column > 0)
+ --column;
+ }
+ else
+ {
+ int width; /* The width of WC. */
2007-01-09 19:29:30 +00:00
+
2009-11-18 14:48:00 +00:00
+ width = wcwidth (wc);
+ column += (width > 0) ? width : 0;
2012-08-20 11:57:53 +00:00
+ if (convert_entire_line == 0 && !iswblank(wc))
2009-11-18 14:48:00 +00:00
+ convert = 0;
+ }
+ }
+ fwrite (bufpos, sizeof(char), mblength, stdout);
+ }
+ }
+ buflen -= mblength;
+ bufpos += mblength;
+ }
+}
+#endif
2006-05-15 14:10:12 +00:00
+
2009-11-18 14:48:00 +00:00
int
main (int argc, char **argv)
{
2014-01-08 12:58:02 +00:00
@@ -421,7 +579,12 @@ main (int argc, char **argv)
2009-11-18 14:48:00 +00:00
file_list = (optind < argc ? &argv[optind] : stdin_argv);
- expand ();
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ expand_multibyte ();
+ else
+#endif
+ expand ();
if (have_read_stdin && fclose (stdin) != 0)
error (EXIT_FAILURE, errno, "-");
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/src/fold.c coreutils-8.24/src/fold.c
--- coreutils-8.24-orig/src/fold.c 2015-06-26 19:05:22.000000000 +0200
+++ coreutils-8.24/src/fold.c 2015-07-05 09:04:33.029546958 +0200
@@ -22,11 +22,33 @@
2009-11-18 14:48:00 +00:00
#include <getopt.h>
#include <sys/types.h>
+/* Get mbstate_t, mbrtowc(), wcwidth(). */
+#if HAVE_WCHAR_H
+# include <wchar.h>
+#endif
2007-01-09 19:29:30 +00:00
+
2009-11-18 14:48:00 +00:00
+/* Get iswprint(), iswblank(), wcwidth(). */
+#if HAVE_WCTYPE_H
+# include <wctype.h>
+#endif
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
#include "system.h"
#include "error.h"
2010-10-20 12:03:53 +00:00
#include "fadvise.h"
2015-07-05 07:17:02 +00:00
#include "xdectoint.h"
2009-11-18 14:48:00 +00:00
+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
+ installation; work around this configuration error. */
+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
+# undef MB_LEN_MAX
+# define MB_LEN_MAX 16
+#endif
2007-01-09 19:29:30 +00:00
+
2009-11-18 14:48:00 +00:00
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
+#if HAVE_MBRTOWC && defined mbstate_t
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
+#endif
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
#define TAB_WIDTH 8
2012-03-26 16:23:32 +00:00
/* The official name of this program (e.g., no 'g' prefix). */
2015-07-05 07:17:02 +00:00
@@ -34,20 +56,41 @@
2009-11-18 14:48:00 +00:00
#define AUTHORS proper_name ("David MacKenzie")
+#define FATAL_ERROR(Message) \
+ do \
+ { \
+ error (0, 0, (Message)); \
+ usage (2); \
+ } \
+ while (0)
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+enum operating_mode
+{
+ /* Fold texts by columns that are at the given positions. */
+ column_mode,
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ /* Fold texts by bytes that are at the given positions. */
+ byte_mode,
2007-01-09 19:29:30 +00:00
+
2009-11-18 14:48:00 +00:00
+ /* Fold texts by characters that are at the given positions. */
+ character_mode,
2009-10-07 08:11:44 +00:00
+};
2007-01-09 19:29:30 +00:00
+
2009-11-18 14:48:00 +00:00
+/* The argument shows current mode. (Default: column_mode) */
+static enum operating_mode operating_mode;
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
/* If nonzero, try to break on whitespace. */
static bool break_spaces;
-/* If nonzero, count bytes, not column positions. */
-static bool count_bytes;
-
/* If nonzero, at least one of the files we read was standard input. */
static bool have_read_stdin;
-static char const shortopts[] = "bsw:0::1::2::3::4::5::6::7::8::9::";
+static char const shortopts[] = "bcsw:0::1::2::3::4::5::6::7::8::9::";
static struct option const longopts[] =
{
{"bytes", no_argument, NULL, 'b'},
+ {"characters", no_argument, NULL, 'c'},
{"spaces", no_argument, NULL, 's'},
{"width", required_argument, NULL, 'w'},
{GETOPT_HELP_OPTION_DECL},
2015-07-05 07:17:02 +00:00
@@ -75,6 +118,7 @@ Wrap input lines in each FILE, writing t
2013-02-15 16:50:47 +00:00
2009-11-18 14:48:00 +00:00
fputs (_("\
-b, --bytes count bytes rather than columns\n\
+ -c, --characters count characters rather than columns\n\
-s, --spaces break at spaces\n\
-w, --width=WIDTH use WIDTH columns instead of 80\n\
"), stdout);
2015-07-05 07:17:02 +00:00
@@ -92,7 +136,7 @@ Wrap input lines in each FILE, writing t
2009-11-18 14:48:00 +00:00
static size_t
adjust_column (size_t column, char c)
{
- if (!count_bytes)
+ if (operating_mode != byte_mode)
{
if (c == '\b')
{
2015-07-05 07:17:02 +00:00
@@ -115,30 +159,14 @@ adjust_column (size_t column, char c)
2009-11-18 14:48:00 +00:00
to stdout, with maximum line length WIDTH.
Return true if successful. */
-static bool
-fold_file (char const *filename, size_t width)
+static void
+fold_text (FILE *istream, size_t width, int *saved_errno)
{
- FILE *istream;
int c;
size_t column = 0; /* Screen column where next char will go. */
2012-03-26 16:23:32 +00:00
size_t offset_out = 0; /* Index in 'line_out' for next char. */
2009-11-18 14:48:00 +00:00
static char *line_out = NULL;
static size_t allocated_out = 0;
- int saved_errno;
-
- if (STREQ (filename, "-"))
- {
- istream = stdin;
- have_read_stdin = true;
- }
- else
- istream = fopen (filename, "r");
-
- if (istream == NULL)
- {
- error (0, errno, "%s", filename);
- return false;
- }
2010-11-15 12:16:04 +00:00
fadvise (istream, FADVISE_SEQUENTIAL);
2010-10-20 12:03:53 +00:00
2015-07-05 07:17:02 +00:00
@@ -168,6 +196,15 @@ fold_file (char const *filename, size_t
2009-11-18 14:48:00 +00:00
bool found_blank = false;
size_t logical_end = offset_out;
+ /* If LINE_OUT has no wide character,
+ put a new wide character in LINE_OUT
+ if column is bigger than width. */
+ if (offset_out == 0)
+ {
+ line_out[offset_out++] = c;
+ continue;
+ }
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
/* Look for the last blank. */
while (logical_end)
{
2015-07-05 07:17:02 +00:00
@@ -214,11 +251,221 @@ fold_file (char const *filename, size_t
2009-11-18 14:48:00 +00:00
line_out[offset_out++] = c;
}
- saved_errno = errno;
+ *saved_errno = errno;
2015-07-05 07:17:02 +00:00
+
+ if (offset_out)
+ fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
+
2009-10-07 08:11:44 +00:00
+}
+
2009-11-18 14:48:00 +00:00
+#if HAVE_MBRTOWC
+static void
+fold_multibyte_text (FILE *istream, size_t width, int *saved_errno)
2009-10-07 08:11:44 +00:00
+{
2009-11-18 14:48:00 +00:00
+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
+ size_t buflen = 0; /* The length of the byte sequence in buf. */
2011-03-14 08:43:52 +00:00
+ char *bufpos = buf; /* Next read position of BUF. */
2009-11-18 14:48:00 +00:00
+ wint_t wc; /* A gotten wide character. */
+ size_t mblength; /* The byte size of a multibyte character which shows
+ as same character as WC. */
+ mbstate_t state, state_bak; /* State of the stream. */
2011-03-14 08:43:52 +00:00
+ int convfail = 0; /* 1, when conversion is failed. Otherwise 0. */
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ static char *line_out = NULL;
+ size_t offset_out = 0; /* Index in `line_out' for next char. */
+ static size_t allocated_out = 0;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ int increment;
+ size_t column = 0;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ size_t last_blank_pos;
+ size_t last_blank_column;
+ int is_blank_seen;
+ int last_blank_increment = 0;
+ int is_bs_following_last_blank;
+ size_t bs_following_last_blank_num;
+ int is_cr_after_last_blank;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+#define CLEAR_FLAGS \
+ do \
+ { \
+ last_blank_pos = 0; \
+ last_blank_column = 0; \
+ is_blank_seen = 0; \
+ is_bs_following_last_blank = 0; \
+ bs_following_last_blank_num = 0; \
+ is_cr_after_last_blank = 0; \
+ } \
+ while (0)
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+#define START_NEW_LINE \
+ do \
+ { \
+ putchar ('\n'); \
+ column = 0; \
+ offset_out = 0; \
+ CLEAR_FLAGS; \
+ } \
+ while (0)
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ CLEAR_FLAGS;
+ memset (&state, '\0', sizeof(mbstate_t));
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ for (;; bufpos += mblength, buflen -= mblength)
+ {
+ if (buflen < MB_LEN_MAX && !feof (istream) && !ferror (istream))
+ {
+ memmove (buf, bufpos, buflen);
+ buflen += fread (buf + buflen, sizeof(char), BUFSIZ, istream);
+ bufpos = buf;
+ }
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (buflen < 1)
+ break;
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ /* Get a wide character. */
+ state_bak = state;
+ mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &state);
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ switch (mblength)
2009-09-12 09:28:49 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ case (size_t)-1:
+ case (size_t)-2:
+ convfail++;
+ state = state_bak;
+ /* Fall through. */
+
+ case 0:
+ mblength = 1;
+ break;
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+
+rescan:
+ if (operating_mode == byte_mode) /* byte mode */
+ increment = mblength;
+ else if (operating_mode == character_mode) /* character mode */
+ increment = 1;
+ else /* column mode */
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ if (convfail)
+ increment = 1;
+ else
2009-09-12 09:28:49 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ switch (wc)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ case L'\n':
+ fwrite (line_out, sizeof(char), offset_out, stdout);
+ START_NEW_LINE;
+ continue;
+
+ case L'\b':
+ increment = (column > 0) ? -1 : 0;
+ break;
2009-09-12 09:28:49 +00:00
+
2009-11-18 14:48:00 +00:00
+ case L'\r':
+ increment = -1 * column;
+ break;
2009-09-12 09:28:49 +00:00
+
2009-11-18 14:48:00 +00:00
+ case L'\t':
+ increment = 8 - column % 8;
+ break;
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ default:
+ increment = wcwidth (wc);
+ increment = (increment < 0) ? 0 : increment;
+ }
2009-10-07 08:11:44 +00:00
+ }
+ }
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (column + increment > width && break_spaces && last_blank_pos)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ fwrite (line_out, sizeof(char), last_blank_pos, stdout);
2009-10-07 08:11:44 +00:00
+ putchar ('\n');
2009-11-18 14:48:00 +00:00
+
+ offset_out = offset_out - last_blank_pos;
+ column = column - last_blank_column + ((is_cr_after_last_blank)
+ ? last_blank_increment : bs_following_last_blank_num);
+ memmove (line_out, line_out + last_blank_pos, offset_out);
+ CLEAR_FLAGS;
+ goto rescan;
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+
+ if (column + increment > width && column != 0)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ fwrite (line_out, sizeof(char), offset_out, stdout);
+ START_NEW_LINE;
+ goto rescan;
2009-10-07 08:11:44 +00:00
+ }
2007-01-09 19:29:30 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (allocated_out < offset_out + mblength)
2009-09-12 09:28:49 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ line_out = X2REALLOC (line_out, &allocated_out);
2009-09-12 09:28:49 +00:00
+ }
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ memcpy (line_out + offset_out, bufpos, mblength);
+ offset_out += mblength;
+ column += increment;
2009-09-12 09:28:49 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (is_blank_seen && !convfail && wc == L'\r')
+ is_cr_after_last_blank = 1;
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (is_bs_following_last_blank && !convfail && wc == L'\b')
+ ++bs_following_last_blank_num;
+ else
+ is_bs_following_last_blank = 0;
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (break_spaces && !convfail && iswblank (wc))
2009-09-12 09:28:49 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ last_blank_pos = offset_out;
+ last_blank_column = column;
+ is_blank_seen = 1;
+ last_blank_increment = increment;
+ is_bs_following_last_blank = 1;
+ bs_following_last_blank_num = 0;
+ is_cr_after_last_blank = 0;
2009-09-12 09:28:49 +00:00
+ }
2009-10-07 08:11:44 +00:00
+ }
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ *saved_errno = errno;
2015-07-05 07:17:02 +00:00
if (offset_out)
fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
2009-10-07 08:11:44 +00:00
+}
2009-11-18 14:48:00 +00:00
+#endif
2006-05-15 14:10:12 +00:00
+
2009-11-18 14:48:00 +00:00
+/* Fold file FILENAME, or standard input if FILENAME is "-",
+ to stdout, with maximum line length WIDTH.
+ Return 0 if successful, 1 if an error occurs. */
2007-01-09 19:29:30 +00:00
+
2009-10-07 08:11:44 +00:00
+static bool
2014-01-04 21:48:09 +00:00
+fold_file (char const *filename, size_t width)
2009-10-07 08:11:44 +00:00
+{
2009-11-18 14:48:00 +00:00
+ FILE *istream;
+ int saved_errno;
2006-05-15 14:10:12 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (STREQ (filename, "-"))
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ istream = stdin;
+ have_read_stdin = 1;
2009-10-07 08:11:44 +00:00
+ }
+ else
2009-11-18 14:48:00 +00:00
+ istream = fopen (filename, "r");
2004-10-18 21:31:43 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (istream == NULL)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ error (0, errno, "%s", filename);
+ return 1;
2009-10-07 08:11:44 +00:00
+ }
+
2009-11-18 14:48:00 +00:00
+ /* Define how ISTREAM is being folded. */
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ fold_multibyte_text (istream, width, &saved_errno);
+ else
+#endif
+ fold_text (istream, width, &saved_errno);
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
if (ferror (istream))
{
error (0, saved_errno, "%s", filename);
2015-07-05 07:17:02 +00:00
@@ -251,7 +498,8 @@ main (int argc, char **argv)
2009-11-18 14:48:00 +00:00
atexit (close_stdout);
- break_spaces = count_bytes = have_read_stdin = false;
+ operating_mode = column_mode;
+ break_spaces = have_read_stdin = false;
while ((optc = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1)
{
2015-07-05 07:17:02 +00:00
@@ -260,7 +508,15 @@ main (int argc, char **argv)
2009-11-18 14:48:00 +00:00
switch (optc)
{
case 'b': /* Count bytes rather than columns. */
- count_bytes = true;
+ if (operating_mode != column_mode)
+ FATAL_ERROR (_("only one way of folding may be specified"));
2009-10-07 08:11:44 +00:00
+ operating_mode = byte_mode;
+ break;
+
2009-11-18 14:48:00 +00:00
+ case 'c':
+ if (operating_mode != column_mode)
+ FATAL_ERROR (_("only one way of folding may be specified"));
+ operating_mode = character_mode;
break;
case 's': /* Break at word boundaries. */
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/src/join.c coreutils-8.24/src/join.c
--- coreutils-8.24-orig/src/join.c 2015-06-26 19:05:22.000000000 +0200
+++ coreutils-8.24/src/join.c 2015-07-05 09:04:33.029546958 +0200
2010-10-20 12:03:53 +00:00
@@ -22,18 +22,32 @@
2009-11-18 14:48:00 +00:00
#include <sys/types.h>
#include <getopt.h>
+/* Get mbstate_t, mbrtowc(), mbrtowc(), wcwidth(). */
+#if HAVE_WCHAR_H
+# include <wchar.h>
+#endif
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+/* Get iswblank(), towupper. */
+#if HAVE_WCTYPE_H
+# include <wctype.h>
+#endif
2004-09-09 03:58:39 +00:00
+
2007-01-09 19:29:30 +00:00
#include "system.h"
#include "error.h"
2010-10-20 12:03:53 +00:00
#include "fadvise.h"
2009-11-18 14:48:00 +00:00
#include "hard-locale.h"
#include "linebuffer.h"
-#include "memcasecmp.h"
2009-10-07 08:11:44 +00:00
#include "quote.h"
2009-11-18 14:48:00 +00:00
#include "stdio--.h"
#include "xmemcoll.h"
#include "xstrtol.h"
#include "argmatch.h"
2006-05-15 14:10:12 +00:00
2009-10-07 08:11:44 +00:00
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
+#if HAVE_MBRTOWC && defined mbstate_t
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
+#endif
+
2012-03-26 16:23:32 +00:00
/* The official name of this program (e.g., no 'g' prefix). */
2009-11-18 14:48:00 +00:00
#define PROGRAM_NAME "join"
2007-01-09 19:29:30 +00:00
2011-09-09 11:16:47 +00:00
@@ -135,10 +149,12 @@ static struct outlist outlist_head;
2012-03-26 16:23:32 +00:00
/* Last element in 'outlist', where a new element can be added. */
2009-11-18 14:48:00 +00:00
static struct outlist *outlist_end = &outlist_head;
-/* Tab character separating fields. If negative, fields are separated
- by any nonempty string of blanks, otherwise by exactly one
- tab character whose value (when cast to unsigned char) equals TAB. */
-static int tab = -1;
+/* Tab character separating fields. If NULL, fields are separated
+ by any nonempty string of blanks. */
+static char *tab = NULL;
+
+/* The number of bytes used for tab. */
+static size_t tablen = 0;
/* If nonzero, check that the input is correctly ordered. */
static enum
2015-07-05 07:17:02 +00:00
@@ -275,13 +291,14 @@ xfields (struct line *line)
2009-11-18 14:48:00 +00:00
if (ptr == lim)
return;
2010-10-20 12:03:53 +00:00
- if (0 <= tab && tab != '\n')
2009-11-18 14:48:00 +00:00
+ if (tab != NULL)
{
+ unsigned char t = tab[0];
char *sep;
- for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1)
+ for (; (sep = memchr (ptr, t, lim - ptr)) != NULL; ptr = sep + 1)
extract_field (line, ptr, sep - ptr);
2009-10-07 08:11:44 +00:00
}
2010-10-20 12:03:53 +00:00
- else if (tab < 0)
+ else
{
/* Skip leading blanks before the first field. */
while (isblank (to_uchar (*ptr)))
2015-07-05 07:17:02 +00:00
@@ -305,6 +322,147 @@ xfields (struct line *line)
2009-11-18 14:48:00 +00:00
extract_field (line, ptr, lim - ptr);
2009-10-07 08:11:44 +00:00
}
2007-01-09 19:29:30 +00:00
2006-05-15 14:10:12 +00:00
+#if HAVE_MBRTOWC
2009-10-07 08:11:44 +00:00
+static void
2009-11-18 14:48:00 +00:00
+xfields_multibyte (struct line *line)
2006-05-15 14:10:12 +00:00
+{
2009-11-18 14:48:00 +00:00
+ char *ptr = line->buf.buffer;
+ char const *lim = ptr + line->buf.length - 1;
+ wchar_t wc = 0;
+ size_t mblength = 1;
+ mbstate_t state, state_bak;
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ memset (&state, 0, sizeof (mbstate_t));
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (ptr >= lim)
+ return;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (tab != NULL)
2006-05-15 14:10:12 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ char *sep = ptr;
+ for (; ptr < lim; ptr = sep + mblength)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ sep = ptr;
+ while (sep < lim)
+ {
+ state_bak = state;
+ mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
2006-05-15 14:10:12 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
+ {
+ mblength = 1;
+ state = state_bak;
+ }
+ mblength = (mblength < 1) ? 1 : mblength;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (mblength == tablen && !memcmp (sep, tab, mblength))
+ break;
+ else
+ {
+ sep += mblength;
+ continue;
+ }
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+
+ if (sep >= lim)
+ break;
+
+ extract_field (line, ptr, sep - ptr);
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+ }
+ else
+ {
+ /* Skip leading blanks before the first field. */
+ while(ptr < lim)
+ {
+ state_bak = state;
+ mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
+ {
+ mblength = 1;
+ state = state_bak;
+ break;
+ }
+ mblength = (mblength < 1) ? 1 : mblength;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (!iswblank(wc))
+ break;
+ ptr += mblength;
+ }
+
+ do
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ char *sep;
+ state_bak = state;
+ mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ mblength = 1;
+ state = state_bak;
+ break;
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+ mblength = (mblength < 1) ? 1 : mblength;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ sep = ptr + mblength;
+ while (sep < lim)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ state_bak = state;
+ mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ mblength = 1;
+ state = state_bak;
+ break;
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+ mblength = (mblength < 1) ? 1 : mblength;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (iswblank (wc))
+ break;
+
+ sep += mblength;
2009-10-07 08:11:44 +00:00
+ }
+
2009-11-18 14:48:00 +00:00
+ extract_field (line, ptr, sep - ptr);
+ if (sep >= lim)
+ return;
+
+ state_bak = state;
+ mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
+ {
+ mblength = 1;
+ state = state_bak;
+ break;
+ }
+ mblength = (mblength < 1) ? 1 : mblength;
+
+ ptr = sep + mblength;
+ while (ptr < lim)
+ {
+ state_bak = state;
+ mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
+ {
+ mblength = 1;
+ state = state_bak;
+ break;
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+ mblength = (mblength < 1) ? 1 : mblength;
+
+ if (!iswblank (wc))
+ break;
+
+ ptr += mblength;
2009-10-07 08:11:44 +00:00
+ }
+ }
2009-11-18 14:48:00 +00:00
+ while (ptr < lim);
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+
+ extract_field (line, ptr, lim - ptr);
2004-09-09 03:58:39 +00:00
+}
+#endif
2009-11-18 14:48:00 +00:00
+
static void
freeline (struct line *line)
2006-05-15 14:10:12 +00:00
{
2015-07-05 07:17:02 +00:00
@@ -326,56 +484,133 @@ keycmp (struct line const *line1, struct
2009-11-18 14:48:00 +00:00
size_t jf_1, size_t jf_2)
{
/* Start of field to compare in each file. */
- char *beg1;
- char *beg2;
-
- size_t len1;
- size_t len2; /* Length of fields to compare. */
+ char *beg[2];
+ char *copy[2];
+ size_t len[2]; /* Length of fields to compare. */
int diff;
+ int i, j;
2013-01-23 09:04:52 +00:00
+ int mallocd = 0;
2006-05-15 14:10:12 +00:00
2009-11-18 14:48:00 +00:00
if (jf_1 < line1->nfields)
{
- beg1 = line1->fields[jf_1].beg;
- len1 = line1->fields[jf_1].len;
+ beg[0] = line1->fields[jf_1].beg;
+ len[0] = line1->fields[jf_1].len;
}
else
{
- beg1 = NULL;
- len1 = 0;
+ beg[0] = NULL;
+ len[0] = 0;
}
2006-05-15 14:10:12 +00:00
2009-11-18 14:48:00 +00:00
if (jf_2 < line2->nfields)
{
- beg2 = line2->fields[jf_2].beg;
- len2 = line2->fields[jf_2].len;
+ beg[1] = line2->fields[jf_2].beg;
+ len[1] = line2->fields[jf_2].len;
}
else
{
- beg2 = NULL;
- len2 = 0;
+ beg[1] = NULL;
+ len[1] = 0;
}
2009-10-07 08:11:44 +00:00
2009-11-18 14:48:00 +00:00
- if (len1 == 0)
- return len2 == 0 ? 0 : -1;
- if (len2 == 0)
+ if (len[0] == 0)
+ return len[1] == 0 ? 0 : -1;
+ if (len[1] == 0)
return 1;
if (ignore_case)
{
- /* FIXME: ignore_case does not work with NLS (in particular,
- with multibyte chars). */
- diff = memcasecmp (beg1, beg2, MIN (len1, len2));
+#ifdef HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ {
+ size_t mblength;
+ wchar_t wc, uwc;
+ mbstate_t state, state_bak;
2007-01-09 19:29:30 +00:00
+
2009-11-18 14:48:00 +00:00
+ memset (&state, '\0', sizeof (mbstate_t));
2006-05-15 14:10:12 +00:00
+
2009-11-18 14:48:00 +00:00
+ for (i = 0; i < 2; i++)
+ {
2013-01-23 09:04:52 +00:00
+ mallocd = 1;
2013-12-22 15:39:13 +00:00
+ copy[i] = xmalloc (len[i] + 1);
+ memset (copy[i], '\0',len[i] + 1);
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ for (j = 0; j < MIN (len[0], len[1]);)
+ {
+ state_bak = state;
+ mblength = mbrtowc (&wc, beg[i] + j, len[i] - j, &state);
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ switch (mblength)
+ {
+ case (size_t) -1:
+ case (size_t) -2:
+ state = state_bak;
+ /* Fall through */
+ case 0:
+ mblength = 1;
+ break;
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ default:
+ uwc = towupper (wc);
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (uwc != wc)
+ {
+ mbstate_t state_wc;
2014-01-04 21:48:09 +00:00
+ size_t mblen;
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ memset (&state_wc, '\0', sizeof (mbstate_t));
2014-01-04 21:48:09 +00:00
+ mblen = wcrtomb (copy[i] + j, uwc, &state_wc);
+ assert (mblen != (size_t)-1);
2009-11-18 14:48:00 +00:00
+ }
+ else
+ memcpy (copy[i] + j, beg[i] + j, mblength);
+ }
+ j += mblength;
+ }
+ copy[i][j] = '\0';
+ }
+ }
2007-01-09 19:29:30 +00:00
+ else
2009-11-18 14:48:00 +00:00
+#endif
+ {
+ for (i = 0; i < 2; i++)
+ {
2013-01-23 09:04:52 +00:00
+ mallocd = 1;
+ copy[i] = xmalloc (len[i] + 1);
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+ for (j = 0; j < MIN (len[0], len[1]); j++)
+ copy[i][j] = toupper (beg[i][j]);
2009-03-02 12:42:55 +00:00
+
2009-11-18 14:48:00 +00:00
+ copy[i][j] = '\0';
+ }
+ }
}
else
{
- if (hard_LC_COLLATE)
- return xmemcoll (beg1, len1, beg2, len2);
- diff = memcmp (beg1, beg2, MIN (len1, len2));
2014-01-04 21:48:09 +00:00
+ copy[0] = beg[0];
+ copy[1] = beg[1];
2015-07-05 07:17:02 +00:00
+ }
+
2009-11-18 14:48:00 +00:00
+ if (hard_LC_COLLATE)
2013-01-23 09:04:52 +00:00
+ {
+ diff = xmemcoll ((char *) copy[0], len[0], (char *) copy[1], len[1]);
+
+ if (mallocd)
+ for (i = 0; i < 2; i++)
+ free (copy[i]);
+
+ return diff;
2015-07-05 07:17:02 +00:00
}
2009-11-18 14:48:00 +00:00
+ diff = memcmp (copy[0], copy[1], MIN (len[0], len[1]));
2004-09-09 03:58:39 +00:00
+
2013-01-23 09:04:52 +00:00
+ if (mallocd)
+ for (i = 0; i < 2; i++)
+ free (copy[i]);
+
2015-07-05 07:17:02 +00:00
2009-11-18 14:48:00 +00:00
if (diff)
return diff;
- return len1 < len2 ? -1 : len1 != len2;
+ return len[0] - len[1];
}
/* Check that successive input lines PREV and CURRENT from input file
2015-07-05 07:17:02 +00:00
@@ -467,6 +702,11 @@ get_line (FILE *fp, struct line **linep,
2009-11-18 14:48:00 +00:00
}
2011-09-09 11:16:47 +00:00
++line_no[which - 1];
2009-11-18 14:48:00 +00:00
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ xfields_multibyte (line);
+ else
+#endif
xfields (line);
if (prevline[which - 1])
2015-07-05 07:17:02 +00:00
@@ -566,21 +806,28 @@ prfield (size_t n, struct line const *li
2009-11-18 14:48:00 +00:00
2011-02-04 19:34:45 +00:00
/* Output all the fields in line, other than the join field. */
2009-11-18 14:48:00 +00:00
+#define PUT_TAB_CHAR \
+ do \
+ { \
+ (tab != NULL) ? \
+ fwrite(tab, sizeof(char), tablen, stdout) : putchar (' '); \
+ } \
2011-02-04 19:34:45 +00:00
+ while (0)
2007-01-09 19:29:30 +00:00
+
2009-11-18 14:48:00 +00:00
static void
2011-02-04 19:34:45 +00:00
prfields (struct line const *line, size_t join_field, size_t autocount)
{
size_t i;
size_t nfields = autoformat ? autocount : line->nfields;
- char output_separator = tab < 0 ? ' ' : tab;
for (i = 0; i < join_field && i < nfields; ++i)
{
- putchar (output_separator);
+ PUT_TAB_CHAR;
prfield (i, line);
}
for (i = join_field + 1; i < nfields; ++i)
{
- putchar (output_separator);
+ PUT_TAB_CHAR;
prfield (i, line);
}
}
2015-07-05 07:17:02 +00:00
@@ -591,7 +838,6 @@ static void
2009-11-18 14:48:00 +00:00
prjoin (struct line const *line1, struct line const *line2)
{
const struct outlist *outlist;
- char output_separator = tab < 0 ? ' ' : tab;
2011-02-04 19:34:45 +00:00
size_t field;
struct line const *line;
2009-11-18 14:48:00 +00:00
2015-07-05 07:17:02 +00:00
@@ -625,7 +871,7 @@ prjoin (struct line const *line1, struct
2009-11-18 14:48:00 +00:00
o = o->next;
if (o == NULL)
break;
- putchar (output_separator);
+ PUT_TAB_CHAR;
}
2013-12-14 17:41:07 +00:00
putchar (eolchar);
2009-11-18 14:48:00 +00:00
}
2015-07-05 07:17:02 +00:00
@@ -1103,21 +1349,46 @@ main (int argc, char **argv)
2009-11-18 14:48:00 +00:00
case 't':
{
- unsigned char newtab = optarg[0];
2011-09-05 06:18:24 +00:00
+ char *newtab = NULL;
2009-11-18 14:48:00 +00:00
+ size_t newtablen;
+ newtab = xstrdup (optarg);
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ {
+ mbstate_t state;
2006-05-15 14:10:12 +00:00
+
2009-11-18 14:48:00 +00:00
+ memset (&state, 0, sizeof (mbstate_t));
+ newtablen = mbrtowc (NULL, newtab,
+ strnlen (newtab, MB_LEN_MAX),
+ &state);
+ if (newtablen == (size_t) 0
+ || newtablen == (size_t) -1
+ || newtablen == (size_t) -2)
+ newtablen = 1;
+ }
+ else
+#endif
+ newtablen = 1;
2010-04-26 13:15:37 +00:00
if (! newtab)
2011-09-05 06:18:24 +00:00
- newtab = '\n'; /* '' => process the whole line. */
2012-03-07 20:29:40 +00:00
+ {
2014-01-04 21:48:09 +00:00
+ newtab = (char*)"\n"; /* '' => process the whole line. */
2010-04-26 13:15:37 +00:00
+ }
else if (optarg[1])
2009-11-18 14:48:00 +00:00
{
- if (STREQ (optarg, "\\0"))
- newtab = '\0';
- else
- error (EXIT_FAILURE, 0, _("multi-character tab %s"),
- quote (optarg));
2010-04-26 13:15:37 +00:00
+ if (newtablen == 1 && newtab[1])
+ {
+ if (STREQ (newtab, "\\0"))
+ newtab[0] = '\0';
+ }
+ }
+ if (tab != NULL && strcmp (tab, newtab))
+ {
2009-11-18 14:48:00 +00:00
+ free (newtab);
+ error (EXIT_FAILURE, 0, _("incompatible tabs"));
}
- if (0 <= tab && tab != newtab)
- error (EXIT_FAILURE, 0, _("incompatible tabs"));
tab = newtab;
2010-04-26 13:15:37 +00:00
- }
2009-11-18 14:48:00 +00:00
+ tablen = newtablen;
2010-04-26 13:15:37 +00:00
+ }
2009-11-18 14:48:00 +00:00
break;
2013-12-14 17:41:07 +00:00
case 'z':
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/src/pr.c coreutils-8.24/src/pr.c
--- coreutils-8.24-orig/src/pr.c 2015-06-26 19:05:22.000000000 +0200
+++ coreutils-8.24/src/pr.c 2015-07-05 09:04:33.030546965 +0200
2014-01-08 12:58:02 +00:00
@@ -312,6 +312,24 @@
2009-11-18 14:48:00 +00:00
#include <getopt.h>
#include <sys/types.h>
2006-05-15 14:10:12 +00:00
+
2009-11-18 14:48:00 +00:00
+/* Get MB_LEN_MAX. */
+#include <limits.h>
+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
+ installation; work around this configuration error. */
+#if !defined MB_LEN_MAX || MB_LEN_MAX == 1
+# define MB_LEN_MAX 16
+#endif
2006-05-15 14:10:12 +00:00
+
2009-11-18 14:48:00 +00:00
+/* Get MB_CUR_MAX. */
+#include <stdlib.h>
2006-05-15 14:10:12 +00:00
+
2009-11-18 14:48:00 +00:00
+/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
2009-10-07 08:11:44 +00:00
+/* Get mbstate_t, mbrtowc(), wcwidth(). */
+#if HAVE_WCHAR_H
+# include <wchar.h>
2004-09-09 03:58:39 +00:00
+#endif
+
2009-10-07 08:11:44 +00:00
#include "system.h"
#include "error.h"
2010-10-20 12:03:53 +00:00
#include "fadvise.h"
2015-07-05 07:17:02 +00:00
@@ -324,6 +342,18 @@
2009-10-07 08:11:44 +00:00
#include "xstrtol.h"
2015-07-05 07:17:02 +00:00
#include "xdectoint.h"
2009-10-07 08:11:44 +00:00
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
+#if HAVE_MBRTOWC && defined mbstate_t
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
+#endif
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+#ifndef HAVE_DECL_WCWIDTH
+"this configure-time declaration test was not run"
+#endif
+#if !HAVE_DECL_WCWIDTH
+extern int wcwidth ();
+#endif
+
2012-03-26 16:23:32 +00:00
/* The official name of this program (e.g., no 'g' prefix). */
2009-11-18 14:48:00 +00:00
#define PROGRAM_NAME "pr"
2009-10-07 08:11:44 +00:00
2015-07-05 07:17:02 +00:00
@@ -416,7 +446,20 @@ struct COLUMN
2009-10-07 08:11:44 +00:00
2009-11-18 14:48:00 +00:00
typedef struct COLUMN COLUMN;
-static int char_to_clump (char c);
+/* Funtion pointers to switch functions for single byte locale or for
+ multibyte locale. If multibyte functions do not exist in your sysytem,
+ these pointers always point the function for single byte locale. */
+static void (*print_char) (char c);
+static int (*char_to_clump) (char c);
2007-01-09 19:29:30 +00:00
+
2009-11-18 14:48:00 +00:00
+/* Functions for single byte locale. */
+static void print_char_single (char c);
+static int char_to_clump_single (char c);
2004-09-09 03:58:39 +00:00
+
2009-11-18 14:48:00 +00:00
+/* Functions for multibyte locale. */
+static void print_char_multi (char c);
+static int char_to_clump_multi (char c);
2009-09-12 09:28:49 +00:00
+
2009-11-18 14:48:00 +00:00
static bool read_line (COLUMN *p);
static bool print_page (void);
static bool print_stored (COLUMN *p);
2015-07-05 07:17:02 +00:00
@@ -428,6 +471,7 @@ static void add_line_number (COLUMN *p);
static void getoptnum (const char *n_str, int min, int *num,
const char *errfmt);
2009-11-18 14:48:00 +00:00
static void getoptarg (char *arg, char switch_char, char *character,
+ int *character_length, int *character_width,
int *number);
static void print_files (int number_of_files, char **av);
2012-01-07 19:47:10 +00:00
static void init_parameters (int number_of_files);
2015-07-05 07:17:02 +00:00
@@ -441,7 +485,6 @@ static void store_char (char c);
static void pad_down (unsigned int lines);
2009-11-18 14:48:00 +00:00
static void read_rest_of_line (COLUMN *p);
static void skip_read (COLUMN *p, int column_number);
-static void print_char (char c);
static void cleanup (void);
static void print_sep_string (void);
static void separator_string (const char *optarg_S);
2015-07-05 07:17:02 +00:00
@@ -453,7 +496,7 @@ static COLUMN *column_vector;
2009-11-18 14:48:00 +00:00
we store the leftmost columns contiguously in buff.
To print a line from buff, get the index of the first character
from line_vector[i], and print up to line_vector[i + 1]. */
-static char *buff;
+static unsigned char *buff;
2009-10-07 08:11:44 +00:00
2009-11-18 14:48:00 +00:00
/* Index of the position in buff where the next character
will be stored. */
2015-07-05 07:17:02 +00:00
@@ -557,7 +600,7 @@ static int chars_per_column;
2009-11-18 14:48:00 +00:00
static bool untabify_input = false;
2009-10-07 08:11:44 +00:00
2009-11-18 14:48:00 +00:00
/* (-e) The input tab character. */
-static char input_tab_char = '\t';
+static char input_tab_char[MB_LEN_MAX] = "\t";
2009-10-07 08:11:44 +00:00
2009-11-18 14:48:00 +00:00
/* (-e) Tabstops are at chars_per_tab, 2*chars_per_tab, 3*chars_per_tab, ...
where the leftmost column is 1. */
2015-07-05 07:17:02 +00:00
@@ -567,7 +610,10 @@ static int chars_per_input_tab = 8;
2009-11-18 14:48:00 +00:00
static bool tabify_output = false;
2009-10-07 08:11:44 +00:00
2009-11-18 14:48:00 +00:00
/* (-i) The output tab character. */
-static char output_tab_char = '\t';
+static char output_tab_char[MB_LEN_MAX] = "\t";
+
+/* (-i) The byte length of output tab character. */
+static int output_tab_char_length = 1;
2009-10-07 08:11:44 +00:00
2009-11-18 14:48:00 +00:00
/* (-i) The width of the output tab. */
static int chars_per_output_tab = 8;
2015-07-05 07:17:02 +00:00
@@ -637,7 +683,13 @@ static int line_number;
2009-11-18 14:48:00 +00:00
static bool numbered_lines = false;
2009-10-07 08:11:44 +00:00
2009-11-18 14:48:00 +00:00
/* (-n) Character which follows each line number. */
-static char number_separator = '\t';
+static char number_separator[MB_LEN_MAX] = "\t";
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+/* (-n) The byte length of the character which follows each line number. */
+static int number_separator_length = 1;
+
+/* (-n) The character width of the character which follows each line number. */
+static int number_separator_width = 0;
2009-10-07 08:11:44 +00:00
2009-11-18 14:48:00 +00:00
/* (-n) line counting starts with 1st line of input file (not with 1st
line of 1st page printed). */
2015-07-05 07:17:02 +00:00
@@ -690,6 +742,7 @@ static bool use_col_separator = false;
2012-03-26 16:23:32 +00:00
-a|COLUMN|-m is a 'space' and with the -J option a 'tab'. */
2009-11-18 14:48:00 +00:00
static char *col_sep_string = (char *) "";
static int col_sep_length = 0;
+static int col_sep_width = 0;
static char *column_separator = (char *) " ";
static char *line_separator = (char *) "\t";
2009-10-07 08:11:44 +00:00
2015-07-05 07:17:02 +00:00
@@ -840,6 +893,13 @@ separator_string (const char *optarg_S)
2009-11-18 14:48:00 +00:00
col_sep_length = (int) strlen (optarg_S);
col_sep_string = xmalloc (col_sep_length + 1);
strcpy (col_sep_string, optarg_S);
2009-10-07 08:11:44 +00:00
+
+#if HAVE_MBRTOWC
2009-11-18 14:48:00 +00:00
+ if (MB_CUR_MAX > 1)
+ col_sep_width = mbswidth (col_sep_string, 0);
+ else
+#endif
+ col_sep_width = col_sep_length;
}
int
2015-07-05 07:17:02 +00:00
@@ -864,6 +924,21 @@ main (int argc, char **argv)
2009-11-18 14:48:00 +00:00
atexit (close_stdout);
+/* Define which functions are used, the ones for single byte locale or the ones
+ for multibyte locale. */
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ {
+ print_char = print_char_multi;
+ char_to_clump = char_to_clump_multi;
+ }
+ else
+#endif
+ {
+ print_char = print_char_single;
+ char_to_clump = char_to_clump_single;
+ }
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
n_files = 0;
file_names = (argc > 1
? xmalloc ((argc - 1) * sizeof (char *))
2015-07-05 07:17:02 +00:00
@@ -940,8 +1015,12 @@ main (int argc, char **argv)
2009-11-18 14:48:00 +00:00
break;
case 'e':
if (optarg)
- getoptarg (optarg, 'e', &input_tab_char,
- &chars_per_input_tab);
+ {
+ int dummy_length, dummy_width;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ getoptarg (optarg, 'e', input_tab_char, &dummy_length,
+ &dummy_width, &chars_per_input_tab);
+ }
/* Could check tab width > 0. */
untabify_input = true;
break;
2015-07-05 07:17:02 +00:00
@@ -954,8 +1033,12 @@ main (int argc, char **argv)
2009-11-18 14:48:00 +00:00
break;
case 'i':
if (optarg)
- getoptarg (optarg, 'i', &output_tab_char,
- &chars_per_output_tab);
+ {
+ int dummy_width;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ getoptarg (optarg, 'i', output_tab_char, &output_tab_char_length,
+ &dummy_width, &chars_per_output_tab);
+ }
/* Could check tab width > 0. */
tabify_output = true;
break;
2015-07-05 07:17:02 +00:00
@@ -973,8 +1056,8 @@ main (int argc, char **argv)
2009-11-18 14:48:00 +00:00
case 'n':
numbered_lines = true;
if (optarg)
- getoptarg (optarg, 'n', &number_separator,
- &chars_per_number);
+ getoptarg (optarg, 'n', number_separator, &number_separator_length,
+ &number_separator_width, &chars_per_number);
break;
case 'N':
skip_count = false;
2015-07-05 07:17:02 +00:00
@@ -998,7 +1081,7 @@ main (int argc, char **argv)
2009-11-18 14:48:00 +00:00
old_s = false;
/* Reset an additional input of -s, -S dominates -s */
col_sep_string = bad_cast ("");
- col_sep_length = 0;
+ col_sep_length = col_sep_width = 0;
use_col_separator = true;
if (optarg)
separator_string (optarg);
2015-07-05 07:17:02 +00:00
@@ -1152,10 +1235,45 @@ getoptnum (const char *n_str, int min, i
2009-11-18 14:48:00 +00:00
a number. */
static void
-getoptarg (char *arg, char switch_char, char *character, int *number)
+getoptarg (char *arg, char switch_char, char *character, int *character_length,
+ int *character_width, int *number)
{
if (!ISDIGIT (*arg))
- *character = *arg++;
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+#ifdef HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1) /* for multibyte locale. */
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ wchar_t wc;
+ size_t mblength;
+ int width;
+ mbstate_t state = {'\0'};
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ mblength = mbrtowc (&wc, arg, strnlen(arg, MB_LEN_MAX), &state);
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
+ {
+ *character_length = 1;
+ *character_width = 1;
+ }
2009-10-07 08:11:44 +00:00
+ else
+ {
2009-11-18 14:48:00 +00:00
+ *character_length = (mblength < 1) ? 1 : mblength;
+ width = wcwidth (wc);
+ *character_width = (width < 0) ? 0 : width;
2009-10-07 08:11:44 +00:00
+ }
+
2009-11-18 14:48:00 +00:00
+ strncpy (character, arg, *character_length);
+ arg += *character_length;
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+ else /* for single byte locale. */
+#endif
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ *character = *arg++;
+ *character_length = 1;
+ *character_width = 1;
2009-10-07 08:11:44 +00:00
+ }
+ }
+
2009-11-18 14:48:00 +00:00
if (*arg)
2009-10-07 08:11:44 +00:00
{
2009-11-18 14:48:00 +00:00
long int tmp_long;
2015-07-05 07:17:02 +00:00
@@ -1177,6 +1295,11 @@ static void
2012-01-16 14:38:16 +00:00
init_parameters (int number_of_files)
{
int chars_used_by_number = 0;
+ int mb_len = 1;
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ mb_len = MB_LEN_MAX;
+#endif
lines_per_body = lines_per_page - lines_per_header - lines_per_footer;
if (lines_per_body <= 0)
2015-07-05 07:17:02 +00:00
@@ -1214,7 +1337,7 @@ init_parameters (int number_of_files)
2009-11-18 14:48:00 +00:00
else
col_sep_string = column_separator;
2009-10-07 08:11:44 +00:00
2009-11-18 14:48:00 +00:00
- col_sep_length = 1;
+ col_sep_length = col_sep_width = 1;
use_col_separator = true;
}
/* It's rather pointless to define a TAB separator with column
2015-07-05 07:17:02 +00:00
@@ -1244,11 +1367,11 @@ init_parameters (int number_of_files)
2012-05-11 07:58:08 +00:00
+ TAB_WIDTH (chars_per_input_tab, chars_per_number); */
2009-10-07 08:11:44 +00:00
2009-11-18 14:48:00 +00:00
/* Estimate chars_per_text without any margin and keep it constant. */
- if (number_separator == '\t')
+ if (number_separator[0] == '\t')
2012-05-11 07:58:08 +00:00
number_width = (chars_per_number
+ TAB_WIDTH (chars_per_default_tab, chars_per_number));
2009-11-18 14:48:00 +00:00
else
- number_width = chars_per_number + 1;
+ number_width = chars_per_number + number_separator_width;
2009-10-07 08:11:44 +00:00
2009-11-18 14:48:00 +00:00
/* The number is part of the column width unless we are
printing files in parallel. */
2015-07-05 07:17:02 +00:00
@@ -1257,7 +1380,7 @@ init_parameters (int number_of_files)
2009-11-18 14:48:00 +00:00
}
2009-10-07 08:11:44 +00:00
2012-05-11 07:58:08 +00:00
chars_per_column = (chars_per_line - chars_used_by_number
- - (columns - 1) * col_sep_length) / columns;
+ - (columns - 1) * col_sep_width) / columns;
2009-10-07 08:11:44 +00:00
2009-11-18 14:48:00 +00:00
if (chars_per_column < 1)
error (EXIT_FAILURE, 0, _("page width too narrow"));
2015-07-05 07:17:02 +00:00
@@ -1275,7 +1398,7 @@ init_parameters (int number_of_files)
2012-01-16 14:38:16 +00:00
We've to use 8 as the lower limit, if we use chars_per_default_tab = 8
to expand a tab which is not an input_tab-char. */
free (clump_buff);
- clump_buff = xmalloc (MAX (8, chars_per_input_tab));
+ clump_buff = xmalloc (mb_len * MAX (8, chars_per_input_tab));
}
/* Open the necessary files,
2015-07-05 07:17:02 +00:00
@@ -1383,7 +1506,7 @@ init_funcs (void)
2009-10-07 08:11:44 +00:00
2009-11-18 14:48:00 +00:00
/* Enlarge p->start_position of first column to use the same form of
padding_not_printed with all columns. */
- h = h + col_sep_length;
+ h = h + col_sep_width;
2009-10-07 08:11:44 +00:00
2009-11-18 14:48:00 +00:00
/* This loop takes care of all but the rightmost column. */
2009-10-07 08:11:44 +00:00
2015-07-05 07:17:02 +00:00
@@ -1417,7 +1540,7 @@ init_funcs (void)
2009-11-18 14:48:00 +00:00
}
else
{
- h = h_next + col_sep_length;
+ h = h_next + col_sep_width;
h_next = h + chars_per_column;
}
}
2015-07-05 07:17:02 +00:00
@@ -1708,9 +1831,9 @@ static void
2009-11-18 14:48:00 +00:00
align_column (COLUMN *p)
{
padding_not_printed = p->start_position;
- if (padding_not_printed - col_sep_length > 0)
+ if (padding_not_printed - col_sep_width > 0)
2009-10-07 08:11:44 +00:00
{
2009-11-18 14:48:00 +00:00
- pad_across_to (padding_not_printed - col_sep_length);
+ pad_across_to (padding_not_printed - col_sep_width);
padding_not_printed = ANYWHERE;
2009-10-07 08:11:44 +00:00
}
2009-11-18 14:48:00 +00:00
2015-07-05 07:17:02 +00:00
@@ -1981,13 +2104,13 @@ store_char (char c)
2009-11-18 14:48:00 +00:00
/* May be too generous. */
buff = X2REALLOC (buff, &buff_allocated);
}
- buff[buff_current++] = c;
+ buff[buff_current++] = (unsigned char) c;
2009-10-07 08:11:44 +00:00
}
static void
2009-11-18 14:48:00 +00:00
add_line_number (COLUMN *p)
2009-10-07 08:11:44 +00:00
{
2009-11-18 14:48:00 +00:00
- int i;
2009-10-07 08:11:44 +00:00
+ int i, j;
2009-11-18 14:48:00 +00:00
char *s;
2013-02-15 16:50:47 +00:00
int num_width;
2009-10-07 08:11:44 +00:00
2015-07-05 07:17:02 +00:00
@@ -2004,22 +2127,24 @@ add_line_number (COLUMN *p)
2009-11-18 14:48:00 +00:00
/* Tabification is assumed for multiple columns, also for n-separators,
2012-03-26 16:23:32 +00:00
but 'default n-separator = TAB' hasn't been given priority over
2009-11-18 14:48:00 +00:00
equal column_width also specified by POSIX. */
- if (number_separator == '\t')
+ if (number_separator[0] == '\t')
{
i = number_width - chars_per_number;
while (i-- > 0)
(p->char_func) (' ');
}
else
- (p->char_func) (number_separator);
+ for (j = 0; j < number_separator_length; j++)
+ (p->char_func) (number_separator[j]);
2009-10-07 08:11:44 +00:00
}
else
2009-11-18 14:48:00 +00:00
/* To comply with POSIX, we avoid any expansion of default TAB
separator with a single column output. No column_width requirement
has to be considered. */
2009-10-07 08:11:44 +00:00
{
2009-11-18 14:48:00 +00:00
- (p->char_func) (number_separator);
- if (number_separator == '\t')
+ for (j = 0; j < number_separator_length; j++)
+ (p->char_func) (number_separator[j]);
+ if (number_separator[0] == '\t')
output_position = POS_AFTER_TAB (chars_per_output_tab,
output_position);
2009-10-07 08:11:44 +00:00
}
2015-07-05 07:17:02 +00:00
@@ -2180,7 +2305,7 @@ print_white_space (void)
2009-11-18 14:48:00 +00:00
while (goal - h_old > 1
&& (h_new = POS_AFTER_TAB (chars_per_output_tab, h_old)) <= goal)
2009-10-07 08:11:44 +00:00
{
2009-11-18 14:48:00 +00:00
- putchar (output_tab_char);
+ fwrite (output_tab_char, sizeof(char), output_tab_char_length, stdout);
h_old = h_new;
2009-10-07 08:11:44 +00:00
}
2009-11-18 14:48:00 +00:00
while (++h_old <= goal)
2015-07-05 07:17:02 +00:00
@@ -2200,6 +2325,7 @@ print_sep_string (void)
2009-11-18 14:48:00 +00:00
{
char *s;
int l = col_sep_length;
+ int not_space_flag;
2009-10-07 08:11:44 +00:00
2009-11-18 14:48:00 +00:00
s = col_sep_string;
2009-10-07 08:11:44 +00:00
2015-07-05 07:17:02 +00:00
@@ -2213,6 +2339,7 @@ print_sep_string (void)
2009-10-07 08:11:44 +00:00
{
2009-11-18 14:48:00 +00:00
for (; separators_not_printed > 0; --separators_not_printed)
2009-10-07 08:11:44 +00:00
{
2009-11-18 14:48:00 +00:00
+ not_space_flag = 0;
while (l-- > 0)
{
/* 3 types of sep_strings: spaces only, spaces and chars,
2015-07-05 07:17:02 +00:00
@@ -2226,12 +2353,15 @@ print_sep_string (void)
2009-11-18 14:48:00 +00:00
}
else
{
+ not_space_flag = 1;
if (spaces_not_printed > 0)
print_white_space ();
putchar (*s++);
- ++output_position;
}
}
+ if (not_space_flag)
+ output_position += col_sep_width;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
/* sep_string ends with some spaces */
if (spaces_not_printed > 0)
print_white_space ();
2015-07-05 07:17:02 +00:00
@@ -2259,7 +2389,7 @@ print_clump (COLUMN *p, int n, char *clu
2009-11-18 14:48:00 +00:00
required number of tabs and spaces. */
static void
-print_char (char c)
+print_char_single (char c)
{
if (tabify_output)
{
2015-07-05 07:17:02 +00:00
@@ -2283,6 +2413,74 @@ print_char (char c)
2009-11-18 14:48:00 +00:00
putchar (c);
}
+#ifdef HAVE_MBRTOWC
+static void
+print_char_multi (char c)
2009-10-07 08:11:44 +00:00
+{
2009-11-18 14:48:00 +00:00
+ static size_t mbc_pos = 0;
+ static char mbc[MB_LEN_MAX] = {'\0'};
+ static mbstate_t state = {'\0'};
+ mbstate_t state_bak;
+ wchar_t wc;
+ size_t mblength;
+ int width;
+
+ if (tabify_output)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ state_bak = state;
+ mbc[mbc_pos++] = c;
+ mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ while (mbc_pos > 0)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ switch (mblength)
+ {
+ case (size_t)-2:
+ state = state_bak;
+ return;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ case (size_t)-1:
+ state = state_bak;
+ ++output_position;
+ putchar (mbc[0]);
+ memmove (mbc, mbc + 1, MB_CUR_MAX - 1);
+ --mbc_pos;
+ break;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ case 0:
+ mblength = 1;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ default:
+ if (wc == L' ')
+ {
+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
+ --mbc_pos;
+ ++spaces_not_printed;
+ return;
+ }
+ else if (spaces_not_printed > 0)
+ print_white_space ();
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ /* Nonprintables are assumed to have width 0, except L'\b'. */
+ if ((width = wcwidth (wc)) < 1)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ if (wc == L'\b')
+ --output_position;
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+ else
+ output_position += width;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ fwrite (mbc, sizeof(char), mblength, stdout);
+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
+ mbc_pos -= mblength;
+ }
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+ return;
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+ putchar (c);
2009-10-07 08:11:44 +00:00
+}
2009-11-18 14:48:00 +00:00
+#endif
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
/* Skip to page PAGE before printing.
PAGE may be larger than total number of pages. */
2015-07-05 07:17:02 +00:00
@@ -2462,9 +2660,9 @@ read_line (COLUMN *p)
2009-11-18 14:48:00 +00:00
align_empty_cols = false;
}
- if (padding_not_printed - col_sep_length > 0)
+ if (padding_not_printed - col_sep_width > 0)
{
- pad_across_to (padding_not_printed - col_sep_length);
+ pad_across_to (padding_not_printed - col_sep_width);
padding_not_printed = ANYWHERE;
}
2015-07-05 07:17:02 +00:00
@@ -2534,7 +2732,7 @@ print_stored (COLUMN *p)
2014-01-04 21:48:09 +00:00
int i;
int line = p->current_line++;
- char *first = &buff[line_vector[line]];
+ unsigned char *first = &buff[line_vector[line]];
/* FIXME
UMR: Uninitialized memory read:
* This is occurring while in:
2015-07-05 07:17:02 +00:00
@@ -2546,7 +2744,7 @@ print_stored (COLUMN *p)
2014-01-04 21:48:09 +00:00
xmalloc [xmalloc.c:94]
init_store_cols [pr.c:1648]
*/
- char *last = &buff[line_vector[line + 1]];
+ unsigned char *last = &buff[line_vector[line + 1]];
pad_vertically = true;
2015-07-05 07:17:02 +00:00
@@ -2565,9 +2763,9 @@ print_stored (COLUMN *p)
2009-11-18 14:48:00 +00:00
}
}
- if (padding_not_printed - col_sep_length > 0)
+ if (padding_not_printed - col_sep_width > 0)
{
- pad_across_to (padding_not_printed - col_sep_length);
+ pad_across_to (padding_not_printed - col_sep_width);
padding_not_printed = ANYWHERE;
}
2015-07-05 07:17:02 +00:00
@@ -2580,8 +2778,8 @@ print_stored (COLUMN *p)
2009-11-18 14:48:00 +00:00
if (spaces_not_printed == 0)
{
output_position = p->start_position + end_vector[line];
- if (p->start_position - col_sep_length == chars_per_margin)
- output_position -= col_sep_length;
+ if (p->start_position - col_sep_width == chars_per_margin)
+ output_position -= col_sep_width;
}
return true;
2015-07-05 07:17:02 +00:00
@@ -2600,7 +2798,7 @@ print_stored (COLUMN *p)
2009-11-18 14:48:00 +00:00
number of characters is 1.) */
static int
-char_to_clump (char c)
+char_to_clump_single (char c)
{
unsigned char uc = c;
char *s = clump_buff;
2015-07-05 07:17:02 +00:00
@@ -2610,10 +2808,10 @@ char_to_clump (char c)
2009-11-18 14:48:00 +00:00
int chars;
int chars_per_c = 8;
- if (c == input_tab_char)
+ if (c == input_tab_char[0])
chars_per_c = chars_per_input_tab;
- if (c == input_tab_char || c == '\t')
+ if (c == input_tab_char[0] || c == '\t')
{
width = TAB_WIDTH (chars_per_c, input_position);
2015-07-05 07:17:02 +00:00
@@ -2694,6 +2892,164 @@ char_to_clump (char c)
2009-11-18 14:48:00 +00:00
return chars;
}
+#ifdef HAVE_MBRTOWC
+static int
+char_to_clump_multi (char c)
2009-10-07 08:11:44 +00:00
+{
2009-11-18 14:48:00 +00:00
+ static size_t mbc_pos = 0;
+ static char mbc[MB_LEN_MAX] = {'\0'};
+ static mbstate_t state = {'\0'};
+ mbstate_t state_bak;
+ wchar_t wc;
+ size_t mblength;
+ int wc_width;
+ register char *s = clump_buff;
+ register int i, j;
+ char esc_buff[4];
+ int width;
+ int chars;
+ int chars_per_c = 8;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ state_bak = state;
+ mbc[mbc_pos++] = c;
+ mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ width = 0;
+ chars = 0;
+ while (mbc_pos > 0)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ switch (mblength)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ case (size_t)-2:
+ state = state_bak;
+ return 0;
+
+ case (size_t)-1:
+ state = state_bak;
+ mblength = 1;
+
+ if (use_esc_sequence || use_cntrl_prefix)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ width = +4;
+ chars = +4;
+ *s++ = '\\';
2012-01-16 14:38:16 +00:00
+ sprintf (esc_buff, "%03o", (unsigned char) mbc[0]);
2009-11-18 14:48:00 +00:00
+ for (i = 0; i <= 2; ++i)
+ *s++ = (int) esc_buff[i];
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+ else
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ width += 1;
+ chars += 1;
+ *s++ = mbc[0];
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+ break;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ case 0:
+ mblength = 1;
+ /* Fall through */
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ default:
+ if (memcmp (mbc, input_tab_char, mblength) == 0)
+ chars_per_c = chars_per_input_tab;
+
+ if (memcmp (mbc, input_tab_char, mblength) == 0 || c == '\t')
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ int width_inc;
+
+ width_inc = TAB_WIDTH (chars_per_c, input_position);
+ width += width_inc;
+
+ if (untabify_input)
+ {
+ for (i = width_inc; i; --i)
+ *s++ = ' ';
+ chars += width_inc;
+ }
+ else
+ {
+ for (i = 0; i < mblength; i++)
+ *s++ = mbc[i];
+ chars += mblength;
+ }
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+ else if ((wc_width = wcwidth (wc)) < 1)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ if (use_esc_sequence)
+ {
+ for (i = 0; i < mblength; i++)
+ {
+ width += 4;
+ chars += 4;
+ *s++ = '\\';
2012-01-16 14:38:16 +00:00
+ sprintf (esc_buff, "%03o", (unsigned char) mbc[i]);
2009-11-18 14:48:00 +00:00
+ for (j = 0; j <= 2; ++j)
+ *s++ = (int) esc_buff[j];
+ }
+ }
+ else if (use_cntrl_prefix)
+ {
+ if (wc < 0200)
+ {
+ width += 2;
+ chars += 2;
+ *s++ = '^';
+ *s++ = wc ^ 0100;
+ }
+ else
+ {
+ for (i = 0; i < mblength; i++)
+ {
+ width += 4;
+ chars += 4;
+ *s++ = '\\';
2012-01-16 14:38:16 +00:00
+ sprintf (esc_buff, "%03o", (unsigned char) mbc[i]);
2009-11-18 14:48:00 +00:00
+ for (j = 0; j <= 2; ++j)
+ *s++ = (int) esc_buff[j];
+ }
+ }
+ }
+ else if (wc == L'\b')
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ width += -1;
+ chars += 1;
+ *s++ = c;
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+ else
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ width += 0;
+ chars += mblength;
+ for (i = 0; i < mblength; i++)
+ *s++ = mbc[i];
2009-10-07 08:11:44 +00:00
+ }
+ }
+ else
+ {
2009-11-18 14:48:00 +00:00
+ width += wc_width;
+ chars += mblength;
+ for (i = 0; i < mblength; i++)
+ *s++ = mbc[i];
2009-10-07 08:11:44 +00:00
+ }
+ }
2009-11-18 14:48:00 +00:00
+ memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
+ mbc_pos -= mblength;
2009-10-07 08:11:44 +00:00
+ }
+
2013-08-15 11:26:11 +00:00
+ /* Too many backspaces must put us in position 0 -- never negative. */
+ if (width < 0 && input_position == 0)
+ {
+ chars = 0;
+ input_position = 0;
+ }
+ else if (width < 0 && input_position <= -width)
+ input_position = 0;
+ else
+ input_position += width;
+
2009-11-18 14:48:00 +00:00
+ return chars;
2009-10-07 08:11:44 +00:00
+}
2009-11-18 14:48:00 +00:00
+#endif
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
/* We've just printed some files and need to clean up things before
looking for more options and printing the next batch of files.
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/src/sort.c coreutils-8.24/src/sort.c
--- coreutils-8.24-orig/src/sort.c 2015-06-26 19:05:22.000000000 +0200
+++ coreutils-8.24/src/sort.c 2015-07-05 09:04:33.032546980 +0200
2012-08-12 22:14:40 +00:00
@@ -29,6 +29,14 @@
2009-11-18 14:48:00 +00:00
#include <sys/wait.h>
#include <signal.h>
2012-08-12 22:14:40 +00:00
#include <assert.h>
2009-11-18 14:48:00 +00:00
+#if HAVE_WCHAR_H
+# include <wchar.h>
+#endif
+/* Get isw* functions. */
+#if HAVE_WCTYPE_H
+# include <wctype.h>
+#endif
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
#include "system.h"
#include "argmatch.h"
#include "error.h"
2014-01-02 20:29:20 +00:00
@@ -164,14 +172,39 @@ static int decimal_point;
/* Thousands separator; if -1, then there isn't one. */
static int thousands_sep;
2009-11-18 14:48:00 +00:00
2014-01-02 20:29:20 +00:00
+/* True if -f is specified. */
+static bool folding;
+
2009-11-18 14:48:00 +00:00
/* Nonzero if the corresponding locales are hard. */
static bool hard_LC_COLLATE;
2013-08-14 17:01:16 +00:00
-#if HAVE_NL_LANGINFO
2013-12-16 16:48:21 +00:00
+#if HAVE_LANGINFO_CODESET
2009-11-18 14:48:00 +00:00
static bool hard_LC_TIME;
#endif
#define NONZERO(x) ((x) != 0)
+/* get a multibyte character's byte length. */
+#define GET_BYTELEN_OF_CHAR(LIM, PTR, MBLENGTH, STATE) \
+ do \
+ { \
+ wchar_t wc; \
+ mbstate_t state_bak; \
+ \
+ state_bak = STATE; \
+ mblength = mbrtowc (&wc, PTR, LIM - PTR, &STATE); \
+ \
+ switch (MBLENGTH) \
+ { \
+ case (size_t)-1: \
+ case (size_t)-2: \
+ STATE = state_bak; \
+ /* Fall through. */ \
+ case 0: \
+ MBLENGTH = 1; \
+ } \
+ } \
+ while (0)
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
/* The kind of blanks for '-b' to skip in various options. */
enum blanktype { bl_start, bl_end, bl_both };
2014-01-02 20:29:20 +00:00
@@ -345,13 +378,11 @@ static bool reverse;
2009-11-18 14:48:00 +00:00
they were read if all keys compare equal. */
static bool stable;
-/* If TAB has this value, blanks separate fields. */
-enum { TAB_DEFAULT = CHAR_MAX + 1 };
-
-/* Tab character separating fields. If TAB_DEFAULT, then fields are
2013-12-16 16:48:21 +00:00
+/* Tab character separating fields. If tab_length is 0, then fields are
2009-11-18 14:48:00 +00:00
separated by the empty string between a non-blank character and a blank
character. */
2013-12-16 16:48:21 +00:00
-static int tab = TAB_DEFAULT;
2009-11-18 14:48:00 +00:00
+static char tab[MB_LEN_MAX + 1];
+static size_t tab_length = 0;
/* Flag to remove consecutive duplicate lines from the output.
Only the last of a sequence of equal lines will be output. */
2015-07-05 07:17:02 +00:00
@@ -810,6 +841,46 @@ reap_all (void)
2010-12-23 13:13:28 +00:00
reap (-1);
2009-11-18 14:48:00 +00:00
}
+/* Function pointers. */
2009-10-07 08:11:44 +00:00
+static void
2009-11-18 14:48:00 +00:00
+(*inittables) (void);
+static char *
+(*begfield) (const struct line*, const struct keyfield *);
+static char *
+(*limfield) (const struct line*, const struct keyfield *);
2010-10-26 16:53:51 +00:00
+static void
2010-11-03 11:16:08 +00:00
+(*skipblanks) (char **ptr, char *lim);
2009-11-18 14:48:00 +00:00
+static int
2010-10-20 12:03:53 +00:00
+(*getmonth) (char const *, size_t, char **);
2009-11-18 14:48:00 +00:00
+static int
+(*keycompare) (const struct line *, const struct line *);
+static int
+(*numcompare) (const char *, const char *);
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+/* Test for white space multibyte character.
+ Set LENGTH the byte length of investigated multibyte character. */
+#if HAVE_MBRTOWC
+static int
+ismbblank (const char *str, size_t len, size_t *length)
2009-10-07 08:11:44 +00:00
+{
2009-11-18 14:48:00 +00:00
+ size_t mblength;
+ wchar_t wc;
+ mbstate_t state;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ memset (&state, '\0', sizeof(mbstate_t));
+ mblength = mbrtowc (&wc, str, len, &state);
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ *length = 1;
+ return 0;
2009-10-07 08:11:44 +00:00
+ }
+
2009-11-18 14:48:00 +00:00
+ *length = (mblength < 1) ? 1 : mblength;
+ return iswblank (wc);
2009-10-07 08:11:44 +00:00
+}
2009-11-18 14:48:00 +00:00
+#endif
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
/* Clean up any remaining temporary files. */
static void
2015-07-05 07:17:02 +00:00
@@ -1254,7 +1325,7 @@ zaptemp (char const *name)
2009-11-18 14:48:00 +00:00
free (node);
}
2013-08-14 17:01:16 +00:00
-#if HAVE_NL_LANGINFO
2013-12-16 16:48:21 +00:00
+#if HAVE_LANGINFO_CODESET
2009-11-18 14:48:00 +00:00
static int
2010-10-20 12:03:53 +00:00
struct_month_cmp (void const *m1, void const *m2)
2015-07-05 07:17:02 +00:00
@@ -1269,7 +1340,7 @@ struct_month_cmp (void const *m1, void c
2009-11-18 14:48:00 +00:00
/* Initialize the character class tables. */
static void
2013-08-14 17:01:16 +00:00
-inittables (void)
2013-12-16 16:48:21 +00:00
+inittables_uni (void)
2009-11-18 14:48:00 +00:00
{
size_t i;
2015-07-05 07:17:02 +00:00
@@ -1281,7 +1352,7 @@ inittables (void)
2009-11-18 14:48:00 +00:00
fold_toupper[i] = toupper (i);
}
2013-08-14 17:01:16 +00:00
-#if HAVE_NL_LANGINFO
2013-12-16 16:48:21 +00:00
+#if HAVE_LANGINFO_CODESET
2009-11-18 14:48:00 +00:00
/* If we're not in the "C" locale, read different names for months. */
if (hard_LC_TIME)
{
2015-07-05 07:17:02 +00:00
@@ -1363,6 +1434,84 @@ specify_nmerge (int oi, char c, char con
2009-11-18 14:48:00 +00:00
xstrtol_fatal (e, oi, c, long_options, s);
}
+#if HAVE_MBRTOWC
2009-10-07 08:11:44 +00:00
+static void
2009-11-18 14:48:00 +00:00
+inittables_mb (void)
2009-10-07 08:11:44 +00:00
+{
2009-11-18 14:48:00 +00:00
+ int i, j, k, l;
2010-11-03 11:16:08 +00:00
+ char *name, *s, *lc_time, *lc_ctype;
2009-11-18 14:48:00 +00:00
+ size_t s_len, mblength;
+ char mbc[MB_LEN_MAX];
+ wchar_t wc, pwc;
+ mbstate_t state_mb, state_wc;
2009-10-07 08:11:44 +00:00
+
2010-11-03 11:16:08 +00:00
+ lc_time = setlocale (LC_TIME, "");
+ if (lc_time)
+ lc_time = xstrdup (lc_time);
+
+ lc_ctype = setlocale (LC_CTYPE, "");
+ if (lc_ctype)
+ lc_ctype = xstrdup (lc_ctype);
+
+ if (lc_time && lc_ctype)
+ /* temporarily set LC_CTYPE to match LC_TIME, so that we can convert
+ * the names of months to upper case */
+ setlocale (LC_CTYPE, lc_time);
+
2009-11-18 14:48:00 +00:00
+ for (i = 0; i < MONTHS_PER_YEAR; i++)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ s = (char *) nl_langinfo (ABMON_1 + i);
+ s_len = strlen (s);
+ monthtab[i].name = name = (char *) xmalloc (s_len + 1);
+ monthtab[i].val = i + 1;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ memset (&state_mb, '\0', sizeof (mbstate_t));
+ memset (&state_wc, '\0', sizeof (mbstate_t));
+
+ for (j = 0; j < s_len;)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ if (!ismbblank (s + j, s_len - j, &mblength))
+ break;
+ j += mblength;
2009-10-07 08:11:44 +00:00
+ }
+
2009-11-18 14:48:00 +00:00
+ for (k = 0; j < s_len;)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ mblength = mbrtowc (&wc, (s + j), (s_len - j), &state_mb);
+ assert (mblength != (size_t)-1 && mblength != (size_t)-2);
+ if (mblength == 0)
+ break;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ pwc = towupper (wc);
+ if (pwc == wc)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ memcpy (mbc, s + j, mblength);
+ j += mblength;
2009-10-07 08:11:44 +00:00
+ }
+ else
+ {
2009-11-18 14:48:00 +00:00
+ j += mblength;
+ mblength = wcrtomb (mbc, pwc, &state_wc);
+ assert (mblength != (size_t)0 && mblength != (size_t)-1);
2009-10-07 08:11:44 +00:00
+ }
+
2009-11-18 14:48:00 +00:00
+ for (l = 0; l < mblength; l++)
+ name[k++] = mbc[l];
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+ name[k] = '\0';
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+ qsort ((void *) monthtab, MONTHS_PER_YEAR,
+ sizeof (struct month), struct_month_cmp);
2010-11-03 11:16:08 +00:00
+
+ if (lc_time && lc_ctype)
+ /* restore the original locales */
+ setlocale (LC_CTYPE, lc_ctype);
+
+ free (lc_ctype);
+ free (lc_time);
2009-11-18 14:48:00 +00:00
+}
+#endif
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
/* Specify the amount of main memory to use when sorting. */
static void
specify_sort_size (int oi, char c, char const *s)
2015-07-05 07:17:02 +00:00
@@ -1596,7 +1745,7 @@ buffer_linelim (struct buffer const *buf
2009-11-18 14:48:00 +00:00
by KEY in LINE. */
static char *
2013-08-14 17:01:16 +00:00
-begfield (struct line const *line, struct keyfield const *key)
2013-12-16 16:48:21 +00:00
+begfield_uni (const struct line *line, const struct keyfield *key)
2009-11-18 14:48:00 +00:00
{
char *ptr = line->text, *lim = ptr + line->length - 1;
size_t sword = key->sword;
2015-07-05 07:17:02 +00:00
@@ -1605,10 +1754,10 @@ begfield (struct line const *line, struc
2009-11-18 14:48:00 +00:00
/* The leading field separator itself is included in a field when -t
is absent. */
2013-08-14 17:01:16 +00:00
- if (tab != TAB_DEFAULT)
2013-12-16 16:48:21 +00:00
+ if (tab_length)
2009-11-18 14:48:00 +00:00
while (ptr < lim && sword--)
{
2013-08-14 17:01:16 +00:00
- while (ptr < lim && *ptr != tab)
2013-12-16 16:48:21 +00:00
+ while (ptr < lim && *ptr != tab[0])
2009-11-18 14:48:00 +00:00
++ptr;
if (ptr < lim)
++ptr;
2015-07-05 07:17:02 +00:00
@@ -1634,11 +1783,70 @@ begfield (struct line const *line, struc
2009-11-18 14:48:00 +00:00
return ptr;
}
+#if HAVE_MBRTOWC
+static char *
+begfield_mb (const struct line *line, const struct keyfield *key)
+{
+ int i;
+ char *ptr = line->text, *lim = ptr + line->length - 1;
+ size_t sword = key->sword;
+ size_t schar = key->schar;
+ size_t mblength;
+ mbstate_t state;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ memset (&state, '\0', sizeof(mbstate_t));
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (tab_length)
+ while (ptr < lim && sword--)
+ {
+ while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ if (ptr < lim)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ }
+ else
+ while (ptr < lim && sword--)
+ {
+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
+ if (ptr < lim)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
+ }
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (key->skipsblanks)
+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ for (i = 0; i < schar; i++)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (ptr + mblength > lim)
+ break;
+ else
+ ptr += mblength;
+ }
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ return ptr;
2009-10-07 08:11:44 +00:00
+}
2009-11-18 14:48:00 +00:00
+#endif
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
/* Return the limit of (a pointer to the first character after) the field
in LINE specified by KEY. */
static char *
2013-08-14 17:01:16 +00:00
-limfield (struct line const *line, struct keyfield const *key)
2013-12-16 16:48:21 +00:00
+limfield_uni (const struct line *line, const struct keyfield *key)
2009-11-18 14:48:00 +00:00
{
char *ptr = line->text, *lim = ptr + line->length - 1;
size_t eword = key->eword, echar = key->echar;
2015-07-05 07:17:02 +00:00
@@ -1653,10 +1861,10 @@ limfield (struct line const *line, struc
2012-03-26 16:23:32 +00:00
'beginning' is the first character following the delimiting TAB.
Otherwise, leave PTR pointing at the first 'blank' character after
2009-11-18 14:48:00 +00:00
the preceding field. */
2013-08-14 17:01:16 +00:00
- if (tab != TAB_DEFAULT)
2013-12-16 16:48:21 +00:00
+ if (tab_length)
2009-11-18 14:48:00 +00:00
while (ptr < lim && eword--)
{
2013-08-14 17:01:16 +00:00
- while (ptr < lim && *ptr != tab)
2013-12-16 16:48:21 +00:00
+ while (ptr < lim && *ptr != tab[0])
2009-11-18 14:48:00 +00:00
++ptr;
if (ptr < lim && (eword || echar))
++ptr;
2015-07-05 07:17:02 +00:00
@@ -1702,10 +1910,10 @@ limfield (struct line const *line, struc
2009-11-18 14:48:00 +00:00
*/
/* Make LIM point to the end of (one byte past) the current field. */
2013-08-14 17:01:16 +00:00
- if (tab != TAB_DEFAULT)
2013-12-16 16:48:21 +00:00
+ if (tab_length)
2009-11-18 14:48:00 +00:00
{
char *newlim;
2013-08-14 17:01:16 +00:00
- newlim = memchr (ptr, tab, lim - ptr);
2013-12-16 16:48:21 +00:00
+ newlim = memchr (ptr, tab[0], lim - ptr);
2009-11-18 14:48:00 +00:00
if (newlim)
lim = newlim;
}
2015-07-05 07:17:02 +00:00
@@ -1736,6 +1944,130 @@ limfield (struct line const *line, struc
2009-11-18 14:48:00 +00:00
return ptr;
}
+#if HAVE_MBRTOWC
+static char *
+limfield_mb (const struct line *line, const struct keyfield *key)
2009-10-07 08:11:44 +00:00
+{
2009-11-18 14:48:00 +00:00
+ char *ptr = line->text, *lim = ptr + line->length - 1;
+ size_t eword = key->eword, echar = key->echar;
+ int i;
+ size_t mblength;
+ mbstate_t state;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (echar == 0)
+ eword++; /* skip all of end field. */
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ memset (&state, '\0', sizeof(mbstate_t));
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (tab_length)
+ while (ptr < lim && eword--)
+ {
+ while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ if (ptr < lim && (eword | echar))
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ }
+ else
+ while (ptr < lim && eword--)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
+ if (ptr < lim)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
2009-10-07 08:11:44 +00:00
+ }
+
+
2009-11-18 14:48:00 +00:00
+# ifdef POSIX_UNSPECIFIED
+ /* Make LIM point to the end of (one byte past) the current field. */
+ if (tab_length)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ char *newlim, *p;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ newlim = NULL;
+ for (p = ptr; p < lim;)
+ {
+ if (memcmp (p, tab, tab_length) == 0)
+ {
+ newlim = p;
+ break;
+ }
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ p += mblength;
+ }
+ }
+ else
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ char *newlim;
+ newlim = ptr;
+
+ while (newlim < lim && ismbblank (newlim, lim - newlim, &mblength))
+ newlim += mblength;
+ if (ptr < lim)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+ while (newlim < lim && !ismbblank (newlim, lim - newlim, &mblength))
+ newlim += mblength;
+ lim = newlim;
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+# endif
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (echar != 0)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ /* If we're skipping leading blanks, don't start counting characters
+ * until after skipping past any leading blanks. */
2013-08-14 17:01:16 +00:00
+ if (key->skipeblanks)
2009-11-18 14:48:00 +00:00
+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ memset (&state, '\0', sizeof(mbstate_t));
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ /* Advance PTR by ECHAR (if possible), but no further than LIM. */
+ for (i = 0; i < echar; i++)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (ptr + mblength > lim)
+ break;
+ else
+ ptr += mblength;
2009-10-07 08:11:44 +00:00
+ }
+ }
+
2009-11-18 14:48:00 +00:00
+ return ptr;
+}
+#endif
2010-10-26 16:53:51 +00:00
+
+static void
2010-11-03 11:16:08 +00:00
+skipblanks_uni (char **ptr, char *lim)
2010-10-26 16:53:51 +00:00
+{
+ while (*ptr < lim && blanks[to_uchar (**ptr)])
+ ++(*ptr);
+}
+
+#if HAVE_MBRTOWC
+static void
2010-11-03 11:16:08 +00:00
+skipblanks_mb (char **ptr, char *lim)
2010-10-26 16:53:51 +00:00
+{
+ size_t mblength;
+ while (*ptr < lim && ismbblank (*ptr, lim - *ptr, &mblength))
+ (*ptr) += mblength;
+}
+#endif
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
/* Fill BUF reading from FP, moving buf->left bytes from the end
of buf->buf to the beginning first. If EOF is reached and the
file wasn't terminated by a newline, supply one. Set up BUF's line
2015-07-05 07:17:02 +00:00
@@ -1822,8 +2154,22 @@ fillbuf (struct buffer *buf, FILE *fp, c
2009-11-18 14:48:00 +00:00
else
{
if (key->skipsblanks)
2013-12-16 16:48:21 +00:00
- while (blanks[to_uchar (*line_start)])
- line_start++;
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ {
+ size_t mblength;
+ while (line_start < line->keylim &&
+ ismbblank (line_start,
+ line->keylim - line_start,
+ &mblength))
+ line_start += mblength;
+ }
+ else
+#endif
+ while (blanks[to_uchar (*line_start)])
+ line_start++;
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
line->keybeg = line_start;
}
}
2015-07-05 07:17:02 +00:00
@@ -1944,7 +2290,7 @@ human_numcompare (char const *a, char co
2009-11-18 14:48:00 +00:00
hideously fast. */
static int
2013-08-14 17:01:16 +00:00
-numcompare (char const *a, char const *b)
2013-12-16 16:48:21 +00:00
+numcompare_uni (const char *a, const char *b)
2009-11-18 14:48:00 +00:00
{
while (blanks[to_uchar (*a)])
a++;
2015-07-05 07:17:02 +00:00
@@ -1954,6 +2300,25 @@ numcompare (char const *a, char const *b
2010-10-20 12:03:53 +00:00
return strnumcmp (a, b, decimal_point, thousands_sep);
2009-11-18 14:48:00 +00:00
}
+#if HAVE_MBRTOWC
+static int
+numcompare_mb (const char *a, const char *b)
+{
+ size_t mblength, len;
+ len = strlen (a); /* okay for UTF-8 */
+ while (*a && ismbblank (a, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
+ {
+ a += mblength;
+ len -= mblength;
+ }
+ len = strlen (b); /* okay for UTF-8 */
+ while (*b && ismbblank (b, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
+ b += mblength;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ return strnumcmp (a, b, decimal_point, thousands_sep);
+}
+#endif /* HAV_EMBRTOWC */
2009-10-07 08:11:44 +00:00
+
2011-10-12 14:13:12 +00:00
/* Work around a problem whereby the long double value returned by glibc's
strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
A and B before calling strtold. FIXME: remove this function once
2015-07-05 07:17:02 +00:00
@@ -2004,7 +2369,7 @@ general_numcompare (char const *sa, char
2009-11-18 14:48:00 +00:00
Return 0 if the name in S is not recognized. */
static int
2013-08-14 17:01:16 +00:00
-getmonth (char const *month, char **ea)
2013-12-16 16:48:21 +00:00
+getmonth_uni (char const *month, size_t len, char **ea)
2009-11-18 14:48:00 +00:00
{
size_t lo = 0;
size_t hi = MONTHS_PER_YEAR;
2015-07-05 07:17:02 +00:00
@@ -2279,15 +2644,14 @@ debug_key (struct line const *line, stru
2010-12-23 13:13:28 +00:00
char saved = *lim;
*lim = '\0';
2010-10-26 16:53:51 +00:00
- while (blanks[to_uchar (*beg)])
- beg++;
2013-12-16 16:48:21 +00:00
+ skipblanks (&beg, lim);
2010-10-26 16:53:51 +00:00
2010-10-20 12:03:53 +00:00
char *tighter_lim = beg;
2011-02-04 19:34:45 +00:00
if (lim < beg)
tighter_lim = lim;
else if (key->month)
2013-08-14 17:01:16 +00:00
- getmonth (beg, &tighter_lim);
2013-12-16 16:48:21 +00:00
+ getmonth (beg, lim-beg, &tighter_lim);
2010-10-20 12:03:53 +00:00
else if (key->general_numeric)
ignore_value (strtold (beg, &tighter_lim));
else if (key->numeric || key->human_numeric)
2015-07-05 07:17:02 +00:00
@@ -2431,7 +2795,7 @@ key_warnings (struct keyfield const *gke
2010-10-20 12:03:53 +00:00
bool maybe_space_aligned = !hard_LC_COLLATE && default_key_compare (key)
&& !(key->schar || key->echar);
bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
2013-08-14 17:01:16 +00:00
- if (!gkey_only && tab == TAB_DEFAULT && !line_offset
2013-12-16 16:48:21 +00:00
+ if (!gkey_only && !tab_length && !line_offset
2010-10-20 12:03:53 +00:00
&& ((!key->skipsblanks && !(implicit_skip || maybe_space_aligned))
|| (!key->skipsblanks && key->schar)
|| (!key->skipeblanks && key->echar)))
2015-07-05 07:17:02 +00:00
@@ -2489,11 +2853,87 @@ key_warnings (struct keyfield const *gke
2012-03-26 16:23:32 +00:00
error (0, 0, _("option '-r' only applies to last-resort comparison"));
2009-11-18 14:48:00 +00:00
}
+#if HAVE_MBRTOWC
+static int
2010-10-20 12:03:53 +00:00
+getmonth_mb (const char *s, size_t len, char **ea)
2009-11-18 14:48:00 +00:00
+{
+ char *month;
+ register size_t i;
+ register int lo = 0, hi = MONTHS_PER_YEAR, result;
+ char *tmp;
+ size_t wclength, mblength;
+ const char **pp;
+ const wchar_t **wpp;
+ wchar_t *month_wcs;
+ mbstate_t state;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ while (len > 0 && ismbblank (s, len, &mblength))
+ {
+ s += mblength;
+ len -= mblength;
+ }
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (len == 0)
+ return 0;
2009-09-12 09:28:49 +00:00
+
2013-01-23 09:04:52 +00:00
+ month = (char *) xmalloc (len + 1);
2007-01-09 19:29:30 +00:00
+
2013-01-23 09:04:52 +00:00
+ tmp = (char *) xmalloc (len + 1);
2009-11-18 14:48:00 +00:00
+ memcpy (tmp, s, len);
+ tmp[len] = '\0';
+ pp = (const char **)&tmp;
2013-01-23 09:04:52 +00:00
+ month_wcs = (wchar_t *) xmalloc ((len + 1) * sizeof (wchar_t));
2009-11-18 14:48:00 +00:00
+ memset (&state, '\0', sizeof(mbstate_t));
2007-01-09 19:29:30 +00:00
+
2009-11-18 14:48:00 +00:00
+ wclength = mbsrtowcs (month_wcs, pp, len + 1, &state);
2010-09-20 11:19:42 +00:00
+ if (wclength == (size_t)-1 || *pp != NULL)
+ error (SORT_FAILURE, 0, _("Invalid multibyte input %s."), quote(s));
2007-01-09 19:29:30 +00:00
+
2009-11-18 14:48:00 +00:00
+ for (i = 0; i < wclength; i++)
+ {
+ month_wcs[i] = towupper(month_wcs[i]);
+ if (iswblank (month_wcs[i]))
+ {
+ month_wcs[i] = L'\0';
2009-10-07 08:11:44 +00:00
+ break;
2009-11-18 14:48:00 +00:00
+ }
+ }
2007-01-09 19:29:30 +00:00
+
2009-11-18 14:48:00 +00:00
+ wpp = (const wchar_t **)&month_wcs;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ mblength = wcsrtombs (month, wpp, len + 1, &state);
+ assert (mblength != (-1) && *wpp == NULL);
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ do
+ {
+ int ix = (lo + hi) / 2;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0)
+ hi = ix;
+ else
+ lo = ix;
+ }
+ while (hi - lo > 1);
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name))
+ ? monthtab[lo].val : 0);
2009-10-07 08:11:44 +00:00
+
2012-03-07 20:29:40 +00:00
+ if (ea && result)
2014-01-04 21:48:09 +00:00
+ *ea = (char*) s + strlen (monthtab[lo].name);
2012-03-07 20:29:40 +00:00
+
2013-01-23 09:04:52 +00:00
+ free (month);
+ free (tmp);
+ free (month_wcs);
+
2009-11-18 14:48:00 +00:00
+ return result;
+}
+#endif
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
/* Compare two lines A and B trying every key in sequence until there
are no more keys or a difference is found. */
static int
2013-08-14 17:01:16 +00:00
-keycompare (struct line const *a, struct line const *b)
2013-12-16 16:48:21 +00:00
+keycompare_uni (const struct line *a, const struct line *b)
2009-11-18 14:48:00 +00:00
{
struct keyfield *key = keylist;
2015-07-05 07:17:02 +00:00
@@ -2578,7 +3018,7 @@ keycompare (struct line const *a, struct
2010-10-20 12:03:53 +00:00
else if (key->human_numeric)
diff = human_numcompare (ta, tb);
else if (key->month)
2013-08-14 17:01:16 +00:00
- diff = getmonth (ta, NULL) - getmonth (tb, NULL);
2013-12-16 16:48:21 +00:00
+ diff = getmonth (ta, tlena, NULL) - getmonth (tb, tlenb, NULL);
2010-10-20 12:03:53 +00:00
else if (key->random)
diff = compare_random (ta, tlena, tb, tlenb);
else if (key->version)
2015-07-05 07:17:02 +00:00
@@ -2694,6 +3134,209 @@ keycompare (struct line const *a, struct
2009-11-18 14:48:00 +00:00
return key->reverse ? -diff : diff;
}
+#if HAVE_MBRTOWC
+static int
+keycompare_mb (const struct line *a, const struct line *b)
+{
+ struct keyfield *key = keylist;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ /* For the first iteration only, the key positions have been
+ precomputed for us. */
+ char *texta = a->keybeg;
+ char *textb = b->keybeg;
+ char *lima = a->keylim;
+ char *limb = b->keylim;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ size_t mblength_a, mblength_b;
+ wchar_t wc_a, wc_b;
+ mbstate_t state_a, state_b;
2009-10-07 08:11:44 +00:00
+
2013-08-14 17:01:16 +00:00
+ int diff = 0;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ memset (&state_a, '\0', sizeof(mbstate_t));
+ memset (&state_b, '\0', sizeof(mbstate_t));
2013-08-14 17:01:16 +00:00
+ /* Ignore keys with start after end. */
+ if (a->keybeg - a->keylim > 0)
+ return 0;
2007-01-09 19:29:30 +00:00
+
2009-11-18 14:48:00 +00:00
+
+ /* Ignore and/or translate chars before comparing. */
+# define IGNORE_CHARS(NEW_LEN, LEN, TEXT, COPY, WC, MBLENGTH, STATE) \
+ do \
+ { \
+ wchar_t uwc; \
+ char mbc[MB_LEN_MAX]; \
+ mbstate_t state_wc; \
+ \
+ for (NEW_LEN = i = 0; i < LEN;) \
+ { \
+ mbstate_t state_bak; \
+ \
+ state_bak = STATE; \
+ MBLENGTH = mbrtowc (&WC, TEXT + i, LEN - i, &STATE); \
+ \
+ if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1 \
+ || MBLENGTH == 0) \
+ { \
+ if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1) \
+ STATE = state_bak; \
+ if (!ignore) \
2011-09-05 06:18:24 +00:00
+ COPY[NEW_LEN++] = TEXT[i]; \
+ i++; \
2009-11-18 14:48:00 +00:00
+ continue; \
+ } \
+ \
+ if (ignore) \
+ { \
+ if ((ignore == nonprinting && !iswprint (WC)) \
+ || (ignore == nondictionary \
+ && !iswalnum (WC) && !iswblank (WC))) \
+ { \
+ i += MBLENGTH; \
+ continue; \
+ } \
+ } \
+ \
+ if (translate) \
+ { \
+ \
+ uwc = towupper(WC); \
+ if (WC == uwc) \
+ { \
+ memcpy (mbc, TEXT + i, MBLENGTH); \
+ i += MBLENGTH; \
+ } \
+ else \
+ { \
+ i += MBLENGTH; \
+ WC = uwc; \
+ memset (&state_wc, '\0', sizeof (mbstate_t)); \
+ \
+ MBLENGTH = wcrtomb (mbc, WC, &state_wc); \
+ assert (MBLENGTH != (size_t)-1 && MBLENGTH != 0); \
+ } \
+ \
+ for (j = 0; j < MBLENGTH; j++) \
+ COPY[NEW_LEN++] = mbc[j]; \
+ } \
+ else \
+ for (j = 0; j < MBLENGTH; j++) \
+ COPY[NEW_LEN++] = TEXT[i++]; \
+ } \
+ COPY[NEW_LEN] = '\0'; \
+ } \
+ while (0)
2013-08-14 17:01:16 +00:00
+
+ /* Actually compare the fields. */
+
+ for (;;)
+ {
+ /* Find the lengths. */
+ size_t lena = lima <= texta ? 0 : lima - texta;
+ size_t lenb = limb <= textb ? 0 : limb - textb;
+
2014-10-01 12:49:30 +00:00
+ char enda IF_LINT (= 0);
+ char endb IF_LINT (= 0);
+
2013-08-14 17:01:16 +00:00
+ char const *translate = key->translate;
+ bool const *ignore = key->ignore;
+
+ if (ignore || translate)
+ {
2015-05-13 08:53:55 +00:00
+ char *copy_a = (char *) xmalloc ((lena + lenb) * MB_CUR_MAX + 2);
+ char *copy_b = copy_a + lena * MB_CUR_MAX + 1;
2013-08-14 17:01:16 +00:00
+ size_t new_len_a, new_len_b;
+ size_t i, j;
+
+ IGNORE_CHARS (new_len_a, lena, texta, copy_a,
+ wc_a, mblength_a, state_a);
+ IGNORE_CHARS (new_len_b, lenb, textb, copy_b,
+ wc_b, mblength_b, state_b);
+ texta = copy_a; textb = copy_b;
+ lena = new_len_a; lenb = new_len_b;
2009-10-07 08:11:44 +00:00
+ }
2014-10-01 12:49:30 +00:00
+ else
+ {
+ /* Use the keys in-place, temporarily null-terminated. */
+ enda = texta[lena]; texta[lena] = '\0';
+ endb = textb[lenb]; textb[lenb] = '\0';
+ }
2009-10-07 08:11:44 +00:00
+
2013-08-14 17:01:16 +00:00
+ if (key->random)
+ diff = compare_random (texta, lena, textb, lenb);
+ else if (key->numeric | key->general_numeric | key->human_numeric)
+ {
+ char savea = *lima, saveb = *limb;
+
+ *lima = *limb = '\0';
+ diff = (key->numeric ? numcompare (texta, textb)
+ : key->general_numeric ? general_numcompare (texta, textb)
+ : human_numcompare (texta, textb));
+ *lima = savea, *limb = saveb;
+ }
+ else if (key->version)
+ diff = filevercmp (texta, textb);
+ else if (key->month)
+ diff = getmonth (texta, lena, NULL) - getmonth (textb, lenb, NULL);
+ else if (lena == 0)
+ diff = - NONZERO (lenb);
+ else if (lenb == 0)
+ diff = 1;
2014-01-06 10:49:28 +00:00
+ else if (hard_LC_COLLATE && !folding)
+ {
2014-10-01 12:49:30 +00:00
+ diff = xmemcoll0 (texta, lena + 1, textb, lenb + 1);
2014-01-06 10:49:28 +00:00
+ }
2013-08-14 17:01:16 +00:00
+ else
2014-10-01 12:49:30 +00:00
+ {
+ diff = memcmp (texta, textb, MIN (lena, lenb));
+ if (diff == 0)
+ diff = lena < lenb ? -1 : lena != lenb;
+ }
2013-08-14 17:01:16 +00:00
+
+ if (ignore || translate)
+ free (texta);
2014-10-01 12:49:30 +00:00
+ else
+ {
+ texta[lena] = enda;
+ textb[lenb] = endb;
+ }
2013-08-14 17:01:16 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (diff)
+ goto not_equal;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ key = key->next;
+ if (! key)
+ break;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ /* Find the beginning and limit of the next field. */
+ if (key->eword != -1)
+ lima = limfield (a, key), limb = limfield (b, key);
2009-10-07 08:11:44 +00:00
+ else
2009-11-18 14:48:00 +00:00
+ lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (key->sword != -1)
+ texta = begfield (a, key), textb = begfield (b, key);
+ else
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ texta = a->text, textb = b->text;
+ if (key->skipsblanks)
+ {
+ while (texta < lima && ismbblank (texta, lima - texta, &mblength_a))
+ texta += mblength_a;
+ while (textb < limb && ismbblank (textb, limb - textb, &mblength_b))
+ textb += mblength_b;
+ }
2009-10-07 08:11:44 +00:00
+ }
+ }
+
2009-11-18 14:48:00 +00:00
+not_equal:
2013-08-14 17:01:16 +00:00
+ if (key && key->reverse)
+ return -diff;
+ else
+ return diff;
2009-11-18 14:48:00 +00:00
+}
+#endif
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
/* Compare two lines A and B, returning negative, zero, or positive
depending on whether A compares less than, equal to, or greater than B. */
2015-07-05 07:17:02 +00:00
@@ -2721,7 +3364,7 @@ compare (struct line const *a, struct li
2013-08-14 17:01:16 +00:00
diff = - NONZERO (blen);
else if (blen == 0)
diff = 1;
- else if (hard_LC_COLLATE)
2014-01-02 20:29:20 +00:00
+ else if (hard_LC_COLLATE && !folding)
{
/* Note xmemcoll0 is a performance enhancement as
it will not unconditionally write '\0' after the
2015-07-05 07:17:02 +00:00
@@ -4120,6 +4763,7 @@ set_ordering (char const *s, struct keyf
2014-01-02 20:29:20 +00:00
break;
case 'f':
key->translate = fold_toupper;
+ folding = true;
break;
case 'g':
key->general_numeric = true;
2015-07-05 07:17:02 +00:00
@@ -4197,7 +4841,7 @@ main (int argc, char **argv)
2009-11-18 14:48:00 +00:00
initialize_exit_failure (SORT_FAILURE);
hard_LC_COLLATE = hard_locale (LC_COLLATE);
2013-08-14 17:01:16 +00:00
-#if HAVE_NL_LANGINFO
2013-12-16 16:48:21 +00:00
+#if HAVE_LANGINFO_CODESET
2009-11-18 14:48:00 +00:00
hard_LC_TIME = hard_locale (LC_TIME);
#endif
2015-07-05 07:17:02 +00:00
@@ -4218,6 +4862,29 @@ main (int argc, char **argv)
2009-11-18 14:48:00 +00:00
thousands_sep = -1;
}
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ inittables = inittables_mb;
+ begfield = begfield_mb;
+ limfield = limfield_mb;
2010-10-26 16:53:51 +00:00
+ skipblanks = skipblanks_mb;
2009-11-18 14:48:00 +00:00
+ getmonth = getmonth_mb;
+ keycompare = keycompare_mb;
+ numcompare = numcompare_mb;
2009-10-07 08:11:44 +00:00
+ }
2009-11-18 14:48:00 +00:00
+ else
+#endif
2009-10-07 08:11:44 +00:00
+ {
2009-11-18 14:48:00 +00:00
+ inittables = inittables_uni;
+ begfield = begfield_uni;
+ limfield = limfield_uni;
2010-10-26 16:53:51 +00:00
+ skipblanks = skipblanks_uni;
2009-11-18 14:48:00 +00:00
+ getmonth = getmonth_uni;
+ keycompare = keycompare_uni;
+ numcompare = numcompare_uni;
2009-10-07 08:11:44 +00:00
+ }
+
2009-11-18 14:48:00 +00:00
have_read_stdin = false;
inittables ();
2015-07-05 07:17:02 +00:00
@@ -4492,13 +5159,34 @@ main (int argc, char **argv)
2009-11-18 14:48:00 +00:00
case 't':
{
2013-12-16 16:48:21 +00:00
- char newtab = optarg[0];
- if (! newtab)
2009-11-18 14:48:00 +00:00
+ char newtab[MB_LEN_MAX + 1];
+ size_t newtab_length = 1;
+ strncpy (newtab, optarg, MB_LEN_MAX);
+ if (! newtab[0])
error (SORT_FAILURE, 0, _("empty tab"));
2013-12-16 16:48:21 +00:00
- if (optarg[1])
2009-11-18 14:48:00 +00:00
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ {
+ wchar_t wc;
+ mbstate_t state;
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ memset (&state, '\0', sizeof (mbstate_t));
+ newtab_length = mbrtowc (&wc, newtab, strnlen (newtab,
+ MB_LEN_MAX),
+ &state);
+ switch (newtab_length)
+ {
+ case (size_t) -1:
+ case (size_t) -2:
+ case 0:
+ newtab_length = 1;
+ }
+ }
+#endif
+ if (newtab_length == 1 && optarg[1])
{
if (STREQ (optarg, "\\0"))
2013-08-14 17:01:16 +00:00
- newtab = '\0';
2013-12-16 16:48:21 +00:00
+ newtab[0] = '\0';
2009-11-18 14:48:00 +00:00
else
{
2012-03-26 16:23:32 +00:00
/* Provoke with 'sort -txx'. Complain about
2015-07-05 07:17:02 +00:00
@@ -4509,9 +5197,12 @@ main (int argc, char **argv)
2009-11-18 14:48:00 +00:00
quote (optarg));
}
}
2013-12-16 16:48:21 +00:00
- if (tab != TAB_DEFAULT && tab != newtab)
2009-11-18 14:48:00 +00:00
+ if (tab_length
+ && (tab_length != newtab_length
+ || memcmp (tab, newtab, tab_length) != 0))
error (SORT_FAILURE, 0, _("incompatible tabs"));
2013-12-16 16:48:21 +00:00
- tab = newtab;
2009-11-18 14:48:00 +00:00
+ memcpy (tab, newtab, newtab_length);
+ tab_length = newtab_length;
}
break;
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/src/unexpand.c coreutils-8.24/src/unexpand.c
--- coreutils-8.24-orig/src/unexpand.c 2015-06-26 19:05:22.000000000 +0200
+++ coreutils-8.24/src/unexpand.c 2015-07-05 09:04:33.032546980 +0200
2012-03-26 16:23:32 +00:00
@@ -38,12 +38,29 @@
2007-01-09 19:29:30 +00:00
#include <stdio.h>
#include <getopt.h>
#include <sys/types.h>
+
+/* Get mbstate_t, mbrtowc(), wcwidth(). */
+#if HAVE_WCHAR_H
+# include <wchar.h>
+#endif
+
#include "system.h"
#include "error.h"
2010-10-20 12:03:53 +00:00
#include "fadvise.h"
2007-01-09 19:29:30 +00:00
#include "quote.h"
#include "xstrndup.h"
+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
+ installation; work around this configuration error. */
+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
+# define MB_LEN_MAX 16
+#endif
+
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
+#if HAVE_MBRTOWC && defined mbstate_t
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
+#endif
+
2012-03-26 16:23:32 +00:00
/* The official name of this program (e.g., no 'g' prefix). */
2007-01-09 19:29:30 +00:00
#define PROGRAM_NAME "unexpand"
2013-02-20 15:33:47 +00:00
@@ -103,6 +120,210 @@ static struct option const longopts[] =
2007-01-09 19:29:30 +00:00
{NULL, 0, NULL, 0}
};
+static FILE *next_file (FILE *fp);
+
+#if HAVE_MBRTOWC
+static void
+unexpand_multibyte (void)
+{
+ FILE *fp; /* Input stream. */
+ mbstate_t i_state; /* Current shift state of the input stream. */
+ mbstate_t i_state_bak; /* Back up the I_STATE. */
+ mbstate_t o_state; /* Current shift state of the output stream. */
+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
2011-03-14 08:43:52 +00:00
+ char *bufpos = buf; /* Next read position of BUF. */
2007-01-09 19:29:30 +00:00
+ size_t buflen = 0; /* The length of the byte sequence in buf. */
+ wint_t wc; /* A gotten wide character. */
+ size_t mblength; /* The byte size of a multibyte character
+ which shows as same character as WC. */
2013-02-20 15:33:47 +00:00
+ bool prev_tab = false;
2007-01-09 19:29:30 +00:00
+
+ /* Index in `tab_list' of next tabstop: */
+ int tab_index = 0; /* For calculating width of pending tabs. */
+ int print_tab_index = 0; /* For printing as many tabs as possible. */
+ unsigned int column = 0; /* Column on screen of next char. */
+ int next_tab_column; /* Column the next tab stop is on. */
+ int convert = 1; /* If nonzero, perform translations. */
+ unsigned int pending = 0; /* Pending columns of blanks. */
+
+ fp = next_file ((FILE *) NULL);
+ if (fp == NULL)
+ return;
+
+ memset (&o_state, '\0', sizeof(mbstate_t));
+ memset (&i_state, '\0', sizeof(mbstate_t));
+
+ for (;;)
+ {
+ if (buflen < MB_LEN_MAX && !feof(fp) && !ferror(fp))
+ {
+ memmove (buf, bufpos, buflen);
+ buflen += fread (buf + buflen, sizeof(char), BUFSIZ, fp);
+ bufpos = buf;
+ }
+
+ /* Get a wide character. */
+ if (buflen < 1)
+ {
+ mblength = 1;
+ wc = WEOF;
+ }
+ else
+ {
+ i_state_bak = i_state;
+ mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &i_state);
+ }
+
+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
+ {
+ i_state = i_state_bak;
+ wc = L'\0';
+ }
+
+ if (wc == L' ' && convert && column < INT_MAX)
+ {
+ ++pending;
+ ++column;
+ }
+ else if (wc == L'\t' && convert)
+ {
+ if (tab_size == 0)
+ {
+ /* Do not let tab_index == first_free_tab;
+ stop when it is 1 less. */
+ while (tab_index < first_free_tab - 1
+ && column >= tab_list[tab_index])
+ tab_index++;
+ next_tab_column = tab_list[tab_index];
+ if (tab_index < first_free_tab - 1)
+ tab_index++;
+ if (column >= next_tab_column)
+ {
+ convert = 0; /* Ran out of tab stops. */
+ goto flush_pend_mb;
+ }
+ }
+ else
+ {
+ next_tab_column = column + tab_size - column % tab_size;
+ }
+ pending += next_tab_column - column;
+ column = next_tab_column;
+ }
+ else
+ {
+flush_pend_mb:
+ /* Flush pending spaces. Print as many tabs as possible,
+ then print the rest as spaces. */
2013-02-20 15:33:47 +00:00
+ if (pending == 1 && column != 1 && !prev_tab)
2007-01-09 19:29:30 +00:00
+ {
+ putchar (' ');
+ pending = 0;
+ }
+ column -= pending;
+ while (pending > 0)
+ {
+ if (tab_size == 0)
+ {
+ /* Do not let print_tab_index == first_free_tab;
+ stop when it is 1 less. */
+ while (print_tab_index < first_free_tab - 1
2004-09-09 03:58:39 +00:00
+ && column >= tab_list[print_tab_index])
+ print_tab_index++;
+ next_tab_column = tab_list[print_tab_index];
+ if (print_tab_index < first_free_tab - 1)
+ print_tab_index++;
+ }
+ else
+ {
+ next_tab_column =
+ column + tab_size - column % tab_size;
+ }
+ if (next_tab_column - column <= pending)
+ {
+ putchar ('\t');
+ pending -= next_tab_column - column;
+ column = next_tab_column;
+ }
+ else
+ {
2007-01-09 19:29:30 +00:00
+ --print_tab_index;
+ column += pending;
+ while (pending != 0)
+ {
+ putchar (' ');
+ pending--;
+ }
+ }
+ }
+
+ if (wc == WEOF)
+ {
+ fp = next_file (fp);
+ if (fp == NULL)
+ break; /* No more files. */
+ else
+ {
+ memset (&i_state, '\0', sizeof(mbstate_t));
+ continue;
+ }
+ }
+
+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
+ {
+ if (convert)
+ {
+ ++column;
+ if (convert_entire_line == 0)
+ convert = 0;
+ }
+ mblength = 1;
+ putchar (buf[0]);
+ }
+ else if (mblength == 0)
+ {
2009-11-18 14:48:00 +00:00
+ if (convert && convert_entire_line == 0)
+ convert = 0;
+ mblength = 1;
+ putchar ('\0');
+ }
+ else
+ {
+ if (convert)
+ {
+ if (wc == L'\b')
+ {
+ if (column > 0)
+ --column;
+ }
+ else
+ {
+ int width; /* The width of WC. */
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ width = wcwidth (wc);
+ column += (width > 0) ? width : 0;
+ if (convert_entire_line == 0)
+ convert = 0;
+ }
+ }
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
+ if (wc == L'\n')
+ {
+ tab_index = print_tab_index = 0;
+ column = pending = 0;
+ convert = 1;
+ }
+ fwrite (bufpos, sizeof(char), mblength, stdout);
+ }
+ }
2013-02-20 15:33:47 +00:00
+ prev_tab = wc == L'\t';
2009-11-18 14:48:00 +00:00
+ buflen -= mblength;
+ bufpos += mblength;
2004-09-09 03:58:39 +00:00
+ }
+}
2009-11-18 14:48:00 +00:00
+#endif
2004-09-09 03:58:39 +00:00
+
2009-10-07 08:11:44 +00:00
+
2009-11-18 14:48:00 +00:00
void
usage (int status)
{
2013-12-16 16:48:21 +00:00
@@ -523,7 +744,12 @@ main (int argc, char **argv)
2009-11-18 14:48:00 +00:00
file_list = (optind < argc ? &argv[optind] : stdin_argv);
- unexpand ();
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ unexpand_multibyte ();
2009-10-07 08:11:44 +00:00
+ else
2009-11-18 14:48:00 +00:00
+#endif
+ unexpand ();
if (have_read_stdin && fclose (stdin) != 0)
error (EXIT_FAILURE, errno, "-");
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/src/uniq.c coreutils-8.24/src/uniq.c
--- coreutils-8.24-orig/src/uniq.c 2015-06-26 19:04:19.000000000 +0200
+++ coreutils-8.24/src/uniq.c 2015-07-05 09:04:33.032546980 +0200
2014-01-08 12:58:02 +00:00
@@ -21,6 +21,17 @@
2009-10-07 08:11:44 +00:00
#include <getopt.h>
#include <sys/types.h>
2006-05-15 14:10:12 +00:00
2009-10-07 08:11:44 +00:00
+/* Get mbstate_t, mbrtowc(). */
+#if HAVE_WCHAR_H
+# include <wchar.h>
+#endif
+
+/* Get isw* functions. */
+#if HAVE_WCTYPE_H
+# include <wctype.h>
+#endif
2014-01-04 21:48:09 +00:00
+#include <assert.h>
2009-10-07 08:11:44 +00:00
+
#include "system.h"
#include "argmatch.h"
#include "linebuffer.h"
2014-01-04 21:48:09 +00:00
@@ -32,7 +43,19 @@
2009-11-27 13:24:09 +00:00
#include "stdio--.h"
2009-10-07 08:11:44 +00:00
#include "xmemcoll.h"
#include "xstrtol.h"
-#include "memcasecmp.h"
+#include "xmemcoll.h"
+
+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
+ installation; work around this configuration error. */
+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
+# define MB_LEN_MAX 16
+#endif
+
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
+#if HAVE_MBRTOWC && defined mbstate_t
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
+#endif
+
2006-05-15 14:10:12 +00:00
2012-03-26 16:23:32 +00:00
/* The official name of this program (e.g., no 'g' prefix). */
2009-10-07 08:11:44 +00:00
#define PROGRAM_NAME "uniq"
2014-01-08 12:58:02 +00:00
@@ -143,6 +166,10 @@ enum
2013-12-14 17:41:07 +00:00
GROUP_OPTION = CHAR_MAX + 1
};
2007-01-09 19:29:30 +00:00
2009-10-07 08:11:44 +00:00
+/* Function pointers. */
+static char *
+(*find_field) (struct linebuffer *line);
+
static struct option const longopts[] =
2007-01-09 19:29:30 +00:00
{
2009-10-07 08:11:44 +00:00
{"count", no_argument, NULL, 'c'},
2014-07-22 12:01:39 +00:00
@@ -251,7 +278,7 @@ size_opt (char const *opt, char const *m
2009-10-07 08:11:44 +00:00
return a pointer to the beginning of the line's field to be compared. */
2005-10-28 15:08:16 +00:00
2011-09-09 11:16:47 +00:00
static char * _GL_ATTRIBUTE_PURE
2009-10-07 08:11:44 +00:00
-find_field (struct linebuffer const *line)
+find_field_uni (struct linebuffer *line)
{
size_t count;
char const *lp = line->buffer;
2014-07-22 12:01:39 +00:00
@@ -271,6 +298,83 @@ find_field (struct linebuffer const *lin
2009-10-07 08:11:44 +00:00
return line->buffer + i;
2007-01-09 19:29:30 +00:00
}
2005-10-28 15:08:16 +00:00
2009-10-07 08:11:44 +00:00
+#if HAVE_MBRTOWC
+
+# define MBCHAR_TO_WCHAR(WC, MBLENGTH, LP, POS, SIZE, STATEP, CONVFAIL) \
+ do \
+ { \
+ mbstate_t state_bak; \
+ \
+ CONVFAIL = 0; \
+ state_bak = *STATEP; \
+ \
+ MBLENGTH = mbrtowc (&WC, LP + POS, SIZE - POS, STATEP); \
+ \
+ switch (MBLENGTH) \
+ { \
+ case (size_t)-2: \
+ case (size_t)-1: \
+ *STATEP = state_bak; \
+ CONVFAIL++; \
+ /* Fall through */ \
+ case 0: \
+ MBLENGTH = 1; \
+ } \
+ } \
+ while (0)
+
+static char *
+find_field_multi (struct linebuffer *line)
2005-10-28 15:08:16 +00:00
+{
2009-10-07 08:11:44 +00:00
+ size_t count;
+ char *lp = line->buffer;
+ size_t size = line->length - 1;
+ size_t pos;
2007-01-09 19:29:30 +00:00
+ size_t mblength;
2009-10-07 08:11:44 +00:00
+ wchar_t wc;
+ mbstate_t *statep;
2011-03-14 08:43:52 +00:00
+ int convfail = 0;
2009-10-07 08:11:44 +00:00
+
+ pos = 0;
+ statep = &(line->state);
+
+ /* skip fields. */
+ for (count = 0; count < skip_fields && pos < size; count++)
+ {
+ while (pos < size)
+ {
+ MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
+
+ if (convfail || !iswblank (wc))
+ {
+ pos += mblength;
+ break;
+ }
+ pos += mblength;
+ }
+
+ while (pos < size)
+ {
+ MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
+
+ if (!convfail && iswblank (wc))
+ break;
+
+ pos += mblength;
+ }
+ }
+
+ /* skip fields. */
+ for (count = 0; count < skip_chars && pos < size; count++)
+ {
+ MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
+ pos += mblength;
+ }
+
+ return lp + pos;
+}
+#endif
2005-10-28 15:08:16 +00:00
+
2009-10-07 08:11:44 +00:00
/* Return false if two strings OLD and NEW match, true if not.
OLD and NEW point not to the beginnings of the lines
but rather to the beginnings of the fields to compare.
2014-07-22 12:01:39 +00:00
@@ -279,6 +383,8 @@ find_field (struct linebuffer const *lin
2009-10-07 08:11:44 +00:00
static bool
different (char *old, char *new, size_t oldlen, size_t newlen)
{
+ char *copy_old, *copy_new;
2005-10-28 15:08:16 +00:00
+
2009-10-07 08:11:44 +00:00
if (check_chars < oldlen)
oldlen = check_chars;
if (check_chars < newlen)
2015-07-05 07:17:02 +00:00
@@ -286,15 +392,104 @@ different (char *old, char *new, size_t
2009-10-07 08:11:44 +00:00
if (ignore_case)
{
- /* FIXME: This should invoke strcoll somehow. */
- return oldlen != newlen || memcasecmp (old, new, oldlen);
+ size_t i;
2006-05-15 14:10:12 +00:00
+
2013-01-23 09:04:52 +00:00
+ copy_old = xmalloc (oldlen + 1);
+ copy_new = xmalloc (oldlen + 1);
2006-05-15 14:10:12 +00:00
+
2009-10-07 08:11:44 +00:00
+ for (i = 0; i < oldlen; i++)
+ {
+ copy_old[i] = toupper (old[i]);
+ copy_new[i] = toupper (new[i]);
+ }
2013-01-23 09:04:52 +00:00
+ bool rc = xmemcoll (copy_old, oldlen, copy_new, newlen);
+ free (copy_old);
+ free (copy_new);
+ return rc;
2009-10-07 08:11:44 +00:00
}
- else if (hard_LC_COLLATE)
- return xmemcoll (old, oldlen, new, newlen) != 0;
else
- return oldlen != newlen || memcmp (old, new, oldlen);
+ {
+ copy_old = (char *)old;
+ copy_new = (char *)new;
+ }
2006-05-15 14:10:12 +00:00
+
2009-10-07 08:11:44 +00:00
+ return xmemcoll (copy_old, oldlen, copy_new, newlen);
2013-01-23 09:04:52 +00:00
+
2015-07-05 07:17:02 +00:00
}
2009-10-07 08:11:44 +00:00
+#if HAVE_MBRTOWC
+static int
+different_multi (const char *old, const char *new, size_t oldlen, size_t newlen, mbstate_t oldstate, mbstate_t newstate)
+{
+ size_t i, j, chars;
+ const char *str[2];
+ char *copy[2];
+ size_t len[2];
+ mbstate_t state[2];
+ size_t mblength;
+ wchar_t wc, uwc;
+ mbstate_t state_bak;
2007-01-09 19:29:30 +00:00
+
2009-10-07 08:11:44 +00:00
+ str[0] = old;
+ str[1] = new;
+ len[0] = oldlen;
+ len[1] = newlen;
+ state[0] = oldstate;
+ state[1] = newstate;
2009-09-12 09:28:49 +00:00
+
2009-10-07 08:11:44 +00:00
+ for (i = 0; i < 2; i++)
+ {
2013-12-22 15:39:13 +00:00
+ copy[i] = xmalloc (len[i] + 1);
+ memset (copy[i], '\0', len[i] + 1);
2009-09-12 09:28:49 +00:00
+
2009-10-07 08:11:44 +00:00
+ for (j = 0, chars = 0; j < len[i] && chars < check_chars; chars++)
+ {
+ state_bak = state[i];
+ mblength = mbrtowc (&wc, str[i] + j, len[i] - j, &(state[i]));
+
+ switch (mblength)
2009-09-12 09:28:49 +00:00
+ {
2009-10-07 08:11:44 +00:00
+ case (size_t)-1:
+ case (size_t)-2:
+ state[i] = state_bak;
+ /* Fall through */
+ case 0:
+ mblength = 1;
+ break;
+
+ default:
+ if (ignore_case)
2009-09-12 09:28:49 +00:00
+ {
2009-10-07 08:11:44 +00:00
+ uwc = towupper (wc);
+
+ if (uwc != wc)
2009-09-12 09:28:49 +00:00
+ {
2009-10-07 08:11:44 +00:00
+ mbstate_t state_wc;
2014-01-04 21:48:09 +00:00
+ size_t mblen;
2009-10-07 08:11:44 +00:00
+
+ memset (&state_wc, '\0', sizeof(mbstate_t));
2014-01-04 21:48:09 +00:00
+ mblen = wcrtomb (copy[i] + j, uwc, &state_wc);
+ assert (mblen != (size_t)-1);
2009-09-12 09:28:49 +00:00
+ }
+ else
2009-10-07 08:11:44 +00:00
+ memcpy (copy[i] + j, str[i] + j, mblength);
2009-09-12 09:28:49 +00:00
+ }
+ else
2009-10-07 08:11:44 +00:00
+ memcpy (copy[i] + j, str[i] + j, mblength);
2009-09-12 09:28:49 +00:00
+ }
2009-10-07 08:11:44 +00:00
+ j += mblength;
+ }
+ copy[i][j] = '\0';
+ len[i] = j;
+ }
2013-01-23 09:04:52 +00:00
+ int rc = xmemcoll (copy[0], len[0], copy[1], len[1]);
+ free (copy[0]);
+ free (copy[1]);
+ return rc;
2009-10-07 08:11:44 +00:00
+
2015-07-05 07:17:02 +00:00
+}
2009-10-07 08:11:44 +00:00
+#endif
2015-07-05 07:17:02 +00:00
+
2009-10-07 08:11:44 +00:00
/* Output the line in linebuffer LINE to standard output
provided that the switches say it should be output.
2015-07-05 07:17:02 +00:00
MATCH is true if the line matches the previous line.
2014-07-22 12:01:39 +00:00
@@ -358,19 +553,38 @@ check_file (const char *infile, const ch
2010-10-20 12:03:53 +00:00
char *prevfield IF_LINT ( = NULL);
size_t prevlen IF_LINT ( = 0);
2013-12-14 17:41:07 +00:00
bool first_group_printed = false;
2009-10-07 08:11:44 +00:00
+#if HAVE_MBRTOWC
+ mbstate_t prevstate;
+
+ memset (&prevstate, '\0', sizeof (mbstate_t));
+#endif
while (!feof (stdin))
{
char *thisfield;
size_t thislen;
2013-12-14 17:41:07 +00:00
bool new_group;
2009-10-07 08:11:44 +00:00
+#if HAVE_MBRTOWC
+ mbstate_t thisstate;
+#endif
2013-12-14 17:41:07 +00:00
2009-10-07 08:11:44 +00:00
if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
break;
2013-12-14 17:41:07 +00:00
2009-10-07 08:11:44 +00:00
thisfield = find_field (thisline);
thislen = thisline->length - 1 - (thisfield - thisline->buffer);
+#if HAVE_MBRTOWC
2014-01-04 21:53:29 +00:00
+ if (MB_CUR_MAX > 1)
2009-10-07 08:11:44 +00:00
+ {
2014-01-04 21:53:29 +00:00
+ thisstate = thisline->state;
2013-12-14 17:41:07 +00:00
2014-01-04 21:53:29 +00:00
+ new_group = (prevline->length == 0
+ || different_multi (thisfield, prevfield,
+ thislen, prevlen,
+ thisstate, prevstate));
+ }
+ else
+#endif
2013-12-14 17:41:07 +00:00
new_group = (prevline->length == 0
|| different (thisfield, prevfield, thislen, prevlen));
2014-01-04 21:53:29 +00:00
2014-07-22 12:01:39 +00:00
@@ -388,6 +602,10 @@ check_file (const char *infile, const ch
2014-01-04 21:53:29 +00:00
SWAP_LINES (prevline, thisline);
prevfield = thisfield;
2013-12-16 16:48:21 +00:00
prevlen = thislen;
+#if HAVE_MBRTOWC
2014-01-04 21:53:29 +00:00
+ if (MB_CUR_MAX > 1)
+ prevstate = thisstate;
2013-12-16 16:48:21 +00:00
+#endif
2014-01-04 21:53:29 +00:00
first_group_printed = true;
}
2013-12-16 16:48:21 +00:00
}
2014-07-22 12:01:39 +00:00
@@ -400,17 +618,26 @@ check_file (const char *infile, const ch
2009-10-07 08:11:44 +00:00
size_t prevlen;
uintmax_t match_count = 0;
bool first_delimiter = true;
+#if HAVE_MBRTOWC
+ mbstate_t prevstate;
+#endif
if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
goto closefiles;
prevfield = find_field (prevline);
prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
+#if HAVE_MBRTOWC
+ prevstate = prevline->state;
+#endif
while (!feof (stdin))
{
bool match;
char *thisfield;
size_t thislen;
+#if HAVE_MBRTOWC
2011-03-14 08:43:52 +00:00
+ mbstate_t thisstate = thisline->state;
2009-10-07 08:11:44 +00:00
+#endif
if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
{
if (ferror (stdin))
2014-07-22 12:01:39 +00:00
@@ -419,6 +646,14 @@ check_file (const char *infile, const ch
2009-10-07 08:11:44 +00:00
}
thisfield = find_field (thisline);
thislen = thisline->length - 1 - (thisfield - thisline->buffer);
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
2009-09-12 09:28:49 +00:00
+ {
2009-10-07 08:11:44 +00:00
+ match = !different_multi (thisfield, prevfield,
+ thislen, prevlen, thisstate, prevstate);
2009-09-12 09:28:49 +00:00
+ }
2009-10-07 08:11:44 +00:00
+ else
2005-10-28 15:08:16 +00:00
+#endif
2009-10-07 08:11:44 +00:00
match = !different (thisfield, prevfield, thislen, prevlen);
match_count += match;
2006-05-15 14:10:12 +00:00
2014-07-22 12:01:39 +00:00
@@ -451,6 +686,9 @@ check_file (const char *infile, const ch
2009-10-07 08:11:44 +00:00
SWAP_LINES (prevline, thisline);
prevfield = thisfield;
prevlen = thislen;
+#if HAVE_MBRTOWC
+ prevstate = thisstate;
2006-05-15 14:10:12 +00:00
+#endif
2009-10-07 08:11:44 +00:00
if (!match)
match_count = 0;
}
2014-07-22 12:01:39 +00:00
@@ -497,6 +735,19 @@ main (int argc, char **argv)
2006-05-15 14:10:12 +00:00
2009-10-07 08:11:44 +00:00
atexit (close_stdout);
2006-05-15 14:10:12 +00:00
2009-10-07 08:11:44 +00:00
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ {
+ find_field = find_field_multi;
+ }
+ else
2006-05-15 14:10:12 +00:00
+#endif
2009-10-07 08:11:44 +00:00
+ {
+ find_field = find_field_uni;
+ }
2005-10-28 15:08:16 +00:00
+
2006-05-15 14:10:12 +00:00
+
+
2009-10-07 08:11:44 +00:00
skip_chars = 0;
skip_fields = 0;
check_chars = SIZE_MAX;
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/tests/i18n/sort.sh coreutils-8.24/tests/i18n/sort.sh
--- coreutils-8.24-orig/tests/i18n/sort.sh 1970-01-01 01:00:00.000000000 +0100
+++ coreutils-8.24/tests/i18n/sort.sh 2015-07-05 09:04:33.032546980 +0200
@@ -0,0 +1,29 @@
+#!/bin/sh
+# Verify sort's multi-byte support.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ sort
+
+export LC_ALL=en_US.UTF-8
+locale -k LC_CTYPE | grep -q "charmap.*UTF-8" \
+ || skip_ "No UTF-8 locale available"
+
+# Enable heap consistency checkng on older systems
+export MALLOC_CHECK_=2
+
+
+# check buffer overflow issue due to
+# expanding multi-byte representation due to case conversion
+# https://bugzilla.suse.com/show_bug.cgi?id=928749
+cat <<EOF > exp
+.
+ɑ
+EOF
+cat <<EOF | sort -f > out || fail=1
+.
+ɑ
+EOF
+compare exp out || { fail=1; cat out; }
+
+
+Exit $fail
diff -urNp coreutils-8.24-orig/tests/local.mk coreutils-8.24/tests/local.mk
--- coreutils-8.24-orig/tests/local.mk 2015-07-05 09:00:46.526859558 +0200
+++ coreutils-8.24/tests/local.mk 2015-07-05 09:04:33.033546987 +0200
@@ -341,6 +341,8 @@ all_tests = \
2013-02-07 17:08:05 +00:00
tests/misc/sort-discrim.sh \
tests/misc/sort-files0-from.pl \
tests/misc/sort-float.sh \
+ tests/misc/sort-mb-tests.sh \
2015-05-13 08:53:55 +00:00
+ tests/i18n/sort.sh \
2013-02-07 17:08:05 +00:00
tests/misc/sort-merge.pl \
tests/misc/sort-merge-fdlimit.sh \
tests/misc/sort-month.sh \
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/tests/misc/cut.pl coreutils-8.24/tests/misc/cut.pl
--- coreutils-8.24-orig/tests/misc/cut.pl 2015-06-26 19:04:19.000000000 +0200
+++ coreutils-8.24/tests/misc/cut.pl 2015-07-05 09:04:33.033546987 +0200
2013-11-28 00:24:16 +00:00
@@ -23,9 +23,11 @@ use strict;
2011-09-09 11:16:47 +00:00
# Turn off localization of executable's output.
@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
-my $mb_locale = $ENV{LOCALE_FR_UTF8};
2013-11-28 00:24:16 +00:00
+my $mb_locale;
+# uncommented enable multibyte paths
2013-11-28 00:35:10 +00:00
+$mb_locale = $ENV{LOCALE_FR_UTF8};
2013-11-28 00:24:16 +00:00
! defined $mb_locale || $mb_locale eq 'none'
2011-09-09 11:16:47 +00:00
- and $mb_locale = 'C';
2013-11-28 00:24:16 +00:00
+ and $mb_locale = 'C';
2011-09-09 11:16:47 +00:00
2009-10-07 08:11:44 +00:00
my $prog = 'cut';
2012-03-26 16:23:32 +00:00
my $try = "Try '$prog --help' for more information.\n";
2014-07-22 12:01:39 +00:00
@@ -227,6 +229,7 @@ if ($mb_locale ne 'C')
2013-11-28 00:35:10 +00:00
my @new_t = @$t;
my $test_name = shift @new_t;
+ next if ($test_name =~ "newline-[12][0-9]");
push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
}
push @Tests, @new;
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/tests/misc/expand.pl coreutils-8.24/tests/misc/expand.pl
--- coreutils-8.24-orig/tests/misc/expand.pl 2015-06-26 19:04:19.000000000 +0200
+++ coreutils-8.24/tests/misc/expand.pl 2015-07-05 09:04:33.033546987 +0200
2012-08-20 11:57:53 +00:00
@@ -23,6 +23,15 @@ use strict;
# Turn off localization of executable's output.
@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
2013-11-28 00:24:16 +00:00
+#comment out next line to disable multibyte tests
2012-08-20 11:57:53 +00:00
+my $mb_locale = $ENV{LOCALE_FR_UTF8};
+! defined $mb_locale || $mb_locale eq 'none'
+ and $mb_locale = 'C';
+
+my $prog = 'expand';
+my $try = "Try \`$prog --help' for more information.\n";
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
my @Tests =
(
['t1', '--tabs=3', {IN=>"a\tb"}, {OUT=>"a b"}],
@@ -31,6 +40,37 @@ my @Tests =
['i2', '--tabs=3 -i', {IN=>" \ta\tb"}, {OUT=>" a\tb"}],
);
+if ($mb_locale ne 'C')
+ {
+ # Duplicate each test vector, appending "-mb" to the test name and
+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+ # provide coverage for the distro-added multi-byte code paths.
+ my @new;
+ foreach my $t (@Tests)
+ {
+ my @new_t = @$t;
+ my $test_name = shift @new_t;
+
+ # Depending on whether expand is multi-byte-patched,
+ # it emits different diagnostics:
+ # non-MB: invalid byte or field list
+ # MB: invalid byte, character or field list
+ # Adjust the expected error output accordingly.
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+ (@new_t))
+ {
+ my $sub = {ERR_SUBST => 's/, character//'};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+ }
+ push @Tests, @new;
+ }
+
+
+@Tests = triple_test \@Tests;
+
my $save_temps = $ENV{DEBUG};
my $verbose = $ENV{VERBOSE};
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/tests/misc/fold.pl coreutils-8.24/tests/misc/fold.pl
--- coreutils-8.24-orig/tests/misc/fold.pl 2015-06-26 19:04:19.000000000 +0200
+++ coreutils-8.24/tests/misc/fold.pl 2015-07-05 09:04:33.033546987 +0200
2013-11-28 00:24:16 +00:00
@@ -20,9 +20,18 @@ use strict;
(my $program_name = $0) =~ s|.*/||;
+my $prog = 'fold';
+my $try = "Try \`$prog --help' for more information.\n";
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
# Turn off localization of executable's output.
@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
+# uncommented to enable multibyte paths
+my $mb_locale = $ENV{LOCALE_FR_UTF8};
+! defined $mb_locale || $mb_locale eq 'none'
+ and $mb_locale = 'C';
+
my @Tests =
(
['s1', '-w2 -s', {IN=>"a\t"}, {OUT=>"a\n\t"}],
@@ -31,9 +40,48 @@ my @Tests =
['s4', '-w4 -s', {IN=>"abc ef\n"}, {OUT=>"abc \nef\n"}],
);
+# Add _POSIX2_VERSION=199209 to the environment of each test
+# that uses an old-style option like +1.
+if ($mb_locale ne 'C')
+ {
+ # Duplicate each test vector, appending "-mb" to the test name and
+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+ # provide coverage for the distro-added multi-byte code paths.
+ my @new;
+ foreach my $t (@Tests)
+ {
+ my @new_t = @$t;
+ my $test_name = shift @new_t;
+
+ # Depending on whether fold is multi-byte-patched,
+ # it emits different diagnostics:
+ # non-MB: invalid byte or field list
+ # MB: invalid byte, character or field list
+ # Adjust the expected error output accordingly.
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+ (@new_t))
+ {
+ my $sub = {ERR_SUBST => 's/, character//'};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+ }
+ push @Tests, @new;
+ }
+
+@Tests = triple_test \@Tests;
+
+# Remember that triple_test creates from each test with exactly one "IN"
+# file two more tests (.p and .r suffix on name) corresponding to reading
+# input from a file and from a pipe. The pipe-reading test would fail
+# due to a race condition about 1 in 20 times.
+# Remove the IN_PIPE version of the "output-is-input" test above.
+# The others aren't susceptible because they have three inputs each.
+@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
+
my $save_temps = $ENV{DEBUG};
my $verbose = $ENV{VERBOSE};
-my $prog = 'fold';
my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose);
exit $fail;
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/tests/misc/join.pl coreutils-8.24/tests/misc/join.pl
--- coreutils-8.24-orig/tests/misc/join.pl 2015-06-26 19:04:19.000000000 +0200
+++ coreutils-8.24/tests/misc/join.pl 2015-07-05 09:04:33.033546987 +0200
2013-11-28 00:24:16 +00:00
@@ -25,6 +25,15 @@ my $limits = getlimits ();
my $prog = 'join';
+my $try = "Try \`$prog --help' for more information.\n";
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
+my $mb_locale;
+#Comment out next line to disable multibyte tests
+$mb_locale = $ENV{LOCALE_FR_UTF8};
+! defined $mb_locale || $mb_locale eq 'none'
+ and $mb_locale = 'C';
+
my $delim = chr 0247;
sub t_subst ($)
{
2013-12-16 16:48:21 +00:00
@@ -326,8 +335,49 @@ foreach my $t (@tv)
2013-11-28 00:24:16 +00:00
push @Tests, $new_ent;
}
+# Add _POSIX2_VERSION=199209 to the environment of each test
+# that uses an old-style option like +1.
+if ($mb_locale ne 'C')
+ {
+ # Duplicate each test vector, appending "-mb" to the test name and
+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+ # provide coverage for the distro-added multi-byte code paths.
+ my @new;
+ foreach my $t (@Tests)
+ {
+ my @new_t = @$t;
+ my $test_name = shift @new_t;
+
+ # Depending on whether join is multi-byte-patched,
+ # it emits different diagnostics:
+ # non-MB: invalid byte or field list
+ # MB: invalid byte, character or field list
+ # Adjust the expected error output accordingly.
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+ (@new_t))
+ {
+ my $sub = {ERR_SUBST => 's/, character//'};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
+ #Adjust the output some error messages including test_name for mb
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR}}
+ (@new_t))
+ {
+ my $sub2 = {ERR_SUBST => "s/$test_name-mb/$test_name/"};
+ push @new_t, $sub2;
+ push @$t, $sub2;
+ }
+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+ }
+ push @Tests, @new;
+ }
+
@Tests = triple_test \@Tests;
+#skip invalid-j-mb test, it is failing because of the format
+@Tests = grep {$_->[0] ne 'invalid-j-mb'} @Tests;
+
my $save_temps = $ENV{DEBUG};
my $verbose = $ENV{VERBOSE};
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/tests/misc/sort-mb-tests.sh coreutils-8.24/tests/misc/sort-mb-tests.sh
--- coreutils-8.24-orig/tests/misc/sort-mb-tests.sh 1970-01-01 01:00:00.000000000 +0100
+++ coreutils-8.24/tests/misc/sort-mb-tests.sh 2015-07-05 09:04:33.034546995 +0200
2013-02-18 15:59:45 +00:00
@@ -0,0 +1,45 @@
+#!/bin/sh
+# Verify sort's multi-byte support.
2013-02-07 17:08:05 +00:00
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
2013-02-18 15:59:45 +00:00
+print_ver_ sort
2013-02-07 17:08:05 +00:00
+
2013-02-18 15:59:45 +00:00
+export LC_ALL=en_US.UTF-8
+locale -k LC_CTYPE | grep -q "charmap.*UTF-8" \
+ || skip_ "No UTF-8 locale available"
2013-02-07 17:08:05 +00:00
+
2009-09-12 09:28:49 +00:00
+
2013-02-18 15:59:45 +00:00
+cat <<EOF > exp
+Banana@ 5
+Apple@ 10
+Citrus@ 20
+Cherry@ 30
+EOF
+
+cat <<EOF | sort -t @ -k2 -n > out || fail=1
+Apple@ 10
+Banana@ 5
+Citrus@ 20
+Cherry@ 30
+EOF
+
+compare exp out || { fail=1; cat out; }
+
+
+cat <<EOF > exp
+Citrus@ A A 20@ @ 5
+Cherry@ A A 30@ @ 10
+Apple@ A A 10@ @ 20
+Banana@ A A 5@ @ 30
+EOF
+
+cat <<EOF | sort -t @ -k4 -n > out || fail=1
+Apple@ A A 10@ @ 20
+Banana@ A A 5@ @ 30
+Citrus@ A A 20@ @ 5
+Cherry@ A A 30@ @ 10
+EOF
+
+compare exp out || { fail=1; cat out; }
+
+Exit $fail
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/tests/misc/sort-merge.pl coreutils-8.24/tests/misc/sort-merge.pl
--- coreutils-8.24-orig/tests/misc/sort-merge.pl 2015-06-26 19:04:19.000000000 +0200
+++ coreutils-8.24/tests/misc/sort-merge.pl 2015-07-05 09:04:33.034546995 +0200
2013-11-28 00:24:16 +00:00
@@ -26,6 +26,15 @@ my $prog = 'sort';
# Turn off localization of executable's output.
@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
+my $mb_locale;
+# uncommented according to upstream commit enabling multibyte paths
2013-11-28 00:35:10 +00:00
+$mb_locale = $ENV{LOCALE_FR_UTF8};
2013-11-28 00:24:16 +00:00
+! defined $mb_locale || $mb_locale eq 'none'
+ and $mb_locale = 'C';
+
+my $try = "Try \`$prog --help' for more information.\n";
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
# three empty files and one that says 'foo'
my @inputs = (+(map{{IN=> {"empty$_"=> ''}}}1..3), {IN=> {foo=> "foo\n"}});
@@ -77,6 +86,39 @@ my @Tests =
{OUT=>$big_input}],
);
+# Add _POSIX2_VERSION=199209 to the environment of each test
+# that uses an old-style option like +1.
+if ($mb_locale ne 'C')
+ {
+ # Duplicate each test vector, appending "-mb" to the test name and
+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+ # provide coverage for the distro-added multi-byte code paths.
+ my @new;
+ foreach my $t (@Tests)
+ {
+ my @new_t = @$t;
+ my $test_name = shift @new_t;
+
+ # Depending on whether sort is multi-byte-patched,
+ # it emits different diagnostics:
+ # non-MB: invalid byte or field list
+ # MB: invalid byte, character or field list
+ # Adjust the expected error output accordingly.
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+ (@new_t))
+ {
+ my $sub = {ERR_SUBST => 's/, character//'};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
2013-11-28 00:35:10 +00:00
+ next if ($test_name =~ "nmerge-.");
2013-11-28 00:24:16 +00:00
+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+ }
+ push @Tests, @new;
+ }
+
+@Tests = triple_test \@Tests;
+
my $save_temps = $ENV{DEBUG};
my $verbose = $ENV{VERBOSE};
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/tests/misc/sort.pl coreutils-8.24/tests/misc/sort.pl
--- coreutils-8.24-orig/tests/misc/sort.pl 2015-06-26 19:04:19.000000000 +0200
+++ coreutils-8.24/tests/misc/sort.pl 2015-07-05 09:04:33.034546995 +0200
2013-11-28 00:24:16 +00:00
@@ -24,10 +24,15 @@ my $prog = 'sort';
# Turn off localization of executable's output.
@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
-my $mb_locale = $ENV{LOCALE_FR_UTF8};
+my $mb_locale;
+#Comment out next line to disable multibyte tests
+$mb_locale = $ENV{LOCALE_FR_UTF8};
! defined $mb_locale || $mb_locale eq 'none'
and $mb_locale = 'C';
+my $try = "Try \`$prog --help' for more information.\n";
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
# Since each test is run with a file name and with redirected stdin,
# the name in the diagnostic is either the file name or "-".
# Normalize each diagnostic to use '-'.
2015-07-05 07:17:02 +00:00
@@ -419,6 +428,38 @@ foreach my $t (@Tests)
2013-11-28 00:24:16 +00:00
}
}
2013-12-16 16:48:21 +00:00
2013-11-28 00:24:16 +00:00
+if ($mb_locale ne 'C')
+ {
+ # Duplicate each test vector, appending "-mb" to the test name and
+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+ # provide coverage for the distro-added multi-byte code paths.
+ my @new;
+ foreach my $t (@Tests)
+ {
+ my @new_t = @$t;
+ my $test_name = shift @new_t;
+
+ # Depending on whether sort is multi-byte-patched,
+ # it emits different diagnostics:
+ # non-MB: invalid byte or field list
+ # MB: invalid byte, character or field list
+ # Adjust the expected error output accordingly.
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+ (@new_t))
+ {
+ my $sub = {ERR_SUBST => 's/, character//'};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
+ #disable several failing tests until investigation, disable all tests with envvars set
+ next if (grep {ref $_ eq 'HASH' && exists $_->{ENV}} (@new_t));
2014-01-02 20:29:20 +00:00
+ next if ($test_name =~ "18g" or $test_name =~ "sort-numeric" or $test_name =~ "08[ab]" or $test_name =~ "03[def]" or $test_name =~ "h4" or $test_name =~ "n1" or $test_name =~ "2[01]a");
2014-10-01 12:49:30 +00:00
+ next if ($test_name =~ "11[ab]"); # avoid FP: expected result differs to MB result due to collation rules.
2013-11-28 00:24:16 +00:00
+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+ }
+ push @Tests, @new;
+ }
2013-12-16 16:48:21 +00:00
+
2013-11-28 00:24:16 +00:00
@Tests = triple_test \@Tests;
2013-12-16 16:48:21 +00:00
# Remember that triple_test creates from each test with exactly one "IN"
2015-07-05 07:17:02 +00:00
@@ -428,6 +469,7 @@ foreach my $t (@Tests)
2013-12-12 12:34:32 +00:00
# Remove the IN_PIPE version of the "output-is-input" test above.
# The others aren't susceptible because they have three inputs each.
@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
+@Tests = grep {$_->[0] ne 'output-is-input-mb.p'} @Tests;
my $save_temps = $ENV{DEBUG};
my $verbose = $ENV{VERBOSE};
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/tests/misc/unexpand.pl coreutils-8.24/tests/misc/unexpand.pl
--- coreutils-8.24-orig/tests/misc/unexpand.pl 2015-06-26 19:04:19.000000000 +0200
+++ coreutils-8.24/tests/misc/unexpand.pl 2015-07-05 09:04:33.034546995 +0200
2013-11-28 00:24:16 +00:00
@@ -27,6 +27,14 @@ my $limits = getlimits ();
my $prog = 'unexpand';
+# comment out next line to disable multibyte tests
+my $mb_locale = $ENV{LOCALE_FR_UTF8};
+! defined $mb_locale || $mb_locale eq 'none'
+ and $mb_locale = 'C';
+
+my $try = "Try \`$prog --help' for more information.\n";
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
my @Tests =
(
['a1', {IN=> ' 'x 1 ."y\n"}, {OUT=> ' 'x 1 ."y\n"}],
@@ -92,6 +100,37 @@ my @Tests =
{EXIT => 1}, {ERR => "$prog: tab stop value is too large\n"}],
);
+if ($mb_locale ne 'C')
+ {
+ # Duplicate each test vector, appending "-mb" to the test name and
+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+ # provide coverage for the distro-added multi-byte code paths.
+ my @new;
+ foreach my $t (@Tests)
+ {
+ my @new_t = @$t;
+ my $test_name = shift @new_t;
+
+ # Depending on whether unexpand is multi-byte-patched,
+ # it emits different diagnostics:
+ # non-MB: invalid byte or field list
+ # MB: invalid byte, character or field list
+ # Adjust the expected error output accordingly.
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+ (@new_t))
+ {
+ my $sub = {ERR_SUBST => 's/, character//'};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
+ next if ($test_name =~ 'b-1');
+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+ }
+ push @Tests, @new;
+ }
+
+@Tests = triple_test \@Tests;
+
my $save_temps = $ENV{DEBUG};
my $verbose = $ENV{VERBOSE};
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/tests/misc/uniq.pl coreutils-8.24/tests/misc/uniq.pl
--- coreutils-8.24-orig/tests/misc/uniq.pl 2015-06-26 19:04:19.000000000 +0200
+++ coreutils-8.24/tests/misc/uniq.pl 2015-07-05 09:04:33.035547002 +0200
2013-11-28 00:24:16 +00:00
@@ -23,9 +23,17 @@ my $limits = getlimits ();
my $prog = 'uniq';
my $try = "Try '$prog --help' for more information.\n";
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
# Turn off localization of executable's output.
@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
+my $mb_locale;
+#Comment out next line to disable multibyte tests
+$mb_locale = $ENV{LOCALE_FR_UTF8};
+! defined $mb_locale || $mb_locale eq 'none'
+ and $mb_locale = 'C';
+
# When possible, create a "-z"-testing variant of each test.
sub add_z_variants($)
{
2014-01-08 13:55:05 +00:00
@@ -261,6 +269,53 @@ foreach my $t (@Tests)
2013-11-28 00:24:16 +00:00
and push @$t, {ENV=>'_POSIX2_VERSION=199209'};
}
2013-12-16 16:48:21 +00:00
2013-11-28 00:24:16 +00:00
+if ($mb_locale ne 'C')
+ {
+ # Duplicate each test vector, appending "-mb" to the test name and
+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+ # provide coverage for the distro-added multi-byte code paths.
+ my @new;
+ foreach my $t (@Tests)
+ {
+ my @new_t = @$t;
+ my $test_name = shift @new_t;
+
+ # Depending on whether uniq is multi-byte-patched,
+ # it emits different diagnostics:
+ # non-MB: invalid byte or field list
+ # MB: invalid byte, character or field list
+ # Adjust the expected error output accordingly.
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+ (@new_t))
+ {
+ my $sub = {ERR_SUBST => 's/, character//'};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
2014-01-04 21:52:24 +00:00
+ # In test #145, replace the each ‘ ...’ by '...'.
+ if ($test_name =~ "145")
+ {
+ my $sub = { ERR_SUBST => "s/‘ ([^’ ]+)’ /'\$1'/g"};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
+ next if ( $test_name =~ "schar"
+ or $test_name =~ "^obs-plus"
+ or $test_name =~ "119");
2013-11-28 00:24:16 +00:00
+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+ }
+ push @Tests, @new;
+ }
2013-12-16 16:48:21 +00:00
+
2013-11-28 00:24:16 +00:00
+# Remember that triple_test creates from each test with exactly one "IN"
+# file two more tests (.p and .r suffix on name) corresponding to reading
+# input from a file and from a pipe. The pipe-reading test would fail
+# due to a race condition about 1 in 20 times.
+# Remove the IN_PIPE version of the "output-is-input" test above.
+# The others aren't susceptible because they have three inputs each.
+
+@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
+
@Tests = add_z_variants \@Tests;
@Tests = triple_test \@Tests;
2015-07-05 07:17:02 +00:00
diff -urNp coreutils-8.24-orig/tests/pr/pr-tests.pl coreutils-8.24/tests/pr/pr-tests.pl
--- coreutils-8.24-orig/tests/pr/pr-tests.pl 2015-06-26 19:04:19.000000000 +0200
+++ coreutils-8.24/tests/pr/pr-tests.pl 2015-07-05 09:04:33.035547002 +0200
@@ -24,6 +24,15 @@ use strict;
2013-11-28 00:24:16 +00:00
my $prog = 'pr';
2015-07-05 07:17:02 +00:00
my $normalize_strerror = "s/': .*/'/";
2013-11-28 00:24:16 +00:00
+my $mb_locale;
+#Uncomment the following line to enable multibyte tests
+$mb_locale = $ENV{LOCALE_FR_UTF8};
+! defined $mb_locale || $mb_locale eq 'none'
+ and $mb_locale = 'C';
+
+my $try = "Try \`$prog --help' for more information.\n";
+my $inval = "$prog: invalid byte, character or field list\n$try";
+
my @tv = (
# -b option is no longer an official option. But it's still working to
2015-07-05 07:17:02 +00:00
@@ -467,8 +476,48 @@ push @Tests,
2013-11-28 00:24:16 +00:00
{IN=>{3=>"x\ty\tz\n"}},
{OUT=>join("\t", qw(a b c m n o x y z)) . "\n"} ];
+# Add _POSIX2_VERSION=199209 to the environment of each test
+# that uses an old-style option like +1.
+if ($mb_locale ne 'C')
+ {
+ # Duplicate each test vector, appending "-mb" to the test name and
+ # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+ # provide coverage for the distro-added multi-byte code paths.
+ my @new;
+ foreach my $t (@Tests)
+ {
+ my @new_t = @$t;
+ my $test_name = shift @new_t;
+
+ # Depending on whether pr is multi-byte-patched,
+ # it emits different diagnostics:
+ # non-MB: invalid byte or field list
+ # MB: invalid byte, character or field list
+ # Adjust the expected error output accordingly.
+ if (grep {ref $_ eq 'HASH' && exists $_->{ERR} && $_->{ERR} eq $inval}
+ (@new_t))
+ {
+ my $sub = {ERR_SUBST => 's/, character//'};
+ push @new_t, $sub;
+ push @$t, $sub;
+ }
+ #temporarily skip some failing tests
+ next if ($test_name =~ "col-0" or $test_name =~ "col-inval");
+ push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+ }
+ push @Tests, @new;
+ }
+
@Tests = triple_test \@Tests;
+# Remember that triple_test creates from each test with exactly one "IN"
+# file two more tests (.p and .r suffix on name) corresponding to reading
+# input from a file and from a pipe. The pipe-reading test would fail
+# due to a race condition about 1 in 20 times.
+# Remove the IN_PIPE version of the "output-is-input" test above.
+# The others aren't susceptible because they have three inputs each.
+@Tests = grep {$_->[0] ne 'output-is-input.p'} @Tests;
+
my $save_temps = $ENV{DEBUG};
my $verbose = $ENV{VERBOSE};