coreutils/coreutils-i18n-cut.patch

586 lines
17 KiB
Diff
Raw Normal View History

--- coreutils-8.24/src/cut.c 2015-06-26 19:05:22.000000000 +0200
+++ cut.c 2016-01-15 10:15:04.863804121 +0100
@@ -28,6 +28,11 @@
#include <assert.h>
#include <getopt.h>
#include <sys/types.h>
+
+#include <mbfile.h>
+#include <mbiter.h>
+#include <string.h>
+
#include "system.h"
#include "error.h"
@@ -90,25 +95,16 @@ add_range_pair (size_t lo, size_t hi)
CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
static struct field_range_pair *current_rp;
-/* This buffer is used to support the semantics of the -s option
- (or lack of same) when the specified field list includes (does
- not include) the first field. In both of those cases, the entire
- first field must be read into this buffer to determine whether it
- is followed by a delimiter or a newline before any of it may be
- output. Otherwise, cut_fields can do the job without using this
- buffer. */
-static char *field_1_buffer;
-
-/* The number of bytes allocated for FIELD_1_BUFFER. */
-static size_t field_1_bufsize;
-
enum operating_mode
{
undefined_mode,
- /* Output characters that are in the given bytes. */
+ /* Output the given bytes. */
byte_mode,
+ /* Output characters that are in the given positions . */
+ char_mode,
+
/* Output the given delimiter-separated fields. */
field_mode
};
@@ -120,12 +116,16 @@ static enum operating_mode operating_mod
with field mode. */
static bool suppress_non_delimited;
+/* Unless true, we do not recognize multibyte characters in byte-splitting
+ mode. */
+static bool no_break_mb_chars;
+
/* If true, print all bytes, characters, or fields _except_
those that were specified. */
static bool complement;
/* The delimiter character for field mode. */
-static unsigned char delim;
+static mbf_char_t delim;
/* The delimiter for each line/record. */
static unsigned char line_delim = '\n';
@@ -135,7 +135,7 @@ static size_t output_delimiter_length;
/* The output field separator string. Defaults to the 1-character
string consisting of the input delimiter. */
-static char *output_delimiter_string;
+static char const *output_delimiter_string;
/* True if we have ever read standard input. */
static bool have_read_stdin;
@@ -189,7 +189,7 @@ Print selected parts of lines from each
-f, --fields=LIST select only these fields; also print any line\n\
that contains no delimiter character, unless\n\
the -s option is specified\n\
- -n (ignored)\n\
+ -n with -b, don't split multibyte characters\n\
"), stdout);
fputs (_("\
--complement complement the set of selected bytes, characters\n\
@@ -435,6 +435,12 @@ next_item (size_t *item_idx)
current_rp++;
}
+static inline void
+next_item_n (size_t *item_idx, size_t n)
+{
+ while (n-- > 0)
+ next_item (item_idx);
+}
/* Return nonzero if the K'th field or byte is printable. */
static inline bool
@@ -443,6 +449,15 @@ print_kth (size_t k)
return current_rp->lo <= k;
}
+/* The lo and hi params should be used for the current characters byte position
+ * and byte size, respectively. */
+static inline bool
+rp_intersect (size_t lo, size_t hi)
+{
+ return ((current_rp->lo <= lo && current_rp->hi >= lo)
+ || (current_rp->lo <= hi && current_rp->hi >= hi));
+}
+
/* Return nonzero if K'th byte is the beginning of a range. */
static inline bool
@@ -505,23 +520,215 @@ cut_bytes (FILE *stream)
}
/* Read from stream STREAM, printing to standard output any selected fields. */
+extern ssize_t
+mb_getndelim2 (mbf_char_t **lineptr, size_t *linesize, size_t nmax,
+ mbf_char_t delim1, mbf_char_t delim2, mb_file_t *stream)
+{
+/* The maximum value that getndelim2 can return without suffering from
+ overflow problems, either internally (because of pointer
+ subtraction overflow) or due to the API (because of ssize_t). */
+#define GETNDELIM2_MAXIMUM (PTRDIFF_MAX < SSIZE_MAX ? PTRDIFF_MAX : SSIZE_MAX)
+
+/* Try to add at least this many bytes when extending the buffer.
+ MIN_CHUNK must be no greater than GETNDELIM2_MAXIMUM. */
+#define MIN_CHUNK 64
+ size_t nchars_avail; /* Allocated but unused chars in *LINEPTR. */
+ mbf_char_t *read_pos; /* Where we're reading into *LINEPTR. */
+ ssize_t chars_stored = -1;
+ mbf_char_t *ptr = *lineptr;
+ size_t size = *linesize;
+ bool found_delimiter;
+
+ if (!ptr)
+ {
+ size = nmax < MIN_CHUNK ? nmax : MIN_CHUNK;
+ ptr = malloc (size * sizeof (mbf_char_t));
+ if (!ptr)
+ return -1;
+ }
+
+ if (size < 0)
+ goto done;
+
+ nchars_avail = size;
+ read_pos = ptr;
+
+ if (nchars_avail == 0 && nmax <= size)
+ goto done;
+
+ /* Normalize delimiters, since memchr2 doesn't handle EOF. */
+ if (mb_iseof (delim1))
+ mb_copy (&delim1, &delim2);
+ else if (mb_iseof (delim2))
+ mb_copy (&delim2, &delim1);
+
+ flockfile (stream);
+
+ found_delimiter = false;
+ do
+ {
+ /* Here always ptr + size == read_pos + nchars_avail.
+ Also nchars_avail > 0 || size < nmax. */
+
+ mbf_char_t c IF_LINT (= 0);
+ {
+ mbf_getc (c, *stream);
+ if (mb_iseof (c))
+ {
+ /* Return partial line, if any. */
+ if (read_pos == ptr)
+ goto unlock_done;
+ else
+ break;
+ }
+ if (mb_equal (c, delim1) || mb_equal (c, delim2))
+ found_delimiter = true;
+ }
+
+ /* We always want at least one byte left in the buffer, since we
+ always (unless we get an error while reading the first byte)
+ NUL-terminate the line buffer. */
+
+ if (!nchars_avail)
+ {
+ /* Grow size proportionally, not linearly, to avoid O(n^2)
+ running time. */
+ size_t newsize = size < MIN_CHUNK ? size + MIN_CHUNK : 2 * size;
+ mbf_char_t *newptr;
+
+ /* Respect nmax. This handles possible integer overflow. */
+ if (! (size < newsize && newsize <= nmax))
+ newsize = nmax;
+
+ if (GETNDELIM2_MAXIMUM < newsize)
+ {
+ size_t newsizemax = GETNDELIM2_MAXIMUM + 1;
+ if (size == newsizemax)
+ goto unlock_done;
+ newsize = newsizemax;
+ }
+ nchars_avail = newsize - (read_pos - ptr);
+ newptr = realloc (ptr, newsize * sizeof (mbf_char_t));
+ if (!newptr)
+ goto unlock_done;
+ ptr = newptr;
+ size = newsize;
+ read_pos = size - nchars_avail + ptr;
+ }
+
+ /* Here, if size < nmax, nchars_avail >= buffer_len + 1.
+ If size == nmax, nchars_avail > 0. */
+
+ if (1 < nchars_avail--)
+ {
+ mb_copy(read_pos++, &c);
+ }
+
+ }
+ while (!found_delimiter);
+
+ chars_stored = (read_pos - ptr);
+
+ unlock_done:
+ funlockfile (stream);
+
+ done:
+ *lineptr = ptr;
+ *linesize = size;
+ return chars_stored;
+}
+
+static void
+cut_chars (FILE *stream)
+{
+ size_t char_idx; /* Number of chars in the line so far. */
+ bool print_delimiter;
+ mbf_char_t c;
+ mb_file_t mbf;
+
+ print_delimiter = false;
+ char_idx = 0;
+ current_rp = frp;
+
+ mbf_init (mbf, stream);
+ while (true)
+ {
+ mbf_getc (c, mbf);
+
+ if (mb_iseq (c, line_delim))
+ {
+ putc (line_delim, stdout);
+ char_idx = 0;
+ print_delimiter = false;
+ current_rp = frp;
+ }
+ else if (mb_iseof (c))
+ {
+ if (char_idx > 0)
+ putc (line_delim, stdout);
+ break;
+ }
+ else
+ {
+ /* Forward by one byte. */
+ next_item (&char_idx);
+
+ /* Check if the current characters byte range is within
+ * the argument list. */
+ if (rp_intersect (char_idx, char_idx + mb_len (c) - 1))
+ {
+ if (output_delimiter_specified)
+ {
+ if (print_delimiter && is_range_start_index (char_idx))
+ {
+ fwrite (output_delimiter_string, sizeof (char),
+ output_delimiter_length, stdout);
+ }
+ print_delimiter = true;
+ }
+ mb_putc (c, stdout);
+ }
+
+ /* Byte mode with multibyte characters uncut (-b -n). */
+ if (no_break_mb_chars)
+ /* Forward by an additional byte_length (c) - 1. */
+ next_item_n (&char_idx, mb_len (c) - 1);
+ }
+ }
+}
static void
cut_fields (FILE *stream)
{
- int c;
+
+ /* This buffer is used to support the semantics of the -s option
+ (or lack of same) when the specified field list includes (does
+ not include) the first field. In both of those cases, the entire
+ first field must be read into this buffer to determine whether it
+ is followed by a delimiter or a newline before any of it may be
+ output. Otherwise, cut_fields can do the job without using this
+ buffer. */
+ mbf_char_t *field_1_buffer = 0;
+ /* The number of bytes allocated for FIELD_1_BUFFER. */
+ size_t field_1_bufsize;
+
+
+ mbf_char_t c, d;
+ mb_file_t mbf;
size_t field_idx = 1;
bool found_any_selected_field = false;
bool buffer_first_field;
current_rp = frp;
- c = getc (stream);
- if (c == EOF)
+ mbf_init (mbf, stream);
+ mbf_getc (c, mbf);
+ if (mb_iseof (c))
return;
- ungetc (c, stream);
- c = 0;
+ mbf_ungetc (c, mbf);
+ mb_setascii (&c, 0);
+ mb_copy (&d, &delim);
/* To support the semantics of the -s flag, we may have to buffer
all of the first field to determine whether it is 'delimited.'
@@ -536,10 +744,14 @@ cut_fields (FILE *stream)
if (field_idx == 1 && buffer_first_field)
{
ssize_t len;
- size_t n_bytes;
+ size_t n_chars;
+ mbf_char_t nl;
+ mb_setascii (&nl, line_delim);
+
+ len = mb_getndelim2 (&field_1_buffer, &field_1_bufsize,
+ GETNLINE_NO_LIMIT, d, nl, &mbf);
+
- len = getndelim2 (&field_1_buffer, &field_1_bufsize, 0,
- GETNLINE_NO_LIMIT, delim, line_delim, stream);
if (len < 0)
{
free (field_1_buffer);
@@ -549,15 +761,15 @@ cut_fields (FILE *stream)
xalloc_die ();
}
- n_bytes = len;
- assert (n_bytes != 0);
+ n_chars = len;
+ //assert (n_chars != 0);
- c = 0;
+ mb_setascii (&c, 0);
/* If the first field extends to the end of line (it is not
delimited) and we are printing all non-delimited lines,
print this one. */
- if (to_uchar (field_1_buffer[n_bytes - 1]) != delim)
+ if (!mb_equal (field_1_buffer[n_chars - 1], d))
{
if (suppress_non_delimited)
{
@@ -565,26 +777,30 @@ cut_fields (FILE *stream)
}
else
{
- fwrite (field_1_buffer, sizeof (char), n_bytes, stdout);
+ for (int i = 0; i < n_chars; ++i)
+ mb_putc (field_1_buffer[i], stdout);
+
/* Make sure the output line is newline terminated. */
- if (field_1_buffer[n_bytes - 1] != line_delim)
+ if (!mb_iseq (field_1_buffer[n_chars - 1], line_delim))
putchar (line_delim);
- c = line_delim;
+ mb_setascii (&c, line_delim);
}
continue;
}
if (print_kth (1))
{
/* Print the field, but not the trailing delimiter. */
- fwrite (field_1_buffer, sizeof (char), n_bytes - 1, stdout);
+ for (int i = 0; i < n_chars - 1; ++i)
+ mb_putc (field_1_buffer[i], stdout);
/* With -d$'\n' don't treat the last '\n' as a delimiter. */
- if (delim == line_delim)
+ if (mb_iseq (d, line_delim))
{
- int last_c = getc (stream);
- if (last_c != EOF)
+ mbf_char_t last_c;
+ mbf_getc (last_c, mbf);
+ if (!mb_iseof (last_c))
{
- ungetc (last_c, stream);
+ mbf_ungetc (last_c, mbf);
found_any_selected_field = true;
}
}
@@ -594,7 +810,8 @@ cut_fields (FILE *stream)
next_item (&field_idx);
}
- int prev_c = c;
+ mbf_char_t prev_c;
+ mb_copy (&prev_c, &c);
if (print_kth (field_idx))
{
@@ -605,42 +822,46 @@ cut_fields (FILE *stream)
}
found_any_selected_field = true;
- while ((c = getc (stream)) != delim && c != line_delim && c != EOF)
+ mbf_getc (c, mbf);
+ while (!mb_equal (c, d) && !mb_iseq (c, line_delim) && !mb_iseof (c))
{
- putchar (c);
- prev_c = c;
+ mb_putc (c, stdout);
+ mb_copy (&prev_c, &c);
+ mbf_getc (c, mbf);
}
}
else
{
- while ((c = getc (stream)) != delim && c != line_delim && c != EOF)
+ mbf_getc (c, mbf);
+ while (!mb_equal (c, d) && !mb_iseq (c, line_delim) && !mb_iseof (c))
{
- prev_c = c;
+ mb_copy (&prev_c, &c);
+ mbf_getc (c, mbf);
}
}
/* With -d$'\n' don't treat the last '\n' as a delimiter. */
- if (delim == line_delim && c == delim)
+ if (mb_iseq (d, line_delim) && mb_equal (c, d))
{
- int last_c = getc (stream);
- if (last_c != EOF)
- ungetc (last_c, stream);
+ mbf_char_t last_c;
+ mbf_getc (last_c, mbf);
+ if (!mb_iseof (last_c))
+ mbf_ungetc (last_c, mbf);
else
- c = last_c;
+ mb_copy (&c, &last_c);
}
- if (c == delim)
+ if (mb_equal (c, d))
next_item (&field_idx);
- else if (c == line_delim || c == EOF)
+ else if (mb_iseq (c, line_delim) || mb_iseof (c))
{
if (found_any_selected_field
|| !(suppress_non_delimited && field_idx == 1))
{
- if (c == line_delim || prev_c != line_delim
- || delim == line_delim)
+ if (mb_iseq (c, line_delim) || !mb_iseq (prev_c, line_delim) || mb_iseq (d, line_delim))
putchar (line_delim);
}
- if (c == EOF)
+ if (mb_iseof (c))
break;
field_idx = 1;
current_rp = frp;
@@ -652,7 +874,14 @@ static void
cut_stream (FILE *stream)
{
if (operating_mode == byte_mode)
- cut_bytes (stream);
+ {
+ if (no_break_mb_chars)
+ cut_chars (stream);
+ else
+ cut_bytes (stream);
+ }
+ else if (operating_mode == char_mode)
+ cut_chars (stream);
else
cut_fields (stream);
}
@@ -706,6 +935,7 @@ main (int argc, char **argv)
bool ok;
bool delim_specified = false;
char *spec_list_string IF_LINT ( = NULL);
+ mbi_iterator_t iter;
initialize_main (&argc, &argv);
set_program_name (argv[0]);
@@ -719,8 +949,10 @@ main (int argc, char **argv)
/* By default, all non-delimited lines are printed. */
suppress_non_delimited = false;
+ /* Default behaviour for -b, unless -n is also specified. */
+ no_break_mb_chars = false;
- delim = '\0';
+ mb_setascii (&delim, '\0');
have_read_stdin = false;
while ((optc = getopt_long (argc, argv, "b:c:d:f:nsz", longopts, NULL)) != -1)
@@ -728,7 +960,6 @@ main (int argc, char **argv)
switch (optc)
{
case 'b':
- case 'c':
/* Build the byte list. */
if (operating_mode != undefined_mode)
FATAL_ERROR (_("only one type of list may be specified"));
@@ -736,6 +967,14 @@ main (int argc, char **argv)
spec_list_string = optarg;
break;
+ case 'c':
+ /* Build the char list. */
+ if (operating_mode != undefined_mode)
+ FATAL_ERROR (_("only one type of list may be specified"));
+ operating_mode = char_mode;
+ spec_list_string = optarg;
+ break;
+
case 'f':
/* Build the field list. */
if (operating_mode != undefined_mode)
2016-01-15 10:02:13 +00:00
@@ -747,9 +986,17 @@ main (int argc, char **argv)
case 'd':
/* New delimiter. */
/* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */
- if (optarg[0] != '\0' && optarg[1] != '\0')
+ mbi_init (iter, optarg, strlen (optarg));
+ if (!mbi_avail (iter))
+ mb_setascii (&delim, '\0');
+ else
2016-01-15 10:02:13 +00:00
+ {
+ mb_copy (&delim, &mbi_cur (iter));
+
2016-01-15 10:02:13 +00:00
+ mbi_advance (iter);
+ if (mbi_avail (iter))
FATAL_ERROR (_("the delimiter must be a single character"));
2016-01-15 10:02:13 +00:00
+ }
- delim = optarg[0];
delim_specified = true;
break;
@@ -763,6 +1008,7 @@ main (int argc, char **argv)
break;
case 'n':
+ no_break_mb_chars = true;
break;
case 's':
@@ -802,15 +1048,12 @@ main (int argc, char **argv)
| (complement ? SETFLD_COMPLEMENT : 0) );
if (!delim_specified)
- delim = '\t';
+ mb_setascii (&delim, '\t');
if (output_delimiter_string == NULL)
{
- static char dummy[2];
- dummy[0] = delim;
- dummy[1] = '\0';
- output_delimiter_string = dummy;
- output_delimiter_length = 1;
+ output_delimiter_string = mb_ptr (delim);
+ output_delimiter_length = mb_len (delim);
}
if (optind == argc)