324 lines
9.1 KiB
Diff
324 lines
9.1 KiB
Diff
commit fa1b0d5e9f6e0353e16339430770a7a8824c0468
|
|
Author: Florian Weimer <fweimer@redhat.com>
|
|
Date: Fri Sep 20 13:10:54 2024 +0200
|
|
|
|
iconv: Input buffering for the iconv program (bug 6050)
|
|
|
|
Do not read the entire input file into memory.
|
|
|
|
Reviewed-by: DJ Delorie <dj@redhat.com>
|
|
|
|
diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c
|
|
index dd4bc3a59a20799a..a2f1d34e4579f80f 100644
|
|
--- a/iconv/iconv_prog.c
|
|
+++ b/iconv/iconv_prog.c
|
|
@@ -118,8 +118,9 @@ static size_t output_buffer_size = 1024 * 1024;
|
|
|
|
/* Prototypes for the functions doing the actual work. */
|
|
static void prepare_output_file (char **argv);
|
|
-static void close_output_file (int status);
|
|
-static int process_block (iconv_t cd, char *addr, size_t len);
|
|
+static void close_output_file (__gconv_t cd, int status);
|
|
+static int process_block (iconv_t cd, char **addr, size_t *len,
|
|
+ off64_t file_offset, bool *incomplete);
|
|
static int process_fd (iconv_t cd, int fd);
|
|
static int process_file (iconv_t cd, FILE *input);
|
|
static void print_known_names (void);
|
|
@@ -311,7 +312,7 @@ conversions from `%s' and to `%s' are not supported"),
|
|
status = EXIT_FAILURE;
|
|
|
|
/* Close the output file now. */
|
|
- close_output_file (status);
|
|
+ close_output_file (cd, status);
|
|
}
|
|
|
|
return status;
|
|
@@ -599,7 +600,7 @@ flush_output (void)
|
|
}
|
|
|
|
static void
|
|
-close_output_file (int status)
|
|
+close_output_file (__gconv_t cd, int status)
|
|
{
|
|
/* Do not perform a flush if a temporary file or the in-memory
|
|
buffer is in use and there was an error. It would clobber the
|
|
@@ -608,10 +609,28 @@ close_output_file (int status)
|
|
(output_using_temporary_file || output_fd < 0))
|
|
return;
|
|
|
|
- /* The current_input_file_index variable is now larger than
|
|
- last_overlapping_file_index, so the flush_output call switches
|
|
+ /* All the input text is processed. For state-dependent character
|
|
+ sets we have to flush the state now.
|
|
+
|
|
+ The current_input_file_index variable is now larger than
|
|
+ last_overlapping_file_index, so the flush_output calls switch
|
|
away from the temporary file. */
|
|
+ size_t n = iconv (cd, NULL, NULL,
|
|
+ &output_buffer_current, &output_buffer_remaining);
|
|
+ if (n == (size_t) -1 && errno == E2BIG)
|
|
+ {
|
|
+ /* Try again if the state flush exceeded the buffer space. */
|
|
+ flush_output ();
|
|
+ n = iconv (cd, NULL, NULL,
|
|
+ &output_buffer_current, &output_buffer_remaining);
|
|
+ }
|
|
+ int saved_errno = errno;
|
|
flush_output ();
|
|
+ if (n == (size_t) -1 && !omit_invalid)
|
|
+ {
|
|
+ errno = saved_errno;
|
|
+ output_error ();
|
|
+ }
|
|
|
|
if (output_fd == STDOUT_FILENO)
|
|
{
|
|
@@ -625,51 +644,35 @@ close_output_file (int status)
|
|
output_error ();
|
|
}
|
|
|
|
+/* CD is the iconv handle. Input processing starts at *ADDR, and
|
|
+ consumes upto *LEN bytes. *ADDR and *LEN are updated. FILE_OFFSET
|
|
+ is the file offset of the data initially at ADDR. *INCOMPLETE is
|
|
+ set to true if conversion stops due to an incomplete input
|
|
+ sequence. */
|
|
static int
|
|
-process_block (iconv_t cd, char *addr, size_t len)
|
|
+process_block (iconv_t cd, char **addr, size_t *len, off64_t file_offset,
|
|
+ bool *incomplete)
|
|
{
|
|
- const char *start = addr;
|
|
+ const char *start = *addr;
|
|
size_t n;
|
|
int ret = 0;
|
|
|
|
- while (len > 0)
|
|
+ while (*len > 0)
|
|
{
|
|
- n = iconv (cd, &addr, &len,
|
|
+ n = iconv (cd, addr, len,
|
|
&output_buffer_current, &output_buffer_remaining);
|
|
|
|
if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
|
|
{
|
|
ret = 1;
|
|
- if (len == 0)
|
|
+ if (*len == 0)
|
|
n = 0;
|
|
else
|
|
errno = E2BIG;
|
|
}
|
|
|
|
if (n != (size_t) -1)
|
|
- {
|
|
- /* All the input test is processed. For state-dependent
|
|
- character sets we have to flush the state now. */
|
|
- n = iconv (cd, NULL, NULL,
|
|
- &output_buffer_current, &output_buffer_remaining);
|
|
- if (n == (size_t) -1 && errno == E2BIG)
|
|
- {
|
|
- /* Try again if the state flush exceeded the buffer space. */
|
|
- flush_output ();
|
|
- n = iconv (cd, NULL, NULL,
|
|
- &output_buffer_current, &output_buffer_remaining);
|
|
- }
|
|
- bool errno_is_EILSEQ = errno == EILSEQ;
|
|
-
|
|
- if (n != (size_t) -1)
|
|
- break;
|
|
-
|
|
- if (omit_invalid && errno_is_EILSEQ)
|
|
- {
|
|
- ret = 1;
|
|
- break;
|
|
- }
|
|
- }
|
|
+ break;
|
|
|
|
if (errno == E2BIG)
|
|
flush_output ();
|
|
@@ -680,13 +683,12 @@ process_block (iconv_t cd, char *addr, size_t len)
|
|
{
|
|
case EILSEQ:
|
|
if (! omit_invalid)
|
|
- error (0, 0, _("illegal input sequence at position %ld"),
|
|
- (long int) (addr - start));
|
|
+ error (0, 0, _("illegal input sequence at position %lld"),
|
|
+ (long long int) (file_offset + (*addr - start)));
|
|
break;
|
|
case EINVAL:
|
|
- error (0, 0, _("\
|
|
-incomplete character or shift sequence at end of buffer"));
|
|
- break;
|
|
+ *incomplete = true;
|
|
+ return ret;
|
|
case EBADF:
|
|
error (0, 0, _("internal error (illegal descriptor)"));
|
|
break;
|
|
@@ -706,79 +708,49 @@ incomplete character or shift sequence at end of buffer"));
|
|
static int
|
|
process_fd (iconv_t cd, int fd)
|
|
{
|
|
- /* we have a problem with reading from a descriptor since we must not
|
|
- provide the iconv() function an incomplete character or shift
|
|
- sequence at the end of the buffer. Since we have to deal with
|
|
- arbitrary encodings we must read the whole text in a buffer and
|
|
- process it in one step. */
|
|
- static char *inbuf = NULL;
|
|
- static size_t maxlen = 0;
|
|
- char *inptr = inbuf;
|
|
- size_t actlen = 0;
|
|
-
|
|
- while (actlen < maxlen)
|
|
+ char inbuf[BUFSIZ];
|
|
+ char *inbuf_end = inbuf + sizeof (inbuf);
|
|
+ size_t inbuf_used = 0;
|
|
+ off64_t file_offset = 0;
|
|
+ int status = 0;
|
|
+ bool incomplete = false;
|
|
+
|
|
+ while (true)
|
|
{
|
|
- ssize_t n = read (fd, inptr, maxlen - actlen);
|
|
-
|
|
- if (n == 0)
|
|
- /* No more text to read. */
|
|
- break;
|
|
-
|
|
- if (n == -1)
|
|
+ char *p = inbuf + inbuf_used;
|
|
+ ssize_t read_ret = read (fd, p, inbuf_end - p);
|
|
+ if (read_ret == 0)
|
|
+ {
|
|
+ /* On EOF, check if the previous iconv invocation saw an
|
|
+ incomplete sequence. */
|
|
+ if (incomplete)
|
|
+ {
|
|
+ error (0, 0, _("\
|
|
+incomplete character or shift sequence at end of buffer"));
|
|
+ return 1;
|
|
+ }
|
|
+ return 0;
|
|
+ }
|
|
+ if (read_ret < 0)
|
|
{
|
|
- /* Error while reading. */
|
|
error (0, errno, _("error while reading the input"));
|
|
return -1;
|
|
}
|
|
-
|
|
- inptr += n;
|
|
- actlen += n;
|
|
+ inbuf_used += read_ret;
|
|
+ incomplete = false;
|
|
+ p = inbuf;
|
|
+ int ret = process_block (cd, &p, &inbuf_used, file_offset, &incomplete);
|
|
+ if (ret != 0)
|
|
+ {
|
|
+ status = ret;
|
|
+ if (ret < 0)
|
|
+ break;
|
|
+ }
|
|
+ /* The next loop iteration consumes the leftover bytes. */
|
|
+ memmove (inbuf, p, inbuf_used);
|
|
+ file_offset += read_ret - inbuf_used;
|
|
}
|
|
-
|
|
- if (actlen == maxlen)
|
|
- while (1)
|
|
- {
|
|
- ssize_t n;
|
|
- char *new_inbuf;
|
|
-
|
|
- /* Increase the buffer. */
|
|
- new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
|
|
- if (new_inbuf == NULL)
|
|
- {
|
|
- error (0, errno, _("unable to allocate buffer for input"));
|
|
- return -1;
|
|
- }
|
|
- inbuf = new_inbuf;
|
|
- maxlen += 32768;
|
|
- inptr = inbuf + actlen;
|
|
-
|
|
- do
|
|
- {
|
|
- n = read (fd, inptr, maxlen - actlen);
|
|
-
|
|
- if (n == 0)
|
|
- /* No more text to read. */
|
|
- break;
|
|
-
|
|
- if (n == -1)
|
|
- {
|
|
- /* Error while reading. */
|
|
- error (0, errno, _("error while reading the input"));
|
|
- return -1;
|
|
- }
|
|
-
|
|
- inptr += n;
|
|
- actlen += n;
|
|
- }
|
|
- while (actlen < maxlen);
|
|
-
|
|
- if (n == 0)
|
|
- /* Break again so we leave both loops. */
|
|
- break;
|
|
- }
|
|
-
|
|
- /* Now we have all the input in the buffer. Process it in one run. */
|
|
- return process_block (cd, inbuf, actlen);
|
|
+ return status;
|
|
}
|
|
|
|
|
|
diff --git a/iconv/tst-iconv_prog-buffer.sh b/iconv/tst-iconv_prog-buffer.sh
|
|
index a9c3729d948b4679..23098ac56a344c48 100644
|
|
--- a/iconv/tst-iconv_prog-buffer.sh
|
|
+++ b/iconv/tst-iconv_prog-buffer.sh
|
|
@@ -50,6 +50,9 @@ echo OUT > "$tmp/out-template"
|
|
: > "$tmp/empty"
|
|
printf '\xff' > "$tmp/0xff"
|
|
|
|
+# Length should be a prime number, to help with buffer alignment testing.
|
|
+printf '\xc3\xa4\xe2\x80\x94\xe2\x80\x94\xc3\xa4\n' > "$tmp/utf8-sequence"
|
|
+
|
|
# Double all files to produce larger buffers.
|
|
for p in "$tmp"/* ; do
|
|
i=0
|
|
@@ -270,6 +273,34 @@ expect_exit 1 run_iconv -o "$tmp/out" "$tmp/abc" - < "$tmp/0xff" "$tmp/def"
|
|
run_iconv -o "$tmp/out" "$tmp/xy" - - "$tmp/zt" < "$tmp/abc"
|
|
expect_files xy abc zt
|
|
|
|
+# NB: Extra iconv args are ignored after this point. Actual
|
|
+# multi-byte conversion does not work with tiny buffers.
|
|
+iconv_args="-f UTF-8 -t ASCII"
|
|
+
|
|
+printf 'x\n\xc3' > "$tmp/incomplete"
|
|
+expect_exit 1 run_iconv -o "$tmp/out" "$tmp/incomplete"
|
|
+check_out <<EOF
|
|
+x
|
|
+EOF
|
|
+
|
|
+# Test buffering behavior if the buffer ends with an incomplete
|
|
+# multi-byte sequence.
|
|
+prefix=""
|
|
+prefix_length=0
|
|
+while test $prefix_length -lt 12; do
|
|
+ echo "info: testing prefix length $prefix_length" 2>&$logfd
|
|
+ printf "%s" "$prefix" > "$tmp/prefix"
|
|
+ cat "$tmp/prefix" "$tmp/utf8-sequence" > "$tmp/tmp"
|
|
+ iconv_args="-f UTF-8 -t UCS-4"
|
|
+ run_iconv -o "$tmp/out1" "$tmp/tmp"
|
|
+ iconv_args="-f UCS-4 -t UTF-8"
|
|
+ run_iconv -o "$tmp/out" "$tmp/out1"
|
|
+ expect_files prefix utf8-sequence
|
|
+
|
|
+ prefix="$prefix@"
|
|
+ prefix_length=$(($prefix_length + 1))
|
|
+done
|
|
+
|
|
if $failure ; then
|
|
exit 1
|
|
fi
|