Fix an unintended upgrade to UTF-8 in the middle of a transliteration

2019-11-12 16:57:00 +01:00 · 2019-11-12 16:57:00 +01:00 · eadda09063
commit eadda09063
parent c33e239bcc
3 changed files with 136 additions and 0 deletions
--- a/perl-5.31.5-toke.c-Fix-bug-tr-upgrading-to-UTF-8-in-middle.patch
+++ b/perl-5.31.5-toke.c-Fix-bug-tr-upgrading-to-UTF-8-in-middle.patch
@ -0,0 +1,78 @@
+From 0c311b7c345769239f38d0139ea7738feec5ca4d Mon Sep 17 00:00:00 2001
+From: Karl Williamson <khw@cpan.org>
+Date: Sat, 2 Nov 2019 13:59:38 -0600
+Subject: [PATCH] toke.c: Fix bug tr/// upgrading to UTF-8 in middle
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Consider tr/\x{ff}-\x{100}/AB/.
+
+While parsing, the code keeps an offset from the beginning of the output
+to the beginning of the second number in the range.  This is purely for
+speed so that it wouldn't have to re-find the beginning of that value,
+when it already knew it.
+
+But the example above shows the folly of this shortcut.  The second
+number in the range causes the output to be upgraded to UTF-8, which
+makes that offset invalid in general.  Change to re-find the beginning.
+
+Signed-off-by: Petr Písař <ppisar@redhat.com>
+---
+ t/op/tr.t | 12 +++++++++++-
+ toke.c    |  4 +++-
+ 2 files changed, 14 insertions(+), 2 deletions(-)
+
+diff --git a/t/op/tr.t b/t/op/tr.t
+index 47d603d4fd..25125c5bc7 100644
+--- a/t/op/tr.t
+++ b/t/op/tr.t
+@@ -13,7 +13,7 @@ BEGIN {
+ 
+ use utf8;
+ 
+-plan tests => 301;
+plan tests => 304;
+ 
+ # Test this first before we extend the stack with other operations.
+ # This caused an asan failure due to a bad write past the end of the stack.
+@@ -1145,4 +1145,14 @@ for ("", nullrocow) {
+                     'RT #133880 illegal \N{}');
+ }
+ 
+{
+    my $c = "\xff";
+    my $d = "\x{104}";
+    eval '$c =~ tr/\x{ff}-\x{104}/\x{100}-\x{105}/';
+    is($@, "", 'tr/\x{ff}-\x{104}/\x{100}-\x{105}/ compiled');
+    is($c, "\x{100}", 'ff -> 100');
+    eval '$d =~ tr/\x{ff}-\x{104}/\x{100}-\x{105}/';
+    is($d, "\x{105}", '104 -> 105');
+}
+
+ 1;
+diff --git a/toke.c b/toke.c
+index 2995737af2..28f305c62c 100644
+--- a/toke.c
+++ b/toke.c
+@@ -3044,7 +3044,7 @@ S_scan_const(pTHX_ char *start)
+                  * 'offset_to_max' is the offset in 'sv' at which the character
+                  *      (the range's maximum end point) before 'd'  begins.
+                  */
+-                char * max_ptr = SvPVX(sv) + offset_to_max;
+                char * max_ptr;
+                 char * min_ptr;
+                 IV range_min;
+ 		IV range_max;	/* last character in range */
+@@ -3056,6 +3056,8 @@ S_scan_const(pTHX_ char *start)
+                 IV real_range_max = 0;
+ #endif
+                 /* Get the code point values of the range ends. */
+                max_ptr = (d_is_utf8) ? (char *) utf8_hop( (U8*) d, -1) : d - 1;
+                offset_to_max = max_ptr - SvPVX_const(sv);
+                 if (d_is_utf8) {
+                     /* We know the utf8 is valid, because we just constructed
+                      * it ourselves in previous loop iterations */
+-- 
+2.21.0
+
--- a/perl-5.31.5-toke.c-comment-changes.patch
+++ b/perl-5.31.5-toke.c-comment-changes.patch
@ -0,0 +1,48 @@
+From d7f7b0e39a10a6e3e0bd81d15473ee522a064016 Mon Sep 17 00:00:00 2001
+From: Karl Williamson <khw@cpan.org>
+Date: Mon, 4 Nov 2019 21:55:53 -0700
+Subject: [PATCH] toke.c: comment changes
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+These should have been included in
+0c311b7c345769239f38d0139ea7738feec5ca4d
+
+Signed-off-by: Petr Písař <ppisar@redhat.com>
+---
+ toke.c | 11 ++---------
+ 1 file changed, 2 insertions(+), 9 deletions(-)
+
+diff --git a/toke.c b/toke.c
+index 3f376640ef..9c1e77f9db 100644
+--- a/toke.c
+++ b/toke.c
+@@ -3032,13 +3032,8 @@ S_scan_const(pTHX_ char *start)
+                     s++;    /* Skip past the hyphen */
+ 
+                     /* d now points to where the end-range character will be
+-                     * placed.  Save it so won't have to go finding it later,
+-                     * and drop down to get that character.  (Actually we
+-                     * instead save the offset, to handle the case where a
+-                     * realloc in the meantime could change the actual
+-                     * pointer).  We'll finish processing the range the next
+-                     * time through the loop */
+-                    offset_to_max = d - SvPVX_const(sv);
+                     * placed.  Drop down to get that character.  We'll finish
+                     * processing the range the next time through the loop */
+ 
+                     if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
+                         has_above_latin1 = TRUE;
+@@ -3055,8 +3050,6 @@ S_scan_const(pTHX_ char *start)
+                  *      are the range start and range end, in order.
+                  * 'd'  points to just beyond the range end in the 'sv' string,
+                  *      where we would next place something
+-                 * 'offset_to_max' is the offset in 'sv' at which the character
+-                 *      (the range's maximum end point) before 'd'  begins.
+                  */
+                 char * max_ptr;
+                 char * min_ptr;
+-- 
+2.21.0
+
--- a/perl.spec
+++ b/perl.spec
@ -276,6 +276,11 @@ Patch66:        perl-5.31.5-Be-clearer-about-taint-s-effect-on-INC.patch
 # in upstream after 5.31.5
 Patch67:        perl-5.31.5-Tie-StdHandle-BINMODE-handle-layer-argument.patch

+# Fix an unintended upgrade to UTF-8 in the middle of a transliteration,
+# in upstream after 5.31.5
+Patch68:        perl-5.31.5-toke.c-Fix-bug-tr-upgrading-to-UTF-8-in-middle.patch
+Patch69:        perl-5.31.5-toke.c-comment-changes.patch
+
 # Link XS modules to libperl.so with EU::CBuilder on Linux, bug #960048
 Patch200:       perl-5.16.3-Link-XS-modules-to-libperl.so-with-EU-CBuilder-on-Li.patch

@ -2864,6 +2869,8 @@ rm -rf .git # Perl tests examine a git repository
 %patch65 -p1
 %patch66 -p1
 %patch67 -p1
+%patch68 -p1
+%patch69 -p1
 %patch200 -p1
 %patch201 -p1

@ -2926,6 +2933,8 @@ perl -x patchlevel.h \
    'Fedora Patch65: Fix taint mode documentation regarding @INC' \
    'Fedora Patch66: Fix taint mode documentation regarding @INC' \
    'Fedora Patch67: Fix handling a layer argument in Tie::StdHandle::BINMODE() (RT#132475)' \
+    'Fedora Patch68: Fix an unintended upgrade to UTF-8 in the middle of a transliteration' \
+    'Fedora Patch69: Fix an unintended upgrade to UTF-8 in the middle of a transliteration' \
    'Fedora Patch200: Link XS modules to libperl.so with EU::CBuilder on Linux' \
    'Fedora Patch201: Link XS modules to libperl.so with EU::MM on Linux' \
    %{nil}
@ -5176,6 +5185,7 @@ popd
 - Fix handling undefined array members in Dumpvalue (RT#134441)
 - Fix taint mode documentation regarding @INC
 - Fix handling a layer argument in Tie::StdHandle::BINMODE() (RT#132475)
+- Fix an unintended upgrade to UTF-8 in the middle of a transliteration

 * Mon Nov 11 2019 Jitka Plesnikova <jplesnik@redhat.com> - 4:5.30.1-447
 - 5.30.1 bump (see <https://metacpan.org/pod/release/SHAY/perl-5.30.1/pod/perldelta.pod>