From a0ea65ae2690d1e397dffe0282791fb9787fab47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= Date: Wed, 9 Aug 2017 14:50:40 +0200 Subject: [PATCH] Fix splitting non-ASCII strings if unicode_strings feature is enabled --- ...-130907-Fix-the-Unicode-Bug-in-split.patch | 226 ++++++++++++++++++ perl.spec | 7 + 2 files changed, 233 insertions(+) create mode 100644 perl-5.27.1-RT-130907-Fix-the-Unicode-Bug-in-split.patch diff --git a/perl-5.27.1-RT-130907-Fix-the-Unicode-Bug-in-split.patch b/perl-5.27.1-RT-130907-Fix-the-Unicode-Bug-in-split.patch new file mode 100644 index 0000000..52a797a --- /dev/null +++ b/perl-5.27.1-RT-130907-Fix-the-Unicode-Bug-in-split.patch @@ -0,0 +1,226 @@ +From 5aca16e032861ea3dfcc96ad417ea87e2b1552e5 Mon Sep 17 00:00:00 2001 +From: Aaron Crane +Date: Sat, 4 Mar 2017 12:50:58 +0000 +Subject: [PATCH] RT #130907: Fix the Unicode Bug in split " " +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Ported to 5.26.0: + +commit 20ae58f7a9bbf84d043d6e90f5988b6e3ca4ee3d +Author: Aaron Crane +Date: Sat Mar 4 12:50:58 2017 +0000 + + RT #130907: Fix the Unicode Bug in split " " + +Signed-off-by: Petr Písař +--- + lib/feature.pm | 5 +++-- + pod/perldelta.pod | 9 +++++++++ + pod/perlfunc.pod | 8 ++++++++ + pod/perlunicode.pod | 11 +++++++++++ + pod/perluniintro.pod | 5 +++-- + pp.c | 13 +++++++++++++ + regen/feature.pl | 5 +++-- + t/op/split.t | 20 +++++++++++++++++++- + 8 files changed, 69 insertions(+), 7 deletions(-) + +diff --git a/lib/feature.pm b/lib/feature.pm +index ed13273..93e020b 100644 +--- a/lib/feature.pm ++++ b/lib/feature.pm +@@ -175,8 +175,9 @@ C subpragma is B recommended. + + This feature is available starting with Perl 5.12; was almost fully + implemented in Perl 5.14; and extended in Perl 5.16 to cover C; +-and extended further in Perl 5.26 to cover L. ++was extended further in Perl 5.26 to cover L; and was extended again in Perl 5.28 to ++cover L. + + =head2 The 'unicode_eval' and 'evalbytes' features + +diff --git a/pod/perldelta.pod b/pod/perldelta.pod +index 06dcd1d..d31335f 100644 +--- a/pod/perldelta.pod ++++ b/pod/perldelta.pod +@@ -3206,6 +3206,15 @@ calls. + Parsing bad POSIX charclasses no longer leaks memory. + L<[perl #128313]|https://rt.perl.org/Public/Bug/Display.html?id=128313> + ++=item * ++ ++C now correctly handles the argument being split when in the ++scope of the L<< C|feature/"The 'unicode_strings' feature" ++>> feature. Previously, when a string using the single-byte internal ++representation contained characters that are whitespace by Unicode rules but ++not by ASCII rules, it treated those characters as part of fields rather ++than as field separators. [perl #130907] ++ + =back + + =head1 Known Problems +diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod +index b8dca6e..9abadf4 100644 +--- a/pod/perlfunc.pod ++++ b/pod/perlfunc.pod +@@ -7616,6 +7616,14 @@ special case was restricted to the use of a plain S> as the + pattern argument to split; in Perl 5.18.0 and later this special case is + triggered by any expression which evaluates to the simple string S>. + ++As of Perl 5.28, this special-cased whitespace splitting works as expected in ++the scope of L<< S>|feature/The ++'unicode_strings' feature >>. In previous versions, and outside the scope of ++that feature, it exhibits L: characters that are ++whitespace according to Unicode rules but not according to ASCII rules can be ++treated as part of fields rather than as field separators, depending on the ++string's internal encoding. ++ + If omitted, PATTERN defaults to a single space, S>, triggering + the previously described I emulation. + +diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod +index 9c13c35..2e84e95 100644 +--- a/pod/perlunicode.pod ++++ b/pod/perlunicode.pod +@@ -1835,6 +1835,17 @@ outside its scope, it could produce strings whose length in characters + exceeded that of the right-hand side, where the right-hand side took up more + bytes than the correct range endpoint. + ++=item * ++ ++In L<< C's special-case whitespace splitting|perlfunc/split >>. ++ ++Starting in Perl 5.28.0, the C function with a pattern specified as ++a string containing a single space handles whitespace characters consistently ++within the scope of of C. Prior to that, or outside its scope, ++characters that are whitespace according to Unicode rules but not according to ++ASCII rules were treated as field contents rather than field separators when ++they appear in byte-encoded strings. ++ + =back + + You can see from the above that the effect of C +diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod +index d35de34..595ec46 100644 +--- a/pod/perluniintro.pod ++++ b/pod/perluniintro.pod +@@ -151,11 +151,12 @@ serious Unicode work. The maintenance release 5.6.1 fixed many of the + problems of the initial Unicode implementation, but for example + regular expressions still do not work with Unicode in 5.6.1. + Perl v5.14.0 is the first release where Unicode support is +-(almost) seamlessly integrable without some gotchas. (There are two ++(almost) seamlessly integrable without some gotchas. (There are a few + exceptions. Firstly, some differences in L + were fixed starting in Perl 5.16.0. Secondly, some differences in + L were fixed starting in +-Perl 5.26.0.) ++Perl 5.26.0. Thirdly, some differences in L were fixed ++started in Perl 5.28.0.) + + To enable this + seamless support, you should C (which is +diff --git a/pp.c b/pp.c +index cc4cb59..d9dd005 100644 +--- a/pp.c ++++ b/pp.c +@@ -5740,6 +5740,7 @@ PP(pp_split) + STRLEN len; + const char *s = SvPV_const(sv, len); + const bool do_utf8 = DO_UTF8(sv); ++ const bool in_uni_8_bit = IN_UNI_8_BIT; + const char *strend = s + len; + PMOP *pm = cPMOPx(PL_op); + REGEXP *rx; +@@ -5826,6 +5827,10 @@ PP(pp_split) + while (s < strend && isSPACE_LC(*s)) + s++; + } ++ else if (in_uni_8_bit) { ++ while (s < strend && isSPACE_L1(*s)) ++ s++; ++ } + else { + while (s < strend && isSPACE(*s)) + s++; +@@ -5857,6 +5862,10 @@ PP(pp_split) + { + while (m < strend && !isSPACE_LC(*m)) + ++m; ++ } ++ else if (in_uni_8_bit) { ++ while (m < strend && !isSPACE_L1(*m)) ++ ++m; + } else { + while (m < strend && !isSPACE(*m)) + ++m; +@@ -5891,6 +5900,10 @@ PP(pp_split) + { + while (s < strend && isSPACE_LC(*s)) + ++s; ++ } ++ else if (in_uni_8_bit) { ++ while (s < strend && isSPACE_L1(*s)) ++ ++s; + } else { + while (s < strend && isSPACE(*s)) + ++s; +diff --git a/regen/feature.pl b/regen/feature.pl +index 579120e..8a4ce63 100755 +--- a/regen/feature.pl ++++ b/regen/feature.pl +@@ -485,8 +485,9 @@ C subpragma is B recommended. + + This feature is available starting with Perl 5.12; was almost fully + implemented in Perl 5.14; and extended in Perl 5.16 to cover C; +-and extended further in Perl 5.26 to cover L. ++was extended further in Perl 5.26 to cover L; and was extended again in Perl 5.28 to ++cover L. + + =head2 The 'unicode_eval' and 'evalbytes' features + +diff --git a/t/op/split.t b/t/op/split.t +index d60bcaf..038c5d7 100644 +--- a/t/op/split.t ++++ b/t/op/split.t +@@ -7,7 +7,7 @@ BEGIN { + set_up_inc('../lib'); + } + +-plan tests => 163; ++plan tests => 172; + + $FS = ':'; + +@@ -480,6 +480,24 @@ is($cnt, scalar(@ary)); + qq{split(\$cond ? qr/ / : " ", "$exp") behaves as expected over repeated similar patterns}; + } + ++SKIP: { ++ # RT #130907: unicode_strings feature doesn't work with split ' ' ++ ++ my ($sp) = grep /\s/u, map chr, reverse 128 .. 255 # prefer \xA0 over \x85 ++ or skip 'no unicode whitespace found in high-8-bit range', 9; ++ ++ for (["$sp$sp. /", "leading unicode whitespace"], ++ [".$sp$sp/", "unicode whitespace separator"], ++ [". /$sp$sp", "trailing unicode whitespace"]) { ++ my ($str, $desc) = @$_; ++ use feature "unicode_strings"; ++ my @got = split " ", $str; ++ is @got, 2, "whitespace split: $desc: field count"; ++ is $got[0], '.', "whitespace split: $desc: field 0"; ++ is $got[1], '/', "whitespace split: $desc: field 1"; ++ } ++} ++ + { + # 'RT #116086: split "\x20" does not work as documented'; + my @results; +-- +2.9.4 + diff --git a/perl.spec b/perl.spec index 5d3cc9a..868fda7 100644 --- a/perl.spec +++ b/perl.spec @@ -212,6 +212,10 @@ Patch52: perl-5.26.0-perl-131588-be-a-little-more-careful-in-arybase-_tie # in upstream adter 5.27.1 Patch53: perl-5.27.1-perl-131597-ensure-the-GV-slot-is-filled-for-our-foo.patch +# Fix splitting non-ASCII strings if unicode_strings feature is enabled, +# RT#130907 in upstream after 5.27.1 +Patch54: perl-5.27.1-RT-130907-Fix-the-Unicode-Bug-in-split.patch + # Link XS modules to libperl.so with EU::CBuilder on Linux, bug #960048 Patch200: perl-5.16.3-Link-XS-modules-to-libperl.so-with-EU-CBuilder-on-Li.patch @@ -2795,6 +2799,7 @@ Perl extension for Version Objects %patch51 -p1 %patch52 -p1 %patch53 -p1 +%patch54 -p1 %patch200 -p1 %patch201 -p1 @@ -2833,6 +2838,7 @@ perl -x patchlevel.h \ 'Fedora Patch51: Fix error message for "our sub foo::bar" (RT#131679)' \ 'Fedora Patch52: Fix executing arybase::_tie_it() in Safe compartement (RT#131588)' \ 'Fedora Patch53: Fix handling attribute specification on our variables (RT#131597)' \ + 'Fedora Patch54: Fix splitting non-ASCII strings if unicode_strings feature is enabled (RT#130907)' \ 'Fedora Patch200: Link XS modules to libperl.so with EU::CBuilder on Linux' \ 'Fedora Patch201: Link XS modules to libperl.so with EU::MM on Linux' \ %{nil} @@ -5127,6 +5133,7 @@ popd - Fix error message for "our sub foo::bar" (RT#131679) - Fix executing arybase::_tie_it() in Safe compartement (RT#131588) - Fix handling attribute specification on our variables (RT#131597) +- Fix splitting non-ASCII strings if unicode_strings feature is enabled (RT#130907) * Sat Jul 29 2017 Igor Gnatenko - 4:5.26.0-397 - Enable separate debuginfo back