From a0ea65ae2690d1e397dffe0282791fb9787fab47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= <ppisar@redhat.com>
Date: Wed, 9 Aug 2017 14:50:40 +0200
Subject: [PATCH] Fix splitting non-ASCII strings if unicode_strings feature is
 enabled

---
 ...-130907-Fix-the-Unicode-Bug-in-split.patch | 226 ++++++++++++++++++
 perl.spec                                     |   7 +
 2 files changed, 233 insertions(+)
 create mode 100644 perl-5.27.1-RT-130907-Fix-the-Unicode-Bug-in-split.patch
diff --git a/perl-5.27.1-RT-130907-Fix-the-Unicode-Bug-in-split.patch b/perl-5.27.1-RT-130907-Fix-the-Unicode-Bug-in-split.patch
new file mode 100644
index 0000000..52a797a
--- /dev/null
+++ b/perl-5.27.1-RT-130907-Fix-the-Unicode-Bug-in-split.patch
@@ -0,0 +1,226 @@
+From 5aca16e032861ea3dfcc96ad417ea87e2b1552e5 Mon Sep 17 00:00:00 2001
+From: Aaron Crane <arc@cpan.org>
+Date: Sat, 4 Mar 2017 12:50:58 +0000
+Subject: [PATCH] RT #130907: Fix the Unicode Bug in split " "
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Ported to 5.26.0:
+
+commit 20ae58f7a9bbf84d043d6e90f5988b6e3ca4ee3d
+Author: Aaron Crane <arc@cpan.org>
+Date:   Sat Mar 4 12:50:58 2017 +0000
+
+    RT #130907: Fix the Unicode Bug in split " "
+
+Signed-off-by: Petr Písař <ppisar@redhat.com>
+---
+ lib/feature.pm       |  5 +++--
+ pod/perldelta.pod    |  9 +++++++++
+ pod/perlfunc.pod     |  8 ++++++++
+ pod/perlunicode.pod  | 11 +++++++++++
+ pod/perluniintro.pod |  5 +++--
+ pp.c                 | 13 +++++++++++++
+ regen/feature.pl     |  5 +++--
+ t/op/split.t         | 20 +++++++++++++++++++-
+ 8 files changed, 69 insertions(+), 7 deletions(-)
+
+diff --git a/lib/feature.pm b/lib/feature.pm
+index ed13273..93e020b 100644
+--- a/lib/feature.pm
++++ b/lib/feature.pm
+@@ -175,8 +175,9 @@ C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
+ 
+ This feature is available starting with Perl 5.12; was almost fully
+ implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>;
+-and extended further in Perl 5.26 to cover L<the range
+-operator|perlop/Range Operators>.
++was extended further in Perl 5.26 to cover L<the range
++operator|perlop/Range Operators>; and was extended again in Perl 5.28 to
++cover L<special-cased whitespace splitting|perlfunc/split>.
+ 
+ =head2 The 'unicode_eval' and 'evalbytes' features
+ 
+diff --git a/pod/perldelta.pod b/pod/perldelta.pod
+index 06dcd1d..d31335f 100644
+--- a/pod/perldelta.pod
++++ b/pod/perldelta.pod
+@@ -3206,6 +3206,15 @@ calls.
+ Parsing bad POSIX charclasses no longer leaks memory.
+ L<[perl #128313]|https://rt.perl.org/Public/Bug/Display.html?id=128313>
+ 
++=item *
++
++C<split ' '> now correctly handles the argument being split when in the
++scope of the L<< C<unicode_strings>|feature/"The 'unicode_strings' feature"
++>> feature. Previously, when a string using the single-byte internal
++representation contained characters that are whitespace by Unicode rules but
++not by ASCII rules, it treated those characters as part of fields rather
++than as field separators.  [perl #130907]
++
+ =back
+ 
+ =head1 Known Problems
+diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod
+index b8dca6e..9abadf4 100644
+--- a/pod/perlfunc.pod
++++ b/pod/perlfunc.pod
+@@ -7616,6 +7616,14 @@ special case was restricted to the use of a plain S<C<" ">> as the
+ pattern argument to split; in Perl 5.18.0 and later this special case is
+ triggered by any expression which evaluates to the simple string S<C<" ">>.
+ 
++As of Perl 5.28, this special-cased whitespace splitting works as expected in
++the scope of L<< S<C<"use feature 'unicode_strings">>|feature/The
++'unicode_strings' feature >>. In previous versions, and outside the scope of
++that feature, it exhibits L<perlunicode/The "Unicode Bug">: characters that are
++whitespace according to Unicode rules but not according to ASCII rules can be
++treated as part of fields rather than as field separators, depending on the
++string's internal encoding.
++
+ If omitted, PATTERN defaults to a single space, S<C<" ">>, triggering
+ the previously described I<awk> emulation.
+ 
+diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
+index 9c13c35..2e84e95 100644
+--- a/pod/perlunicode.pod
++++ b/pod/perlunicode.pod
+@@ -1835,6 +1835,17 @@ outside its scope, it could produce strings whose length in characters
+ exceeded that of the right-hand side, where the right-hand side took up more
+ bytes than the correct range endpoint.
+ 
++=item *
++
++In L<< C<split>'s special-case whitespace splitting|perlfunc/split >>.
++
++Starting in Perl 5.28.0, the C<split> function with a pattern specified as
++a string containing a single space handles whitespace characters consistently
++within the scope of of C<unicode_strings>. Prior to that, or outside its scope,
++characters that are whitespace according to Unicode rules but not according to
++ASCII rules were treated as field contents rather than field separators when
++they appear in byte-encoded strings.
++
+ =back
+ 
+ You can see from the above that the effect of C<unicode_strings>
+diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod
+index d35de34..595ec46 100644
+--- a/pod/perluniintro.pod
++++ b/pod/perluniintro.pod
+@@ -151,11 +151,12 @@ serious Unicode work.  The maintenance release 5.6.1 fixed many of the
+ problems of the initial Unicode implementation, but for example
+ regular expressions still do not work with Unicode in 5.6.1.
+ Perl v5.14.0 is the first release where Unicode support is
+-(almost) seamlessly integrable without some gotchas. (There are two
++(almost) seamlessly integrable without some gotchas. (There are a few
+ exceptions. Firstly, some differences in L<quotemeta|perlfunc/quotemeta>
+ were fixed starting in Perl 5.16.0. Secondly, some differences in
+ L<the range operator|perlop/Range Operators> were fixed starting in
+-Perl 5.26.0.)
++Perl 5.26.0. Thirdly, some differences in L<split|perlfunc/split> were fixed
++started in Perl 5.28.0.)
+ 
+ To enable this
+ seamless support, you should C<use feature 'unicode_strings'> (which is
+diff --git a/pp.c b/pp.c
+index cc4cb59..d9dd005 100644
+--- a/pp.c
++++ b/pp.c
+@@ -5740,6 +5740,7 @@ PP(pp_split)
+     STRLEN len;
+     const char *s = SvPV_const(sv, len);
+     const bool do_utf8 = DO_UTF8(sv);
++    const bool in_uni_8_bit = IN_UNI_8_BIT;
+     const char *strend = s + len;
+     PMOP *pm = cPMOPx(PL_op);
+     REGEXP *rx;
+@@ -5826,6 +5827,10 @@ PP(pp_split)
+ 	    while (s < strend && isSPACE_LC(*s))
+ 		s++;
+ 	}
++        else if (in_uni_8_bit) {
++            while (s < strend && isSPACE_L1(*s))
++                s++;
++        }
+ 	else {
+ 	    while (s < strend && isSPACE(*s))
+ 		s++;
+@@ -5857,6 +5862,10 @@ PP(pp_split)
+             {
+ 	        while (m < strend && !isSPACE_LC(*m))
+ 		    ++m;
++            }
++            else if (in_uni_8_bit) {
++                while (m < strend && !isSPACE_L1(*m))
++                    ++m;
+             } else {
+                 while (m < strend && !isSPACE(*m))
+                     ++m;
+@@ -5891,6 +5900,10 @@ PP(pp_split)
+             {
+ 	        while (s < strend && isSPACE_LC(*s))
+ 		    ++s;
++            }
++            else if (in_uni_8_bit) {
++                while (s < strend && isSPACE_L1(*s))
++                    ++s;
+             } else {
+                 while (s < strend && isSPACE(*s))
+                     ++s;
+diff --git a/regen/feature.pl b/regen/feature.pl
+index 579120e..8a4ce63 100755
+--- a/regen/feature.pl
++++ b/regen/feature.pl
+@@ -485,8 +485,9 @@ C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
+ 
+ This feature is available starting with Perl 5.12; was almost fully
+ implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>;
+-and extended further in Perl 5.26 to cover L<the range
+-operator|perlop/Range Operators>.
++was extended further in Perl 5.26 to cover L<the range
++operator|perlop/Range Operators>; and was extended again in Perl 5.28 to
++cover L<special-cased whitespace splitting|perlfunc/split>.
+ 
+ =head2 The 'unicode_eval' and 'evalbytes' features
+ 
+diff --git a/t/op/split.t b/t/op/split.t
+index d60bcaf..038c5d7 100644
+--- a/t/op/split.t
++++ b/t/op/split.t
+@@ -7,7 +7,7 @@ BEGIN {
+     set_up_inc('../lib');
+ }
+ 
+-plan tests => 163;
++plan tests => 172;
+ 
+ $FS = ':';
+ 
+@@ -480,6 +480,24 @@ is($cnt, scalar(@ary));
+         qq{split(\$cond ? qr/ / : " ", "$exp") behaves as expected over repeated similar patterns};
+ }
+ 
++SKIP: {
++    # RT #130907: unicode_strings feature doesn't work with split ' '
++
++    my ($sp) = grep /\s/u, map chr, reverse 128 .. 255 # prefer \xA0 over \x85
++        or skip 'no unicode whitespace found in high-8-bit range', 9;
++
++    for (["$sp$sp. /", "leading unicode whitespace"],
++         [".$sp$sp/",  "unicode whitespace separator"],
++         [". /$sp$sp", "trailing unicode whitespace"]) {
++        my ($str, $desc) = @$_;
++        use feature "unicode_strings";
++        my @got = split " ", $str;
++        is @got, 2, "whitespace split: $desc: field count";
++        is $got[0], '.', "whitespace split: $desc: field 0";
++        is $got[1], '/', "whitespace split: $desc: field 1";
++    }
++}
++
+ {
+     # 'RT #116086: split "\x20" does not work as documented';
+     my @results;
+-- 
+2.9.4
+
diff --git a/perl.spec b/perl.spec
index 5d3cc9a..868fda7 100644
--- a/perl.spec
+++ b/perl.spec
@@ -212,6 +212,10 @@ Patch52:        perl-5.26.0-perl-131588-be-a-little-more-careful-in-arybase-_tie
 # in upstream adter 5.27.1
 Patch53:        perl-5.27.1-perl-131597-ensure-the-GV-slot-is-filled-for-our-foo.patch
 
+# Fix splitting non-ASCII strings if unicode_strings feature is enabled,
+# RT#130907 in upstream after 5.27.1
+Patch54:        perl-5.27.1-RT-130907-Fix-the-Unicode-Bug-in-split.patch
+
 # Link XS modules to libperl.so with EU::CBuilder on Linux, bug #960048
 Patch200:       perl-5.16.3-Link-XS-modules-to-libperl.so-with-EU-CBuilder-on-Li.patch
 
@@ -2795,6 +2799,7 @@ Perl extension for Version Objects
 %patch51 -p1
 %patch52 -p1
 %patch53 -p1
+%patch54 -p1
 %patch200 -p1
 %patch201 -p1
 
@@ -2833,6 +2838,7 @@ perl -x patchlevel.h \
     'Fedora Patch51: Fix error message for "our sub foo::bar" (RT#131679)' \
     'Fedora Patch52: Fix executing arybase::_tie_it() in Safe compartement (RT#131588)' \
     'Fedora Patch53: Fix handling attribute specification on our variables (RT#131597)' \
+    'Fedora Patch54: Fix splitting non-ASCII strings if unicode_strings feature is enabled (RT#130907)' \
     'Fedora Patch200: Link XS modules to libperl.so with EU::CBuilder on Linux' \
     'Fedora Patch201: Link XS modules to libperl.so with EU::MM on Linux' \
     %{nil}
@@ -5127,6 +5133,7 @@ popd
 - Fix error message for "our sub foo::bar" (RT#131679)
 - Fix executing arybase::_tie_it() in Safe compartement (RT#131588)
 - Fix handling attribute specification on our variables (RT#131597)
+- Fix splitting non-ASCII strings if unicode_strings feature is enabled (RT#130907)
 
 * Sat Jul 29 2017 Igor Gnatenko <ignatenkobrain@fedoraproject.org> - 4:5.26.0-397
 - Enable separate debuginfo back