From 281d2faaadfab697923a8bd0a3d9fe28194385e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= Date: Thu, 3 Nov 2016 13:33:10 +0100 Subject: [PATCH] Fix firstchar bitmap under UTF-8 with prefix optimization --- ...rl-129950-fix-firstchar-bitmap-under.patch | 97 +++++++++++++++++++ perl.spec | 8 ++ 2 files changed, 105 insertions(+) create mode 100644 perl-5.24.0-regcomp.c-fix-perl-129950-fix-firstchar-bitmap-under.patch diff --git a/perl-5.24.0-regcomp.c-fix-perl-129950-fix-firstchar-bitmap-under.patch b/perl-5.24.0-regcomp.c-fix-perl-129950-fix-firstchar-bitmap-under.patch new file mode 100644 index 0000000..82eeea6 --- /dev/null +++ b/perl-5.24.0-regcomp.c-fix-perl-129950-fix-firstchar-bitmap-under.patch @@ -0,0 +1,97 @@ +From 1b90dad20879f0e7a3eced5da0e0aacda93708ed Mon Sep 17 00:00:00 2001 +From: Yves Orton +Date: Thu, 27 Oct 2016 13:52:24 +0200 +Subject: [PATCH] regcomp.c: fix perl #129950 - fix firstchar bitmap under utf8 + with prefix optimisation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Ported to 5.24.0: + +commit da42332b10691ba7af7550035ffc7f46c87e4e66 +Author: Yves Orton +Date: Thu Oct 27 13:52:24 2016 +0200 + + regcomp.c: fix perl #129950 - fix firstchar bitmap under utf8 with prefix optimisation + + The trie code contains a number of sub optimisations, one of which + extracts common prefixes from alternations, and another which isa + bitmap of the possible matching first chars. + + The bitmap needs to contain the possible first octets of the string + which the trie can match, and for codepoints which might have a different + first octet under utf8 or non-utf8 need to register BOTH codepoints. + + So for instance in the pattern (?:a|a\x{E4}) we should restructure this + as a(|\x{E4), and the bitmap for the trie should contain both \x{E4} AND + \x{C3} as \x{C3} is the first byte of \x{EF} expressed as utf8. + +Signed-off-by: Petr Písař +--- + regcomp.c | 14 ++++++++++++++ + t/re/pat.t | 9 ++++++++- + 2 files changed, 22 insertions(+), 1 deletion(-) + +diff --git a/regcomp.c b/regcomp.c +index 7462885..bcb8db5 100644 +--- a/regcomp.c ++++ b/regcomp.c +@@ -3272,6 +3272,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, + TRIE_BITMAP_SET(trie,*ch); + if ( folder ) + TRIE_BITMAP_SET(trie, folder[ *ch ]); ++ if ( !UTF ) { ++ /* store first byte of utf8 representation of ++ variant codepoints */ ++ if (! UVCHR_IS_INVARIANT(*ch)) { ++ TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(*ch)); ++ } ++ } + DEBUG_OPTIMISE_r( + Perl_re_printf( aTHX_ "%s", (char*)ch) + ); +@@ -3280,6 +3287,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, + TRIE_BITMAP_SET(trie,*ch); + if ( folder ) + TRIE_BITMAP_SET(trie,folder[ *ch ]); ++ if ( !UTF ) { ++ /* store first byte of utf8 representation of ++ variant codepoints */ ++ if (! UVCHR_IS_INVARIANT(*ch)) { ++ TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(*ch)); ++ } ++ } + DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch)); + } + idx = ofs; +diff --git a/t/re/pat.t b/t/re/pat.t +index 295a9f7..4aa77cf 100644 +--- a/t/re/pat.t ++++ b/t/re/pat.t +@@ -23,7 +23,7 @@ BEGIN { + skip_all_without_unicode_tables(); + } + +-plan tests => 789; # Update this when adding/deleting tests. ++plan tests => 791; # Update this when adding/deleting tests. + + run_tests() unless caller; + +@@ -1758,6 +1758,13 @@ EOP + fresh_perl_is($code, $expect, {}, "$bug - $test_name" ); + } + } ++ ++ { ++ my $str = "a\xE4"; ++ ok( $str =~ m{^(a|a\x{e4})$}, "fix [perl #129950] - latin1 case" ); ++ utf8::upgrade($str); ++ ok( $str =~ m{^(a|a\x{e4})$}, "fix [perl #129950] - utf8 case" ); ++ } + } # End of sub run_tests + + 1; +-- +2.7.4 + diff --git a/perl.spec b/perl.spec index 0893966..42c36de 100644 --- a/perl.spec +++ b/perl.spec @@ -206,6 +206,11 @@ Patch54: perl-5.24.0-perl-129350-anchored-floating-substrings-must-be-utf # Fix parsing perl options in shell bang line, RT#129336, # in upstream after 5.25.5 Patch55: perl-5.24.0-rt-129336-perl-i-u-erroneously-interpreted-as-u.patch + +# Fix firstchar bitmap under UTF-8 with prefix optimization, RT#129950, +# in upstream after 5.25.6 +Patch56: perl-5.24.0-regcomp.c-fix-perl-129950-fix-firstchar-bitmap-under.patch + # Link XS modules to libperl.so with EU::CBuilder on Linux, bug #960048 Patch200: perl-5.16.3-Link-XS-modules-to-libperl.so-with-EU-CBuilder-on-Li.patch @@ -2884,6 +2889,7 @@ Perl extension for Version Objects %patch53 -p1 %patch54 -p1 %patch55 -p1 +%patch56 -p1 %patch200 -p1 %patch201 -p1 @@ -2930,6 +2936,7 @@ perl -x patchlevel.h \ 'Fedora Patch53: Fix string overrun in Perl_gv_fetchmethod_pvn_flags (RT#129267)' \ 'Fedora Patch54: Fix crash when matching UTF-8 string with non-UTF-8 substrings (RT#129350)' \ 'Fedora Patch55: Fix parsing perl options in shell bang line (RT#129336)' \ + 'Fedora Patch56: Fix firstchar bitmap under UTF-8 with prefix optimization (RT#129950)' \ 'Fedora Patch200: Link XS modules to libperl.so with EU::CBuilder on Linux' \ 'Fedora Patch201: Link XS modules to libperl.so with EU::MM on Linux' \ %{nil} @@ -5214,6 +5221,7 @@ popd - Fix string overrun in Perl_gv_fetchmethod_pvn_flags (RT#129267) - Fix crash when matching UTF-8 string with non-UTF-8 substrings (RT#129350) - Fix parsing perl options in shell bang line (RT#129336) +- Fix firstchar bitmap under UTF-8 with prefix optimization (RT#129950) * Fri Sep 02 2016 Petr Pisar - 4:5.24.0-378 - perl-core depends on Parse::CPAN::Meta module instead of package name to allow