perl/perl-5.24.0-regcomp.c-fix-perl-129950-fix-firstchar-bitmap-under.patch

98 lines
3.8 KiB
Diff
Raw Normal View History

From 1b90dad20879f0e7a3eced5da0e0aacda93708ed Mon Sep 17 00:00:00 2001
From: Yves Orton <demerphq@gmail.com>
Date: Thu, 27 Oct 2016 13:52:24 +0200
Subject: [PATCH] regcomp.c: fix perl #129950 - fix firstchar bitmap under utf8
with prefix optimisation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Ported to 5.24.0:
commit da42332b10691ba7af7550035ffc7f46c87e4e66
Author: Yves Orton <demerphq@gmail.com>
Date: Thu Oct 27 13:52:24 2016 +0200
regcomp.c: fix perl #129950 - fix firstchar bitmap under utf8 with prefix optimisation
The trie code contains a number of sub optimisations, one of which
extracts common prefixes from alternations, and another which isa
bitmap of the possible matching first chars.
The bitmap needs to contain the possible first octets of the string
which the trie can match, and for codepoints which might have a different
first octet under utf8 or non-utf8 need to register BOTH codepoints.
So for instance in the pattern (?:a|a\x{E4}) we should restructure this
as a(|\x{E4), and the bitmap for the trie should contain both \x{E4} AND
\x{C3} as \x{C3} is the first byte of \x{EF} expressed as utf8.
Signed-off-by: Petr Písař <ppisar@redhat.com>
---
regcomp.c | 14 ++++++++++++++
t/re/pat.t | 9 ++++++++-
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/regcomp.c b/regcomp.c
index 7462885..bcb8db5 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3272,6 +3272,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
TRIE_BITMAP_SET(trie,*ch);
if ( folder )
TRIE_BITMAP_SET(trie, folder[ *ch ]);
+ if ( !UTF ) {
+ /* store first byte of utf8 representation of
+ variant codepoints */
+ if (! UVCHR_IS_INVARIANT(*ch)) {
+ TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(*ch));
+ }
+ }
DEBUG_OPTIMISE_r(
Perl_re_printf( aTHX_ "%s", (char*)ch)
);
@@ -3280,6 +3287,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
TRIE_BITMAP_SET(trie,*ch);
if ( folder )
TRIE_BITMAP_SET(trie,folder[ *ch ]);
+ if ( !UTF ) {
+ /* store first byte of utf8 representation of
+ variant codepoints */
+ if (! UVCHR_IS_INVARIANT(*ch)) {
+ TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(*ch));
+ }
+ }
DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
}
idx = ofs;
diff --git a/t/re/pat.t b/t/re/pat.t
index 295a9f7..4aa77cf 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -23,7 +23,7 @@ BEGIN {
skip_all_without_unicode_tables();
}
-plan tests => 789; # Update this when adding/deleting tests.
+plan tests => 791; # Update this when adding/deleting tests.
run_tests() unless caller;
@@ -1758,6 +1758,13 @@ EOP
fresh_perl_is($code, $expect, {}, "$bug - $test_name" );
}
}
+
+ {
+ my $str = "a\xE4";
+ ok( $str =~ m{^(a|a\x{e4})$}, "fix [perl #129950] - latin1 case" );
+ utf8::upgrade($str);
+ ok( $str =~ m{^(a|a\x{e4})$}, "fix [perl #129950] - utf8 case" );
+ }
} # End of sub run_tests
1;
--
2.7.4