Fix case-less match if cases differ in encoding length

This commit is contained in:
Petr Písař 2011-12-02 11:47:19 +01:00
parent eadca49929
commit 789dda6d1e
2 changed files with 158 additions and 1 deletions

View File

@ -0,0 +1,150 @@
From 72a4bb52e09d46af0b00dd4064f93e9948fdad51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= <ppisar@redhat.com>
Date: Fri, 2 Dec 2011 11:36:54 +0100
Subject: [PATCH] Fix caseless match if cases differ in encoding length
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
From:
r778 | ph10 | 2011-12-01 18:38:47 +0100 (Čt, 01 pro 2011) | 3 lines
Fix bug with caseless matching of characters of different lengths when
the shorter is right at the end of the subject.
Petr Pisar: Changelog entry removed.
---
pcre_exec.c | 32 ++++++++++++++++----------------
testdata/testinput6 | 14 ++++++++++++++
testdata/testoutput6 | 22 ++++++++++++++++++++++
3 files changed, 52 insertions(+), 16 deletions(-)
diff --git a/pcre_exec.c b/pcre_exec.c
index 2e763d1..9881bdd 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -427,7 +427,7 @@ returns a negative (error) response, the outer incarnation must also return the
same response. */
/* These macros pack up tests that are used for partial matching, and which
-appears several times in the code. We set the "hit end" flag if the pointer is
+appear several times in the code. We set the "hit end" flag if the pointer is
at the end of the subject and also past the start of the subject (i.e.
something has been matched). For hard partial matching, we then return
immediately. The second one is used when we already know we are past the end of
@@ -3039,31 +3039,36 @@ for (;;)
}
break;
- /* Match a single character, caselessly */
+ /* Match a single character, caselessly. If we are at the end of the
+ subject, give up immediately. */
case OP_CHARI:
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
+ }
+
#ifdef SUPPORT_UTF8
if (utf8)
{
length = 1;
ecode++;
GETCHARLEN(fc, ecode, length);
-
- if (length > md->end_subject - eptr)
- {
- CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
- MRRETURN(MATCH_NOMATCH);
- }
-
+
/* If the pattern character's value is < 128, we have only one byte, and
- can use the fast lookup table. */
+ we know that its other case must also be one byte long, so we can use the
+ fast lookup table. We know that there is at least one byte left in the
+ subject. */
if (fc < 128)
{
if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
}
- /* Otherwise we must pick up the subject character */
+ /* Otherwise we must pick up the subject character. Note that we cannot
+ use the value of "length" to check for sufficient bytes left, because the
+ other case of the character may have more or fewer bytes. */
else
{
@@ -3088,11 +3093,6 @@ for (;;)
/* Non-UTF-8 mode */
{
- if (md->end_subject - eptr < 1)
- {
- SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
- MRRETURN(MATCH_NOMATCH);
- }
if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
ecode += 2;
}
diff --git a/testdata/testinput6 b/testdata/testinput6
index e5fc0e9..6b0d2f7 100644
--- a/testdata/testinput6
+++ b/testdata/testinput6
@@ -802,4 +802,18 @@
** Failers
a\xFCb
+/ⱥ/8i
+ ⱥ
+ Ⱥx
+ Ⱥ
+
+/[ⱥ]/8i
+ ⱥ
+ Ⱥx
+ Ⱥ
+
+/Ⱥ/8i
+ Ⱥ
+ ⱥ
+
/-- End of testinput6 --/
diff --git a/testdata/testoutput6 b/testdata/testoutput6
index 1acaa23..68c0a46 100644
--- a/testdata/testoutput6
+++ b/testdata/testoutput6
@@ -1353,4 +1353,26 @@ No match
a\xFCb
No match
+/ⱥ/8i
+ ⱥ
+ 0: \x{2c65}
+ Ⱥx
+ 0: \x{23a}
+ Ⱥ
+ 0: \x{23a}
+
+/[ⱥ]/8i
+ ⱥ
+ 0: \x{2c65}
+ Ⱥx
+ 0: \x{23a}
+ Ⱥ
+ 0: \x{23a}
+
+/Ⱥ/8i
+ Ⱥ
+ 0: \x{23a}
+ ⱥ
+ 0: \x{2c65}
+
/-- End of testinput6 --/
--
1.7.7.4

View File

@ -1,7 +1,7 @@
# This is stable release: %%global rcversion RC3
Name: pcre
Version: 8.20
Release: %{?rcversion:0.}6%{?rcversion:.%rcversion}%{?dist}
Release: %{?rcversion:0.}7%{?rcversion:.%rcversion}%{?dist}
%global myversion %{version}%{?rcversion:-%rcversion}
Summary: Perl-compatible regular expression library
Group: System Environment/Libraries
@ -19,6 +19,9 @@ Patch3: pcre-8.20-lookbehind-2.patch
Patch4: pcre-8.20-forward_reference.patch
# Fix cache-flush in JIT on PPC, in upstream after 8.20.
Patch5: pcre-8.20-ppcjit.patch
# Fix case-less match if cases differ in encoding length, in upstream after
# 8.20.
Patch6: pcre-8.20-caseless_different_length.patch
BuildRequires: readline-devel
# New libtool to get rid of rpath
BuildRequires: autoconf, automake, libtool
@ -63,6 +66,7 @@ libtoolize --copy --force && autoreconf
%patch3 -p1 -b .lookbehind2
%patch4 -p0 -b .forward_reference
%patch5 -p0 -b .ppcjit
%patch6 -p1 -b .caseless_different_length
# One contributor's name is non-UTF-8
for F in ChangeLog; do
iconv -f latin1 -t utf8 "$F" >"${F}.utf8"
@ -133,6 +137,9 @@ make check
%{_mandir}/man1/pcretest.*
%changelog
* Fri Dec 02 2011 Petr Pisar <ppisar@redhat.com> - 8.20-7
- Fix case-less match if cases differ in encoding length (bug #756675)
* Fri Nov 25 2011 Petr Pisar <ppisar@redhat.com> - 8.20-6
- Fix cache-flush in JIT on PPC