Fix case-less match if cases differ in encoding length

2011-12-02 11:47:19 +01:00 · 2011-12-02 11:47:19 +01:00 · 789dda6d1e
commit 789dda6d1e
parent eadca49929
2 changed files with 158 additions and 1 deletions
--- a/pcre-8.20-caseless_different_length.patch
+++ b/pcre-8.20-caseless_different_length.patch
@ -0,0 +1,150 @@
 From 72a4bb52e09d46af0b00dd4064f93e9948fdad51 Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= <ppisar@redhat.com>
 Date: Fri, 2 Dec 2011 11:36:54 +0100
 Subject: [PATCH] Fix caseless match if cases differ in encoding length
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 From:
 r778 | ph10 | 2011-12-01 18:38:47 +0100 (Čt, 01 pro 2011) | 3 lines
 Fix bug with caseless matching of characters of different lengths when
 the shorter is right at the end of the subject.
 Petr Pisar: Changelog entry removed.
 ---
 pcre_exec.c          |   32 ++++++++++++++++----------------
 testdata/testinput6  |   14 ++++++++++++++
 testdata/testoutput6 |   22 ++++++++++++++++++++++
 3 files changed, 52 insertions(+), 16 deletions(-)
 diff --git a/pcre_exec.c b/pcre_exec.c
 index 2e763d1..9881bdd 100644
 --- a/pcre_exec.c
 +++ b/pcre_exec.c
@@ -427,7 +427,7 @@ returns a negative (error) response, the outer incarnation must also return the
 same response. */
 /* These macros pack up tests that are used for partial matching, and which
 -appears several times in the code. We set the "hit end" flag if the pointer is
 +appear several times in the code. We set the "hit end" flag if the pointer is
 at the end of the subject and also past the start of the subject (i.e.
 something has been matched). For hard partial matching, we then return
 immediately. The second one is used when we already know we are past the end of
@@ -3039,31 +3039,36 @@ for (;;)
       }
     break;
 -    /* Match a single character, caselessly */
 +    /* Match a single character, caselessly. If we are at the end of the 
 +    subject, give up immediately. */
     case OP_CHARI:
 +    if (eptr >= md->end_subject)
 +      {
 +      SCHECK_PARTIAL(); 
 +      MRRETURN(MATCH_NOMATCH); 
 +      }   
 + 
 #ifdef SUPPORT_UTF8
     if (utf8)
       {
       length = 1;
       ecode++;
       GETCHARLEN(fc, ecode, length);
 -
 -      if (length > md->end_subject - eptr)
 -        {
 -        CHECK_PARTIAL();             /* Not SCHECK_PARTIAL() */
 -        MRRETURN(MATCH_NOMATCH);
 -        }
 -
 + 
       /* If the pattern character's value is < 128, we have only one byte, and
 -      can use the fast lookup table. */
 +      we know that its other case must also be one byte long, so we can use the
 +      fast lookup table. We know that there is at least one byte left in the 
 +      subject. */
       if (fc < 128)
         {
         if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
         }
 -      /* Otherwise we must pick up the subject character */
 +      /* Otherwise we must pick up the subject character. Note that we cannot
 +      use the value of "length" to check for sufficient bytes left, because the
 +      other case of the character may have more or fewer bytes.  */
       else
         {
@@ -3088,11 +3093,6 @@ for (;;)
     /* Non-UTF-8 mode */
       {
 -      if (md->end_subject - eptr < 1)
 -        {
 -        SCHECK_PARTIAL();            /* This one can use SCHECK_PARTIAL() */
 -        MRRETURN(MATCH_NOMATCH);
 -        }
       if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
       ecode += 2;
       }
 diff --git a/testdata/testinput6 b/testdata/testinput6
 index e5fc0e9..6b0d2f7 100644
 --- a/testdata/testinput6
 +++ b/testdata/testinput6
@@ -802,4 +802,18 @@
     ** Failers 
     a\xFCb   
 +/ⱥ/8i
 +    ⱥ
 +    Ⱥx 
 +    Ⱥ 
 +
 +/[ⱥ]/8i
 +    ⱥ
 +    Ⱥx 
 +    Ⱥ 
 +
 +/Ⱥ/8i
 +    Ⱥ
 +    ⱥ
 +
 /-- End of testinput6 --/
 diff --git a/testdata/testoutput6 b/testdata/testoutput6
 index 1acaa23..68c0a46 100644
 --- a/testdata/testoutput6
 +++ b/testdata/testoutput6
@@ -1353,4 +1353,26 @@ No match
     a\xFCb   
 No match
 +/ⱥ/8i
 +    ⱥ
 + 0: \x{2c65}
 +    Ⱥx 
 + 0: \x{23a}
 +    Ⱥ 
 + 0: \x{23a}
 +
 +/[ⱥ]/8i
 +    ⱥ
 + 0: \x{2c65}
 +    Ⱥx 
 + 0: \x{23a}
 +    Ⱥ 
 + 0: \x{23a}
 +
 +/Ⱥ/8i
 +    Ⱥ
 + 0: \x{23a}
 +    ⱥ
 + 0: \x{2c65}
 +
 /-- End of testinput6 --/
 -- 
 1.7.7.4
--- a/pcre.spec
+++ b/pcre.spec
@ -1,7 +1,7 @@
 # This is stable release: %%global rcversion RC3
 Name: pcre
 Version: 8.20
-Release: %{?rcversion:0.}6%{?rcversion:.%rcversion}%{?dist}
+Release: %{?rcversion:0.}7%{?rcversion:.%rcversion}%{?dist}
 %global myversion %{version}%{?rcversion:-%rcversion}
 Summary: Perl-compatible regular expression library
 Group: System Environment/Libraries
@ -19,6 +19,9 @@ Patch3: pcre-8.20-lookbehind-2.patch
 Patch4: pcre-8.20-forward_reference.patch
 # Fix cache-flush in JIT on PPC, in upstream after 8.20.
 Patch5: pcre-8.20-ppcjit.patch
 # Fix case-less match if cases differ in encoding length, in upstream after
 # 8.20.
 Patch6: pcre-8.20-caseless_different_length.patch
 BuildRequires: readline-devel
 # New libtool to get rid of rpath
 BuildRequires: autoconf, automake, libtool
@ -63,6 +66,7 @@ libtoolize --copy --force && autoreconf
 %patch3 -p1 -b .lookbehind2
 %patch4 -p0 -b .forward_reference
 %patch5 -p0 -b .ppcjit
 %patch6 -p1 -b .caseless_different_length
 # One contributor's name is non-UTF-8
 for F in ChangeLog; do
    iconv -f latin1 -t utf8 "$F" >"${F}.utf8"
@ -133,6 +137,9 @@ make check
 %{_mandir}/man1/pcretest.*
 %changelog
 * Fri Dec 02 2011 Petr Pisar <ppisar@redhat.com> - 8.20-7
 - Fix case-less match if cases differ in encoding length (bug #756675)
 * Fri Nov 25 2011 Petr Pisar <ppisar@redhat.com> - 8.20-6
 - Fix cache-flush in JIT on PPC