pcre2/pcre2-10.30-Fix-PCRE2_FIRSTLINE-bug-when-a-pattern-match-starts-.patch

From f3b22988611cca57770a705f05c0d9ef583d605a Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Mon, 1 Jan 2018 14:12:35 +0000
Subject: [PATCH] Fix PCRE2_FIRSTLINE bug when a pattern match starts with the
 first code unit of a newline sequence.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@899 6239d852-aaf2-0410-a92c-79f79f948069
Petr Písař: Ported to 10.32

Signed-off-by: Petr Písař <ppisar@redhat.com>
---
 src/pcre2_dfa_match.c | 21 +++++++++++++++------
 src/pcre2_match.c     | 19 ++++++++++++++-----
 testdata/testinput2   | 10 ++++++++++
 testdata/testinput6   | 10 ++++++++++
 testdata/testoutput2  | 13 +++++++++++++
 testdata/testoutput6  | 13 +++++++++++++
 6 files changed, 75 insertions(+), 11 deletions(-)

diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
index 5ae1394..7bbd6d3 100644
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
 
                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2017 University of Cambridge
+          New API code Copyright (c) 2016-2018 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -3558,9 +3558,11 @@ for (;;)
 
     /* If firstline is TRUE, the start of the match is constrained to the first
     line of a multiline string. That is, the match must be before or at the
-    first newline. Implement this by temporarily adjusting end_subject so that
-    we stop the optimization scans for a first code unit at a newline. If the
-    match fails at the newline, later code breaks this loop. */
+    first newline following the start of matching. Temporarily adjust
+    end_subject so that we stop the optimization scans for a first code unit
+    immediately after the first character of a newline (the first code unit can
+    legitimately be a newline). If the match fails at the newline, later code
+    breaks this loop. */
 
     if (firstline)
       {
@@ -3568,7 +3570,7 @@ for (;;)
 #ifdef SUPPORT_UNICODE
       if (utf)
         {
-        while (t < mb->end_subject && !IS_NEWLINE(t))
+        while (t < end_subject && !IS_NEWLINE(t))
           {
           t++;
           ACROSSCHAR(t < end_subject, *t, t++);
@@ -3576,7 +3578,14 @@ for (;;)
         }
       else
 #endif
-      while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
+      while (t < end_subject && !IS_NEWLINE(t)) t++;
+
+      /* Note that we only need to advance by one code unit if we found a
+      newline. If the newline is CRLF, a first code unit of LF should not
+      match, because it is not at or before the newline. Similarly, only the
+      first code unit of a Unicode newline might be relevant. */
+
+      if (t < end_subject) t++;
       end_subject = t;
       }
 
diff --git a/src/pcre2_match.c b/src/pcre2_match.c
index 050b7e9..8afb0d8 100644
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@@ -6541,9 +6541,11 @@ for(;;)
 
     /* If firstline is TRUE, the start of the match is constrained to the first
     line of a multiline string. That is, the match must be before or at the
-    first newline. Implement this by temporarily adjusting end_subject so that
-    we stop the optimization scans for a first code unit at a newline. If the
-    match fails at the newline, later code breaks this loop. */
+    first newline following the start of matching. Temporarily adjust
+    end_subject so that we stop the optimization scans for a first code unit
+    immediately after the first character of a newline (the first code unit can
+    legitimately be a newline). If the match fails at the newline, later code
+    breaks this loop. */
 
     if (firstline)
       {
@@ -6551,7 +6553,7 @@ for(;;)
 #ifdef SUPPORT_UNICODE
       if (utf)
         {
-        while (t < mb->end_subject && !IS_NEWLINE(t))
+        while (t < end_subject && !IS_NEWLINE(t))
           {
           t++;
           ACROSSCHAR(t < end_subject, *t, t++);
@@ -6559,7 +6561,14 @@ for(;;)
         }
       else
 #endif
-      while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
+      while (t < end_subject && !IS_NEWLINE(t)) t++;
+
+      /* Note that we only need to advance by one code unit if we found a
+      newline. If the newline is CRLF, a first code unit of LF should not
+      match, because it is not at or before the newline. Similarly, only the
+      first code unit of a Unicode newline might be relevant. */
+
+      if (t < end_subject) t++;
       end_subject = t;
       }
 
diff --git a/testdata/testinput2 b/testdata/testinput2
index 695f0a4..b173fe0 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -5385,4 +5385,14 @@ a)"xI
     ab
     aaab 
 
+/\n/firstline
+    xyz\nabc
+
+/\nabc/firstline
+    xyz\nabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+    xyz\r\nabc
+
 # End of testinput2
diff --git a/testdata/testinput6 b/testdata/testinput6
index ce2e082..614c3a0 100644
--- a/testdata/testinput6
+++ b/testdata/testinput6
@@ -4932,4 +4932,14 @@
 /(*LIMIT_MATCH=100).*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););/no_dotstar_anchor
 .*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
 
+/\n/firstline
+    xyz\nabc
+
+/\nabc/firstline
+    xyz\nabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+    xyz\r\nabc
+
 # End of testinput6
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 31ccfbe..c19c270 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -16368,6 +16368,19 @@ Subject length lower bound = 1
  0: ab
  1: a
 
+/\n/firstline
+    xyz\nabc
+ 0: \x0a
+
+/\nabc/firstline
+    xyz\nabc
+ 0: \x0aabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+    xyz\r\nabc
+No match
+
 # End of testinput2
 Error -65: PCRE2_ERROR_BADDATA (unknown error number)
 Error -62: bad serialized data
diff --git a/testdata/testoutput6 b/testdata/testoutput6
index b912944..2d321d5 100644
--- a/testdata/testoutput6
+++ b/testdata/testoutput6
@@ -7753,4 +7753,17 @@ No match
 .*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
 Failed: error -47: match limit exceeded
 
+/\n/firstline
+    xyz\nabc
+ 0: \x0a
+
+/\nabc/firstline
+    xyz\nabc
+ 0: \x0aabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+    xyz\r\nabc
+No match
+
 # End of testinput6
-- 
2.13.6