pcre2/pcre2-10.30-Update-pcre2demo-to-deal-with-various-K-inside-asser.patch

97 lines
3.6 KiB
Diff

From f442210323e228a407dfda75b6bb7a62e91111ee Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Sat, 23 Dec 2017 17:15:51 +0000
Subject: [PATCH 1/3] Update pcre2demo to deal with various \K inside assertion
anomalies.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@894 6239d852-aaf2-0410-a92c-79f79f948069
Petr Písař: Ported to 10.30.
---
src/pcre2demo.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
diff --git a/src/pcre2demo.c b/src/pcre2demo.c
index 8ae49f1..5d9b321 100644
--- a/src/pcre2demo.c
+++ b/src/pcre2demo.c
@@ -211,6 +211,21 @@ pcre2_match_data_create_from_pattern() above. */
if (rc == 0)
printf("ovector was not big enough for all the captured substrings\n");
+/* We must guard against patterns such as /(?=.\K)/ that use \K in an assertion
+to set the start of a match later than its end. In this demonstration program,
+we just detect this case and give up. */
+
+if (ovector[0] > ovector[1])
+ {
+ printf("\\K was used in an assertion to set the match start after its end.\n"
+ "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
+ (char *)(subject + ovector[1]));
+ printf("Run abandoned\n");
+ pcre2_match_data_free(match_data);
+ pcre2_code_free(re);
+ return 1;
+ }
+
/* Show substrings stored in the output vector by number. Obviously, in a real
application you might want to do things other than print them. */
@@ -338,6 +353,29 @@ for (;;)
options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
}
+ /* If the previous match was not an empty string, there is one tricky case to
+ consider. If a pattern contains \K within a lookbehind assertion at the
+ start, the end of the matched string can be at the offset where the match
+ started. Without special action, this leads to a loop that keeps on matching
+ the same substring. We must detect this case and arrange to move the start on
+ by one character. The pcre2_get_startchar() function returns the starting
+ offset that was passed to pcre2_match(). */
+
+ else
+ {
+ PCRE2_SIZE startchar = pcre2_get_startchar(match_data);
+ if (start_offset <= startchar)
+ {
+ if (startchar >= subject_length) break; /* Reached end of subject. */
+ start_offset = startchar + 1; /* Advance by one character. */
+ if (utf8) /* If UTF-8, it may be more */
+ { /* than one code unit. */
+ for (; start_offset < subject_length; start_offset++)
+ if ((subject[start_offset] & 0xc0) != 0x80) break;
+ }
+ }
+ }
+
/* Run the next matching operation */
rc = pcre2_match(
@@ -402,6 +440,21 @@ for (;;)
if (rc == 0)
printf("ovector was not big enough for all the captured substrings\n");
+ /* We must guard against patterns such as /(?=.\K)/ that use \K in an
+ assertion to set the start of a match later than its end. In this
+ demonstration program, we just detect this case and give up. */
+
+ if (ovector[0] > ovector[1])
+ {
+ printf("\\K was used in an assertion to set the match start after its end.\n"
+ "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
+ (char *)(subject + ovector[1]));
+ printf("Run abandoned\n");
+ pcre2_match_data_free(match_data);
+ pcre2_code_free(re);
+ return 1;
+ }
+
/* As before, show substrings stored in the output vector by number, and then
also any named substrings. */
--
2.13.6