ruby/rubygem-strscan-1.0.2-Accept-String-as-a-pattern.patch
2024-06-25 10:26:27 +02:00

265 lines
8.5 KiB
Diff

From e56ac27d19cc3acdf6c1cb13b14224c43df5f5f6 Mon Sep 17 00:00:00 2001
From: Kouhei Sutou <kou@clear-code.com>
Date: Thu, 4 Apr 2019 17:52:50 +0900
Subject: [PATCH] Accept String as a pattern
It's only for head only match case such as StringScanner#scan.
If we use a String as a pattern, we can improve match performance.
Here is a result of the including benchmark. It shows String as a
pattern is 1.25x faster than Regexp as a pattern.
% rake benchmark
/tmp/local/bin/ruby -S benchmark-driver benchmark/scan.yaml
Warming up --------------------------------------
regexp 12.094M i/s - 12.242M times in 1.012250s (82.69ns/i, 277clocks/i)
string 14.653M i/s - 14.889M times in 1.016124s (68.25ns/i, 252clocks/i)
Calculating -------------------------------------
regexp 14.713M i/s - 36.281M times in 2.465970s (67.97ns/i, 254clocks/i)
string 18.422M i/s - 43.959M times in 2.386255s (54.28ns/i, 201clocks/i)
Comparison:
string: 18421631.8 i/s
regexp: 14712660.7 i/s - 1.25x slower
====
Backport https://github.com/ruby/strscan/pull/4 for strscan.
REXML fixes for CVE-2024-35716 depend on this feature.
---
ext/strscan/strscan.c | 92 +++++++++++++++++++-----------
test/strscan/test_stringscanner.rb | 45 ++++++++++++++-
2 files changed, 100 insertions(+), 37 deletions(-)
diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c
index d6168a0d4f..43319b672e 100644
--- a/ext/strscan/strscan.c
+++ b/ext/strscan/strscan.c
@@ -447,15 +447,18 @@ strscan_set_pos(VALUE self, VALUE v)
}
static VALUE
-strscan_do_scan(VALUE self, VALUE regex, int succptr, int getstr, int headonly)
+strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly)
{
- regex_t *rb_reg_prepare_re(VALUE re, VALUE str);
struct strscanner *p;
- regex_t *re;
- long ret;
- int tmpreg;
- Check_Type(regex, T_REGEXP);
+ if (headonly) {
+ if (!RB_TYPE_P(pattern, T_REGEXP)) {
+ StringValue(pattern);
+ }
+ }
+ else {
+ Check_Type(pattern, T_REGEXP);
+ }
GET_SCANNER(self, p);
CLEAR_MATCH_STATUS(p);
@@ -463,37 +466,55 @@ strscan_do_scan(VALUE self, VALUE regex, int succptr, int getstr, int headonly)
return Qnil;
}
- p->regex = regex;
- re = rb_reg_prepare_re(regex, p->str);
- tmpreg = re != RREGEXP_PTR(regex);
- if (!tmpreg) RREGEXP(regex)->usecnt++;
+ if (RB_TYPE_P(pattern, T_REGEXP)) {
+ regex_t *rb_reg_prepare_re(VALUE re, VALUE str);
+ regex_t *re;
+ long ret;
+ int tmpreg;
+
+ p->regex = pattern;
+ re = rb_reg_prepare_re(pattern, p->str);
+ tmpreg = re != RREGEXP_PTR(pattern);
+ if (!tmpreg) RREGEXP(pattern)->usecnt++;
+
+ if (headonly) {
+ ret = onig_match(re, (UChar* )CURPTR(p),
+ (UChar* )(CURPTR(p) + S_RESTLEN(p)),
+ (UChar* )CURPTR(p), &(p->regs), ONIG_OPTION_NONE);
+ }
+ else {
+ ret = onig_search(re,
+ (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
+ (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
+ &(p->regs), ONIG_OPTION_NONE);
+ }
+ if (!tmpreg) RREGEXP(pattern)->usecnt--;
+ if (tmpreg) {
+ if (RREGEXP(pattern)->usecnt) {
+ onig_free(re);
+ }
+ else {
+ onig_free(RREGEXP_PTR(pattern));
+ RREGEXP_PTR(pattern) = re;
+ }
+ }
- if (headonly) {
- ret = onig_match(re, (UChar* )CURPTR(p),
- (UChar* )(CURPTR(p) + S_RESTLEN(p)),
- (UChar* )CURPTR(p), &(p->regs), ONIG_OPTION_NONE);
+ if (ret == -2) rb_raise(ScanError, "regexp buffer overflow");
+ if (ret < 0) {
+ /* not matched */
+ return Qnil;
+ }
}
else {
- ret = onig_search(re,
- (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
- (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
- &(p->regs), ONIG_OPTION_NONE);
- }
- if (!tmpreg) RREGEXP(regex)->usecnt--;
- if (tmpreg) {
- if (RREGEXP(regex)->usecnt) {
- onig_free(re);
+ rb_enc_check(p->str, pattern);
+ if (S_RESTLEN(p) < RSTRING_LEN(pattern)) {
+ return Qnil;
}
- else {
- onig_free(RREGEXP_PTR(regex));
- RREGEXP_PTR(regex) = re;
+ if (memcmp(CURPTR(p), RSTRING_PTR(pattern), RSTRING_LEN(pattern)) != 0) {
+ return Qnil;
}
- }
-
- if (ret == -2) rb_raise(ScanError, "regexp buffer overflow");
- if (ret < 0) {
- /* not matched */
- return Qnil;
+ onig_region_clear(&(p->regs));
+ onig_region_set(&(p->regs), 0, 0, RSTRING_LEN(pattern));
}
MATCHED(p);
@@ -520,7 +541,8 @@ strscan_do_scan(VALUE self, VALUE regex, int succptr, int getstr, int headonly)
* p s.scan(/\w+/) # -> "test"
* p s.scan(/\w+/) # -> nil
* p s.scan(/\s+/) # -> " "
- * p s.scan(/\w+/) # -> "string"
+ * p s.scan("str") # -> "str"
+ * p s.scan(/\w+/) # -> "ing"
* p s.scan(/./) # -> nil
*
*/
@@ -539,6 +561,7 @@ strscan_scan(VALUE self, VALUE re)
* s = StringScanner.new('test string')
* p s.match?(/\w+/) # -> 4
* p s.match?(/\w+/) # -> 4
+ * p s.match?("test") # -> 4
* p s.match?(/\s+/) # -> nil
*/
static VALUE
@@ -560,7 +583,8 @@ strscan_match_p(VALUE self, VALUE re)
* p s.skip(/\w+/) # -> 4
* p s.skip(/\w+/) # -> nil
* p s.skip(/\s+/) # -> 1
- * p s.skip(/\w+/) # -> 6
+ * p s.skip("st") # -> 2
+ * p s.skip(/\w+/) # -> 4
* p s.skip(/./) # -> nil
*
*/
diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb
index 3423f9cfed..63b1ce1f9b 100644
--- a/test/strscan/test_stringscanner.rb
+++ b/test/strscan/test_stringscanner.rb
@@ -282,6 +282,22 @@ def test_scan
assert_equal "", s.scan(//)
end
+ def test_scan_string
+ s = StringScanner.new('stra strb strc')
+ assert_equal 'str', s.scan('str')
+ assert_equal 'str', s[0]
+ assert_equal 3, s.pos
+ assert_equal false, s.tainted?
+ assert_equal 'a ', s.scan('a ')
+
+ str = 'stra strb strc'.dup
+ str.taint
+ s = StringScanner.new(str, false)
+ matched = s.scan('str')
+ assert_equal 'str', matched
+ assert_equal true, matched.tainted?
+ end
+
def test_skip
s = StringScanner.new('stra strb strc', true)
assert_equal 4, s.skip(/\w+/)
@@ -367,8 +383,10 @@ def test_matched
assert_equal false, s.matched.tainted?
s.scan(/\s+/)
assert_equal ' ', s.matched
+ s.scan('st')
+ assert_equal 'st', s.matched
s.scan(/\w+/)
- assert_equal 'strb', s.matched
+ assert_equal 'rb', s.matched
s.scan(/\s+/)
assert_equal ' ', s.matched
s.scan(/\w+/)
@@ -483,7 +501,7 @@ def test_pre_match
s.skip(/\s/)
assert_equal 'a', s.pre_match
assert_equal false, s.pre_match.tainted?
- s.scan(/\w/)
+ s.scan('b')
assert_equal 'a ', s.pre_match
s.scan_until(/c/)
assert_equal 'a b ', s.pre_match
@@ -513,7 +531,7 @@ def test_post_match
assert_equal ' b c d e', s.post_match
s.skip(/\s/)
assert_equal 'b c d e', s.post_match
- s.scan(/\w/)
+ s.scan('b')
assert_equal ' c d e', s.post_match
s.scan_until(/c/)
assert_equal ' d e', s.post_match
@@ -589,6 +607,20 @@ def test_encoding
assert_equal(Encoding::EUC_JP, ss.scan(/./e).encoding)
end
+ def test_encoding_string
+ str = "\xA1\xA2".dup.force_encoding("euc-jp")
+ ss = StringScanner.new(str)
+ assert_equal(str.dup, ss.scan(str.dup))
+ end
+
+ def test_invalid_encoding_string
+ str = "\xA1\xA2".dup.force_encoding("euc-jp")
+ ss = StringScanner.new(str)
+ assert_raise(Encoding::CompatibilityError) do
+ ss.scan(str.encode("UTF-8"))
+ end
+ end
+
def test_generic_regexp
ss = StringScanner.new("\xA1\xA2".dup.force_encoding("euc-jp"))
t = ss.scan(/./)
@@ -643,6 +675,13 @@ def test_exist_p
assert_equal(nil, s.exist?(/e/))
end
+ def test_exist_p_string
+ s = StringScanner.new("test string")
+ assert_raise(TypeError) do
+ s.exist?(" ")
+ end
+ end
+
def test_skip_until
s = StringScanner.new("Foo Bar Baz")
assert_equal(3, s.skip_until(/Foo/))