136 lines
3.9 KiB
Diff
136 lines
3.9 KiB
Diff
From cbdb75742d763ab4133942014747132400c6ddfb Mon Sep 17 00:00:00 2001
|
|
From: Pali <pali@cpan.org>
|
|
Date: Tue, 16 Aug 2016 19:05:17 +0200
|
|
Subject: [PATCH] Encode::utf8: Fix processing invalid UTF-8 subsequences
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Skip number characters which was really processed by convert_utf8_multi_seq
|
|
and not just expected number of characters. Also simplify check for strict
|
|
UTF-8 mode.
|
|
|
|
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
|
---
|
|
Encode.xs | 46 ++++++++++++++++++++++++++--------------------
|
|
t/fallback.t | 2 +-
|
|
2 files changed, 27 insertions(+), 21 deletions(-)
|
|
|
|
diff --git a/Encode.xs b/Encode.xs
|
|
index 6b4fae9..60de62c 100644
|
|
--- a/Encode.xs
|
|
+++ b/Encode.xs
|
|
@@ -325,13 +325,20 @@ strict_utf8(pTHX_ SV* sv)
|
|
#define UNICODE_IS_NONCHAR(c) ((c >= 0xFDD0 && c <= 0xFDEF) || (c & 0xFFFE) == 0xFFFE)
|
|
#endif
|
|
|
|
+#ifndef UNICODE_IS_SUPER
|
|
+#define UNICODE_IS_SUPER(c) (c > PERL_UNICODE_MAX)
|
|
+#endif
|
|
+
|
|
+#define UNICODE_IS_STRICT(c) (!UNICODE_IS_SURROGATE(c) && !UNICODE_IS_NONCHAR(c) && !UNICODE_IS_SUPER(c))
|
|
+
|
|
+/*
|
|
+ * Convert non strict utf8 sequence of len >= 2 to unicode codepoint
|
|
+ */
|
|
static UV
|
|
-convert_utf8_multi_seq(U8* s, STRLEN len, bool strict)
|
|
+convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen)
|
|
{
|
|
UV uv;
|
|
-
|
|
- if (strict && len > 4)
|
|
- return 0;
|
|
+ U8 *ptr = s;
|
|
|
|
uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(len);
|
|
|
|
@@ -339,15 +346,15 @@ convert_utf8_multi_seq(U8* s, STRLEN len, bool strict)
|
|
s++;
|
|
|
|
while (len--) {
|
|
- if (!UTF8_IS_CONTINUATION(*s))
|
|
+ if (!UTF8_IS_CONTINUATION(*s)) {
|
|
+ *rlen = s-ptr;
|
|
return 0;
|
|
+ }
|
|
uv = UTF8_ACCUMULATE(uv, *s);
|
|
s++;
|
|
}
|
|
|
|
- if (strict && (UNICODE_IS_SURROGATE(uv) || UNICODE_IS_NONCHAR(uv) || uv > PERL_UNICODE_MAX))
|
|
- return 0;
|
|
-
|
|
+ *rlen = s-ptr;
|
|
return uv;
|
|
}
|
|
|
|
@@ -384,32 +391,30 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
|
|
continue;
|
|
}
|
|
|
|
+ ulen = 1;
|
|
if (UTF8_IS_START(*s)) {
|
|
U8 skip = UTF8SKIP(s);
|
|
if ((s + skip) > e) {
|
|
if (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) {
|
|
const U8 *p = s + 1;
|
|
for (; p < e; p++) {
|
|
- if (!UTF8_IS_CONTINUATION(*p))
|
|
+ if (!UTF8_IS_CONTINUATION(*p)) {
|
|
+ ulen = p-s;
|
|
goto malformed_byte;
|
|
+ }
|
|
}
|
|
break;
|
|
}
|
|
|
|
+ ulen = e-s;
|
|
goto malformed_byte;
|
|
}
|
|
|
|
- ulen = skip;
|
|
- uv = convert_utf8_multi_seq(s, skip, strict);
|
|
- if (uv == 0) {
|
|
- if (strict) {
|
|
- uv = convert_utf8_multi_seq(s, skip, 0);
|
|
- if (uv == 0)
|
|
- goto malformed_byte;
|
|
- goto malformed;
|
|
- }
|
|
+ uv = convert_utf8_multi_seq(s, skip, &ulen);
|
|
+ if (uv == 0)
|
|
goto malformed_byte;
|
|
- }
|
|
+ else if (strict && !UNICODE_IS_STRICT(uv))
|
|
+ goto malformed;
|
|
|
|
|
|
/* Whole char is good */
|
|
@@ -422,7 +427,8 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
|
|
/* If we get here there is something wrong with alleged UTF-8 */
|
|
malformed_byte:
|
|
uv = (UV)*s;
|
|
- ulen = 1;
|
|
+ if (ulen == 0)
|
|
+ ulen = 1;
|
|
|
|
malformed:
|
|
if (check & ENCODE_DIE_ON_ERR){
|
|
diff --git a/t/fallback.t b/t/fallback.t
|
|
index 8ef8ab3..86605ef 100644
|
|
--- a/t/fallback.t
|
|
+++ b/t/fallback.t
|
|
@@ -35,7 +35,7 @@ for my $i (0x80..0xff){
|
|
$uo .= chr($i);
|
|
$residue .= chr($i);
|
|
$af .= '?';
|
|
- $uf .= "\x{FFFD}";
|
|
+ $uf .= "\x{FFFD}" if $i < 0xfd;
|
|
$ap .= sprintf("\\x{%04x}", $i);
|
|
$up .= sprintf("\\x%02X", $i);
|
|
$ah .= sprintf("&#%d;", $i);
|
|
--
|
|
2.7.4
|
|
|