Refuse non-shortests UTF-8 representations in strict mode
This commit is contained in:
parent
b6a844858c
commit
e02bca8997
@ -0,0 +1,95 @@
|
||||
From 2aac84fb885fb8560294c1080fc6bbf9be35e731 Mon Sep 17 00:00:00 2001
|
||||
From: Pali <pali@cpan.org>
|
||||
Date: Tue, 16 Aug 2016 18:34:37 +0200
|
||||
Subject: [PATCH] Encode::utf8: Add tests for Malformed and Overlong UTF-8
|
||||
sequences
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
EBCDIC tests are not included yet.
|
||||
|
||||
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
||||
---
|
||||
t/utf8strict.t | 51 ++++++++++++++++++++++++++++++++++++++++++++++-----
|
||||
1 file changed, 46 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/t/utf8strict.t b/t/utf8strict.t
|
||||
index 3f362f4..39293d3 100644
|
||||
--- a/t/utf8strict.t
|
||||
+++ b/t/utf8strict.t
|
||||
@@ -47,8 +47,8 @@ BEGIN {
|
||||
qq/dd 67 41 41/ => 0, # 2.3.2
|
||||
qq/ee 42 73 73 71/ => 0, # 2.3.3
|
||||
qq/f4 90 80 80/ => 1, # 2.3.4 -- out of range so NG
|
||||
- # "3 Malformed sequences" are checked by perl.
|
||||
- # "4 Overlong sequences" are checked by perl.
|
||||
+ # EBCDIC TODO: "3 Malformed sequences"
|
||||
+ # EBCDIC TODO: "4 Overlong sequences"
|
||||
);
|
||||
} else {
|
||||
%SEQ = (
|
||||
@@ -56,8 +56,49 @@ BEGIN {
|
||||
qq/ee 80 80/ => 0, # 2.3.2
|
||||
qq/f4 8f bf bd/ => 0, # 2.3.3
|
||||
qq/f4 90 80 80/ => 1, # 2.3.4 -- out of range so NG
|
||||
- # "3 Malformed sequences" are checked by perl.
|
||||
- # "4 Overlong sequences" are checked by perl.
|
||||
+ qq/80/ => 1, # 3.1.1
|
||||
+ qq/bf/ => 1, # 3.1.2
|
||||
+ qq/80 bf/ => 1, # 3.1.3
|
||||
+ qq/80 bf 80/ => 1, # 3.1.4
|
||||
+ qq/80 bf 80 bf/ => 1, # 3.1.5
|
||||
+ qq/80 bf 80 bf 80/ => 1, # 3.1.6
|
||||
+ qq/80 bf 80 bf 80 bf/ => 1, # 3.1.7
|
||||
+ qq/80 bf 80 bf 80 bf 80/ => 1, # 3.1.8
|
||||
+ qq/80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf/ => 1, # 3.1.9
|
||||
+ qq/c0 20 c1 20 c2 20 c3 20 c4 20 c5 20 c6 20 c7 20 c8 20 c9 20 ca 20 cb 20 cc 20 cd 20 ce 20 cf 20 d0 20 d1 20 d2 20 d3 20 d4 20 d5 20 d6 20 d7 20 d8 20 d9 20 da 20 db 20 dc 20 dd 20 de 20 df 20/ => 1, # 3.2.1
|
||||
+ qq/e0 20 e1 20 e2 20 e3 20 e4 20 e5 20 e6 20 e7 20 e8 20 e9 20 ea 20 eb 20 ec 20 ed 20 ee 20 ef 20/ => 1, # 3.2.2
|
||||
+ qq/f0 20 f1 20 f2 20 f3 20 f4 20 f5 20 f6 20 f7 20/ => 1, # 3.2.3
|
||||
+ qq/f8 20 f9 20 fa 20 fb 20/ => 1, # 3.2.4
|
||||
+ qq/fc 20 fd 20/ => 1, # 3.2.5
|
||||
+ qq/c0/ => 1, # 3.3.1
|
||||
+ qq/e0 80/ => 1, # 3.3.2
|
||||
+ qq/f0 80 80/ => 1, # 3.3.3
|
||||
+ qq/f8 80 80 80/ => 1, # 3.3.4
|
||||
+ qq/fc 80 80 80 80/ => 1, # 3.3.5
|
||||
+ qq/df/ => 1, # 3.3.6
|
||||
+ qq/ef bf/ => 1, # 3.3.7
|
||||
+ qq/f7 bf bf/ => 1, # 3.3.8
|
||||
+ qq/fb bf bf bf/ => 1, # 3.3.9
|
||||
+ qq/fd bf bf bf bf/ => 1, # 3.3.10
|
||||
+ qq/c0 e0 80 f0 80 80 f8 80 80 80 fc 80 80 80 80 df ef bf f7 bf bf fb bf bf bf fd bf bf bf bf/ => 1, # 3.4.1
|
||||
+ qq/fe/ => 1, # 3.5.1
|
||||
+ qq/ff/ => 1, # 3.5.2
|
||||
+ qq/fe fe ff ff/ => 1, # 3.5.3
|
||||
+ qq/c0 af/ => 1, # 4.1.1
|
||||
+ qq/e0 80 af/ => 1, # 4.1.2
|
||||
+ qq/f0 80 80 af/ => 1, # 4.1.3
|
||||
+ qq/f8 80 80 80 af/ => 1, # 4.1.4
|
||||
+ qq/fc 80 80 80 80 af/ => 1, # 4.1.5
|
||||
+ qq/c1 bf/ => 1, # 4.2.1
|
||||
+ qq/e0 9f bf/ => 1, # 4.2.2
|
||||
+ qq/f0 8f bf bf/ => 1, # 4.2.3
|
||||
+ qq/f8 87 bf bf bf/ => 1, # 4.2.4
|
||||
+ qq/fc 83 bf bf bf bf/ => 1, # 4.2.5
|
||||
+ qq/c0 80/ => 1, # 4.3.1
|
||||
+ qq/e0 80 80/ => 1, # 4.3.2
|
||||
+ qq/f0 80 80 80/ => 1, # 4.3.3
|
||||
+ qq/f8 80 80 80 80/ => 1, # 4.3.4
|
||||
+ qq/fc 80 80 80 80 80/ => 1, # 4.3.5
|
||||
);
|
||||
}
|
||||
$NTESTS += scalar keys %SEQ;
|
||||
@@ -82,7 +123,7 @@ for my $s (sort keys %SEQ){
|
||||
eval { $d->decode($o,1) };
|
||||
$DEBUG and $@ and warn $@;
|
||||
my $t = $@ ? 1 : 0;
|
||||
- is($t, $SEQ{$s}, $s);
|
||||
+ is($t, $SEQ{$s}, "sequence: $s");
|
||||
}
|
||||
|
||||
__END__
|
||||
--
|
||||
2.7.4
|
||||
|
@ -0,0 +1,61 @@
|
||||
From b7fc82093961f282b986a62c582b4ca2fcc303db Mon Sep 17 00:00:00 2001
|
||||
From: Pali <pali@cpan.org>
|
||||
Date: Tue, 16 Aug 2016 19:07:31 +0200
|
||||
Subject: [PATCH] Encode::utf8: Check for overflowed and overlong UTF-8
|
||||
sequences
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
||||
---
|
||||
Encode.xs | 14 ++++++++++++++
|
||||
1 file changed, 14 insertions(+)
|
||||
|
||||
diff --git a/Encode.xs b/Encode.xs
|
||||
index 60de62c..1906f0c 100644
|
||||
--- a/Encode.xs
|
||||
+++ b/Encode.xs
|
||||
@@ -331,6 +331,13 @@ strict_utf8(pTHX_ SV* sv)
|
||||
|
||||
#define UNICODE_IS_STRICT(c) (!UNICODE_IS_SURROGATE(c) && !UNICODE_IS_NONCHAR(c) && !UNICODE_IS_SUPER(c))
|
||||
|
||||
+#ifndef UTF_ACCUMULATION_OVERFLOW_MASK
|
||||
+#ifndef CHARBITS
|
||||
+#define CHARBITS CHAR_BIT
|
||||
+#endif
|
||||
+#define UTF_ACCUMULATION_OVERFLOW_MASK (((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * CHARBITS) - UTF_ACCUMULATION_SHIFT))
|
||||
+#endif
|
||||
+
|
||||
/*
|
||||
* Convert non strict utf8 sequence of len >= 2 to unicode codepoint
|
||||
*/
|
||||
@@ -339,6 +346,7 @@ convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen)
|
||||
{
|
||||
UV uv;
|
||||
U8 *ptr = s;
|
||||
+ bool overflowed = 0;
|
||||
|
||||
uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(len);
|
||||
|
||||
@@ -350,11 +358,17 @@ convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen)
|
||||
*rlen = s-ptr;
|
||||
return 0;
|
||||
}
|
||||
+ if (uv & UTF_ACCUMULATION_OVERFLOW_MASK)
|
||||
+ overflowed = 1;
|
||||
uv = UTF8_ACCUMULATE(uv, *s);
|
||||
s++;
|
||||
}
|
||||
|
||||
*rlen = s-ptr;
|
||||
+
|
||||
+ if (overflowed || *rlen > (STRLEN)UNISKIP(uv))
|
||||
+ return 0;
|
||||
+
|
||||
return uv;
|
||||
}
|
||||
|
||||
--
|
||||
2.7.4
|
||||
|
@ -0,0 +1,33 @@
|
||||
From 3cf4b7e53c84d91fa819d89d2504be2db90dee11 Mon Sep 17 00:00:00 2001
|
||||
From: Pali <pali@cpan.org>
|
||||
Date: Fri, 19 Aug 2016 10:58:56 +0200
|
||||
Subject: [PATCH] Encode::utf8: Fix count of replacement characters for
|
||||
overflowed and overlong UTF-8 sequences
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
||||
---
|
||||
Encode.xs | 4 +++-
|
||||
1 file changed, 3 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/Encode.xs b/Encode.xs
|
||||
index 1906f0c..49a3846 100644
|
||||
--- a/Encode.xs
|
||||
+++ b/Encode.xs
|
||||
@@ -366,8 +366,10 @@ convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen)
|
||||
|
||||
*rlen = s-ptr;
|
||||
|
||||
- if (overflowed || *rlen > (STRLEN)UNISKIP(uv))
|
||||
+ if (overflowed || *rlen > (STRLEN)UNISKIP(uv)) {
|
||||
+ *rlen = 1;
|
||||
return 0;
|
||||
+ }
|
||||
|
||||
return uv;
|
||||
}
|
||||
--
|
||||
2.7.4
|
||||
|
@ -0,0 +1,135 @@
|
||||
From cbdb75742d763ab4133942014747132400c6ddfb Mon Sep 17 00:00:00 2001
|
||||
From: Pali <pali@cpan.org>
|
||||
Date: Tue, 16 Aug 2016 19:05:17 +0200
|
||||
Subject: [PATCH] Encode::utf8: Fix processing invalid UTF-8 subsequences
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Skip number characters which was really processed by convert_utf8_multi_seq
|
||||
and not just expected number of characters. Also simplify check for strict
|
||||
UTF-8 mode.
|
||||
|
||||
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
||||
---
|
||||
Encode.xs | 46 ++++++++++++++++++++++++++--------------------
|
||||
t/fallback.t | 2 +-
|
||||
2 files changed, 27 insertions(+), 21 deletions(-)
|
||||
|
||||
diff --git a/Encode.xs b/Encode.xs
|
||||
index 6b4fae9..60de62c 100644
|
||||
--- a/Encode.xs
|
||||
+++ b/Encode.xs
|
||||
@@ -325,13 +325,20 @@ strict_utf8(pTHX_ SV* sv)
|
||||
#define UNICODE_IS_NONCHAR(c) ((c >= 0xFDD0 && c <= 0xFDEF) || (c & 0xFFFE) == 0xFFFE)
|
||||
#endif
|
||||
|
||||
+#ifndef UNICODE_IS_SUPER
|
||||
+#define UNICODE_IS_SUPER(c) (c > PERL_UNICODE_MAX)
|
||||
+#endif
|
||||
+
|
||||
+#define UNICODE_IS_STRICT(c) (!UNICODE_IS_SURROGATE(c) && !UNICODE_IS_NONCHAR(c) && !UNICODE_IS_SUPER(c))
|
||||
+
|
||||
+/*
|
||||
+ * Convert non strict utf8 sequence of len >= 2 to unicode codepoint
|
||||
+ */
|
||||
static UV
|
||||
-convert_utf8_multi_seq(U8* s, STRLEN len, bool strict)
|
||||
+convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen)
|
||||
{
|
||||
UV uv;
|
||||
-
|
||||
- if (strict && len > 4)
|
||||
- return 0;
|
||||
+ U8 *ptr = s;
|
||||
|
||||
uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(len);
|
||||
|
||||
@@ -339,15 +346,15 @@ convert_utf8_multi_seq(U8* s, STRLEN len, bool strict)
|
||||
s++;
|
||||
|
||||
while (len--) {
|
||||
- if (!UTF8_IS_CONTINUATION(*s))
|
||||
+ if (!UTF8_IS_CONTINUATION(*s)) {
|
||||
+ *rlen = s-ptr;
|
||||
return 0;
|
||||
+ }
|
||||
uv = UTF8_ACCUMULATE(uv, *s);
|
||||
s++;
|
||||
}
|
||||
|
||||
- if (strict && (UNICODE_IS_SURROGATE(uv) || UNICODE_IS_NONCHAR(uv) || uv > PERL_UNICODE_MAX))
|
||||
- return 0;
|
||||
-
|
||||
+ *rlen = s-ptr;
|
||||
return uv;
|
||||
}
|
||||
|
||||
@@ -384,32 +391,30 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
|
||||
continue;
|
||||
}
|
||||
|
||||
+ ulen = 1;
|
||||
if (UTF8_IS_START(*s)) {
|
||||
U8 skip = UTF8SKIP(s);
|
||||
if ((s + skip) > e) {
|
||||
if (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) {
|
||||
const U8 *p = s + 1;
|
||||
for (; p < e; p++) {
|
||||
- if (!UTF8_IS_CONTINUATION(*p))
|
||||
+ if (!UTF8_IS_CONTINUATION(*p)) {
|
||||
+ ulen = p-s;
|
||||
goto malformed_byte;
|
||||
+ }
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
+ ulen = e-s;
|
||||
goto malformed_byte;
|
||||
}
|
||||
|
||||
- ulen = skip;
|
||||
- uv = convert_utf8_multi_seq(s, skip, strict);
|
||||
- if (uv == 0) {
|
||||
- if (strict) {
|
||||
- uv = convert_utf8_multi_seq(s, skip, 0);
|
||||
- if (uv == 0)
|
||||
- goto malformed_byte;
|
||||
- goto malformed;
|
||||
- }
|
||||
+ uv = convert_utf8_multi_seq(s, skip, &ulen);
|
||||
+ if (uv == 0)
|
||||
goto malformed_byte;
|
||||
- }
|
||||
+ else if (strict && !UNICODE_IS_STRICT(uv))
|
||||
+ goto malformed;
|
||||
|
||||
|
||||
/* Whole char is good */
|
||||
@@ -422,7 +427,8 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
|
||||
/* If we get here there is something wrong with alleged UTF-8 */
|
||||
malformed_byte:
|
||||
uv = (UV)*s;
|
||||
- ulen = 1;
|
||||
+ if (ulen == 0)
|
||||
+ ulen = 1;
|
||||
|
||||
malformed:
|
||||
if (check & ENCODE_DIE_ON_ERR){
|
||||
diff --git a/t/fallback.t b/t/fallback.t
|
||||
index 8ef8ab3..86605ef 100644
|
||||
--- a/t/fallback.t
|
||||
+++ b/t/fallback.t
|
||||
@@ -35,7 +35,7 @@ for my $i (0x80..0xff){
|
||||
$uo .= chr($i);
|
||||
$residue .= chr($i);
|
||||
$af .= '?';
|
||||
- $uf .= "\x{FFFD}";
|
||||
+ $uf .= "\x{FFFD}" if $i < 0xfd;
|
||||
$ap .= sprintf("\\x{%04x}", $i);
|
||||
$up .= sprintf("\\x%02X", $i);
|
||||
$ah .= sprintf("&#%d;", $i);
|
||||
--
|
||||
2.7.4
|
||||
|
@ -20,6 +20,18 @@ Source0: http://www.cpan.org/authors/id/D/DA/DANKOGAI/Encode-%{cpan_versi
|
||||
# Fix Encode::encode_utf8(undef) to return undef, CPAN RT#116904,
|
||||
# in upstream after 2.86
|
||||
Patch0: Encode-2.86-Fix-return-value-of-Encode-encode_utf8-undef.patch
|
||||
# Tests for refusing non-shortests UTF-8 representations,
|
||||
# <https://github.com/dankogai/p5-encode/issues/64>, in upstream after 2.86
|
||||
Patch1: Encode-2.86-Encode-utf8-Add-tests-for-Malformed-and-Overlong-UTF.patch
|
||||
# 1/3 Refusing non-shortests UTF-8 representations,
|
||||
# <https://github.com/dankogai/p5-encode/issues/64>, in upstream after 2.86
|
||||
Patch2: Encode-2.86-Encode-utf8-Fix-processing-invalid-UTF-8-subsequence.patch
|
||||
# 2/3 Refusing non-shortests UTF-8 representations,
|
||||
# <https://github.com/dankogai/p5-encode/issues/64>, in upstream after 2.86
|
||||
Patch3: Encode-2.86-Encode-utf8-Check-for-overflowed-and-overlong-UTF-8-.patch
|
||||
# 3/3 Refusing non-shortests UTF-8 representations,
|
||||
# <https://github.com/dankogai/p5-encode/issues/64>, in upstream after 2.86
|
||||
Patch4: Encode-2.86-Encode-utf8-Fix-count-of-replacement-characters-for-.patch
|
||||
BuildRequires: coreutils
|
||||
BuildRequires: findutils
|
||||
BuildRequires: make
|
||||
@ -131,6 +143,10 @@ your own encoding to perl. No knowledge of XS is necessary.
|
||||
%prep
|
||||
%setup -q -n Encode-%{cpan_version}
|
||||
%patch0 -p1
|
||||
%patch1 -p1
|
||||
%patch2 -p1
|
||||
%patch3 -p1
|
||||
%patch4 -p1
|
||||
|
||||
%build
|
||||
# Additional scripts can be installed by appending MORE_SCRIPTS, UCM files by
|
||||
@ -174,6 +190,7 @@ make test
|
||||
%changelog
|
||||
* Fri Sep 30 2016 Petr Pisar <ppisar@redhat.com> - 4:2.86-3
|
||||
- Fix Encode::encode_utf8(undef) to return undef (CPAN RT#116904)
|
||||
- Refuse non-shortests UTF-8 representations in strict mode
|
||||
|
||||
* Fri Sep 16 2016 Petr Pisar <ppisar@redhat.com> - 4:2.86-2
|
||||
- Add Artistic 2.0 into license tag because of encguess tool
|
||||
|
Loading…
Reference in New Issue
Block a user