From e02bca89973eb682d0336a4bb6e1235865cd42b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= Date: Fri, 30 Sep 2016 10:51:47 +0200 Subject: [PATCH] Refuse non-shortests UTF-8 representations in strict mode --- ...tests-for-Malformed-and-Overlong-UTF.patch | 95 ++++++++++++ ...k-for-overflowed-and-overlong-UTF-8-.patch | 61 ++++++++ ...count-of-replacement-characters-for-.patch | 33 +++++ ...processing-invalid-UTF-8-subsequence.patch | 135 ++++++++++++++++++ perl-Encode.spec | 17 +++ 5 files changed, 341 insertions(+) create mode 100644 Encode-2.86-Encode-utf8-Add-tests-for-Malformed-and-Overlong-UTF.patch create mode 100644 Encode-2.86-Encode-utf8-Check-for-overflowed-and-overlong-UTF-8-.patch create mode 100644 Encode-2.86-Encode-utf8-Fix-count-of-replacement-characters-for-.patch create mode 100644 Encode-2.86-Encode-utf8-Fix-processing-invalid-UTF-8-subsequence.patch diff --git a/Encode-2.86-Encode-utf8-Add-tests-for-Malformed-and-Overlong-UTF.patch b/Encode-2.86-Encode-utf8-Add-tests-for-Malformed-and-Overlong-UTF.patch new file mode 100644 index 0000000..5d6b3d5 --- /dev/null +++ b/Encode-2.86-Encode-utf8-Add-tests-for-Malformed-and-Overlong-UTF.patch @@ -0,0 +1,95 @@ +From 2aac84fb885fb8560294c1080fc6bbf9be35e731 Mon Sep 17 00:00:00 2001 +From: Pali +Date: Tue, 16 Aug 2016 18:34:37 +0200 +Subject: [PATCH] Encode::utf8: Add tests for Malformed and Overlong UTF-8 + sequences +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +EBCDIC tests are not included yet. + +Signed-off-by: Petr Písař +--- + t/utf8strict.t | 51 ++++++++++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 46 insertions(+), 5 deletions(-) + +diff --git a/t/utf8strict.t b/t/utf8strict.t +index 3f362f4..39293d3 100644 +--- a/t/utf8strict.t ++++ b/t/utf8strict.t +@@ -47,8 +47,8 @@ BEGIN { + qq/dd 67 41 41/ => 0, # 2.3.2 + qq/ee 42 73 73 71/ => 0, # 2.3.3 + qq/f4 90 80 80/ => 1, # 2.3.4 -- out of range so NG +- # "3 Malformed sequences" are checked by perl. +- # "4 Overlong sequences" are checked by perl. ++ # EBCDIC TODO: "3 Malformed sequences" ++ # EBCDIC TODO: "4 Overlong sequences" + ); + } else { + %SEQ = ( +@@ -56,8 +56,49 @@ BEGIN { + qq/ee 80 80/ => 0, # 2.3.2 + qq/f4 8f bf bd/ => 0, # 2.3.3 + qq/f4 90 80 80/ => 1, # 2.3.4 -- out of range so NG +- # "3 Malformed sequences" are checked by perl. +- # "4 Overlong sequences" are checked by perl. ++ qq/80/ => 1, # 3.1.1 ++ qq/bf/ => 1, # 3.1.2 ++ qq/80 bf/ => 1, # 3.1.3 ++ qq/80 bf 80/ => 1, # 3.1.4 ++ qq/80 bf 80 bf/ => 1, # 3.1.5 ++ qq/80 bf 80 bf 80/ => 1, # 3.1.6 ++ qq/80 bf 80 bf 80 bf/ => 1, # 3.1.7 ++ qq/80 bf 80 bf 80 bf 80/ => 1, # 3.1.8 ++ qq/80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf/ => 1, # 3.1.9 ++ qq/c0 20 c1 20 c2 20 c3 20 c4 20 c5 20 c6 20 c7 20 c8 20 c9 20 ca 20 cb 20 cc 20 cd 20 ce 20 cf 20 d0 20 d1 20 d2 20 d3 20 d4 20 d5 20 d6 20 d7 20 d8 20 d9 20 da 20 db 20 dc 20 dd 20 de 20 df 20/ => 1, # 3.2.1 ++ qq/e0 20 e1 20 e2 20 e3 20 e4 20 e5 20 e6 20 e7 20 e8 20 e9 20 ea 20 eb 20 ec 20 ed 20 ee 20 ef 20/ => 1, # 3.2.2 ++ qq/f0 20 f1 20 f2 20 f3 20 f4 20 f5 20 f6 20 f7 20/ => 1, # 3.2.3 ++ qq/f8 20 f9 20 fa 20 fb 20/ => 1, # 3.2.4 ++ qq/fc 20 fd 20/ => 1, # 3.2.5 ++ qq/c0/ => 1, # 3.3.1 ++ qq/e0 80/ => 1, # 3.3.2 ++ qq/f0 80 80/ => 1, # 3.3.3 ++ qq/f8 80 80 80/ => 1, # 3.3.4 ++ qq/fc 80 80 80 80/ => 1, # 3.3.5 ++ qq/df/ => 1, # 3.3.6 ++ qq/ef bf/ => 1, # 3.3.7 ++ qq/f7 bf bf/ => 1, # 3.3.8 ++ qq/fb bf bf bf/ => 1, # 3.3.9 ++ qq/fd bf bf bf bf/ => 1, # 3.3.10 ++ qq/c0 e0 80 f0 80 80 f8 80 80 80 fc 80 80 80 80 df ef bf f7 bf bf fb bf bf bf fd bf bf bf bf/ => 1, # 3.4.1 ++ qq/fe/ => 1, # 3.5.1 ++ qq/ff/ => 1, # 3.5.2 ++ qq/fe fe ff ff/ => 1, # 3.5.3 ++ qq/c0 af/ => 1, # 4.1.1 ++ qq/e0 80 af/ => 1, # 4.1.2 ++ qq/f0 80 80 af/ => 1, # 4.1.3 ++ qq/f8 80 80 80 af/ => 1, # 4.1.4 ++ qq/fc 80 80 80 80 af/ => 1, # 4.1.5 ++ qq/c1 bf/ => 1, # 4.2.1 ++ qq/e0 9f bf/ => 1, # 4.2.2 ++ qq/f0 8f bf bf/ => 1, # 4.2.3 ++ qq/f8 87 bf bf bf/ => 1, # 4.2.4 ++ qq/fc 83 bf bf bf bf/ => 1, # 4.2.5 ++ qq/c0 80/ => 1, # 4.3.1 ++ qq/e0 80 80/ => 1, # 4.3.2 ++ qq/f0 80 80 80/ => 1, # 4.3.3 ++ qq/f8 80 80 80 80/ => 1, # 4.3.4 ++ qq/fc 80 80 80 80 80/ => 1, # 4.3.5 + ); + } + $NTESTS += scalar keys %SEQ; +@@ -82,7 +123,7 @@ for my $s (sort keys %SEQ){ + eval { $d->decode($o,1) }; + $DEBUG and $@ and warn $@; + my $t = $@ ? 1 : 0; +- is($t, $SEQ{$s}, $s); ++ is($t, $SEQ{$s}, "sequence: $s"); + } + + __END__ +-- +2.7.4 + diff --git a/Encode-2.86-Encode-utf8-Check-for-overflowed-and-overlong-UTF-8-.patch b/Encode-2.86-Encode-utf8-Check-for-overflowed-and-overlong-UTF-8-.patch new file mode 100644 index 0000000..5fbf2b4 --- /dev/null +++ b/Encode-2.86-Encode-utf8-Check-for-overflowed-and-overlong-UTF-8-.patch @@ -0,0 +1,61 @@ +From b7fc82093961f282b986a62c582b4ca2fcc303db Mon Sep 17 00:00:00 2001 +From: Pali +Date: Tue, 16 Aug 2016 19:07:31 +0200 +Subject: [PATCH] Encode::utf8: Check for overflowed and overlong UTF-8 + sequences +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Signed-off-by: Petr Písař +--- + Encode.xs | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/Encode.xs b/Encode.xs +index 60de62c..1906f0c 100644 +--- a/Encode.xs ++++ b/Encode.xs +@@ -331,6 +331,13 @@ strict_utf8(pTHX_ SV* sv) + + #define UNICODE_IS_STRICT(c) (!UNICODE_IS_SURROGATE(c) && !UNICODE_IS_NONCHAR(c) && !UNICODE_IS_SUPER(c)) + ++#ifndef UTF_ACCUMULATION_OVERFLOW_MASK ++#ifndef CHARBITS ++#define CHARBITS CHAR_BIT ++#endif ++#define UTF_ACCUMULATION_OVERFLOW_MASK (((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * CHARBITS) - UTF_ACCUMULATION_SHIFT)) ++#endif ++ + /* + * Convert non strict utf8 sequence of len >= 2 to unicode codepoint + */ +@@ -339,6 +346,7 @@ convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen) + { + UV uv; + U8 *ptr = s; ++ bool overflowed = 0; + + uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(len); + +@@ -350,11 +358,17 @@ convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen) + *rlen = s-ptr; + return 0; + } ++ if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) ++ overflowed = 1; + uv = UTF8_ACCUMULATE(uv, *s); + s++; + } + + *rlen = s-ptr; ++ ++ if (overflowed || *rlen > (STRLEN)UNISKIP(uv)) ++ return 0; ++ + return uv; + } + +-- +2.7.4 + diff --git a/Encode-2.86-Encode-utf8-Fix-count-of-replacement-characters-for-.patch b/Encode-2.86-Encode-utf8-Fix-count-of-replacement-characters-for-.patch new file mode 100644 index 0000000..e3f14a8 --- /dev/null +++ b/Encode-2.86-Encode-utf8-Fix-count-of-replacement-characters-for-.patch @@ -0,0 +1,33 @@ +From 3cf4b7e53c84d91fa819d89d2504be2db90dee11 Mon Sep 17 00:00:00 2001 +From: Pali +Date: Fri, 19 Aug 2016 10:58:56 +0200 +Subject: [PATCH] Encode::utf8: Fix count of replacement characters for + overflowed and overlong UTF-8 sequences +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Signed-off-by: Petr Písař +--- + Encode.xs | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/Encode.xs b/Encode.xs +index 1906f0c..49a3846 100644 +--- a/Encode.xs ++++ b/Encode.xs +@@ -366,8 +366,10 @@ convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen) + + *rlen = s-ptr; + +- if (overflowed || *rlen > (STRLEN)UNISKIP(uv)) ++ if (overflowed || *rlen > (STRLEN)UNISKIP(uv)) { ++ *rlen = 1; + return 0; ++ } + + return uv; + } +-- +2.7.4 + diff --git a/Encode-2.86-Encode-utf8-Fix-processing-invalid-UTF-8-subsequence.patch b/Encode-2.86-Encode-utf8-Fix-processing-invalid-UTF-8-subsequence.patch new file mode 100644 index 0000000..975753a --- /dev/null +++ b/Encode-2.86-Encode-utf8-Fix-processing-invalid-UTF-8-subsequence.patch @@ -0,0 +1,135 @@ +From cbdb75742d763ab4133942014747132400c6ddfb Mon Sep 17 00:00:00 2001 +From: Pali +Date: Tue, 16 Aug 2016 19:05:17 +0200 +Subject: [PATCH] Encode::utf8: Fix processing invalid UTF-8 subsequences +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Skip number characters which was really processed by convert_utf8_multi_seq +and not just expected number of characters. Also simplify check for strict +UTF-8 mode. + +Signed-off-by: Petr Písař +--- + Encode.xs | 46 ++++++++++++++++++++++++++-------------------- + t/fallback.t | 2 +- + 2 files changed, 27 insertions(+), 21 deletions(-) + +diff --git a/Encode.xs b/Encode.xs +index 6b4fae9..60de62c 100644 +--- a/Encode.xs ++++ b/Encode.xs +@@ -325,13 +325,20 @@ strict_utf8(pTHX_ SV* sv) + #define UNICODE_IS_NONCHAR(c) ((c >= 0xFDD0 && c <= 0xFDEF) || (c & 0xFFFE) == 0xFFFE) + #endif + ++#ifndef UNICODE_IS_SUPER ++#define UNICODE_IS_SUPER(c) (c > PERL_UNICODE_MAX) ++#endif ++ ++#define UNICODE_IS_STRICT(c) (!UNICODE_IS_SURROGATE(c) && !UNICODE_IS_NONCHAR(c) && !UNICODE_IS_SUPER(c)) ++ ++/* ++ * Convert non strict utf8 sequence of len >= 2 to unicode codepoint ++ */ + static UV +-convert_utf8_multi_seq(U8* s, STRLEN len, bool strict) ++convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen) + { + UV uv; +- +- if (strict && len > 4) +- return 0; ++ U8 *ptr = s; + + uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(len); + +@@ -339,15 +346,15 @@ convert_utf8_multi_seq(U8* s, STRLEN len, bool strict) + s++; + + while (len--) { +- if (!UTF8_IS_CONTINUATION(*s)) ++ if (!UTF8_IS_CONTINUATION(*s)) { ++ *rlen = s-ptr; + return 0; ++ } + uv = UTF8_ACCUMULATE(uv, *s); + s++; + } + +- if (strict && (UNICODE_IS_SURROGATE(uv) || UNICODE_IS_NONCHAR(uv) || uv > PERL_UNICODE_MAX)) +- return 0; +- ++ *rlen = s-ptr; + return uv; + } + +@@ -384,32 +391,30 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, + continue; + } + ++ ulen = 1; + if (UTF8_IS_START(*s)) { + U8 skip = UTF8SKIP(s); + if ((s + skip) > e) { + if (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) { + const U8 *p = s + 1; + for (; p < e; p++) { +- if (!UTF8_IS_CONTINUATION(*p)) ++ if (!UTF8_IS_CONTINUATION(*p)) { ++ ulen = p-s; + goto malformed_byte; ++ } + } + break; + } + ++ ulen = e-s; + goto malformed_byte; + } + +- ulen = skip; +- uv = convert_utf8_multi_seq(s, skip, strict); +- if (uv == 0) { +- if (strict) { +- uv = convert_utf8_multi_seq(s, skip, 0); +- if (uv == 0) +- goto malformed_byte; +- goto malformed; +- } ++ uv = convert_utf8_multi_seq(s, skip, &ulen); ++ if (uv == 0) + goto malformed_byte; +- } ++ else if (strict && !UNICODE_IS_STRICT(uv)) ++ goto malformed; + + + /* Whole char is good */ +@@ -422,7 +427,8 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, + /* If we get here there is something wrong with alleged UTF-8 */ + malformed_byte: + uv = (UV)*s; +- ulen = 1; ++ if (ulen == 0) ++ ulen = 1; + + malformed: + if (check & ENCODE_DIE_ON_ERR){ +diff --git a/t/fallback.t b/t/fallback.t +index 8ef8ab3..86605ef 100644 +--- a/t/fallback.t ++++ b/t/fallback.t +@@ -35,7 +35,7 @@ for my $i (0x80..0xff){ + $uo .= chr($i); + $residue .= chr($i); + $af .= '?'; +- $uf .= "\x{FFFD}"; ++ $uf .= "\x{FFFD}" if $i < 0xfd; + $ap .= sprintf("\\x{%04x}", $i); + $up .= sprintf("\\x%02X", $i); + $ah .= sprintf("&#%d;", $i); +-- +2.7.4 + diff --git a/perl-Encode.spec b/perl-Encode.spec index 25c84ef..5da3294 100644 --- a/perl-Encode.spec +++ b/perl-Encode.spec @@ -20,6 +20,18 @@ Source0: http://www.cpan.org/authors/id/D/DA/DANKOGAI/Encode-%{cpan_versi # Fix Encode::encode_utf8(undef) to return undef, CPAN RT#116904, # in upstream after 2.86 Patch0: Encode-2.86-Fix-return-value-of-Encode-encode_utf8-undef.patch +# Tests for refusing non-shortests UTF-8 representations, +# , in upstream after 2.86 +Patch1: Encode-2.86-Encode-utf8-Add-tests-for-Malformed-and-Overlong-UTF.patch +# 1/3 Refusing non-shortests UTF-8 representations, +# , in upstream after 2.86 +Patch2: Encode-2.86-Encode-utf8-Fix-processing-invalid-UTF-8-subsequence.patch +# 2/3 Refusing non-shortests UTF-8 representations, +# , in upstream after 2.86 +Patch3: Encode-2.86-Encode-utf8-Check-for-overflowed-and-overlong-UTF-8-.patch +# 3/3 Refusing non-shortests UTF-8 representations, +# , in upstream after 2.86 +Patch4: Encode-2.86-Encode-utf8-Fix-count-of-replacement-characters-for-.patch BuildRequires: coreutils BuildRequires: findutils BuildRequires: make @@ -131,6 +143,10 @@ your own encoding to perl. No knowledge of XS is necessary. %prep %setup -q -n Encode-%{cpan_version} %patch0 -p1 +%patch1 -p1 +%patch2 -p1 +%patch3 -p1 +%patch4 -p1 %build # Additional scripts can be installed by appending MORE_SCRIPTS, UCM files by @@ -174,6 +190,7 @@ make test %changelog * Fri Sep 30 2016 Petr Pisar - 4:2.86-3 - Fix Encode::encode_utf8(undef) to return undef (CPAN RT#116904) +- Refuse non-shortests UTF-8 representations in strict mode * Fri Sep 16 2016 Petr Pisar - 4:2.86-2 - Add Artistic 2.0 into license tag because of encguess tool