2009-04-06 Stepan Kasal * t/util-58.t: Add tests reflecting common usage. * CGI/Util.pm (encode): State what conversions are needed, in accordance to the common usage mentioned above; and code it. diff -ur perl-5.10.0/lib/CGI/Util.pm perl-5.10.0/lib/CGI/Util.pm --- perl-5.10.0/lib/CGI/Util.pm 2008-09-08 15:58:52.000000000 +0200 +++ perl-5.10.0/lib/CGI/Util.pm 2009-04-04 16:30:29.000000000 +0200 @@ -210,7 +210,6 @@ my $todecode = shift; return undef unless defined($todecode); $todecode =~ tr/+/ /; # pluses become spaces - $EBCDIC = "\t" ne "\011"; if ($EBCDIC) { $todecode =~ s/%([0-9a-fA-F]{2})/chr $A2E[hex($1)]/ge; } else { @@ -232,16 +231,24 @@ } # URL-encode data +# +# We cannot use the %u escapes, they were rejected by W3C, so the official +# way is %XX-escaped utf-8 encoding. +# Naturally, Unicode strings have to be converted to their utf-8 byte +# representation. (No action is required on 5.6.) +# Byte strings were traditionally used directly as a sequence of octets. +# This worked if they actually represented binary data (i.e. in CGI::Compress). +# This also worked if these byte strings were actually utf-8 encoded; e.g., +# when the source file used utf-8 without the apropriate "use utf8;". +# This fails if the byte string is actually a Latin 1 encoded string, but it +# was always so and cannot be fixed without breaking the binary data case. +# -- Stepan Kasal +# sub escape { shift() if @_ > 1 and ( ref($_[0]) || (defined $_[1] && $_[0] eq $CGI::DefaultClass)); my $toencode = shift; return undef unless defined($toencode); - $toencode = eval { pack("C*", unpack("U0C*", $toencode))} || pack("C*", unpack("C*", $toencode)); - - # force bytes while preserving backward compatibility -- dankogai - # but commented out because it was breaking CGI::Compress -- lstein - # $toencode = eval { pack("U*", unpack("U0C*", $toencode))} || pack("C*", unpack("C*", $toencode)); - + utf8::encode($toencode) if ($] > 5.007 && utf8::is_utf8($toencode)); if ($EBCDIC) { $toencode=~s/([^a-zA-Z0-9_.~-])/uc sprintf("%%%02x",$E2A[ord($1)])/eg; } else { diff -ur perl-5.10.0/lib/CGI/t/util-58.t perl-5.10.0/lib/CGI/t/util-58.t --- perl-5.10.0/lib/CGI/t/util-58.t 2003-04-14 20:32:22.000000000 +0200 +++ perl-5.10.0/lib/CGI/t/util-58.t 2009-04-06 16:49:42.000000000 +0200 @@ -1,16 +1,29 @@ +# test CGI::Util::escape +use Test::More tests => 4; +use_ok("CGI::Util"); + +# Byte strings should be escaped byte by byte: +# 1) not a valid utf-8 sequence: +my $uri = "pe\x{f8}\x{ed}\x{e8}ko.ogg"; +is(CGI::Util::escape($uri), "pe%F8%ED%E8ko.ogg", "Escape a Latin-2 string"); + +# 2) is a valid utf-8 sequence, but not an UTF-8-flagged string +# This happens often: people write utf-8 strings to source, but forget +# to tell perl about it by "use utf8;"--this is obviously wrong, but we +# have to handle it gracefully, for compatibility with GCI.pm under +# perl-5.8.x # -# This tests CGI::Util::escape() when fed with UTF-8-flagged string -# -- dankogai -BEGIN { - if ($] < 5.008) { - print "1..0 # \$] == $] < 5.008\n"; - exit(0); - } -} +$uri = "pe\x{c5}\x{99}\x{c3}\x{ad}\x{c4}\x{8d}ko.ogg"; +is(CGI::Util::escape($uri), "pe%C5%99%C3%AD%C4%8Dko.ogg", + "Escape an utf-8 byte string"); -use Test::More tests => 2; -use_ok("CGI::Util"); -my $uri = "\x{5c0f}\x{98fc} \x{5f3e}.txt"; # KOGAI, Dan, in Kanji -is(CGI::Util::escape($uri), "%E5%B0%8F%E9%A3%BC%20%E5%BC%BE.txt", - "# Escape string with UTF-8 flag"); +SKIP: +{ + # This tests CGI::Util::escape() when fed with UTF-8-flagged string + # -- dankogai + skip("Unicode strings not available in $]", 1) if ($] < 5.008); + $uri = "\x{5c0f}\x{98fc} \x{5f3e}.txt"; # KOGAI, Dan, in Kanji + is(CGI::Util::escape($uri), "%E5%B0%8F%E9%A3%BC%20%E5%BC%BE.txt", + "Escape string with UTF-8 flag"); +} __END__