2009-04-06  Stepan Kasal  <skasal@redhat.com>

	* t/util-58.t: Add tests reflecting common usage.
	* CGI/Util.pm (encode): State what conversions are needed, in
	accordance to the common usage mentioned above; and code it.

diff -ur perl-5.10.0/lib/CGI/Util.pm perl-5.10.0/lib/CGI/Util.pm
--- perl-5.10.0/lib/CGI/Util.pm	2008-09-08 15:58:52.000000000 +0200
+++ perl-5.10.0/lib/CGI/Util.pm	2009-04-04 16:30:29.000000000 +0200
@@ -210,7 +210,6 @@
   my $todecode = shift;
   return undef unless defined($todecode);
   $todecode =~ tr/+/ /;       # pluses become spaces
-    $EBCDIC = "\t" ne "\011";
     if ($EBCDIC) {
       $todecode =~ s/%([0-9a-fA-F]{2})/chr $A2E[hex($1)]/ge;
     } else {
@@ -232,16 +231,24 @@
 }
 
 # URL-encode data
+#
+# We cannot use the %u escapes, they were rejected by W3C, so the official
+# way is %XX-escaped utf-8 encoding.
+# Naturally, Unicode strings have to be converted to their utf-8 byte
+# representation.  (No action is required on 5.6.)
+# Byte strings were traditionally used directly as a sequence of octets.
+# This worked if they actually represented binary data (i.e. in CGI::Compress).
+# This also worked if these byte strings were actually utf-8 encoded; e.g.,
+# when the source file used utf-8 without the apropriate "use utf8;".
+# This fails if the byte string is actually a Latin 1 encoded string, but it
+# was always so and cannot be fixed without breaking the binary data case.
+# -- Stepan Kasal <skasal@redhat.com>
+#
 sub escape {
   shift() if @_ > 1 and ( ref($_[0]) || (defined $_[1] && $_[0] eq $CGI::DefaultClass));
   my $toencode = shift;
   return undef unless defined($toencode);
-  $toencode = eval { pack("C*", unpack("U0C*", $toencode))} || pack("C*", unpack("C*", $toencode));
-
-  # force bytes while preserving backward compatibility -- dankogai
-  # but commented out because it was breaking CGI::Compress -- lstein
-  # $toencode = eval { pack("U*", unpack("U0C*", $toencode))} || pack("C*", unpack("C*", $toencode));
-
+  utf8::encode($toencode) if ($] > 5.007 && utf8::is_utf8($toencode));
     if ($EBCDIC) {
       $toencode=~s/([^a-zA-Z0-9_.~-])/uc sprintf("%%%02x",$E2A[ord($1)])/eg;
     } else {
diff -ur perl-5.10.0/lib/CGI/t/util-58.t perl-5.10.0/lib/CGI/t/util-58.t
--- perl-5.10.0/lib/CGI/t/util-58.t	2003-04-14 20:32:22.000000000 +0200
+++ perl-5.10.0/lib/CGI/t/util-58.t	2009-04-06 16:49:42.000000000 +0200
@@ -1,16 +1,29 @@
+# test CGI::Util::escape
+use Test::More tests => 4;
+use_ok("CGI::Util");
+
+# Byte strings should be escaped byte by byte:
+# 1) not a valid utf-8 sequence:
+my $uri = "pe\x{f8}\x{ed}\x{e8}ko.ogg";
+is(CGI::Util::escape($uri), "pe%F8%ED%E8ko.ogg", "Escape a Latin-2 string");
+
+# 2) is a valid utf-8 sequence, but not an UTF-8-flagged string
+#    This happens often: people write utf-8 strings to source, but forget
+#    to tell perl about it by "use utf8;"--this is obviously wrong, but we
+#    have to handle it gracefully, for compatibility with GCI.pm under
+#    perl-5.8.x
 #
-# This tests CGI::Util::escape() when fed with UTF-8-flagged string
-# -- dankogai
-BEGIN {
-    if ($] < 5.008) {
-       print "1..0 # \$] == $] < 5.008\n";
-       exit(0);
-    }
-}
+$uri = "pe\x{c5}\x{99}\x{c3}\x{ad}\x{c4}\x{8d}ko.ogg";
+is(CGI::Util::escape($uri), "pe%C5%99%C3%AD%C4%8Dko.ogg",
+	"Escape an utf-8 byte string");
 
-use Test::More tests => 2;
-use_ok("CGI::Util");
-my $uri = "\x{5c0f}\x{98fc} \x{5f3e}.txt"; # KOGAI, Dan, in Kanji
-is(CGI::Util::escape($uri), "%E5%B0%8F%E9%A3%BC%20%E5%BC%BE.txt",
-   "# Escape string with UTF-8 flag");
+SKIP:
+{
+	# This tests CGI::Util::escape() when fed with UTF-8-flagged string
+	# -- dankogai
+	skip("Unicode strings not available in $]", 1) if ($] < 5.008);
+	$uri = "\x{5c0f}\x{98fc} \x{5f3e}.txt"; # KOGAI, Dan, in Kanji
+	is(CGI::Util::escape($uri), "%E5%B0%8F%E9%A3%BC%20%E5%BC%BE.txt",
+   		"Escape string with UTF-8 flag");
+}
 __END__