From eb90e29c72489febb6502f53800fb37626f36313 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Zaoral?= Date: Thu, 9 Nov 2023 13:25:20 +0100 Subject: [PATCH] fix UTF-8 quoting in xtrace Resolves: RHEL-5684 --- ksh-20120801-xtrace-utf8-quoting.patch | 141 +++++++++++++++++++++++++ ksh.spec | 10 +- 2 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 ksh-20120801-xtrace-utf8-quoting.patch diff --git a/ksh-20120801-xtrace-utf8-quoting.patch b/ksh-20120801-xtrace-utf8-quoting.patch new file mode 100644 index 0000000..f2e10c6 --- /dev/null +++ b/ksh-20120801-xtrace-utf8-quoting.patch @@ -0,0 +1,141 @@ +From f9d28935bb93fe7336ba8c5eab4231050de2e11e Mon Sep 17 00:00:00 2001 +From: Martijn Dekker +Date: Fri, 10 Jul 2020 01:38:13 +0100 +Subject: [PATCH] Fix UTF-8 shellquoting for xtrace, printf %q, etc. +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This fixes an annoying issue in the shell's quoting algorithm +(used for xtrace (set -x), printf %q, and other things) for UTF-8 +locales, that caused it to encode perfectly printable UTF-8 +characters unnecessarily and inconsistently. For example: + +$ (set -x; : 'aeu aéu') ++ : $'aeu a\u[e9]u' +$ (set -x; : 'aéu aeu') ++ : 'aéu aeu' +$ (set -x; : '正常終了 aeu') ++ : '正常終了 aeu' +$ (set -x; : 'aeu 正常終了') ++ : $'aeu \u[6b63]\u[5e38]\u[7d42]\u[4e86]' + +This issue was originally reported by lijo george in May 2017: +https://www.mail-archive.com/ast-developers@lists.research.att.com/msg01958.html + +src/cmd/ksh93/sh/string.c: +- Add is_invisible() function that returns true if a character is a + Unicode invisible (non-graph) character, excluding ASCII space. + Ref.: https://unicode.org/charts/PDF/U2000.pdf +- Use a fallback in is_invisible() if we cannot use the system's + iswprint(3); this is the case for the ksh C.UTF-8 locale if the + OS doesn't support that. Fall back to a hardcoded blacklist of + invisible and control characters and put up with not encoding + nonexistent characters into \u[xxxx] escapes. + Ref.: https://unicode.org/charts/PDF/U2000.pdf +- When deciding whether to switch to $'...' quoting mode (state=2), + use is_invisible() instead of testing for ASCII 0-127 range. +- In $'...' quoting mode, use is_invisible() to decide whether to + encode wide characters into \u[xxxx] escapes. + +src/cmd/ksh93/tests/builtins.sh: +- Add regression tests for shellquoting Arabic, Japanese and Latin + UTF-8 characters, to be run only in a UTF-8 locale. The Arabic + sample text[*] contains a couple of direction markers that are + expected to be encoded into \u[xxxx] escapes. + +[*] source: https://r12a.github.io/scripts/tutorial/summaries/arabic + +Upstream-commit: f9d28935bb93fe7336ba8c5eab4231050de2e11e +Cherry-picked-by: Lukáš Zaoral +--- + src/cmd/ksh93/sh/string.c | 32 ++++++++++++++++++++++++++++++-- + src/cmd/ksh93/tests/builtins.sh | 18 ++++++++++++++++++ + 2 files changed, 48 insertions(+), 2 deletions(-) + +diff --git a/src/cmd/ksh93/sh/string.c b/src/cmd/ksh93/sh/string.c +index 5eb124b75b23..fd620a09e9b0 100644 +--- a/src/cmd/ksh93/sh/string.c ++++ b/src/cmd/ksh93/sh/string.c +@@ -325,6 +325,34 @@ static char *sh_fmtcsv(const char *string) + return(stakptr(offset)); + } + ++#if SHOPT_MULTIBYTE ++/* ++ * Returns true if c is an invisible Unicode character, excluding ASCII space. ++ * Use iswgraph(3) if possible. In the ksh-specific C.UTF-8 locale, this is ++ * generally not possible as the OS-provided iswgraph(3) doesn't support that ++ * locale. So do a quick test and do our best with a fallback if necessary. ++ */ ++static int is_invisible(int c) ++{ ++ if(!mbwide()) /* not in multibyte locale? */ ++ return(c != ' ' && !isgraph(c)); /* use plain isgraph(3) */ ++ else if(iswgraph(0x5E38) && !iswgraph(0xFEFF)) /* can we use iswgraph(3)? */ ++ return(c != ' ' && !iswgraph(c)); /* use iswgraph(3) */ ++ else /* fallback: */ ++ return( c <= 0x001F || /* control characters */ ++ c >= 0x007F && c <= 0x009F || /* control characters */ ++ c == 0x00A0 || /* non-breaking space */ ++ c == 0x061C || /* arabic letter mark */ ++ c == 0x1680 || /* ogham space mark */ ++ c == 0x180E || /* mongolian vowel separator */ ++ c >= 0x2000 && c <= 0x200F || /* spaces and format characters */ ++ c >= 0x2028 && c <= 0x202F || /* separators and format characters */ ++ c >= 0x205F && c <= 0x206F || /* various format characters */ ++ c == 0x3000 || /* ideographic space */ ++ c == 0xFEFF ); /* zero-width non-breaking space */ ++} ++#endif /* SHOPT_MULTIBYTE */ ++ + /* + * print quoting chars so that it can be read by the shell + * puts null terminated result on stack, but doesn't freeze it +@@ -363,7 +391,7 @@ char *sh_fmtq(const char *string) + for(;c;c= mbchar(cp)) + { + #if SHOPT_MULTIBYTE +- if(c=='\'' || c>=128 || c<0 || !iswprint(c)) ++ if(c=='\'' || is_invisible(c)) + #else + if(c=='\'' || !isprint(c)) + #endif /* SHOPT_MULTIBYTE */ +@@ -426,7 +454,7 @@ char *sh_fmtq(const char *string) + cp = op+1; + isbyte = 1; + } +- if(mbwide() && ((cp-op)>1)) ++ if(mbwide() && is_invisible(c)) + { + sfprintf(staksp,"\\u[%x]",c); + continue; +diff --git a/src/cmd/ksh93/tests/builtins.sh b/src/cmd/ksh93/tests/builtins.sh +index 66d465e0a205..34ef9c914f29 100755 +--- a/src/cmd/ksh93/tests/builtins.sh ++++ b/src/cmd/ksh93/tests/builtins.sh +@@ -318,6 +318,24 @@ + then err_exit "printf '%..*s' not working" + fi + [[ $(printf '%q\n') == '' ]] || err_exit 'printf "%q" with missing arguments' ++# shell-quoting UTF-8 characters: check for unnecessary encoding ++case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in ++( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* ) ++ expect=$'$\'عندما يريد العالم أن \\u[202a]يتكلّم \\u[202c] ، فهو يتحدّث بلغة يونيكود.\'' ++ actual=$(printf %q 'عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.') ++ [[ $actual == "$expect" ]] || err_exit 'shell-quoting: Arabic UTF-8 characters' \ ++ "(expected $expect; got $actual)" ++ expect="'正常終了 正常終了'" ++ actual=$(printf %q '正常終了 正常終了') ++ [[ $actual == "$expect" ]] || err_exit 'shell-quoting: Japanese UTF-8 characters' \ ++ "(expected $expect; got $actual)" ++ expect="'aeu aéu'" ++ actual=$(printf %q 'aeu aéu') ++ [[ $actual == "$expect" ]] || err_exit 'shell-quoting: Latin UTF-8 characters' \ ++ "(expected $expect; got $actual)" ++ ;; ++esac ++ + # we won't get hit by the one second boundary twice, right? + [[ $(printf '%T\n' now | sed 's/GMT/UTC/') == "$(date)" ]] || + [[ $(printf '%T\n' now | sed 's/GMT/UTC/') == "$(date)" ]] || diff --git a/ksh.spec b/ksh.spec index 6c27b82..138d5ab 100644 --- a/ksh.spec +++ b/ksh.spec @@ -6,7 +6,7 @@ Summary: The Original ATT Korn Shell URL: http://www.kornshell.com/ License: EPL-1.0 Version: %{releasedate} -Release: 262%{?dist} +Release: 263%{?dist} Source0: http://www.research.att.com/~gsf/download/tgz/ast-ksh.%{release_date}.tgz Source1: http://www.research.att.com/~gsf/download/tgz/INIT.%{release_date}.tgz Source2: kshcomp.conf @@ -248,6 +248,10 @@ Patch96: ksh-20120801-segfault-strdup.patch # upstream commit: https://github.com/ksh93/ksh/commit/035a4cb3f453271b7ae63bcb53a7963b8dbe4c41 Patch97: ksh-20120801-segfault-cd-paths.patch +# RHEL-5684 +# upstream commit: https://github.com/ksh93/ksh/commit/f9d28935bb93fe7336ba8c5eab4231050de2e11e +Patch98: ksh-20120801-xtrace-utf8-quoting.patch + Conflicts: pdksh Requires: coreutils, diffutils, chkconfig BuildRequires: bison @@ -401,6 +405,10 @@ fi %config(noreplace) %{_sysconfdir}/binfmt.d/kshcomp.conf %changelog +* Thu Nov 09 2023 Lukáš Zaoral - 20120801-263 +- fix UTF-8 quoting in xtrace + Resolves: RHEL-5684 + * Wed Nov 08 2023 Lukáš Zaoral - 20120801-262 - fix segfault in subshell if $PATH contains a .paths directory Resolves: RHEL-12011