From ac8991e5257978a6359c001b7fa227c334fd9e18 Mon Sep 17 00:00:00 2001 From: Martijn Dekker Date: Wed, 5 Aug 2020 18:22:22 +0100 Subject: [PATCH] Fix shellquoting of invalid multibyte char (re: f9d28935, 8c7c60ec) This commit fixes two bugs in the generation of $'...' shellquoted strings: 1. A bug introduced in f9d28935. In UTF-8 locales, a byte that is invalid in UTF-8, e.g. hex byte 86, would be shellquoted as \u[86], which is not the same as the correct quoting, \x86. 2. A bug inherited from 93u+. Single bytes (e.g. hex 11) were always quoted as \x11 and not \x[11], even if a subsequent character was a hexadecimal digit. However, the parser reads past two hexadecimal digits, so we got: $ printf '%q\n' $'\x[11]1' $'\x111' $ printf $'\x111' | od -t x1 0000000 c4 91 0000002 After the bug fix, this works correctly: $ printf '%q\n' $'\x[11]1' $'\x[11]1' $ printf $'\x[11]1' | od -t x1 0000000 11 31 0000002 src/cmd/ksh93/sh/string.c: sh_fmtq(): - Make the multibyte code for $'...' more readable, eliminating the 'isbyte' flag. - When in a multibyte locale, make sure to shellquote both invalid multibyte characters and unprintable ASCII characters as hexadecimal bytes (\xNN). This reinstates 93u+ behaviour. - When quoting bytes, use isxdigit(3) to determine if the next character is a hex digit, and if so, protect the quoted byte with square brackets. src/cmd/ksh93/tests/quoting2.sh: - Move the 'printf %q' shellquoting regression tests here from builtins.sh; they test the shellquoting algorithm, not so much the printf builtin itself. - Add regression tests for these bugs. --- NEWS | 6 +++++ src/cmd/ksh93/sh/string.c | 38 +++++++++++++++++----------- src/cmd/ksh93/tests/builtins.sh | 33 ------------------------- src/cmd/ksh93/tests/quoting2.sh | 44 +++++++++++++++++++++++++++++++++ 4 files changed, 73 insertions(+), 48 deletions(-) diff --git a/NEWS b/NEWS index f752478f0..0f2183fdf 100644 --- a/NEWS +++ b/NEWS @@ -11,6 +11,12 @@ Any uppercase BUG_* names are modernish shell bug IDs. - Fixed a bug that caused scripts to continue running after over-shifting in a function when the function call had a redirection. +- When generating shellquoted strings (such as with 'printf %q'), the + hexadecimal value of a quoted unprintable character was not protected with + square braces, e.g. 0x12 followed by '3' would be quoted as '\x123', which + is a different value. Such strings are now quoted like '\x[12]3' if the + next character is a hexadecimal digit. + 2020-07-31: - Fixed a bug that caused multidimensional associative arrays to be created diff --git a/src/cmd/ksh93/sh/string.c b/src/cmd/ksh93/sh/string.c index e7c102f4a..6c3720649 100644 --- a/src/cmd/ksh93/sh/string.c +++ b/src/cmd/ksh93/sh/string.c @@ -40,6 +40,10 @@ # define iswprint(c) (((c)&~0377) || isprint(c)) #endif +#ifndef isxdigit +# define isxdigit(c) ((c)>='0'&&(c)<='9'||(c)>='a'&&(c)<='f'||(c)>='A'&&(c)<='F') +#endif + /* * Table lookup routine @@ -410,7 +414,6 @@ char *sh_fmtq(const char *string) } else { - int isbyte=0; stakwrite("$'",2); cp = string; #if SHOPT_MULTIBYTE @@ -447,24 +450,29 @@ char *sh_fmtq(const char *string) break; default: #if SHOPT_MULTIBYTE - isbyte = 0; - if(c<0) + if(mbwide()) { - c = *((unsigned char *)op); - cp = op+1; - isbyte = 1; + /* We're in a multibyte locale */ + if(c<0 || c<128 && !isprint(c)) + { + /* Invalid multibyte char, or unprintable ASCII char: quote as hex byte */ + c = *((unsigned char *)op); + cp = op+1; + goto quote_one_byte; + } + if(is_invisible(c)) + { + /* Unicode hex code */ + sfprintf(staksp,"\\u[%x]",c); + continue; + } } - if(mbwide() && is_invisible(c)) - { - sfprintf(staksp,"\\u[%x]",c); - continue; - } - else if(isbyte) -#else + else +#endif /* SHOPT_MULTIBYTE */ if(!isprint(c)) -#endif { - sfprintf(staksp,"\\x%.2x",c); + quote_one_byte: + sfprintf(staksp, isxdigit(*cp) ? "\\x[%.2x]" : "\\x%.2x", c); continue; } state=0; diff --git a/src/cmd/ksh93/tests/builtins.sh b/src/cmd/ksh93/tests/builtins.sh index bc865c8e6..1952c867a 100755 --- a/src/cmd/ksh93/tests/builtins.sh +++ b/src/cmd/ksh93/tests/builtins.sh @@ -303,39 +303,6 @@ if [[ $(printf '%..*s\n' : abc def) != abc:def ]] then err_exit "printf '%..*s' not working" fi -# ====== -# shell-quoting using printf %q (same algorithm used for xtrace and output of 'set', 'trap', ...) - -[[ $(printf '%q\n') == '' ]] || err_exit 'printf "%q" with missing arguments' - -# the following fails on 2012-08-01 in UTF-8 locales -expect="'shell-quoted string'" -actual=$( - print -nr $'\303\274' | read -n1 foo # interrupt processing of 2-byte UTF-8 char after reading 1 byte - printf '%q\n' "shell-quoted string" -) -LC_CTYPE=POSIX true # on buggy ksh, a locale re-init via temp assignment restores correct shellquoting -[[ $actual == "$expect" ]] || err_exit 'shell-quoting corrupted after interrupted processing of UTF-8 char' \ - "(expected $expect; got $actual)" - -# shell-quoting UTF-8 characters: check for unnecessary encoding -case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in -( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* ) - expect=$'$\'عندما يريد العالم أن \\u[202a]يتكلّم \\u[202c] ، فهو يتحدّث بلغة يونيكود.\'' - actual=$(printf %q 'عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.') - [[ $actual == "$expect" ]] || err_exit 'shell-quoting: Arabic UTF-8 characters' \ - "(expected $expect; got $actual)" - expect="'正常終了 正常終了'" - actual=$(printf %q '正常終了 正常終了') - [[ $actual == "$expect" ]] || err_exit 'shell-quoting: Japanese UTF-8 characters' \ - "(expected $expect; got $actual)" - expect="'aeu aéu'" - actual=$(printf %q 'aeu aéu') - [[ $actual == "$expect" ]] || err_exit 'shell-quoting: Latin UTF-8 characters' \ - "(expected $expect; got $actual)" - ;; -esac - # ====== # we won't get hit by the one second boundary twice, right? expect= actual= diff --git a/src/cmd/ksh93/tests/quoting2.sh b/src/cmd/ksh93/tests/quoting2.sh index 0ffcb4fae..a2b808af8 100755 --- a/src/cmd/ksh93/tests/quoting2.sh +++ b/src/cmd/ksh93/tests/quoting2.sh @@ -215,4 +215,48 @@ exp='ac' got=$'a\0b'c [[ $got == "$exp" ]] || err_exit "\$'a\\0b'c expansion failed -- expected '$exp', got '$got'" +# ====== +# generating shell-quoted strings using printf %q (same algorithm used for xtrace and output of 'set', 'trap', ...) + +[[ $(printf '%q\n') == '' ]] || err_exit 'printf "%q" with missing arguments yields non-empty result' + +# the following fails on 2012-08-01 in UTF-8 locales +expect="'shell-quoted string'" +actual=$( + print -nr $'\303\274' | read -n1 foo # interrupt processing of 2-byte UTF-8 char after reading 1 byte + printf '%q\n' "shell-quoted string" +) +LC_CTYPE=POSIX true # on buggy ksh, a locale re-init via temp assignment restores correct shellquoting +[[ $actual == "$expect" ]] || err_exit 'shell-quoting corrupted after interrupted processing of UTF-8 char' \ + "(expected $expect; got $actual)" + +# shell-quoting UTF-8 characters: check for unnecessary encoding +case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in +( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* ) + expect=$'$\'عندما يريد العالم أن \\u[202a]يتكلّم \\u[202c] ، فهو يتحدّث بلغة يونيكود.\'' + actual=$(printf %q 'عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.') + [[ $actual == "$expect" ]] || err_exit 'shell-quoting: Arabic UTF-8 characters' \ + "(expected $expect; got $actual)" + expect="'正常終了 正常終了'" + actual=$(printf %q '正常終了 正常終了') + [[ $actual == "$expect" ]] || err_exit 'shell-quoting: Japanese UTF-8 characters' \ + "(expected $expect; got $actual)" + expect="'aeu aéu'" + actual=$(printf %q 'aeu aéu') + [[ $actual == "$expect" ]] || err_exit 'shell-quoting: Latin UTF-8 characters' \ + "(expected $expect; got $actual)" + expect=$'$\'\\x86\\u[86]\\xf0\\x96v\\xa7\\xb5\'' + actual=$(printf %q $'\x86\u86\xF0\x96\x76\xA7\xB5') + [[ $actual == "$expect" ]] || err_exit 'shell-quoting: invalid UTF-8 characters not encoded with \xNN' \ + "(expected $expect; got $actual)" + ;; +esac + +# check that hex bytes are protected with square braces if needed +expect=$'$\'1\\x[11]1\'' +actual=$(printf %q $'1\x[11]1') +[[ $actual == "$expect" ]] || err_exit 'shell-quoting: hex bytes not protected from subsequent hex-like chars' \ + "(expected $expect; got $actual)" + +# ====== exit $((Errors<125?Errors:125))