From ac8991e5257978a6359c001b7fa227c334fd9e18 Mon Sep 17 00:00:00 2001
From: Martijn Dekker <martijn@inlv.org>
Date: Wed, 5 Aug 2020 18:22:22 +0100
Subject: [PATCH] Fix shellquoting of invalid multibyte char (re: f9d28935,
 8c7c60ec)

This commit fixes two bugs in the generation of $'...' shellquoted
strings:
1. A bug introduced in f9d28935. In UTF-8 locales, a byte that is
   invalid in UTF-8, e.g. hex byte 86, would be shellquoted as
   \u[86], which is not the same as the correct quoting, \x86.
2. A bug inherited from 93u+. Single bytes (e.g. hex 11) were
   always quoted as \x11 and not \x[11], even if a subsequent
   character was a hexadecimal digit. However, the parser reads
   past two hexadecimal digits, so we got:
	$ printf '%q\n' $'\x[11]1'
	$'\x111'
	$ printf $'\x111' | od -t x1
	0000000    c4  91
	0000002
   After the bug fix, this works correctly:
	$ printf '%q\n' $'\x[11]1'
	$'\x[11]1'
	$ printf $'\x[11]1' | od -t x1
	0000000    11  31
	0000002

src/cmd/ksh93/sh/string.c: sh_fmtq():
- Make the multibyte code for $'...' more readable, eliminating the
  'isbyte' flag.
- When in a multibyte locale, make sure to shellquote both invalid
  multibyte characters and unprintable ASCII characters as
  hexadecimal bytes (\xNN). This reinstates 93u+ behaviour.
- When quoting bytes, use isxdigit(3) to determine if the next
  character is a hex digit, and if so, protect the quoted byte with
  square brackets.

src/cmd/ksh93/tests/quoting2.sh:
- Move the 'printf %q' shellquoting regression tests here from
  builtins.sh; they test the shellquoting algorithm, not so much
  the printf builtin itself.
- Add regression tests for these bugs.
---
 NEWS                            |  6 +++++
 src/cmd/ksh93/sh/string.c       | 38 +++++++++++++++++-----------
 src/cmd/ksh93/tests/builtins.sh | 33 -------------------------
 src/cmd/ksh93/tests/quoting2.sh | 44 +++++++++++++++++++++++++++++++++
 4 files changed, 73 insertions(+), 48 deletions(-)

diff --git a/NEWS b/NEWS
index f752478f0..0f2183fdf 100644
--- a/NEWS
+++ b/NEWS
@@ -11,6 +11,12 @@ Any uppercase BUG_* names are modernish shell bug IDs.
 - Fixed a bug that caused scripts to continue running after over-shifting
   in a function when the function call had a redirection.
 
+- When generating shellquoted strings (such as with 'printf %q'), the
+  hexadecimal value of a quoted unprintable character was not protected with
+  square braces, e.g. 0x12 followed by '3' would be quoted as '\x123', which
+  is a different value. Such strings are now quoted like '\x[12]3' if the
+  next character is a hexadecimal digit.
+
 2020-07-31:
 
 - Fixed a bug that caused multidimensional associative arrays to be created
diff --git a/src/cmd/ksh93/sh/string.c b/src/cmd/ksh93/sh/string.c
index e7c102f4a..6c3720649 100644
--- a/src/cmd/ksh93/sh/string.c
+++ b/src/cmd/ksh93/sh/string.c
@@ -40,6 +40,10 @@
 #   define iswprint(c)		(((c)&~0377) || isprint(c))
 #endif
 
+#ifndef isxdigit
+#   define isxdigit(c)		((c)>='0'&&(c)<='9'||(c)>='a'&&(c)<='f'||(c)>='A'&&(c)<='F')
+#endif
+
 
 /*
  *  Table lookup routine
@@ -410,7 +414,6 @@ char	*sh_fmtq(const char *string)
 	}
 	else
 	{
-		int isbyte=0;
 		stakwrite("$'",2);
 		cp = string;
 #if SHOPT_MULTIBYTE
@@ -447,24 +450,29 @@ char	*sh_fmtq(const char *string)
 				break;
 			    default:
 #if SHOPT_MULTIBYTE
-				isbyte = 0;
-				if(c<0)
+				if(mbwide())
 				{
-					c = *((unsigned char *)op);
-					cp = op+1;
-					isbyte = 1;
+					/* We're in a multibyte locale */
+					if(c<0 || c<128 && !isprint(c))
+					{
+						/* Invalid multibyte char, or unprintable ASCII char: quote as hex byte */
+						c = *((unsigned char *)op);
+						cp = op+1;
+						goto quote_one_byte;
+					}
+					if(is_invisible(c))
+					{
+						/* Unicode hex code */
+						sfprintf(staksp,"\\u[%x]",c);
+						continue;
+					}
 				}
-				if(mbwide() && is_invisible(c))
-				{
-					sfprintf(staksp,"\\u[%x]",c);
-					continue;
-				}
-				else if(isbyte)
-#else
+				else
+#endif /* SHOPT_MULTIBYTE */
 				if(!isprint(c))
-#endif
 				{
-					sfprintf(staksp,"\\x%.2x",c);
+				quote_one_byte:
+					sfprintf(staksp, isxdigit(*cp) ? "\\x[%.2x]" : "\\x%.2x", c);
 					continue;
 				}
 				state=0;
diff --git a/src/cmd/ksh93/tests/builtins.sh b/src/cmd/ksh93/tests/builtins.sh
index bc865c8e6..1952c867a 100755
--- a/src/cmd/ksh93/tests/builtins.sh
+++ b/src/cmd/ksh93/tests/builtins.sh
@@ -303,39 +303,6 @@ if	[[ $(printf '%..*s\n' : abc def) != abc:def ]]
 then	err_exit "printf '%..*s' not working"
 fi
 
-# ======
-# shell-quoting using printf %q (same algorithm used for xtrace and output of 'set', 'trap', ...)
-
-[[ $(printf '%q\n') == '' ]] || err_exit 'printf "%q" with missing arguments'
-
-# the following fails on 2012-08-01 in UTF-8 locales
-expect="'shell-quoted string'"
-actual=$(
-	print -nr $'\303\274' | read -n1 foo  # interrupt processing of 2-byte UTF-8 char after reading 1 byte
-	printf '%q\n' "shell-quoted string"
-)
-LC_CTYPE=POSIX true	    # on buggy ksh, a locale re-init via temp assignment restores correct shellquoting
-[[ $actual == "$expect" ]] || err_exit 'shell-quoting corrupted after interrupted processing of UTF-8 char' \
-				"(expected $expect; got $actual)"
-
-# shell-quoting UTF-8 characters: check for unnecessary encoding
-case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in
-( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* )
-	expect=$'$\'عندما يريد العالم أن \\u[202a]يتكلّم \\u[202c] ، فهو يتحدّث بلغة يونيكود.\''
-	actual=$(printf %q 'عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.')
-	[[ $actual == "$expect" ]] || err_exit 'shell-quoting: Arabic UTF-8 characters' \
-				"(expected $expect; got $actual)"
-	expect="'正常終了 正常終了'"
-	actual=$(printf %q '正常終了 正常終了')
-	[[ $actual == "$expect" ]] || err_exit 'shell-quoting: Japanese UTF-8 characters' \
-				"(expected $expect; got $actual)"
-	expect="'aeu aéu'"
-	actual=$(printf %q 'aeu aéu')
-	[[ $actual == "$expect" ]] || err_exit 'shell-quoting: Latin UTF-8 characters' \
-				"(expected $expect; got $actual)"
-	;;
-esac
-
 # ======
 # we won't get hit by the one second boundary twice, right?
 expect= actual=
diff --git a/src/cmd/ksh93/tests/quoting2.sh b/src/cmd/ksh93/tests/quoting2.sh
index 0ffcb4fae..a2b808af8 100755
--- a/src/cmd/ksh93/tests/quoting2.sh
+++ b/src/cmd/ksh93/tests/quoting2.sh
@@ -215,4 +215,48 @@ exp='ac'
 got=$'a\0b'c
 [[ $got == "$exp" ]] || err_exit "\$'a\\0b'c expansion failed -- expected '$exp', got '$got'"
 
+# ======
+# generating shell-quoted strings using printf %q (same algorithm used for xtrace and output of 'set', 'trap', ...)
+
+[[ $(printf '%q\n') == '' ]] || err_exit 'printf "%q" with missing arguments yields non-empty result'
+
+# the following fails on 2012-08-01 in UTF-8 locales
+expect="'shell-quoted string'"
+actual=$(
+	print -nr $'\303\274' | read -n1 foo  # interrupt processing of 2-byte UTF-8 char after reading 1 byte
+	printf '%q\n' "shell-quoted string"
+)
+LC_CTYPE=POSIX true	    # on buggy ksh, a locale re-init via temp assignment restores correct shellquoting
+[[ $actual == "$expect" ]] || err_exit 'shell-quoting corrupted after interrupted processing of UTF-8 char' \
+				"(expected $expect; got $actual)"
+
+# shell-quoting UTF-8 characters: check for unnecessary encoding
+case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in
+( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* )
+	expect=$'$\'عندما يريد العالم أن \\u[202a]يتكلّم \\u[202c] ، فهو يتحدّث بلغة يونيكود.\''
+	actual=$(printf %q 'عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.')
+	[[ $actual == "$expect" ]] || err_exit 'shell-quoting: Arabic UTF-8 characters' \
+				"(expected $expect; got $actual)"
+	expect="'正常終了 正常終了'"
+	actual=$(printf %q '正常終了 正常終了')
+	[[ $actual == "$expect" ]] || err_exit 'shell-quoting: Japanese UTF-8 characters' \
+				"(expected $expect; got $actual)"
+	expect="'aeu aéu'"
+	actual=$(printf %q 'aeu aéu')
+	[[ $actual == "$expect" ]] || err_exit 'shell-quoting: Latin UTF-8 characters' \
+				"(expected $expect; got $actual)"
+	expect=$'$\'\\x86\\u[86]\\xf0\\x96v\\xa7\\xb5\''
+	actual=$(printf %q $'\x86\u86\xF0\x96\x76\xA7\xB5')
+	[[ $actual == "$expect" ]] || err_exit 'shell-quoting: invalid UTF-8 characters not encoded with \xNN' \
+				"(expected $expect; got $actual)"
+	;;
+esac
+
+# check that hex bytes are protected with square braces if needed
+expect=$'$\'1\\x[11]1\''
+actual=$(printf %q $'1\x[11]1')
+[[ $actual == "$expect" ]] || err_exit 'shell-quoting: hex bytes not protected from subsequent hex-like chars' \
+				"(expected $expect; got $actual)"
+
+# ======
 exit $((Errors<125?Errors:125))