From 8b5f11dcd7cfc05ee88d5be89ea1cd973e095b74 Mon Sep 17 00:00:00 2001
From: Johnothan King <johnothanking@protonmail.com>
Date: Sat, 25 Jul 2020 11:46:11 -0700
Subject: [PATCH] Add support for multibyte characters to $IFS (#92)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for multibyte characters to $IFS

This commit fixes BUG_MULTIBIFS, which had two bug reports in the ksh2020 branch.

src/cmd/ksh93/sh/macro.c:
- Backport Eric Scrivner's fix for multibyte IFS characters (slightly modified
  for compatibility with C89). Explanation from https://github.com/att/ast/pull/737:

  Previously, the varsub method used for the macro expansion of $param, ${param},
  and ${param op word} would incorrectly expand the internal field separator (IFS)
  if it was a multibyte character. This was due to truncation based on the
  incorrect assumption that the IFS would never be larger than a single byte.

  This change fixes this issue by carefully tracking the number of bytes that
  should be persisted in the IFS case and ensuring that all bytes are written
  during expansion and substitution.

  Bug report: https://github.com/att/ast/issues/13

- Fixed another bug that caused multibyte characters with the same initial byte
  to be treated as the same character by the IFS. This bug was occurring because
  the first byte of a multibyte character wasn't being written to the stack when
  the IFS delimiter had the same initial byte:

  $ IFS=£
  $ v='§'
  $ set -- $v
  $ v="${1-}"
  $ echo "$v" | hd # The first byte should be c2, but it isn't due to the bug
  00000000  a7 0a                                             |..|
  00000002

  Bug report: https://github.com/att/ast/issues/1372

src/cmd/ksh93/tests/variables.sh:
- Add (reworked) regression tests from ksh2020 for the multibyte IFS bugs.
- Add a regression test for att/ast#1372 based on the reproducer.
---
 NEWS                             |  8 ++++++++
 TODO                             |  6 ------
 src/cmd/ksh93/include/version.h  |  2 +-
 src/cmd/ksh93/sh/macro.c         | 33 ++++++++++++++++++++++++++++++++
 src/cmd/ksh93/tests/variables.sh | 33 ++++++++++++++++++++++++++++++++
 5 files changed, 75 insertions(+), 7 deletions(-)

diff --git a/NEWS b/NEWS
index e4d97c321..3ea2ded9d 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,14 @@ For full details, see the git log at: https://github.com/ksh93/ksh
 
 Any uppercase BUG_* names are modernish shell bug IDs.
 
+2020-07-25:
+
+- Fixed BUG_MULTIBIFS: Multibyte characters can now be used as IFS
+  delimiters. "$*" was incorrectly joining positional parameters on
+  the first byte of a multibyte character. This was due to truncation
+  based on the incorrect assumption the IFS would never be larger
+  than a single byte.
+
 2020-07-23:
 
 - Fixed an infinite loop that could occur when ksh is the system's /bin/sh.
diff --git a/TODO b/TODO
index f1e7aa0e7..36026fac2 100644
--- a/TODO
+++ b/TODO
@@ -54,9 +54,3 @@ https://github.com/modernish/modernish/tree/0.16/lib/modernish/cap/
   between 'while'/'until' and 'do'), the exit status passed down from the
   previous command is ignored and the function returns with status 0
   instead.
-
-- BUG_MULTIBIFS: We're on a UTF-8 locale and the shell supports UTF-8
-  characters in general (i.e. we don't have WRN_MULTIBYTE) – however, using
-  multi-byte characters as IFS field delimiters still doesn't work. For
-  example, "$*" joins positional parameters on the first byte of IFS instead
-  of the first character.
diff --git a/src/cmd/ksh93/include/version.h b/src/cmd/ksh93/include/version.h
index 2f87d7609..066f85373 100644
--- a/src/cmd/ksh93/include/version.h
+++ b/src/cmd/ksh93/include/version.h
@@ -17,4 +17,4 @@
 *                  David Korn <dgk@research.att.com>                   *
 *                                                                      *
 ***********************************************************************/
-#define SH_RELEASE	"93u+m 2020-07-23"
+#define SH_RELEASE	"93u+m 2020-07-25"
diff --git a/src/cmd/ksh93/sh/macro.c b/src/cmd/ksh93/sh/macro.c
index 530da993c..1fb67ffb3 100644
--- a/src/cmd/ksh93/sh/macro.c
+++ b/src/cmd/ksh93/sh/macro.c
@@ -1954,10 +1954,34 @@ retry2:
 			}
 			else if(d)
 			{
+#if SHOPT_MULTIBYTE
+				Sfio_t *sfio_ptr = (mp->sp) ? mp->sp : stkp;
+
+				/*
+				 * We know from above that if we are not performing @-expansion
+				 * then we assigned `d` the value of `mp->ifs`, here we check
+				 * whether or not we have a valid string of IFS characters to
+				 * write as it is possible for `d` to be set to `mp->ifs` and
+				 * yet `mp->ifsp` to be NULL.
+				 */
+				if(mode != '@' && mp->ifsp)
+				{
+					/*
+					 * Handle multi-byte characters being used for the internal
+					 * field separator (IFS).
+					 */
+					int i;
+					for(i = 0; i < mbsize(mp->ifsp); i++)
+						sfputc(sfio_ptr,mp->ifsp[i]);
+				}
+				else
+					sfputc(sfio_ptr,d);
+#else
 				if(mp->sp)
 					sfputc(mp->sp,d);
 				else
 					sfputc(stkp,d);
+#endif
 			}
 		}
 		if(arrmax)
@@ -2403,7 +2427,16 @@ static void mac_copy(register Mac_t *mp,register const char *str, register int s
 				if(n==S_MBYTE)
 				{
 					if(sh_strchr(mp->ifsp,cp-1)<0)
+					{
+						/*
+						 * The multi-byte character that was found has the same initial
+						 * byte as the IFS delimiter, but it's a different character. Put
+						 * the first byte onto the stack and continue; multi-byte characters
+						 * otherwise lose their initial byte.
+						 */
+						sfputc(stkp,c);
 						continue;
+					}
 					n = mbsize(cp-1) - 1;
 					if(n==-2)
 						n = 0;
diff --git a/src/cmd/ksh93/tests/variables.sh b/src/cmd/ksh93/tests/variables.sh
index c6e8c4203..6afe9022b 100755
--- a/src/cmd/ksh93/tests/variables.sh
+++ b/src/cmd/ksh93/tests/variables.sh
@@ -435,6 +435,39 @@ case $(unset IFS; set -- $v; print $#) in
 *)	err_exit 'BUG_KUNSETIFS detection failed'
 esac
 
+# Multi-byte characters should work with $IFS
+(
+	LC_ALL=C.UTF-8  # The multi-byte tests are pointless without UTF-8
+
+	# Test the following characters:
+	# Lowercase accented e  (two bytes)
+	# Roman sestertius sign (four bytes)
+	for delim in é 𐆘; do
+		IFS="$delim"
+		set : :
+		[ "$*" == ":$delim:" ] || err_exit "IFS failed with multi-byte character $delim (expected :$delim:, got $*)"
+
+		read -r first second third <<< "one${delim}two${delim}three"
+		[[ $first == one ]] || err_exit "IFS failed with multi-byte character $delim (expected one, got $first)"
+		[[ $second == two ]] || err_exit "IFS failed with multi-byte character $delim (expected two, got $second)"
+		[[ $third == three ]] || err_exit "IFS failed with multi-byte character $delim (expected three, got $three)"
+
+		# Ensure subshells don't get corrupted when IFS becomes a multi-byte character
+		expected_output="$(printf ":$delim:\\ntrap -- 'echo end' EXIT\\nend")"
+		output="$(LANG=C.UTF-8; IFS=$delim; set : :; echo "$*"; trap "echo end" EXIT; trap)"
+		[[ $output == $expected_output ]] || err_exit "IFS in subshell failed with multi-byte character $delim (expected $expected_output, got $output)"
+	done
+
+	# Multibyte characters with the same initial byte shouldn't be parsed as the same
+	# character if they are different. The regression test below tests two characters
+	# with the same initial byte (0xC2).
+	IFS='£'  # £ = C2 A3
+	v='abc§def ghi§jkl'  # § = C2 A7 (same initial byte)
+	set -- $v
+	v="${#},${1-},${2-},${3-}"
+	[[ $v == '1,abc§def ghi§jkl,,' ]] || err_exit "IFS treats £ (C2 A3) and § (C2 A7) as the same character"
+)
+
 # ^^^ end: IFS tests ^^^
 # restore default split:
 unset IFS