diff --git a/NEWS b/NEWS index e4d97c321..3ea2ded9d 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,14 @@ For full details, see the git log at: https://github.com/ksh93/ksh Any uppercase BUG_* names are modernish shell bug IDs. +2020-07-25: + +- Fixed BUG_MULTIBIFS: Multibyte characters can now be used as IFS + delimiters. "$*" was incorrectly joining positional parameters on + the first byte of a multibyte character. This was due to truncation + based on the incorrect assumption the IFS would never be larger + than a single byte. + 2020-07-23: - Fixed an infinite loop that could occur when ksh is the system's /bin/sh. diff --git a/TODO b/TODO index f1e7aa0e7..36026fac2 100644 --- a/TODO +++ b/TODO @@ -54,9 +54,3 @@ https://github.com/modernish/modernish/tree/0.16/lib/modernish/cap/ between 'while'/'until' and 'do'), the exit status passed down from the previous command is ignored and the function returns with status 0 instead. - -- BUG_MULTIBIFS: We're on a UTF-8 locale and the shell supports UTF-8 - characters in general (i.e. we don't have WRN_MULTIBYTE) – however, using - multi-byte characters as IFS field delimiters still doesn't work. For - example, "$*" joins positional parameters on the first byte of IFS instead - of the first character. diff --git a/src/cmd/ksh93/include/version.h b/src/cmd/ksh93/include/version.h index 2f87d7609..066f85373 100644 --- a/src/cmd/ksh93/include/version.h +++ b/src/cmd/ksh93/include/version.h @@ -17,4 +17,4 @@ * David Korn * * * ***********************************************************************/ -#define SH_RELEASE "93u+m 2020-07-23" +#define SH_RELEASE "93u+m 2020-07-25" diff --git a/src/cmd/ksh93/sh/macro.c b/src/cmd/ksh93/sh/macro.c index 530da993c..1fb67ffb3 100644 --- a/src/cmd/ksh93/sh/macro.c +++ b/src/cmd/ksh93/sh/macro.c @@ -1954,10 +1954,34 @@ retry2: } else if(d) { +#if SHOPT_MULTIBYTE + Sfio_t *sfio_ptr = (mp->sp) ? mp->sp : stkp; + + /* + * We know from above that if we are not performing @-expansion + * then we assigned `d` the value of `mp->ifs`, here we check + * whether or not we have a valid string of IFS characters to + * write as it is possible for `d` to be set to `mp->ifs` and + * yet `mp->ifsp` to be NULL. + */ + if(mode != '@' && mp->ifsp) + { + /* + * Handle multi-byte characters being used for the internal + * field separator (IFS). + */ + int i; + for(i = 0; i < mbsize(mp->ifsp); i++) + sfputc(sfio_ptr,mp->ifsp[i]); + } + else + sfputc(sfio_ptr,d); +#else if(mp->sp) sfputc(mp->sp,d); else sfputc(stkp,d); +#endif } } if(arrmax) @@ -2403,7 +2427,16 @@ static void mac_copy(register Mac_t *mp,register const char *str, register int s if(n==S_MBYTE) { if(sh_strchr(mp->ifsp,cp-1)<0) + { + /* + * The multi-byte character that was found has the same initial + * byte as the IFS delimiter, but it's a different character. Put + * the first byte onto the stack and continue; multi-byte characters + * otherwise lose their initial byte. + */ + sfputc(stkp,c); continue; + } n = mbsize(cp-1) - 1; if(n==-2) n = 0; diff --git a/src/cmd/ksh93/tests/variables.sh b/src/cmd/ksh93/tests/variables.sh index c6e8c4203..6afe9022b 100755 --- a/src/cmd/ksh93/tests/variables.sh +++ b/src/cmd/ksh93/tests/variables.sh @@ -435,6 +435,39 @@ case $(unset IFS; set -- $v; print $#) in *) err_exit 'BUG_KUNSETIFS detection failed' esac +# Multi-byte characters should work with $IFS +( + LC_ALL=C.UTF-8 # The multi-byte tests are pointless without UTF-8 + + # Test the following characters: + # Lowercase accented e (two bytes) + # Roman sestertius sign (four bytes) + for delim in é 𐆘; do + IFS="$delim" + set : : + [ "$*" == ":$delim:" ] || err_exit "IFS failed with multi-byte character $delim (expected :$delim:, got $*)" + + read -r first second third <<< "one${delim}two${delim}three" + [[ $first == one ]] || err_exit "IFS failed with multi-byte character $delim (expected one, got $first)" + [[ $second == two ]] || err_exit "IFS failed with multi-byte character $delim (expected two, got $second)" + [[ $third == three ]] || err_exit "IFS failed with multi-byte character $delim (expected three, got $three)" + + # Ensure subshells don't get corrupted when IFS becomes a multi-byte character + expected_output="$(printf ":$delim:\\ntrap -- 'echo end' EXIT\\nend")" + output="$(LANG=C.UTF-8; IFS=$delim; set : :; echo "$*"; trap "echo end" EXIT; trap)" + [[ $output == $expected_output ]] || err_exit "IFS in subshell failed with multi-byte character $delim (expected $expected_output, got $output)" + done + + # Multibyte characters with the same initial byte shouldn't be parsed as the same + # character if they are different. The regression test below tests two characters + # with the same initial byte (0xC2). + IFS='£' # £ = C2 A3 + v='abc§def ghi§jkl' # § = C2 A7 (same initial byte) + set -- $v + v="${#},${1-},${2-},${3-}" + [[ $v == '1,abc§def ghi§jkl,,' ]] || err_exit "IFS treats £ (C2 A3) and § (C2 A7) as the same character" +) + # ^^^ end: IFS tests ^^^ # restore default split: unset IFS