From ce6488ccb3789fb7e8ae0d751e3b49bd4e985375 Mon Sep 17 00:00:00 2001 From: Adam Saponara Date: Thu, 8 Feb 2024 20:40:24 -0500 Subject: [PATCH] stop reading at null-terminator in `tb_utf8_char_to_unicode` (#63) if this occurs in `tb_print_ex`, replace the invalid character with a U+FFFD. --- termbox2.h | 36 ++++++++++++++++++++------- tests/test_invalid_utf8/expected.ansi | 24 ++++++++++++++++++ tests/test_invalid_utf8/test.php | 8 ++++++ 3 files changed, 59 insertions(+), 9 deletions(-) create mode 100644 tests/test_invalid_utf8/expected.ansi create mode 100755 tests/test_invalid_utf8/test.php diff --git a/termbox2.h b/termbox2.h index 9da4ef1..0a8b44d 100644 --- a/termbox2.h +++ b/termbox2.h @@ -632,7 +632,8 @@ int tb_poll_event(struct tb_event *event); int tb_get_fds(int *ttyfd, int *resizefd); /* Print and printf functions. Specify param out_w to determine width of printed - * string. + * string. Incomplete trailing UTF-8 byte sequences are replaced with U+FFFD. + * For finer control, use tb_set_cell(). */ int tb_print(int x, int y, uintattr_t fg, uintattr_t bg, const char *str); int tb_printf(int x, int y, uintattr_t fg, uintattr_t bg, const char *fmt, ...); @@ -660,6 +661,17 @@ int tb_set_func(int fn_type, int (*fn)(struct tb_event *, size_t *)); /* Utility functions. */ int tb_utf8_char_length(char c); + +/* Convert UTF-8 null-terminated byte sequence to UTF-32 code point. + * + * If `c` is an empty C string, return 0. `out` is left unchanged. + * + * If a null byte is encountered in the middle of the code point, return a + * negative number indicating how many bytes were processed. `out` is left + * unchanged. + * + * Otherwise, return byte length of code point (1-6). + */ int tb_utf8_char_to_unicode(uint32_t *out, const char *c); int tb_utf8_unicode_to_char(char *out, uint32_t c); int tb_last_errno(void); @@ -1815,11 +1827,17 @@ int tb_print_ex(int x, int y, uintattr_t fg, uintattr_t bg, size_t *out_w, *out_w = 0; } while (*str) { - str += tb_utf8_char_to_unicode(&uni, str); - w = wcwidth((wchar_t)uni); - if (w < 0) { - w = 1; + rv = tb_utf8_char_to_unicode(&uni, str); + if (rv < 0) { + uni = 0xfffd; // replace invalid UTF-8 char with U+FFFD + str += rv * -1; + } else if (rv > 0) { + str += rv; + } else { + break; // shouldn't get here } + w = wcwidth((wchar_t)uni); + if (w < 0) w = 1; if (w == 0 && x > ix) { if_err_return(rv, tb_extend_cell(x - 1, y, uni)); } else { @@ -1892,19 +1910,19 @@ int tb_utf8_char_length(char c) { } int tb_utf8_char_to_unicode(uint32_t *out, const char *c) { - if (*c == 0) { - return TB_ERR; - } + if (*c == '\0') return 0; int i; unsigned char len = tb_utf8_char_length(*c); unsigned char mask = utf8_mask[len - 1]; uint32_t result = c[0] & mask; - for (i = 1; i < len; ++i) { + for (i = 1; i < len && c[i] != '\0'; ++i) { result <<= 6; result |= c[i] & 0x3f; } + if (i != len) return i * -1; + *out = result; return (int)len; } diff --git a/tests/test_invalid_utf8/expected.ansi b/tests/test_invalid_utf8/expected.ansi new file mode 100644 index 0000000..b440b63 --- /dev/null +++ b/tests/test_invalid_utf8/expected.ansi @@ -0,0 +1,24 @@ +#5foo� + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_invalid_utf8/test.php b/tests/test_invalid_utf8/test.php new file mode 100755 index 0000000..76480dd --- /dev/null +++ b/tests/test_invalid_utf8/test.php @@ -0,0 +1,8 @@ +ffi->tb_init(); +$test->ffi->tb_print_ex(0, 0, 0, 0, NULL, "foo\xc2\x00password"); +$test->ffi->tb_present(); + +$test->screencap(); -- 2.39.5