From ce6488ccb3789fb7e8ae0d751e3b49bd4e985375 Mon Sep 17 00:00:00 2001
From: Adam Saponara <as@php.net>
Date: Thu, 8 Feb 2024 20:40:24 -0500
Subject: [PATCH] stop reading at null-terminator in `tb_utf8_char_to_unicode`
 (#63)

if this occurs in `tb_print_ex`, replace the invalid character with a U+FFFD.
---
 termbox2.h                            | 36 ++++++++++++++++++++-------
 tests/test_invalid_utf8/expected.ansi | 24 ++++++++++++++++++
 tests/test_invalid_utf8/test.php      |  8 ++++++
 3 files changed, 59 insertions(+), 9 deletions(-)
 create mode 100644 tests/test_invalid_utf8/expected.ansi
 create mode 100755 tests/test_invalid_utf8/test.php

diff --git a/termbox2.h b/termbox2.h
index 9da4ef1..0a8b44d 100644
--- a/termbox2.h
+++ b/termbox2.h
@@ -632,7 +632,8 @@ int tb_poll_event(struct tb_event *event);
 int tb_get_fds(int *ttyfd, int *resizefd);
 
 /* Print and printf functions. Specify param out_w to determine width of printed
- * string.
+ * string. Incomplete trailing UTF-8 byte sequences are replaced with U+FFFD.
+ * For finer control, use tb_set_cell().
  */
 int tb_print(int x, int y, uintattr_t fg, uintattr_t bg, const char *str);
 int tb_printf(int x, int y, uintattr_t fg, uintattr_t bg, const char *fmt, ...);
@@ -660,6 +661,17 @@ int tb_set_func(int fn_type, int (*fn)(struct tb_event *, size_t *));
 
 /* Utility functions. */
 int tb_utf8_char_length(char c);
+
+/* Convert UTF-8 null-terminated byte sequence to UTF-32 code point.
+ *
+ * If `c` is an empty C string, return 0. `out` is left unchanged.
+ *
+ * If a null byte is encountered in the middle of the code point, return a
+ * negative number indicating how many bytes were processed. `out` is left
+ * unchanged.
+ *
+ * Otherwise, return byte length of code point (1-6).
+ */
 int tb_utf8_char_to_unicode(uint32_t *out, const char *c);
 int tb_utf8_unicode_to_char(char *out, uint32_t c);
 int tb_last_errno(void);
@@ -1815,11 +1827,17 @@ int tb_print_ex(int x, int y, uintattr_t fg, uintattr_t bg, size_t *out_w,
         *out_w = 0;
     }
     while (*str) {
-        str += tb_utf8_char_to_unicode(&uni, str);
-        w = wcwidth((wchar_t)uni);
-        if (w < 0) {
-            w = 1;
+        rv = tb_utf8_char_to_unicode(&uni, str);
+        if (rv < 0) {
+            uni = 0xfffd; // replace invalid UTF-8 char with U+FFFD
+            str += rv * -1;
+        } else if (rv > 0) {
+            str += rv;
+        } else {
+            break; // shouldn't get here
         }
+        w = wcwidth((wchar_t)uni);
+        if (w < 0) w = 1;
         if (w == 0 && x > ix) {
             if_err_return(rv, tb_extend_cell(x - 1, y, uni));
         } else {
@@ -1892,19 +1910,19 @@ int tb_utf8_char_length(char c) {
 }
 
 int tb_utf8_char_to_unicode(uint32_t *out, const char *c) {
-    if (*c == 0) {
-        return TB_ERR;
-    }
+    if (*c == '\0') return 0;
 
     int i;
     unsigned char len = tb_utf8_char_length(*c);
     unsigned char mask = utf8_mask[len - 1];
     uint32_t result = c[0] & mask;
-    for (i = 1; i < len; ++i) {
+    for (i = 1; i < len && c[i] != '\0'; ++i) {
         result <<= 6;
         result |= c[i] & 0x3f;
     }
 
+    if (i != len) return i * -1;
+
     *out = result;
     return (int)len;
 }
diff --git a/tests/test_invalid_utf8/expected.ansi b/tests/test_invalid_utf8/expected.ansi
new file mode 100644
index 0000000..b440b63
--- /dev/null
+++ b/tests/test_invalid_utf8/expected.ansi
@@ -0,0 +1,24 @@
+#5[0mfooï¿½[0m
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/test_invalid_utf8/test.php b/tests/test_invalid_utf8/test.php
new file mode 100755
index 0000000..76480dd
--- /dev/null
+++ b/tests/test_invalid_utf8/test.php
@@ -0,0 +1,8 @@
+<?php
+declare(strict_types=1);
+
+$test->ffi->tb_init();
+$test->ffi->tb_print_ex(0, 0, 0, 0, NULL, "foo\xc2\x00password");
+$test->ffi->tb_present();
+
+$test->screencap();
-- 
2.39.5