Make guf_utf8_char 4 bytes instead of 5

(Null-termination was superfluous here.)
2025-05-25 16:04:26 +02:00 · 2025-05-25 16:04:26 +02:00 · ae83ee66e1
commit ae83ee66e1
parent c54fc75221
3 changed files with 5 additions and 11 deletions
--- a/src/guf_utf8.h
+++ b/src/guf_utf8.h
@ -16,7 +16,7 @@

 // Corresponds to one unicode codepoint (NOTE: one guf_utf8_char does not necessarily correspond to one printable character, e.g. combining characters).
 typedef struct guf_utf8_char { 
-    char bytes[5];
+    char bytes[4];
 } guf_utf8_char; 

 typedef enum guf_utf8_stat {
@ -68,7 +68,7 @@ const char* const GUF_UTF8_COMMON_PUNCT[32] =
    ".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "„", "“", "´", "»", "«", "`", "\\", "%", "‒", "–", "—", "—", "_"
 };

-const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR  = {.bytes = {'\xEF','\xBF','\xBD', '\0', '\0'}};
+const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR  = {.bytes = {'\xEF','\xBF','\xBD', '\0'}};

 #ifndef GUF_FN_KEYWORDS
    #define GUF_FN_KEYWORDS
--- a/src/test/test_utf8.cpp
+++ b/src/test/test_utf8.cpp
@ -317,35 +317,31 @@ void UTF8Test::encode_decode()
    TEST_CHECK(guf_utf8_encode(&utf8, 0x1F308)); // "🌈" (Rainbow)
    TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
    TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\x8C' && utf8.bytes[3] == '\x88');
-    TEST_CHECK(utf8.bytes[4] == '\0');
    TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F308);

    TEST_CHECK(guf_utf8_encode(&utf8, 0x130B8)); // "𓂸" (Egyptian Hieroglyph D052)
    TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
    TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x93' && utf8.bytes[2] == '\x82' && utf8.bytes[3] == '\xB8');
-    TEST_CHECK(utf8.bytes[4] == '\0');
    TEST_CHECK(guf_utf8_decode(&utf8) == 0x130B8);

    TEST_CHECK(guf_utf8_encode(&utf8, 0x1F97A)); // "🥺" (Face with Pleading Eyes)
    TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
    TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA5' && utf8.bytes[3] == '\xBA');
-    TEST_CHECK(utf8.bytes[4] == '\0');
    TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F97A);

    TEST_CHECK(guf_utf8_encode(&utf8, 0x1F980)); // "🦀" (Crab)
    TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
    TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA6' && utf8.bytes[3] == '\x80');
-    TEST_CHECK(utf8.bytes[4] == '\0');
    TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F980);

    // Invalid characters:
-    utf8 = {.bytes = {'\xC0', '\x80', 0, 0, 0}};
+    utf8 = {.bytes = {'\xC0', '\x80', 0, 0}};
    TEST_CHECK(guf_utf8_decode(&utf8) < 0);

-    utf8 = {.bytes = {'\xC0', 0, 0, 0, 0}};
+    utf8 = {.bytes = {'\xC0', 0, 0, 0}};
    TEST_CHECK(guf_utf8_decode(&utf8) < 0);

-    utf8 = {.bytes = {'\x80', 0, 0, 0, 0}};
+    utf8 = {.bytes = {'\x80', 0, 0, 0}};
    TEST_CHECK(guf_utf8_decode(&utf8) < 0);

    // "The definition of UTF-8 prohibits encoding character numbers between U+D800 and U+DFFF" (surrogate pairs).
--- a/todo.txt
+++ b/todo.txt
@ -1,7 +1,5 @@
 - fix readonly str/uninit ?

- make guf_utf8_char 4 bytes (non-null terminated)
-
 - guf_stack, guf_queue, guf_dqueue, guf_prio_queue (using a heap), guf_ringbuf
 - sort: add cpp #ifdef to remove restrict from declaration