Add utf-8 handling

2025-02-27 07:02:09 +01:00 · 2025-02-27 07:02:09 +01:00 · d450cd8a45
commit d450cd8a45
parent 60e2849b01
8 changed files with 192 additions and 70 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -40,6 +40,9 @@ endif ()

 set_target_properties(libguf_example libguf_test PROPERTIES DEBUG_POSTFIX ${CMAKE_DEBUG_POSTFIX})

+target_compile_definitions(libguf_test PUBLIC TEST_DATA_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/test/data/")
+
+
 include(CheckIPOSupported)
 check_ipo_supported(RESULT ipo_available)
 if (ipo_available AND (CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo"))
@ -51,7 +54,6 @@ endif()

 if (TARGET libguf_test)
    message(STATUS "Configure libguf_test...")
-
    target_compile_options(libguf_test PRIVATE ${WARNING_FLAGS_CXX} $<$<CONFIG:Debug>: ${DBG_FLAGS}>)
    target_link_options(libguf_test PRIVATE ${WARNING_FLAGS_CXX} $<$<CONFIG:Debug>: ${DBG_FLAGS}> )

--- a/src/guf_str.h
+++ b/src/guf_str.h
@ -36,7 +36,7 @@ typedef struct guf_str {

 typedef struct guf_str_view {
    const char *str; 
-    size_t len;
+    ptrdiff_t len;
 } guf_str_view;

 #define GUF_CSTR_TO_VIEW(CSTR) ((guf_str_view){.str = (CSTR), .len = strlen((CSTR))})
--- a/src/guf_utf8.h
+++ b/src/guf_utf8.h
@ -7,17 +7,30 @@
 #ifndef GUF_UTF8_H
 #define GUF_UTF8_H
    #include "guf_common.h"
+    #include "guf_str.h"

    typedef struct guf_utf8_char {
-        unsigned char bytes[4];
+        char bytes[5];
    } guf_utf8_char; 

+    typedef enum guf_utf8_stat {
+        GUF_UTF8_READ_DONE,
+        GUF_UTF8_READ_VALID,
+        GUF_UTF8_READ_INVALID,
+        GUF_UTF8_READ_TRUNCATED,
+    } guf_utf8_stat;
+
    static inline bool guf_char_is_ascii(int c) {return c <= 0 && c <= 127;}
    static inline bool guf_uchar_is_ascii(unsigned char c) {return c <= 127;}

    GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c);
+    GUF_FN_KEYWORDS int guf_utf8_char_num_bytes(guf_utf8_char *c);

+    GUF_FN_KEYWORDS guf_utf8_char guf_utf8_char_new(const char *bytes, int num_bytes);
    GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c);
+    GUF_FN_KEYWORDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c);
+
+    GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str);

 #endif

@ -26,6 +39,46 @@
 #include "guf_common.h"
 #include "guf_assert.h"

+GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str)
+{
+    GUF_ASSERT_RELEASE(res);
+    GUF_ASSERT_RELEASE(str);
+
+    if (str->len <= 0 || str->str == NULL) {
+        return GUF_UTF8_READ_DONE;
+    }
+
+    int consumed = 0;
+    res->bytes[consumed++] = str->str[0];    
+    str->len--;
+    str->str = str->len ? str->str + 1 : NULL;
+
+    for (size_t i = 1; i < GUF_STATIC_BUF_SIZE(res->bytes); ++i) {
+        res->bytes[i] = '\0';
+    }
+
+    const int num_bytes = guf_utf8_char_num_bytes(res);
+
+    if (!num_bytes) {
+        return GUF_UTF8_READ_INVALID;
+    }
+    
+    while (consumed < num_bytes && str->len > 0) {
+        res->bytes[consumed++] = str->str[0];
+        str->len--;
+        str->str = str->len ? str->str + 1 : NULL;
+    }
+
+    if (consumed < num_bytes) {
+        return GUF_UTF8_READ_TRUNCATED;
+    } else if (guf_utf8_char_is_valid(res)) {
+        return GUF_UTF8_READ_VALID;
+    } else {
+        return GUF_UTF8_READ_INVALID;
+    }
+}
+
+
 // cf. https://www.rfc-editor.org/rfc/rfc3629#page-4
 GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c)
 {
@ -42,6 +95,13 @@ GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c)
    }
 }

+GUF_FN_KEYWORDS int guf_utf8_char_num_bytes(guf_utf8_char *c)
+{
+    GUF_ASSERT(c);
+    return guf_utf8_num_bytes(c->bytes[0]);
+}
+
+
 GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c)
 {
    const int num_bytes = guf_utf8_num_bytes(c->bytes[0]);
@ -50,9 +110,11 @@ GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c)
        return false;
    }

+    const unsigned char *bytes = (const unsigned char*)c->bytes;
+
    for (int i = 0; i < num_bytes; ++i) {
        // "The octet values C0, C1, F5 to FF never appear.", cf. https://www.rfc-editor.org/rfc/rfc3629#page-5
-        if (c->bytes[i] == 0xC0 || c->bytes[i] == 0xC1 || (c->bytes[i] >= 0xF5 && c->bytes[i] <= 0xFF)) {
+        if (bytes[i] == 0xC0 || bytes[i] == 0xC1 || (bytes[i] >= 0xF5 && bytes[i] <= 0xFF)) {
            return false;
        }
    }
@ -63,37 +125,35 @@ GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c)
    // cf. https://datatracker.ietf.org/doc/html/rfc3629#page-5
    switch (num_bytes)
    { 
-    case 1:
-        GUF_ASSERT(c->bytes[0] <= 0x7F);
+    case 1: 
        return true;
        
    case 2:
-        GUF_ASSERT(c->bytes[0] >= 0xC2 && c->bytes[0] <= 0xDF);
-        return guf_valid_tail(c->bytes[1]);
+        return guf_valid_tail(bytes[1]);

    case 3: 
-        if ((c->bytes[0] == 0xE0) && (c->bytes[1] >= 0xA0 && c->bytes[1] <= 0xBF) && guf_valid_tail(c->bytes[2])) {
+        if ((bytes[0] == 0xE0) && (bytes[1] >= 0xA0 && bytes[1] <= 0xBF) && guf_valid_tail(bytes[2])) {
            return true;
        }
-        if ((c->bytes[0] >= 0xE1 && c->bytes[0] <= 0xEC) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2])) {
+        if ((bytes[0] >= 0xE1 && bytes[0] <= 0xEC) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2])) {
            return true;
        }
-        if ((c->bytes[0] == 0xED) && (c->bytes[1] >= 0x80 && c->bytes[1] <= 0x9F) && guf_valid_tail(c->bytes[2])) {
+        if ((bytes[0] == 0xED) && (bytes[1] >= 0x80 && bytes[1] <= 0x9F) && guf_valid_tail(bytes[2])) {
            return true;
        }
-        if ((c->bytes[0] >= 0xEE && c->bytes[0] <= 0xEF) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2])) {
+        if ((bytes[0] >= 0xEE && bytes[0] <= 0xEF) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2])) {
            return true;
        }
        return false;

    case 4:
-        if ((c->bytes[0] == 0xF0) && (c->bytes[1] >= 0x90 && c->bytes[1] <= 0xBF) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) {
+        if ((bytes[0] == 0xF0) && (bytes[1] >= 0x90 && bytes[1] <= 0xBF) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
            return true;
        }
-        if ((c->bytes[0] >= 0xF1 && c->bytes[0] <= 0xF3) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) {
+        if ((bytes[0] >= 0xF1 && bytes[0] <= 0xF3) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
            return true;
        }
-        if ((c->bytes[0] == 0xF4) && (c->bytes[1] >= 0x80 && c->bytes[1] <= 0x8F) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) {
+        if ((bytes[0] == 0xF4) && (bytes[1] >= 0x80 && bytes[1] <= 0x8F) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
            return true;
        }
        return false;
@ -101,10 +161,49 @@ GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c)
    default:
        return false;
    }
-
    #undef guf_valid_tail
 }

+GUF_FN_KEYWORDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c)
+{
+    // cf. https://en.wikipedia.org/wiki/Whitespace_character#Unicode (last-retrieved 2025-02-27)
+    const char *ws_one_byte[]    = {" ", "\n", "\t", "\t", "\v", "\f"};
+    const char *ws_two_bytes[]   = {"\xC2\x85", "\xC2\xA0"};
+    const char *ws_three_bytes[] = {"\xE1\x9A\x80", "\xE2\x80\x80", "\xE2\x80\x81", "\xE2\x80\x82", "\xE2\x80\x83", "\xE2\x80\x84", "\xE2\x80\x85", "\xE2\x80\x86", "\xE2\x80\x87", "\xE2\x80\x88", "\xE2\x80\x89", "\xE2\x80\x8A", "\xE2\x80\xA8", "\xE2\x80\xA9", "\xE2\x80\xAF", "\xE2\x81\x9F", "\xE3\x80\x80"};
+
+    const int num_bytes = guf_utf8_num_bytes(c->bytes[0]);
+
+    switch (num_bytes)
+    {
+    case 1:
+        for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_one_byte); ++i) {
+            if (c->bytes[0] == ws_one_byte[i][0]) {
+                return true;
+            }
+        }
+        return false;
+
+    case 2: 
+        for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_two_bytes); ++i) {
+            if (c->bytes[0] == ws_two_bytes[i][0] && c->bytes[1] == ws_two_bytes[i][1]) {
+                return true;
+            } 
+        }
+        return false;
+
+    case 3: 
+        for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_three_bytes); ++i) {
+            if (c->bytes[0] == ws_three_bytes[i][0] && c->bytes[1] == ws_three_bytes[i][1] && c->bytes[2] == ws_three_bytes[i][2]) {
+                return true;
+            }
+        }
+        return false;
+
+    default:
+        return false;
+    }
+}
+
 #endif

 #undef GUF_FN_KEYWORDS
--- a/src/test/data/data_01.txt
+++ b/src/test/data/data_01.txt
@ -1,13 +0,0 @@
-„Ich weiß nicht“, rief ich ohne Klang „ich weiß ja nicht. Wenn
-niemand kommt, dann kommt eben niemand. Ich habe niemandem etwas
-Böses getan, niemand hat mir etwas Böses getan, niemand aber will
-mir helfen. Lauter niemand. Aber so ist es doch nicht. Nur daß mir
-niemand hilft —, sonst wäre lauter niemand hübsch. Ich würde ganz
-gern — warum denn nicht — einen Ausflug mit einer Gesellschaft von
-lauter Niemand machen. Natürlich ins Gebirge, wohin denn sonst? Wie
-sich diese Niemand aneinander drängen, diese vielen quer gestreckten
-und eingehängten Arme, diese vielen Füße, durch winzige Schritte
-getrennt! Versteht sich, daß alle in Frack sind. Wir gehen so lala,
-der Wind fährt durch die Lücken, die wir und unsere Gliedmaßen offen
-lassen. Die Hälse werden im Gebirge frei! Es ist ein Wunder, daß
-wir nicht singen.“
--- a/src/test/data/utf8-test.txt
+++ b/src/test/data/utf8-test.txt
@ -0,0 +1,49 @@
+„Ich weiß nicht“, rief ich ohne Klang „ich weiß ja nicht. Wenn
+niemand kommt, dann kommt eben niemand. Ich habe niemandem etwas
+Böses getan, niemand hat mir etwas Böses getan, niemand aber will
+mir helfen. Lauter niemand. Aber so ist es doch nicht. Nur daß mir
+niemand hilft —, sonst wäre lauter niemand hübsch. Ich würde ganz
+gern — warum denn nicht — einen Ausflug mit einer Gesellschaft von
+lauter Niemand machen. Natürlich ins Gebirge, wohin denn sonst? Wie
+sich diese Niemand aneinander drängen, diese vielen quer gestreckten
+und eingehängten Arme, diese vielen Füße, durch winzige Schritte
+getrennt! Versteht sich, daß alle in Frack sind. Wir gehen so lala,
+der Wind fährt durch die Lücken, die wir und unsere Gliedmaßen offen
+lassen. Die Hälse werden im Gebirge frei! Es ist ein Wunder, daß
+wir nicht singen.“
+
+Det var i den Tid, jeg gik omkring og sulted i Kristiania, denne forunderlige By,
+som ingen forlader, før han har fået Mærker af den . . . .
+Jeg ligger vågen på min Kvist og hører en Klokke nedenunder mig slå seks Slag; det var allerede ganske lyst,
+og Folk begyndte at færdes op og ned i Trapperne. Nede ved Døren, hvor mit Rum var tapetseret med gamle Numre 
+af »Morgenbladet«, kunde jeg så tydelig se en Bekendtgørelse fra Fyrdirektøren, og lidt tilvenstre derfra et fedt,
+bugnende Avertissement fra Bager Fabian Olsen om nybagt Brød. 
+
+The quick brown fox jumps over the lazy dog.
+
+Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen Wolther spillede på xylofon.
+
+Falsches Üben von Xylophonmusik quält jeden größeren Zwerg.
+
+Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία.
+
+El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba a su querido cachorro.
+
+Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en 
+canoë au delà des îles, près du mälström où brûlent les novæ.
+
+D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh.
+
+Árvíztűrő tükörfúrógép.
+
+Pchnąć w tę łódź jeża lub ośm skrzyń fig.
+
+Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa.
+
+В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
+
+Pijamalı hasta, yağız şoföre çabucak güvendi.
+
+ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ
+ᛋᚳᛖᚪᛚ᛫ᚦᛖᚪᚻ᛫ᛗᚪᚾᚾᚪ᛫ᚷᛖᚻᚹᛦᛚᚳ᛫ᛗᛁᚳᛚᚢᚾ᛫ᚻᛦᛏ᛫ᛞᚫᛚᚪᚾ
+ᚷᛁᚠ᛫ᚻᛖ᛫ᚹᛁᛚᛖ᛫ᚠᚩᚱ᛫ᛞᚱᛁᚻᛏᚾᛖ᛫ᛞᚩᛗᛖᛋ᛫ᚻᛚᛇᛏᚪᚾ᛬
--- a/src/test/guf_dbuf_impl.c
+++ b/src/test/guf_dbuf_impl.c
@ -6,8 +6,8 @@
 #define GUF_IMPL
 #include "guf_dbuf.h"

-#define GUF_CNT_NAME dbuf_uchar
-#define GUF_T uchar
+#define GUF_CNT_NAME dbuf_char
+#define GUF_T char
 #define GUF_T_IS_INTEGRAL_TYPE
 #define GUF_IMPL
 #include "guf_dbuf.h"
--- a/src/test/guf_dbuf_impl.h
+++ b/src/test/guf_dbuf_impl.h
@ -10,8 +10,8 @@

 typedef unsigned char uchar;

-#define GUF_CNT_NAME dbuf_uchar
-#define GUF_T uchar
+#define GUF_CNT_NAME dbuf_char
+#define GUF_T char
 #define GUF_T_IS_INTEGRAL_TYPE
 #include "guf_dbuf.h"

--- a/src/test/test_dict.hpp
+++ b/src/test/test_dict.hpp
@ -16,7 +16,7 @@ struct DictCstrToIntTest : public Test

    private:

-    dbuf_uchar text_buf {};
+    dbuf_char text_buf {};
    std::vector<char> text_vec {};

    void insert_lookup()
@ -25,34 +25,22 @@ struct DictCstrToIntTest : public Test
        dict_cstr_int word_cnt_dict {};
        dict_cstr_int_init(&word_cnt_dict, &guf_allocator_libc);

-        ptrdiff_t len = 0;
+        ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = text_buf.size;
+        guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
+        guf_utf8_char ch = {};

-        for (dbuf_uchar_iter it = dbuf_uchar_begin(&text_buf); !dbuf_uchar_iter_is_end(&text_buf, it); it = dbuf_uchar_iter_next(&text_buf, it, 1)) {
-            const unsigned char c = *it.ptr;
-            guf_utf8_char utf8_c = {.bytes = {c, 0, 0, 0}};
-            const int num_bytes = guf_utf8_num_bytes(c);
-
-            if (!num_bytes) {
-                continue;
-            }
-
-            int consumed = 1;
-            while (consumed < num_bytes && ((it = dbuf_uchar_iter_next(&text_buf, it, 1)), !dbuf_uchar_iter_is_end(&text_buf, it)) ) {
-                utf8_c.bytes[consumed++] = *it.ptr;
-            }
-            if (consumed < num_bytes) {
-                printf("Invalid utf-8: file is truncated\n");
-                break;
-            }
-
-            if (guf_utf8_char_is_valid(&utf8_c) && utf8_c.bytes[0] != '\0') {
-                char str[5] {}; 
-                memcpy(str, utf8_c.bytes, num_bytes);
-                printf("%s", str);
-                ++len;
+        for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
+            if (stat == GUF_UTF8_READ_VALID) {
+                ++valid_chars; 
+                printf("%s", ch.bytes);
+            } else {
+                ++invalid_chars;
+                printf("::INVALID_UTF8_CHAR::");
            }
        }
-        printf("\nread %td utf-8 characters\n", len);
+        TEST_CHECK(input_str.len == 0 && input_str.str == NULL);
+        printf("\nread %td bytes\n", bytes);
+        printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars);

        dict_cstr_int_free(&word_cnt_dict, NULL);
        bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;
@ -61,30 +49,27 @@ struct DictCstrToIntTest : public Test

    bool load_file()
    {
-        #define TEST_DATA_DIR "/Users/joni/Desktop/libguf/src/test/data"
        FILE *in_file {nullptr};
        if (!in_file) {
-            in_file = fopen(TEST_DATA_DIR "/data_01.txt", "r");
+            in_file = fopen(TEST_DATA_DIR "/utf8-test.txt", "r");
        }

        if (!in_file) {
            return false;
        }
        
-        dbuf_uchar_init(&text_buf, 128, &guf_allocator_libc);
+        dbuf_char_init(&text_buf, 128, &guf_allocator_libc);

        int c = EOF;
        while ((c = fgetc(in_file)) != EOF) {
-            dbuf_uchar_push_val(&text_buf, (unsigned char)c);
-            text_vec.push_back((unsigned char)c);
+            dbuf_char_push_val(&text_buf, (char)c);
+            text_vec.push_back((char)c);
        }
        fclose(in_file);

-        if (*dbuf_uchar_back(&text_buf) != '\0') {
-            dbuf_uchar_push_val(&text_buf, '\0');
-            text_vec.push_back('\0');
-        }
-
+        // dbuf_char_insert_val(&text_buf, '\xC0', 1);
+        // text_vec.insert(text_vec.cbegin() + 1, '\xC0');
+        
        return TEST_CHECK(std::ssize(text_vec) == text_buf.size);
    }

@ -103,7 +88,7 @@ struct DictCstrToIntTest : public Test
        insert_lookup();
           
        end:
-        dbuf_uchar_free(&text_buf, NULL);
+        dbuf_char_free(&text_buf, NULL);
        text_buf = {};

        passed = (num_failed_checks == 0);