diff --git a/CMakeLists.txt b/CMakeLists.txt index 3cc4758..58e080f 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,6 +40,9 @@ endif () set_target_properties(libguf_example libguf_test PROPERTIES DEBUG_POSTFIX ${CMAKE_DEBUG_POSTFIX}) +target_compile_definitions(libguf_test PUBLIC TEST_DATA_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/test/data/") + + include(CheckIPOSupported) check_ipo_supported(RESULT ipo_available) if (ipo_available AND (CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")) @@ -51,7 +54,6 @@ endif() if (TARGET libguf_test) message(STATUS "Configure libguf_test...") - target_compile_options(libguf_test PRIVATE ${WARNING_FLAGS_CXX} $<$: ${DBG_FLAGS}>) target_link_options(libguf_test PRIVATE ${WARNING_FLAGS_CXX} $<$: ${DBG_FLAGS}> ) diff --git a/src/guf_str.h b/src/guf_str.h index 95ca926..be2c088 100644 --- a/src/guf_str.h +++ b/src/guf_str.h @@ -36,7 +36,7 @@ typedef struct guf_str { typedef struct guf_str_view { const char *str; - size_t len; + ptrdiff_t len; } guf_str_view; #define GUF_CSTR_TO_VIEW(CSTR) ((guf_str_view){.str = (CSTR), .len = strlen((CSTR))}) diff --git a/src/guf_utf8.h b/src/guf_utf8.h index 0478da9..b4f53c8 100644 --- a/src/guf_utf8.h +++ b/src/guf_utf8.h @@ -7,17 +7,30 @@ #ifndef GUF_UTF8_H #define GUF_UTF8_H #include "guf_common.h" + #include "guf_str.h" typedef struct guf_utf8_char { - unsigned char bytes[4]; + char bytes[5]; } guf_utf8_char; + typedef enum guf_utf8_stat { + GUF_UTF8_READ_DONE, + GUF_UTF8_READ_VALID, + GUF_UTF8_READ_INVALID, + GUF_UTF8_READ_TRUNCATED, + } guf_utf8_stat; + static inline bool guf_char_is_ascii(int c) {return c <= 0 && c <= 127;} static inline bool guf_uchar_is_ascii(unsigned char c) {return c <= 127;} GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c); + GUF_FN_KEYWORDS int guf_utf8_char_num_bytes(guf_utf8_char *c); + GUF_FN_KEYWORDS guf_utf8_char guf_utf8_char_new(const char *bytes, int num_bytes); GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c); + GUF_FN_KEYWORDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c); + + GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str); #endif @@ -26,6 +39,46 @@ #include "guf_common.h" #include "guf_assert.h" +GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str) +{ + GUF_ASSERT_RELEASE(res); + GUF_ASSERT_RELEASE(str); + + if (str->len <= 0 || str->str == NULL) { + return GUF_UTF8_READ_DONE; + } + + int consumed = 0; + res->bytes[consumed++] = str->str[0]; + str->len--; + str->str = str->len ? str->str + 1 : NULL; + + for (size_t i = 1; i < GUF_STATIC_BUF_SIZE(res->bytes); ++i) { + res->bytes[i] = '\0'; + } + + const int num_bytes = guf_utf8_char_num_bytes(res); + + if (!num_bytes) { + return GUF_UTF8_READ_INVALID; + } + + while (consumed < num_bytes && str->len > 0) { + res->bytes[consumed++] = str->str[0]; + str->len--; + str->str = str->len ? str->str + 1 : NULL; + } + + if (consumed < num_bytes) { + return GUF_UTF8_READ_TRUNCATED; + } else if (guf_utf8_char_is_valid(res)) { + return GUF_UTF8_READ_VALID; + } else { + return GUF_UTF8_READ_INVALID; + } +} + + // cf. https://www.rfc-editor.org/rfc/rfc3629#page-4 GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c) { @@ -42,6 +95,13 @@ GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c) } } +GUF_FN_KEYWORDS int guf_utf8_char_num_bytes(guf_utf8_char *c) +{ + GUF_ASSERT(c); + return guf_utf8_num_bytes(c->bytes[0]); +} + + GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c) { const int num_bytes = guf_utf8_num_bytes(c->bytes[0]); @@ -50,9 +110,11 @@ GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c) return false; } + const unsigned char *bytes = (const unsigned char*)c->bytes; + for (int i = 0; i < num_bytes; ++i) { // "The octet values C0, C1, F5 to FF never appear.", cf. https://www.rfc-editor.org/rfc/rfc3629#page-5 - if (c->bytes[i] == 0xC0 || c->bytes[i] == 0xC1 || (c->bytes[i] >= 0xF5 && c->bytes[i] <= 0xFF)) { + if (bytes[i] == 0xC0 || bytes[i] == 0xC1 || (bytes[i] >= 0xF5 && bytes[i] <= 0xFF)) { return false; } } @@ -63,37 +125,35 @@ GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c) // cf. https://datatracker.ietf.org/doc/html/rfc3629#page-5 switch (num_bytes) { - case 1: - GUF_ASSERT(c->bytes[0] <= 0x7F); + case 1: return true; case 2: - GUF_ASSERT(c->bytes[0] >= 0xC2 && c->bytes[0] <= 0xDF); - return guf_valid_tail(c->bytes[1]); + return guf_valid_tail(bytes[1]); case 3: - if ((c->bytes[0] == 0xE0) && (c->bytes[1] >= 0xA0 && c->bytes[1] <= 0xBF) && guf_valid_tail(c->bytes[2])) { + if ((bytes[0] == 0xE0) && (bytes[1] >= 0xA0 && bytes[1] <= 0xBF) && guf_valid_tail(bytes[2])) { return true; } - if ((c->bytes[0] >= 0xE1 && c->bytes[0] <= 0xEC) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2])) { + if ((bytes[0] >= 0xE1 && bytes[0] <= 0xEC) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2])) { return true; } - if ((c->bytes[0] == 0xED) && (c->bytes[1] >= 0x80 && c->bytes[1] <= 0x9F) && guf_valid_tail(c->bytes[2])) { + if ((bytes[0] == 0xED) && (bytes[1] >= 0x80 && bytes[1] <= 0x9F) && guf_valid_tail(bytes[2])) { return true; } - if ((c->bytes[0] >= 0xEE && c->bytes[0] <= 0xEF) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2])) { + if ((bytes[0] >= 0xEE && bytes[0] <= 0xEF) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2])) { return true; } return false; case 4: - if ((c->bytes[0] == 0xF0) && (c->bytes[1] >= 0x90 && c->bytes[1] <= 0xBF) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) { + if ((bytes[0] == 0xF0) && (bytes[1] >= 0x90 && bytes[1] <= 0xBF) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) { return true; } - if ((c->bytes[0] >= 0xF1 && c->bytes[0] <= 0xF3) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) { + if ((bytes[0] >= 0xF1 && bytes[0] <= 0xF3) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) { return true; } - if ((c->bytes[0] == 0xF4) && (c->bytes[1] >= 0x80 && c->bytes[1] <= 0x8F) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) { + if ((bytes[0] == 0xF4) && (bytes[1] >= 0x80 && bytes[1] <= 0x8F) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) { return true; } return false; @@ -101,10 +161,49 @@ GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c) default: return false; } - #undef guf_valid_tail } +GUF_FN_KEYWORDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c) +{ + // cf. https://en.wikipedia.org/wiki/Whitespace_character#Unicode (last-retrieved 2025-02-27) + const char *ws_one_byte[] = {" ", "\n", "\t", "\t", "\v", "\f"}; + const char *ws_two_bytes[] = {"\xC2\x85", "\xC2\xA0"}; + const char *ws_three_bytes[] = {"\xE1\x9A\x80", "\xE2\x80\x80", "\xE2\x80\x81", "\xE2\x80\x82", "\xE2\x80\x83", "\xE2\x80\x84", "\xE2\x80\x85", "\xE2\x80\x86", "\xE2\x80\x87", "\xE2\x80\x88", "\xE2\x80\x89", "\xE2\x80\x8A", "\xE2\x80\xA8", "\xE2\x80\xA9", "\xE2\x80\xAF", "\xE2\x81\x9F", "\xE3\x80\x80"}; + + const int num_bytes = guf_utf8_num_bytes(c->bytes[0]); + + switch (num_bytes) + { + case 1: + for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_one_byte); ++i) { + if (c->bytes[0] == ws_one_byte[i][0]) { + return true; + } + } + return false; + + case 2: + for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_two_bytes); ++i) { + if (c->bytes[0] == ws_two_bytes[i][0] && c->bytes[1] == ws_two_bytes[i][1]) { + return true; + } + } + return false; + + case 3: + for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_three_bytes); ++i) { + if (c->bytes[0] == ws_three_bytes[i][0] && c->bytes[1] == ws_three_bytes[i][1] && c->bytes[2] == ws_three_bytes[i][2]) { + return true; + } + } + return false; + + default: + return false; + } +} + #endif #undef GUF_FN_KEYWORDS diff --git a/src/test/data/data_01.txt b/src/test/data/data_01.txt deleted file mode 100644 index 66a6a6c..0000000 --- a/src/test/data/data_01.txt +++ /dev/null @@ -1,13 +0,0 @@ -„Ich weiß nicht“, rief ich ohne Klang „ich weiß ja nicht. Wenn -niemand kommt, dann kommt eben niemand. Ich habe niemandem etwas -Böses getan, niemand hat mir etwas Böses getan, niemand aber will -mir helfen. Lauter niemand. Aber so ist es doch nicht. Nur daß mir -niemand hilft —, sonst wäre lauter niemand hübsch. Ich würde ganz -gern — warum denn nicht — einen Ausflug mit einer Gesellschaft von -lauter Niemand machen. Natürlich ins Gebirge, wohin denn sonst? Wie -sich diese Niemand aneinander drängen, diese vielen quer gestreckten -und eingehängten Arme, diese vielen Füße, durch winzige Schritte -getrennt! Versteht sich, daß alle in Frack sind. Wir gehen so lala, -der Wind fährt durch die Lücken, die wir und unsere Gliedmaßen offen -lassen. Die Hälse werden im Gebirge frei! Es ist ein Wunder, daß -wir nicht singen.“ \ No newline at end of file diff --git a/src/test/data/utf8-test.txt b/src/test/data/utf8-test.txt new file mode 100644 index 0000000..47042a8 --- /dev/null +++ b/src/test/data/utf8-test.txt @@ -0,0 +1,49 @@ +„Ich weiß nicht“, rief ich ohne Klang „ich weiß ja nicht. Wenn +niemand kommt, dann kommt eben niemand. Ich habe niemandem etwas +Böses getan, niemand hat mir etwas Böses getan, niemand aber will +mir helfen. Lauter niemand. Aber so ist es doch nicht. Nur daß mir +niemand hilft —, sonst wäre lauter niemand hübsch. Ich würde ganz +gern — warum denn nicht — einen Ausflug mit einer Gesellschaft von +lauter Niemand machen. Natürlich ins Gebirge, wohin denn sonst? Wie +sich diese Niemand aneinander drängen, diese vielen quer gestreckten +und eingehängten Arme, diese vielen Füße, durch winzige Schritte +getrennt! Versteht sich, daß alle in Frack sind. Wir gehen so lala, +der Wind fährt durch die Lücken, die wir und unsere Gliedmaßen offen +lassen. Die Hälse werden im Gebirge frei! Es ist ein Wunder, daß +wir nicht singen.“ + +Det var i den Tid, jeg gik omkring og sulted i Kristiania, denne forunderlige By, +som ingen forlader, før han har fået Mærker af den . . . . +Jeg ligger vågen på min Kvist og hører en Klokke nedenunder mig slå seks Slag; det var allerede ganske lyst, +og Folk begyndte at færdes op og ned i Trapperne. Nede ved Døren, hvor mit Rum var tapetseret med gamle Numre +af »Morgenbladet«, kunde jeg så tydelig se en Bekendtgørelse fra Fyrdirektøren, og lidt tilvenstre derfra et fedt, +bugnende Avertissement fra Bager Fabian Olsen om nybagt Brød. + +The quick brown fox jumps over the lazy dog. + +Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen Wolther spillede på xylofon. + +Falsches Üben von Xylophonmusik quält jeden größeren Zwerg. + +Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία. + +El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba a su querido cachorro. + +Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en +canoë au delà des îles, près du mälström où brûlent les novæ. + +D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh. + +Árvíztűrő tükörfúrógép. + +Pchnąć w tę łódź jeża lub ośm skrzyń fig. + +Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa. + +В чащах юга жил бы цитрус? Да, но фальшивый экземпляр! + +Pijamalı hasta, yağız şoföre çabucak güvendi. + +ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ +ᛋᚳᛖᚪᛚ᛫ᚦᛖᚪᚻ᛫ᛗᚪᚾᚾᚪ᛫ᚷᛖᚻᚹᛦᛚᚳ᛫ᛗᛁᚳᛚᚢᚾ᛫ᚻᛦᛏ᛫ᛞᚫᛚᚪᚾ +ᚷᛁᚠ᛫ᚻᛖ᛫ᚹᛁᛚᛖ᛫ᚠᚩᚱ᛫ᛞᚱᛁᚻᛏᚾᛖ᛫ᛞᚩᛗᛖᛋ᛫ᚻᛚᛇᛏᚪᚾ᛬ \ No newline at end of file diff --git a/src/test/guf_dbuf_impl.c b/src/test/guf_dbuf_impl.c index 7718c8e..8e1cdf9 100644 --- a/src/test/guf_dbuf_impl.c +++ b/src/test/guf_dbuf_impl.c @@ -6,8 +6,8 @@ #define GUF_IMPL #include "guf_dbuf.h" -#define GUF_CNT_NAME dbuf_uchar -#define GUF_T uchar +#define GUF_CNT_NAME dbuf_char +#define GUF_T char #define GUF_T_IS_INTEGRAL_TYPE #define GUF_IMPL #include "guf_dbuf.h" diff --git a/src/test/guf_dbuf_impl.h b/src/test/guf_dbuf_impl.h index d278cf8..af2ad41 100644 --- a/src/test/guf_dbuf_impl.h +++ b/src/test/guf_dbuf_impl.h @@ -10,8 +10,8 @@ typedef unsigned char uchar; -#define GUF_CNT_NAME dbuf_uchar -#define GUF_T uchar +#define GUF_CNT_NAME dbuf_char +#define GUF_T char #define GUF_T_IS_INTEGRAL_TYPE #include "guf_dbuf.h" diff --git a/src/test/test_dict.hpp b/src/test/test_dict.hpp index 3561be2..9c55add 100644 --- a/src/test/test_dict.hpp +++ b/src/test/test_dict.hpp @@ -16,7 +16,7 @@ struct DictCstrToIntTest : public Test private: - dbuf_uchar text_buf {}; + dbuf_char text_buf {}; std::vector text_vec {}; void insert_lookup() @@ -25,34 +25,22 @@ struct DictCstrToIntTest : public Test dict_cstr_int word_cnt_dict {}; dict_cstr_int_init(&word_cnt_dict, &guf_allocator_libc); - ptrdiff_t len = 0; + ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = text_buf.size; + guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; + guf_utf8_char ch = {}; - for (dbuf_uchar_iter it = dbuf_uchar_begin(&text_buf); !dbuf_uchar_iter_is_end(&text_buf, it); it = dbuf_uchar_iter_next(&text_buf, it, 1)) { - const unsigned char c = *it.ptr; - guf_utf8_char utf8_c = {.bytes = {c, 0, 0, 0}}; - const int num_bytes = guf_utf8_num_bytes(c); - - if (!num_bytes) { - continue; - } - - int consumed = 1; - while (consumed < num_bytes && ((it = dbuf_uchar_iter_next(&text_buf, it, 1)), !dbuf_uchar_iter_is_end(&text_buf, it)) ) { - utf8_c.bytes[consumed++] = *it.ptr; - } - if (consumed < num_bytes) { - printf("Invalid utf-8: file is truncated\n"); - break; - } - - if (guf_utf8_char_is_valid(&utf8_c) && utf8_c.bytes[0] != '\0') { - char str[5] {}; - memcpy(str, utf8_c.bytes, num_bytes); - printf("%s", str); - ++len; + for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) { + if (stat == GUF_UTF8_READ_VALID) { + ++valid_chars; + printf("%s", ch.bytes); + } else { + ++invalid_chars; + printf("::INVALID_UTF8_CHAR::"); } } - printf("\nread %td utf-8 characters\n", len); + TEST_CHECK(input_str.len == 0 && input_str.str == NULL); + printf("\nread %td bytes\n", bytes); + printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars); dict_cstr_int_free(&word_cnt_dict, NULL); bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size; @@ -61,30 +49,27 @@ struct DictCstrToIntTest : public Test bool load_file() { - #define TEST_DATA_DIR "/Users/joni/Desktop/libguf/src/test/data" FILE *in_file {nullptr}; if (!in_file) { - in_file = fopen(TEST_DATA_DIR "/data_01.txt", "r"); + in_file = fopen(TEST_DATA_DIR "/utf8-test.txt", "r"); } if (!in_file) { return false; } - dbuf_uchar_init(&text_buf, 128, &guf_allocator_libc); + dbuf_char_init(&text_buf, 128, &guf_allocator_libc); int c = EOF; while ((c = fgetc(in_file)) != EOF) { - dbuf_uchar_push_val(&text_buf, (unsigned char)c); - text_vec.push_back((unsigned char)c); + dbuf_char_push_val(&text_buf, (char)c); + text_vec.push_back((char)c); } fclose(in_file); - if (*dbuf_uchar_back(&text_buf) != '\0') { - dbuf_uchar_push_val(&text_buf, '\0'); - text_vec.push_back('\0'); - } - + // dbuf_char_insert_val(&text_buf, '\xC0', 1); + // text_vec.insert(text_vec.cbegin() + 1, '\xC0'); + return TEST_CHECK(std::ssize(text_vec) == text_buf.size); } @@ -103,7 +88,7 @@ struct DictCstrToIntTest : public Test insert_lookup(); end: - dbuf_uchar_free(&text_buf, NULL); + dbuf_char_free(&text_buf, NULL); text_buf = {}; passed = (num_failed_checks == 0);