diff --git a/CMakeLists.txt b/CMakeLists.txt index 1971536..3cc4758 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,7 @@ endif () add_executable(libguf_example src/test/example.c src/test/guf_dict_impl.c) target_include_directories(libguf_example PRIVATE src src/test) -add_executable(libguf_test src/test/test.cpp src/test/guf_dbuf_impl.c src/test/guf_dict_impl.c src/test/guf_rand_impl.c src/test/guf_sort_impl.c) +add_executable(libguf_test src/test/test.cpp src/test/guf_dbuf_impl.c src/test/guf_dict_impl.c src/test/guf_rand_impl.c src/test/guf_sort_impl.c src/test/guf_utf8_impl.c) target_include_directories(libguf_test PRIVATE src src/test) if (NOT DEFINED MSVC) diff --git a/src/guf_str.c b/src/guf_str.c index fcbe932..c86ea6a 100644 --- a/src/guf_str.c +++ b/src/guf_str.c @@ -687,9 +687,14 @@ guf_str_codepoint_utf8 guf_str_iterate_utf8(const guf_str *str, size_t *idx) guf_str_codepoint_utf8 cp = {.num_bytes = 1, .bytes = {'\0', '\0', '\0', '\0', '\0'}, .valid = true}; - const unsigned char four_bytes_mask = 240; - const unsigned char three_bytes_mask = 224; - const unsigned char two_bytes_mask = 192; + const unsigned char four_bytes_val = 240; // 0b1111.0xxx + const unsigned char four_bytes_mask = 248; // 0b1111.1000 + + const unsigned char three_bytes_val = 224; // 0b1110.xxxx + const unsigned char three_bytes_mask = 240; // 0b1111.0000 + + const unsigned char two_bytes_val = 192; // 0b110x.xxxx + const unsigned char two_bytes_mask = 224 ; // 0b1110.0000 size_t i = *idx; if (guf_str_char_is_ascii(c_str[i])) { @@ -732,7 +737,7 @@ guf_str_codepoint_utf8 guf_str_iterate_utf8(const guf_str *str, size_t *idx) size_t id = i + j; assert(id < len); unsigned char byte = c_str[id]; - if (byte >= 128 && byte < 192) { // Binary: 10...... + if (byte >= 128 && byte < 192) { // 0b10xx.xxxx cp.bytes[id] = byte; } else { cp.valid = false; diff --git a/src/guf_utf8.h b/src/guf_utf8.h new file mode 100644 index 0000000..0478da9 --- /dev/null +++ b/src/guf_utf8.h @@ -0,0 +1,113 @@ +#if defined(GUF_STATIC) || defined(GUF_STATIC_IMPL) + #define GUF_FN_KEYWORDS static +#else + #define GUF_FN_KEYWORDS +#endif + +#ifndef GUF_UTF8_H +#define GUF_UTF8_H + #include "guf_common.h" + + typedef struct guf_utf8_char { + unsigned char bytes[4]; + } guf_utf8_char; + + static inline bool guf_char_is_ascii(int c) {return c <= 0 && c <= 127;} + static inline bool guf_uchar_is_ascii(unsigned char c) {return c <= 127;} + + GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c); + + GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c); + +#endif + +#if defined(GUF_IMPL) || defined(GUF_IMPL_STATIC) + +#include "guf_common.h" +#include "guf_assert.h" + +// cf. https://www.rfc-editor.org/rfc/rfc3629#page-4 +GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c) +{ + if (c <= 0x7F) { // bits: 0xxx.xxxx + return 1; + } else if (c >= 0xC2 && c <= 0xDF) { // bits: 110x.xxxx (without 0xC0 and 0xC1) + return 2; + } else if (c >= 0xE0 && c <= 0xEF) { // bits: 1110.xxxx + return 3; + } else if (c >= 0xF0 && c <= 0xF4) { // bits: b1111.0xxx (without 0xF5 to 0xFF) + return 4; + } else { + return 0; // Invalid byte. + } +} + +GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c) +{ + const int num_bytes = guf_utf8_num_bytes(c->bytes[0]); + + if (!num_bytes) { + return false; + } + + for (int i = 0; i < num_bytes; ++i) { + // "The octet values C0, C1, F5 to FF never appear.", cf. https://www.rfc-editor.org/rfc/rfc3629#page-5 + if (c->bytes[i] == 0xC0 || c->bytes[i] == 0xC1 || (c->bytes[i] >= 0xF5 && c->bytes[i] <= 0xFF)) { + return false; + } + } + + // Binary: 10xx.xxxx + #define guf_valid_tail(byte) ((byte) >= 0x80 && (byte) <= 0xBF) + + // cf. https://datatracker.ietf.org/doc/html/rfc3629#page-5 + switch (num_bytes) + { + case 1: + GUF_ASSERT(c->bytes[0] <= 0x7F); + return true; + + case 2: + GUF_ASSERT(c->bytes[0] >= 0xC2 && c->bytes[0] <= 0xDF); + return guf_valid_tail(c->bytes[1]); + + case 3: + if ((c->bytes[0] == 0xE0) && (c->bytes[1] >= 0xA0 && c->bytes[1] <= 0xBF) && guf_valid_tail(c->bytes[2])) { + return true; + } + if ((c->bytes[0] >= 0xE1 && c->bytes[0] <= 0xEC) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2])) { + return true; + } + if ((c->bytes[0] == 0xED) && (c->bytes[1] >= 0x80 && c->bytes[1] <= 0x9F) && guf_valid_tail(c->bytes[2])) { + return true; + } + if ((c->bytes[0] >= 0xEE && c->bytes[0] <= 0xEF) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2])) { + return true; + } + return false; + + case 4: + if ((c->bytes[0] == 0xF0) && (c->bytes[1] >= 0x90 && c->bytes[1] <= 0xBF) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) { + return true; + } + if ((c->bytes[0] >= 0xF1 && c->bytes[0] <= 0xF3) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) { + return true; + } + if ((c->bytes[0] == 0xF4) && (c->bytes[1] >= 0x80 && c->bytes[1] <= 0x8F) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) { + return true; + } + return false; + + default: + return false; + } + + #undef guf_valid_tail +} + +#endif + +#undef GUF_FN_KEYWORDS +#undef GUF_IMPL +#undef GUF_IMPL_STATIC +#undef GUF_STATIC diff --git a/src/test/guf_dbuf_impl.c b/src/test/guf_dbuf_impl.c index 01b9545..7718c8e 100644 --- a/src/test/guf_dbuf_impl.c +++ b/src/test/guf_dbuf_impl.c @@ -6,6 +6,12 @@ #define GUF_IMPL #include "guf_dbuf.h" +#define GUF_CNT_NAME dbuf_uchar +#define GUF_T uchar +#define GUF_T_IS_INTEGRAL_TYPE +#define GUF_IMPL +#include "guf_dbuf.h" + #define GUF_CNT_NAME dbuf_float #define GUF_T float #define GUF_T_IS_INTEGRAL_TYPE diff --git a/src/test/guf_dbuf_impl.h b/src/test/guf_dbuf_impl.h index 14b4d9d..d278cf8 100644 --- a/src/test/guf_dbuf_impl.h +++ b/src/test/guf_dbuf_impl.h @@ -8,6 +8,13 @@ #define GUF_T_IS_INTEGRAL_TYPE #include "guf_dbuf.h" +typedef unsigned char uchar; + +#define GUF_CNT_NAME dbuf_uchar +#define GUF_T uchar +#define GUF_T_IS_INTEGRAL_TYPE +#include "guf_dbuf.h" + #define GUF_CNT_NAME dbuf_float #define GUF_T float #define GUF_T_IS_INTEGRAL_TYPE diff --git a/src/test/guf_dict_impl.h b/src/test/guf_dict_impl.h index c8245ca..84845f2 100644 --- a/src/test/guf_dict_impl.h +++ b/src/test/guf_dict_impl.h @@ -16,12 +16,10 @@ static inline guf_hash_size_t int32_hash(const int32_t *a) { return guf_hash(a, sizeof(int32_t), GUF_HASH_INIT); } - static inline bool int32_eq(const int32_t *a, const int32_t *b) { return *a == *b; } - #define GUF_DICT_KEY_T int32_t #define GUF_DICT_KEY_HASH int32_hash #define GUF_DICT_KEY_T_EQ int32_eq diff --git a/src/test/guf_utf8_impl.c b/src/test/guf_utf8_impl.c new file mode 100644 index 0000000..b0b2362 --- /dev/null +++ b/src/test/guf_utf8_impl.c @@ -0,0 +1,4 @@ +#include "guf_utf8.h" + +#define GUF_IMPL +#include "guf_utf8.h" diff --git a/src/test/test.cpp b/src/test/test.cpp index e2c9677..14ab8f9 100644 --- a/src/test/test.cpp +++ b/src/test/test.cpp @@ -8,6 +8,7 @@ extern "C" { } #include "test_dbuf.hpp" +#include "test_dict.hpp" std::unordered_set> g_tests {}; @@ -20,6 +21,10 @@ void init_tests() test = std::make_unique("DbufCstringTest"); GUF_ASSERT_RELEASE(test.get()); g_tests.insert(std::move(test)); + + test = std::make_unique("DictCstrToIntTest"); + GUF_ASSERT_RELEASE(test.get()); + g_tests.insert(std::move(test)); } int main() diff --git a/src/test/test_dbuf.hpp b/src/test/test_dbuf.hpp index 1c19f97..ff49c9d 100644 --- a/src/test/test_dbuf.hpp +++ b/src/test/test_dbuf.hpp @@ -10,7 +10,7 @@ extern "C" struct DbufIntTest : public Test { - DbufIntTest(std::string name) : Test(name) {}; + DbufIntTest(const std::string& name) : Test(name) {}; private: diff --git a/src/test/test_dict.hpp b/src/test/test_dict.hpp new file mode 100644 index 0000000..3561be2 --- /dev/null +++ b/src/test/test_dict.hpp @@ -0,0 +1,114 @@ +#pragma once +#include +#include "test.hpp" + +extern "C" +{ + #include "guf_alloc_libc.h" + #include "guf_dict_impl.h" + #include "guf_utf8.h" +} + +struct DictCstrToIntTest : public Test +{ + + DictCstrToIntTest(const std::string& name) : Test(name) {}; + + private: + + dbuf_uchar text_buf {}; + std::vector text_vec {}; + + void insert_lookup() + { + std::unordered_map word_cnt_map {}; + dict_cstr_int word_cnt_dict {}; + dict_cstr_int_init(&word_cnt_dict, &guf_allocator_libc); + + ptrdiff_t len = 0; + + for (dbuf_uchar_iter it = dbuf_uchar_begin(&text_buf); !dbuf_uchar_iter_is_end(&text_buf, it); it = dbuf_uchar_iter_next(&text_buf, it, 1)) { + const unsigned char c = *it.ptr; + guf_utf8_char utf8_c = {.bytes = {c, 0, 0, 0}}; + const int num_bytes = guf_utf8_num_bytes(c); + + if (!num_bytes) { + continue; + } + + int consumed = 1; + while (consumed < num_bytes && ((it = dbuf_uchar_iter_next(&text_buf, it, 1)), !dbuf_uchar_iter_is_end(&text_buf, it)) ) { + utf8_c.bytes[consumed++] = *it.ptr; + } + if (consumed < num_bytes) { + printf("Invalid utf-8: file is truncated\n"); + break; + } + + if (guf_utf8_char_is_valid(&utf8_c) && utf8_c.bytes[0] != '\0') { + char str[5] {}; + memcpy(str, utf8_c.bytes, num_bytes); + printf("%s", str); + ++len; + } + } + printf("\nread %td utf-8 characters\n", len); + + dict_cstr_int_free(&word_cnt_dict, NULL); + bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size; + TEST_CHECK(!dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones); + } + + bool load_file() + { + #define TEST_DATA_DIR "/Users/joni/Desktop/libguf/src/test/data" + FILE *in_file {nullptr}; + if (!in_file) { + in_file = fopen(TEST_DATA_DIR "/data_01.txt", "r"); + } + + if (!in_file) { + return false; + } + + dbuf_uchar_init(&text_buf, 128, &guf_allocator_libc); + + int c = EOF; + while ((c = fgetc(in_file)) != EOF) { + dbuf_uchar_push_val(&text_buf, (unsigned char)c); + text_vec.push_back((unsigned char)c); + } + fclose(in_file); + + if (*dbuf_uchar_back(&text_buf) != '\0') { + dbuf_uchar_push_val(&text_buf, '\0'); + text_vec.push_back('\0'); + } + + return TEST_CHECK(std::ssize(text_vec) == text_buf.size); + } + + public: + + bool run() override + { + if (done) { + return passed; + } + + if (!TEST_CHECK(load_file())) { + goto end; + } + + insert_lookup(); + + end: + dbuf_uchar_free(&text_buf, NULL); + text_buf = {}; + + passed = (num_failed_checks == 0); + done = true; + + return passed; + } +}; diff --git a/todo.txt b/todo.txt new file mode 100644 index 0000000..f4847fe --- /dev/null +++ b/todo.txt @@ -0,0 +1,2 @@ +- guf_stack, guf_queue, guf_ringbuf +- guf_rand etc.: move guf_fn_keywors out of header guard? \ No newline at end of file