#pragma once #include #include "test.hpp" extern "C" { #include "guf_alloc_libc.h" #include "guf_dict_impl.h" #include "guf_utf8.h" #include "guf_str.h" } struct DictCstrToIntTest : public Test { DictCstrToIntTest(const std::string& name) : Test(name) {}; private: dbuf_char text_buf {}; std::vector text_vec {}; void insert_lookup() { std::unordered_map word_cnt_map {}; dict_cstr_int word_cnt_dict {}; dict_cstr_int_init(&word_cnt_dict, &guf_allocator_libc); dbuf_str_view delims = dbuf_str_view_new(&guf_allocator_libc); for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_whitespace); ++i) { guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_whitespace[i]), .str = guf_utf8_whitespace[i]}; dbuf_str_view_push_val(&delims, d); } for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_punctuation); ++i) { guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_punctuation[i]), .str = guf_utf8_punctuation[i]}; dbuf_str_view_push_val(&delims, d); } guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; guf_str_view tok; while ((tok = guf_str_next_tok(&input_str, delims.data, delims.size, NULL, -1)).len) { // printf("tok_len: %td ", tok.len); printf("'%.*s'\n", (int)tok.len, tok.str); } dbuf_str_view_free(&delims, NULL); // ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = 0; // guf_utf8_char ch = {}; // for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) { // if (stat == GUF_UTF8_READ_VALID) { // ++valid_chars; // printf("%s", ch.bytes); // } else { // ++invalid_chars; // printf("::INVALID_UTF8_CHAR::"); // } // bytes += guf_utf8_char_num_bytes(&ch); // } // TEST_CHECK(input_str.len == 0 && input_str.str == NULL); // printf("\nread %td bytes\n", bytes); // printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars); dict_cstr_int_free(&word_cnt_dict, NULL); bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size; TEST_CHECK(!dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones); } bool load_file() { FILE *in_file {nullptr}; if (!in_file) { in_file = fopen(TEST_DATA_DIR "/utf8-test.txt", "r"); } if (!in_file) { return false; } dbuf_char_init(&text_buf, 128, &guf_allocator_libc); int c = EOF; while ((c = fgetc(in_file)) != EOF) { dbuf_char_push_val(&text_buf, (char)c); text_vec.push_back((char)c); } fclose(in_file); // dbuf_char_insert_val(&text_buf, '\xC0', 1); // text_vec.insert(text_vec.cbegin() + 1, '\xC0'); return TEST_CHECK(std::ssize(text_vec) == text_buf.size); } public: bool run() override { if (done) { return passed; } if (!TEST_CHECK(load_file())) { goto end; } insert_lookup(); end: dbuf_char_free(&text_buf, NULL); text_buf = {}; passed = (num_failed_checks == 0); done = true; return passed; } };