libguf/src/test/test_dict.hpp

#pragma once
#include <unordered_map>
#include "test.hpp"

extern "C"
{
    #include "guf_alloc_libc.h"
    #include "guf_dict_impl.h"
    #include "guf_utf8.h"
    #include "guf_str.h"
}

struct DictCstrToIntTest : public Test
{

    DictCstrToIntTest(const std::string& name) : Test(name) {};

    private:

    dbuf_char text_buf {};
    std::vector<char> text_vec {};

    void insert_lookup()
    {
        std::unordered_map<std::string, int> word_cnt_map {};
        dict_cstr_int word_cnt_dict {};
        dict_cstr_int_init(&word_cnt_dict, &guf_allocator_libc);

        dbuf_str_view delims = dbuf_str_view_new(&guf_allocator_libc);
        for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_whitespace); ++i) {
            guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_whitespace[i]), .str = guf_utf8_whitespace[i]};
            dbuf_str_view_push_val(&delims, d);
        }
        for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_punctuation); ++i) {
            guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_punctuation[i]), .str = guf_utf8_punctuation[i]};
            dbuf_str_view_push_val(&delims, d);
        }
        guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
        guf_str_view tok;
        while ((tok = guf_str_next_tok(&input_str, delims.data, delims.size, NULL, -1)).len) {
            // printf("tok_len: %td ", tok.len);
            printf("'%.*s'\n", (int)tok.len, tok.str);
        }
        dbuf_str_view_free(&delims, NULL);

        // ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = 0;
        // guf_utf8_char ch = {};
        // for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
        //     if (stat == GUF_UTF8_READ_VALID) {
        //         ++valid_chars;
        //         printf("%s", ch.bytes);
        //     } else {
        //         ++invalid_chars;
        //         printf("::INVALID_UTF8_CHAR::");
        //     }
        //     bytes += guf_utf8_char_num_bytes(&ch);
        // }
        // TEST_CHECK(input_str.len == 0 && input_str.str == NULL);
        // printf("\nread %td bytes\n", bytes);
        // printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars);

        dict_cstr_int_free(&word_cnt_dict, NULL);
        bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;
        TEST_CHECK(!dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones);
    }

    bool load_file()
    {
        FILE *in_file {nullptr};
        if (!in_file) {
            in_file = fopen(TEST_DATA_DIR "/utf8-test.txt", "r");
        }

        if (!in_file) {
            return false;
        }

        dbuf_char_init(&text_buf, 128, &guf_allocator_libc);

        int c = EOF;
        while ((c = fgetc(in_file)) != EOF) {
            dbuf_char_push_val(&text_buf, (char)c);
            text_vec.push_back((char)c);
        }
        fclose(in_file);

        // dbuf_char_insert_val(&text_buf, '\xC0', 1);
        // text_vec.insert(text_vec.cbegin() + 1, '\xC0');

        return TEST_CHECK(std::ssize(text_vec) == text_buf.size);
    }

    public:

    bool run() override
    {
        if (done) {
            return passed;
        }

        if (!TEST_CHECK(load_file())) {
            goto end;
        }

        insert_lookup();

        end:
        dbuf_char_free(&text_buf, NULL);
        text_buf = {};

        passed = (num_failed_checks == 0);
        done = true;

        return passed;
    }
};