libguf/src/test/test_dict.hpp
2025-03-04 08:01:41 +01:00

365 lines
16 KiB
C++

#pragma once
#include <unordered_map>
#include <cstring>
#include "test.hpp"
extern "C"
{
#include "guf_alloc_libc.h"
#include "guf_dict_impl.h"
#include "guf_str.h"
}
struct DictSvToIntTest : public Test
{
DictSvToIntTest(const std::string& name) : Test(name) {};
private:
dbuf_char text_buf {};
std::vector<char> text_vec {};
void insert_lookup()
{
std::unordered_map<std::string_view, int32_t> word_cnt_map {};
dict_sv_i32 word_cnt_dict {};
dict_sv_i32_init(&word_cnt_dict, &guf_allocator_libc);
dbuf_str_view delims = dbuf_str_view_new(&guf_allocator_libc);
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(GUF_UTF8_WHITESPACE); ++i) {
guf_str_view d = {.len = (ptrdiff_t)strlen(GUF_UTF8_WHITESPACE[i]), .str = GUF_UTF8_WHITESPACE[i]};
dbuf_str_view_push_val(&delims, d);
}
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(GUF_UTF8_COMMON_PUNCT); ++i) {
guf_str_view d = {.len = (ptrdiff_t)strlen(GUF_UTF8_COMMON_PUNCT[i]), .str = GUF_UTF8_COMMON_PUNCT[i]};
dbuf_str_view_push_val(&delims, d);
}
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
guf_str_view tok;
while ((tok = guf_str_next_tok(&input_str, delims.data, delims.size, NULL, -1)).len) {
if (tok.len <= 0) {
continue;
}
std::string_view sv(tok.str, tok.len);
TEST_CHECK(dict_sv_i32_contains(&word_cnt_dict, &tok) == word_cnt_map.contains(sv));
if (!dict_sv_i32_contains(&word_cnt_dict, &tok)) {
dict_sv_i32_insert_val_arg(&word_cnt_dict, tok, 1, GUF_CPY_VALUE, GUF_CPY_VALUE);
word_cnt_map.insert({sv, 1});
} else {
int32_t *cnt = dict_sv_i32_at_val_arg(&word_cnt_dict, tok);
if (TEST_CHECK(cnt)) {
*cnt += 1;
}
word_cnt_map.at(sv) += 1;
}
// printf("tok_len: %td ", tok.len);
// printf("'%.*s'\n", (int)tok.len, tok.str);
TEST_CHECK(dict_sv_i32_debug_valid_size(&word_cnt_dict));
}
dbuf_str_view_free(&delims, NULL);
TEST_CHECK(dict_sv_i32_size(&word_cnt_dict) == std::ssize(word_cnt_map));
TEST_CHECK(dict_sv_i32_debug_valid_size(&word_cnt_dict));
for (const auto & [word, cnt] : word_cnt_map ) {
guf_str_view sv = {.str = word.data(), .len = (ptrdiff_t)word.size()};
int32_t *res = dict_sv_i32_at(&word_cnt_dict, &sv);
TEST_CHECK(res && *res == cnt);
}
ptrdiff_t i = 0;
GUF_CNT_FOREACH(&word_cnt_dict, dict_sv_i32, kv_it) {
const dict_sv_i32_kv *kv = kv_it.ptr;
if (TEST_CHECK(kv)) {
const int32_t cnt = kv->val;
// printf("%.*s: %d\n", (int)kv->key.len, kv->key.str, cnt);
const std::string_view sv(kv->key.str, kv->key.len);
if (TEST_CHECK(word_cnt_map.contains(sv))) {
TEST_CHECK(word_cnt_map.at(sv) == cnt);
}
}
++i;
}
TEST_CHECK(i == dict_sv_i32_size(&word_cnt_dict));
TEST_CHECK(i == std::ssize(word_cnt_map));
TEST_CHECK(dict_sv_i32_debug_valid_size(&word_cnt_dict));
// std::cout << "load fac: " << dict_sv_i32_load_factor(&word_cnt_dict) << ", cap: " << word_cnt_dict.kv_indices_cap << "\n";
// std::cout << "size: " << dict_sv_i32_size(&word_cnt_dict) << ", max probelen: " << word_cnt_dict.max_probelen << "\n";
// Erase tests:
const double load_fac_before_erase = dict_sv_i32_load_factor(&word_cnt_dict);
const ptrdiff_t size_before_erase = dict_sv_i32_size(&word_cnt_dict);
ptrdiff_t num_del = 0;
while (dict_sv_i32_size(&word_cnt_dict) > size_before_erase / 2) {
dict_sv_i32_kv *kv = NULL;
if (num_del % 2) {
dict_sv_i32_iter it = dict_sv_i32_begin(&word_cnt_dict);
GUF_ASSERT_RELEASE(!dict_sv_i32_iter_is_end(&word_cnt_dict, it));
kv = it.ptr;
} else {
dict_sv_i32_iter rit = dict_sv_i32_rbegin(&word_cnt_dict);
GUF_ASSERT_RELEASE(!dict_sv_i32_iter_is_end(&word_cnt_dict, rit));
kv = rit.ptr;
}
GUF_ASSERT_RELEASE(kv);
const guf_str_view key = kv->key;
const bool del_success = dict_sv_i32_erase(&word_cnt_dict, &key);
TEST_CHECK(del_success);
TEST_CHECK(!dict_sv_i32_contains(&word_cnt_dict, &key));
std::string_view sv(key.str, (size_t)key.len);
if (TEST_CHECK(word_cnt_map.contains(sv))) {
word_cnt_map.erase(sv);
}
TEST_CHECK(!word_cnt_map.contains(sv));
if (del_success) {
++num_del;
}
}
TEST_CHECK(dict_sv_i32_size(&word_cnt_dict) >= 0);
TEST_CHECK(size_before_erase - num_del == dict_sv_i32_size(&word_cnt_dict));
TEST_CHECK(std::ssize(word_cnt_map) == dict_sv_i32_size(&word_cnt_dict));
if (dict_sv_i32_size(&word_cnt_dict) != 0) {
TEST_CHECK(load_fac_before_erase == dict_sv_i32_load_factor(&word_cnt_dict));
} else {
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == 0);
}
if (dict_sv_i32_size(&word_cnt_dict) >= 4) {
dict_sv_i32_kv_dbuf_iter it = dict_sv_i32_begin(&word_cnt_dict);
it = dict_sv_i32_iter_next(&word_cnt_dict, it, 1);
GUF_ASSERT_RELEASE(!dict_sv_i32_iter_is_end(&word_cnt_dict, it));
guf_str_view key = it.ptr->key;
bool del_success = dict_sv_i32_erase(&word_cnt_dict, &key);
TEST_CHECK(del_success);
TEST_CHECK(!dict_sv_i32_contains(&word_cnt_dict, &key));
std::string_view sv(key.str, (size_t)key.len);
if (TEST_CHECK(word_cnt_map.contains(sv))) {
word_cnt_map.erase(sv);
}
it = dict_sv_i32_rbegin(&word_cnt_dict);
it = dict_sv_i32_iter_next(&word_cnt_dict, it, 1);
GUF_ASSERT_RELEASE(!dict_sv_i32_iter_is_end(&word_cnt_dict, it));
key = it.ptr->key;
del_success = dict_sv_i32_erase(&word_cnt_dict, &key);
TEST_CHECK(del_success);
TEST_CHECK(!dict_sv_i32_contains(&word_cnt_dict, &key));
sv = std::string_view(key.str, (size_t)key.len);
if (TEST_CHECK(word_cnt_map.contains(sv))) {
word_cnt_map.erase(sv);
}
}
TEST_CHECK(std::ssize(word_cnt_map) == dict_sv_i32_size(&word_cnt_dict));
i = 0;
GUF_CNT_FOREACH(&word_cnt_dict, dict_sv_i32, kv_it) {
const dict_sv_i32_kv *kv = kv_it.ptr;
if (TEST_CHECK(kv)) {
const int32_t cnt = kv->val;
const std::string_view sv(kv->key.str, (size_t)kv->key.len);
if (TEST_CHECK(word_cnt_map.contains(sv))) {
TEST_CHECK(word_cnt_map.at(sv) == cnt);
}
++i;
}
}
TEST_CHECK(i == word_cnt_dict.kv_elems.size);
TEST_CHECK(i == std::ssize(word_cnt_map));
while (dict_sv_i32_size(&word_cnt_dict) > 0) {
const dict_sv_i32_iter beg = dict_sv_i32_begin(&word_cnt_dict);
if (TEST_CHECK(!dict_sv_i32_iter_is_end(&word_cnt_dict, beg))) {
const guf_str_view key = beg.ptr->key;
if (TEST_CHECK(dict_sv_i32_contains(&word_cnt_dict, &key))) {
const bool del_success = dict_sv_i32_erase(&word_cnt_dict, &key);
TEST_CHECK(del_success);
TEST_CHECK(!dict_sv_i32_contains(&word_cnt_dict, &key));
}
const std::string_view sv(key.str, (size_t)key.len);
if (TEST_CHECK(word_cnt_map.contains(sv))) {
word_cnt_map.erase(sv);
}
}
}
TEST_CHECK(dict_sv_i32_size(&word_cnt_dict) == 0 && word_cnt_map.size() == 0);
TEST_CHECK(word_cnt_dict.num_tombstones == 0);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == 0);
dict_sv_i32_insert_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("Hej"), (size_t)64, GUF_CPY_VALUE, GUF_CPY_VALUE);
dict_sv_i32_insert_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("verden!"), (size_t)128, GUF_CPY_VALUE, GUF_CPY_VALUE);
dict_sv_i32_insert_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("Flødeskum"), (size_t)256, GUF_CPY_VALUE, GUF_CPY_VALUE);
dict_sv_i32_insert_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("med"), (size_t)512, GUF_CPY_VALUE, GUF_CPY_VALUE);
dict_sv_i32_insert_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("Faxe Kondi."), (size_t)1024, GUF_CPY_VALUE, GUF_CPY_VALUE);
TEST_CHECK(dict_sv_i32_size(&word_cnt_dict) == 5);
int32_t *val = dict_sv_i32_at_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("Hej"));
TEST_CHECK(val && *val == 64);
val = dict_sv_i32_at_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("Flødeskum"));
TEST_CHECK(val && *val == 256);
val = dict_sv_i32_at_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("Faxe Kondi."));
TEST_CHECK(val && *val == 1024);
val = dict_sv_i32_at_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("verden!"));
TEST_CHECK(val && *val == 128);
val = dict_sv_i32_at_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("med"));
TEST_CHECK(val && *val == 512);
TEST_CHECK(word_cnt_dict.kv_elems.size == 5);
TEST_CHECK(word_cnt_dict.kv_elems.data[0].val == 64 && std::strcmp(word_cnt_dict.kv_elems.data[0].key.str, "Hej") == 0);
TEST_CHECK(word_cnt_dict.kv_elems.data[1].val == 128 && std::strcmp(word_cnt_dict.kv_elems.data[1].key.str, "verden!") == 0);
TEST_CHECK(word_cnt_dict.kv_elems.data[2].val == 256 && std::strcmp(word_cnt_dict.kv_elems.data[2].key.str, "Flødeskum") == 0);
TEST_CHECK(word_cnt_dict.kv_elems.data[3].val == 512 && std::strcmp(word_cnt_dict.kv_elems.data[3].key.str, "med") == 0);
TEST_CHECK(word_cnt_dict.kv_elems.data[4].val == 1024 && std::strcmp(word_cnt_dict.kv_elems.data[4].key.str, "Faxe Kondi.") == 0);
const double load_fac_beg = dict_sv_i32_load_factor(&word_cnt_dict);
const ptrdiff_t cap_begin = word_cnt_dict.kv_indices_cap;
ptrdiff_t del = 0;
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("Hej")));
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
for (ptrdiff_t n = 0; n < cap_begin + 128; ++n) {
dict_sv_i32_insert_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("Hej"), 64, GUF_CPY_VALUE, GUF_CPY_VALUE);
TEST_CHECK(word_cnt_dict.num_tombstones == --del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("Hej")));
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
}
TEST_CHECK(word_cnt_dict.kv_indices_cap == cap_begin);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("Faxe Kondi.")));
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
for (ptrdiff_t n = 0; n < 256; ++n) {
dict_sv_i32_insert_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("Faxe Kondi."), 128, GUF_CPY_VALUE, GUF_CPY_VALUE);
TEST_CHECK(word_cnt_dict.num_tombstones == --del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("Faxe Kondi.")));
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
}
TEST_CHECK(word_cnt_dict.kv_indices_cap == cap_begin);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("med")));
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
for (ptrdiff_t n = 0; n < 512 + cap_begin; ++n) {
dict_sv_i32_insert_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("med"), 256, GUF_CPY_VALUE, GUF_CPY_VALUE);
TEST_CHECK(word_cnt_dict.num_tombstones == --del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("med")));
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
}
TEST_CHECK(word_cnt_dict.kv_indices_cap == cap_begin);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("Flødeskum")));
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
for (ptrdiff_t n = 0; n < 71; ++n) {
dict_sv_i32_insert_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("Flødeskum"), 512, GUF_CPY_VALUE, GUF_CPY_VALUE);
TEST_CHECK(word_cnt_dict.num_tombstones == --del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("Flødeskum")));
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
}
TEST_CHECK(word_cnt_dict.kv_indices_cap == cap_begin);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("verden!")));
TEST_CHECK(word_cnt_dict.num_tombstones == 0);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == 0);
for (ptrdiff_t n = 0; n < 201; ++n) {
dict_sv_i32_insert_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("verden!"), 128, GUF_CPY_VALUE, GUF_CPY_VALUE);
TEST_CHECK(word_cnt_dict.num_tombstones == 0);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) > 0);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, GUF_CSTR_TO_VIEW_CPP("verden!")));
TEST_CHECK(word_cnt_dict.num_tombstones == 0);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == 0);
}
TEST_CHECK(word_cnt_dict.kv_indices_cap == cap_begin);
TEST_CHECK(word_cnt_dict.kv_elems.size == 0);
TEST_CHECK(dict_sv_i32_size(&word_cnt_dict) == 0);
dict_sv_i32_free(&word_cnt_dict, NULL);
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;
TEST_CHECK(dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones);
}
bool load_file(const char *fname)
{
FILE *in_file {nullptr};
if (!in_file) {
in_file = fopen(fname, "r");
}
GUF_ASSERT_RELEASE(in_file);
dbuf_char_init(&text_buf, 128, &guf_allocator_libc);
int c = EOF;
while ((c = fgetc(in_file)) != EOF) {
dbuf_char_push_val(&text_buf, (char)c);
text_vec.push_back((char)c);
}
fclose(in_file);
// dbuf_char_insert_val(&text_buf, '\xC0', 1);
// text_vec.insert(text_vec.cbegin() + 1, '\xC0');
return TEST_CHECK(std::ssize(text_vec) == text_buf.size);
}
void free_file()
{
dbuf_char_free(&text_buf, NULL);
text_buf = {};
text_vec.clear();
}
public:
bool run() override
{
if (done) {
return passed;
}
if (TEST_CHECK(load_file(TEST_DATA_DIR "/utf8-test.txt"))) {
insert_lookup();
}
free_file();
if (TEST_CHECK(load_file(TEST_DATA_DIR "/bartleby.txt"))) {
insert_lookup();
}
free_file();
passed = (num_failed_checks == 0);
done = true;
return passed;
}
};