Add more dict tests

This commit is contained in:
jun 2025-03-04 06:41:44 +01:00
parent 8e6ffcdc70
commit 9d62df6a83
6 changed files with 2105 additions and 25 deletions

View File

@ -602,10 +602,9 @@ GUF_DICT_KWRDS bool GUF_CAT(GUF_DICT_NAME, _erase)(GUF_DICT_NAME *ht, const GUF_
GUF_DICT_KV_NAME *last_kv = GUF_CAT(GUF_DICT_KV_DBUF, _back)(&ht->kv_elems); GUF_DICT_KV_NAME *last_kv = GUF_CAT(GUF_DICT_KV_DBUF, _back)(&ht->kv_elems);
GUF_ASSERT(last_kv); GUF_ASSERT(last_kv);
GUF_ASSERT(kv != last_kv); GUF_ASSERT(kv != last_kv);
*kv = *last_kv; *kv = *last_kv;
GUF_ASSERT(!GUF_DICT_KEY_T_EQ(key, &last_kv->key)); // GUF_ASSERT(!GUF_DICT_KEY_T_EQ(key, &last_kv->key));
// 2.) Update kv_index. // 2.) Update kv_index.
bool last_key_exists = false; bool last_key_exists = false;
@ -622,7 +621,7 @@ GUF_DICT_KWRDS bool GUF_CAT(GUF_DICT_NAME, _erase)(GUF_DICT_NAME *ht, const GUF_
GUF_ASSERT(ht->kv_elems.size >= 0); GUF_ASSERT(ht->kv_elems.size >= 0);
GUF_ASSERT(ht->num_tombstones <= ht->kv_indices_cap); GUF_ASSERT(ht->num_tombstones <= ht->kv_indices_cap);
GUF_ASSERT(!GUF_CAT(GUF_DICT_NAME, _contains)(ht, key)); // GUF_ASSERT(!GUF_CAT(GUF_DICT_NAME, _contains)(ht, key));
if (ht->kv_elems.size == 0 && ht->num_tombstones > 0) { // Optimisation: We can delete all tombstones here. if (ht->kv_elems.size == 0 && ht->num_tombstones > 0) { // Optimisation: We can delete all tombstones here.
ptrdiff_t del_tombstone_cnt = 0; ptrdiff_t del_tombstone_cnt = 0;

View File

@ -38,7 +38,7 @@ GUF_UTF8_KWRDS bool guf_utf8_equal(const guf_utf8_char *a, const guf_utf8_char *
GUF_UTF8_KWRDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str); GUF_UTF8_KWRDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str);
extern const char* const GUF_UTF8_WHITESPACE[25]; extern const char* const GUF_UTF8_WHITESPACE[25];
extern const char* const GUF_UTF8_COMMON_PUNCT[31]; extern const char* const GUF_UTF8_COMMON_PUNCT[32];
extern const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR; // Replacement character "<22>" (U+FFFD) extern const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR; // Replacement character "<22>" (U+FFFD)
#define GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT UINT32_C(0xFFFD) #define GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT UINT32_C(0xFFFD)
@ -57,9 +57,9 @@ const char* const GUF_UTF8_WHITESPACE[25] =
}; };
// Common punctuation (TODO: make more exhaustive; use \x escapes) // Common punctuation (TODO: make more exhaustive; use \x escapes)
const char* const GUF_UTF8_COMMON_PUNCT[31] = const char* const GUF_UTF8_COMMON_PUNCT[32] =
{ {
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "", "", "´", "»", "«", "`", "\\", "%", "", "", "", "" ".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "", "", "´", "»", "«", "`", "\\", "%", "", "", "", "", "_"
}; };
const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR = {.bytes = {'\xEF','\xBF','\xBD', '\0', '\0'}}; const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR = {.bytes = {'\xEF','\xBF','\xBD', '\0', '\0'}};

1941
src/test/data/bartleby.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,6 @@
#pragma once #pragma once
#include <unordered_map> #include <unordered_map>
#include <cstring>
#include "test.hpp" #include "test.hpp"
extern "C" extern "C"
@ -84,8 +85,10 @@ struct DictSvToIntTest : public Test
TEST_CHECK(i == std::ssize(word_cnt_map)); TEST_CHECK(i == std::ssize(word_cnt_map));
TEST_CHECK(dict_sv_i32_debug_valid_size(&word_cnt_dict)); TEST_CHECK(dict_sv_i32_debug_valid_size(&word_cnt_dict));
std::cout << "load fac: " << dict_sv_i32_load_factor(&word_cnt_dict) << ", cap: " << word_cnt_dict.kv_indices_cap << "\n"; // std::cout << "load fac: " << dict_sv_i32_load_factor(&word_cnt_dict) << ", cap: " << word_cnt_dict.kv_indices_cap << "\n";
std::cout << "size: " << dict_sv_i32_size(&word_cnt_dict) << ", max probelen: " << word_cnt_dict.max_probelen << "\n"; // std::cout << "size: " << dict_sv_i32_size(&word_cnt_dict) << ", max probelen: " << word_cnt_dict.max_probelen << "\n";
// Erase tests:
const double load_fac_before_erase = dict_sv_i32_load_factor(&word_cnt_dict); const double load_fac_before_erase = dict_sv_i32_load_factor(&word_cnt_dict);
const ptrdiff_t size_before_erase = dict_sv_i32_size(&word_cnt_dict); const ptrdiff_t size_before_erase = dict_sv_i32_size(&word_cnt_dict);
@ -176,22 +179,146 @@ struct DictSvToIntTest : public Test
TEST_CHECK(i == word_cnt_dict.kv_elems.size); TEST_CHECK(i == word_cnt_dict.kv_elems.size);
TEST_CHECK(i == std::ssize(word_cnt_map)); TEST_CHECK(i == std::ssize(word_cnt_map));
while (dict_sv_i32_size(&word_cnt_dict) > 0) {
const dict_sv_i32_iter beg = dict_sv_i32_begin(&word_cnt_dict);
if (TEST_CHECK(!dict_sv_i32_iter_is_end(&word_cnt_dict, beg))) {
const guf_str_view key = beg.ptr->key;
if (TEST_CHECK(dict_sv_i32_contains(&word_cnt_dict, &key))) {
const bool del_success = dict_sv_i32_erase(&word_cnt_dict, &key);
TEST_CHECK(del_success);
TEST_CHECK(!dict_sv_i32_contains(&word_cnt_dict, &key));
}
const std::string_view sv(key.str, (size_t)key.len);
if (TEST_CHECK(word_cnt_map.contains(sv))) {
word_cnt_map.erase(sv);
}
}
}
TEST_CHECK(dict_sv_i32_size(&word_cnt_dict) == 0 && word_cnt_map.size() == 0);
TEST_CHECK(word_cnt_dict.num_tombstones == 0);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == 0);
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "Hej", .len = (ptrdiff_t)strlen("Hej")}, (size_t)64, GUF_CPY_VALUE, GUF_CPY_VALUE);
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "verden!", .len = (ptrdiff_t)strlen("verden!")}, (size_t)128, GUF_CPY_VALUE, GUF_CPY_VALUE);
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "Flødeskum", .len = (ptrdiff_t)strlen("Flødeskum")}, (size_t)256, GUF_CPY_VALUE, GUF_CPY_VALUE);
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "med", .len = (ptrdiff_t)strlen("med")}, (size_t)512, GUF_CPY_VALUE, GUF_CPY_VALUE);
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "Faxe Kondi.", .len = (ptrdiff_t)strlen("Faxe Kondi.")}, (size_t)1024, GUF_CPY_VALUE, GUF_CPY_VALUE);
TEST_CHECK(dict_sv_i32_size(&word_cnt_dict) == 5);
int32_t *val = dict_sv_i32_at_val_arg(&word_cnt_dict, {.str = "Hej", .len = (ptrdiff_t)strlen("Hej")});
TEST_CHECK(val && *val == 64);
val = dict_sv_i32_at_val_arg(&word_cnt_dict, {.str = "Flødeskum", .len = (ptrdiff_t)strlen("Flødeskum")});
TEST_CHECK(val && *val == 256);
val = dict_sv_i32_at_val_arg(&word_cnt_dict, {.str = "Faxe Kondi.", .len = (ptrdiff_t)strlen("Faxe Kondi.")});
TEST_CHECK(val && *val == 1024);
val = dict_sv_i32_at_val_arg(&word_cnt_dict, {.str = "verden!", .len = (ptrdiff_t)strlen("verden!")});
TEST_CHECK(val && *val == 128);
val = dict_sv_i32_at_val_arg(&word_cnt_dict, {.str = "med", .len = (ptrdiff_t)strlen("med")});
TEST_CHECK(val && *val == 512);
TEST_CHECK(word_cnt_dict.kv_elems.size == 5);
TEST_CHECK(word_cnt_dict.kv_elems.data[0].val == 64 && std::strcmp(word_cnt_dict.kv_elems.data[0].key.str, "Hej") == 0);
TEST_CHECK(word_cnt_dict.kv_elems.data[1].val == 128 && std::strcmp(word_cnt_dict.kv_elems.data[1].key.str, "verden!") == 0);
TEST_CHECK(word_cnt_dict.kv_elems.data[2].val == 256 && std::strcmp(word_cnt_dict.kv_elems.data[2].key.str, "Flødeskum") == 0);
TEST_CHECK(word_cnt_dict.kv_elems.data[3].val == 512 && std::strcmp(word_cnt_dict.kv_elems.data[3].key.str, "med") == 0);
TEST_CHECK(word_cnt_dict.kv_elems.data[4].val == 1024 && std::strcmp(word_cnt_dict.kv_elems.data[4].key.str, "Faxe Kondi.") == 0);
ptrdiff_t del = 0;
const double load_fac_beg = dict_sv_i32_load_factor(&word_cnt_dict);
const ptrdiff_t cap_begin = word_cnt_dict.kv_indices_cap;
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "Hej", .len = (ptrdiff_t)strlen("Hej")}));
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
for (ptrdiff_t n = 0; n < cap_begin + 128; ++n) {
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "Hej", .len = (ptrdiff_t)strlen("Hej")}, 64, GUF_CPY_VALUE, GUF_CPY_VALUE);
TEST_CHECK(word_cnt_dict.num_tombstones == --del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "Hej", .len = (ptrdiff_t)strlen("Hej")}));
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
}
TEST_CHECK(word_cnt_dict.kv_indices_cap == cap_begin);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "Faxe Kondi.", .len = (ptrdiff_t)strlen("Faxe Kondi.")}));
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
for (ptrdiff_t n = 0; n < 256; ++n) {
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "Faxe Kondi.", .len = (ptrdiff_t)strlen("Faxe Kondi.")}, 128, GUF_CPY_VALUE, GUF_CPY_VALUE);
TEST_CHECK(word_cnt_dict.num_tombstones == --del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "Faxe Kondi.", .len = (ptrdiff_t)strlen("Faxe Kondi.")}));
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
}
TEST_CHECK(word_cnt_dict.kv_indices_cap == cap_begin);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "med", .len = (ptrdiff_t)strlen("med")}));
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
for (ptrdiff_t n = 0; n < 512; ++n) {
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "med", .len = (ptrdiff_t)strlen("med")}, 256, GUF_CPY_VALUE, GUF_CPY_VALUE);
TEST_CHECK(word_cnt_dict.num_tombstones == --del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "med", .len = (ptrdiff_t)strlen("med")}));
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
}
TEST_CHECK(word_cnt_dict.kv_indices_cap == cap_begin);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "Flødeskum", .len = (ptrdiff_t)strlen("Flødeskum")}));
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
for (ptrdiff_t n = 0; n < 71; ++n) {
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "Flødeskum", .len = (ptrdiff_t)strlen("Flødeskum")}, 512, GUF_CPY_VALUE, GUF_CPY_VALUE);
TEST_CHECK(word_cnt_dict.num_tombstones == --del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "Flødeskum", .len = (ptrdiff_t)strlen("Flødeskum")}));
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
}
TEST_CHECK(word_cnt_dict.kv_indices_cap == cap_begin);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "verden!", .len = (ptrdiff_t)strlen("verden!")}));
TEST_CHECK(word_cnt_dict.num_tombstones == 0);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == 0);
for (ptrdiff_t n = 0; n < 201; ++n) {
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "verden!", .len = (ptrdiff_t)strlen("verden!")}, 128, GUF_CPY_VALUE, GUF_CPY_VALUE);
TEST_CHECK(word_cnt_dict.num_tombstones == 0);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) > 0);
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "verden!", .len = (ptrdiff_t)strlen("verden!")}));
TEST_CHECK(word_cnt_dict.num_tombstones == 0);
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == 0);
}
TEST_CHECK(word_cnt_dict.kv_indices_cap == cap_begin);
TEST_CHECK(word_cnt_dict.kv_elems.size == 0);
TEST_CHECK(dict_sv_i32_size(&word_cnt_dict) == 0);
dict_sv_i32_free(&word_cnt_dict, NULL); dict_sv_i32_free(&word_cnt_dict, NULL);
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size; bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;
TEST_CHECK(dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones); TEST_CHECK(dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones);
} }
bool load_file() bool load_file(const char *fname)
{ {
FILE *in_file {nullptr}; FILE *in_file {nullptr};
if (!in_file) { if (!in_file) {
in_file = fopen(TEST_DATA_DIR "/utf8-test.txt", "r"); in_file = fopen(fname, "r");
} }
if (!in_file) { GUF_ASSERT_RELEASE(in_file);
return false;
}
dbuf_char_init(&text_buf, 128, &guf_allocator_libc); dbuf_char_init(&text_buf, 128, &guf_allocator_libc);
@ -208,6 +335,13 @@ struct DictSvToIntTest : public Test
return TEST_CHECK(std::ssize(text_vec) == text_buf.size); return TEST_CHECK(std::ssize(text_vec) == text_buf.size);
} }
void free_file()
{
dbuf_char_free(&text_buf, NULL);
text_buf = {};
text_vec.clear();
}
public: public:
bool run() override bool run() override
@ -216,19 +350,18 @@ struct DictSvToIntTest : public Test
return passed; return passed;
} }
if (!TEST_CHECK(load_file())) { if (TEST_CHECK(load_file(TEST_DATA_DIR "/utf8-test.txt"))) {
goto end;
}
insert_lookup(); insert_lookup();
}
free_file();
end: if (TEST_CHECK(load_file(TEST_DATA_DIR "/bartleby.txt"))) {
dbuf_char_free(&text_buf, NULL); insert_lookup();
text_buf = {}; }
free_file();
passed = (num_failed_checks == 0); passed = (num_failed_checks == 0);
done = true; done = true;
return passed; return passed;
} }
}; };

View File

@ -363,6 +363,9 @@ struct UTF8Test : public Test
read_utf8_chars(TEST_DATA_DIR "/" "utf8-test.txt", &valid, &invalid); read_utf8_chars(TEST_DATA_DIR "/" "utf8-test.txt", &valid, &invalid);
TEST_CHECK(valid == 2634 && invalid == 0); TEST_CHECK(valid == 2634 && invalid == 0);
read_utf8_chars(TEST_DATA_DIR "/" "bartleby.txt", &valid, &invalid);
TEST_CHECK(valid > 16000 && invalid == 0);
dbuf_str_view delims = dbuf_str_view_new(&guf_allocator_libc); dbuf_str_view delims = dbuf_str_view_new(&guf_allocator_libc);
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(GUF_UTF8_WHITESPACE); ++i) { for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(GUF_UTF8_WHITESPACE); ++i) {
guf_str_view d = {.len = (ptrdiff_t)strlen(GUF_UTF8_WHITESPACE[i]), .str = GUF_UTF8_WHITESPACE[i]}; guf_str_view d = {.len = (ptrdiff_t)strlen(GUF_UTF8_WHITESPACE[i]), .str = GUF_UTF8_WHITESPACE[i]};
@ -376,11 +379,15 @@ struct UTF8Test : public Test
TEST_CHECK(words == 422); TEST_CHECK(words == 422);
int words_with_delims = count_words_with_delims(TEST_DATA_DIR "/" "utf8-test.txt", &delims); int words_with_delims = count_words_with_delims(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
TEST_CHECK(words_with_delims == 949); TEST_CHECK(words_with_delims == 949);
int words2 = count_words(TEST_DATA_DIR "/" "bartleby.txt", &delims);
TEST_CHECK(words2 > 2048);
dbuf_str_view_free(&delims, NULL); dbuf_str_view_free(&delims, NULL);
encode_decode(); encode_decode();
encode_decode_file(TEST_DATA_DIR "/" "utf8-test.txt"); encode_decode_file(TEST_DATA_DIR "/" "utf8-test.txt");
encode_decode_file(TEST_DATA_DIR "/" "bartleby.txt");
done = true; done = true;
passed = (num_failed_checks == 0); passed = (num_failed_checks == 0);

View File

@ -1,6 +1,6 @@
- guf_stack, guf_queue, guf_ringbuf - guf_stack, guf_queue, guf_ringbuf
- guf_rand etc.: move guf_fn_keywors out of header guard? (-> no, add a GUF_WITHOUT_TYPES)
- unicode normalisation - unicode normalisation
- fix 32-bit dict (and add 32/64 bit defs in common.h) - track allocs for test (implement alloc tracker)
- guf_dict: allow manual resize (and possibly resize if load fac gets to high after erase) - handle right-to-left text properly
- fix 32-bit dict (and add 32/64 bit defs and 32/64-bit platform detection in common.h)