Add more dict tests
This commit is contained in:
parent
8e6ffcdc70
commit
9d62df6a83
@ -602,10 +602,9 @@ GUF_DICT_KWRDS bool GUF_CAT(GUF_DICT_NAME, _erase)(GUF_DICT_NAME *ht, const GUF_
|
||||
GUF_DICT_KV_NAME *last_kv = GUF_CAT(GUF_DICT_KV_DBUF, _back)(&ht->kv_elems);
|
||||
GUF_ASSERT(last_kv);
|
||||
GUF_ASSERT(kv != last_kv);
|
||||
|
||||
*kv = *last_kv;
|
||||
|
||||
GUF_ASSERT(!GUF_DICT_KEY_T_EQ(key, &last_kv->key));
|
||||
// GUF_ASSERT(!GUF_DICT_KEY_T_EQ(key, &last_kv->key));
|
||||
|
||||
// 2.) Update kv_index.
|
||||
bool last_key_exists = false;
|
||||
@ -622,7 +621,7 @@ GUF_DICT_KWRDS bool GUF_CAT(GUF_DICT_NAME, _erase)(GUF_DICT_NAME *ht, const GUF_
|
||||
GUF_ASSERT(ht->kv_elems.size >= 0);
|
||||
GUF_ASSERT(ht->num_tombstones <= ht->kv_indices_cap);
|
||||
|
||||
GUF_ASSERT(!GUF_CAT(GUF_DICT_NAME, _contains)(ht, key));
|
||||
// GUF_ASSERT(!GUF_CAT(GUF_DICT_NAME, _contains)(ht, key));
|
||||
|
||||
if (ht->kv_elems.size == 0 && ht->num_tombstones > 0) { // Optimisation: We can delete all tombstones here.
|
||||
ptrdiff_t del_tombstone_cnt = 0;
|
||||
|
||||
@ -38,7 +38,7 @@ GUF_UTF8_KWRDS bool guf_utf8_equal(const guf_utf8_char *a, const guf_utf8_char *
|
||||
GUF_UTF8_KWRDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str);
|
||||
|
||||
extern const char* const GUF_UTF8_WHITESPACE[25];
|
||||
extern const char* const GUF_UTF8_COMMON_PUNCT[31];
|
||||
extern const char* const GUF_UTF8_COMMON_PUNCT[32];
|
||||
|
||||
extern const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR; // Replacement character "<22>" (U+FFFD)
|
||||
#define GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT UINT32_C(0xFFFD)
|
||||
@ -57,9 +57,9 @@ const char* const GUF_UTF8_WHITESPACE[25] =
|
||||
};
|
||||
|
||||
// Common punctuation (TODO: make more exhaustive; use \x escapes)
|
||||
const char* const GUF_UTF8_COMMON_PUNCT[31] =
|
||||
const char* const GUF_UTF8_COMMON_PUNCT[32] =
|
||||
{
|
||||
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "„", "“", "´", "»", "«", "`", "\\", "%", "‒", "–", "—", "—"
|
||||
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "„", "“", "´", "»", "«", "`", "\\", "%", "‒", "–", "—", "—", "_"
|
||||
};
|
||||
|
||||
const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR = {.bytes = {'\xEF','\xBF','\xBD', '\0', '\0'}};
|
||||
|
||||
1941
src/test/data/bartleby.txt
Normal file
1941
src/test/data/bartleby.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
#include <unordered_map>
|
||||
#include <cstring>
|
||||
#include "test.hpp"
|
||||
|
||||
extern "C"
|
||||
@ -84,8 +85,10 @@ struct DictSvToIntTest : public Test
|
||||
TEST_CHECK(i == std::ssize(word_cnt_map));
|
||||
TEST_CHECK(dict_sv_i32_debug_valid_size(&word_cnt_dict));
|
||||
|
||||
std::cout << "load fac: " << dict_sv_i32_load_factor(&word_cnt_dict) << ", cap: " << word_cnt_dict.kv_indices_cap << "\n";
|
||||
std::cout << "size: " << dict_sv_i32_size(&word_cnt_dict) << ", max probelen: " << word_cnt_dict.max_probelen << "\n";
|
||||
// std::cout << "load fac: " << dict_sv_i32_load_factor(&word_cnt_dict) << ", cap: " << word_cnt_dict.kv_indices_cap << "\n";
|
||||
// std::cout << "size: " << dict_sv_i32_size(&word_cnt_dict) << ", max probelen: " << word_cnt_dict.max_probelen << "\n";
|
||||
|
||||
// Erase tests:
|
||||
|
||||
const double load_fac_before_erase = dict_sv_i32_load_factor(&word_cnt_dict);
|
||||
const ptrdiff_t size_before_erase = dict_sv_i32_size(&word_cnt_dict);
|
||||
@ -176,22 +179,146 @@ struct DictSvToIntTest : public Test
|
||||
TEST_CHECK(i == word_cnt_dict.kv_elems.size);
|
||||
TEST_CHECK(i == std::ssize(word_cnt_map));
|
||||
|
||||
while (dict_sv_i32_size(&word_cnt_dict) > 0) {
|
||||
const dict_sv_i32_iter beg = dict_sv_i32_begin(&word_cnt_dict);
|
||||
if (TEST_CHECK(!dict_sv_i32_iter_is_end(&word_cnt_dict, beg))) {
|
||||
const guf_str_view key = beg.ptr->key;
|
||||
if (TEST_CHECK(dict_sv_i32_contains(&word_cnt_dict, &key))) {
|
||||
const bool del_success = dict_sv_i32_erase(&word_cnt_dict, &key);
|
||||
TEST_CHECK(del_success);
|
||||
TEST_CHECK(!dict_sv_i32_contains(&word_cnt_dict, &key));
|
||||
}
|
||||
const std::string_view sv(key.str, (size_t)key.len);
|
||||
if (TEST_CHECK(word_cnt_map.contains(sv))) {
|
||||
word_cnt_map.erase(sv);
|
||||
}
|
||||
}
|
||||
}
|
||||
TEST_CHECK(dict_sv_i32_size(&word_cnt_dict) == 0 && word_cnt_map.size() == 0);
|
||||
TEST_CHECK(word_cnt_dict.num_tombstones == 0);
|
||||
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == 0);
|
||||
|
||||
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "Hej", .len = (ptrdiff_t)strlen("Hej")}, (size_t)64, GUF_CPY_VALUE, GUF_CPY_VALUE);
|
||||
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "verden!", .len = (ptrdiff_t)strlen("verden!")}, (size_t)128, GUF_CPY_VALUE, GUF_CPY_VALUE);
|
||||
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "Flødeskum", .len = (ptrdiff_t)strlen("Flødeskum")}, (size_t)256, GUF_CPY_VALUE, GUF_CPY_VALUE);
|
||||
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "med", .len = (ptrdiff_t)strlen("med")}, (size_t)512, GUF_CPY_VALUE, GUF_CPY_VALUE);
|
||||
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "Faxe Kondi.", .len = (ptrdiff_t)strlen("Faxe Kondi.")}, (size_t)1024, GUF_CPY_VALUE, GUF_CPY_VALUE);
|
||||
|
||||
TEST_CHECK(dict_sv_i32_size(&word_cnt_dict) == 5);
|
||||
|
||||
int32_t *val = dict_sv_i32_at_val_arg(&word_cnt_dict, {.str = "Hej", .len = (ptrdiff_t)strlen("Hej")});
|
||||
TEST_CHECK(val && *val == 64);
|
||||
val = dict_sv_i32_at_val_arg(&word_cnt_dict, {.str = "Flødeskum", .len = (ptrdiff_t)strlen("Flødeskum")});
|
||||
TEST_CHECK(val && *val == 256);
|
||||
val = dict_sv_i32_at_val_arg(&word_cnt_dict, {.str = "Faxe Kondi.", .len = (ptrdiff_t)strlen("Faxe Kondi.")});
|
||||
TEST_CHECK(val && *val == 1024);
|
||||
val = dict_sv_i32_at_val_arg(&word_cnt_dict, {.str = "verden!", .len = (ptrdiff_t)strlen("verden!")});
|
||||
TEST_CHECK(val && *val == 128);
|
||||
val = dict_sv_i32_at_val_arg(&word_cnt_dict, {.str = "med", .len = (ptrdiff_t)strlen("med")});
|
||||
TEST_CHECK(val && *val == 512);
|
||||
|
||||
TEST_CHECK(word_cnt_dict.kv_elems.size == 5);
|
||||
|
||||
TEST_CHECK(word_cnt_dict.kv_elems.data[0].val == 64 && std::strcmp(word_cnt_dict.kv_elems.data[0].key.str, "Hej") == 0);
|
||||
TEST_CHECK(word_cnt_dict.kv_elems.data[1].val == 128 && std::strcmp(word_cnt_dict.kv_elems.data[1].key.str, "verden!") == 0);
|
||||
TEST_CHECK(word_cnt_dict.kv_elems.data[2].val == 256 && std::strcmp(word_cnt_dict.kv_elems.data[2].key.str, "Flødeskum") == 0);
|
||||
TEST_CHECK(word_cnt_dict.kv_elems.data[3].val == 512 && std::strcmp(word_cnt_dict.kv_elems.data[3].key.str, "med") == 0);
|
||||
TEST_CHECK(word_cnt_dict.kv_elems.data[4].val == 1024 && std::strcmp(word_cnt_dict.kv_elems.data[4].key.str, "Faxe Kondi.") == 0);
|
||||
|
||||
|
||||
ptrdiff_t del = 0;
|
||||
|
||||
const double load_fac_beg = dict_sv_i32_load_factor(&word_cnt_dict);
|
||||
const ptrdiff_t cap_begin = word_cnt_dict.kv_indices_cap;
|
||||
|
||||
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "Hej", .len = (ptrdiff_t)strlen("Hej")}));
|
||||
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
|
||||
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
|
||||
for (ptrdiff_t n = 0; n < cap_begin + 128; ++n) {
|
||||
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "Hej", .len = (ptrdiff_t)strlen("Hej")}, 64, GUF_CPY_VALUE, GUF_CPY_VALUE);
|
||||
TEST_CHECK(word_cnt_dict.num_tombstones == --del);
|
||||
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
|
||||
|
||||
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "Hej", .len = (ptrdiff_t)strlen("Hej")}));
|
||||
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
|
||||
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
|
||||
}
|
||||
TEST_CHECK(word_cnt_dict.kv_indices_cap == cap_begin);
|
||||
|
||||
|
||||
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "Faxe Kondi.", .len = (ptrdiff_t)strlen("Faxe Kondi.")}));
|
||||
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
|
||||
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
|
||||
for (ptrdiff_t n = 0; n < 256; ++n) {
|
||||
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "Faxe Kondi.", .len = (ptrdiff_t)strlen("Faxe Kondi.")}, 128, GUF_CPY_VALUE, GUF_CPY_VALUE);
|
||||
TEST_CHECK(word_cnt_dict.num_tombstones == --del);
|
||||
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
|
||||
|
||||
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "Faxe Kondi.", .len = (ptrdiff_t)strlen("Faxe Kondi.")}));
|
||||
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
|
||||
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
|
||||
}
|
||||
TEST_CHECK(word_cnt_dict.kv_indices_cap == cap_begin);
|
||||
|
||||
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "med", .len = (ptrdiff_t)strlen("med")}));
|
||||
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
|
||||
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
|
||||
for (ptrdiff_t n = 0; n < 512; ++n) {
|
||||
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "med", .len = (ptrdiff_t)strlen("med")}, 256, GUF_CPY_VALUE, GUF_CPY_VALUE);
|
||||
TEST_CHECK(word_cnt_dict.num_tombstones == --del);
|
||||
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
|
||||
|
||||
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "med", .len = (ptrdiff_t)strlen("med")}));
|
||||
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
|
||||
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
|
||||
}
|
||||
TEST_CHECK(word_cnt_dict.kv_indices_cap == cap_begin);
|
||||
|
||||
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "Flødeskum", .len = (ptrdiff_t)strlen("Flødeskum")}));
|
||||
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
|
||||
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
|
||||
for (ptrdiff_t n = 0; n < 71; ++n) {
|
||||
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "Flødeskum", .len = (ptrdiff_t)strlen("Flødeskum")}, 512, GUF_CPY_VALUE, GUF_CPY_VALUE);
|
||||
TEST_CHECK(word_cnt_dict.num_tombstones == --del);
|
||||
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
|
||||
|
||||
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "Flødeskum", .len = (ptrdiff_t)strlen("Flødeskum")}));
|
||||
TEST_CHECK(word_cnt_dict.num_tombstones == ++del);
|
||||
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == load_fac_beg);
|
||||
}
|
||||
TEST_CHECK(word_cnt_dict.kv_indices_cap == cap_begin);
|
||||
|
||||
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "verden!", .len = (ptrdiff_t)strlen("verden!")}));
|
||||
TEST_CHECK(word_cnt_dict.num_tombstones == 0);
|
||||
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == 0);
|
||||
for (ptrdiff_t n = 0; n < 201; ++n) {
|
||||
dict_sv_i32_insert_val_arg(&word_cnt_dict, {.str = "verden!", .len = (ptrdiff_t)strlen("verden!")}, 128, GUF_CPY_VALUE, GUF_CPY_VALUE);
|
||||
TEST_CHECK(word_cnt_dict.num_tombstones == 0);
|
||||
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) > 0);
|
||||
|
||||
TEST_CHECK(dict_sv_i32_erase_val_arg(&word_cnt_dict, {.str = "verden!", .len = (ptrdiff_t)strlen("verden!")}));
|
||||
TEST_CHECK(word_cnt_dict.num_tombstones == 0);
|
||||
TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == 0);
|
||||
}
|
||||
TEST_CHECK(word_cnt_dict.kv_indices_cap == cap_begin);
|
||||
|
||||
TEST_CHECK(word_cnt_dict.kv_elems.size == 0);
|
||||
TEST_CHECK(dict_sv_i32_size(&word_cnt_dict) == 0);
|
||||
|
||||
|
||||
dict_sv_i32_free(&word_cnt_dict, NULL);
|
||||
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;
|
||||
TEST_CHECK(dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones);
|
||||
}
|
||||
|
||||
bool load_file()
|
||||
bool load_file(const char *fname)
|
||||
{
|
||||
FILE *in_file {nullptr};
|
||||
if (!in_file) {
|
||||
in_file = fopen(TEST_DATA_DIR "/utf8-test.txt", "r");
|
||||
in_file = fopen(fname, "r");
|
||||
}
|
||||
|
||||
if (!in_file) {
|
||||
return false;
|
||||
}
|
||||
GUF_ASSERT_RELEASE(in_file);
|
||||
|
||||
dbuf_char_init(&text_buf, 128, &guf_allocator_libc);
|
||||
|
||||
@ -208,6 +335,13 @@ struct DictSvToIntTest : public Test
|
||||
return TEST_CHECK(std::ssize(text_vec) == text_buf.size);
|
||||
}
|
||||
|
||||
void free_file()
|
||||
{
|
||||
dbuf_char_free(&text_buf, NULL);
|
||||
text_buf = {};
|
||||
text_vec.clear();
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
bool run() override
|
||||
@ -216,19 +350,18 @@ struct DictSvToIntTest : public Test
|
||||
return passed;
|
||||
}
|
||||
|
||||
if (!TEST_CHECK(load_file())) {
|
||||
goto end;
|
||||
if (TEST_CHECK(load_file(TEST_DATA_DIR "/utf8-test.txt"))) {
|
||||
insert_lookup();
|
||||
}
|
||||
free_file();
|
||||
|
||||
insert_lookup();
|
||||
|
||||
end:
|
||||
dbuf_char_free(&text_buf, NULL);
|
||||
text_buf = {};
|
||||
if (TEST_CHECK(load_file(TEST_DATA_DIR "/bartleby.txt"))) {
|
||||
insert_lookup();
|
||||
}
|
||||
free_file();
|
||||
|
||||
passed = (num_failed_checks == 0);
|
||||
done = true;
|
||||
|
||||
return passed;
|
||||
}
|
||||
};
|
||||
|
||||
@ -363,6 +363,9 @@ struct UTF8Test : public Test
|
||||
read_utf8_chars(TEST_DATA_DIR "/" "utf8-test.txt", &valid, &invalid);
|
||||
TEST_CHECK(valid == 2634 && invalid == 0);
|
||||
|
||||
read_utf8_chars(TEST_DATA_DIR "/" "bartleby.txt", &valid, &invalid);
|
||||
TEST_CHECK(valid > 16000 && invalid == 0);
|
||||
|
||||
dbuf_str_view delims = dbuf_str_view_new(&guf_allocator_libc);
|
||||
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(GUF_UTF8_WHITESPACE); ++i) {
|
||||
guf_str_view d = {.len = (ptrdiff_t)strlen(GUF_UTF8_WHITESPACE[i]), .str = GUF_UTF8_WHITESPACE[i]};
|
||||
@ -376,11 +379,15 @@ struct UTF8Test : public Test
|
||||
TEST_CHECK(words == 422);
|
||||
int words_with_delims = count_words_with_delims(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
|
||||
TEST_CHECK(words_with_delims == 949);
|
||||
|
||||
int words2 = count_words(TEST_DATA_DIR "/" "bartleby.txt", &delims);
|
||||
TEST_CHECK(words2 > 2048);
|
||||
|
||||
dbuf_str_view_free(&delims, NULL);
|
||||
|
||||
encode_decode();
|
||||
encode_decode_file(TEST_DATA_DIR "/" "utf8-test.txt");
|
||||
|
||||
encode_decode_file(TEST_DATA_DIR "/" "bartleby.txt");
|
||||
|
||||
done = true;
|
||||
passed = (num_failed_checks == 0);
|
||||
|
||||
6
todo.txt
6
todo.txt
@ -1,6 +1,6 @@
|
||||
- guf_stack, guf_queue, guf_ringbuf
|
||||
- guf_rand etc.: move guf_fn_keywors out of header guard? (-> no, add a GUF_WITHOUT_TYPES)
|
||||
|
||||
- unicode normalisation
|
||||
- fix 32-bit dict (and add 32/64 bit defs in common.h)
|
||||
- guf_dict: allow manual resize (and possibly resize if load fac gets to high after erase)
|
||||
- track allocs for test (implement alloc tracker)
|
||||
- handle right-to-left text properly
|
||||
- fix 32-bit dict (and add 32/64 bit defs and 32/64-bit platform detection in common.h)
|
||||
Loading…
x
Reference in New Issue
Block a user