From 8e6ffcdc70611367d7852440dcaf4c0b0782df2e Mon Sep 17 00:00:00 2001 From: jun <83899451+zeichensystem@users.noreply.github.com> Date: Mon, 3 Mar 2025 16:14:13 +0100 Subject: [PATCH] Add dict_erase tests --- src/guf_dict.h | 126 ++++++++++++++++++++++++++++++++--------- src/test/test_dict.hpp | 94 ++++++++++++++++++++++++++++++ todo.txt | 3 +- 3 files changed, 196 insertions(+), 27 deletions(-) diff --git a/src/guf_dict.h b/src/guf_dict.h index 4f7e131..fe6b24f 100755 --- a/src/guf_dict.h +++ b/src/guf_dict.h @@ -152,6 +152,7 @@ GUF_DICT_KWRDS ptrdiff_t GUF_CAT(GUF_DICT_NAME, _iter_to_idx)(const GUF_DICT_NAM GUF_DICT_KWRDS GUF_CAT(GUF_DICT_NAME, _iter) GUF_CAT(GUF_DICT_NAME, _find_val_if)(GUF_DICT_NAME *ht, GUF_CAT(GUF_DICT_NAME, _iter) begin, GUF_CAT(GUF_DICT_NAME, _iter) end, bool (*predicate)(const GUF_DICT_VAL_T *)); #endif +GUF_DICT_KWRDS bool GUF_CAT(GUF_DICT_NAME, _debug_valid_size)(const GUF_DICT_NAME *ht); // #define GUF_DICT_IMPL /* DEBUGGGGGGGGG */ @@ -159,6 +160,17 @@ GUF_DICT_KWRDS ptrdiff_t GUF_CAT(GUF_DICT_NAME, _iter_to_idx)(const GUF_DICT_NAM #include "guf_assert.h" +GUF_DICT_KWRDS bool GUF_CAT(GUF_DICT_NAME, _debug_valid_size)(const GUF_DICT_NAME *ht) +{ + ptrdiff_t cnt = 0; + for (ptrdiff_t i = 0; i < ht->kv_indices_cap; ++i) { + if (ht->kv_indices[i].kv_idx != GUF_DICT_KV_IDX_NULL && ht->kv_indices[i].kv_idx != GUF_DICT_KV_IDX_TOMBSTONE) { + ++cnt; + } + } + return cnt == ht->kv_elems.size; +} + static inline void GUF_CAT(GUF_DICT_KV_NAME, _free)(GUF_DICT_KV_NAME *kv, void *ctx) { (void)ctx; @@ -278,7 +290,7 @@ static size_t GUF_CAT(GUF_DICT_NAME, _find_idx)(GUF_DICT_NAME *ht, const GUF_DIC const size_t start_idx = idx; size_t first_tombstone_idx = SIZE_MAX; size_t probe_len = 0; - + // size_t seen_occupied = 0; // This allows us to bail out early once we visited every non-null/non-tombstone kv_idx. do { if (ht->kv_indices[idx].kv_idx == GUF_DICT_KV_IDX_NULL) { // 1.) Empty. if (first_tombstone_idx != SIZE_MAX) { @@ -293,15 +305,17 @@ static size_t GUF_CAT(GUF_DICT_NAME, _find_idx)(GUF_DICT_NAME *ht, const GUF_DIC first_tombstone_idx = idx; } goto probe; - } else if (hash == ht->kv_indices[idx].key_hash && GUF_DICT_KEY_T_EQ(key, &GUF_CAT(GUF_DICT_KV_DBUF, _at)(&ht->kv_elems, ht->kv_indices[idx].kv_idx)->key)) { // 3.) Key already exists. + } else if (hash == ht->kv_indices[idx].key_hash && GUF_DICT_KEY_T_EQ(key, &(GUF_CAT(GUF_DICT_KV_DBUF, _at)(&ht->kv_elems, ht->kv_indices[idx].kv_idx)->key))) { // 3.) Key already exists. ht->max_probelen = GUF_MAX((ptrdiff_t)probe_len, ht->max_probelen); *key_exists = true; return idx; - } else { // 4.) Have to probe due to hash-collision (idx is already occupied, but not by the key). + } else { // 4.) Have to probe due to hash-collision/tombstone. probe: - ++probe_len; - // NOTE: Add the probe_offset to start_idx and not to idx. - idx = GUF_MOD_CAP(start_idx + GUF_CAT(GUF_DICT_NAME, _probe_offset)(probe_len)); + ++probe_len; + // if (ht->kv_indices[idx].kv_idx != GUF_DICT_KV_IDX_NULL && ht->kv_indices[idx].kv_idx != GUF_DICT_KV_IDX_TOMBSTONE) { + // ++seen_occupied; // && seen_occupied <= (size_t)ht->kv_elems.size + // } + idx = GUF_MOD_CAP(start_idx + GUF_CAT(GUF_DICT_NAME, _probe_offset)(probe_len)); // NOTE: Add probe_offset to start_idx and not to idx. GUF_ASSERT((ptrdiff_t)probe_len <= (ht->kv_elems.size + ht->num_tombstones)); } } while (idx != start_idx && probe_len < (size_t)ht->kv_indices_cap); @@ -317,19 +331,10 @@ static size_t GUF_CAT(GUF_DICT_NAME, _find_idx)(GUF_DICT_NAME *ht, const GUF_DIC #undef GUF_MOD_CAP } -GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_DICT_KEY_T *key, GUF_DICT_VAL_T *val, guf_cpy_opt key_opt, guf_cpy_opt val_opt, guf_err *err) + +static void GUF_CAT(GUF_DICT_NAME, _try_grow_if_necessary)(GUF_DICT_NAME *ht, guf_err *err) { GUF_ASSERT_RELEASE(GUF_CAT(GUF_DICT_NAME, _valid)(ht)); - - if (!key || !val) { - guf_err_set_or_panic(err, GUF_ERR_INVALID_ARG, GUF_ERR_MSG("in function dict_try_insert: key or val argument is NULL")); - return; - } - if ((size_t)ht->kv_elems.size == GUF_DICT_MAX_SIZE) { - guf_err_set_or_panic(err, GUF_ERR_INT_OVERFLOW, GUF_ERR_MSG("in function dict_try_insert: dict has reached its max size (UINT64_MAX - 2 or UINT32_MAX - 2)")); - return; - } - #ifdef GUF_DICT_PROBE_LINEAR const double MAX_LOAD_FAC = 0.6; #else @@ -343,7 +348,7 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D if (ht->kv_indices_cap == 0) { // 1.a) Allocate initial kv-index-buffer. GUF_DICT_KV_META_T *new_kv_indices = allocator->alloc(KV_META_START_CAP * sizeof(GUF_DICT_KV_META_T), allocator->ctx); if (new_kv_indices == NULL) { - guf_err_set_or_panic(err, GUF_ERR_ALLOC_FAIL, GUF_ERR_MSG("in function dict_try_insert: Initial allocation failed")); + guf_err_set_or_panic(err, GUF_ERR_ALLOC_FAIL, GUF_ERR_MSG("in function dict_try_grow: Initial allocation failed")); return; } ht->kv_indices = new_kv_indices; @@ -352,7 +357,7 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D new_kv_indices[i].kv_idx = GUF_DICT_KV_IDX_NULL; new_kv_indices[i].key_hash = 0; } - } else if (GUF_CAT(GUF_DICT_NAME, _load_factor)(ht) > MAX_LOAD_FAC) { // 1.b) Grow kv-index-buffer. + } else if (GUF_CAT(GUF_DICT_NAME, _load_factor)(ht) > MAX_LOAD_FAC) { // 1.b) Grow kv-index-buffer if necessary. GUF_ASSERT(ht->kv_indices); const ptrdiff_t old_size = ht->kv_indices_cap * sizeof(GUF_DICT_KV_META_T); ptrdiff_t new_size = 0; @@ -371,13 +376,13 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D ht->kv_indices_cap = ht->kv_indices_cap * KV_META_GROWTH_FAC; ht->num_tombstones = 0; // ht->max_probelen = 0; - + for (ptrdiff_t i = 0; i < ht->kv_indices_cap; ++i) { ht->kv_indices[i].kv_idx = GUF_DICT_KV_IDX_NULL; ht->kv_indices[i].key_hash = 0; } - for (ptrdiff_t kv_idx = 0; kv_idx < ht->kv_elems.size; ++kv_idx) { + for (ptrdiff_t kv_idx = 0; kv_idx < ht->kv_elems.size; ++kv_idx) { // Re-insert keys. const GUF_DICT_KV_NAME *kv = GUF_CAT(GUF_DICT_KV_DBUF, _at)(&ht->kv_elems, kv_idx); GUF_ASSERT(kv); bool key_exists = false; @@ -389,11 +394,33 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D } } + guf_err_set_if_not_null(err, GUF_ERR_NONE); GUF_ASSERT(GUF_CAT(GUF_DICT_NAME, _load_factor)(ht) <= MAX_LOAD_FAC); +} - // 2.) Insert new key-value pair. + +GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_DICT_KEY_T *key, GUF_DICT_VAL_T *val, guf_cpy_opt key_opt, guf_cpy_opt val_opt, guf_err *err) +{ + GUF_ASSERT_RELEASE(GUF_CAT(GUF_DICT_NAME, _valid)(ht)); + + if (!key || !val) { + guf_err_set_or_panic(err, GUF_ERR_INVALID_ARG, GUF_ERR_MSG("in function dict_try_insert: key or val argument is NULL")); + return; + } + if ((size_t)ht->kv_elems.size == GUF_DICT_MAX_SIZE) { + guf_err_set_or_panic(err, GUF_ERR_INT_OVERFLOW, GUF_ERR_MSG("in function dict_try_insert: dict has reached its max size (UINT64_MAX - 2 or UINT32_MAX - 2)")); + return; + } + + // 1.) Grow kv-index-buffer if neccessary (or make the initial allocation.) + GUF_CAT(GUF_DICT_NAME, _try_grow_if_necessary)(ht, err); + if (err != NULL && *err != GUF_ERR_NONE) { + guf_err_set_or_panic(err, *err, GUF_ERR_MSG("in function dict_try_insert: try_grow failed.")); + return; + } GUF_ASSERT_RELEASE(ht->kv_indices_cap > ht->kv_elems.size); + // 2.) Insert new key-value pair. bool key_exists = false; size_t idx = GUF_CAT(GUF_DICT_NAME, _find_idx)(ht, key, &key_exists); if (key_exists) { @@ -463,6 +490,16 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D GUF_DICT_KV_NAME kv = {.key = key_cpy, .val = val_cpy}; GUF_CAT(GUF_DICT_KV_DBUF, _try_push_val)(&ht->kv_elems, kv, err); + + if (err && *err != GUF_ERR_NONE) { // Insertion failed. + GUF_ASSERT(*err != GUF_ERR_IDX_RANGE && *err != GUF_ERR_INVALID_ARG); + #ifdef GUF_DICT_KEY_T_FREE + GUF_DICT_KEY_T_FREE(&kv.key, NULL); + #endif + #ifdef GUF_DICT_VAL_T_FREE + GUF_DICT_VAL_T_FREE(&kv.val, NULL); + #endif + } } GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _insert)(GUF_DICT_NAME *ht, GUF_DICT_KEY_T *key, GUF_DICT_VAL_T *val, guf_cpy_opt key_opt, guf_cpy_opt val_opt) @@ -517,6 +554,11 @@ GUF_DICT_KWRDS bool GUF_CAT(GUF_DICT_NAME, _contains)(GUF_DICT_NAME *ht, const G } bool key_exists = false; const size_t idx = GUF_CAT(GUF_DICT_NAME, _find_idx)(ht, key, &key_exists); + if (key_exists) { + GUF_ASSERT(idx != SIZE_MAX); + GUF_ASSERT(ht->kv_indices[idx].kv_idx != GUF_DICT_KV_IDX_TOMBSTONE); + GUF_ASSERT(ht->kv_indices[idx].kv_idx != GUF_DICT_KV_IDX_NULL); + } (void)idx; return key_exists; } @@ -540,13 +582,16 @@ GUF_DICT_KWRDS bool GUF_CAT(GUF_DICT_NAME, _erase)(GUF_DICT_NAME *ht, const GUF_ if (!key_exists) { return false; } - GUF_ASSERT((ptrdiff_t)idx < ht->kv_indices_cap); + const size_t kv_idx = ht->kv_indices[idx].kv_idx; + GUF_ASSERT(kv_idx < (size_t)ht->kv_elems.size); ht->kv_indices[idx].kv_idx = GUF_DICT_KV_IDX_TOMBSTONE; ht->kv_indices[idx].key_hash = 0; + ht->num_tombstones += 1; + GUF_DICT_KV_NAME *kv = GUF_CAT(GUF_DICT_KV_DBUF, _at)(&ht->kv_elems, kv_idx); GUF_ASSERT(kv); @@ -556,16 +601,45 @@ GUF_DICT_KWRDS bool GUF_CAT(GUF_DICT_NAME, _erase)(GUF_DICT_NAME *ht, const GUF_ // 1.) Switch kv_elem. GUF_DICT_KV_NAME *last_kv = GUF_CAT(GUF_DICT_KV_DBUF, _back)(&ht->kv_elems); GUF_ASSERT(last_kv); + GUF_ASSERT(kv != last_kv); + *kv = *last_kv; + + GUF_ASSERT(!GUF_DICT_KEY_T_EQ(key, &last_kv->key)); + // 2.) Update kv_index. bool last_key_exists = false; - const size_t last_idx = GUF_CAT(GUF_DICT_NAME, _find_idx)(ht, &kv->key, &last_key_exists); + const size_t last_idx = GUF_CAT(GUF_DICT_NAME, _find_idx)(ht, &last_kv->key, &last_key_exists); + GUF_ASSERT(last_idx != idx); GUF_ASSERT(last_key_exists && (ptrdiff_t)last_idx < ht->kv_indices_cap); + GUF_ASSERT((ptrdiff_t)ht->kv_indices[last_idx].kv_idx == ht->kv_elems.size - 1); + GUF_ASSERT(ht->kv_indices[last_idx].kv_idx != GUF_DICT_KV_IDX_TOMBSTONE && ht->kv_indices[last_idx].kv_idx != GUF_DICT_KV_IDX_NULL); ht->kv_indices[last_idx].kv_idx = kv_idx; } - ht->kv_elems.size--; - ht->num_tombstones++; + ht->kv_elems.size -= 1; + + GUF_ASSERT(ht->kv_elems.size >= 0); + GUF_ASSERT(ht->num_tombstones <= ht->kv_indices_cap); + + GUF_ASSERT(!GUF_CAT(GUF_DICT_NAME, _contains)(ht, key)); + + if (ht->kv_elems.size == 0 && ht->num_tombstones > 0) { // Optimisation: We can delete all tombstones here. + ptrdiff_t del_tombstone_cnt = 0; + for (ptrdiff_t i = 0; i < ht->kv_indices_cap && del_tombstone_cnt < ht->num_tombstones; ++i) { + GUF_ASSERT(ht->kv_indices[i].kv_idx == GUF_DICT_KV_IDX_TOMBSTONE || ht->kv_indices[i].kv_idx == GUF_DICT_KV_IDX_NULL); + if (ht->kv_indices[i].kv_idx == GUF_DICT_KV_IDX_TOMBSTONE) { + ht->kv_indices[i].kv_idx = GUF_DICT_KV_IDX_NULL; + ht->kv_indices[i].key_hash = 0; + ++del_tombstone_cnt; + } else { + GUF_ASSERT(ht->kv_indices[i].kv_idx == GUF_DICT_KV_IDX_NULL); + } + } + GUF_ASSERT(del_tombstone_cnt == ht->num_tombstones); + ht->num_tombstones = 0; + } + return true; } diff --git a/src/test/test_dict.hpp b/src/test/test_dict.hpp index 01141d1..a0138d2 100644 --- a/src/test/test_dict.hpp +++ b/src/test/test_dict.hpp @@ -33,6 +33,7 @@ struct DictSvToIntTest : public Test guf_str_view d = {.len = (ptrdiff_t)strlen(GUF_UTF8_COMMON_PUNCT[i]), .str = GUF_UTF8_COMMON_PUNCT[i]}; dbuf_str_view_push_val(&delims, d); } + guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; guf_str_view tok; while ((tok = guf_str_next_tok(&input_str, delims.data, delims.size, NULL, -1)).len) { @@ -53,10 +54,12 @@ struct DictSvToIntTest : public Test } // printf("tok_len: %td ", tok.len); // printf("'%.*s'\n", (int)tok.len, tok.str); + TEST_CHECK(dict_sv_i32_debug_valid_size(&word_cnt_dict)); } dbuf_str_view_free(&delims, NULL); TEST_CHECK(dict_sv_i32_size(&word_cnt_dict) == std::ssize(word_cnt_map)); + TEST_CHECK(dict_sv_i32_debug_valid_size(&word_cnt_dict)); for (const auto & [word, cnt] : word_cnt_map ) { guf_str_view sv = {.str = word.data(), .len = (ptrdiff_t)word.size()}; @@ -79,10 +82,101 @@ struct DictSvToIntTest : public Test } TEST_CHECK(i == dict_sv_i32_size(&word_cnt_dict)); TEST_CHECK(i == std::ssize(word_cnt_map)); + TEST_CHECK(dict_sv_i32_debug_valid_size(&word_cnt_dict)); std::cout << "load fac: " << dict_sv_i32_load_factor(&word_cnt_dict) << ", cap: " << word_cnt_dict.kv_indices_cap << "\n"; std::cout << "size: " << dict_sv_i32_size(&word_cnt_dict) << ", max probelen: " << word_cnt_dict.max_probelen << "\n"; + const double load_fac_before_erase = dict_sv_i32_load_factor(&word_cnt_dict); + const ptrdiff_t size_before_erase = dict_sv_i32_size(&word_cnt_dict); + ptrdiff_t num_del = 0; + while (dict_sv_i32_size(&word_cnt_dict) > size_before_erase / 2) { + dict_sv_i32_kv *kv = NULL; + if (num_del % 2) { + dict_sv_i32_iter it = dict_sv_i32_begin(&word_cnt_dict); + GUF_ASSERT_RELEASE(!dict_sv_i32_iter_is_end(&word_cnt_dict, it)); + kv = it.ptr; + } else { + dict_sv_i32_iter rit = dict_sv_i32_rbegin(&word_cnt_dict); + GUF_ASSERT_RELEASE(!dict_sv_i32_iter_is_end(&word_cnt_dict, rit)); + kv = rit.ptr; + } + GUF_ASSERT_RELEASE(kv); + + const guf_str_view key = kv->key; + + const bool del_success = dict_sv_i32_erase(&word_cnt_dict, &key); + TEST_CHECK(del_success); + TEST_CHECK(!dict_sv_i32_contains(&word_cnt_dict, &key)); + + std::string_view sv(key.str, (size_t)key.len); + if (TEST_CHECK(word_cnt_map.contains(sv))) { + word_cnt_map.erase(sv); + } + TEST_CHECK(!word_cnt_map.contains(sv)); + + if (del_success) { + ++num_del; + } + } + TEST_CHECK(dict_sv_i32_size(&word_cnt_dict) >= 0); + TEST_CHECK(size_before_erase - num_del == dict_sv_i32_size(&word_cnt_dict)); + TEST_CHECK(std::ssize(word_cnt_map) == dict_sv_i32_size(&word_cnt_dict)); + + if (dict_sv_i32_size(&word_cnt_dict) != 0) { + TEST_CHECK(load_fac_before_erase == dict_sv_i32_load_factor(&word_cnt_dict)); + } else { + TEST_CHECK(dict_sv_i32_load_factor(&word_cnt_dict) == 0); + } + + if (dict_sv_i32_size(&word_cnt_dict) >= 4) { + dict_sv_i32_kv_dbuf_iter it = dict_sv_i32_begin(&word_cnt_dict); + it = dict_sv_i32_iter_next(&word_cnt_dict, it, 1); + GUF_ASSERT_RELEASE(!dict_sv_i32_iter_is_end(&word_cnt_dict, it)); + + guf_str_view key = it.ptr->key; + + bool del_success = dict_sv_i32_erase(&word_cnt_dict, &key); + TEST_CHECK(del_success); + TEST_CHECK(!dict_sv_i32_contains(&word_cnt_dict, &key)); + + std::string_view sv(key.str, (size_t)key.len); + if (TEST_CHECK(word_cnt_map.contains(sv))) { + word_cnt_map.erase(sv); + } + + it = dict_sv_i32_rbegin(&word_cnt_dict); + it = dict_sv_i32_iter_next(&word_cnt_dict, it, 1); + GUF_ASSERT_RELEASE(!dict_sv_i32_iter_is_end(&word_cnt_dict, it)); + key = it.ptr->key; + + del_success = dict_sv_i32_erase(&word_cnt_dict, &key); + TEST_CHECK(del_success); + TEST_CHECK(!dict_sv_i32_contains(&word_cnt_dict, &key)); + + sv = std::string_view(key.str, (size_t)key.len); + if (TEST_CHECK(word_cnt_map.contains(sv))) { + word_cnt_map.erase(sv); + } + } + TEST_CHECK(std::ssize(word_cnt_map) == dict_sv_i32_size(&word_cnt_dict)); + + i = 0; + GUF_CNT_FOREACH(&word_cnt_dict, dict_sv_i32, kv_it) { + const dict_sv_i32_kv *kv = kv_it.ptr; + if (TEST_CHECK(kv)) { + const int32_t cnt = kv->val; + const std::string_view sv(kv->key.str, (size_t)kv->key.len); + if (TEST_CHECK(word_cnt_map.contains(sv))) { + TEST_CHECK(word_cnt_map.at(sv) == cnt); + } + ++i; + } + } + TEST_CHECK(i == word_cnt_dict.kv_elems.size); + TEST_CHECK(i == std::ssize(word_cnt_map)); + + dict_sv_i32_free(&word_cnt_dict, NULL); bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size; TEST_CHECK(dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones); diff --git a/todo.txt b/todo.txt index 230f7a0..9d58cfb 100644 --- a/todo.txt +++ b/todo.txt @@ -2,4 +2,5 @@ - guf_rand etc.: move guf_fn_keywors out of header guard? (-> no, add a GUF_WITHOUT_TYPES) - unicode normalisation -- fix 32-bit dict (and add 32/64 bit defs in common.h) \ No newline at end of file +- fix 32-bit dict (and add 32/64 bit defs in common.h) +- guf_dict: allow manual resize (and possibly resize if load fac gets to high after erase) \ No newline at end of file