From 4d97725bba98f5e042c024b32ca8382fa8453d5e Mon Sep 17 00:00:00 2001 From: jun <83899451+zeichensystem@users.noreply.github.com> Date: Sun, 2 Mar 2025 21:28:43 +0100 Subject: [PATCH] Fix guf_dict growth bug --- CMakeLists.txt | 4 +-- src/guf_dict.h | 11 +++++--- src/guf_str.h | 23 ++++++++++++++--- src/guf_utf8.h | 2 +- src/test/guf_dict_impl.c | 9 +++++++ src/test/guf_dict_impl.h | 9 +++++++ src/test/test.cpp | 2 +- src/test/test.hpp | 2 +- src/test/test_dict.hpp | 56 +++++++++++++++++++++++++++++++++++----- src/test/test_utf8.hpp | 1 - 10 files changed, 99 insertions(+), 20 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d568885..ff76fcf 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,10 +18,10 @@ if (NOT DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/bin) endif () -add_executable(libguf_example src/test/example.c src/test/guf_dict_impl.c) +add_executable(libguf_example src/test/example.c src/test/guf_str_impl.c src/test/guf_dict_impl.c) target_include_directories(libguf_example PRIVATE src src/test) -add_executable(libguf_test src/test/test.cpp src/test/guf_dbuf_impl.c src/test/guf_dict_impl.c src/test/guf_rand_impl.c src/test/guf_sort_impl.c src/test/guf_str_impl.c) +add_executable(libguf_test src/test/test.cpp src/test/guf_dbuf_impl.c src/test/guf_str_impl.c src/test/guf_dict_impl.c src/test/guf_rand_impl.c src/test/guf_sort_impl.c) target_include_directories(libguf_test PRIVATE src src/test) if (NOT DEFINED MSVC) diff --git a/src/guf_dict.h b/src/guf_dict.h index 52c0bcb..79c1f8a 100755 --- a/src/guf_dict.h +++ b/src/guf_dict.h @@ -300,6 +300,7 @@ static size_t GUF_CAT(GUF_DICT_NAME, _find_idx)(GUF_DICT_NAME *ht, const GUF_DIC probe: idx = GUF_MOD_CAP(idx + GUF_CAT(GUF_DICT_NAME, _probe_offset)(probe_len)); ++probe_len; + // printf("sz %td cap %td proble-len %td\n", ht->kv_elems.size, ht->kv_indices_cap, probe_len); GUF_ASSERT((ptrdiff_t)probe_len <= (ht->kv_elems.size + ht->num_tombstones)); } } while (idx != start_idx); @@ -328,7 +329,8 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D return; } - const ptrdiff_t KV_META_START_CAP = 64; // Must be a power of two > 0. + const double MAX_LOAD_FAC = 0.66; + const ptrdiff_t KV_META_START_CAP = 32; // Must be a power of two > 0. const ptrdiff_t KV_META_GROWTH_FAC = 2; // Must be a power of two > 0. guf_allocator *allocator = ht->kv_elems.allocator; @@ -345,7 +347,7 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D new_kv_indices[i].kv_idx = GUF_DICT_KV_IDX_NULL; new_kv_indices[i].key_hash = 0; } - } else if (GUF_CAT(GUF_DICT_NAME, _load_factor)(ht) > 0.6) { // 1.b) Grow kv-index-buffer. + } else if (GUF_CAT(GUF_DICT_NAME, _load_factor)(ht) > MAX_LOAD_FAC) { // 1.b) Grow kv-index-buffer. GUF_ASSERT(ht->kv_indices); const ptrdiff_t old_size = ht->kv_indices_cap * sizeof(GUF_DICT_KV_META_T); ptrdiff_t new_size = 0; @@ -371,12 +373,15 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D ht->kv_indices = new_kv_indices; ht->kv_indices_cap = new_kv_meta_cap; ht->num_tombstones = 0; + ht->max_probelen = 0; ptrdiff_t cnt = 0; for (ptrdiff_t i = 0; i < old_kv_indices_cap; ++i) { // Copy old kv-indices into new kv-index-buffer. if (old_kv_indices[i].kv_idx != GUF_DICT_KV_IDX_NULL && old_kv_indices[i].kv_idx != GUF_DICT_KV_IDX_TOMBSTONE) { bool key_exists = false; - size_t new_idx = GUF_CAT(GUF_DICT_NAME, _find_idx)(ht, key, &key_exists); + const GUF_DICT_KV_NAME *kv = GUF_CAT(GUF_DICT_KV_DBUF, _at)(&ht->kv_elems, old_kv_indices[i].kv_idx); + GUF_ASSERT(kv); + size_t new_idx = GUF_CAT(GUF_DICT_NAME, _find_idx)(ht, &kv->key, &key_exists); GUF_ASSERT(!key_exists); GUF_ASSERT(new_idx < SIZE_MAX); ht->kv_indices[new_idx] = old_kv_indices[i]; diff --git a/src/guf_str.h b/src/guf_str.h index e574a30..880b07c 100644 --- a/src/guf_str.h +++ b/src/guf_str.h @@ -10,6 +10,7 @@ #include "guf_alloc.h" #include "guf_str_view_type.h" #include "guf_utf8.h" +#include "guf_hash.h" typedef enum guf_str_state { GUF_STR_STATE_INIT = 0, @@ -85,6 +86,8 @@ GUF_STR_KWRDS bool guf_str_is_stack_allocated(const guf_str *str); GUF_STR_KWRDS bool guf_str_is_valid(const guf_str *str); GUF_STR_KWRDS bool guf_str_alloc_success(const guf_str *str); +GUF_STR_KWRDS guf_hash_size_t guf_str_view_hash(const guf_str_view *sv); + // Comparison: GUF_STR_KWRDS bool guf_str_view_equal(const guf_str_view* a, const guf_str_view* b); GUF_STR_KWRDS bool guf_str_equal(const guf_str *a, const guf_str *b); @@ -194,20 +197,32 @@ GUF_STR_KWRDS guf_str_view guf_substr_view(guf_str_view str, ptrdiff_t pos, ptrd return (guf_str_view){.str = str.str + pos, .len = substr_len}; } +GUF_STR_KWRDS guf_hash_size_t guf_str_view_hash(const guf_str_view *sv) +{ + GUF_ASSERT(sv); + if (!sv->str || sv->len <= 0) { + return GUF_HASH_INIT; + } + + return guf_hash(sv->str, sv->len, GUF_HASH_INIT); +} + // Comparison: GUF_STR_KWRDS bool guf_str_view_equal(const guf_str_view* a, const guf_str_view* b) { GUF_ASSERT_RELEASE(a && b); - GUF_ASSERT_RELEASE(a->str && b->str); if (a->len != b->len) { return false; } - GUF_ASSERT_RELEASE(a->len >= 0); - if (a->len == 0) { - return a->str == b->str; // Compare pointers by value here. + if ((!a->str && b->str) || (!b->str && a->str)) { + return false; + } else if (!a->str && !b->str) { + return a->len == b->len; } + GUF_ASSERT_RELEASE(a->len >= 0); + return 0 == memcmp(a->str, b->str, a->len); } diff --git a/src/guf_utf8.h b/src/guf_utf8.h index 41df69c..6b4b6d1 100644 --- a/src/guf_utf8.h +++ b/src/guf_utf8.h @@ -133,7 +133,7 @@ GUF_UTF8_KWRDS bool guf_utf8_encode(guf_utf8_char *result, uint32_t cp) for (int byte_n = num_bytes - 1; byte_n >= 0 && cp > 0; --byte_n) { const int bits = (byte_n == 0) ? first_byte_bits : tail_byte_bits; const uint32_t cp_mask = (UINT32_C(1) << bits) - 1; - result->bytes[byte_n] = (unsigned char)result->bytes[byte_n] | (cp & cp_mask); + result->bytes[byte_n] = (char)((unsigned char)result->bytes[byte_n] | (cp & cp_mask)); cp = cp >> bits; cp_bits += bits; } diff --git a/src/test/guf_dict_impl.c b/src/test/guf_dict_impl.c index e56e05d..e978418 100644 --- a/src/test/guf_dict_impl.c +++ b/src/test/guf_dict_impl.c @@ -9,6 +9,15 @@ #define GUF_DICT_IMPL #include "guf_dict.h" +#define GUF_DICT_KEY_T guf_str_view +#define GUF_DICT_KEY_HASH guf_str_view_hash +#define GUF_DICT_KEY_T_EQ guf_str_view_equal +#define GUF_DICT_VAL_T int32_t +#define GUF_DICT_VAL_T_IS_INTEGRAL_TYPE +#define GUF_DICT_NAME dict_sv_i32 +#define GUF_DICT_IMPL +#include "guf_dict.h" + #define GUF_DICT_KEY_T int32_t #define GUF_DICT_KEY_HASH int32_hash #define GUF_DICT_KEY_T_EQ int32_eq diff --git a/src/test/guf_dict_impl.h b/src/test/guf_dict_impl.h index 84845f2..ab37b63 100644 --- a/src/test/guf_dict_impl.h +++ b/src/test/guf_dict_impl.h @@ -3,6 +3,7 @@ #include "guf_common.h" #include "guf_cstr.h" +#include "guf_str.h" #define GUF_DICT_KEY_T guf_cstr_const #define GUF_DICT_KEY_HASH guf_cstr_const_hash @@ -12,6 +13,14 @@ #define GUF_DICT_NAME dict_cstr_int #include "guf_dict.h" +#define GUF_DICT_KEY_T guf_str_view +#define GUF_DICT_KEY_HASH guf_str_view_hash +#define GUF_DICT_KEY_T_EQ guf_str_view_equal +#define GUF_DICT_VAL_T int32_t +#define GUF_DICT_VAL_T_IS_INTEGRAL_TYPE +#define GUF_DICT_NAME dict_sv_i32 +#include "guf_dict.h" + static inline guf_hash_size_t int32_hash(const int32_t *a) { return guf_hash(a, sizeof(int32_t), GUF_HASH_INIT); diff --git a/src/test/test.cpp b/src/test/test.cpp index fde5183..21e143e 100644 --- a/src/test/test.cpp +++ b/src/test/test.cpp @@ -23,7 +23,7 @@ void init_tests() GUF_ASSERT_RELEASE(test.get()); g_tests.insert(std::move(test)); - test = std::make_unique("DictCstrToIntTest"); + test = std::make_unique("DictSvToIntTest"); GUF_ASSERT_RELEASE(test.get()); g_tests.insert(std::move(test)); diff --git a/src/test/test.hpp b/src/test/test.hpp index 2f3978a..a5f8b52 100644 --- a/src/test/test.hpp +++ b/src/test/test.hpp @@ -10,7 +10,7 @@ #include "guf_common.h" -#define TEST_CHECK(COND) check((COND), GUF_STRINGIFY(COND), __LINE__, __FILE__) +#define TEST_CHECK(COND) (check((COND), GUF_STRINGIFY(COND), __LINE__, __FILE__)) struct Test { diff --git a/src/test/test_dict.hpp b/src/test/test_dict.hpp index 707230b..282b78d 100644 --- a/src/test/test_dict.hpp +++ b/src/test/test_dict.hpp @@ -9,10 +9,10 @@ extern "C" #include "guf_str.h" } -struct DictCstrToIntTest : public Test +struct DictSvToIntTest : public Test { - DictCstrToIntTest(const std::string& name) : Test(name) {}; + DictSvToIntTest(const std::string& name) : Test(name) {}; private: @@ -21,9 +21,9 @@ struct DictCstrToIntTest : public Test void insert_lookup() { - std::unordered_map word_cnt_map {}; - dict_cstr_int word_cnt_dict {}; - dict_cstr_int_init(&word_cnt_dict, &guf_allocator_libc); + std::unordered_map word_cnt_map {}; + dict_sv_i32 word_cnt_dict {}; + dict_sv_i32_init(&word_cnt_dict, &guf_allocator_libc); dbuf_str_view delims = dbuf_str_view_new(&guf_allocator_libc); for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(GUF_UTF8_WHITESPACE); ++i) { @@ -37,14 +37,56 @@ struct DictCstrToIntTest : public Test guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; guf_str_view tok; while ((tok = guf_str_next_tok(&input_str, delims.data, delims.size, NULL, -1)).len) { + if (tok.len <= 0) { + continue; + } + std::string_view sv(tok.str, tok.len); + TEST_CHECK(dict_sv_i32_contains(&word_cnt_dict, &tok) == word_cnt_map.contains(sv)); + if (!dict_sv_i32_contains(&word_cnt_dict, &tok)) { + dict_sv_i32_insert_val_arg(&word_cnt_dict, tok, 1, GUF_CPY_VALUE, GUF_CPY_VALUE); + word_cnt_map.insert({sv, 1}); + } else { + int32_t *cnt = dict_sv_i32_at_val_arg(&word_cnt_dict, tok); + if (TEST_CHECK(cnt)) { + *cnt += 1; + } + word_cnt_map.at(sv) += 1; + } // printf("tok_len: %td ", tok.len); // printf("'%.*s'\n", (int)tok.len, tok.str); } dbuf_str_view_free(&delims, NULL); - dict_cstr_int_free(&word_cnt_dict, NULL); + TEST_CHECK(dict_sv_i32_size(&word_cnt_dict) == std::ssize(word_cnt_map)); + + for (const auto & [word, cnt] : word_cnt_map ) { + guf_str_view sv = {.str = word.data(), .len = (ptrdiff_t)word.size()}; + int32_t *res = dict_sv_i32_at(&word_cnt_dict, &sv); + TEST_CHECK(res && *res == cnt); + } + + ptrdiff_t i = 0; + GUF_CNT_FOREACH(&word_cnt_dict, dict_sv_i32, kv_it) { + const dict_sv_i32_kv *kv = kv_it.ptr; + if (TEST_CHECK(kv)) { + const int32_t cnt = kv->val; + // printf("%.*s: %d\n", (int)kv->key.len, kv->key.str, cnt); + const std::string_view sv(kv->key.str, kv->key.len); + if (TEST_CHECK(word_cnt_map.contains(sv))) { + TEST_CHECK(word_cnt_map.at(sv) == cnt); + } + } + ++i; + } + TEST_CHECK(i == dict_sv_i32_size(&word_cnt_dict)); + TEST_CHECK(i == std::ssize(word_cnt_map)); + + // std::cout << "load fac: " << dict_sv_i32_load_factor(&word_cnt_dict) << ", cap: " << word_cnt_dict.kv_indices_cap << "\n"; + // std::cout << "size: " << dict_sv_i32_size(&word_cnt_dict) << ", max probelen: " << word_cnt_dict.max_probelen << "\n"; + + dict_sv_i32_free(&word_cnt_dict, NULL); bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size; - TEST_CHECK(!dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones); + TEST_CHECK(dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones); } bool load_file() diff --git a/src/test/test_utf8.hpp b/src/test/test_utf8.hpp index ce70ac1..1fe759e 100644 --- a/src/test/test_utf8.hpp +++ b/src/test/test_utf8.hpp @@ -386,5 +386,4 @@ struct UTF8Test : public Test passed = (num_failed_checks == 0); return passed; } - };