Fix guf_dict growth bug

This commit is contained in:
jun 2025-03-02 21:28:43 +01:00
parent cd1c1cd5db
commit 4d97725bba
10 changed files with 99 additions and 20 deletions

View File

@ -18,10 +18,10 @@ if (NOT DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/bin) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/bin)
endif () endif ()
add_executable(libguf_example src/test/example.c src/test/guf_dict_impl.c) add_executable(libguf_example src/test/example.c src/test/guf_str_impl.c src/test/guf_dict_impl.c)
target_include_directories(libguf_example PRIVATE src src/test) target_include_directories(libguf_example PRIVATE src src/test)
add_executable(libguf_test src/test/test.cpp src/test/guf_dbuf_impl.c src/test/guf_dict_impl.c src/test/guf_rand_impl.c src/test/guf_sort_impl.c src/test/guf_str_impl.c) add_executable(libguf_test src/test/test.cpp src/test/guf_dbuf_impl.c src/test/guf_str_impl.c src/test/guf_dict_impl.c src/test/guf_rand_impl.c src/test/guf_sort_impl.c)
target_include_directories(libguf_test PRIVATE src src/test) target_include_directories(libguf_test PRIVATE src src/test)
if (NOT DEFINED MSVC) if (NOT DEFINED MSVC)

View File

@ -300,6 +300,7 @@ static size_t GUF_CAT(GUF_DICT_NAME, _find_idx)(GUF_DICT_NAME *ht, const GUF_DIC
probe: probe:
idx = GUF_MOD_CAP(idx + GUF_CAT(GUF_DICT_NAME, _probe_offset)(probe_len)); idx = GUF_MOD_CAP(idx + GUF_CAT(GUF_DICT_NAME, _probe_offset)(probe_len));
++probe_len; ++probe_len;
// printf("sz %td cap %td proble-len %td\n", ht->kv_elems.size, ht->kv_indices_cap, probe_len);
GUF_ASSERT((ptrdiff_t)probe_len <= (ht->kv_elems.size + ht->num_tombstones)); GUF_ASSERT((ptrdiff_t)probe_len <= (ht->kv_elems.size + ht->num_tombstones));
} }
} while (idx != start_idx); } while (idx != start_idx);
@ -328,7 +329,8 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D
return; return;
} }
const ptrdiff_t KV_META_START_CAP = 64; // Must be a power of two > 0. const double MAX_LOAD_FAC = 0.66;
const ptrdiff_t KV_META_START_CAP = 32; // Must be a power of two > 0.
const ptrdiff_t KV_META_GROWTH_FAC = 2; // Must be a power of two > 0. const ptrdiff_t KV_META_GROWTH_FAC = 2; // Must be a power of two > 0.
guf_allocator *allocator = ht->kv_elems.allocator; guf_allocator *allocator = ht->kv_elems.allocator;
@ -345,7 +347,7 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D
new_kv_indices[i].kv_idx = GUF_DICT_KV_IDX_NULL; new_kv_indices[i].kv_idx = GUF_DICT_KV_IDX_NULL;
new_kv_indices[i].key_hash = 0; new_kv_indices[i].key_hash = 0;
} }
} else if (GUF_CAT(GUF_DICT_NAME, _load_factor)(ht) > 0.6) { // 1.b) Grow kv-index-buffer. } else if (GUF_CAT(GUF_DICT_NAME, _load_factor)(ht) > MAX_LOAD_FAC) { // 1.b) Grow kv-index-buffer.
GUF_ASSERT(ht->kv_indices); GUF_ASSERT(ht->kv_indices);
const ptrdiff_t old_size = ht->kv_indices_cap * sizeof(GUF_DICT_KV_META_T); const ptrdiff_t old_size = ht->kv_indices_cap * sizeof(GUF_DICT_KV_META_T);
ptrdiff_t new_size = 0; ptrdiff_t new_size = 0;
@ -371,12 +373,15 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D
ht->kv_indices = new_kv_indices; ht->kv_indices = new_kv_indices;
ht->kv_indices_cap = new_kv_meta_cap; ht->kv_indices_cap = new_kv_meta_cap;
ht->num_tombstones = 0; ht->num_tombstones = 0;
ht->max_probelen = 0;
ptrdiff_t cnt = 0; ptrdiff_t cnt = 0;
for (ptrdiff_t i = 0; i < old_kv_indices_cap; ++i) { // Copy old kv-indices into new kv-index-buffer. for (ptrdiff_t i = 0; i < old_kv_indices_cap; ++i) { // Copy old kv-indices into new kv-index-buffer.
if (old_kv_indices[i].kv_idx != GUF_DICT_KV_IDX_NULL && old_kv_indices[i].kv_idx != GUF_DICT_KV_IDX_TOMBSTONE) { if (old_kv_indices[i].kv_idx != GUF_DICT_KV_IDX_NULL && old_kv_indices[i].kv_idx != GUF_DICT_KV_IDX_TOMBSTONE) {
bool key_exists = false; bool key_exists = false;
size_t new_idx = GUF_CAT(GUF_DICT_NAME, _find_idx)(ht, key, &key_exists); const GUF_DICT_KV_NAME *kv = GUF_CAT(GUF_DICT_KV_DBUF, _at)(&ht->kv_elems, old_kv_indices[i].kv_idx);
GUF_ASSERT(kv);
size_t new_idx = GUF_CAT(GUF_DICT_NAME, _find_idx)(ht, &kv->key, &key_exists);
GUF_ASSERT(!key_exists); GUF_ASSERT(!key_exists);
GUF_ASSERT(new_idx < SIZE_MAX); GUF_ASSERT(new_idx < SIZE_MAX);
ht->kv_indices[new_idx] = old_kv_indices[i]; ht->kv_indices[new_idx] = old_kv_indices[i];

View File

@ -10,6 +10,7 @@
#include "guf_alloc.h" #include "guf_alloc.h"
#include "guf_str_view_type.h" #include "guf_str_view_type.h"
#include "guf_utf8.h" #include "guf_utf8.h"
#include "guf_hash.h"
typedef enum guf_str_state { typedef enum guf_str_state {
GUF_STR_STATE_INIT = 0, GUF_STR_STATE_INIT = 0,
@ -85,6 +86,8 @@ GUF_STR_KWRDS bool guf_str_is_stack_allocated(const guf_str *str);
GUF_STR_KWRDS bool guf_str_is_valid(const guf_str *str); GUF_STR_KWRDS bool guf_str_is_valid(const guf_str *str);
GUF_STR_KWRDS bool guf_str_alloc_success(const guf_str *str); GUF_STR_KWRDS bool guf_str_alloc_success(const guf_str *str);
GUF_STR_KWRDS guf_hash_size_t guf_str_view_hash(const guf_str_view *sv);
// Comparison: // Comparison:
GUF_STR_KWRDS bool guf_str_view_equal(const guf_str_view* a, const guf_str_view* b); GUF_STR_KWRDS bool guf_str_view_equal(const guf_str_view* a, const guf_str_view* b);
GUF_STR_KWRDS bool guf_str_equal(const guf_str *a, const guf_str *b); GUF_STR_KWRDS bool guf_str_equal(const guf_str *a, const guf_str *b);
@ -194,20 +197,32 @@ GUF_STR_KWRDS guf_str_view guf_substr_view(guf_str_view str, ptrdiff_t pos, ptrd
return (guf_str_view){.str = str.str + pos, .len = substr_len}; return (guf_str_view){.str = str.str + pos, .len = substr_len};
} }
GUF_STR_KWRDS guf_hash_size_t guf_str_view_hash(const guf_str_view *sv)
{
GUF_ASSERT(sv);
if (!sv->str || sv->len <= 0) {
return GUF_HASH_INIT;
}
return guf_hash(sv->str, sv->len, GUF_HASH_INIT);
}
// Comparison: // Comparison:
GUF_STR_KWRDS bool guf_str_view_equal(const guf_str_view* a, const guf_str_view* b) GUF_STR_KWRDS bool guf_str_view_equal(const guf_str_view* a, const guf_str_view* b)
{ {
GUF_ASSERT_RELEASE(a && b); GUF_ASSERT_RELEASE(a && b);
GUF_ASSERT_RELEASE(a->str && b->str);
if (a->len != b->len) { if (a->len != b->len) {
return false; return false;
} }
GUF_ASSERT_RELEASE(a->len >= 0); if ((!a->str && b->str) || (!b->str && a->str)) {
if (a->len == 0) { return false;
return a->str == b->str; // Compare pointers by value here. } else if (!a->str && !b->str) {
return a->len == b->len;
} }
GUF_ASSERT_RELEASE(a->len >= 0);
return 0 == memcmp(a->str, b->str, a->len); return 0 == memcmp(a->str, b->str, a->len);
} }

View File

@ -133,7 +133,7 @@ GUF_UTF8_KWRDS bool guf_utf8_encode(guf_utf8_char *result, uint32_t cp)
for (int byte_n = num_bytes - 1; byte_n >= 0 && cp > 0; --byte_n) { for (int byte_n = num_bytes - 1; byte_n >= 0 && cp > 0; --byte_n) {
const int bits = (byte_n == 0) ? first_byte_bits : tail_byte_bits; const int bits = (byte_n == 0) ? first_byte_bits : tail_byte_bits;
const uint32_t cp_mask = (UINT32_C(1) << bits) - 1; const uint32_t cp_mask = (UINT32_C(1) << bits) - 1;
result->bytes[byte_n] = (unsigned char)result->bytes[byte_n] | (cp & cp_mask); result->bytes[byte_n] = (char)((unsigned char)result->bytes[byte_n] | (cp & cp_mask));
cp = cp >> bits; cp = cp >> bits;
cp_bits += bits; cp_bits += bits;
} }

View File

@ -9,6 +9,15 @@
#define GUF_DICT_IMPL #define GUF_DICT_IMPL
#include "guf_dict.h" #include "guf_dict.h"
#define GUF_DICT_KEY_T guf_str_view
#define GUF_DICT_KEY_HASH guf_str_view_hash
#define GUF_DICT_KEY_T_EQ guf_str_view_equal
#define GUF_DICT_VAL_T int32_t
#define GUF_DICT_VAL_T_IS_INTEGRAL_TYPE
#define GUF_DICT_NAME dict_sv_i32
#define GUF_DICT_IMPL
#include "guf_dict.h"
#define GUF_DICT_KEY_T int32_t #define GUF_DICT_KEY_T int32_t
#define GUF_DICT_KEY_HASH int32_hash #define GUF_DICT_KEY_HASH int32_hash
#define GUF_DICT_KEY_T_EQ int32_eq #define GUF_DICT_KEY_T_EQ int32_eq

View File

@ -3,6 +3,7 @@
#include "guf_common.h" #include "guf_common.h"
#include "guf_cstr.h" #include "guf_cstr.h"
#include "guf_str.h"
#define GUF_DICT_KEY_T guf_cstr_const #define GUF_DICT_KEY_T guf_cstr_const
#define GUF_DICT_KEY_HASH guf_cstr_const_hash #define GUF_DICT_KEY_HASH guf_cstr_const_hash
@ -12,6 +13,14 @@
#define GUF_DICT_NAME dict_cstr_int #define GUF_DICT_NAME dict_cstr_int
#include "guf_dict.h" #include "guf_dict.h"
#define GUF_DICT_KEY_T guf_str_view
#define GUF_DICT_KEY_HASH guf_str_view_hash
#define GUF_DICT_KEY_T_EQ guf_str_view_equal
#define GUF_DICT_VAL_T int32_t
#define GUF_DICT_VAL_T_IS_INTEGRAL_TYPE
#define GUF_DICT_NAME dict_sv_i32
#include "guf_dict.h"
static inline guf_hash_size_t int32_hash(const int32_t *a) static inline guf_hash_size_t int32_hash(const int32_t *a)
{ {
return guf_hash(a, sizeof(int32_t), GUF_HASH_INIT); return guf_hash(a, sizeof(int32_t), GUF_HASH_INIT);

View File

@ -23,7 +23,7 @@ void init_tests()
GUF_ASSERT_RELEASE(test.get()); GUF_ASSERT_RELEASE(test.get());
g_tests.insert(std::move(test)); g_tests.insert(std::move(test));
test = std::make_unique<DictCstrToIntTest>("DictCstrToIntTest"); test = std::make_unique<DictSvToIntTest>("DictSvToIntTest");
GUF_ASSERT_RELEASE(test.get()); GUF_ASSERT_RELEASE(test.get());
g_tests.insert(std::move(test)); g_tests.insert(std::move(test));

View File

@ -10,7 +10,7 @@
#include "guf_common.h" #include "guf_common.h"
#define TEST_CHECK(COND) check((COND), GUF_STRINGIFY(COND), __LINE__, __FILE__) #define TEST_CHECK(COND) (check((COND), GUF_STRINGIFY(COND), __LINE__, __FILE__))
struct Test struct Test
{ {

View File

@ -9,10 +9,10 @@ extern "C"
#include "guf_str.h" #include "guf_str.h"
} }
struct DictCstrToIntTest : public Test struct DictSvToIntTest : public Test
{ {
DictCstrToIntTest(const std::string& name) : Test(name) {}; DictSvToIntTest(const std::string& name) : Test(name) {};
private: private:
@ -21,9 +21,9 @@ struct DictCstrToIntTest : public Test
void insert_lookup() void insert_lookup()
{ {
std::unordered_map<std::string, int> word_cnt_map {}; std::unordered_map<std::string_view, int32_t> word_cnt_map {};
dict_cstr_int word_cnt_dict {}; dict_sv_i32 word_cnt_dict {};
dict_cstr_int_init(&word_cnt_dict, &guf_allocator_libc); dict_sv_i32_init(&word_cnt_dict, &guf_allocator_libc);
dbuf_str_view delims = dbuf_str_view_new(&guf_allocator_libc); dbuf_str_view delims = dbuf_str_view_new(&guf_allocator_libc);
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(GUF_UTF8_WHITESPACE); ++i) { for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(GUF_UTF8_WHITESPACE); ++i) {
@ -37,14 +37,56 @@ struct DictCstrToIntTest : public Test
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
guf_str_view tok; guf_str_view tok;
while ((tok = guf_str_next_tok(&input_str, delims.data, delims.size, NULL, -1)).len) { while ((tok = guf_str_next_tok(&input_str, delims.data, delims.size, NULL, -1)).len) {
if (tok.len <= 0) {
continue;
}
std::string_view sv(tok.str, tok.len);
TEST_CHECK(dict_sv_i32_contains(&word_cnt_dict, &tok) == word_cnt_map.contains(sv));
if (!dict_sv_i32_contains(&word_cnt_dict, &tok)) {
dict_sv_i32_insert_val_arg(&word_cnt_dict, tok, 1, GUF_CPY_VALUE, GUF_CPY_VALUE);
word_cnt_map.insert({sv, 1});
} else {
int32_t *cnt = dict_sv_i32_at_val_arg(&word_cnt_dict, tok);
if (TEST_CHECK(cnt)) {
*cnt += 1;
}
word_cnt_map.at(sv) += 1;
}
// printf("tok_len: %td ", tok.len); // printf("tok_len: %td ", tok.len);
// printf("'%.*s'\n", (int)tok.len, tok.str); // printf("'%.*s'\n", (int)tok.len, tok.str);
} }
dbuf_str_view_free(&delims, NULL); dbuf_str_view_free(&delims, NULL);
dict_cstr_int_free(&word_cnt_dict, NULL); TEST_CHECK(dict_sv_i32_size(&word_cnt_dict) == std::ssize(word_cnt_map));
for (const auto & [word, cnt] : word_cnt_map ) {
guf_str_view sv = {.str = word.data(), .len = (ptrdiff_t)word.size()};
int32_t *res = dict_sv_i32_at(&word_cnt_dict, &sv);
TEST_CHECK(res && *res == cnt);
}
ptrdiff_t i = 0;
GUF_CNT_FOREACH(&word_cnt_dict, dict_sv_i32, kv_it) {
const dict_sv_i32_kv *kv = kv_it.ptr;
if (TEST_CHECK(kv)) {
const int32_t cnt = kv->val;
// printf("%.*s: %d\n", (int)kv->key.len, kv->key.str, cnt);
const std::string_view sv(kv->key.str, kv->key.len);
if (TEST_CHECK(word_cnt_map.contains(sv))) {
TEST_CHECK(word_cnt_map.at(sv) == cnt);
}
}
++i;
}
TEST_CHECK(i == dict_sv_i32_size(&word_cnt_dict));
TEST_CHECK(i == std::ssize(word_cnt_map));
// std::cout << "load fac: " << dict_sv_i32_load_factor(&word_cnt_dict) << ", cap: " << word_cnt_dict.kv_indices_cap << "\n";
// std::cout << "size: " << dict_sv_i32_size(&word_cnt_dict) << ", max probelen: " << word_cnt_dict.max_probelen << "\n";
dict_sv_i32_free(&word_cnt_dict, NULL);
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size; bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;
TEST_CHECK(!dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones); TEST_CHECK(dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones);
} }
bool load_file() bool load_file()

View File

@ -386,5 +386,4 @@ struct UTF8Test : public Test
passed = (num_failed_checks == 0); passed = (num_failed_checks == 0);
return passed; return passed;
} }
}; };