diff --git a/src/guf_dict.h b/src/guf_dict.h index 79c1f8a..4f7e131 100755 --- a/src/guf_dict.h +++ b/src/guf_dict.h @@ -254,13 +254,13 @@ static inline size_t GUF_CAT(GUF_DICT_NAME, _probe_offset)(size_t probe_len) { GUF_ASSERT(probe_len > 0); #ifdef GUF_DICT_PROBE_LINEAR - return 1; + return probe_len; // 1, 2, 3, 4, 5, ... #else /* Guaranteed to visit each index once for capacities which are powers of two. cf. https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/ (last-retrieved 2024-07-29) */ - return probe_len * (probe_len + 1) / 2; // 1, 3, 6, 10, 15, ... (starting from probe_len == 1) + return probe_len * (probe_len + 1) / 2; // 1, 3, 6, 10, 15, 21 ... (starting from probe_len == 1) #endif } @@ -277,7 +277,8 @@ static size_t GUF_CAT(GUF_DICT_NAME, _find_idx)(GUF_DICT_NAME *ht, const GUF_DIC size_t idx = GUF_MOD_CAP(hash); const size_t start_idx = idx; size_t first_tombstone_idx = SIZE_MAX; - size_t probe_len = 1; + size_t probe_len = 0; + do { if (ht->kv_indices[idx].kv_idx == GUF_DICT_KV_IDX_NULL) { // 1.) Empty. if (first_tombstone_idx != SIZE_MAX) { @@ -298,12 +299,12 @@ static size_t GUF_CAT(GUF_DICT_NAME, _find_idx)(GUF_DICT_NAME *ht, const GUF_DIC return idx; } else { // 4.) Have to probe due to hash-collision (idx is already occupied, but not by the key). probe: - idx = GUF_MOD_CAP(idx + GUF_CAT(GUF_DICT_NAME, _probe_offset)(probe_len)); ++probe_len; - // printf("sz %td cap %td proble-len %td\n", ht->kv_elems.size, ht->kv_indices_cap, probe_len); + // NOTE: Add the probe_offset to start_idx and not to idx. + idx = GUF_MOD_CAP(start_idx + GUF_CAT(GUF_DICT_NAME, _probe_offset)(probe_len)); GUF_ASSERT((ptrdiff_t)probe_len <= (ht->kv_elems.size + ht->num_tombstones)); } - } while (idx != start_idx); + } while (idx != start_idx && probe_len < (size_t)ht->kv_indices_cap); *key_exists = false; if (first_tombstone_idx != SIZE_MAX) { // Edge case: No empty slots, but found tombstone. @@ -329,9 +330,13 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D return; } - const double MAX_LOAD_FAC = 0.66; + #ifdef GUF_DICT_PROBE_LINEAR + const double MAX_LOAD_FAC = 0.6; + #else + const double MAX_LOAD_FAC = 0.5; + #endif const ptrdiff_t KV_META_START_CAP = 32; // Must be a power of two > 0. - const ptrdiff_t KV_META_GROWTH_FAC = 2; // Must be a power of two > 0. + const ptrdiff_t KV_META_GROWTH_FAC = 2; // Must be a power of two > 1. guf_allocator *allocator = ht->kv_elems.allocator; @@ -355,47 +360,37 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D guf_err_set_or_panic(err, GUF_ERR_ALLOC_FAIL, GUF_ERR_MSG("in function dict_try_insert: New capacity would overflow)")); return; } + // TODO: Not sure if alloc and free is better here than realloc (since we do not copy ht->kv_indices anyway.) GUF_DICT_KV_META_T *new_kv_indices = allocator->alloc(new_size, allocator->ctx); if (new_kv_indices == NULL) { - guf_err_set_or_panic(err, GUF_ERR_ALLOC_FAIL, GUF_ERR_MSG("in function dict_try_insert: allocation failed")); + guf_err_set_or_panic(err, GUF_ERR_ALLOC_FAIL, GUF_ERR_MSG("in function dict_try_insert: re-allocation failed")); return; } - - const ptrdiff_t new_kv_meta_cap = ht->kv_indices_cap * KV_META_GROWTH_FAC; - for (ptrdiff_t i = 0; i < new_kv_meta_cap; ++i) { - new_kv_indices[i].kv_idx = GUF_DICT_KV_IDX_NULL; - new_kv_indices[i].key_hash = 0; - } - - GUF_DICT_KV_META_T *old_kv_indices = ht->kv_indices; - ptrdiff_t old_kv_indices_cap = ht->kv_indices_cap; - + allocator->free(ht->kv_indices, old_size, allocator->ctx); ht->kv_indices = new_kv_indices; - ht->kv_indices_cap = new_kv_meta_cap; + ht->kv_indices_cap = ht->kv_indices_cap * KV_META_GROWTH_FAC; ht->num_tombstones = 0; - ht->max_probelen = 0; + // ht->max_probelen = 0; - ptrdiff_t cnt = 0; - for (ptrdiff_t i = 0; i < old_kv_indices_cap; ++i) { // Copy old kv-indices into new kv-index-buffer. - if (old_kv_indices[i].kv_idx != GUF_DICT_KV_IDX_NULL && old_kv_indices[i].kv_idx != GUF_DICT_KV_IDX_TOMBSTONE) { - bool key_exists = false; - const GUF_DICT_KV_NAME *kv = GUF_CAT(GUF_DICT_KV_DBUF, _at)(&ht->kv_elems, old_kv_indices[i].kv_idx); - GUF_ASSERT(kv); - size_t new_idx = GUF_CAT(GUF_DICT_NAME, _find_idx)(ht, &kv->key, &key_exists); - GUF_ASSERT(!key_exists); - GUF_ASSERT(new_idx < SIZE_MAX); - ht->kv_indices[new_idx] = old_kv_indices[i]; - ++cnt; - } + for (ptrdiff_t i = 0; i < ht->kv_indices_cap; ++i) { + ht->kv_indices[i].kv_idx = GUF_DICT_KV_IDX_NULL; + ht->kv_indices[i].key_hash = 0; } - GUF_ASSERT(cnt == ht->kv_elems.size); - (void)cnt; - allocator->free(old_kv_indices, old_size, allocator->ctx); - - GUF_ASSERT(GUF_CAT(GUF_DICT_NAME, _load_factor)(ht) < 0.6); + for (ptrdiff_t kv_idx = 0; kv_idx < ht->kv_elems.size; ++kv_idx) { + const GUF_DICT_KV_NAME *kv = GUF_CAT(GUF_DICT_KV_DBUF, _at)(&ht->kv_elems, kv_idx); + GUF_ASSERT(kv); + bool key_exists = false; + const size_t new_idx = GUF_CAT(GUF_DICT_NAME, _find_idx)(ht, &kv->key, &key_exists); + GUF_ASSERT(!key_exists); + GUF_ASSERT(new_idx < SIZE_MAX); + ht->kv_indices[new_idx].kv_idx = kv_idx; + ht->kv_indices[new_idx].key_hash = GUF_DICT_KEY_HASH(&kv->key); + } } + GUF_ASSERT(GUF_CAT(GUF_DICT_NAME, _load_factor)(ht) <= MAX_LOAD_FAC); + // 2.) Insert new key-value pair. GUF_ASSERT_RELEASE(ht->kv_indices_cap > ht->kv_elems.size); diff --git a/src/guf_utf8.h b/src/guf_utf8.h index 6b4b6d1..fd7d3b9 100644 --- a/src/guf_utf8.h +++ b/src/guf_utf8.h @@ -38,7 +38,7 @@ GUF_UTF8_KWRDS bool guf_utf8_equal(const guf_utf8_char *a, const guf_utf8_char * GUF_UTF8_KWRDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str); extern const char* const GUF_UTF8_WHITESPACE[25]; -extern const char* const GUF_UTF8_COMMON_PUNCT[29]; +extern const char* const GUF_UTF8_COMMON_PUNCT[31]; extern const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR; // Replacement character "�" (U+FFFD) #define GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT UINT32_C(0xFFFD) @@ -57,9 +57,9 @@ const char* const GUF_UTF8_WHITESPACE[25] = }; // Common punctuation (TODO: make more exhaustive; use \x escapes) -const char* const GUF_UTF8_COMMON_PUNCT[29] = +const char* const GUF_UTF8_COMMON_PUNCT[31] = { - ".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "„", "“", "´", "`", "\\", "%", "‒", "–", "—", "—" + ".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "„", "“", "´", "»", "«", "`", "\\", "%", "‒", "–", "—", "—" }; const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR = {.bytes = {'\xEF','\xBF','\xBD', '\0', '\0'}}; diff --git a/src/test/guf_dict_impl.c b/src/test/guf_dict_impl.c index e978418..68df3bf 100644 --- a/src/test/guf_dict_impl.c +++ b/src/test/guf_dict_impl.c @@ -15,6 +15,7 @@ #define GUF_DICT_VAL_T int32_t #define GUF_DICT_VAL_T_IS_INTEGRAL_TYPE #define GUF_DICT_NAME dict_sv_i32 +// #define GUF_DICT_PROBE_LINEAR #define GUF_DICT_IMPL #include "guf_dict.h" diff --git a/src/test/test_dict.hpp b/src/test/test_dict.hpp index 282b78d..01141d1 100644 --- a/src/test/test_dict.hpp +++ b/src/test/test_dict.hpp @@ -11,7 +11,6 @@ extern "C" struct DictSvToIntTest : public Test { - DictSvToIntTest(const std::string& name) : Test(name) {}; private: @@ -81,8 +80,8 @@ struct DictSvToIntTest : public Test TEST_CHECK(i == dict_sv_i32_size(&word_cnt_dict)); TEST_CHECK(i == std::ssize(word_cnt_map)); - // std::cout << "load fac: " << dict_sv_i32_load_factor(&word_cnt_dict) << ", cap: " << word_cnt_dict.kv_indices_cap << "\n"; - // std::cout << "size: " << dict_sv_i32_size(&word_cnt_dict) << ", max probelen: " << word_cnt_dict.max_probelen << "\n"; + std::cout << "load fac: " << dict_sv_i32_load_factor(&word_cnt_dict) << ", cap: " << word_cnt_dict.kv_indices_cap << "\n"; + std::cout << "size: " << dict_sv_i32_size(&word_cnt_dict) << ", max probelen: " << word_cnt_dict.max_probelen << "\n"; dict_sv_i32_free(&word_cnt_dict, NULL); bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size; diff --git a/src/test/test_utf8.hpp b/src/test/test_utf8.hpp index 1fe759e..45c1071 100644 --- a/src/test/test_utf8.hpp +++ b/src/test/test_utf8.hpp @@ -375,7 +375,7 @@ struct UTF8Test : public Test int words = count_words(TEST_DATA_DIR "/" "utf8-test.txt", &delims); TEST_CHECK(words == 422); int words_with_delims = count_words_with_delims(TEST_DATA_DIR "/" "utf8-test.txt", &delims); - TEST_CHECK(words_with_delims == 947); + TEST_CHECK(words_with_delims == 949); dbuf_str_view_free(&delims, NULL); encode_decode();