Fix guf_dict probing

This commit is contained in:
jun 2025-03-03 11:28:40 +01:00
parent 4d97725bba
commit a41ff868f2
5 changed files with 40 additions and 45 deletions

View File

@ -254,13 +254,13 @@ static inline size_t GUF_CAT(GUF_DICT_NAME, _probe_offset)(size_t probe_len)
{
GUF_ASSERT(probe_len > 0);
#ifdef GUF_DICT_PROBE_LINEAR
return 1;
return probe_len; // 1, 2, 3, 4, 5, ...
#else
/*
Guaranteed to visit each index once for capacities which are powers of two.
cf. https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/ (last-retrieved 2024-07-29)
*/
return probe_len * (probe_len + 1) / 2; // 1, 3, 6, 10, 15, ... (starting from probe_len == 1)
return probe_len * (probe_len + 1) / 2; // 1, 3, 6, 10, 15, 21 ... (starting from probe_len == 1)
#endif
}
@ -277,7 +277,8 @@ static size_t GUF_CAT(GUF_DICT_NAME, _find_idx)(GUF_DICT_NAME *ht, const GUF_DIC
size_t idx = GUF_MOD_CAP(hash);
const size_t start_idx = idx;
size_t first_tombstone_idx = SIZE_MAX;
size_t probe_len = 1;
size_t probe_len = 0;
do {
if (ht->kv_indices[idx].kv_idx == GUF_DICT_KV_IDX_NULL) { // 1.) Empty.
if (first_tombstone_idx != SIZE_MAX) {
@ -298,12 +299,12 @@ static size_t GUF_CAT(GUF_DICT_NAME, _find_idx)(GUF_DICT_NAME *ht, const GUF_DIC
return idx;
} else { // 4.) Have to probe due to hash-collision (idx is already occupied, but not by the key).
probe:
idx = GUF_MOD_CAP(idx + GUF_CAT(GUF_DICT_NAME, _probe_offset)(probe_len));
++probe_len;
// printf("sz %td cap %td proble-len %td\n", ht->kv_elems.size, ht->kv_indices_cap, probe_len);
// NOTE: Add the probe_offset to start_idx and not to idx.
idx = GUF_MOD_CAP(start_idx + GUF_CAT(GUF_DICT_NAME, _probe_offset)(probe_len));
GUF_ASSERT((ptrdiff_t)probe_len <= (ht->kv_elems.size + ht->num_tombstones));
}
} while (idx != start_idx);
} while (idx != start_idx && probe_len < (size_t)ht->kv_indices_cap);
*key_exists = false;
if (first_tombstone_idx != SIZE_MAX) { // Edge case: No empty slots, but found tombstone.
@ -329,9 +330,13 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D
return;
}
const double MAX_LOAD_FAC = 0.66;
#ifdef GUF_DICT_PROBE_LINEAR
const double MAX_LOAD_FAC = 0.6;
#else
const double MAX_LOAD_FAC = 0.5;
#endif
const ptrdiff_t KV_META_START_CAP = 32; // Must be a power of two > 0.
const ptrdiff_t KV_META_GROWTH_FAC = 2; // Must be a power of two > 0.
const ptrdiff_t KV_META_GROWTH_FAC = 2; // Must be a power of two > 1.
guf_allocator *allocator = ht->kv_elems.allocator;
@ -355,46 +360,36 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D
guf_err_set_or_panic(err, GUF_ERR_ALLOC_FAIL, GUF_ERR_MSG("in function dict_try_insert: New capacity would overflow)"));
return;
}
// TODO: Not sure if alloc and free is better here than realloc (since we do not copy ht->kv_indices anyway.)
GUF_DICT_KV_META_T *new_kv_indices = allocator->alloc(new_size, allocator->ctx);
if (new_kv_indices == NULL) {
guf_err_set_or_panic(err, GUF_ERR_ALLOC_FAIL, GUF_ERR_MSG("in function dict_try_insert: allocation failed"));
guf_err_set_or_panic(err, GUF_ERR_ALLOC_FAIL, GUF_ERR_MSG("in function dict_try_insert: re-allocation failed"));
return;
}
allocator->free(ht->kv_indices, old_size, allocator->ctx);
ht->kv_indices = new_kv_indices;
ht->kv_indices_cap = ht->kv_indices_cap * KV_META_GROWTH_FAC;
ht->num_tombstones = 0;
// ht->max_probelen = 0;
const ptrdiff_t new_kv_meta_cap = ht->kv_indices_cap * KV_META_GROWTH_FAC;
for (ptrdiff_t i = 0; i < new_kv_meta_cap; ++i) {
new_kv_indices[i].kv_idx = GUF_DICT_KV_IDX_NULL;
new_kv_indices[i].key_hash = 0;
for (ptrdiff_t i = 0; i < ht->kv_indices_cap; ++i) {
ht->kv_indices[i].kv_idx = GUF_DICT_KV_IDX_NULL;
ht->kv_indices[i].key_hash = 0;
}
GUF_DICT_KV_META_T *old_kv_indices = ht->kv_indices;
ptrdiff_t old_kv_indices_cap = ht->kv_indices_cap;
ht->kv_indices = new_kv_indices;
ht->kv_indices_cap = new_kv_meta_cap;
ht->num_tombstones = 0;
ht->max_probelen = 0;
ptrdiff_t cnt = 0;
for (ptrdiff_t i = 0; i < old_kv_indices_cap; ++i) { // Copy old kv-indices into new kv-index-buffer.
if (old_kv_indices[i].kv_idx != GUF_DICT_KV_IDX_NULL && old_kv_indices[i].kv_idx != GUF_DICT_KV_IDX_TOMBSTONE) {
bool key_exists = false;
const GUF_DICT_KV_NAME *kv = GUF_CAT(GUF_DICT_KV_DBUF, _at)(&ht->kv_elems, old_kv_indices[i].kv_idx);
for (ptrdiff_t kv_idx = 0; kv_idx < ht->kv_elems.size; ++kv_idx) {
const GUF_DICT_KV_NAME *kv = GUF_CAT(GUF_DICT_KV_DBUF, _at)(&ht->kv_elems, kv_idx);
GUF_ASSERT(kv);
size_t new_idx = GUF_CAT(GUF_DICT_NAME, _find_idx)(ht, &kv->key, &key_exists);
bool key_exists = false;
const size_t new_idx = GUF_CAT(GUF_DICT_NAME, _find_idx)(ht, &kv->key, &key_exists);
GUF_ASSERT(!key_exists);
GUF_ASSERT(new_idx < SIZE_MAX);
ht->kv_indices[new_idx] = old_kv_indices[i];
++cnt;
ht->kv_indices[new_idx].kv_idx = kv_idx;
ht->kv_indices[new_idx].key_hash = GUF_DICT_KEY_HASH(&kv->key);
}
}
GUF_ASSERT(cnt == ht->kv_elems.size);
(void)cnt;
allocator->free(old_kv_indices, old_size, allocator->ctx);
GUF_ASSERT(GUF_CAT(GUF_DICT_NAME, _load_factor)(ht) < 0.6);
}
GUF_ASSERT(GUF_CAT(GUF_DICT_NAME, _load_factor)(ht) <= MAX_LOAD_FAC);
// 2.) Insert new key-value pair.
GUF_ASSERT_RELEASE(ht->kv_indices_cap > ht->kv_elems.size);

View File

@ -38,7 +38,7 @@ GUF_UTF8_KWRDS bool guf_utf8_equal(const guf_utf8_char *a, const guf_utf8_char *
GUF_UTF8_KWRDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str);
extern const char* const GUF_UTF8_WHITESPACE[25];
extern const char* const GUF_UTF8_COMMON_PUNCT[29];
extern const char* const GUF_UTF8_COMMON_PUNCT[31];
extern const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR; // Replacement character "<22>" (U+FFFD)
#define GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT UINT32_C(0xFFFD)
@ -57,9 +57,9 @@ const char* const GUF_UTF8_WHITESPACE[25] =
};
// Common punctuation (TODO: make more exhaustive; use \x escapes)
const char* const GUF_UTF8_COMMON_PUNCT[29] =
const char* const GUF_UTF8_COMMON_PUNCT[31] =
{
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "", "", "´", "`", "\\", "%", "", "", "", ""
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "", "", "´", "»", "«", "`", "\\", "%", "", "", "", ""
};
const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR = {.bytes = {'\xEF','\xBF','\xBD', '\0', '\0'}};

View File

@ -15,6 +15,7 @@
#define GUF_DICT_VAL_T int32_t
#define GUF_DICT_VAL_T_IS_INTEGRAL_TYPE
#define GUF_DICT_NAME dict_sv_i32
// #define GUF_DICT_PROBE_LINEAR
#define GUF_DICT_IMPL
#include "guf_dict.h"

View File

@ -11,7 +11,6 @@ extern "C"
struct DictSvToIntTest : public Test
{
DictSvToIntTest(const std::string& name) : Test(name) {};
private:
@ -81,8 +80,8 @@ struct DictSvToIntTest : public Test
TEST_CHECK(i == dict_sv_i32_size(&word_cnt_dict));
TEST_CHECK(i == std::ssize(word_cnt_map));
// std::cout << "load fac: " << dict_sv_i32_load_factor(&word_cnt_dict) << ", cap: " << word_cnt_dict.kv_indices_cap << "\n";
// std::cout << "size: " << dict_sv_i32_size(&word_cnt_dict) << ", max probelen: " << word_cnt_dict.max_probelen << "\n";
std::cout << "load fac: " << dict_sv_i32_load_factor(&word_cnt_dict) << ", cap: " << word_cnt_dict.kv_indices_cap << "\n";
std::cout << "size: " << dict_sv_i32_size(&word_cnt_dict) << ", max probelen: " << word_cnt_dict.max_probelen << "\n";
dict_sv_i32_free(&word_cnt_dict, NULL);
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;

View File

@ -375,7 +375,7 @@ struct UTF8Test : public Test
int words = count_words(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
TEST_CHECK(words == 422);
int words_with_delims = count_words_with_delims(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
TEST_CHECK(words_with_delims == 947);
TEST_CHECK(words_with_delims == 949);
dbuf_str_view_free(&delims, NULL);
encode_decode();