Fix guf_dict probing
This commit is contained in:
parent
4d97725bba
commit
a41ff868f2
@ -254,13 +254,13 @@ static inline size_t GUF_CAT(GUF_DICT_NAME, _probe_offset)(size_t probe_len)
|
||||
{
|
||||
GUF_ASSERT(probe_len > 0);
|
||||
#ifdef GUF_DICT_PROBE_LINEAR
|
||||
return 1;
|
||||
return probe_len; // 1, 2, 3, 4, 5, ...
|
||||
#else
|
||||
/*
|
||||
Guaranteed to visit each index once for capacities which are powers of two.
|
||||
cf. https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/ (last-retrieved 2024-07-29)
|
||||
*/
|
||||
return probe_len * (probe_len + 1) / 2; // 1, 3, 6, 10, 15, ... (starting from probe_len == 1)
|
||||
return probe_len * (probe_len + 1) / 2; // 1, 3, 6, 10, 15, 21 ... (starting from probe_len == 1)
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -277,7 +277,8 @@ static size_t GUF_CAT(GUF_DICT_NAME, _find_idx)(GUF_DICT_NAME *ht, const GUF_DIC
|
||||
size_t idx = GUF_MOD_CAP(hash);
|
||||
const size_t start_idx = idx;
|
||||
size_t first_tombstone_idx = SIZE_MAX;
|
||||
size_t probe_len = 1;
|
||||
size_t probe_len = 0;
|
||||
|
||||
do {
|
||||
if (ht->kv_indices[idx].kv_idx == GUF_DICT_KV_IDX_NULL) { // 1.) Empty.
|
||||
if (first_tombstone_idx != SIZE_MAX) {
|
||||
@ -298,12 +299,12 @@ static size_t GUF_CAT(GUF_DICT_NAME, _find_idx)(GUF_DICT_NAME *ht, const GUF_DIC
|
||||
return idx;
|
||||
} else { // 4.) Have to probe due to hash-collision (idx is already occupied, but not by the key).
|
||||
probe:
|
||||
idx = GUF_MOD_CAP(idx + GUF_CAT(GUF_DICT_NAME, _probe_offset)(probe_len));
|
||||
++probe_len;
|
||||
// printf("sz %td cap %td proble-len %td\n", ht->kv_elems.size, ht->kv_indices_cap, probe_len);
|
||||
// NOTE: Add the probe_offset to start_idx and not to idx.
|
||||
idx = GUF_MOD_CAP(start_idx + GUF_CAT(GUF_DICT_NAME, _probe_offset)(probe_len));
|
||||
GUF_ASSERT((ptrdiff_t)probe_len <= (ht->kv_elems.size + ht->num_tombstones));
|
||||
}
|
||||
} while (idx != start_idx);
|
||||
} while (idx != start_idx && probe_len < (size_t)ht->kv_indices_cap);
|
||||
|
||||
*key_exists = false;
|
||||
if (first_tombstone_idx != SIZE_MAX) { // Edge case: No empty slots, but found tombstone.
|
||||
@ -329,9 +330,13 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D
|
||||
return;
|
||||
}
|
||||
|
||||
const double MAX_LOAD_FAC = 0.66;
|
||||
#ifdef GUF_DICT_PROBE_LINEAR
|
||||
const double MAX_LOAD_FAC = 0.6;
|
||||
#else
|
||||
const double MAX_LOAD_FAC = 0.5;
|
||||
#endif
|
||||
const ptrdiff_t KV_META_START_CAP = 32; // Must be a power of two > 0.
|
||||
const ptrdiff_t KV_META_GROWTH_FAC = 2; // Must be a power of two > 0.
|
||||
const ptrdiff_t KV_META_GROWTH_FAC = 2; // Must be a power of two > 1.
|
||||
|
||||
guf_allocator *allocator = ht->kv_elems.allocator;
|
||||
|
||||
@ -355,46 +360,36 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D
|
||||
guf_err_set_or_panic(err, GUF_ERR_ALLOC_FAIL, GUF_ERR_MSG("in function dict_try_insert: New capacity would overflow)"));
|
||||
return;
|
||||
}
|
||||
// TODO: Not sure if alloc and free is better here than realloc (since we do not copy ht->kv_indices anyway.)
|
||||
GUF_DICT_KV_META_T *new_kv_indices = allocator->alloc(new_size, allocator->ctx);
|
||||
if (new_kv_indices == NULL) {
|
||||
guf_err_set_or_panic(err, GUF_ERR_ALLOC_FAIL, GUF_ERR_MSG("in function dict_try_insert: allocation failed"));
|
||||
guf_err_set_or_panic(err, GUF_ERR_ALLOC_FAIL, GUF_ERR_MSG("in function dict_try_insert: re-allocation failed"));
|
||||
return;
|
||||
}
|
||||
allocator->free(ht->kv_indices, old_size, allocator->ctx);
|
||||
ht->kv_indices = new_kv_indices;
|
||||
ht->kv_indices_cap = ht->kv_indices_cap * KV_META_GROWTH_FAC;
|
||||
ht->num_tombstones = 0;
|
||||
// ht->max_probelen = 0;
|
||||
|
||||
const ptrdiff_t new_kv_meta_cap = ht->kv_indices_cap * KV_META_GROWTH_FAC;
|
||||
for (ptrdiff_t i = 0; i < new_kv_meta_cap; ++i) {
|
||||
new_kv_indices[i].kv_idx = GUF_DICT_KV_IDX_NULL;
|
||||
new_kv_indices[i].key_hash = 0;
|
||||
for (ptrdiff_t i = 0; i < ht->kv_indices_cap; ++i) {
|
||||
ht->kv_indices[i].kv_idx = GUF_DICT_KV_IDX_NULL;
|
||||
ht->kv_indices[i].key_hash = 0;
|
||||
}
|
||||
|
||||
GUF_DICT_KV_META_T *old_kv_indices = ht->kv_indices;
|
||||
ptrdiff_t old_kv_indices_cap = ht->kv_indices_cap;
|
||||
|
||||
ht->kv_indices = new_kv_indices;
|
||||
ht->kv_indices_cap = new_kv_meta_cap;
|
||||
ht->num_tombstones = 0;
|
||||
ht->max_probelen = 0;
|
||||
|
||||
ptrdiff_t cnt = 0;
|
||||
for (ptrdiff_t i = 0; i < old_kv_indices_cap; ++i) { // Copy old kv-indices into new kv-index-buffer.
|
||||
if (old_kv_indices[i].kv_idx != GUF_DICT_KV_IDX_NULL && old_kv_indices[i].kv_idx != GUF_DICT_KV_IDX_TOMBSTONE) {
|
||||
bool key_exists = false;
|
||||
const GUF_DICT_KV_NAME *kv = GUF_CAT(GUF_DICT_KV_DBUF, _at)(&ht->kv_elems, old_kv_indices[i].kv_idx);
|
||||
for (ptrdiff_t kv_idx = 0; kv_idx < ht->kv_elems.size; ++kv_idx) {
|
||||
const GUF_DICT_KV_NAME *kv = GUF_CAT(GUF_DICT_KV_DBUF, _at)(&ht->kv_elems, kv_idx);
|
||||
GUF_ASSERT(kv);
|
||||
size_t new_idx = GUF_CAT(GUF_DICT_NAME, _find_idx)(ht, &kv->key, &key_exists);
|
||||
bool key_exists = false;
|
||||
const size_t new_idx = GUF_CAT(GUF_DICT_NAME, _find_idx)(ht, &kv->key, &key_exists);
|
||||
GUF_ASSERT(!key_exists);
|
||||
GUF_ASSERT(new_idx < SIZE_MAX);
|
||||
ht->kv_indices[new_idx] = old_kv_indices[i];
|
||||
++cnt;
|
||||
ht->kv_indices[new_idx].kv_idx = kv_idx;
|
||||
ht->kv_indices[new_idx].key_hash = GUF_DICT_KEY_HASH(&kv->key);
|
||||
}
|
||||
}
|
||||
GUF_ASSERT(cnt == ht->kv_elems.size);
|
||||
(void)cnt;
|
||||
|
||||
allocator->free(old_kv_indices, old_size, allocator->ctx);
|
||||
|
||||
GUF_ASSERT(GUF_CAT(GUF_DICT_NAME, _load_factor)(ht) < 0.6);
|
||||
}
|
||||
GUF_ASSERT(GUF_CAT(GUF_DICT_NAME, _load_factor)(ht) <= MAX_LOAD_FAC);
|
||||
|
||||
// 2.) Insert new key-value pair.
|
||||
GUF_ASSERT_RELEASE(ht->kv_indices_cap > ht->kv_elems.size);
|
||||
|
||||
@ -38,7 +38,7 @@ GUF_UTF8_KWRDS bool guf_utf8_equal(const guf_utf8_char *a, const guf_utf8_char *
|
||||
GUF_UTF8_KWRDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str);
|
||||
|
||||
extern const char* const GUF_UTF8_WHITESPACE[25];
|
||||
extern const char* const GUF_UTF8_COMMON_PUNCT[29];
|
||||
extern const char* const GUF_UTF8_COMMON_PUNCT[31];
|
||||
|
||||
extern const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR; // Replacement character "<22>" (U+FFFD)
|
||||
#define GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT UINT32_C(0xFFFD)
|
||||
@ -57,9 +57,9 @@ const char* const GUF_UTF8_WHITESPACE[25] =
|
||||
};
|
||||
|
||||
// Common punctuation (TODO: make more exhaustive; use \x escapes)
|
||||
const char* const GUF_UTF8_COMMON_PUNCT[29] =
|
||||
const char* const GUF_UTF8_COMMON_PUNCT[31] =
|
||||
{
|
||||
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "„", "“", "´", "`", "\\", "%", "‒", "–", "—", "—"
|
||||
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "„", "“", "´", "»", "«", "`", "\\", "%", "‒", "–", "—", "—"
|
||||
};
|
||||
|
||||
const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR = {.bytes = {'\xEF','\xBF','\xBD', '\0', '\0'}};
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
#define GUF_DICT_VAL_T int32_t
|
||||
#define GUF_DICT_VAL_T_IS_INTEGRAL_TYPE
|
||||
#define GUF_DICT_NAME dict_sv_i32
|
||||
// #define GUF_DICT_PROBE_LINEAR
|
||||
#define GUF_DICT_IMPL
|
||||
#include "guf_dict.h"
|
||||
|
||||
|
||||
@ -11,7 +11,6 @@ extern "C"
|
||||
|
||||
struct DictSvToIntTest : public Test
|
||||
{
|
||||
|
||||
DictSvToIntTest(const std::string& name) : Test(name) {};
|
||||
|
||||
private:
|
||||
@ -81,8 +80,8 @@ struct DictSvToIntTest : public Test
|
||||
TEST_CHECK(i == dict_sv_i32_size(&word_cnt_dict));
|
||||
TEST_CHECK(i == std::ssize(word_cnt_map));
|
||||
|
||||
// std::cout << "load fac: " << dict_sv_i32_load_factor(&word_cnt_dict) << ", cap: " << word_cnt_dict.kv_indices_cap << "\n";
|
||||
// std::cout << "size: " << dict_sv_i32_size(&word_cnt_dict) << ", max probelen: " << word_cnt_dict.max_probelen << "\n";
|
||||
std::cout << "load fac: " << dict_sv_i32_load_factor(&word_cnt_dict) << ", cap: " << word_cnt_dict.kv_indices_cap << "\n";
|
||||
std::cout << "size: " << dict_sv_i32_size(&word_cnt_dict) << ", max probelen: " << word_cnt_dict.max_probelen << "\n";
|
||||
|
||||
dict_sv_i32_free(&word_cnt_dict, NULL);
|
||||
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;
|
||||
|
||||
@ -375,7 +375,7 @@ struct UTF8Test : public Test
|
||||
int words = count_words(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
|
||||
TEST_CHECK(words == 422);
|
||||
int words_with_delims = count_words_with_delims(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
|
||||
TEST_CHECK(words_with_delims == 947);
|
||||
TEST_CHECK(words_with_delims == 949);
|
||||
dbuf_str_view_free(&delims, NULL);
|
||||
|
||||
encode_decode();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user