Fix guf_dict probing
This commit is contained in:
parent
4d97725bba
commit
a41ff868f2
@ -254,13 +254,13 @@ static inline size_t GUF_CAT(GUF_DICT_NAME, _probe_offset)(size_t probe_len)
|
|||||||
{
|
{
|
||||||
GUF_ASSERT(probe_len > 0);
|
GUF_ASSERT(probe_len > 0);
|
||||||
#ifdef GUF_DICT_PROBE_LINEAR
|
#ifdef GUF_DICT_PROBE_LINEAR
|
||||||
return 1;
|
return probe_len; // 1, 2, 3, 4, 5, ...
|
||||||
#else
|
#else
|
||||||
/*
|
/*
|
||||||
Guaranteed to visit each index once for capacities which are powers of two.
|
Guaranteed to visit each index once for capacities which are powers of two.
|
||||||
cf. https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/ (last-retrieved 2024-07-29)
|
cf. https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/ (last-retrieved 2024-07-29)
|
||||||
*/
|
*/
|
||||||
return probe_len * (probe_len + 1) / 2; // 1, 3, 6, 10, 15, ... (starting from probe_len == 1)
|
return probe_len * (probe_len + 1) / 2; // 1, 3, 6, 10, 15, 21 ... (starting from probe_len == 1)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -277,7 +277,8 @@ static size_t GUF_CAT(GUF_DICT_NAME, _find_idx)(GUF_DICT_NAME *ht, const GUF_DIC
|
|||||||
size_t idx = GUF_MOD_CAP(hash);
|
size_t idx = GUF_MOD_CAP(hash);
|
||||||
const size_t start_idx = idx;
|
const size_t start_idx = idx;
|
||||||
size_t first_tombstone_idx = SIZE_MAX;
|
size_t first_tombstone_idx = SIZE_MAX;
|
||||||
size_t probe_len = 1;
|
size_t probe_len = 0;
|
||||||
|
|
||||||
do {
|
do {
|
||||||
if (ht->kv_indices[idx].kv_idx == GUF_DICT_KV_IDX_NULL) { // 1.) Empty.
|
if (ht->kv_indices[idx].kv_idx == GUF_DICT_KV_IDX_NULL) { // 1.) Empty.
|
||||||
if (first_tombstone_idx != SIZE_MAX) {
|
if (first_tombstone_idx != SIZE_MAX) {
|
||||||
@ -298,12 +299,12 @@ static size_t GUF_CAT(GUF_DICT_NAME, _find_idx)(GUF_DICT_NAME *ht, const GUF_DIC
|
|||||||
return idx;
|
return idx;
|
||||||
} else { // 4.) Have to probe due to hash-collision (idx is already occupied, but not by the key).
|
} else { // 4.) Have to probe due to hash-collision (idx is already occupied, but not by the key).
|
||||||
probe:
|
probe:
|
||||||
idx = GUF_MOD_CAP(idx + GUF_CAT(GUF_DICT_NAME, _probe_offset)(probe_len));
|
|
||||||
++probe_len;
|
++probe_len;
|
||||||
// printf("sz %td cap %td proble-len %td\n", ht->kv_elems.size, ht->kv_indices_cap, probe_len);
|
// NOTE: Add the probe_offset to start_idx and not to idx.
|
||||||
|
idx = GUF_MOD_CAP(start_idx + GUF_CAT(GUF_DICT_NAME, _probe_offset)(probe_len));
|
||||||
GUF_ASSERT((ptrdiff_t)probe_len <= (ht->kv_elems.size + ht->num_tombstones));
|
GUF_ASSERT((ptrdiff_t)probe_len <= (ht->kv_elems.size + ht->num_tombstones));
|
||||||
}
|
}
|
||||||
} while (idx != start_idx);
|
} while (idx != start_idx && probe_len < (size_t)ht->kv_indices_cap);
|
||||||
|
|
||||||
*key_exists = false;
|
*key_exists = false;
|
||||||
if (first_tombstone_idx != SIZE_MAX) { // Edge case: No empty slots, but found tombstone.
|
if (first_tombstone_idx != SIZE_MAX) { // Edge case: No empty slots, but found tombstone.
|
||||||
@ -329,9 +330,13 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const double MAX_LOAD_FAC = 0.66;
|
#ifdef GUF_DICT_PROBE_LINEAR
|
||||||
|
const double MAX_LOAD_FAC = 0.6;
|
||||||
|
#else
|
||||||
|
const double MAX_LOAD_FAC = 0.5;
|
||||||
|
#endif
|
||||||
const ptrdiff_t KV_META_START_CAP = 32; // Must be a power of two > 0.
|
const ptrdiff_t KV_META_START_CAP = 32; // Must be a power of two > 0.
|
||||||
const ptrdiff_t KV_META_GROWTH_FAC = 2; // Must be a power of two > 0.
|
const ptrdiff_t KV_META_GROWTH_FAC = 2; // Must be a power of two > 1.
|
||||||
|
|
||||||
guf_allocator *allocator = ht->kv_elems.allocator;
|
guf_allocator *allocator = ht->kv_elems.allocator;
|
||||||
|
|
||||||
@ -355,47 +360,37 @@ GUF_DICT_KWRDS void GUF_CAT(GUF_DICT_NAME, _try_insert)(GUF_DICT_NAME *ht, GUF_D
|
|||||||
guf_err_set_or_panic(err, GUF_ERR_ALLOC_FAIL, GUF_ERR_MSG("in function dict_try_insert: New capacity would overflow)"));
|
guf_err_set_or_panic(err, GUF_ERR_ALLOC_FAIL, GUF_ERR_MSG("in function dict_try_insert: New capacity would overflow)"));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
// TODO: Not sure if alloc and free is better here than realloc (since we do not copy ht->kv_indices anyway.)
|
||||||
GUF_DICT_KV_META_T *new_kv_indices = allocator->alloc(new_size, allocator->ctx);
|
GUF_DICT_KV_META_T *new_kv_indices = allocator->alloc(new_size, allocator->ctx);
|
||||||
if (new_kv_indices == NULL) {
|
if (new_kv_indices == NULL) {
|
||||||
guf_err_set_or_panic(err, GUF_ERR_ALLOC_FAIL, GUF_ERR_MSG("in function dict_try_insert: allocation failed"));
|
guf_err_set_or_panic(err, GUF_ERR_ALLOC_FAIL, GUF_ERR_MSG("in function dict_try_insert: re-allocation failed"));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
allocator->free(ht->kv_indices, old_size, allocator->ctx);
|
||||||
const ptrdiff_t new_kv_meta_cap = ht->kv_indices_cap * KV_META_GROWTH_FAC;
|
|
||||||
for (ptrdiff_t i = 0; i < new_kv_meta_cap; ++i) {
|
|
||||||
new_kv_indices[i].kv_idx = GUF_DICT_KV_IDX_NULL;
|
|
||||||
new_kv_indices[i].key_hash = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
GUF_DICT_KV_META_T *old_kv_indices = ht->kv_indices;
|
|
||||||
ptrdiff_t old_kv_indices_cap = ht->kv_indices_cap;
|
|
||||||
|
|
||||||
ht->kv_indices = new_kv_indices;
|
ht->kv_indices = new_kv_indices;
|
||||||
ht->kv_indices_cap = new_kv_meta_cap;
|
ht->kv_indices_cap = ht->kv_indices_cap * KV_META_GROWTH_FAC;
|
||||||
ht->num_tombstones = 0;
|
ht->num_tombstones = 0;
|
||||||
ht->max_probelen = 0;
|
// ht->max_probelen = 0;
|
||||||
|
|
||||||
ptrdiff_t cnt = 0;
|
for (ptrdiff_t i = 0; i < ht->kv_indices_cap; ++i) {
|
||||||
for (ptrdiff_t i = 0; i < old_kv_indices_cap; ++i) { // Copy old kv-indices into new kv-index-buffer.
|
ht->kv_indices[i].kv_idx = GUF_DICT_KV_IDX_NULL;
|
||||||
if (old_kv_indices[i].kv_idx != GUF_DICT_KV_IDX_NULL && old_kv_indices[i].kv_idx != GUF_DICT_KV_IDX_TOMBSTONE) {
|
ht->kv_indices[i].key_hash = 0;
|
||||||
bool key_exists = false;
|
|
||||||
const GUF_DICT_KV_NAME *kv = GUF_CAT(GUF_DICT_KV_DBUF, _at)(&ht->kv_elems, old_kv_indices[i].kv_idx);
|
|
||||||
GUF_ASSERT(kv);
|
|
||||||
size_t new_idx = GUF_CAT(GUF_DICT_NAME, _find_idx)(ht, &kv->key, &key_exists);
|
|
||||||
GUF_ASSERT(!key_exists);
|
|
||||||
GUF_ASSERT(new_idx < SIZE_MAX);
|
|
||||||
ht->kv_indices[new_idx] = old_kv_indices[i];
|
|
||||||
++cnt;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
GUF_ASSERT(cnt == ht->kv_elems.size);
|
|
||||||
(void)cnt;
|
|
||||||
|
|
||||||
allocator->free(old_kv_indices, old_size, allocator->ctx);
|
for (ptrdiff_t kv_idx = 0; kv_idx < ht->kv_elems.size; ++kv_idx) {
|
||||||
|
const GUF_DICT_KV_NAME *kv = GUF_CAT(GUF_DICT_KV_DBUF, _at)(&ht->kv_elems, kv_idx);
|
||||||
GUF_ASSERT(GUF_CAT(GUF_DICT_NAME, _load_factor)(ht) < 0.6);
|
GUF_ASSERT(kv);
|
||||||
|
bool key_exists = false;
|
||||||
|
const size_t new_idx = GUF_CAT(GUF_DICT_NAME, _find_idx)(ht, &kv->key, &key_exists);
|
||||||
|
GUF_ASSERT(!key_exists);
|
||||||
|
GUF_ASSERT(new_idx < SIZE_MAX);
|
||||||
|
ht->kv_indices[new_idx].kv_idx = kv_idx;
|
||||||
|
ht->kv_indices[new_idx].key_hash = GUF_DICT_KEY_HASH(&kv->key);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GUF_ASSERT(GUF_CAT(GUF_DICT_NAME, _load_factor)(ht) <= MAX_LOAD_FAC);
|
||||||
|
|
||||||
// 2.) Insert new key-value pair.
|
// 2.) Insert new key-value pair.
|
||||||
GUF_ASSERT_RELEASE(ht->kv_indices_cap > ht->kv_elems.size);
|
GUF_ASSERT_RELEASE(ht->kv_indices_cap > ht->kv_elems.size);
|
||||||
|
|
||||||
|
|||||||
@ -38,7 +38,7 @@ GUF_UTF8_KWRDS bool guf_utf8_equal(const guf_utf8_char *a, const guf_utf8_char *
|
|||||||
GUF_UTF8_KWRDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str);
|
GUF_UTF8_KWRDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str);
|
||||||
|
|
||||||
extern const char* const GUF_UTF8_WHITESPACE[25];
|
extern const char* const GUF_UTF8_WHITESPACE[25];
|
||||||
extern const char* const GUF_UTF8_COMMON_PUNCT[29];
|
extern const char* const GUF_UTF8_COMMON_PUNCT[31];
|
||||||
|
|
||||||
extern const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR; // Replacement character "<22>" (U+FFFD)
|
extern const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR; // Replacement character "<22>" (U+FFFD)
|
||||||
#define GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT UINT32_C(0xFFFD)
|
#define GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT UINT32_C(0xFFFD)
|
||||||
@ -57,9 +57,9 @@ const char* const GUF_UTF8_WHITESPACE[25] =
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Common punctuation (TODO: make more exhaustive; use \x escapes)
|
// Common punctuation (TODO: make more exhaustive; use \x escapes)
|
||||||
const char* const GUF_UTF8_COMMON_PUNCT[29] =
|
const char* const GUF_UTF8_COMMON_PUNCT[31] =
|
||||||
{
|
{
|
||||||
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "„", "“", "´", "`", "\\", "%", "‒", "–", "—", "—"
|
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "„", "“", "´", "»", "«", "`", "\\", "%", "‒", "–", "—", "—"
|
||||||
};
|
};
|
||||||
|
|
||||||
const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR = {.bytes = {'\xEF','\xBF','\xBD', '\0', '\0'}};
|
const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR = {.bytes = {'\xEF','\xBF','\xBD', '\0', '\0'}};
|
||||||
|
|||||||
@ -15,6 +15,7 @@
|
|||||||
#define GUF_DICT_VAL_T int32_t
|
#define GUF_DICT_VAL_T int32_t
|
||||||
#define GUF_DICT_VAL_T_IS_INTEGRAL_TYPE
|
#define GUF_DICT_VAL_T_IS_INTEGRAL_TYPE
|
||||||
#define GUF_DICT_NAME dict_sv_i32
|
#define GUF_DICT_NAME dict_sv_i32
|
||||||
|
// #define GUF_DICT_PROBE_LINEAR
|
||||||
#define GUF_DICT_IMPL
|
#define GUF_DICT_IMPL
|
||||||
#include "guf_dict.h"
|
#include "guf_dict.h"
|
||||||
|
|
||||||
|
|||||||
@ -11,7 +11,6 @@ extern "C"
|
|||||||
|
|
||||||
struct DictSvToIntTest : public Test
|
struct DictSvToIntTest : public Test
|
||||||
{
|
{
|
||||||
|
|
||||||
DictSvToIntTest(const std::string& name) : Test(name) {};
|
DictSvToIntTest(const std::string& name) : Test(name) {};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -81,8 +80,8 @@ struct DictSvToIntTest : public Test
|
|||||||
TEST_CHECK(i == dict_sv_i32_size(&word_cnt_dict));
|
TEST_CHECK(i == dict_sv_i32_size(&word_cnt_dict));
|
||||||
TEST_CHECK(i == std::ssize(word_cnt_map));
|
TEST_CHECK(i == std::ssize(word_cnt_map));
|
||||||
|
|
||||||
// std::cout << "load fac: " << dict_sv_i32_load_factor(&word_cnt_dict) << ", cap: " << word_cnt_dict.kv_indices_cap << "\n";
|
std::cout << "load fac: " << dict_sv_i32_load_factor(&word_cnt_dict) << ", cap: " << word_cnt_dict.kv_indices_cap << "\n";
|
||||||
// std::cout << "size: " << dict_sv_i32_size(&word_cnt_dict) << ", max probelen: " << word_cnt_dict.max_probelen << "\n";
|
std::cout << "size: " << dict_sv_i32_size(&word_cnt_dict) << ", max probelen: " << word_cnt_dict.max_probelen << "\n";
|
||||||
|
|
||||||
dict_sv_i32_free(&word_cnt_dict, NULL);
|
dict_sv_i32_free(&word_cnt_dict, NULL);
|
||||||
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;
|
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;
|
||||||
|
|||||||
@ -375,7 +375,7 @@ struct UTF8Test : public Test
|
|||||||
int words = count_words(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
|
int words = count_words(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
|
||||||
TEST_CHECK(words == 422);
|
TEST_CHECK(words == 422);
|
||||||
int words_with_delims = count_words_with_delims(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
|
int words_with_delims = count_words_with_delims(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
|
||||||
TEST_CHECK(words_with_delims == 947);
|
TEST_CHECK(words_with_delims == 949);
|
||||||
dbuf_str_view_free(&delims, NULL);
|
dbuf_str_view_free(&delims, NULL);
|
||||||
|
|
||||||
encode_decode();
|
encode_decode();
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user