From 1013616b2d6c38d346635c191c8259bb1edc9ced Mon Sep 17 00:00:00 2001 From: jun <83899451+zeichensystem@users.noreply.github.com> Date: Fri, 2 May 2025 21:54:33 +0200 Subject: [PATCH] Re-implement guf_str tokeniser --- src/guf_str.h | 262 +++++++++++++++++++++++++++--------- src/test/data/utf8-test.txt | 2 +- src/test/test_dict.hpp | 15 ++- src/test/test_utf8.hpp | 38 +++--- 4 files changed, 226 insertions(+), 91 deletions(-) diff --git a/src/guf_str.h b/src/guf_str.h index f0630ff..0d88743 100644 --- a/src/guf_str.h +++ b/src/guf_str.h @@ -50,6 +50,20 @@ typedef struct guf_str { guf_allocator *allocator; // Wasteful (8 bytes on 64-bit platforms...), but keeping this pointer also allows us to have "read-only strings" (a string is read-only if allocator == NULL) } guf_str; // Total: 32 bytes on 64-bit platforms, 16 bytes on 32-bit platforms. +typedef enum guf_str_tok_delim_opt { + GUF_STR_TOK_DELIM_OPT_MATCH_LONGEST = 0, + GUF_STR_TOK_DELIM_OPT_MATCH_SHORTEST, + GUF_STR_TOK_DELIM_OPT_MATCH_IN_ORDER, +} guf_str_tok_delim_opt; + +typedef struct guf_str_tok_state { + guf_str_view input; + guf_str_view cur_tok, cur_delim; + const guf_str_view *delims; + const ptrdiff_t delim_count; + ptrdiff_t num_toks_read, num_delims_read; + bool done; +} guf_str_tok_state; #define GUF_CSTR_TO_VIEW(CSTR) ((guf_str_view){.str = (CSTR), .len = (ptrdiff_t)strlen((CSTR))}) #define GUF_CSTR_LIT_TO_VIEW(CSTR) ((guf_str_view){.str = (CSTR), .len = (ptrdiff_t)sizeof((CSTR)) - 1}) @@ -88,8 +102,29 @@ GUF_STR_KWRDS guf_str_view guf_str_view_trim_right_ascii(guf_str_view sv); // Return true if sv does not violate any of its invariants (.len must be >= 0, .str must not be NULL unless len is 0) GUF_STR_KWRDS bool guf_str_view_is_valid(guf_str_view sv); -// Return the guf_str_view corresponding to the next token (delimiters (each can be more than once character) given by delims, preserved_delims are delimiters which are returned as tokens when encountered) -GUF_STR_KWRDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str_view *delims, ptrdiff_t num_delims, const guf_str_view *preserved_delims, ptrdiff_t num_preserved_delims); +/* + Return the substring up to the first delimiter "delim" and advance src to one past the delim (so the function can be called repeatedly) + cf. "str_pop_first_split": + - https://accu.org/conf-docs/PDFs_2021/luca_sass_modern_c_and_what_we_can_learn_from_it.pdf ("String handling in Modern C", page 128 of the pdf) + - https://youtu.be/QpAhX-gsHMs?si=lCvm6o60LrYHaAHc&t=3059 (last-retrieved 2025-04-30) +*/ +GUF_STR_KWRDS guf_str_view guf_str_view_pop_split(guf_str_view *src, guf_str_view delim); + +// Create a new tokeniser-state for guf_str_tok_next. +GUF_STR_KWRDS guf_str_tok_state guf_str_tok_state_new(guf_str_view str, guf_str_view *delims, ptrdiff_t delim_count, guf_str_tok_delim_opt delim_match_opt); +/* + Return true when the next token (or delimiter if preserve_delims == true) was encountered. + Put the current token into state->cur_tok. + If preserve_delims is true, every delimiter will be put into state->cur_delim. + If preserve_delims is false, delimiters will only be put into state->cur_delim if the current token is not empty + (otherwise, empty tokens are skipped for preserve_delims == false). + + Example: delims = ["-", "+"] + - "-1+2": tok_next(preserve_delims=false) will set state->cur_tok = "1" and state->cur_delim = "+" + - "-1+2": tok_next(preserve_delims=true) will set state->cur_tok = "" and state->cur_delim = "-" + Set preserve_delims to false if you don't care about processing the delimiters +*/ +GUF_STR_KWRDS bool guf_str_tok_next(guf_str_tok_state *state, bool preserve_delims); // 2.) guf_str: @@ -1178,76 +1213,171 @@ GUF_STR_KWRDS bool guf_str_view_is_valid(guf_str_view sv) } } -GUF_STR_KWRDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str_view *delims, ptrdiff_t num_delims, const guf_str_view *preserved_delims, ptrdiff_t num_preserved_delims) +/* + cf. "str_pop_first_split": + - https://accu.org/conf-docs/PDFs_2021/luca_sass_modern_c_and_what_we_can_learn_from_it.pdf ("String handling in Modern C", page 128 of the pdf) + - https://youtu.be/QpAhX-gsHMs?si=lCvm6o60LrYHaAHc&t=3059 (last-retrieved 2025-04-30) +*/ + +GUF_STR_KWRDS guf_str_view guf_str_view_pop_split(guf_str_view *src, guf_str_view delim) { - if (input->len <= 0 || input->str == NULL) { - return (guf_str_view){.str = NULL, .len = 0}; + GUF_ASSERT(src); + GUF_ASSERT_RELEASE(guf_str_view_is_valid(*src)); + GUF_ASSERT_RELEASE(guf_str_view_is_valid(delim)); + + if (delim.len <= 0) { + goto delim_not_found; } - ptrdiff_t max_delim_len = -1; - for (ptrdiff_t i = 0; i < num_delims; ++i) { - if (delims[i].len > max_delim_len) { - max_delim_len = delims[i].len; - } - } - - guf_str_view tok = {.str = input->str, .len = 0}; - guf_str_view prev_input = *input; - guf_utf8_char ch = {0}; - - for (guf_utf8_stat stat = guf_utf8_char_next(&ch, input); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, input)) { - if (stat != GUF_UTF8_READ_VALID) { - prev_input = *input; - continue; - } - - const int num_bytes = guf_utf8_char_num_bytes(&ch); - - for (ptrdiff_t delim_len = GUF_MIN(max_delim_len, prev_input.len); delim_len > 0; --delim_len) { - guf_str_view delim_candidate = guf_str_view_substr(prev_input, 0, delim_len); - for (ptrdiff_t delim_i = 0; delim_i < num_delims; ++delim_i) { - if (guf_str_view_equal(&delim_candidate, delims + delim_i)) { // Found delim. - bool preserved = false; - if (preserved_delims && num_preserved_delims > 0) { - for (ptrdiff_t preserved_i = 0; preserved_i < num_preserved_delims; ++preserved_i) { - if (guf_str_view_equal(&delim_candidate, preserved_delims + preserved_i)) { - preserved = true; - break; - } - } - } - if (!preserved) { - input->len = prev_input.len - delim_len; - input->str = prev_input.len > 0 ? prev_input.str + delim_len : NULL; - GUF_ASSERT(input->len >= 0); - } else { - input->str -= num_bytes; - input->len += num_bytes; - } - - if (tok.len == 0) { - if (preserved) { - input->str += num_bytes; - input->len -= num_bytes; - return delim_candidate; - } - tok.str = input->str; - goto end; - } else { - return tok; - } - } + for (ptrdiff_t src_idx = 0; src_idx < src->len; ++src_idx) { + ptrdiff_t num_matched = 0; + for (ptrdiff_t delim_idx = 0; delim_idx < delim.len && (src_idx + delim.len <= src->len); ++delim_idx) { + if (delim.str[delim_idx] != src->str[src_idx + delim_idx]) { + break; } + ++num_matched; } - tok.len += num_bytes; - - end:; - prev_input = *input; + if (num_matched == delim.len) { // Delimiter found in interval [src_idx, src_idx + delim.len) + const guf_str_view popped = guf_str_view_substr(*src, 0, src_idx); + const ptrdiff_t advance_len = popped.len + delim.len; + GUF_ASSERT(advance_len > 0 && advance_len >= delim.len); + src->len -= advance_len; + GUF_ASSERT(src->len >= 0); + src->str = src->len > 0 ? src->str + advance_len : NULL; + return popped; + } } - - return tok; + +delim_not_found:; + const guf_str_view popped = *src; + src->str = NULL; + src->len = 0; + return popped; } +static inline int guf_str_view_cmp_asc_by_len_(const void *a, const void *b) +{ + const guf_str_view *asv = (const guf_str_view*)a; + const guf_str_view *bsv = (const guf_str_view*)b; + if (asv->len < bsv->len) { + return -1; + } else if (asv->len > bsv->len) { + return 1; + } else { + return 0; + } +} + +static inline int guf_str_view_cmp_desc_by_len_(const void *a, const void *b) +{ + return -guf_str_view_cmp_asc_by_len_(a, b); +} + +GUF_STR_KWRDS guf_str_tok_state guf_str_tok_state_new(guf_str_view str, guf_str_view *delims, ptrdiff_t delim_count, guf_str_tok_delim_opt delim_match_opt) +{ + GUF_ASSERT_RELEASE(guf_str_view_is_valid(str)); + GUF_ASSERT_RELEASE(delim_count > 0 ? delims != NULL : true); + + ptrdiff_t max_delim_len = 0; + if (delims && delim_count > 0) { + for (ptrdiff_t i = 0; i < delim_count; ++i) { + GUF_ASSERT_RELEASE(guf_str_view_is_valid(delims[i])); + max_delim_len = guf_max_ptrdiff_t(max_delim_len, delims[i].len); + } + } + if (max_delim_len <= 0 || delim_count <= 0 || delims == NULL) { + delims = NULL; + delim_count = 0; + } else { + switch (delim_match_opt) { + case GUF_STR_TOK_DELIM_OPT_MATCH_LONGEST: + qsort(delims, delim_count, sizeof(delims[0]), guf_str_view_cmp_desc_by_len_); + break; + case GUF_STR_TOK_DELIM_OPT_MATCH_SHORTEST: + qsort(delims, delim_count, sizeof(delims[0]), guf_str_view_cmp_asc_by_len_); + break; + case GUF_STR_TOK_DELIM_OPT_MATCH_IN_ORDER: + break; + default: + GUF_ASSERT(false); + break; + } + } + return (guf_str_tok_state) { + .input = str, + .delims = delims, + .delim_count = delim_count, + .num_toks_read = 0, + .num_delims_read = 0, + .cur_tok = (guf_str_view){.len = 0, .str = NULL}, + .cur_delim = (guf_str_view){.len = 0, .str = NULL}, + .done = false + }; +} + +GUF_STR_KWRDS bool guf_str_tok_next(guf_str_tok_state *state, bool preserve_delims) +{ + GUF_ASSERT(state); + GUF_ASSERT(guf_str_view_is_valid(state->input)); + GUF_ASSERT(state->num_toks_read >= 0 && state->num_delims_read >= 0 && state->delim_count >= 0); + GUF_ASSERT(state->delim_count > 0 ? state->delims != NULL : true); + + if (state->done || state->input.len <= 0 || state->input.str == NULL) { + state->done = true; + state->cur_tok = (guf_str_view){.str = NULL, .len = 0}; + state->cur_delim = (guf_str_view){.str = NULL, .len = 0}; + return false; + } else if (state->delim_count <= 0 || state->delims == NULL) { + state->done = true; + state->cur_tok = state->input; + state->cur_delim = (guf_str_view){.str = NULL, .len = 0}; + return state->cur_tok.len > 0; + } + +find_next_delim_begin: + state->cur_tok = state->cur_delim = (guf_str_view){.str = state->input.str, .len = 0}; + while (state->input.len > 0) { + GUF_ASSERT(state->input.str != NULL); + for (ptrdiff_t delim_idx = 0; delim_idx < state->delim_count; ++delim_idx) { // If state->delims is sorted descending/ascending by length, match the longest/shortest possible delim + const guf_str_view delim = state->delims[delim_idx]; + GUF_ASSERT(guf_str_view_is_valid(delim)); + if (delim.len > state->input.len || delim.len <= 0) { // Current delim cannot possibly match. + continue; + } + const guf_str_view delim_candidate = guf_str_view_substr(state->input, 0, delim.len); + if (guf_str_view_equal(&delim_candidate, &delim)) { // a) Matched the current delim: + GUF_ASSERT(state->input.len >= delim.len); + GUF_ASSERT(state->cur_tok.len >= 0); + state->cur_delim = delim; + state->num_delims_read += 1; + + state->input.len -= delim.len; + state->input.str = state->input.len > 0 ? state->input.str + delim.len : NULL; + + if (!preserve_delims && state->cur_tok.len == 0) { + goto find_next_delim_begin; + } + + state->num_toks_read += state->cur_tok.len > 0 ? 1 : 0; + state->done = state->input.len <= 0; + GUF_ASSERT(state->cur_tok.len > 0 || state->cur_delim.len > 0); + return true; + } + } + // b) Could not match any of the delims: + state->cur_tok.len += 1; + state->input.len -= 1; + state->input.str = state->input.len > 0 ? state->input.str + 1 : NULL; + } + + state->done = true; + GUF_ASSERT(state->cur_tok.len >= 0); + state->cur_delim = (guf_str_view){.str = NULL, .len = 0}; + if (state->cur_tok.len > 0) { + state->num_toks_read += 1; + } + return state->cur_tok.len > 0; +} GUF_STR_KWRDS guf_str_view guf_str_view_trim_left_ascii(guf_str_view sv) { @@ -1294,7 +1424,9 @@ GUF_STR_KWRDS guf_str_view guf_str_view_substr(guf_str_view str, ptrdiff_t pos, GUF_ASSERT(substr_len >= 0); GUF_ASSERT(substr_len <= str.len); - return (guf_str_view){.str = str.str + pos, .len = substr_len}; + const guf_str_view sub_sv = {.str = str.str + pos, .len = substr_len}; + GUF_ASSERT(guf_str_view_is_valid(sub_sv)); + return sub_sv; } GUF_STR_KWRDS guf_hash_size_t guf_str_view_hash(const guf_str_view *sv) diff --git a/src/test/data/utf8-test.txt b/src/test/data/utf8-test.txt index dbbacd7..c5e99ce 100644 --- a/src/test/data/utf8-test.txt +++ b/src/test/data/utf8-test.txt @@ -64,4 +64,4 @@ Albert osti fagotin ja töräytti puhkuvan melodian. ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ ᛋᚳᛖᚪᛚ᛫ᚦᛖᚪᚻ᛫ᛗᚪᚾᚾᚪ᛫ᚷᛖᚻᚹᛦᛚᚳ᛫ᛗᛁᚳᛚᚢᚾ᛫ᚻᛦᛏ᛫ᛞᚫᛚᚪᚾ -ᚷᛁᚠ᛫ᚻᛖ᛫ᚹᛁᛚᛖ᛫ᚠᚩᚱ᛫ᛞᚱᛁᚻᛏᚾᛖ᛫ᛞᚩᛗᛖᛋ᛫ᚻᛚᛇᛏᚪᚾ᛬ \ No newline at end of file +ᚷᛁᚠ᛫ᚻᛖ᛫ᚹᛁᛚᛖ᛫ᚠᚩᚱ᛫ᛞᚱᛁᚻᛏᚾᛖ᛫ᛞᚩᛗᛖᛋ᛫ᚻᛚᛇᛏᚪᚾ᛬ diff --git a/src/test/test_dict.hpp b/src/test/test_dict.hpp index 47e14ad..fb158c5 100644 --- a/src/test/test_dict.hpp +++ b/src/test/test_dict.hpp @@ -38,13 +38,14 @@ private: dbuf_str_view_push_val(&delims, d); } - guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; - guf_str_view tok; - while ((tok = guf_str_next_tok(&input_str, delims.data, delims.size, NULL, -1)).len) { - if (tok.len <= 0) { - continue; - } - std::string_view sv(tok.str, tok.len); + guf_str_tok_state tok_state = guf_str_tok_state_new(guf_str_view{.str = text_buf.data, .len = text_buf.size}, delims.data, delims.size, GUF_STR_TOK_DELIM_OPT_MATCH_LONGEST); + while (guf_str_tok_next(&tok_state, true)) { + guf_str_view tok = tok_state.cur_tok; + // if (tok.len <= 0) { + // continue; + // } + std::string_view sv(tok.str , tok.len); + //std::cout << sv << std::string_view(tok_state.cur_delim.str, tok_state.cur_delim.len); TEST_CHECK(dict_sv_i32_contains(&word_cnt_dict, &tok) == word_cnt_map.contains(sv)); if (!dict_sv_i32_contains(&word_cnt_dict, &tok)) { dict_sv_i32_insert_val_arg(&word_cnt_dict, tok, 1, GUF_CPY_VALUE, GUF_CPY_VALUE); diff --git a/src/test/test_utf8.hpp b/src/test/test_utf8.hpp index 40d3c4a..a2dd213 100644 --- a/src/test/test_utf8.hpp +++ b/src/test/test_utf8.hpp @@ -85,11 +85,9 @@ private: int num_words = 0; - guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; - guf_str_view tok; - while ((tok = guf_str_next_tok(&input_str, delims->data, delims->size, NULL, -1)).len) { - // printf("tok_len: %td ", tok.len); - // printf("'%.*s'\n", (int)tok.len, tok.str); + guf_str_tok_state tok_state = guf_str_tok_state_new(guf_str_view{.str = text_buf.data, .len = text_buf.size}, delims->data, delims->size, GUF_STR_TOK_DELIM_OPT_MATCH_LONGEST); + while (guf_str_tok_next(&tok_state, false)) { + TEST_CHECK(tok_state.cur_tok.len > 0); ++num_words; } @@ -101,19 +99,23 @@ private: { GUF_ASSERT_RELEASE(load_text(fname)); - int num_words = 0; - guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; - guf_str_view tok; - while ((tok = guf_str_next_tok(&input_str, delims->data, delims->size, delims->data, delims->size)).len) { - // if (tok.str[0] == '\n') { - // printf("'\\n'\n"); - // } else { - // printf("'%.*s'\n", (int)tok.len, tok.str); - // } - ++num_words; + int num_words = 0, num_delims = 0; + guf_str_tok_state tok_state = guf_str_tok_state_new(guf_str_view{.str = text_buf.data, .len = text_buf.size}, delims->data, delims->size, GUF_STR_TOK_DELIM_OPT_MATCH_LONGEST); + while (guf_str_tok_next(&tok_state, true)) { + if (tok_state.cur_tok.len) { + ++num_words; + // printf("'%.*s'\n", (int)tok_state.cur_tok.len, tok_state.cur_tok.str); + } + if (tok_state.cur_delim.len) { + ++num_delims; + // if (tok_state.cur_delim.str[0] == '\n') + // printf("'\\n'\n"); + // else + // printf("'%.*s'\n", (int)tok_state.cur_delim.len, tok_state.cur_delim.str); + } } free_text(); - return num_words; + return num_words + num_delims; } void encode_decode_file(const char *fname) @@ -360,7 +362,7 @@ public: ptrdiff_t valid = 0, invalid = 0; read_utf8_chars(TEST_DATA_DIR "/" "utf8-test.txt", &valid, &invalid); - TEST_CHECK(valid == 2634 && invalid == 0); + TEST_CHECK(valid == 2635 && invalid == 0); read_utf8_chars(TEST_DATA_DIR "/" "bartleby.txt", &valid, &invalid); TEST_CHECK(valid > 16000 && invalid == 0); @@ -377,7 +379,7 @@ public: int words = count_words(TEST_DATA_DIR "/" "utf8-test.txt", &delims); TEST_CHECK(words == 422); int words_with_delims = count_words_with_delims(TEST_DATA_DIR "/" "utf8-test.txt", &delims); - TEST_CHECK(words_with_delims == 949); + TEST_CHECK(words_with_delims == 950); int words2 = count_words(TEST_DATA_DIR "/" "bartleby.txt", &delims); TEST_CHECK(words2 > 2048);