From 8b02eff3b7211c1ab1d8e985502bb263b744335b Mon Sep 17 00:00:00 2001 From: jun <83899451+zeichensystem@users.noreply.github.com> Date: Thu, 27 Feb 2025 13:02:28 +0100 Subject: [PATCH] Make preserved_delims work (in guf_str_next_tok) --- src/guf_utf8.h | 24 +++++- src/test/data/utf8-test.txt | 18 +++++ src/test/test.cpp | 5 ++ src/test/test_dict.hpp | 18 +---- src/test/test_utf8.hpp | 156 ++++++++++++++++++++++++++++++++++++ 5 files changed, 201 insertions(+), 20 deletions(-) create mode 100644 src/test/test_utf8.hpp diff --git a/src/guf_utf8.h b/src/guf_utf8.h index d92b729..f96c824 100644 --- a/src/guf_utf8.h +++ b/src/guf_utf8.h @@ -257,12 +257,30 @@ GUF_FN_KEYWORDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str guf_str_view delim_candidate = guf_substr_view(prev_input, 0, delim_len); for (ptrdiff_t delim_i = 0; delim_i < num_delims; ++delim_i) { if (guf_str_view_equal(&delim_candidate, delims + delim_i)) { // Found delim. - input->len = prev_input.len - delim_len; - input->str = prev_input.len > 0 ? prev_input.str + delim_len : NULL; + bool preserved = false; if (preserved_delims && num_preserved_delims > 0) { - return delim_candidate; + for (ptrdiff_t preserved_i = 0; preserved_i < num_preserved_delims; ++preserved_i) { + if (guf_str_view_equal(&delim_candidate, preserved_delims + preserved_i)) { + preserved = true; + break; + } + } } + if (!preserved) { + input->len = prev_input.len - delim_len; + input->str = prev_input.len > 0 ? prev_input.str + delim_len : NULL; + GUF_ASSERT(input->len >= 0); + } else { + input->str -= num_bytes; + input->len += num_bytes; + } + if (tok.len == 0) { + if (preserved) { + input->str += num_bytes; + input->len -= num_bytes; + return delim_candidate; + } tok.str = input->str; goto end; } else { diff --git a/src/test/data/utf8-test.txt b/src/test/data/utf8-test.txt index f7e4dbf..dbbacd7 100644 --- a/src/test/data/utf8-test.txt +++ b/src/test/data/utf8-test.txt @@ -44,6 +44,24 @@ Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa. Pijamalı hasta, yağız şoföre çabucak güvendi. +Albert osti fagotin ja töräytti puhkuvan melodian. + +דג סקרן שט בים מאוכזב ולפתע מצא חברה + +نص حكيم له سر قاطع وذو شأن عظيم مكتوب على ثوب أخضر ومغلف بجلد أزرق + +بر اثر چنین تلقین و شستشوی مغزی جامعی، سطح و پایهٔ ذهن و فهم و نظر بعضی اشخاص واژگونه و معکوس می‌شود + +키스의 고유조건은 입술끼리 만나야 하고 특별한 기술은 필요치 않다. + +いろはにほへとちりぬるを +わかよたれそつねならむ +うゐのおくやまけふこえて +あさきゆめみしゑひもせす + +イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム +ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン + ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ ᛋᚳᛖᚪᛚ᛫ᚦᛖᚪᚻ᛫ᛗᚪᚾᚾᚪ᛫ᚷᛖᚻᚹᛦᛚᚳ᛫ᛗᛁᚳᛚᚢᚾ᛫ᚻᛦᛏ᛫ᛞᚫᛚᚪᚾ ᚷᛁᚠ᛫ᚻᛖ᛫ᚹᛁᛚᛖ᛫ᚠᚩᚱ᛫ᛞᚱᛁᚻᛏᚾᛖ᛫ᛞᚩᛗᛖᛋ᛫ᚻᛚᛇᛏᚪᚾ᛬ \ No newline at end of file diff --git a/src/test/test.cpp b/src/test/test.cpp index 14ab8f9..fde5183 100644 --- a/src/test/test.cpp +++ b/src/test/test.cpp @@ -9,6 +9,7 @@ extern "C" { #include "test_dbuf.hpp" #include "test_dict.hpp" +#include "test_utf8.hpp" std::unordered_set> g_tests {}; @@ -25,6 +26,10 @@ void init_tests() test = std::make_unique("DictCstrToIntTest"); GUF_ASSERT_RELEASE(test.get()); g_tests.insert(std::move(test)); + + test = std::make_unique("UTF8Test"); + GUF_ASSERT_RELEASE(test.get()); + g_tests.insert(std::move(test)); } int main() diff --git a/src/test/test_dict.hpp b/src/test/test_dict.hpp index b93dd3c..12465b3 100644 --- a/src/test/test_dict.hpp +++ b/src/test/test_dict.hpp @@ -39,26 +39,10 @@ struct DictCstrToIntTest : public Test guf_str_view tok; while ((tok = guf_str_next_tok(&input_str, delims.data, delims.size, NULL, -1)).len) { // printf("tok_len: %td ", tok.len); - printf("'%.*s'\n", (int)tok.len, tok.str); + // printf("'%.*s'\n", (int)tok.len, tok.str); } dbuf_str_view_free(&delims, NULL); - // ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = 0; - // guf_utf8_char ch = {}; - // for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) { - // if (stat == GUF_UTF8_READ_VALID) { - // ++valid_chars; - // printf("%s", ch.bytes); - // } else { - // ++invalid_chars; - // printf("::INVALID_UTF8_CHAR::"); - // } - // bytes += guf_utf8_char_num_bytes(&ch); - // } - // TEST_CHECK(input_str.len == 0 && input_str.str == NULL); - // printf("\nread %td bytes\n", bytes); - // printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars); - dict_cstr_int_free(&word_cnt_dict, NULL); bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size; TEST_CHECK(!dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones); diff --git a/src/test/test_utf8.hpp b/src/test/test_utf8.hpp new file mode 100644 index 0000000..02a1830 --- /dev/null +++ b/src/test/test_utf8.hpp @@ -0,0 +1,156 @@ +#include +#include "test.hpp" + +extern "C" +{ + #include "guf_alloc_libc.h" + #include "guf_dict_impl.h" + #include "guf_dbuf_impl.h" + #include "guf_utf8.h" + #include "guf_str.h" +} + +struct UTF8Test : public Test +{ + + UTF8Test(const std::string& name) : Test(name) {}; + + private: + dbuf_char text_buf {}; + std::vector text_vec; + + bool load_text(const char *fname) + { + FILE *in_file {nullptr}; + if (!in_file) { + in_file = fopen(fname, "r"); + } + + if (!in_file) { + return false; + } + + dbuf_char_init(&text_buf, 128, &guf_allocator_libc); + + int c = EOF; + while ((c = fgetc(in_file)) != EOF) { + dbuf_char_push_val(&text_buf, (char)c); + text_vec.push_back((char)c); + } + fclose(in_file); + + return TEST_CHECK(std::ssize(text_vec) == text_buf.size); + } + + void free_text() + { + dbuf_char_free(&text_buf, NULL); + text_vec.clear(); + } + + + void read_utf8_chars(const char *fname, ptrdiff_t *n_valid, ptrdiff_t *n_invalid) + { + GUF_ASSERT_RELEASE(load_text(fname)); + + ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = 0; + guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; + guf_utf8_char ch = {}; + for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) { + if (stat == GUF_UTF8_READ_VALID) { + ++valid_chars; + // printf("%s", ch.bytes); + } else { + ++invalid_chars; + // printf("::INVALID_UTF8_CHAR::"); + } + bytes += guf_utf8_char_num_bytes(&ch); + } + TEST_CHECK(input_str.len == 0 && input_str.str == NULL); + TEST_CHECK(bytes == text_buf.size); + + // printf("\nread %td bytes\n", bytes); + // printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars); + + free_text(); + + if (n_valid) + *n_valid = valid_chars; + if (n_invalid) + *n_invalid = invalid_chars; + } + + int count_words(const char *fname, const dbuf_str_view *delims) + { + GUF_ASSERT_RELEASE(load_text(fname)); + + int num_words = 0; + + guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; + guf_str_view tok; + while ((tok = guf_str_next_tok(&input_str, delims->data, delims->size, NULL, -1)).len) { + // printf("tok_len: %td ", tok.len); + // printf("'%.*s'\n", (int)tok.len, tok.str); + ++num_words; + } + + free_text(); + return num_words; + } + + int count_words_with_delims(const char *fname, const dbuf_str_view *delims) + { + GUF_ASSERT_RELEASE(load_text(fname)); + + int num_words = 0; + guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; + guf_str_view tok; + while ((tok = guf_str_next_tok(&input_str, delims->data, delims->size, delims->data, delims->size)).len) { + // if (tok.str[0] == '\n') { + // printf("'\\n'\n"); + // } else { + // printf("'%.*s'\n", (int)tok.len, tok.str); + // } + ++num_words; + } + free_text(); + return num_words; + } + + public: + + bool run() + { + if (done) { + return passed; + } + + ptrdiff_t valid = 0, invalid = 0; + read_utf8_chars(TEST_DATA_DIR "/" "utf8-test.txt", &valid, &invalid); + TEST_CHECK(valid == 2634 && invalid == 0); + + dbuf_str_view delims = dbuf_str_view_new(&guf_allocator_libc); + for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_whitespace); ++i) { + guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_whitespace[i]), .str = guf_utf8_whitespace[i]}; + dbuf_str_view_push_val(&delims, d); + } + for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_punctuation); ++i) { + guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_punctuation[i]), .str = guf_utf8_punctuation[i]}; + dbuf_str_view_push_val(&delims, d); + } + + int words = count_words(TEST_DATA_DIR "/" "utf8-test.txt", &delims); + printf("words %d\n", words); + TEST_CHECK(words == 422); + + int words_with_delims = count_words_with_delims(TEST_DATA_DIR "/" "utf8-test.txt", &delims); + TEST_CHECK(words_with_delims == 947); + + dbuf_str_view_free(&delims, NULL); + + done = true; + passed = (num_failed_checks == 0); + return passed; + } + +};