Make preserved_delims work (in guf_str_next_tok)

This commit is contained in:
jun 2025-02-27 13:02:28 +01:00
parent 217622d816
commit 8b02eff3b7
5 changed files with 201 additions and 20 deletions

View File

@ -257,12 +257,30 @@ GUF_FN_KEYWORDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str
guf_str_view delim_candidate = guf_substr_view(prev_input, 0, delim_len); guf_str_view delim_candidate = guf_substr_view(prev_input, 0, delim_len);
for (ptrdiff_t delim_i = 0; delim_i < num_delims; ++delim_i) { for (ptrdiff_t delim_i = 0; delim_i < num_delims; ++delim_i) {
if (guf_str_view_equal(&delim_candidate, delims + delim_i)) { // Found delim. if (guf_str_view_equal(&delim_candidate, delims + delim_i)) { // Found delim.
bool preserved = false;
if (preserved_delims && num_preserved_delims > 0) {
for (ptrdiff_t preserved_i = 0; preserved_i < num_preserved_delims; ++preserved_i) {
if (guf_str_view_equal(&delim_candidate, preserved_delims + preserved_i)) {
preserved = true;
break;
}
}
}
if (!preserved) {
input->len = prev_input.len - delim_len; input->len = prev_input.len - delim_len;
input->str = prev_input.len > 0 ? prev_input.str + delim_len : NULL; input->str = prev_input.len > 0 ? prev_input.str + delim_len : NULL;
if (preserved_delims && num_preserved_delims > 0) { GUF_ASSERT(input->len >= 0);
} else {
input->str -= num_bytes;
input->len += num_bytes;
}
if (tok.len == 0) {
if (preserved) {
input->str += num_bytes;
input->len -= num_bytes;
return delim_candidate; return delim_candidate;
} }
if (tok.len == 0) {
tok.str = input->str; tok.str = input->str;
goto end; goto end;
} else { } else {

View File

@ -44,6 +44,24 @@ Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa.
Pijamalı hasta, yağız şoföre çabucak güvendi. Pijamalı hasta, yağız şoföre çabucak güvendi.
Albert osti fagotin ja töräytti puhkuvan melodian.
דג סקרן שט בים מאוכזב ולפתע מצא חברה
نص حكيم له سر قاطع وذو شأن عظيم مكتوب على ثوب أخضر ومغلف بجلد أزرق
بر اثر چنین تلقین و شستشوی مغزی جامعی، سطح و پایهٔ ذهن و فهم و نظر بعضی اشخاص واژگونه و معکوس می‌شود
키스의 고유조건은 입술끼리 만나야 하고 특별한 기술은 필요치 않다.
いろはにほへとちりぬるを
わかよたれそつねならむ
うゐのおくやまけふこえて
あさきゆめみしゑひもせす
イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム
ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン
ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ
ᛋᚳᛖᚪᛚ᛫ᚦᛖᚪᚻ᛫ᛗᚪᚾᚾᚪ᛫ᚷᛖᚻᚹᛦᛚᚳ᛫ᛗᛁᚳᛚᚢᚾ᛫ᚻᛦᛏ᛫ᛞᚫᛚᚪᚾ ᛋᚳᛖᚪᛚ᛫ᚦᛖᚪᚻ᛫ᛗᚪᚾᚾᚪ᛫ᚷᛖᚻᚹᛦᛚᚳ᛫ᛗᛁᚳᛚᚢᚾ᛫ᚻᛦᛏ᛫ᛞᚫᛚᚪᚾ
ᚷᛁᚠ᛫ᚻᛖ᛫ᚹᛁᛚᛖ᛫ᚠᚩᚱ᛫ᛞᚱᛁᚻᛏᚾᛖ᛫ᛞᚩᛗᛖᛋ᛫ᚻᛚᛇᛏᚪᚾ᛬ ᚷᛁᚠ᛫ᚻᛖ᛫ᚹᛁᛚᛖ᛫ᚠᚩᚱ᛫ᛞᚱᛁᚻᛏᚾᛖ᛫ᛞᚩᛗᛖᛋ᛫ᚻᛚᛇᛏᚪᚾ᛬

View File

@ -9,6 +9,7 @@ extern "C" {
#include "test_dbuf.hpp" #include "test_dbuf.hpp"
#include "test_dict.hpp" #include "test_dict.hpp"
#include "test_utf8.hpp"
std::unordered_set<std::unique_ptr<Test>> g_tests {}; std::unordered_set<std::unique_ptr<Test>> g_tests {};
@ -25,6 +26,10 @@ void init_tests()
test = std::make_unique<DictCstrToIntTest>("DictCstrToIntTest"); test = std::make_unique<DictCstrToIntTest>("DictCstrToIntTest");
GUF_ASSERT_RELEASE(test.get()); GUF_ASSERT_RELEASE(test.get());
g_tests.insert(std::move(test)); g_tests.insert(std::move(test));
test = std::make_unique<UTF8Test>("UTF8Test");
GUF_ASSERT_RELEASE(test.get());
g_tests.insert(std::move(test));
} }
int main() int main()

View File

@ -39,26 +39,10 @@ struct DictCstrToIntTest : public Test
guf_str_view tok; guf_str_view tok;
while ((tok = guf_str_next_tok(&input_str, delims.data, delims.size, NULL, -1)).len) { while ((tok = guf_str_next_tok(&input_str, delims.data, delims.size, NULL, -1)).len) {
// printf("tok_len: %td ", tok.len); // printf("tok_len: %td ", tok.len);
printf("'%.*s'\n", (int)tok.len, tok.str); // printf("'%.*s'\n", (int)tok.len, tok.str);
} }
dbuf_str_view_free(&delims, NULL); dbuf_str_view_free(&delims, NULL);
// ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = 0;
// guf_utf8_char ch = {};
// for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
// if (stat == GUF_UTF8_READ_VALID) {
// ++valid_chars;
// printf("%s", ch.bytes);
// } else {
// ++invalid_chars;
// printf("::INVALID_UTF8_CHAR::");
// }
// bytes += guf_utf8_char_num_bytes(&ch);
// }
// TEST_CHECK(input_str.len == 0 && input_str.str == NULL);
// printf("\nread %td bytes\n", bytes);
// printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars);
dict_cstr_int_free(&word_cnt_dict, NULL); dict_cstr_int_free(&word_cnt_dict, NULL);
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size; bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;
TEST_CHECK(!dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones); TEST_CHECK(!dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones);

156
src/test/test_utf8.hpp Normal file
View File

@ -0,0 +1,156 @@
#include <vector>
#include "test.hpp"
extern "C"
{
#include "guf_alloc_libc.h"
#include "guf_dict_impl.h"
#include "guf_dbuf_impl.h"
#include "guf_utf8.h"
#include "guf_str.h"
}
struct UTF8Test : public Test
{
UTF8Test(const std::string& name) : Test(name) {};
private:
dbuf_char text_buf {};
std::vector<char> text_vec;
bool load_text(const char *fname)
{
FILE *in_file {nullptr};
if (!in_file) {
in_file = fopen(fname, "r");
}
if (!in_file) {
return false;
}
dbuf_char_init(&text_buf, 128, &guf_allocator_libc);
int c = EOF;
while ((c = fgetc(in_file)) != EOF) {
dbuf_char_push_val(&text_buf, (char)c);
text_vec.push_back((char)c);
}
fclose(in_file);
return TEST_CHECK(std::ssize(text_vec) == text_buf.size);
}
void free_text()
{
dbuf_char_free(&text_buf, NULL);
text_vec.clear();
}
void read_utf8_chars(const char *fname, ptrdiff_t *n_valid, ptrdiff_t *n_invalid)
{
GUF_ASSERT_RELEASE(load_text(fname));
ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = 0;
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
guf_utf8_char ch = {};
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
if (stat == GUF_UTF8_READ_VALID) {
++valid_chars;
// printf("%s", ch.bytes);
} else {
++invalid_chars;
// printf("::INVALID_UTF8_CHAR::");
}
bytes += guf_utf8_char_num_bytes(&ch);
}
TEST_CHECK(input_str.len == 0 && input_str.str == NULL);
TEST_CHECK(bytes == text_buf.size);
// printf("\nread %td bytes\n", bytes);
// printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars);
free_text();
if (n_valid)
*n_valid = valid_chars;
if (n_invalid)
*n_invalid = invalid_chars;
}
int count_words(const char *fname, const dbuf_str_view *delims)
{
GUF_ASSERT_RELEASE(load_text(fname));
int num_words = 0;
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
guf_str_view tok;
while ((tok = guf_str_next_tok(&input_str, delims->data, delims->size, NULL, -1)).len) {
// printf("tok_len: %td ", tok.len);
// printf("'%.*s'\n", (int)tok.len, tok.str);
++num_words;
}
free_text();
return num_words;
}
int count_words_with_delims(const char *fname, const dbuf_str_view *delims)
{
GUF_ASSERT_RELEASE(load_text(fname));
int num_words = 0;
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
guf_str_view tok;
while ((tok = guf_str_next_tok(&input_str, delims->data, delims->size, delims->data, delims->size)).len) {
// if (tok.str[0] == '\n') {
// printf("'\\n'\n");
// } else {
// printf("'%.*s'\n", (int)tok.len, tok.str);
// }
++num_words;
}
free_text();
return num_words;
}
public:
bool run()
{
if (done) {
return passed;
}
ptrdiff_t valid = 0, invalid = 0;
read_utf8_chars(TEST_DATA_DIR "/" "utf8-test.txt", &valid, &invalid);
TEST_CHECK(valid == 2634 && invalid == 0);
dbuf_str_view delims = dbuf_str_view_new(&guf_allocator_libc);
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_whitespace); ++i) {
guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_whitespace[i]), .str = guf_utf8_whitespace[i]};
dbuf_str_view_push_val(&delims, d);
}
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_punctuation); ++i) {
guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_punctuation[i]), .str = guf_utf8_punctuation[i]};
dbuf_str_view_push_val(&delims, d);
}
int words = count_words(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
printf("words %d\n", words);
TEST_CHECK(words == 422);
int words_with_delims = count_words_with_delims(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
TEST_CHECK(words_with_delims == 947);
dbuf_str_view_free(&delims, NULL);
done = true;
passed = (num_failed_checks == 0);
return passed;
}
};