Make preserved_delims work (in guf_str_next_tok)
This commit is contained in:
parent
217622d816
commit
8b02eff3b7
@ -257,12 +257,30 @@ GUF_FN_KEYWORDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str
|
||||
guf_str_view delim_candidate = guf_substr_view(prev_input, 0, delim_len);
|
||||
for (ptrdiff_t delim_i = 0; delim_i < num_delims; ++delim_i) {
|
||||
if (guf_str_view_equal(&delim_candidate, delims + delim_i)) { // Found delim.
|
||||
bool preserved = false;
|
||||
if (preserved_delims && num_preserved_delims > 0) {
|
||||
for (ptrdiff_t preserved_i = 0; preserved_i < num_preserved_delims; ++preserved_i) {
|
||||
if (guf_str_view_equal(&delim_candidate, preserved_delims + preserved_i)) {
|
||||
preserved = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!preserved) {
|
||||
input->len = prev_input.len - delim_len;
|
||||
input->str = prev_input.len > 0 ? prev_input.str + delim_len : NULL;
|
||||
if (preserved_delims && num_preserved_delims > 0) {
|
||||
GUF_ASSERT(input->len >= 0);
|
||||
} else {
|
||||
input->str -= num_bytes;
|
||||
input->len += num_bytes;
|
||||
}
|
||||
|
||||
if (tok.len == 0) {
|
||||
if (preserved) {
|
||||
input->str += num_bytes;
|
||||
input->len -= num_bytes;
|
||||
return delim_candidate;
|
||||
}
|
||||
if (tok.len == 0) {
|
||||
tok.str = input->str;
|
||||
goto end;
|
||||
} else {
|
||||
|
||||
@ -44,6 +44,24 @@ Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa.
|
||||
|
||||
Pijamalı hasta, yağız şoföre çabucak güvendi.
|
||||
|
||||
Albert osti fagotin ja töräytti puhkuvan melodian.
|
||||
|
||||
דג סקרן שט בים מאוכזב ולפתע מצא חברה
|
||||
|
||||
نص حكيم له سر قاطع وذو شأن عظيم مكتوب على ثوب أخضر ومغلف بجلد أزرق
|
||||
|
||||
بر اثر چنین تلقین و شستشوی مغزی جامعی، سطح و پایهٔ ذهن و فهم و نظر بعضی اشخاص واژگونه و معکوس میشود
|
||||
|
||||
키스의 고유조건은 입술끼리 만나야 하고 특별한 기술은 필요치 않다.
|
||||
|
||||
いろはにほへとちりぬるを
|
||||
わかよたれそつねならむ
|
||||
うゐのおくやまけふこえて
|
||||
あさきゆめみしゑひもせす
|
||||
|
||||
イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム
|
||||
ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン
|
||||
|
||||
ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ
|
||||
ᛋᚳᛖᚪᛚ᛫ᚦᛖᚪᚻ᛫ᛗᚪᚾᚾᚪ᛫ᚷᛖᚻᚹᛦᛚᚳ᛫ᛗᛁᚳᛚᚢᚾ᛫ᚻᛦᛏ᛫ᛞᚫᛚᚪᚾ
|
||||
ᚷᛁᚠ᛫ᚻᛖ᛫ᚹᛁᛚᛖ᛫ᚠᚩᚱ᛫ᛞᚱᛁᚻᛏᚾᛖ᛫ᛞᚩᛗᛖᛋ᛫ᚻᛚᛇᛏᚪᚾ᛬
|
||||
@ -9,6 +9,7 @@ extern "C" {
|
||||
|
||||
#include "test_dbuf.hpp"
|
||||
#include "test_dict.hpp"
|
||||
#include "test_utf8.hpp"
|
||||
|
||||
std::unordered_set<std::unique_ptr<Test>> g_tests {};
|
||||
|
||||
@ -25,6 +26,10 @@ void init_tests()
|
||||
test = std::make_unique<DictCstrToIntTest>("DictCstrToIntTest");
|
||||
GUF_ASSERT_RELEASE(test.get());
|
||||
g_tests.insert(std::move(test));
|
||||
|
||||
test = std::make_unique<UTF8Test>("UTF8Test");
|
||||
GUF_ASSERT_RELEASE(test.get());
|
||||
g_tests.insert(std::move(test));
|
||||
}
|
||||
|
||||
int main()
|
||||
|
||||
@ -39,26 +39,10 @@ struct DictCstrToIntTest : public Test
|
||||
guf_str_view tok;
|
||||
while ((tok = guf_str_next_tok(&input_str, delims.data, delims.size, NULL, -1)).len) {
|
||||
// printf("tok_len: %td ", tok.len);
|
||||
printf("'%.*s'\n", (int)tok.len, tok.str);
|
||||
// printf("'%.*s'\n", (int)tok.len, tok.str);
|
||||
}
|
||||
dbuf_str_view_free(&delims, NULL);
|
||||
|
||||
// ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = 0;
|
||||
// guf_utf8_char ch = {};
|
||||
// for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
|
||||
// if (stat == GUF_UTF8_READ_VALID) {
|
||||
// ++valid_chars;
|
||||
// printf("%s", ch.bytes);
|
||||
// } else {
|
||||
// ++invalid_chars;
|
||||
// printf("::INVALID_UTF8_CHAR::");
|
||||
// }
|
||||
// bytes += guf_utf8_char_num_bytes(&ch);
|
||||
// }
|
||||
// TEST_CHECK(input_str.len == 0 && input_str.str == NULL);
|
||||
// printf("\nread %td bytes\n", bytes);
|
||||
// printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars);
|
||||
|
||||
dict_cstr_int_free(&word_cnt_dict, NULL);
|
||||
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;
|
||||
TEST_CHECK(!dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones);
|
||||
|
||||
156
src/test/test_utf8.hpp
Normal file
156
src/test/test_utf8.hpp
Normal file
@ -0,0 +1,156 @@
|
||||
#include <vector>
|
||||
#include "test.hpp"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "guf_alloc_libc.h"
|
||||
#include "guf_dict_impl.h"
|
||||
#include "guf_dbuf_impl.h"
|
||||
#include "guf_utf8.h"
|
||||
#include "guf_str.h"
|
||||
}
|
||||
|
||||
struct UTF8Test : public Test
|
||||
{
|
||||
|
||||
UTF8Test(const std::string& name) : Test(name) {};
|
||||
|
||||
private:
|
||||
dbuf_char text_buf {};
|
||||
std::vector<char> text_vec;
|
||||
|
||||
bool load_text(const char *fname)
|
||||
{
|
||||
FILE *in_file {nullptr};
|
||||
if (!in_file) {
|
||||
in_file = fopen(fname, "r");
|
||||
}
|
||||
|
||||
if (!in_file) {
|
||||
return false;
|
||||
}
|
||||
|
||||
dbuf_char_init(&text_buf, 128, &guf_allocator_libc);
|
||||
|
||||
int c = EOF;
|
||||
while ((c = fgetc(in_file)) != EOF) {
|
||||
dbuf_char_push_val(&text_buf, (char)c);
|
||||
text_vec.push_back((char)c);
|
||||
}
|
||||
fclose(in_file);
|
||||
|
||||
return TEST_CHECK(std::ssize(text_vec) == text_buf.size);
|
||||
}
|
||||
|
||||
void free_text()
|
||||
{
|
||||
dbuf_char_free(&text_buf, NULL);
|
||||
text_vec.clear();
|
||||
}
|
||||
|
||||
|
||||
void read_utf8_chars(const char *fname, ptrdiff_t *n_valid, ptrdiff_t *n_invalid)
|
||||
{
|
||||
GUF_ASSERT_RELEASE(load_text(fname));
|
||||
|
||||
ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = 0;
|
||||
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
|
||||
guf_utf8_char ch = {};
|
||||
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
|
||||
if (stat == GUF_UTF8_READ_VALID) {
|
||||
++valid_chars;
|
||||
// printf("%s", ch.bytes);
|
||||
} else {
|
||||
++invalid_chars;
|
||||
// printf("::INVALID_UTF8_CHAR::");
|
||||
}
|
||||
bytes += guf_utf8_char_num_bytes(&ch);
|
||||
}
|
||||
TEST_CHECK(input_str.len == 0 && input_str.str == NULL);
|
||||
TEST_CHECK(bytes == text_buf.size);
|
||||
|
||||
// printf("\nread %td bytes\n", bytes);
|
||||
// printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars);
|
||||
|
||||
free_text();
|
||||
|
||||
if (n_valid)
|
||||
*n_valid = valid_chars;
|
||||
if (n_invalid)
|
||||
*n_invalid = invalid_chars;
|
||||
}
|
||||
|
||||
int count_words(const char *fname, const dbuf_str_view *delims)
|
||||
{
|
||||
GUF_ASSERT_RELEASE(load_text(fname));
|
||||
|
||||
int num_words = 0;
|
||||
|
||||
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
|
||||
guf_str_view tok;
|
||||
while ((tok = guf_str_next_tok(&input_str, delims->data, delims->size, NULL, -1)).len) {
|
||||
// printf("tok_len: %td ", tok.len);
|
||||
// printf("'%.*s'\n", (int)tok.len, tok.str);
|
||||
++num_words;
|
||||
}
|
||||
|
||||
free_text();
|
||||
return num_words;
|
||||
}
|
||||
|
||||
int count_words_with_delims(const char *fname, const dbuf_str_view *delims)
|
||||
{
|
||||
GUF_ASSERT_RELEASE(load_text(fname));
|
||||
|
||||
int num_words = 0;
|
||||
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
|
||||
guf_str_view tok;
|
||||
while ((tok = guf_str_next_tok(&input_str, delims->data, delims->size, delims->data, delims->size)).len) {
|
||||
// if (tok.str[0] == '\n') {
|
||||
// printf("'\\n'\n");
|
||||
// } else {
|
||||
// printf("'%.*s'\n", (int)tok.len, tok.str);
|
||||
// }
|
||||
++num_words;
|
||||
}
|
||||
free_text();
|
||||
return num_words;
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
bool run()
|
||||
{
|
||||
if (done) {
|
||||
return passed;
|
||||
}
|
||||
|
||||
ptrdiff_t valid = 0, invalid = 0;
|
||||
read_utf8_chars(TEST_DATA_DIR "/" "utf8-test.txt", &valid, &invalid);
|
||||
TEST_CHECK(valid == 2634 && invalid == 0);
|
||||
|
||||
dbuf_str_view delims = dbuf_str_view_new(&guf_allocator_libc);
|
||||
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_whitespace); ++i) {
|
||||
guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_whitespace[i]), .str = guf_utf8_whitespace[i]};
|
||||
dbuf_str_view_push_val(&delims, d);
|
||||
}
|
||||
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_punctuation); ++i) {
|
||||
guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_punctuation[i]), .str = guf_utf8_punctuation[i]};
|
||||
dbuf_str_view_push_val(&delims, d);
|
||||
}
|
||||
|
||||
int words = count_words(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
|
||||
printf("words %d\n", words);
|
||||
TEST_CHECK(words == 422);
|
||||
|
||||
int words_with_delims = count_words_with_delims(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
|
||||
TEST_CHECK(words_with_delims == 947);
|
||||
|
||||
dbuf_str_view_free(&delims, NULL);
|
||||
|
||||
done = true;
|
||||
passed = (num_failed_checks == 0);
|
||||
return passed;
|
||||
}
|
||||
|
||||
};
|
||||
Loading…
x
Reference in New Issue
Block a user