Re-implement guf_str tokeniser
This commit is contained in:
parent
c6b0aa8d89
commit
1013616b2d
248
src/guf_str.h
248
src/guf_str.h
@ -50,6 +50,20 @@ typedef struct guf_str {
|
|||||||
guf_allocator *allocator; // Wasteful (8 bytes on 64-bit platforms...), but keeping this pointer also allows us to have "read-only strings" (a string is read-only if allocator == NULL)
|
guf_allocator *allocator; // Wasteful (8 bytes on 64-bit platforms...), but keeping this pointer also allows us to have "read-only strings" (a string is read-only if allocator == NULL)
|
||||||
} guf_str; // Total: 32 bytes on 64-bit platforms, 16 bytes on 32-bit platforms.
|
} guf_str; // Total: 32 bytes on 64-bit platforms, 16 bytes on 32-bit platforms.
|
||||||
|
|
||||||
|
typedef enum guf_str_tok_delim_opt {
|
||||||
|
GUF_STR_TOK_DELIM_OPT_MATCH_LONGEST = 0,
|
||||||
|
GUF_STR_TOK_DELIM_OPT_MATCH_SHORTEST,
|
||||||
|
GUF_STR_TOK_DELIM_OPT_MATCH_IN_ORDER,
|
||||||
|
} guf_str_tok_delim_opt;
|
||||||
|
|
||||||
|
typedef struct guf_str_tok_state {
|
||||||
|
guf_str_view input;
|
||||||
|
guf_str_view cur_tok, cur_delim;
|
||||||
|
const guf_str_view *delims;
|
||||||
|
const ptrdiff_t delim_count;
|
||||||
|
ptrdiff_t num_toks_read, num_delims_read;
|
||||||
|
bool done;
|
||||||
|
} guf_str_tok_state;
|
||||||
|
|
||||||
#define GUF_CSTR_TO_VIEW(CSTR) ((guf_str_view){.str = (CSTR), .len = (ptrdiff_t)strlen((CSTR))})
|
#define GUF_CSTR_TO_VIEW(CSTR) ((guf_str_view){.str = (CSTR), .len = (ptrdiff_t)strlen((CSTR))})
|
||||||
#define GUF_CSTR_LIT_TO_VIEW(CSTR) ((guf_str_view){.str = (CSTR), .len = (ptrdiff_t)sizeof((CSTR)) - 1})
|
#define GUF_CSTR_LIT_TO_VIEW(CSTR) ((guf_str_view){.str = (CSTR), .len = (ptrdiff_t)sizeof((CSTR)) - 1})
|
||||||
@ -88,8 +102,29 @@ GUF_STR_KWRDS guf_str_view guf_str_view_trim_right_ascii(guf_str_view sv);
|
|||||||
// Return true if sv does not violate any of its invariants (.len must be >= 0, .str must not be NULL unless len is 0)
|
// Return true if sv does not violate any of its invariants (.len must be >= 0, .str must not be NULL unless len is 0)
|
||||||
GUF_STR_KWRDS bool guf_str_view_is_valid(guf_str_view sv);
|
GUF_STR_KWRDS bool guf_str_view_is_valid(guf_str_view sv);
|
||||||
|
|
||||||
// Return the guf_str_view corresponding to the next token (delimiters (each can be more than once character) given by delims, preserved_delims are delimiters which are returned as tokens when encountered)
|
/*
|
||||||
GUF_STR_KWRDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str_view *delims, ptrdiff_t num_delims, const guf_str_view *preserved_delims, ptrdiff_t num_preserved_delims);
|
Return the substring up to the first delimiter "delim" and advance src to one past the delim (so the function can be called repeatedly)
|
||||||
|
cf. "str_pop_first_split":
|
||||||
|
- https://accu.org/conf-docs/PDFs_2021/luca_sass_modern_c_and_what_we_can_learn_from_it.pdf ("String handling in Modern C", page 128 of the pdf)
|
||||||
|
- https://youtu.be/QpAhX-gsHMs?si=lCvm6o60LrYHaAHc&t=3059 (last-retrieved 2025-04-30)
|
||||||
|
*/
|
||||||
|
GUF_STR_KWRDS guf_str_view guf_str_view_pop_split(guf_str_view *src, guf_str_view delim);
|
||||||
|
|
||||||
|
// Create a new tokeniser-state for guf_str_tok_next.
|
||||||
|
GUF_STR_KWRDS guf_str_tok_state guf_str_tok_state_new(guf_str_view str, guf_str_view *delims, ptrdiff_t delim_count, guf_str_tok_delim_opt delim_match_opt);
|
||||||
|
/*
|
||||||
|
Return true when the next token (or delimiter if preserve_delims == true) was encountered.
|
||||||
|
Put the current token into state->cur_tok.
|
||||||
|
If preserve_delims is true, every delimiter will be put into state->cur_delim.
|
||||||
|
If preserve_delims is false, delimiters will only be put into state->cur_delim if the current token is not empty
|
||||||
|
(otherwise, empty tokens are skipped for preserve_delims == false).
|
||||||
|
|
||||||
|
Example: delims = ["-", "+"]
|
||||||
|
- "-1+2": tok_next(preserve_delims=false) will set state->cur_tok = "1" and state->cur_delim = "+"
|
||||||
|
- "-1+2": tok_next(preserve_delims=true) will set state->cur_tok = "" and state->cur_delim = "-"
|
||||||
|
Set preserve_delims to false if you don't care about processing the delimiters
|
||||||
|
*/
|
||||||
|
GUF_STR_KWRDS bool guf_str_tok_next(guf_str_tok_state *state, bool preserve_delims);
|
||||||
|
|
||||||
|
|
||||||
// 2.) guf_str:
|
// 2.) guf_str:
|
||||||
@ -1178,76 +1213,171 @@ GUF_STR_KWRDS bool guf_str_view_is_valid(guf_str_view sv)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GUF_STR_KWRDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str_view *delims, ptrdiff_t num_delims, const guf_str_view *preserved_delims, ptrdiff_t num_preserved_delims)
|
/*
|
||||||
|
cf. "str_pop_first_split":
|
||||||
|
- https://accu.org/conf-docs/PDFs_2021/luca_sass_modern_c_and_what_we_can_learn_from_it.pdf ("String handling in Modern C", page 128 of the pdf)
|
||||||
|
- https://youtu.be/QpAhX-gsHMs?si=lCvm6o60LrYHaAHc&t=3059 (last-retrieved 2025-04-30)
|
||||||
|
*/
|
||||||
|
|
||||||
|
GUF_STR_KWRDS guf_str_view guf_str_view_pop_split(guf_str_view *src, guf_str_view delim)
|
||||||
{
|
{
|
||||||
if (input->len <= 0 || input->str == NULL) {
|
GUF_ASSERT(src);
|
||||||
return (guf_str_view){.str = NULL, .len = 0};
|
GUF_ASSERT_RELEASE(guf_str_view_is_valid(*src));
|
||||||
|
GUF_ASSERT_RELEASE(guf_str_view_is_valid(delim));
|
||||||
|
|
||||||
|
if (delim.len <= 0) {
|
||||||
|
goto delim_not_found;
|
||||||
}
|
}
|
||||||
|
|
||||||
ptrdiff_t max_delim_len = -1;
|
for (ptrdiff_t src_idx = 0; src_idx < src->len; ++src_idx) {
|
||||||
for (ptrdiff_t i = 0; i < num_delims; ++i) {
|
ptrdiff_t num_matched = 0;
|
||||||
if (delims[i].len > max_delim_len) {
|
for (ptrdiff_t delim_idx = 0; delim_idx < delim.len && (src_idx + delim.len <= src->len); ++delim_idx) {
|
||||||
max_delim_len = delims[i].len;
|
if (delim.str[delim_idx] != src->str[src_idx + delim_idx]) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
++num_matched;
|
||||||
|
}
|
||||||
|
if (num_matched == delim.len) { // Delimiter found in interval [src_idx, src_idx + delim.len)
|
||||||
|
const guf_str_view popped = guf_str_view_substr(*src, 0, src_idx);
|
||||||
|
const ptrdiff_t advance_len = popped.len + delim.len;
|
||||||
|
GUF_ASSERT(advance_len > 0 && advance_len >= delim.len);
|
||||||
|
src->len -= advance_len;
|
||||||
|
GUF_ASSERT(src->len >= 0);
|
||||||
|
src->str = src->len > 0 ? src->str + advance_len : NULL;
|
||||||
|
return popped;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
guf_str_view tok = {.str = input->str, .len = 0};
|
delim_not_found:;
|
||||||
guf_str_view prev_input = *input;
|
const guf_str_view popped = *src;
|
||||||
guf_utf8_char ch = {0};
|
src->str = NULL;
|
||||||
|
src->len = 0;
|
||||||
|
return popped;
|
||||||
|
}
|
||||||
|
|
||||||
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, input); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, input)) {
|
static inline int guf_str_view_cmp_asc_by_len_(const void *a, const void *b)
|
||||||
if (stat != GUF_UTF8_READ_VALID) {
|
{
|
||||||
prev_input = *input;
|
const guf_str_view *asv = (const guf_str_view*)a;
|
||||||
continue;
|
const guf_str_view *bsv = (const guf_str_view*)b;
|
||||||
|
if (asv->len < bsv->len) {
|
||||||
|
return -1;
|
||||||
|
} else if (asv->len > bsv->len) {
|
||||||
|
return 1;
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const int num_bytes = guf_utf8_char_num_bytes(&ch);
|
static inline int guf_str_view_cmp_desc_by_len_(const void *a, const void *b)
|
||||||
|
{
|
||||||
|
return -guf_str_view_cmp_asc_by_len_(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
for (ptrdiff_t delim_len = GUF_MIN(max_delim_len, prev_input.len); delim_len > 0; --delim_len) {
|
GUF_STR_KWRDS guf_str_tok_state guf_str_tok_state_new(guf_str_view str, guf_str_view *delims, ptrdiff_t delim_count, guf_str_tok_delim_opt delim_match_opt)
|
||||||
guf_str_view delim_candidate = guf_str_view_substr(prev_input, 0, delim_len);
|
{
|
||||||
for (ptrdiff_t delim_i = 0; delim_i < num_delims; ++delim_i) {
|
GUF_ASSERT_RELEASE(guf_str_view_is_valid(str));
|
||||||
if (guf_str_view_equal(&delim_candidate, delims + delim_i)) { // Found delim.
|
GUF_ASSERT_RELEASE(delim_count > 0 ? delims != NULL : true);
|
||||||
bool preserved = false;
|
|
||||||
if (preserved_delims && num_preserved_delims > 0) {
|
ptrdiff_t max_delim_len = 0;
|
||||||
for (ptrdiff_t preserved_i = 0; preserved_i < num_preserved_delims; ++preserved_i) {
|
if (delims && delim_count > 0) {
|
||||||
if (guf_str_view_equal(&delim_candidate, preserved_delims + preserved_i)) {
|
for (ptrdiff_t i = 0; i < delim_count; ++i) {
|
||||||
preserved = true;
|
GUF_ASSERT_RELEASE(guf_str_view_is_valid(delims[i]));
|
||||||
|
max_delim_len = guf_max_ptrdiff_t(max_delim_len, delims[i].len);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (max_delim_len <= 0 || delim_count <= 0 || delims == NULL) {
|
||||||
|
delims = NULL;
|
||||||
|
delim_count = 0;
|
||||||
|
} else {
|
||||||
|
switch (delim_match_opt) {
|
||||||
|
case GUF_STR_TOK_DELIM_OPT_MATCH_LONGEST:
|
||||||
|
qsort(delims, delim_count, sizeof(delims[0]), guf_str_view_cmp_desc_by_len_);
|
||||||
|
break;
|
||||||
|
case GUF_STR_TOK_DELIM_OPT_MATCH_SHORTEST:
|
||||||
|
qsort(delims, delim_count, sizeof(delims[0]), guf_str_view_cmp_asc_by_len_);
|
||||||
|
break;
|
||||||
|
case GUF_STR_TOK_DELIM_OPT_MATCH_IN_ORDER:
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GUF_ASSERT(false);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
return (guf_str_tok_state) {
|
||||||
if (!preserved) {
|
.input = str,
|
||||||
input->len = prev_input.len - delim_len;
|
.delims = delims,
|
||||||
input->str = prev_input.len > 0 ? prev_input.str + delim_len : NULL;
|
.delim_count = delim_count,
|
||||||
GUF_ASSERT(input->len >= 0);
|
.num_toks_read = 0,
|
||||||
} else {
|
.num_delims_read = 0,
|
||||||
input->str -= num_bytes;
|
.cur_tok = (guf_str_view){.len = 0, .str = NULL},
|
||||||
input->len += num_bytes;
|
.cur_delim = (guf_str_view){.len = 0, .str = NULL},
|
||||||
}
|
.done = false
|
||||||
|
};
|
||||||
if (tok.len == 0) {
|
|
||||||
if (preserved) {
|
|
||||||
input->str += num_bytes;
|
|
||||||
input->len -= num_bytes;
|
|
||||||
return delim_candidate;
|
|
||||||
}
|
|
||||||
tok.str = input->str;
|
|
||||||
goto end;
|
|
||||||
} else {
|
|
||||||
return tok;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
tok.len += num_bytes;
|
|
||||||
|
|
||||||
end:;
|
|
||||||
prev_input = *input;
|
|
||||||
}
|
|
||||||
|
|
||||||
return tok;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GUF_STR_KWRDS bool guf_str_tok_next(guf_str_tok_state *state, bool preserve_delims)
|
||||||
|
{
|
||||||
|
GUF_ASSERT(state);
|
||||||
|
GUF_ASSERT(guf_str_view_is_valid(state->input));
|
||||||
|
GUF_ASSERT(state->num_toks_read >= 0 && state->num_delims_read >= 0 && state->delim_count >= 0);
|
||||||
|
GUF_ASSERT(state->delim_count > 0 ? state->delims != NULL : true);
|
||||||
|
|
||||||
|
if (state->done || state->input.len <= 0 || state->input.str == NULL) {
|
||||||
|
state->done = true;
|
||||||
|
state->cur_tok = (guf_str_view){.str = NULL, .len = 0};
|
||||||
|
state->cur_delim = (guf_str_view){.str = NULL, .len = 0};
|
||||||
|
return false;
|
||||||
|
} else if (state->delim_count <= 0 || state->delims == NULL) {
|
||||||
|
state->done = true;
|
||||||
|
state->cur_tok = state->input;
|
||||||
|
state->cur_delim = (guf_str_view){.str = NULL, .len = 0};
|
||||||
|
return state->cur_tok.len > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
find_next_delim_begin:
|
||||||
|
state->cur_tok = state->cur_delim = (guf_str_view){.str = state->input.str, .len = 0};
|
||||||
|
while (state->input.len > 0) {
|
||||||
|
GUF_ASSERT(state->input.str != NULL);
|
||||||
|
for (ptrdiff_t delim_idx = 0; delim_idx < state->delim_count; ++delim_idx) { // If state->delims is sorted descending/ascending by length, match the longest/shortest possible delim
|
||||||
|
const guf_str_view delim = state->delims[delim_idx];
|
||||||
|
GUF_ASSERT(guf_str_view_is_valid(delim));
|
||||||
|
if (delim.len > state->input.len || delim.len <= 0) { // Current delim cannot possibly match.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const guf_str_view delim_candidate = guf_str_view_substr(state->input, 0, delim.len);
|
||||||
|
if (guf_str_view_equal(&delim_candidate, &delim)) { // a) Matched the current delim:
|
||||||
|
GUF_ASSERT(state->input.len >= delim.len);
|
||||||
|
GUF_ASSERT(state->cur_tok.len >= 0);
|
||||||
|
state->cur_delim = delim;
|
||||||
|
state->num_delims_read += 1;
|
||||||
|
|
||||||
|
state->input.len -= delim.len;
|
||||||
|
state->input.str = state->input.len > 0 ? state->input.str + delim.len : NULL;
|
||||||
|
|
||||||
|
if (!preserve_delims && state->cur_tok.len == 0) {
|
||||||
|
goto find_next_delim_begin;
|
||||||
|
}
|
||||||
|
|
||||||
|
state->num_toks_read += state->cur_tok.len > 0 ? 1 : 0;
|
||||||
|
state->done = state->input.len <= 0;
|
||||||
|
GUF_ASSERT(state->cur_tok.len > 0 || state->cur_delim.len > 0);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// b) Could not match any of the delims:
|
||||||
|
state->cur_tok.len += 1;
|
||||||
|
state->input.len -= 1;
|
||||||
|
state->input.str = state->input.len > 0 ? state->input.str + 1 : NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
state->done = true;
|
||||||
|
GUF_ASSERT(state->cur_tok.len >= 0);
|
||||||
|
state->cur_delim = (guf_str_view){.str = NULL, .len = 0};
|
||||||
|
if (state->cur_tok.len > 0) {
|
||||||
|
state->num_toks_read += 1;
|
||||||
|
}
|
||||||
|
return state->cur_tok.len > 0;
|
||||||
|
}
|
||||||
|
|
||||||
GUF_STR_KWRDS guf_str_view guf_str_view_trim_left_ascii(guf_str_view sv)
|
GUF_STR_KWRDS guf_str_view guf_str_view_trim_left_ascii(guf_str_view sv)
|
||||||
{
|
{
|
||||||
@ -1294,7 +1424,9 @@ GUF_STR_KWRDS guf_str_view guf_str_view_substr(guf_str_view str, ptrdiff_t pos,
|
|||||||
GUF_ASSERT(substr_len >= 0);
|
GUF_ASSERT(substr_len >= 0);
|
||||||
GUF_ASSERT(substr_len <= str.len);
|
GUF_ASSERT(substr_len <= str.len);
|
||||||
|
|
||||||
return (guf_str_view){.str = str.str + pos, .len = substr_len};
|
const guf_str_view sub_sv = {.str = str.str + pos, .len = substr_len};
|
||||||
|
GUF_ASSERT(guf_str_view_is_valid(sub_sv));
|
||||||
|
return sub_sv;
|
||||||
}
|
}
|
||||||
|
|
||||||
GUF_STR_KWRDS guf_hash_size_t guf_str_view_hash(const guf_str_view *sv)
|
GUF_STR_KWRDS guf_hash_size_t guf_str_view_hash(const guf_str_view *sv)
|
||||||
|
|||||||
@ -38,13 +38,14 @@ private:
|
|||||||
dbuf_str_view_push_val(&delims, d);
|
dbuf_str_view_push_val(&delims, d);
|
||||||
}
|
}
|
||||||
|
|
||||||
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
|
guf_str_tok_state tok_state = guf_str_tok_state_new(guf_str_view{.str = text_buf.data, .len = text_buf.size}, delims.data, delims.size, GUF_STR_TOK_DELIM_OPT_MATCH_LONGEST);
|
||||||
guf_str_view tok;
|
while (guf_str_tok_next(&tok_state, true)) {
|
||||||
while ((tok = guf_str_next_tok(&input_str, delims.data, delims.size, NULL, -1)).len) {
|
guf_str_view tok = tok_state.cur_tok;
|
||||||
if (tok.len <= 0) {
|
// if (tok.len <= 0) {
|
||||||
continue;
|
// continue;
|
||||||
}
|
// }
|
||||||
std::string_view sv(tok.str, tok.len);
|
std::string_view sv(tok.str , tok.len);
|
||||||
|
//std::cout << sv << std::string_view(tok_state.cur_delim.str, tok_state.cur_delim.len);
|
||||||
TEST_CHECK(dict_sv_i32_contains(&word_cnt_dict, &tok) == word_cnt_map.contains(sv));
|
TEST_CHECK(dict_sv_i32_contains(&word_cnt_dict, &tok) == word_cnt_map.contains(sv));
|
||||||
if (!dict_sv_i32_contains(&word_cnt_dict, &tok)) {
|
if (!dict_sv_i32_contains(&word_cnt_dict, &tok)) {
|
||||||
dict_sv_i32_insert_val_arg(&word_cnt_dict, tok, 1, GUF_CPY_VALUE, GUF_CPY_VALUE);
|
dict_sv_i32_insert_val_arg(&word_cnt_dict, tok, 1, GUF_CPY_VALUE, GUF_CPY_VALUE);
|
||||||
|
|||||||
@ -85,11 +85,9 @@ private:
|
|||||||
|
|
||||||
int num_words = 0;
|
int num_words = 0;
|
||||||
|
|
||||||
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
|
guf_str_tok_state tok_state = guf_str_tok_state_new(guf_str_view{.str = text_buf.data, .len = text_buf.size}, delims->data, delims->size, GUF_STR_TOK_DELIM_OPT_MATCH_LONGEST);
|
||||||
guf_str_view tok;
|
while (guf_str_tok_next(&tok_state, false)) {
|
||||||
while ((tok = guf_str_next_tok(&input_str, delims->data, delims->size, NULL, -1)).len) {
|
TEST_CHECK(tok_state.cur_tok.len > 0);
|
||||||
// printf("tok_len: %td ", tok.len);
|
|
||||||
// printf("'%.*s'\n", (int)tok.len, tok.str);
|
|
||||||
++num_words;
|
++num_words;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -101,19 +99,23 @@ private:
|
|||||||
{
|
{
|
||||||
GUF_ASSERT_RELEASE(load_text(fname));
|
GUF_ASSERT_RELEASE(load_text(fname));
|
||||||
|
|
||||||
int num_words = 0;
|
int num_words = 0, num_delims = 0;
|
||||||
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
|
guf_str_tok_state tok_state = guf_str_tok_state_new(guf_str_view{.str = text_buf.data, .len = text_buf.size}, delims->data, delims->size, GUF_STR_TOK_DELIM_OPT_MATCH_LONGEST);
|
||||||
guf_str_view tok;
|
while (guf_str_tok_next(&tok_state, true)) {
|
||||||
while ((tok = guf_str_next_tok(&input_str, delims->data, delims->size, delims->data, delims->size)).len) {
|
if (tok_state.cur_tok.len) {
|
||||||
// if (tok.str[0] == '\n') {
|
|
||||||
// printf("'\\n'\n");
|
|
||||||
// } else {
|
|
||||||
// printf("'%.*s'\n", (int)tok.len, tok.str);
|
|
||||||
// }
|
|
||||||
++num_words;
|
++num_words;
|
||||||
|
// printf("'%.*s'\n", (int)tok_state.cur_tok.len, tok_state.cur_tok.str);
|
||||||
|
}
|
||||||
|
if (tok_state.cur_delim.len) {
|
||||||
|
++num_delims;
|
||||||
|
// if (tok_state.cur_delim.str[0] == '\n')
|
||||||
|
// printf("'\\n'\n");
|
||||||
|
// else
|
||||||
|
// printf("'%.*s'\n", (int)tok_state.cur_delim.len, tok_state.cur_delim.str);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
free_text();
|
free_text();
|
||||||
return num_words;
|
return num_words + num_delims;
|
||||||
}
|
}
|
||||||
|
|
||||||
void encode_decode_file(const char *fname)
|
void encode_decode_file(const char *fname)
|
||||||
@ -360,7 +362,7 @@ public:
|
|||||||
|
|
||||||
ptrdiff_t valid = 0, invalid = 0;
|
ptrdiff_t valid = 0, invalid = 0;
|
||||||
read_utf8_chars(TEST_DATA_DIR "/" "utf8-test.txt", &valid, &invalid);
|
read_utf8_chars(TEST_DATA_DIR "/" "utf8-test.txt", &valid, &invalid);
|
||||||
TEST_CHECK(valid == 2634 && invalid == 0);
|
TEST_CHECK(valid == 2635 && invalid == 0);
|
||||||
|
|
||||||
read_utf8_chars(TEST_DATA_DIR "/" "bartleby.txt", &valid, &invalid);
|
read_utf8_chars(TEST_DATA_DIR "/" "bartleby.txt", &valid, &invalid);
|
||||||
TEST_CHECK(valid > 16000 && invalid == 0);
|
TEST_CHECK(valid > 16000 && invalid == 0);
|
||||||
@ -377,7 +379,7 @@ public:
|
|||||||
int words = count_words(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
|
int words = count_words(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
|
||||||
TEST_CHECK(words == 422);
|
TEST_CHECK(words == 422);
|
||||||
int words_with_delims = count_words_with_delims(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
|
int words_with_delims = count_words_with_delims(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
|
||||||
TEST_CHECK(words_with_delims == 949);
|
TEST_CHECK(words_with_delims == 950);
|
||||||
|
|
||||||
int words2 = count_words(TEST_DATA_DIR "/" "bartleby.txt", &delims);
|
int words2 = count_words(TEST_DATA_DIR "/" "bartleby.txt", &delims);
|
||||||
TEST_CHECK(words2 > 2048);
|
TEST_CHECK(words2 > 2048);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user