Add tokeniser
This commit is contained in:
parent
d450cd8a45
commit
217622d816
@ -760,5 +760,3 @@ size_t guf_str_len_utf8(const guf_str *str)
|
|||||||
assert(n >= 1);
|
assert(n >= 1);
|
||||||
return n - 1;
|
return n - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// guf_str_tokenise (const guf_str *str, const char *delims, const char *preserved_delims, )
|
|
||||||
|
|||||||
146
src/guf_str.h
146
src/guf_str.h
@ -3,12 +3,11 @@
|
|||||||
#include "guf_common.h"
|
#include "guf_common.h"
|
||||||
#include "guf_alloc.h"
|
#include "guf_alloc.h"
|
||||||
|
|
||||||
// #define GUF_T char
|
#if defined(GUF_STATIC) || defined(GUF_IMPL_STATIC)
|
||||||
// #define GUF_CNT_NAME guf_dbuf_char
|
#define GUF_FN_KEYWORDS static
|
||||||
// #define GUF_T_IS_INTEGRAL_TYPE
|
#else
|
||||||
// #include "guf_dbuf.h"
|
#define GUF_FN_KEYWORDS
|
||||||
|
#endif
|
||||||
#define GUF_STR_ABORT_ON_ALLOC_FAILURE 1
|
|
||||||
|
|
||||||
typedef enum guf_str_state {
|
typedef enum guf_str_state {
|
||||||
GUF_STR_STATE_INIT = 0,
|
GUF_STR_STATE_INIT = 0,
|
||||||
@ -43,62 +42,101 @@ typedef struct guf_str_view {
|
|||||||
#define GUF_STR_TO_VIEW(GUF_STR_PTR) ((guf_str_view){.str = guf_str_const_cstr((GUF_STR_PTR)), .len = guf_str_len((GUF_STR_PTR))})
|
#define GUF_STR_TO_VIEW(GUF_STR_PTR) ((guf_str_view){.str = guf_str_const_cstr((GUF_STR_PTR)), .len = guf_str_len((GUF_STR_PTR))})
|
||||||
#define GUF_CSTR_TO_READONLY_STR(CSTR) ((guf_str){.state = GUF_STR_STATE_VIEW, .allocator = NULL, .data.heap.c_str = CSTR, .data.heap.len = strlen(CSTR), .data.heap.capacity = 0})
|
#define GUF_CSTR_TO_READONLY_STR(CSTR) ((guf_str){.state = GUF_STR_STATE_VIEW, .allocator = NULL, .data.heap.c_str = CSTR, .data.heap.len = strlen(CSTR), .data.heap.capacity = 0})
|
||||||
|
|
||||||
extern const guf_str GUF_STR_UNINITIALISED;
|
// Creation:
|
||||||
extern const guf_str GUF_STR_UNINITIALISED_FAILED_ALLOC;
|
GUF_FN_KEYWORDS guf_str *guf_str_init(guf_str *str, guf_str_view str_view);
|
||||||
|
GUF_FN_KEYWORDS guf_str *guf_str_init_from_cstr(guf_str *str, const char* c_str);
|
||||||
|
GUF_FN_KEYWORDS guf_str *guf_str_init_empty_with_capacity(guf_str *str, size_t capacity);
|
||||||
|
// guf_str_new functions return GUF_DICT_UNINITIALISED or GUF_STR_UNINITIALISED_FAILED_ALLOC on failure (can be checked with guf_str_alloc_success)
|
||||||
|
GUF_FN_KEYWORDS guf_str guf_str_new(guf_str_view str_view);
|
||||||
|
GUF_FN_KEYWORDS guf_str guf_str_new_substr(guf_str_view str_view, ptrdiff_t pos, ptrdiff_t len);
|
||||||
|
|
||||||
|
GUF_FN_KEYWORDS guf_str guf_str_new_from_cstr(const char *c_str);
|
||||||
|
GUF_FN_KEYWORDS guf_str guf_str_new_empty_with_capacity(size_t capacity);
|
||||||
|
|
||||||
|
// Destruction:
|
||||||
|
GUF_FN_KEYWORDS void guf_str_free(guf_str *str);
|
||||||
|
|
||||||
|
// Modification:
|
||||||
|
GUF_FN_KEYWORDS guf_str *guf_str_append(guf_str *str, guf_str_view to_append);
|
||||||
|
GUF_FN_KEYWORDS guf_str *guf_str_append_cstr(guf_str *str, const char *cstr_to_append); // Not necessary
|
||||||
|
GUF_FN_KEYWORDS guf_str *guf_str_substr(guf_str* str, size_t pos, size_t count);
|
||||||
|
|
||||||
|
GUF_FN_KEYWORDS guf_str *guf_str_reserve(guf_str *str, size_t bufsize);
|
||||||
|
GUF_FN_KEYWORDS guf_str *guf_str_shrink_capacity(guf_str *str, size_t shrink_trigger_fac, bool shrink_exact);
|
||||||
|
|
||||||
|
GUF_FN_KEYWORDS char guf_str_pop_back(guf_str *str);
|
||||||
|
GUF_FN_KEYWORDS char guf_str_pop_front(guf_str *str);
|
||||||
|
|
||||||
|
// Copying and viewing:
|
||||||
|
GUF_FN_KEYWORDS guf_str guf_str_substr_cpy(guf_str_view str, size_t pos, size_t count); // not necessary
|
||||||
|
GUF_FN_KEYWORDS guf_str_view guf_substr_view(guf_str_view str, ptrdiff_t pos, ptrdiff_t count);
|
||||||
|
|
||||||
|
// Indexing:
|
||||||
|
GUF_FN_KEYWORDS char *guf_str_at(guf_str *str, size_t idx);
|
||||||
|
GUF_FN_KEYWORDS char *guf_str_back(guf_str *str);
|
||||||
|
GUF_FN_KEYWORDS char *guf_str_front(guf_str *str);
|
||||||
|
GUF_FN_KEYWORDS const char *guf_str_const_cstr(const guf_str *str);
|
||||||
|
|
||||||
|
// Metadata retrieval:
|
||||||
|
GUF_FN_KEYWORDS size_t guf_str_len(const guf_str *str); // The size (in chars) without the final zero-terminator (size - 1).
|
||||||
|
GUF_FN_KEYWORDS size_t guf_str_capacity(const guf_str *str);
|
||||||
|
GUF_FN_KEYWORDS bool guf_str_is_stack_allocated(const guf_str *str);
|
||||||
|
GUF_FN_KEYWORDS bool guf_str_is_valid(const guf_str *str);
|
||||||
|
GUF_FN_KEYWORDS bool guf_str_alloc_success(const guf_str *str);
|
||||||
|
|
||||||
|
// Comparison:
|
||||||
|
GUF_FN_KEYWORDS bool guf_str_view_equal(const guf_str_view* a, const guf_str_view* b);
|
||||||
|
GUF_FN_KEYWORDS bool guf_str_equal(const guf_str *a, const guf_str *b);
|
||||||
|
GUF_FN_KEYWORDS bool guf_str_equals_cstr(const guf_str *a, const char *c_str);
|
||||||
|
GUF_FN_KEYWORDS bool guf_str_equals_strview(const guf_str *a, guf_str_view b);
|
||||||
|
GUF_FN_KEYWORDS int guf_str_view_cmp(const void *str_view_a, const void *str_view_b); // For qsort etc.
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(GUF_IMPL) || defined(GUF_IMPL_STATIC)
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#ifndef GUF_FN_KEYWORDS
|
||||||
|
#define GUF_FN_KEYWORDS
|
||||||
|
#endif
|
||||||
|
|
||||||
// TODO: find_first_of and tokenise -> for parsing, see aoclib.
|
// TODO: find_first_of and tokenise -> for parsing, see aoclib.
|
||||||
|
|
||||||
// Creation:
|
|
||||||
guf_str *guf_str_init(guf_str *str, guf_str_view str_view);
|
|
||||||
guf_str *guf_str_init_from_cstr(guf_str *str, const char* c_str);
|
|
||||||
guf_str *guf_str_init_empty_with_capacity(guf_str *str, size_t capacity);
|
|
||||||
// guf_str_new functions return GUF_DICT_UNINITIALISED or GUF_STR_UNINITIALISED_FAILED_ALLOC on failure (can be checked with guf_str_alloc_success)
|
|
||||||
guf_str guf_str_new(guf_str_view str_view);
|
|
||||||
guf_str guf_str_new_substr(guf_str_view str_view, ptrdiff_t pos, ptrdiff_t len);
|
|
||||||
|
|
||||||
guf_str guf_str_new_from_cstr(const char *c_str);
|
GUF_FN_KEYWORDS guf_str_view guf_substr_view(guf_str_view str, ptrdiff_t pos, ptrdiff_t count)
|
||||||
guf_str guf_str_new_empty_with_capacity(size_t capacity);
|
{
|
||||||
|
GUF_ASSERT(str.str);
|
||||||
|
GUF_ASSERT(pos >= 0);
|
||||||
|
GUF_ASSERT(count >= 0);
|
||||||
|
|
||||||
// Destruction:
|
if (str.len == 0 || count == 0 || pos >= str.len || str.str == NULL) {
|
||||||
void guf_str_free(guf_str *str);
|
return (guf_str_view){.str = str.str, .len = 0};
|
||||||
|
}
|
||||||
|
|
||||||
// Modification:
|
const ptrdiff_t substr_len = pos + count > str.len ? str.len - pos : count;
|
||||||
guf_str *guf_str_append(guf_str *str, guf_str_view to_append);
|
GUF_ASSERT(substr_len >= 0);
|
||||||
guf_str *guf_str_append_cstr(guf_str *str, const char *cstr_to_append); // Not necessary
|
GUF_ASSERT(substr_len <= str.len);
|
||||||
guf_str *guf_str_substr(guf_str* str, size_t pos, size_t count);
|
|
||||||
|
|
||||||
guf_str *guf_str_reserve(guf_str *str, size_t bufsize);
|
return (guf_str_view){.str = str.str + pos, .len = substr_len};
|
||||||
guf_str *guf_str_shrink_capacity(guf_str *str, size_t shrink_trigger_fac, bool shrink_exact);
|
}
|
||||||
|
|
||||||
char guf_str_pop_back(guf_str *str);
|
|
||||||
char guf_str_pop_front(guf_str *str);
|
|
||||||
|
|
||||||
// Copying and viewing:
|
|
||||||
guf_str guf_str_substr_cpy(guf_str_view str, size_t pos, size_t count); // not necessary
|
|
||||||
guf_str_view guf_str_substr_view(guf_str_view str, size_t pos, size_t count);
|
|
||||||
|
|
||||||
// Indexing:
|
|
||||||
char *guf_str_at(guf_str *str, size_t idx);
|
|
||||||
char *guf_str_back(guf_str *str);
|
|
||||||
char *guf_str_front(guf_str *str);
|
|
||||||
const char *guf_str_const_cstr(const guf_str *str);
|
|
||||||
|
|
||||||
// Metadata retrieval:
|
|
||||||
size_t guf_str_len(const guf_str *str); // The size (in chars) without the final zero-terminator (size - 1).
|
|
||||||
size_t guf_str_capacity(const guf_str *str);
|
|
||||||
bool guf_str_is_stack_allocated(const guf_str *str);
|
|
||||||
bool guf_str_is_valid(const guf_str *str);
|
|
||||||
bool guf_str_alloc_success(const guf_str *str);
|
|
||||||
|
|
||||||
// Comparison:
|
// Comparison:
|
||||||
bool guf_str_view_equal(guf_str_view a, guf_str_view b);
|
GUF_FN_KEYWORDS bool guf_str_view_equal(const guf_str_view* a, const guf_str_view* b)
|
||||||
bool guf_str_equal(const guf_str *a, const guf_str *b);
|
{
|
||||||
bool guf_str_equals_cstr(const guf_str *a, const char *c_str);
|
GUF_ASSERT_RELEASE(a && b);
|
||||||
bool guf_str_equals_strview(const guf_str *a, guf_str_view b);
|
GUF_ASSERT_RELEASE(a->str && b->str);
|
||||||
int guf_str_view_cmp(const void *str_view_a, const void *str_view_b); // For qsort etc.
|
if (a->len != b->len) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
GUF_ASSERT_RELEASE(a->len >= 0);
|
||||||
|
|
||||||
// UTF-8 operations.
|
return 0 == memcmp(a->str, b->str, a->len);
|
||||||
bool guf_str_char_is_ascii(char c);
|
}
|
||||||
bool guf_str_is_ascii(const guf_str *str);
|
|
||||||
|
|
||||||
#endif
|
#undef GUF_IMPL
|
||||||
|
#undef GUF_IMPL_STATIC
|
||||||
|
#undef GUF_STATIC
|
||||||
|
#undef GUF_FN_KEYWORDS
|
||||||
|
|
||||||
|
#endif /* end impl */
|
||||||
|
|||||||
@ -1,14 +1,14 @@
|
|||||||
|
#ifndef GUF_UTF8_H
|
||||||
|
#define GUF_UTF8_H
|
||||||
|
#include "guf_common.h"
|
||||||
|
#include "guf_str.h"
|
||||||
|
|
||||||
#if defined(GUF_STATIC) || defined(GUF_STATIC_IMPL)
|
#if defined(GUF_STATIC) || defined(GUF_STATIC_IMPL)
|
||||||
#define GUF_FN_KEYWORDS static
|
#define GUF_FN_KEYWORDS static
|
||||||
#else
|
#else
|
||||||
#define GUF_FN_KEYWORDS
|
#define GUF_FN_KEYWORDS
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef GUF_UTF8_H
|
|
||||||
#define GUF_UTF8_H
|
|
||||||
#include "guf_common.h"
|
|
||||||
#include "guf_str.h"
|
|
||||||
|
|
||||||
typedef struct guf_utf8_char {
|
typedef struct guf_utf8_char {
|
||||||
char bytes[5];
|
char bytes[5];
|
||||||
} guf_utf8_char;
|
} guf_utf8_char;
|
||||||
@ -32,6 +32,12 @@
|
|||||||
|
|
||||||
GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str);
|
GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str);
|
||||||
|
|
||||||
|
GUF_FN_KEYWORDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str_view *delims, ptrdiff_t num_delims, const guf_str_view *preserved_delims, ptrdiff_t num_preserved_delims);
|
||||||
|
|
||||||
|
|
||||||
|
extern const char* const guf_utf8_whitespace[25];
|
||||||
|
extern const char* const guf_utf8_punctuation[29];
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(GUF_IMPL) || defined(GUF_IMPL_STATIC)
|
#if defined(GUF_IMPL) || defined(GUF_IMPL_STATIC)
|
||||||
@ -39,6 +45,22 @@
|
|||||||
#include "guf_common.h"
|
#include "guf_common.h"
|
||||||
#include "guf_assert.h"
|
#include "guf_assert.h"
|
||||||
|
|
||||||
|
const char* const guf_utf8_whitespace[25] =
|
||||||
|
{
|
||||||
|
" ", "\n", "\t", "\t", "\v", "\f",
|
||||||
|
"\xC2\x85", "\xC2\xA0",
|
||||||
|
"\xE1\x9A\x80", "\xE2\x80\x80", "\xE2\x80\x81", "\xE2\x80\x82", "\xE2\x80\x83", "\xE2\x80\x84", "\xE2\x80\x85", "\xE2\x80\x86", "\xE2\x80\x87", "\xE2\x80\x88", "\xE2\x80\x89", "\xE2\x80\x8A", "\xE2\x80\xA8", "\xE2\x80\xA9", "\xE2\x80\xAF", "\xE2\x81\x9F", "\xE3\x80\x80"
|
||||||
|
};
|
||||||
|
|
||||||
|
const char* const guf_utf8_punctuation[29] =
|
||||||
|
{
|
||||||
|
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "„", "“", "´", "`", "\\", "%", "‒", "–", "—", "—"
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifndef GUF_FN_KEYWORDS
|
||||||
|
#define GUF_FN_KEYWORDS
|
||||||
|
#endif
|
||||||
|
|
||||||
GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str)
|
GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str)
|
||||||
{
|
{
|
||||||
GUF_ASSERT_RELEASE(res);
|
GUF_ASSERT_RELEASE(res);
|
||||||
@ -204,6 +226,60 @@ GUF_FN_KEYWORDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GUF_FN_KEYWORDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str_view *delims, ptrdiff_t num_delims, const guf_str_view *preserved_delims, ptrdiff_t num_preserved_delims)
|
||||||
|
{
|
||||||
|
if (input->len <= 0 || input->str == NULL) {
|
||||||
|
return (guf_str_view){.str = NULL, .len = 0};
|
||||||
|
}
|
||||||
|
|
||||||
|
ptrdiff_t max_delim_len = -1;
|
||||||
|
for (ptrdiff_t i = 0; i < num_delims; ++i) {
|
||||||
|
if (delims[i].len > max_delim_len) {
|
||||||
|
max_delim_len = delims[i].len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
guf_str_view tok = {.str = input->str, .len = 0};
|
||||||
|
|
||||||
|
guf_utf8_char ch = {0};
|
||||||
|
|
||||||
|
guf_str_view prev_input = *input;
|
||||||
|
|
||||||
|
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, input); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, input)) {
|
||||||
|
if (stat != GUF_UTF8_READ_VALID) {
|
||||||
|
prev_input = *input;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int num_bytes = guf_utf8_char_num_bytes(&ch);
|
||||||
|
|
||||||
|
for (ptrdiff_t delim_len = GUF_MIN(max_delim_len, prev_input.len); delim_len > 0; --delim_len) {
|
||||||
|
guf_str_view delim_candidate = guf_substr_view(prev_input, 0, delim_len);
|
||||||
|
for (ptrdiff_t delim_i = 0; delim_i < num_delims; ++delim_i) {
|
||||||
|
if (guf_str_view_equal(&delim_candidate, delims + delim_i)) { // Found delim.
|
||||||
|
input->len = prev_input.len - delim_len;
|
||||||
|
input->str = prev_input.len > 0 ? prev_input.str + delim_len : NULL;
|
||||||
|
if (preserved_delims && num_preserved_delims > 0) {
|
||||||
|
return delim_candidate;
|
||||||
|
}
|
||||||
|
if (tok.len == 0) {
|
||||||
|
tok.str = input->str;
|
||||||
|
goto end;
|
||||||
|
} else {
|
||||||
|
return tok;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tok.len += num_bytes;
|
||||||
|
|
||||||
|
end:;
|
||||||
|
prev_input = *input;
|
||||||
|
}
|
||||||
|
|
||||||
|
return tok;
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#undef GUF_FN_KEYWORDS
|
#undef GUF_FN_KEYWORDS
|
||||||
|
|||||||
@ -32,3 +32,9 @@
|
|||||||
#define GUF_T_EQ guf_cstr_const_eq
|
#define GUF_T_EQ guf_cstr_const_eq
|
||||||
#define GUF_IMPL
|
#define GUF_IMPL
|
||||||
#include "guf_dbuf.h"
|
#include "guf_dbuf.h"
|
||||||
|
|
||||||
|
#define GUF_T guf_str_view
|
||||||
|
#define GUF_CNT_NAME dbuf_str_view
|
||||||
|
#define GUF_T_EQ guf_str_view_equal
|
||||||
|
#define GUF_IMPL
|
||||||
|
#include "guf_dbuf.h"
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
#define GUF_DBUF_IMPL_H
|
#define GUF_DBUF_IMPL_H
|
||||||
|
|
||||||
#include "guf_cstr.h"
|
#include "guf_cstr.h"
|
||||||
|
#include "guf_str.h"
|
||||||
|
|
||||||
#define GUF_CNT_NAME dbuf_int
|
#define GUF_CNT_NAME dbuf_int
|
||||||
#define GUF_T int
|
#define GUF_T int
|
||||||
@ -33,4 +34,9 @@ typedef unsigned char uchar;
|
|||||||
#define GUF_T_EQ guf_cstr_const_eq
|
#define GUF_T_EQ guf_cstr_const_eq
|
||||||
#include "guf_dbuf.h"
|
#include "guf_dbuf.h"
|
||||||
|
|
||||||
|
#define GUF_T guf_str_view
|
||||||
|
#define GUF_CNT_NAME dbuf_str_view
|
||||||
|
#define GUF_T_EQ guf_str_view_equal
|
||||||
|
#include "guf_dbuf.h"
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -2,3 +2,6 @@
|
|||||||
|
|
||||||
#define GUF_IMPL
|
#define GUF_IMPL
|
||||||
#include "guf_utf8.h"
|
#include "guf_utf8.h"
|
||||||
|
|
||||||
|
#define GUF_IMPL
|
||||||
|
#include "guf_str.h"
|
||||||
|
|||||||
@ -7,6 +7,7 @@ extern "C"
|
|||||||
#include "guf_alloc_libc.h"
|
#include "guf_alloc_libc.h"
|
||||||
#include "guf_dict_impl.h"
|
#include "guf_dict_impl.h"
|
||||||
#include "guf_utf8.h"
|
#include "guf_utf8.h"
|
||||||
|
#include "guf_str.h"
|
||||||
}
|
}
|
||||||
|
|
||||||
struct DictCstrToIntTest : public Test
|
struct DictCstrToIntTest : public Test
|
||||||
@ -25,22 +26,38 @@ struct DictCstrToIntTest : public Test
|
|||||||
dict_cstr_int word_cnt_dict {};
|
dict_cstr_int word_cnt_dict {};
|
||||||
dict_cstr_int_init(&word_cnt_dict, &guf_allocator_libc);
|
dict_cstr_int_init(&word_cnt_dict, &guf_allocator_libc);
|
||||||
|
|
||||||
ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = text_buf.size;
|
dbuf_str_view delims = dbuf_str_view_new(&guf_allocator_libc);
|
||||||
|
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_whitespace); ++i) {
|
||||||
|
guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_whitespace[i]), .str = guf_utf8_whitespace[i]};
|
||||||
|
dbuf_str_view_push_val(&delims, d);
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_punctuation); ++i) {
|
||||||
|
guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_punctuation[i]), .str = guf_utf8_punctuation[i]};
|
||||||
|
dbuf_str_view_push_val(&delims, d);
|
||||||
|
}
|
||||||
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
|
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
|
||||||
guf_utf8_char ch = {};
|
guf_str_view tok;
|
||||||
|
while ((tok = guf_str_next_tok(&input_str, delims.data, delims.size, NULL, -1)).len) {
|
||||||
|
// printf("tok_len: %td ", tok.len);
|
||||||
|
printf("'%.*s'\n", (int)tok.len, tok.str);
|
||||||
|
}
|
||||||
|
dbuf_str_view_free(&delims, NULL);
|
||||||
|
|
||||||
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
|
// ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = 0;
|
||||||
if (stat == GUF_UTF8_READ_VALID) {
|
// guf_utf8_char ch = {};
|
||||||
++valid_chars;
|
// for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
|
||||||
printf("%s", ch.bytes);
|
// if (stat == GUF_UTF8_READ_VALID) {
|
||||||
} else {
|
// ++valid_chars;
|
||||||
++invalid_chars;
|
// printf("%s", ch.bytes);
|
||||||
printf("::INVALID_UTF8_CHAR::");
|
// } else {
|
||||||
}
|
// ++invalid_chars;
|
||||||
}
|
// printf("::INVALID_UTF8_CHAR::");
|
||||||
TEST_CHECK(input_str.len == 0 && input_str.str == NULL);
|
// }
|
||||||
printf("\nread %td bytes\n", bytes);
|
// bytes += guf_utf8_char_num_bytes(&ch);
|
||||||
printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars);
|
// }
|
||||||
|
// TEST_CHECK(input_str.len == 0 && input_str.str == NULL);
|
||||||
|
// printf("\nread %td bytes\n", bytes);
|
||||||
|
// printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars);
|
||||||
|
|
||||||
dict_cstr_int_free(&word_cnt_dict, NULL);
|
dict_cstr_int_free(&word_cnt_dict, NULL);
|
||||||
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;
|
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user