Add tokeniser

This commit is contained in:
jun 2025-02-27 10:21:21 +01:00
parent d450cd8a45
commit 217622d816
9 changed files with 224 additions and 80 deletions

View File

@ -760,5 +760,3 @@ size_t guf_str_len_utf8(const guf_str *str)
assert(n >= 1); assert(n >= 1);
return n - 1; return n - 1;
} }
// guf_str_tokenise (const guf_str *str, const char *delims, const char *preserved_delims, )

View File

@ -3,12 +3,11 @@
#include "guf_common.h" #include "guf_common.h"
#include "guf_alloc.h" #include "guf_alloc.h"
// #define GUF_T char #if defined(GUF_STATIC) || defined(GUF_IMPL_STATIC)
// #define GUF_CNT_NAME guf_dbuf_char #define GUF_FN_KEYWORDS static
// #define GUF_T_IS_INTEGRAL_TYPE #else
// #include "guf_dbuf.h" #define GUF_FN_KEYWORDS
#endif
#define GUF_STR_ABORT_ON_ALLOC_FAILURE 1
typedef enum guf_str_state { typedef enum guf_str_state {
GUF_STR_STATE_INIT = 0, GUF_STR_STATE_INIT = 0,
@ -43,62 +42,101 @@ typedef struct guf_str_view {
#define GUF_STR_TO_VIEW(GUF_STR_PTR) ((guf_str_view){.str = guf_str_const_cstr((GUF_STR_PTR)), .len = guf_str_len((GUF_STR_PTR))}) #define GUF_STR_TO_VIEW(GUF_STR_PTR) ((guf_str_view){.str = guf_str_const_cstr((GUF_STR_PTR)), .len = guf_str_len((GUF_STR_PTR))})
#define GUF_CSTR_TO_READONLY_STR(CSTR) ((guf_str){.state = GUF_STR_STATE_VIEW, .allocator = NULL, .data.heap.c_str = CSTR, .data.heap.len = strlen(CSTR), .data.heap.capacity = 0}) #define GUF_CSTR_TO_READONLY_STR(CSTR) ((guf_str){.state = GUF_STR_STATE_VIEW, .allocator = NULL, .data.heap.c_str = CSTR, .data.heap.len = strlen(CSTR), .data.heap.capacity = 0})
extern const guf_str GUF_STR_UNINITIALISED; // Creation:
extern const guf_str GUF_STR_UNINITIALISED_FAILED_ALLOC; GUF_FN_KEYWORDS guf_str *guf_str_init(guf_str *str, guf_str_view str_view);
GUF_FN_KEYWORDS guf_str *guf_str_init_from_cstr(guf_str *str, const char* c_str);
GUF_FN_KEYWORDS guf_str *guf_str_init_empty_with_capacity(guf_str *str, size_t capacity);
// guf_str_new functions return GUF_DICT_UNINITIALISED or GUF_STR_UNINITIALISED_FAILED_ALLOC on failure (can be checked with guf_str_alloc_success)
GUF_FN_KEYWORDS guf_str guf_str_new(guf_str_view str_view);
GUF_FN_KEYWORDS guf_str guf_str_new_substr(guf_str_view str_view, ptrdiff_t pos, ptrdiff_t len);
GUF_FN_KEYWORDS guf_str guf_str_new_from_cstr(const char *c_str);
GUF_FN_KEYWORDS guf_str guf_str_new_empty_with_capacity(size_t capacity);
// Destruction:
GUF_FN_KEYWORDS void guf_str_free(guf_str *str);
// Modification:
GUF_FN_KEYWORDS guf_str *guf_str_append(guf_str *str, guf_str_view to_append);
GUF_FN_KEYWORDS guf_str *guf_str_append_cstr(guf_str *str, const char *cstr_to_append); // Not necessary
GUF_FN_KEYWORDS guf_str *guf_str_substr(guf_str* str, size_t pos, size_t count);
GUF_FN_KEYWORDS guf_str *guf_str_reserve(guf_str *str, size_t bufsize);
GUF_FN_KEYWORDS guf_str *guf_str_shrink_capacity(guf_str *str, size_t shrink_trigger_fac, bool shrink_exact);
GUF_FN_KEYWORDS char guf_str_pop_back(guf_str *str);
GUF_FN_KEYWORDS char guf_str_pop_front(guf_str *str);
// Copying and viewing:
GUF_FN_KEYWORDS guf_str guf_str_substr_cpy(guf_str_view str, size_t pos, size_t count); // not necessary
GUF_FN_KEYWORDS guf_str_view guf_substr_view(guf_str_view str, ptrdiff_t pos, ptrdiff_t count);
// Indexing:
GUF_FN_KEYWORDS char *guf_str_at(guf_str *str, size_t idx);
GUF_FN_KEYWORDS char *guf_str_back(guf_str *str);
GUF_FN_KEYWORDS char *guf_str_front(guf_str *str);
GUF_FN_KEYWORDS const char *guf_str_const_cstr(const guf_str *str);
// Metadata retrieval:
GUF_FN_KEYWORDS size_t guf_str_len(const guf_str *str); // The size (in chars) without the final zero-terminator (size - 1).
GUF_FN_KEYWORDS size_t guf_str_capacity(const guf_str *str);
GUF_FN_KEYWORDS bool guf_str_is_stack_allocated(const guf_str *str);
GUF_FN_KEYWORDS bool guf_str_is_valid(const guf_str *str);
GUF_FN_KEYWORDS bool guf_str_alloc_success(const guf_str *str);
// Comparison:
GUF_FN_KEYWORDS bool guf_str_view_equal(const guf_str_view* a, const guf_str_view* b);
GUF_FN_KEYWORDS bool guf_str_equal(const guf_str *a, const guf_str *b);
GUF_FN_KEYWORDS bool guf_str_equals_cstr(const guf_str *a, const char *c_str);
GUF_FN_KEYWORDS bool guf_str_equals_strview(const guf_str *a, guf_str_view b);
GUF_FN_KEYWORDS int guf_str_view_cmp(const void *str_view_a, const void *str_view_b); // For qsort etc.
#endif
#if defined(GUF_IMPL) || defined(GUF_IMPL_STATIC)
#include <string.h>
#ifndef GUF_FN_KEYWORDS
#define GUF_FN_KEYWORDS
#endif
// TODO: find_first_of and tokenise -> for parsing, see aoclib. // TODO: find_first_of and tokenise -> for parsing, see aoclib.
// Creation:
guf_str *guf_str_init(guf_str *str, guf_str_view str_view);
guf_str *guf_str_init_from_cstr(guf_str *str, const char* c_str);
guf_str *guf_str_init_empty_with_capacity(guf_str *str, size_t capacity);
// guf_str_new functions return GUF_DICT_UNINITIALISED or GUF_STR_UNINITIALISED_FAILED_ALLOC on failure (can be checked with guf_str_alloc_success)
guf_str guf_str_new(guf_str_view str_view);
guf_str guf_str_new_substr(guf_str_view str_view, ptrdiff_t pos, ptrdiff_t len);
guf_str guf_str_new_from_cstr(const char *c_str); GUF_FN_KEYWORDS guf_str_view guf_substr_view(guf_str_view str, ptrdiff_t pos, ptrdiff_t count)
guf_str guf_str_new_empty_with_capacity(size_t capacity); {
GUF_ASSERT(str.str);
GUF_ASSERT(pos >= 0);
GUF_ASSERT(count >= 0);
// Destruction: if (str.len == 0 || count == 0 || pos >= str.len || str.str == NULL) {
void guf_str_free(guf_str *str); return (guf_str_view){.str = str.str, .len = 0};
}
// Modification: const ptrdiff_t substr_len = pos + count > str.len ? str.len - pos : count;
guf_str *guf_str_append(guf_str *str, guf_str_view to_append); GUF_ASSERT(substr_len >= 0);
guf_str *guf_str_append_cstr(guf_str *str, const char *cstr_to_append); // Not necessary GUF_ASSERT(substr_len <= str.len);
guf_str *guf_str_substr(guf_str* str, size_t pos, size_t count);
guf_str *guf_str_reserve(guf_str *str, size_t bufsize); return (guf_str_view){.str = str.str + pos, .len = substr_len};
guf_str *guf_str_shrink_capacity(guf_str *str, size_t shrink_trigger_fac, bool shrink_exact); }
char guf_str_pop_back(guf_str *str);
char guf_str_pop_front(guf_str *str);
// Copying and viewing:
guf_str guf_str_substr_cpy(guf_str_view str, size_t pos, size_t count); // not necessary
guf_str_view guf_str_substr_view(guf_str_view str, size_t pos, size_t count);
// Indexing:
char *guf_str_at(guf_str *str, size_t idx);
char *guf_str_back(guf_str *str);
char *guf_str_front(guf_str *str);
const char *guf_str_const_cstr(const guf_str *str);
// Metadata retrieval:
size_t guf_str_len(const guf_str *str); // The size (in chars) without the final zero-terminator (size - 1).
size_t guf_str_capacity(const guf_str *str);
bool guf_str_is_stack_allocated(const guf_str *str);
bool guf_str_is_valid(const guf_str *str);
bool guf_str_alloc_success(const guf_str *str);
// Comparison: // Comparison:
bool guf_str_view_equal(guf_str_view a, guf_str_view b); GUF_FN_KEYWORDS bool guf_str_view_equal(const guf_str_view* a, const guf_str_view* b)
bool guf_str_equal(const guf_str *a, const guf_str *b); {
bool guf_str_equals_cstr(const guf_str *a, const char *c_str); GUF_ASSERT_RELEASE(a && b);
bool guf_str_equals_strview(const guf_str *a, guf_str_view b); GUF_ASSERT_RELEASE(a->str && b->str);
int guf_str_view_cmp(const void *str_view_a, const void *str_view_b); // For qsort etc. if (a->len != b->len) {
return false;
}
GUF_ASSERT_RELEASE(a->len >= 0);
// UTF-8 operations. return 0 == memcmp(a->str, b->str, a->len);
bool guf_str_char_is_ascii(char c); }
bool guf_str_is_ascii(const guf_str *str);
#endif #undef GUF_IMPL
#undef GUF_IMPL_STATIC
#undef GUF_STATIC
#undef GUF_FN_KEYWORDS
#endif /* end impl */

View File

@ -1,14 +1,14 @@
#ifndef GUF_UTF8_H
#define GUF_UTF8_H
#include "guf_common.h"
#include "guf_str.h"
#if defined(GUF_STATIC) || defined(GUF_STATIC_IMPL) #if defined(GUF_STATIC) || defined(GUF_STATIC_IMPL)
#define GUF_FN_KEYWORDS static #define GUF_FN_KEYWORDS static
#else #else
#define GUF_FN_KEYWORDS #define GUF_FN_KEYWORDS
#endif #endif
#ifndef GUF_UTF8_H
#define GUF_UTF8_H
#include "guf_common.h"
#include "guf_str.h"
typedef struct guf_utf8_char { typedef struct guf_utf8_char {
char bytes[5]; char bytes[5];
} guf_utf8_char; } guf_utf8_char;
@ -32,6 +32,12 @@
GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str); GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str);
GUF_FN_KEYWORDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str_view *delims, ptrdiff_t num_delims, const guf_str_view *preserved_delims, ptrdiff_t num_preserved_delims);
extern const char* const guf_utf8_whitespace[25];
extern const char* const guf_utf8_punctuation[29];
#endif #endif
#if defined(GUF_IMPL) || defined(GUF_IMPL_STATIC) #if defined(GUF_IMPL) || defined(GUF_IMPL_STATIC)
@ -39,6 +45,22 @@
#include "guf_common.h" #include "guf_common.h"
#include "guf_assert.h" #include "guf_assert.h"
const char* const guf_utf8_whitespace[25] =
{
" ", "\n", "\t", "\t", "\v", "\f",
"\xC2\x85", "\xC2\xA0",
"\xE1\x9A\x80", "\xE2\x80\x80", "\xE2\x80\x81", "\xE2\x80\x82", "\xE2\x80\x83", "\xE2\x80\x84", "\xE2\x80\x85", "\xE2\x80\x86", "\xE2\x80\x87", "\xE2\x80\x88", "\xE2\x80\x89", "\xE2\x80\x8A", "\xE2\x80\xA8", "\xE2\x80\xA9", "\xE2\x80\xAF", "\xE2\x81\x9F", "\xE3\x80\x80"
};
const char* const guf_utf8_punctuation[29] =
{
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "", "", "´", "`", "\\", "%", "", "", "", ""
};
#ifndef GUF_FN_KEYWORDS
#define GUF_FN_KEYWORDS
#endif
GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str) GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str)
{ {
GUF_ASSERT_RELEASE(res); GUF_ASSERT_RELEASE(res);
@ -204,6 +226,60 @@ GUF_FN_KEYWORDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c)
} }
} }
GUF_FN_KEYWORDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str_view *delims, ptrdiff_t num_delims, const guf_str_view *preserved_delims, ptrdiff_t num_preserved_delims)
{
if (input->len <= 0 || input->str == NULL) {
return (guf_str_view){.str = NULL, .len = 0};
}
ptrdiff_t max_delim_len = -1;
for (ptrdiff_t i = 0; i < num_delims; ++i) {
if (delims[i].len > max_delim_len) {
max_delim_len = delims[i].len;
}
}
guf_str_view tok = {.str = input->str, .len = 0};
guf_utf8_char ch = {0};
guf_str_view prev_input = *input;
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, input); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, input)) {
if (stat != GUF_UTF8_READ_VALID) {
prev_input = *input;
continue;
}
const int num_bytes = guf_utf8_char_num_bytes(&ch);
for (ptrdiff_t delim_len = GUF_MIN(max_delim_len, prev_input.len); delim_len > 0; --delim_len) {
guf_str_view delim_candidate = guf_substr_view(prev_input, 0, delim_len);
for (ptrdiff_t delim_i = 0; delim_i < num_delims; ++delim_i) {
if (guf_str_view_equal(&delim_candidate, delims + delim_i)) { // Found delim.
input->len = prev_input.len - delim_len;
input->str = prev_input.len > 0 ? prev_input.str + delim_len : NULL;
if (preserved_delims && num_preserved_delims > 0) {
return delim_candidate;
}
if (tok.len == 0) {
tok.str = input->str;
goto end;
} else {
return tok;
}
}
}
}
tok.len += num_bytes;
end:;
prev_input = *input;
}
return tok;
}
#endif #endif
#undef GUF_FN_KEYWORDS #undef GUF_FN_KEYWORDS

View File

@ -32,3 +32,9 @@
#define GUF_T_EQ guf_cstr_const_eq #define GUF_T_EQ guf_cstr_const_eq
#define GUF_IMPL #define GUF_IMPL
#include "guf_dbuf.h" #include "guf_dbuf.h"
#define GUF_T guf_str_view
#define GUF_CNT_NAME dbuf_str_view
#define GUF_T_EQ guf_str_view_equal
#define GUF_IMPL
#include "guf_dbuf.h"

View File

@ -2,6 +2,7 @@
#define GUF_DBUF_IMPL_H #define GUF_DBUF_IMPL_H
#include "guf_cstr.h" #include "guf_cstr.h"
#include "guf_str.h"
#define GUF_CNT_NAME dbuf_int #define GUF_CNT_NAME dbuf_int
#define GUF_T int #define GUF_T int
@ -33,4 +34,9 @@ typedef unsigned char uchar;
#define GUF_T_EQ guf_cstr_const_eq #define GUF_T_EQ guf_cstr_const_eq
#include "guf_dbuf.h" #include "guf_dbuf.h"
#define GUF_T guf_str_view
#define GUF_CNT_NAME dbuf_str_view
#define GUF_T_EQ guf_str_view_equal
#include "guf_dbuf.h"
#endif #endif

View File

@ -2,3 +2,6 @@
#define GUF_IMPL #define GUF_IMPL
#include "guf_utf8.h" #include "guf_utf8.h"
#define GUF_IMPL
#include "guf_str.h"

View File

@ -7,6 +7,7 @@ extern "C"
#include "guf_alloc_libc.h" #include "guf_alloc_libc.h"
#include "guf_dict_impl.h" #include "guf_dict_impl.h"
#include "guf_utf8.h" #include "guf_utf8.h"
#include "guf_str.h"
} }
struct DictCstrToIntTest : public Test struct DictCstrToIntTest : public Test
@ -25,22 +26,38 @@ struct DictCstrToIntTest : public Test
dict_cstr_int word_cnt_dict {}; dict_cstr_int word_cnt_dict {};
dict_cstr_int_init(&word_cnt_dict, &guf_allocator_libc); dict_cstr_int_init(&word_cnt_dict, &guf_allocator_libc);
ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = text_buf.size; dbuf_str_view delims = dbuf_str_view_new(&guf_allocator_libc);
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_whitespace); ++i) {
guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_whitespace[i]), .str = guf_utf8_whitespace[i]};
dbuf_str_view_push_val(&delims, d);
}
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_punctuation); ++i) {
guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_punctuation[i]), .str = guf_utf8_punctuation[i]};
dbuf_str_view_push_val(&delims, d);
}
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
guf_utf8_char ch = {}; guf_str_view tok;
while ((tok = guf_str_next_tok(&input_str, delims.data, delims.size, NULL, -1)).len) {
// printf("tok_len: %td ", tok.len);
printf("'%.*s'\n", (int)tok.len, tok.str);
}
dbuf_str_view_free(&delims, NULL);
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) { // ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = 0;
if (stat == GUF_UTF8_READ_VALID) { // guf_utf8_char ch = {};
++valid_chars; // for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
printf("%s", ch.bytes); // if (stat == GUF_UTF8_READ_VALID) {
} else { // ++valid_chars;
++invalid_chars; // printf("%s", ch.bytes);
printf("::INVALID_UTF8_CHAR::"); // } else {
} // ++invalid_chars;
} // printf("::INVALID_UTF8_CHAR::");
TEST_CHECK(input_str.len == 0 && input_str.str == NULL); // }
printf("\nread %td bytes\n", bytes); // bytes += guf_utf8_char_num_bytes(&ch);
printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars); // }
// TEST_CHECK(input_str.len == 0 && input_str.str == NULL);
// printf("\nread %td bytes\n", bytes);
// printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars);
dict_cstr_int_free(&word_cnt_dict, NULL); dict_cstr_int_free(&word_cnt_dict, NULL);
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size; bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;

View File

@ -1,2 +1,2 @@
- guf_stack, guf_queue, guf_ringbuf - guf_stack, guf_queue, guf_ringbuf
- guf_rand etc.: move guf_fn_keywors out of header guard? - guf_rand etc.: move guf_fn_keywors out of header guard? (-> no, add a GUF_WITHOUT_TYPES)