Add tokeniser

This commit is contained in:
jun 2025-02-27 10:21:21 +01:00
parent d450cd8a45
commit 217622d816
9 changed files with 224 additions and 80 deletions

View File

@ -760,5 +760,3 @@ size_t guf_str_len_utf8(const guf_str *str)
assert(n >= 1);
return n - 1;
}
// guf_str_tokenise (const guf_str *str, const char *delims, const char *preserved_delims, )

View File

@ -3,12 +3,11 @@
#include "guf_common.h"
#include "guf_alloc.h"
// #define GUF_T char
// #define GUF_CNT_NAME guf_dbuf_char
// #define GUF_T_IS_INTEGRAL_TYPE
// #include "guf_dbuf.h"
#define GUF_STR_ABORT_ON_ALLOC_FAILURE 1
#if defined(GUF_STATIC) || defined(GUF_IMPL_STATIC)
#define GUF_FN_KEYWORDS static
#else
#define GUF_FN_KEYWORDS
#endif
typedef enum guf_str_state {
GUF_STR_STATE_INIT = 0,
@ -43,62 +42,101 @@ typedef struct guf_str_view {
#define GUF_STR_TO_VIEW(GUF_STR_PTR) ((guf_str_view){.str = guf_str_const_cstr((GUF_STR_PTR)), .len = guf_str_len((GUF_STR_PTR))})
#define GUF_CSTR_TO_READONLY_STR(CSTR) ((guf_str){.state = GUF_STR_STATE_VIEW, .allocator = NULL, .data.heap.c_str = CSTR, .data.heap.len = strlen(CSTR), .data.heap.capacity = 0})
extern const guf_str GUF_STR_UNINITIALISED;
extern const guf_str GUF_STR_UNINITIALISED_FAILED_ALLOC;
// Creation:
GUF_FN_KEYWORDS guf_str *guf_str_init(guf_str *str, guf_str_view str_view);
GUF_FN_KEYWORDS guf_str *guf_str_init_from_cstr(guf_str *str, const char* c_str);
GUF_FN_KEYWORDS guf_str *guf_str_init_empty_with_capacity(guf_str *str, size_t capacity);
// guf_str_new functions return GUF_DICT_UNINITIALISED or GUF_STR_UNINITIALISED_FAILED_ALLOC on failure (can be checked with guf_str_alloc_success)
GUF_FN_KEYWORDS guf_str guf_str_new(guf_str_view str_view);
GUF_FN_KEYWORDS guf_str guf_str_new_substr(guf_str_view str_view, ptrdiff_t pos, ptrdiff_t len);
GUF_FN_KEYWORDS guf_str guf_str_new_from_cstr(const char *c_str);
GUF_FN_KEYWORDS guf_str guf_str_new_empty_with_capacity(size_t capacity);
// Destruction:
GUF_FN_KEYWORDS void guf_str_free(guf_str *str);
// Modification:
GUF_FN_KEYWORDS guf_str *guf_str_append(guf_str *str, guf_str_view to_append);
GUF_FN_KEYWORDS guf_str *guf_str_append_cstr(guf_str *str, const char *cstr_to_append); // Not necessary
GUF_FN_KEYWORDS guf_str *guf_str_substr(guf_str* str, size_t pos, size_t count);
GUF_FN_KEYWORDS guf_str *guf_str_reserve(guf_str *str, size_t bufsize);
GUF_FN_KEYWORDS guf_str *guf_str_shrink_capacity(guf_str *str, size_t shrink_trigger_fac, bool shrink_exact);
GUF_FN_KEYWORDS char guf_str_pop_back(guf_str *str);
GUF_FN_KEYWORDS char guf_str_pop_front(guf_str *str);
// Copying and viewing:
GUF_FN_KEYWORDS guf_str guf_str_substr_cpy(guf_str_view str, size_t pos, size_t count); // not necessary
GUF_FN_KEYWORDS guf_str_view guf_substr_view(guf_str_view str, ptrdiff_t pos, ptrdiff_t count);
// Indexing:
GUF_FN_KEYWORDS char *guf_str_at(guf_str *str, size_t idx);
GUF_FN_KEYWORDS char *guf_str_back(guf_str *str);
GUF_FN_KEYWORDS char *guf_str_front(guf_str *str);
GUF_FN_KEYWORDS const char *guf_str_const_cstr(const guf_str *str);
// Metadata retrieval:
GUF_FN_KEYWORDS size_t guf_str_len(const guf_str *str); // The size (in chars) without the final zero-terminator (size - 1).
GUF_FN_KEYWORDS size_t guf_str_capacity(const guf_str *str);
GUF_FN_KEYWORDS bool guf_str_is_stack_allocated(const guf_str *str);
GUF_FN_KEYWORDS bool guf_str_is_valid(const guf_str *str);
GUF_FN_KEYWORDS bool guf_str_alloc_success(const guf_str *str);
// Comparison:
GUF_FN_KEYWORDS bool guf_str_view_equal(const guf_str_view* a, const guf_str_view* b);
GUF_FN_KEYWORDS bool guf_str_equal(const guf_str *a, const guf_str *b);
GUF_FN_KEYWORDS bool guf_str_equals_cstr(const guf_str *a, const char *c_str);
GUF_FN_KEYWORDS bool guf_str_equals_strview(const guf_str *a, guf_str_view b);
GUF_FN_KEYWORDS int guf_str_view_cmp(const void *str_view_a, const void *str_view_b); // For qsort etc.
#endif
#if defined(GUF_IMPL) || defined(GUF_IMPL_STATIC)
#include <string.h>
#ifndef GUF_FN_KEYWORDS
#define GUF_FN_KEYWORDS
#endif
// TODO: find_first_of and tokenise -> for parsing, see aoclib.
// Creation:
guf_str *guf_str_init(guf_str *str, guf_str_view str_view);
guf_str *guf_str_init_from_cstr(guf_str *str, const char* c_str);
guf_str *guf_str_init_empty_with_capacity(guf_str *str, size_t capacity);
// guf_str_new functions return GUF_DICT_UNINITIALISED or GUF_STR_UNINITIALISED_FAILED_ALLOC on failure (can be checked with guf_str_alloc_success)
guf_str guf_str_new(guf_str_view str_view);
guf_str guf_str_new_substr(guf_str_view str_view, ptrdiff_t pos, ptrdiff_t len);
guf_str guf_str_new_from_cstr(const char *c_str);
guf_str guf_str_new_empty_with_capacity(size_t capacity);
GUF_FN_KEYWORDS guf_str_view guf_substr_view(guf_str_view str, ptrdiff_t pos, ptrdiff_t count)
{
GUF_ASSERT(str.str);
GUF_ASSERT(pos >= 0);
GUF_ASSERT(count >= 0);
// Destruction:
void guf_str_free(guf_str *str);
if (str.len == 0 || count == 0 || pos >= str.len || str.str == NULL) {
return (guf_str_view){.str = str.str, .len = 0};
}
// Modification:
guf_str *guf_str_append(guf_str *str, guf_str_view to_append);
guf_str *guf_str_append_cstr(guf_str *str, const char *cstr_to_append); // Not necessary
guf_str *guf_str_substr(guf_str* str, size_t pos, size_t count);
const ptrdiff_t substr_len = pos + count > str.len ? str.len - pos : count;
GUF_ASSERT(substr_len >= 0);
GUF_ASSERT(substr_len <= str.len);
guf_str *guf_str_reserve(guf_str *str, size_t bufsize);
guf_str *guf_str_shrink_capacity(guf_str *str, size_t shrink_trigger_fac, bool shrink_exact);
char guf_str_pop_back(guf_str *str);
char guf_str_pop_front(guf_str *str);
// Copying and viewing:
guf_str guf_str_substr_cpy(guf_str_view str, size_t pos, size_t count); // not necessary
guf_str_view guf_str_substr_view(guf_str_view str, size_t pos, size_t count);
// Indexing:
char *guf_str_at(guf_str *str, size_t idx);
char *guf_str_back(guf_str *str);
char *guf_str_front(guf_str *str);
const char *guf_str_const_cstr(const guf_str *str);
// Metadata retrieval:
size_t guf_str_len(const guf_str *str); // The size (in chars) without the final zero-terminator (size - 1).
size_t guf_str_capacity(const guf_str *str);
bool guf_str_is_stack_allocated(const guf_str *str);
bool guf_str_is_valid(const guf_str *str);
bool guf_str_alloc_success(const guf_str *str);
return (guf_str_view){.str = str.str + pos, .len = substr_len};
}
// Comparison:
bool guf_str_view_equal(guf_str_view a, guf_str_view b);
bool guf_str_equal(const guf_str *a, const guf_str *b);
bool guf_str_equals_cstr(const guf_str *a, const char *c_str);
bool guf_str_equals_strview(const guf_str *a, guf_str_view b);
int guf_str_view_cmp(const void *str_view_a, const void *str_view_b); // For qsort etc.
GUF_FN_KEYWORDS bool guf_str_view_equal(const guf_str_view* a, const guf_str_view* b)
{
GUF_ASSERT_RELEASE(a && b);
GUF_ASSERT_RELEASE(a->str && b->str);
if (a->len != b->len) {
return false;
}
GUF_ASSERT_RELEASE(a->len >= 0);
// UTF-8 operations.
bool guf_str_char_is_ascii(char c);
bool guf_str_is_ascii(const guf_str *str);
return 0 == memcmp(a->str, b->str, a->len);
}
#endif
#undef GUF_IMPL
#undef GUF_IMPL_STATIC
#undef GUF_STATIC
#undef GUF_FN_KEYWORDS
#endif /* end impl */

View File

@ -1,14 +1,14 @@
#if defined(GUF_STATIC) || defined(GUF_STATIC_IMPL)
#define GUF_FN_KEYWORDS static
#else
#define GUF_FN_KEYWORDS
#endif
#ifndef GUF_UTF8_H
#define GUF_UTF8_H
#include "guf_common.h"
#include "guf_str.h"
#if defined(GUF_STATIC) || defined(GUF_STATIC_IMPL)
#define GUF_FN_KEYWORDS static
#else
#define GUF_FN_KEYWORDS
#endif
typedef struct guf_utf8_char {
char bytes[5];
} guf_utf8_char;
@ -32,6 +32,12 @@
GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str);
GUF_FN_KEYWORDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str_view *delims, ptrdiff_t num_delims, const guf_str_view *preserved_delims, ptrdiff_t num_preserved_delims);
extern const char* const guf_utf8_whitespace[25];
extern const char* const guf_utf8_punctuation[29];
#endif
#if defined(GUF_IMPL) || defined(GUF_IMPL_STATIC)
@ -39,6 +45,22 @@
#include "guf_common.h"
#include "guf_assert.h"
const char* const guf_utf8_whitespace[25] =
{
" ", "\n", "\t", "\t", "\v", "\f",
"\xC2\x85", "\xC2\xA0",
"\xE1\x9A\x80", "\xE2\x80\x80", "\xE2\x80\x81", "\xE2\x80\x82", "\xE2\x80\x83", "\xE2\x80\x84", "\xE2\x80\x85", "\xE2\x80\x86", "\xE2\x80\x87", "\xE2\x80\x88", "\xE2\x80\x89", "\xE2\x80\x8A", "\xE2\x80\xA8", "\xE2\x80\xA9", "\xE2\x80\xAF", "\xE2\x81\x9F", "\xE3\x80\x80"
};
const char* const guf_utf8_punctuation[29] =
{
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "", "", "´", "`", "\\", "%", "", "", "", ""
};
#ifndef GUF_FN_KEYWORDS
#define GUF_FN_KEYWORDS
#endif
GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str)
{
GUF_ASSERT_RELEASE(res);
@ -204,6 +226,60 @@ GUF_FN_KEYWORDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c)
}
}
GUF_FN_KEYWORDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str_view *delims, ptrdiff_t num_delims, const guf_str_view *preserved_delims, ptrdiff_t num_preserved_delims)
{
if (input->len <= 0 || input->str == NULL) {
return (guf_str_view){.str = NULL, .len = 0};
}
ptrdiff_t max_delim_len = -1;
for (ptrdiff_t i = 0; i < num_delims; ++i) {
if (delims[i].len > max_delim_len) {
max_delim_len = delims[i].len;
}
}
guf_str_view tok = {.str = input->str, .len = 0};
guf_utf8_char ch = {0};
guf_str_view prev_input = *input;
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, input); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, input)) {
if (stat != GUF_UTF8_READ_VALID) {
prev_input = *input;
continue;
}
const int num_bytes = guf_utf8_char_num_bytes(&ch);
for (ptrdiff_t delim_len = GUF_MIN(max_delim_len, prev_input.len); delim_len > 0; --delim_len) {
guf_str_view delim_candidate = guf_substr_view(prev_input, 0, delim_len);
for (ptrdiff_t delim_i = 0; delim_i < num_delims; ++delim_i) {
if (guf_str_view_equal(&delim_candidate, delims + delim_i)) { // Found delim.
input->len = prev_input.len - delim_len;
input->str = prev_input.len > 0 ? prev_input.str + delim_len : NULL;
if (preserved_delims && num_preserved_delims > 0) {
return delim_candidate;
}
if (tok.len == 0) {
tok.str = input->str;
goto end;
} else {
return tok;
}
}
}
}
tok.len += num_bytes;
end:;
prev_input = *input;
}
return tok;
}
#endif
#undef GUF_FN_KEYWORDS

View File

@ -15,9 +15,9 @@ wir nicht singen.“
Det var i den Tid, jeg gik omkring og sulted i Kristiania, denne forunderlige By,
som ingen forlader, før han har fået Mærker af den . . . .
Jeg ligger vågen på min Kvist og hører en Klokke nedenunder mig slå seks Slag; det var allerede ganske lyst,
og Folk begyndte at færdes op og ned i Trapperne. Nede ved Døren, hvor mit Rum var tapetseret med gamle Numre
og Folk begyndte at færdes op og ned i Trapperne. Nede ved Døren, hvor mit Rum var tapetseret med gamle Numre
af »Morgenbladet«, kunde jeg så tydelig se en Bekendtgørelse fra Fyrdirektøren, og lidt tilvenstre derfra et fedt,
bugnende Avertissement fra Bager Fabian Olsen om nybagt Brød.
bugnende Avertissement fra Bager Fabian Olsen om nybagt Brød.
The quick brown fox jumps over the lazy dog.

View File

@ -32,3 +32,9 @@
#define GUF_T_EQ guf_cstr_const_eq
#define GUF_IMPL
#include "guf_dbuf.h"
#define GUF_T guf_str_view
#define GUF_CNT_NAME dbuf_str_view
#define GUF_T_EQ guf_str_view_equal
#define GUF_IMPL
#include "guf_dbuf.h"

View File

@ -2,6 +2,7 @@
#define GUF_DBUF_IMPL_H
#include "guf_cstr.h"
#include "guf_str.h"
#define GUF_CNT_NAME dbuf_int
#define GUF_T int
@ -33,4 +34,9 @@ typedef unsigned char uchar;
#define GUF_T_EQ guf_cstr_const_eq
#include "guf_dbuf.h"
#define GUF_T guf_str_view
#define GUF_CNT_NAME dbuf_str_view
#define GUF_T_EQ guf_str_view_equal
#include "guf_dbuf.h"
#endif

View File

@ -2,3 +2,6 @@
#define GUF_IMPL
#include "guf_utf8.h"
#define GUF_IMPL
#include "guf_str.h"

View File

@ -7,6 +7,7 @@ extern "C"
#include "guf_alloc_libc.h"
#include "guf_dict_impl.h"
#include "guf_utf8.h"
#include "guf_str.h"
}
struct DictCstrToIntTest : public Test
@ -25,22 +26,38 @@ struct DictCstrToIntTest : public Test
dict_cstr_int word_cnt_dict {};
dict_cstr_int_init(&word_cnt_dict, &guf_allocator_libc);
ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = text_buf.size;
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
guf_utf8_char ch = {};
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
if (stat == GUF_UTF8_READ_VALID) {
++valid_chars;
printf("%s", ch.bytes);
} else {
++invalid_chars;
printf("::INVALID_UTF8_CHAR::");
}
dbuf_str_view delims = dbuf_str_view_new(&guf_allocator_libc);
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_whitespace); ++i) {
guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_whitespace[i]), .str = guf_utf8_whitespace[i]};
dbuf_str_view_push_val(&delims, d);
}
TEST_CHECK(input_str.len == 0 && input_str.str == NULL);
printf("\nread %td bytes\n", bytes);
printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars);
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_punctuation); ++i) {
guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_punctuation[i]), .str = guf_utf8_punctuation[i]};
dbuf_str_view_push_val(&delims, d);
}
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
guf_str_view tok;
while ((tok = guf_str_next_tok(&input_str, delims.data, delims.size, NULL, -1)).len) {
// printf("tok_len: %td ", tok.len);
printf("'%.*s'\n", (int)tok.len, tok.str);
}
dbuf_str_view_free(&delims, NULL);
// ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = 0;
// guf_utf8_char ch = {};
// for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
// if (stat == GUF_UTF8_READ_VALID) {
// ++valid_chars;
// printf("%s", ch.bytes);
// } else {
// ++invalid_chars;
// printf("::INVALID_UTF8_CHAR::");
// }
// bytes += guf_utf8_char_num_bytes(&ch);
// }
// TEST_CHECK(input_str.len == 0 && input_str.str == NULL);
// printf("\nread %td bytes\n", bytes);
// printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars);
dict_cstr_int_free(&word_cnt_dict, NULL);
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;

View File

@ -1,2 +1,2 @@
- guf_stack, guf_queue, guf_ringbuf
- guf_rand etc.: move guf_fn_keywors out of header guard?
- guf_rand etc.: move guf_fn_keywors out of header guard? (-> no, add a GUF_WITHOUT_TYPES)