diff --git a/src/guf_str.c b/src/guf_str.c index c86ea6a..c752093 100644 --- a/src/guf_str.c +++ b/src/guf_str.c @@ -760,5 +760,3 @@ size_t guf_str_len_utf8(const guf_str *str) assert(n >= 1); return n - 1; } - -// guf_str_tokenise (const guf_str *str, const char *delims, const char *preserved_delims, ) diff --git a/src/guf_str.h b/src/guf_str.h index be2c088..9af8789 100644 --- a/src/guf_str.h +++ b/src/guf_str.h @@ -3,12 +3,11 @@ #include "guf_common.h" #include "guf_alloc.h" -// #define GUF_T char -// #define GUF_CNT_NAME guf_dbuf_char -// #define GUF_T_IS_INTEGRAL_TYPE -// #include "guf_dbuf.h" - -#define GUF_STR_ABORT_ON_ALLOC_FAILURE 1 +#if defined(GUF_STATIC) || defined(GUF_IMPL_STATIC) + #define GUF_FN_KEYWORDS static +#else + #define GUF_FN_KEYWORDS +#endif typedef enum guf_str_state { GUF_STR_STATE_INIT = 0, @@ -43,62 +42,101 @@ typedef struct guf_str_view { #define GUF_STR_TO_VIEW(GUF_STR_PTR) ((guf_str_view){.str = guf_str_const_cstr((GUF_STR_PTR)), .len = guf_str_len((GUF_STR_PTR))}) #define GUF_CSTR_TO_READONLY_STR(CSTR) ((guf_str){.state = GUF_STR_STATE_VIEW, .allocator = NULL, .data.heap.c_str = CSTR, .data.heap.len = strlen(CSTR), .data.heap.capacity = 0}) -extern const guf_str GUF_STR_UNINITIALISED; -extern const guf_str GUF_STR_UNINITIALISED_FAILED_ALLOC; +// Creation: +GUF_FN_KEYWORDS guf_str *guf_str_init(guf_str *str, guf_str_view str_view); +GUF_FN_KEYWORDS guf_str *guf_str_init_from_cstr(guf_str *str, const char* c_str); +GUF_FN_KEYWORDS guf_str *guf_str_init_empty_with_capacity(guf_str *str, size_t capacity); +// guf_str_new functions return GUF_DICT_UNINITIALISED or GUF_STR_UNINITIALISED_FAILED_ALLOC on failure (can be checked with guf_str_alloc_success) +GUF_FN_KEYWORDS guf_str guf_str_new(guf_str_view str_view); +GUF_FN_KEYWORDS guf_str guf_str_new_substr(guf_str_view str_view, ptrdiff_t pos, ptrdiff_t len); + +GUF_FN_KEYWORDS guf_str guf_str_new_from_cstr(const char *c_str); +GUF_FN_KEYWORDS guf_str guf_str_new_empty_with_capacity(size_t capacity); + +// Destruction: +GUF_FN_KEYWORDS void guf_str_free(guf_str *str); + +// Modification: +GUF_FN_KEYWORDS guf_str *guf_str_append(guf_str *str, guf_str_view to_append); +GUF_FN_KEYWORDS guf_str *guf_str_append_cstr(guf_str *str, const char *cstr_to_append); // Not necessary +GUF_FN_KEYWORDS guf_str *guf_str_substr(guf_str* str, size_t pos, size_t count); + +GUF_FN_KEYWORDS guf_str *guf_str_reserve(guf_str *str, size_t bufsize); +GUF_FN_KEYWORDS guf_str *guf_str_shrink_capacity(guf_str *str, size_t shrink_trigger_fac, bool shrink_exact); + +GUF_FN_KEYWORDS char guf_str_pop_back(guf_str *str); +GUF_FN_KEYWORDS char guf_str_pop_front(guf_str *str); + +// Copying and viewing: +GUF_FN_KEYWORDS guf_str guf_str_substr_cpy(guf_str_view str, size_t pos, size_t count); // not necessary +GUF_FN_KEYWORDS guf_str_view guf_substr_view(guf_str_view str, ptrdiff_t pos, ptrdiff_t count); + +// Indexing: +GUF_FN_KEYWORDS char *guf_str_at(guf_str *str, size_t idx); +GUF_FN_KEYWORDS char *guf_str_back(guf_str *str); +GUF_FN_KEYWORDS char *guf_str_front(guf_str *str); +GUF_FN_KEYWORDS const char *guf_str_const_cstr(const guf_str *str); + +// Metadata retrieval: +GUF_FN_KEYWORDS size_t guf_str_len(const guf_str *str); // The size (in chars) without the final zero-terminator (size - 1). +GUF_FN_KEYWORDS size_t guf_str_capacity(const guf_str *str); +GUF_FN_KEYWORDS bool guf_str_is_stack_allocated(const guf_str *str); +GUF_FN_KEYWORDS bool guf_str_is_valid(const guf_str *str); +GUF_FN_KEYWORDS bool guf_str_alloc_success(const guf_str *str); + +// Comparison: +GUF_FN_KEYWORDS bool guf_str_view_equal(const guf_str_view* a, const guf_str_view* b); +GUF_FN_KEYWORDS bool guf_str_equal(const guf_str *a, const guf_str *b); +GUF_FN_KEYWORDS bool guf_str_equals_cstr(const guf_str *a, const char *c_str); +GUF_FN_KEYWORDS bool guf_str_equals_strview(const guf_str *a, guf_str_view b); +GUF_FN_KEYWORDS int guf_str_view_cmp(const void *str_view_a, const void *str_view_b); // For qsort etc. + +#endif + +#if defined(GUF_IMPL) || defined(GUF_IMPL_STATIC) + +#include + +#ifndef GUF_FN_KEYWORDS + #define GUF_FN_KEYWORDS +#endif // TODO: find_first_of and tokenise -> for parsing, see aoclib. -// Creation: -guf_str *guf_str_init(guf_str *str, guf_str_view str_view); -guf_str *guf_str_init_from_cstr(guf_str *str, const char* c_str); -guf_str *guf_str_init_empty_with_capacity(guf_str *str, size_t capacity); -// guf_str_new functions return GUF_DICT_UNINITIALISED or GUF_STR_UNINITIALISED_FAILED_ALLOC on failure (can be checked with guf_str_alloc_success) -guf_str guf_str_new(guf_str_view str_view); -guf_str guf_str_new_substr(guf_str_view str_view, ptrdiff_t pos, ptrdiff_t len); -guf_str guf_str_new_from_cstr(const char *c_str); -guf_str guf_str_new_empty_with_capacity(size_t capacity); +GUF_FN_KEYWORDS guf_str_view guf_substr_view(guf_str_view str, ptrdiff_t pos, ptrdiff_t count) +{ + GUF_ASSERT(str.str); + GUF_ASSERT(pos >= 0); + GUF_ASSERT(count >= 0); -// Destruction: -void guf_str_free(guf_str *str); + if (str.len == 0 || count == 0 || pos >= str.len || str.str == NULL) { + return (guf_str_view){.str = str.str, .len = 0}; + } -// Modification: -guf_str *guf_str_append(guf_str *str, guf_str_view to_append); -guf_str *guf_str_append_cstr(guf_str *str, const char *cstr_to_append); // Not necessary -guf_str *guf_str_substr(guf_str* str, size_t pos, size_t count); + const ptrdiff_t substr_len = pos + count > str.len ? str.len - pos : count; + GUF_ASSERT(substr_len >= 0); + GUF_ASSERT(substr_len <= str.len); -guf_str *guf_str_reserve(guf_str *str, size_t bufsize); -guf_str *guf_str_shrink_capacity(guf_str *str, size_t shrink_trigger_fac, bool shrink_exact); - -char guf_str_pop_back(guf_str *str); -char guf_str_pop_front(guf_str *str); - -// Copying and viewing: -guf_str guf_str_substr_cpy(guf_str_view str, size_t pos, size_t count); // not necessary -guf_str_view guf_str_substr_view(guf_str_view str, size_t pos, size_t count); - -// Indexing: -char *guf_str_at(guf_str *str, size_t idx); -char *guf_str_back(guf_str *str); -char *guf_str_front(guf_str *str); -const char *guf_str_const_cstr(const guf_str *str); - -// Metadata retrieval: -size_t guf_str_len(const guf_str *str); // The size (in chars) without the final zero-terminator (size - 1). -size_t guf_str_capacity(const guf_str *str); -bool guf_str_is_stack_allocated(const guf_str *str); -bool guf_str_is_valid(const guf_str *str); -bool guf_str_alloc_success(const guf_str *str); + return (guf_str_view){.str = str.str + pos, .len = substr_len}; +} // Comparison: -bool guf_str_view_equal(guf_str_view a, guf_str_view b); -bool guf_str_equal(const guf_str *a, const guf_str *b); -bool guf_str_equals_cstr(const guf_str *a, const char *c_str); -bool guf_str_equals_strview(const guf_str *a, guf_str_view b); -int guf_str_view_cmp(const void *str_view_a, const void *str_view_b); // For qsort etc. +GUF_FN_KEYWORDS bool guf_str_view_equal(const guf_str_view* a, const guf_str_view* b) +{ + GUF_ASSERT_RELEASE(a && b); + GUF_ASSERT_RELEASE(a->str && b->str); + if (a->len != b->len) { + return false; + } + GUF_ASSERT_RELEASE(a->len >= 0); -// UTF-8 operations. -bool guf_str_char_is_ascii(char c); -bool guf_str_is_ascii(const guf_str *str); + return 0 == memcmp(a->str, b->str, a->len); +} -#endif +#undef GUF_IMPL +#undef GUF_IMPL_STATIC +#undef GUF_STATIC +#undef GUF_FN_KEYWORDS + +#endif /* end impl */ diff --git a/src/guf_utf8.h b/src/guf_utf8.h index b4f53c8..d92b729 100644 --- a/src/guf_utf8.h +++ b/src/guf_utf8.h @@ -1,14 +1,14 @@ -#if defined(GUF_STATIC) || defined(GUF_STATIC_IMPL) - #define GUF_FN_KEYWORDS static -#else - #define GUF_FN_KEYWORDS -#endif - #ifndef GUF_UTF8_H #define GUF_UTF8_H #include "guf_common.h" #include "guf_str.h" + #if defined(GUF_STATIC) || defined(GUF_STATIC_IMPL) + #define GUF_FN_KEYWORDS static + #else + #define GUF_FN_KEYWORDS + #endif + typedef struct guf_utf8_char { char bytes[5]; } guf_utf8_char; @@ -32,6 +32,12 @@ GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str); + GUF_FN_KEYWORDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str_view *delims, ptrdiff_t num_delims, const guf_str_view *preserved_delims, ptrdiff_t num_preserved_delims); + + + extern const char* const guf_utf8_whitespace[25]; + extern const char* const guf_utf8_punctuation[29]; + #endif #if defined(GUF_IMPL) || defined(GUF_IMPL_STATIC) @@ -39,6 +45,22 @@ #include "guf_common.h" #include "guf_assert.h" +const char* const guf_utf8_whitespace[25] = +{ + " ", "\n", "\t", "\t", "\v", "\f", + "\xC2\x85", "\xC2\xA0", + "\xE1\x9A\x80", "\xE2\x80\x80", "\xE2\x80\x81", "\xE2\x80\x82", "\xE2\x80\x83", "\xE2\x80\x84", "\xE2\x80\x85", "\xE2\x80\x86", "\xE2\x80\x87", "\xE2\x80\x88", "\xE2\x80\x89", "\xE2\x80\x8A", "\xE2\x80\xA8", "\xE2\x80\xA9", "\xE2\x80\xAF", "\xE2\x81\x9F", "\xE3\x80\x80" +}; + +const char* const guf_utf8_punctuation[29] = +{ + ".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "„", "“", "´", "`", "\\", "%", "‒", "–", "—", "—" +}; + +#ifndef GUF_FN_KEYWORDS + #define GUF_FN_KEYWORDS +#endif + GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str) { GUF_ASSERT_RELEASE(res); @@ -204,6 +226,60 @@ GUF_FN_KEYWORDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c) } } +GUF_FN_KEYWORDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str_view *delims, ptrdiff_t num_delims, const guf_str_view *preserved_delims, ptrdiff_t num_preserved_delims) +{ + if (input->len <= 0 || input->str == NULL) { + return (guf_str_view){.str = NULL, .len = 0}; + } + + ptrdiff_t max_delim_len = -1; + for (ptrdiff_t i = 0; i < num_delims; ++i) { + if (delims[i].len > max_delim_len) { + max_delim_len = delims[i].len; + } + } + + guf_str_view tok = {.str = input->str, .len = 0}; + + guf_utf8_char ch = {0}; + + guf_str_view prev_input = *input; + + for (guf_utf8_stat stat = guf_utf8_char_next(&ch, input); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, input)) { + if (stat != GUF_UTF8_READ_VALID) { + prev_input = *input; + continue; + } + + const int num_bytes = guf_utf8_char_num_bytes(&ch); + + for (ptrdiff_t delim_len = GUF_MIN(max_delim_len, prev_input.len); delim_len > 0; --delim_len) { + guf_str_view delim_candidate = guf_substr_view(prev_input, 0, delim_len); + for (ptrdiff_t delim_i = 0; delim_i < num_delims; ++delim_i) { + if (guf_str_view_equal(&delim_candidate, delims + delim_i)) { // Found delim. + input->len = prev_input.len - delim_len; + input->str = prev_input.len > 0 ? prev_input.str + delim_len : NULL; + if (preserved_delims && num_preserved_delims > 0) { + return delim_candidate; + } + if (tok.len == 0) { + tok.str = input->str; + goto end; + } else { + return tok; + } + } + } + } + tok.len += num_bytes; + + end:; + prev_input = *input; + } + + return tok; +} + #endif #undef GUF_FN_KEYWORDS diff --git a/src/test/data/utf8-test.txt b/src/test/data/utf8-test.txt index 47042a8..f7e4dbf 100644 --- a/src/test/data/utf8-test.txt +++ b/src/test/data/utf8-test.txt @@ -15,9 +15,9 @@ wir nicht singen.“ Det var i den Tid, jeg gik omkring og sulted i Kristiania, denne forunderlige By, som ingen forlader, før han har fået Mærker af den . . . . Jeg ligger vågen på min Kvist og hører en Klokke nedenunder mig slå seks Slag; det var allerede ganske lyst, -og Folk begyndte at færdes op og ned i Trapperne. Nede ved Døren, hvor mit Rum var tapetseret med gamle Numre +og Folk begyndte at færdes op og ned i Trapperne. Nede ved Døren, hvor mit Rum var tapetseret med gamle Numre af »Morgenbladet«, kunde jeg så tydelig se en Bekendtgørelse fra Fyrdirektøren, og lidt tilvenstre derfra et fedt, -bugnende Avertissement fra Bager Fabian Olsen om nybagt Brød. +bugnende Avertissement fra Bager Fabian Olsen om nybagt Brød. The quick brown fox jumps over the lazy dog. diff --git a/src/test/guf_dbuf_impl.c b/src/test/guf_dbuf_impl.c index 8e1cdf9..2265246 100644 --- a/src/test/guf_dbuf_impl.c +++ b/src/test/guf_dbuf_impl.c @@ -32,3 +32,9 @@ #define GUF_T_EQ guf_cstr_const_eq #define GUF_IMPL #include "guf_dbuf.h" + +#define GUF_T guf_str_view +#define GUF_CNT_NAME dbuf_str_view +#define GUF_T_EQ guf_str_view_equal +#define GUF_IMPL +#include "guf_dbuf.h" diff --git a/src/test/guf_dbuf_impl.h b/src/test/guf_dbuf_impl.h index af2ad41..e448e69 100644 --- a/src/test/guf_dbuf_impl.h +++ b/src/test/guf_dbuf_impl.h @@ -2,6 +2,7 @@ #define GUF_DBUF_IMPL_H #include "guf_cstr.h" +#include "guf_str.h" #define GUF_CNT_NAME dbuf_int #define GUF_T int @@ -33,4 +34,9 @@ typedef unsigned char uchar; #define GUF_T_EQ guf_cstr_const_eq #include "guf_dbuf.h" +#define GUF_T guf_str_view +#define GUF_CNT_NAME dbuf_str_view +#define GUF_T_EQ guf_str_view_equal +#include "guf_dbuf.h" + #endif diff --git a/src/test/guf_utf8_impl.c b/src/test/guf_utf8_impl.c index b0b2362..d4e6cc1 100644 --- a/src/test/guf_utf8_impl.c +++ b/src/test/guf_utf8_impl.c @@ -2,3 +2,6 @@ #define GUF_IMPL #include "guf_utf8.h" + +#define GUF_IMPL +#include "guf_str.h" diff --git a/src/test/test_dict.hpp b/src/test/test_dict.hpp index 9c55add..b93dd3c 100644 --- a/src/test/test_dict.hpp +++ b/src/test/test_dict.hpp @@ -7,6 +7,7 @@ extern "C" #include "guf_alloc_libc.h" #include "guf_dict_impl.h" #include "guf_utf8.h" + #include "guf_str.h" } struct DictCstrToIntTest : public Test @@ -25,22 +26,38 @@ struct DictCstrToIntTest : public Test dict_cstr_int word_cnt_dict {}; dict_cstr_int_init(&word_cnt_dict, &guf_allocator_libc); - ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = text_buf.size; - guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; - guf_utf8_char ch = {}; - - for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) { - if (stat == GUF_UTF8_READ_VALID) { - ++valid_chars; - printf("%s", ch.bytes); - } else { - ++invalid_chars; - printf("::INVALID_UTF8_CHAR::"); - } + dbuf_str_view delims = dbuf_str_view_new(&guf_allocator_libc); + for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_whitespace); ++i) { + guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_whitespace[i]), .str = guf_utf8_whitespace[i]}; + dbuf_str_view_push_val(&delims, d); } - TEST_CHECK(input_str.len == 0 && input_str.str == NULL); - printf("\nread %td bytes\n", bytes); - printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars); + for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(guf_utf8_punctuation); ++i) { + guf_str_view d = {.len = (ptrdiff_t)strlen(guf_utf8_punctuation[i]), .str = guf_utf8_punctuation[i]}; + dbuf_str_view_push_val(&delims, d); + } + guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; + guf_str_view tok; + while ((tok = guf_str_next_tok(&input_str, delims.data, delims.size, NULL, -1)).len) { + // printf("tok_len: %td ", tok.len); + printf("'%.*s'\n", (int)tok.len, tok.str); + } + dbuf_str_view_free(&delims, NULL); + + // ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = 0; + // guf_utf8_char ch = {}; + // for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) { + // if (stat == GUF_UTF8_READ_VALID) { + // ++valid_chars; + // printf("%s", ch.bytes); + // } else { + // ++invalid_chars; + // printf("::INVALID_UTF8_CHAR::"); + // } + // bytes += guf_utf8_char_num_bytes(&ch); + // } + // TEST_CHECK(input_str.len == 0 && input_str.str == NULL); + // printf("\nread %td bytes\n", bytes); + // printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars); dict_cstr_int_free(&word_cnt_dict, NULL); bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size; diff --git a/todo.txt b/todo.txt index f4847fe..48fe1c6 100644 --- a/todo.txt +++ b/todo.txt @@ -1,2 +1,2 @@ - guf_stack, guf_queue, guf_ringbuf -- guf_rand etc.: move guf_fn_keywors out of header guard? \ No newline at end of file +- guf_rand etc.: move guf_fn_keywors out of header guard? (-> no, add a GUF_WITHOUT_TYPES) \ No newline at end of file