libguf/src/guf_utf8.h

#ifndef GUF_UTF8_H
#define GUF_UTF8_H
    #include "guf_common.h"
    #include "guf_str.h"

    #if defined(GUF_STATIC) || defined(GUF_STATIC_IMPL)
        #define GUF_FN_KEYWORDS static
    #else
        #define GUF_FN_KEYWORDS
    #endif

    typedef struct guf_utf8_char {
        char bytes[5];
    } guf_utf8_char;

    typedef enum guf_utf8_stat {
        GUF_UTF8_READ_DONE,
        GUF_UTF8_READ_VALID,
        GUF_UTF8_READ_INVALID,
        GUF_UTF8_READ_TRUNCATED,
    } guf_utf8_stat;

    static inline bool guf_char_is_ascii(int c) {return c <= 0 && c <= 127;}
    static inline bool guf_uchar_is_ascii(unsigned char c) {return c <= 127;}

    GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c);
    GUF_FN_KEYWORDS int guf_utf8_char_num_bytes(guf_utf8_char *c);

    GUF_FN_KEYWORDS guf_utf8_char guf_utf8_char_new(const char *bytes, int num_bytes);
    GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c);
    GUF_FN_KEYWORDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c);

    GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str);

    GUF_FN_KEYWORDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str_view *delims, ptrdiff_t num_delims, const guf_str_view *preserved_delims, ptrdiff_t num_preserved_delims);


    extern const char* const guf_utf8_whitespace[25];
    extern const char* const guf_utf8_punctuation[29];

#endif

#if defined(GUF_IMPL) || defined(GUF_IMPL_STATIC)

#include "guf_common.h"
#include "guf_assert.h"

const char* const guf_utf8_whitespace[25] =
{
    " ", "\n", "\t", "\t", "\v", "\f",
    "\xC2\x85", "\xC2\xA0",
    "\xE1\x9A\x80", "\xE2\x80\x80", "\xE2\x80\x81", "\xE2\x80\x82", "\xE2\x80\x83", "\xE2\x80\x84", "\xE2\x80\x85", "\xE2\x80\x86", "\xE2\x80\x87", "\xE2\x80\x88", "\xE2\x80\x89", "\xE2\x80\x8A", "\xE2\x80\xA8", "\xE2\x80\xA9", "\xE2\x80\xAF", "\xE2\x81\x9F", "\xE3\x80\x80"
};

const char* const guf_utf8_punctuation[29] =
{
    ".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "„", "“", "´", "`", "\\", "%", "‒", "–", "—", "—"
};

#ifndef GUF_FN_KEYWORDS
    #define GUF_FN_KEYWORDS
#endif

GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str)
{
    GUF_ASSERT_RELEASE(res);
    GUF_ASSERT_RELEASE(str);

    if (str->len <= 0 || str->str == NULL) {
        return GUF_UTF8_READ_DONE;
    }

    int consumed = 0;
    res->bytes[consumed++] = str->str[0];
    str->len--;
    str->str = str->len ? str->str + 1 : NULL;

    for (size_t i = 1; i < GUF_STATIC_BUF_SIZE(res->bytes); ++i) {
        res->bytes[i] = '\0';
    }

    const int num_bytes = guf_utf8_char_num_bytes(res);

    if (!num_bytes) {
        return GUF_UTF8_READ_INVALID;
    }

    while (consumed < num_bytes && str->len > 0) {
        res->bytes[consumed++] = str->str[0];
        str->len--;
        str->str = str->len ? str->str + 1 : NULL;
    }

    if (consumed < num_bytes) {
        return GUF_UTF8_READ_TRUNCATED;
    } else if (guf_utf8_char_is_valid(res)) {
        return GUF_UTF8_READ_VALID;
    } else {
        return GUF_UTF8_READ_INVALID;
    }
}


// cf. https://www.rfc-editor.org/rfc/rfc3629#page-4
GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c)
{
    if (c <= 0x7F) {                     // bits: 0xxx.xxxx
        return 1;
    } else if (c >= 0xC2 && c <= 0xDF) { // bits: 110x.xxxx (without 0xC0 and 0xC1)
        return 2;
    } else if (c >= 0xE0 && c <= 0xEF) { // bits: 1110.xxxx
        return 3;
    } else if (c >= 0xF0 && c <= 0xF4) { // bits: b1111.0xxx (without 0xF5 to 0xFF)
        return 4;
    } else {
        return 0; // Invalid byte.
    }
}

GUF_FN_KEYWORDS int guf_utf8_char_num_bytes(guf_utf8_char *c)
{
    GUF_ASSERT(c);
    return guf_utf8_num_bytes(c->bytes[0]);
}


GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c)
{
    const int num_bytes = guf_utf8_num_bytes(c->bytes[0]);

    if (!num_bytes) {
        return false;
    }

    const unsigned char *bytes = (const unsigned char*)c->bytes;

    for (int i = 0; i < num_bytes; ++i) {
        // "The octet values C0, C1, F5 to FF never appear.", cf. https://www.rfc-editor.org/rfc/rfc3629#page-5
        if (bytes[i] == 0xC0 || bytes[i] == 0xC1 || (bytes[i] >= 0xF5 && bytes[i] <= 0xFF)) {
            return false;
        }
    }

    // Binary: 10xx.xxxx
    #define guf_valid_tail(byte) ((byte) >= 0x80 && (byte) <= 0xBF)

    // cf. https://datatracker.ietf.org/doc/html/rfc3629#page-5
    switch (num_bytes)
    {
    case 1:
        return true;

    case 2:
        return guf_valid_tail(bytes[1]);

    case 3:
        if ((bytes[0] == 0xE0) && (bytes[1] >= 0xA0 && bytes[1] <= 0xBF) && guf_valid_tail(bytes[2])) {
            return true;
        }
        if ((bytes[0] >= 0xE1 && bytes[0] <= 0xEC) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2])) {
            return true;
        }
        if ((bytes[0] == 0xED) && (bytes[1] >= 0x80 && bytes[1] <= 0x9F) && guf_valid_tail(bytes[2])) {
            return true;
        }
        if ((bytes[0] >= 0xEE && bytes[0] <= 0xEF) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2])) {
            return true;
        }
        return false;

    case 4:
        if ((bytes[0] == 0xF0) && (bytes[1] >= 0x90 && bytes[1] <= 0xBF) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
            return true;
        }
        if ((bytes[0] >= 0xF1 && bytes[0] <= 0xF3) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
            return true;
        }
        if ((bytes[0] == 0xF4) && (bytes[1] >= 0x80 && bytes[1] <= 0x8F) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
            return true;
        }
        return false;

    default:
        return false;
    }
    #undef guf_valid_tail
}

GUF_FN_KEYWORDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c)
{
    // cf. https://en.wikipedia.org/wiki/Whitespace_character#Unicode (last-retrieved 2025-02-27)
    const char *ws_one_byte[]    = {" ", "\n", "\t", "\t", "\v", "\f"};
    const char *ws_two_bytes[]   = {"\xC2\x85", "\xC2\xA0"};
    const char *ws_three_bytes[] = {"\xE1\x9A\x80", "\xE2\x80\x80", "\xE2\x80\x81", "\xE2\x80\x82", "\xE2\x80\x83", "\xE2\x80\x84", "\xE2\x80\x85", "\xE2\x80\x86", "\xE2\x80\x87", "\xE2\x80\x88", "\xE2\x80\x89", "\xE2\x80\x8A", "\xE2\x80\xA8", "\xE2\x80\xA9", "\xE2\x80\xAF", "\xE2\x81\x9F", "\xE3\x80\x80"};

    const int num_bytes = guf_utf8_num_bytes(c->bytes[0]);

    switch (num_bytes)
    {
    case 1:
        for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_one_byte); ++i) {
            if (c->bytes[0] == ws_one_byte[i][0]) {
                return true;
            }
        }
        return false;

    case 2:
        for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_two_bytes); ++i) {
            if (c->bytes[0] == ws_two_bytes[i][0] && c->bytes[1] == ws_two_bytes[i][1]) {
                return true;
            }
        }
        return false;

    case 3:
        for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_three_bytes); ++i) {
            if (c->bytes[0] == ws_three_bytes[i][0] && c->bytes[1] == ws_three_bytes[i][1] && c->bytes[2] == ws_three_bytes[i][2]) {
                return true;
            }
        }
        return false;

    default:
        return false;
    }
}

GUF_FN_KEYWORDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str_view *delims, ptrdiff_t num_delims, const guf_str_view *preserved_delims, ptrdiff_t num_preserved_delims)
{
    if (input->len <= 0 || input->str == NULL) {
        return (guf_str_view){.str = NULL, .len = 0};
    }

    ptrdiff_t max_delim_len = -1;
    for (ptrdiff_t i = 0; i < num_delims; ++i) {
        if (delims[i].len > max_delim_len) {
            max_delim_len = delims[i].len;
        }
    }

    guf_str_view tok = {.str = input->str, .len = 0};

    guf_utf8_char ch = {0};

    guf_str_view prev_input = *input;

    for (guf_utf8_stat stat = guf_utf8_char_next(&ch, input); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, input)) {
        if (stat != GUF_UTF8_READ_VALID) {
            prev_input = *input;
            continue;
        }

        const int num_bytes = guf_utf8_char_num_bytes(&ch);

        for (ptrdiff_t delim_len = GUF_MIN(max_delim_len, prev_input.len); delim_len > 0; --delim_len) {
            guf_str_view delim_candidate = guf_substr_view(prev_input, 0, delim_len);
            for (ptrdiff_t delim_i = 0; delim_i < num_delims; ++delim_i) {
                if (guf_str_view_equal(&delim_candidate, delims + delim_i)) { // Found delim.
                    bool preserved = false;
                    if (preserved_delims && num_preserved_delims > 0) {
                        for (ptrdiff_t preserved_i = 0; preserved_i < num_preserved_delims; ++preserved_i) {
                            if (guf_str_view_equal(&delim_candidate, preserved_delims + preserved_i)) {
                                preserved = true;
                                break;
                            }
                        }
                    }
                    if (!preserved) {
                        input->len = prev_input.len - delim_len;
                        input->str = prev_input.len > 0 ? prev_input.str + delim_len : NULL;
                        GUF_ASSERT(input->len >= 0);
                    } else {
                        input->str -= num_bytes;
                        input->len += num_bytes;
                    }

                    if (tok.len == 0) {
                        if (preserved) {
                            input->str += num_bytes;
                            input->len -= num_bytes;
                            return delim_candidate;
                        }
                        tok.str = input->str;
                        goto end;
                    } else {
                        return tok;
                    }
                }
            }
        }
        tok.len += num_bytes;

        end:;
        prev_input = *input;
    }

    return tok;
}

#endif

#undef GUF_FN_KEYWORDS
#undef GUF_IMPL
#undef GUF_IMPL_STATIC
#undef GUF_STATIC