libguf/src/guf_utf8.h

#if defined(GUF_UTF8_IMPL_STATIC)
    #define GUF_UTF8_KWRDS static
#else
    #define GUF_UTF8_KWRDS
#endif

#ifndef GUF_UTF8_H
#define GUF_UTF8_H
#include "guf_common.h"
#include "guf_str_view_type.h"

// Corresponds to one unicode codepoint (NOTE: one guf_utf8_char does not necessarily correspond to one printable character, e.g. combining characters).
typedef struct guf_utf8_char {
    char bytes[5];
} guf_utf8_char;

typedef enum guf_utf8_stat {
    GUF_UTF8_READ_DONE,
    GUF_UTF8_READ_VALID,
    GUF_UTF8_READ_INVALID,
    GUF_UTF8_READ_TRUNCATED,
} guf_utf8_stat;

static inline bool guf_char_is_ascii(int c) {return c <= 0 && c <= 127;}
static inline bool guf_uchar_is_ascii(unsigned char c) {return c <= 127;}

GUF_UTF8_KWRDS int guf_utf8_num_bytes(unsigned char c);
GUF_UTF8_KWRDS int guf_utf8_char_num_bytes(const guf_utf8_char *c);
GUF_UTF8_KWRDS bool guf_utf8_char_is_valid(const guf_utf8_char *c);
GUF_UTF8_KWRDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c);

GUF_UTF8_KWRDS guf_utf8_char guf_utf8_char_new(uint32_t codepoint);             // Returns GUF_UTF8_REPLACEMENT_CHAR for invalid codepoints (and for GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT).
GUF_UTF8_KWRDS bool guf_utf8_encode(guf_utf8_char *result, uint32_t codepoint); // Returns false for invalid codepoints.
GUF_UTF8_KWRDS int32_t guf_utf8_decode(const guf_utf8_char *utf8);              // Returns -1 for invalid utf-8.

GUF_UTF8_KWRDS bool guf_utf8_equal(const guf_utf8_char *a, const guf_utf8_char *b);

GUF_UTF8_KWRDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str);

extern const char* const GUF_UTF8_WHITESPACE[25];
extern const char* const GUF_UTF8_COMMON_PUNCT[32];

extern const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR; // Replacement character "<22>" (U+FFFD)
#define GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT UINT32_C(0xFFFD)

#endif

#if defined(GUF_UTF8_IMPL) || defined(GUF_UTF8_IMPL_STATIC)
#include <string.h>

// All utf-8 whitespace, cf. https://en.wikipedia.org/wiki/Whitespace_character#Unicode (last-retrieved 2025-02-27)
const char* const GUF_UTF8_WHITESPACE[25] =
{
    " ", "\n", "\t", "\t", "\v", "\f",
    "\xC2\x85", "\xC2\xA0",
    "\xE1\x9A\x80", "\xE2\x80\x80", "\xE2\x80\x81", "\xE2\x80\x82", "\xE2\x80\x83", "\xE2\x80\x84", "\xE2\x80\x85", "\xE2\x80\x86", "\xE2\x80\x87", "\xE2\x80\x88", "\xE2\x80\x89", "\xE2\x80\x8A", "\xE2\x80\xA8", "\xE2\x80\xA9", "\xE2\x80\xAF", "\xE2\x81\x9F", "\xE3\x80\x80"
};

// Common punctuation (TODO: make more exhaustive; use \x escapes)
const char* const GUF_UTF8_COMMON_PUNCT[32] =
{
    ".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "„", "“", "´", "»", "«", "`", "\\", "%", "‒", "–", "—", "—", "_"
};

const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR  = {.bytes = {'\xEF','\xBF','\xBD', '\0', '\0'}};

#ifndef GUF_FN_KEYWORDS
    #define GUF_FN_KEYWORDS
#endif

GUF_UTF8_KWRDS bool guf_utf8_equal(const guf_utf8_char *a, const guf_utf8_char *b)
{
    const int num_bytes_a = guf_utf8_char_num_bytes(a);
    const int num_bytes_b = guf_utf8_char_num_bytes(b);

    if (num_bytes_a != num_bytes_b) {
        return false;
    }

    const int n = (num_bytes_a != 0) ? GUF_CLAMP(num_bytes_a, 1, 4) : 4;
    for (int i = 0; i < n; ++i) {
        if (a->bytes[i] != b->bytes[i]) {
            return false;
        }
    }
    return true;
}

// cf. https://datatracker.ietf.org/doc/html/rfc3629#section-3 (last-retrieved 2025-03-02)
GUF_UTF8_KWRDS bool guf_utf8_encode(guf_utf8_char *result, uint32_t cp)
{
    GUF_ASSERT(result);

    // "The definition of UTF-8 prohibits encoding character numbers between U+D800 and U+DFFF" (surrogate pairs).
    const bool might_be_valid = (cp <= 0x10FFFF) && !(cp >= 0xD800 && cp <= 0xDFFF);
    if (!might_be_valid) {
        *result = GUF_UTF8_REPLACEMENT_CHAR;
        return false;
    }

    memset(result->bytes, '\0', GUF_STATIC_BUF_SIZE(result->bytes));

    int num_bytes = 0, first_byte_bits = 0;
    if (cp <= 0x7F) {                             // binary: 0xxx.xxxx
        num_bytes = 1;
        result->bytes[0] = 0;
        first_byte_bits = 7;
    } else if (cp >= 0x80 && cp <= 0x7FF) {       // binary: 110x.xxxx 10xx.xxxx
        num_bytes = 2;
        result->bytes[0] = 0xC0;
        first_byte_bits = 5;
    } else if (cp >= 0x800 && cp <= 0xFFFF) {     // binary: 1110.xxxx 10xx.xxxx 10xx.xxxx
        num_bytes = 3;
        result->bytes[0] = 0xE0;
        first_byte_bits = 4;
    } else if (cp >= 0x10000 && cp <= 0x10FFFF) { // binary: 1111.0xxx 10xx.xxxx 10xx.xxxx 10xx.xxxx
        num_bytes = 4;
        result->bytes[0] = 0xF0;
        first_byte_bits = 3;
    }

    if (num_bytes == 0) {
        *result = GUF_UTF8_REPLACEMENT_CHAR;
        return false;
    }

    for (int i = 1; i < num_bytes; ++i) {
        result->bytes[i] = 0x80; // binary: 10xx.xxxx
    }

    const int tail_byte_bits = 6;
    int cp_bits = 0;
    for (int byte_n = num_bytes - 1; byte_n >= 0 && cp > 0; --byte_n) {
        const int bits = (byte_n == 0) ? first_byte_bits : tail_byte_bits;
        const uint32_t cp_mask = (UINT32_C(1) << bits) - 1;
        result->bytes[byte_n] = (char)((unsigned char)result->bytes[byte_n] | (cp & cp_mask));
        cp = cp >> bits;
        cp_bits += bits;
    }
    GUF_ASSERT(cp_bits <= first_byte_bits + (num_bytes - 1) * tail_byte_bits);
    GUF_ASSERT(cp_bits <= 21);
    (void)cp_bits;

    if (guf_utf8_char_is_valid(result)) {
        return true;
    } else {
        *result = GUF_UTF8_REPLACEMENT_CHAR;
        return false;
    }
}

GUF_UTF8_KWRDS guf_utf8_char guf_utf8_char_new(uint32_t codepoint)
{
    guf_utf8_char result = GUF_UTF8_REPLACEMENT_CHAR;
    guf_utf8_encode(&result, codepoint);
    return result;
}

// cf. https://datatracker.ietf.org/doc/html/rfc3629#section-3 (last-retrieved 2025-03-02)
GUF_UTF8_KWRDS int32_t guf_utf8_decode(const guf_utf8_char *c)
{
    if (!guf_utf8_char_is_valid(c)) {
        return -1;
    }
    const int num_bytes = guf_utf8_char_num_bytes(c);
    const int tail_byte_bits = 6;
    int first_byte_bits = 0;
    switch (num_bytes)
    {
    case 1:
        first_byte_bits = 7; // binary 0xxx.xxxx
        break;
    case 2:
        first_byte_bits = 5; // binary: 110x.xxxx 10xx.xxxx
        break;
    case 3:
        first_byte_bits = 4; // binary: 1110.xxxx 10xx.xxxx 10xx.xxxx
        break;
    case 4:
        first_byte_bits = 3; // binary: 1111.0xxx 10xx.xxxx 10xx.xxxx 10xx.xxxx
        break;
    default:
        return -1;
    }

    uint32_t cp = 0;
    int cp_bits = 0;
    for (int byte_n = num_bytes - 1; byte_n >= 0; --byte_n) {
        const int bits = (byte_n == 0) ? first_byte_bits : tail_byte_bits;
        const uint32_t byte_mask = (UINT32_C(1) << bits) - 1;
        cp |= ((uint32_t)c->bytes[byte_n] & byte_mask) << cp_bits;
        cp_bits += bits;
    }
    GUF_ASSERT(cp_bits == first_byte_bits + (num_bytes - 1) * tail_byte_bits);
    GUF_ASSERT(cp_bits <= 21);

    const bool valid = (cp <= 0x10FFFF) && !(cp >= 0xD800 && cp <= 0xDFFF);
    if (!valid) {
        return -1;
    } else {
        GUF_ASSERT(cp <= INT32_MAX);
        return (int32_t)cp;
    }
}


GUF_UTF8_KWRDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str)
{
    GUF_ASSERT(res);
    GUF_ASSERT(str);

    if (str->len <= 0 || str->str == NULL) {
        return GUF_UTF8_READ_DONE;
    }

    int consumed = 0;
    res->bytes[consumed++] = str->str[0];
    str->len--;
    str->str = str->len ? str->str + 1 : NULL;

    for (size_t i = 1; i < GUF_STATIC_BUF_SIZE(res->bytes); ++i) {
        res->bytes[i] = '\0';
    }

    const int num_bytes = guf_utf8_char_num_bytes(res);

    if (!num_bytes) {
        return GUF_UTF8_READ_INVALID;
    }

    while (consumed < num_bytes && str->len > 0) {
        res->bytes[consumed++] = str->str[0];
        str->len--;
        str->str = str->len ? str->str + 1 : NULL;
    }

    if (consumed < num_bytes) {
        return GUF_UTF8_READ_TRUNCATED;
    } else if (guf_utf8_char_is_valid(res)) {
        return GUF_UTF8_READ_VALID;
    } else {
        // TODO: this means str will point one past the last read character (maybe it would be better to skip to one past the first?)
        return GUF_UTF8_READ_INVALID;
    }
}

// cf. https://www.rfc-editor.org/rfc/rfc3629#page-4
GUF_UTF8_KWRDS int guf_utf8_num_bytes(unsigned char c)
{
    if (c <= 0x7F) {                     // bits: 0xxx.xxxx
        return 1;
    } else if (c >= 0xC2 && c <= 0xDF) { // bits: 110x.xxxx (without 0xC0 and 0xC1)
        return 2;
    } else if (c >= 0xE0 && c <= 0xEF) { // bits: 1110.xxxx
        return 3;
    } else if (c >= 0xF0 && c <= 0xF4) { // bits: b1111.0xxx (without 0xF5 to 0xFF)
        return 4;
    } else {
        return 0; // Invalid byte.
    }
}

GUF_UTF8_KWRDS int guf_utf8_char_num_bytes(const guf_utf8_char *c)
{
    GUF_ASSERT(c);
    return guf_utf8_num_bytes(c->bytes[0]);
}


GUF_UTF8_KWRDS bool guf_utf8_char_is_valid(const guf_utf8_char *c)
{
    const int num_bytes = guf_utf8_num_bytes(c->bytes[0]);

    if (!num_bytes) {
        return false;
    }

    const unsigned char *bytes = (const unsigned char*)c->bytes; // It's important to cast to unsigned char* here!

    for (int i = 0; i < num_bytes; ++i) {
        // "The octet values C0, C1, F5 to FF never appear.", cf. https://www.rfc-editor.org/rfc/rfc3629#page-5
        if (bytes[i] == 0xC0 || bytes[i] == 0xC1 || (bytes[i] >= 0xF5 && bytes[i] <= 0xFF)) {
            return false;
        }
    }

    // Binary: 10xx.xxxx
    #define guf_valid_tail(byte) ((byte) >= 0x80 && (byte) <= 0xBF)

    // cf. https://datatracker.ietf.org/doc/html/rfc3629#page-5
    switch (num_bytes)
    {
    case 1:
        return true;

    case 2:
        return guf_valid_tail(bytes[1]);

    case 3:
        if ((bytes[0] == 0xE0) && (bytes[1] >= 0xA0 && bytes[1] <= 0xBF) && guf_valid_tail(bytes[2])) {
            return true;
        }
        if ((bytes[0] >= 0xE1 && bytes[0] <= 0xEC) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2])) {
            return true;
        }
        if ((bytes[0] == 0xED) && (bytes[1] >= 0x80 && bytes[1] <= 0x9F) && guf_valid_tail(bytes[2])) {
            return true;
        }
        if ((bytes[0] >= 0xEE && bytes[0] <= 0xEF) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2])) {
            return true;
        }
        return false;

    case 4:
        if ((bytes[0] == 0xF0) && (bytes[1] >= 0x90 && bytes[1] <= 0xBF) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
            return true;
        }
        if ((bytes[0] >= 0xF1 && bytes[0] <= 0xF3) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
            return true;
        }
        if ((bytes[0] == 0xF4) && (bytes[1] >= 0x80 && bytes[1] <= 0x8F) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
            return true;
        }
        return false;

    default:
        return false;
    }
    #undef guf_valid_tail
}

GUF_UTF8_KWRDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c)
{
    GUF_ASSERT(c);

    // cf. https://en.wikipedia.org/wiki/Whitespace_character#Unicode (last-retrieved 2025-02-27)
    const char *ws_one_byte[]    = {" ", "\n", "\t", "\t", "\v", "\f"};
    const char *ws_two_bytes[]   = {"\xC2\x85", "\xC2\xA0"};
    const char *ws_three_bytes[] = {"\xE1\x9A\x80", "\xE2\x80\x80", "\xE2\x80\x81", "\xE2\x80\x82", "\xE2\x80\x83", "\xE2\x80\x84", "\xE2\x80\x85", "\xE2\x80\x86", "\xE2\x80\x87", "\xE2\x80\x88", "\xE2\x80\x89", "\xE2\x80\x8A", "\xE2\x80\xA8", "\xE2\x80\xA9", "\xE2\x80\xAF", "\xE2\x81\x9F", "\xE3\x80\x80"};

    const int num_bytes = guf_utf8_num_bytes(c->bytes[0]);

    switch (num_bytes)
    {
    case 1:
        for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_one_byte); ++i) {
            if (c->bytes[0] == ws_one_byte[i][0]) {
                return true;
            }
        }
        return false;

    case 2:
        for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_two_bytes); ++i) {
            if (c->bytes[0] == ws_two_bytes[i][0] && c->bytes[1] == ws_two_bytes[i][1]) {
                return true;
            }
        }
        return false;

    case 3:
        for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_three_bytes); ++i) {
            if (c->bytes[0] == ws_three_bytes[i][0] && c->bytes[1] == ws_three_bytes[i][1] && c->bytes[2] == ws_three_bytes[i][2]) {
                return true;
            }
        }
        return false;

    default:
        return false;
    }
}

#undef GUF_UTF8_IMPL
#undef GUF_UTF8_IMPL_STATIC
#endif /* end impl */

#undef GUF_UTF8_KWRDS