libguf/src/guf_utf8.h

307 lines
10 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#ifndef GUF_UTF8_H
#define GUF_UTF8_H
#include "guf_common.h"
#include "guf_str.h"
#if defined(GUF_STATIC) || defined(GUF_STATIC_IMPL)
#define GUF_FN_KEYWORDS static
#else
#define GUF_FN_KEYWORDS
#endif
typedef struct guf_utf8_char {
char bytes[5];
} guf_utf8_char;
typedef enum guf_utf8_stat {
GUF_UTF8_READ_DONE,
GUF_UTF8_READ_VALID,
GUF_UTF8_READ_INVALID,
GUF_UTF8_READ_TRUNCATED,
} guf_utf8_stat;
static inline bool guf_char_is_ascii(int c) {return c <= 0 && c <= 127;}
static inline bool guf_uchar_is_ascii(unsigned char c) {return c <= 127;}
GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c);
GUF_FN_KEYWORDS int guf_utf8_char_num_bytes(guf_utf8_char *c);
GUF_FN_KEYWORDS guf_utf8_char guf_utf8_char_new(const char *bytes, int num_bytes);
GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c);
GUF_FN_KEYWORDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c);
GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str);
GUF_FN_KEYWORDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str_view *delims, ptrdiff_t num_delims, const guf_str_view *preserved_delims, ptrdiff_t num_preserved_delims);
extern const char* const guf_utf8_whitespace[25];
extern const char* const guf_utf8_punctuation[29];
#endif
#if defined(GUF_IMPL) || defined(GUF_IMPL_STATIC)
#include "guf_common.h"
#include "guf_assert.h"
const char* const guf_utf8_whitespace[25] =
{
" ", "\n", "\t", "\t", "\v", "\f",
"\xC2\x85", "\xC2\xA0",
"\xE1\x9A\x80", "\xE2\x80\x80", "\xE2\x80\x81", "\xE2\x80\x82", "\xE2\x80\x83", "\xE2\x80\x84", "\xE2\x80\x85", "\xE2\x80\x86", "\xE2\x80\x87", "\xE2\x80\x88", "\xE2\x80\x89", "\xE2\x80\x8A", "\xE2\x80\xA8", "\xE2\x80\xA9", "\xE2\x80\xAF", "\xE2\x81\x9F", "\xE3\x80\x80"
};
const char* const guf_utf8_punctuation[29] =
{
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "", "", "´", "`", "\\", "%", "", "", "", ""
};
#ifndef GUF_FN_KEYWORDS
#define GUF_FN_KEYWORDS
#endif
GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str)
{
GUF_ASSERT_RELEASE(res);
GUF_ASSERT_RELEASE(str);
if (str->len <= 0 || str->str == NULL) {
return GUF_UTF8_READ_DONE;
}
int consumed = 0;
res->bytes[consumed++] = str->str[0];
str->len--;
str->str = str->len ? str->str + 1 : NULL;
for (size_t i = 1; i < GUF_STATIC_BUF_SIZE(res->bytes); ++i) {
res->bytes[i] = '\0';
}
const int num_bytes = guf_utf8_char_num_bytes(res);
if (!num_bytes) {
return GUF_UTF8_READ_INVALID;
}
while (consumed < num_bytes && str->len > 0) {
res->bytes[consumed++] = str->str[0];
str->len--;
str->str = str->len ? str->str + 1 : NULL;
}
if (consumed < num_bytes) {
return GUF_UTF8_READ_TRUNCATED;
} else if (guf_utf8_char_is_valid(res)) {
return GUF_UTF8_READ_VALID;
} else {
return GUF_UTF8_READ_INVALID;
}
}
// cf. https://www.rfc-editor.org/rfc/rfc3629#page-4
GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c)
{
if (c <= 0x7F) { // bits: 0xxx.xxxx
return 1;
} else if (c >= 0xC2 && c <= 0xDF) { // bits: 110x.xxxx (without 0xC0 and 0xC1)
return 2;
} else if (c >= 0xE0 && c <= 0xEF) { // bits: 1110.xxxx
return 3;
} else if (c >= 0xF0 && c <= 0xF4) { // bits: b1111.0xxx (without 0xF5 to 0xFF)
return 4;
} else {
return 0; // Invalid byte.
}
}
GUF_FN_KEYWORDS int guf_utf8_char_num_bytes(guf_utf8_char *c)
{
GUF_ASSERT(c);
return guf_utf8_num_bytes(c->bytes[0]);
}
GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c)
{
const int num_bytes = guf_utf8_num_bytes(c->bytes[0]);
if (!num_bytes) {
return false;
}
const unsigned char *bytes = (const unsigned char*)c->bytes;
for (int i = 0; i < num_bytes; ++i) {
// "The octet values C0, C1, F5 to FF never appear.", cf. https://www.rfc-editor.org/rfc/rfc3629#page-5
if (bytes[i] == 0xC0 || bytes[i] == 0xC1 || (bytes[i] >= 0xF5 && bytes[i] <= 0xFF)) {
return false;
}
}
// Binary: 10xx.xxxx
#define guf_valid_tail(byte) ((byte) >= 0x80 && (byte) <= 0xBF)
// cf. https://datatracker.ietf.org/doc/html/rfc3629#page-5
switch (num_bytes)
{
case 1:
return true;
case 2:
return guf_valid_tail(bytes[1]);
case 3:
if ((bytes[0] == 0xE0) && (bytes[1] >= 0xA0 && bytes[1] <= 0xBF) && guf_valid_tail(bytes[2])) {
return true;
}
if ((bytes[0] >= 0xE1 && bytes[0] <= 0xEC) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2])) {
return true;
}
if ((bytes[0] == 0xED) && (bytes[1] >= 0x80 && bytes[1] <= 0x9F) && guf_valid_tail(bytes[2])) {
return true;
}
if ((bytes[0] >= 0xEE && bytes[0] <= 0xEF) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2])) {
return true;
}
return false;
case 4:
if ((bytes[0] == 0xF0) && (bytes[1] >= 0x90 && bytes[1] <= 0xBF) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
return true;
}
if ((bytes[0] >= 0xF1 && bytes[0] <= 0xF3) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
return true;
}
if ((bytes[0] == 0xF4) && (bytes[1] >= 0x80 && bytes[1] <= 0x8F) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
return true;
}
return false;
default:
return false;
}
#undef guf_valid_tail
}
GUF_FN_KEYWORDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c)
{
// cf. https://en.wikipedia.org/wiki/Whitespace_character#Unicode (last-retrieved 2025-02-27)
const char *ws_one_byte[] = {" ", "\n", "\t", "\t", "\v", "\f"};
const char *ws_two_bytes[] = {"\xC2\x85", "\xC2\xA0"};
const char *ws_three_bytes[] = {"\xE1\x9A\x80", "\xE2\x80\x80", "\xE2\x80\x81", "\xE2\x80\x82", "\xE2\x80\x83", "\xE2\x80\x84", "\xE2\x80\x85", "\xE2\x80\x86", "\xE2\x80\x87", "\xE2\x80\x88", "\xE2\x80\x89", "\xE2\x80\x8A", "\xE2\x80\xA8", "\xE2\x80\xA9", "\xE2\x80\xAF", "\xE2\x81\x9F", "\xE3\x80\x80"};
const int num_bytes = guf_utf8_num_bytes(c->bytes[0]);
switch (num_bytes)
{
case 1:
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_one_byte); ++i) {
if (c->bytes[0] == ws_one_byte[i][0]) {
return true;
}
}
return false;
case 2:
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_two_bytes); ++i) {
if (c->bytes[0] == ws_two_bytes[i][0] && c->bytes[1] == ws_two_bytes[i][1]) {
return true;
}
}
return false;
case 3:
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_three_bytes); ++i) {
if (c->bytes[0] == ws_three_bytes[i][0] && c->bytes[1] == ws_three_bytes[i][1] && c->bytes[2] == ws_three_bytes[i][2]) {
return true;
}
}
return false;
default:
return false;
}
}
GUF_FN_KEYWORDS guf_str_view guf_str_next_tok(guf_str_view *input, const guf_str_view *delims, ptrdiff_t num_delims, const guf_str_view *preserved_delims, ptrdiff_t num_preserved_delims)
{
if (input->len <= 0 || input->str == NULL) {
return (guf_str_view){.str = NULL, .len = 0};
}
ptrdiff_t max_delim_len = -1;
for (ptrdiff_t i = 0; i < num_delims; ++i) {
if (delims[i].len > max_delim_len) {
max_delim_len = delims[i].len;
}
}
guf_str_view tok = {.str = input->str, .len = 0};
guf_utf8_char ch = {0};
guf_str_view prev_input = *input;
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, input); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, input)) {
if (stat != GUF_UTF8_READ_VALID) {
prev_input = *input;
continue;
}
const int num_bytes = guf_utf8_char_num_bytes(&ch);
for (ptrdiff_t delim_len = GUF_MIN(max_delim_len, prev_input.len); delim_len > 0; --delim_len) {
guf_str_view delim_candidate = guf_substr_view(prev_input, 0, delim_len);
for (ptrdiff_t delim_i = 0; delim_i < num_delims; ++delim_i) {
if (guf_str_view_equal(&delim_candidate, delims + delim_i)) { // Found delim.
bool preserved = false;
if (preserved_delims && num_preserved_delims > 0) {
for (ptrdiff_t preserved_i = 0; preserved_i < num_preserved_delims; ++preserved_i) {
if (guf_str_view_equal(&delim_candidate, preserved_delims + preserved_i)) {
preserved = true;
break;
}
}
}
if (!preserved) {
input->len = prev_input.len - delim_len;
input->str = prev_input.len > 0 ? prev_input.str + delim_len : NULL;
GUF_ASSERT(input->len >= 0);
} else {
input->str -= num_bytes;
input->len += num_bytes;
}
if (tok.len == 0) {
if (preserved) {
input->str += num_bytes;
input->len -= num_bytes;
return delim_candidate;
}
tok.str = input->str;
goto end;
} else {
return tok;
}
}
}
}
tok.len += num_bytes;
end:;
prev_input = *input;
}
return tok;
}
#endif
#undef GUF_FN_KEYWORDS
#undef GUF_IMPL
#undef GUF_IMPL_STATIC
#undef GUF_STATIC