213 lines
6.4 KiB
C
213 lines
6.4 KiB
C
#if defined(GUF_STATIC) || defined(GUF_STATIC_IMPL)
|
|
#define GUF_FN_KEYWORDS static
|
|
#else
|
|
#define GUF_FN_KEYWORDS
|
|
#endif
|
|
|
|
#ifndef GUF_UTF8_H
|
|
#define GUF_UTF8_H
|
|
#include "guf_common.h"
|
|
#include "guf_str.h"
|
|
|
|
typedef struct guf_utf8_char {
|
|
char bytes[5];
|
|
} guf_utf8_char;
|
|
|
|
typedef enum guf_utf8_stat {
|
|
GUF_UTF8_READ_DONE,
|
|
GUF_UTF8_READ_VALID,
|
|
GUF_UTF8_READ_INVALID,
|
|
GUF_UTF8_READ_TRUNCATED,
|
|
} guf_utf8_stat;
|
|
|
|
static inline bool guf_char_is_ascii(int c) {return c <= 0 && c <= 127;}
|
|
static inline bool guf_uchar_is_ascii(unsigned char c) {return c <= 127;}
|
|
|
|
GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c);
|
|
GUF_FN_KEYWORDS int guf_utf8_char_num_bytes(guf_utf8_char *c);
|
|
|
|
GUF_FN_KEYWORDS guf_utf8_char guf_utf8_char_new(const char *bytes, int num_bytes);
|
|
GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c);
|
|
GUF_FN_KEYWORDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c);
|
|
|
|
GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str);
|
|
|
|
#endif
|
|
|
|
#if defined(GUF_IMPL) || defined(GUF_IMPL_STATIC)
|
|
|
|
#include "guf_common.h"
|
|
#include "guf_assert.h"
|
|
|
|
GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str)
|
|
{
|
|
GUF_ASSERT_RELEASE(res);
|
|
GUF_ASSERT_RELEASE(str);
|
|
|
|
if (str->len <= 0 || str->str == NULL) {
|
|
return GUF_UTF8_READ_DONE;
|
|
}
|
|
|
|
int consumed = 0;
|
|
res->bytes[consumed++] = str->str[0];
|
|
str->len--;
|
|
str->str = str->len ? str->str + 1 : NULL;
|
|
|
|
for (size_t i = 1; i < GUF_STATIC_BUF_SIZE(res->bytes); ++i) {
|
|
res->bytes[i] = '\0';
|
|
}
|
|
|
|
const int num_bytes = guf_utf8_char_num_bytes(res);
|
|
|
|
if (!num_bytes) {
|
|
return GUF_UTF8_READ_INVALID;
|
|
}
|
|
|
|
while (consumed < num_bytes && str->len > 0) {
|
|
res->bytes[consumed++] = str->str[0];
|
|
str->len--;
|
|
str->str = str->len ? str->str + 1 : NULL;
|
|
}
|
|
|
|
if (consumed < num_bytes) {
|
|
return GUF_UTF8_READ_TRUNCATED;
|
|
} else if (guf_utf8_char_is_valid(res)) {
|
|
return GUF_UTF8_READ_VALID;
|
|
} else {
|
|
return GUF_UTF8_READ_INVALID;
|
|
}
|
|
}
|
|
|
|
|
|
// cf. https://www.rfc-editor.org/rfc/rfc3629#page-4
|
|
GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c)
|
|
{
|
|
if (c <= 0x7F) { // bits: 0xxx.xxxx
|
|
return 1;
|
|
} else if (c >= 0xC2 && c <= 0xDF) { // bits: 110x.xxxx (without 0xC0 and 0xC1)
|
|
return 2;
|
|
} else if (c >= 0xE0 && c <= 0xEF) { // bits: 1110.xxxx
|
|
return 3;
|
|
} else if (c >= 0xF0 && c <= 0xF4) { // bits: b1111.0xxx (without 0xF5 to 0xFF)
|
|
return 4;
|
|
} else {
|
|
return 0; // Invalid byte.
|
|
}
|
|
}
|
|
|
|
GUF_FN_KEYWORDS int guf_utf8_char_num_bytes(guf_utf8_char *c)
|
|
{
|
|
GUF_ASSERT(c);
|
|
return guf_utf8_num_bytes(c->bytes[0]);
|
|
}
|
|
|
|
|
|
GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c)
|
|
{
|
|
const int num_bytes = guf_utf8_num_bytes(c->bytes[0]);
|
|
|
|
if (!num_bytes) {
|
|
return false;
|
|
}
|
|
|
|
const unsigned char *bytes = (const unsigned char*)c->bytes;
|
|
|
|
for (int i = 0; i < num_bytes; ++i) {
|
|
// "The octet values C0, C1, F5 to FF never appear.", cf. https://www.rfc-editor.org/rfc/rfc3629#page-5
|
|
if (bytes[i] == 0xC0 || bytes[i] == 0xC1 || (bytes[i] >= 0xF5 && bytes[i] <= 0xFF)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Binary: 10xx.xxxx
|
|
#define guf_valid_tail(byte) ((byte) >= 0x80 && (byte) <= 0xBF)
|
|
|
|
// cf. https://datatracker.ietf.org/doc/html/rfc3629#page-5
|
|
switch (num_bytes)
|
|
{
|
|
case 1:
|
|
return true;
|
|
|
|
case 2:
|
|
return guf_valid_tail(bytes[1]);
|
|
|
|
case 3:
|
|
if ((bytes[0] == 0xE0) && (bytes[1] >= 0xA0 && bytes[1] <= 0xBF) && guf_valid_tail(bytes[2])) {
|
|
return true;
|
|
}
|
|
if ((bytes[0] >= 0xE1 && bytes[0] <= 0xEC) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2])) {
|
|
return true;
|
|
}
|
|
if ((bytes[0] == 0xED) && (bytes[1] >= 0x80 && bytes[1] <= 0x9F) && guf_valid_tail(bytes[2])) {
|
|
return true;
|
|
}
|
|
if ((bytes[0] >= 0xEE && bytes[0] <= 0xEF) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2])) {
|
|
return true;
|
|
}
|
|
return false;
|
|
|
|
case 4:
|
|
if ((bytes[0] == 0xF0) && (bytes[1] >= 0x90 && bytes[1] <= 0xBF) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
|
|
return true;
|
|
}
|
|
if ((bytes[0] >= 0xF1 && bytes[0] <= 0xF3) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
|
|
return true;
|
|
}
|
|
if ((bytes[0] == 0xF4) && (bytes[1] >= 0x80 && bytes[1] <= 0x8F) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
|
|
return true;
|
|
}
|
|
return false;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
#undef guf_valid_tail
|
|
}
|
|
|
|
GUF_FN_KEYWORDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c)
|
|
{
|
|
// cf. https://en.wikipedia.org/wiki/Whitespace_character#Unicode (last-retrieved 2025-02-27)
|
|
const char *ws_one_byte[] = {" ", "\n", "\t", "\t", "\v", "\f"};
|
|
const char *ws_two_bytes[] = {"\xC2\x85", "\xC2\xA0"};
|
|
const char *ws_three_bytes[] = {"\xE1\x9A\x80", "\xE2\x80\x80", "\xE2\x80\x81", "\xE2\x80\x82", "\xE2\x80\x83", "\xE2\x80\x84", "\xE2\x80\x85", "\xE2\x80\x86", "\xE2\x80\x87", "\xE2\x80\x88", "\xE2\x80\x89", "\xE2\x80\x8A", "\xE2\x80\xA8", "\xE2\x80\xA9", "\xE2\x80\xAF", "\xE2\x81\x9F", "\xE3\x80\x80"};
|
|
|
|
const int num_bytes = guf_utf8_num_bytes(c->bytes[0]);
|
|
|
|
switch (num_bytes)
|
|
{
|
|
case 1:
|
|
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_one_byte); ++i) {
|
|
if (c->bytes[0] == ws_one_byte[i][0]) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
|
|
case 2:
|
|
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_two_bytes); ++i) {
|
|
if (c->bytes[0] == ws_two_bytes[i][0] && c->bytes[1] == ws_two_bytes[i][1]) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
|
|
case 3:
|
|
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_three_bytes); ++i) {
|
|
if (c->bytes[0] == ws_three_bytes[i][0] && c->bytes[1] == ws_three_bytes[i][1] && c->bytes[2] == ws_three_bytes[i][2]) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
#undef GUF_FN_KEYWORDS
|
|
#undef GUF_IMPL
|
|
#undef GUF_IMPL_STATIC
|
|
#undef GUF_STATIC
|