libguf/src/guf_utf8.h
jun 57f0e47efc Refactor to use portable minimum-width integers.
The signed and unsigned fixed-width integers (int32_t, uint32_t etc.) are optional
in C99 (and above). Use the non-optional minimum-width integers (int_fast32_t, uint_fast32_t and int_least32_t, uint_least32_t etc.) instead.

To simulate unsigned wrap-around, use the GUF_UWRAP macros in guf_common.h

cf. https://en.cppreference.com/w/c/types/integer (last-retrieved: 2025-05-18)
2025-05-18 22:03:03 +02:00

388 lines
13 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
is parametrized: no
NOTE: don't include if you already use guf_str.h
*/
#if defined(GUF_UTF8_IMPL_STATIC)
#define GUF_UTF8_KWRDS static
#else
#define GUF_UTF8_KWRDS
#endif
#ifndef GUF_UTF8_H
#define GUF_UTF8_H
#include "guf_common.h"
#include "guf_str_view_type.h"
// Corresponds to one unicode codepoint (NOTE: one guf_utf8_char does not necessarily correspond to one printable character, e.g. combining characters).
typedef struct guf_utf8_char {
char bytes[5];
} guf_utf8_char;
typedef enum guf_utf8_stat {
GUF_UTF8_READ_DONE,
GUF_UTF8_READ_VALID,
GUF_UTF8_READ_INVALID,
GUF_UTF8_READ_TRUNCATED,
} guf_utf8_stat;
static inline bool guf_char_is_ascii(int c) {return c <= 0 && c <= 127;}
static inline bool guf_uchar_is_ascii(unsigned char c) {return c <= 127;}
static inline bool guf_char_isspace_ascii(int c) {return c == ' ' || c == '\n' || c == '\t' || c == '\v' || c == '\f' || c == '\r';}
GUF_UTF8_KWRDS int guf_utf8_num_bytes(unsigned char c);
GUF_UTF8_KWRDS int guf_utf8_char_num_bytes(const guf_utf8_char *c);
GUF_UTF8_KWRDS bool guf_utf8_char_is_valid(const guf_utf8_char *c);
GUF_UTF8_KWRDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c);
GUF_UTF8_KWRDS guf_utf8_char guf_utf8_char_new(uint_fast32_t codepoint); // Returns GUF_UTF8_REPLACEMENT_CHAR for invalid codepoints (and for GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT).
GUF_UTF8_KWRDS bool guf_utf8_encode(guf_utf8_char *result, uint_fast32_t codepoint); // Returns false for invalid codepoints.
GUF_UTF8_KWRDS int_fast32_t guf_utf8_decode(const guf_utf8_char *utf8); // Returns -1 for invalid utf-8.
GUF_UTF8_KWRDS bool guf_utf8_equal(const guf_utf8_char *a, const guf_utf8_char *b);
GUF_UTF8_KWRDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str);
extern const char* const GUF_UTF8_WHITESPACE[25];
extern const char* const GUF_UTF8_COMMON_PUNCT[32];
extern const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR; // Replacement character "<22>" (U+FFFD)
#define GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT UINT32_C(0xFFFD)
#endif
#if defined(GUF_UTF8_IMPL) || defined(GUF_UTF8_IMPL_STATIC)
#include <string.h>
// All utf-8 whitespace, cf. https://en.wikipedia.org/wiki/Whitespace_character#Unicode (last-retrieved 2025-02-27)
const char* const GUF_UTF8_WHITESPACE[25] =
{
" ", "\n", "\t", "\r", "\v", "\f",
"\xC2\x85", "\xC2\xA0",
"\xE1\x9A\x80", "\xE2\x80\x80", "\xE2\x80\x81", "\xE2\x80\x82", "\xE2\x80\x83", "\xE2\x80\x84", "\xE2\x80\x85", "\xE2\x80\x86", "\xE2\x80\x87", "\xE2\x80\x88", "\xE2\x80\x89", "\xE2\x80\x8A", "\xE2\x80\xA8", "\xE2\x80\xA9", "\xE2\x80\xAF", "\xE2\x81\x9F", "\xE3\x80\x80"
};
// Common punctuation (TODO: make more exhaustive; use \x escapes)
const char* const GUF_UTF8_COMMON_PUNCT[32] =
{
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "", "", "´", "»", "«", "`", "\\", "%", "", "", "", "", "_"
};
const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR = {.bytes = {'\xEF','\xBF','\xBD', '\0', '\0'}};
#ifndef GUF_FN_KEYWORDS
#define GUF_FN_KEYWORDS
#endif
GUF_UTF8_KWRDS bool guf_utf8_equal(const guf_utf8_char *a, const guf_utf8_char *b)
{
const int num_bytes_a = guf_utf8_char_num_bytes(a);
const int num_bytes_b = guf_utf8_char_num_bytes(b);
if (num_bytes_a != num_bytes_b) {
return false;
}
const int n = (num_bytes_a != 0) ? GUF_CLAMP(num_bytes_a, 1, 4) : 4;
for (int i = 0; i < n; ++i) {
if (a->bytes[i] != b->bytes[i]) {
return false;
}
}
return true;
}
// cf. https://datatracker.ietf.org/doc/html/rfc3629#section-3 (last-retrieved 2025-03-02)
GUF_UTF8_KWRDS bool guf_utf8_encode(guf_utf8_char *result, uint_fast32_t cp)
{
GUF_ASSERT(result);
// "The definition of UTF-8 prohibits encoding character numbers between U+D800 and U+DFFF" (surrogate pairs).
const bool might_be_valid = (cp <= 0x10FFFF) && !(cp >= 0xD800 && cp <= 0xDFFF);
if (!might_be_valid) {
*result = GUF_UTF8_REPLACEMENT_CHAR;
return false;
}
memset(result->bytes, '\0', GUF_ARR_SIZE(result->bytes));
int num_bytes = 0, first_byte_bits = 0;
if (cp <= 0x7F) { // binary: 0xxx.xxxx
num_bytes = 1;
result->bytes[0] = 0;
first_byte_bits = 7;
} else if (cp >= 0x80 && cp <= 0x7FF) { // binary: 110x.xxxx 10xx.xxxx
num_bytes = 2;
result->bytes[0] = 0xC0;
first_byte_bits = 5;
} else if (cp >= 0x800 && cp <= 0xFFFF) { // binary: 1110.xxxx 10xx.xxxx 10xx.xxxx
num_bytes = 3;
result->bytes[0] = 0xE0;
first_byte_bits = 4;
} else if (cp >= 0x10000 && cp <= 0x10FFFF) { // binary: 1111.0xxx 10xx.xxxx 10xx.xxxx 10xx.xxxx
num_bytes = 4;
result->bytes[0] = 0xF0;
first_byte_bits = 3;
}
if (num_bytes == 0) {
*result = GUF_UTF8_REPLACEMENT_CHAR;
return false;
}
for (int i = 1; i < num_bytes; ++i) {
result->bytes[i] = 0x80; // binary: 10xx.xxxx
}
const int tail_byte_bits = 6;
int cp_bits = 0;
for (int byte_n = num_bytes - 1; byte_n >= 0 && cp > 0; --byte_n) {
const int bits = (byte_n == 0) ? first_byte_bits : tail_byte_bits;
const uint_fast32_t cp_mask = GUF_UWRAP_32( (UINT32_C(1) << bits) - 1 );
result->bytes[byte_n] = (char)(1u * (unsigned char)result->bytes[byte_n] | (cp & cp_mask));
cp = cp >> bits;
cp_bits += bits;
}
GUF_ASSERT(cp_bits <= first_byte_bits + (num_bytes - 1) * tail_byte_bits);
GUF_ASSERT(cp_bits <= 21);
(void)cp_bits;
if (guf_utf8_char_is_valid(result)) {
return true;
} else {
*result = GUF_UTF8_REPLACEMENT_CHAR;
return false;
}
}
GUF_UTF8_KWRDS guf_utf8_char guf_utf8_char_new(uint_fast32_t codepoint)
{
guf_utf8_char result = GUF_UTF8_REPLACEMENT_CHAR;
guf_utf8_encode(&result, codepoint);
return result;
}
// cf. https://datatracker.ietf.org/doc/html/rfc3629#section-3 (last-retrieved 2025-03-02)
GUF_UTF8_KWRDS int_fast32_t guf_utf8_decode(const guf_utf8_char *c)
{
if (!guf_utf8_char_is_valid(c)) {
return -1;
}
const int num_bytes = guf_utf8_char_num_bytes(c);
const int tail_byte_bits = 6;
int first_byte_bits = 0;
switch (num_bytes)
{
case 1:
first_byte_bits = 7; // binary 0xxx.xxxx
break;
case 2:
first_byte_bits = 5; // binary: 110x.xxxx 10xx.xxxx
break;
case 3:
first_byte_bits = 4; // binary: 1110.xxxx 10xx.xxxx 10xx.xxxx
break;
case 4:
first_byte_bits = 3; // binary: 1111.0xxx 10xx.xxxx 10xx.xxxx 10xx.xxxx
break;
default:
return -1;
}
uint_fast32_t cp = 0;
int cp_bits = 0;
for (int byte_n = num_bytes - 1; byte_n >= 0; --byte_n) {
const int bits = (byte_n == 0) ? first_byte_bits : tail_byte_bits;
const uint_fast32_t byte_mask = GUF_UWRAP_32( (UINT32_C(1) << bits) - 1 );
cp = GUF_UWRAP_32( cp | GUF_UWRAP_32( 1u * ((uint_fast32_t)c->bytes[byte_n] & byte_mask) << cp_bits ) );
cp_bits += bits;
}
GUF_ASSERT(cp_bits == first_byte_bits + (num_bytes - 1) * tail_byte_bits);
GUF_ASSERT(cp_bits <= 21);
const bool valid = (cp <= 0x10FFFF) && !(cp >= 0xD800 && cp <= 0xDFFF);
if (!valid) {
return -1;
} else {
#ifdef INT32_MAX
GUF_ASSERT(cp <= INT32_MAX);
#endif
GUF_ASSERT(cp <= INT_FAST32_MAX);
return (int_fast32_t)cp;
}
}
GUF_UTF8_KWRDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str)
{
GUF_ASSERT(res);
GUF_ASSERT(str);
if (str->len <= 0 || str->str == NULL) {
return GUF_UTF8_READ_DONE;
}
int consumed = 0;
res->bytes[consumed++] = str->str[0];
str->len--;
str->str = str->len ? str->str + 1 : NULL;
for (size_t i = 1; i < GUF_ARR_SIZE(res->bytes); ++i) {
res->bytes[i] = '\0';
}
const int num_bytes = guf_utf8_char_num_bytes(res);
if (!num_bytes) {
return GUF_UTF8_READ_INVALID;
}
while (consumed < num_bytes && str->len > 0) {
res->bytes[consumed++] = str->str[0];
str->len--;
str->str = str->len ? str->str + 1 : NULL;
}
if (consumed < num_bytes) {
return GUF_UTF8_READ_TRUNCATED;
} else if (guf_utf8_char_is_valid(res)) {
return GUF_UTF8_READ_VALID;
} else {
// TODO: this means str will point one past the last read character (maybe it would be better to skip to one past the first?)
return GUF_UTF8_READ_INVALID;
}
}
// cf. https://www.rfc-editor.org/rfc/rfc3629#page-4
GUF_UTF8_KWRDS int guf_utf8_num_bytes(unsigned char c)
{
if (c <= 0x7F) { // bits: 0xxx.xxxx
return 1;
} else if (c >= 0xC2 && c <= 0xDF) { // bits: 110x.xxxx (without 0xC0 and 0xC1)
return 2;
} else if (c >= 0xE0 && c <= 0xEF) { // bits: 1110.xxxx
return 3;
} else if (c >= 0xF0 && c <= 0xF4) { // bits: b1111.0xxx (without 0xF5 to 0xFF)
return 4;
} else {
return 0; // Invalid byte.
}
}
GUF_UTF8_KWRDS int guf_utf8_char_num_bytes(const guf_utf8_char *c)
{
GUF_ASSERT(c);
return guf_utf8_num_bytes(c->bytes[0]);
}
GUF_UTF8_KWRDS bool guf_utf8_char_is_valid(const guf_utf8_char *c)
{
const int num_bytes = guf_utf8_num_bytes(c->bytes[0]);
if (!num_bytes) {
return false;
}
const unsigned char *bytes = (const unsigned char*)c->bytes; // It's important to cast to unsigned char* here!
for (int i = 0; i < num_bytes; ++i) {
// "The octet values C0, C1, F5 to FF never appear.", cf. https://www.rfc-editor.org/rfc/rfc3629#page-5
if (bytes[i] == 0xC0 || bytes[i] == 0xC1 || (bytes[i] >= 0xF5 && bytes[i] <= 0xFF)) {
return false;
}
}
// Binary: 10xx.xxxx
#define guf_valid_tail(byte) ((byte) >= 0x80 && (byte) <= 0xBF)
// cf. https://datatracker.ietf.org/doc/html/rfc3629#page-5
switch (num_bytes)
{
case 1:
return true;
case 2:
return guf_valid_tail(bytes[1]);
case 3:
if ((bytes[0] == 0xE0) && (bytes[1] >= 0xA0 && bytes[1] <= 0xBF) && guf_valid_tail(bytes[2])) {
return true;
}
if ((bytes[0] >= 0xE1 && bytes[0] <= 0xEC) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2])) {
return true;
}
if ((bytes[0] == 0xED) && (bytes[1] >= 0x80 && bytes[1] <= 0x9F) && guf_valid_tail(bytes[2])) {
return true;
}
if ((bytes[0] >= 0xEE && bytes[0] <= 0xEF) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2])) {
return true;
}
return false;
case 4:
if ((bytes[0] == 0xF0) && (bytes[1] >= 0x90 && bytes[1] <= 0xBF) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
return true;
}
if ((bytes[0] >= 0xF1 && bytes[0] <= 0xF3) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
return true;
}
if ((bytes[0] == 0xF4) && (bytes[1] >= 0x80 && bytes[1] <= 0x8F) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
return true;
}
return false;
default:
return false;
}
#undef guf_valid_tail
}
GUF_UTF8_KWRDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c)
{
GUF_ASSERT(c);
// cf. https://en.wikipedia.org/wiki/Whitespace_character#Unicode (last-retrieved 2025-02-27)
const char *ws_one_byte[] = {" ", "\n", "\t", "\t", "\v", "\f"};
const char *ws_two_bytes[] = {"\xC2\x85", "\xC2\xA0"};
const char *ws_three_bytes[] = {"\xE1\x9A\x80", "\xE2\x80\x80", "\xE2\x80\x81", "\xE2\x80\x82", "\xE2\x80\x83", "\xE2\x80\x84", "\xE2\x80\x85", "\xE2\x80\x86", "\xE2\x80\x87", "\xE2\x80\x88", "\xE2\x80\x89", "\xE2\x80\x8A", "\xE2\x80\xA8", "\xE2\x80\xA9", "\xE2\x80\xAF", "\xE2\x81\x9F", "\xE3\x80\x80"};
const int num_bytes = guf_utf8_num_bytes(c->bytes[0]);
switch (num_bytes)
{
case 1:
for (size_t i = 0; i < GUF_ARR_SIZE(ws_one_byte); ++i) {
if (c->bytes[0] == ws_one_byte[i][0]) {
return true;
}
}
return false;
case 2:
for (size_t i = 0; i < GUF_ARR_SIZE(ws_two_bytes); ++i) {
if (c->bytes[0] == ws_two_bytes[i][0] && c->bytes[1] == ws_two_bytes[i][1]) {
return true;
}
}
return false;
case 3:
for (size_t i = 0; i < GUF_ARR_SIZE(ws_three_bytes); ++i) {
if (c->bytes[0] == ws_three_bytes[i][0] && c->bytes[1] == ws_three_bytes[i][1] && c->bytes[2] == ws_three_bytes[i][2]) {
return true;
}
}
return false;
default:
return false;
}
}
#undef GUF_UTF8_IMPL
#undef GUF_UTF8_IMPL_STATIC
#endif /* end impl */
#undef GUF_UTF8_KWRDS