Make guf_utf8_char 4 bytes instead of 5

(Null-termination was superfluous here.)
This commit is contained in:
jun 2025-05-25 16:04:26 +02:00
parent c54fc75221
commit ae83ee66e1
3 changed files with 5 additions and 11 deletions

View File

@ -16,7 +16,7 @@
// Corresponds to one unicode codepoint (NOTE: one guf_utf8_char does not necessarily correspond to one printable character, e.g. combining characters). // Corresponds to one unicode codepoint (NOTE: one guf_utf8_char does not necessarily correspond to one printable character, e.g. combining characters).
typedef struct guf_utf8_char { typedef struct guf_utf8_char {
char bytes[5]; char bytes[4];
} guf_utf8_char; } guf_utf8_char;
typedef enum guf_utf8_stat { typedef enum guf_utf8_stat {
@ -68,7 +68,7 @@ const char* const GUF_UTF8_COMMON_PUNCT[32] =
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "", "", "´", "»", "«", "`", "\\", "%", "", "", "", "", "_" ".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "", "", "´", "»", "«", "`", "\\", "%", "", "", "", "", "_"
}; };
const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR = {.bytes = {'\xEF','\xBF','\xBD', '\0', '\0'}}; const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR = {.bytes = {'\xEF','\xBF','\xBD', '\0'}};
#ifndef GUF_FN_KEYWORDS #ifndef GUF_FN_KEYWORDS
#define GUF_FN_KEYWORDS #define GUF_FN_KEYWORDS

View File

@ -317,35 +317,31 @@ void UTF8Test::encode_decode()
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F308)); // "🌈" (Rainbow) TEST_CHECK(guf_utf8_encode(&utf8, 0x1F308)); // "🌈" (Rainbow)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4); TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\x8C' && utf8.bytes[3] == '\x88'); TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\x8C' && utf8.bytes[3] == '\x88');
TEST_CHECK(utf8.bytes[4] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F308); TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F308);
TEST_CHECK(guf_utf8_encode(&utf8, 0x130B8)); // "𓂸" (Egyptian Hieroglyph D052) TEST_CHECK(guf_utf8_encode(&utf8, 0x130B8)); // "𓂸" (Egyptian Hieroglyph D052)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4); TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x93' && utf8.bytes[2] == '\x82' && utf8.bytes[3] == '\xB8'); TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x93' && utf8.bytes[2] == '\x82' && utf8.bytes[3] == '\xB8');
TEST_CHECK(utf8.bytes[4] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x130B8); TEST_CHECK(guf_utf8_decode(&utf8) == 0x130B8);
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F97A)); // "🥺" (Face with Pleading Eyes) TEST_CHECK(guf_utf8_encode(&utf8, 0x1F97A)); // "🥺" (Face with Pleading Eyes)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4); TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA5' && utf8.bytes[3] == '\xBA'); TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA5' && utf8.bytes[3] == '\xBA');
TEST_CHECK(utf8.bytes[4] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F97A); TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F97A);
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F980)); // "🦀" (Crab) TEST_CHECK(guf_utf8_encode(&utf8, 0x1F980)); // "🦀" (Crab)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4); TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA6' && utf8.bytes[3] == '\x80'); TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA6' && utf8.bytes[3] == '\x80');
TEST_CHECK(utf8.bytes[4] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F980); TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F980);
// Invalid characters: // Invalid characters:
utf8 = {.bytes = {'\xC0', '\x80', 0, 0, 0}}; utf8 = {.bytes = {'\xC0', '\x80', 0, 0}};
TEST_CHECK(guf_utf8_decode(&utf8) < 0); TEST_CHECK(guf_utf8_decode(&utf8) < 0);
utf8 = {.bytes = {'\xC0', 0, 0, 0, 0}}; utf8 = {.bytes = {'\xC0', 0, 0, 0}};
TEST_CHECK(guf_utf8_decode(&utf8) < 0); TEST_CHECK(guf_utf8_decode(&utf8) < 0);
utf8 = {.bytes = {'\x80', 0, 0, 0, 0}}; utf8 = {.bytes = {'\x80', 0, 0, 0}};
TEST_CHECK(guf_utf8_decode(&utf8) < 0); TEST_CHECK(guf_utf8_decode(&utf8) < 0);
// "The definition of UTF-8 prohibits encoding character numbers between U+D800 and U+DFFF" (surrogate pairs). // "The definition of UTF-8 prohibits encoding character numbers between U+D800 and U+DFFF" (surrogate pairs).

View File

@ -1,7 +1,5 @@
- fix readonly str/uninit ? - fix readonly str/uninit ?
- make guf_utf8_char 4 bytes (non-null terminated)
- guf_stack, guf_queue, guf_dqueue, guf_prio_queue (using a heap), guf_ringbuf - guf_stack, guf_queue, guf_dqueue, guf_prio_queue (using a heap), guf_ringbuf
- sort: add cpp #ifdef to remove restrict from declaration - sort: add cpp #ifdef to remove restrict from declaration