Make guf_utf8_char 4 bytes instead of 5
(Null-termination was superfluous here.)
This commit is contained in:
parent
c54fc75221
commit
ae83ee66e1
@ -16,7 +16,7 @@
|
||||
|
||||
// Corresponds to one unicode codepoint (NOTE: one guf_utf8_char does not necessarily correspond to one printable character, e.g. combining characters).
|
||||
typedef struct guf_utf8_char {
|
||||
char bytes[5];
|
||||
char bytes[4];
|
||||
} guf_utf8_char;
|
||||
|
||||
typedef enum guf_utf8_stat {
|
||||
@ -68,7 +68,7 @@ const char* const GUF_UTF8_COMMON_PUNCT[32] =
|
||||
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "„", "“", "´", "»", "«", "`", "\\", "%", "‒", "–", "—", "—", "_"
|
||||
};
|
||||
|
||||
const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR = {.bytes = {'\xEF','\xBF','\xBD', '\0', '\0'}};
|
||||
const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR = {.bytes = {'\xEF','\xBF','\xBD', '\0'}};
|
||||
|
||||
#ifndef GUF_FN_KEYWORDS
|
||||
#define GUF_FN_KEYWORDS
|
||||
|
||||
@ -317,35 +317,31 @@ void UTF8Test::encode_decode()
|
||||
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F308)); // "🌈" (Rainbow)
|
||||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
|
||||
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\x8C' && utf8.bytes[3] == '\x88');
|
||||
TEST_CHECK(utf8.bytes[4] == '\0');
|
||||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F308);
|
||||
|
||||
TEST_CHECK(guf_utf8_encode(&utf8, 0x130B8)); // "𓂸" (Egyptian Hieroglyph D052)
|
||||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
|
||||
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x93' && utf8.bytes[2] == '\x82' && utf8.bytes[3] == '\xB8');
|
||||
TEST_CHECK(utf8.bytes[4] == '\0');
|
||||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x130B8);
|
||||
|
||||
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F97A)); // "🥺" (Face with Pleading Eyes)
|
||||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
|
||||
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA5' && utf8.bytes[3] == '\xBA');
|
||||
TEST_CHECK(utf8.bytes[4] == '\0');
|
||||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F97A);
|
||||
|
||||
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F980)); // "🦀" (Crab)
|
||||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
|
||||
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA6' && utf8.bytes[3] == '\x80');
|
||||
TEST_CHECK(utf8.bytes[4] == '\0');
|
||||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F980);
|
||||
|
||||
// Invalid characters:
|
||||
utf8 = {.bytes = {'\xC0', '\x80', 0, 0, 0}};
|
||||
utf8 = {.bytes = {'\xC0', '\x80', 0, 0}};
|
||||
TEST_CHECK(guf_utf8_decode(&utf8) < 0);
|
||||
|
||||
utf8 = {.bytes = {'\xC0', 0, 0, 0, 0}};
|
||||
utf8 = {.bytes = {'\xC0', 0, 0, 0}};
|
||||
TEST_CHECK(guf_utf8_decode(&utf8) < 0);
|
||||
|
||||
utf8 = {.bytes = {'\x80', 0, 0, 0, 0}};
|
||||
utf8 = {.bytes = {'\x80', 0, 0, 0}};
|
||||
TEST_CHECK(guf_utf8_decode(&utf8) < 0);
|
||||
|
||||
// "The definition of UTF-8 prohibits encoding character numbers between U+D800 and U+DFFF" (surrogate pairs).
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user