Make guf_utf8_char 4 bytes instead of 5
(Null-termination was superfluous here.)
This commit is contained in:
parent
c54fc75221
commit
ae83ee66e1
@ -16,7 +16,7 @@
|
|||||||
|
|
||||||
// Corresponds to one unicode codepoint (NOTE: one guf_utf8_char does not necessarily correspond to one printable character, e.g. combining characters).
|
// Corresponds to one unicode codepoint (NOTE: one guf_utf8_char does not necessarily correspond to one printable character, e.g. combining characters).
|
||||||
typedef struct guf_utf8_char {
|
typedef struct guf_utf8_char {
|
||||||
char bytes[5];
|
char bytes[4];
|
||||||
} guf_utf8_char;
|
} guf_utf8_char;
|
||||||
|
|
||||||
typedef enum guf_utf8_stat {
|
typedef enum guf_utf8_stat {
|
||||||
@ -68,7 +68,7 @@ const char* const GUF_UTF8_COMMON_PUNCT[32] =
|
|||||||
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "„", "“", "´", "»", "«", "`", "\\", "%", "‒", "–", "—", "—", "_"
|
".", ",", ";", ":", "(", ")", "[", "]", "!", "?", "¿", "¡", "&", "+", "-", "/", "*", "\"", "'", "„", "“", "´", "»", "«", "`", "\\", "%", "‒", "–", "—", "—", "_"
|
||||||
};
|
};
|
||||||
|
|
||||||
const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR = {.bytes = {'\xEF','\xBF','\xBD', '\0', '\0'}};
|
const guf_utf8_char GUF_UTF8_REPLACEMENT_CHAR = {.bytes = {'\xEF','\xBF','\xBD', '\0'}};
|
||||||
|
|
||||||
#ifndef GUF_FN_KEYWORDS
|
#ifndef GUF_FN_KEYWORDS
|
||||||
#define GUF_FN_KEYWORDS
|
#define GUF_FN_KEYWORDS
|
||||||
|
|||||||
@ -317,35 +317,31 @@ void UTF8Test::encode_decode()
|
|||||||
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F308)); // "🌈" (Rainbow)
|
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F308)); // "🌈" (Rainbow)
|
||||||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
|
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
|
||||||
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\x8C' && utf8.bytes[3] == '\x88');
|
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\x8C' && utf8.bytes[3] == '\x88');
|
||||||
TEST_CHECK(utf8.bytes[4] == '\0');
|
|
||||||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F308);
|
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F308);
|
||||||
|
|
||||||
TEST_CHECK(guf_utf8_encode(&utf8, 0x130B8)); // "𓂸" (Egyptian Hieroglyph D052)
|
TEST_CHECK(guf_utf8_encode(&utf8, 0x130B8)); // "𓂸" (Egyptian Hieroglyph D052)
|
||||||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
|
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
|
||||||
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x93' && utf8.bytes[2] == '\x82' && utf8.bytes[3] == '\xB8');
|
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x93' && utf8.bytes[2] == '\x82' && utf8.bytes[3] == '\xB8');
|
||||||
TEST_CHECK(utf8.bytes[4] == '\0');
|
|
||||||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x130B8);
|
TEST_CHECK(guf_utf8_decode(&utf8) == 0x130B8);
|
||||||
|
|
||||||
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F97A)); // "🥺" (Face with Pleading Eyes)
|
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F97A)); // "🥺" (Face with Pleading Eyes)
|
||||||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
|
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
|
||||||
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA5' && utf8.bytes[3] == '\xBA');
|
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA5' && utf8.bytes[3] == '\xBA');
|
||||||
TEST_CHECK(utf8.bytes[4] == '\0');
|
|
||||||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F97A);
|
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F97A);
|
||||||
|
|
||||||
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F980)); // "🦀" (Crab)
|
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F980)); // "🦀" (Crab)
|
||||||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
|
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
|
||||||
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA6' && utf8.bytes[3] == '\x80');
|
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA6' && utf8.bytes[3] == '\x80');
|
||||||
TEST_CHECK(utf8.bytes[4] == '\0');
|
|
||||||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F980);
|
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F980);
|
||||||
|
|
||||||
// Invalid characters:
|
// Invalid characters:
|
||||||
utf8 = {.bytes = {'\xC0', '\x80', 0, 0, 0}};
|
utf8 = {.bytes = {'\xC0', '\x80', 0, 0}};
|
||||||
TEST_CHECK(guf_utf8_decode(&utf8) < 0);
|
TEST_CHECK(guf_utf8_decode(&utf8) < 0);
|
||||||
|
|
||||||
utf8 = {.bytes = {'\xC0', 0, 0, 0, 0}};
|
utf8 = {.bytes = {'\xC0', 0, 0, 0}};
|
||||||
TEST_CHECK(guf_utf8_decode(&utf8) < 0);
|
TEST_CHECK(guf_utf8_decode(&utf8) < 0);
|
||||||
|
|
||||||
utf8 = {.bytes = {'\x80', 0, 0, 0, 0}};
|
utf8 = {.bytes = {'\x80', 0, 0, 0}};
|
||||||
TEST_CHECK(guf_utf8_decode(&utf8) < 0);
|
TEST_CHECK(guf_utf8_decode(&utf8) < 0);
|
||||||
|
|
||||||
// "The definition of UTF-8 prohibits encoding character numbers between U+D800 and U+DFFF" (surrogate pairs).
|
// "The definition of UTF-8 prohibits encoding character numbers between U+D800 and U+DFFF" (surrogate pairs).
|
||||||
|
|||||||
2
todo.txt
2
todo.txt
@ -1,7 +1,5 @@
|
|||||||
- fix readonly str/uninit ?
|
- fix readonly str/uninit ?
|
||||||
|
|
||||||
- make guf_utf8_char 4 bytes (non-null terminated)
|
|
||||||
|
|
||||||
- guf_stack, guf_queue, guf_dqueue, guf_prio_queue (using a heap), guf_ringbuf
|
- guf_stack, guf_queue, guf_dqueue, guf_prio_queue (using a heap), guf_ringbuf
|
||||||
- sort: add cpp #ifdef to remove restrict from declaration
|
- sort: add cpp #ifdef to remove restrict from declaration
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user