#include #include "test.hpp" extern "C" { #include "guf_alloc_libc.h" #include "guf_dict_impl.h" #include "guf_dbuf_impl.h" #include "guf_str.h" } struct UTF8Test : public Test { UTF8Test(const std::string& name) : Test(name) {}; private: dbuf_char text_buf {}; std::vector text_vec; bool load_text(const char *fname) { FILE *in_file {nullptr}; if (!in_file) { in_file = fopen(fname, "r"); } if (!in_file) { return false; } dbuf_char_init(&text_buf, 128, &guf_allocator_libc); int c = EOF; while ((c = fgetc(in_file)) != EOF) { dbuf_char_push_val(&text_buf, (char)c); text_vec.push_back((char)c); } fclose(in_file); return TEST_CHECK(std::ssize(text_vec) == text_buf.size); } void free_text() { dbuf_char_free(&text_buf, NULL); text_vec.clear(); } void read_utf8_chars(const char *fname, ptrdiff_t *n_valid, ptrdiff_t *n_invalid) { GUF_ASSERT_RELEASE(load_text(fname)); ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = 0; guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; guf_utf8_char ch = {}; for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) { if (stat == GUF_UTF8_READ_VALID) { ++valid_chars; // printf("%s", ch.bytes); } else { ++invalid_chars; // printf("::INVALID_UTF8_CHAR::"); } bytes += guf_utf8_char_num_bytes(&ch); } TEST_CHECK(input_str.len == 0 && input_str.str == NULL); TEST_CHECK(bytes == text_buf.size); // printf("\nread %td bytes\n", bytes); // printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars); free_text(); if (n_valid) *n_valid = valid_chars; if (n_invalid) *n_invalid = invalid_chars; } int count_words(const char *fname, const dbuf_str_view *delims) { GUF_ASSERT_RELEASE(load_text(fname)); int num_words = 0; guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; guf_str_view tok; while ((tok = guf_str_next_tok(&input_str, delims->data, delims->size, NULL, -1)).len) { // printf("tok_len: %td ", tok.len); // printf("'%.*s'\n", (int)tok.len, tok.str); ++num_words; } free_text(); return num_words; } int count_words_with_delims(const char *fname, const dbuf_str_view *delims) { GUF_ASSERT_RELEASE(load_text(fname)); int num_words = 0; guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; guf_str_view tok; while ((tok = guf_str_next_tok(&input_str, delims->data, delims->size, delims->data, delims->size)).len) { // if (tok.str[0] == '\n') { // printf("'\\n'\n"); // } else { // printf("'%.*s'\n", (int)tok.len, tok.str); // } ++num_words; } free_text(); return num_words; } void encode_decode_file(const char *fname) { GUF_ASSERT_RELEASE(load_text(fname)); dbuf_i32 cp_buf = dbuf_i32_new(&guf_allocator_libc); ptrdiff_t valid_chars = 0, invalid_chars = 0; guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size}; guf_utf8_char ch = {}; for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) { if (stat == GUF_UTF8_READ_VALID) { ++valid_chars; const int32_t codepoint = guf_utf8_decode(&ch); TEST_CHECK(codepoint >= 0); dbuf_i32_push_val(&cp_buf, codepoint); } else { ++invalid_chars; const int32_t codepoint = guf_utf8_decode(&ch); TEST_CHECK(codepoint < 0); dbuf_i32_push_val(&cp_buf, -1); } } TEST_CHECK(cp_buf.size == valid_chars + invalid_chars); guf_str_view in_str = {.str = text_buf.data, .len = text_buf.size}; GUF_CNT_FOREACH(&cp_buf, dbuf_i32, it) { GUF_ASSERT_RELEASE(it.ptr); const int32_t codepoint = *it.ptr; guf_utf8_char utf8_ch = {}; const guf_utf8_stat stat = guf_utf8_char_next(&utf8_ch, &in_str); if (codepoint >= 0) { TEST_CHECK(stat == GUF_UTF8_READ_VALID); guf_utf8_char encoded_ch = {}; TEST_CHECK(guf_utf8_encode(&encoded_ch, codepoint)); TEST_CHECK(guf_utf8_equal(&encoded_ch, &utf8_ch)); } } guf_utf8_char utf8_ch = {}; const guf_utf8_stat stat = guf_utf8_char_next(&utf8_ch, &in_str); TEST_CHECK(stat == GUF_UTF8_READ_DONE); dbuf_i32_free(&cp_buf, NULL); free_text(); } void encode_decode() { guf_utf8_char utf8 = {0}; // 1 byte characters. for (uint8_t ascii = 0; ascii <= 0x7F; ++ascii) { TEST_CHECK(guf_utf8_encode(&utf8, ascii)); TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 1); TEST_CHECK(utf8.bytes[0] == ascii); TEST_CHECK(utf8.bytes[1] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == ascii); } // 2 byte characters: TEST_CHECK(guf_utf8_encode(&utf8, 0x00E6)); // "æ" (Latin Small Letter Ae) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2); TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xA6'); TEST_CHECK(utf8.bytes[2] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0x00E6); TEST_CHECK(guf_utf8_encode(&utf8, 0x00E5)); // "å" (Latin Small Letter A with Ring Above) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2); TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xA5'); TEST_CHECK(utf8.bytes[2] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0x00E5); TEST_CHECK(guf_utf8_encode(&utf8, 0x00F8)); // "ø" (Latin Small Letter O with Stroke) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2); TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xB8'); TEST_CHECK(utf8.bytes[2] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0x00F8); TEST_CHECK(guf_utf8_encode(&utf8, 0x00E4)); // "ä" (Latin Small Letter A with Diaeresis) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2); TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xA4'); TEST_CHECK(utf8.bytes[2] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0x00E4); TEST_CHECK(guf_utf8_encode(&utf8, 0x00F6)); // "ö" (Latin Small Letter O with Diaeresis) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2); TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xB6'); TEST_CHECK(utf8.bytes[2] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0x00F6); TEST_CHECK(guf_utf8_encode(&utf8, 0x00D6)); // "Ö" (Latin Capital Letter O with Diaeresis) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2); TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\x96'); TEST_CHECK(utf8.bytes[2] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0x00D6); TEST_CHECK(guf_utf8_encode(&utf8, 0x00FC)); // "ü" (Latin Small Letter U with Diaeresis) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2); TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xBC'); TEST_CHECK(utf8.bytes[2] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0x00FC); TEST_CHECK(guf_utf8_encode(&utf8, 0x00B5)); // "µ" (Micro Sign) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2); TEST_CHECK(utf8.bytes[0] == '\xC2' && utf8.bytes[1] == '\xB5'); TEST_CHECK(utf8.bytes[2] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0x00B5); TEST_CHECK(guf_utf8_encode(&utf8, 0x030A)); // "◌̊" (Combining Ring Above) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2); TEST_CHECK(utf8.bytes[0] == '\xCC' && utf8.bytes[1] == '\x8A'); TEST_CHECK(utf8.bytes[2] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0x030A); // 3 byte characters: TEST_CHECK(guf_utf8_encode(&utf8, 0x7121)); // "無" (Nothingness; CJK Unified Ideograph-7121) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3); TEST_CHECK(!guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR)); TEST_CHECK(utf8.bytes[0] == '\xE7' && utf8.bytes[1] == '\x84' && utf8.bytes[2] == '\xA1'); TEST_CHECK(utf8.bytes[3] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0x7121); TEST_CHECK(guf_utf8_encode(&utf8, 0x201E)); // "„" (Double Low-9 Quotation Mark) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3); TEST_CHECK(!guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR)); TEST_CHECK(utf8.bytes[0] == '\xE2' && utf8.bytes[1] == '\x80' && utf8.bytes[2] == '\x9E'); TEST_CHECK(utf8.bytes[3] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0x201E); TEST_CHECK(guf_utf8_encode(&utf8, 0x20AC)); // "€" (Euro Sign) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3); TEST_CHECK(!guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR)); TEST_CHECK(utf8.bytes[0] == '\xE2' && utf8.bytes[1] == '\x82' && utf8.bytes[2] == '\xAC'); TEST_CHECK(utf8.bytes[3] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0x20AC); TEST_CHECK(guf_utf8_encode(&utf8, 0xFC51)); // "ﱑ" (Arabic Ligature Heh with Jeem Isolated Form) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3); TEST_CHECK(!guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR)); TEST_CHECK(utf8.bytes[0] == '\xEF' && utf8.bytes[1] == '\xB1' && utf8.bytes[2] == '\x91'); TEST_CHECK(utf8.bytes[3] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0xFC51); TEST_CHECK(guf_utf8_encode(&utf8, 0x1AA3)); // "᪣" (Tai Tham Sign Keow) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3); TEST_CHECK(!guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR)); TEST_CHECK(utf8.bytes[0] == '\xE1' && utf8.bytes[1] == '\xAA' && utf8.bytes[2] == '\xA3'); TEST_CHECK(utf8.bytes[3] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0x1AA3); TEST_CHECK(guf_utf8_encode(&utf8, GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT)); // "�" (Replacement Character) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3); TEST_CHECK(utf8.bytes[0] == '\xEF' && utf8.bytes[1] == '\xBF' && utf8.bytes[2] == '\xBD'); TEST_CHECK(utf8.bytes[3] == '\0'); TEST_CHECK(guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR)); TEST_CHECK(guf_utf8_decode(&utf8) == GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT); // 4 byte characters: TEST_CHECK(guf_utf8_encode(&utf8, 0x1F308)); // "🌈" (Rainbow) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4); TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\x8C' && utf8.bytes[3] == '\x88'); TEST_CHECK(utf8.bytes[4] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F308); TEST_CHECK(guf_utf8_encode(&utf8, 0x130B8)); // "𓂸" (Egyptian Hieroglyph D052) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4); TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x93' && utf8.bytes[2] == '\x82' && utf8.bytes[3] == '\xB8'); TEST_CHECK(utf8.bytes[4] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0x130B8); TEST_CHECK(guf_utf8_encode(&utf8, 0x1F97A)); // "🥺" (Face with Pleading Eyes) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4); TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA5' && utf8.bytes[3] == '\xBA'); TEST_CHECK(utf8.bytes[4] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F97A); TEST_CHECK(guf_utf8_encode(&utf8, 0x1F980)); // "🦀" (Crab) TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4); TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA6' && utf8.bytes[3] == '\x80'); TEST_CHECK(utf8.bytes[4] == '\0'); TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F980); // Invalid characters: utf8 = {.bytes = {'\xC0', '\x80', 0, 0, 0}}; TEST_CHECK(guf_utf8_decode(&utf8) < 0); utf8 = {.bytes = {'\xC0', 0, 0, 0, 0}}; TEST_CHECK(guf_utf8_decode(&utf8) < 0); utf8 = {.bytes = {'\x80', 0, 0, 0, 0}}; TEST_CHECK(guf_utf8_decode(&utf8) < 0); // "The definition of UTF-8 prohibits encoding character numbers between U+D800 and U+DFFF" (surrogate pairs). TEST_CHECK(!guf_utf8_encode(&utf8, 0xD800)); TEST_CHECK(guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR)); TEST_CHECK(guf_utf8_decode(&utf8) == GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT); TEST_CHECK(!guf_utf8_encode(&utf8, 0xDFFF)); TEST_CHECK(guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR)); TEST_CHECK(guf_utf8_decode(&utf8) == GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT); TEST_CHECK(!guf_utf8_encode(&utf8, 0xDA00)); TEST_CHECK(guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR)); TEST_CHECK(guf_utf8_decode(&utf8) == GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT); char buf[] = {'\x2F', '\xC0', '\xAE', '\x2E', '\x2F'}; guf_str_view input_str = {.str = buf, .len = GUF_STATIC_BUF_SIZE(buf)}; guf_utf8_char ch = {}; int valid_chars = 0, invalid_chars = 0; for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) { if (stat == GUF_UTF8_READ_VALID) { ++valid_chars; } else { ++invalid_chars; } } TEST_CHECK(invalid_chars == 2 && valid_chars == 3); char buf2[] = {'\xE0', '\x80', 'a', 'b', 'c'}; // 1 invalid 3-byte-character, 2 valid 1-byte-characters input_str = {.str = buf2, .len = GUF_STATIC_BUF_SIZE(buf2)}; ch = {}; valid_chars = invalid_chars = 0; for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) { if (stat == GUF_UTF8_READ_VALID) { // printf("%s", ch.bytes); ++valid_chars; } else { // printf("%s", GUF_UTF8_REPLACEMENT_CHAR.bytes); ++invalid_chars; } } TEST_CHECK(invalid_chars == 1 && valid_chars == 2); } public: bool run() { if (done) { return passed; } ptrdiff_t valid = 0, invalid = 0; read_utf8_chars(TEST_DATA_DIR "/" "utf8-test.txt", &valid, &invalid); TEST_CHECK(valid == 2634 && invalid == 0); dbuf_str_view delims = dbuf_str_view_new(&guf_allocator_libc); for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(GUF_UTF8_WHITESPACE); ++i) { guf_str_view d = {.len = (ptrdiff_t)strlen(GUF_UTF8_WHITESPACE[i]), .str = GUF_UTF8_WHITESPACE[i]}; dbuf_str_view_push_val(&delims, d); } for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(GUF_UTF8_COMMON_PUNCT); ++i) { guf_str_view d = {.len = (ptrdiff_t)strlen(GUF_UTF8_COMMON_PUNCT[i]), .str = GUF_UTF8_COMMON_PUNCT[i]}; dbuf_str_view_push_val(&delims, d); } int words = count_words(TEST_DATA_DIR "/" "utf8-test.txt", &delims); TEST_CHECK(words == 422); int words_with_delims = count_words_with_delims(TEST_DATA_DIR "/" "utf8-test.txt", &delims); TEST_CHECK(words_with_delims == 947); dbuf_str_view_free(&delims, NULL); encode_decode(); encode_decode_file(TEST_DATA_DIR "/" "utf8-test.txt"); done = true; passed = (num_failed_checks == 0); return passed; } };