libguf/src/test/test_utf8.cpp
2025-05-14 17:08:15 +02:00

392 lines
15 KiB
C++
Raw Blame History

#include "test_utf8.hpp"
extern "C"
{
#include "guf_alloc_libc.h"
#include "guf_str.h"
#include "impls/dict_impl.h"
}
/*
UTF8Test:
*/
void UTF8Test::run()
{
if (done) {
return;
}
push_check_name("read_utf8_chars");
ptrdiff_t valid = 0, invalid = 0;
read_utf8_chars(TEST_DATA_DIR "/" "utf8-test.txt", &valid, &invalid);
TEST_CHECK(valid == 2635 && invalid == 0);
read_utf8_chars(TEST_DATA_DIR "/" "bartleby.txt", &valid, &invalid);
TEST_CHECK(valid > 16000 && invalid == 0);
pop_check_name();
push_check_name("count_words");
dbuf_str_view delims = dbuf_str_view_new(&allocator);
for (size_t i = 0; i < GUF_ARR_SIZE(GUF_UTF8_WHITESPACE); ++i) {
guf_str_view d = {.len = (ptrdiff_t)strlen(GUF_UTF8_WHITESPACE[i]), .str = GUF_UTF8_WHITESPACE[i]};
dbuf_str_view_push_val(&delims, d);
}
for (size_t i = 0; i < GUF_ARR_SIZE(GUF_UTF8_COMMON_PUNCT); ++i) {
guf_str_view d = {.len = (ptrdiff_t)strlen(GUF_UTF8_COMMON_PUNCT[i]), .str = GUF_UTF8_COMMON_PUNCT[i]};
dbuf_str_view_push_val(&delims, d);
}
int words = count_words(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
TEST_CHECK(words == 422);
int words_with_delims = count_words_with_delims(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
TEST_CHECK(words_with_delims == 950);
int words2 = count_words(TEST_DATA_DIR "/" "bartleby.txt", &delims);
TEST_CHECK(words2 > 2048);
dbuf_str_view_free(&delims, NULL);
pop_check_name();
push_check_name("encode_decode");
encode_decode();
encode_decode_file(TEST_DATA_DIR "/" "utf8-test.txt");
encode_decode_file(TEST_DATA_DIR "/" "bartleby.txt");
pop_check_name();
//guf_alloc_tracker_print(&allocator_ctx.tracker, NULL);
TEST_CHECK(!guf_alloc_tracker_found_leak(&allocator_ctx.tracker));
}
bool UTF8Test::load_text(const char *fname)
{
FILE *in_file {nullptr};
if (!in_file) {
in_file = fopen(fname, "r");
}
if (!in_file) {
return false;
}
dbuf_char_init(&text_buf, 128, &allocator);
int c = EOF;
while ((c = fgetc(in_file)) != EOF) {
dbuf_char_push_val(&text_buf, (char)c);
text_vec.push_back((char)c);
}
fclose(in_file);
return TEST_CHECK(std::ssize(text_vec) == text_buf.size);
}
void UTF8Test::free_text()
{
dbuf_char_free(&text_buf, NULL);
text_vec.clear();
}
void UTF8Test::read_utf8_chars(const char *fname, ptrdiff_t *n_valid, ptrdiff_t *n_invalid)
{
GUF_ASSERT_RELEASE(load_text(fname));
ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = 0;
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
guf_utf8_char ch = {};
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
if (stat == GUF_UTF8_READ_VALID) {
++valid_chars;
// printf("%s", ch.bytes);
} else {
++invalid_chars;
// printf("::INVALID_UTF8_CHAR::");
}
bytes += guf_utf8_char_num_bytes(&ch);
}
TEST_CHECK(input_str.len == 0 && input_str.str == NULL);
TEST_CHECK(bytes == text_buf.size);
// printf("\nread %td bytes\n", bytes);
// printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars);
free_text();
if (n_valid)
*n_valid = valid_chars;
if (n_invalid)
*n_invalid = invalid_chars;
}
int UTF8Test::count_words(const char *fname, const dbuf_str_view *delims)
{
GUF_ASSERT_RELEASE(load_text(fname));
int num_words = 0;
guf_str_tok_state tok_state = guf_str_tok_state_new(guf_str_view{.str = text_buf.data, .len = text_buf.size}, delims->data, delims->size, GUF_STR_TOK_DELIM_OPT_MATCH_LONGEST);
while (guf_str_tok_next(&tok_state, false)) {
TEST_CHECK(tok_state.cur_tok.len > 0);
++num_words;
}
free_text();
return num_words;
}
int UTF8Test::count_words_with_delims(const char *fname, const dbuf_str_view *delims)
{
GUF_ASSERT_RELEASE(load_text(fname));
int num_words = 0, num_delims = 0;
guf_str_tok_state tok_state = guf_str_tok_state_new(guf_str_view{.str = text_buf.data, .len = text_buf.size}, delims->data, delims->size, GUF_STR_TOK_DELIM_OPT_MATCH_LONGEST);
while (guf_str_tok_next(&tok_state, true)) {
if (tok_state.cur_tok.len) {
++num_words;
// printf("'%.*s'\n", (int)tok_state.cur_tok.len, tok_state.cur_tok.str);
}
if (tok_state.cur_delim.len) {
++num_delims;
// if (tok_state.cur_delim.str[0] == '\n')
// printf("'\\n'\n");
// else
// printf("'%.*s'\n", (int)tok_state.cur_delim.len, tok_state.cur_delim.str);
}
}
free_text();
return num_words + num_delims;
}
void UTF8Test::encode_decode_file(const char *fname)
{
GUF_ASSERT_RELEASE(load_text(fname));
dbuf_i32 cp_buf = dbuf_i32_new(&allocator);
ptrdiff_t valid_chars = 0, invalid_chars = 0;
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
guf_utf8_char ch = {};
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
if (stat == GUF_UTF8_READ_VALID) {
++valid_chars;
const int32_t codepoint = guf_utf8_decode(&ch);
TEST_CHECK(codepoint >= 0);
dbuf_i32_push_val(&cp_buf, codepoint);
} else {
++invalid_chars;
const int32_t codepoint = guf_utf8_decode(&ch);
TEST_CHECK(codepoint < 0);
dbuf_i32_push_val(&cp_buf, -1);
}
}
TEST_CHECK(cp_buf.size == valid_chars + invalid_chars);
guf_str_view in_str = {.str = text_buf.data, .len = text_buf.size};
GUF_CNT_FOREACH(&cp_buf, dbuf_i32, it) {
GUF_ASSERT_RELEASE(it.ptr);
const int32_t codepoint = *it.ptr;
guf_utf8_char utf8_ch = {};
const guf_utf8_stat stat = guf_utf8_char_next(&utf8_ch, &in_str);
if (codepoint >= 0) {
TEST_CHECK(stat == GUF_UTF8_READ_VALID);
guf_utf8_char encoded_ch = {};
TEST_CHECK(guf_utf8_encode(&encoded_ch, codepoint));
TEST_CHECK(guf_utf8_equal(&encoded_ch, &utf8_ch));
}
}
guf_utf8_char utf8_ch = {};
const guf_utf8_stat stat = guf_utf8_char_next(&utf8_ch, &in_str);
TEST_CHECK(stat == GUF_UTF8_READ_DONE);
dbuf_i32_free(&cp_buf, NULL);
free_text();
}
void UTF8Test::encode_decode()
{
guf_utf8_char utf8 = {0};
// 1 byte characters.
for (uint8_t ascii = 0; ascii <= 0x7F; ++ascii) {
TEST_CHECK(guf_utf8_encode(&utf8, ascii));
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 1);
TEST_CHECK(utf8.bytes[0] == ascii);
TEST_CHECK(utf8.bytes[1] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == ascii);
}
// 2 byte characters:
TEST_CHECK(guf_utf8_encode(&utf8, 0x00E6)); // "æ" (Latin Small Letter Ae)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xA6');
TEST_CHECK(utf8.bytes[2] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x00E6);
TEST_CHECK(guf_utf8_encode(&utf8, 0x00E5)); // "å" (Latin Small Letter A with Ring Above)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xA5');
TEST_CHECK(utf8.bytes[2] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x00E5);
TEST_CHECK(guf_utf8_encode(&utf8, 0x00F8)); // "ø" (Latin Small Letter O with Stroke)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xB8');
TEST_CHECK(utf8.bytes[2] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x00F8);
TEST_CHECK(guf_utf8_encode(&utf8, 0x00E4)); // "ä" (Latin Small Letter A with Diaeresis)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xA4');
TEST_CHECK(utf8.bytes[2] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x00E4);
TEST_CHECK(guf_utf8_encode(&utf8, 0x00F6)); // "ö" (Latin Small Letter O with Diaeresis)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xB6');
TEST_CHECK(utf8.bytes[2] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x00F6);
TEST_CHECK(guf_utf8_encode(&utf8, 0x00D6)); // "Ö" (Latin Capital Letter O with Diaeresis)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\x96');
TEST_CHECK(utf8.bytes[2] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x00D6);
TEST_CHECK(guf_utf8_encode(&utf8, 0x00FC)); // "ü" (Latin Small Letter U with Diaeresis)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xBC');
TEST_CHECK(utf8.bytes[2] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x00FC);
TEST_CHECK(guf_utf8_encode(&utf8, 0x00B5)); // "µ" (Micro Sign)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
TEST_CHECK(utf8.bytes[0] == '\xC2' && utf8.bytes[1] == '\xB5');
TEST_CHECK(utf8.bytes[2] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x00B5);
TEST_CHECK(guf_utf8_encode(&utf8, 0x030A)); // "◌̊" (Combining Ring Above)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
TEST_CHECK(utf8.bytes[0] == '\xCC' && utf8.bytes[1] == '\x8A');
TEST_CHECK(utf8.bytes[2] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x030A);
// 3 byte characters:
TEST_CHECK(guf_utf8_encode(&utf8, 0x7121)); // "無" (Nothingness; CJK Unified Ideograph-7121)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3);
TEST_CHECK(!guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
TEST_CHECK(utf8.bytes[0] == '\xE7' && utf8.bytes[1] == '\x84' && utf8.bytes[2] == '\xA1');
TEST_CHECK(utf8.bytes[3] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x7121);
TEST_CHECK(guf_utf8_encode(&utf8, 0x201E)); // "„" (Double Low-9 Quotation Mark)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3);
TEST_CHECK(!guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
TEST_CHECK(utf8.bytes[0] == '\xE2' && utf8.bytes[1] == '\x80' && utf8.bytes[2] == '\x9E');
TEST_CHECK(utf8.bytes[3] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x201E);
TEST_CHECK(guf_utf8_encode(&utf8, 0x20AC)); // "€" (Euro Sign)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3);
TEST_CHECK(!guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
TEST_CHECK(utf8.bytes[0] == '\xE2' && utf8.bytes[1] == '\x82' && utf8.bytes[2] == '\xAC');
TEST_CHECK(utf8.bytes[3] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x20AC);
TEST_CHECK(guf_utf8_encode(&utf8, 0xFC51)); // "ﱑ" (Arabic Ligature Heh with Jeem Isolated Form)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3);
TEST_CHECK(!guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
TEST_CHECK(utf8.bytes[0] == '\xEF' && utf8.bytes[1] == '\xB1' && utf8.bytes[2] == '\x91');
TEST_CHECK(utf8.bytes[3] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0xFC51);
TEST_CHECK(guf_utf8_encode(&utf8, 0x1AA3)); // "᪣" (Tai Tham Sign Keow)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3);
TEST_CHECK(!guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
TEST_CHECK(utf8.bytes[0] == '\xE1' && utf8.bytes[1] == '\xAA' && utf8.bytes[2] == '\xA3');
TEST_CHECK(utf8.bytes[3] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1AA3);
TEST_CHECK(guf_utf8_encode(&utf8, GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT)); // "<22>" (Replacement Character)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3);
TEST_CHECK(utf8.bytes[0] == '\xEF' && utf8.bytes[1] == '\xBF' && utf8.bytes[2] == '\xBD');
TEST_CHECK(utf8.bytes[3] == '\0');
TEST_CHECK(guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
TEST_CHECK(guf_utf8_decode(&utf8) == GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT);
// 4 byte characters:
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F308)); // "🌈" (Rainbow)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\x8C' && utf8.bytes[3] == '\x88');
TEST_CHECK(utf8.bytes[4] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F308);
TEST_CHECK(guf_utf8_encode(&utf8, 0x130B8)); // "𓂸" (Egyptian Hieroglyph D052)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x93' && utf8.bytes[2] == '\x82' && utf8.bytes[3] == '\xB8');
TEST_CHECK(utf8.bytes[4] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x130B8);
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F97A)); // "🥺" (Face with Pleading Eyes)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA5' && utf8.bytes[3] == '\xBA');
TEST_CHECK(utf8.bytes[4] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F97A);
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F980)); // "🦀" (Crab)
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA6' && utf8.bytes[3] == '\x80');
TEST_CHECK(utf8.bytes[4] == '\0');
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F980);
// Invalid characters:
utf8 = {.bytes = {'\xC0', '\x80', 0, 0, 0}};
TEST_CHECK(guf_utf8_decode(&utf8) < 0);
utf8 = {.bytes = {'\xC0', 0, 0, 0, 0}};
TEST_CHECK(guf_utf8_decode(&utf8) < 0);
utf8 = {.bytes = {'\x80', 0, 0, 0, 0}};
TEST_CHECK(guf_utf8_decode(&utf8) < 0);
// "The definition of UTF-8 prohibits encoding character numbers between U+D800 and U+DFFF" (surrogate pairs).
TEST_CHECK(!guf_utf8_encode(&utf8, 0xD800));
TEST_CHECK(guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
TEST_CHECK(guf_utf8_decode(&utf8) == GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT);
TEST_CHECK(!guf_utf8_encode(&utf8, 0xDFFF));
TEST_CHECK(guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
TEST_CHECK(guf_utf8_decode(&utf8) == GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT);
TEST_CHECK(!guf_utf8_encode(&utf8, 0xDA00));
TEST_CHECK(guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
TEST_CHECK(guf_utf8_decode(&utf8) == GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT);
char buf[] = {'\x2F', '\xC0', '\xAE', '\x2E', '\x2F'};
guf_str_view input_str = {.str = buf, .len = GUF_ARR_SIZE(buf)};
guf_utf8_char ch = {};
int valid_chars = 0, invalid_chars = 0;
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
if (stat == GUF_UTF8_READ_VALID) {
++valid_chars;
} else {
++invalid_chars;
}
}
TEST_CHECK(invalid_chars == 2 && valid_chars == 3);
char buf2[] = {'\xE0', '\x80', 'a', 'b', 'c'}; // 1 invalid 3-byte-character, 2 valid 1-byte-characters
input_str = {.str = buf2, .len = GUF_ARR_SIZE(buf2)};
ch = {};
valid_chars = invalid_chars = 0;
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
if (stat == GUF_UTF8_READ_VALID) {
// printf("%s", ch.bytes);
++valid_chars;
} else {
// printf("%s", GUF_UTF8_REPLACEMENT_CHAR.bytes);
++invalid_chars;
}
}
TEST_CHECK(invalid_chars == 1 && valid_chars == 2);
}