392 lines
15 KiB
C++
392 lines
15 KiB
C++
#include "test_utf8.hpp"
|
||
extern "C"
|
||
{
|
||
#include "guf_alloc_libc.h"
|
||
#include "guf_str.h"
|
||
#include "impls/dict_impl.h"
|
||
}
|
||
|
||
/*
|
||
UTF8Test:
|
||
*/
|
||
|
||
void UTF8Test::run()
|
||
{
|
||
if (done) {
|
||
return;
|
||
}
|
||
|
||
push_check_name("read_utf8_chars");
|
||
ptrdiff_t valid = 0, invalid = 0;
|
||
read_utf8_chars(TEST_DATA_DIR "/" "utf8-test.txt", &valid, &invalid);
|
||
TEST_CHECK(valid == 2635 && invalid == 0);
|
||
read_utf8_chars(TEST_DATA_DIR "/" "bartleby.txt", &valid, &invalid);
|
||
TEST_CHECK(valid > 16000 && invalid == 0);
|
||
pop_check_name();
|
||
|
||
push_check_name("count_words");
|
||
dbuf_str_view delims = dbuf_str_view_new(&allocator);
|
||
for (size_t i = 0; i < GUF_ARR_SIZE(GUF_UTF8_WHITESPACE); ++i) {
|
||
guf_str_view d = {.len = (ptrdiff_t)strlen(GUF_UTF8_WHITESPACE[i]), .str = GUF_UTF8_WHITESPACE[i]};
|
||
dbuf_str_view_push_val(&delims, d);
|
||
}
|
||
for (size_t i = 0; i < GUF_ARR_SIZE(GUF_UTF8_COMMON_PUNCT); ++i) {
|
||
guf_str_view d = {.len = (ptrdiff_t)strlen(GUF_UTF8_COMMON_PUNCT[i]), .str = GUF_UTF8_COMMON_PUNCT[i]};
|
||
dbuf_str_view_push_val(&delims, d);
|
||
}
|
||
int words = count_words(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
|
||
TEST_CHECK(words == 422);
|
||
int words_with_delims = count_words_with_delims(TEST_DATA_DIR "/" "utf8-test.txt", &delims);
|
||
TEST_CHECK(words_with_delims == 950);
|
||
|
||
int words2 = count_words(TEST_DATA_DIR "/" "bartleby.txt", &delims);
|
||
TEST_CHECK(words2 > 2048);
|
||
|
||
dbuf_str_view_free(&delims, NULL);
|
||
pop_check_name();
|
||
|
||
push_check_name("encode_decode");
|
||
encode_decode();
|
||
encode_decode_file(TEST_DATA_DIR "/" "utf8-test.txt");
|
||
encode_decode_file(TEST_DATA_DIR "/" "bartleby.txt");
|
||
pop_check_name();
|
||
|
||
//guf_alloc_tracker_print(&allocator_ctx.tracker, NULL);
|
||
TEST_CHECK(!guf_alloc_tracker_found_leak(&allocator_ctx.tracker));
|
||
}
|
||
|
||
|
||
bool UTF8Test::load_text(const char *fname)
|
||
{
|
||
FILE *in_file {nullptr};
|
||
if (!in_file) {
|
||
in_file = fopen(fname, "r");
|
||
}
|
||
|
||
if (!in_file) {
|
||
return false;
|
||
}
|
||
|
||
dbuf_char_init(&text_buf, 128, &allocator);
|
||
|
||
int c = EOF;
|
||
while ((c = fgetc(in_file)) != EOF) {
|
||
dbuf_char_push_val(&text_buf, (char)c);
|
||
text_vec.push_back((char)c);
|
||
}
|
||
fclose(in_file);
|
||
|
||
return TEST_CHECK(std::ssize(text_vec) == text_buf.size);
|
||
}
|
||
|
||
void UTF8Test::free_text()
|
||
{
|
||
dbuf_char_free(&text_buf, NULL);
|
||
text_vec.clear();
|
||
}
|
||
|
||
|
||
void UTF8Test::read_utf8_chars(const char *fname, ptrdiff_t *n_valid, ptrdiff_t *n_invalid)
|
||
{
|
||
GUF_ASSERT_RELEASE(load_text(fname));
|
||
|
||
ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = 0;
|
||
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
|
||
guf_utf8_char ch = {};
|
||
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
|
||
if (stat == GUF_UTF8_READ_VALID) {
|
||
++valid_chars;
|
||
// printf("%s", ch.bytes);
|
||
} else {
|
||
++invalid_chars;
|
||
// printf("::INVALID_UTF8_CHAR::");
|
||
}
|
||
bytes += guf_utf8_char_num_bytes(&ch);
|
||
}
|
||
TEST_CHECK(input_str.len == 0 && input_str.str == NULL);
|
||
TEST_CHECK(bytes == text_buf.size);
|
||
|
||
// printf("\nread %td bytes\n", bytes);
|
||
// printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars);
|
||
|
||
free_text();
|
||
|
||
if (n_valid)
|
||
*n_valid = valid_chars;
|
||
if (n_invalid)
|
||
*n_invalid = invalid_chars;
|
||
}
|
||
|
||
int UTF8Test::count_words(const char *fname, const dbuf_str_view *delims)
|
||
{
|
||
GUF_ASSERT_RELEASE(load_text(fname));
|
||
|
||
int num_words = 0;
|
||
|
||
guf_str_tok_state tok_state = guf_str_tok_state_new(guf_str_view{.str = text_buf.data, .len = text_buf.size}, delims->data, delims->size, GUF_STR_TOK_DELIM_OPT_MATCH_LONGEST);
|
||
while (guf_str_tok_next(&tok_state, false)) {
|
||
TEST_CHECK(tok_state.cur_tok.len > 0);
|
||
++num_words;
|
||
}
|
||
|
||
free_text();
|
||
return num_words;
|
||
}
|
||
|
||
int UTF8Test::count_words_with_delims(const char *fname, const dbuf_str_view *delims)
|
||
{
|
||
GUF_ASSERT_RELEASE(load_text(fname));
|
||
|
||
int num_words = 0, num_delims = 0;
|
||
guf_str_tok_state tok_state = guf_str_tok_state_new(guf_str_view{.str = text_buf.data, .len = text_buf.size}, delims->data, delims->size, GUF_STR_TOK_DELIM_OPT_MATCH_LONGEST);
|
||
while (guf_str_tok_next(&tok_state, true)) {
|
||
if (tok_state.cur_tok.len) {
|
||
++num_words;
|
||
// printf("'%.*s'\n", (int)tok_state.cur_tok.len, tok_state.cur_tok.str);
|
||
}
|
||
if (tok_state.cur_delim.len) {
|
||
++num_delims;
|
||
// if (tok_state.cur_delim.str[0] == '\n')
|
||
// printf("'\\n'\n");
|
||
// else
|
||
// printf("'%.*s'\n", (int)tok_state.cur_delim.len, tok_state.cur_delim.str);
|
||
}
|
||
}
|
||
free_text();
|
||
return num_words + num_delims;
|
||
}
|
||
|
||
void UTF8Test::encode_decode_file(const char *fname)
|
||
{
|
||
GUF_ASSERT_RELEASE(load_text(fname));
|
||
|
||
dbuf_i32 cp_buf = dbuf_i32_new(&allocator);
|
||
|
||
ptrdiff_t valid_chars = 0, invalid_chars = 0;
|
||
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
|
||
guf_utf8_char ch = {};
|
||
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
|
||
if (stat == GUF_UTF8_READ_VALID) {
|
||
++valid_chars;
|
||
const int32_t codepoint = guf_utf8_decode(&ch);
|
||
TEST_CHECK(codepoint >= 0);
|
||
dbuf_i32_push_val(&cp_buf, codepoint);
|
||
} else {
|
||
++invalid_chars;
|
||
const int32_t codepoint = guf_utf8_decode(&ch);
|
||
TEST_CHECK(codepoint < 0);
|
||
dbuf_i32_push_val(&cp_buf, -1);
|
||
}
|
||
}
|
||
TEST_CHECK(cp_buf.size == valid_chars + invalid_chars);
|
||
|
||
guf_str_view in_str = {.str = text_buf.data, .len = text_buf.size};
|
||
GUF_CNT_FOREACH(&cp_buf, dbuf_i32, it) {
|
||
GUF_ASSERT_RELEASE(it.ptr);
|
||
const int32_t codepoint = *it.ptr;
|
||
guf_utf8_char utf8_ch = {};
|
||
const guf_utf8_stat stat = guf_utf8_char_next(&utf8_ch, &in_str);
|
||
if (codepoint >= 0) {
|
||
TEST_CHECK(stat == GUF_UTF8_READ_VALID);
|
||
guf_utf8_char encoded_ch = {};
|
||
TEST_CHECK(guf_utf8_encode(&encoded_ch, codepoint));
|
||
TEST_CHECK(guf_utf8_equal(&encoded_ch, &utf8_ch));
|
||
}
|
||
}
|
||
guf_utf8_char utf8_ch = {};
|
||
const guf_utf8_stat stat = guf_utf8_char_next(&utf8_ch, &in_str);
|
||
TEST_CHECK(stat == GUF_UTF8_READ_DONE);
|
||
|
||
dbuf_i32_free(&cp_buf, NULL);
|
||
|
||
free_text();
|
||
}
|
||
|
||
void UTF8Test::encode_decode()
|
||
{
|
||
guf_utf8_char utf8 = {0};
|
||
|
||
// 1 byte characters.
|
||
for (uint8_t ascii = 0; ascii <= 0x7F; ++ascii) {
|
||
TEST_CHECK(guf_utf8_encode(&utf8, ascii));
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 1);
|
||
TEST_CHECK(utf8.bytes[0] == ascii);
|
||
TEST_CHECK(utf8.bytes[1] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == ascii);
|
||
}
|
||
|
||
// 2 byte characters:
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0x00E6)); // "æ" (Latin Small Letter Ae)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
|
||
TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xA6');
|
||
TEST_CHECK(utf8.bytes[2] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x00E6);
|
||
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0x00E5)); // "å" (Latin Small Letter A with Ring Above)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
|
||
TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xA5');
|
||
TEST_CHECK(utf8.bytes[2] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x00E5);
|
||
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0x00F8)); // "ø" (Latin Small Letter O with Stroke)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
|
||
TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xB8');
|
||
TEST_CHECK(utf8.bytes[2] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x00F8);
|
||
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0x00E4)); // "ä" (Latin Small Letter A with Diaeresis)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
|
||
TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xA4');
|
||
TEST_CHECK(utf8.bytes[2] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x00E4);
|
||
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0x00F6)); // "ö" (Latin Small Letter O with Diaeresis)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
|
||
TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xB6');
|
||
TEST_CHECK(utf8.bytes[2] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x00F6);
|
||
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0x00D6)); // "Ö" (Latin Capital Letter O with Diaeresis)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
|
||
TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\x96');
|
||
TEST_CHECK(utf8.bytes[2] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x00D6);
|
||
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0x00FC)); // "ü" (Latin Small Letter U with Diaeresis)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
|
||
TEST_CHECK(utf8.bytes[0] == '\xC3' && utf8.bytes[1] == '\xBC');
|
||
TEST_CHECK(utf8.bytes[2] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x00FC);
|
||
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0x00B5)); // "µ" (Micro Sign)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
|
||
TEST_CHECK(utf8.bytes[0] == '\xC2' && utf8.bytes[1] == '\xB5');
|
||
TEST_CHECK(utf8.bytes[2] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x00B5);
|
||
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0x030A)); // "◌̊" (Combining Ring Above)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 2);
|
||
TEST_CHECK(utf8.bytes[0] == '\xCC' && utf8.bytes[1] == '\x8A');
|
||
TEST_CHECK(utf8.bytes[2] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x030A);
|
||
|
||
// 3 byte characters:
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0x7121)); // "無" (Nothingness; CJK Unified Ideograph-7121)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3);
|
||
TEST_CHECK(!guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
|
||
TEST_CHECK(utf8.bytes[0] == '\xE7' && utf8.bytes[1] == '\x84' && utf8.bytes[2] == '\xA1');
|
||
TEST_CHECK(utf8.bytes[3] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x7121);
|
||
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0x201E)); // "„" (Double Low-9 Quotation Mark)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3);
|
||
TEST_CHECK(!guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
|
||
TEST_CHECK(utf8.bytes[0] == '\xE2' && utf8.bytes[1] == '\x80' && utf8.bytes[2] == '\x9E');
|
||
TEST_CHECK(utf8.bytes[3] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x201E);
|
||
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0x20AC)); // "€" (Euro Sign)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3);
|
||
TEST_CHECK(!guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
|
||
TEST_CHECK(utf8.bytes[0] == '\xE2' && utf8.bytes[1] == '\x82' && utf8.bytes[2] == '\xAC');
|
||
TEST_CHECK(utf8.bytes[3] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x20AC);
|
||
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0xFC51)); // "ﱑ" (Arabic Ligature Heh with Jeem Isolated Form)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3);
|
||
TEST_CHECK(!guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
|
||
TEST_CHECK(utf8.bytes[0] == '\xEF' && utf8.bytes[1] == '\xB1' && utf8.bytes[2] == '\x91');
|
||
TEST_CHECK(utf8.bytes[3] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0xFC51);
|
||
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0x1AA3)); // "᪣" (Tai Tham Sign Keow)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3);
|
||
TEST_CHECK(!guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
|
||
TEST_CHECK(utf8.bytes[0] == '\xE1' && utf8.bytes[1] == '\xAA' && utf8.bytes[2] == '\xA3');
|
||
TEST_CHECK(utf8.bytes[3] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1AA3);
|
||
|
||
TEST_CHECK(guf_utf8_encode(&utf8, GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT)); // "<22>" (Replacement Character)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 3);
|
||
TEST_CHECK(utf8.bytes[0] == '\xEF' && utf8.bytes[1] == '\xBF' && utf8.bytes[2] == '\xBD');
|
||
TEST_CHECK(utf8.bytes[3] == '\0');
|
||
TEST_CHECK(guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT);
|
||
|
||
// 4 byte characters:
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F308)); // "🌈" (Rainbow)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
|
||
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\x8C' && utf8.bytes[3] == '\x88');
|
||
TEST_CHECK(utf8.bytes[4] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F308);
|
||
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0x130B8)); // "𓂸" (Egyptian Hieroglyph D052)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
|
||
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x93' && utf8.bytes[2] == '\x82' && utf8.bytes[3] == '\xB8');
|
||
TEST_CHECK(utf8.bytes[4] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x130B8);
|
||
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F97A)); // "🥺" (Face with Pleading Eyes)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
|
||
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA5' && utf8.bytes[3] == '\xBA');
|
||
TEST_CHECK(utf8.bytes[4] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F97A);
|
||
|
||
TEST_CHECK(guf_utf8_encode(&utf8, 0x1F980)); // "🦀" (Crab)
|
||
TEST_CHECK(guf_utf8_char_num_bytes(&utf8) == 4);
|
||
TEST_CHECK(utf8.bytes[0] == '\xF0' && utf8.bytes[1] == '\x9F' && utf8.bytes[2] == '\xA6' && utf8.bytes[3] == '\x80');
|
||
TEST_CHECK(utf8.bytes[4] == '\0');
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == 0x1F980);
|
||
|
||
// Invalid characters:
|
||
utf8 = {.bytes = {'\xC0', '\x80', 0, 0, 0}};
|
||
TEST_CHECK(guf_utf8_decode(&utf8) < 0);
|
||
|
||
utf8 = {.bytes = {'\xC0', 0, 0, 0, 0}};
|
||
TEST_CHECK(guf_utf8_decode(&utf8) < 0);
|
||
|
||
utf8 = {.bytes = {'\x80', 0, 0, 0, 0}};
|
||
TEST_CHECK(guf_utf8_decode(&utf8) < 0);
|
||
|
||
// "The definition of UTF-8 prohibits encoding character numbers between U+D800 and U+DFFF" (surrogate pairs).
|
||
TEST_CHECK(!guf_utf8_encode(&utf8, 0xD800));
|
||
TEST_CHECK(guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT);
|
||
|
||
TEST_CHECK(!guf_utf8_encode(&utf8, 0xDFFF));
|
||
TEST_CHECK(guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT);
|
||
|
||
TEST_CHECK(!guf_utf8_encode(&utf8, 0xDA00));
|
||
TEST_CHECK(guf_utf8_equal(&utf8, &GUF_UTF8_REPLACEMENT_CHAR));
|
||
TEST_CHECK(guf_utf8_decode(&utf8) == GUF_UTF8_REPLACEMENT_CHAR_CODEPOINT);
|
||
|
||
char buf[] = {'\x2F', '\xC0', '\xAE', '\x2E', '\x2F'};
|
||
guf_str_view input_str = {.str = buf, .len = GUF_ARR_SIZE(buf)};
|
||
guf_utf8_char ch = {};
|
||
int valid_chars = 0, invalid_chars = 0;
|
||
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
|
||
if (stat == GUF_UTF8_READ_VALID) {
|
||
++valid_chars;
|
||
} else {
|
||
++invalid_chars;
|
||
}
|
||
}
|
||
TEST_CHECK(invalid_chars == 2 && valid_chars == 3);
|
||
|
||
char buf2[] = {'\xE0', '\x80', 'a', 'b', 'c'}; // 1 invalid 3-byte-character, 2 valid 1-byte-characters
|
||
input_str = {.str = buf2, .len = GUF_ARR_SIZE(buf2)};
|
||
ch = {};
|
||
valid_chars = invalid_chars = 0;
|
||
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
|
||
if (stat == GUF_UTF8_READ_VALID) {
|
||
// printf("%s", ch.bytes);
|
||
++valid_chars;
|
||
} else {
|
||
// printf("%s", GUF_UTF8_REPLACEMENT_CHAR.bytes);
|
||
++invalid_chars;
|
||
}
|
||
}
|
||
TEST_CHECK(invalid_chars == 1 && valid_chars == 2);
|
||
}
|