Add utf-8 handling
This commit is contained in:
parent
60e2849b01
commit
d450cd8a45
@ -40,6 +40,9 @@ endif ()
|
|||||||
|
|
||||||
set_target_properties(libguf_example libguf_test PROPERTIES DEBUG_POSTFIX ${CMAKE_DEBUG_POSTFIX})
|
set_target_properties(libguf_example libguf_test PROPERTIES DEBUG_POSTFIX ${CMAKE_DEBUG_POSTFIX})
|
||||||
|
|
||||||
|
target_compile_definitions(libguf_test PUBLIC TEST_DATA_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/test/data/")
|
||||||
|
|
||||||
|
|
||||||
include(CheckIPOSupported)
|
include(CheckIPOSupported)
|
||||||
check_ipo_supported(RESULT ipo_available)
|
check_ipo_supported(RESULT ipo_available)
|
||||||
if (ipo_available AND (CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo"))
|
if (ipo_available AND (CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo"))
|
||||||
@ -51,7 +54,6 @@ endif()
|
|||||||
|
|
||||||
if (TARGET libguf_test)
|
if (TARGET libguf_test)
|
||||||
message(STATUS "Configure libguf_test...")
|
message(STATUS "Configure libguf_test...")
|
||||||
|
|
||||||
target_compile_options(libguf_test PRIVATE ${WARNING_FLAGS_CXX} $<$<CONFIG:Debug>: ${DBG_FLAGS}>)
|
target_compile_options(libguf_test PRIVATE ${WARNING_FLAGS_CXX} $<$<CONFIG:Debug>: ${DBG_FLAGS}>)
|
||||||
target_link_options(libguf_test PRIVATE ${WARNING_FLAGS_CXX} $<$<CONFIG:Debug>: ${DBG_FLAGS}> )
|
target_link_options(libguf_test PRIVATE ${WARNING_FLAGS_CXX} $<$<CONFIG:Debug>: ${DBG_FLAGS}> )
|
||||||
|
|
||||||
|
|||||||
@ -36,7 +36,7 @@ typedef struct guf_str {
|
|||||||
|
|
||||||
typedef struct guf_str_view {
|
typedef struct guf_str_view {
|
||||||
const char *str;
|
const char *str;
|
||||||
size_t len;
|
ptrdiff_t len;
|
||||||
} guf_str_view;
|
} guf_str_view;
|
||||||
|
|
||||||
#define GUF_CSTR_TO_VIEW(CSTR) ((guf_str_view){.str = (CSTR), .len = strlen((CSTR))})
|
#define GUF_CSTR_TO_VIEW(CSTR) ((guf_str_view){.str = (CSTR), .len = strlen((CSTR))})
|
||||||
|
|||||||
127
src/guf_utf8.h
127
src/guf_utf8.h
@ -7,17 +7,30 @@
|
|||||||
#ifndef GUF_UTF8_H
|
#ifndef GUF_UTF8_H
|
||||||
#define GUF_UTF8_H
|
#define GUF_UTF8_H
|
||||||
#include "guf_common.h"
|
#include "guf_common.h"
|
||||||
|
#include "guf_str.h"
|
||||||
|
|
||||||
typedef struct guf_utf8_char {
|
typedef struct guf_utf8_char {
|
||||||
unsigned char bytes[4];
|
char bytes[5];
|
||||||
} guf_utf8_char;
|
} guf_utf8_char;
|
||||||
|
|
||||||
|
typedef enum guf_utf8_stat {
|
||||||
|
GUF_UTF8_READ_DONE,
|
||||||
|
GUF_UTF8_READ_VALID,
|
||||||
|
GUF_UTF8_READ_INVALID,
|
||||||
|
GUF_UTF8_READ_TRUNCATED,
|
||||||
|
} guf_utf8_stat;
|
||||||
|
|
||||||
static inline bool guf_char_is_ascii(int c) {return c <= 0 && c <= 127;}
|
static inline bool guf_char_is_ascii(int c) {return c <= 0 && c <= 127;}
|
||||||
static inline bool guf_uchar_is_ascii(unsigned char c) {return c <= 127;}
|
static inline bool guf_uchar_is_ascii(unsigned char c) {return c <= 127;}
|
||||||
|
|
||||||
GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c);
|
GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c);
|
||||||
|
GUF_FN_KEYWORDS int guf_utf8_char_num_bytes(guf_utf8_char *c);
|
||||||
|
|
||||||
|
GUF_FN_KEYWORDS guf_utf8_char guf_utf8_char_new(const char *bytes, int num_bytes);
|
||||||
GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c);
|
GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c);
|
||||||
|
GUF_FN_KEYWORDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c);
|
||||||
|
|
||||||
|
GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -26,6 +39,46 @@
|
|||||||
#include "guf_common.h"
|
#include "guf_common.h"
|
||||||
#include "guf_assert.h"
|
#include "guf_assert.h"
|
||||||
|
|
||||||
|
GUF_FN_KEYWORDS guf_utf8_stat guf_utf8_char_next(guf_utf8_char *res, guf_str_view *str)
|
||||||
|
{
|
||||||
|
GUF_ASSERT_RELEASE(res);
|
||||||
|
GUF_ASSERT_RELEASE(str);
|
||||||
|
|
||||||
|
if (str->len <= 0 || str->str == NULL) {
|
||||||
|
return GUF_UTF8_READ_DONE;
|
||||||
|
}
|
||||||
|
|
||||||
|
int consumed = 0;
|
||||||
|
res->bytes[consumed++] = str->str[0];
|
||||||
|
str->len--;
|
||||||
|
str->str = str->len ? str->str + 1 : NULL;
|
||||||
|
|
||||||
|
for (size_t i = 1; i < GUF_STATIC_BUF_SIZE(res->bytes); ++i) {
|
||||||
|
res->bytes[i] = '\0';
|
||||||
|
}
|
||||||
|
|
||||||
|
const int num_bytes = guf_utf8_char_num_bytes(res);
|
||||||
|
|
||||||
|
if (!num_bytes) {
|
||||||
|
return GUF_UTF8_READ_INVALID;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (consumed < num_bytes && str->len > 0) {
|
||||||
|
res->bytes[consumed++] = str->str[0];
|
||||||
|
str->len--;
|
||||||
|
str->str = str->len ? str->str + 1 : NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (consumed < num_bytes) {
|
||||||
|
return GUF_UTF8_READ_TRUNCATED;
|
||||||
|
} else if (guf_utf8_char_is_valid(res)) {
|
||||||
|
return GUF_UTF8_READ_VALID;
|
||||||
|
} else {
|
||||||
|
return GUF_UTF8_READ_INVALID;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// cf. https://www.rfc-editor.org/rfc/rfc3629#page-4
|
// cf. https://www.rfc-editor.org/rfc/rfc3629#page-4
|
||||||
GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c)
|
GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c)
|
||||||
{
|
{
|
||||||
@ -42,6 +95,13 @@ GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GUF_FN_KEYWORDS int guf_utf8_char_num_bytes(guf_utf8_char *c)
|
||||||
|
{
|
||||||
|
GUF_ASSERT(c);
|
||||||
|
return guf_utf8_num_bytes(c->bytes[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c)
|
GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c)
|
||||||
{
|
{
|
||||||
const int num_bytes = guf_utf8_num_bytes(c->bytes[0]);
|
const int num_bytes = guf_utf8_num_bytes(c->bytes[0]);
|
||||||
@ -50,9 +110,11 @@ GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const unsigned char *bytes = (const unsigned char*)c->bytes;
|
||||||
|
|
||||||
for (int i = 0; i < num_bytes; ++i) {
|
for (int i = 0; i < num_bytes; ++i) {
|
||||||
// "The octet values C0, C1, F5 to FF never appear.", cf. https://www.rfc-editor.org/rfc/rfc3629#page-5
|
// "The octet values C0, C1, F5 to FF never appear.", cf. https://www.rfc-editor.org/rfc/rfc3629#page-5
|
||||||
if (c->bytes[i] == 0xC0 || c->bytes[i] == 0xC1 || (c->bytes[i] >= 0xF5 && c->bytes[i] <= 0xFF)) {
|
if (bytes[i] == 0xC0 || bytes[i] == 0xC1 || (bytes[i] >= 0xF5 && bytes[i] <= 0xFF)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -63,37 +125,35 @@ GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c)
|
|||||||
// cf. https://datatracker.ietf.org/doc/html/rfc3629#page-5
|
// cf. https://datatracker.ietf.org/doc/html/rfc3629#page-5
|
||||||
switch (num_bytes)
|
switch (num_bytes)
|
||||||
{
|
{
|
||||||
case 1:
|
case 1:
|
||||||
GUF_ASSERT(c->bytes[0] <= 0x7F);
|
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
case 2:
|
case 2:
|
||||||
GUF_ASSERT(c->bytes[0] >= 0xC2 && c->bytes[0] <= 0xDF);
|
return guf_valid_tail(bytes[1]);
|
||||||
return guf_valid_tail(c->bytes[1]);
|
|
||||||
|
|
||||||
case 3:
|
case 3:
|
||||||
if ((c->bytes[0] == 0xE0) && (c->bytes[1] >= 0xA0 && c->bytes[1] <= 0xBF) && guf_valid_tail(c->bytes[2])) {
|
if ((bytes[0] == 0xE0) && (bytes[1] >= 0xA0 && bytes[1] <= 0xBF) && guf_valid_tail(bytes[2])) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if ((c->bytes[0] >= 0xE1 && c->bytes[0] <= 0xEC) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2])) {
|
if ((bytes[0] >= 0xE1 && bytes[0] <= 0xEC) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2])) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if ((c->bytes[0] == 0xED) && (c->bytes[1] >= 0x80 && c->bytes[1] <= 0x9F) && guf_valid_tail(c->bytes[2])) {
|
if ((bytes[0] == 0xED) && (bytes[1] >= 0x80 && bytes[1] <= 0x9F) && guf_valid_tail(bytes[2])) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if ((c->bytes[0] >= 0xEE && c->bytes[0] <= 0xEF) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2])) {
|
if ((bytes[0] >= 0xEE && bytes[0] <= 0xEF) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2])) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
case 4:
|
case 4:
|
||||||
if ((c->bytes[0] == 0xF0) && (c->bytes[1] >= 0x90 && c->bytes[1] <= 0xBF) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) {
|
if ((bytes[0] == 0xF0) && (bytes[1] >= 0x90 && bytes[1] <= 0xBF) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if ((c->bytes[0] >= 0xF1 && c->bytes[0] <= 0xF3) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) {
|
if ((bytes[0] >= 0xF1 && bytes[0] <= 0xF3) && guf_valid_tail(bytes[1]) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if ((c->bytes[0] == 0xF4) && (c->bytes[1] >= 0x80 && c->bytes[1] <= 0x8F) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) {
|
if ((bytes[0] == 0xF4) && (bytes[1] >= 0x80 && bytes[1] <= 0x8F) && guf_valid_tail(bytes[2]) && guf_valid_tail(bytes[3])) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
@ -101,10 +161,49 @@ GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c)
|
|||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef guf_valid_tail
|
#undef guf_valid_tail
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GUF_FN_KEYWORDS bool guf_utf8_char_is_whitespace(const guf_utf8_char *c)
|
||||||
|
{
|
||||||
|
// cf. https://en.wikipedia.org/wiki/Whitespace_character#Unicode (last-retrieved 2025-02-27)
|
||||||
|
const char *ws_one_byte[] = {" ", "\n", "\t", "\t", "\v", "\f"};
|
||||||
|
const char *ws_two_bytes[] = {"\xC2\x85", "\xC2\xA0"};
|
||||||
|
const char *ws_three_bytes[] = {"\xE1\x9A\x80", "\xE2\x80\x80", "\xE2\x80\x81", "\xE2\x80\x82", "\xE2\x80\x83", "\xE2\x80\x84", "\xE2\x80\x85", "\xE2\x80\x86", "\xE2\x80\x87", "\xE2\x80\x88", "\xE2\x80\x89", "\xE2\x80\x8A", "\xE2\x80\xA8", "\xE2\x80\xA9", "\xE2\x80\xAF", "\xE2\x81\x9F", "\xE3\x80\x80"};
|
||||||
|
|
||||||
|
const int num_bytes = guf_utf8_num_bytes(c->bytes[0]);
|
||||||
|
|
||||||
|
switch (num_bytes)
|
||||||
|
{
|
||||||
|
case 1:
|
||||||
|
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_one_byte); ++i) {
|
||||||
|
if (c->bytes[0] == ws_one_byte[i][0]) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
|
||||||
|
case 2:
|
||||||
|
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_two_bytes); ++i) {
|
||||||
|
if (c->bytes[0] == ws_two_bytes[i][0] && c->bytes[1] == ws_two_bytes[i][1]) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
|
||||||
|
case 3:
|
||||||
|
for (size_t i = 0; i < GUF_STATIC_BUF_SIZE(ws_three_bytes); ++i) {
|
||||||
|
if (c->bytes[0] == ws_three_bytes[i][0] && c->bytes[1] == ws_three_bytes[i][1] && c->bytes[2] == ws_three_bytes[i][2]) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#undef GUF_FN_KEYWORDS
|
#undef GUF_FN_KEYWORDS
|
||||||
|
|||||||
@ -1,13 +0,0 @@
|
|||||||
„Ich weiß nicht“, rief ich ohne Klang „ich weiß ja nicht. Wenn
|
|
||||||
niemand kommt, dann kommt eben niemand. Ich habe niemandem etwas
|
|
||||||
Böses getan, niemand hat mir etwas Böses getan, niemand aber will
|
|
||||||
mir helfen. Lauter niemand. Aber so ist es doch nicht. Nur daß mir
|
|
||||||
niemand hilft —, sonst wäre lauter niemand hübsch. Ich würde ganz
|
|
||||||
gern — warum denn nicht — einen Ausflug mit einer Gesellschaft von
|
|
||||||
lauter Niemand machen. Natürlich ins Gebirge, wohin denn sonst? Wie
|
|
||||||
sich diese Niemand aneinander drängen, diese vielen quer gestreckten
|
|
||||||
und eingehängten Arme, diese vielen Füße, durch winzige Schritte
|
|
||||||
getrennt! Versteht sich, daß alle in Frack sind. Wir gehen so lala,
|
|
||||||
der Wind fährt durch die Lücken, die wir und unsere Gliedmaßen offen
|
|
||||||
lassen. Die Hälse werden im Gebirge frei! Es ist ein Wunder, daß
|
|
||||||
wir nicht singen.“
|
|
||||||
49
src/test/data/utf8-test.txt
Normal file
49
src/test/data/utf8-test.txt
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
„Ich weiß nicht“, rief ich ohne Klang „ich weiß ja nicht. Wenn
|
||||||
|
niemand kommt, dann kommt eben niemand. Ich habe niemandem etwas
|
||||||
|
Böses getan, niemand hat mir etwas Böses getan, niemand aber will
|
||||||
|
mir helfen. Lauter niemand. Aber so ist es doch nicht. Nur daß mir
|
||||||
|
niemand hilft —, sonst wäre lauter niemand hübsch. Ich würde ganz
|
||||||
|
gern — warum denn nicht — einen Ausflug mit einer Gesellschaft von
|
||||||
|
lauter Niemand machen. Natürlich ins Gebirge, wohin denn sonst? Wie
|
||||||
|
sich diese Niemand aneinander drängen, diese vielen quer gestreckten
|
||||||
|
und eingehängten Arme, diese vielen Füße, durch winzige Schritte
|
||||||
|
getrennt! Versteht sich, daß alle in Frack sind. Wir gehen so lala,
|
||||||
|
der Wind fährt durch die Lücken, die wir und unsere Gliedmaßen offen
|
||||||
|
lassen. Die Hälse werden im Gebirge frei! Es ist ein Wunder, daß
|
||||||
|
wir nicht singen.“
|
||||||
|
|
||||||
|
Det var i den Tid, jeg gik omkring og sulted i Kristiania, denne forunderlige By,
|
||||||
|
som ingen forlader, før han har fået Mærker af den . . . .
|
||||||
|
Jeg ligger vågen på min Kvist og hører en Klokke nedenunder mig slå seks Slag; det var allerede ganske lyst,
|
||||||
|
og Folk begyndte at færdes op og ned i Trapperne. Nede ved Døren, hvor mit Rum var tapetseret med gamle Numre
|
||||||
|
af »Morgenbladet«, kunde jeg så tydelig se en Bekendtgørelse fra Fyrdirektøren, og lidt tilvenstre derfra et fedt,
|
||||||
|
bugnende Avertissement fra Bager Fabian Olsen om nybagt Brød.
|
||||||
|
|
||||||
|
The quick brown fox jumps over the lazy dog.
|
||||||
|
|
||||||
|
Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen Wolther spillede på xylofon.
|
||||||
|
|
||||||
|
Falsches Üben von Xylophonmusik quält jeden größeren Zwerg.
|
||||||
|
|
||||||
|
Ξεσκεπάζω τὴν ψυχοφθόρα βδελυγμία.
|
||||||
|
|
||||||
|
El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba a su querido cachorro.
|
||||||
|
|
||||||
|
Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en
|
||||||
|
canoë au delà des îles, près du mälström où brûlent les novæ.
|
||||||
|
|
||||||
|
D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh.
|
||||||
|
|
||||||
|
Árvíztűrő tükörfúrógép.
|
||||||
|
|
||||||
|
Pchnąć w tę łódź jeża lub ośm skrzyń fig.
|
||||||
|
|
||||||
|
Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa.
|
||||||
|
|
||||||
|
В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
|
||||||
|
|
||||||
|
Pijamalı hasta, yağız şoföre çabucak güvendi.
|
||||||
|
|
||||||
|
ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ
|
||||||
|
ᛋᚳᛖᚪᛚ᛫ᚦᛖᚪᚻ᛫ᛗᚪᚾᚾᚪ᛫ᚷᛖᚻᚹᛦᛚᚳ᛫ᛗᛁᚳᛚᚢᚾ᛫ᚻᛦᛏ᛫ᛞᚫᛚᚪᚾ
|
||||||
|
ᚷᛁᚠ᛫ᚻᛖ᛫ᚹᛁᛚᛖ᛫ᚠᚩᚱ᛫ᛞᚱᛁᚻᛏᚾᛖ᛫ᛞᚩᛗᛖᛋ᛫ᚻᛚᛇᛏᚪᚾ᛬
|
||||||
@ -6,8 +6,8 @@
|
|||||||
#define GUF_IMPL
|
#define GUF_IMPL
|
||||||
#include "guf_dbuf.h"
|
#include "guf_dbuf.h"
|
||||||
|
|
||||||
#define GUF_CNT_NAME dbuf_uchar
|
#define GUF_CNT_NAME dbuf_char
|
||||||
#define GUF_T uchar
|
#define GUF_T char
|
||||||
#define GUF_T_IS_INTEGRAL_TYPE
|
#define GUF_T_IS_INTEGRAL_TYPE
|
||||||
#define GUF_IMPL
|
#define GUF_IMPL
|
||||||
#include "guf_dbuf.h"
|
#include "guf_dbuf.h"
|
||||||
|
|||||||
@ -10,8 +10,8 @@
|
|||||||
|
|
||||||
typedef unsigned char uchar;
|
typedef unsigned char uchar;
|
||||||
|
|
||||||
#define GUF_CNT_NAME dbuf_uchar
|
#define GUF_CNT_NAME dbuf_char
|
||||||
#define GUF_T uchar
|
#define GUF_T char
|
||||||
#define GUF_T_IS_INTEGRAL_TYPE
|
#define GUF_T_IS_INTEGRAL_TYPE
|
||||||
#include "guf_dbuf.h"
|
#include "guf_dbuf.h"
|
||||||
|
|
||||||
|
|||||||
@ -16,7 +16,7 @@ struct DictCstrToIntTest : public Test
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
dbuf_uchar text_buf {};
|
dbuf_char text_buf {};
|
||||||
std::vector<char> text_vec {};
|
std::vector<char> text_vec {};
|
||||||
|
|
||||||
void insert_lookup()
|
void insert_lookup()
|
||||||
@ -25,34 +25,22 @@ struct DictCstrToIntTest : public Test
|
|||||||
dict_cstr_int word_cnt_dict {};
|
dict_cstr_int word_cnt_dict {};
|
||||||
dict_cstr_int_init(&word_cnt_dict, &guf_allocator_libc);
|
dict_cstr_int_init(&word_cnt_dict, &guf_allocator_libc);
|
||||||
|
|
||||||
ptrdiff_t len = 0;
|
ptrdiff_t valid_chars = 0, invalid_chars = 0, bytes = text_buf.size;
|
||||||
|
guf_str_view input_str = {.str = text_buf.data, .len = text_buf.size};
|
||||||
|
guf_utf8_char ch = {};
|
||||||
|
|
||||||
for (dbuf_uchar_iter it = dbuf_uchar_begin(&text_buf); !dbuf_uchar_iter_is_end(&text_buf, it); it = dbuf_uchar_iter_next(&text_buf, it, 1)) {
|
for (guf_utf8_stat stat = guf_utf8_char_next(&ch, &input_str); stat != GUF_UTF8_READ_DONE; stat = guf_utf8_char_next(&ch, &input_str)) {
|
||||||
const unsigned char c = *it.ptr;
|
if (stat == GUF_UTF8_READ_VALID) {
|
||||||
guf_utf8_char utf8_c = {.bytes = {c, 0, 0, 0}};
|
++valid_chars;
|
||||||
const int num_bytes = guf_utf8_num_bytes(c);
|
printf("%s", ch.bytes);
|
||||||
|
} else {
|
||||||
if (!num_bytes) {
|
++invalid_chars;
|
||||||
continue;
|
printf("::INVALID_UTF8_CHAR::");
|
||||||
}
|
|
||||||
|
|
||||||
int consumed = 1;
|
|
||||||
while (consumed < num_bytes && ((it = dbuf_uchar_iter_next(&text_buf, it, 1)), !dbuf_uchar_iter_is_end(&text_buf, it)) ) {
|
|
||||||
utf8_c.bytes[consumed++] = *it.ptr;
|
|
||||||
}
|
|
||||||
if (consumed < num_bytes) {
|
|
||||||
printf("Invalid utf-8: file is truncated\n");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (guf_utf8_char_is_valid(&utf8_c) && utf8_c.bytes[0] != '\0') {
|
|
||||||
char str[5] {};
|
|
||||||
memcpy(str, utf8_c.bytes, num_bytes);
|
|
||||||
printf("%s", str);
|
|
||||||
++len;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("\nread %td utf-8 characters\n", len);
|
TEST_CHECK(input_str.len == 0 && input_str.str == NULL);
|
||||||
|
printf("\nread %td bytes\n", bytes);
|
||||||
|
printf("read %td valid and %td invalid utf-8 characters\n", valid_chars, invalid_chars);
|
||||||
|
|
||||||
dict_cstr_int_free(&word_cnt_dict, NULL);
|
dict_cstr_int_free(&word_cnt_dict, NULL);
|
||||||
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;
|
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;
|
||||||
@ -61,30 +49,27 @@ struct DictCstrToIntTest : public Test
|
|||||||
|
|
||||||
bool load_file()
|
bool load_file()
|
||||||
{
|
{
|
||||||
#define TEST_DATA_DIR "/Users/joni/Desktop/libguf/src/test/data"
|
|
||||||
FILE *in_file {nullptr};
|
FILE *in_file {nullptr};
|
||||||
if (!in_file) {
|
if (!in_file) {
|
||||||
in_file = fopen(TEST_DATA_DIR "/data_01.txt", "r");
|
in_file = fopen(TEST_DATA_DIR "/utf8-test.txt", "r");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!in_file) {
|
if (!in_file) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
dbuf_uchar_init(&text_buf, 128, &guf_allocator_libc);
|
dbuf_char_init(&text_buf, 128, &guf_allocator_libc);
|
||||||
|
|
||||||
int c = EOF;
|
int c = EOF;
|
||||||
while ((c = fgetc(in_file)) != EOF) {
|
while ((c = fgetc(in_file)) != EOF) {
|
||||||
dbuf_uchar_push_val(&text_buf, (unsigned char)c);
|
dbuf_char_push_val(&text_buf, (char)c);
|
||||||
text_vec.push_back((unsigned char)c);
|
text_vec.push_back((char)c);
|
||||||
}
|
}
|
||||||
fclose(in_file);
|
fclose(in_file);
|
||||||
|
|
||||||
if (*dbuf_uchar_back(&text_buf) != '\0') {
|
// dbuf_char_insert_val(&text_buf, '\xC0', 1);
|
||||||
dbuf_uchar_push_val(&text_buf, '\0');
|
// text_vec.insert(text_vec.cbegin() + 1, '\xC0');
|
||||||
text_vec.push_back('\0');
|
|
||||||
}
|
|
||||||
|
|
||||||
return TEST_CHECK(std::ssize(text_vec) == text_buf.size);
|
return TEST_CHECK(std::ssize(text_vec) == text_buf.size);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -103,7 +88,7 @@ struct DictCstrToIntTest : public Test
|
|||||||
insert_lookup();
|
insert_lookup();
|
||||||
|
|
||||||
end:
|
end:
|
||||||
dbuf_uchar_free(&text_buf, NULL);
|
dbuf_char_free(&text_buf, NULL);
|
||||||
text_buf = {};
|
text_buf = {};
|
||||||
|
|
||||||
passed = (num_failed_checks == 0);
|
passed = (num_failed_checks == 0);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user