Work on utf-8

This commit is contained in:
jun 2025-02-26 03:57:04 +01:00
parent 7a990c810e
commit 60e2849b01
11 changed files with 262 additions and 8 deletions

View File

@ -21,7 +21,7 @@ endif ()
add_executable(libguf_example src/test/example.c src/test/guf_dict_impl.c)
target_include_directories(libguf_example PRIVATE src src/test)
add_executable(libguf_test src/test/test.cpp src/test/guf_dbuf_impl.c src/test/guf_dict_impl.c src/test/guf_rand_impl.c src/test/guf_sort_impl.c)
add_executable(libguf_test src/test/test.cpp src/test/guf_dbuf_impl.c src/test/guf_dict_impl.c src/test/guf_rand_impl.c src/test/guf_sort_impl.c src/test/guf_utf8_impl.c)
target_include_directories(libguf_test PRIVATE src src/test)
if (NOT DEFINED MSVC)

View File

@ -687,9 +687,14 @@ guf_str_codepoint_utf8 guf_str_iterate_utf8(const guf_str *str, size_t *idx)
guf_str_codepoint_utf8 cp = {.num_bytes = 1, .bytes = {'\0', '\0', '\0', '\0', '\0'}, .valid = true};
const unsigned char four_bytes_mask = 240;
const unsigned char three_bytes_mask = 224;
const unsigned char two_bytes_mask = 192;
const unsigned char four_bytes_val = 240; // 0b1111.0xxx
const unsigned char four_bytes_mask = 248; // 0b1111.1000
const unsigned char three_bytes_val = 224; // 0b1110.xxxx
const unsigned char three_bytes_mask = 240; // 0b1111.0000
const unsigned char two_bytes_val = 192; // 0b110x.xxxx
const unsigned char two_bytes_mask = 224 ; // 0b1110.0000
size_t i = *idx;
if (guf_str_char_is_ascii(c_str[i])) {
@ -732,7 +737,7 @@ guf_str_codepoint_utf8 guf_str_iterate_utf8(const guf_str *str, size_t *idx)
size_t id = i + j;
assert(id < len);
unsigned char byte = c_str[id];
if (byte >= 128 && byte < 192) { // Binary: 10......
if (byte >= 128 && byte < 192) { // 0b10xx.xxxx
cp.bytes[id] = byte;
} else {
cp.valid = false;

113
src/guf_utf8.h Normal file
View File

@ -0,0 +1,113 @@
#if defined(GUF_STATIC) || defined(GUF_STATIC_IMPL)
#define GUF_FN_KEYWORDS static
#else
#define GUF_FN_KEYWORDS
#endif
#ifndef GUF_UTF8_H
#define GUF_UTF8_H
#include "guf_common.h"
typedef struct guf_utf8_char {
unsigned char bytes[4];
} guf_utf8_char;
static inline bool guf_char_is_ascii(int c) {return c <= 0 && c <= 127;}
static inline bool guf_uchar_is_ascii(unsigned char c) {return c <= 127;}
GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c);
GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c);
#endif
#if defined(GUF_IMPL) || defined(GUF_IMPL_STATIC)
#include "guf_common.h"
#include "guf_assert.h"
// cf. https://www.rfc-editor.org/rfc/rfc3629#page-4
GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c)
{
if (c <= 0x7F) { // bits: 0xxx.xxxx
return 1;
} else if (c >= 0xC2 && c <= 0xDF) { // bits: 110x.xxxx (without 0xC0 and 0xC1)
return 2;
} else if (c >= 0xE0 && c <= 0xEF) { // bits: 1110.xxxx
return 3;
} else if (c >= 0xF0 && c <= 0xF4) { // bits: b1111.0xxx (without 0xF5 to 0xFF)
return 4;
} else {
return 0; // Invalid byte.
}
}
GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c)
{
const int num_bytes = guf_utf8_num_bytes(c->bytes[0]);
if (!num_bytes) {
return false;
}
for (int i = 0; i < num_bytes; ++i) {
// "The octet values C0, C1, F5 to FF never appear.", cf. https://www.rfc-editor.org/rfc/rfc3629#page-5
if (c->bytes[i] == 0xC0 || c->bytes[i] == 0xC1 || (c->bytes[i] >= 0xF5 && c->bytes[i] <= 0xFF)) {
return false;
}
}
// Binary: 10xx.xxxx
#define guf_valid_tail(byte) ((byte) >= 0x80 && (byte) <= 0xBF)
// cf. https://datatracker.ietf.org/doc/html/rfc3629#page-5
switch (num_bytes)
{
case 1:
GUF_ASSERT(c->bytes[0] <= 0x7F);
return true;
case 2:
GUF_ASSERT(c->bytes[0] >= 0xC2 && c->bytes[0] <= 0xDF);
return guf_valid_tail(c->bytes[1]);
case 3:
if ((c->bytes[0] == 0xE0) && (c->bytes[1] >= 0xA0 && c->bytes[1] <= 0xBF) && guf_valid_tail(c->bytes[2])) {
return true;
}
if ((c->bytes[0] >= 0xE1 && c->bytes[0] <= 0xEC) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2])) {
return true;
}
if ((c->bytes[0] == 0xED) && (c->bytes[1] >= 0x80 && c->bytes[1] <= 0x9F) && guf_valid_tail(c->bytes[2])) {
return true;
}
if ((c->bytes[0] >= 0xEE && c->bytes[0] <= 0xEF) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2])) {
return true;
}
return false;
case 4:
if ((c->bytes[0] == 0xF0) && (c->bytes[1] >= 0x90 && c->bytes[1] <= 0xBF) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) {
return true;
}
if ((c->bytes[0] >= 0xF1 && c->bytes[0] <= 0xF3) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) {
return true;
}
if ((c->bytes[0] == 0xF4) && (c->bytes[1] >= 0x80 && c->bytes[1] <= 0x8F) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) {
return true;
}
return false;
default:
return false;
}
#undef guf_valid_tail
}
#endif
#undef GUF_FN_KEYWORDS
#undef GUF_IMPL
#undef GUF_IMPL_STATIC
#undef GUF_STATIC

View File

@ -6,6 +6,12 @@
#define GUF_IMPL
#include "guf_dbuf.h"
#define GUF_CNT_NAME dbuf_uchar
#define GUF_T uchar
#define GUF_T_IS_INTEGRAL_TYPE
#define GUF_IMPL
#include "guf_dbuf.h"
#define GUF_CNT_NAME dbuf_float
#define GUF_T float
#define GUF_T_IS_INTEGRAL_TYPE

View File

@ -8,6 +8,13 @@
#define GUF_T_IS_INTEGRAL_TYPE
#include "guf_dbuf.h"
typedef unsigned char uchar;
#define GUF_CNT_NAME dbuf_uchar
#define GUF_T uchar
#define GUF_T_IS_INTEGRAL_TYPE
#include "guf_dbuf.h"
#define GUF_CNT_NAME dbuf_float
#define GUF_T float
#define GUF_T_IS_INTEGRAL_TYPE

View File

@ -16,12 +16,10 @@ static inline guf_hash_size_t int32_hash(const int32_t *a)
{
return guf_hash(a, sizeof(int32_t), GUF_HASH_INIT);
}
static inline bool int32_eq(const int32_t *a, const int32_t *b)
{
return *a == *b;
}
#define GUF_DICT_KEY_T int32_t
#define GUF_DICT_KEY_HASH int32_hash
#define GUF_DICT_KEY_T_EQ int32_eq

4
src/test/guf_utf8_impl.c Normal file
View File

@ -0,0 +1,4 @@
#include "guf_utf8.h"
#define GUF_IMPL
#include "guf_utf8.h"

View File

@ -8,6 +8,7 @@ extern "C" {
}
#include "test_dbuf.hpp"
#include "test_dict.hpp"
std::unordered_set<std::unique_ptr<Test>> g_tests {};
@ -20,6 +21,10 @@ void init_tests()
test = std::make_unique<DbufCstringTest>("DbufCstringTest");
GUF_ASSERT_RELEASE(test.get());
g_tests.insert(std::move(test));
test = std::make_unique<DictCstrToIntTest>("DictCstrToIntTest");
GUF_ASSERT_RELEASE(test.get());
g_tests.insert(std::move(test));
}
int main()

View File

@ -10,7 +10,7 @@ extern "C"
struct DbufIntTest : public Test
{
DbufIntTest(std::string name) : Test(name) {};
DbufIntTest(const std::string& name) : Test(name) {};
private:

114
src/test/test_dict.hpp Normal file
View File

@ -0,0 +1,114 @@
#pragma once
#include <unordered_map>
#include "test.hpp"
extern "C"
{
#include "guf_alloc_libc.h"
#include "guf_dict_impl.h"
#include "guf_utf8.h"
}
struct DictCstrToIntTest : public Test
{
DictCstrToIntTest(const std::string& name) : Test(name) {};
private:
dbuf_uchar text_buf {};
std::vector<char> text_vec {};
void insert_lookup()
{
std::unordered_map<std::string, int> word_cnt_map {};
dict_cstr_int word_cnt_dict {};
dict_cstr_int_init(&word_cnt_dict, &guf_allocator_libc);
ptrdiff_t len = 0;
for (dbuf_uchar_iter it = dbuf_uchar_begin(&text_buf); !dbuf_uchar_iter_is_end(&text_buf, it); it = dbuf_uchar_iter_next(&text_buf, it, 1)) {
const unsigned char c = *it.ptr;
guf_utf8_char utf8_c = {.bytes = {c, 0, 0, 0}};
const int num_bytes = guf_utf8_num_bytes(c);
if (!num_bytes) {
continue;
}
int consumed = 1;
while (consumed < num_bytes && ((it = dbuf_uchar_iter_next(&text_buf, it, 1)), !dbuf_uchar_iter_is_end(&text_buf, it)) ) {
utf8_c.bytes[consumed++] = *it.ptr;
}
if (consumed < num_bytes) {
printf("Invalid utf-8: file is truncated\n");
break;
}
if (guf_utf8_char_is_valid(&utf8_c) && utf8_c.bytes[0] != '\0') {
char str[5] {};
memcpy(str, utf8_c.bytes, num_bytes);
printf("%s", str);
++len;
}
}
printf("\nread %td utf-8 characters\n", len);
dict_cstr_int_free(&word_cnt_dict, NULL);
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;
TEST_CHECK(!dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones);
}
bool load_file()
{
#define TEST_DATA_DIR "/Users/joni/Desktop/libguf/src/test/data"
FILE *in_file {nullptr};
if (!in_file) {
in_file = fopen(TEST_DATA_DIR "/data_01.txt", "r");
}
if (!in_file) {
return false;
}
dbuf_uchar_init(&text_buf, 128, &guf_allocator_libc);
int c = EOF;
while ((c = fgetc(in_file)) != EOF) {
dbuf_uchar_push_val(&text_buf, (unsigned char)c);
text_vec.push_back((unsigned char)c);
}
fclose(in_file);
if (*dbuf_uchar_back(&text_buf) != '\0') {
dbuf_uchar_push_val(&text_buf, '\0');
text_vec.push_back('\0');
}
return TEST_CHECK(std::ssize(text_vec) == text_buf.size);
}
public:
bool run() override
{
if (done) {
return passed;
}
if (!TEST_CHECK(load_file())) {
goto end;
}
insert_lookup();
end:
dbuf_uchar_free(&text_buf, NULL);
text_buf = {};
passed = (num_failed_checks == 0);
done = true;
return passed;
}
};

2
todo.txt Normal file
View File

@ -0,0 +1,2 @@
- guf_stack, guf_queue, guf_ringbuf
- guf_rand etc.: move guf_fn_keywors out of header guard?