Work on utf-8
This commit is contained in:
parent
7a990c810e
commit
60e2849b01
@ -21,7 +21,7 @@ endif ()
|
||||
add_executable(libguf_example src/test/example.c src/test/guf_dict_impl.c)
|
||||
target_include_directories(libguf_example PRIVATE src src/test)
|
||||
|
||||
add_executable(libguf_test src/test/test.cpp src/test/guf_dbuf_impl.c src/test/guf_dict_impl.c src/test/guf_rand_impl.c src/test/guf_sort_impl.c)
|
||||
add_executable(libguf_test src/test/test.cpp src/test/guf_dbuf_impl.c src/test/guf_dict_impl.c src/test/guf_rand_impl.c src/test/guf_sort_impl.c src/test/guf_utf8_impl.c)
|
||||
target_include_directories(libguf_test PRIVATE src src/test)
|
||||
|
||||
if (NOT DEFINED MSVC)
|
||||
|
||||
@ -687,9 +687,14 @@ guf_str_codepoint_utf8 guf_str_iterate_utf8(const guf_str *str, size_t *idx)
|
||||
|
||||
guf_str_codepoint_utf8 cp = {.num_bytes = 1, .bytes = {'\0', '\0', '\0', '\0', '\0'}, .valid = true};
|
||||
|
||||
const unsigned char four_bytes_mask = 240;
|
||||
const unsigned char three_bytes_mask = 224;
|
||||
const unsigned char two_bytes_mask = 192;
|
||||
const unsigned char four_bytes_val = 240; // 0b1111.0xxx
|
||||
const unsigned char four_bytes_mask = 248; // 0b1111.1000
|
||||
|
||||
const unsigned char three_bytes_val = 224; // 0b1110.xxxx
|
||||
const unsigned char three_bytes_mask = 240; // 0b1111.0000
|
||||
|
||||
const unsigned char two_bytes_val = 192; // 0b110x.xxxx
|
||||
const unsigned char two_bytes_mask = 224 ; // 0b1110.0000
|
||||
|
||||
size_t i = *idx;
|
||||
if (guf_str_char_is_ascii(c_str[i])) {
|
||||
@ -732,7 +737,7 @@ guf_str_codepoint_utf8 guf_str_iterate_utf8(const guf_str *str, size_t *idx)
|
||||
size_t id = i + j;
|
||||
assert(id < len);
|
||||
unsigned char byte = c_str[id];
|
||||
if (byte >= 128 && byte < 192) { // Binary: 10......
|
||||
if (byte >= 128 && byte < 192) { // 0b10xx.xxxx
|
||||
cp.bytes[id] = byte;
|
||||
} else {
|
||||
cp.valid = false;
|
||||
|
||||
113
src/guf_utf8.h
Normal file
113
src/guf_utf8.h
Normal file
@ -0,0 +1,113 @@
|
||||
#if defined(GUF_STATIC) || defined(GUF_STATIC_IMPL)
|
||||
#define GUF_FN_KEYWORDS static
|
||||
#else
|
||||
#define GUF_FN_KEYWORDS
|
||||
#endif
|
||||
|
||||
#ifndef GUF_UTF8_H
|
||||
#define GUF_UTF8_H
|
||||
#include "guf_common.h"
|
||||
|
||||
typedef struct guf_utf8_char {
|
||||
unsigned char bytes[4];
|
||||
} guf_utf8_char;
|
||||
|
||||
static inline bool guf_char_is_ascii(int c) {return c <= 0 && c <= 127;}
|
||||
static inline bool guf_uchar_is_ascii(unsigned char c) {return c <= 127;}
|
||||
|
||||
GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c);
|
||||
|
||||
GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c);
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(GUF_IMPL) || defined(GUF_IMPL_STATIC)
|
||||
|
||||
#include "guf_common.h"
|
||||
#include "guf_assert.h"
|
||||
|
||||
// cf. https://www.rfc-editor.org/rfc/rfc3629#page-4
|
||||
GUF_FN_KEYWORDS int guf_utf8_num_bytes(unsigned char c)
|
||||
{
|
||||
if (c <= 0x7F) { // bits: 0xxx.xxxx
|
||||
return 1;
|
||||
} else if (c >= 0xC2 && c <= 0xDF) { // bits: 110x.xxxx (without 0xC0 and 0xC1)
|
||||
return 2;
|
||||
} else if (c >= 0xE0 && c <= 0xEF) { // bits: 1110.xxxx
|
||||
return 3;
|
||||
} else if (c >= 0xF0 && c <= 0xF4) { // bits: b1111.0xxx (without 0xF5 to 0xFF)
|
||||
return 4;
|
||||
} else {
|
||||
return 0; // Invalid byte.
|
||||
}
|
||||
}
|
||||
|
||||
GUF_FN_KEYWORDS bool guf_utf8_char_is_valid(const guf_utf8_char *c)
|
||||
{
|
||||
const int num_bytes = guf_utf8_num_bytes(c->bytes[0]);
|
||||
|
||||
if (!num_bytes) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_bytes; ++i) {
|
||||
// "The octet values C0, C1, F5 to FF never appear.", cf. https://www.rfc-editor.org/rfc/rfc3629#page-5
|
||||
if (c->bytes[i] == 0xC0 || c->bytes[i] == 0xC1 || (c->bytes[i] >= 0xF5 && c->bytes[i] <= 0xFF)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Binary: 10xx.xxxx
|
||||
#define guf_valid_tail(byte) ((byte) >= 0x80 && (byte) <= 0xBF)
|
||||
|
||||
// cf. https://datatracker.ietf.org/doc/html/rfc3629#page-5
|
||||
switch (num_bytes)
|
||||
{
|
||||
case 1:
|
||||
GUF_ASSERT(c->bytes[0] <= 0x7F);
|
||||
return true;
|
||||
|
||||
case 2:
|
||||
GUF_ASSERT(c->bytes[0] >= 0xC2 && c->bytes[0] <= 0xDF);
|
||||
return guf_valid_tail(c->bytes[1]);
|
||||
|
||||
case 3:
|
||||
if ((c->bytes[0] == 0xE0) && (c->bytes[1] >= 0xA0 && c->bytes[1] <= 0xBF) && guf_valid_tail(c->bytes[2])) {
|
||||
return true;
|
||||
}
|
||||
if ((c->bytes[0] >= 0xE1 && c->bytes[0] <= 0xEC) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2])) {
|
||||
return true;
|
||||
}
|
||||
if ((c->bytes[0] == 0xED) && (c->bytes[1] >= 0x80 && c->bytes[1] <= 0x9F) && guf_valid_tail(c->bytes[2])) {
|
||||
return true;
|
||||
}
|
||||
if ((c->bytes[0] >= 0xEE && c->bytes[0] <= 0xEF) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2])) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
case 4:
|
||||
if ((c->bytes[0] == 0xF0) && (c->bytes[1] >= 0x90 && c->bytes[1] <= 0xBF) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) {
|
||||
return true;
|
||||
}
|
||||
if ((c->bytes[0] >= 0xF1 && c->bytes[0] <= 0xF3) && guf_valid_tail(c->bytes[1]) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) {
|
||||
return true;
|
||||
}
|
||||
if ((c->bytes[0] == 0xF4) && (c->bytes[1] >= 0x80 && c->bytes[1] <= 0x8F) && guf_valid_tail(c->bytes[2]) && guf_valid_tail(c->bytes[3])) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
#undef guf_valid_tail
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#undef GUF_FN_KEYWORDS
|
||||
#undef GUF_IMPL
|
||||
#undef GUF_IMPL_STATIC
|
||||
#undef GUF_STATIC
|
||||
@ -6,6 +6,12 @@
|
||||
#define GUF_IMPL
|
||||
#include "guf_dbuf.h"
|
||||
|
||||
#define GUF_CNT_NAME dbuf_uchar
|
||||
#define GUF_T uchar
|
||||
#define GUF_T_IS_INTEGRAL_TYPE
|
||||
#define GUF_IMPL
|
||||
#include "guf_dbuf.h"
|
||||
|
||||
#define GUF_CNT_NAME dbuf_float
|
||||
#define GUF_T float
|
||||
#define GUF_T_IS_INTEGRAL_TYPE
|
||||
|
||||
@ -8,6 +8,13 @@
|
||||
#define GUF_T_IS_INTEGRAL_TYPE
|
||||
#include "guf_dbuf.h"
|
||||
|
||||
typedef unsigned char uchar;
|
||||
|
||||
#define GUF_CNT_NAME dbuf_uchar
|
||||
#define GUF_T uchar
|
||||
#define GUF_T_IS_INTEGRAL_TYPE
|
||||
#include "guf_dbuf.h"
|
||||
|
||||
#define GUF_CNT_NAME dbuf_float
|
||||
#define GUF_T float
|
||||
#define GUF_T_IS_INTEGRAL_TYPE
|
||||
|
||||
@ -16,12 +16,10 @@ static inline guf_hash_size_t int32_hash(const int32_t *a)
|
||||
{
|
||||
return guf_hash(a, sizeof(int32_t), GUF_HASH_INIT);
|
||||
}
|
||||
|
||||
static inline bool int32_eq(const int32_t *a, const int32_t *b)
|
||||
{
|
||||
return *a == *b;
|
||||
}
|
||||
|
||||
#define GUF_DICT_KEY_T int32_t
|
||||
#define GUF_DICT_KEY_HASH int32_hash
|
||||
#define GUF_DICT_KEY_T_EQ int32_eq
|
||||
|
||||
4
src/test/guf_utf8_impl.c
Normal file
4
src/test/guf_utf8_impl.c
Normal file
@ -0,0 +1,4 @@
|
||||
#include "guf_utf8.h"
|
||||
|
||||
#define GUF_IMPL
|
||||
#include "guf_utf8.h"
|
||||
@ -8,6 +8,7 @@ extern "C" {
|
||||
}
|
||||
|
||||
#include "test_dbuf.hpp"
|
||||
#include "test_dict.hpp"
|
||||
|
||||
std::unordered_set<std::unique_ptr<Test>> g_tests {};
|
||||
|
||||
@ -20,6 +21,10 @@ void init_tests()
|
||||
test = std::make_unique<DbufCstringTest>("DbufCstringTest");
|
||||
GUF_ASSERT_RELEASE(test.get());
|
||||
g_tests.insert(std::move(test));
|
||||
|
||||
test = std::make_unique<DictCstrToIntTest>("DictCstrToIntTest");
|
||||
GUF_ASSERT_RELEASE(test.get());
|
||||
g_tests.insert(std::move(test));
|
||||
}
|
||||
|
||||
int main()
|
||||
|
||||
@ -10,7 +10,7 @@ extern "C"
|
||||
|
||||
struct DbufIntTest : public Test
|
||||
{
|
||||
DbufIntTest(std::string name) : Test(name) {};
|
||||
DbufIntTest(const std::string& name) : Test(name) {};
|
||||
|
||||
private:
|
||||
|
||||
|
||||
114
src/test/test_dict.hpp
Normal file
114
src/test/test_dict.hpp
Normal file
@ -0,0 +1,114 @@
|
||||
#pragma once
|
||||
#include <unordered_map>
|
||||
#include "test.hpp"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "guf_alloc_libc.h"
|
||||
#include "guf_dict_impl.h"
|
||||
#include "guf_utf8.h"
|
||||
}
|
||||
|
||||
struct DictCstrToIntTest : public Test
|
||||
{
|
||||
|
||||
DictCstrToIntTest(const std::string& name) : Test(name) {};
|
||||
|
||||
private:
|
||||
|
||||
dbuf_uchar text_buf {};
|
||||
std::vector<char> text_vec {};
|
||||
|
||||
void insert_lookup()
|
||||
{
|
||||
std::unordered_map<std::string, int> word_cnt_map {};
|
||||
dict_cstr_int word_cnt_dict {};
|
||||
dict_cstr_int_init(&word_cnt_dict, &guf_allocator_libc);
|
||||
|
||||
ptrdiff_t len = 0;
|
||||
|
||||
for (dbuf_uchar_iter it = dbuf_uchar_begin(&text_buf); !dbuf_uchar_iter_is_end(&text_buf, it); it = dbuf_uchar_iter_next(&text_buf, it, 1)) {
|
||||
const unsigned char c = *it.ptr;
|
||||
guf_utf8_char utf8_c = {.bytes = {c, 0, 0, 0}};
|
||||
const int num_bytes = guf_utf8_num_bytes(c);
|
||||
|
||||
if (!num_bytes) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int consumed = 1;
|
||||
while (consumed < num_bytes && ((it = dbuf_uchar_iter_next(&text_buf, it, 1)), !dbuf_uchar_iter_is_end(&text_buf, it)) ) {
|
||||
utf8_c.bytes[consumed++] = *it.ptr;
|
||||
}
|
||||
if (consumed < num_bytes) {
|
||||
printf("Invalid utf-8: file is truncated\n");
|
||||
break;
|
||||
}
|
||||
|
||||
if (guf_utf8_char_is_valid(&utf8_c) && utf8_c.bytes[0] != '\0') {
|
||||
char str[5] {};
|
||||
memcpy(str, utf8_c.bytes, num_bytes);
|
||||
printf("%s", str);
|
||||
++len;
|
||||
}
|
||||
}
|
||||
printf("\nread %td utf-8 characters\n", len);
|
||||
|
||||
dict_cstr_int_free(&word_cnt_dict, NULL);
|
||||
bool dbuf_null = !word_cnt_dict.kv_elems.data && !word_cnt_dict.kv_elems.allocator && !word_cnt_dict.kv_elems.capacity && !word_cnt_dict.kv_elems.size;
|
||||
TEST_CHECK(!dbuf_null && !word_cnt_dict.kv_indices && !word_cnt_dict.kv_indices_cap && !word_cnt_dict.max_probelen && !word_cnt_dict.num_tombstones);
|
||||
}
|
||||
|
||||
bool load_file()
|
||||
{
|
||||
#define TEST_DATA_DIR "/Users/joni/Desktop/libguf/src/test/data"
|
||||
FILE *in_file {nullptr};
|
||||
if (!in_file) {
|
||||
in_file = fopen(TEST_DATA_DIR "/data_01.txt", "r");
|
||||
}
|
||||
|
||||
if (!in_file) {
|
||||
return false;
|
||||
}
|
||||
|
||||
dbuf_uchar_init(&text_buf, 128, &guf_allocator_libc);
|
||||
|
||||
int c = EOF;
|
||||
while ((c = fgetc(in_file)) != EOF) {
|
||||
dbuf_uchar_push_val(&text_buf, (unsigned char)c);
|
||||
text_vec.push_back((unsigned char)c);
|
||||
}
|
||||
fclose(in_file);
|
||||
|
||||
if (*dbuf_uchar_back(&text_buf) != '\0') {
|
||||
dbuf_uchar_push_val(&text_buf, '\0');
|
||||
text_vec.push_back('\0');
|
||||
}
|
||||
|
||||
return TEST_CHECK(std::ssize(text_vec) == text_buf.size);
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
bool run() override
|
||||
{
|
||||
if (done) {
|
||||
return passed;
|
||||
}
|
||||
|
||||
if (!TEST_CHECK(load_file())) {
|
||||
goto end;
|
||||
}
|
||||
|
||||
insert_lookup();
|
||||
|
||||
end:
|
||||
dbuf_uchar_free(&text_buf, NULL);
|
||||
text_buf = {};
|
||||
|
||||
passed = (num_failed_checks == 0);
|
||||
done = true;
|
||||
|
||||
return passed;
|
||||
}
|
||||
};
|
||||
Loading…
x
Reference in New Issue
Block a user