mirror of
https://forge.fsky.io/lda/Parsee.git
synced 2026-03-13 21:05:10 +00:00
269 lines
5.1 KiB
C
269 lines
5.1 KiB
C
#include <Unistring.h>
|
|
|
|
#include <Cytoplasm/Memory.h>
|
|
#include <Cytoplasm/Str.h>
|
|
#include <Cytoplasm/Log.h>
|
|
#include <Cytoplasm/Log.h>
|
|
|
|
#include <string.h>
|
|
#include <stdarg.h>
|
|
|
|
struct Unistr {
|
|
size_t length;
|
|
uint32_t *codepoints;
|
|
};
|
|
|
|
void
|
|
UnistrAddch(Unistr *unistr, uint32_t u)
|
|
{
|
|
if (!unistr || !u)
|
|
{
|
|
return;
|
|
}
|
|
unistr->length++;
|
|
unistr->codepoints = Realloc(
|
|
unistr->codepoints,
|
|
unistr->length * sizeof(*unistr->codepoints)
|
|
);
|
|
|
|
unistr->codepoints[unistr->length - 1] = u;
|
|
}
|
|
|
|
static bool
|
|
UTFIsN(char *off, size_t available, size_t n, uint8_t pc)
|
|
{
|
|
size_t i;
|
|
uint8_t *offu = (uint8_t *) off;
|
|
if (((available < n) || ((*offu >> (8-n-1)) != pc)) && (n >= 1))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
for (i = 0; i < n - 1; i++)
|
|
{
|
|
if ((offu[i+1] >> 6) != 0x2)
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
Unistr *
|
|
UnistrCreate(char *src)
|
|
{
|
|
size_t len, i;
|
|
Unistr *str;
|
|
if (!src)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
len = strlen(src);
|
|
str = Malloc(sizeof(*str));
|
|
str->length = 0;
|
|
str->codepoints = NULL;
|
|
|
|
/* We can't just set the length to {len}. */
|
|
for (i = 0; i < len; i++)
|
|
{
|
|
char byte = src[i];
|
|
size_t available = len - i;
|
|
if ((byte & 0x80) == 0)
|
|
{
|
|
/* This is a regular codepoint */
|
|
UnistrAddch(str, byte & 0x7F);
|
|
continue;
|
|
}
|
|
else if (UTFIsN(&src[i], available, 2, 0x06))
|
|
{
|
|
char a = src[i+0] & 0x1F;
|
|
char b = src[i+1] & 0x3F;
|
|
uint32_t u = (a << (6 * 1)) | b;
|
|
|
|
/* Overlongs are errors. */
|
|
if (u < 0x0080 || u > 0x07FF)
|
|
{
|
|
UnistrFree(str);
|
|
return NULL;
|
|
}
|
|
|
|
UnistrAddch(str, u);
|
|
i += 2 - 1;
|
|
continue;
|
|
}
|
|
else if (UTFIsN(&src[i], available, 3, 0x0E))
|
|
{
|
|
char a = src[i+0] & 0x0F;
|
|
char b = src[i+1] & 0x3F;
|
|
char c = src[i+2] & 0x3F;
|
|
uint32_t u =
|
|
(a << (6 * 2)) |
|
|
(b << (6 * 1)) |
|
|
(c << (6 * 0)) ;
|
|
|
|
/* Overlongs are errors. */
|
|
if (u < 0x0800 || u > 0xFFFF)
|
|
{
|
|
UnistrFree(str);
|
|
return NULL;
|
|
}
|
|
|
|
UnistrAddch(str, u);
|
|
i += 3 - 1;
|
|
continue;
|
|
}
|
|
else if (UTFIsN(&src[i], available, 4, 0x1E))
|
|
{
|
|
char a = src[i+0] & 0x07;
|
|
char b = src[i+1] & 0x3F;
|
|
char c = src[i+2] & 0x3F;
|
|
char d = src[i+3] & 0x3F;
|
|
uint32_t u =
|
|
(a << (6 * 3)) |
|
|
(b << (6 * 2)) |
|
|
(c << (6 * 1)) |
|
|
(d << (6 * 0)) ;
|
|
|
|
/* Overlongs are errors. */
|
|
if (u < 0x10000 || u > 0x10FFFF)
|
|
{
|
|
UnistrFree(str);
|
|
return NULL;
|
|
}
|
|
|
|
UnistrAddch(str, u);
|
|
i += 4 - 1;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
return str;
|
|
}
|
|
void
|
|
UnistrFree(Unistr *unistr)
|
|
{
|
|
if (!unistr)
|
|
{
|
|
return;
|
|
}
|
|
|
|
Free(unistr->codepoints);
|
|
Free(unistr);
|
|
}
|
|
char *
|
|
UnistrC(Unistr *unistr)
|
|
{
|
|
char *ret, *tmp, *utf;
|
|
size_t i;
|
|
if (!unistr)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
ret = NULL;
|
|
for (i = 0; i < unistr->length; i++)
|
|
{
|
|
uint32_t code = unistr->codepoints[i];
|
|
utf = StrUtf8Encode(code);
|
|
|
|
tmp = ret;
|
|
ret = StrConcat(2, ret, utf);
|
|
Free(tmp);
|
|
Free(utf);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
size_t
|
|
UnistrSize(Unistr *unistr)
|
|
{
|
|
return unistr ? unistr->length : 0;
|
|
}
|
|
uint32_t
|
|
UnistrGetch(Unistr *unistr, size_t i)
|
|
{
|
|
if (!unistr)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
return i < unistr->length ? unistr->codepoints[i] : 0;
|
|
}
|
|
bool
|
|
UnistrIsBMP(uint32_t u)
|
|
{
|
|
if (u == 0)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
return u <= 0xFFFF;
|
|
}
|
|
Unistr *
|
|
UnistrFilter(Unistr *str, UnistrFilterFunc filter)
|
|
{
|
|
Unistr *unistr;
|
|
size_t i;
|
|
if (!str || !filter)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
unistr = UnistrCreate("");
|
|
for (i = 0; i < UnistrSize(str); i++)
|
|
{
|
|
uint32_t code = UnistrGetch(str, i);
|
|
if (!filter(code))
|
|
{
|
|
continue;
|
|
}
|
|
UnistrAddch(unistr, code);
|
|
}
|
|
|
|
return unistr;
|
|
}
|
|
|
|
Unistr *
|
|
UnistrConcat(size_t n, ...)
|
|
{
|
|
va_list list;
|
|
size_t i;
|
|
Unistr *ret = UnistrCreate("");
|
|
|
|
va_start(list, n);
|
|
for (i = 0; i < n; i++)
|
|
{
|
|
Unistr *to_concat = va_arg(list, Unistr *);
|
|
size_t j;
|
|
for (j = 0; j < UnistrSize(to_concat); j++)
|
|
{
|
|
UnistrAddch(ret, UnistrGetch(to_concat, j));
|
|
}
|
|
}
|
|
|
|
va_end(list);
|
|
return ret;
|
|
}
|
|
size_t
|
|
UnistrGetOffset(Unistr *str, uint32_t sep)
|
|
{
|
|
size_t i;
|
|
uint32_t prev = 0x0A;
|
|
if (!str || !sep)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
for (i = 0; i < str->length; i++)
|
|
{
|
|
uint32_t curr = str->codepoints[i];
|
|
if (prev == 0x0A && curr != sep)
|
|
{
|
|
return i;
|
|
}
|
|
prev = curr;
|
|
}
|
|
return 0;
|
|
}
|