Parsee/src/Unistr.c
lda 389870c5d3 [FIX] Ignore unavailable statuses
They aren't that useful, especially in MUCs.
2025-01-25 12:20:47 +00:00

279 lines
5.2 KiB
C

#include <Unistring.h>
#include <Cytoplasm/Memory.h>
#include <Cytoplasm/Str.h>
#include <Cytoplasm/Log.h>
#include <Cytoplasm/Log.h>
#include <string.h>
#include <stdarg.h>
struct Unistr {
size_t length;
uint32_t *codepoints;
};
void
UnistrAddch(Unistr *unistr, uint32_t u)
{
if (!unistr || !u)
{
return;
}
unistr->length++;
unistr->codepoints = Realloc(
unistr->codepoints,
unistr->length * sizeof(*unistr->codepoints)
);
unistr->codepoints[unistr->length - 1] = u;
}
static bool
UTFIsN(char *off, size_t available, size_t n, uint8_t pc)
{
size_t i;
uint8_t *offu = (uint8_t *) off;
if (((available < n) || ((*offu >> (8-n-1)) != pc)) && (n >= 1))
{
return false;
}
for (i = 0; i < n - 1; i++)
{
if ((offu[i+1] >> 6) != 0x2)
{
return false;
}
}
return true;
}
Unistr *
UnistrCreate(char *src)
{
size_t len, i;
Unistr *str;
if (!src)
{
return NULL;
}
len = strlen(src);
str = Malloc(sizeof(*str));
str->length = 0;
str->codepoints = NULL;
/* We can't just set the length to {len}. */
for (i = 0; i < len; i++)
{
char byte = src[i];
size_t available = len - i;
if ((byte & 0x80) == 0)
{
/* This is a regular codepoint */
UnistrAddch(str, byte & 0x7F);
continue;
}
else if (UTFIsN(&src[i], available, 2, 0x06))
{
char a = src[i+0] & 0x1F;
char b = src[i+1] & 0x3F;
uint32_t u = (a << (6 * 1)) | b;
/* Overlongs are errors. */
if (u < 0x0080 || u > 0x07FF)
{
UnistrFree(str);
return NULL;
}
UnistrAddch(str, u);
i += 2 - 1;
continue;
}
else if (UTFIsN(&src[i], available, 3, 0x0E))
{
char a = src[i+0] & 0x0F;
char b = src[i+1] & 0x3F;
char c = src[i+2] & 0x3F;
uint32_t u =
(a << (6 * 2)) |
(b << (6 * 1)) |
(c << (6 * 0)) ;
/* Overlongs are errors. */
if (u < 0x0800 || u > 0xFFFF)
{
UnistrFree(str);
return NULL;
}
UnistrAddch(str, u);
i += 3 - 1;
continue;
}
else if (UTFIsN(&src[i], available, 4, 0x1E))
{
char a = src[i+0] & 0x07;
char b = src[i+1] & 0x3F;
char c = src[i+2] & 0x3F;
char d = src[i+3] & 0x3F;
uint32_t u =
(a << (6 * 3)) |
(b << (6 * 2)) |
(c << (6 * 1)) |
(d << (6 * 0)) ;
/* Overlongs are errors. */
if (u < 0x10000 || u > 0x10FFFF)
{
UnistrFree(str);
return NULL;
}
UnistrAddch(str, u);
i += 4 - 1;
continue;
}
}
return str;
}
void
UnistrFree(Unistr *unistr)
{
if (!unistr)
{
return;
}
Free(unistr->codepoints);
Free(unistr);
}
char *
UnistrC(Unistr *unistr)
{
char *ret, *tmp, *utf;
size_t i;
if (!unistr)
{
return NULL;
}
ret = NULL;
for (i = 0; i < unistr->length; i++)
{
uint32_t code = unistr->codepoints[i];
utf = StrUtf8Encode(code);
tmp = ret;
ret = StrConcat(2, ret, utf);
Free(tmp);
Free(utf);
}
return ret;
}
size_t
UnistrSize(Unistr *unistr)
{
return unistr ? unistr->length : 0;
}
uint32_t
UnistrGetch(Unistr *unistr, size_t i)
{
if (!unistr)
{
return 0;
}
return i < unistr->length ? unistr->codepoints[i] : 0;
}
bool
UnistrIsASCII(uint32_t u)
{
if (u == 0)
{
return NULL;
}
return u < 0x7F;
}
bool
UnistrIsBMP(uint32_t u)
{
if (u == 0)
{
return NULL;
}
return u <= 0xFFFF;
}
Unistr *
UnistrFilter(Unistr *str, UnistrFilterFunc filter)
{
Unistr *unistr;
size_t i;
if (!str || !filter)
{
return NULL;
}
unistr = UnistrCreate("");
for (i = 0; i < UnistrSize(str); i++)
{
uint32_t code = UnistrGetch(str, i);
if (!filter(code))
{
continue;
}
UnistrAddch(unistr, code);
}
return unistr;
}
Unistr *
UnistrConcat(size_t n, ...)
{
va_list list;
size_t i;
Unistr *ret = UnistrCreate("");
va_start(list, n);
for (i = 0; i < n; i++)
{
Unistr *to_concat = va_arg(list, Unistr *);
size_t j;
for (j = 0; j < UnistrSize(to_concat); j++)
{
UnistrAddch(ret, UnistrGetch(to_concat, j));
}
}
va_end(list);
return ret;
}
size_t
UnistrGetOffset(Unistr *str, uint32_t sep)
{
size_t i;
uint32_t prev = 0x0A;
if (!str || !sep)
{
return 0;
}
for (i = 0; i < str->length; i++)
{
uint32_t curr = str->codepoints[i];
if (prev == 0x0A && curr != sep)
{
return i;
}
prev = curr;
}
return 0;
}