#include #include #include #include #include #include #include struct Unistr { size_t length; uint32_t *codepoints; }; void UnistrAddch(Unistr *unistr, uint32_t u) { if (!unistr || !u) { return; } unistr->length++; unistr->codepoints = Realloc( unistr->codepoints, unistr->length * sizeof(*unistr->codepoints) ); unistr->codepoints[unistr->length - 1] = u; } static bool UTFIsN(char *off, size_t available, size_t n, uint8_t pc) { size_t i; uint8_t *offu = (uint8_t *) off; if (((available < n) || ((*offu >> (8-n-1)) != pc)) && (n >= 1)) { return false; } for (i = 0; i < n - 1; i++) { if ((offu[i+1] >> 6) != 0x2) { return false; } } return true; } Unistr * UnistrCreate(char *src) { size_t len, i; Unistr *str; if (!src) { return NULL; } len = strlen(src); str = Malloc(sizeof(*str)); str->length = 0; str->codepoints = NULL; /* We can't just set the length to {len}. */ for (i = 0; i < len; i++) { char byte = src[i]; size_t available = len - i; if ((byte & 0x80) == 0) { /* This is a regular codepoint */ UnistrAddch(str, byte & 0x7F); continue; } else if (UTFIsN(&src[i], available, 2, 0x06)) { char a = src[i+0] & 0x1F; char b = src[i+1] & 0x3F; uint32_t u = (a << (6 * 1)) | b; /* Overlongs are errors. */ if (u < 0x0080 || u > 0x07FF) { UnistrFree(str); return NULL; } UnistrAddch(str, u); i += 2 - 1; continue; } else if (UTFIsN(&src[i], available, 3, 0x0E)) { char a = src[i+0] & 0x0F; char b = src[i+1] & 0x3F; char c = src[i+2] & 0x3F; uint32_t u = (a << (6 * 2)) | (b << (6 * 1)) | (c << (6 * 0)) ; /* Overlongs are errors. */ if (u < 0x0800 || u > 0xFFFF) { UnistrFree(str); return NULL; } UnistrAddch(str, u); i += 3 - 1; continue; } else if (UTFIsN(&src[i], available, 4, 0x1E)) { char a = src[i+0] & 0x07; char b = src[i+1] & 0x3F; char c = src[i+2] & 0x3F; char d = src[i+3] & 0x3F; uint32_t u = (a << (6 * 3)) | (b << (6 * 2)) | (c << (6 * 1)) | (d << (6 * 0)) ; /* Overlongs are errors. */ if (u < 0x10000 || u > 0x10FFFF) { UnistrFree(str); return NULL; } UnistrAddch(str, u); i += 4 - 1; continue; } } return str; } void UnistrFree(Unistr *unistr) { if (!unistr) { return; } Free(unistr->codepoints); Free(unistr); } char * UnistrC(Unistr *unistr) { char *ret, *tmp, *utf; size_t i; if (!unistr) { return NULL; } ret = NULL; for (i = 0; i < unistr->length; i++) { uint32_t code = unistr->codepoints[i]; utf = StrUtf8Encode(code); tmp = ret; ret = StrConcat(2, ret, utf); Free(tmp); Free(utf); } return ret; } size_t UnistrSize(Unistr *unistr) { return unistr ? unistr->length : 0; } uint32_t UnistrGetch(Unistr *unistr, size_t i) { if (!unistr) { return 0; } return i < unistr->length ? unistr->codepoints[i] : 0; } bool UnistrIsASCII(uint32_t u) { if (u == 0) { return NULL; } return u < 0x7F; } bool UnistrIsBMP(uint32_t u) { if (u == 0) { return NULL; } return u <= 0xFFFF; } Unistr * UnistrFilter(Unistr *str, UnistrFilterFunc filter) { Unistr *unistr; size_t i; if (!str || !filter) { return NULL; } unistr = UnistrCreate(""); for (i = 0; i < UnistrSize(str); i++) { uint32_t code = UnistrGetch(str, i); if (!filter(code)) { continue; } UnistrAddch(unistr, code); } return unistr; } Unistr * UnistrConcat(size_t n, ...) { va_list list; size_t i; Unistr *ret = UnistrCreate(""); va_start(list, n); for (i = 0; i < n; i++) { Unistr *to_concat = va_arg(list, Unistr *); size_t j; for (j = 0; j < UnistrSize(to_concat); j++) { UnistrAddch(ret, UnistrGetch(to_concat, j)); } } va_end(list); return ret; } size_t UnistrGetOffset(Unistr *str, uint32_t sep) { size_t i; uint32_t prev = 0x0A; if (!str || !sep) { return 0; } for (i = 0; i < str->length; i++) { uint32_t curr = str->codepoints[i]; if (prev == 0x0A && curr != sep) { return i; } prev = curr; } return 0; }