[MOD/WIP] Mess a bit with the XEP-0393 parser

It took a comical amount of time for me to do that LMAO
This commit is contained in:
LDA 2024-08-01 10:31:59 +02:00
commit cb0e77e7a4
5 changed files with 345 additions and 148 deletions

View file

@ -1,7 +1,10 @@
#include <XEP393.h>
#include <StringSplit.h>
#include <Cytoplasm/Memory.h>
#include <Cytoplasm/Array.h>
#include <Cytoplasm/Str.h>
#include <Cytoplasm/Log.h>
#include <string.h>
@ -62,190 +65,223 @@ XEP393FreeElement(XEP393Element *element)
XEP393FreeElementBase(element, false);
}
typedef struct StrView {
char *start;
char *end;
bool heap_free;
} StrView;
#define ViewLength(v) ((size_t) ((v.end) - (v.start)))
static char *
StringifyView(StrView v)
static StringRect
DecodeQuote(StringRect rect, size_t *skip)
{
char *r;
size_t len;
if (!v.start || v.start > v.end)
StringRect ret = StrFullRect(NULL);
int lines = 0;
/* C abuse of chaining operations */
while ((StrGet(rect, lines, 0) == '>') && ++lines)
{
return NULL;
}
len = ViewLength(v);
r = Malloc(len + 1);
memcpy(r, v.start, len);
r[len] = '\0';
return r;
}
static StrView
CreateStaticView(char *str)
{
StrView view = {
.start = str,
.end = str + strlen(str),
.heap_free = false
};
return view;
}
static bool
IdentifySpans(char span_tag, char end_tag, StrView in, StrView *view)
{
size_t length;
bool found = false, equal, flag;
char prev = '\0';
if (in.start >= in.end)
{
return false;
}
if (ViewLength(in) < 2)
{
return false;
}
equal = span_tag == end_tag;
flag = equal && isspace(*(in.start + 1));
if (*in.start != span_tag || flag)
{
/* The opening styling directive MUST NOT be followed
* by a whitespace character */
return false;
}
view->start = in.start + 1;
in.start += 1;
for (length = 0; ViewLength(in) > 0; length++, in.start++)
{
if (*in.start == end_tag)
if (!ret.source_lines)
{
found = true;
break;
int shift_by = 1, ch;
ret = rect;
ret.end_line = 0;
while ((ch = StrGet(rect, lines - 1, shift_by)) && isspace(ch))
{
shift_by++;
}
if (ch)
{
ret = StrShift(ret, shift_by);
}
continue;
}
ret.end_line++;
}
if (!lines)
{
return StrFullRect(NULL);
}
if (skip)
{
*skip = lines;
}
return ret;
}
static StringRect
DecodeSpan(StringRect rect, char del, size_t *skip)
{
StringRect ret = StrFullRect(NULL);
int chars = 0;
char c;
if (StrGet(rect, 0, 0) != del)
{
return ret;
}
rect = StrShift(rect, 1);
/* C abuse of chaining operations */
while (((c = StrGet(rect, 0, chars)) != del) && ++chars)
{
if (!c)
{
return StrFullRect(NULL);
}
if (!ret.source_lines && isspace(c))
{
return StrFullRect(NULL);
}
prev = *in.start;
if (!ret.source_lines)
{
ret = rect;
ret.end_char = ret.start_char;
continue;
}
ret.end_char++;
}
if (!found || !length || (prev && equal && isspace(prev)))
ret.end_char++;
if (!chars)
{
/* the closing styling directive MUST NOT be preceeded
* by a whitespace character. */
return false;
return StrFullRect(NULL);
}
view->end = in.start;
return true;
{
char *temp, *gen = NULL, chara[2] = { 0, '\0' };
size_t i;
for (i = 0; i < StrViewChars(ret, 0); i++)
{
*chara = StrGet(ret, 0, i);
if (!*chara)
{
break;
}
temp = gen;
gen = StrConcat(2, gen, chara);
Free(temp);
}
Free(gen);
}
if (skip)
{
*skip = chars;
}
return ret;
}
#define IdentifySpan(span_tag, in, view) IdentifySpans(span_tag, span_tag, in, view)
#define BLOCK_QUOTE (1 << 0)
#define BLOCK_CODES (1 << 1)
static void
XEP393Decode(StrView view, XEP393Element *root)
ParseLine(XEP393Element *elem, StringRect line)
{
StrView subview = view;
StrView textview = view;
XEP393Element *text, *span;
bool managed = false;
char prev = '\0', curr = '\0';
XEP393Element *span_item, *line_item;
StringRect shifted;
size_t ch_idx, chars = StrViewChars(line, 0);
size_t text_start = 0;
size_t i;
textview.end = subview.start;
for (; subview.start < subview.end; subview.start++)
for (ch_idx = 0; ch_idx < chars; ch_idx++)
{
bool sol = false;
StrView span_view;
managed = false;
curr = *subview.start;
if (prev == '\0' || prev == '\n')
{
/* TODO: Start of line, start parsing blocks. */
sol = true;
}
#define Spanify(xep_symbol) \
managed = true; \
textview.end = subview.start; \
text = CreateElementVessel( \
root, XEP393_TEXT \
); \
text->text_data = StringifyView(textview); \
\
/* Found a span. */ \
span = CreateElementVessel( \
root, xep_symbol \
); \
\
XEP393Decode(span_view, span); \
\
/* Update subview */ \
subview.start = span_view.end + 1; \
\
/* Update textview */ \
textview.start = subview.start; \
textview.end = subview.start
if (IdentifySpan('_', subview, &span_view))
{
Spanify(XEP393_ITALIC);
}
else if (IdentifySpan('*', subview, &span_view))
{
Spanify(XEP393_EMPH);
}
else if (IdentifySpan('`', subview, &span_view))
{
Spanify(XEP393_MONO);
}
else if (curr == '\n')
{
/* TODO: Remove this */
span_view.start = subview.start;
span_view.end = subview.start;
Spanify(XEP393_NL);
}
else if (sol && IdentifySpans('>', '\n', subview, &span_view))
{
/* TODO: This doesnt work with more than one line of quotes. */
Spanify(XEP393_QUOT);
}
else
{
/* Text character: update end */
textview.end = subview.start;
char curr = StrGet(line, 0, ch_idx);
StringRect span;
shifted = line;
shifted.start_char += ch_idx;
#define HandleSpan(del, sym) \
if (curr == del && \
(span = DecodeSpan(shifted, del, NULL)).source_lines) \
{ \
size_t text_end = ch_idx; \
\
{ \
char *temp, *gen = NULL, chara[2] = { 0, '\0' }; \
for (i = text_start; i < text_end; i++) \
{ \
*chara = StrGet(line, 0, i); \
\
temp = gen; \
gen = StrConcat(2, gen, chara); \
Free(temp); \
} \
line_item = CreateElementVessel(elem, XEP393_TEXT); \
line_item->text_data = gen; \
} \
\
span_item = CreateElementVessel(elem, sym); \
ParseLine(span_item, span); \
text_start = span.end_char - line.start_char + 1; \
ch_idx = span.end_char; \
continue; \
}
prev = curr;
HandleSpan('*', XEP393_EMPH);
HandleSpan('_', XEP393_ITALIC);
HandleSpan('~', XEP393_SRKE);
HandleSpan('`', XEP393_MONO);
}
if (!managed)
{
textview.end = subview.start;
text = CreateElementVessel(
root, XEP393_TEXT
);
text->text_data = StringifyView(textview);
char *temp, *gen = NULL, chara[2] = { 0, '\0' };
for (i = text_start; i < chars; i++)
{
*chara = StrGet(line, 0, i);
temp = gen;
gen = StrConcat(2, gen, chara);
Free(temp);
}
line_item = CreateElementVessel(elem, XEP393_TEXT);
line_item->text_data = gen;
}
}
static void
XEP393Parse(XEP393Element *root, StringRect region, int flags)
{
size_t i, lines = StrViewLines(region);
for (i = 0; i < lines; i++)
{
StringRect extend_line = StrGetl(region, i, true);
StringRect single_line = StrGetl(region, i, false);
size_t jump_by = 0;
XEP393Element *sub;
if ((flags & BLOCK_QUOTE) && (StrGet(single_line, 0, 0) == '>'))
{
StringRect quote = DecodeQuote(extend_line, &jump_by);
sub = CreateElementVessel(root, XEP393_QUOT);
XEP393Parse(sub, quote, flags);
i += jump_by - 1;
continue;
}
/* TODO: Parse the single line properly. */
if (!(flags & BLOCK_CODES))
{
sub = CreateElementVessel(root, XEP393_LINE);
ParseLine(sub, single_line);
continue;
}
}
}
XEP393Element *
XEP393(char *message)
{
StrView view = CreateStaticView(message);
char **lines = StrSplitLines(message);
StringRect view = StrFullRect(lines);
XEP393Element *root = CreateElementVessel(NULL, XEP393_ROOT);
/* TODO: Parse blocks first, *then* spans. Considering the
* current architecture, this shouldn't be too hard to integrate,
* given how string views already manage boundaries, and elements
* can already be used to contain blocks I think.
*
* Actually, nevermind, these would be pure pain. Nested blocks,
* unterminated ones, QUOTES. Just hell. I hate parsing this shit. */
XEP393Decode(view, root);
XEP393Parse(root, view, BLOCK_QUOTE);
StrFreeLines(lines);
return root;
}
@ -267,8 +303,13 @@ ShoveXML(XEP393Element *element, XMLElement *xmlparent)
head = XMLCreateTag("i");
XMLAddChild(xmlparent, head);
break;
case XEP393_NL:
XMLAddChild(xmlparent, XMLCreateTag("br"));
case XEP393_LINE:
head = XMLCreateTag("p");
XMLAddChild(xmlparent, head);
break;
case XEP393_SRKE:
head = XMLCreateTag("s");
XMLAddChild(xmlparent, head);
break;
case XEP393_QUOT:
head = XMLCreateTag("blockquote");