[ADD/WIP] Start making a simple SAX parser, ASwerk

This commit is contained in:
LDA 2024-06-15 12:29:34 +02:00
commit 79217d3608
14 changed files with 1066 additions and 26 deletions

702
src/XML/SAX.c Normal file
View file

@ -0,0 +1,702 @@
#include <XML.h>
#include <Cytoplasm/HashMap.h>
#include <Cytoplasm/Memory.h>
#include <Cytoplasm/Array.h>
#include <Cytoplasm/Str.h>
#include <Cytoplasm/Log.h>
#include <string.h>
#include <ctype.h>
struct XMLexer {
Stream *stream;
bool autofree;
/* A temporary buffer */
int *buffer;
ssize_t length;
ssize_t top;
enum {
XML_STATE_NONE = 0,
XML_STATE_COMMENT,
XML_STATE_ATTR,
XML_STATE_ATTRHEAD_PROP,
XML_STATE_ATTRTAIL
} state;
struct {
Array *elements;
char *str;
} data;
};
/* "Looks ahead" in the XML stream for a NUL-terminated string.
* If it was found and skip is set, then also skips over it. */
static bool XMLookahead(XMLexer *lexer, const char *str, bool skip);
/* Parses an XML "name" */
static bool XMLIsStart(XMLexer *lexer);
static char * XMLParseName(XMLexer *lexer);
static bool XMLSkipSpace(XMLexer *lexer);
static char * XMLParseAttValue(XMLexer *lexer);
static ssize_t XMLInitialiseBuffer(XMLexer *lexer);
static int XMLGetc(XMLexer *lexer);
static void XMLUngetc(XMLexer *lexer, int ch);
static void XMLEndBuffer(XMLexer *lexer);
static void XMLRollback(XMLexer *lexer);
static void XMLReset(XMLexer *lexer, ssize_t loc);
static char * XMLStringify(XMLexer *lexer, ssize_t point);
static HashMap * XMLReadProps(XMLexer *l);
static void XMLPushElement(XMLexer *lexer, char *e);
static char * XMLPopElement(XMLexer *lexer);
static XMLEvent * XMLCreateEmptyElem(XMLexer *lexer, HashMap *attrs);
static XMLEvent * XMLCreateStart(XMLexer *lexer, HashMap *attrs);
static XMLEvent * XMLCreateRelax(XMLexer *lexer);
static XMLEvent * XMLCreateEnd(XMLexer *lexer, char *end);
static XMLEvent * XMLCreateData(XMLexer *lexer);
XMLexer *
XMLCreateLexer(Stream *stream, bool autofree)
{
XMLexer *lexer;
if (!stream)
{
return NULL;
}
lexer = Malloc(sizeof(*lexer));
lexer->stream = stream;
lexer->autofree = autofree;
lexer->state = XML_STATE_NONE;
lexer->buffer = NULL;
lexer->length = -1;
lexer->top = -1;
lexer->data.elements = ArrayCreate();
lexer->data.str = NULL;
return lexer;
}
void
XMLFreeLexer(XMLexer *lexer)
{
if (!lexer)
{
return;
}
if (lexer->autofree)
{
StreamClose(lexer->stream);
}
if (lexer->buffer)
{
Free(lexer->buffer);
}
if (lexer->data.str)
{
Free(lexer->data.str);
}
if (lexer->data.elements)
{
size_t i;
Array *elems = lexer->data.elements;
for (i = 0; i < ArraySize(elems); i++)
{
Free(ArrayGet(elems, i));
}
ArrayFree(lexer->data.elements);
}
Free(lexer);
}
XMLEvent *
XMLCrank(XMLexer *lexer)
{
XMLEvent *event = NULL;
char c;
char *attrname;
HashMap *props;
char *key, *val;
char cbuf[2] = { 0 };
char *tmp;
if (!lexer)
{
return NULL;
}
if (StreamEof(lexer->stream))
{
return NULL;
}
event = XMLCreateRelax(lexer);
switch (lexer->state)
{
case XML_STATE_NONE:
if (XMLookahead(lexer, "<!--", true))
{
if (lexer->data.str)
{
XMLFreeEvent(event);
event = XMLCreateData(lexer);
}
lexer->state = XML_STATE_COMMENT;
break;
}
else if (XMLookahead(lexer, "</", true))
{
if (lexer->data.str)
{
XMLFreeEvent(event);
event = XMLCreateData(lexer);
}
lexer->state = XML_STATE_ATTRTAIL;
break;
}
else if (XMLookahead(lexer, "<", true))
{
if (lexer->data.str)
{
XMLFreeEvent(event);
event = XMLCreateData(lexer);
}
lexer->state = XML_STATE_ATTR;
break;
}
/* TODO: Try storing character into str buffer. */
cbuf[0] = XMLGetc(lexer);
tmp = lexer->data.str;
lexer->data.str = StrConcat(2, tmp, cbuf);
Free(tmp);
break;
case XML_STATE_COMMENT:
if (XMLookahead(lexer, "-->", true))
{
lexer->state = XML_STATE_NONE;
}
else if (XMLookahead(lexer, "--", false))
{
/* Throw error */
return NULL;
}
break;
case XML_STATE_ATTR:
attrname = XMLParseName(lexer);
if (!attrname)
{
/* TODO: Throw error */
}
XMLPushElement(lexer, attrname);
props = XMLReadProps(lexer);
XMLSkipSpace(lexer);
if (XMLookahead(lexer, "/>", true))
{
lexer->state = XML_STATE_NONE;
XMLFreeEvent(event);
event = XMLCreateEmptyElem(lexer, props);
Free(XMLPopElement(lexer));
break;
}
else if (XMLookahead(lexer, ">", true))
{
lexer->state = XML_STATE_NONE;
XMLFreeEvent(event);
event = XMLCreateStart(lexer, props);
break;
}
break;
case XML_STATE_ATTRTAIL:
attrname = XMLParseName(lexer);
Free(XMLPopElement(lexer));
if (!XMLookahead(lexer, ">", true))
{
/* TODO: Throw error. */
break;
}
lexer->state = XML_STATE_NONE;
XMLFreeEvent(event);
event = XMLCreateEnd(lexer, attrname);
break;
}
/* TODO: Crank our XML parser. */
return event;
}
void
XMLFreeEvent(XMLEvent *event)
{
if (!event)
{
return;
}
if (event->element)
{
Free(event->element);
}
if (event->attrs)
{
char *key;
void *val;
while (HashMapIterate(event->attrs, &key, &val))
{
Log(LOG_INFO, "Trying to free %s", val);
Free(val);
}
HashMapFree(event->attrs);
}
if (event->data)
{
Free(event->data);
}
Free(event);
}
static bool
XMLookahead(XMLexer *lexer, const char *str, bool skip)
{
int *stack;
size_t top, i;
ssize_t ntop;
bool ret = false;
if (!lexer || !str)
{
return false;
}
top = 0;
stack = Malloc(strlen(str) * sizeof(*stack));
for (i = 0; i < strlen(str); i++)
{
char c = str[i];
int getc = XMLGetc(lexer);
stack[top++] = getc;
if (getc != c || getc == EOF)
{
goto seekback;
}
}
/* We have been able to seek the string properly */
ret = true;
if (!skip)
{
goto seekback;
}
Free(stack);
return ret;
seekback:
for (ntop = top - 1; ntop >= 0; ntop--)
{
XMLUngetc(lexer, stack[ntop]);
}
Free(stack);
return ret;
}
#define IsNamestart(c) ((c == ':') || isalpha(c) || (c == '_'))
#define IsNamepart(c) (IsNamestart(c) || (c == '-') || isdigit(c))
static char *
XMLParseName(XMLexer *lexer)
{
int c;
ssize_t point;
point = XMLInitialiseBuffer(lexer);
c = XMLGetc(lexer);
if (!IsNamestart(c))
{
XMLRollback(lexer);
return NULL;
}
while ((c = XMLGetc(lexer)))
{
if (!IsNamepart(c))
{
break;
}
}
XMLUngetc(lexer, c);
return XMLStringify(lexer, point);
}
static ssize_t
XMLInitialiseBuffer(XMLexer *lexer)
{
if (!lexer)
{
return -1;
}
if (lexer->length != -1)
{
return lexer->top;
}
lexer->length = 0;
lexer->top = 0;
lexer->buffer = NULL;
return 0;
}
static int
XMLGetc(XMLexer *lexer)
{
int ch;
if (lexer->length == -1)
{
return StreamGetc(lexer->stream);
}
ch = StreamGetc(lexer->stream);
if (lexer->top >= lexer->length)
{
lexer->length += 8;
lexer->buffer = Realloc(
lexer->buffer,
lexer->length * sizeof(*lexer->buffer)
);
}
lexer->buffer[lexer->top++] = ch;
return ch;
}
static void
XMLUngetc(XMLexer *lexer, int ch)
{
if (!lexer || lexer->length == -1)
{
StreamUngetc(lexer->stream, ch);
return;
}
if (lexer->top == 0)
{
return;
}
StreamUngetc(lexer->stream, ch);
lexer->top--;
}
static void
XMLEndBuffer(XMLexer *lexer)
{
if (!lexer || lexer->length == -1)
{
return;
}
lexer->top = -1;
lexer->length = -1;
Free(lexer->buffer);
lexer->buffer = NULL;
}
static void
XMLRollback(XMLexer *lexer)
{
ssize_t i;
if (!lexer || lexer->length == -1)
{
return;
}
for (i = lexer->top - 1; i >= 0; i--)
{
StreamUngetc(lexer->stream, lexer->buffer[i]);
}
XMLEndBuffer(lexer);
}
void
XMLReset(XMLexer *lexer, ssize_t loc)
{
ssize_t i;
if (!lexer || lexer->length == -1)
{
return;
}
for (i = lexer->top - 1; i >= loc; i--)
{
StreamUngetc(lexer->stream, lexer->buffer[i]);
}
lexer->top = loc;
}
char *
XMLStringify(XMLexer *lexer, ssize_t point)
{
ssize_t i;
char *str;
if (!lexer || lexer->length == -1)
{
return NULL;
}
str = Malloc(lexer->top + 1);
memset(str, '\0', lexer->top + 1);
for (i = point; i < lexer->top; i++)
{
str[i - point] = (char) lexer->buffer[i];
}
XMLEndBuffer(lexer);
return str;
}
#define IsSpace(c) ((c == ' ') || (c == 0x09) || (c == 0x0D) || (c == 0x0A))
bool
XMLSkipSpace(XMLexer *lexer)
{
int c;
bool r = false;
while ((c = XMLGetc(lexer)) != EOF)
{
if (!IsSpace(c))
{
break;
}
r = true;
}
XMLUngetc(lexer, c);
return r;
}
HashMap *
XMLReadProps(XMLexer *lexer)
{
ssize_t point = XMLInitialiseBuffer(lexer);
HashMap *map = HashMapCreate();
bool error = false;
while (true)
{
char *name;
char *value = NULL;
if (!XMLSkipSpace(lexer))
{
/* A lack of space is totally excepted */
break;
}
name = XMLParseName(lexer);
if (!name)
{
/* A lack of name is totally excepted */
break;
}
value = StrDuplicate("");
if (XMLookahead(lexer, "=", true))
{
/* TODO: Values aren't names. */
Free(value);
value = XMLParseAttValue(lexer);
if (!value)
{
error = true;
Free(name);
break;
}
}
HashMapSet(map, name, value);
Free(name);
}
if (error)
{
XMLReset(lexer, point);
}
return map;
}
static XMLEvent *
XMLCreateStart(XMLexer *lexer, HashMap *attrs)
{
XMLEvent *event = Malloc(sizeof(*event));
size_t elements = ArraySize(lexer->data.elements) - 1;
char *h_element = StrDuplicate(ArrayGet(lexer->data.elements, elements));
event->type = XML_LEXER_STARTELEM;
event->element = h_element;
event->attrs = attrs;
event->data = NULL;
/* TODO */
event->line = 0;
event->col = 0;
event->offset = 0;
return event;
}
XMLEvent *
XMLCreateEnd(XMLexer *lexer, char *end)
{
XMLEvent *event = Malloc(sizeof(*event));
event->type = XML_LEXER_ENDELEM;
event->element = end;
event->attrs = NULL;
event->data = NULL;
/* TODO */
event->line = 0;
event->col = 0;
event->offset = 0;
return event;
}
static XMLEvent *
XMLCreateEmptyElem(XMLexer *lexer, HashMap *attrs)
{
XMLEvent *event = Malloc(sizeof(*event));
size_t elements = ArraySize(lexer->data.elements) - 1;
char *h_element = StrDuplicate(ArrayGet(lexer->data.elements, elements));
event->type = XML_LEXER_ELEM;
event->element = h_element;
event->attrs = attrs;
event->data = NULL;
/* TODO */
event->line = 0;
event->col = 0;
event->offset = 0;
return event;
}
XMLEvent *
XMLCreateData(XMLexer *lexer)
{
XMLEvent *event = Malloc(sizeof(*event));
size_t elements = ArraySize(lexer->data.elements);
event->type = XML_LEXER_DATA;
event->element = elements ?
StrDuplicate(ArrayGet(lexer->data.elements, elements - 1)) :
NULL;
event->attrs = NULL;
event->data = lexer->data.str;
/* TODO */
event->line = 0;
event->col = 0;
event->offset = 0;
lexer->data.str = NULL;
return event;
}
static XMLEvent *
XMLCreateRelax(XMLexer *lexer)
{
XMLEvent *event = Malloc(sizeof(*event));
size_t elements = ArraySize(lexer->data.elements);
event->type = XML_RELAX;
event->element = elements ?
StrDuplicate(ArrayGet(lexer->data.elements, elements - 1)) :
NULL;
event->attrs = NULL;
event->data = NULL;
/* TODO */
event->line = 0;
event->col = 0;
event->offset = 0;
return event;
}
static void
XMLPushElement(XMLexer *lexer, char *e)
{
ArrayAdd(lexer->data.elements, e);
}
static char *
XMLPopElement(XMLexer *lexer)
{
size_t n = ArraySize(lexer->data.elements);
if (n == 0)
{
return NULL;
}
return ArrayDelete(lexer->data.elements, n - 1);
}
#define IsNormalQ(c) ((c != '<') && (c != '&') && (c != '\''))
#define IsNormalD(c) ((c != '<') && (c != '&') && (c != '"'))
static char *
XMLParseAttQuote(XMLexer *lexer)
{
int c;
ssize_t point;
char *str;
point = XMLInitialiseBuffer(lexer);
while ((c = XMLGetc(lexer)))
{
if (!IsNormalQ(c))
{
break;
}
}
if (c != '\'')
{
XMLUngetc(lexer, c);
return NULL;
}
/* TODO: Decode the string */
str = XMLStringify(lexer, point);
str[strlen(str) - 1] = '\0'; /* Trim the quote. */
return str;
}
static char *
XMLParseAttDouble(XMLexer *lexer)
{
int c;
ssize_t point;
char *str;
point = XMLInitialiseBuffer(lexer);
while ((c = XMLGetc(lexer)))
{
if (!IsNormalD(c))
{
break;
}
}
if (c != '"')
{
XMLUngetc(lexer, c);
return NULL;
}
/* TODO: Decode the string */
str = XMLStringify(lexer, point);
str[strlen(str) - 1] = '\0'; /* Trim the quote. */
return str;
}
static char *
XMLParseAttValue(XMLexer *lexer)
{
ssize_t point = XMLInitialiseBuffer(lexer);
if (XMLookahead(lexer, "'", true))
{
return XMLParseAttQuote(lexer);
}
else if (XMLookahead(lexer, "\"", true))
{
return XMLParseAttDouble(lexer);
}
return NULL;
}