mirror of
https://forge.fsky.io/lda/Parsee.git
synced 2026-03-13 21:35:10 +00:00
[ADD/WIP] Start making a simple SAX parser, ASwerk
This commit is contained in:
parent
0fa95c2d14
commit
79217d3608
14 changed files with 1066 additions and 26 deletions
702
src/XML/SAX.c
Normal file
702
src/XML/SAX.c
Normal file
|
|
@ -0,0 +1,702 @@
|
|||
#include <XML.h>
|
||||
|
||||
#include <Cytoplasm/HashMap.h>
|
||||
#include <Cytoplasm/Memory.h>
|
||||
#include <Cytoplasm/Array.h>
|
||||
#include <Cytoplasm/Str.h>
|
||||
#include <Cytoplasm/Log.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
|
||||
struct XMLexer {
|
||||
Stream *stream;
|
||||
bool autofree;
|
||||
|
||||
/* A temporary buffer */
|
||||
int *buffer;
|
||||
ssize_t length;
|
||||
ssize_t top;
|
||||
|
||||
enum {
|
||||
XML_STATE_NONE = 0,
|
||||
XML_STATE_COMMENT,
|
||||
|
||||
XML_STATE_ATTR,
|
||||
XML_STATE_ATTRHEAD_PROP,
|
||||
XML_STATE_ATTRTAIL
|
||||
} state;
|
||||
|
||||
struct {
|
||||
Array *elements;
|
||||
char *str;
|
||||
} data;
|
||||
};
|
||||
|
||||
/* "Looks ahead" in the XML stream for a NUL-terminated string.
|
||||
* If it was found and skip is set, then also skips over it. */
|
||||
static bool XMLookahead(XMLexer *lexer, const char *str, bool skip);
|
||||
|
||||
/* Parses an XML "name" */
|
||||
static bool XMLIsStart(XMLexer *lexer);
|
||||
static char * XMLParseName(XMLexer *lexer);
|
||||
static bool XMLSkipSpace(XMLexer *lexer);
|
||||
static char * XMLParseAttValue(XMLexer *lexer);
|
||||
|
||||
static ssize_t XMLInitialiseBuffer(XMLexer *lexer);
|
||||
static int XMLGetc(XMLexer *lexer);
|
||||
static void XMLUngetc(XMLexer *lexer, int ch);
|
||||
static void XMLEndBuffer(XMLexer *lexer);
|
||||
static void XMLRollback(XMLexer *lexer);
|
||||
static void XMLReset(XMLexer *lexer, ssize_t loc);
|
||||
static char * XMLStringify(XMLexer *lexer, ssize_t point);
|
||||
static HashMap * XMLReadProps(XMLexer *l);
|
||||
static void XMLPushElement(XMLexer *lexer, char *e);
|
||||
static char * XMLPopElement(XMLexer *lexer);
|
||||
|
||||
static XMLEvent * XMLCreateEmptyElem(XMLexer *lexer, HashMap *attrs);
|
||||
static XMLEvent * XMLCreateStart(XMLexer *lexer, HashMap *attrs);
|
||||
static XMLEvent * XMLCreateRelax(XMLexer *lexer);
|
||||
static XMLEvent * XMLCreateEnd(XMLexer *lexer, char *end);
|
||||
static XMLEvent * XMLCreateData(XMLexer *lexer);
|
||||
|
||||
XMLexer *
|
||||
XMLCreateLexer(Stream *stream, bool autofree)
|
||||
{
|
||||
XMLexer *lexer;
|
||||
if (!stream)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
lexer = Malloc(sizeof(*lexer));
|
||||
lexer->stream = stream;
|
||||
lexer->autofree = autofree;
|
||||
lexer->state = XML_STATE_NONE;
|
||||
|
||||
lexer->buffer = NULL;
|
||||
lexer->length = -1;
|
||||
lexer->top = -1;
|
||||
|
||||
lexer->data.elements = ArrayCreate();
|
||||
lexer->data.str = NULL;
|
||||
|
||||
return lexer;
|
||||
}
|
||||
void
|
||||
XMLFreeLexer(XMLexer *lexer)
|
||||
{
|
||||
if (!lexer)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (lexer->autofree)
|
||||
{
|
||||
StreamClose(lexer->stream);
|
||||
}
|
||||
if (lexer->buffer)
|
||||
{
|
||||
Free(lexer->buffer);
|
||||
}
|
||||
if (lexer->data.str)
|
||||
{
|
||||
Free(lexer->data.str);
|
||||
}
|
||||
if (lexer->data.elements)
|
||||
{
|
||||
size_t i;
|
||||
Array *elems = lexer->data.elements;
|
||||
for (i = 0; i < ArraySize(elems); i++)
|
||||
{
|
||||
Free(ArrayGet(elems, i));
|
||||
}
|
||||
ArrayFree(lexer->data.elements);
|
||||
}
|
||||
Free(lexer);
|
||||
}
|
||||
|
||||
|
||||
XMLEvent *
|
||||
XMLCrank(XMLexer *lexer)
|
||||
{
|
||||
XMLEvent *event = NULL;
|
||||
char c;
|
||||
char *attrname;
|
||||
HashMap *props;
|
||||
char *key, *val;
|
||||
char cbuf[2] = { 0 };
|
||||
char *tmp;
|
||||
if (!lexer)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
if (StreamEof(lexer->stream))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
event = XMLCreateRelax(lexer);
|
||||
switch (lexer->state)
|
||||
{
|
||||
case XML_STATE_NONE:
|
||||
if (XMLookahead(lexer, "<!--", true))
|
||||
{
|
||||
if (lexer->data.str)
|
||||
{
|
||||
XMLFreeEvent(event);
|
||||
event = XMLCreateData(lexer);
|
||||
}
|
||||
|
||||
lexer->state = XML_STATE_COMMENT;
|
||||
break;
|
||||
}
|
||||
else if (XMLookahead(lexer, "</", true))
|
||||
{
|
||||
if (lexer->data.str)
|
||||
{
|
||||
XMLFreeEvent(event);
|
||||
event = XMLCreateData(lexer);
|
||||
}
|
||||
|
||||
lexer->state = XML_STATE_ATTRTAIL;
|
||||
break;
|
||||
}
|
||||
else if (XMLookahead(lexer, "<", true))
|
||||
{
|
||||
if (lexer->data.str)
|
||||
{
|
||||
XMLFreeEvent(event);
|
||||
event = XMLCreateData(lexer);
|
||||
}
|
||||
|
||||
lexer->state = XML_STATE_ATTR;
|
||||
break;
|
||||
}
|
||||
/* TODO: Try storing character into str buffer. */
|
||||
cbuf[0] = XMLGetc(lexer);
|
||||
tmp = lexer->data.str;
|
||||
lexer->data.str = StrConcat(2, tmp, cbuf);
|
||||
Free(tmp);
|
||||
break;
|
||||
case XML_STATE_COMMENT:
|
||||
if (XMLookahead(lexer, "-->", true))
|
||||
{
|
||||
lexer->state = XML_STATE_NONE;
|
||||
}
|
||||
else if (XMLookahead(lexer, "--", false))
|
||||
{
|
||||
/* Throw error */
|
||||
return NULL;
|
||||
}
|
||||
break;
|
||||
case XML_STATE_ATTR:
|
||||
attrname = XMLParseName(lexer);
|
||||
if (!attrname)
|
||||
{
|
||||
/* TODO: Throw error */
|
||||
}
|
||||
XMLPushElement(lexer, attrname);
|
||||
|
||||
props = XMLReadProps(lexer);
|
||||
|
||||
XMLSkipSpace(lexer);
|
||||
if (XMLookahead(lexer, "/>", true))
|
||||
{
|
||||
lexer->state = XML_STATE_NONE;
|
||||
XMLFreeEvent(event);
|
||||
event = XMLCreateEmptyElem(lexer, props);
|
||||
|
||||
Free(XMLPopElement(lexer));
|
||||
break;
|
||||
}
|
||||
else if (XMLookahead(lexer, ">", true))
|
||||
{
|
||||
lexer->state = XML_STATE_NONE;
|
||||
XMLFreeEvent(event);
|
||||
event = XMLCreateStart(lexer, props);
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case XML_STATE_ATTRTAIL:
|
||||
attrname = XMLParseName(lexer);
|
||||
Free(XMLPopElement(lexer));
|
||||
if (!XMLookahead(lexer, ">", true))
|
||||
{
|
||||
/* TODO: Throw error. */
|
||||
break;
|
||||
}
|
||||
lexer->state = XML_STATE_NONE;
|
||||
XMLFreeEvent(event);
|
||||
event = XMLCreateEnd(lexer, attrname);
|
||||
break;
|
||||
}
|
||||
/* TODO: Crank our XML parser. */
|
||||
return event;
|
||||
}
|
||||
void
|
||||
XMLFreeEvent(XMLEvent *event)
|
||||
{
|
||||
if (!event)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (event->element)
|
||||
{
|
||||
Free(event->element);
|
||||
}
|
||||
if (event->attrs)
|
||||
{
|
||||
char *key;
|
||||
void *val;
|
||||
while (HashMapIterate(event->attrs, &key, &val))
|
||||
{
|
||||
Log(LOG_INFO, "Trying to free %s", val);
|
||||
Free(val);
|
||||
}
|
||||
HashMapFree(event->attrs);
|
||||
}
|
||||
if (event->data)
|
||||
{
|
||||
Free(event->data);
|
||||
}
|
||||
Free(event);
|
||||
}
|
||||
|
||||
static bool
|
||||
XMLookahead(XMLexer *lexer, const char *str, bool skip)
|
||||
{
|
||||
int *stack;
|
||||
size_t top, i;
|
||||
ssize_t ntop;
|
||||
bool ret = false;
|
||||
if (!lexer || !str)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
top = 0;
|
||||
stack = Malloc(strlen(str) * sizeof(*stack));
|
||||
|
||||
for (i = 0; i < strlen(str); i++)
|
||||
{
|
||||
char c = str[i];
|
||||
int getc = XMLGetc(lexer);
|
||||
|
||||
stack[top++] = getc;
|
||||
if (getc != c || getc == EOF)
|
||||
{
|
||||
goto seekback;
|
||||
}
|
||||
}
|
||||
|
||||
/* We have been able to seek the string properly */
|
||||
ret = true;
|
||||
if (!skip)
|
||||
{
|
||||
goto seekback;
|
||||
}
|
||||
Free(stack);
|
||||
return ret;
|
||||
|
||||
seekback:
|
||||
for (ntop = top - 1; ntop >= 0; ntop--)
|
||||
{
|
||||
XMLUngetc(lexer, stack[ntop]);
|
||||
}
|
||||
Free(stack);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define IsNamestart(c) ((c == ':') || isalpha(c) || (c == '_'))
|
||||
#define IsNamepart(c) (IsNamestart(c) || (c == '-') || isdigit(c))
|
||||
static char *
|
||||
XMLParseName(XMLexer *lexer)
|
||||
{
|
||||
int c;
|
||||
ssize_t point;
|
||||
|
||||
point = XMLInitialiseBuffer(lexer);
|
||||
c = XMLGetc(lexer);
|
||||
if (!IsNamestart(c))
|
||||
{
|
||||
XMLRollback(lexer);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
while ((c = XMLGetc(lexer)))
|
||||
{
|
||||
if (!IsNamepart(c))
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
XMLUngetc(lexer, c);
|
||||
|
||||
return XMLStringify(lexer, point);
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
XMLInitialiseBuffer(XMLexer *lexer)
|
||||
{
|
||||
if (!lexer)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
if (lexer->length != -1)
|
||||
{
|
||||
return lexer->top;
|
||||
}
|
||||
|
||||
lexer->length = 0;
|
||||
lexer->top = 0;
|
||||
lexer->buffer = NULL;
|
||||
return 0;
|
||||
}
|
||||
static int
|
||||
XMLGetc(XMLexer *lexer)
|
||||
{
|
||||
int ch;
|
||||
if (lexer->length == -1)
|
||||
{
|
||||
return StreamGetc(lexer->stream);
|
||||
}
|
||||
|
||||
ch = StreamGetc(lexer->stream);
|
||||
if (lexer->top >= lexer->length)
|
||||
{
|
||||
lexer->length += 8;
|
||||
lexer->buffer = Realloc(
|
||||
lexer->buffer,
|
||||
lexer->length * sizeof(*lexer->buffer)
|
||||
);
|
||||
}
|
||||
lexer->buffer[lexer->top++] = ch;
|
||||
return ch;
|
||||
}
|
||||
static void
|
||||
XMLUngetc(XMLexer *lexer, int ch)
|
||||
{
|
||||
if (!lexer || lexer->length == -1)
|
||||
{
|
||||
StreamUngetc(lexer->stream, ch);
|
||||
return;
|
||||
}
|
||||
if (lexer->top == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
StreamUngetc(lexer->stream, ch);
|
||||
lexer->top--;
|
||||
}
|
||||
static void
|
||||
XMLEndBuffer(XMLexer *lexer)
|
||||
{
|
||||
if (!lexer || lexer->length == -1)
|
||||
{
|
||||
return;
|
||||
}
|
||||
lexer->top = -1;
|
||||
lexer->length = -1;
|
||||
|
||||
Free(lexer->buffer);
|
||||
lexer->buffer = NULL;
|
||||
}
|
||||
static void
|
||||
XMLRollback(XMLexer *lexer)
|
||||
{
|
||||
ssize_t i;
|
||||
if (!lexer || lexer->length == -1)
|
||||
{
|
||||
return;
|
||||
}
|
||||
for (i = lexer->top - 1; i >= 0; i--)
|
||||
{
|
||||
StreamUngetc(lexer->stream, lexer->buffer[i]);
|
||||
}
|
||||
XMLEndBuffer(lexer);
|
||||
}
|
||||
void
|
||||
XMLReset(XMLexer *lexer, ssize_t loc)
|
||||
{
|
||||
ssize_t i;
|
||||
if (!lexer || lexer->length == -1)
|
||||
{
|
||||
return;
|
||||
}
|
||||
for (i = lexer->top - 1; i >= loc; i--)
|
||||
{
|
||||
StreamUngetc(lexer->stream, lexer->buffer[i]);
|
||||
}
|
||||
lexer->top = loc;
|
||||
}
|
||||
char *
|
||||
XMLStringify(XMLexer *lexer, ssize_t point)
|
||||
{
|
||||
ssize_t i;
|
||||
char *str;
|
||||
if (!lexer || lexer->length == -1)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
str = Malloc(lexer->top + 1);
|
||||
memset(str, '\0', lexer->top + 1);
|
||||
for (i = point; i < lexer->top; i++)
|
||||
{
|
||||
str[i - point] = (char) lexer->buffer[i];
|
||||
}
|
||||
XMLEndBuffer(lexer);
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
#define IsSpace(c) ((c == ' ') || (c == 0x09) || (c == 0x0D) || (c == 0x0A))
|
||||
bool
|
||||
XMLSkipSpace(XMLexer *lexer)
|
||||
{
|
||||
int c;
|
||||
bool r = false;
|
||||
while ((c = XMLGetc(lexer)) != EOF)
|
||||
{
|
||||
if (!IsSpace(c))
|
||||
{
|
||||
break;
|
||||
}
|
||||
r = true;
|
||||
}
|
||||
XMLUngetc(lexer, c);
|
||||
|
||||
return r;
|
||||
}
|
||||
HashMap *
|
||||
XMLReadProps(XMLexer *lexer)
|
||||
{
|
||||
ssize_t point = XMLInitialiseBuffer(lexer);
|
||||
HashMap *map = HashMapCreate();
|
||||
bool error = false;
|
||||
while (true)
|
||||
{
|
||||
char *name;
|
||||
char *value = NULL;
|
||||
|
||||
if (!XMLSkipSpace(lexer))
|
||||
{
|
||||
/* A lack of space is totally excepted */
|
||||
break;
|
||||
}
|
||||
|
||||
name = XMLParseName(lexer);
|
||||
if (!name)
|
||||
{
|
||||
/* A lack of name is totally excepted */
|
||||
break;
|
||||
}
|
||||
|
||||
value = StrDuplicate("");
|
||||
if (XMLookahead(lexer, "=", true))
|
||||
{
|
||||
/* TODO: Values aren't names. */
|
||||
Free(value);
|
||||
value = XMLParseAttValue(lexer);
|
||||
if (!value)
|
||||
{
|
||||
error = true;
|
||||
Free(name);
|
||||
break;
|
||||
}
|
||||
}
|
||||
HashMapSet(map, name, value);
|
||||
Free(name);
|
||||
}
|
||||
if (error)
|
||||
{
|
||||
XMLReset(lexer, point);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
static XMLEvent *
|
||||
XMLCreateStart(XMLexer *lexer, HashMap *attrs)
|
||||
{
|
||||
XMLEvent *event = Malloc(sizeof(*event));
|
||||
size_t elements = ArraySize(lexer->data.elements) - 1;
|
||||
char *h_element = StrDuplicate(ArrayGet(lexer->data.elements, elements));
|
||||
|
||||
event->type = XML_LEXER_STARTELEM;
|
||||
event->element = h_element;
|
||||
event->attrs = attrs;
|
||||
event->data = NULL;
|
||||
|
||||
/* TODO */
|
||||
event->line = 0;
|
||||
event->col = 0;
|
||||
event->offset = 0;
|
||||
|
||||
return event;
|
||||
}
|
||||
XMLEvent *
|
||||
XMLCreateEnd(XMLexer *lexer, char *end)
|
||||
{
|
||||
XMLEvent *event = Malloc(sizeof(*event));
|
||||
|
||||
event->type = XML_LEXER_ENDELEM;
|
||||
event->element = end;
|
||||
event->attrs = NULL;
|
||||
event->data = NULL;
|
||||
|
||||
/* TODO */
|
||||
event->line = 0;
|
||||
event->col = 0;
|
||||
event->offset = 0;
|
||||
|
||||
return event;
|
||||
}
|
||||
static XMLEvent *
|
||||
XMLCreateEmptyElem(XMLexer *lexer, HashMap *attrs)
|
||||
{
|
||||
XMLEvent *event = Malloc(sizeof(*event));
|
||||
size_t elements = ArraySize(lexer->data.elements) - 1;
|
||||
char *h_element = StrDuplicate(ArrayGet(lexer->data.elements, elements));
|
||||
|
||||
event->type = XML_LEXER_ELEM;
|
||||
event->element = h_element;
|
||||
event->attrs = attrs;
|
||||
event->data = NULL;
|
||||
|
||||
/* TODO */
|
||||
event->line = 0;
|
||||
event->col = 0;
|
||||
event->offset = 0;
|
||||
|
||||
return event;
|
||||
}
|
||||
XMLEvent *
|
||||
XMLCreateData(XMLexer *lexer)
|
||||
{
|
||||
XMLEvent *event = Malloc(sizeof(*event));
|
||||
size_t elements = ArraySize(lexer->data.elements);
|
||||
|
||||
event->type = XML_LEXER_DATA;
|
||||
event->element = elements ?
|
||||
StrDuplicate(ArrayGet(lexer->data.elements, elements - 1)) :
|
||||
NULL;
|
||||
event->attrs = NULL;
|
||||
event->data = lexer->data.str;
|
||||
|
||||
/* TODO */
|
||||
event->line = 0;
|
||||
event->col = 0;
|
||||
event->offset = 0;
|
||||
|
||||
lexer->data.str = NULL;
|
||||
|
||||
return event;
|
||||
}
|
||||
static XMLEvent *
|
||||
XMLCreateRelax(XMLexer *lexer)
|
||||
{
|
||||
XMLEvent *event = Malloc(sizeof(*event));
|
||||
size_t elements = ArraySize(lexer->data.elements);
|
||||
|
||||
event->type = XML_RELAX;
|
||||
event->element = elements ?
|
||||
StrDuplicate(ArrayGet(lexer->data.elements, elements - 1)) :
|
||||
NULL;
|
||||
event->attrs = NULL;
|
||||
event->data = NULL;
|
||||
|
||||
/* TODO */
|
||||
event->line = 0;
|
||||
event->col = 0;
|
||||
event->offset = 0;
|
||||
|
||||
return event;
|
||||
}
|
||||
|
||||
static void
|
||||
XMLPushElement(XMLexer *lexer, char *e)
|
||||
{
|
||||
ArrayAdd(lexer->data.elements, e);
|
||||
}
|
||||
static char *
|
||||
XMLPopElement(XMLexer *lexer)
|
||||
{
|
||||
size_t n = ArraySize(lexer->data.elements);
|
||||
if (n == 0)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return ArrayDelete(lexer->data.elements, n - 1);
|
||||
}
|
||||
|
||||
#define IsNormalQ(c) ((c != '<') && (c != '&') && (c != '\''))
|
||||
#define IsNormalD(c) ((c != '<') && (c != '&') && (c != '"'))
|
||||
static char *
|
||||
XMLParseAttQuote(XMLexer *lexer)
|
||||
{
|
||||
int c;
|
||||
ssize_t point;
|
||||
char *str;
|
||||
|
||||
point = XMLInitialiseBuffer(lexer);
|
||||
|
||||
while ((c = XMLGetc(lexer)))
|
||||
{
|
||||
if (!IsNormalQ(c))
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (c != '\'')
|
||||
{
|
||||
XMLUngetc(lexer, c);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* TODO: Decode the string */
|
||||
str = XMLStringify(lexer, point);
|
||||
str[strlen(str) - 1] = '\0'; /* Trim the quote. */
|
||||
return str;
|
||||
}
|
||||
static char *
|
||||
XMLParseAttDouble(XMLexer *lexer)
|
||||
{
|
||||
int c;
|
||||
ssize_t point;
|
||||
char *str;
|
||||
|
||||
point = XMLInitialiseBuffer(lexer);
|
||||
|
||||
while ((c = XMLGetc(lexer)))
|
||||
{
|
||||
if (!IsNormalD(c))
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (c != '"')
|
||||
{
|
||||
XMLUngetc(lexer, c);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* TODO: Decode the string */
|
||||
str = XMLStringify(lexer, point);
|
||||
str[strlen(str) - 1] = '\0'; /* Trim the quote. */
|
||||
return str;
|
||||
}
|
||||
static char *
|
||||
XMLParseAttValue(XMLexer *lexer)
|
||||
{
|
||||
ssize_t point = XMLInitialiseBuffer(lexer);
|
||||
|
||||
if (XMLookahead(lexer, "'", true))
|
||||
{
|
||||
return XMLParseAttQuote(lexer);
|
||||
}
|
||||
else if (XMLookahead(lexer, "\"", true))
|
||||
{
|
||||
return XMLParseAttDouble(lexer);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue