/*====================================================================*
*
* xmlscan.c - markup scanner;
*
* node.h
*
* scan XML source and create a parse tree;
*
* Motley Tools by Charles Maier <cmaier@cmassoc.net>;
* Copyright (c) 2001-2006 by Charles Maier Associates;
* Licensed under the Internet Software Consortium License;
*
*--------------------------------------------------------------------*/
#ifndef XMLSCAN_SOURCE
#define XMLSCAN_SOURCE
/*====================================================================*
* system header files;
*--------------------------------------------------------------------*/
#include <string.h>
#include <ctype.h>
/*====================================================================*
* custom header files;
*--------------------------------------------------------------------*/
#include "../nodes/node.h"
#include "../tools/number.h"
#include "../tools/error.h"
/*====================================================================*
*
* char * advance (char * string, unsigned * line);
*
* discard whitespace and count newlines up to the next meaningful
* character;
*
* this function is critical to the XML parsing engine because it
* ensures that node strings are NUL terminated and line counts
* are accurate;
*
* Motley Tools by Charles Maier <cmaier@cmassoc.net>;
* Copyright (c) 2001-2006 by Charles Maier Associates;
* Licensed under the Internet Software Consortium License;
*
*--------------------------------------------------------------------*/
static char * advance (char * string, unsigned * lineno)
{
while (isspace (*string))
{
if (*string == '\n')
{
(*lineno)++;
}
*string++ = (char)(0);
}
return (string);
}
/*====================================================================*
*
* char * discard (char * string, unsigned * line);
*
* discard current character; advance to next character;
*
* Motley Tools by Charles Maier <cmaier@cmassoc.net>;
* Copyright (c) 2001-2006 by Charles Maier Associates;
* Licensed under the Internet Software Consortium License;
*
*--------------------------------------------------------------------*/
static char * discard (char * string, unsigned * lineno)
{
*string++ = (char)(0);
string = advance (string, lineno);
return (string);
}
/*====================================================================*
*
* char * nmtoken (char * string);
*
* collect nmtoken as per w3c xml 1.0 specification;
*
* Motley Tools by Charles Maier <cmaier@cmassoc.net>;
* Copyright (c) 2001-2006 by Charles Maier Associates;
* Licensed under the Internet Software Consortium License;
*
*--------------------------------------------------------------------*/
static char * nmtoken (char * string)
{
while (isalnum (*string) || (*string == '-') || (*string == '_') || (*string == '.') || (*string == ':'))
{
string++;
}
return (string);
}
/*====================================================================*
*
* char * content (char * string, char quote, unsigned * line);
*
* collect literal string; discard quotes; preserve whitespace;
* count newlines;
*
* Motley Tools by Charles Maier <cmaier@cmassoc.net>;
* Copyright (c) 2001-2006 by Charles Maier Associates;
* Licensed under the Internet Software Consortium License;
*
*--------------------------------------------------------------------*/
static char * content (char * string, char quote, unsigned * lineno)
{
if (*string == quote)
{
*string++ = (char)(0);
}
while (*string)
{
if (*string == quote)
{
break;
}
if (*string++ == '\n')
{
(*lineno)++;
}
}
if (*string == quote)
{
*string++ = (char)(0);
}
return (string);
}
/*====================================================================*
*
* char * collect (char * string);
*
* collect entity; an entity consists of non-blank characters
* excluding common tag punctuation;
*
* Motley Tools by Charles Maier <cmaier@cmassoc.net>;
* Copyright (c) 2001-2006 by Charles Maier Associates;
* Licensed under the Internet Software Consortium License;
*
*--------------------------------------------------------------------*/
static char * collect (char * string)
{
while (*string)
{
if (*string == '<')
{
break;
}
if (*string == '=')
{
break;
}
if (*string == '/')
{
break;
}
if (*string == '?')
{
break;
}
if (*string == '>')
{
break;
}
if (isspace (*string))
{
break;
}
string++;
}
return (string);
}
/*====================================================================*
*
* static char * comment (char * string, unsigned * line);
*
* collect comment;
* preserve delimiters;
* preserve whitespace;
* count newlines;
*
* Motley Tools by Charles Maier <cmaier@cmassoc.net>;
* Copyright (c) 2001-2006 by Charles Maier Associates;
* Licensed under the Internet Software Consortium License;
*
*--------------------------------------------------------------------*/
static char * comment (char * string, unsigned * lineno)
{
string++;
if (*string == '-')
{
while (*string == '-')
{
string++;
}
while ((*string) && (*string != '-'))
{
while ((*string) && (*string != '-'))
{
if (*string == '\n')
{
(*lineno)++;
}
string++;
}
string++;
}
while (*string == '-')
{
string++;
}
}
return (string);
}
/*====================================================================*
*
* char * literal (char * string, char quote, unsigned * line);
*
* collect literal;
* preserve delimiters;
* preserve whitespace;
* count newlines;
*
* Motley Tools by Charles Maier <cmaier@cmassoc.net>;
* Copyright (c) 2001-2006 by Charles Maier Associates;
* Licensed under the Internet Software Consortium License;
*
*--------------------------------------------------------------------*/
static char * literal (char *string, char quote, unsigned * lineno)
{
if (*string == quote)
{
*string++ = (char)(0);
}
while (*string)
{
if (*string == quote)
{
break;
}
if (*string == '\n')
{
(*lineno)++;
}
string++;
}
if (*string == quote)
{
*string++ = (char)(0);
}
return (string);
}
/*====================================================================*
*
* char * context (char * string, signed c, unsigned *line);
*
* collect context;
* preserve delimiters;
* preserve whitespace;
* count newlines;
*
* Motley Tools by Charles Maier <cmaier@cmassoc.net>;
* Copyright (c) 2001-2006 by Charles Maier Associates;
* Licensed under the Internet Software Consortium License;
*
*--------------------------------------------------------------------*/
static char * context (char *string, signed c, unsigned * lineno)
{
string++;
while (*string)
{
if (*string == (char)(c))
{
string++;
break;
}
if (*string == '{')
{
string = context (string, '}', lineno);
continue;
}
if (*string == '(')
{
string = context (string, ')', lineno);
continue;
}
if (*string == '[')
{
string = context (string, ']', lineno);
continue;
}
if ((*string == '\"') || (*string == '\''))
{
string = literal (string, *string, lineno);
continue;
}
if (*string == '\n')
{
(*lineno)++;
}
string++;
}
return (string);
}
/*====================================================================*
*
* void xmlscan (NODE * node);
*
* node.h
*
* Motley Tools by Charles Maier <cmaier@cmassoc.net>;
* Copyright (c) 2001-2006 by Charles Maier Associates;
* Licensed under the Internet Software Consortium License;
*
*--------------------------------------------------------------------*/
signed xmlscan (NODE * node)
{
NODE * section = node;
NODE * element;
NODE * attribute;
NODE * value;
char prefix = (char)(0);
char suffix = (char)(0);
char * string = node->text;
unsigned lineno = 1;
if (!section)
{
error (1, EFAULT, "section is null");
}
if (!string)
{
error (1, EFAULT, "string is null");
}
while (*string)
{
if (*string == '<')
{
prefix = '<';
suffix = '>';
string = discard (string, &lineno);
if ((*string == '/') || (*string == '?') || (*string == '!'))
{
prefix = *string;
string = discard (string, &lineno);
}
element = xmlnode (section);
element->line = lineno;
element->type = NODE_ELEM;
element->text = string;
if (isalpha (*string))
{
string = nmtoken (string);
}
else if (*string == '-')
{
string = comment (string, &lineno);
}
else if (*string == '[')
{
string = context (string, ']', &lineno);
}
else
{
string = collect (string);
}
string = advance (string, &lineno);
while ((*string) && (*string != '<') && (*string != '/') && (*string != '?') && (*string != '>'))
{
attribute = xmlnode (element);
attribute->line = lineno;
attribute->type = NODE_ATTR;
attribute->text = string;
if (isalpha (*string))
{
string = nmtoken (string);
}
else if (*string == '-')
{
string = comment (string, &lineno);
}
else if (*string == '[')
{
string = context (string, ']', &lineno);
}
else if ((*string == '\"') || (*string == '\''))
{
string = content (string, *string, &lineno);
attribute->text++;
}
else
{
string = collect (string);
}
string = advance (string, &lineno);
if (*string == '=')
{
string = discard (string, &lineno);
value = xmlnode (attribute);
value->line = lineno;
value->type = NODE_VALU;
value->text = string;
if ((*string == '\"') || (*string == '\''))
{
string = content (string, *string, &lineno);
value->text++;
}
else
{
string = collect (string);
}
string = advance (string, &lineno);
}
}
if ((*string == '/') || (*string == '?'))
{
suffix = *string;
string = discard (string, &lineno);
}
}
else if (*string == '>')
{
string = discard (string, &lineno);
if (prefix == '!')
{
element->type = NODE_SGML;
}
else if (prefix == '?')
{
element->type = NODE_INST;
}
else if (suffix == '?')
{
}
else if (prefix == '/')
{
element->type = NODE_ETAG;
if (element->below)
{
error (1, 0, "Element </%s> on line %d has attributes or content.", element->text, element->line);
}
if (strcmp (section->text, element->text))
{
error (1, 0, "Element <%s> on line %d teminated by </%s> on line %d", section->text, section->line, element->text, element->line);
}
if (section->above)
{
section = section->above;
}
}
else if (suffix == '/')
{
}
else
{
section = element;
}
}
else
{
signed space = 0;
char * output = string;
NODE * segment = xmlnode (section);
segment->line = lineno;
segment->type = NODE_DATA;
segment->text = string;
while (*string)
{
if (*string == '<')
{
break;
}
if (isspace (*string))
{
string = advance (string, &lineno);
space++;
continue;
}
if (space)
{
*output++ = ' ';
space--;
}
*output++ = *string++;
}
if (output < string)
{
*output = (char)(0);
}
}
}
return (0);
}
#endif