// XmlParser.cpp - an implementation of the XIP parsing interface


// Copyright  2003 by Jef Poskanzer <jef@mail.acme.com>.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
// SUCH DAMAGE.
//
// For commentary on this license please see http://www.acme.com/license.html


#include "XmlParser.h"


class XmlParserSource
    {
    public:
    // Unlike istreams, you should check eof() *before* calling get().
    virtual bool eof( void ) = 0;
    virtual char get( void ) = 0;
    };


class XmlParserIStreamSource : public XmlParserSource
    {
    public:
    std::istream* is;
    bool needChar;
    char ch;
    XmlParserIStreamSource( std::istream* _is ) :
	is( _is ), needChar( true )
	{
	}
    bool eof( void )
	{
	if ( ! *is )
	    return true;
	if ( needChar )
	    {
	    ch = is->get();
	    needChar = false;
	    }
	return is->eof();
	}
    char get( void )
	{
	if ( needChar )
	    ch = is->get();
	needChar = true;
	return ch;
	}
    };


class XmlParserStringSource : public XmlParserSource
    {
    public:
    const std::string& str;
    size_t i, len;
    XmlParserStringSource( const std::string& _str ) :
	str( _str ), i( 0 )
	{
	len = str.length();
	}
    bool eof( void )
	{
	return i >= len;
	}
    char get( void )
	{
	return str[i++];
	}
    };


class XmlParserByteSource : public XmlParserSource
    {
    public:
    const char* bytes;
    size_t i, len;
    XmlParserByteSource( const char* _bytes, size_t _len ) :
	bytes( _bytes ), len( _len ), i( 0 )
	{
	}
    bool eof( void )
	{
	return i >= len;
	}
    char get( void )
	{
	return bytes[i++];
	}
    };


class XmlParserPrivate
    {
    public:
    XmlParserSource* source;
    size_t pos;
    int line;
    enum FSMState
	{
	st_text,
	st_text_cr,
	st_lt,
	st_lt_bang,
	st_lt_bang_dash,
	st_comment,
	st_comment_dash,
	st_comment_dash_dash,
	st_lt_bang_lbrack,
	st_lt_bang_lbrack_c,
	st_lt_bang_lbrack_cd,
	st_lt_bang_lbrack_cda,
	st_lt_bang_lbrack_cdat,
	st_lt_bang_lbrack_cdata,
	st_cdata,
	st_cdata_cr,
	st_cdata_rbrack,
	st_cdata_rbrack_rbrack,
	st_elementname,
	st_bang_elementname,
	st_element,
	st_bang_element,
	st_attrname,
	st_attrname_equal,
	st_attrvalue_dq,
	st_attrvalue_sq,
	st_element_slash,
	st_element_qmark,
	st_end_elementname
	};
    FSMState state;
    char nextCh;
    bool gotNextCh, gotNextText, popElement;
    std::string elementname, attrname, attrvalue;
    XipText nextText;
    int elementId;
    void realGetNextText( void ) throw ( XipException );
    void addElement( void );
    void addAttr( void );

    XmlParserPrivate( void ) :
	pos( 0 ), line( 1 ), state( st_text ), gotNextCh( false ), gotNextText( false ), popElement( false ), elementId( 0 )
	{
	nextText.elements.reserve( 10 );	// arbitrary initial size
	}
    };


XmlParser::XmlParser( std::istream* is )
    {
    pp = new XmlParserPrivate();
    pp->source = new XmlParserIStreamSource( is );
    }


XmlParser::XmlParser( const std::string& str )
    {
    pp = new XmlParserPrivate();
    pp->source = new XmlParserStringSource( str );
    }


XmlParser::XmlParser( const char* bytes, size_t len )
    {
    pp = new XmlParserPrivate();
    pp->source = new XmlParserByteSource( bytes, len );
    }


XmlParser::~XmlParser()
    {
    delete pp->source;
    delete pp;
    }


static void substituteEntities( std::string& str )
    {
    for ( size_t i = 0; i < str.length(); ++i )
	{
	if ( str[i] == '&' )
	    {
	    if ( str.substr( i, 4 ) == "&lt;" )
		str.replace( i, 4, "<" );
	    else if ( str.substr( i, 4 ) == "&gt;" )
		str.replace( i, 4, ">" );
	    else if ( str.substr( i, 5 ) == "&amp;" )
		str.replace( i, 5, "&" );
	    else if ( str.substr( i, 6 ) == "&apos;" )
		str.replace( i, 6, "'" );
	    else if ( str.substr( i, 6 ) == "&quot;" )
		str.replace( i, 6, "\"" );
	    else if ( i + 1 < str.length() && str[i+1] == '#' )
		{
		int val = 0;
		int base = 10;
		size_t j = i + 2; 
		if ( j < str.length() && ( str[j] == 'x' || str[j] == 'X' ) )
		    {
		    base = 16;
		    ++j;
		    }
		for (;;)
		    {
		    if ( j >= str.length() )
			return;
		    if ( str[j] == ';' )
			break;
		    switch ( str[j] )
			{
			case '0': case '1': case '2':
			case '3': case '4': case '5':
			case '6': case '7': case '8':
			case '9':
			val = val * base + str[j] - '0';
			break;
			case 'a': case 'b': case 'c':
			case 'd': case 'e': case 'f':
			val = val * base + str[j] - 'a' + 10;
			break;
			case 'A': case 'B': case 'C':
			case 'D': case 'E': case 'F':
			val = val * 16 + str[j] - 'A' + 10;
			break;
			}
		    ++j;
		    }
		str.replace( i, j - i + 1, 1, (char) val );
		}
	    }
	}
    }


bool XmlParser::hasMoreText( void ) throw ( XipException )
    {
    if ( ! pp->gotNextText )
	pp->realGetNextText();
    return pp->gotNextText;
    }


const XipText& XmlParser::getNextText( void ) throw ( XipException )
    {
    if ( ! pp->gotNextText )
	pp->realGetNextText();
    if ( ! pp->gotNextText )
	throw XipException( "unexpected parser failure", pp->pos, pp->line );
    if ( ! pp->nextText.isCdata )
	substituteEntities( pp->nextText.text );
    pp->gotNextText = false;
    if ( pp->nextText.elements.size() == 0 )
	throw XipException( "text outside of the root element", pp->pos, pp->line );
    return pp->nextText;
    }


static bool allWhitespace( std::string str )
    {
    for ( std::string::const_iterator i = str.begin(); i != str.end(); ++i )
	if ( *i != ' ' && *i != '\t' && *i != '\n' && *i != '\r' )
	    return false;
    return true;
    }


void XmlParserPrivate::realGetNextText( void ) throw ( XipException )
    {
    char ch;
    nextText.isCdata = false;
    if ( popElement )
	{
	nextText.elements.pop_back();
	popElement = false;
	}
    for (;;)
	{
	// Get next character.
	if ( gotNextCh )
	    ch = nextCh;
	else
	    {
	    if ( source->eof() )
		{
		if ( gotNextText && allWhitespace( nextText.text ) )
		    gotNextText = false;
		if ( nextText.elements.size() != 0 )
		    throw XipException( "unterminated elements at EOF", pos, line );
		return;
		}
	    ch = source->get();
	    }
	++pos;
	if ( ch == '\n' )
	    ++line;
	// Run state machine.
	switch ( state )
	    {
	    case st_text:
	    switch ( ch )
		{
		case '<':
		state = st_lt;
		if ( gotNextText )
		    if ( allWhitespace( nextText.text ) )
			gotNextText = false;
		    else
			return;
		break;
		default:
		// Accumulate text.
		if ( ! gotNextText )
		    {
		    nextText.text.erase();
		    nextText.startpos = pos - 1;
		    nextText.startline = line;
		    gotNextText = true;
		    }
		if ( ch == '\r' )
		    {
		    nextText.text += '\n';
		    state = st_text_cr;
		    }
		else
		    nextText.text += ch;
		break;
		}
	    break;
	    case st_text_cr:
	    switch ( ch )
		{
		case '\n': state = st_text; break;
		case '\r': nextText.text += '\n'; break;
		default: nextText.text += ch; state = st_text; break;
		}
	    break;

	    case st_lt:
	    elementname.erase();
	    elementname += ch;
	    switch ( ch )
		{
		case '!': state = st_lt_bang; break;
		case '/':
		state = st_end_elementname;
		elementname.erase();
		break;
		case '>': state = st_text; break;
		default: state = st_elementname; break;
		}
	    break;
	    case st_lt_bang:
	    elementname += ch;
	    switch ( ch )
		{
		case '-': state = st_lt_bang_dash; break;
		case '[': state = st_lt_bang_lbrack; break;
		case '>': state = st_text; break;
		default: state = st_bang_elementname; break;
		}
	    break;

	    case st_lt_bang_dash:
	    elementname += ch;
	    switch ( ch )
		{
		case '-': state = st_comment; break;
		case '>': state = st_text; break;
		default: state = st_elementname; break;
		}
	    break;
	    case st_comment:
	    switch ( ch )
		{
		case '-': state = st_comment_dash; break;
		}
	    break;
	    case st_comment_dash:
	    switch ( ch )
		{
		case '-': state = st_comment_dash_dash; break;
		}
	    break;
	    case st_comment_dash_dash:
	    switch ( ch )
		{
		case '>': state = st_text; break;
		}
	    break;

	    case st_lt_bang_lbrack:
	    elementname += ch;
	    switch ( ch )
		{
		case 'C': state = st_lt_bang_lbrack_c; break;
		default: state = st_bang_elementname; break;
		}
	    break;
	    case st_lt_bang_lbrack_c:
	    elementname += ch;
	    switch ( ch )
		{
		case 'D': state = st_lt_bang_lbrack_cd; break;
		default: state = st_bang_elementname; break;
		}
	    break;
	    case st_lt_bang_lbrack_cd:
	    elementname += ch;
	    switch ( ch )
		{
		case 'A': state = st_lt_bang_lbrack_cda; break;
		default: state = st_bang_elementname; break;
		}
	    break;
	    case st_lt_bang_lbrack_cda:
	    elementname += ch;
	    switch ( ch )
		{
		case 'T': state = st_lt_bang_lbrack_cdat; break;
		default: state = st_bang_elementname; break;
		}
	    break;
	    case st_lt_bang_lbrack_cdat:
	    elementname += ch;
	    switch ( ch )
		{
		case 'A': state = st_lt_bang_lbrack_cdata; break;
		default: state = st_bang_elementname; break;
		}
	    break;
	    case st_lt_bang_lbrack_cdata:
	    elementname += ch;
	    switch ( ch )
		{
		case '[':
		state = st_cdata;
		// Accumulate text.
		nextText.text.erase();
		nextText.startpos = pos - 1;
		nextText.startline = line;
		gotNextText = true;
		break;
		default: state = st_bang_elementname; break;
		}
	    break;
	    case st_cdata:
	    switch ( ch )
		{
		case ']': state = st_cdata_rbrack; break;
		case '\r': nextText.text += '\n'; state = st_cdata_cr; break;
		default: nextText.text += ch; break;
		}
	    break;
	    case st_cdata_cr:
	    switch ( ch )
		{
		case '\n': state = st_cdata; break;
		case '\r': nextText.text += '\n'; break;
		default: nextText.text += ch; state = st_cdata; break;
		}
	    break;
	    case st_cdata_rbrack:
	    switch ( ch )
		{
		case ']': state = st_cdata_rbrack_rbrack; break;
		default:
		nextText.text += ']';
		nextText.text += ch;
		state = st_cdata;
		break;
		}
	    break;
	    case st_cdata_rbrack_rbrack:
	    switch ( ch )
		{
		case '>':
		state = st_text;
		if ( allWhitespace( nextText.text ) )
		    gotNextText = false;
		else
		    {
		    nextText.isCdata = true;
		    return;
		    }
		break;
		case ']': nextText.text += ']'; break;
		default:
		nextText.text += ']';
		nextText.text += ch;
		state = st_cdata;
		break;
		}
	    break;

	    case st_elementname:
	    switch ( ch )
		{
		case ' ': case '\t': case '\n': case '\r':
		addElement();
		state = st_element;
		break;
		case '>':
		addElement();
		state = st_text;
		break;
		case '/':
		addElement();
		state = st_element_slash;
		break;
		case '?':
		if ( elementname.length() == 0 )
		    elementname += ch;
		else
		    {
		    addElement();
		    state = st_element_qmark;
		    }
		break;
		default:
		elementname += ch;
		break;
		}
	    break;
	    case st_bang_elementname:
	    switch ( ch )
		{
		case ' ': case '\t': case '\n': case '\r':
		addElement();
		state = st_bang_element;
		break;
		case '>':
		// Return an empty string.
		addElement();
		nextText.text.erase();
		nextText.startpos = pos;
		nextText.startline = line;
		gotNextText = true;
		popElement = true;
		state = st_text;
		return;
		default:
		elementname += ch;
		break;
		}
	    break;
	    case st_element:
	    switch ( ch )
		{
		case ' ': case '\t': case '\n': case '\r':
		break;
		case '/': state = st_element_slash; break;
		case '?': state = st_element_qmark; break;
		case '>': state = st_text; break;
		default:
		state = st_attrname;
		attrname.erase();
		attrname += ch;
		break;
		}
	    break;

	    case st_bang_element:
	    // This actually needs to be much more complicated
	    // to handle !DOCTYPE correctly.  For one thing,
	    // it can have nested <>s.
	    switch ( ch )
		{
		case '>':
		// Return an empty string.
		nextText.text.erase();
		nextText.startpos = pos;
		nextText.startline = line;
		gotNextText = true;
		popElement = true;
		state = st_text;
		return;
		default:
		break;
		}
	    break;

	    case st_attrname:
	    switch ( ch )
		{
		case '=':
		state = st_attrname_equal;
		attrvalue.erase();
		break;
		case ' ': case '\t': case '\n': case '\r':
		case '/': case '?': case '>': 
		throw XipException( "malformed attribute", pos, line );
		default:
		attrname += ch;
		break;
		}
	    break;
	    case st_attrname_equal:
	    switch ( ch )
		{
		case '"': state = st_attrvalue_dq; break;
		case '\'': state = st_attrvalue_sq; break;
		default:
		throw XipException( "unquoted attribute value", pos, line );
		}
	    break;
	    case st_attrvalue_dq:
	    switch ( ch )
		{
		case '"':
		addAttr();
		state = st_element;
		break;
		default:
		attrvalue += ch;
		break;
		}
	    break;
	    case st_attrvalue_sq:
	    switch ( ch )
		{
		case '\'':
		addAttr();
		state = st_element;
		break;
		default:
		attrvalue += ch;
		break;
		}
	    break;

	    case st_element_slash:
	    switch ( ch )
		{
		case '>':
		// Return an empty string.
		nextText.text.erase();
		nextText.startpos = pos;
		nextText.startline = line;
		gotNextText = true;
		popElement = true;
		state = st_text;
		return;
		default:
		throw XipException( "junk after / in self-closing element", pos, line );
		}
	    break;
	    case st_element_qmark:
	    switch ( ch )
		{
		case '>':
		// Return an empty string.
		nextText.text.erase();
		nextText.startpos = pos;
		nextText.startline = line;
		gotNextText = true;
		popElement = true;
		state = st_text;
		return;
		default:
		throw XipException( "junk after ? in processing-instruction element", pos, line );
		}
	    break;
	    case st_end_elementname:
	    switch ( ch )
		{
		case '>':
		if ( nextText.elements.size() == 0 )
		    throw XipException( "unmatched closing element", pos, line );
		if ( nextText.elements.back().name != elementname )
		    throw XipException( "incorrect element nesting", pos, line );
		nextText.elements.pop_back();
		state = st_text;
		break;
		default:
		elementname += ch;
		break;
		}
	    break;
	    }
	}
    }


void XmlParserPrivate::addElement( void )
    {
    XipElement element;
    element.id = elementId++;
    element.name = elementname;
    nextText.elements.push_back( element );
    }


void XmlParserPrivate::addAttr( void )
    {
    substituteEntities( attrvalue );
    nextText.elements.back().attributes[attrname] = attrvalue;
    }
