// XTPMarkupParser.cpp: implementation of the CXTPMarkupParser class.
//
// This file is a part of the XTREME TOOLKIT PRO MFC class library.
// (c)1998-2012 Codejock Software, All Rights Reserved.
//
// THIS SOURCE FILE IS THE PROPERTY OF CODEJOCK SOFTWARE AND IS NOT TO BE
// RE-DISTRIBUTED BY ANY MEANS WHATSOEVER WITHOUT THE EXPRESSED WRITTEN
// CONSENT OF CODEJOCK SOFTWARE.
//
// THIS SOURCE CODE CAN ONLY BE USED UNDER THE TERMS AND CONDITIONS OUTLINED
// IN THE XTREME TOOLKIT PRO LICENSE AGREEMENT. CODEJOCK SOFTWARE GRANTS TO
// YOU (ONE SOFTWARE DEVELOPER) THE LIMITED RIGHT TO USE THIS SOFTWARE ON A
// SINGLE COMPUTER.
//
// CONTACT INFORMATION:
// support@codejock.com
// http://www.codejock.com
//
/////////////////////////////////////////////////////////////////////////////

#include "stdafx.h"

#include "Common/XTPVc80Helpers.h"
#include "Common/XTPSystemHelpers.h"

#include "XTPMarkupParser.h"

// Based on code of Andrew Fedoniouk @ terrainformatica.com

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif

//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

CXTPMarkupParser::CXTPMarkupParser()
:   m_cInputChar(0),
	m_nValueLength(0),
	m_nTagNameLength(0),
	m_nAttributeNameLength(0),
	m_bGotTail(FALSE)
{
	m_lpszPos = NULL;
	m_lpszEnd = NULL;

	m_nLine = 0;
	m_nPosition = 0;

	m_bUnicode = FALSE;
	m_nEncoding = CP_ACP;

	m_lpszValue = new WCHAR[m_nValueAlloc = XTP_MAX_TOKEN_SIZE];

	m_scan = &CXTPMarkupParser::ScanBody;
}

CXTPMarkupParser::~CXTPMarkupParser()
{
	delete[] m_lpszValue;

}

void CXTPMarkupParser::SetBuffer(LPCSTR lpszStart, LPCSTR lpszEnd)
{
	m_lpszPos = lpszStart;
	m_lpszEnd = lpszEnd;
	m_bUnicode = FALSE;
}

void CXTPMarkupParser::SetBuffer(LPCWSTR lpszStart, LPCWSTR lpszEnd)
{
	m_lpszPos = (LPCSTR)lpszStart;
	m_lpszEnd = (LPCSTR)lpszEnd;
	m_bUnicode = TRUE;
}

CXTPMarkupParser::TokenType CXTPMarkupParser::GetNextToken()
{
	return (this->*m_scan)();
}

const WCHAR* CXTPMarkupParser::GetValue()
{
	m_lpszValue[m_nValueLength] = 0;
	return m_lpszValue;
}

const WCHAR* CXTPMarkupParser::GetAttributeName()
{
	m_lpszAttributeName[m_nAttributeNameLength] = 0;
	return m_lpszAttributeName;
}

const WCHAR* CXTPMarkupParser::GetTagName()
{
	m_lpszTagName[m_nTagNameLength] = 0;
	return m_lpszTagName;
}

CXTPMarkupParser::TokenType CXTPMarkupParser::ReportError(LPCWSTR lpszError)
{
	WCSNCPY_S(m_lpszValue, 1024, lpszError, 1024);
	m_nValueLength = (int)wcslen(m_lpszValue);

	return tokenError;
}

BOOL CXTPMarkupParser::FindFirstTag()
{
	WCHAR c = GetChar();

	while (c != 0)
	{
		if (c == '<')
		{
			PushBack(c);
			return TRUE;
		}
		else if (!IsWhitespace(c))
		{
			return FALSE;
		}
		c = GetChar();
	}

	return FALSE;
}

CXTPMarkupParser::TokenType CXTPMarkupParser::ScanBody()
{
	WCHAR c = GetChar();

	m_nValueLength = 0;

	BOOL ws = FALSE;

	if (c == 0) return tokenEof;
	else if (c == '<') return ScanTag();
	else if (c == '&')
		c = ScanEntity();
	else
		ws = IsWhitespace(c);

	while (TRUE)
	{
		AppendValue(c);
		c = GetNextChar();
		if (c == 0)  { PushBack(c); break; }
		if (c == '<') { PushBack(c); break; }
		if (c == '&') { PushBack(c); break; }

		if (IsWhitespace(c) != ws)
		{
			PushBack(c);
			break;
		}

	}
	return ws? tokenSpace : tokenWord;
}

CXTPMarkupParser::TokenType CXTPMarkupParser::ScanHead()
{
	WCHAR c = SkipWhitespace();

	if (c == '>') { m_scan = &CXTPMarkupParser::ScanBody; return ScanBody(); }
	if (c == '/')
	{
		WCHAR t = GetChar();
		if (t == '>')   { m_scan = &CXTPMarkupParser::ScanBody; return tokenTagEnd; }
		else { PushBack(t); return ReportError(L"Unexpected token. The expected token is '>'"); }
	}

	m_nAttributeNameLength = 0;
	m_nValueLength = 0;

	// attribute name...
	while (c != '=')
	{
		if ( c == 0) return tokenEof;
		if ( c == '>' ) return ReportError(L"'>' is an unexpected token. The expected token is '='");
		if ( IsWhitespace(c) )
		{
			c = SkipWhitespace();
			if (c != '=') return ReportError(L"Unexpected token. The expected token is '='");
			else break;
		}
		if ( c == '<') return ReportError(L"'<' is an unexpected token. The expected token is '='");
		AppendAttributeName(c);
		c = GetChar();
	}

	c = SkipWhitespace();
	// attribute m_lpszValue...

	if (c == '\"')
	{
		while ((c = GetChar()) != NULL)
		{
			if (c == '\"') return tokenAttribute;
			if (c == '&')
				c = ScanEntity();
			AppendValue(c);
		}
	}
	else if (c == '\'')
	{
		while ((c = GetChar()) != NULL)
		{
			if (c == '\'') return tokenAttribute;
			if (c == '&')
				c = ScanEntity();
			AppendValue(c);
		}
	}

	return ReportError(L"Unexpected token. The expected token is '\"' or '''");
}

// caller already consumed '<'
// scan header start or tag tail
CXTPMarkupParser::TokenType CXTPMarkupParser::ScanTag()
{
	m_nTagNameLength = 0;

	WCHAR c = GetChar();

	BOOL is_tail = c == '/';
	if (is_tail) c = GetChar();
	else if ( c == '?' )
	{
		m_scan = &CXTPMarkupParser::ScanPI;
		return tokenPIStart;
	}

	while (c)
	{
		if (IsWhitespace(c)) { c = SkipWhitespace(); break; }
		if (c == '/' || c == '>') break;
		AppendTagName(c);

		switch (m_nTagNameLength)
		{
		case 3:
			if (wcsncmp(m_lpszTagName, L"!--", 3) == 0)  { m_scan = &CXTPMarkupParser::ScanComment; return tokenCommentStart; }
			break;

		case 8:
			if ( wcsncmp(m_lpszTagName, L"![CDATA[", 8) == 0 ) { m_scan = &CXTPMarkupParser::ScanCData; return tokenCDataStart; }
			break;
		}

		c = GetChar();
	}

	if (c == 0) return ReportError(L"Unexpected end of file has occurred.");

	if (is_tail)
	{
		if (c == '>') return tokenTagEnd;
		return ReportError(L"Unexpected token. The expected token is '>'");
	}
	else
		PushBack(c);

	m_scan = &CXTPMarkupParser::ScanHead;
	return tokenTagStart;
}

// skip whitespaces.
// returns first non-whitespace WCHAR
WCHAR CXTPMarkupParser::SkipWhitespace()
{
	for (WCHAR c = GetChar(); c != 0; c = GetChar())
	{
		if (!IsWhitespace(c)) return c;
	}

	return 0;
}

void CXTPMarkupParser::PushBack(WCHAR c)
{
	m_cInputChar = c;
}

WCHAR CXTPMarkupParser::GetNextChar()
{
	if (m_lpszPos >= m_lpszEnd)
		return NULL;

	WCHAR c = 0;

	if (m_bUnicode)
	{
		c = *((LPCWSTR)m_lpszPos);
		m_lpszPos += sizeof(WCHAR);
	}
	else
	{
		char t = *m_lpszPos;

		if (m_nEncoding == CP_UTF8)
		{
			if ( 0 == ( t & '\x80' ) )
			{
				c = t;
			}
			else if ('\xF0' == (t & '\xF0')) // 1111 - error, more than 16-bit char
			{

			}
			else if ( '\xE0' == (t & '\xF0')) // 1110xxxx 10xxxxxx 10xxxxxx
			{
				char t2 = *(++m_lpszPos);
				char t3 = *(++m_lpszPos);

				c = (WCHAR)((WCHAR(t & '\x0F') << 12 ) | ( WCHAR(t2 & '\x3F' ) << 6 ) | WCHAR(t3 & '\x3F' ));
			}
			else if ( '\xC0' == (t & '\xE0')) // 110xxxxx 10xxxxxx
			{
				char t2 = *(++m_lpszPos);
				c = (WCHAR)((WCHAR( t & '\x1F' ) << 6 ) | ( t2 & '\x3F' ));
			}
			else
			{
			}
		}
		else if (XTPSystemVersion()->GetMaxCharSize() > 1 && _istlead(t))
		{
			MultiByteToWideChar(m_nEncoding, 0, m_lpszPos, 2, &c, 1);
			m_lpszPos++;
		}
		else if (t > 0 && t < 128)
		{
			c = t;
		}
		else
		{
			MultiByteToWideChar(m_nEncoding, 0, m_lpszPos, 1, &c, 1);
		}
		m_lpszPos++;
	}

	m_nPosition++;
	if (c == '\r' || c == '\n')
	{
		m_nLine++;
		m_nPosition = 0;
	}

	return c;
}

WCHAR CXTPMarkupParser::GetChar()
{
	if (m_cInputChar) { WCHAR t(m_cInputChar); m_cInputChar = 0; return t; }
	return GetNextChar();
}

WCHAR CXTPMarkupParser::ResolveEntity(const WCHAR* buf, int buf_size)
{
	if (buf[0] == '#')
	{
		int nAscii = 0;

		if (buf[1] == 'x' && buf_size > 2)
		{
			if (WSCANF_S(buf + 2, L"%x", &nAscii) != 1)
				return 0;

			return (WCHAR)nAscii;
		}
		else
		{
			if (WSCANF_S(buf + 1, L"%i", &nAscii) != 1)
				return 0;

			return (WCHAR)nAscii;
		}
	}

	return 0;
}

// caller consumed '&'
WCHAR CXTPMarkupParser::ScanEntity()
{
	WCHAR buf[32];
	int i = 0;
	WCHAR t;
	for (; i < 31 ; ++i )
	{
		t = GetChar();
		if (t == 0) return tokenEof;

		buf[i] = t;
		if (t == ';')
			break;
	}
	buf[i] = 0;
	if (i == 2)
	{
		if (wcsncmp(buf, L"gt", 2) == 0) return '>';
		if (wcsncmp(buf, L"lt", 2) == 0) return '<';
	}
	else if (i == 3 && (wcsncmp(buf, L"amp", 3) == 0))
		return '&';
	else if (i == 4)
	{
		if (wcsncmp(buf, L"apos", 4) == 0) return '\'';
		if (wcsncmp(buf, L"quot", 4) == 0) return '\"';
	}
	t = ResolveEntity(buf, i);
	if (t) return t;
	// no luck ...
	AppendValue('&');
	for (int n = 0; n < i; ++n)
		AppendValue(buf[n]);
	return ';';
}

BOOL CXTPMarkupParser::IsWhitespace(WCHAR c) const
{
	return c <= ' '
		&& (c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f');
}

void CXTPMarkupParser::AppendValue(WCHAR c)
{
	if (m_nValueLength >= m_nValueAlloc - 1)
	{
		m_nValueAlloc *= 2;
		WCHAR* lpszValue = new WCHAR[m_nValueAlloc];
		MEMCPY_S(lpszValue, m_lpszValue, m_nValueLength * sizeof(WCHAR));

		delete[] m_lpszValue;
		m_lpszValue = lpszValue;
	}

	m_lpszValue[m_nValueLength++] = c;
}

void CXTPMarkupParser::AppendAttributeName(WCHAR c)
{
	if (m_nAttributeNameLength < (XTP_MAX_NAME_SIZE - 1))
		m_lpszAttributeName[m_nAttributeNameLength++] = c;
}

void CXTPMarkupParser::AppendTagName(WCHAR c)
{
	if (m_nTagNameLength < (XTP_MAX_NAME_SIZE - 1))
		m_lpszTagName[m_nTagNameLength++] = c;
}

CXTPMarkupParser::TokenType CXTPMarkupParser::ScanComment()
{
	if (m_bGotTail)
	{
		m_scan = &CXTPMarkupParser::ScanBody;
		m_bGotTail = FALSE;
		return tokenCommentEnd;
	}
	for (m_nValueLength = 0; m_nValueLength < (XTP_MAX_TOKEN_SIZE - 1); ++m_nValueLength)
	{
		WCHAR c = GetChar();
		if ( c == 0) return tokenEof;
		m_lpszValue[m_nValueLength] = c;

		if (m_nValueLength >= 2
			&& m_lpszValue[m_nValueLength] == '>'
			&& m_lpszValue[m_nValueLength - 1] == '-'
			&& m_lpszValue[m_nValueLength - 2] == '-')
		{
			m_bGotTail = TRUE;
			m_nValueLength -= 2;
			break;
		}
	}
	return tokenData;
}

CXTPMarkupParser::TokenType CXTPMarkupParser::ScanCData()
{
	if (m_bGotTail)
	{
		m_scan = &CXTPMarkupParser::ScanBody;
		m_bGotTail = FALSE;
		return tokenCDataEnd;
	}
	for (m_nValueLength = 0; m_nValueLength < (XTP_MAX_TOKEN_SIZE - 1); ++m_nValueLength)
	{
		WCHAR c = GetChar();
		if ( c == 0) return tokenEof;
		m_lpszValue[m_nValueLength] = c;

		if (m_nValueLength >= 2
			&& m_lpszValue[m_nValueLength] == '>'
			&& m_lpszValue[m_nValueLength - 1] == ']'
			&& m_lpszValue[m_nValueLength - 2] == ']')
		{
			m_bGotTail = TRUE;
			m_nValueLength -= 2;
			break;
		}
	}
	return tokenData;
}

CXTPMarkupParser::TokenType CXTPMarkupParser::ScanPI()
{
	if (m_bGotTail)
	{
		m_scan = &CXTPMarkupParser::ScanBody;
		m_bGotTail = FALSE;
		return tokenPIEnd;
	}
	for (m_nValueLength = 0; m_nValueLength < (XTP_MAX_TOKEN_SIZE - 1); ++m_nValueLength)
	{
		WCHAR c = GetChar();
		if ( c == 0)
			return tokenEof;

		if (IsWhitespace(c))
		{
			m_nValueLength--;
			continue;
		}

		m_lpszValue[m_nValueLength] = c;

		if (m_nValueLength >= 1
			&& m_lpszValue[m_nValueLength] == '>'
			&& m_lpszValue[m_nValueLength - 1] == '?')
		{
			m_bGotTail = TRUE;
			m_nValueLength -= 1;
			break;
		}
	}
	return tokenData;
}