//------------------------------------------------------------------------
// ZipRC.exe - DtdClassInliner.cs
//
// Licensed under the wxWidgets license, see LICENSE.txt for details.
// (c) Dr. Harald Meyer auf'm Hofe
//
// $Id: HtmlParse.cs,v 1.2 2007/09/22 21:47:43 harald_meyer Exp $
//------------------------------------------------------------------------
using System;
using System.Collections;
using System.Text;
namespace Contrib.Html{
/** Yet another small HTML parser.
* This class uses HtmlLex to scan for tokens in an HTML text stream.
* It calls some virtual methods like OnEnterElement() on certain events
* in parsing the text. Additionally, specializations can store a stack
* of state information that grows with parsing into nested tags and
* decreases with leaving nested tags either parsing the corresponding end tag
* or an end tag of a tag deeper in the stack.
*
* This procedure is rather optimized for robustness and not for compatibility of
* standards. In fact, nearly anything defined by W3C is ignored here. The only
* tested purpose is the adoption of doxygen output to the \e wx.NET help viewer.
*/
public class HtmlParse
{
HtmlLex _src;
/** This class represents an entry in the element stack.
*/
internal protected class StackEntry
{
internal string Element;
/** This is the state info that will be inherited from higher nodes to lower ones.
*/
internal ICloneable State;
/** This is the state that exclusively belongs to this element.
*/
internal object Prop;
public StackEntry(string element)
{
this.Element = element;
this.State=null;
}
}
/** A stack of instances of StackEntry.
*/
IList _stack;
public HtmlParse(HtmlLex src)
{
this._src = src;
this._stack = new ArrayList();
}
/** This is the depth of the internal stack representing the nested structure of tags.
*/
protected int Depth { get { return this._stack.Count; } }
/** This will access the internal stack element according to the given depth.
* \c this[0] returns the entry on the current element. \c this[1] will return the
* entry on the element containing the current element or \c null of there is not
* containing element. All elements without explicit end tag are considered to
* contain everything that their predecessor contains. So,
* \verbatim
<p>
A paragraph.
<p>
Another paragraph.
\endverbatim
* will be parsed like
* \verbatim
<p>
A paragraph.
<p>
Another paragraph.
</p>
</p>
\verbatim
* rather than
* \verbatim
<p>
A paragraph.
</p>
<p>
Another paragraph.
</p>
\verbatim
*/
protected StackEntry this[int depth]
{
get
{
if (depth >= this._stack.Count)
return null;
return (StackEntry) this._stack[depth];
}
}
/** This will be called before any other domain on starting a new element tag.
*/
protected virtual void OnStartElementTag(string currentElement)
{
}
/** Overload this to react on the start of an element.
* \param currentElement is the element's name in lower case letters (e.g. "ul").
* \param currentElementString is the full string describing the current element like for instance '<ul class="test">'.
* \c this[0] will always be this element.
* \param currentElement is the current element's name like "ul"
* \param tagString is the full tag string including attributes introduced by OnAttribute().
* \param attributes maps the names of the parsed attributes to their values.
*/
protected virtual void OnEndElementTag(string currentElement, string tagString, IDictionary attributes)
{
}
/** This will be called on defining an attribute of an element.
* \c this[0] will always be the current element. OnElement() has not yet been called.
* The value will be stripped of quotes if necessary.
*
* \param attributeName is the name of the attribute
* \param value is the value (without quotes) or empty \c attributeName is an attribute without values.
* \param attributes maps the names of the parsed attributes to their values. You may extend this
* to inline new attributes that have not been parsed but that shall be processed by OnEndElementTag().
* \result with \c true this method tells the parser to add the received attribute to the list of attributes to
* ba passed to OnEndElementTag().
*/
protected virtual bool OnAttribute(string attributeName, string value, IDictionary attributes)
{
return true;
}
/** This will be called whenever the stack decreases.
* \c this[0] is always the \c currentElement.
*/
protected virtual void OnClosingElement(string currentElement, string tag)
{
}
/** This will be called whenever this is not a part of more specific events.
*/
protected virtual void OnDefaultEvent(string token)
{
}
/** This is called whenever the parser has read a remark.
* \param remark is the remark text without surrounding tags.
*/
protected virtual void OnRemark(string remark)
{
}
/** Starts the parser, reads from the source, and starts events.
*/
public void Parse()
{
IDictionary attributes=null;
string attributeName=null;
bool readingAttributeValue=false;
StringBuilder fullElementString = null;
for (string token = this._src.NextToken();
token != null;
token = this._src.NextToken())
{
if (token.StartsWith("<!--"))
{
// processing remarks
token = token.Substring(4, token.Length - 7);
token = token.Trim();
this.OnRemark(token);
token = this._src.NextToken();
if (token == null) break;
}
if (token.TrimStart().StartsWith("</"))
{
token = token.Trim();
// end tag. decrease stack.
string element = token.Substring(2).ToLower();
token+=this._src.NextToken().Trim();
while (this._stack.Count > 0)
{
StackEntry current = (StackEntry) this._stack[0];
if (current.Element == element)
{
this.OnClosingElement(current.Element, token);
this._stack.RemoveAt(0);
break;
}
else
this._stack.RemoveAt(0);
}
}
else if (token.TrimStart().StartsWith("<"))
{
// start tag. stack grows.
token = token.Trim();
string element = token.Substring(1).ToLower();
if (this.Depth > 0 && this[0].Element == element)
{
// the new element is equal to the old element.
// act as if the old element has been closed explicitely.
this.OnClosingElement(this[0].Element, string.Format("</{0}>", this[0].Element));
this._stack.RemoveAt(0);
}
this._stack.Insert(0, new StackEntry(element));
if (this.Depth > 1)
this[0].State = (ICloneable)this[1].State.Clone(); // State inheritance
this.OnStartElementTag(element);
attributes = new Hashtable(); // Start reading attributes
fullElementString = new StringBuilder();
fullElementString.AppendFormat("{0} ", token);
}
else if (token.Trim() == ">")
{
token = token.Trim();
// closing the element definition. Only interesting if reading attributes.
if (attributes != null && fullElementString != null)
{
this.OnEndElementTag(this[0].Element, fullElementString.ToString()+token, attributes);
}
fullElementString = null;
attributes = null;
attributeName = null;
}
else if (attributes!=null && attributeName != null && readingAttributeValue)
{
token = token.Trim();
// the token is an attribute value.
string attributeValue = token;
if (attributeValue.StartsWith("\"") && attributeValue.EndsWith("\"") && attributeValue.Length > 1)
attributeValue = attributeValue.Substring(1, attributeValue.Length - 2);
IDictionary newAttributes = new Hashtable();
bool addThis=this.OnAttribute(attributeName, attributeValue, newAttributes);
foreach (DictionaryEntry entry in newAttributes)
{
attributes.Add(entry.Key, entry.Value);
if (entry.Value==null || entry.Value.ToString().Length > 0)
fullElementString.AppendFormat("{0}=\"{1}\" ", entry.Key, entry.Value);
else
fullElementString.AppendFormat("{0} ", entry.Key);
}
if (addThis)
{
fullElementString.AppendFormat("{0}=\"{1}\" ", attributeName, attributeValue);
}
readingAttributeValue = false;
attributeName = null;
}
else if (attributes != null && attributeName == null)
{
// the token is an attribute name
token = token.Trim();
attributeName = token;
}
else if (attributes != null)
{
// this branch expects an equal sign.
token = token.Trim();
if (token == "=")
{
readingAttributeValue = true;
}
else
{
IDictionary newAttributes=new Hashtable();
bool addThis=this.OnAttribute(attributeName, "", newAttributes);
foreach (DictionaryEntry entry in newAttributes)
{
attributes.Add(entry.Key, entry.Value);
if (entry.Value == null || entry.Value.ToString().Length > 0)
fullElementString.AppendFormat("{0}=\"{1}\" ", entry.Key, entry.Value);
else
fullElementString.AppendFormat("{0} ", entry.Key);
}
if (addThis)
{
fullElementString.AppendFormat("{0} ", attributeName);
}
readingAttributeValue = false;
attributeName = null;
}
}
else
{
this.OnDefaultEvent(token);
}
}
}
}
}
|