HtmlParse.cs :  » GUI » wx-NET » Contrib » Html » C# / CSharp Open Source

Home
C# / CSharp Open Source
1.2.6.4 mono .net core
2.2.6.4 mono core
3.Aspect Oriented Frameworks
4.Bloggers
5.Build Systems
6.Business Application
7.Charting Reporting Tools
8.Chat Servers
9.Code Coverage Tools
10.Content Management Systems CMS
11.CRM ERP
12.Database
13.Development
14.Email
15.Forum
16.Game
17.GIS
18.GUI
19.IDEs
20.Installers Generators
21.Inversion of Control Dependency Injection
22.Issue Tracking
23.Logging Tools
24.Message
25.Mobile
26.Network Clients
27.Network Servers
28.Office
29.PDF
30.Persistence Frameworks
31.Portals
32.Profilers
33.Project Management
34.RSS RDF
35.Rule Engines
36.Script
37.Search Engines
38.Sound Audio
39.Source Control
40.SQL Clients
41.Template Engines
42.Testing
43.UML
44.Web Frameworks
45.Web Service
46.Web Testing
47.Wiki Engines
48.Windows Presentation Foundation
49.Workflows
50.XML Parsers
C# / C Sharp
C# / C Sharp by API
C# / CSharp Tutorial
C# / CSharp Open Source » GUI » wx NET 
wx NET » Contrib » Html » HtmlParse.cs
//------------------------------------------------------------------------
// ZipRC.exe - DtdClassInliner.cs
// 
// Licensed under the wxWidgets license, see LICENSE.txt for details.
// (c) Dr. Harald Meyer auf'm Hofe
//
// $Id: HtmlParse.cs,v 1.2 2007/09/22 21:47:43 harald_meyer Exp $
//------------------------------------------------------------------------

using System;
using System.Collections;
using System.Text;

namespace Contrib.Html{
    /** Yet another small HTML parser.
     * This class uses HtmlLex to scan for tokens in an HTML text stream.
     * It calls some virtual methods like OnEnterElement() on certain events
     * in parsing the text. Additionally, specializations can store a stack
     * of state information that grows with parsing into nested tags and
     * decreases with leaving nested tags either parsing the corresponding end tag 
     * or an end tag of a tag deeper in the stack.
     * 
     * This procedure is rather optimized for robustness and not for compatibility of
     * standards. In fact, nearly anything defined by W3C is ignored here. The only
     * tested purpose is the adoption of doxygen output to the \e wx.NET help viewer.
     */
    public class HtmlParse
    {
        HtmlLex _src;

        /** This class represents an entry in the element stack.
         */
        internal protected class StackEntry
        {
            internal string Element;
            /** This is the state info that will be inherited from higher nodes to lower ones.
             */
            internal ICloneable State;
            /** This is the state that exclusively belongs to this element.
             */
            internal object Prop;
            public StackEntry(string element)
            {
                this.Element = element;
                this.State=null;
            }
        }

        /** A stack of instances of StackEntry.
         */
        IList _stack;
        public HtmlParse(HtmlLex src)
        {
            this._src = src;
            this._stack = new ArrayList();
        }

        /** This is the depth of the internal stack representing the nested structure of tags.
         */
        protected int Depth { get { return this._stack.Count; } }

        /** This will access the internal stack element according to the given depth.
         * \c this[0] returns the entry on the current element. \c this[1] will return the
         * entry on the element containing the current element or \c null of there is not 
         * containing element. All elements without explicit end tag are considered to
         * contain everything that their predecessor contains. So, 
         * \verbatim
         <p>
         A paragraph.
         <p>
         Another paragraph.
         \endverbatim
         * will be parsed like
         * \verbatim
         <p>
         A paragraph.
         <p>
         Another paragraph.
         </p>
         </p>
         \verbatim
         * rather than
         * \verbatim
         <p>
         A paragraph.
         </p>
         <p>
         Another paragraph.
         </p>
         \verbatim
         */
        protected StackEntry this[int depth]
        {
            get
            {
                if (depth >= this._stack.Count)
                    return null;
                return (StackEntry) this._stack[depth];
            }
        }

        /** This will be called before any other domain on starting a new element tag.
         */
        protected virtual void OnStartElementTag(string currentElement)
        {
        }

        /** Overload this to react on the start of an element.
         * \param currentElement is the element's name in lower case letters (e.g. "ul").
         * \param currentElementString is the full string describing the current element like for instance '<ul class="test">'.
         * \c this[0] will always be this element.
         * \param currentElement is the current element's name like "ul"
         * \param tagString is the full tag string including attributes introduced by OnAttribute().
         * \param attributes maps the names of the parsed attributes to their values.
         */
        protected virtual void OnEndElementTag(string currentElement, string tagString, IDictionary attributes)
        {
        }

        /** This will be called on defining an attribute of an element.
         * \c this[0] will always be the current element. OnElement() has not yet been called.
         * The value will be stripped of quotes if necessary.
         * 
         * \param attributeName is the name of the attribute
         * \param value is the value (without quotes) or empty \c attributeName is an attribute without values.
         * \param attributes maps the names of the parsed attributes to their values. You may extend this
         *        to inline new attributes that have not been parsed but that shall be processed by OnEndElementTag().
         * \result with \c true this method tells the parser to add the received attribute to the list of attributes to
         *         ba passed to OnEndElementTag().
         */
        protected virtual bool OnAttribute(string attributeName, string value, IDictionary attributes)
        {
            return true;
        }

        /** This will be called whenever the stack decreases.
         * \c this[0] is always the \c currentElement.
         */
        protected virtual void OnClosingElement(string currentElement, string tag)
        {
        }

        /** This will be called whenever this is not a part of more specific events.
         */
        protected virtual void OnDefaultEvent(string token)
        {
        }

        /** This is called whenever the parser has read a remark.
         * \param remark is the remark text without surrounding tags.
         */
        protected virtual void OnRemark(string remark)
        {
        }

        /** Starts the parser, reads from the source, and starts events.
         */
        public void Parse()
        {
            IDictionary attributes=null;
            string attributeName=null;
            bool readingAttributeValue=false;
            StringBuilder fullElementString = null;
            for (string token = this._src.NextToken();
                token != null;
                token = this._src.NextToken())
            {
                if (token.StartsWith("<!--"))
                {
                    // processing remarks
                    token = token.Substring(4, token.Length - 7);
                    token = token.Trim();
                    this.OnRemark(token);
                    token = this._src.NextToken();
                    if (token == null) break;
                }
                if (token.TrimStart().StartsWith("</"))
                {
                    token = token.Trim();
                    // end tag. decrease stack.
                    string element = token.Substring(2).ToLower();
                    token+=this._src.NextToken().Trim();
                    while (this._stack.Count > 0)
                    {
                        StackEntry current = (StackEntry) this._stack[0];
                        if (current.Element == element)
                        {
                            this.OnClosingElement(current.Element, token);
                            this._stack.RemoveAt(0);
                            break;
                        }
                        else
                            this._stack.RemoveAt(0);
                    }
                }
                else if (token.TrimStart().StartsWith("<"))
                {
                    // start tag. stack grows.
                    token = token.Trim();
                    string element = token.Substring(1).ToLower();
                    if (this.Depth > 0 && this[0].Element == element)
                    {
                        // the new element is equal to the old element.
                        // act as if the old element has been closed explicitely.
                        this.OnClosingElement(this[0].Element, string.Format("</{0}>", this[0].Element));
                        this._stack.RemoveAt(0);
                    }
                    this._stack.Insert(0, new StackEntry(element));
                    if (this.Depth > 1)
                        this[0].State = (ICloneable)this[1].State.Clone(); // State inheritance
                    this.OnStartElementTag(element);
                    attributes = new Hashtable(); // Start reading attributes
                    fullElementString = new StringBuilder();
                    fullElementString.AppendFormat("{0} ", token);
                }
                else if (token.Trim() == ">")
                {
                    token = token.Trim();
                    // closing the element definition. Only interesting if reading attributes.
                    if (attributes != null && fullElementString != null)
                    {
                        this.OnEndElementTag(this[0].Element, fullElementString.ToString()+token, attributes);
                    }
                    fullElementString = null;
                    attributes = null;
                    attributeName = null;
                }
                else if (attributes!=null && attributeName != null && readingAttributeValue)
                {
                    token = token.Trim();
                    // the token is an attribute value.
                    string attributeValue = token;
                    if (attributeValue.StartsWith("\"") && attributeValue.EndsWith("\"") && attributeValue.Length > 1)
                        attributeValue = attributeValue.Substring(1, attributeValue.Length - 2);
                    IDictionary newAttributes = new Hashtable();
                    bool addThis=this.OnAttribute(attributeName, attributeValue, newAttributes);
                    foreach (DictionaryEntry entry in newAttributes)
                    {
                        attributes.Add(entry.Key, entry.Value);
                        if (entry.Value==null || entry.Value.ToString().Length > 0)
                            fullElementString.AppendFormat("{0}=\"{1}\" ", entry.Key, entry.Value);
                        else
                            fullElementString.AppendFormat("{0} ", entry.Key);
                    }
                    if (addThis)
                    {
                        fullElementString.AppendFormat("{0}=\"{1}\" ", attributeName, attributeValue);
                    }
                    readingAttributeValue = false;
                    attributeName = null;
                }
                else if (attributes != null && attributeName == null)
                {
                    // the token is an attribute name
                    token = token.Trim();
                    attributeName = token;
                }
                else if (attributes != null)
                {
                    // this branch expects an equal sign.
                    token = token.Trim();
                    if (token == "=")
                    {
                        readingAttributeValue = true;
                    }
                    else
                    {
                        IDictionary newAttributes=new Hashtable();
                        bool addThis=this.OnAttribute(attributeName, "", newAttributes);
                        foreach (DictionaryEntry entry in newAttributes)
                        {
                            attributes.Add(entry.Key, entry.Value);
                            if (entry.Value == null || entry.Value.ToString().Length > 0)
                                fullElementString.AppendFormat("{0}=\"{1}\" ", entry.Key, entry.Value);
                            else
                                fullElementString.AppendFormat("{0} ", entry.Key);
                        }
                        if (addThis)
                        {
                            fullElementString.AppendFormat("{0} ", attributeName);
                        }
                        readingAttributeValue = false;
                        attributeName = null;
                    }
                }
                else
                {
                    this.OnDefaultEvent(token);
                }
            }
        }
    }
}
www.java2v.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.