//------------------------------------------------------------------------
// ZipRC.exe - DtdClassInliner.cs
//
// Licensed under the wxWidgets license, see LICENSE.txt for details.
// (c) Dr. Harald Meyer auf'm Hofe
//
// $Id: HtmlLex.cs,v 1.2 2007/09/22 21:47:43 harald_meyer Exp $
//------------------------------------------------------------------------
using System;
using System.Text;
using System.IO;
namespace Contrib.Html{
/** Fast and rough lexical analysis of HTML.
* Without exact regaring the actual lexical structure of HTML or SGML this simply
* uses blanks (or smaller non-negative characters) and the assignment operator as
* separators. Additionally, "<" always starts a new token and the current token
* is stopped with ">". Finally, this
* scanner recognizes strings in such a way that everything inbetween two quotes "
* will be collected into one token.
*
* Line endings are also tokens. This is simply to enable users of this class
* to pertain the line endings when converting an HTML source into a new HTML text.
*/
public class HtmlLex
{
string _src;
int _pos;
string _token = null;
/** Generates a lexer listening on the provided data source.
*/
public HtmlLex(string src)
{
this._src = src;
this._pos = 0;
}
/** Returns \c true iff you can read a token from this.
*/
public bool HasToken { get { return this.CurrentToken != null; } }
/** This returns the current token.
* This may be \c null if everything has been read.
*/
public string CurrentToken
{
get
{
if (this._token == null) return NextToken();
return this._token.ToString();
}
}
/** This will jump to the next token that is not a "\n".
* Use this whenever you do not want to deal with line feeds in
* the original data source.
*/
public string NextTokenNoCR()
{
string token = NextToken();
for (;
"\n" == token;
token = NextToken())
{
}
return token;
}
/** This will jump to the next token.
* This will be \c null if everything has been read.
*/
public string NextToken()
{
if (this._src == null)
{
return null;
}
StringBuilder newToken=new StringBuilder();
for ( ; this._pos < this._src.Length; ++this._pos)
{
char c = this._src[this._pos];
if (c == '<'
&& newToken.Length==0
&& this._src.Substring(this._pos, 4) == "<!--")
{
// this is a remark.
int newPos = this._src.IndexOf("-->", this._pos+4);
if (newPos < 0)
{
this._token = this._src.Substring(this._pos);
this._src = null;
}
else
{
newPos+=3;
this._token = this._src.Substring(this._pos, newPos-this._pos);
this._pos = newPos;
}
break;
}
else
{
if (c == '>'
|| c == '\n'
|| c == '=')
{
if (newToken.Length == 0)
{
this._token = new string(c, 1);
++this._pos;
}
else
{
// this is attribute token for itself.
this._token = newToken.ToString();
}
break;
}
if ((c == '"'
|| c == '<'
|| (c >= 0 && c <= ' '))
&& newToken.Length > 0)
{
// stop with the current token because """, "<" or " " start a new one.
this._token = newToken.ToString();
break;
}
if (c < 0 || c >= ' ')
newToken.Append(new char[] { c });
if (c == '"')
{
// read a string
++this._pos;
for (; this._pos < this._src.Length; ++this._pos)
{
c = this._src[this._pos];
newToken.Append(new char[] { c });
if (c == '"')
break;
}
}
}
}
if (this._pos >= this._src.Length)
this._src = null;
return this._token;
}
/** This will find the next occurance of \c findThis, put the cursor to the end of this occurance, and return the string of chars this rocedure jumped over.
* If \c findThis cannot be found, this will return the complete not yet scanned text.
* */
public string GotoNext(string findThis)
{
int newPos = this._src.IndexOf(findThis, this._pos);
if (newPos < 0)
{
string result = this._src.Substring(this._pos);
this._src = null;
return result;
}
else
{
newPos+=findThis.Length;
string result = this._src.Substring(this._pos, newPos-this._pos);
this._pos = newPos;
return result;
}
}
}
}
|