using System;
using System.Threading;
using System.Collections;
using System.Xml;
using System.Xml.Xsl;
using System.Xml.XPath;
using System.Net;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
namespace Bitworking{
public class HelpfulException : Exception {
private string message;
public override string Message { get { return message; } }
public HelpfulException( Exception e ) {
// Users do not care about the outer invocation exceptions
System.Type type = e.GetType();
if ( type == typeof( System.Reflection.TargetInvocationException ) ) {
e = e.InnerException;
type = e.GetType();
}
message = e.Message;
if ( type == typeof( System.Net.WebException ) ) {
System.Net.WebException x = (System.Net.WebException) e;
if ( x.Status == System.Net.WebExceptionStatus.NameResolutionFailure )
message += "\r\n(Try checking that the URL is okay)";
if ( x.Response != null && x.Response.ResponseUri != null )
message += "\r\n(While attempting to access the following URI: " + x.Response.ResponseUri + ")";
}
else if ( type == typeof( System.UriFormatException ) ) {
/* TODO: Is there any interesting that this exception provides?
System.UriFormatException x = (System.UriFormatException) e;
message += "\r\n(While attempting to crack the following URI: "
*/
}
}
} // class HelpfulException
/// <summary>
/// A collection of static utility methods.
/// </summary>
public class StringUtils {
/// <summary>
/// Checks whether a given string is part of the (possibly null
/// or non-string) collection.
/// </summary>
/// <param name="str">String to look for</param>
/// <param name="collection">Collection to search in, may be null
/// or have non-string elements</param>
/// <returns>true if str found in collection, false otherwise</returns>
static public bool IsInCollection( string str, ICollection collection ) {
if ( collection == null )
return false;
foreach ( object o in collection ) {
string s = o as string;
if ( s != null && s == str )
return true;
}
return false;
} // IsInCollection
/// <summary>
/// Given a string with possibly escaped HTML, sanitize it so
/// that it won't have the dangerous script, meta, or object
/// tags.
/// </summary>
/// <param name="s">String to sanitize</param>
/// <returns>Sanitized string</returns>
static public string SanitizeMarkup( string s ) {
if (s != null && s != "") {
s = findScript_.Replace(s, "< script");
s = findMeta_.Replace(s, "< meta");
s = findObject_.Replace(s, "< object");
}
return s;
}
private static Regex findScript_ = new Regex("<\\s*script", RegexOptions.IgnoreCase);
private static Regex findMeta_ = new Regex("<\\s*meta", RegexOptions.IgnoreCase);
private static Regex findObject_ = new Regex("<\\s*object", RegexOptions.IgnoreCase);
/// <summary>
/// Automatically pick a string if it has substance, or pick
/// a fallback string if not.
/// </summary>
/// <param name="val">The string to check for substance.</param>
/// <param name="fallback">The fallback string.</param>
/// <returns>val if it has substance (non-null and non-empty),
/// fallback otherwise.</returns>
static public string StringOrFallback( string val, string fallback ) {
if ( val != null && val != "" )
return val;
return fallback;
}
// Starting from the parent 'node', traverse the path 'childPath'
// and return the string value there. Empty strings if there's a problem.
static public string GetChildInnerXml( XmlNamespaceManager nsmanager, XmlNode node, string childPath ) {
if ( node == null )
return "";
XmlNode child = node.SelectSingleNode( childPath, nsmanager );
if ( child == null )
return "";
return child.InnerXml;
} // GetChildInnerXml
// TODO: Check whether the whole concept can be made redundant through
// XML normalization phase prior to this level of processing.
static public string GetChildAsText( XmlNamespaceManager nsmanager, XmlNode node, string childPath ) {
if ( node == null )
return "";
XmlNode child = node.SelectSingleNode( childPath, nsmanager );
if ( child == null )
return "";
XmlNodeList grandchildren = child.ChildNodes;
if ( grandchildren == null || grandchildren.Count == 0 )
return child.Value;
// If we have only one CDATA or Text grandchildren, its value
// becomes the value of the this function. Otherwise, concatenate.
StringBuilder text = new StringBuilder( child.OuterXml.Length );
foreach ( XmlNode grandchild in grandchildren ) {
XmlNodeType type = grandchild.NodeType;
if ( type == XmlNodeType.CDATA || type == XmlNodeType.Text ) {
text.Append( grandchild.Value );
}
else if ( type == XmlNodeType.Element ) {
text.Append( grandchild.OuterXml );
}
else {
System.Diagnostics.Debug.WriteLine( "*** Ignoring XmlNodeType: " + type + ", OuterXml: " + grandchild.OuterXml );
}
}
return text.ToString();
} // GetChildAsText
/// <summary>
/// Read a stream to its end. Assuming that the stream holds an XML
/// resource, put its content into a string, taking into account the
/// various encodings an XML resource might be in.
/// </summary>
/// <param name="stream">A stream to read the XML resource from.</param>
/// <returns>The string representation of the XML resource.</returns>
static public string LoadXmlStreamIntoString( Stream stream ) {
// Our algorithm is:
// 1. Read the signature starting the stream
// 2. If the signature is matched by a known encoding, use that encoding
// 3. Otherwise, assume ASCII and look for the XML PI's encoding attribute
// 4. If none found, the stream is UTF-8
// 1.
byte[] signature = new Byte[4];
int bytesRead = stream.Read( signature, 0, 4 );
if ( bytesRead != 4 ) {
throw new Exception( "XML stream ended abruptly" );
}
// 2.
// Note that currently I haven't put all the signatures here.
// (For example, UCS-4 types are not found here.) See the XML spec
// for additional possible signatures which currently go unused.
Encoding encoding = null;
byte s0 = signature[0];
if ( s0 == 0xFE )
encoding = Encoding.GetEncoding( "UTF-16BE" );
else if ( s0 == 0xFF )
encoding = Encoding.GetEncoding( "UTF-16LE" );
else if ( s0 == 0xEF )
encoding = Encoding.UTF8;
// 3.
// TODO:
// 00/3C/00/3F - UTF-16BE, but read PI
// 3C/00/3F/00 - UTF-16LE, but read PI
// 4C/6F/A7/94 - EBCDIC
else
if ( s0 == 0x3C && signature[1] == 0x3F && signature[2] == 0x78 && signature[3] == 0x6D ) {
// We don't have a BOM, we do have an XML PI, and we need to read
// the PI's 'encoding' attribute to determine the encoding.
// Read until you see the end of the PI "?>"
encoding = Encoding.UTF8;
StreamReader reader1 = new StreamReader( stream, encoding, false );
string xmlPi = "";
while ( true ) {
string line = reader1.ReadLine();
xmlPi += line + "\r\n";
if ( line.IndexOf( "?>" ) >= 0 )
break;
}
// Ignore everything in the PI except the value of the encoding attribute
System.Text.RegularExpressions.Match match =
System.Text.RegularExpressions.Regex.Match( xmlPi, "encoding\\W*=\\W*('|\")" );
if ( match.Success ) {
string enc = xmlPi.Substring( match.Index + match.Length );
match = System.Text.RegularExpressions.Regex.Match( enc, "('|\")" );
if ( match.Success ) {
enc = enc.Substring( 0, match.Index );
// Get the encoding
encoding = Encoding.GetEncoding( enc );
}
}
}
if ( encoding == null ) {
// 4.
encoding = Encoding.UTF8;
}
stream.Position = 0;
StreamReader reader = new StreamReader( stream, encoding, false );
string strDoc = reader.ReadToEnd();
return strDoc;
} // LoadXmlStreamIntoString
/// <summary>
/// Returns a string in which all characters which are not valid in XML
/// are removed.
/// </summary>
/// <param name="str">Source string to sanitize.</param>
/// <param name="comments">Optional storage for addingn comments on removed characters.</param>
/// <returns>The character-sanitized string.</returns>
private static string RemoveXmlInvalidCharacters( string str, StringBuilder comments ) {
StringBuilder buf = new StringBuilder( str );
for ( int i = 0; i < str.Length; i++ ) {
char c = buf[i];
if ( c < 32 ) { // ASCII control
//if ( invalidAsciiControlCharacter[c] )
if ( !Char.IsWhiteSpace( c ) ) {
buf[i] = '?';
if ( comments != null )
comments.Append( "<loose>Invalid character (code "
+ (int)c + ") found at character number " + i +
".</loose>\r\n" );
}
}
}
return buf.ToString();
} // RemoveXmlInvalidCharacters
// static private bool[] invalidAsciiControlCharacter =
// new bool[32]{
// true, true, true, true, true, true, true, true,
// true, false, false, false, false, false, true, true,
// true, true, true, true, true, true, true, true,
// true, true, true, true, true, true, true, true
// };
/// <summary>
/// Load a file that holds a combination of HTML embedded in XML
/// into an XML document. Handle possible well-formedness violations
/// that are valid behavior in HTML but not in XML.
/// </summary>
/// <param name="doc">The XML document to load to.</param>
/// <param name="filename">URI to the XML resource.</param>
/// <returns>A comment string. If non-empty, it indicates that the
/// document was not well-formed.</returns>
static public string LoadXmlHtmlCombo( ref XmlDocument doc, string filename ) {
// Synopsis:
// 1. Try loading using the XML parser, and return if successful
// 2. Walk the string once, and replace any invalid character
// 3. Walk the string again, and replace any invalid entity
StringBuilder comments = new StringBuilder();
// 1. Try loading using the XML parser, and return if successful
try {
doc.Load( filename );
return ""; // If we get here, the file loaded okay, and there's no need for further
// manipulations
}
catch ( XmlException xe ) {
// The message issues by the parser might have an invalid XML character
// itself. We need to encode it, AND make sure no XML-invalids are in it.
string strict = "<strict>" +
System.Web.HttpUtility.HtmlEncode( xe.Message ) + "</strict>\r\n";
strict = RemoveXmlInvalidCharacters( strict, null );
comments.Append( strict );
}
catch ( Exception e ) {
// Currently do nothing
System.Diagnostics.Debug.WriteLine( "LoadXmlHtmlCombo ignoring unplanned exception.\r\n" + e.Message );
}
// TODO: Currently we're not handling encoding issues correctly.
// We can tell Unicode files by their BOM. However, it is
// impossible to distinguish UTF-8 from (say) Latin-1 without
// reading the XML PI, which we currently don't do.
Stream stream = new FileStream( filename, FileMode.Open, FileAccess.Read );
string strDoc = LoadXmlStreamIntoString( stream );
int length = strDoc.Length;
// Debugging support: Identify streams with a particular substring
//if ( strDoc.IndexOf( "SUBSTRING_HERE" ) > 0 )
// System.Diagnostics.Debugger.Break();
// 2. Walk the string once, and replace any invalid character
strDoc = RemoveXmlInvalidCharacters( strDoc, comments );
StringBuilder buf = new StringBuilder( length + length/4 );
// 3. Walk the string again, and replace any invalid entity
// Model:
// - The string can be looked-at as a series of extents and entities
// - An extent starts with a non-&-char, and extends until the first
// &-char (exclusive)
// - An entity starts with a &-char, and ends with a ;-char
// - Some people put stray &-char which are not a part of an entity
// Algorithm:
// - Start from the string's beginning
// - Determine the longest extent that you find and copy that to the output
// - If not end-of-string, we are on a &-char and looking for corresponding ;-char
// - Repeat
int i = 0;
while ( i < length ) {
// i is our progress indicator -- it tells where we are in the string
// k indicates the start of the current extent
int k = i;
int nextAmpersand = strDoc.IndexOf( '&', k );
if ( nextAmpersand < 0 )
nextAmpersand = length;
int nextCDATA = strDoc.IndexOf( "<![CDATA[", k, nextAmpersand-k );
if ( nextCDATA < 0 )
nextCDATA = length;
i = System.Math.Min( nextAmpersand, nextCDATA );
if ( i == length ) {
// Extent reaches from k to the end
buf.Append( strDoc, k, length-k );
break;
}
// Extent is strDoc.Substring(k,i-k). Note that it might be empty (i==k)
// System.Diagnostics.Debug.WriteLine( "[[" + strDoc.Substring( k, i-k ) + "]]" );
buf.Append( strDoc, k, i-k );
// Now that we have dealt with the extent, we need to find out whether
// we have an entity ('&...;') or a CDATA ('<[CDATA[...]]>')
if ( strDoc[i] == '<' ) {
// CDATA
int j = strDoc.IndexOf( "]]>", i );
if ( j < 0 )
j = length;
else
j += 3; // Include the terminating sequence in the bunch
buf.Append( strDoc, i, j-i+1 );
i = j+1;
}
else if ( strDoc[i] == '&' ) {
// Entity
// Locate the end of the entity reference
int j = i+1;
int jmax = i+16;
if ( jmax > length )
jmax = length;
while ( strDoc[j] != ';' && j < jmax )
j++;
if ( j == jmax ) {
// We assume that we've encountered a stray '&' hiding in a
// string. This is not valid XML, but many people don't care.
buf.Append( "&" );
i++;
comments.Append( "<loose>A stray ampersand character ('&') was found at character number " + i
+ ". Please use '&amp;' instead.</loose>\r\n" );
continue;
}
// strDoc[i, i+1, ...] now looks like:
// &name ;...
// &#dddd ;...
// &#xhhhh;...
// ^ ^
// i j
if ( i+1 == j )
throw new OverflowException( "Entity reference cannot be empty (&;)" );
// This is an entity reference. Translate it into a character
string str = strDoc.Substring( i, j-i+1 );
switch ( str ) {
// We preserve the 5 XML entities
case """:
case "&":
case "<":
case ">":
case "'": break;
// The others are passed to a CLR function (both numeric code and others)
default:
if ( str[1] != '#' ) // Only comment if not numeric code
comments.Append( "<loose>The HTTP character entity '" +
System.Web.HttpUtility.HtmlEncode( str ) +
"' (which is not a valid XML character entity) was found at character number " + i +
".</loose>\r\n" );
str = System.Web.HttpUtility.HtmlDecode( str );
break;
}
buf.Append( str );
i = j+1;
} // if entity
else {
throw new Exception( "Bug. Expecting either an entity reference or a CDATA" );
}
} // while i < length
// Finally, we can read the new string into the XmlDocument
// System.Diagnostics.Debug.WriteLine( buf.ToString() );
// 1. Try loading using the XML parser, and return if successful
try {
doc.LoadXml( buf.ToString() );
}
catch ( XmlException ) {
// Nope, the XML is still invalid. Nothing to do but report the errors.
int MaxMessageBack = 1024;
if ( comments.Length > MaxMessageBack ) {
comments.Remove( MaxMessageBack, comments.Length - MaxMessageBack );
comments.Append( "...(rest cut)" );
}
throw new Exception( "Invalid XML document:\r\n" + comments.ToString() );
}
return comments.ToString();
} // LoadXmlHtmlCombo
} // class StringUtils
}
|