Utility.cs : » RSS-RDF » Aggie » Bitworking » C# / CSharp Open Source

1.	2.6.4 mono .net core
2.	2.6.4 mono core
3.	Aspect Oriented Frameworks
4.	Bloggers
5.	Build Systems
6.	Business Application
7.	Charting Reporting Tools
8.	Chat Servers
9.	Code Coverage Tools
10.	Content Management Systems CMS
11.	CRM ERP
12.	Database
13.	Development
14.	Email
15.	Forum
16.	Game
17.	GIS
18.	GUI
19.	IDEs
20.	Installers Generators
21.	Inversion of Control Dependency Injection
22.	Issue Tracking
23.	Logging Tools
24.	Message
25.	Mobile
26.	Network Clients
27.	Network Servers
28.	Office
29.	PDF
30.	Persistence Frameworks
31.	Portals
32.	Profilers
33.	Project Management
34.	RSS RDF
35.	Rule Engines
36.	Script
37.	Search Engines
38.	Sound Audio
39.	Source Control
40.	SQL Clients
41.	Template Engines
42.	Testing
43.	UML
44.	Web Frameworks
45.	Web Service
46.	Web Testing
47.	Wiki Engines
48.	Windows Presentation Foundation
49.	Workflows
50.	XML Parsers
C# / C Sharp
C# / C Sharp by API
C# / CSharp Tutorial
C# / CSharp Open Source » RSS RDF » Aggie
Aggie » Bitworking » Utility.cs

using System;
using System.Threading;
using System.Collections;
using System.Xml;
using System.Xml.Xsl;
using System.Xml.XPath;
using System.Net;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;

namespace Bitworking{

  public class HelpfulException : Exception {
    private string message;

    public override string Message { get { return message; } }

    public HelpfulException( Exception e ) {
      // Users do not care about the outer invocation exceptions
      System.Type type = e.GetType();
      if ( type == typeof( System.Reflection.TargetInvocationException ) ) {
        e = e.InnerException;
        type = e.GetType();
      }

      message = e.Message;

      if ( type == typeof( System.Net.WebException ) ) {
        System.Net.WebException x = (System.Net.WebException) e;
        if ( x.Status == System.Net.WebExceptionStatus.NameResolutionFailure )
          message += "\r\n(Try checking that the URL is okay)";
        if ( x.Response != null && x.Response.ResponseUri != null )
          message += "\r\n(While attempting to access the following URI: " + x.Response.ResponseUri + ")";
      }
      else if ( type == typeof( System.UriFormatException ) ) {
        /* TODO: Is there any interesting that this exception provides?
        System.UriFormatException x = (System.UriFormatException) e;
        message += "\r\n(While attempting to crack the following URI: "
        */
      }

    }
  } // class HelpfulException

  /// <summary>
  /// A collection of static utility methods.
  /// </summary>
  public class StringUtils {

    /// <summary>
    /// Checks whether a given string is part of the (possibly null
    /// or non-string) collection.
    /// </summary>
    /// <param name="str">String to look for</param>
    /// <param name="collection">Collection to search in, may be null
    /// or have non-string elements</param>
    /// <returns>true if str found in collection, false otherwise</returns>
    static public bool IsInCollection( string str, ICollection collection ) {
      if ( collection == null )
        return false;
      foreach ( object o in collection ) {
        string s = o as string;
        if ( s != null && s == str )
          return true;
      }
      return false;
    } // IsInCollection

    /// <summary>
    /// Given a string with possibly escaped HTML, sanitize it so
    /// that it won't have the dangerous script, meta, or object
    /// tags.
    /// </summary>
    /// <param name="s">String to sanitize</param>
    /// <returns>Sanitized string</returns>
    static public string SanitizeMarkup( string s ) {
      if (s != null && s != "") {
        s = findScript_.Replace(s, "&lt; script");
        s = findMeta_.Replace(s, "&lt; meta");
        s = findObject_.Replace(s, "&lt; object");
      }
      return s;
    }
    private static Regex findScript_ = new Regex("<\\s*script", RegexOptions.IgnoreCase);
    private static Regex findMeta_ = new Regex("<\\s*meta", RegexOptions.IgnoreCase);
    private static Regex findObject_ = new Regex("<\\s*object", RegexOptions.IgnoreCase);


    /// <summary>
    /// Automatically pick a string if it has substance, or pick
    /// a fallback string if not.
    /// </summary>
    /// <param name="val">The string to check for substance.</param>
    /// <param name="fallback">The fallback string.</param>
    /// <returns>val if it has substance (non-null and non-empty),
    /// fallback otherwise.</returns>
    static public string StringOrFallback( string val, string fallback ) {
      if ( val != null && val != "" )
        return val;
      return fallback;
    }

    // Starting from the parent 'node', traverse the path 'childPath'
    // and return the string value there. Empty strings if there's a problem.
    static public string GetChildInnerXml( XmlNamespaceManager nsmanager, XmlNode node, string childPath ) {
      if ( node == null )
        return "";
      XmlNode child = node.SelectSingleNode( childPath, nsmanager );
      if ( child == null )
        return "";
      return child.InnerXml;
    } // GetChildInnerXml

    // TODO: Check whether the whole concept can be made redundant through
    //       XML normalization phase prior to this level of processing.
    static public string GetChildAsText( XmlNamespaceManager nsmanager, XmlNode node, string childPath ) {
      if ( node == null )
        return "";
      XmlNode child = node.SelectSingleNode( childPath, nsmanager );
      if ( child == null )
        return "";

      XmlNodeList grandchildren = child.ChildNodes;
      if ( grandchildren == null || grandchildren.Count == 0 )
        return child.Value;

      // If we have only one CDATA or Text grandchildren, its value
      // becomes the value of the this function. Otherwise, concatenate.
      StringBuilder text = new StringBuilder( child.OuterXml.Length );
      foreach ( XmlNode grandchild in grandchildren ) {
        XmlNodeType type = grandchild.NodeType;
        if ( type == XmlNodeType.CDATA || type == XmlNodeType.Text ) {
          text.Append( grandchild.Value );
        }
        else if ( type == XmlNodeType.Element ) {
          text.Append( grandchild.OuterXml );
        }
        else {
          System.Diagnostics.Debug.WriteLine( "*** Ignoring XmlNodeType: " + type + ", OuterXml: " + grandchild.OuterXml );
        }
      }

      return text.ToString();
    } // GetChildAsText

    /// <summary>
    /// Read a stream to its end. Assuming that the stream holds an XML
    /// resource, put its content into a string, taking into account the
    /// various encodings an XML resource might be in.
    /// </summary>
    /// <param name="stream">A stream to read the XML resource from.</param>
    /// <returns>The string representation of the XML resource.</returns>
    static public string LoadXmlStreamIntoString( Stream stream ) {
      // Our algorithm is:
      // 1. Read the signature starting the stream
      // 2. If the signature is matched by a known encoding, use that encoding
      // 3. Otherwise, assume ASCII and look for the XML PI's encoding attribute
      // 4. If none found, the stream is UTF-8

      // 1.
      byte[] signature = new Byte[4];
      int bytesRead = stream.Read( signature, 0, 4 );
      if ( bytesRead != 4 ) {
        throw new Exception( "XML stream ended abruptly" );
      }

      // 2.
      // Note that currently I haven't put all the signatures here.
      // (For example, UCS-4 types are not found here.) See the XML spec
      // for additional possible signatures which currently go unused.
      Encoding encoding = null;
      byte s0 = signature[0];
      if ( s0 == 0xFE )
        encoding = Encoding.GetEncoding( "UTF-16BE" );
      else if ( s0 == 0xFF )
        encoding = Encoding.GetEncoding( "UTF-16LE" );
      else if ( s0 == 0xEF )
        encoding = Encoding.UTF8;
      
        // 3.
        // TODO:
        // 00/3C/00/3F - UTF-16BE, but read PI
        // 3C/00/3F/00 - UTF-16LE, but read PI
        // 4C/6F/A7/94 - EBCDIC
      else
        if ( s0 == 0x3C && signature[1] == 0x3F && signature[2] == 0x78 && signature[3] == 0x6D ) {
        // We don't have a BOM, we do have an XML PI, and we need to read
        // the PI's 'encoding' attribute to determine the encoding.

        // Read until you see the end of the PI "?>"
        encoding = Encoding.UTF8;
        StreamReader reader1 = new StreamReader( stream, encoding, false );
        string xmlPi = "";
        while ( true ) {
          string line = reader1.ReadLine();
          xmlPi += line + "\r\n";
          if ( line.IndexOf( "?>" ) >= 0 )
            break;
        }

        // Ignore everything in the PI except the value of the encoding attribute
        System.Text.RegularExpressions.Match match = 
          System.Text.RegularExpressions.Regex.Match( xmlPi, "encoding\\W*=\\W*('|\")" );
        if ( match.Success ) {
          string enc = xmlPi.Substring( match.Index + match.Length );
          match = System.Text.RegularExpressions.Regex.Match( enc, "('|\")" );
          if ( match.Success ) {
            enc = enc.Substring( 0, match.Index );

            // Get the encoding
            encoding = Encoding.GetEncoding( enc );
          }
        }
      }
      
      if ( encoding == null ) {
        // 4.
        encoding = Encoding.UTF8;
      }
      
      stream.Position = 0;
      StreamReader reader = new StreamReader( stream, encoding, false );
      string strDoc = reader.ReadToEnd();
      return strDoc;
    } // LoadXmlStreamIntoString

    /// <summary>
    /// Returns a string in which all characters which are not valid in XML
    /// are removed.
    /// </summary>
    /// <param name="str">Source string to sanitize.</param>
    /// <param name="comments">Optional storage for addingn comments on removed characters.</param>
    /// <returns>The character-sanitized string.</returns>
    private static string RemoveXmlInvalidCharacters( string str, StringBuilder comments ) {
      StringBuilder buf = new StringBuilder( str );
      for ( int i = 0; i < str.Length; i++ ) {
        char c = buf[i];
        if ( c < 32 ) { // ASCII control
          //if ( invalidAsciiControlCharacter[c] )
          if ( !Char.IsWhiteSpace( c ) ) {
            buf[i] = '?';
            if ( comments != null )
              comments.Append( "<loose>Invalid character (code "
                + (int)c + ") found at character number  " + i +
                ".</loose>\r\n" );
          }
        }
      }
      return buf.ToString();
    } // RemoveXmlInvalidCharacters

    //    static private bool[] invalidAsciiControlCharacter =
    //      new bool[32]{
    //                    true, true,  true,  true,  true,  true,  true, true,
    //                    true, false, false, false, false, false, true, true,
    //                    true, true,  true,  true,  true,  true,  true, true,
    //                    true, true,  true,  true,  true,  true,  true, true
    //                  };

    /// <summary>
    /// Load a file that holds a combination of HTML embedded in XML
    /// into an XML document. Handle possible well-formedness violations
    /// that are valid behavior in HTML but not in XML.
    /// </summary>
    /// <param name="doc">The XML document to load to.</param>
    /// <param name="filename">URI to the XML resource.</param>
    /// <returns>A comment string. If non-empty, it indicates that the
    /// document was not well-formed.</returns>
    static public string LoadXmlHtmlCombo( ref XmlDocument doc, string filename ) {
      // Synopsis:
      // 1. Try loading using the XML parser, and return if successful
      // 2. Walk the string once, and replace any invalid character
      // 3. Walk the string again, and replace any invalid entity
      StringBuilder comments = new StringBuilder();

      // 1. Try loading using the XML parser, and return if successful
      try {
        doc.Load( filename );
        return "";  // If we get here, the file loaded okay, and there's no need for further
        // manipulations
      }
      catch ( XmlException xe ) {
        // The message issues by the parser might have an invalid XML character
        // itself. We need to encode it, AND make sure no XML-invalids are in it.
        string strict = "<strict>" + 
          System.Web.HttpUtility.HtmlEncode( xe.Message ) + "</strict>\r\n";
        strict = RemoveXmlInvalidCharacters( strict, null );
        comments.Append( strict );
      }
      catch ( Exception e ) {
        // Currently do nothing
        System.Diagnostics.Debug.WriteLine( "LoadXmlHtmlCombo ignoring unplanned exception.\r\n" + e.Message );
      }

      // TODO: Currently we're not handling encoding issues correctly.
      //       We can tell Unicode files by their BOM. However, it is
      //       impossible to distinguish UTF-8 from (say) Latin-1 without
      //       reading the XML PI, which we currently don't do.
      Stream stream = new FileStream( filename, FileMode.Open, FileAccess.Read );
      string strDoc = LoadXmlStreamIntoString( stream );
      int length = strDoc.Length;

      // Debugging support: Identify streams with a particular substring
      //if ( strDoc.IndexOf( "SUBSTRING_HERE" ) > 0 )
      //  System.Diagnostics.Debugger.Break();

      // 2. Walk the string once, and replace any invalid character
      strDoc = RemoveXmlInvalidCharacters( strDoc, comments );

      StringBuilder buf = new StringBuilder( length + length/4 );

      // 3. Walk the string again, and replace any invalid entity
      // Model:
      // - The string can be looked-at as a series of extents and entities
      // - An extent starts with a non-&-char, and extends until the first
      //   &-char (exclusive)
      // - An entity starts with a &-char, and ends with a ;-char
      // - Some people put stray &-char which are not a part of an entity
      // Algorithm:
      // - Start from the string's beginning
      // - Determine the longest extent that you find and copy that to the output
      // - If not end-of-string, we are on a &-char and looking for corresponding ;-char
      // - Repeat

      int i = 0;
      while ( i < length ) {
        // i is our progress indicator -- it tells where we are in the string
        // k indicates the start of the current extent
        int k = i;
        
        int nextAmpersand = strDoc.IndexOf( '&', k );
        if ( nextAmpersand < 0 )
          nextAmpersand = length;

        int nextCDATA = strDoc.IndexOf( "<![CDATA[", k, nextAmpersand-k );
        if ( nextCDATA < 0 )
          nextCDATA = length;

        i = System.Math.Min( nextAmpersand, nextCDATA );
        if ( i == length ) {
                // Extent reaches from k to the end
          buf.Append( strDoc, k, length-k );
          break;
        }

        // Extent is strDoc.Substring(k,i-k). Note that it might be empty (i==k)
        // System.Diagnostics.Debug.WriteLine( "[[" + strDoc.Substring( k, i-k ) + "]]" );
        buf.Append( strDoc, k, i-k );

        // Now that we have dealt with the extent, we need to find out whether
        // we have an entity ('&...;') or a CDATA ('<[CDATA[...]]>')
        if ( strDoc[i] == '<' ) {
                // CDATA
          int j = strDoc.IndexOf( "]]>", i );
          if ( j < 0 )
            j = length;
          else
            j += 3; // Include the terminating sequence in the bunch
          buf.Append( strDoc, i, j-i+1 );
          i = j+1;
        }
        else if ( strDoc[i] == '&' ) {
                // Entity
          // Locate the end of the entity reference
          int j = i+1;
          int jmax = i+16;
          if ( jmax > length )
            jmax = length;
          while ( strDoc[j] != ';' && j < jmax )
            j++;

          if ( j == jmax ) {
                    // We assume that we've encountered a stray '&' hiding in a 
            // string. This is not valid XML, but many people don't care.
            buf.Append( "&amp;" );
            i++;
            comments.Append( "<loose>A stray ampersand character ('&amp;') was found at character number " + i 
              + ". Please use '&amp;amp;' instead.</loose>\r\n" );
            continue;
          }

          // strDoc[i, i+1, ...] now looks like:
          //   &name  ;...
          //   &#dddd ;...
          //   &#xhhhh;...
          //   ^      ^
          //   i      j
          if ( i+1 == j )
            throw new OverflowException( "Entity reference cannot be empty (&;)" );

          // This is an entity reference. Translate it into a character
          string str = strDoc.Substring( i, j-i+1 );

          switch ( str ) {
              // We preserve the 5 XML entities
            case "&quot;":
            case "&amp;":    
            case "&lt;":     
            case "&gt;":     
            case "&apos;":   break;
              // The others are passed to a CLR function (both numeric code and others)
            default:
              if ( str[1] != '#' ) // Only comment if not numeric code
                comments.Append( "<loose>The HTTP character entity '" + 
                  System.Web.HttpUtility.HtmlEncode( str ) + 
                  "' (which is not a valid XML character entity) was found at character number " + i + 
                  ".</loose>\r\n" );
              str = System.Web.HttpUtility.HtmlDecode( str );
              break;
          }
          buf.Append( str );
          i = j+1;
        } // if entity
        else {
          throw new Exception( "Bug. Expecting either an entity reference or a CDATA" );
        }
      } // while i < length

      // Finally, we can read the new string into the XmlDocument
      // System.Diagnostics.Debug.WriteLine( buf.ToString() );
      // 1. Try loading using the XML parser, and return if successful
      try {
        doc.LoadXml( buf.ToString() );
      }
      catch ( XmlException ) {
        // Nope, the XML is still invalid. Nothing to do but report the errors.
        int MaxMessageBack = 1024;
        if ( comments.Length > MaxMessageBack ) {
          comments.Remove( MaxMessageBack, comments.Length - MaxMessageBack );
          comments.Append( "...(rest cut)" );
        }
        throw new Exception( "Invalid XML document:\r\n" + comments.ToString() );
      }

      return comments.ToString();
    } // LoadXmlHtmlCombo

  } // class StringUtils
}
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.