0001: /*
0002: * $HeadURL: https://svn.apache.org/repos/asf/jakarta/httpcomponents/oac.hc3x/tags/HTTPCLIENT_3_1/src/java/org/apache/commons/httpclient/URI.java $
0003: * $Revision: 564973 $
0004: * $Date: 2007-08-11 22:51:47 +0200 (Sat, 11 Aug 2007) $
0005: *
0006: * ====================================================================
0007: *
0008: * Licensed to the Apache Software Foundation (ASF) under one or more
0009: * contributor license agreements. See the NOTICE file distributed with
0010: * this work for additional information regarding copyright ownership.
0011: * The ASF licenses this file to You under the Apache License, Version 2.0
0012: * (the "License"); you may not use this file except in compliance with
0013: * the License. You may obtain a copy of the License at
0014: *
0015: * http://www.apache.org/licenses/LICENSE-2.0
0016: *
0017: * Unless required by applicable law or agreed to in writing, software
0018: * distributed under the License is distributed on an "AS IS" BASIS,
0019: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0020: * See the License for the specific language governing permissions and
0021: * limitations under the License.
0022: * ====================================================================
0023: *
0024: * This software consists of voluntary contributions made by many
0025: * individuals on behalf of the Apache Software Foundation. For more
0026: * information on the Apache Software Foundation, please see
0027: * <http://www.apache.org/>.
0028: *
0029: */
0030:
0031: package org.apache.commons.httpclient;
0032:
0033: import java.io.IOException;
0034: import java.io.ObjectInputStream;
0035: import java.io.ObjectOutputStream;
0036: import java.io.Serializable;
0037: import java.util.Arrays;
0038: import java.util.Locale;
0039: import java.util.BitSet;
0040: import java.util.Hashtable;
0041:
0042: import org.apache.commons.codec.DecoderException;
0043: import org.apache.commons.codec.net.URLCodec;
0044: import org.apache.commons.httpclient.util.EncodingUtil;
0045:
0046: /**
0047: * The interface for the URI(Uniform Resource Identifiers) version of RFC 2396.
0048: * This class has the purpose of supportting of parsing a URI reference to
0049: * extend any specific protocols, the character encoding of the protocol to
0050: * be transported and the charset of the document.
0051: * <p>
0052: * A URI is always in an "escaped" form, since escaping or unescaping a
0053: * completed URI might change its semantics.
0054: * <p>
0055: * Implementers should be careful not to escape or unescape the same string
0056: * more than once, since unescaping an already unescaped string might lead to
0057: * misinterpreting a percent data character as another escaped character,
0058: * or vice versa in the case of escaping an already escaped string.
0059: * <p>
0060: * In order to avoid these problems, data types used as follows:
0061: * <p><blockquote><pre>
0062: * URI character sequence: char
0063: * octet sequence: byte
0064: * original character sequence: String
0065: * </pre></blockquote><p>
0066: *
0067: * So, a URI is a sequence of characters as an array of a char type, which
0068: * is not always represented as a sequence of octets as an array of byte.
0069: * <p>
0070: *
0071: * URI Syntactic Components
0072: * <p><blockquote><pre>
0073: * - In general, written as follows:
0074: * Absolute URI = <scheme>:<scheme-specific-part>
0075: * Generic URI = <scheme>://<authority><path>?<query>
0076: *
0077: * - Syntax
0078: * absoluteURI = scheme ":" ( hier_part | opaque_part )
0079: * hier_part = ( net_path | abs_path ) [ "?" query ]
0080: * net_path = "//" authority [ abs_path ]
0081: * abs_path = "/" path_segments
0082: * </pre></blockquote><p>
0083: *
0084: * The following examples illustrate URI that are in common use.
0085: * <pre>
0086: * ftp://ftp.is.co.za/rfc/rfc1808.txt
0087: * -- ftp scheme for File Transfer Protocol services
0088: * gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles
0089: * -- gopher scheme for Gopher and Gopher+ Protocol services
0090: * http://www.math.uio.no/faq/compression-faq/part1.html
0091: * -- http scheme for Hypertext Transfer Protocol services
0092: * mailto:mduerst@ifi.unizh.ch
0093: * -- mailto scheme for electronic mail addresses
0094: * news:comp.infosystems.www.servers.unix
0095: * -- news scheme for USENET news groups and articles
0096: * telnet://melvyl.ucop.edu/
0097: * -- telnet scheme for interactive services via the TELNET Protocol
0098: * </pre>
0099: * Please, notice that there are many modifications from URL(RFC 1738) and
0100: * relative URL(RFC 1808).
0101: * <p>
0102: * <b>The expressions for a URI</b>
0103: * <p><pre>
0104: * For escaped URI forms
0105: * - URI(char[]) // constructor
0106: * - char[] getRawXxx() // method
0107: * - String getEscapedXxx() // method
0108: * - String toString() // method
0109: * <p>
0110: * For unescaped URI forms
0111: * - URI(String) // constructor
0112: * - String getXXX() // method
0113: * </pre><p>
0114: *
0115: * @author <a href="mailto:jericho@apache.org">Sung-Gu</a>
0116: * @author <a href="mailto:mbowler@GargoyleSoftware.com">Mike Bowler</a>
0117: * @version $Revision: 564973 $ $Date: 2002/03/14 15:14:01
0118: */
0119: public class URI implements Cloneable, Comparable, Serializable {
0120:
0121: // ----------------------------------------------------------- Constructors
0122:
0123: /** Create an instance as an internal use */
0124: protected URI() {
0125: }
0126:
0127: /**
0128: * Construct a URI from a string with the given charset. The input string can
0129: * be either in escaped or unescaped form.
0130: *
0131: * @param s URI character sequence
0132: * @param escaped <tt>true</tt> if URI character sequence is in escaped form.
0133: * <tt>false</tt> otherwise.
0134: * @param charset the charset string to do escape encoding, if required
0135: *
0136: * @throws URIException If the URI cannot be created.
0137: * @throws NullPointerException if input string is <code>null</code>
0138: *
0139: * @see #getProtocolCharset
0140: *
0141: * @since 3.0
0142: */
0143: public URI(String s, boolean escaped, String charset)
0144: throws URIException, NullPointerException {
0145: protocolCharset = charset;
0146: parseUriReference(s, escaped);
0147: }
0148:
0149: /**
0150: * Construct a URI from a string with the given charset. The input string can
0151: * be either in escaped or unescaped form.
0152: *
0153: * @param s URI character sequence
0154: * @param escaped <tt>true</tt> if URI character sequence is in escaped form.
0155: * <tt>false</tt> otherwise.
0156: *
0157: * @throws URIException If the URI cannot be created.
0158: * @throws NullPointerException if input string is <code>null</code>
0159: *
0160: * @see #getProtocolCharset
0161: *
0162: * @since 3.0
0163: */
0164: public URI(String s, boolean escaped) throws URIException,
0165: NullPointerException {
0166: parseUriReference(s, escaped);
0167: }
0168:
0169: /**
0170: * Construct a URI as an escaped form of a character array with the given
0171: * charset.
0172: *
0173: * @param escaped the URI character sequence
0174: * @param charset the charset string to do escape encoding
0175: * @throws URIException If the URI cannot be created.
0176: * @throws NullPointerException if <code>escaped</code> is <code>null</code>
0177: * @see #getProtocolCharset
0178: *
0179: * @deprecated Use #URI(String, boolean, String)
0180: */
0181: public URI(char[] escaped, String charset) throws URIException,
0182: NullPointerException {
0183: protocolCharset = charset;
0184: parseUriReference(new String(escaped), true);
0185: }
0186:
0187: /**
0188: * Construct a URI as an escaped form of a character array.
0189: * An URI can be placed within double-quotes or angle brackets like
0190: * "http://test.com/" and <http://test.com/>
0191: *
0192: * @param escaped the URI character sequence
0193: * @throws URIException If the URI cannot be created.
0194: * @throws NullPointerException if <code>escaped</code> is <code>null</code>
0195: * @see #getDefaultProtocolCharset
0196: *
0197: * @deprecated Use #URI(String, boolean)
0198: */
0199: public URI(char[] escaped) throws URIException,
0200: NullPointerException {
0201: parseUriReference(new String(escaped), true);
0202: }
0203:
0204: /**
0205: * Construct a URI from the given string with the given charset.
0206: *
0207: * @param original the string to be represented to URI character sequence
0208: * It is one of absoluteURI and relativeURI.
0209: * @param charset the charset string to do escape encoding
0210: * @throws URIException If the URI cannot be created.
0211: * @see #getProtocolCharset
0212: *
0213: * @deprecated Use #URI(String, boolean, String)
0214: */
0215: public URI(String original, String charset) throws URIException {
0216: protocolCharset = charset;
0217: parseUriReference(original, false);
0218: }
0219:
0220: /**
0221: * Construct a URI from the given string.
0222: * <p><blockquote><pre>
0223: * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
0224: * </pre></blockquote><p>
0225: * An URI can be placed within double-quotes or angle brackets like
0226: * "http://test.com/" and <http://test.com/>
0227: *
0228: * @param original the string to be represented to URI character sequence
0229: * It is one of absoluteURI and relativeURI.
0230: * @throws URIException If the URI cannot be created.
0231: * @see #getDefaultProtocolCharset
0232: *
0233: * @deprecated Use #URI(String, boolean)
0234: */
0235: public URI(String original) throws URIException {
0236: parseUriReference(original, false);
0237: }
0238:
0239: /**
0240: * Construct a general URI from the given components.
0241: * <p><blockquote><pre>
0242: * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
0243: * absoluteURI = scheme ":" ( hier_part | opaque_part )
0244: * opaque_part = uric_no_slash *uric
0245: * </pre></blockquote><p>
0246: * It's for absolute URI = <scheme>:<scheme-specific-part>#
0247: * <fragment>.
0248: *
0249: * @param scheme the scheme string
0250: * @param schemeSpecificPart scheme_specific_part
0251: * @param fragment the fragment string
0252: * @throws URIException If the URI cannot be created.
0253: * @see #getDefaultProtocolCharset
0254: */
0255: public URI(String scheme, String schemeSpecificPart, String fragment)
0256: throws URIException {
0257:
0258: // validate and contruct the URI character sequence
0259: if (scheme == null) {
0260: throw new URIException(URIException.PARSING,
0261: "scheme required");
0262: }
0263: char[] s = scheme.toLowerCase().toCharArray();
0264: if (validate(s, URI.scheme)) {
0265: _scheme = s; // is_absoluteURI
0266: } else {
0267: throw new URIException(URIException.PARSING,
0268: "incorrect scheme");
0269: }
0270: _opaque = encode(schemeSpecificPart, allowed_opaque_part,
0271: getProtocolCharset());
0272: // Set flag
0273: _is_opaque_part = true;
0274: _fragment = fragment == null ? null : fragment.toCharArray();
0275: setURI();
0276: }
0277:
0278: /**
0279: * Construct a general URI from the given components.
0280: * <p><blockquote><pre>
0281: * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
0282: * absoluteURI = scheme ":" ( hier_part | opaque_part )
0283: * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
0284: * hier_part = ( net_path | abs_path ) [ "?" query ]
0285: * </pre></blockquote><p>
0286: * It's for absolute URI = <scheme>:<path>?<query>#<
0287: * fragment> and relative URI = <path>?<query>#<fragment
0288: * >.
0289: *
0290: * @param scheme the scheme string
0291: * @param authority the authority string
0292: * @param path the path string
0293: * @param query the query string
0294: * @param fragment the fragment string
0295: * @throws URIException If the new URI cannot be created.
0296: * @see #getDefaultProtocolCharset
0297: */
0298: public URI(String scheme, String authority, String path,
0299: String query, String fragment) throws URIException {
0300:
0301: // validate and contruct the URI character sequence
0302: StringBuffer buff = new StringBuffer();
0303: if (scheme != null) {
0304: buff.append(scheme);
0305: buff.append(':');
0306: }
0307: if (authority != null) {
0308: buff.append("//");
0309: buff.append(authority);
0310: }
0311: if (path != null) { // accept empty path
0312: if ((scheme != null || authority != null)
0313: && !path.startsWith("/")) {
0314: throw new URIException(URIException.PARSING,
0315: "abs_path requested");
0316: }
0317: buff.append(path);
0318: }
0319: if (query != null) {
0320: buff.append('?');
0321: buff.append(query);
0322: }
0323: if (fragment != null) {
0324: buff.append('#');
0325: buff.append(fragment);
0326: }
0327: parseUriReference(buff.toString(), false);
0328: }
0329:
0330: /**
0331: * Construct a general URI from the given components.
0332: *
0333: * @param scheme the scheme string
0334: * @param userinfo the userinfo string
0335: * @param host the host string
0336: * @param port the port number
0337: * @throws URIException If the new URI cannot be created.
0338: * @see #getDefaultProtocolCharset
0339: */
0340: public URI(String scheme, String userinfo, String host, int port)
0341: throws URIException {
0342:
0343: this (scheme, userinfo, host, port, null, null, null);
0344: }
0345:
0346: /**
0347: * Construct a general URI from the given components.
0348: *
0349: * @param scheme the scheme string
0350: * @param userinfo the userinfo string
0351: * @param host the host string
0352: * @param port the port number
0353: * @param path the path string
0354: * @throws URIException If the new URI cannot be created.
0355: * @see #getDefaultProtocolCharset
0356: */
0357: public URI(String scheme, String userinfo, String host, int port,
0358: String path) throws URIException {
0359:
0360: this (scheme, userinfo, host, port, path, null, null);
0361: }
0362:
0363: /**
0364: * Construct a general URI from the given components.
0365: *
0366: * @param scheme the scheme string
0367: * @param userinfo the userinfo string
0368: * @param host the host string
0369: * @param port the port number
0370: * @param path the path string
0371: * @param query the query string
0372: * @throws URIException If the new URI cannot be created.
0373: * @see #getDefaultProtocolCharset
0374: */
0375: public URI(String scheme, String userinfo, String host, int port,
0376: String path, String query) throws URIException {
0377:
0378: this (scheme, userinfo, host, port, path, query, null);
0379: }
0380:
0381: /**
0382: * Construct a general URI from the given components.
0383: *
0384: * @param scheme the scheme string
0385: * @param userinfo the userinfo string
0386: * @param host the host string
0387: * @param port the port number
0388: * @param path the path string
0389: * @param query the query string
0390: * @param fragment the fragment string
0391: * @throws URIException If the new URI cannot be created.
0392: * @see #getDefaultProtocolCharset
0393: */
0394: public URI(String scheme, String userinfo, String host, int port,
0395: String path, String query, String fragment)
0396: throws URIException {
0397:
0398: this (scheme, (host == null) ? null
0399: : ((userinfo != null) ? userinfo + '@' : "") + host
0400: + ((port != -1) ? ":" + port : ""), path,
0401: query, fragment);
0402: }
0403:
0404: /**
0405: * Construct a general URI from the given components.
0406: *
0407: * @param scheme the scheme string
0408: * @param host the host string
0409: * @param path the path string
0410: * @param fragment the fragment string
0411: * @throws URIException If the new URI cannot be created.
0412: * @see #getDefaultProtocolCharset
0413: */
0414: public URI(String scheme, String host, String path, String fragment)
0415: throws URIException {
0416:
0417: this (scheme, host, path, null, fragment);
0418: }
0419:
0420: /**
0421: * Construct a general URI with the given relative URI string.
0422: *
0423: * @param base the base URI
0424: * @param relative the relative URI string
0425: * @throws URIException If the new URI cannot be created.
0426: *
0427: * @deprecated Use #URI(URI, String, boolean)
0428: */
0429: public URI(URI base, String relative) throws URIException {
0430: this (base, new URI(relative));
0431: }
0432:
0433: /**
0434: * Construct a general URI with the given relative URI string.
0435: *
0436: * @param base the base URI
0437: * @param relative the relative URI string
0438: * @param escaped <tt>true</tt> if URI character sequence is in escaped form.
0439: * <tt>false</tt> otherwise.
0440: *
0441: * @throws URIException If the new URI cannot be created.
0442: *
0443: * @since 3.0
0444: */
0445: public URI(URI base, String relative, boolean escaped)
0446: throws URIException {
0447: this (base, new URI(relative, escaped));
0448: }
0449:
0450: /**
0451: * Construct a general URI with the given relative URI.
0452: * <p><blockquote><pre>
0453: * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
0454: * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
0455: * </pre></blockquote><p>
0456: * Resolving Relative References to Absolute Form.
0457: *
0458: * <strong>Examples of Resolving Relative URI References</strong>
0459: *
0460: * Within an object with a well-defined base URI of
0461: * <p><blockquote><pre>
0462: * http://a/b/c/d;p?q
0463: * </pre></blockquote><p>
0464: * the relative URI would be resolved as follows:
0465: *
0466: * Normal Examples
0467: *
0468: * <p><blockquote><pre>
0469: * g:h = g:h
0470: * g = http://a/b/c/g
0471: * ./g = http://a/b/c/g
0472: * g/ = http://a/b/c/g/
0473: * /g = http://a/g
0474: * //g = http://g
0475: * ?y = http://a/b/c/?y
0476: * g?y = http://a/b/c/g?y
0477: * #s = (current document)#s
0478: * g#s = http://a/b/c/g#s
0479: * g?y#s = http://a/b/c/g?y#s
0480: * ;x = http://a/b/c/;x
0481: * g;x = http://a/b/c/g;x
0482: * g;x?y#s = http://a/b/c/g;x?y#s
0483: * . = http://a/b/c/
0484: * ./ = http://a/b/c/
0485: * .. = http://a/b/
0486: * ../ = http://a/b/
0487: * ../g = http://a/b/g
0488: * ../.. = http://a/
0489: * ../../ = http://a/
0490: * ../../g = http://a/g
0491: * </pre></blockquote><p>
0492: *
0493: * Some URI schemes do not allow a hierarchical syntax matching the
0494: * <hier_part> syntax, and thus cannot use relative references.
0495: *
0496: * @param base the base URI
0497: * @param relative the relative URI
0498: * @throws URIException If the new URI cannot be created.
0499: */
0500: public URI(URI base, URI relative) throws URIException {
0501:
0502: if (base._scheme == null) {
0503: throw new URIException(URIException.PARSING,
0504: "base URI required");
0505: }
0506: if (base._scheme != null) {
0507: this ._scheme = base._scheme;
0508: this ._authority = base._authority;
0509: this ._is_net_path = base._is_net_path;
0510: }
0511: if (base._is_opaque_part || relative._is_opaque_part) {
0512: this ._scheme = base._scheme;
0513: this ._is_opaque_part = base._is_opaque_part
0514: || relative._is_opaque_part;
0515: this ._opaque = relative._opaque;
0516: this ._fragment = relative._fragment;
0517: this .setURI();
0518: return;
0519: }
0520: boolean schemesEqual = Arrays.equals(base._scheme,
0521: relative._scheme);
0522: if (relative._scheme != null
0523: && (!schemesEqual || relative._authority != null)) {
0524: this ._scheme = relative._scheme;
0525: this ._is_net_path = relative._is_net_path;
0526: this ._authority = relative._authority;
0527: if (relative._is_server) {
0528: this ._is_server = relative._is_server;
0529: this ._userinfo = relative._userinfo;
0530: this ._host = relative._host;
0531: this ._port = relative._port;
0532: } else if (relative._is_reg_name) {
0533: this ._is_reg_name = relative._is_reg_name;
0534: }
0535: this ._is_abs_path = relative._is_abs_path;
0536: this ._is_rel_path = relative._is_rel_path;
0537: this ._path = relative._path;
0538: } else if (base._authority != null && relative._scheme == null) {
0539: this ._is_net_path = base._is_net_path;
0540: this ._authority = base._authority;
0541: if (base._is_server) {
0542: this ._is_server = base._is_server;
0543: this ._userinfo = base._userinfo;
0544: this ._host = base._host;
0545: this ._port = base._port;
0546: } else if (base._is_reg_name) {
0547: this ._is_reg_name = base._is_reg_name;
0548: }
0549: }
0550: if (relative._authority != null) {
0551: this ._is_net_path = relative._is_net_path;
0552: this ._authority = relative._authority;
0553: if (relative._is_server) {
0554: this ._is_server = relative._is_server;
0555: this ._userinfo = relative._userinfo;
0556: this ._host = relative._host;
0557: this ._port = relative._port;
0558: } else if (relative._is_reg_name) {
0559: this ._is_reg_name = relative._is_reg_name;
0560: }
0561: this ._is_abs_path = relative._is_abs_path;
0562: this ._is_rel_path = relative._is_rel_path;
0563: this ._path = relative._path;
0564: }
0565: // resolve the path and query if necessary
0566: if (relative._authority == null
0567: && (relative._scheme == null || schemesEqual)) {
0568: if ((relative._path == null || relative._path.length == 0)
0569: && relative._query == null) {
0570: // handle a reference to the current document, see RFC 2396
0571: // section 5.2 step 2
0572: this ._path = base._path;
0573: this ._query = base._query;
0574: } else {
0575: this ._path = resolvePath(base._path, relative._path);
0576: }
0577: }
0578: // base._query removed
0579: if (relative._query != null) {
0580: this ._query = relative._query;
0581: }
0582: // base._fragment removed
0583: if (relative._fragment != null) {
0584: this ._fragment = relative._fragment;
0585: }
0586: this .setURI();
0587: // reparse the newly built URI, this will ensure that all flags are set correctly.
0588: // TODO there must be a better way to do this
0589: parseUriReference(new String(_uri), true);
0590: }
0591:
0592: // --------------------------------------------------- Instance Variables
0593:
0594: /** Version ID for serialization */
0595: static final long serialVersionUID = 604752400577948726L;
0596:
0597: /**
0598: * Cache the hash code for this URI.
0599: */
0600: protected int hash = 0;
0601:
0602: /**
0603: * This Uniform Resource Identifier (URI).
0604: * The URI is always in an "escaped" form, since escaping or unescaping
0605: * a completed URI might change its semantics.
0606: */
0607: protected char[] _uri = null;
0608:
0609: /**
0610: * The charset of the protocol used by this URI instance.
0611: */
0612: protected String protocolCharset = null;
0613:
0614: /**
0615: * The default charset of the protocol. RFC 2277, 2396
0616: */
0617: protected static String defaultProtocolCharset = "UTF-8";
0618:
0619: /**
0620: * The default charset of the document. RFC 2277, 2396
0621: * The platform's charset is used for the document by default.
0622: */
0623: protected static String defaultDocumentCharset = null;
0624: protected static String defaultDocumentCharsetByLocale = null;
0625: protected static String defaultDocumentCharsetByPlatform = null;
0626: // Static initializer for defaultDocumentCharset
0627: static {
0628: Locale locale = Locale.getDefault();
0629: // in order to support backward compatiblity
0630: if (locale != null) {
0631: defaultDocumentCharsetByLocale = LocaleToCharsetMap
0632: .getCharset(locale);
0633: // set the default document charset
0634: defaultDocumentCharset = defaultDocumentCharsetByLocale;
0635: }
0636: // in order to support platform encoding
0637: try {
0638: defaultDocumentCharsetByPlatform = System
0639: .getProperty("file.encoding");
0640: } catch (SecurityException ignore) {
0641: }
0642: if (defaultDocumentCharset == null) {
0643: // set the default document charset
0644: defaultDocumentCharset = defaultDocumentCharsetByPlatform;
0645: }
0646: }
0647:
0648: /**
0649: * The scheme.
0650: */
0651: protected char[] _scheme = null;
0652:
0653: /**
0654: * The opaque.
0655: */
0656: protected char[] _opaque = null;
0657:
0658: /**
0659: * The authority.
0660: */
0661: protected char[] _authority = null;
0662:
0663: /**
0664: * The userinfo.
0665: */
0666: protected char[] _userinfo = null;
0667:
0668: /**
0669: * The host.
0670: */
0671: protected char[] _host = null;
0672:
0673: /**
0674: * The port.
0675: */
0676: protected int _port = -1;
0677:
0678: /**
0679: * The path.
0680: */
0681: protected char[] _path = null;
0682:
0683: /**
0684: * The query.
0685: */
0686: protected char[] _query = null;
0687:
0688: /**
0689: * The fragment.
0690: */
0691: protected char[] _fragment = null;
0692:
0693: /**
0694: * The root path.
0695: */
0696: protected static final char[] rootPath = { '/' };
0697:
0698: // ---------------------- Generous characters for each component validation
0699:
0700: /**
0701: * The percent "%" character always has the reserved purpose of being the
0702: * escape indicator, it must be escaped as "%25" in order to be used as
0703: * data within a URI.
0704: */
0705: protected static final BitSet percent = new BitSet(256);
0706: // Static initializer for percent
0707: static {
0708: percent.set('%');
0709: }
0710:
0711: /**
0712: * BitSet for digit.
0713: * <p><blockquote><pre>
0714: * digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
0715: * "8" | "9"
0716: * </pre></blockquote><p>
0717: */
0718: protected static final BitSet digit = new BitSet(256);
0719: // Static initializer for digit
0720: static {
0721: for (int i = '0'; i <= '9'; i++) {
0722: digit.set(i);
0723: }
0724: }
0725:
0726: /**
0727: * BitSet for alpha.
0728: * <p><blockquote><pre>
0729: * alpha = lowalpha | upalpha
0730: * </pre></blockquote><p>
0731: */
0732: protected static final BitSet alpha = new BitSet(256);
0733: // Static initializer for alpha
0734: static {
0735: for (int i = 'a'; i <= 'z'; i++) {
0736: alpha.set(i);
0737: }
0738: for (int i = 'A'; i <= 'Z'; i++) {
0739: alpha.set(i);
0740: }
0741: }
0742:
0743: /**
0744: * BitSet for alphanum (join of alpha & digit).
0745: * <p><blockquote><pre>
0746: * alphanum = alpha | digit
0747: * </pre></blockquote><p>
0748: */
0749: protected static final BitSet alphanum = new BitSet(256);
0750: // Static initializer for alphanum
0751: static {
0752: alphanum.or(alpha);
0753: alphanum.or(digit);
0754: }
0755:
0756: /**
0757: * BitSet for hex.
0758: * <p><blockquote><pre>
0759: * hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
0760: * "a" | "b" | "c" | "d" | "e" | "f"
0761: * </pre></blockquote><p>
0762: */
0763: protected static final BitSet hex = new BitSet(256);
0764: // Static initializer for hex
0765: static {
0766: hex.or(digit);
0767: for (int i = 'a'; i <= 'f'; i++) {
0768: hex.set(i);
0769: }
0770: for (int i = 'A'; i <= 'F'; i++) {
0771: hex.set(i);
0772: }
0773: }
0774:
0775: /**
0776: * BitSet for escaped.
0777: * <p><blockquote><pre>
0778: * escaped = "%" hex hex
0779: * </pre></blockquote><p>
0780: */
0781: protected static final BitSet escaped = new BitSet(256);
0782: // Static initializer for escaped
0783: static {
0784: escaped.or(percent);
0785: escaped.or(hex);
0786: }
0787:
0788: /**
0789: * BitSet for mark.
0790: * <p><blockquote><pre>
0791: * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" |
0792: * "(" | ")"
0793: * </pre></blockquote><p>
0794: */
0795: protected static final BitSet mark = new BitSet(256);
0796: // Static initializer for mark
0797: static {
0798: mark.set('-');
0799: mark.set('_');
0800: mark.set('.');
0801: mark.set('!');
0802: mark.set('~');
0803: mark.set('*');
0804: mark.set('\'');
0805: mark.set('(');
0806: mark.set(')');
0807: }
0808:
0809: /**
0810: * Data characters that are allowed in a URI but do not have a reserved
0811: * purpose are called unreserved.
0812: * <p><blockquote><pre>
0813: * unreserved = alphanum | mark
0814: * </pre></blockquote><p>
0815: */
0816: protected static final BitSet unreserved = new BitSet(256);
0817: // Static initializer for unreserved
0818: static {
0819: unreserved.or(alphanum);
0820: unreserved.or(mark);
0821: }
0822:
0823: /**
0824: * BitSet for reserved.
0825: * <p><blockquote><pre>
0826: * reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
0827: * "$" | ","
0828: * </pre></blockquote><p>
0829: */
0830: protected static final BitSet reserved = new BitSet(256);
0831: // Static initializer for reserved
0832: static {
0833: reserved.set(';');
0834: reserved.set('/');
0835: reserved.set('?');
0836: reserved.set(':');
0837: reserved.set('@');
0838: reserved.set('&');
0839: reserved.set('=');
0840: reserved.set('+');
0841: reserved.set('$');
0842: reserved.set(',');
0843: }
0844:
0845: /**
0846: * BitSet for uric.
0847: * <p><blockquote><pre>
0848: * uric = reserved | unreserved | escaped
0849: * </pre></blockquote><p>
0850: */
0851: protected static final BitSet uric = new BitSet(256);
0852: // Static initializer for uric
0853: static {
0854: uric.or(reserved);
0855: uric.or(unreserved);
0856: uric.or(escaped);
0857: }
0858:
0859: /**
0860: * BitSet for fragment (alias for uric).
0861: * <p><blockquote><pre>
0862: * fragment = *uric
0863: * </pre></blockquote><p>
0864: */
0865: protected static final BitSet fragment = uric;
0866:
0867: /**
0868: * BitSet for query (alias for uric).
0869: * <p><blockquote><pre>
0870: * query = *uric
0871: * </pre></blockquote><p>
0872: */
0873: protected static final BitSet query = uric;
0874:
0875: /**
0876: * BitSet for pchar.
0877: * <p><blockquote><pre>
0878: * pchar = unreserved | escaped |
0879: * ":" | "@" | "&" | "=" | "+" | "$" | ","
0880: * </pre></blockquote><p>
0881: */
0882: protected static final BitSet pchar = new BitSet(256);
0883: // Static initializer for pchar
0884: static {
0885: pchar.or(unreserved);
0886: pchar.or(escaped);
0887: pchar.set(':');
0888: pchar.set('@');
0889: pchar.set('&');
0890: pchar.set('=');
0891: pchar.set('+');
0892: pchar.set('$');
0893: pchar.set(',');
0894: }
0895:
0896: /**
0897: * BitSet for param (alias for pchar).
0898: * <p><blockquote><pre>
0899: * param = *pchar
0900: * </pre></blockquote><p>
0901: */
0902: protected static final BitSet param = pchar;
0903:
0904: /**
0905: * BitSet for segment.
0906: * <p><blockquote><pre>
0907: * segment = *pchar *( ";" param )
0908: * </pre></blockquote><p>
0909: */
0910: protected static final BitSet segment = new BitSet(256);
0911: // Static initializer for segment
0912: static {
0913: segment.or(pchar);
0914: segment.set(';');
0915: segment.or(param);
0916: }
0917:
0918: /**
0919: * BitSet for path segments.
0920: * <p><blockquote><pre>
0921: * path_segments = segment *( "/" segment )
0922: * </pre></blockquote><p>
0923: */
0924: protected static final BitSet path_segments = new BitSet(256);
0925: // Static initializer for path_segments
0926: static {
0927: path_segments.set('/');
0928: path_segments.or(segment);
0929: }
0930:
0931: /**
0932: * URI absolute path.
0933: * <p><blockquote><pre>
0934: * abs_path = "/" path_segments
0935: * </pre></blockquote><p>
0936: */
0937: protected static final BitSet abs_path = new BitSet(256);
0938: // Static initializer for abs_path
0939: static {
0940: abs_path.set('/');
0941: abs_path.or(path_segments);
0942: }
0943:
0944: /**
0945: * URI bitset for encoding typical non-slash characters.
0946: * <p><blockquote><pre>
0947: * uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
0948: * "&" | "=" | "+" | "$" | ","
0949: * </pre></blockquote><p>
0950: */
0951: protected static final BitSet uric_no_slash = new BitSet(256);
0952: // Static initializer for uric_no_slash
0953: static {
0954: uric_no_slash.or(unreserved);
0955: uric_no_slash.or(escaped);
0956: uric_no_slash.set(';');
0957: uric_no_slash.set('?');
0958: uric_no_slash.set(';');
0959: uric_no_slash.set('@');
0960: uric_no_slash.set('&');
0961: uric_no_slash.set('=');
0962: uric_no_slash.set('+');
0963: uric_no_slash.set('$');
0964: uric_no_slash.set(',');
0965: }
0966:
0967: /**
0968: * URI bitset that combines uric_no_slash and uric.
0969: * <p><blockquote><pre>
0970: * opaque_part = uric_no_slash *uric
0971: * </pre></blockquote><p>
0972: */
0973: protected static final BitSet opaque_part = new BitSet(256);
0974: // Static initializer for opaque_part
0975: static {
0976: // it's generous. because first character must not include a slash
0977: opaque_part.or(uric_no_slash);
0978: opaque_part.or(uric);
0979: }
0980:
0981: /**
0982: * URI bitset that combines absolute path and opaque part.
0983: * <p><blockquote><pre>
0984: * path = [ abs_path | opaque_part ]
0985: * </pre></blockquote><p>
0986: */
0987: protected static final BitSet path = new BitSet(256);
0988: // Static initializer for path
0989: static {
0990: path.or(abs_path);
0991: path.or(opaque_part);
0992: }
0993:
0994: /**
0995: * Port, a logical alias for digit.
0996: */
0997: protected static final BitSet port = digit;
0998:
0999: /**
1000: * Bitset that combines digit and dot fo IPv$address.
1001: * <p><blockquote><pre>
1002: * IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit
1003: * </pre></blockquote><p>
1004: */
1005: protected static final BitSet IPv4address = new BitSet(256);
1006: // Static initializer for IPv4address
1007: static {
1008: IPv4address.or(digit);
1009: IPv4address.set('.');
1010: }
1011:
1012: /**
1013: * RFC 2373.
1014: * <p><blockquote><pre>
1015: * IPv6address = hexpart [ ":" IPv4address ]
1016: * </pre></blockquote><p>
1017: */
1018: protected static final BitSet IPv6address = new BitSet(256);
1019: // Static initializer for IPv6address reference
1020: static {
1021: IPv6address.or(hex); // hexpart
1022: IPv6address.set(':');
1023: IPv6address.or(IPv4address);
1024: }
1025:
1026: /**
1027: * RFC 2732, 2373.
1028: * <p><blockquote><pre>
1029: * IPv6reference = "[" IPv6address "]"
1030: * </pre></blockquote><p>
1031: */
1032: protected static final BitSet IPv6reference = new BitSet(256);
1033: // Static initializer for IPv6reference
1034: static {
1035: IPv6reference.set('[');
1036: IPv6reference.or(IPv6address);
1037: IPv6reference.set(']');
1038: }
1039:
1040: /**
1041: * BitSet for toplabel.
1042: * <p><blockquote><pre>
1043: * toplabel = alpha | alpha *( alphanum | "-" ) alphanum
1044: * </pre></blockquote><p>
1045: */
1046: protected static final BitSet toplabel = new BitSet(256);
1047: // Static initializer for toplabel
1048: static {
1049: toplabel.or(alphanum);
1050: toplabel.set('-');
1051: }
1052:
1053: /**
1054: * BitSet for domainlabel.
1055: * <p><blockquote><pre>
1056: * domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
1057: * </pre></blockquote><p>
1058: */
1059: protected static final BitSet domainlabel = toplabel;
1060:
1061: /**
1062: * BitSet for hostname.
1063: * <p><blockquote><pre>
1064: * hostname = *( domainlabel "." ) toplabel [ "." ]
1065: * </pre></blockquote><p>
1066: */
1067: protected static final BitSet hostname = new BitSet(256);
1068: // Static initializer for hostname
1069: static {
1070: hostname.or(toplabel);
1071: // hostname.or(domainlabel);
1072: hostname.set('.');
1073: }
1074:
1075: /**
1076: * BitSet for host.
1077: * <p><blockquote><pre>
1078: * host = hostname | IPv4address | IPv6reference
1079: * </pre></blockquote><p>
1080: */
1081: protected static final BitSet host = new BitSet(256);
1082: // Static initializer for host
1083: static {
1084: host.or(hostname);
1085: // host.or(IPv4address);
1086: host.or(IPv6reference); // IPv4address
1087: }
1088:
1089: /**
1090: * BitSet for hostport.
1091: * <p><blockquote><pre>
1092: * hostport = host [ ":" port ]
1093: * </pre></blockquote><p>
1094: */
1095: protected static final BitSet hostport = new BitSet(256);
1096: // Static initializer for hostport
1097: static {
1098: hostport.or(host);
1099: hostport.set(':');
1100: hostport.or(port);
1101: }
1102:
1103: /**
1104: * Bitset for userinfo.
1105: * <p><blockquote><pre>
1106: * userinfo = *( unreserved | escaped |
1107: * ";" | ":" | "&" | "=" | "+" | "$" | "," )
1108: * </pre></blockquote><p>
1109: */
1110: protected static final BitSet userinfo = new BitSet(256);
1111: // Static initializer for userinfo
1112: static {
1113: userinfo.or(unreserved);
1114: userinfo.or(escaped);
1115: userinfo.set(';');
1116: userinfo.set(':');
1117: userinfo.set('&');
1118: userinfo.set('=');
1119: userinfo.set('+');
1120: userinfo.set('$');
1121: userinfo.set(',');
1122: }
1123:
1124: /**
1125: * BitSet for within the userinfo component like user and password.
1126: */
1127: public static final BitSet within_userinfo = new BitSet(256);
1128: // Static initializer for within_userinfo
1129: static {
1130: within_userinfo.or(userinfo);
1131: within_userinfo.clear(';'); // reserved within authority
1132: within_userinfo.clear(':');
1133: within_userinfo.clear('@');
1134: within_userinfo.clear('?');
1135: within_userinfo.clear('/');
1136: }
1137:
1138: /**
1139: * Bitset for server.
1140: * <p><blockquote><pre>
1141: * server = [ [ userinfo "@" ] hostport ]
1142: * </pre></blockquote><p>
1143: */
1144: protected static final BitSet server = new BitSet(256);
1145: // Static initializer for server
1146: static {
1147: server.or(userinfo);
1148: server.set('@');
1149: server.or(hostport);
1150: }
1151:
1152: /**
1153: * BitSet for reg_name.
1154: * <p><blockquote><pre>
1155: * reg_name = 1*( unreserved | escaped | "$" | "," |
1156: * ";" | ":" | "@" | "&" | "=" | "+" )
1157: * </pre></blockquote><p>
1158: */
1159: protected static final BitSet reg_name = new BitSet(256);
1160: // Static initializer for reg_name
1161: static {
1162: reg_name.or(unreserved);
1163: reg_name.or(escaped);
1164: reg_name.set('$');
1165: reg_name.set(',');
1166: reg_name.set(';');
1167: reg_name.set(':');
1168: reg_name.set('@');
1169: reg_name.set('&');
1170: reg_name.set('=');
1171: reg_name.set('+');
1172: }
1173:
1174: /**
1175: * BitSet for authority.
1176: * <p><blockquote><pre>
1177: * authority = server | reg_name
1178: * </pre></blockquote><p>
1179: */
1180: protected static final BitSet authority = new BitSet(256);
1181: // Static initializer for authority
1182: static {
1183: authority.or(server);
1184: authority.or(reg_name);
1185: }
1186:
1187: /**
1188: * BitSet for scheme.
1189: * <p><blockquote><pre>
1190: * scheme = alpha *( alpha | digit | "+" | "-" | "." )
1191: * </pre></blockquote><p>
1192: */
1193: protected static final BitSet scheme = new BitSet(256);
1194: // Static initializer for scheme
1195: static {
1196: scheme.or(alpha);
1197: scheme.or(digit);
1198: scheme.set('+');
1199: scheme.set('-');
1200: scheme.set('.');
1201: }
1202:
1203: /**
1204: * BitSet for rel_segment.
1205: * <p><blockquote><pre>
1206: * rel_segment = 1*( unreserved | escaped |
1207: * ";" | "@" | "&" | "=" | "+" | "$" | "," )
1208: * </pre></blockquote><p>
1209: */
1210: protected static final BitSet rel_segment = new BitSet(256);
1211: // Static initializer for rel_segment
1212: static {
1213: rel_segment.or(unreserved);
1214: rel_segment.or(escaped);
1215: rel_segment.set(';');
1216: rel_segment.set('@');
1217: rel_segment.set('&');
1218: rel_segment.set('=');
1219: rel_segment.set('+');
1220: rel_segment.set('$');
1221: rel_segment.set(',');
1222: }
1223:
1224: /**
1225: * BitSet for rel_path.
1226: * <p><blockquote><pre>
1227: * rel_path = rel_segment [ abs_path ]
1228: * </pre></blockquote><p>
1229: */
1230: protected static final BitSet rel_path = new BitSet(256);
1231: // Static initializer for rel_path
1232: static {
1233: rel_path.or(rel_segment);
1234: rel_path.or(abs_path);
1235: }
1236:
1237: /**
1238: * BitSet for net_path.
1239: * <p><blockquote><pre>
1240: * net_path = "//" authority [ abs_path ]
1241: * </pre></blockquote><p>
1242: */
1243: protected static final BitSet net_path = new BitSet(256);
1244: // Static initializer for net_path
1245: static {
1246: net_path.set('/');
1247: net_path.or(authority);
1248: net_path.or(abs_path);
1249: }
1250:
1251: /**
1252: * BitSet for hier_part.
1253: * <p><blockquote><pre>
1254: * hier_part = ( net_path | abs_path ) [ "?" query ]
1255: * </pre></blockquote><p>
1256: */
1257: protected static final BitSet hier_part = new BitSet(256);
1258: // Static initializer for hier_part
1259: static {
1260: hier_part.or(net_path);
1261: hier_part.or(abs_path);
1262: // hier_part.set('?'); aleady included
1263: hier_part.or(query);
1264: }
1265:
1266: /**
1267: * BitSet for relativeURI.
1268: * <p><blockquote><pre>
1269: * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
1270: * </pre></blockquote><p>
1271: */
1272: protected static final BitSet relativeURI = new BitSet(256);
1273: // Static initializer for relativeURI
1274: static {
1275: relativeURI.or(net_path);
1276: relativeURI.or(abs_path);
1277: relativeURI.or(rel_path);
1278: // relativeURI.set('?'); aleady included
1279: relativeURI.or(query);
1280: }
1281:
1282: /**
1283: * BitSet for absoluteURI.
1284: * <p><blockquote><pre>
1285: * absoluteURI = scheme ":" ( hier_part | opaque_part )
1286: * </pre></blockquote><p>
1287: */
1288: protected static final BitSet absoluteURI = new BitSet(256);
1289: // Static initializer for absoluteURI
1290: static {
1291: absoluteURI.or(scheme);
1292: absoluteURI.set(':');
1293: absoluteURI.or(hier_part);
1294: absoluteURI.or(opaque_part);
1295: }
1296:
1297: /**
1298: * BitSet for URI-reference.
1299: * <p><blockquote><pre>
1300: * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
1301: * </pre></blockquote><p>
1302: */
1303: protected static final BitSet URI_reference = new BitSet(256);
1304: // Static initializer for URI_reference
1305: static {
1306: URI_reference.or(absoluteURI);
1307: URI_reference.or(relativeURI);
1308: URI_reference.set('#');
1309: URI_reference.or(fragment);
1310: }
1311:
1312: // ---------------------------- Characters disallowed within the URI syntax
1313: // Excluded US-ASCII Characters are like control, space, delims and unwise
1314:
1315: /**
1316: * BitSet for control.
1317: */
1318: public static final BitSet control = new BitSet(256);
1319: // Static initializer for control
1320: static {
1321: for (int i = 0; i <= 0x1F; i++) {
1322: control.set(i);
1323: }
1324: control.set(0x7F);
1325: }
1326:
1327: /**
1328: * BitSet for space.
1329: */
1330: public static final BitSet space = new BitSet(256);
1331: // Static initializer for space
1332: static {
1333: space.set(0x20);
1334: }
1335:
1336: /**
1337: * BitSet for delims.
1338: */
1339: public static final BitSet delims = new BitSet(256);
1340: // Static initializer for delims
1341: static {
1342: delims.set('<');
1343: delims.set('>');
1344: delims.set('#');
1345: delims.set('%');
1346: delims.set('"');
1347: }
1348:
1349: /**
1350: * BitSet for unwise.
1351: */
1352: public static final BitSet unwise = new BitSet(256);
1353: // Static initializer for unwise
1354: static {
1355: unwise.set('{');
1356: unwise.set('}');
1357: unwise.set('|');
1358: unwise.set('\\');
1359: unwise.set('^');
1360: unwise.set('[');
1361: unwise.set(']');
1362: unwise.set('`');
1363: }
1364:
1365: /**
1366: * Disallowed rel_path before escaping.
1367: */
1368: public static final BitSet disallowed_rel_path = new BitSet(256);
1369: // Static initializer for disallowed_rel_path
1370: static {
1371: disallowed_rel_path.or(uric);
1372: disallowed_rel_path.andNot(rel_path);
1373: }
1374:
1375: /**
1376: * Disallowed opaque_part before escaping.
1377: */
1378: public static final BitSet disallowed_opaque_part = new BitSet(256);
1379: // Static initializer for disallowed_opaque_part
1380: static {
1381: disallowed_opaque_part.or(uric);
1382: disallowed_opaque_part.andNot(opaque_part);
1383: }
1384:
1385: // ----------------------- Characters allowed within and for each component
1386:
1387: /**
1388: * Those characters that are allowed for the authority component.
1389: */
1390: public static final BitSet allowed_authority = new BitSet(256);
1391: // Static initializer for allowed_authority
1392: static {
1393: allowed_authority.or(authority);
1394: allowed_authority.clear('%');
1395: }
1396:
1397: /**
1398: * Those characters that are allowed for the opaque_part.
1399: */
1400: public static final BitSet allowed_opaque_part = new BitSet(256);
1401: // Static initializer for allowed_opaque_part
1402: static {
1403: allowed_opaque_part.or(opaque_part);
1404: allowed_opaque_part.clear('%');
1405: }
1406:
1407: /**
1408: * Those characters that are allowed for the reg_name.
1409: */
1410: public static final BitSet allowed_reg_name = new BitSet(256);
1411: // Static initializer for allowed_reg_name
1412: static {
1413: allowed_reg_name.or(reg_name);
1414: // allowed_reg_name.andNot(percent);
1415: allowed_reg_name.clear('%');
1416: }
1417:
1418: /**
1419: * Those characters that are allowed for the userinfo component.
1420: */
1421: public static final BitSet allowed_userinfo = new BitSet(256);
1422: // Static initializer for allowed_userinfo
1423: static {
1424: allowed_userinfo.or(userinfo);
1425: // allowed_userinfo.andNot(percent);
1426: allowed_userinfo.clear('%');
1427: }
1428:
1429: /**
1430: * Those characters that are allowed for within the userinfo component.
1431: */
1432: public static final BitSet allowed_within_userinfo = new BitSet(256);
1433: // Static initializer for allowed_within_userinfo
1434: static {
1435: allowed_within_userinfo.or(within_userinfo);
1436: allowed_within_userinfo.clear('%');
1437: }
1438:
1439: /**
1440: * Those characters that are allowed for the IPv6reference component.
1441: * The characters '[', ']' in IPv6reference should be excluded.
1442: */
1443: public static final BitSet allowed_IPv6reference = new BitSet(256);
1444: // Static initializer for allowed_IPv6reference
1445: static {
1446: allowed_IPv6reference.or(IPv6reference);
1447: // allowed_IPv6reference.andNot(unwise);
1448: allowed_IPv6reference.clear('[');
1449: allowed_IPv6reference.clear(']');
1450: }
1451:
1452: /**
1453: * Those characters that are allowed for the host component.
1454: * The characters '[', ']' in IPv6reference should be excluded.
1455: */
1456: public static final BitSet allowed_host = new BitSet(256);
1457: // Static initializer for allowed_host
1458: static {
1459: allowed_host.or(hostname);
1460: allowed_host.or(allowed_IPv6reference);
1461: }
1462:
1463: /**
1464: * Those characters that are allowed for the authority component.
1465: */
1466: public static final BitSet allowed_within_authority = new BitSet(
1467: 256);
1468: // Static initializer for allowed_within_authority
1469: static {
1470: allowed_within_authority.or(server);
1471: allowed_within_authority.or(reg_name);
1472: allowed_within_authority.clear(';');
1473: allowed_within_authority.clear(':');
1474: allowed_within_authority.clear('@');
1475: allowed_within_authority.clear('?');
1476: allowed_within_authority.clear('/');
1477: }
1478:
1479: /**
1480: * Those characters that are allowed for the abs_path.
1481: */
1482: public static final BitSet allowed_abs_path = new BitSet(256);
1483: // Static initializer for allowed_abs_path
1484: static {
1485: allowed_abs_path.or(abs_path);
1486: // allowed_abs_path.set('/'); // aleady included
1487: allowed_abs_path.andNot(percent);
1488: allowed_abs_path.clear('+');
1489: }
1490:
1491: /**
1492: * Those characters that are allowed for the rel_path.
1493: */
1494: public static final BitSet allowed_rel_path = new BitSet(256);
1495: // Static initializer for allowed_rel_path
1496: static {
1497: allowed_rel_path.or(rel_path);
1498: allowed_rel_path.clear('%');
1499: allowed_rel_path.clear('+');
1500: }
1501:
1502: /**
1503: * Those characters that are allowed within the path.
1504: */
1505: public static final BitSet allowed_within_path = new BitSet(256);
1506: // Static initializer for allowed_within_path
1507: static {
1508: allowed_within_path.or(abs_path);
1509: allowed_within_path.clear('/');
1510: allowed_within_path.clear(';');
1511: allowed_within_path.clear('=');
1512: allowed_within_path.clear('?');
1513: }
1514:
1515: /**
1516: * Those characters that are allowed for the query component.
1517: */
1518: public static final BitSet allowed_query = new BitSet(256);
1519: // Static initializer for allowed_query
1520: static {
1521: allowed_query.or(uric);
1522: allowed_query.clear('%');
1523: }
1524:
1525: /**
1526: * Those characters that are allowed within the query component.
1527: */
1528: public static final BitSet allowed_within_query = new BitSet(256);
1529: // Static initializer for allowed_within_query
1530: static {
1531: allowed_within_query.or(allowed_query);
1532: allowed_within_query.andNot(reserved); // excluded 'reserved'
1533: }
1534:
1535: /**
1536: * Those characters that are allowed for the fragment component.
1537: */
1538: public static final BitSet allowed_fragment = new BitSet(256);
1539: // Static initializer for allowed_fragment
1540: static {
1541: allowed_fragment.or(uric);
1542: allowed_fragment.clear('%');
1543: }
1544:
1545: // ------------------------------------------- Flags for this URI-reference
1546:
1547: // TODO: Figure out what all these variables are for and provide javadoc
1548:
1549: // URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
1550: // absoluteURI = scheme ":" ( hier_part | opaque_part )
1551: protected boolean _is_hier_part;
1552: protected boolean _is_opaque_part;
1553: // relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
1554: // hier_part = ( net_path | abs_path ) [ "?" query ]
1555: protected boolean _is_net_path;
1556: protected boolean _is_abs_path;
1557: protected boolean _is_rel_path;
1558: // net_path = "//" authority [ abs_path ]
1559: // authority = server | reg_name
1560: protected boolean _is_reg_name;
1561: protected boolean _is_server; // = _has_server
1562: // server = [ [ userinfo "@" ] hostport ]
1563: // host = hostname | IPv4address | IPv6reference
1564: protected boolean _is_hostname;
1565: protected boolean _is_IPv4address;
1566: protected boolean _is_IPv6reference;
1567:
1568: // ------------------------------------------ Character and escape encoding
1569:
1570: /**
1571: * Encodes URI string.
1572: *
1573: * This is a two mapping, one from original characters to octets, and
1574: * subsequently a second from octets to URI characters:
1575: * <p><blockquote><pre>
1576: * original character sequence->octet sequence->URI character sequence
1577: * </pre></blockquote><p>
1578: *
1579: * An escaped octet is encoded as a character triplet, consisting of the
1580: * percent character "%" followed by the two hexadecimal digits
1581: * representing the octet code. For example, "%20" is the escaped
1582: * encoding for the US-ASCII space character.
1583: * <p>
1584: * Conversion from the local filesystem character set to UTF-8 will
1585: * normally involve a two step process. First convert the local character
1586: * set to the UCS; then convert the UCS to UTF-8.
1587: * The first step in the process can be performed by maintaining a mapping
1588: * table that includes the local character set code and the corresponding
1589: * UCS code.
1590: * The next step is to convert the UCS character code to the UTF-8 encoding.
1591: * <p>
1592: * Mapping between vendor codepages can be done in a very similar manner
1593: * as described above.
1594: * <p>
1595: * The only time escape encodings can allowedly be made is when a URI is
1596: * being created from its component parts. The escape and validate methods
1597: * are internally performed within this method.
1598: *
1599: * @param original the original character sequence
1600: * @param allowed those characters that are allowed within a component
1601: * @param charset the protocol charset
1602: * @return URI character sequence
1603: * @throws URIException null component or unsupported character encoding
1604: */
1605:
1606: protected static char[] encode(String original, BitSet allowed,
1607: String charset) throws URIException {
1608: if (original == null) {
1609: throw new IllegalArgumentException(
1610: "Original string may not be null");
1611: }
1612: if (allowed == null) {
1613: throw new IllegalArgumentException(
1614: "Allowed bitset may not be null");
1615: }
1616: byte[] rawdata = URLCodec.encodeUrl(allowed, EncodingUtil
1617: .getBytes(original, charset));
1618: return EncodingUtil.getAsciiString(rawdata).toCharArray();
1619: }
1620:
1621: /**
1622: * Decodes URI encoded string.
1623: *
1624: * This is a two mapping, one from URI characters to octets, and
1625: * subsequently a second from octets to original characters:
1626: * <p><blockquote><pre>
1627: * URI character sequence->octet sequence->original character sequence
1628: * </pre></blockquote><p>
1629: *
1630: * A URI must be separated into its components before the escaped
1631: * characters within those components can be allowedly decoded.
1632: * <p>
1633: * Notice that there is a chance that URI characters that are non UTF-8
1634: * may be parsed as valid UTF-8. A recent non-scientific analysis found
1635: * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
1636: * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
1637: * false reading.
1638: * <p>
1639: * The percent "%" character always has the reserved purpose of being
1640: * the escape indicator, it must be escaped as "%25" in order to be used
1641: * as data within a URI.
1642: * <p>
1643: * The unescape method is internally performed within this method.
1644: *
1645: * @param component the URI character sequence
1646: * @param charset the protocol charset
1647: * @return original character sequence
1648: * @throws URIException incomplete trailing escape pattern or unsupported
1649: * character encoding
1650: */
1651: protected static String decode(char[] component, String charset)
1652: throws URIException {
1653: if (component == null) {
1654: throw new IllegalArgumentException(
1655: "Component array of chars may not be null");
1656: }
1657: return decode(new String(component), charset);
1658: }
1659:
1660: /**
1661: * Decodes URI encoded string.
1662: *
1663: * This is a two mapping, one from URI characters to octets, and
1664: * subsequently a second from octets to original characters:
1665: * <p><blockquote><pre>
1666: * URI character sequence->octet sequence->original character sequence
1667: * </pre></blockquote><p>
1668: *
1669: * A URI must be separated into its components before the escaped
1670: * characters within those components can be allowedly decoded.
1671: * <p>
1672: * Notice that there is a chance that URI characters that are non UTF-8
1673: * may be parsed as valid UTF-8. A recent non-scientific analysis found
1674: * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
1675: * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
1676: * false reading.
1677: * <p>
1678: * The percent "%" character always has the reserved purpose of being
1679: * the escape indicator, it must be escaped as "%25" in order to be used
1680: * as data within a URI.
1681: * <p>
1682: * The unescape method is internally performed within this method.
1683: *
1684: * @param component the URI character sequence
1685: * @param charset the protocol charset
1686: * @return original character sequence
1687: * @throws URIException incomplete trailing escape pattern or unsupported
1688: * character encoding
1689: *
1690: * @since 3.0
1691: */
1692: protected static String decode(String component, String charset)
1693: throws URIException {
1694: if (component == null) {
1695: throw new IllegalArgumentException(
1696: "Component array of chars may not be null");
1697: }
1698: byte[] rawdata = null;
1699: try {
1700: rawdata = URLCodec.decodeUrl(EncodingUtil
1701: .getAsciiBytes(component));
1702: } catch (DecoderException e) {
1703: throw new URIException(e.getMessage());
1704: }
1705: return EncodingUtil.getString(rawdata, charset);
1706: }
1707:
1708: /**
1709: * Pre-validate the unescaped URI string within a specific component.
1710: *
1711: * @param component the component string within the component
1712: * @param disallowed those characters disallowed within the component
1713: * @return if true, it doesn't have the disallowed characters
1714: * if false, the component is undefined or an incorrect one
1715: */
1716: protected boolean prevalidate(String component, BitSet disallowed) {
1717: // prevalidate the given component by disallowed characters
1718: if (component == null) {
1719: return false; // undefined
1720: }
1721: char[] target = component.toCharArray();
1722: for (int i = 0; i < target.length; i++) {
1723: if (disallowed.get(target[i])) {
1724: return false;
1725: }
1726: }
1727: return true;
1728: }
1729:
1730: /**
1731: * Validate the URI characters within a specific component.
1732: * The component must be performed after escape encoding. Or it doesn't
1733: * include escaped characters.
1734: *
1735: * @param component the characters sequence within the component
1736: * @param generous those characters that are allowed within a component
1737: * @return if true, it's the correct URI character sequence
1738: */
1739: protected boolean validate(char[] component, BitSet generous) {
1740: // validate each component by generous characters
1741: return validate(component, 0, -1, generous);
1742: }
1743:
1744: /**
1745: * Validate the URI characters within a specific component.
1746: * The component must be performed after escape encoding. Or it doesn't
1747: * include escaped characters.
1748: * <p>
1749: * It's not that much strict, generous. The strict validation might be
1750: * performed before being called this method.
1751: *
1752: * @param component the characters sequence within the component
1753: * @param soffset the starting offset of the given component
1754: * @param eoffset the ending offset of the given component
1755: * if -1, it means the length of the component
1756: * @param generous those characters that are allowed within a component
1757: * @return if true, it's the correct URI character sequence
1758: */
1759: protected boolean validate(char[] component, int soffset,
1760: int eoffset, BitSet generous) {
1761: // validate each component by generous characters
1762: if (eoffset == -1) {
1763: eoffset = component.length - 1;
1764: }
1765: for (int i = soffset; i <= eoffset; i++) {
1766: if (!generous.get(component[i])) {
1767: return false;
1768: }
1769: }
1770: return true;
1771: }
1772:
1773: /**
1774: * In order to avoid any possilbity of conflict with non-ASCII characters,
1775: * Parse a URI reference as a <code>String</code> with the character
1776: * encoding of the local system or the document.
1777: * <p>
1778: * The following line is the regular expression for breaking-down a URI
1779: * reference into its components.
1780: * <p><blockquote><pre>
1781: * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1782: * 12 3 4 5 6 7 8 9
1783: * </pre></blockquote><p>
1784: * For example, matching the above expression to
1785: * http://jakarta.apache.org/ietf/uri/#Related
1786: * results in the following subexpression matches:
1787: * <p><blockquote><pre>
1788: * $1 = http:
1789: * scheme = $2 = http
1790: * $3 = //jakarta.apache.org
1791: * authority = $4 = jakarta.apache.org
1792: * path = $5 = /ietf/uri/
1793: * $6 = <undefined>
1794: * query = $7 = <undefined>
1795: * $8 = #Related
1796: * fragment = $9 = Related
1797: * </pre></blockquote><p>
1798: *
1799: * @param original the original character sequence
1800: * @param escaped <code>true</code> if <code>original</code> is escaped
1801: * @throws URIException If an error occurs.
1802: */
1803: protected void parseUriReference(String original, boolean escaped)
1804: throws URIException {
1805:
1806: // validate and contruct the URI character sequence
1807: if (original == null) {
1808: throw new URIException("URI-Reference required");
1809: }
1810:
1811: /* @
1812: * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1813: */
1814: String tmp = original.trim();
1815:
1816: /*
1817: * The length of the string sequence of characters.
1818: * It may not be equal to the length of the byte array.
1819: */
1820: int length = tmp.length();
1821:
1822: /*
1823: * Remove the delimiters like angle brackets around an URI.
1824: */
1825: if (length > 0) {
1826: char[] firstDelimiter = { tmp.charAt(0) };
1827: if (validate(firstDelimiter, delims)) {
1828: if (length >= 2) {
1829: char[] lastDelimiter = { tmp.charAt(length - 1) };
1830: if (validate(lastDelimiter, delims)) {
1831: tmp = tmp.substring(1, length - 1);
1832: length = length - 2;
1833: }
1834: }
1835: }
1836: }
1837:
1838: /*
1839: * The starting index
1840: */
1841: int from = 0;
1842:
1843: /*
1844: * The test flag whether the URI is started from the path component.
1845: */
1846: boolean isStartedFromPath = false;
1847: int atColon = tmp.indexOf(':');
1848: int atSlash = tmp.indexOf('/');
1849: if ((atColon <= 0 && !tmp.startsWith("//"))
1850: || (atSlash >= 0 && atSlash < atColon)) {
1851: isStartedFromPath = true;
1852: }
1853:
1854: /*
1855: * <p><blockquote><pre>
1856: * @@@@@@@@
1857: * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1858: * </pre></blockquote><p>
1859: */
1860: int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#",
1861: from);
1862: if (at == -1) {
1863: at = 0;
1864: }
1865:
1866: /*
1867: * Parse the scheme.
1868: * <p><blockquote><pre>
1869: * scheme = $2 = http
1870: * @
1871: * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1872: * </pre></blockquote><p>
1873: */
1874: if (at > 0 && at < length && tmp.charAt(at) == ':') {
1875: char[] target = tmp.substring(0, at).toLowerCase()
1876: .toCharArray();
1877: if (validate(target, scheme)) {
1878: _scheme = target;
1879: } else {
1880: throw new URIException("incorrect scheme");
1881: }
1882: from = ++at;
1883: }
1884:
1885: /*
1886: * Parse the authority component.
1887: * <p><blockquote><pre>
1888: * authority = $4 = jakarta.apache.org
1889: * @@
1890: * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1891: * </pre></blockquote><p>
1892: */
1893: // Reset flags
1894: _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false;
1895: if (0 <= at && at < length && tmp.charAt(at) == '/') {
1896: // Set flag
1897: _is_hier_part = true;
1898: if (at + 2 < length && tmp.charAt(at + 1) == '/'
1899: && !isStartedFromPath) {
1900: // the temporary index to start the search from
1901: int next = indexFirstOf(tmp, "/?#", at + 2);
1902: if (next == -1) {
1903: next = (tmp.substring(at + 2).length() == 0) ? at + 2
1904: : tmp.length();
1905: }
1906: parseAuthority(tmp.substring(at + 2, next), escaped);
1907: from = at = next;
1908: // Set flag
1909: _is_net_path = true;
1910: }
1911: if (from == at) {
1912: // Set flag
1913: _is_abs_path = true;
1914: }
1915: }
1916:
1917: /*
1918: * Parse the path component.
1919: * <p><blockquote><pre>
1920: * path = $5 = /ietf/uri/
1921: * @@@@@@
1922: * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1923: * </pre></blockquote><p>
1924: */
1925: if (from < length) {
1926: // rel_path = rel_segment [ abs_path ]
1927: int next = indexFirstOf(tmp, "?#", from);
1928: if (next == -1) {
1929: next = tmp.length();
1930: }
1931: if (!_is_abs_path) {
1932: if (!escaped
1933: && prevalidate(tmp.substring(from, next),
1934: disallowed_rel_path)
1935: || escaped
1936: && validate(tmp.substring(from, next)
1937: .toCharArray(), rel_path)) {
1938: // Set flag
1939: _is_rel_path = true;
1940: } else if (!escaped
1941: && prevalidate(tmp.substring(from, next),
1942: disallowed_opaque_part)
1943: || escaped
1944: && validate(tmp.substring(from, next)
1945: .toCharArray(), opaque_part)) {
1946: // Set flag
1947: _is_opaque_part = true;
1948: } else {
1949: // the path component may be empty
1950: _path = null;
1951: }
1952: }
1953: String s = tmp.substring(from, next);
1954: if (escaped) {
1955: setRawPath(s.toCharArray());
1956: } else {
1957: setPath(s);
1958: }
1959: at = next;
1960: }
1961:
1962: // set the charset to do escape encoding
1963: String charset = getProtocolCharset();
1964:
1965: /*
1966: * Parse the query component.
1967: * <p><blockquote><pre>
1968: * query = $7 = <undefined>
1969: * @@@@@@@@@
1970: * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1971: * </pre></blockquote><p>
1972: */
1973: if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') {
1974: int next = tmp.indexOf('#', at + 1);
1975: if (next == -1) {
1976: next = tmp.length();
1977: }
1978: if (escaped) {
1979: _query = tmp.substring(at + 1, next).toCharArray();
1980: if (!validate(_query, uric)) {
1981: throw new URIException("Invalid query");
1982: }
1983: } else {
1984: _query = encode(tmp.substring(at + 1, next),
1985: allowed_query, charset);
1986: }
1987: at = next;
1988: }
1989:
1990: /*
1991: * Parse the fragment component.
1992: * <p><blockquote><pre>
1993: * fragment = $9 = Related
1994: * @@@@@@@@
1995: * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1996: * </pre></blockquote><p>
1997: */
1998: if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') {
1999: if (at + 1 == length) { // empty fragment
2000: _fragment = "".toCharArray();
2001: } else {
2002: _fragment = (escaped) ? tmp.substring(at + 1)
2003: .toCharArray() : encode(tmp.substring(at + 1),
2004: allowed_fragment, charset);
2005: }
2006: }
2007:
2008: // set this URI.
2009: setURI();
2010: }
2011:
2012: /**
2013: * Get the earlier index that to be searched for the first occurrance in
2014: * one of any of the given string.
2015: *
2016: * @param s the string to be indexed
2017: * @param delims the delimiters used to index
2018: * @return the earlier index if there are delimiters
2019: */
2020: protected int indexFirstOf(String s, String delims) {
2021: return indexFirstOf(s, delims, -1);
2022: }
2023:
2024: /**
2025: * Get the earlier index that to be searched for the first occurrance in
2026: * one of any of the given string.
2027: *
2028: * @param s the string to be indexed
2029: * @param delims the delimiters used to index
2030: * @param offset the from index
2031: * @return the earlier index if there are delimiters
2032: */
2033: protected int indexFirstOf(String s, String delims, int offset) {
2034: if (s == null || s.length() == 0) {
2035: return -1;
2036: }
2037: if (delims == null || delims.length() == 0) {
2038: return -1;
2039: }
2040: // check boundaries
2041: if (offset < 0) {
2042: offset = 0;
2043: } else if (offset > s.length()) {
2044: return -1;
2045: }
2046: // s is never null
2047: int min = s.length();
2048: char[] delim = delims.toCharArray();
2049: for (int i = 0; i < delim.length; i++) {
2050: int at = s.indexOf(delim[i], offset);
2051: if (at >= 0 && at < min) {
2052: min = at;
2053: }
2054: }
2055: return (min == s.length()) ? -1 : min;
2056: }
2057:
2058: /**
2059: * Get the earlier index that to be searched for the first occurrance in
2060: * one of any of the given array.
2061: *
2062: * @param s the character array to be indexed
2063: * @param delim the delimiter used to index
2064: * @return the ealier index if there are a delimiter
2065: */
2066: protected int indexFirstOf(char[] s, char delim) {
2067: return indexFirstOf(s, delim, 0);
2068: }
2069:
2070: /**
2071: * Get the earlier index that to be searched for the first occurrance in
2072: * one of any of the given array.
2073: *
2074: * @param s the character array to be indexed
2075: * @param delim the delimiter used to index
2076: * @param offset The offset.
2077: * @return the ealier index if there is a delimiter
2078: */
2079: protected int indexFirstOf(char[] s, char delim, int offset) {
2080: if (s == null || s.length == 0) {
2081: return -1;
2082: }
2083: // check boundaries
2084: if (offset < 0) {
2085: offset = 0;
2086: } else if (offset > s.length) {
2087: return -1;
2088: }
2089: for (int i = offset; i < s.length; i++) {
2090: if (s[i] == delim) {
2091: return i;
2092: }
2093: }
2094: return -1;
2095: }
2096:
2097: /**
2098: * Parse the authority component.
2099: *
2100: * @param original the original character sequence of authority component
2101: * @param escaped <code>true</code> if <code>original</code> is escaped
2102: * @throws URIException If an error occurs.
2103: */
2104: protected void parseAuthority(String original, boolean escaped)
2105: throws URIException {
2106:
2107: // Reset flags
2108: _is_reg_name = _is_server = _is_hostname = _is_IPv4address = _is_IPv6reference = false;
2109:
2110: // set the charset to do escape encoding
2111: String charset = getProtocolCharset();
2112:
2113: boolean hasPort = true;
2114: int from = 0;
2115: int next = original.indexOf('@');
2116: if (next != -1) { // neither -1 and 0
2117: // each protocol extented from URI supports the specific userinfo
2118: _userinfo = (escaped) ? original.substring(0, next)
2119: .toCharArray() : encode(
2120: original.substring(0, next), allowed_userinfo,
2121: charset);
2122: from = next + 1;
2123: }
2124: next = original.indexOf('[', from);
2125: if (next >= from) {
2126: next = original.indexOf(']', from);
2127: if (next == -1) {
2128: throw new URIException(URIException.PARSING,
2129: "IPv6reference");
2130: } else {
2131: next++;
2132: }
2133: // In IPv6reference, '[', ']' should be excluded
2134: _host = (escaped) ? original.substring(from, next)
2135: .toCharArray() : encode(original.substring(from,
2136: next), allowed_IPv6reference, charset);
2137: // Set flag
2138: _is_IPv6reference = true;
2139: } else { // only for !_is_IPv6reference
2140: next = original.indexOf(':', from);
2141: if (next == -1) {
2142: next = original.length();
2143: hasPort = false;
2144: }
2145: // REMINDME: it doesn't need the pre-validation
2146: _host = original.substring(from, next).toCharArray();
2147: if (validate(_host, IPv4address)) {
2148: // Set flag
2149: _is_IPv4address = true;
2150: } else if (validate(_host, hostname)) {
2151: // Set flag
2152: _is_hostname = true;
2153: } else {
2154: // Set flag
2155: _is_reg_name = true;
2156: }
2157: }
2158: if (_is_reg_name) {
2159: // Reset flags for a server-based naming authority
2160: _is_server = _is_hostname = _is_IPv4address = _is_IPv6reference = false;
2161: // set a registry-based naming authority
2162: if (escaped) {
2163: _authority = original.toCharArray();
2164: if (!validate(_authority, reg_name)) {
2165: throw new URIException("Invalid authority");
2166: }
2167: } else {
2168: _authority = encode(original, allowed_reg_name, charset);
2169: }
2170: } else {
2171: if (original.length() - 1 > next && hasPort
2172: && original.charAt(next) == ':') { // not empty
2173: from = next + 1;
2174: try {
2175: _port = Integer.parseInt(original.substring(from));
2176: } catch (NumberFormatException error) {
2177: throw new URIException(URIException.PARSING,
2178: "invalid port number");
2179: }
2180: }
2181: // set a server-based naming authority
2182: StringBuffer buf = new StringBuffer();
2183: if (_userinfo != null) { // has_userinfo
2184: buf.append(_userinfo);
2185: buf.append('@');
2186: }
2187: if (_host != null) {
2188: buf.append(_host);
2189: if (_port != -1) {
2190: buf.append(':');
2191: buf.append(_port);
2192: }
2193: }
2194: _authority = buf.toString().toCharArray();
2195: // Set flag
2196: _is_server = true;
2197: }
2198: }
2199:
2200: /**
2201: * Once it's parsed successfully, set this URI.
2202: *
2203: * @see #getRawURI
2204: */
2205: protected void setURI() {
2206: // set _uri
2207: StringBuffer buf = new StringBuffer();
2208: // ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
2209: if (_scheme != null) {
2210: buf.append(_scheme);
2211: buf.append(':');
2212: }
2213: if (_is_net_path) {
2214: buf.append("//");
2215: if (_authority != null) { // has_authority
2216: buf.append(_authority);
2217: }
2218: }
2219: if (_opaque != null && _is_opaque_part) {
2220: buf.append(_opaque);
2221: } else if (_path != null) {
2222: // _is_hier_part or _is_relativeURI
2223: if (_path.length != 0) {
2224: buf.append(_path);
2225: }
2226: }
2227: if (_query != null) { // has_query
2228: buf.append('?');
2229: buf.append(_query);
2230: }
2231: // ignore the fragment identifier
2232: _uri = buf.toString().toCharArray();
2233: hash = 0;
2234: }
2235:
2236: // ----------------------------------------------------------- Test methods
2237:
2238: /**
2239: * Tell whether or not this URI is absolute.
2240: *
2241: * @return true iif this URI is absoluteURI
2242: */
2243: public boolean isAbsoluteURI() {
2244: return (_scheme != null);
2245: }
2246:
2247: /**
2248: * Tell whether or not this URI is relative.
2249: *
2250: * @return true iif this URI is relativeURI
2251: */
2252: public boolean isRelativeURI() {
2253: return (_scheme == null);
2254: }
2255:
2256: /**
2257: * Tell whether or not the absoluteURI of this URI is hier_part.
2258: *
2259: * @return true iif the absoluteURI is hier_part
2260: */
2261: public boolean isHierPart() {
2262: return _is_hier_part;
2263: }
2264:
2265: /**
2266: * Tell whether or not the absoluteURI of this URI is opaque_part.
2267: *
2268: * @return true iif the absoluteURI is opaque_part
2269: */
2270: public boolean isOpaquePart() {
2271: return _is_opaque_part;
2272: }
2273:
2274: /**
2275: * Tell whether or not the relativeURI or heir_part of this URI is net_path.
2276: * It's the same function as the has_authority() method.
2277: *
2278: * @return true iif the relativeURI or heir_part is net_path
2279: * @see #hasAuthority
2280: */
2281: public boolean isNetPath() {
2282: return _is_net_path || (_authority != null);
2283: }
2284:
2285: /**
2286: * Tell whether or not the relativeURI or hier_part of this URI is abs_path.
2287: *
2288: * @return true iif the relativeURI or hier_part is abs_path
2289: */
2290: public boolean isAbsPath() {
2291: return _is_abs_path;
2292: }
2293:
2294: /**
2295: * Tell whether or not the relativeURI of this URI is rel_path.
2296: *
2297: * @return true iif the relativeURI is rel_path
2298: */
2299: public boolean isRelPath() {
2300: return _is_rel_path;
2301: }
2302:
2303: /**
2304: * Tell whether or not this URI has authority.
2305: * It's the same function as the is_net_path() method.
2306: *
2307: * @return true iif this URI has authority
2308: * @see #isNetPath
2309: */
2310: public boolean hasAuthority() {
2311: return (_authority != null) || _is_net_path;
2312: }
2313:
2314: /**
2315: * Tell whether or not the authority component of this URI is reg_name.
2316: *
2317: * @return true iif the authority component is reg_name
2318: */
2319: public boolean isRegName() {
2320: return _is_reg_name;
2321: }
2322:
2323: /**
2324: * Tell whether or not the authority component of this URI is server.
2325: *
2326: * @return true iif the authority component is server
2327: */
2328: public boolean isServer() {
2329: return _is_server;
2330: }
2331:
2332: /**
2333: * Tell whether or not this URI has userinfo.
2334: *
2335: * @return true iif this URI has userinfo
2336: */
2337: public boolean hasUserinfo() {
2338: return (_userinfo != null);
2339: }
2340:
2341: /**
2342: * Tell whether or not the host part of this URI is hostname.
2343: *
2344: * @return true iif the host part is hostname
2345: */
2346: public boolean isHostname() {
2347: return _is_hostname;
2348: }
2349:
2350: /**
2351: * Tell whether or not the host part of this URI is IPv4address.
2352: *
2353: * @return true iif the host part is IPv4address
2354: */
2355: public boolean isIPv4address() {
2356: return _is_IPv4address;
2357: }
2358:
2359: /**
2360: * Tell whether or not the host part of this URI is IPv6reference.
2361: *
2362: * @return true iif the host part is IPv6reference
2363: */
2364: public boolean isIPv6reference() {
2365: return _is_IPv6reference;
2366: }
2367:
2368: /**
2369: * Tell whether or not this URI has query.
2370: *
2371: * @return true iif this URI has query
2372: */
2373: public boolean hasQuery() {
2374: return (_query != null);
2375: }
2376:
2377: /**
2378: * Tell whether or not this URI has fragment.
2379: *
2380: * @return true iif this URI has fragment
2381: */
2382: public boolean hasFragment() {
2383: return (_fragment != null);
2384: }
2385:
2386: // ---------------------------------------------------------------- Charset
2387:
2388: /**
2389: * Set the default charset of the protocol.
2390: * <p>
2391: * The character set used to store files SHALL remain a local decision and
2392: * MAY depend on the capability of local operating systems. Prior to the
2393: * exchange of URIs they SHOULD be converted into a ISO/IEC 10646 format
2394: * and UTF-8 encoded. This approach, while allowing international exchange
2395: * of URIs, will still allow backward compatibility with older systems
2396: * because the code set positions for ASCII characters are identical to the
2397: * one byte sequence in UTF-8.
2398: * <p>
2399: * An individual URI scheme may require a single charset, define a default
2400: * charset, or provide a way to indicate the charset used.
2401: *
2402: * <p>
2403: * Always all the time, the setter method is always succeeded and throws
2404: * <code>DefaultCharsetChanged</code> exception.
2405: *
2406: * So API programmer must follow the following way:
2407: * <code><pre>
2408: * import org.apache.util.URI$DefaultCharsetChanged;
2409: * .
2410: * .
2411: * .
2412: * try {
2413: * URI.setDefaultProtocolCharset("UTF-8");
2414: * } catch (DefaultCharsetChanged cc) {
2415: * // CASE 1: the exception could be ignored, when it is set by user
2416: * if (cc.getReasonCode() == DefaultCharsetChanged.PROTOCOL_CHARSET) {
2417: * // CASE 2: let user know the default protocol charset changed
2418: * } else {
2419: * // CASE 2: let user know the default document charset changed
2420: * }
2421: * }
2422: * </pre></code>
2423: *
2424: * The API programmer is responsible to set the correct charset.
2425: * And each application should remember its own charset to support.
2426: *
2427: * @param charset the default charset for each protocol
2428: * @throws DefaultCharsetChanged default charset changed
2429: */
2430: public static void setDefaultProtocolCharset(String charset)
2431: throws DefaultCharsetChanged {
2432:
2433: defaultProtocolCharset = charset;
2434: throw new DefaultCharsetChanged(
2435: DefaultCharsetChanged.PROTOCOL_CHARSET,
2436: "the default protocol charset changed");
2437: }
2438:
2439: /**
2440: * Get the default charset of the protocol.
2441: * <p>
2442: * An individual URI scheme may require a single charset, define a default
2443: * charset, or provide a way to indicate the charset used.
2444: * <p>
2445: * To work globally either requires support of a number of character sets
2446: * and to be able to convert between them, or the use of a single preferred
2447: * character set.
2448: * For support of global compatibility it is STRONGLY RECOMMENDED that
2449: * clients and servers use UTF-8 encoding when exchanging URIs.
2450: *
2451: * @return the default charset string
2452: */
2453: public static String getDefaultProtocolCharset() {
2454: return defaultProtocolCharset;
2455: }
2456:
2457: /**
2458: * Get the protocol charset used by this current URI instance.
2459: * It was set by the constructor for this instance. If it was not set by
2460: * contructor, it will return the default protocol charset.
2461: *
2462: * @return the protocol charset string
2463: * @see #getDefaultProtocolCharset
2464: */
2465: public String getProtocolCharset() {
2466: return (protocolCharset != null) ? protocolCharset
2467: : defaultProtocolCharset;
2468: }
2469:
2470: /**
2471: * Set the default charset of the document.
2472: * <p>
2473: * Notice that it will be possible to contain mixed characters (e.g.
2474: * ftp://host/KoreanNamespace/ChineseResource). To handle the Bi-directional
2475: * display of these character sets, the protocol charset could be simply
2476: * used again. Because it's not yet implemented that the insertion of BIDI
2477: * control characters at different points during composition is extracted.
2478: * <p>
2479: *
2480: * Always all the time, the setter method is always succeeded and throws
2481: * <code>DefaultCharsetChanged</code> exception.
2482: *
2483: * So API programmer must follow the following way:
2484: * <code><pre>
2485: * import org.apache.util.URI$DefaultCharsetChanged;
2486: * .
2487: * .
2488: * .
2489: * try {
2490: * URI.setDefaultDocumentCharset("EUC-KR");
2491: * } catch (DefaultCharsetChanged cc) {
2492: * // CASE 1: the exception could be ignored, when it is set by user
2493: * if (cc.getReasonCode() == DefaultCharsetChanged.DOCUMENT_CHARSET) {
2494: * // CASE 2: let user know the default document charset changed
2495: * } else {
2496: * // CASE 2: let user know the default protocol charset changed
2497: * }
2498: * }
2499: * </pre></code>
2500: *
2501: * The API programmer is responsible to set the correct charset.
2502: * And each application should remember its own charset to support.
2503: *
2504: * @param charset the default charset for the document
2505: * @throws DefaultCharsetChanged default charset changed
2506: */
2507: public static void setDefaultDocumentCharset(String charset)
2508: throws DefaultCharsetChanged {
2509:
2510: defaultDocumentCharset = charset;
2511: throw new DefaultCharsetChanged(
2512: DefaultCharsetChanged.DOCUMENT_CHARSET,
2513: "the default document charset changed");
2514: }
2515:
2516: /**
2517: * Get the recommended default charset of the document.
2518: *
2519: * @return the default charset string
2520: */
2521: public static String getDefaultDocumentCharset() {
2522: return defaultDocumentCharset;
2523: }
2524:
2525: /**
2526: * Get the default charset of the document by locale.
2527: *
2528: * @return the default charset string by locale
2529: */
2530: public static String getDefaultDocumentCharsetByLocale() {
2531: return defaultDocumentCharsetByLocale;
2532: }
2533:
2534: /**
2535: * Get the default charset of the document by platform.
2536: *
2537: * @return the default charset string by platform
2538: */
2539: public static String getDefaultDocumentCharsetByPlatform() {
2540: return defaultDocumentCharsetByPlatform;
2541: }
2542:
2543: // ------------------------------------------------------------- The scheme
2544:
2545: /**
2546: * Get the scheme.
2547: *
2548: * @return the scheme
2549: */
2550: public char[] getRawScheme() {
2551: return _scheme;
2552: }
2553:
2554: /**
2555: * Get the scheme.
2556: *
2557: * @return the scheme
2558: * null if undefined scheme
2559: */
2560: public String getScheme() {
2561: return (_scheme == null) ? null : new String(_scheme);
2562: }
2563:
2564: // ---------------------------------------------------------- The authority
2565:
2566: /**
2567: * Set the authority. It can be one type of server, hostport, hostname,
2568: * IPv4address, IPv6reference and reg_name.
2569: * <p><blockquote><pre>
2570: * authority = server | reg_name
2571: * </pre></blockquote><p>
2572: *
2573: * @param escapedAuthority the raw escaped authority
2574: * @throws URIException If {@link
2575: * #parseAuthority(java.lang.String,boolean)} fails
2576: * @throws NullPointerException null authority
2577: */
2578: public void setRawAuthority(char[] escapedAuthority)
2579: throws URIException, NullPointerException {
2580:
2581: parseAuthority(new String(escapedAuthority), true);
2582: setURI();
2583: }
2584:
2585: /**
2586: * Set the authority. It can be one type of server, hostport, hostname,
2587: * IPv4address, IPv6reference and reg_name.
2588: * Note that there is no setAuthority method by the escape encoding reason.
2589: *
2590: * @param escapedAuthority the escaped authority string
2591: * @throws URIException If {@link
2592: * #parseAuthority(java.lang.String,boolean)} fails
2593: */
2594: public void setEscapedAuthority(String escapedAuthority)
2595: throws URIException {
2596:
2597: parseAuthority(escapedAuthority, true);
2598: setURI();
2599: }
2600:
2601: /**
2602: * Get the raw-escaped authority.
2603: *
2604: * @return the raw-escaped authority
2605: */
2606: public char[] getRawAuthority() {
2607: return _authority;
2608: }
2609:
2610: /**
2611: * Get the escaped authority.
2612: *
2613: * @return the escaped authority
2614: */
2615: public String getEscapedAuthority() {
2616: return (_authority == null) ? null : new String(_authority);
2617: }
2618:
2619: /**
2620: * Get the authority.
2621: *
2622: * @return the authority
2623: * @throws URIException If {@link #decode} fails
2624: */
2625: public String getAuthority() throws URIException {
2626: return (_authority == null) ? null : decode(_authority,
2627: getProtocolCharset());
2628: }
2629:
2630: // ----------------------------------------------------------- The userinfo
2631:
2632: /**
2633: * Get the raw-escaped userinfo.
2634: *
2635: * @return the raw-escaped userinfo
2636: * @see #getAuthority
2637: */
2638: public char[] getRawUserinfo() {
2639: return _userinfo;
2640: }
2641:
2642: /**
2643: * Get the escaped userinfo.
2644: *
2645: * @return the escaped userinfo
2646: * @see #getAuthority
2647: */
2648: public String getEscapedUserinfo() {
2649: return (_userinfo == null) ? null : new String(_userinfo);
2650: }
2651:
2652: /**
2653: * Get the userinfo.
2654: *
2655: * @return the userinfo
2656: * @throws URIException If {@link #decode} fails
2657: * @see #getAuthority
2658: */
2659: public String getUserinfo() throws URIException {
2660: return (_userinfo == null) ? null : decode(_userinfo,
2661: getProtocolCharset());
2662: }
2663:
2664: // --------------------------------------------------------------- The host
2665:
2666: /**
2667: * Get the host.
2668: * <p><blockquote><pre>
2669: * host = hostname | IPv4address | IPv6reference
2670: * </pre></blockquote><p>
2671: *
2672: * @return the host
2673: * @see #getAuthority
2674: */
2675: public char[] getRawHost() {
2676: return _host;
2677: }
2678:
2679: /**
2680: * Get the host.
2681: * <p><blockquote><pre>
2682: * host = hostname | IPv4address | IPv6reference
2683: * </pre></blockquote><p>
2684: *
2685: * @return the host
2686: * @throws URIException If {@link #decode} fails
2687: * @see #getAuthority
2688: */
2689: public String getHost() throws URIException {
2690: if (_host != null) {
2691: return decode(_host, getProtocolCharset());
2692: } else {
2693: return null;
2694: }
2695: }
2696:
2697: // --------------------------------------------------------------- The port
2698:
2699: /**
2700: * Get the port. In order to get the specfic default port, the specific
2701: * protocol-supported class extended from the URI class should be used.
2702: * It has the server-based naming authority.
2703: *
2704: * @return the port
2705: * if -1, it has the default port for the scheme or the server-based
2706: * naming authority is not supported in the specific URI.
2707: */
2708: public int getPort() {
2709: return _port;
2710: }
2711:
2712: // --------------------------------------------------------------- The path
2713:
2714: /**
2715: * Set the raw-escaped path.
2716: *
2717: * @param escapedPath the path character sequence
2718: * @throws URIException encoding error or not proper for initial instance
2719: * @see #encode
2720: */
2721: public void setRawPath(char[] escapedPath) throws URIException {
2722: if (escapedPath == null || escapedPath.length == 0) {
2723: _path = _opaque = escapedPath;
2724: setURI();
2725: return;
2726: }
2727: // remove the fragment identifier
2728: escapedPath = removeFragmentIdentifier(escapedPath);
2729: if (_is_net_path || _is_abs_path) {
2730: if (escapedPath[0] != '/') {
2731: throw new URIException(URIException.PARSING,
2732: "not absolute path");
2733: }
2734: if (!validate(escapedPath, abs_path)) {
2735: throw new URIException(URIException.ESCAPING,
2736: "escaped absolute path not valid");
2737: }
2738: _path = escapedPath;
2739: } else if (_is_rel_path) {
2740: int at = indexFirstOf(escapedPath, '/');
2741: if (at == 0) {
2742: throw new URIException(URIException.PARSING,
2743: "incorrect path");
2744: }
2745: if (at > 0
2746: && !validate(escapedPath, 0, at - 1, rel_segment)
2747: && !validate(escapedPath, at, -1, abs_path)
2748: || at < 0
2749: && !validate(escapedPath, 0, -1, rel_segment)) {
2750:
2751: throw new URIException(URIException.ESCAPING,
2752: "escaped relative path not valid");
2753: }
2754: _path = escapedPath;
2755: } else if (_is_opaque_part) {
2756: if (!uric_no_slash.get(escapedPath[0])
2757: && !validate(escapedPath, 1, -1, uric)) {
2758: throw new URIException(URIException.ESCAPING,
2759: "escaped opaque part not valid");
2760: }
2761: _opaque = escapedPath;
2762: } else {
2763: throw new URIException(URIException.PARSING,
2764: "incorrect path");
2765: }
2766: setURI();
2767: }
2768:
2769: /**
2770: * Set the escaped path.
2771: *
2772: * @param escapedPath the escaped path string
2773: * @throws URIException encoding error or not proper for initial instance
2774: * @see #encode
2775: */
2776: public void setEscapedPath(String escapedPath) throws URIException {
2777: if (escapedPath == null) {
2778: _path = _opaque = null;
2779: setURI();
2780: return;
2781: }
2782: setRawPath(escapedPath.toCharArray());
2783: }
2784:
2785: /**
2786: * Set the path.
2787: *
2788: * @param path the path string
2789: * @throws URIException set incorrectly or fragment only
2790: * @see #encode
2791: */
2792: public void setPath(String path) throws URIException {
2793:
2794: if (path == null || path.length() == 0) {
2795: _path = _opaque = (path == null) ? null : path
2796: .toCharArray();
2797: setURI();
2798: return;
2799: }
2800: // set the charset to do escape encoding
2801: String charset = getProtocolCharset();
2802:
2803: if (_is_net_path || _is_abs_path) {
2804: _path = encode(path, allowed_abs_path, charset);
2805: } else if (_is_rel_path) {
2806: StringBuffer buff = new StringBuffer(path.length());
2807: int at = path.indexOf('/');
2808: if (at == 0) { // never 0
2809: throw new URIException(URIException.PARSING,
2810: "incorrect relative path");
2811: }
2812: if (at > 0) {
2813: buff.append(encode(path.substring(0, at),
2814: allowed_rel_path, charset));
2815: buff.append(encode(path.substring(at),
2816: allowed_abs_path, charset));
2817: } else {
2818: buff.append(encode(path, allowed_rel_path, charset));
2819: }
2820: _path = buff.toString().toCharArray();
2821: } else if (_is_opaque_part) {
2822: StringBuffer buf = new StringBuffer();
2823: buf.insert(0, encode(path.substring(0, 1), uric_no_slash,
2824: charset));
2825: buf.insert(1, encode(path.substring(1), uric, charset));
2826: _opaque = buf.toString().toCharArray();
2827: } else {
2828: throw new URIException(URIException.PARSING,
2829: "incorrect path");
2830: }
2831: setURI();
2832: }
2833:
2834: /**
2835: * Resolve the base and relative path.
2836: *
2837: * @param basePath a character array of the basePath
2838: * @param relPath a character array of the relPath
2839: * @return the resolved path
2840: * @throws URIException no more higher path level to be resolved
2841: */
2842: protected char[] resolvePath(char[] basePath, char[] relPath)
2843: throws URIException {
2844:
2845: // REMINDME: paths are never null
2846: String base = (basePath == null) ? "" : new String(basePath);
2847:
2848: // _path could be empty
2849: if (relPath == null || relPath.length == 0) {
2850: return normalize(basePath);
2851: } else if (relPath[0] == '/') {
2852: return normalize(relPath);
2853: } else {
2854: int at = base.lastIndexOf('/');
2855: if (at != -1) {
2856: basePath = base.substring(0, at + 1).toCharArray();
2857: }
2858: StringBuffer buff = new StringBuffer(base.length()
2859: + relPath.length);
2860: buff.append((at != -1) ? base.substring(0, at + 1) : "/");
2861: buff.append(relPath);
2862: return normalize(buff.toString().toCharArray());
2863: }
2864: }
2865:
2866: /**
2867: * Get the raw-escaped current hierarchy level in the given path.
2868: * If the last namespace is a collection, the slash mark ('/') should be
2869: * ended with at the last character of the path string.
2870: *
2871: * @param path the path
2872: * @return the current hierarchy level
2873: * @throws URIException no hierarchy level
2874: */
2875: protected char[] getRawCurrentHierPath(char[] path)
2876: throws URIException {
2877:
2878: if (_is_opaque_part) {
2879: throw new URIException(URIException.PARSING,
2880: "no hierarchy level");
2881: }
2882: if (path == null) {
2883: throw new URIException(URIException.PARSING, "empty path");
2884: }
2885: String buff = new String(path);
2886: int first = buff.indexOf('/');
2887: int last = buff.lastIndexOf('/');
2888: if (last == 0) {
2889: return rootPath;
2890: } else if (first != last && last != -1) {
2891: return buff.substring(0, last).toCharArray();
2892: }
2893: // FIXME: it could be a document on the server side
2894: return path;
2895: }
2896:
2897: /**
2898: * Get the raw-escaped current hierarchy level.
2899: *
2900: * @return the raw-escaped current hierarchy level
2901: * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2902: */
2903: public char[] getRawCurrentHierPath() throws URIException {
2904: return (_path == null) ? null : getRawCurrentHierPath(_path);
2905: }
2906:
2907: /**
2908: * Get the escaped current hierarchy level.
2909: *
2910: * @return the escaped current hierarchy level
2911: * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2912: */
2913: public String getEscapedCurrentHierPath() throws URIException {
2914: char[] path = getRawCurrentHierPath();
2915: return (path == null) ? null : new String(path);
2916: }
2917:
2918: /**
2919: * Get the current hierarchy level.
2920: *
2921: * @return the current hierarchy level
2922: * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2923: * @see #decode
2924: */
2925: public String getCurrentHierPath() throws URIException {
2926: char[] path = getRawCurrentHierPath();
2927: return (path == null) ? null : decode(path,
2928: getProtocolCharset());
2929: }
2930:
2931: /**
2932: * Get the level above the this hierarchy level.
2933: *
2934: * @return the raw above hierarchy level
2935: * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2936: */
2937: public char[] getRawAboveHierPath() throws URIException {
2938: char[] path = getRawCurrentHierPath();
2939: return (path == null) ? null : getRawCurrentHierPath(path);
2940: }
2941:
2942: /**
2943: * Get the level above the this hierarchy level.
2944: *
2945: * @return the raw above hierarchy level
2946: * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2947: */
2948: public String getEscapedAboveHierPath() throws URIException {
2949: char[] path = getRawAboveHierPath();
2950: return (path == null) ? null : new String(path);
2951: }
2952:
2953: /**
2954: * Get the level above the this hierarchy level.
2955: *
2956: * @return the above hierarchy level
2957: * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2958: * @see #decode
2959: */
2960: public String getAboveHierPath() throws URIException {
2961: char[] path = getRawAboveHierPath();
2962: return (path == null) ? null : decode(path,
2963: getProtocolCharset());
2964: }
2965:
2966: /**
2967: * Get the raw-escaped path.
2968: * <p><blockquote><pre>
2969: * path = [ abs_path | opaque_part ]
2970: * </pre></blockquote><p>
2971: *
2972: * @return the raw-escaped path
2973: */
2974: public char[] getRawPath() {
2975: return _is_opaque_part ? _opaque : _path;
2976: }
2977:
2978: /**
2979: * Get the escaped path.
2980: * <p><blockquote><pre>
2981: * path = [ abs_path | opaque_part ]
2982: * abs_path = "/" path_segments
2983: * opaque_part = uric_no_slash *uric
2984: * </pre></blockquote><p>
2985: *
2986: * @return the escaped path string
2987: */
2988: public String getEscapedPath() {
2989: char[] path = getRawPath();
2990: return (path == null) ? null : new String(path);
2991: }
2992:
2993: /**
2994: * Get the path.
2995: * <p><blockquote><pre>
2996: * path = [ abs_path | opaque_part ]
2997: * </pre></blockquote><p>
2998: * @return the path string
2999: * @throws URIException If {@link #decode} fails.
3000: * @see #decode
3001: */
3002: public String getPath() throws URIException {
3003: char[] path = getRawPath();
3004: return (path == null) ? null : decode(path,
3005: getProtocolCharset());
3006: }
3007:
3008: /**
3009: * Get the raw-escaped basename of the path.
3010: *
3011: * @return the raw-escaped basename
3012: */
3013: public char[] getRawName() {
3014: if (_path == null) {
3015: return null;
3016: }
3017:
3018: int at = 0;
3019: for (int i = _path.length - 1; i >= 0; i--) {
3020: if (_path[i] == '/') {
3021: at = i + 1;
3022: break;
3023: }
3024: }
3025: int len = _path.length - at;
3026: char[] basename = new char[len];
3027: System.arraycopy(_path, at, basename, 0, len);
3028: return basename;
3029: }
3030:
3031: /**
3032: * Get the escaped basename of the path.
3033: *
3034: * @return the escaped basename string
3035: */
3036: public String getEscapedName() {
3037: char[] basename = getRawName();
3038: return (basename == null) ? null : new String(basename);
3039: }
3040:
3041: /**
3042: * Get the basename of the path.
3043: *
3044: * @return the basename string
3045: * @throws URIException incomplete trailing escape pattern or unsupported
3046: * character encoding
3047: * @see #decode
3048: */
3049: public String getName() throws URIException {
3050: char[] basename = getRawName();
3051: return (basename == null) ? null : decode(getRawName(),
3052: getProtocolCharset());
3053: }
3054:
3055: // ----------------------------------------------------- The path and query
3056:
3057: /**
3058: * Get the raw-escaped path and query.
3059: *
3060: * @return the raw-escaped path and query
3061: */
3062: public char[] getRawPathQuery() {
3063:
3064: if (_path == null && _query == null) {
3065: return null;
3066: }
3067: StringBuffer buff = new StringBuffer();
3068: if (_path != null) {
3069: buff.append(_path);
3070: }
3071: if (_query != null) {
3072: buff.append('?');
3073: buff.append(_query);
3074: }
3075: return buff.toString().toCharArray();
3076: }
3077:
3078: /**
3079: * Get the escaped query.
3080: *
3081: * @return the escaped path and query string
3082: */
3083: public String getEscapedPathQuery() {
3084: char[] rawPathQuery = getRawPathQuery();
3085: return (rawPathQuery == null) ? null : new String(rawPathQuery);
3086: }
3087:
3088: /**
3089: * Get the path and query.
3090: *
3091: * @return the path and query string.
3092: * @throws URIException incomplete trailing escape pattern or unsupported
3093: * character encoding
3094: * @see #decode
3095: */
3096: public String getPathQuery() throws URIException {
3097: char[] rawPathQuery = getRawPathQuery();
3098: return (rawPathQuery == null) ? null : decode(rawPathQuery,
3099: getProtocolCharset());
3100: }
3101:
3102: // -------------------------------------------------------------- The query
3103:
3104: /**
3105: * Set the raw-escaped query.
3106: *
3107: * @param escapedQuery the raw-escaped query
3108: * @throws URIException escaped query not valid
3109: */
3110: public void setRawQuery(char[] escapedQuery) throws URIException {
3111: if (escapedQuery == null || escapedQuery.length == 0) {
3112: _query = escapedQuery;
3113: setURI();
3114: return;
3115: }
3116: // remove the fragment identifier
3117: escapedQuery = removeFragmentIdentifier(escapedQuery);
3118: if (!validate(escapedQuery, query)) {
3119: throw new URIException(URIException.ESCAPING,
3120: "escaped query not valid");
3121: }
3122: _query = escapedQuery;
3123: setURI();
3124: }
3125:
3126: /**
3127: * Set the escaped query string.
3128: *
3129: * @param escapedQuery the escaped query string
3130: * @throws URIException escaped query not valid
3131: */
3132: public void setEscapedQuery(String escapedQuery)
3133: throws URIException {
3134: if (escapedQuery == null) {
3135: _query = null;
3136: setURI();
3137: return;
3138: }
3139: setRawQuery(escapedQuery.toCharArray());
3140: }
3141:
3142: /**
3143: * Set the query.
3144: * <p>
3145: * When a query string is not misunderstood the reserved special characters
3146: * ("&", "=", "+", ",", and "$") within a query component, it is
3147: * recommended to use in encoding the whole query with this method.
3148: * <p>
3149: * The additional APIs for the special purpose using by the reserved
3150: * special characters used in each protocol are implemented in each protocol
3151: * classes inherited from <code>URI</code>. So refer to the same-named APIs
3152: * implemented in each specific protocol instance.
3153: *
3154: * @param query the query string.
3155: * @throws URIException incomplete trailing escape pattern or unsupported
3156: * character encoding
3157: * @see #encode
3158: */
3159: public void setQuery(String query) throws URIException {
3160: if (query == null || query.length() == 0) {
3161: _query = (query == null) ? null : query.toCharArray();
3162: setURI();
3163: return;
3164: }
3165: setRawQuery(encode(query, allowed_query, getProtocolCharset()));
3166: }
3167:
3168: /**
3169: * Get the raw-escaped query.
3170: *
3171: * @return the raw-escaped query
3172: */
3173: public char[] getRawQuery() {
3174: return _query;
3175: }
3176:
3177: /**
3178: * Get the escaped query.
3179: *
3180: * @return the escaped query string
3181: */
3182: public String getEscapedQuery() {
3183: return (_query == null) ? null : new String(_query);
3184: }
3185:
3186: /**
3187: * Get the query.
3188: *
3189: * @return the query string.
3190: * @throws URIException incomplete trailing escape pattern or unsupported
3191: * character encoding
3192: * @see #decode
3193: */
3194: public String getQuery() throws URIException {
3195: return (_query == null) ? null : decode(_query,
3196: getProtocolCharset());
3197: }
3198:
3199: // ----------------------------------------------------------- The fragment
3200:
3201: /**
3202: * Set the raw-escaped fragment.
3203: *
3204: * @param escapedFragment the raw-escaped fragment
3205: * @throws URIException escaped fragment not valid
3206: */
3207: public void setRawFragment(char[] escapedFragment)
3208: throws URIException {
3209: if (escapedFragment == null || escapedFragment.length == 0) {
3210: _fragment = escapedFragment;
3211: hash = 0;
3212: return;
3213: }
3214: if (!validate(escapedFragment, fragment)) {
3215: throw new URIException(URIException.ESCAPING,
3216: "escaped fragment not valid");
3217: }
3218: _fragment = escapedFragment;
3219: hash = 0;
3220: }
3221:
3222: /**
3223: * Set the escaped fragment string.
3224: *
3225: * @param escapedFragment the escaped fragment string
3226: * @throws URIException escaped fragment not valid
3227: */
3228: public void setEscapedFragment(String escapedFragment)
3229: throws URIException {
3230: if (escapedFragment == null) {
3231: _fragment = null;
3232: hash = 0;
3233: return;
3234: }
3235: setRawFragment(escapedFragment.toCharArray());
3236: }
3237:
3238: /**
3239: * Set the fragment.
3240: *
3241: * @param fragment the fragment string.
3242: * @throws URIException If an error occurs.
3243: */
3244: public void setFragment(String fragment) throws URIException {
3245: if (fragment == null || fragment.length() == 0) {
3246: _fragment = (fragment == null) ? null : fragment
3247: .toCharArray();
3248: hash = 0;
3249: return;
3250: }
3251: _fragment = encode(fragment, allowed_fragment,
3252: getProtocolCharset());
3253: hash = 0;
3254: }
3255:
3256: /**
3257: * Get the raw-escaped fragment.
3258: * <p>
3259: * The optional fragment identifier is not part of a URI, but is often used
3260: * in conjunction with a URI.
3261: * <p>
3262: * The format and interpretation of fragment identifiers is dependent on
3263: * the media type [RFC2046] of the retrieval result.
3264: * <p>
3265: * A fragment identifier is only meaningful when a URI reference is
3266: * intended for retrieval and the result of that retrieval is a document
3267: * for which the identified fragment is consistently defined.
3268: *
3269: * @return the raw-escaped fragment
3270: */
3271: public char[] getRawFragment() {
3272: return _fragment;
3273: }
3274:
3275: /**
3276: * Get the escaped fragment.
3277: *
3278: * @return the escaped fragment string
3279: */
3280: public String getEscapedFragment() {
3281: return (_fragment == null) ? null : new String(_fragment);
3282: }
3283:
3284: /**
3285: * Get the fragment.
3286: *
3287: * @return the fragment string
3288: * @throws URIException incomplete trailing escape pattern or unsupported
3289: * character encoding
3290: * @see #decode
3291: */
3292: public String getFragment() throws URIException {
3293: return (_fragment == null) ? null : decode(_fragment,
3294: getProtocolCharset());
3295: }
3296:
3297: // ------------------------------------------------------------- Utilities
3298:
3299: /**
3300: * Remove the fragment identifier of the given component.
3301: *
3302: * @param component the component that a fragment may be included
3303: * @return the component that the fragment identifier is removed
3304: */
3305: protected char[] removeFragmentIdentifier(char[] component) {
3306: if (component == null) {
3307: return null;
3308: }
3309: int lastIndex = new String(component).indexOf('#');
3310: if (lastIndex != -1) {
3311: component = new String(component).substring(0, lastIndex)
3312: .toCharArray();
3313: }
3314: return component;
3315: }
3316:
3317: /**
3318: * Normalize the given hier path part.
3319: *
3320: * <p>Algorithm taken from URI reference parser at
3321: * http://www.apache.org/~fielding/uri/rev-2002/issues.html.
3322: *
3323: * @param path the path to normalize
3324: * @return the normalized path
3325: * @throws URIException no more higher path level to be normalized
3326: */
3327: protected char[] normalize(char[] path) throws URIException {
3328:
3329: if (path == null) {
3330: return null;
3331: }
3332:
3333: String normalized = new String(path);
3334:
3335: // If the buffer begins with "./" or "../", the "." or ".." is removed.
3336: if (normalized.startsWith("./")) {
3337: normalized = normalized.substring(1);
3338: } else if (normalized.startsWith("../")) {
3339: normalized = normalized.substring(2);
3340: } else if (normalized.startsWith("..")) {
3341: normalized = normalized.substring(2);
3342: }
3343:
3344: // All occurrences of "/./" in the buffer are replaced with "/"
3345: int index = -1;
3346: while ((index = normalized.indexOf("/./")) != -1) {
3347: normalized = normalized.substring(0, index)
3348: + normalized.substring(index + 2);
3349: }
3350:
3351: // If the buffer ends with "/.", the "." is removed.
3352: if (normalized.endsWith("/.")) {
3353: normalized = normalized.substring(0,
3354: normalized.length() - 1);
3355: }
3356:
3357: int startIndex = 0;
3358:
3359: // All occurrences of "/<segment>/../" in the buffer, where ".."
3360: // and <segment> are complete path segments, are iteratively replaced
3361: // with "/" in order from left to right until no matching pattern remains.
3362: // If the buffer ends with "/<segment>/..", that is also replaced
3363: // with "/". Note that <segment> may be empty.
3364: while ((index = normalized.indexOf("/../", startIndex)) != -1) {
3365: int slashIndex = normalized.lastIndexOf('/', index - 1);
3366: if (slashIndex >= 0) {
3367: normalized = normalized.substring(0, slashIndex)
3368: + normalized.substring(index + 3);
3369: } else {
3370: startIndex = index + 3;
3371: }
3372: }
3373: if (normalized.endsWith("/..")) {
3374: int slashIndex = normalized.lastIndexOf('/', normalized
3375: .length() - 4);
3376: if (slashIndex >= 0) {
3377: normalized = normalized.substring(0, slashIndex + 1);
3378: }
3379: }
3380:
3381: // All prefixes of "<segment>/../" in the buffer, where ".."
3382: // and <segment> are complete path segments, are iteratively replaced
3383: // with "/" in order from left to right until no matching pattern remains.
3384: // If the buffer ends with "<segment>/..", that is also replaced
3385: // with "/". Note that <segment> may be empty.
3386: while ((index = normalized.indexOf("/../")) != -1) {
3387: int slashIndex = normalized.lastIndexOf('/', index - 1);
3388: if (slashIndex >= 0) {
3389: break;
3390: } else {
3391: normalized = normalized.substring(index + 3);
3392: }
3393: }
3394: if (normalized.endsWith("/..")) {
3395: int slashIndex = normalized.lastIndexOf('/', normalized
3396: .length() - 4);
3397: if (slashIndex < 0) {
3398: normalized = "/";
3399: }
3400: }
3401:
3402: return normalized.toCharArray();
3403: }
3404:
3405: /**
3406: * Normalizes the path part of this URI. Normalization is only meant to be performed on
3407: * URIs with an absolute path. Calling this method on a relative path URI will have no
3408: * effect.
3409: *
3410: * @throws URIException no more higher path level to be normalized
3411: *
3412: * @see #isAbsPath()
3413: */
3414: public void normalize() throws URIException {
3415: if (isAbsPath()) {
3416: _path = normalize(_path);
3417: setURI();
3418: }
3419: }
3420:
3421: /**
3422: * Test if the first array is equal to the second array.
3423: *
3424: * @param first the first character array
3425: * @param second the second character array
3426: * @return true if they're equal
3427: */
3428: protected boolean equals(char[] first, char[] second) {
3429:
3430: if (first == null && second == null) {
3431: return true;
3432: }
3433: if (first == null || second == null) {
3434: return false;
3435: }
3436: if (first.length != second.length) {
3437: return false;
3438: }
3439: for (int i = 0; i < first.length; i++) {
3440: if (first[i] != second[i]) {
3441: return false;
3442: }
3443: }
3444: return true;
3445: }
3446:
3447: /**
3448: * Test an object if this URI is equal to another.
3449: *
3450: * @param obj an object to compare
3451: * @return true if two URI objects are equal
3452: */
3453: public boolean equals(Object obj) {
3454:
3455: // normalize and test each components
3456: if (obj == this ) {
3457: return true;
3458: }
3459: if (!(obj instanceof URI)) {
3460: return false;
3461: }
3462: URI another = (URI) obj;
3463: // scheme
3464: if (!equals(_scheme, another._scheme)) {
3465: return false;
3466: }
3467: // is_opaque_part or is_hier_part? and opaque
3468: if (!equals(_opaque, another._opaque)) {
3469: return false;
3470: }
3471: // is_hier_part
3472: // has_authority
3473: if (!equals(_authority, another._authority)) {
3474: return false;
3475: }
3476: // path
3477: if (!equals(_path, another._path)) {
3478: return false;
3479: }
3480: // has_query
3481: if (!equals(_query, another._query)) {
3482: return false;
3483: }
3484: // has_fragment? should be careful of the only fragment case.
3485: if (!equals(_fragment, another._fragment)) {
3486: return false;
3487: }
3488: return true;
3489: }
3490:
3491: // ---------------------------------------------------------- Serialization
3492:
3493: /**
3494: * Write the content of this URI.
3495: *
3496: * @param oos the object-output stream
3497: * @throws IOException If an IO problem occurs.
3498: */
3499: private void writeObject(ObjectOutputStream oos) throws IOException {
3500:
3501: oos.defaultWriteObject();
3502: }
3503:
3504: /**
3505: * Read a URI.
3506: *
3507: * @param ois the object-input stream
3508: * @throws ClassNotFoundException If one of the classes specified in the
3509: * input stream cannot be found.
3510: * @throws IOException If an IO problem occurs.
3511: */
3512: private void readObject(ObjectInputStream ois)
3513: throws ClassNotFoundException, IOException {
3514:
3515: ois.defaultReadObject();
3516: }
3517:
3518: // -------------------------------------------------------------- Hash code
3519:
3520: /**
3521: * Return a hash code for this URI.
3522: *
3523: * @return a has code value for this URI
3524: */
3525: public int hashCode() {
3526: if (hash == 0) {
3527: char[] c = _uri;
3528: if (c != null) {
3529: for (int i = 0, len = c.length; i < len; i++) {
3530: hash = 31 * hash + c[i];
3531: }
3532: }
3533: c = _fragment;
3534: if (c != null) {
3535: for (int i = 0, len = c.length; i < len; i++) {
3536: hash = 31 * hash + c[i];
3537: }
3538: }
3539: }
3540: return hash;
3541: }
3542:
3543: // ------------------------------------------------------------- Comparison
3544:
3545: /**
3546: * Compare this URI to another object.
3547: *
3548: * @param obj the object to be compared.
3549: * @return 0, if it's same,
3550: * -1, if failed, first being compared with in the authority component
3551: * @throws ClassCastException not URI argument
3552: */
3553: public int compareTo(Object obj) throws ClassCastException {
3554:
3555: URI another = (URI) obj;
3556: if (!equals(_authority, another.getRawAuthority())) {
3557: return -1;
3558: }
3559: return toString().compareTo(another.toString());
3560: }
3561:
3562: // ------------------------------------------------------------------ Clone
3563:
3564: /**
3565: * Create and return a copy of this object, the URI-reference containing
3566: * the userinfo component. Notice that the whole URI-reference including
3567: * the userinfo component counld not be gotten as a <code>String</code>.
3568: * <p>
3569: * To copy the identical <code>URI</code> object including the userinfo
3570: * component, it should be used.
3571: *
3572: * @return a clone of this instance
3573: */
3574: public synchronized Object clone()
3575: throws CloneNotSupportedException {
3576:
3577: URI instance = (URI) super .clone();
3578:
3579: instance._uri = _uri;
3580: instance._scheme = _scheme;
3581: instance._opaque = _opaque;
3582: instance._authority = _authority;
3583: instance._userinfo = _userinfo;
3584: instance._host = _host;
3585: instance._port = _port;
3586: instance._path = _path;
3587: instance._query = _query;
3588: instance._fragment = _fragment;
3589: // the charset to do escape encoding for this instance
3590: instance.protocolCharset = protocolCharset;
3591: // flags
3592: instance._is_hier_part = _is_hier_part;
3593: instance._is_opaque_part = _is_opaque_part;
3594: instance._is_net_path = _is_net_path;
3595: instance._is_abs_path = _is_abs_path;
3596: instance._is_rel_path = _is_rel_path;
3597: instance._is_reg_name = _is_reg_name;
3598: instance._is_server = _is_server;
3599: instance._is_hostname = _is_hostname;
3600: instance._is_IPv4address = _is_IPv4address;
3601: instance._is_IPv6reference = _is_IPv6reference;
3602:
3603: return instance;
3604: }
3605:
3606: // ------------------------------------------------------------ Get the URI
3607:
3608: /**
3609: * It can be gotten the URI character sequence. It's raw-escaped.
3610: * For the purpose of the protocol to be transported, it will be useful.
3611: * <p>
3612: * It is clearly unwise to use a URL that contains a password which is
3613: * intended to be secret. In particular, the use of a password within
3614: * the 'userinfo' component of a URL is strongly disrecommended except
3615: * in those rare cases where the 'password' parameter is intended to be
3616: * public.
3617: * <p>
3618: * When you want to get each part of the userinfo, you need to use the
3619: * specific methods in the specific URL. It depends on the specific URL.
3620: *
3621: * @return the URI character sequence
3622: */
3623: public char[] getRawURI() {
3624: return _uri;
3625: }
3626:
3627: /**
3628: * It can be gotten the URI character sequence. It's escaped.
3629: * For the purpose of the protocol to be transported, it will be useful.
3630: *
3631: * @return the escaped URI string
3632: */
3633: public String getEscapedURI() {
3634: return (_uri == null) ? null : new String(_uri);
3635: }
3636:
3637: /**
3638: * It can be gotten the URI character sequence.
3639: *
3640: * @return the original URI string
3641: * @throws URIException incomplete trailing escape pattern or unsupported
3642: * character encoding
3643: * @see #decode
3644: */
3645: public String getURI() throws URIException {
3646: return (_uri == null) ? null : decode(_uri,
3647: getProtocolCharset());
3648: }
3649:
3650: /**
3651: * Get the URI reference character sequence.
3652: *
3653: * @return the URI reference character sequence
3654: */
3655: public char[] getRawURIReference() {
3656: if (_fragment == null) {
3657: return _uri;
3658: }
3659: if (_uri == null) {
3660: return _fragment;
3661: }
3662: // if _uri != null && _fragment != null
3663: String uriReference = new String(_uri) + "#"
3664: + new String(_fragment);
3665: return uriReference.toCharArray();
3666: }
3667:
3668: /**
3669: * Get the escaped URI reference string.
3670: *
3671: * @return the escaped URI reference string
3672: */
3673: public String getEscapedURIReference() {
3674: char[] uriReference = getRawURIReference();
3675: return (uriReference == null) ? null : new String(uriReference);
3676: }
3677:
3678: /**
3679: * Get the original URI reference string.
3680: *
3681: * @return the original URI reference string
3682: * @throws URIException If {@link #decode} fails.
3683: */
3684: public String getURIReference() throws URIException {
3685: char[] uriReference = getRawURIReference();
3686: return (uriReference == null) ? null : decode(uriReference,
3687: getProtocolCharset());
3688: }
3689:
3690: /**
3691: * Get the escaped URI string.
3692: * <p>
3693: * On the document, the URI-reference form is only used without the userinfo
3694: * component like http://jakarta.apache.org/ by the security reason.
3695: * But the URI-reference form with the userinfo component could be parsed.
3696: * <p>
3697: * In other words, this URI and any its subclasses must not expose the
3698: * URI-reference expression with the userinfo component like
3699: * http://user:password@hostport/restricted_zone.<br>
3700: * It means that the API client programmer should extract each user and
3701: * password to access manually. Probably it will be supported in the each
3702: * subclass, however, not a whole URI-reference expression.
3703: *
3704: * @return the escaped URI string
3705: * @see #clone()
3706: */
3707: public String toString() {
3708: return getEscapedURI();
3709: }
3710:
3711: // ------------------------------------------------------------ Inner class
3712:
3713: /**
3714: * The charset-changed normal operation to represent to be required to
3715: * alert to user the fact the default charset is changed.
3716: */
3717: public static class DefaultCharsetChanged extends RuntimeException {
3718:
3719: // ------------------------------------------------------- constructors
3720:
3721: /**
3722: * The constructor with a reason string and its code arguments.
3723: *
3724: * @param reasonCode the reason code
3725: * @param reason the reason
3726: */
3727: public DefaultCharsetChanged(int reasonCode, String reason) {
3728: super (reason);
3729: this .reason = reason;
3730: this .reasonCode = reasonCode;
3731: }
3732:
3733: // ---------------------------------------------------------- constants
3734:
3735: /** No specified reason code. */
3736: public static final int UNKNOWN = 0;
3737:
3738: /** Protocol charset changed. */
3739: public static final int PROTOCOL_CHARSET = 1;
3740:
3741: /** Document charset changed. */
3742: public static final int DOCUMENT_CHARSET = 2;
3743:
3744: // ------------------------------------------------- instance variables
3745:
3746: /** The reason code. */
3747: private int reasonCode;
3748:
3749: /** The reason message. */
3750: private String reason;
3751:
3752: // ------------------------------------------------------------ methods
3753:
3754: /**
3755: * Get the reason code.
3756: *
3757: * @return the reason code
3758: */
3759: public int getReasonCode() {
3760: return reasonCode;
3761: }
3762:
3763: /**
3764: * Get the reason message.
3765: *
3766: * @return the reason message
3767: */
3768: public String getReason() {
3769: return reason;
3770: }
3771:
3772: }
3773:
3774: /**
3775: * A mapping to determine the (somewhat arbitrarily) preferred charset for a
3776: * given locale. Supports all locales recognized in JDK 1.1.
3777: * <p>
3778: * The distribution of this class is Servlets.com. It was originally
3779: * written by Jason Hunter [jhunter at acm.org] and used by with permission.
3780: */
3781: public static class LocaleToCharsetMap {
3782:
3783: /** A mapping of language code to charset */
3784: private static final Hashtable LOCALE_TO_CHARSET_MAP;
3785: static {
3786: LOCALE_TO_CHARSET_MAP = new Hashtable();
3787: LOCALE_TO_CHARSET_MAP.put("ar", "ISO-8859-6");
3788: LOCALE_TO_CHARSET_MAP.put("be", "ISO-8859-5");
3789: LOCALE_TO_CHARSET_MAP.put("bg", "ISO-8859-5");
3790: LOCALE_TO_CHARSET_MAP.put("ca", "ISO-8859-1");
3791: LOCALE_TO_CHARSET_MAP.put("cs", "ISO-8859-2");
3792: LOCALE_TO_CHARSET_MAP.put("da", "ISO-8859-1");
3793: LOCALE_TO_CHARSET_MAP.put("de", "ISO-8859-1");
3794: LOCALE_TO_CHARSET_MAP.put("el", "ISO-8859-7");
3795: LOCALE_TO_CHARSET_MAP.put("en", "ISO-8859-1");
3796: LOCALE_TO_CHARSET_MAP.put("es", "ISO-8859-1");
3797: LOCALE_TO_CHARSET_MAP.put("et", "ISO-8859-1");
3798: LOCALE_TO_CHARSET_MAP.put("fi", "ISO-8859-1");
3799: LOCALE_TO_CHARSET_MAP.put("fr", "ISO-8859-1");
3800: LOCALE_TO_CHARSET_MAP.put("hr", "ISO-8859-2");
3801: LOCALE_TO_CHARSET_MAP.put("hu", "ISO-8859-2");
3802: LOCALE_TO_CHARSET_MAP.put("is", "ISO-8859-1");
3803: LOCALE_TO_CHARSET_MAP.put("it", "ISO-8859-1");
3804: LOCALE_TO_CHARSET_MAP.put("iw", "ISO-8859-8");
3805: LOCALE_TO_CHARSET_MAP.put("ja", "Shift_JIS");
3806: LOCALE_TO_CHARSET_MAP.put("ko", "EUC-KR");
3807: LOCALE_TO_CHARSET_MAP.put("lt", "ISO-8859-2");
3808: LOCALE_TO_CHARSET_MAP.put("lv", "ISO-8859-2");
3809: LOCALE_TO_CHARSET_MAP.put("mk", "ISO-8859-5");
3810: LOCALE_TO_CHARSET_MAP.put("nl", "ISO-8859-1");
3811: LOCALE_TO_CHARSET_MAP.put("no", "ISO-8859-1");
3812: LOCALE_TO_CHARSET_MAP.put("pl", "ISO-8859-2");
3813: LOCALE_TO_CHARSET_MAP.put("pt", "ISO-8859-1");
3814: LOCALE_TO_CHARSET_MAP.put("ro", "ISO-8859-2");
3815: LOCALE_TO_CHARSET_MAP.put("ru", "ISO-8859-5");
3816: LOCALE_TO_CHARSET_MAP.put("sh", "ISO-8859-5");
3817: LOCALE_TO_CHARSET_MAP.put("sk", "ISO-8859-2");
3818: LOCALE_TO_CHARSET_MAP.put("sl", "ISO-8859-2");
3819: LOCALE_TO_CHARSET_MAP.put("sq", "ISO-8859-2");
3820: LOCALE_TO_CHARSET_MAP.put("sr", "ISO-8859-5");
3821: LOCALE_TO_CHARSET_MAP.put("sv", "ISO-8859-1");
3822: LOCALE_TO_CHARSET_MAP.put("tr", "ISO-8859-9");
3823: LOCALE_TO_CHARSET_MAP.put("uk", "ISO-8859-5");
3824: LOCALE_TO_CHARSET_MAP.put("zh", "GB2312");
3825: LOCALE_TO_CHARSET_MAP.put("zh_TW", "Big5");
3826: }
3827:
3828: /**
3829: * Get the preferred charset for the given locale.
3830: *
3831: * @param locale the locale
3832: * @return the preferred charset or null if the locale is not
3833: * recognized.
3834: */
3835: public static String getCharset(Locale locale) {
3836: // try for an full name match (may include country)
3837: String charset = (String) LOCALE_TO_CHARSET_MAP.get(locale
3838: .toString());
3839: if (charset != null) {
3840: return charset;
3841: }
3842:
3843: // if a full name didn't match, try just the language
3844: charset = (String) LOCALE_TO_CHARSET_MAP.get(locale
3845: .getLanguage());
3846: return charset; // may be null
3847: }
3848:
3849: }
3850:
3851: }
|