001: /*
002: * (c) Copyright 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 Hewlett-Packard Development Company, LP
003: * All rights reserved.
004: *
005: * Redistribution and use in source and binary forms, with or without
006: * modification, are permitted provided that the following conditions
007: * are met:
008: * 1. Redistributions of source code must retain the above copyright
009: * notice, this list of conditions and the following disclaimer.
010: * 2. Redistributions in binary form must reproduce the above copyright
011: * notice, this list of conditions and the following disclaimer in the
012: * documentation and/or other materials provided with the distribution.
013: * 3. The name of the author may not be used to endorse or promote products
014: * derived from this software without specific prior written permission.
015:
016: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
017: * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
018: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
019: * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
020: * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
021: * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
022: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
023: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
024: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
025: * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
026:
027: * * $Id: URIref.java,v 1.7 2008/01/02 12:07:43 andy_seaborne Exp $
028:
029: AUTHOR: Jeremy J. Carroll
030: *//*
031: * URIref.java
032: *
033: * Created on September 20, 2001, 12:04 PM
034: */
035:
036: package com.hp.hpl.jena.util;
037:
038: /**
039: * This class provides methods to encode and decode URI References
040: * in accordance with http://www.w3.org/TR/charmod/#sec-URIs .
041: * The details of how the algorithms handle '%' are captured in
042: * http://lists.w3.org/Archives/Public/uri/2001Sep/0009.html
043: * @author jjc
044: */
045: public class URIref extends Object {
046:
047: /** Convert a Unicode string, first to UTF-8 and then to
048: * an RFC 2396 compliant URI with optional fragment identifier
049: * using %NN escape mechanism as appropriate.
050: * The '%' character is assumed to already indicated an escape byte.
051: * The '%' character must be followed by two hexadecimal digits.
052: * @param unicode The uri, in characters specified by RFC 2396 + '#'
053: * @return The corresponding Unicode String
054: */
055: static public String encode(String unicode) {
056: try {
057: byte utf8[] = unicode.getBytes("UTF-8");
058: byte rsltAscii[] = new byte[utf8.length * 6];
059: int in = 0;
060: int out = 0;
061: while (in < utf8.length) {
062: switch (utf8[in]) {
063: case (byte) 'a':
064: case (byte) 'b':
065: case (byte) 'c':
066: case (byte) 'd':
067: case (byte) 'e':
068: case (byte) 'f':
069: case (byte) 'g':
070: case (byte) 'h':
071: case (byte) 'i':
072: case (byte) 'j':
073: case (byte) 'k':
074: case (byte) 'l':
075: case (byte) 'm':
076: case (byte) 'n':
077: case (byte) 'o':
078: case (byte) 'p':
079: case (byte) 'q':
080: case (byte) 'r':
081: case (byte) 's':
082: case (byte) 't':
083: case (byte) 'u':
084: case (byte) 'v':
085: case (byte) 'w':
086: case (byte) 'x':
087: case (byte) 'y':
088: case (byte) 'z':
089: case (byte) 'A':
090: case (byte) 'B':
091: case (byte) 'C':
092: case (byte) 'D':
093: case (byte) 'E':
094: case (byte) 'F':
095: case (byte) 'G':
096: case (byte) 'H':
097: case (byte) 'I':
098: case (byte) 'J':
099: case (byte) 'K':
100: case (byte) 'L':
101: case (byte) 'M':
102: case (byte) 'N':
103: case (byte) 'O':
104: case (byte) 'P':
105: case (byte) 'Q':
106: case (byte) 'R':
107: case (byte) 'S':
108: case (byte) 'T':
109: case (byte) 'U':
110: case (byte) 'V':
111: case (byte) 'W':
112: case (byte) 'X':
113: case (byte) 'Y':
114: case (byte) 'Z':
115: case (byte) '0':
116: case (byte) '1':
117: case (byte) '2':
118: case (byte) '3':
119: case (byte) '4':
120: case (byte) '5':
121: case (byte) '6':
122: case (byte) '7':
123: case (byte) '8':
124: case (byte) '9':
125: case (byte) ';':
126: case (byte) '/':
127: case (byte) '?':
128: case (byte) ':':
129: case (byte) '@':
130: case (byte) '&':
131: case (byte) '=':
132: case (byte) '+':
133: case (byte) '$':
134: case (byte) ',':
135: case (byte) '-':
136: case (byte) '_':
137: case (byte) '.':
138: case (byte) '!':
139: case (byte) '~':
140: case (byte) '*':
141: case (byte) '\'':
142: case (byte) '(':
143: case (byte) ')':
144: case (byte) '#':
145: case (byte) '[':
146: case (byte) ']':
147: rsltAscii[out] = utf8[in];
148: out++;
149: in++;
150: break;
151: case (byte) '%':
152: try {
153: if (in + 2 < utf8.length) {
154: byte first = hexEncode(hexDecode(utf8[in + 1]));
155: byte second = hexEncode(hexDecode(utf8[in + 2]));
156: rsltAscii[out++] = (byte) '%';
157: rsltAscii[out++] = first;
158: rsltAscii[out++] = second;
159: in += 3;
160: break;
161: }
162: } catch (IllegalArgumentException e) {
163: // Illformed - should issue message ....
164: System.err
165: .println("Confusing IRI to encode - contains literal '%': "
166: + unicode);
167: // Fall through.
168: }
169: default:
170: rsltAscii[out++] = (byte) '%';
171: // Get rid of sign ...
172: int c = ((int) utf8[in]) & 255;
173: rsltAscii[out++] = hexEncode(c / 16);
174: rsltAscii[out++] = hexEncode(c % 16);
175: in++;
176: break;
177: }
178: }
179: return new String(rsltAscii, 0, out, "US-ASCII");
180: } catch (java.io.UnsupportedEncodingException e) {
181: throw new Error(
182: "The JVM is required to support UTF-8 and US-ASCII encodings.");
183: }
184: }
185:
186: /** Convert a URI, in US-ASCII, with escaped characters taken from UTF-8,
187: * to the corresponding Unicode string.
188: * On ill-formed input the results are undefined, specifically if
189: * the unescaped version is not a UTF-8 String, some String will be
190: * returned.
191: * Escaped '%' characters (i.e. "%25") are left unchanged.
192: * @param uri The uri, in characters specified by RFC 2396 + '#'.
193: * @return The corresponding Unicode String.
194: * @exception IllegalArgumentException If a % hex sequence is ill-formed.
195: */
196: static public String decode(String uri) {
197: try {
198: byte ascii[] = uri.getBytes("US-ASCII");
199: byte utf8[] = new byte[ascii.length];
200: int in = 0;
201: int out = 0;
202: while (in < ascii.length) {
203: if (ascii[in] == (byte) '%'
204: && (ascii[in + 1] != '2' || ascii[in + 2] != '5')) {
205: in++;
206: utf8[out++] = (byte) (hexDecode(ascii[in]) * 16 | hexDecode(ascii[in + 1]));
207: in += 2;
208: } else {
209: utf8[out++] = ascii[in++];
210: }
211: }
212: return new String(utf8, 0, out, "UTF-8");
213: } catch (java.io.UnsupportedEncodingException e) {
214: throw new Error(
215: "The JVM is required to support UTF-8 and US-ASCII encodings.");
216: } catch (ArrayIndexOutOfBoundsException ee) {
217: throw new IllegalArgumentException(
218: "Incomplete Hex escape sequence in " + uri);
219: }
220: }
221:
222: static private byte hexEncode(int i) {
223: if (i < 10)
224: return (byte) ('0' + i);
225: else
226: return (byte) ('A' + i - 10);
227: }
228:
229: static private int hexDecode(byte b) {
230: switch (b) {
231: case (byte) 'a':
232: case (byte) 'b':
233: case (byte) 'c':
234: case (byte) 'd':
235: case (byte) 'e':
236: case (byte) 'f':
237: return (((int) b) & 255) - 'a' + 10;
238: case (byte) 'A':
239: case (byte) 'B':
240: case (byte) 'C':
241: case (byte) 'D':
242: case (byte) 'E':
243: case (byte) 'F':
244: return b - (byte) 'A' + 10;
245: case (byte) '0':
246: case (byte) '1':
247: case (byte) '2':
248: case (byte) '3':
249: case (byte) '4':
250: case (byte) '5':
251: case (byte) '6':
252: case (byte) '7':
253: case (byte) '8':
254: case (byte) '9':
255: return b - (byte) '0';
256: default:
257: throw new IllegalArgumentException(
258: "Bad Hex escape character: " + (((int) b) & 255));
259: }
260: }
261:
262: /** For simple testing ...
263: */
264: static public void main(String args[]) {
265: for (int i = 0; i < args.length; i++) {
266: System.out.println(args[i] + " => " + decode(args[i])
267: + " => " + encode(decode(args[i])));
268: }
269: }
270:
271: }
|