001: /*
002: * Copyright Aduna (http://www.aduna-software.com/) (c) 1997-2006.
003: *
004: * Licensed under the Aduna BSD-style license.
005: */
006: package org.openrdf.rio.turtle;
007:
008: import info.aduna.text.ASCIIUtil;
009: import info.aduna.text.StringUtil;
010:
011: /**
012: * Utility methods for Turtle encoding/decoding.
013: */
014: public class TurtleUtil {
015:
016: /**
017: * Tries to find an index where the suppied URI can be split into a namespace
018: * and a local name that comply with the serialization constraints of the
019: * Turtle format. Specifically, the local name should adhere to Turtle's <a
020: * href="http://www.dajobe.org/2004/01/turtle/#name">name</a> production
021: * rule.
022: *
023: * @param uri
024: * The URI to split.
025: * @return The index where the supplied URI can be split, or <tt>-1</tt> if
026: * the URI cannot be split.
027: */
028: public static int findURISplitIndex(String uri) {
029: int uriLength = uri.length();
030:
031: int idx = uriLength - 1;
032:
033: // Search last character that is not a name character
034: for (; idx >= 0; idx--) {
035: if (!TurtleUtil.isNameChar(uri.charAt(idx))) {
036: // Found a non-name character
037: break;
038: }
039: }
040:
041: idx++;
042:
043: // Local names need to start with a 'nameStartChar', skip characters
044: // that are not nameStartChar's.
045: for (; idx < uriLength; idx++) {
046: if (TurtleUtil.isNameStartChar(uri.charAt(idx))) {
047: break;
048: }
049: }
050:
051: if (idx > 0 && idx < uriLength) {
052: // A valid split index has been found
053: return idx;
054: }
055:
056: // No valid local name has been found
057: return -1;
058: }
059:
060: public static boolean isWhitespace(int c) {
061: // Whitespace character are space, tab, newline and carriage return:
062: return c == 0x20 || c == 0x9 || c == 0xA || c == 0xD;
063: }
064:
065: public static boolean isPrefixStartChar(int c) {
066: return ASCIIUtil.isLetter(c) || c >= 0x00C0 && c <= 0x00D6
067: || c >= 0x00D8 && c <= 0x00F6 || c >= 0x00F8
068: && c <= 0x02FF || c >= 0x0370 && c <= 0x037D
069: || c >= 0x037F && c <= 0x1FFF || c >= 0x200C
070: && c <= 0x200D || c >= 0x2070 && c <= 0x218F
071: || c >= 0x2C00 && c <= 0x2FEF || c >= 0x3001
072: && c <= 0xD7FF || c >= 0xF900 && c <= 0xFDCF
073: || c >= 0xFDF0 && c <= 0xFFFD || c >= 0x10000
074: && c <= 0xEFFFF;
075: }
076:
077: public static boolean isNameStartChar(int c) {
078: return c == '_' || isPrefixStartChar(c);
079: }
080:
081: public static boolean isNameChar(int c) {
082: return isNameStartChar(c) || ASCIIUtil.isNumber(c) || c == '-'
083: || c == 0x00B7 || c >= 0x0300 && c <= 0x036F
084: || c >= 0x203F && c <= 0x2040;
085: }
086:
087: public static boolean isPrefixChar(int c) {
088: return isNameChar(c);
089: }
090:
091: public static boolean isLanguageStartChar(int c) {
092: return ASCIIUtil.isLetter(c);
093: }
094:
095: public static boolean isLanguageChar(int c) {
096: return ASCIIUtil.isLetter(c) || ASCIIUtil.isNumber(c)
097: || c == '-';
098: }
099:
100: public static boolean isLegalPrefix(String prefix) {
101: if (prefix.length() == 0) {
102: return false;
103: }
104: if (!isPrefixStartChar(prefix.charAt(0))) {
105: return false;
106: }
107: for (int i = 1; i < prefix.length(); i++) {
108: if (!isPrefixChar(prefix.charAt(i))) {
109: return false;
110: }
111: }
112: return true;
113: }
114:
115: public static boolean isLegalName(String name) {
116: if (name.length() == 0) {
117: return false;
118: }
119: if (!isNameStartChar(name.charAt(0))) {
120: return false;
121: }
122: for (int i = 1; i < name.length(); i++) {
123: if (!isNameChar(name.charAt(i))) {
124: return false;
125: }
126: }
127: return true;
128: }
129:
130: /**
131: * Encodes the supplied string for inclusion as a 'normal' string in a
132: * Turtle document.
133: */
134: public static String encodeString(String s) {
135: s = StringUtil.gsub("\\", "\\\\", s);
136: s = StringUtil.gsub("\t", "\\t", s);
137: s = StringUtil.gsub("\n", "\\n", s);
138: s = StringUtil.gsub("\r", "\\r", s);
139: s = StringUtil.gsub("\"", "\\\"", s);
140: return s;
141: }
142:
143: /**
144: * Encodes the supplied string for inclusion as a long string in a Turtle
145: * document.
146: **/
147: public static String encodeLongString(String s) {
148: // TODO: not all double quotes need to be escaped. It suffices to encode
149: // the ones that form sequences of 3 or more double quotes, and the ones
150: // at the end of a string.
151: s = StringUtil.gsub("\\", "\\\\", s);
152: s = StringUtil.gsub("\"", "\\\"", s);
153: return s;
154: }
155:
156: /**
157: * Encodes the supplied string for inclusion as a (relative) URI in a Turtle
158: * document.
159: **/
160: public static String encodeURIString(String s) {
161: s = StringUtil.gsub("\\", "\\\\", s);
162: s = StringUtil.gsub(">", "\\>", s);
163: return s;
164: }
165:
166: /**
167: * Decodes an encoded Turtle string. Any \-escape sequences are substituted
168: * with their decoded value.
169: *
170: * @param s An encoded Turtle string.
171: * @return The unencoded string.
172: * @exception IllegalArgumentException If the supplied string is not a
173: * correctly encoded Turtle string.
174: **/
175: public static String decodeString(String s) {
176: int backSlashIdx = s.indexOf('\\');
177:
178: if (backSlashIdx == -1) {
179: // No escaped characters found
180: return s;
181: }
182:
183: int startIdx = 0;
184: int sLength = s.length();
185: StringBuilder sb = new StringBuilder(sLength);
186:
187: while (backSlashIdx != -1) {
188: sb.append(s.substring(startIdx, backSlashIdx));
189:
190: if (backSlashIdx + 1 >= sLength) {
191: throw new IllegalArgumentException(
192: "Unescaped backslash in: " + s);
193: }
194:
195: char c = s.charAt(backSlashIdx + 1);
196:
197: if (c == 't') {
198: sb.append('\t');
199: startIdx = backSlashIdx + 2;
200: } else if (c == 'r') {
201: sb.append('\r');
202: startIdx = backSlashIdx + 2;
203: } else if (c == 'n') {
204: sb.append('\n');
205: startIdx = backSlashIdx + 2;
206: } else if (c == '"') {
207: sb.append('"');
208: startIdx = backSlashIdx + 2;
209: } else if (c == '>') {
210: sb.append('>');
211: startIdx = backSlashIdx + 2;
212: } else if (c == '\\') {
213: sb.append('\\');
214: startIdx = backSlashIdx + 2;
215: } else if (c == 'u') {
216: // \\uxxxx
217:if (backSlashIdx + 5 >= sLength) {
218: throw new IllegalArgumentException(
219: "Incomplete Unicode escape sequence in: "
220: + s);
221: }
222: String xx = s.substring(backSlashIdx + 2,
223: backSlashIdx + 6);
224:
225: try {
226: c = (char) Integer.parseInt(xx, 16);
227: sb.append(c);
228:
229: startIdx = backSlashIdx + 6;
230: } catch (NumberFormatException e) {
231: throw new IllegalArgumentException(
232: "Illegal Unicode escape sequence '\\u" + xx
233: + "' in: " + s);
234: }
235: } else if (c == 'U') {
236: // \\Uxxxxxxxx
237: if (backSlashIdx + 9 >= sLength) {
238: throw new IllegalArgumentException(
239: "Incomplete Unicode escape sequence in: "
240: + s);
241: }
242: String xx = s.substring(backSlashIdx + 2,
243: backSlashIdx + 10);
244:
245: try {
246: c = (char) Integer.parseInt(xx, 16);
247: sb.append(c);
248:
249: startIdx = backSlashIdx + 10;
250: } catch (NumberFormatException e) {
251: throw new IllegalArgumentException(
252: "Illegal Unicode escape sequence '\\U" + xx
253: + "' in: " + s);
254: }
255: } else {
256: throw new IllegalArgumentException(
257: "Unescaped backslash in: " + s);
258: }
259:
260: backSlashIdx = s.indexOf('\\', startIdx);
261: }
262:
263: sb.append(s.substring(startIdx));
264:
265: return sb.toString();
266: }
267: }
|