001: /*
002: * Copyright Aduna (http://www.aduna-software.com/) (c) 1997-2006.
003: *
004: * Licensed under the Aduna BSD-style license.
005: */
006: package org.openrdf.rio.ntriples;
007:
008: import org.openrdf.model.BNode;
009: import org.openrdf.model.Literal;
010: import org.openrdf.model.Resource;
011: import org.openrdf.model.URI;
012: import org.openrdf.model.Value;
013: import org.openrdf.model.ValueFactory;
014:
015: /**
016: * Utility methods for N-Triples encoding/decoding.
017: */
018: public class NTriplesUtil {
019:
020: /**
021: * Parses an N-Triples value, creates an object for it using the
022: * supplied ValueFactory and returns this object.
023: *
024: * @param nTriplesValue The N-Triples value to parse.
025: * @param valueFactory The ValueFactory to use for creating the
026: * object.
027: * @return An object representing the parsed value.
028: * @throws IllegalArgumentException If the supplied value could not be
029: * parsed correctly.
030: */
031: public static Value parseValue(String nTriplesValue,
032: ValueFactory valueFactory) throws IllegalArgumentException {
033: if (nTriplesValue.startsWith("<")) {
034: return parseURI(nTriplesValue, valueFactory);
035: } else if (nTriplesValue.startsWith("_:")) {
036: return parseBNode(nTriplesValue, valueFactory);
037: } else if (nTriplesValue.startsWith("\"")) {
038: return parseLiteral(nTriplesValue, valueFactory);
039: } else {
040: throw new IllegalArgumentException(
041: "Not a legal N-Triples value: " + nTriplesValue);
042: }
043: }
044:
045: /**
046: * Parses an N-Triples resource, creates an object for it using
047: * the supplied ValueFactory and returns this object.
048: *
049: * @param nTriplesResource The N-Triples resource to parse.
050: * @param valueFactory The ValueFactory to use for creating the
051: * object.
052: * @return An object representing the parsed resource.
053: * @throws IllegalArgumentException If the supplied resource could not be
054: * parsed correctly.
055: */
056: public static Resource parseResource(String nTriplesResource,
057: ValueFactory valueFactory) throws IllegalArgumentException {
058: if (nTriplesResource.startsWith("<")) {
059: return parseURI(nTriplesResource, valueFactory);
060: } else if (nTriplesResource.startsWith("_:")) {
061: return parseBNode(nTriplesResource, valueFactory);
062: } else {
063: throw new IllegalArgumentException(
064: "Not a legal N-Triples resource: "
065: + nTriplesResource);
066: }
067: }
068:
069: /**
070: * Parses an N-Triples URI, creates an object for it using the
071: * supplied ValueFactory and returns this object.
072: *
073: * @param nTriplesURI The N-Triples URI to parse.
074: * @param valueFactory The ValueFactory to use for creating the
075: * object.
076: * @return An object representing the parsed URI.
077: * @throws IllegalArgumentException If the supplied URI could not be
078: * parsed correctly.
079: */
080: public static URI parseURI(String nTriplesURI,
081: ValueFactory valueFactory) throws IllegalArgumentException {
082: if (nTriplesURI.startsWith("<") && nTriplesURI.endsWith(">")) {
083: String uri = nTriplesURI.substring(1,
084: nTriplesURI.length() - 1);
085: uri = unescapeString(uri);
086: return valueFactory.createURI(uri);
087: } else {
088: throw new IllegalArgumentException(
089: "Not a legal N-Triples URI: " + nTriplesURI);
090: }
091: }
092:
093: /**
094: * Parses an N-Triples bNode, creates an object for it using the
095: * supplied ValueFactory and returns this object.
096: *
097: * @param nTriplesBNode The N-Triples bNode to parse.
098: * @param valueFactory The ValueFactory to use for creating the
099: * object.
100: * @return An object representing the parsed bNode.
101: * @throws IllegalArgumentException If the supplied bNode could not be
102: * parsed correctly.
103: */
104: public static BNode parseBNode(String nTriplesBNode,
105: ValueFactory valueFactory) throws IllegalArgumentException {
106: if (nTriplesBNode.startsWith("_:")) {
107: return valueFactory.createBNode(nTriplesBNode.substring(2));
108: } else {
109: throw new IllegalArgumentException(
110: "Not a legal N-Triples URI: " + nTriplesBNode);
111: }
112: }
113:
114: /**
115: * Parses an N-Triples literal, creates an object for it using the
116: * supplied ValueFactory and returns this object.
117: *
118: * @param nTriplesLiteral The N-Triples literal to parse.
119: * @param valueFactory The ValueFactory to use for creating the
120: * object.
121: * @return An object representing the parsed literal.
122: * @throws IllegalArgumentException If the supplied literal could not be
123: * parsed correctly.
124: */
125: public static Literal parseLiteral(String nTriplesLiteral,
126: ValueFactory valueFactory) throws IllegalArgumentException {
127: if (nTriplesLiteral.startsWith("\"")) {
128: // Find string separation points
129: int endLabelIdx = findEndOfLabel(nTriplesLiteral);
130:
131: if (endLabelIdx != -1) {
132: int startLangIdx = nTriplesLiteral.indexOf("@",
133: endLabelIdx);
134: int startDtIdx = nTriplesLiteral.indexOf("^^",
135: endLabelIdx);
136:
137: if (startLangIdx != -1 && startDtIdx != -1) {
138: throw new IllegalArgumentException(
139: "Literals can not have both a language and a datatype");
140: }
141:
142: // Get label
143: String label = nTriplesLiteral
144: .substring(1, endLabelIdx);
145: label = unescapeString(label);
146:
147: if (startLangIdx != -1) {
148: // Get language
149: String language = nTriplesLiteral
150: .substring(startLangIdx + 1);
151: return valueFactory.createLiteral(label, language);
152: } else if (startDtIdx != -1) {
153: // Get datatype
154: String datatype = nTriplesLiteral
155: .substring(startDtIdx + 2);
156: URI dtURI = parseURI(datatype, valueFactory);
157: return valueFactory.createLiteral(label, dtURI);
158: } else {
159: return valueFactory.createLiteral(label);
160: }
161: }
162: }
163:
164: throw new IllegalArgumentException(
165: "Not a legal N-Triples literal: " + nTriplesLiteral);
166: }
167:
168: /**
169: * Finds the end of the label in a literal string. This method
170: * takes into account that characters can be escaped using
171: * backslashes.
172: *
173: * @return The index of the double quote ending the label, or
174: * <tt>-1</tt> if it could not be found.
175: */
176: private static int findEndOfLabel(String nTriplesLiteral) {
177: // First character of literal is guaranteed to be a double
178: // quote, start search at second character.
179:
180: boolean previousWasBackslash = false;
181:
182: for (int i = 1; i < nTriplesLiteral.length(); i++) {
183: char c = nTriplesLiteral.charAt(i);
184:
185: if (c == '"' && !previousWasBackslash) {
186: return i;
187: } else if (c == '\\' && !previousWasBackslash) {
188: // start of escape
189: previousWasBackslash = true;
190: } else if (previousWasBackslash) {
191: // c was escaped
192: previousWasBackslash = false;
193: }
194: }
195:
196: return -1;
197: }
198:
199: /**
200: * Creates an N-Triples string for the supplied value.
201: */
202: public static String toNTriplesString(Value value) {
203: if (value instanceof Resource) {
204: return toNTriplesString((Resource) value);
205: } else if (value instanceof Literal) {
206: return toNTriplesString((Literal) value);
207: } else {
208: throw new IllegalArgumentException("Unknown value type: "
209: + value.getClass());
210: }
211: }
212:
213: /**
214: * Creates an N-Triples string for the supplied resource.
215: */
216: public static String toNTriplesString(Resource resource) {
217: if (resource instanceof URI) {
218: return toNTriplesString((URI) resource);
219: } else if (resource instanceof BNode) {
220: return toNTriplesString((BNode) resource);
221: } else {
222: throw new IllegalArgumentException(
223: "Unknown resource type: " + resource.getClass());
224: }
225: }
226:
227: /**
228: * Creates an N-Triples string for the supplied URI.
229: */
230: public static String toNTriplesString(URI uri) {
231: return "<" + escapeString(uri.toString()) + ">";
232: }
233:
234: /**
235: * Creates an N-Triples string for the supplied bNode.
236: */
237: public static String toNTriplesString(BNode bNode) {
238: return "_:" + bNode.getID();
239: }
240:
241: /**
242: * Creates an N-Triples string for the supplied literal.
243: */
244: public static String toNTriplesString(Literal lit) {
245: // Do some character escaping on the label:
246: StringBuilder sb = new StringBuilder(128);
247: sb.append("\"");
248: sb.append(escapeString(lit.getLabel()));
249: sb.append("\"");
250:
251: if (lit.getDatatype() != null) {
252: // Append the literal's datatype
253: sb.append("^^");
254: sb.append(toNTriplesString(lit.getDatatype()));
255: } else if (lit.getLanguage() != null) {
256: // Append the literal's language
257: sb.append("@");
258: sb.append(lit.getLanguage());
259: }
260:
261: return sb.toString();
262: }
263:
264: /**
265: * Checks whether the supplied character is a letter or number
266: * according to the N-Triples specification.
267: * @see #isLetter
268: * @see #isNumber
269: */
270: public static boolean isLetterOrNumber(int c) {
271: return isLetter(c) || isNumber(c);
272: }
273:
274: /**
275: * Checks whether the supplied character is a letter according to
276: * the N-Triples specification. N-Triples letters are A - Z and a - z.
277: */
278: public static boolean isLetter(int c) {
279: return (c >= 65 && c <= 90) || // A - Z
280: (c >= 97 && c <= 122); // a - z
281: }
282:
283: /**
284: * Checks whether the supplied character is a number according to
285: * the N-Triples specification. N-Triples numbers are 0 - 9.
286: */
287: public static boolean isNumber(int c) {
288: return (c >= 48 && c <= 57); // 0 - 9
289: }
290:
291: /**
292: * Escapes a Unicode string to an all-ASCII character sequence. Any special
293: * characters are escaped using backslashes (<tt>"</tt> becomes <tt>\"</tt>,
294: * etc.), and non-ascii/non-printable characters are escaped using Unicode
295: * escapes (<tt>\uxxxx</tt> and <tt>\Uxxxxxxxx</tt>).
296: */
297: public static String escapeString(String label) {
298: int labelLength = label.length();
299: StringBuilder sb = new StringBuilder(2 * labelLength);
300:
301: for (int i = 0; i < labelLength; i++) {
302: char c = label.charAt(i);
303: int cInt = c;
304:
305: if (c == '\\') {
306: sb.append("\\\\");
307: } else if (c == '"') {
308: sb.append("\\\"");
309: } else if (c == '\n') {
310: sb.append("\\n");
311: } else if (c == '\r') {
312: sb.append("\\r");
313: } else if (c == '\t') {
314: sb.append("\\t");
315: } else if (cInt >= 0x0 && cInt <= 0x8 || cInt == 0xB
316: || cInt == 0xC || cInt >= 0xE && cInt <= 0x1F
317: || cInt >= 0x7F && cInt <= 0xFFFF) {
318: sb.append("\\u");
319: sb.append(toHexString(cInt, 4));
320: } else if (cInt >= 0x10000 && cInt <= 0x10FFFF) {
321: sb.append("\\U");
322: sb.append(toHexString(cInt, 8));
323: } else {
324: sb.append(c);
325: }
326: }
327:
328: return sb.toString();
329: }
330:
331: /**
332: * Unescapes an escaped Unicode string. Any Unicode sequences
333: * (<tt>\uxxxx</tt> and <tt>\Uxxxxxxxx</tt>) are restored to the
334: * value indicated by the hexadecimal argument and any backslash-escapes
335: * (<tt>\"</tt>, <tt>\\</tt>, etc.) are decoded to their original form.
336: *
337: * @param s An escaped Unicode string.
338: * @return The unescaped string.
339: * @throws IllegalArgumentException If the supplied string is not a
340: * correctly escaped N-Triples string.
341: */
342: public static String unescapeString(String s) {
343: int backSlashIdx = s.indexOf('\\');
344:
345: if (backSlashIdx == -1) {
346: // No escaped characters found
347: return s;
348: }
349:
350: int startIdx = 0;
351: int sLength = s.length();
352: StringBuilder sb = new StringBuilder(sLength);
353:
354: while (backSlashIdx != -1) {
355: sb.append(s.substring(startIdx, backSlashIdx));
356:
357: if (backSlashIdx + 1 >= sLength) {
358: throw new IllegalArgumentException(
359: "Unescaped backslash in: " + s);
360: }
361:
362: char c = s.charAt(backSlashIdx + 1);
363:
364: if (c == 't') {
365: sb.append('\t');
366: startIdx = backSlashIdx + 2;
367: } else if (c == 'r') {
368: sb.append('\r');
369: startIdx = backSlashIdx + 2;
370: } else if (c == 'n') {
371: sb.append('\n');
372: startIdx = backSlashIdx + 2;
373: } else if (c == '"') {
374: sb.append('"');
375: startIdx = backSlashIdx + 2;
376: } else if (c == '\\') {
377: sb.append('\\');
378: startIdx = backSlashIdx + 2;
379: } else if (c == 'u') {
380: // \\uxxxx
381:if (backSlashIdx + 5 >= sLength) {
382: throw new IllegalArgumentException(
383: "Incomplete Unicode escape sequence in: "
384: + s);
385: }
386: String xx = s.substring(backSlashIdx + 2,
387: backSlashIdx + 6);
388:
389: try {
390: c = (char) Integer.parseInt(xx, 16);
391: sb.append(c);
392:
393: startIdx = backSlashIdx + 6;
394: } catch (NumberFormatException e) {
395: throw new IllegalArgumentException(
396: "Illegal Unicode escape sequence '\\u" + xx
397: + "' in: " + s);
398: }
399: } else if (c == 'U') {
400: // \\Uxxxxxxxx
401: if (backSlashIdx + 9 >= sLength) {
402: throw new IllegalArgumentException(
403: "Incomplete Unicode escape sequence in: "
404: + s);
405: }
406: String xx = s.substring(backSlashIdx + 2,
407: backSlashIdx + 10);
408:
409: try {
410: c = (char) Integer.parseInt(xx, 16);
411: sb.append(c);
412:
413: startIdx = backSlashIdx + 10;
414: } catch (NumberFormatException e) {
415: throw new IllegalArgumentException(
416: "Illegal Unicode escape sequence '\\U" + xx
417: + "' in: " + s);
418: }
419: } else {
420: throw new IllegalArgumentException(
421: "Unescaped backslash in: " + s);
422: }
423:
424: backSlashIdx = s.indexOf('\\', startIdx);
425: }
426:
427: sb.append(s.substring(startIdx));
428:
429: return sb.toString();
430: }
431:
432: /**
433: * Converts a decimal value to a hexadecimal string represention
434: * of the specified length.
435: *
436: * @param decimal A decimal value.
437: * @param stringLength The length of the resulting string.
438: */
439: public static String toHexString(int decimal, int stringLength) {
440: StringBuilder sb = new StringBuilder(stringLength);
441:
442: String hexVal = Integer.toHexString(decimal).toUpperCase();
443:
444: // insert zeros if hexVal has less than stringLength characters:
445: int nofZeros = stringLength - hexVal.length();
446: for (int i = 0; i < nofZeros; i++) {
447: sb.append('0');
448: }
449:
450: sb.append(hexVal);
451:
452: return sb.toString();
453: }
454: }
|