001: /*
002: * Copyright Aduna (http://www.aduna-software.com/) (c) 1997-2007.
003: *
004: * Licensed under the Aduna BSD-style license.
005: */
006: package org.openrdf.rio.ntriples;
007:
008: import java.io.IOException;
009: import java.io.InputStream;
010: import java.io.InputStreamReader;
011: import java.io.Reader;
012: import java.io.UnsupportedEncodingException;
013:
014: import org.openrdf.model.Literal;
015: import org.openrdf.model.Resource;
016: import org.openrdf.model.Statement;
017: import org.openrdf.model.URI;
018: import org.openrdf.model.Value;
019: import org.openrdf.model.ValueFactory;
020: import org.openrdf.model.impl.ValueFactoryImpl;
021: import org.openrdf.rio.RDFFormat;
022: import org.openrdf.rio.RDFHandlerException;
023: import org.openrdf.rio.RDFParseException;
024: import org.openrdf.rio.helpers.RDFParserBase;
025:
026: /**
027: * RDF parser for N-Triples files. A specification of NTriples can be found in
028: * <a href="http://www.w3.org/TR/rdf-testcases/#ntriples">this section</a> of
029: * the RDF Test Cases document. This parser is not thread-safe, therefore its
030: * public methods are synchronized.
031: *
032: * @author Arjohn Kampman
033: */
034: public class NTriplesParser extends RDFParserBase {
035:
036: /*-----------*
037: * Variables *
038: *-----------*/
039:
040: private Reader reader;
041:
042: private int lineNo;
043:
044: private Resource subject;
045:
046: private URI predicate;
047:
048: private Value object;
049:
050: /*--------------*
051: * Constructors *
052: *--------------*/
053:
054: /**
055: * Creates a new NTriplesParser that will use a {@link ValueFactoryImpl} to
056: * create object for resources, bNodes and literals.
057: */
058: public NTriplesParser() {
059: super ();
060: }
061:
062: /**
063: * Creates a new NTriplesParser that will use the supplied
064: * <tt>ValueFactory</tt> to create RDF model objects.
065: *
066: * @param valueFactory
067: * A ValueFactory.
068: */
069: public NTriplesParser(ValueFactory valueFactory) {
070: super (valueFactory);
071: }
072:
073: /*---------*
074: * Methods *
075: *---------*/
076:
077: // implements RDFParser.getRDFFormat()
078: public final RDFFormat getRDFFormat() {
079: return RDFFormat.NTRIPLES;
080: }
081:
082: /**
083: * Implementation of the <tt>parse(InputStream, String)</tt> method defined
084: * in the RDFParser interface.
085: *
086: * @param in
087: * The InputStream from which to read the data, must not be
088: * <tt>null</tt>. The InputStream is supposed to contain 7-bit
089: * US-ASCII characters, as per the N-Triples specification.
090: * @param baseURI
091: * The URI associated with the data in the InputStream, must not be
092: * <tt>null</tt>.
093: * @throws IOException
094: * If an I/O error occurred while data was read from the InputStream.
095: * @throws RDFParseException
096: * If the parser has found an unrecoverable parse error.
097: * @throws RDFHandlerException
098: * If the configured statement handler encountered an unrecoverable
099: * error.
100: * @throws IllegalArgumentException
101: * If the supplied input stream or base URI is <tt>null</tt>.
102: */
103: public synchronized void parse(InputStream in, String baseURI)
104: throws IOException, RDFParseException, RDFHandlerException {
105: if (in == null) {
106: throw new IllegalArgumentException(
107: "Input stream can not be 'null'");
108: }
109: // Note: baseURI will be checked in parse(Reader, String)
110:
111: try {
112: parse(new InputStreamReader(in, "US-ASCII"), baseURI);
113: } catch (UnsupportedEncodingException e) {
114: // Every platform should support the US-ASCII encoding...
115: throw new RuntimeException(e);
116: }
117: }
118:
119: /**
120: * Implementation of the <tt>parse(Reader, String)</tt> method defined in
121: * the RDFParser interface.
122: *
123: * @param reader
124: * The Reader from which to read the data, must not be <tt>null</tt>.
125: * @param baseURI
126: * The URI associated with the data in the Reader, must not be
127: * <tt>null</tt>.
128: * @throws IOException
129: * If an I/O error occurred while data was read from the InputStream.
130: * @throws RDFParseException
131: * If the parser has found an unrecoverable parse error.
132: * @throws RDFHandlerException
133: * If the configured statement handler encountered an unrecoverable
134: * error.
135: * @throws IllegalArgumentException
136: * If the supplied reader or base URI is <tt>null</tt>.
137: */
138: public synchronized void parse(Reader reader, String baseURI)
139: throws IOException, RDFParseException, RDFHandlerException {
140: if (reader == null) {
141: throw new IllegalArgumentException(
142: "Reader can not be 'null'");
143: }
144: if (baseURI == null) {
145: throw new IllegalArgumentException(
146: "base URI can not be 'null'");
147: }
148:
149: rdfHandler.startRDF();
150:
151: this .reader = reader;
152: lineNo = 1;
153:
154: reportLocation(lineNo, 1);
155:
156: try {
157: int c = reader.read();
158: c = skipWhitespace(c);
159:
160: while (c != -1) {
161: if (c == '#') {
162: // Comment, ignore
163: c = skipLine(c);
164: } else if (c == '\r' || c == '\n') {
165: // Empty line, ignore
166: c = skipLine(c);
167: } else {
168: c = parseTriple(c);
169: }
170:
171: c = skipWhitespace(c);
172: }
173: } finally {
174: clear();
175: }
176:
177: rdfHandler.endRDF();
178: }
179:
180: /**
181: * Reads characters from reader until it finds a character that is not a
182: * space or tab, and returns this last character. In case the end of the
183: * character stream has been reached, -1 is returned.
184: */
185: private int skipWhitespace(int c) throws IOException {
186: while (c == ' ' || c == '\t') {
187: c = reader.read();
188: }
189:
190: return c;
191: }
192:
193: /**
194: * Reads characters from reader until the first EOL has been read. The first
195: * character after the EOL is returned. In case the end of the character
196: * stream has been reached, -1 is returned.
197: */
198: private int skipLine(int c) throws IOException {
199: while (c != -1 && c != '\r' && c != '\n') {
200: c = reader.read();
201: }
202:
203: // c is equal to -1, \r or \n. In case of a \r, we should
204: // check whether it is followed by a \n.
205:
206: if (c == '\n') {
207: c = reader.read();
208:
209: lineNo++;
210:
211: reportLocation(lineNo, 1);
212: } else if (c == '\r') {
213: c = reader.read();
214:
215: if (c == '\n') {
216: c = reader.read();
217: }
218:
219: lineNo++;
220:
221: reportLocation(lineNo, 1);
222: }
223:
224: return c;
225: }
226:
227: private int parseTriple(int c) throws IOException,
228: RDFParseException, RDFHandlerException {
229: c = parseSubject(c);
230:
231: c = skipWhitespace(c);
232:
233: c = parsePredicate(c);
234:
235: c = skipWhitespace(c);
236:
237: c = parseObject(c);
238:
239: c = skipWhitespace(c);
240:
241: if (c == -1) {
242: throwEOFException();
243: } else if (c != '.') {
244: reportFatalError("Expected '.', found: " + (char) c);
245: }
246:
247: c = skipLine(c);
248:
249: Statement st = createStatement(subject, predicate, object);
250: rdfHandler.handleStatement(st);
251:
252: subject = null;
253: predicate = null;
254: object = null;
255:
256: return c;
257: }
258:
259: private int parseSubject(int c) throws IOException,
260: RDFParseException {
261: StringBuilder sb = new StringBuilder(100);
262:
263: // subject is either an uriref (<foo://bar>) or a nodeID (_:node1)
264: if (c == '<') {
265: // subject is an uriref
266: c = parseUriRef(c, sb);
267: subject = createURI(sb.toString());
268: } else if (c == '_') {
269: // subject is a bNode
270: c = parseNodeID(c, sb);
271: subject = createBNode(sb.toString());
272: } else if (c == -1) {
273: throwEOFException();
274: } else {
275: reportFatalError("Expected '<' or '_', found: " + (char) c);
276: }
277:
278: return c;
279: }
280:
281: private int parsePredicate(int c) throws IOException,
282: RDFParseException {
283: StringBuilder sb = new StringBuilder(100);
284:
285: // predicate must be an uriref (<foo://bar>)
286: if (c == '<') {
287: // predicate is an uriref
288: c = parseUriRef(c, sb);
289: predicate = createURI(sb.toString());
290: } else if (c == -1) {
291: throwEOFException();
292: } else {
293: reportFatalError("Expected '<', found: " + (char) c);
294: }
295:
296: return c;
297: }
298:
299: private int parseObject(int c) throws IOException,
300: RDFParseException {
301: StringBuilder sb = new StringBuilder(100);
302:
303: // object is either an uriref (<foo://bar>), a nodeID (_:node1) or a
304: // literal ("foo"-en or "1"^^<xsd:integer>).
305: if (c == '<') {
306: // object is an uriref
307: c = parseUriRef(c, sb);
308: object = createURI(sb.toString());
309: } else if (c == '_') {
310: // object is a bNode
311: c = parseNodeID(c, sb);
312: object = createBNode(sb.toString());
313: } else if (c == '"') {
314: // object is a literal
315: StringBuilder lang = new StringBuilder(8);
316: StringBuilder datatype = new StringBuilder(40);
317: c = parseLiteral(c, sb, lang, datatype);
318: object = createLiteral(sb.toString(), lang.toString(),
319: datatype.toString());
320: } else if (c == -1) {
321: throwEOFException();
322: } else {
323: reportFatalError("Expected '<', '_' or '\"', found: "
324: + (char) c);
325: }
326:
327: return c;
328: }
329:
330: private int parseUriRef(int c, StringBuilder uriRef)
331: throws IOException, RDFParseException {
332: assert c == '<' : "Supplied char should be a '<', is: " + c;
333:
334: // Read up to the next '>' character
335: c = reader.read();
336: while (c != '>') {
337: if (c == -1) {
338: throwEOFException();
339: }
340: uriRef.append((char) c);
341: c = reader.read();
342: }
343:
344: // c == '>', read next char
345: c = reader.read();
346:
347: return c;
348: }
349:
350: private int parseNodeID(int c, StringBuilder name)
351: throws IOException, RDFParseException {
352: assert c == '_' : "Supplied char should be a '_', is: " + c;
353:
354: c = reader.read();
355: if (c == -1) {
356: throwEOFException();
357: } else if (c != ':') {
358: reportError("Expected ':', found: " + (char) c);
359: }
360:
361: c = reader.read();
362: if (c == -1) {
363: throwEOFException();
364: } else if (!NTriplesUtil.isLetter(c)) {
365: reportError("Expected a letter, found: " + (char) c);
366: }
367: name.append((char) c);
368:
369: // Read all following letter and numbers, they are part of the name
370: c = reader.read();
371: while (c != -1 && NTriplesUtil.isLetterOrNumber(c)) {
372: name.append((char) c);
373: c = reader.read();
374: }
375:
376: return c;
377: }
378:
379: private int parseLiteral(int c, StringBuilder value,
380: StringBuilder lang, StringBuilder datatype)
381: throws IOException, RDFParseException {
382: assert c == '"' : "Supplied char should be a '\"', is: " + c;
383:
384: // Read up to the next '"' character
385: c = reader.read();
386: while (c != '"') {
387: if (c == -1) {
388: throwEOFException();
389: }
390: value.append((char) c);
391:
392: if (c == '\\') {
393: // This escapes the next character, which might be a double quote
394: c = reader.read();
395: if (c == -1) {
396: throwEOFException();
397: }
398: value.append((char) c);
399: }
400:
401: c = reader.read();
402: }
403:
404: // c == '"', read next char
405: c = reader.read();
406:
407: if (c == '@') {
408: // Read language
409: c = reader.read();
410: while (c != -1 && c != '.' && c != '^' && c != ' '
411: && c != '\t') {
412: lang.append((char) c);
413: c = reader.read();
414: }
415: } else if (c == '^') {
416: // Read datatype
417: c = reader.read();
418:
419: // c should be another '^'
420: if (c == -1) {
421: throwEOFException();
422: } else if (c != '^') {
423: reportError("Expected '^', found: " + (char) c);
424: }
425:
426: c = reader.read();
427:
428: // c should be a '<'
429: if (c == -1) {
430: throwEOFException();
431: } else if (c != '<') {
432: reportError("Expected '<', found: " + (char) c);
433: }
434:
435: c = parseUriRef(c, datatype);
436: }
437:
438: return c;
439: }
440:
441: @Override
442: protected URI createURI(String uri) throws RDFParseException {
443: try {
444: uri = NTriplesUtil.unescapeString(uri);
445: } catch (IllegalArgumentException e) {
446: reportError(e.getMessage());
447: }
448:
449: return super .createURI(uri);
450: }
451:
452: protected Literal createLiteral(String label, String lang,
453: String datatype) throws RDFParseException {
454: try {
455: label = NTriplesUtil.unescapeString(label);
456: } catch (IllegalArgumentException e) {
457: reportError(e.getMessage());
458: }
459:
460: if (lang.length() == 0) {
461: lang = null;
462: }
463:
464: if (datatype.length() == 0) {
465: datatype = null;
466: }
467:
468: URI dtURI = null;
469: if (datatype != null) {
470: dtURI = createURI(datatype);
471: }
472:
473: return super .createLiteral(label, lang, dtURI);
474: }
475:
476: /**
477: * Overrides {@link RDFParserBase#reportWarning(String)}, adding line number
478: * information to the error.
479: */
480: @Override
481: protected void reportWarning(String msg) {
482: reportWarning(msg, lineNo, -1);
483: }
484:
485: /**
486: * Overrides {@link RDFParserBase#reportError(String)}, adding line number
487: * information to the error.
488: */
489: @Override
490: protected void reportError(String msg) throws RDFParseException {
491: reportError(msg, lineNo, -1);
492: }
493:
494: /**
495: * Overrides {@link RDFParserBase#reportFatalError(String)}, adding line
496: * number information to the error.
497: */
498: @Override
499: protected void reportFatalError(String msg)
500: throws RDFParseException {
501: reportFatalError(msg, lineNo, -1);
502: }
503:
504: /**
505: * Overrides {@link RDFParserBase#reportFatalError(Exception)}, adding line
506: * number information to the error.
507: */
508: @Override
509: protected void reportFatalError(Exception e)
510: throws RDFParseException {
511: reportFatalError(e, lineNo, -1);
512: }
513:
514: private void throwEOFException() throws RDFParseException {
515: throw new RDFParseException("Unexpected end of file");
516: }
517: }
|