0001: /*
0002: * Copyright Aduna (http://www.aduna-software.com/) (c) 1997-2007.
0003: *
0004: * Licensed under the Aduna BSD-style license.
0005: */
0006: package org.openrdf.rio.turtle;
0007:
0008: import java.io.IOException;
0009: import java.io.InputStream;
0010: import java.io.InputStreamReader;
0011: import java.io.LineNumberReader;
0012: import java.io.PushbackReader;
0013: import java.io.Reader;
0014: import java.io.UnsupportedEncodingException;
0015:
0016: import info.aduna.text.ASCIIUtil;
0017:
0018: import org.openrdf.model.BNode;
0019: import org.openrdf.model.Literal;
0020: import org.openrdf.model.Resource;
0021: import org.openrdf.model.Statement;
0022: import org.openrdf.model.URI;
0023: import org.openrdf.model.Value;
0024: import org.openrdf.model.ValueFactory;
0025: import org.openrdf.model.impl.ValueFactoryImpl;
0026: import org.openrdf.model.vocabulary.RDF;
0027: import org.openrdf.model.vocabulary.XMLSchema;
0028: import org.openrdf.rio.RDFFormat;
0029: import org.openrdf.rio.RDFHandlerException;
0030: import org.openrdf.rio.RDFParseException;
0031: import org.openrdf.rio.helpers.RDFParserBase;
0032:
0033: /**
0034: * RDF parser for <a href="http://www.dajobe.org/2004/01/turtle/">Turtle</a>
0035: * files. This parser is not thread-safe, therefore its public methods are
0036: * synchronized.
0037: * <p>
0038: * This implementation is based on the 2006/01/02 version of the Turtle
0039: * specification, with slight deviations:
0040: * <ul>
0041: * <li>Normalization of integer, floating point and boolean values is dependent
0042: * on the specified datatype handling. According to the specification, integers
0043: * and booleans should be normalized, but floats don't.</li>
0044: * <li>Comments can be used anywhere in the document, and extend to the end of
0045: * the line. The Turtle grammar doesn't allow comments to be used inside triple
0046: * constructs that extend over multiple lines, but the author's own parser
0047: * deviates from this too.</li>
0048: * </ul>
0049: *
0050: * @author Arjohn Kampman
0051: */
0052: public class TurtleParser extends RDFParserBase {
0053:
0054: /*-----------*
0055: * Variables *
0056: *-----------*/
0057:
0058: private LineNumberReader lineReader;
0059:
0060: private PushbackReader reader;
0061:
0062: private Resource subject;
0063:
0064: private URI predicate;
0065:
0066: private Value object;
0067:
0068: /*--------------*
0069: * Constructors *
0070: *--------------*/
0071:
0072: /**
0073: * Creates a new TurtleParser that will use a {@link ValueFactoryImpl} to
0074: * create RDF model objects.
0075: */
0076: public TurtleParser() {
0077: super ();
0078: }
0079:
0080: /**
0081: * Creates a new TurtleParser that will use the supplied ValueFactory to
0082: * create RDF model objects.
0083: *
0084: * @param valueFactory
0085: * A ValueFactory.
0086: */
0087: public TurtleParser(ValueFactory valueFactory) {
0088: super (valueFactory);
0089: }
0090:
0091: /*---------*
0092: * Methods *
0093: *---------*/
0094:
0095: public RDFFormat getRDFFormat() {
0096: return RDFFormat.TURTLE;
0097: }
0098:
0099: /**
0100: * Implementation of the <tt>parse(InputStream, String)</tt> method defined
0101: * in the RDFParser interface.
0102: *
0103: * @param in
0104: * The InputStream from which to read the data, must not be
0105: * <tt>null</tt>. The InputStream is supposed to contain UTF-8
0106: * encoded Unicode characters, as per the Turtle specification.
0107: * @param baseURI
0108: * The URI associated with the data in the InputStream, must not be
0109: * <tt>null</tt>.
0110: * @throws IOException
0111: * If an I/O error occurred while data was read from the InputStream.
0112: * @throws RDFParseException
0113: * If the parser has found an unrecoverable parse error.
0114: * @throws RDFHandlerException
0115: * If the configured statement handler encountered an unrecoverable
0116: * error.
0117: * @throws IllegalArgumentException
0118: * If the supplied input stream or base URI is <tt>null</tt>.
0119: */
0120: public synchronized void parse(InputStream in, String baseURI)
0121: throws IOException, RDFParseException, RDFHandlerException {
0122: if (in == null) {
0123: throw new IllegalArgumentException(
0124: "Input stream must not be 'null'");
0125: }
0126: // Note: baseURI will be checked in parse(Reader, String)
0127:
0128: try {
0129: parse(new InputStreamReader(in, "UTF-8"), baseURI);
0130: } catch (UnsupportedEncodingException e) {
0131: // Every platform should support the UTF-8 encoding...
0132: throw new RuntimeException(e);
0133: }
0134: }
0135:
0136: /**
0137: * Implementation of the <tt>parse(Reader, String)</tt> method defined in
0138: * the RDFParser interface.
0139: *
0140: * @param reader
0141: * The Reader from which to read the data, must not be <tt>null</tt>.
0142: * @param baseURI
0143: * The URI associated with the data in the Reader, must not be
0144: * <tt>null</tt>.
0145: * @throws IOException
0146: * If an I/O error occurred while data was read from the InputStream.
0147: * @throws RDFParseException
0148: * If the parser has found an unrecoverable parse error.
0149: * @throws RDFHandlerException
0150: * If the configured statement handler encountered an unrecoverable
0151: * error.
0152: * @throws IllegalArgumentException
0153: * If the supplied reader or base URI is <tt>null</tt>.
0154: */
0155: public synchronized void parse(Reader reader, String baseURI)
0156: throws IOException, RDFParseException, RDFHandlerException {
0157: if (reader == null) {
0158: throw new IllegalArgumentException(
0159: "Reader must not be 'null'");
0160: }
0161: if (baseURI == null) {
0162: throw new IllegalArgumentException(
0163: "base URI must not be 'null'");
0164: }
0165:
0166: rdfHandler.startRDF();
0167:
0168: lineReader = new LineNumberReader(reader);
0169: // Start counting lines at 1:
0170: lineReader.setLineNumber(1);
0171:
0172: // Allow at most 2 characters to be pushed back:
0173: this .reader = new PushbackReader(lineReader, 2);
0174:
0175: // Store normalized base URI
0176: setBaseURI(baseURI);
0177:
0178: reportLocation();
0179:
0180: try {
0181: int c = skipWSC();
0182:
0183: while (c != -1) {
0184: parseStatement();
0185: c = skipWSC();
0186: }
0187: } finally {
0188: clear();
0189: }
0190:
0191: rdfHandler.endRDF();
0192: }
0193:
0194: protected void parseStatement() throws IOException,
0195: RDFParseException, RDFHandlerException {
0196: int c = peek();
0197:
0198: if (c == '@') {
0199: parseDirective();
0200: skipWSC();
0201: verifyCharacter(read(), ".");
0202: } else {
0203: parseTriples();
0204: skipWSC();
0205: verifyCharacter(read(), ".");
0206: }
0207: }
0208:
0209: protected void parseDirective() throws IOException,
0210: RDFParseException, RDFHandlerException {
0211: // Verify that the first characters form the string "prefix"
0212: verifyCharacter(read(), "@");
0213:
0214: StringBuilder sb = new StringBuilder(8);
0215:
0216: int c = read();
0217: while (c != -1 && !TurtleUtil.isWhitespace(c)) {
0218: sb.append((char) c);
0219: c = read();
0220: }
0221:
0222: String directive = sb.toString();
0223: if (directive.equals("prefix")) {
0224: parsePrefixID();
0225: } else if (directive.equals("base")) {
0226: parseBase();
0227: } else if (directive.length() == 0) {
0228: reportFatalError("Directive name is missing, expected @prefix or @base");
0229: } else {
0230: reportFatalError("Unknown directive \"@" + directive + "\"");
0231: }
0232: }
0233:
0234: protected void parsePrefixID() throws IOException,
0235: RDFParseException, RDFHandlerException {
0236: skipWSC();
0237:
0238: // Read prefix ID (e.g. "rdf:" or ":")
0239: StringBuilder prefixID = new StringBuilder(8);
0240:
0241: while (true) {
0242: int c = read();
0243:
0244: if (c == ':') {
0245: unread(c);
0246: break;
0247: } else if (TurtleUtil.isWhitespace(c)) {
0248: break;
0249: } else if (c == -1) {
0250: throwEOFException();
0251: }
0252:
0253: prefixID.append((char) c);
0254: }
0255:
0256: skipWSC();
0257:
0258: verifyCharacter(read(), ":");
0259:
0260: skipWSC();
0261:
0262: // Read the namespace URI
0263: URI namespace = parseURI();
0264:
0265: // Store and report this namespace mapping
0266: String prefixStr = prefixID.toString();
0267: String namespaceStr = namespace.toString();
0268:
0269: setNamespace(prefixStr, namespaceStr);
0270:
0271: rdfHandler.handleNamespace(prefixStr, namespaceStr);
0272: }
0273:
0274: protected void parseBase() throws IOException, RDFParseException,
0275: RDFHandlerException {
0276: skipWSC();
0277:
0278: URI baseURI = parseURI();
0279:
0280: setBaseURI(baseURI.toString());
0281: }
0282:
0283: protected void parseTriples() throws IOException,
0284: RDFParseException, RDFHandlerException {
0285: parseSubject();
0286: skipWSC();
0287: parsePredicateObjectList();
0288:
0289: subject = null;
0290: predicate = null;
0291: object = null;
0292: }
0293:
0294: protected void parsePredicateObjectList() throws IOException,
0295: RDFParseException, RDFHandlerException {
0296: predicate = parsePredicate();
0297:
0298: skipWSC();
0299:
0300: parseObjectList();
0301:
0302: while (skipWSC() == ';') {
0303: read();
0304:
0305: int c = skipWSC();
0306:
0307: if (c == '.' || // end of triple
0308: c == ']') // end of predicateObjectList inside blank node
0309: {
0310: break;
0311: }
0312:
0313: predicate = parsePredicate();
0314:
0315: skipWSC();
0316:
0317: parseObjectList();
0318: }
0319: }
0320:
0321: protected void parseObjectList() throws IOException,
0322: RDFParseException, RDFHandlerException {
0323: parseObject();
0324:
0325: while (skipWSC() == ',') {
0326: read();
0327: skipWSC();
0328: parseObject();
0329: }
0330: }
0331:
0332: protected void parseSubject() throws IOException,
0333: RDFParseException, RDFHandlerException {
0334: int c = peek();
0335:
0336: if (c == '(') {
0337: subject = parseCollection();
0338: } else if (c == '[') {
0339: subject = parseImplicitBlank();
0340: } else {
0341: Value value = parseValue();
0342:
0343: if (value instanceof Resource) {
0344: subject = (Resource) value;
0345: } else {
0346: reportFatalError("Illegal subject value: " + value);
0347: }
0348: }
0349: }
0350:
0351: protected URI parsePredicate() throws IOException,
0352: RDFParseException {
0353: // Check if the short-cut 'a' is used
0354: int c1 = read();
0355:
0356: if (c1 == 'a') {
0357: int c2 = read();
0358:
0359: if (TurtleUtil.isWhitespace(c2)) {
0360: // Short-cut is used, return the rdf:type URI
0361: return RDF.TYPE;
0362: }
0363:
0364: // Short-cut is not used, unread all characters
0365: unread(c2);
0366: }
0367: unread(c1);
0368:
0369: // Predicate is a normal resource
0370: Value predicate = parseValue();
0371: if (predicate instanceof URI) {
0372: return (URI) predicate;
0373: } else {
0374: reportFatalError("Illegal predicate value: " + predicate);
0375: return null;
0376: }
0377: }
0378:
0379: protected void parseObject() throws IOException, RDFParseException,
0380: RDFHandlerException {
0381: int c = peek();
0382:
0383: if (c == '(') {
0384: object = parseCollection();
0385: } else if (c == '[') {
0386: object = parseImplicitBlank();
0387: } else {
0388: object = parseValue();
0389: }
0390:
0391: reportStatement(subject, predicate, object);
0392: }
0393:
0394: /**
0395: * Parses a collection, e.g. <tt>( item1 item2 item3 )</tt>.
0396: */
0397: protected Resource parseCollection() throws IOException,
0398: RDFParseException, RDFHandlerException {
0399: verifyCharacter(read(), "(");
0400:
0401: int c = skipWSC();
0402:
0403: if (c == ')') {
0404: // Empty list
0405: read();
0406: return RDF.NIL;
0407: } else {
0408: BNode listRoot = createBNode();
0409:
0410: // Remember current subject and predicate
0411: Resource oldSubject = subject;
0412: URI oldPredicate = predicate;
0413:
0414: // generated bNode becomes subject, predicate becomes rdf:first
0415: subject = listRoot;
0416: predicate = RDF.FIRST;
0417:
0418: parseObject();
0419:
0420: BNode bNode = listRoot;
0421:
0422: while (skipWSC() != ')') {
0423: // Create another list node and link it to the previous
0424: BNode newNode = createBNode();
0425: reportStatement(bNode, RDF.REST, newNode);
0426:
0427: // New node becomes the current
0428: subject = bNode = newNode;
0429:
0430: parseObject();
0431: }
0432:
0433: // Skip ')'
0434: read();
0435:
0436: // Close the list
0437: reportStatement(bNode, RDF.REST, RDF.NIL);
0438:
0439: // Restore previous subject and predicate
0440: subject = oldSubject;
0441: predicate = oldPredicate;
0442:
0443: return listRoot;
0444: }
0445: }
0446:
0447: /**
0448: * Parses an implicit blank node. This method parses the token <tt>[]</tt>
0449: * and predicateObjectLists that are surrounded by square brackets.
0450: */
0451: protected Resource parseImplicitBlank() throws IOException,
0452: RDFParseException, RDFHandlerException {
0453: verifyCharacter(read(), "[");
0454:
0455: BNode bNode = createBNode();
0456:
0457: int c = read();
0458: if (c != ']') {
0459: unread(c);
0460:
0461: // Remember current subject and predicate
0462: Resource oldSubject = subject;
0463: URI oldPredicate = predicate;
0464:
0465: // generated bNode becomes subject
0466: subject = bNode;
0467:
0468: // Enter recursion with nested predicate-object list
0469: skipWSC();
0470:
0471: parsePredicateObjectList();
0472:
0473: skipWSC();
0474:
0475: // Read closing bracket
0476: verifyCharacter(read(), "]");
0477:
0478: // Restore previous subject and predicate
0479: subject = oldSubject;
0480: predicate = oldPredicate;
0481: }
0482:
0483: return bNode;
0484: }
0485:
0486: /**
0487: * Parses an RDF value. This method parses uriref, qname, node ID, quoted
0488: * literal, integer, double and boolean.
0489: */
0490: protected Value parseValue() throws IOException, RDFParseException {
0491: int c = peek();
0492:
0493: if (c == '<') {
0494: // uriref, e.g. <foo://bar>
0495: return parseURI();
0496: } else if (c == ':' || TurtleUtil.isPrefixStartChar(c)) {
0497: // qname or boolean
0498: return parseQNameOrBoolean();
0499: } else if (c == '_') {
0500: // node ID, e.g. _:n1
0501: return parseNodeID();
0502: } else if (c == '"') {
0503: // quoted literal, e.g. "foo" or """foo"""
0504: return parseQuotedLiteral();
0505: } else if (ASCIIUtil.isNumber(c) || c == '.' || c == '+'
0506: || c == '-') {
0507: // integer or double, e.g. 123 or 1.2e3
0508: return parseNumber();
0509: } else if (c == -1) {
0510: throwEOFException();
0511: return null;
0512: } else {
0513: reportFatalError("Expected an RDF value here, found '"
0514: + (char) c + "'");
0515: return null;
0516: }
0517: }
0518:
0519: /**
0520: * Parses a quoted string, optionally followed by a language tag or datatype.
0521: */
0522: protected Literal parseQuotedLiteral() throws IOException,
0523: RDFParseException {
0524: String label = parseQuotedString();
0525:
0526: // Check for presence of a language tag or datatype
0527: int c = peek();
0528:
0529: if (c == '@') {
0530: read();
0531:
0532: // Read language
0533: StringBuilder lang = new StringBuilder(8);
0534:
0535: c = read();
0536: if (c == -1) {
0537: throwEOFException();
0538: }
0539: if (!TurtleUtil.isLanguageStartChar(c)) {
0540: reportError("Expected a letter, found '" + (char) c
0541: + "'");
0542: }
0543:
0544: lang.append((char) c);
0545:
0546: c = read();
0547: while (TurtleUtil.isLanguageChar(c)) {
0548: lang.append((char) c);
0549: c = read();
0550: }
0551:
0552: unread(c);
0553:
0554: return createLiteral(label, lang.toString(), null);
0555: } else if (c == '^') {
0556: read();
0557:
0558: // next character should be another '^'
0559: verifyCharacter(read(), "^");
0560:
0561: // Read datatype
0562: Value datatype = parseValue();
0563: if (datatype instanceof URI) {
0564: return createLiteral(label, null, (URI) datatype);
0565: } else {
0566: reportFatalError("Illegal datatype value: " + datatype);
0567: return null;
0568: }
0569: } else {
0570: return createLiteral(label, null, null);
0571: }
0572: }
0573:
0574: /**
0575: * Parses a quoted string, which is either a "normal string" or a """long
0576: * string""".
0577: */
0578: protected String parseQuotedString() throws IOException,
0579: RDFParseException {
0580: String result = null;
0581:
0582: // First character should be '"'
0583: verifyCharacter(read(), "\"");
0584:
0585: // Check for long-string, which starts and ends with three double quotes
0586: int c2 = read();
0587: int c3 = read();
0588:
0589: if (c2 == '"' && c3 == '"') {
0590: // Long string
0591: result = parseLongString();
0592: } else {
0593: // Normal string
0594: unread(c3);
0595: unread(c2);
0596:
0597: result = parseString();
0598: }
0599:
0600: // Unescape any escape sequences
0601: try {
0602: result = TurtleUtil.decodeString(result);
0603: } catch (IllegalArgumentException e) {
0604: reportError(e.getMessage());
0605: }
0606:
0607: return result;
0608: }
0609:
0610: /**
0611: * Parses a "normal string". This method assumes that the first double quote
0612: * has already been parsed.
0613: */
0614: protected String parseString() throws IOException,
0615: RDFParseException {
0616: StringBuilder sb = new StringBuilder(32);
0617:
0618: while (true) {
0619: int c = read();
0620:
0621: if (c == '"') {
0622: break;
0623: } else if (c == -1) {
0624: throwEOFException();
0625: }
0626:
0627: sb.append((char) c);
0628:
0629: if (c == '\\') {
0630: // This escapes the next character, which might be a '"'
0631: c = read();
0632: if (c == -1) {
0633: throwEOFException();
0634: }
0635: sb.append((char) c);
0636: }
0637: }
0638:
0639: return sb.toString();
0640: }
0641:
0642: /**
0643: * Parses a """long string""". This method assumes that the first three
0644: * double quotes have already been parsed.
0645: */
0646: protected String parseLongString() throws IOException,
0647: RDFParseException {
0648: StringBuilder sb = new StringBuilder(1024);
0649:
0650: int doubleQuoteCount = 0;
0651: int c;
0652:
0653: while (doubleQuoteCount < 3) {
0654: c = read();
0655:
0656: if (c == -1) {
0657: throwEOFException();
0658: } else if (c == '"') {
0659: doubleQuoteCount++;
0660: } else {
0661: doubleQuoteCount = 0;
0662: }
0663:
0664: sb.append((char) c);
0665:
0666: if (c == '\\') {
0667: // This escapes the next character, which might be a '"'
0668: c = read();
0669: if (c == -1) {
0670: throwEOFException();
0671: }
0672: sb.append((char) c);
0673: }
0674: }
0675:
0676: return sb.substring(0, sb.length() - 3);
0677: }
0678:
0679: protected Literal parseNumber() throws IOException,
0680: RDFParseException {
0681: StringBuilder value = new StringBuilder(8);
0682: URI datatype = XMLSchema.INTEGER;
0683:
0684: int c = read();
0685:
0686: // read optional sign character
0687: if (c == '+' || c == '-') {
0688: value.append((char) c);
0689: c = read();
0690: }
0691:
0692: while (ASCIIUtil.isNumber(c)) {
0693: value.append((char) c);
0694: c = read();
0695: }
0696:
0697: if (c == '.' || c == 'e' || c == 'E') {
0698: // We're parsing a decimal or a double
0699: datatype = XMLSchema.DECIMAL;
0700:
0701: // read optional fractional digits
0702: if (c == '.') {
0703: value.append((char) c);
0704:
0705: c = read();
0706: while (ASCIIUtil.isNumber(c)) {
0707: value.append((char) c);
0708: c = read();
0709: }
0710:
0711: if (value.length() == 1) {
0712: // We've only parsed a '.'
0713: reportFatalError("Object for statement missing");
0714: }
0715: } else {
0716: if (value.length() == 0) {
0717: // We've only parsed an 'e' or 'E'
0718: reportFatalError("Object for statement missing");
0719: }
0720: }
0721:
0722: // read optional exponent
0723: if (c == 'e' || c == 'E') {
0724: datatype = XMLSchema.DOUBLE;
0725: value.append((char) c);
0726:
0727: c = read();
0728: if (c == '+' || c == '-') {
0729: value.append((char) c);
0730: c = read();
0731: }
0732:
0733: if (!ASCIIUtil.isNumber(c)) {
0734: reportError("Exponent value missing");
0735: }
0736:
0737: value.append((char) c);
0738:
0739: c = read();
0740: while (ASCIIUtil.isNumber(c)) {
0741: value.append((char) c);
0742: c = read();
0743: }
0744: }
0745: }
0746:
0747: // Unread last character, it isn't part of the number
0748: unread(c);
0749:
0750: // String label = value.toString();
0751: // if (datatype.equals(XMLSchema.INTEGER)) {
0752: // try {
0753: // label = XMLDatatypeUtil.normalizeInteger(label);
0754: // }
0755: // catch (IllegalArgumentException e) {
0756: // // Note: this should never happen because of the parse constraints
0757: // reportError("Illegal integer value: " + label);
0758: // }
0759: // }
0760: // return createLiteral(label, null, datatype);
0761:
0762: // Return result as a typed literal
0763: return createLiteral(value.toString(), null, datatype);
0764: }
0765:
0766: protected URI parseURI() throws IOException, RDFParseException {
0767: StringBuilder uriBuf = new StringBuilder(100);
0768:
0769: // First character should be '<'
0770: int c = read();
0771: verifyCharacter(c, "<");
0772:
0773: // Read up to the next '>' character
0774: while (true) {
0775: c = read();
0776:
0777: if (c == '>') {
0778: break;
0779: } else if (c == -1) {
0780: throwEOFException();
0781: }
0782:
0783: uriBuf.append((char) c);
0784:
0785: if (c == '\\') {
0786: // This escapes the next character, which might be a '>'
0787: c = read();
0788: if (c == -1) {
0789: throwEOFException();
0790: }
0791: uriBuf.append((char) c);
0792: }
0793: }
0794:
0795: String uri = uriBuf.toString();
0796:
0797: // Unescape any escape sequences
0798: try {
0799: uri = TurtleUtil.decodeString(uri);
0800: } catch (IllegalArgumentException e) {
0801: reportError(e.getMessage());
0802: }
0803:
0804: return super .resolveURI(uri);
0805: }
0806:
0807: /**
0808: * Parses qnames and boolean values, which have equivalent starting
0809: * characters.
0810: */
0811: protected Value parseQNameOrBoolean() throws IOException,
0812: RDFParseException {
0813: // First character should be a ':' or a letter
0814: int c = read();
0815: if (c == -1) {
0816: throwEOFException();
0817: }
0818: if (c != ':' && !TurtleUtil.isPrefixStartChar(c)) {
0819: reportError("Expected a ':' or a letter, found '"
0820: + (char) c + "'");
0821: }
0822:
0823: String namespace = null;
0824:
0825: if (c == ':') {
0826: // qname using default namespace
0827: namespace = getNamespace("");
0828: if (namespace == null) {
0829: reportError("Default namespace used but not defined");
0830: }
0831: } else {
0832: // c is the first letter of the prefix
0833: StringBuilder prefix = new StringBuilder(8);
0834: prefix.append((char) c);
0835:
0836: c = read();
0837: while (TurtleUtil.isPrefixChar(c)) {
0838: prefix.append((char) c);
0839: c = read();
0840: }
0841:
0842: if (c != ':') {
0843: // prefix may actually be a boolean value
0844: String value = prefix.toString();
0845:
0846: if (value.equals("true") || value.equals("false")) {
0847: return createLiteral(value, null, XMLSchema.BOOLEAN);
0848: }
0849: }
0850:
0851: verifyCharacter(c, ":");
0852:
0853: namespace = getNamespace(prefix.toString());
0854: if (namespace == null) {
0855: reportError("Namespace prefix '" + prefix.toString()
0856: + "' used but not defined");
0857: }
0858: }
0859:
0860: // c == ':', read optional local name
0861: StringBuilder localName = new StringBuilder(16);
0862: c = read();
0863: if (TurtleUtil.isNameStartChar(c)) {
0864: localName.append((char) c);
0865:
0866: c = read();
0867: while (TurtleUtil.isNameChar(c)) {
0868: localName.append((char) c);
0869: c = read();
0870: }
0871: }
0872:
0873: // Unread last character
0874: unread(c);
0875:
0876: // Note: namespace has already been resolved
0877: return createURI(namespace + localName.toString());
0878: }
0879:
0880: /**
0881: * Parses a blank node ID, e.g. <tt>_:node1</tt>.
0882: */
0883: protected BNode parseNodeID() throws IOException, RDFParseException {
0884: // Node ID should start with "_:"
0885: verifyCharacter(read(), "_");
0886: verifyCharacter(read(), ":");
0887:
0888: // Read the node ID
0889: int c = read();
0890: if (c == -1) {
0891: throwEOFException();
0892: } else if (!TurtleUtil.isNameStartChar(c)) {
0893: reportError("Expected a letter, found '" + (char) c + "'");
0894: }
0895:
0896: StringBuilder name = new StringBuilder(32);
0897: name.append((char) c);
0898:
0899: // Read all following letter and numbers, they are part of the name
0900: c = read();
0901: while (TurtleUtil.isNameChar(c)) {
0902: name.append((char) c);
0903: c = read();
0904: }
0905:
0906: unread(c);
0907:
0908: return createBNode(name.toString());
0909: }
0910:
0911: protected void reportStatement(Resource subj, URI pred, Value obj)
0912: throws RDFParseException, RDFHandlerException {
0913: Statement st = createStatement(subj, pred, obj);
0914: rdfHandler.handleStatement(st);
0915: }
0916:
0917: /**
0918: * Verifies that the supplied character <tt>c</tt> is one of the expected
0919: * characters specified in <tt>expected</tt>. This method will throw a
0920: * <tt>ParseException</tt> if this is not the case.
0921: */
0922: protected void verifyCharacter(int c, String expected)
0923: throws RDFParseException {
0924: if (c == -1) {
0925: throwEOFException();
0926: } else if (expected.indexOf((char) c) == -1) {
0927: StringBuilder msg = new StringBuilder(32);
0928: msg.append("Expected ");
0929: for (int i = 0; i < expected.length(); i++) {
0930: if (i > 0) {
0931: msg.append(" or ");
0932: }
0933: msg.append('\'');
0934: msg.append(expected.charAt(i));
0935: msg.append('\'');
0936: }
0937: msg.append(", found '");
0938: msg.append((char) c);
0939: msg.append("'");
0940:
0941: reportError(msg.toString());
0942: }
0943: }
0944:
0945: /**
0946: * Consumes any white space characters (space, tab, line feed, newline) and
0947: * comments (#-style) from <tt>reader</tt>. After this method has been
0948: * called, the first character that is returned by <tt>reader</tt> is
0949: * either a non-ignorable character, or EOF. For convenience, this character
0950: * is also returned by this method.
0951: *
0952: * @return The next character that will be returned by <tt>reader</tt>.
0953: */
0954: protected int skipWSC() throws IOException {
0955: int c = read();
0956: while (TurtleUtil.isWhitespace(c) || c == '#') {
0957: if (c == '#') {
0958: skipLine();
0959: }
0960:
0961: c = read();
0962: }
0963:
0964: unread(c);
0965:
0966: return c;
0967: }
0968:
0969: /**
0970: * Consumes characters from reader until the first EOL has been read.
0971: */
0972: protected void skipLine() throws IOException {
0973: int c = read();
0974: while (c != -1 && c != 0xD && c != 0xA) {
0975: c = read();
0976: }
0977:
0978: // c is equal to -1, \r or \n.
0979: // In case c is equal to \r, we should also read a following \n.
0980: if (c == 0xD) {
0981: c = read();
0982:
0983: if (c != 0xA) {
0984: unread(c);
0985: }
0986: }
0987:
0988: reportLocation();
0989: }
0990:
0991: protected int read() throws IOException {
0992: return reader.read();
0993: }
0994:
0995: protected void unread(int c) throws IOException {
0996: if (c != -1) {
0997: reader.unread(c);
0998: }
0999: }
1000:
1001: protected int peek() throws IOException {
1002: int result = read();
1003: unread(result);
1004: return result;
1005: }
1006:
1007: protected void reportLocation() {
1008: reportLocation(lineReader.getLineNumber(), -1);
1009: }
1010:
1011: /**
1012: * Overrides {@link RDFParserBase#reportWarning(String)}, adding line number
1013: * information to the error.
1014: */
1015: @Override
1016: protected void reportWarning(String msg) {
1017: reportWarning(msg, lineReader.getLineNumber(), -1);
1018: }
1019:
1020: /**
1021: * Overrides {@link RDFParserBase#reportError(String)}, adding line number
1022: * information to the error.
1023: */
1024: @Override
1025: protected void reportError(String msg) throws RDFParseException {
1026: reportError(msg, lineReader.getLineNumber(), -1);
1027: }
1028:
1029: /**
1030: * Overrides {@link RDFParserBase#reportFatalError(String)}, adding line
1031: * number information to the error.
1032: */
1033: @Override
1034: protected void reportFatalError(String msg)
1035: throws RDFParseException {
1036: reportFatalError(msg, lineReader.getLineNumber(), -1);
1037: }
1038:
1039: /**
1040: * Overrides {@link RDFParserBase#reportFatalError(Exception)}, adding line
1041: * number information to the error.
1042: */
1043: @Override
1044: protected void reportFatalError(Exception e)
1045: throws RDFParseException {
1046: reportFatalError(e, lineReader.getLineNumber(), -1);
1047: }
1048:
1049: protected void throwEOFException() throws RDFParseException {
1050: throw new RDFParseException("Unexpected end of file");
1051: }
1052: }
|