0001: package com.rimfaxe.xml.xmlreader;
0002:
0003: import java.io.*;
0004: import java.util.*;
0005:
0006: /** An XML character stream that has been parsed into a DOM tree. This
0007: class encapsulates the Sparta XML parsing.
0008:
0009: <blockquote><small> Copyright (C) 2002 Hewlett-Packard Company.
0010: This file is part of Sparta, an XML Parser, DOM, and XPath library.
0011: This library is free software; you can redistribute it and/or
0012: modify it under the terms of the GNU Lesser General Public License
0013: as published by the Free Software Foundation; either version 2.1 of
0014: the License, or (at your option) any later version. This library
0015: is distributed in the hope that it will be useful, but WITHOUT ANY
0016: WARRANTY; without even the implied warranty of MERCHANTABILITY or
0017: FITNESS FOR A PARTICULAR PURPOSE.</small></blockquote>
0018: @see <a "href="doc-files/LGPL.txt">GNU Lesser General Public License</a>
0019: @version $Date: 2003/01/27 23:30:58 $ $Revision: 1.4 $
0020: @author Eamonn O'Brien-Strain
0021: @author Sergio Marti
0022: */
0023: class ParseCharStream implements ParseSource {
0024:
0025: private final boolean DEBUG = true;
0026: private final boolean H_DEBUG = false;
0027:
0028: /** Constructor used when passing in XML stored in a string */
0029: public ParseCharStream(String systemId, char[] xmlData,
0030: ParseLog log, String encoding, ParseHandler handler)
0031: throws ParseException, EncodingMismatchException,
0032: IOException {
0033: this (systemId, null, xmlData, log, encoding, handler);
0034: }
0035:
0036: /** Constructor used when passing in XML from a character stream */
0037: public ParseCharStream(String systemId, Reader reader,
0038: ParseLog log, String encoding, ParseHandler handler)
0039: throws ParseException, EncodingMismatchException,
0040: IOException {
0041: this (systemId, reader, null, log, encoding, handler);
0042: }
0043:
0044: /** Parse XML document from characters stream according to W3C grammar.
0045: * [1] document ::= prolog element Misc*
0046: * @see <a href="http://www.w3.org/TR/2000/REC-xml-20001006">
0047: * http://www.w3.org/TR/2000/REC-xml-20001006
0048: * </a>
0049: */
0050:
0051: public ParseCharStream(String systemId, Reader reader,
0052: char[] xmlData, ParseLog log, String encoding,
0053: ParseHandler handler) throws ParseException,
0054: EncodingMismatchException, IOException {
0055: if (DEBUG)
0056: lineNumber_ = 1;
0057: if (H_DEBUG) {
0058: history_ = new CharCircBuffer(HISTORY_LENGTH);
0059: history_.addString("1:");
0060: } else
0061: history_ = null;
0062:
0063: log_ = (log == null) ? DEFAULT_LOG : log;
0064: encoding_ = encoding;
0065:
0066: //http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent
0067: entities_.put("lt", "<");
0068: entities_.put("gt", ">");
0069: entities_.put("amp", "&");
0070: entities_.put("apos", "\'");
0071: entities_.put("quot", "\"");
0072:
0073: // Set input stream buffer. Either use string char array or
0074: // fill from character reader
0075: if (xmlData != null) {
0076: cbuf_ = xmlData;
0077: curPos_ = 0;
0078: endPos_ = cbuf_.length;
0079: eos_ = true;
0080: reader_ = null;
0081: } else {
0082: reader_ = reader;
0083: cbuf_ = new char[CBUF_SIZE];
0084: fillBuf();
0085: }
0086:
0087: systemId_ = systemId;
0088:
0089: // Set the ParseHandler for parsing
0090: handler_ = handler;
0091: handler_.setParseSource(this );
0092:
0093: /*
0094: try {
0095: */
0096:
0097: readProlog();
0098:
0099: handler_.startDocument();
0100:
0101: Element rootElement = this .readElement(null);
0102:
0103: if (docTypeName_ != null
0104: && !docTypeName_.equals(rootElement.getTagName()))
0105: log_.warning("DOCTYPE name \"" + docTypeName_
0106: + "\" not same as tag name, \""
0107: + rootElement.getTagName() + "\" of root element",
0108: systemId_, getLineNumber());
0109: while (isMisc())
0110: readMisc();
0111:
0112: if (reader_ != null)
0113: reader_.close();
0114:
0115: handler_.endDocument();
0116: }
0117:
0118: public String toString() {
0119: return systemId_;
0120: }
0121:
0122: public String getSystemId() {
0123: return systemId_;
0124: }
0125:
0126: /** Last line number read by parser. */
0127: public int getLineNumber() {
0128: return lineNumber_;
0129: }
0130:
0131: int getLastCharRead() {
0132: return ch_;
0133: }
0134:
0135: String getHistory() {
0136: if (H_DEBUG)
0137: return history_.toString();
0138: else
0139: return "";
0140: }
0141:
0142: private int fillBuf() throws IOException {
0143: if (eos_)
0144: return -1;
0145:
0146: if (endPos_ == cbuf_.length) {
0147: if (curPos_ != endPos_)
0148: throw new Error(
0149: "Assertion failed in Sparta: curPos_ != (endPos_ == cbuf_.length)");
0150: curPos_ = endPos_ = 0;
0151: }
0152:
0153: int count = reader_
0154: .read(cbuf_, endPos_, cbuf_.length - endPos_);
0155: if (count <= 0) {
0156: eos_ = true;
0157: return -1;
0158: }
0159: endPos_ += count;
0160: return count;
0161: }
0162:
0163: private int fillBuf(int min) throws IOException {
0164: if (eos_)
0165: return -1;
0166:
0167: int count = 0;
0168: if (cbuf_.length - curPos_ < min) {
0169: for (int i = 0; curPos_ + i < endPos_; i++)
0170: cbuf_[i] = cbuf_[curPos_ + i];
0171: count = endPos_ - curPos_;
0172: endPos_ = count;
0173: curPos_ = 0;
0174: }
0175: int res = fillBuf();
0176: if (res == -1)
0177: if (count == 0)
0178: return -1;
0179: else
0180: return count;
0181: else
0182: return count + res;
0183:
0184: }
0185:
0186: /** [2] Char ::=
0187: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
0188: */
0189: char readChar() throws ParseException, IOException {
0190: //APPROXIMATION
0191: if (curPos_ >= endPos_)
0192: if (fillBuf() == -1)
0193: throw new ParseException(this ,
0194: "unexpected end of expression.");
0195: if (DEBUG)
0196: if (cbuf_[curPos_] == '\n')
0197: lineNumber_++;
0198: if (H_DEBUG) {
0199: history_.addChar(cbuf_[curPos_]);
0200: if (cbuf_[curPos_] == '\n') {
0201: history_.addInt(lineNumber_);
0202: history_.addChar(':');
0203: }
0204: }
0205:
0206: return cbuf_[curPos_++];
0207: }
0208:
0209: char peekChar() throws ParseException, IOException {
0210: //APPROXIMATION
0211: if (curPos_ >= endPos_)
0212: if (fillBuf() == -1)
0213: throw new ParseException(this ,
0214: "unexpected end of expression.");
0215: return cbuf_[curPos_];
0216: }
0217:
0218: void readChar(char expected) throws ParseException, IOException {
0219: char ch = readChar();
0220: if (ch != expected)
0221: throw new ParseException(this , ch, expected);
0222: }
0223:
0224: boolean isChar(char expected) throws ParseException, IOException {
0225: if (curPos_ >= endPos_)
0226: if (fillBuf() == -1)
0227: throw new ParseException(this ,
0228: "unexpected end of expression.");
0229: return (cbuf_[curPos_] == expected);
0230: }
0231:
0232: char readChar(char[] expected) throws ParseException, IOException {
0233: char ch = readChar();
0234: if (!isIn(ch, expected))
0235: throw new ParseException(this , ch, expected);
0236: return ch;
0237: }
0238:
0239: boolean isChar(char[] expected) throws ParseException, IOException {
0240: if (curPos_ >= endPos_)
0241: if (fillBuf() == -1)
0242: return false;
0243: return isIn(cbuf_[curPos_], expected);
0244: }
0245:
0246: static boolean isIn(char ch, char[] expected) {
0247: for (int i = 0; i < expected.length; ++i)
0248: if (ch == expected[i])
0249: return true;
0250: return false;
0251: }
0252:
0253: /** [3] S ::= (#x20 | #x9 | #xD | #xA)+ */
0254: void readS() throws ParseException, IOException {
0255: readChar(S_CHARS);
0256: while (isChar(S_CHARS))
0257: readChar();
0258: }
0259:
0260: static private final char[] S_CHARS = { ' ', '\t', '\r', '\n' };
0261:
0262: boolean isS() throws ParseException, IOException {
0263: return isChar(S_CHARS);
0264: }
0265:
0266: static private final char[] NAME_PUNCT_CHARS = { '.', '-', '_', ':' };
0267:
0268: /** [4] NameChar
0269: ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
0270: */
0271: private boolean isNameChar() throws ParseException, IOException {
0272: char ch = peekChar();
0273: return Character.isUnicodeIdentifierPart(ch)
0274: || isIn(ch, NAME_PUNCT_CHARS)
0275: || Character.getType(ch) == Character.COMBINING_SPACING_MARK
0276: || isExtender(ch);
0277: }
0278:
0279: /** [89] Extender
0280: ::= #x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6
0281: | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]
0282: */
0283: static private boolean isExtender(char ch) {
0284: //verbose but efficient
0285: switch (ch) {
0286: case '\u00B7':
0287: case '\u02D0':
0288: case '\u02D1':
0289: case '\u0387':
0290: case '\u0640':
0291: case '\u0E46':
0292: case '\u0EC6':
0293: case '\u3005':
0294: case '\u3031':
0295: case '\u3032':
0296: case '\u3033':
0297: case '\u3034':
0298: case '\u3035':
0299: case '\u309D':
0300: case '\u309E':
0301: case '\u30FC':
0302: case '\u30FD':
0303: case '\u30FE':
0304: return true;
0305: default:
0306: return false;
0307: }
0308: }
0309:
0310: /** [5] Name ::= (Letter | '_' | ':') (NameChar)*
0311: * [84] Letter ::= BaseChar | Ideographic
0312: */
0313: String readName() throws ParseException, IOException {
0314: StringBuffer result = null;
0315: int i = 0;
0316: tmpBuf_[i++] = readNameStartChar();
0317: while (isNameChar()) {
0318: if (i >= TMP_BUF_SIZE) {
0319: if (result == null) {
0320: result = new StringBuffer(i);
0321: result.append(tmpBuf_, 0, i);
0322: } else
0323: result.append(tmpBuf_, 0, i);
0324: i = 0;
0325: }
0326: tmpBuf_[i++] = readChar();
0327: }
0328: if (result == null)
0329: return new String(tmpBuf_, 0, i);
0330: else {
0331: result.append(tmpBuf_, 0, i);
0332: return result.toString();
0333: }
0334: }
0335:
0336: private char readNameStartChar() throws ParseException, IOException {
0337: char ch = readChar();
0338: if (!Character.isUnicodeIdentifierStart(ch) && ch != '_'
0339: && ch != ':')
0340: throw new ParseException(this , ch,
0341: "letter, underscore, colon");
0342: return ch;
0343: }
0344:
0345: /** [9] EntityValue ::=
0346: * '"'
0347: * (
0348: * [^%&"] | PEReference | Reference
0349: * )*
0350: * '"'
0351: */
0352: String readEntityValue() throws ParseException, IOException {
0353: //grammar allows only double quote, but many xmlconf examples
0354: //use single quotes
0355: char quote = readChar(QUOTE_CHARS);
0356: StringBuffer result = new StringBuffer();
0357: while (!isChar(quote)) {
0358: if (isPeReference())
0359: result.append(readPeReference());
0360: else if (isReference())
0361: result.append(readReference());
0362: else
0363: result.append(readChar());
0364: }
0365: readChar(quote);
0366: return result.toString();
0367: }
0368:
0369: boolean isEntityValue() throws ParseException, IOException {
0370: return isChar(QUOTE_CHARS);
0371: }
0372:
0373: /** [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */
0374: void readSystemLiteral() throws ParseException, IOException {
0375: char quote = readChar();
0376: while (peekChar() != quote)
0377: readChar();
0378: readChar(quote);
0379: }
0380:
0381: /** [12] PubidLiteral
0382: ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
0383: */
0384: void readPubidLiteral() throws ParseException, IOException {
0385: //APPROXIMATION
0386: readSystemLiteral();
0387: }
0388:
0389: private boolean isMisc() throws ParseException, IOException {
0390: return isComment() || isPi() || isS();
0391: }
0392:
0393: private void readMisc() throws ParseException, IOException {
0394: if (isComment())
0395: readComment();
0396: else if (isPi())
0397: readPi();
0398: else if (isS())
0399: readS();
0400: else
0401: throw new ParseException(this ,
0402: "expecting comment or processing instruction or space");
0403: }
0404:
0405: static private final char[] COMMENT_BEGIN = "<!--".toCharArray();
0406: static private final char[] COMMENT_END = "-->".toCharArray();
0407:
0408: /** [15] Comment
0409: ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
0410: */
0411: void readComment() throws ParseException, IOException {
0412: //This is actually less strict than the spec because it allows
0413: //embedded -- and comments ending with --->
0414: readSymbol(COMMENT_BEGIN);
0415: while (!isSymbol(COMMENT_END))
0416: readChar();
0417: readSymbol(COMMENT_END);
0418: }
0419:
0420: boolean isComment() throws ParseException, IOException {
0421: return isSymbol(COMMENT_BEGIN);
0422: }
0423:
0424: static private final char[] PI_BEGIN = "<?".toCharArray();
0425: static private final char[] QU_END = "?>".toCharArray();
0426:
0427: /** [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' */
0428: void readPi() throws ParseException, IOException {
0429: //APPROXIMATION -- treat as comment
0430: readSymbol(PI_BEGIN);
0431: while (!isSymbol(QU_END))
0432: readChar();
0433: readSymbol(QU_END);
0434: }
0435:
0436: boolean isPi() throws ParseException, IOException {
0437: return isSymbol(PI_BEGIN);
0438: }
0439:
0440: /** www.w3.org/TR/2000/REC-xml-20001006#NT-prolog */
0441: private void readProlog() throws ParseException,
0442: EncodingMismatchException, IOException {
0443: if (isXmlDecl())
0444: readXmlDecl();
0445: while (isMisc())
0446: readMisc();
0447: if (isDocTypeDecl()) {
0448: readDocTypeDecl();
0449: while (isMisc())
0450: readMisc();
0451: }
0452: }
0453:
0454: static private final char[] DOCTYPE_BEGIN = "<!DOCTYPE"
0455: .toCharArray();
0456:
0457: private boolean isDocTypeDecl() throws ParseException, IOException {
0458: return isSymbol(DOCTYPE_BEGIN);
0459: }
0460:
0461: static private final char[] XML_BEGIN = "<?xml".toCharArray();
0462:
0463: /** [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' */
0464: private void readXmlDecl() throws ParseException,
0465: EncodingMismatchException, IOException {
0466: readSymbol(XML_BEGIN);
0467: readVersionInfo();
0468: if (isS())
0469: readS();
0470: if (isEncodingDecl()) {
0471: String encodingDeclared = readEncodingDecl();
0472: if (encoding_ != null
0473: && !encodingDeclared.equalsIgnoreCase(encoding_))
0474: throw new EncodingMismatchException(systemId_,
0475: encodingDeclared, encoding_);
0476: }
0477: //APPROXIMATION:
0478: while (!isSymbol(QU_END))
0479: readChar();
0480: readSymbol(QU_END);
0481: }
0482:
0483: private boolean isXmlDecl() throws ParseException, IOException {
0484: return isSymbol(XML_BEGIN);
0485: }
0486:
0487: static private final char[] ENCODING = "encoding".toCharArray();
0488:
0489: private boolean isEncodingDecl() throws ParseException, IOException {
0490: return isSymbol(ENCODING);
0491: }
0492:
0493: /** [80] EncodingDecl ::= S 'encoding' Eq
0494: ('"' EncName '"' | "'" EncName "'" )
0495: [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
0496: */
0497: private String readEncodingDecl() throws ParseException,
0498: IOException {
0499: readSymbol(ENCODING);
0500: readEq();
0501: char quote = readChar(QUOTE_CHARS);
0502: StringBuffer result = new StringBuffer();
0503: while (!isChar(quote))
0504: result.append(readChar());
0505: readChar(quote);
0506: return result.toString();
0507: }
0508:
0509: static final char[] QUOTE_CHARS = { '\'', '\"' };
0510: static private final char[] VERSION = "version".toCharArray();
0511:
0512: /** [24] VersionInfo
0513: ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
0514: */
0515: private void readVersionInfo() throws ParseException, IOException {
0516: readS();
0517: readSymbol(VERSION);
0518: readEq();
0519: char quote = readChar(QUOTE_CHARS);
0520: readVersionNum();
0521: readChar(quote);
0522: }
0523:
0524: /** [25] Eq ::= S? '=' S? */
0525: void readEq() throws ParseException, IOException {
0526: if (isS())
0527: readS();
0528: readChar('=');
0529: if (isS())
0530: readS();
0531: }
0532:
0533: static private final char[] VERSIONNUM_PUNC_CHARS = { '_', '.',
0534: ':', '-' };
0535:
0536: private boolean isVersionNumChar() throws ParseException,
0537: IOException {
0538: char ch = peekChar();
0539: //APPROXIMATION: allows non 7-bit-ASCII letters
0540: return Character.isLetterOrDigit(ch)
0541: || isIn(ch, VERSIONNUM_PUNC_CHARS);
0542: }
0543:
0544: /** [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')+ */
0545: private void readVersionNum() throws ParseException, IOException {
0546: readChar();
0547: while (isVersionNumChar())
0548: readChar();
0549: }
0550:
0551: /** [28] doctypedecl ::=
0552: * '<!DOCTYPE'
0553: * S
0554: * Name
0555: *(1) ( S ExternalID )?
0556: *(2) S?
0557: *(3) ( '[' (markupdecl|DeclSep)* ']' S? )?
0558: * '>'
0559: */
0560: private void readDocTypeDecl() throws ParseException, IOException {
0561: readSymbol(DOCTYPE_BEGIN);
0562: readS();
0563: docTypeName_ = readName();
0564: if (isS()) {
0565: //either at (1) or (2)
0566: readS();
0567: if (!isChar('>') && !isChar('[')) {
0568: //was at (1)
0569: isExternalDtd_ = true; //less checking of entity references
0570: readExternalId();
0571: //now at (2)
0572: if (isS())
0573: readS();
0574: }
0575: }
0576: //now at (3)
0577: if (isChar('[')) {
0578: readChar();
0579: while (!isChar(']')) {
0580: if (isDeclSep())
0581: readDeclSep();
0582: else
0583: readMarkupDecl();
0584: }
0585: readChar(']');
0586: if (isS())
0587: readS();
0588: }
0589: readChar('>');
0590: }
0591:
0592: /** [28a] DeclSep ::= PEReference | S */
0593: private void readDeclSep() throws ParseException, IOException {
0594: if (isPeReference())
0595: readPeReference();
0596: else
0597: readS();
0598: }
0599:
0600: private boolean isDeclSep() throws ParseException, IOException {
0601: return isPeReference() || isS();
0602: }
0603:
0604: static private final char[] MARKUPDECL_BEGIN = "<!".toCharArray();
0605:
0606: /** [29] markupdecl
0607: ::= elementdecl|AttlistDecl|EntityDecl|NotationDecl|PI|Comment
0608: */
0609: private void readMarkupDecl() throws ParseException, IOException {
0610: if (isPi())
0611: readPi();
0612: else if (isComment())
0613: readComment();
0614: else if (isEntityDecl())
0615: readEntityDecl();
0616: else if (isSymbol(MARKUPDECL_BEGIN)) { // (element-|Attlist-|Entity-|Notation-)Decl
0617: while (!isChar('>')) {
0618: if (isChar(QUOTE_CHARS)) {
0619: char quote = readChar();
0620: while (!isChar(quote))
0621: readChar();
0622: readChar(quote);
0623: } else
0624: readChar();
0625: }
0626: readChar('>');
0627: } else
0628: throw new ParseException(this ,
0629: "expecting processing instruction, comment, or \"<!\"");
0630: }
0631:
0632: static private final char[] CHARREF_BEGIN = "&#".toCharArray();
0633:
0634: /** [66] CharRef ::= '&#' [0-9]+ ';'
0635: */
0636: private char readCharRef() throws ParseException, IOException {
0637: readSymbol(CHARREF_BEGIN);
0638: int radix = 10;
0639: if (isChar('x')) {
0640: readChar();
0641: radix = 16;
0642: }
0643: int i = 0;
0644: while (!isChar(';')) {
0645: tmpBuf_[i++] = readChar();
0646: if (i >= TMP_BUF_SIZE) {
0647: log_.warning("Tmp buffer overflow on readCharRef",
0648: systemId_, getLineNumber());
0649: return ' ';
0650: }
0651: }
0652: readChar(';');
0653: String num = new String(tmpBuf_, 0, i);
0654: try {
0655: return (char) Integer.parseInt(num, radix);
0656: } catch (NumberFormatException e) {
0657: log_.warning("\"" + num + "\" is not a valid "
0658: + (radix == 16 ? "hexadecimal" : "decimal")
0659: + " number", systemId_, getLineNumber());
0660: return ' ';
0661: }
0662: }
0663:
0664: /** [67] Reference ::= EntityRef | CharRef
0665: */
0666: String readReference() throws ParseException, IOException {
0667: if (isSymbol(CHARREF_BEGIN))
0668: return readCharRef() + "";
0669: else
0670: return readEntityRef();
0671: }
0672:
0673: boolean isReference() throws ParseException, IOException {
0674: return isChar('&');
0675: }
0676:
0677: /** [68] EntityRef ::= '&' Name ';'
0678: */
0679: private String readEntityRef() throws ParseException, IOException {
0680: readChar('&');
0681: String name = readName();
0682: String result = (String) entities_.get(name);
0683: //http://www.w3.org/TR/2000/REC-xml-20001006#vc-entdeclared
0684: if (result == null) {
0685: result = "";
0686: if (isExternalDtd_)
0687: log_
0688: .warning(
0689: "&"
0690: + name
0691: + "; not found -- possibly defined in external DTD)",
0692: systemId_, getLineNumber());
0693: else
0694: log_.warning("No declaration of &" + name + ";",
0695: systemId_, getLineNumber());
0696: }
0697: readChar(';');
0698: return result;
0699: }
0700:
0701: /* Old methods
0702: private void appendText(Element element, String string) {
0703: handler_.characters(string);
0704: }
0705:
0706: private void appendText(Element element, char ch){
0707: handler_.character(ch);
0708: }
0709: */
0710:
0711: /** [69] PEReference ::= '%' Name ';' */
0712: private String readPeReference() throws ParseException, IOException {
0713: readChar('%');
0714: String name = readName();
0715: String result = (String) pes_.get(name);
0716: //http://www.w3.org/TR/2000/REC-xml-20001006#vc-entdeclared
0717: if (result == null) {
0718: result = "";
0719: log_.warning("No declaration of %" + name + ";", systemId_,
0720: getLineNumber());
0721: }
0722: readChar(';');
0723: return result;
0724: }
0725:
0726: private boolean isPeReference() throws ParseException, IOException {
0727: return isChar('%');
0728: }
0729:
0730: static private final char[] ENTITY_BEGIN = "<!ENTITY".toCharArray();
0731: static private final char[] NDATA = "NDATA".toCharArray();
0732:
0733: /** [70] EntityDecl ::= GEDecl | PEDecl
0734: [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
0735: [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
0736: [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
0737: [74] PEDef ::= EntityValue | ExternalID
0738: [76] NDataDecl ::= S 'NDATA' S Name
0739: */
0740: private void readEntityDecl() throws ParseException, IOException {
0741: readSymbol(ENTITY_BEGIN);
0742: readS();
0743: if (isChar('%')) {
0744: readChar('%');
0745: readS();
0746: String name = readName();
0747: readS();
0748: String value;
0749: if (isEntityValue())
0750: value = readEntityValue();
0751: else
0752: value = readExternalId();
0753: pes_.put(name, value);
0754: } else {
0755: String name = readName();
0756: readS();
0757: String value;
0758: if (isEntityValue())
0759: value = readEntityValue();
0760: else if (isExternalId()) {
0761: value = readExternalId();
0762: if (isS())
0763: readS();
0764: if (isSymbol(NDATA)) {
0765: readSymbol(NDATA);
0766: readS();
0767: readName();
0768: }
0769: } else
0770: throw new ParseException(
0771: this ,
0772: "expecting double-quote, \"PUBLIC\" or \"SYSTEM\" while reading entity declaration");
0773: entities_.put(name, value);
0774: }
0775: if (isS())
0776: readS();
0777: readChar('>');
0778: }
0779:
0780: private boolean isEntityDecl() throws ParseException, IOException {
0781: return isSymbol(ENTITY_BEGIN);
0782: }
0783:
0784: static private final char[] SYSTEM = "SYSTEM".toCharArray();
0785: static private final char[] PUBLIC = "PUBLIC".toCharArray();
0786:
0787: /** [75] ExternalID ::=
0788: * 'SYSTEM' S SystemLiteral
0789: * | 'PUBLIC' S PubidLiteral S SystemLiteral
0790: */
0791: private String readExternalId() throws ParseException, IOException {
0792: if (isSymbol(SYSTEM))
0793: readSymbol(SYSTEM);
0794: else if (isSymbol(PUBLIC)) {
0795: readSymbol(PUBLIC);
0796: readS();
0797: readPubidLiteral();
0798: } else
0799: throw new ParseException(this ,
0800: "expecting \"SYSTEM\" or \"PUBLIC\" while reading external ID");
0801: readS();
0802: readSystemLiteral();
0803: return "(WARNING: external ID not read)"; //not implemented
0804: }
0805:
0806: private boolean isExternalId() throws ParseException, IOException {
0807: return isSymbol(SYSTEM) || isSymbol(PUBLIC);
0808: }
0809:
0810: void readSymbol(char[] expected) throws ParseException, IOException {
0811: int n = expected.length;
0812: if (endPos_ - curPos_ < n) {
0813: if (fillBuf(n) <= 0) {
0814: ch_ = -1;
0815: throw new ParseException(this , "end of XML file",
0816: expected);
0817: }
0818: }
0819: ch_ = cbuf_[endPos_ - 1];
0820:
0821: if (endPos_ - curPos_ < n)
0822: throw new ParseException(this , "end of XML file", expected);
0823:
0824: //compare actual with expected
0825: for (int i = 0; i < n; ++i) {
0826: if (H_DEBUG)
0827: history_.addChar(cbuf_[curPos_ + i]);
0828:
0829: if (cbuf_[curPos_ + i] != expected[i])
0830: throw new ParseException(this , new String(cbuf_,
0831: curPos_, n), expected);
0832: }
0833:
0834: curPos_ += n;
0835: }
0836:
0837: boolean isSymbol(char[] expected) throws ParseException,
0838: IOException {
0839: int n = expected.length;
0840: if (endPos_ - curPos_ < n) {
0841: if (fillBuf(n) <= 0) {
0842: ch_ = -1;
0843: return false;
0844: }
0845: }
0846: ch_ = cbuf_[endPos_ - 1];
0847:
0848: if (endPos_ - curPos_ < n)
0849: return false;
0850:
0851: //compare actual with expected
0852: //int startPos = curPos_;
0853: for (int i = 0; i < n; ++i)
0854: if (cbuf_[curPos_ + i] != expected[i])
0855: return false;
0856:
0857: return true;
0858: }
0859:
0860: //////////////////////////////////////////////////////////////
0861: /** [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
0862: * | "'" ([^<&'] | Reference)* "'"
0863: */
0864: private String readAttValue() throws ParseException, IOException {
0865: char quote = readChar(QUOTE_CHARS);
0866: StringBuffer result = new StringBuffer();
0867: while (!isChar(quote)) {
0868: if (isReference())
0869: result.append(readReference());
0870: else
0871: result.append(readChar());
0872: }
0873: readChar(quote);
0874: return result.toString();
0875: }
0876:
0877: static private final char[] BEGIN_CDATA = "<![CDATA[".toCharArray();
0878: static private final char[] END_CDATA = "]]>".toCharArray();
0879:
0880: /** [14] CharData ::= [^<&]* - (
0881: * [^<&]*
0882: * ']]>'
0883: * [^<&]*
0884: * )
0885: */
0886: private void readPossibleCharData(Element element)
0887: throws ParseException, IOException {
0888: int i = 0;
0889: //StringBuffer buf = new StringBuffer();
0890: while (!isChar('<') && !isChar('&') && !isSymbol(END_CDATA)) {
0891:
0892: tmpBuf_[i] = readChar();
0893:
0894: //convert DOS line endings to UNIX
0895: if (tmpBuf_[i] == '\r' && peekChar() == '\n')
0896: tmpBuf_[i] = readChar();
0897:
0898: //convert DOS line endings to UNIX
0899: /*if( ch == '\n' ){
0900: int iLast = buf.length()-1;
0901: if( iLast>=0 && buf.charAt(iLast)=='\r' )
0902: buf.deleteCharAt(iLast);
0903: }*/
0904:
0905: //buf.append( ch );
0906: i++;
0907: if (i >= TMP_BUF_SIZE) {
0908: handler_.characters(tmpBuf_, 0, i);
0909: i = 0;
0910: }
0911: // appendText( element, ch );
0912: }
0913: if (i > 0)
0914: handler_.characters(tmpBuf_, 0, i);
0915:
0916: //if( buf.length() > 0 )
0917: // appendText( element, buf.toString() );
0918: }
0919:
0920: /**
0921: * [18] CDSect ::= CDStart CData CDEnd
0922: * [19] CDStart ::= '<![CDATA['
0923: * [20] CData ::= (Char* - (Char* ']]>' Char*))
0924: * [21] CDEnd ::= ']]>'
0925: */
0926: private void readCdSect(Element element) throws ParseException,
0927: IOException {
0928: StringBuffer result = null;
0929: readSymbol(BEGIN_CDATA);
0930: int i = 0;
0931: while (!isSymbol(END_CDATA)) {
0932: if (i >= TMP_BUF_SIZE) {
0933: if (result == null) {
0934: result = new StringBuffer(i);
0935: result.append(tmpBuf_, 0, i);
0936: } else
0937: result.append(tmpBuf_, 0, i);
0938: i = 0;
0939: }
0940: tmpBuf_[i++] = readChar();
0941: }
0942: readSymbol(END_CDATA);
0943:
0944: if (result != null) {
0945: result.append(tmpBuf_, 0, i);
0946: char[] cdSect = result.toString().toCharArray();
0947: handler_.characters(cdSect, 0, cdSect.length);
0948: } else {
0949: handler_.characters(tmpBuf_, 0, i);
0950: }
0951:
0952: /* Old style
0953: StringBuffer buf = new StringBuffer();
0954: readSymbol(BEGIN_CDATA);
0955: while( !isSymbol(END_CDATA) )
0956: buf.append( readChar() );
0957: readSymbol(END_CDATA);
0958: if( buf.length() > 0 ) {
0959: char cdSect[] = buf.toString().toCharArray();
0960: handler_.characters(cdSect, 0, cdSect.length);
0961: }
0962: */
0963: }
0964:
0965: private boolean isCdSect() throws ParseException, IOException {
0966: return isSymbol(BEGIN_CDATA);
0967: }
0968:
0969: /** Parse element using stream in document.
0970: * [39] element ::= EmptyElemTag
0971: * | STag content ETag
0972: */
0973: Element readElement(Element parentElement) throws ParseException,
0974: IOException {
0975: Element element = new Element();
0976:
0977: boolean isSTag = readEmptyElementTagOrSTag(element);
0978:
0979: handler_.startElement(element);
0980:
0981: if (isSTag) {
0982: readContent(element);
0983: readETag(element);
0984: }
0985:
0986: handler_.endElement(element);
0987:
0988: //element.normalize();
0989: return element;
0990: }
0991:
0992: ParseLog getLog() {
0993: return log_;
0994: }
0995:
0996: static private final char[] ENDTAG_CHARS = { '/', '>' };
0997: static private final char[] END_EMPTYTAG = "/>".toCharArray();
0998:
0999: /** Return if this is a STag
1000: * [40] STag ::= '<' Name (S Attribute)* S? '>'
1001: * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
1002: */
1003: private boolean readEmptyElementTagOrSTag(Element element)
1004: throws ParseException, IOException {
1005: readChar('<');
1006: element.setTagName(readName());
1007: while (isS()) {
1008: readS();
1009: if (!isChar(ENDTAG_CHARS))
1010: readAttribute(element);
1011: }
1012: if (isS())
1013: readS();
1014: boolean isSTag = isChar('>');
1015: if (isSTag)
1016: readChar('>');
1017: else
1018: readSymbol(END_EMPTYTAG);
1019: return isSTag;
1020:
1021: }
1022:
1023: /** [41] Attribute ::= Name Eq AttValue */
1024: private void readAttribute(Element element) throws ParseException,
1025: IOException {
1026: String name = readName();
1027: readEq();
1028: String value = readAttValue();
1029: //http://www.w3.org/TR/2000/REC-xml-20001006#uniqattspec
1030: if (element.getAttribute(name) != null)
1031: log_.warning("Element " + this + " contains attribute "
1032: + name + "more than once", systemId_,
1033: getLineNumber());
1034: element.setAttribute(name, value);
1035: }
1036:
1037: static private final char[] BEGIN_ETAG = "</".toCharArray();
1038:
1039: /** [42] ETag ::= '</' Name S? '>' */
1040: private void readETag(Element element) throws ParseException,
1041: IOException {
1042: readSymbol(BEGIN_ETAG);
1043: String name = readName();
1044: //http://www.w3.org/TR/2000/REC-xml-20001006#GIMatch
1045: if (!name.equals(element.getTagName()))
1046: log_.warning("end tag (" + name
1047: + ") does not match begin tag ("
1048: + element.getTagName() + ")", systemId_,
1049: getLineNumber());
1050: if (isS())
1051: readS();
1052: readChar('>');
1053: }
1054:
1055: private boolean isETag() throws ParseException, IOException {
1056: return isSymbol(BEGIN_ETAG);
1057: }
1058:
1059: /** [43] content ::=
1060: * CharData? (
1061: * (element | Reference | CDSect | PI | Comment) CharData?
1062: * )*
1063: */
1064: private void readContent(Element element) throws ParseException,
1065: IOException {
1066: readPossibleCharData(element);
1067: boolean keepGoing = true;
1068: while (keepGoing) {
1069: if (isETag())
1070: keepGoing = false;
1071: else if (isReference()) {
1072: // appendText( element, readReference() );
1073: char ref[] = readReference().toCharArray();
1074: handler_.characters(ref, 0, ref.length);
1075: } else if (isCdSect())
1076: readCdSect(element);
1077: else if (isPi())
1078: readPi();
1079: else if (isComment())
1080: readComment();
1081: else if (isChar('<'))
1082: readElement(element);
1083: else
1084: keepGoing = false;
1085: readPossibleCharData(element);
1086: }
1087:
1088: }
1089:
1090: //////////////////////////////////////////////////////////////
1091:
1092: /**
1093: * @link aggregationByValue
1094: */
1095: private String systemId_; // Temp not final
1096: private String docTypeName_ = null;
1097:
1098: /**
1099: * @link aggregationByValue
1100: */
1101: private final Reader reader_;
1102: private final char[] buf_ = new char[LOOKAHEAD];
1103: private final Map entities_ = new HashMap();
1104: private final Map pes_ = new HashMap();
1105: private final ParseLog log_;
1106: private final String encoding_;
1107: private int ch_ = -2; //last char read
1108: private boolean isExternalDtd_ = false;
1109:
1110: static private final int LOOKAHEAD = 9;
1111:
1112: /**
1113: * Added by Sergio Marti.
1114: */
1115:
1116: /** Replaced PeekReader with character array. 10X speed improvement */
1117: private final int CBUF_SIZE = 1024;
1118: private final char[] cbuf_;
1119: private int curPos_ = 0;
1120: private int endPos_ = 0;
1121: private boolean eos_ = false; // End of stream identifier
1122:
1123: // Empty char buffer used to fill with char data
1124: static private final int TMP_BUF_SIZE = 255;
1125: private final char tmpBuf_[] = new char[TMP_BUF_SIZE];
1126:
1127: // Debug information
1128: private int lineNumber_ = -1;
1129: private final CharCircBuffer history_;
1130: static public final int HISTORY_LENGTH = 100;
1131:
1132: // SAX Parser like handler.
1133: private final ParseHandler handler_;
1134:
1135: }
1136:
1137: // $Log: ParseCharStream.java,v $
1138: // Revision 1.4 2003/01/27 23:30:58 yuhongx
1139: // Replaced Hashtable with HashMap.
1140: //
1141: // Revision 1.3 2002/11/06 02:57:59 eobrain
1142: // Organize imputs to removed unused imports. Remove some unused local variables.
1143: //
1144: // Revision 1.2 2002/08/21 20:18:12 eobrain
1145: // Ignore case when comparing encodings.
1146: //
1147: // Revision 1.1.1.1 2002/08/19 05:04:00 eobrain
1148: // import from HP Labs internal CVS
1149: //
1150: // Revision 1.17 2002/08/18 04:36:59 eob
1151: // Make interface package-private so as not to clutter up the javadoc.
1152: //
1153: // Revision 1.16 2002/08/17 00:54:14 sermarti
1154: //
1155: // Revision 1.15 2002/08/15 23:40:22 sermarti
1156: //
1157: // Revision 1.14 2002/08/05 20:04:32 sermarti
1158: //
1159: // Revision 1.13 2002/08/01 23:36:52 sermarti
1160: // Sparta minor update: Now with debug really enabled.
1161: //
1162: // Revision 1.12 2002/08/01 23:29:17 sermarti
1163: // Much faster Sparta parsing.
1164: // Has debug features enabled by default. Currently toggled
1165: // in ParseCharStream.java and recompiled.
1166: //
1167: // Revision 1.11 2002/07/25 21:10:15 sermarti
1168: // Adding files that mysteriously weren't added from Sparta before.
1169: //
1170: // Revision 1.10 2002/05/23 21:28:25 eob
1171: // Make misc optimizations because performance profiling showed that this
1172: // class is heavily used. Avoid use char arrays instead of strings in
1173: // symbol comparison. Remove deprecated methods.
1174: //
1175: // Revision 1.9 2002/05/09 16:50:06 eob
1176: // Add history for better error reporting.
1177: //
1178: // Revision 1.8 2002/03/21 23:52:21 eob
1179: // Deprecate functionality moved to Parser facade class.
1180: //
1181: // Revision 1.7 2002/02/23 02:06:51 eob
1182: // Add constructor that takes a File.
1183: //
1184: // Revision 1.6 2002/02/06 23:32:40 eob
1185: // Better error message.
1186: //
1187: // Revision 1.5 2002/02/01 21:56:23 eob
1188: // Tweak error messages. Add no-log constructor.
1189: //
1190: // Revision 1.4 2002/01/09 00:53:02 eob
1191: // Formatting changes only.
1192: //
1193: // Revision 1.3 2002/01/09 00:48:24 eob
1194: // Replace well-formed errors with warnings.
1195: //
1196: // Revision 1.2 2002/01/08 19:56:51 eob
1197: // Comment change only.
1198: //
1199: // Revision 1.1 2002/01/08 19:29:31 eob
1200: // Factored out ParseSource functionality into ParseCharStream and
1201: // ParseByteStream.
|