0001: // XmlParser.java: the main parser class.
0002: // NO WARRANTY! See README, and copyright below.
0003: // $Id: XmlParser.java 4972 2004-02-22 20:00:54Z spestov $
0004:
0005: package com.microstar.xml;
0006:
0007: import java.io.BufferedInputStream;
0008: import java.io.EOFException;
0009: import java.io.InputStream;
0010: import java.io.Reader;
0011: import java.net.URL;
0012: import java.net.URLConnection;
0013: import java.util.Enumeration;
0014: import java.util.Hashtable;
0015: import java.util.Stack;
0016:
0017: /**
0018: * Parse XML documents and return parse events through call-backs.
0019: * <p>You need to define a class implementing the <code>XmlHandler</code>
0020: * interface: an object belonging to this class will receive the
0021: * callbacks for the events. (As an alternative to implementing
0022: * the full XmlHandler interface, you can simply extend the
0023: * <code>HandlerBase</code> convenience class.)
0024: * <p>Usage (assuming that <code>MyHandler</code> is your implementation
0025: * of the <code>XmlHandler</code> interface):
0026: * <pre>
0027: * XmlHandler handler = new MyHandler();
0028: * XmlParser parser = new XmlParser();
0029: * parser.setHandler(handler);
0030: * try {
0031: * parser.parse("http://www.host.com/doc.xml", null);
0032: * } catch (Exception e) {
0033: * [do something interesting]
0034: * }
0035: * </pre>
0036: * <p>Alternatively, you can use the standard SAX interfaces
0037: * with the <code>SAXDriver</code> class as your entry point.
0038: * @author Copyright (c) 1997, 1998 by Microstar Software Ltd.
0039: * @author Written by David Megginson <dmeggins@microstar.com>
0040: * @version 1.1
0041: * @see XmlHandler
0042: * @see HandlerBase
0043: * @see SAXDriver
0044: */
0045: public class XmlParser {
0046:
0047: //
0048: // Use special cheats that speed up the code (currently about 50%),
0049: // but may cause problems with future maintenance and add to the
0050: // class file size (about 500 bytes).
0051: //
0052: private final static boolean USE_CHEATS = true;
0053:
0054: //////////////////////////////////////////////////////////////////////
0055: // Constructors.
0056: ////////////////////////////////////////////////////////////////////////
0057:
0058: /**
0059: * Construct a new parser with no associated handler.
0060: * @see #setHandler
0061: * @see #parse
0062: */
0063: public XmlParser() {
0064: }
0065:
0066: /**
0067: * Set the handler that will receive parsing events.
0068: * @param handler The handler to receive callback events.
0069: * @see #parse
0070: * @see XmlHandler
0071: */
0072: public void setHandler(XmlHandler handler) {
0073: this .handler = handler;
0074: }
0075:
0076: /**
0077: * Parse an XML document from a URI.
0078: * <p>You may parse a document more than once, but only one thread
0079: * may call this method for an object at one time.
0080: * @param systemId The URI of the document.
0081: * @param publicId The public identifier of the document, or null.
0082: * @param encoding The suggested encoding, or null if unknown.
0083: * @exception java.lang.Exception Any exception thrown by your
0084: * own handlers, or any derivation of java.io.IOException
0085: * thrown by the parser itself.
0086: */
0087: public void parse(String systemId, String publicId, String encoding)
0088: throws java.lang.Exception {
0089: doParse(systemId, publicId, null, null, encoding);
0090: }
0091:
0092: /**
0093: * Parse an XML document from a byte stream.
0094: * <p>The URI that you supply will become the base URI for
0095: * resolving relative links, but Ælfred will actually read
0096: * the document from the supplied input stream.
0097: * <p>You may parse a document more than once, but only one thread
0098: * may call this method for an object at one time.
0099: * @param systemId The base URI of the document, or null if not
0100: * known.
0101: * @param publicId The public identifier of the document, or null
0102: * if not known.
0103: * @param stream A byte input stream.
0104: * @param encoding The suggested encoding, or null if unknown.
0105: * @exception java.lang.Exception Any exception thrown by your
0106: * own handlers, or any derivation of java.io.IOException
0107: * thrown by the parser itself.
0108: */
0109: public void parse(String systemId, String publicId,
0110: InputStream stream, String encoding)
0111: throws java.lang.Exception {
0112: doParse(systemId, publicId, null, stream, encoding);
0113: }
0114:
0115: /**
0116: * Parse an XML document from a character stream.
0117: * <p>The URI that you supply will become the base URI for
0118: * resolving relative links, but Ælfred will actually read
0119: * the document from the supplied input stream.
0120: * <p>You may parse a document more than once, but only one thread
0121: * may call this method for an object at one time.
0122: * @param systemId The base URI of the document, or null if not
0123: * known.
0124: * @param publicId The public identifier of the document, or null
0125: * if not known.
0126: * @param reader A character stream.
0127: * @exception java.lang.Exception Any exception thrown by your
0128: * own handlers, or any derivation of java.io.IOException
0129: * thrown by the parser itself.
0130: */
0131: public void parse(String systemId, String publicId, Reader reader)
0132: throws java.lang.Exception {
0133: doParse(systemId, publicId, reader, null, null);
0134: }
0135:
0136: private synchronized void doParse(String systemId, String publicId,
0137: Reader reader, InputStream stream, String encoding)
0138: throws java.lang.Exception {
0139: basePublicId = publicId;
0140: baseURI = systemId;
0141: baseReader = reader;
0142: baseInputStream = stream;
0143:
0144: initializeVariables();
0145:
0146: // Set the default entities here.
0147: setInternalEntity(intern("amp"), "&");
0148: setInternalEntity(intern("lt"), "<");
0149: setInternalEntity(intern("gt"), ">");
0150: setInternalEntity(intern("apos"), "'");
0151: setInternalEntity(intern("quot"), """);
0152:
0153: if (handler != null) {
0154: handler.startDocument();
0155: }
0156:
0157: pushURL("[document]", basePublicId, baseURI, baseReader,
0158: baseInputStream, encoding);
0159:
0160: parseDocument();
0161:
0162: if (handler != null) {
0163: handler.endDocument();
0164: }
0165: cleanupVariables();
0166: }
0167:
0168: ////////////////////////////////////////////////////////////////////////
0169: // Constants.
0170: ////////////////////////////////////////////////////////////////////////
0171:
0172: //
0173: // Constants for element content type.
0174: //
0175:
0176: /**
0177: * Constant: an element has not been declared.
0178: * @see #getElementContentType
0179: */
0180: public final static int CONTENT_UNDECLARED = 0;
0181:
0182: /**
0183: * Constant: the element has a content model of ANY.
0184: * @see #getElementContentType
0185: */
0186: public final static int CONTENT_ANY = 1;
0187:
0188: /**
0189: * Constant: the element has declared content of EMPTY.
0190: * @see #getElementContentType
0191: */
0192: public final static int CONTENT_EMPTY = 2;
0193:
0194: /**
0195: * Constant: the element has mixed content.
0196: * @see #getElementContentType
0197: */
0198: public final static int CONTENT_MIXED = 3;
0199:
0200: /**
0201: * Constant: the element has element content.
0202: * @see #getElementContentType
0203: */
0204: public final static int CONTENT_ELEMENTS = 4;
0205:
0206: //
0207: // Constants for the entity type.
0208: //
0209:
0210: /**
0211: * Constant: the entity has not been declared.
0212: * @see #getEntityType
0213: */
0214: public final static int ENTITY_UNDECLARED = 0;
0215:
0216: /**
0217: * Constant: the entity is internal.
0218: * @see #getEntityType
0219: */
0220: public final static int ENTITY_INTERNAL = 1;
0221:
0222: /**
0223: * Constant: the entity is external, non-XML data.
0224: * @see #getEntityType
0225: */
0226: public final static int ENTITY_NDATA = 2;
0227:
0228: /**
0229: * Constant: the entity is external XML data.
0230: * @see #getEntityType
0231: */
0232: public final static int ENTITY_TEXT = 3;
0233:
0234: //
0235: // Constants for attribute type.
0236: //
0237:
0238: /**
0239: * Constant: the attribute has not been declared for this element type.
0240: * @see #getAttributeType
0241: */
0242: public final static int ATTRIBUTE_UNDECLARED = 0;
0243:
0244: /**
0245: * Constant: the attribute value is a string value.
0246: * @see #getAttributeType
0247: */
0248: public final static int ATTRIBUTE_CDATA = 1;
0249:
0250: /**
0251: * Constant: the attribute value is a unique identifier.
0252: * @see #getAttributeType
0253: */
0254: public final static int ATTRIBUTE_ID = 2;
0255:
0256: /**
0257: * Constant: the attribute value is a reference to a unique identifier.
0258: * @see #getAttributeType
0259: */
0260: public final static int ATTRIBUTE_IDREF = 3;
0261:
0262: /**
0263: * Constant: the attribute value is a list of ID references.
0264: * @see #getAttributeType
0265: */
0266: public final static int ATTRIBUTE_IDREFS = 4;
0267:
0268: /**
0269: * Constant: the attribute value is the name of an entity.
0270: * @see #getAttributeType
0271: */
0272: public final static int ATTRIBUTE_ENTITY = 5;
0273:
0274: /**
0275: * Constant: the attribute value is a list of entity names.
0276: * @see #getAttributeType
0277: */
0278: public final static int ATTRIBUTE_ENTITIES = 6;
0279:
0280: /**
0281: * Constant: the attribute value is a name token.
0282: * @see #getAttributeType
0283: */
0284: public final static int ATTRIBUTE_NMTOKEN = 7;
0285:
0286: /**
0287: * Constant: the attribute value is a list of name tokens.
0288: * @see #getAttributeType
0289: */
0290: public final static int ATTRIBUTE_NMTOKENS = 8;
0291:
0292: /**
0293: * Constant: the attribute value is a token from an enumeration.
0294: * @see #getAttributeType
0295: */
0296: public final static int ATTRIBUTE_ENUMERATED = 9;
0297:
0298: /**
0299: * Constant: the attribute is the name of a notation.
0300: * @see #getAttributeType
0301: */
0302: public final static int ATTRIBUTE_NOTATION = 10;
0303:
0304: //
0305: // When the class is loaded, populate the hash table of
0306: // attribute types.
0307: //
0308:
0309: /**
0310: * Hash table of attribute types.
0311: */
0312: private static Hashtable attributeTypeHash;
0313: static {
0314: attributeTypeHash = new Hashtable();
0315: attributeTypeHash.put("CDATA", new Integer(ATTRIBUTE_CDATA));
0316: attributeTypeHash.put("ID", new Integer(ATTRIBUTE_ID));
0317: attributeTypeHash.put("IDREF", new Integer(ATTRIBUTE_IDREF));
0318: attributeTypeHash.put("IDREFS", new Integer(ATTRIBUTE_IDREFS));
0319: attributeTypeHash.put("ENTITY", new Integer(ATTRIBUTE_ENTITY));
0320: attributeTypeHash.put("ENTITIES", new Integer(
0321: ATTRIBUTE_ENTITIES));
0322: attributeTypeHash
0323: .put("NMTOKEN", new Integer(ATTRIBUTE_NMTOKEN));
0324: attributeTypeHash.put("NMTOKENS", new Integer(
0325: ATTRIBUTE_NMTOKENS));
0326: attributeTypeHash.put("NOTATION", new Integer(
0327: ATTRIBUTE_NOTATION));
0328: }
0329:
0330: //
0331: // Constants for supported encodings.
0332: //
0333: private final static int ENCODING_UTF_8 = 1;
0334: private final static int ENCODING_ISO_8859_1 = 2;
0335: private final static int ENCODING_UCS_2_12 = 3;
0336: private final static int ENCODING_UCS_2_21 = 4;
0337: private final static int ENCODING_UCS_4_1234 = 5;
0338: private final static int ENCODING_UCS_4_4321 = 6;
0339: private final static int ENCODING_UCS_4_2143 = 7;
0340: private final static int ENCODING_UCS_4_3412 = 8;
0341:
0342: //
0343: // Constants for attribute default value.
0344: //
0345:
0346: /**
0347: * Constant: the attribute is not declared.
0348: * @see #getAttributeDefaultValueType
0349: */
0350: public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 0;
0351:
0352: /**
0353: * Constant: the attribute has a literal default value specified.
0354: * @see #getAttributeDefaultValueType
0355: * @see #getAttributeDefaultValue
0356: */
0357: public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 1;
0358:
0359: /**
0360: * Constant: the attribute was declared #IMPLIED.
0361: * @see #getAttributeDefaultValueType
0362: */
0363: public final static int ATTRIBUTE_DEFAULT_IMPLIED = 2;
0364:
0365: /**
0366: * Constant: the attribute was declared #REQUIRED.
0367: * @see #getAttributeDefaultValueType
0368: */
0369: public final static int ATTRIBUTE_DEFAULT_REQUIRED = 3;
0370:
0371: /**
0372: * Constant: the attribute was declared #FIXED.
0373: * @see #getAttributeDefaultValueType
0374: * @see #getAttributeDefaultValue
0375: */
0376: public final static int ATTRIBUTE_DEFAULT_FIXED = 4;
0377:
0378: //
0379: // Constants for input.
0380: //
0381: private final static int INPUT_NONE = 0;
0382: private final static int INPUT_INTERNAL = 1;
0383: private final static int INPUT_EXTERNAL = 2;
0384: private final static int INPUT_STREAM = 3;
0385: private final static int INPUT_BUFFER = 4;
0386: private final static int INPUT_READER = 5;
0387:
0388: //
0389: // Flags for reading literals.
0390: //
0391: private final static int LIT_CHAR_REF = 1;
0392: private final static int LIT_ENTITY_REF = 2;
0393: private final static int LIT_PE_REF = 4;
0394: private final static int LIT_NORMALIZE = 8;
0395:
0396: //
0397: // Flags for parsing context.
0398: //
0399: private final static int CONTEXT_NONE = 0;
0400: private final static int CONTEXT_DTD = 1;
0401: private final static int CONTEXT_ENTITYVALUE = 2;
0402: private final static int CONTEXT_ATTRIBUTEVALUE = 3;
0403:
0404: //////////////////////////////////////////////////////////////////////
0405: // Error reporting.
0406: //////////////////////////////////////////////////////////////////////
0407:
0408: /**
0409: * Report an error.
0410: * @param message The error message.
0411: * @param textFound The text that caused the error (or null).
0412: * @see XmlHandler#error
0413: * @see #line
0414: */
0415: void error(String message, String textFound, String textExpected)
0416: throws java.lang.Exception {
0417: errorCount++;
0418: if (textFound != null) {
0419: message = message + " (found \"" + textFound + "\")";
0420: }
0421: if (textExpected != null) {
0422: message = message + " (expected \"" + textExpected + "\")";
0423: }
0424: if (handler != null) {
0425: String uri = null;
0426:
0427: if (externalEntity != null) {
0428: uri = externalEntity.getURL().toString();
0429: }
0430: handler.error(message, uri, line, column);
0431: }
0432: }
0433:
0434: /**
0435: * Report a serious error.
0436: * @param message The error message.
0437: * @param textFound The text that caused the error (or null).
0438: */
0439: void error(String message, char textFound, String textExpected)
0440: throws java.lang.Exception {
0441: error(message, new Character(textFound).toString(),
0442: textExpected);
0443: }
0444:
0445: //////////////////////////////////////////////////////////////////////
0446: // Major syntactic productions.
0447: //////////////////////////////////////////////////////////////////////
0448:
0449: /**
0450: * Parse an XML document.
0451: * <pre>
0452: * [1] document ::= prolog element Misc*
0453: * </pre>
0454: * <p>This is the top-level parsing function for a single XML
0455: * document. As a minimum, a well-formed document must have
0456: * a document element, and a valid document must have a prolog
0457: * as well.
0458: */
0459: void parseDocument() throws java.lang.Exception {
0460: char c;
0461:
0462: parseProlog();
0463: require('<');
0464: parseElement();
0465: try {
0466: parseMisc(); //skip all white, PIs, and comments
0467: c = readCh(); //if this doesn't throw an exception...
0468: error("unexpected characters after document end", c, null);
0469: } catch (EOFException e) {
0470: return;
0471: }
0472: }
0473:
0474: /**
0475: * Skip a comment.
0476: * <pre>
0477: * [18] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
0478: * </pre>
0479: * <p>(The <code><!--</code> has already been read.)
0480: */
0481: void parseComment() throws java.lang.Exception {
0482: skipUntil("-->");
0483: }
0484:
0485: /**
0486: * Parse a processing instruction and do a call-back.
0487: * <pre>
0488: * [19] PI ::= '<?' Name (S (Char* - (Char* '?>' Char*)))? '?>'
0489: * </pre>
0490: * <p>(The <code><?</code> has already been read.)
0491: * <p>An XML processing instruction <em>must</em> begin with
0492: * a Name, which is the instruction's target.
0493: */
0494: void parsePI() throws java.lang.Exception {
0495: String name;
0496:
0497: name = readNmtoken(true);
0498: if (!tryRead("?>")) {
0499: requireWhitespace();
0500: parseUntil("?>");
0501: }
0502: if (handler != null) {
0503: handler.processingInstruction(name, dataBufferToString());
0504: }
0505: }
0506:
0507: /**
0508: * Parse a CDATA marked section.
0509: * <pre>
0510: * [20] CDSect ::= CDStart CData CDEnd
0511: * [21] CDStart ::= '<![CDATA['
0512: * [22] CData ::= (Char* - (Char* ']]>' Char*))
0513: * [23] CDEnd ::= ']]>'
0514: * </pre>
0515: * <p>(The '<![CDATA[' has already been read.)
0516: * <p>Note that this just appends characters to the dataBuffer,
0517: * without actually generating an event.
0518: */
0519: void parseCDSect() throws java.lang.Exception {
0520: parseUntil("]]>");
0521: }
0522:
0523: /**
0524: * Parse the prolog of an XML document.
0525: * <pre>
0526: * [24] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
0527: * </pre>
0528: * <p>There are a couple of tricks here. First, it is necessary to
0529: * declare the XML default attributes after the DTD (if present)
0530: * has been read. Second, it is not possible to expand general
0531: * references in attribute value literals until after the entire
0532: * DTD (if present) has been parsed.
0533: * <p>We do not look for the XML declaration here, because it is
0534: * handled by pushURL().
0535: * @see pushURL
0536: */
0537: void parseProlog() throws java.lang.Exception {
0538: parseMisc();
0539:
0540: if (tryRead("<!DOCTYPE")) {
0541: parseDoctypedecl();
0542: parseMisc();
0543: }
0544: }
0545:
0546: /**
0547: * Parse the XML declaration.
0548: * <pre>
0549: * [25] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
0550: * [26] VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'")
0551: * [33] SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'"
0552: * | S 'standalone' Eq '"' ("yes" | "no") '"'
0553: * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
0554: * </pre>
0555: * <p>([80] to [82] are also significant.)
0556: * <p>(The <code><?xml</code> and whitespace have already been read.)
0557: * <p>TODO: validate value of standalone.
0558: * @see #parseTextDecl
0559: * @see #checkEncoding
0560: */
0561: void parseXMLDecl(boolean ignoreEncoding)
0562: throws java.lang.Exception {
0563: String version;
0564: String encodingName = null;
0565: String standalone = null;
0566:
0567: // Read the version.
0568: require("version");
0569: parseEq();
0570: version = readLiteral(0);
0571: if (!version.equals("1.0")) {
0572: error("unsupported XML version", version, "1.0");
0573: }
0574:
0575: // Try reading an encoding declaration.
0576: skipWhitespace();
0577: if (tryRead("encoding")) {
0578: parseEq();
0579: encodingName = readLiteral(0);
0580: checkEncoding(encodingName, ignoreEncoding);
0581: }
0582:
0583: // Try reading a standalone declaration
0584: skipWhitespace();
0585: if (tryRead("standalone")) {
0586: parseEq();
0587: standalone = readLiteral(0);
0588: }
0589:
0590: skipWhitespace();
0591: require("?>");
0592: }
0593:
0594: /**
0595: * Parse the Encoding PI.
0596: * <pre>
0597: * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
0598: * [79] EncodingPI ::= '<?xml' S 'encoding' Eq QEncoding S? '?>'
0599: * [80] QEncoding ::= '"' Encoding '"' | "'" Encoding "'"
0600: * [81] Encoding ::= LatinName
0601: * [82] LatinName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
0602: * </pre>
0603: * <p>(The <code><?xml</code>' and whitespace have already been read.)
0604: * @see #parseXMLDecl
0605: * @see #checkEncoding
0606: */
0607: void parseTextDecl(boolean ignoreEncoding)
0608: throws java.lang.Exception {
0609: String encodingName = null;
0610:
0611: // Read an optional version.
0612: if (tryRead("version")) {
0613: String version;
0614: parseEq();
0615: version = readLiteral(0);
0616: if (!version.equals("1.0")) {
0617: error("unsupported XML version", version, "1.0");
0618: }
0619: requireWhitespace();
0620: }
0621:
0622: // Read the encoding.
0623: require("encoding");
0624: parseEq();
0625: encodingName = readLiteral(0);
0626: checkEncoding(encodingName, ignoreEncoding);
0627:
0628: skipWhitespace();
0629: require("?>");
0630: }
0631:
0632: /**
0633: * Check that the encoding specified makes sense.
0634: * <p>Compare what the author has specified in the XML declaration
0635: * or encoding PI with what we have detected.
0636: * <p>This is also important for distinguishing among the various
0637: * 7- and 8-bit encodings, such as ISO-LATIN-1 (I cannot autodetect
0638: * those).
0639: * @param encodingName The name of the encoding specified by the user.
0640: * @see #parseXMLDecl
0641: * @see #parseTextDecl
0642: */
0643: void checkEncoding(String encodingName, boolean ignoreEncoding)
0644: throws java.lang.Exception {
0645: encodingName = encodingName.toUpperCase();
0646:
0647: if (ignoreEncoding) {
0648: return;
0649: }
0650:
0651: switch (encoding) {
0652: // 8-bit encodings
0653: case ENCODING_UTF_8:
0654: if (encodingName.equals("ISO-8859-1")) {
0655: encoding = ENCODING_ISO_8859_1;
0656: } else if (!encodingName.equals("UTF-8")) {
0657: error("unsupported 8-bit encoding", encodingName,
0658: "UTF-8 or ISO-8859-1");
0659: }
0660: break;
0661: // 16-bit encodings
0662: case ENCODING_UCS_2_12:
0663: case ENCODING_UCS_2_21:
0664: if (!encodingName.equals("ISO-10646-UCS-2")
0665: && !encodingName.equals("UTF-16")) {
0666: error("unsupported 16-bit encoding", encodingName,
0667: "ISO-10646-UCS-2");
0668: }
0669: break;
0670: // 32-bit encodings
0671: case ENCODING_UCS_4_1234:
0672: case ENCODING_UCS_4_4321:
0673: case ENCODING_UCS_4_2143:
0674: case ENCODING_UCS_4_3412:
0675: if (!encodingName.equals("ISO-10646-UCS-4")) {
0676: error("unsupported 32-bit encoding", encodingName,
0677: "ISO-10646-UCS-4");
0678: }
0679: }
0680: }
0681:
0682: /**
0683: * Parse miscellaneous markup outside the document element and DOCTYPE
0684: * declaration.
0685: * <pre>
0686: * [27] Misc ::= Comment | PI | S
0687: * </pre>
0688: */
0689: void parseMisc() throws java.lang.Exception {
0690: while (true) {
0691: skipWhitespace();
0692: if (tryRead("<?")) {
0693: parsePI();
0694: } else if (tryRead("<!--")) {
0695: parseComment();
0696: } else {
0697: return;
0698: }
0699: }
0700: }
0701:
0702: /**
0703: * Parse a document type declaration.
0704: * <pre>
0705: * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
0706: * ('[' %markupdecl* ']' S?)? '>'
0707: * </pre>
0708: * <p>(The <code><!DOCTYPE</code> has already been read.)
0709: */
0710: void parseDoctypedecl() throws java.lang.Exception {
0711: char c;
0712: String doctypeName, ids[];
0713:
0714: // Read the document type name.
0715: requireWhitespace();
0716: doctypeName = readNmtoken(true);
0717:
0718: // Read the ExternalIDs.
0719: skipWhitespace();
0720: ids = readExternalIds(false);
0721:
0722: // Look for a declaration subset.
0723: skipWhitespace();
0724: if (tryRead('[')) {
0725:
0726: // loop until the subset ends
0727: while (true) {
0728: context = CONTEXT_DTD;
0729: skipWhitespace();
0730: context = CONTEXT_NONE;
0731: if (tryRead(']')) {
0732: break; // end of subset
0733: } else {
0734: context = CONTEXT_DTD;
0735: parseMarkupdecl();
0736: context = CONTEXT_NONE;
0737: }
0738: }
0739: }
0740:
0741: // Read the external subset, if any
0742: if (ids[1] != null) {
0743: pushURL("[external subset]", ids[0], ids[1], null, null,
0744: null);
0745:
0746: // Loop until we end up back at '>'
0747: while (true) {
0748: context = CONTEXT_DTD;
0749: skipWhitespace();
0750: context = CONTEXT_NONE;
0751: if (tryRead('>')) {
0752: break;
0753: } else {
0754: context = CONTEXT_DTD;
0755: parseMarkupdecl();
0756: context = CONTEXT_NONE;
0757: }
0758: }
0759: } else {
0760: // No external subset.
0761: skipWhitespace();
0762: require('>');
0763: }
0764:
0765: if (handler != null) {
0766: handler.doctypeDecl(doctypeName, ids[0], ids[1]);
0767: }
0768:
0769: // Expand general entities in
0770: // default values of attributes.
0771: // (Do this after the doctypeDecl
0772: // event!).
0773: // expandAttributeDefaultValues();
0774: }
0775:
0776: /**
0777: * Parse a markup declaration in the internal or external DTD subset.
0778: * <pre>
0779: * [29] markupdecl ::= ( %elementdecl | %AttlistDecl | %EntityDecl |
0780: * %NotationDecl | %PI | %S | %Comment |
0781: * InternalPERef )
0782: * [30] InternalPERef ::= PEReference
0783: * [31] extSubset ::= (%markupdecl | %conditionalSect)*
0784: * </pre>
0785: */
0786: void parseMarkupdecl() throws java.lang.Exception {
0787: if (tryRead("<!ELEMENT")) {
0788: parseElementdecl();
0789: } else if (tryRead("<!ATTLIST")) {
0790: parseAttlistDecl();
0791: } else if (tryRead("<!ENTITY")) {
0792: parseEntityDecl();
0793: } else if (tryRead("<!NOTATION")) {
0794: parseNotationDecl();
0795: } else if (tryRead("<?")) {
0796: parsePI();
0797: } else if (tryRead("<!--")) {
0798: parseComment();
0799: } else if (tryRead("<![")) {
0800: parseConditionalSect();
0801: } else {
0802: error("expected markup declaration", null, null);
0803: }
0804: }
0805:
0806: /**
0807: * Parse an element, with its tags.
0808: * <pre>
0809: * [33] STag ::= '<' Name (S Attribute)* S? '>' [WFC: unique Att spec]
0810: * [38] element ::= EmptyElement | STag content ETag
0811: * [39] EmptyElement ::= '<' Name (S Attribute)* S? '/>'
0812: * [WFC: unique Att spec]
0813: * </pre>
0814: * <p>(The '<' has already been read.)
0815: * <p>NOTE: this method actually chains onto parseContent(), if necessary,
0816: * and parseContent() will take care of calling parseETag().
0817: */
0818: void parseElement() throws java.lang.Exception {
0819: String gi;
0820: char c;
0821: int oldElementContent = currentElementContent;
0822: String oldElement = currentElement;
0823:
0824: // This is the (global) counter for the
0825: // array of specified attributes.
0826: tagAttributePos = 0;
0827:
0828: // Read the element type name.
0829: gi = readNmtoken(true);
0830:
0831: // Determine the current content type.
0832: currentElement = gi;
0833: currentElementContent = getElementContentType(gi);
0834: if (currentElementContent == CONTENT_UNDECLARED) {
0835: currentElementContent = CONTENT_ANY;
0836: }
0837:
0838: // Read the attributes, if any.
0839: // After this loop, we should be just
0840: // in front of the closing delimiter.
0841: skipWhitespace();
0842: c = readCh();
0843: while (c != '/' && c != '>') {
0844: unread(c);
0845: parseAttribute(gi);
0846: skipWhitespace();
0847: c = readCh();
0848: }
0849: unread(c);
0850:
0851: // Supply any defaulted attributes.
0852: Enumeration atts = declaredAttributes(gi);
0853: if (atts != null) {
0854: String aname;
0855: loop: while (atts.hasMoreElements()) {
0856: aname = (String) atts.nextElement();
0857: // See if it was specified.
0858: for (int i = 0; i < tagAttributePos; i++) {
0859: if (tagAttributes[i] == aname) {
0860: continue loop;
0861: }
0862: }
0863: // I guess not...
0864: if (handler != null) {
0865: handler.attribute(aname, getAttributeExpandedValue(
0866: gi, aname), false);
0867: }
0868: }
0869: }
0870:
0871: // Figure out if this is a start tag
0872: // or an empty element, and dispatch an
0873: // event accordingly.
0874: c = readCh();
0875: switch (c) {
0876: case '>':
0877: if (handler != null) {
0878: handler.startElement(gi);
0879: }
0880: parseContent();
0881: break;
0882: case '/':
0883: require('>');
0884: if (handler != null) {
0885: handler.startElement(gi);
0886: handler.endElement(gi);
0887: }
0888: break;
0889: }
0890:
0891: // Restore the previous state.
0892: currentElement = oldElement;
0893: currentElementContent = oldElementContent;
0894: }
0895:
0896: /**
0897: * Parse an attribute assignment.
0898: * <pre>
0899: * [34] Attribute ::= Name Eq AttValue
0900: * </pre>
0901: * @param name The name of the attribute's element.
0902: * @see XmlHandler#attribute
0903: */
0904: void parseAttribute(String name) throws java.lang.Exception {
0905: String aname;
0906: int type;
0907: String value;
0908:
0909: // Read the attribute name.
0910: aname = readNmtoken(true).intern();
0911: type = getAttributeDefaultValueType(name, aname);
0912:
0913: // Parse '='
0914: parseEq();
0915:
0916: // Read the value, normalizing whitespace
0917: // if it is not CDATA.
0918: if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) {
0919: value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF);
0920: } else {
0921: value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF
0922: | LIT_NORMALIZE);
0923: }
0924:
0925: // Inform the handler about the
0926: // attribute.
0927: if (handler != null) {
0928: handler.attribute(aname, value, true);
0929: }
0930: dataBufferPos = 0;
0931:
0932: // Note that the attribute has been
0933: // specified.
0934: if (tagAttributePos == tagAttributes.length) {
0935: String newAttrib[] = new String[tagAttributes.length * 2];
0936: System.arraycopy(tagAttributes, 0, newAttrib, 0,
0937: tagAttributePos);
0938: tagAttributes = newAttrib;
0939: }
0940: tagAttributes[tagAttributePos++] = aname;
0941: }
0942:
0943: /**
0944: * Parse an equals sign surrounded by optional whitespace.
0945: * [35] Eq ::= S? '=' S?
0946: */
0947: void parseEq() throws java.lang.Exception {
0948: skipWhitespace();
0949: require('=');
0950: skipWhitespace();
0951: }
0952:
0953: /**
0954: * Parse an end tag.
0955: * [36] ETag ::= '</' Name S? '>'
0956: * *NOTE: parseContent() chains to here.
0957: */
0958: void parseETag() throws java.lang.Exception {
0959: String name;
0960: name = readNmtoken(true);
0961: if (name != currentElement) {
0962: error("mismatched end tag", name, currentElement);
0963: }
0964: skipWhitespace();
0965: require('>');
0966: if (handler != null) {
0967: handler.endElement(name);
0968: }
0969: }
0970:
0971: /**
0972: * Parse the content of an element.
0973: * [37] content ::= (element | PCData | Reference | CDSect | PI | Comment)*
0974: * [68] Reference ::= EntityRef | CharRef
0975: */
0976: void parseContent() throws java.lang.Exception {
0977: String data;
0978: char c;
0979:
0980: while (true) {
0981:
0982: switch (currentElementContent) {
0983: case CONTENT_ANY:
0984: case CONTENT_MIXED:
0985: parsePCData();
0986: break;
0987: case CONTENT_ELEMENTS:
0988: parseWhitespace();
0989: break;
0990: }
0991:
0992: // Handle delimiters
0993: c = readCh();
0994: switch (c) {
0995:
0996: case '&': // Found "&"
0997: c = readCh();
0998: if (c == '#') {
0999: parseCharRef();
1000: } else {
1001: unread(c);
1002: parseEntityRef(true);
1003: }
1004: break;
1005:
1006: case '<': // Found "<"
1007:
1008: c = readCh();
1009: switch (c) {
1010:
1011: case '!': // Found "<!"
1012: c = readCh();
1013: switch (c) {
1014: case '-': // Found "<!-"
1015: require('-');
1016: parseComment();
1017: break;
1018: case '[': // Found "<!["
1019: require("CDATA[");
1020: parseCDSect();
1021: break;
1022: default:
1023: error("expected comment or CDATA section", c,
1024: null);
1025: break;
1026: }
1027: break;
1028:
1029: case '?': // Found "<?"
1030: dataBufferFlush();
1031: parsePI();
1032: break;
1033:
1034: case '/': // Found "</"
1035: dataBufferFlush();
1036: parseETag();
1037: return;
1038:
1039: default: // Found "<" followed by something else
1040: dataBufferFlush();
1041: unread(c);
1042: parseElement();
1043: break;
1044: }
1045: }
1046: }
1047: }
1048:
1049: /**
1050: * Parse an element type declaration.
1051: * [40] elementdecl ::= '<!ELEMENT' S %Name S (%S S)? %contentspec S? '>'
1052: * [VC: Unique Element Declaration]
1053: * *NOTE: the '<!ELEMENT' has already been read.
1054: */
1055: void parseElementdecl() throws java.lang.Exception {
1056: String name;
1057:
1058: requireWhitespace();
1059: // Read the element type name.
1060: name = readNmtoken(true);
1061:
1062: requireWhitespace();
1063: // Read the content model.
1064: parseContentspec(name);
1065:
1066: skipWhitespace();
1067: require('>');
1068: }
1069:
1070: /**
1071: * Content specification.
1072: * [41] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1073: */
1074: void parseContentspec(String name) throws java.lang.Exception {
1075: if (tryRead("EMPTY")) {
1076: setElement(name, CONTENT_EMPTY, null, null);
1077: return;
1078: } else if (tryRead("ANY")) {
1079: setElement(name, CONTENT_ANY, null, null);
1080: return;
1081: } else {
1082: require('(');
1083: dataBufferAppend('(');
1084: skipWhitespace();
1085: if (tryRead("#PCDATA")) {
1086: dataBufferAppend("#PCDATA");
1087: parseMixed();
1088: setElement(name, CONTENT_MIXED, dataBufferToString(),
1089: null);
1090: } else {
1091: parseElements();
1092: setElement(name, CONTENT_ELEMENTS,
1093: dataBufferToString(), null);
1094: }
1095: }
1096: }
1097:
1098: /**
1099: * Parse an element-content model.
1100: * [42] elements ::= (choice | seq) ('?' | '*' | '+')?
1101: * [44] cps ::= S? %cp S?
1102: * [45] choice ::= '(' S? %ctokplus (S? '|' S? %ctoks)* S? ')'
1103: * [46] ctokplus ::= cps ('|' cps)+
1104: * [47] ctoks ::= cps ('|' cps)*
1105: * [48] seq ::= '(' S? %stoks (S? ',' S? %stoks)* S? ')'
1106: * [49] stoks ::= cps (',' cps)*
1107: * *NOTE: the opening '(' and S have already been read.
1108: * *TODO: go over parameter entity boundaries more carefully.
1109: */
1110: void parseElements() throws java.lang.Exception {
1111: char c;
1112: char sep;
1113:
1114: // Parse the first content particle
1115: skipWhitespace();
1116: parseCp();
1117:
1118: // Check for end or for a separator.
1119: skipWhitespace();
1120: c = readCh();
1121: switch (c) {
1122: case ')':
1123: dataBufferAppend(')');
1124: c = readCh();
1125: switch (c) {
1126: case '*':
1127: case '+':
1128: case '?':
1129: dataBufferAppend(c);
1130: break;
1131: default:
1132: unread(c);
1133: }
1134: return;
1135: case ',': // Register the separator.
1136: case '|':
1137: sep = c;
1138: dataBufferAppend(c);
1139: break;
1140: default:
1141: error("bad separator in content model", c, null);
1142: return;
1143: }
1144:
1145: // Parse the rest of the content model.
1146: while (true) {
1147: skipWhitespace();
1148: parseCp();
1149: skipWhitespace();
1150: c = readCh();
1151: if (c == ')') {
1152: dataBufferAppend(')');
1153: break;
1154: } else if (c != sep) {
1155: error("bad separator in content model", c, null);
1156: return;
1157: } else {
1158: dataBufferAppend(c);
1159: }
1160: }
1161:
1162: // Check for the occurrence indicator.
1163: c = readCh();
1164: switch (c) {
1165: case '?':
1166: case '*':
1167: case '+':
1168: dataBufferAppend(c);
1169: return;
1170: default:
1171: unread(c);
1172: return;
1173: }
1174: }
1175:
1176: /**
1177: * Parse a content particle.
1178: * [43] cp ::= (Name | choice | seq) ('?' | '*' | '+')
1179: * *NOTE: I actually use a slightly different production here:
1180: * cp ::= (elements | (Name ('?' | '*' | '+')?))
1181: */
1182: void parseCp() throws java.lang.Exception {
1183: char c;
1184:
1185: if (tryRead('(')) {
1186: dataBufferAppend('(');
1187: parseElements();
1188: } else {
1189: dataBufferAppend(readNmtoken(true));
1190: c = readCh();
1191: switch (c) {
1192: case '?':
1193: case '*':
1194: case '+':
1195: dataBufferAppend(c);
1196: break;
1197: default:
1198: unread(c);
1199: break;
1200: }
1201: }
1202: }
1203:
1204: /**
1205: * Parse mixed content.
1206: * [50] Mixed ::= '(' S? %( %'#PCDATA' (S? '|' S? %Mtoks)* ) S? ')*'
1207: * | '(' S? %('#PCDATA') S? ')'
1208: * [51] Mtoks ::= %Name (S? '|' S? %Name)*
1209: * *NOTE: the S and '#PCDATA' have already been read.
1210: */
1211: void parseMixed() throws java.lang.Exception {
1212: char c;
1213:
1214: // Check for PCDATA alone.
1215: skipWhitespace();
1216: if (tryRead(')')) {
1217: dataBufferAppend(")*");
1218: tryRead('*');
1219: return;
1220: }
1221:
1222: // Parse mixed content.
1223: skipWhitespace();
1224: while (!tryRead(")*")) {
1225: require('|');
1226: dataBufferAppend('|');
1227: skipWhitespace();
1228: dataBufferAppend(readNmtoken(true));
1229: skipWhitespace();
1230: }
1231: dataBufferAppend(")*");
1232: }
1233:
1234: /**
1235: * Parse an attribute list declaration.
1236: * [52] AttlistDecl ::= '<!ATTLIST' S %Name S? %AttDef+ S? '>'
1237: * *NOTE: the '<!ATTLIST' has already been read.
1238: */
1239: void parseAttlistDecl() throws java.lang.Exception {
1240: String elementName;
1241:
1242: requireWhitespace();
1243: elementName = readNmtoken(true);
1244: requireWhitespace();
1245: while (!tryRead('>')) {
1246: parseAttDef(elementName);
1247: skipWhitespace();
1248: }
1249: }
1250:
1251: /**
1252: * Parse a single attribute definition.
1253: * [53] AttDef ::= S %Name S %AttType S %Default
1254: */
1255: void parseAttDef(String elementName) throws java.lang.Exception {
1256: String name;
1257: int type;
1258: String enumeration = null;
1259:
1260: // Read the attribute name.
1261: name = readNmtoken(true);
1262:
1263: // Read the attribute type.
1264: requireWhitespace();
1265: type = readAttType();
1266:
1267: // Get the string of enumerated values
1268: // if necessary.
1269: if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) {
1270: enumeration = dataBufferToString();
1271: }
1272:
1273: // Read the default value.
1274: requireWhitespace();
1275: parseDefault(elementName, name, type, enumeration);
1276: }
1277:
1278: /**
1279: * Parse the attribute type.
1280: * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1281: * [55] StringType ::= 'CDATA'
1282: * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' |
1283: * 'NMTOKEN' | 'NMTOKENS'
1284: * [57] EnumeratedType ::= NotationType | Enumeration
1285: * *TODO: validate the type!!
1286: */
1287: int readAttType() throws java.lang.Exception {
1288: String typeString;
1289: Integer type;
1290:
1291: if (tryRead('(')) {
1292: parseEnumeration();
1293: return ATTRIBUTE_ENUMERATED;
1294: } else {
1295: typeString = readNmtoken(true);
1296: if (typeString.equals("NOTATION")) {
1297: parseNotationType();
1298: }
1299: type = (Integer) attributeTypeHash.get(typeString);
1300: if (type == null) {
1301: error("illegal attribute type", typeString, null);
1302: return ATTRIBUTE_UNDECLARED;
1303: } else {
1304: return type.intValue();
1305: }
1306: }
1307: }
1308:
1309: /**
1310: * Parse an enumeration.
1311: * [60] Enumeration ::= '(' S? %Etoks (S? '|' S? %Etoks)* S? ')'
1312: * [61] Etoks ::= %Nmtoken (S? '|' S? %Nmtoken)*
1313: * *NOTE: the '(' has already been read.
1314: */
1315: void parseEnumeration() throws java.lang.Exception {
1316: char c;
1317:
1318: dataBufferAppend('(');
1319:
1320: // Read the first token.
1321: skipWhitespace();
1322: dataBufferAppend(readNmtoken(true));
1323: // Read the remaining tokens.
1324: skipWhitespace();
1325: while (!tryRead(')')) {
1326: require('|');
1327: dataBufferAppend('|');
1328: skipWhitespace();
1329: dataBufferAppend(readNmtoken(true));
1330: skipWhitespace();
1331: }
1332: dataBufferAppend(')');
1333: }
1334:
1335: /**
1336: * Parse a notation type for an attribute.
1337: * [58] NotationType ::= %'NOTATION' S '(' S? %Ntoks (S? '|' S? %Ntoks)*
1338: * S? ')'
1339: * [59] Ntoks ::= %Name (S? '|' S? %Name)
1340: * *NOTE: the 'NOTATION' has already been read
1341: */
1342: void parseNotationType() throws java.lang.Exception {
1343: requireWhitespace();
1344: require('(');
1345:
1346: parseEnumeration();
1347: }
1348:
1349: /**
1350: * Parse the default value for an attribute.
1351: * [62] Default ::= '#REQUIRED' | '#IMPLIED' | ((%'#FIXED' S)? %AttValue
1352: */
1353: void parseDefault(String elementName, String name, int type,
1354: String enumeration) throws java.lang.Exception {
1355: int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
1356: String value = null;
1357: boolean normalizeWSFlag;
1358:
1359: if (tryRead('#')) {
1360: if (tryRead("FIXED")) {
1361: valueType = ATTRIBUTE_DEFAULT_FIXED;
1362: requireWhitespace();
1363: context = CONTEXT_ATTRIBUTEVALUE;
1364: value = readLiteral(LIT_CHAR_REF);
1365: context = CONTEXT_DTD;
1366: } else if (tryRead("REQUIRED")) {
1367: valueType = ATTRIBUTE_DEFAULT_REQUIRED;
1368: } else if (tryRead("IMPLIED")) {
1369: valueType = ATTRIBUTE_DEFAULT_IMPLIED;
1370: } else {
1371: error("illegal keyword for attribute default value",
1372: null, null);
1373: }
1374: } else {
1375: context = CONTEXT_ATTRIBUTEVALUE;
1376: value = readLiteral(LIT_CHAR_REF);
1377: context = CONTEXT_DTD;
1378: }
1379: setAttribute(elementName, name, type, enumeration, value,
1380: valueType);
1381: }
1382:
1383: /**
1384: * Parse a conditional section.
1385: * [63] conditionalSect ::= includeSect || ignoreSect
1386: * [64] includeSect ::= '<![' %'INCLUDE' '[' (%markupdecl*)* ']]>'
1387: * [65] ignoreSect ::= '<![' %'IGNORE' '[' ignoreSectContents* ']]>'
1388: * [66] ignoreSectContents ::= ((SkipLit | Comment | PI) -(Char* ']]>'))
1389: * | ('<![' ignoreSectContents* ']]>')
1390: * | (Char - (']' | [<'"]))
1391: * | ('<!' (Char - ('-' | '[')))
1392: * *NOTE: the '<![' has already been read.
1393: * *TODO: verify that I am handling ignoreSectContents right.
1394: */
1395: void parseConditionalSect() throws java.lang.Exception {
1396: skipWhitespace();
1397: if (tryRead("INCLUDE")) {
1398: skipWhitespace();
1399: require('[');
1400: skipWhitespace();
1401: while (!tryRead("]]>")) {
1402: parseMarkupdecl();
1403: skipWhitespace();
1404: }
1405: } else if (tryRead("IGNORE")) {
1406: skipWhitespace();
1407: require('[');
1408: int nesting = 1;
1409: char c;
1410: for (int nest = 1; nest > 0;) {
1411: c = readCh();
1412: switch (c) {
1413: case '<':
1414: if (tryRead("![")) {
1415: nest++;
1416: }
1417: case ']':
1418: if (tryRead("]>")) {
1419: nest--;
1420: }
1421: }
1422: }
1423: } else {
1424: error(
1425: "conditional section must begin with INCLUDE or IGNORE",
1426: null, null);
1427: }
1428: }
1429:
1430: /**
1431: * Read a character reference.
1432: * [67] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
1433: * *NOTE: the '&#' has already been read.
1434: */
1435: void parseCharRef() throws java.lang.Exception {
1436: int value = 0;
1437: char c;
1438:
1439: if (tryRead('x')) {
1440: loop1: while (true) {
1441: c = readCh();
1442: switch (c) {
1443: case '0':
1444: case '1':
1445: case '2':
1446: case '3':
1447: case '4':
1448: case '5':
1449: case '6':
1450: case '7':
1451: case '8':
1452: case '9':
1453: case 'a':
1454: case 'A':
1455: case 'b':
1456: case 'B':
1457: case 'c':
1458: case 'C':
1459: case 'd':
1460: case 'D':
1461: case 'e':
1462: case 'E':
1463: case 'f':
1464: case 'F':
1465: value *= 16;
1466: value += Integer.parseInt(new Character(c)
1467: .toString(), 16);
1468: break;
1469: case ';':
1470: break loop1;
1471: default:
1472: error("illegal character in character reference",
1473: c, null);
1474: break loop1;
1475: }
1476: }
1477: } else {
1478: loop2: while (true) {
1479: c = readCh();
1480: switch (c) {
1481: case '0':
1482: case '1':
1483: case '2':
1484: case '3':
1485: case '4':
1486: case '5':
1487: case '6':
1488: case '7':
1489: case '8':
1490: case '9':
1491: value *= 10;
1492: value += Integer.parseInt(new Character(c)
1493: .toString(), 10);
1494: break;
1495: case ';':
1496: break loop2;
1497: default:
1498: error("illegal character in character reference",
1499: c, null);
1500: break loop2;
1501: }
1502: }
1503: }
1504:
1505: // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
1506: // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
1507: if (value <= 0x0000ffff) {
1508: // no surrogates needed
1509: dataBufferAppend((char) value);
1510: } else if (value <= 0x000fffff) {
1511: // > 16 bits, surrogate needed
1512: dataBufferAppend((char) (0xd8 | ((value & 0x000ffc00) >> 10)));
1513: dataBufferAppend((char) (0xdc | (value & 0x0003ff)));
1514: } else {
1515: // too big for surrogate
1516: error("character reference " + value
1517: + " is too large for UTF-16", new Integer(value)
1518: .toString(), null);
1519: }
1520: }
1521:
1522: /**
1523: * Parse a reference.
1524: * [69] EntityRef ::= '&' Name ';'
1525: * *NOTE: the '&' has already been read.
1526: * @param externalAllowed External entities are allowed here.
1527: */
1528: void parseEntityRef(boolean externalAllowed)
1529: throws java.lang.Exception {
1530: String name;
1531:
1532: name = readNmtoken(true);
1533: require(';');
1534: switch (getEntityType(name)) {
1535: case ENTITY_UNDECLARED:
1536: error("reference to undeclared entity", name, null);
1537: break;
1538: case ENTITY_INTERNAL:
1539: pushString(name, getEntityValue(name));
1540: break;
1541: case ENTITY_TEXT:
1542: if (externalAllowed) {
1543: pushURL(name, getEntityPublicId(name),
1544: getEntitySystemId(name), null, null, null);
1545: } else {
1546: error(
1547: "reference to external entity in attribute value.",
1548: name, null);
1549: }
1550: break;
1551: case ENTITY_NDATA:
1552: if (externalAllowed) {
1553: error("data entity reference in content", name, null);
1554: } else {
1555: error(
1556: "reference to external entity in attribute value.",
1557: name, null);
1558: }
1559: break;
1560: }
1561: }
1562:
1563: /**
1564: * Parse a parameter entity reference.
1565: * [70] PEReference ::= '%' Name ';'
1566: * *NOTE: the '%' has already been read.
1567: */
1568: void parsePEReference(boolean isEntityValue)
1569: throws java.lang.Exception {
1570: String name;
1571:
1572: name = "%" + readNmtoken(true);
1573: require(';');
1574: switch (getEntityType(name)) {
1575: case ENTITY_UNDECLARED:
1576: error("reference to undeclared parameter entity", name,
1577: null);
1578: break;
1579: case ENTITY_INTERNAL:
1580: if (isEntityValue) {
1581: pushString(name, getEntityValue(name));
1582: } else {
1583: pushString(name, " " + getEntityValue(name) + ' ');
1584: }
1585: break;
1586: case ENTITY_TEXT:
1587: if (isEntityValue) {
1588: pushString(null, " ");
1589: }
1590: pushURL(name, getEntityPublicId(name),
1591: getEntitySystemId(name), null, null, null);
1592: if (isEntityValue) {
1593: pushString(null, " ");
1594: }
1595: break;
1596: }
1597: }
1598:
1599: /**
1600: * Parse an entity declaration.
1601: * [71] EntityDecl ::= '<!ENTITY' S %Name S %EntityDef S? '>'
1602: * | '<!ENTITY' S '%' S %Name S %EntityDef S? '>'
1603: * [72] EntityDef ::= EntityValue | ExternalDef
1604: * [73] ExternalDef ::= ExternalID %NDataDecl?
1605: * [74] ExternalID ::= 'SYSTEM' S SystemLiteral
1606: * | 'PUBLIC' S PubidLiteral S SystemLiteral
1607: * [75] NDataDecl ::= S %'NDATA' S %Name
1608: * *NOTE: the '<!ENTITY' has already been read.
1609: */
1610: void parseEntityDecl() throws java.lang.Exception {
1611: char c;
1612: boolean peFlag = false;
1613: String name, value, notationName, ids[];
1614:
1615: // Check for a parameter entity.
1616: requireWhitespace();
1617: if (tryRead('%')) {
1618: peFlag = true;
1619: requireWhitespace();
1620: }
1621:
1622: // Read the entity name, and prepend
1623: // '%' if necessary.
1624: name = readNmtoken(true);
1625: if (peFlag) {
1626: name = "%" + name;
1627: }
1628:
1629: // Read the entity value.
1630: requireWhitespace();
1631: c = readCh();
1632: unread(c);
1633: if (c == '"' || c == '\'') {
1634: // Internal entity.
1635: context = CONTEXT_ENTITYVALUE;
1636: value = readLiteral(LIT_CHAR_REF | LIT_PE_REF);
1637: context = CONTEXT_DTD;
1638: setInternalEntity(name, value);
1639: } else {
1640: // Read the external IDs
1641: ids = readExternalIds(false);
1642: if (ids[1] == null) {
1643: error("system identifer missing", name, null);
1644: }
1645:
1646: // Check for NDATA declaration.
1647: skipWhitespace();
1648: if (tryRead("NDATA")) {
1649: requireWhitespace();
1650: notationName = readNmtoken(true);
1651: setExternalDataEntity(name, ids[0], ids[1],
1652: notationName);
1653: } else {
1654: setExternalTextEntity(name, ids[0], ids[1]);
1655: }
1656: }
1657:
1658: // Finish the declaration.
1659: skipWhitespace();
1660: require('>');
1661: }
1662:
1663: /**
1664: * Parse a notation declaration.
1665: * [81] NotationDecl ::= '<!NOTATION' S %Name S %ExternalID S? '>'
1666: * *NOTE: the '<!NOTATION' has already been read.
1667: */
1668: void parseNotationDecl() throws java.lang.Exception {
1669: String nname, ids[];
1670:
1671: requireWhitespace();
1672: nname = readNmtoken(true);
1673:
1674: requireWhitespace();
1675:
1676: // Read the external identifiers.
1677: ids = readExternalIds(true);
1678: if (ids[0] == null && ids[1] == null) {
1679: error("external identifer missing", nname, null);
1680: }
1681:
1682: // Register the notation.
1683: setNotation(nname, ids[0], ids[1]);
1684:
1685: skipWhitespace();
1686: require('>');
1687: }
1688:
1689: /**
1690: * Parse PCDATA.
1691: * <pre>
1692: * [16] PCData ::= [^<&]*
1693: * </pre>
1694: * <p>The trick here is that the data stays in the dataBuffer without
1695: * necessarily being converted to a string right away.
1696: */
1697: void parsePCData() throws java.lang.Exception {
1698: char c;
1699:
1700: // Start with a little cheat -- in most
1701: // cases, the entire sequence of
1702: // character data will already be in
1703: // the readBuffer; if not, fall through to
1704: // the normal approach.
1705: if (USE_CHEATS) {
1706: int lineAugment = 0;
1707: int columnAugment = 0;
1708:
1709: loop: for (int i = readBufferPos; i < readBufferLength; i++) {
1710: switch (readBuffer[i]) {
1711: case '\n':
1712: lineAugment++;
1713: columnAugment = 0;
1714: break;
1715: case '&':
1716: case '<':
1717: int start = readBufferPos;
1718: columnAugment++;
1719: readBufferPos = i;
1720: if (lineAugment > 0) {
1721: line += lineAugment;
1722: column = columnAugment;
1723: } else {
1724: column += columnAugment;
1725: }
1726: dataBufferAppend(readBuffer, start, i - start);
1727: return;
1728: default:
1729: columnAugment++;
1730: }
1731: }
1732: }
1733:
1734: // OK, the cheat didn't work; start over
1735: // and do it by the book.
1736: while (true) {
1737: c = readCh();
1738: switch (c) {
1739: case '<':
1740: case '&':
1741: unread(c);
1742: return;
1743: default:
1744: dataBufferAppend(c);
1745: break;
1746: }
1747: }
1748: }
1749:
1750: //////////////////////////////////////////////////////////////////////
1751: // High-level reading and scanning methods.
1752: //////////////////////////////////////////////////////////////////////
1753:
1754: /**
1755: * Require whitespace characters.
1756: * [1] S ::= (#x20 | #x9 | #xd | #xa)+
1757: */
1758: void requireWhitespace() throws java.lang.Exception {
1759: char c = readCh();
1760: if (isWhitespace(c)) {
1761: skipWhitespace();
1762: } else {
1763: error("whitespace expected", c, null);
1764: }
1765: }
1766:
1767: /**
1768: * Parse whitespace characters, and leave them in the data buffer.
1769: */
1770: void parseWhitespace() throws java.lang.Exception {
1771: char c = readCh();
1772: while (isWhitespace(c)) {
1773: dataBufferAppend(c);
1774: c = readCh();
1775: }
1776: unread(c);
1777: }
1778:
1779: /**
1780: * Skip whitespace characters.
1781: * [1] S ::= (#x20 | #x9 | #xd | #xa)+
1782: */
1783: void skipWhitespace() throws java.lang.Exception {
1784: // Start with a little cheat. Most of
1785: // the time, the white space will fall
1786: // within the current read buffer; if
1787: // not, then fall through.
1788: if (USE_CHEATS) {
1789: int lineAugment = 0;
1790: int columnAugment = 0;
1791:
1792: loop: for (int i = readBufferPos; i < readBufferLength; i++) {
1793: switch (readBuffer[i]) {
1794: case ' ':
1795: case '\t':
1796: case '\r':
1797: columnAugment++;
1798: break;
1799: case '\n':
1800: lineAugment++;
1801: columnAugment = 0;
1802: break;
1803: case '%':
1804: if (context == CONTEXT_DTD
1805: || context == CONTEXT_ENTITYVALUE) {
1806: break loop;
1807: } // else fall through...
1808: default:
1809: readBufferPos = i;
1810: if (lineAugment > 0) {
1811: line += lineAugment;
1812: column = columnAugment;
1813: } else {
1814: column += columnAugment;
1815: }
1816: return;
1817: }
1818: }
1819: }
1820:
1821: // OK, do it by the book.
1822: char c = readCh();
1823: while (isWhitespace(c)) {
1824: c = readCh();
1825: }
1826: unread(c);
1827: }
1828:
1829: /**
1830: * Read a name or name token.
1831: * [5] Name ::= (Letter | '_' | ':') (NameChar)*
1832: * [7] Nmtoken ::= (NameChar)+
1833: * *NOTE: [6] is implemented implicitly where required.
1834: */
1835: String readNmtoken(boolean isName) throws java.lang.Exception {
1836: char c;
1837:
1838: if (USE_CHEATS) {
1839: loop: for (int i = readBufferPos; i < readBufferLength; i++) {
1840: switch (readBuffer[i]) {
1841: case '%':
1842: if (context == CONTEXT_DTD
1843: || context == CONTEXT_ENTITYVALUE) {
1844: break loop;
1845: } // else fall through...
1846: case '<':
1847: case '>':
1848: case '&':
1849: case ',':
1850: case '|':
1851: case '*':
1852: case '+':
1853: case '?':
1854: case ')':
1855: case '=':
1856: case '\'':
1857: case '"':
1858: case '[':
1859: case ' ':
1860: case '\t':
1861: case '\r':
1862: case '\n':
1863: case ';':
1864: case '/':
1865: case '#':
1866: int start = readBufferPos;
1867: if (i == start) {
1868: error("name expected", readBuffer[i], null);
1869: }
1870: readBufferPos = i;
1871: return intern(readBuffer, start, i - start);
1872: }
1873: }
1874: }
1875:
1876: nameBufferPos = 0;
1877:
1878: // Read the first character.
1879: loop: while (true) {
1880: c = readCh();
1881: switch (c) {
1882: case '%':
1883: case '<':
1884: case '>':
1885: case '&':
1886: case ',':
1887: case '|':
1888: case '*':
1889: case '+':
1890: case '?':
1891: case ')':
1892: case '=':
1893: case '\'':
1894: case '"':
1895: case '[':
1896: case ' ':
1897: case '\t':
1898: case '\n':
1899: case '\r':
1900: case ';':
1901: case '/':
1902: unread(c);
1903: if (nameBufferPos == 0) {
1904: error("name expected", null, null);
1905: }
1906: String s = intern(nameBuffer, 0, nameBufferPos);
1907: nameBufferPos = 0;
1908: return s;
1909: default:
1910: nameBuffer = (char[]) extendArray(nameBuffer,
1911: nameBuffer.length, nameBufferPos);
1912: nameBuffer[nameBufferPos++] = c;
1913: }
1914: }
1915: }
1916:
1917: /**
1918: * Read a literal.
1919: * [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
1920: * | "'" ([^<&'] | Reference)* "'"
1921: * [11] SystemLiteral ::= '"' URLchar* '"' | "'" (URLchar - "'")* "'"
1922: * [13] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
1923: * [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
1924: * | "'" ([^%&'] | PEReference | Reference)* "'"
1925: */
1926: String readLiteral(int flags) throws java.lang.Exception {
1927: char delim, c;
1928: int startLine = line;
1929:
1930: // Find the delimiter.
1931: delim = readCh();
1932: if (delim != '"' && delim != '\'' && delim != (char) 0) {
1933: error("expected '\"' or \"'\"", delim, null);
1934: return null;
1935: }
1936:
1937: // Read the literal.
1938: try {
1939: c = readCh();
1940:
1941: loop: while (c != delim) {
1942: switch (c) {
1943: // Literals never have line ends
1944: case '\n':
1945: case '\r':
1946: c = ' ';
1947: break;
1948: // References may be allowed
1949: case '&':
1950: if ((flags & LIT_CHAR_REF) > 0) {
1951: c = readCh();
1952: if (c == '#') {
1953: parseCharRef();
1954: c = readCh();
1955: continue loop; // check the next character
1956: } else if ((flags & LIT_ENTITY_REF) > 0) {
1957: unread(c);
1958: parseEntityRef(false);
1959: c = readCh();
1960: continue loop;
1961: } else {
1962: dataBufferAppend('&');
1963: }
1964: }
1965: break;
1966:
1967: default:
1968: break;
1969: }
1970: dataBufferAppend(c);
1971: c = readCh();
1972: }
1973: } catch (EOFException e) {
1974: error(
1975: "end of input while looking for delimiter (started on line "
1976: + startLine + ')', null, new Character(
1977: delim).toString());
1978: }
1979:
1980: // Normalise whitespace if necessary.
1981: if ((flags & LIT_NORMALIZE) > 0) {
1982: dataBufferNormalize();
1983: }
1984:
1985: // Return the value.
1986: return dataBufferToString();
1987: }
1988:
1989: /**
1990: * Try reading external identifiers.
1991: * <p>The system identifier is not required for notations.
1992: * @param inNotation Are we in a notation?
1993: * @return A two-member String array containing the identifiers.
1994: */
1995: String[] readExternalIds(boolean inNotation)
1996: throws java.lang.Exception {
1997: char c;
1998: String ids[] = new String[2];
1999:
2000: if (tryRead("PUBLIC")) {
2001: requireWhitespace();
2002: ids[0] = readLiteral(LIT_NORMALIZE); // public id
2003: if (inNotation) {
2004: skipWhitespace();
2005: if (tryRead('"') || tryRead('\'')) {
2006: ids[1] = readLiteral(0);
2007: }
2008: } else {
2009: requireWhitespace();
2010: ids[1] = readLiteral(0); // system id
2011: }
2012: } else if (tryRead("SYSTEM")) {
2013: requireWhitespace();
2014: ids[1] = readLiteral(0); // system id
2015: }
2016:
2017: return ids;
2018: }
2019:
2020: /**
2021: * Test if a character is whitespace.
2022: * <pre>
2023: * [1] S ::= (#x20 | #x9 | #xd | #xa)+
2024: * </pre>
2025: * @param c The character to test.
2026: * @return true if the character is whitespace.
2027: */
2028: final boolean isWhitespace(char c) {
2029: switch ((int) c) {
2030: case 0x20:
2031: case 0x09:
2032: case 0x0d:
2033: case 0x0a:
2034: return true;
2035: default:
2036: return false;
2037: }
2038: }
2039:
2040: //////////////////////////////////////////////////////////////////////
2041: // Utility routines.
2042: //////////////////////////////////////////////////////////////////////
2043:
2044: /**
2045: * Add a character to the data buffer.
2046: */
2047: void dataBufferAppend(char c) {
2048: // Expand buffer if necessary.
2049: dataBuffer = (char[]) extendArray(dataBuffer,
2050: dataBuffer.length, dataBufferPos);
2051: dataBuffer[dataBufferPos++] = c;
2052: }
2053:
2054: /**
2055: * Add a string to the data buffer.
2056: */
2057: void dataBufferAppend(String s) {
2058: dataBufferAppend(s.toCharArray(), 0, s.length());
2059: }
2060:
2061: /**
2062: * Append (part of) a character array to the data buffer.
2063: */
2064: void dataBufferAppend(char ch[], int start, int length) {
2065: dataBuffer = (char[]) extendArray(dataBuffer,
2066: dataBuffer.length, dataBufferPos + length);
2067: System.arraycopy((Object) ch, start, (Object) dataBuffer,
2068: dataBufferPos, length);
2069: dataBufferPos += length;
2070: }
2071:
2072: /**
2073: * Normalise whitespace in the data buffer.
2074: */
2075: void dataBufferNormalize() {
2076: int i = 0;
2077: int j = 0;
2078: int end = dataBufferPos;
2079:
2080: // Skip whitespace at the start.
2081: while (j < end && isWhitespace(dataBuffer[j])) {
2082: j++;
2083: }
2084:
2085: // Skip whitespace at the end.
2086: while (end > j && isWhitespace(dataBuffer[end - 1])) {
2087: end--;
2088: }
2089:
2090: // Start copying to the left.
2091: while (j < end) {
2092:
2093: char c = dataBuffer[j++];
2094:
2095: // Normalise all other whitespace to
2096: // a single space.
2097: if (isWhitespace(c)) {
2098: while (j < end && isWhitespace(dataBuffer[j++])) {
2099: }
2100: dataBuffer[i++] = ' ';
2101: dataBuffer[i++] = dataBuffer[j - 1];
2102: } else {
2103: dataBuffer[i++] = c;
2104: }
2105: }
2106:
2107: // The new length is <= the old one.
2108: dataBufferPos = i;
2109: }
2110:
2111: /**
2112: * Convert the data buffer to a string.
2113: * @param internFlag true if the contents should be interned.
2114: * @see #intern(char[],int,int)
2115: */
2116: String dataBufferToString() {
2117: String s = new String(dataBuffer, 0, dataBufferPos);
2118: dataBufferPos = 0;
2119: return s;
2120: }
2121:
2122: /**
2123: * Flush the contents of the data buffer to the handler, if
2124: * appropriate, and reset the buffer for new input.
2125: */
2126: void dataBufferFlush() throws java.lang.Exception {
2127: if (dataBufferPos > 0) {
2128: switch (currentElementContent) {
2129: case CONTENT_UNDECLARED:
2130: case CONTENT_EMPTY:
2131: // do nothing
2132: break;
2133: case CONTENT_MIXED:
2134: case CONTENT_ANY:
2135: if (handler != null) {
2136: handler.charData(dataBuffer, 0, dataBufferPos);
2137: }
2138: break;
2139: case CONTENT_ELEMENTS:
2140: if (handler != null) {
2141: handler.ignorableWhitespace(dataBuffer, 0,
2142: dataBufferPos);
2143: }
2144: break;
2145: }
2146: dataBufferPos = 0;
2147: }
2148: }
2149:
2150: /**
2151: * Require a string to appear, or throw an exception.
2152: */
2153: void require(String delim) throws java.lang.Exception {
2154: char ch[] = delim.toCharArray();
2155: for (int i = 0; i < ch.length; i++) {
2156: require(ch[i]);
2157: }
2158: }
2159:
2160: /**
2161: * Require a character to appear, or throw an exception.
2162: */
2163: void require(char delim) throws java.lang.Exception {
2164: char c = readCh();
2165:
2166: if (c != delim) {
2167: error("expected character", c, new Character(delim)
2168: .toString());
2169: }
2170: }
2171:
2172: /**
2173: * Return an internalised version of a string.
2174: * <p>Ælfred uses this method to create an internalised version
2175: * of all names and attribute values, so that it can test equality
2176: * with <code>==</code> instead of <code>String.equals()</code>.
2177: * <p>If you want to be able to test for equality in the same way,
2178: * you can use this method to internalise your own strings first:
2179: * <pre>
2180: * String PARA = handler.intern("PARA");
2181: * </pre>
2182: * <p>Note that this will not return the same results as String.intern().
2183: * @param s The string to internalise.
2184: * @return An internalised version of the string.
2185: * @see #intern(char[],int,int)
2186: * @see java.lang.String#intern
2187: */
2188: public String intern(String s) {
2189: char ch[] = s.toCharArray();
2190: return intern(ch, 0, ch.length);
2191: }
2192:
2193: /**
2194: * Create an internalised string from a character array.
2195: * <p>This is much more efficient than constructing a non-internalised
2196: * string first, and then internalising it.
2197: * <p>Note that this will not return the same results as String.intern().
2198: * @param ch an array of characters for building the string.
2199: * @param start the starting position in the array.
2200: * @param length the number of characters to place in the string.
2201: * @return an internalised string.
2202: * @see #intern(String)
2203: * @see java.lang.String#intern
2204: */
2205: public String intern(char ch[], int start, int length) {
2206: int index;
2207: int hash = 0;
2208:
2209: // Generate a hash code.
2210: for (int i = start; i < start + length; i++) {
2211: hash = ((hash << 1) & 0xffffff) + (int) ch[i];
2212: }
2213:
2214: hash = hash % SYMBOL_TABLE_LENGTH;
2215:
2216: // Get the bucket.
2217: Object bucket[] = (Object[]) symbolTable[hash];
2218: if (bucket == null) {
2219: symbolTable[hash] = bucket = new Object[8];
2220: }
2221:
2222: // Search for a matching tuple, and
2223: // return the string if we find one.
2224: for (index = 0; index < bucket.length; index += 2) {
2225: char chFound[] = (char[]) bucket[index];
2226:
2227: // Stop when we hit a null index.
2228: if (chFound == null) {
2229: break;
2230: }
2231:
2232: // If they're the same length,
2233: // check for a match.
2234: // If the loop finishes, 'index' will
2235: // contain the current bucket
2236: // position.
2237: if (chFound.length == length) {
2238: for (int i = 0; i < chFound.length; i++) {
2239: // Stop if there are no more tuples.
2240: if (ch[start + i] != chFound[i]) {
2241: break;
2242: } else if (i == length - 1) {
2243: // That's it, we have a match!
2244: return (String) bucket[index + 1];
2245: }
2246: }
2247: }
2248: }
2249:
2250: // Not found -- we'll have to add it.
2251:
2252: // Do we have to grow the bucket?
2253: bucket = (Object[]) extendArray(bucket, bucket.length, index);
2254:
2255: // OK, add it to the end of the
2256: // bucket.
2257: String s = new String(ch, start, length);
2258: bucket[index] = s.toCharArray();
2259: bucket[index + 1] = s;
2260: symbolTable[hash] = bucket;
2261: return s;
2262: }
2263:
2264: /**
2265: * Ensure the capacity of an array, allocating a new one if
2266: * necessary.
2267: */
2268: Object extendArray(Object array, int currentSize, int requiredSize) {
2269: if (requiredSize < currentSize) {
2270: return array;
2271: } else {
2272: Object newArray = null;
2273: int newSize = currentSize * 2;
2274:
2275: if (newSize <= requiredSize) {
2276: newSize = requiredSize + 1;
2277: }
2278:
2279: if (array instanceof char[]) {
2280: newArray = new char[newSize];
2281: } else if (array instanceof Object[]) {
2282: newArray = new Object[newSize];
2283: }
2284:
2285: System.arraycopy(array, 0, newArray, 0, currentSize);
2286: return newArray;
2287: }
2288: }
2289:
2290: //////////////////////////////////////////////////////////////////////
2291: // XML query routines.
2292: //////////////////////////////////////////////////////////////////////
2293:
2294: //
2295: // Elements
2296: //
2297:
2298: /**
2299: * Get the declared elements for an XML document.
2300: * <p>The results will be valid only after the DTD (if any) has been
2301: * parsed.
2302: * @return An enumeration of all element types declared for this
2303: * document (as Strings).
2304: * @see #getElementContentType
2305: * @see #getElementContentModel
2306: */
2307: public Enumeration declaredElements() {
2308: return elementInfo.keys();
2309: }
2310:
2311: /**
2312: * Look up the content type of an element.
2313: * @param name The element type name.
2314: * @return An integer constant representing the content type.
2315: * @see #getElementContentModel
2316: * @see #CONTENT_UNDECLARED
2317: * @see #CONTENT_ANY
2318: * @see #CONTENT_EMPTY
2319: * @see #CONTENT_MIXED
2320: * @see #CONTENT_ELEMENTS
2321: */
2322: public int getElementContentType(String name) {
2323: Object element[] = (Object[]) elementInfo.get(name);
2324: if (element == null) {
2325: return CONTENT_UNDECLARED;
2326: } else {
2327: return ((Integer) element[0]).intValue();
2328: }
2329: }
2330:
2331: /**
2332: * Look up the content model of an element.
2333: * <p>The result will always be null unless the content type is
2334: * CONTENT_ELEMENTS or CONTENT_MIXED.
2335: * @param name The element type name.
2336: * @return The normalised content model, as a string.
2337: * @see #getElementContentType
2338: */
2339: public String getElementContentModel(String name) {
2340: Object element[] = (Object[]) elementInfo.get(name);
2341: if (element == null) {
2342: return null;
2343: } else {
2344: return (String) element[1];
2345: }
2346: }
2347:
2348: /**
2349: * Register an element.
2350: * Array format:
2351: * element type
2352: * attribute hash table
2353: */
2354: void setElement(String name, int contentType, String contentModel,
2355: Hashtable attributes) throws java.lang.Exception {
2356: Object element[];
2357:
2358: // Try looking up the element
2359: element = (Object[]) elementInfo.get(name);
2360:
2361: // Make a new one if necessary.
2362: if (element == null) {
2363: element = new Object[3];
2364: element[0] = new Integer(CONTENT_UNDECLARED);
2365: element[1] = null;
2366: element[2] = null;
2367: } else if (contentType != CONTENT_UNDECLARED
2368: && ((Integer) element[0]).intValue() != CONTENT_UNDECLARED) {
2369: error("multiple declarations for element type", name, null);
2370: return;
2371: }
2372:
2373: // Insert the content type, if any.
2374: if (contentType != CONTENT_UNDECLARED) {
2375: element[0] = new Integer(contentType);
2376: }
2377:
2378: // Insert the content model, if any.
2379: if (contentModel != null) {
2380: element[1] = contentModel;
2381: }
2382:
2383: // Insert the attributes, if any.
2384: if (attributes != null) {
2385: element[2] = attributes;
2386: }
2387:
2388: // Save the element info.
2389: elementInfo.put(name, element);
2390: }
2391:
2392: /**
2393: * Look up the attribute hash table for an element.
2394: * The hash table is the second item in the element array.
2395: */
2396: Hashtable getElementAttributes(String name) {
2397: Object element[] = (Object[]) elementInfo.get(name);
2398: if (element == null) {
2399: return null;
2400: } else {
2401: return (Hashtable) element[2];
2402: }
2403: }
2404:
2405: //
2406: // Attributes
2407: //
2408:
2409: /**
2410: * Get the declared attributes for an element type.
2411: * @param elname The name of the element type.
2412: * @return An Enumeration of all the attributes declared for
2413: * a specific element type. The results will be valid only
2414: * after the DTD (if any) has been parsed.
2415: * @see #getAttributeType
2416: * @see #getAttributeEnumeration
2417: * @see #getAttributeDefaultValueType
2418: * @see #getAttributeDefaultValue
2419: * @see #getAttributeExpandedValue
2420: */
2421: public Enumeration declaredAttributes(String elname) {
2422: Hashtable attlist = getElementAttributes(elname);
2423:
2424: if (attlist == null) {
2425: return null;
2426: } else {
2427: return attlist.keys();
2428: }
2429: }
2430:
2431: /**
2432: * Retrieve the declared type of an attribute.
2433: * @param name The name of the associated element.
2434: * @param aname The name of the attribute.
2435: * @return An integer constant representing the attribute type.
2436: * @see #ATTRIBUTE_UNDECLARED
2437: * @see #ATTRIBUTE_CDATA
2438: * @see #ATTRIBUTE_ID
2439: * @see #ATTRIBUTE_IDREF
2440: * @see #ATTRIBUTE_IDREFS
2441: * @see #ATTRIBUTE_ENTITY
2442: * @see #ATTRIBUTE_ENTITIES
2443: * @see #ATTRIBUTE_NMTOKEN
2444: * @see #ATTRIBUTE_NMTOKENS
2445: * @see #ATTRIBUTE_ENUMERATED
2446: * @see #ATTRIBUTE_NOTATION
2447: */
2448: public int getAttributeType(String name, String aname) {
2449: Object attribute[] = getAttribute(name, aname);
2450: if (attribute == null) {
2451: return ATTRIBUTE_UNDECLARED;
2452: } else {
2453: return ((Integer) attribute[0]).intValue();
2454: }
2455: }
2456:
2457: /**
2458: * Retrieve the allowed values for an enumerated attribute type.
2459: * @param name The name of the associated element.
2460: * @param aname The name of the attribute.
2461: * @return A string containing the token list.
2462: * @see #ATTRIBUTE_ENUMERATED
2463: * @see #ATTRIBUTE_NOTATION
2464: */
2465: public String getAttributeEnumeration(String name, String aname) {
2466: Object attribute[] = getAttribute(name, aname);
2467: if (attribute == null) {
2468: return null;
2469: } else {
2470: return (String) attribute[3];
2471: }
2472: }
2473:
2474: /**
2475: * Retrieve the default value of a declared attribute.
2476: * @param name The name of the associated element.
2477: * @param aname The name of the attribute.
2478: * @return The default value, or null if the attribute was
2479: * #IMPLIED or simply undeclared and unspecified.
2480: * @see #getAttributeExpandedValue
2481: */
2482: public String getAttributeDefaultValue(String name, String aname) {
2483: Object attribute[] = getAttribute(name, aname);
2484: if (attribute == null) {
2485: return null;
2486: } else {
2487: return (String) attribute[1];
2488: }
2489: }
2490:
2491: /**
2492: * Retrieve the expanded value of a declared attribute.
2493: * <p>All general entities will be expanded.
2494: * @param name The name of the associated element.
2495: * @param aname The name of the attribute.
2496: * @return The expanded default value, or null if the attribute was
2497: * #IMPLIED or simply undeclared
2498: * @see #getAttributeDefaultValue
2499: */
2500: public String getAttributeExpandedValue(String name, String aname) {
2501: Object attribute[] = getAttribute(name, aname);
2502: if (attribute == null) {
2503: return null;
2504: } else if (attribute[4] == null && attribute[1] != null) {
2505: try {
2506: pushString(null, (char) 0 + (String) attribute[1]
2507: + (char) 0);
2508: attribute[4] = readLiteral(LIT_NORMALIZE | LIT_CHAR_REF
2509: | LIT_ENTITY_REF);
2510: } catch (Exception e) {
2511: }
2512: }
2513: return (String) attribute[4];
2514: }
2515:
2516: /**
2517: * Retrieve the default value type of a declared attribute.
2518: * @see #ATTRIBUTE_DEFAULT_SPECIFIED
2519: * @see #ATTRIBUTE_DEFAULT_IMPLIED
2520: * @see #ATTRIBUTE_DEFAULT_REQUIRED
2521: * @see #ATTRIBUTE_DEFAULT_FIXED
2522: */
2523: public int getAttributeDefaultValueType(String name, String aname) {
2524: Object attribute[] = getAttribute(name, aname);
2525: if (attribute == null) {
2526: return ATTRIBUTE_DEFAULT_UNDECLARED;
2527: } else {
2528: return ((Integer) attribute[2]).intValue();
2529: }
2530: }
2531:
2532: /**
2533: * Register an attribute declaration for later retrieval.
2534: * Format:
2535: * - String type
2536: * - String default value
2537: * - int value type
2538: * *TODO: do something with attribute types.
2539: */
2540: void setAttribute(String elName, String name, int type,
2541: String enumeration, String value, int valueType)
2542: throws java.lang.Exception {
2543: Hashtable attlist;
2544: Object attribute[];
2545:
2546: // Create a new hashtable if necessary.
2547: attlist = getElementAttributes(elName);
2548: if (attlist == null) {
2549: attlist = new Hashtable();
2550: }
2551:
2552: // Check that the attribute doesn't
2553: // already exist!
2554: if (attlist.get(name) != null) {
2555: return;
2556: } else {
2557: attribute = new Object[5];
2558: attribute[0] = new Integer(type);
2559: attribute[1] = value;
2560: attribute[2] = new Integer(valueType);
2561: attribute[3] = enumeration;
2562: attribute[4] = null;
2563: attlist.put(name.intern(), attribute);
2564:
2565: // Use CONTENT_UNDECLARED to avoid overwriting
2566: // existing element declaration.
2567: setElement(elName, CONTENT_UNDECLARED, null, attlist);
2568: }
2569: }
2570:
2571: /**
2572: * Retrieve the three-member array representing an
2573: * attribute declaration.
2574: */
2575: Object[] getAttribute(String elName, String name) {
2576: Hashtable attlist;
2577: Object attribute[];
2578:
2579: attlist = getElementAttributes(elName);
2580: if (attlist == null) {
2581: return null;
2582: }
2583:
2584: attribute = (Object[]) attlist.get(name);
2585: return attribute;
2586: }
2587:
2588: //
2589: // Entities
2590: //
2591:
2592: /**
2593: * Get declared entities.
2594: * @return An Enumeration of all the entities declared for
2595: * this XML document. The results will be valid only
2596: * after the DTD (if any) has been parsed.
2597: * @see #getEntityType
2598: * @see #getEntityPublicId
2599: * @see #getEntitySystemId
2600: * @see #getEntityValue
2601: * @see #getEntityNotationName
2602: */
2603: public Enumeration declaredEntities() {
2604: return entityInfo.keys();
2605: }
2606:
2607: /**
2608: * Find the type of an entity.
2609: * @returns An integer constant representing the entity type.
2610: * @see #ENTITY_UNDECLARED
2611: * @see #ENTITY_INTERNAL
2612: * @see #ENTITY_NDATA
2613: * @see #ENTITY_TEXT
2614: */
2615: public int getEntityType(String ename) {
2616: Object entity[] = (Object[]) entityInfo.get(ename);
2617: if (entity == null) {
2618: return ENTITY_UNDECLARED;
2619: } else {
2620: return ((Integer) entity[0]).intValue();
2621: }
2622: }
2623:
2624: /**
2625: * Return an external entity's public identifier, if any.
2626: * @param ename The name of the external entity.
2627: * @return The entity's system identifier, or null if the
2628: * entity was not declared, if it is not an
2629: * external entity, or if no public identifier was
2630: * provided.
2631: * @see #getEntityType
2632: */
2633: public String getEntityPublicId(String ename) {
2634: Object entity[] = (Object[]) entityInfo.get(ename);
2635: if (entity == null) {
2636: return null;
2637: } else {
2638: return (String) entity[1];
2639: }
2640: }
2641:
2642: /**
2643: * Return an external entity's system identifier.
2644: * @param ename The name of the external entity.
2645: * @return The entity's system identifier, or null if the
2646: * entity was not declared, or if it is not an
2647: * external entity.
2648: * @see #getEntityType
2649: */
2650: public String getEntitySystemId(String ename) {
2651: Object entity[] = (Object[]) entityInfo.get(ename);
2652: if (entity == null) {
2653: return null;
2654: } else {
2655: return (String) entity[2];
2656: }
2657: }
2658:
2659: /**
2660: * Return the value of an internal entity.
2661: * @param ename The name of the internal entity.
2662: * @return The entity's value, or null if the entity was
2663: * not declared, or if it is not an internal entity.
2664: * @see #getEntityType
2665: */
2666: public String getEntityValue(String ename) {
2667: Object entity[] = (Object[]) entityInfo.get(ename);
2668: if (entity == null) {
2669: return null;
2670: } else {
2671: return (String) entity[3];
2672: }
2673: }
2674:
2675: /**
2676: * Get the notation name associated with an NDATA entity.
2677: * @param ename The NDATA entity name.
2678: * @return The associated notation name, or null if the
2679: * entity was not declared, or if it is not an
2680: * NDATA entity.
2681: * @see #getEntityType
2682: */
2683: public String getEntityNotationName(String eName) {
2684: Object entity[] = (Object[]) entityInfo.get(eName);
2685: if (entity == null) {
2686: return null;
2687: } else {
2688: return (String) entity[4];
2689: }
2690: }
2691:
2692: /**
2693: * Register an entity declaration for later retrieval.
2694: */
2695: void setInternalEntity(String eName, String value) {
2696: setEntity(eName, ENTITY_INTERNAL, null, null, value, null);
2697: }
2698:
2699: /**
2700: * Register an external data entity.
2701: */
2702: void setExternalDataEntity(String eName, String pubid,
2703: String sysid, String nName) {
2704: setEntity(eName, ENTITY_NDATA, pubid, sysid, null, nName);
2705: }
2706:
2707: /**
2708: * Register an external text entity.
2709: */
2710: void setExternalTextEntity(String eName, String pubid, String sysid) {
2711: setEntity(eName, ENTITY_TEXT, pubid, sysid, null, null);
2712: }
2713:
2714: /**
2715: * Register an entity declaration for later retrieval.
2716: */
2717: void setEntity(String eName, int eClass, String pubid,
2718: String sysid, String value, String nName) {
2719: Object entity[];
2720:
2721: if (entityInfo.get(eName) == null) {
2722: entity = new Object[5];
2723: entity[0] = new Integer(eClass);
2724: entity[1] = pubid;
2725: entity[2] = sysid;
2726: entity[3] = value;
2727: entity[4] = nName;
2728:
2729: entityInfo.put(eName, entity);
2730: }
2731: }
2732:
2733: //
2734: // Notations.
2735: //
2736:
2737: /**
2738: * Get declared notations.
2739: * @return An Enumeration of all the notations declared for
2740: * this XML document. The results will be valid only
2741: * after the DTD (if any) has been parsed.
2742: * @see #getNotationPublicId
2743: * @see #getNotationSystemId
2744: */
2745: public Enumeration declaredNotations() {
2746: return notationInfo.keys();
2747: }
2748:
2749: /**
2750: * Look up the public identifier for a notation.
2751: * You will normally use this method to look up a notation
2752: * that was provided as an attribute value or for an NDATA entity.
2753: * @param nname The name of the notation.
2754: * @return A string containing the public identifier, or null
2755: * if none was provided or if no such notation was
2756: * declared.
2757: * @see #getNotationSystemId
2758: */
2759: public String getNotationPublicId(String nname) {
2760: Object notation[] = (Object[]) notationInfo.get(nname);
2761: if (notation == null) {
2762: return null;
2763: } else {
2764: return (String) notation[0];
2765: }
2766: }
2767:
2768: /**
2769: * Look up the system identifier for a notation.
2770: * You will normally use this method to look up a notation
2771: * that was provided as an attribute value or for an NDATA entity.
2772: * @param nname The name of the notation.
2773: * @return A string containing the system identifier, or null
2774: * if no such notation was declared.
2775: * @see #getNotationPublicId
2776: */
2777: public String getNotationSystemId(String nname) {
2778: Object notation[] = (Object[]) notationInfo.get(nname);
2779: if (notation == null) {
2780: return null;
2781: } else {
2782: return (String) notation[1];
2783: }
2784: }
2785:
2786: /**
2787: * Register a notation declaration for later retrieval.
2788: * Format:
2789: * - public id
2790: * - system id
2791: */
2792: void setNotation(String nname, String pubid, String sysid)
2793: throws java.lang.Exception {
2794: Object notation[];
2795:
2796: if (notationInfo.get(nname) == null) {
2797: notation = new Object[2];
2798: notation[0] = pubid;
2799: notation[1] = sysid;
2800: notationInfo.put(nname, notation);
2801: } else {
2802: error("multiple declarations of notation", nname, null);
2803: }
2804: }
2805:
2806: //
2807: // Location.
2808: //
2809:
2810: /**
2811: * Return the current line number.
2812: */
2813: public int getLineNumber() {
2814: return line;
2815: }
2816:
2817: /**
2818: * Return the current column number.
2819: */
2820: public int getColumnNumber() {
2821: return column;
2822: }
2823:
2824: //////////////////////////////////////////////////////////////////////
2825: // High-level I/O.
2826: //////////////////////////////////////////////////////////////////////
2827:
2828: /**
2829: * Read a single character from the readBuffer.
2830: * <p>The readDataChunk() method maintains the buffer.
2831: * <p>If we hit the end of an entity, try to pop the stack and
2832: * keep going.
2833: * <p>(This approach doesn't really enforce XML's rules about
2834: * entity boundaries, but this is not currently a validating
2835: * parser).
2836: * <p>This routine also attempts to keep track of the current
2837: * position in external entities, but it's not entirely accurate.
2838: * @return The next available input character.
2839: * @see #unread(char)
2840: * @see #unread(String)
2841: * @see #readDataChunk
2842: * @see #readBuffer
2843: * @see #line
2844: * @return The next character from the current input source.
2845: */
2846: char readCh() throws java.lang.Exception {
2847: char c;
2848:
2849: // As long as there's nothing in the
2850: // read buffer, try reading more data
2851: // (for an external entity) or popping
2852: // the entity stack (for either).
2853: while (readBufferPos >= readBufferLength) {
2854: switch (sourceType) {
2855: case INPUT_READER:
2856: case INPUT_EXTERNAL:
2857: case INPUT_STREAM:
2858: readDataChunk();
2859: while (readBufferLength < 1) {
2860: popInput();
2861: if (readBufferLength < 1) {
2862: readDataChunk();
2863: }
2864: }
2865: break;
2866:
2867: default:
2868: popInput();
2869: break;
2870: }
2871: }
2872:
2873: c = readBuffer[readBufferPos++];
2874:
2875: // This is a particularly nasty bit
2876: // of code, that checks for a parameter
2877: // entity reference but peeks ahead to
2878: // catch the '%' in parameter entity
2879: // declarations.
2880: if (c == '%'
2881: && (context == CONTEXT_DTD || context == CONTEXT_ENTITYVALUE)) {
2882: char c2 = readCh();
2883: unread(c2);
2884: if (!isWhitespace(c2)) {
2885: parsePEReference(context == CONTEXT_ENTITYVALUE);
2886: return readCh();
2887: }
2888: }
2889:
2890: if (c == '\n') {
2891: line++;
2892: column = 0;
2893: } else {
2894: column++;
2895: }
2896:
2897: return c;
2898: }
2899:
2900: /**
2901: * Push a single character back onto the current input stream.
2902: * <p>This method usually pushes the character back onto
2903: * the readBuffer, while the unread(String) method treats the
2904: * string as a new internal entity.
2905: * <p>I don't think that this would ever be called with
2906: * readBufferPos = 0, because the methods always reads a character
2907: * before unreading it, but just in case, I've added a boundary
2908: * condition.
2909: * @param c The character to push back.
2910: * @see #readCh
2911: * @see #unread(String)
2912: * @see #unread(char[])
2913: * @see #readBuffer
2914: */
2915: void unread(char c) throws java.lang.Exception {
2916: // Normal condition.
2917: if (c == '\n') {
2918: line--;
2919: column = -1;
2920: }
2921: if (readBufferPos > 0) {
2922: readBuffer[--readBufferPos] = c;
2923: } else {
2924: pushString(null, new Character(c).toString());
2925: }
2926: }
2927:
2928: /**
2929: * Push a char array back onto the current input stream.
2930: * <p>NOTE: you must <em>never</em> push back characters that you
2931: * haven't actually read: use pushString() instead.
2932: * @see #readCh
2933: * @see #unread(char)
2934: * @see #unread(String)
2935: * @see #readBuffer
2936: * @see #pushString
2937: */
2938: void unread(char ch[], int length) throws java.lang.Exception {
2939: for (int i = 0; i < length; i++) {
2940: if (ch[i] == '\n') {
2941: line--;
2942: column = -1;
2943: }
2944: }
2945: if (length < readBufferPos) {
2946: readBufferPos -= length;
2947: } else {
2948: pushCharArray(null, ch, 0, length);
2949: sourceType = INPUT_BUFFER;
2950: }
2951: }
2952:
2953: /**
2954: * Push a new external input source.
2955: * <p>The source will be either an external text entity, or the DTD
2956: * external subset.
2957: * <p>TO DO: Right now, this method always attempts to autodetect
2958: * the encoding; in the future, it should allow the caller to
2959: * request an encoding explicitly, and it should also look at the
2960: * headers with an HTTP connection.
2961: * @param url The java.net.URL object for the entity.
2962: * @see XmlHandler#resolveEntity
2963: * @see #pushString
2964: * @see #sourceType
2965: * @see #pushInput
2966: * @see #detectEncoding
2967: * @see #sourceType
2968: * @see #readBuffer
2969: */
2970: void pushURL(String ename, String publicId, String systemId,
2971: Reader reader, InputStream stream, String encoding)
2972: throws java.lang.Exception {
2973: URL url;
2974: boolean ignoreEncoding = false;
2975:
2976: // Push the existing status.
2977: pushInput(ename);
2978:
2979: // Create a new read buffer.
2980: // (Note the four-character margin)
2981: readBuffer = new char[READ_BUFFER_MAX + 4];
2982: readBufferPos = 0;
2983: readBufferLength = 0;
2984: readBufferOverflow = -1;
2985: is = null;
2986: line = 1;
2987:
2988: currentByteCount = 0;
2989:
2990: // Flush any remaining data.
2991: dataBufferFlush();
2992:
2993: // Make the URL absolute.
2994: if (systemId != null && externalEntity != null) {
2995: systemId = new URL(externalEntity.getURL(), systemId)
2996: .toString();
2997: } else if (baseURI != null) {
2998: try {
2999: systemId = new URL(new URL(baseURI), systemId)
3000: .toString();
3001: } catch (Exception e) {
3002: }
3003: }
3004:
3005: // See if the application wants to
3006: // redirect the system ID and/or
3007: // supply its own character stream.
3008: if (systemId != null && handler != null) {
3009: Object input = handler.resolveEntity(publicId, systemId);
3010: if (input != null) {
3011: if (input instanceof String) {
3012: systemId = (String) input;
3013: } else if (input instanceof InputStream) {
3014: stream = (InputStream) input;
3015: } else if (input instanceof Reader) {
3016: reader = (Reader) input;
3017: }
3018: }
3019: }
3020:
3021: // Start the entity.
3022: if (handler != null) {
3023: if (systemId != null) {
3024: handler.startExternalEntity(systemId);
3025: } else {
3026: handler.startExternalEntity("[external stream]");
3027: }
3028: }
3029:
3030: // Figure out what we're reading from.
3031: if (reader != null) {
3032: // There's an explicit character stream.
3033: sourceType = INPUT_READER;
3034: this .reader = reader;
3035: tryEncodingDecl(true);
3036: return;
3037: } else if (stream != null) {
3038: sourceType = INPUT_STREAM;
3039: is = stream;
3040: } else {
3041: // We have to open our own stream
3042: // to the URL.
3043:
3044: // Set the new status
3045: sourceType = INPUT_EXTERNAL;
3046: url = new URL(systemId);
3047:
3048: externalEntity = url.openConnection();
3049: externalEntity.connect();
3050: is = externalEntity.getInputStream();
3051: }
3052:
3053: // If we get to here, there must be
3054: // an InputStream available.
3055: if (!is.markSupported()) {
3056: is = new BufferedInputStream(is);
3057: }
3058:
3059: // Attempt to detect the encoding.
3060: if (encoding == null && externalEntity != null) {
3061: encoding = externalEntity.getContentEncoding();
3062: }
3063:
3064: if (encoding != null) {
3065: checkEncoding(encoding, false);
3066: ignoreEncoding = true;
3067: } else {
3068: detectEncoding();
3069: ignoreEncoding = false;
3070: }
3071:
3072: // Read an XML or text declaration.
3073: tryEncodingDecl(ignoreEncoding);
3074: }
3075:
3076: /**
3077: * Check for an encoding declaration.
3078: */
3079: void tryEncodingDecl(boolean ignoreEncoding)
3080: throws java.lang.Exception {
3081: // Read the XML/Encoding declaration.
3082: if (tryRead("<?xml")) {
3083: if (tryWhitespace()) {
3084: if (inputStack.size() > 0) {
3085: parseTextDecl(ignoreEncoding);
3086: } else {
3087: parseXMLDecl(ignoreEncoding);
3088: }
3089: } else {
3090: unread("xml".toCharArray(), 3);
3091: parsePI();
3092: }
3093: }
3094: }
3095:
3096: /**
3097: * Attempt to detect the encoding of an entity.
3098: * <p>The trick here (as suggested in the XML standard) is that
3099: * any entity not in UTF-8, or in UCS-2 with a byte-order mark,
3100: * <b>must</b> begin with an XML declaration or an encoding
3101: * declaration; we simply have to look for "<?XML" in various
3102: * encodings.
3103: * <p>This method has no way to distinguish among 8-bit encodings.
3104: * Instead, it assumes UTF-8, then (possibly) revises its assumption
3105: * later in checkEncoding(). Any ASCII-derived 8-bit encoding
3106: * should work, but most will be rejected later by checkEncoding().
3107: * <p>I don't currently detect EBCDIC, since I'm concerned that it
3108: * could also be a valid UTF-8 sequence; I'll have to do more checking
3109: * later.
3110: * @see #tryEncoding(byte[], byte, byte, byte, byte)
3111: * @see #tryEncoding(byte[], byte, byte)
3112: * @see #checkEncoding
3113: * @see #read8bitEncodingDeclaration
3114: */
3115: void detectEncoding() throws java.lang.Exception {
3116: byte signature[] = new byte[4];
3117:
3118: // Read the first four bytes for
3119: // autodetection.
3120: is.mark(4);
3121: is.read(signature);
3122: is.reset();
3123:
3124: // Look for a known signature.
3125: if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
3126: (byte) 0x00, (byte) 0x3c)) {
3127: // UCS-4 must begin with "<!XML"
3128: // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234)
3129: encoding = ENCODING_UCS_4_1234;
3130: } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
3131: (byte) 0x00, (byte) 0x00)) {
3132: // UCS-4 must begin with "<!XML"
3133: // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321)
3134: encoding = ENCODING_UCS_4_4321;
3135: } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
3136: (byte) 0x3c, (byte) 0x00)) {
3137: // UCS-4 must begin with "<!XML"
3138: // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143)
3139: encoding = ENCODING_UCS_4_2143;
3140: } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
3141: (byte) 0x00, (byte) 0x00)) {
3142: // UCS-4 must begin with "<!XML"
3143: // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421)
3144: encoding = ENCODING_UCS_4_3412;
3145: } else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff)) {
3146: // UCS-2 with a byte-order marker.
3147: // 0xfe 0xff: UCS-2, big-endian (12)
3148: encoding = ENCODING_UCS_2_12;
3149: is.read();
3150: is.read();
3151: } else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe)) {
3152: // UCS-2 with a byte-order marker.
3153: // 0xff 0xfe: UCS-2, little-endian (21)
3154: encoding = ENCODING_UCS_2_21;
3155: is.read();
3156: is.read();
3157: } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
3158: (byte) 0x00, (byte) 0x3f)) {
3159: // UCS-2 without a BOM must begin with "<?XML"
3160: // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark
3161: encoding = ENCODING_UCS_2_12;
3162: error("no byte-order mark for UCS-2 entity", null, null);
3163: } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
3164: (byte) 0x3f, (byte) 0x00)) {
3165: // UCS-2 without a BOM must begin with "<?XML"
3166: // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark
3167: encoding = ENCODING_UCS_2_21;
3168: error("no byte-order mark for UCS-2 entity", null, null);
3169: } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f,
3170: (byte) 0x78, (byte) 0x6d)) {
3171: // Some kind of 8-bit encoding with "<?XML"
3172: // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING)
3173: encoding = ENCODING_UTF_8;
3174: read8bitEncodingDeclaration();
3175: } else {
3176: // Some kind of 8-bit encoding without "<?XML"
3177: // (otherwise) UTF-8 without encoding/XML declaration
3178: encoding = ENCODING_UTF_8;
3179: }
3180: }
3181:
3182: /**
3183: * Check for a four-byte signature.
3184: * <p>Utility routine for detectEncoding().
3185: * <p>Always looks for some part of "<?XML" in a specific encoding.
3186: * @param sig The first four bytes read.
3187: * @param b1 The first byte of the signature
3188: * @param b2 The second byte of the signature
3189: * @param b3 The third byte of the signature
3190: * @param b4 The fourth byte of the signature
3191: * @see #detectEncoding
3192: */
3193: boolean tryEncoding(byte sig[], byte b1, byte b2, byte b3, byte b4) {
3194: return (sig[0] == b1 && sig[1] == b2 && sig[2] == b3 && sig[3] == b4);
3195: }
3196:
3197: /**
3198: * Check for a two-byte signature.
3199: * <p>Looks for a UCS-2 byte-order mark.
3200: * <p>Utility routine for detectEncoding().
3201: * @param sig The first four bytes read.
3202: * @param b1 The first byte of the signature
3203: * @param b2 The second byte of the signature
3204: * @see #detectEncoding
3205: */
3206: boolean tryEncoding(byte sig[], byte b1, byte b2) {
3207: return ((sig[0] == b1) && (sig[1] == b2));
3208: }
3209:
3210: /**
3211: * This method pushes a string back onto input.
3212: * <p>It is useful either as the expansion of an internal entity,
3213: * or for backtracking during the parse.
3214: * <p>Call pushCharArray() to do the actual work.
3215: * @param s The string to push back onto input.
3216: * @see #pushCharArray
3217: */
3218: void pushString(String ename, String s) throws java.lang.Exception {
3219: char ch[] = s.toCharArray();
3220: pushCharArray(ename, ch, 0, ch.length);
3221: }
3222:
3223: /**
3224: * Push a new internal input source.
3225: * <p>This method is useful for expanding an internal entity,
3226: * or for unreading a string of characters. It creates a new
3227: * readBuffer containing the characters in the array, instead
3228: * of characters converted from an input byte stream.
3229: * <p>I've added a couple of optimisations: don't push zero-
3230: * length strings, and just push back a single character
3231: * for 1-character strings; this should save some time and memory.
3232: * @param ch The char array to push.
3233: * @see #pushString
3234: * @see #pushURL
3235: * @see #readBuffer
3236: * @see #sourceType
3237: * @see #pushInput
3238: */
3239: void pushCharArray(String ename, char ch[], int start, int length)
3240: throws java.lang.Exception {
3241: // Push the existing status
3242: pushInput(ename);
3243: sourceType = INPUT_INTERNAL;
3244: readBuffer = ch;
3245: readBufferPos = start;
3246: readBufferLength = length;
3247: readBufferOverflow = -1;
3248: }
3249:
3250: /**
3251: * Save the current input source onto the stack.
3252: * <p>This method saves all of the global variables associated with
3253: * the current input source, so that they can be restored when a new
3254: * input source has finished. It also tests for entity recursion.
3255: * <p>The method saves the following global variables onto a stack
3256: * using a fixed-length array:
3257: * <ol>
3258: * <li>sourceType
3259: * <li>externalEntity
3260: * <li>readBuffer
3261: * <li>readBufferPos
3262: * <li>readBufferLength
3263: * <li>line
3264: * <li>encoding
3265: * </ol>
3266: * @param ename The name of the entity (if any) causing the new input.
3267: * @see #popInput
3268: * @see #sourceType
3269: * @see #externalEntity
3270: * @see #readBuffer
3271: * @see #readBufferPos
3272: * @see #readBufferLength
3273: * @see #line
3274: * @see #encoding
3275: */
3276: void pushInput(String ename) throws java.lang.Exception {
3277: Object input[] = new Object[12];
3278:
3279: // Check for entity recursion.
3280: if (ename != null) {
3281: Enumeration entities = entityStack.elements();
3282: while (entities.hasMoreElements()) {
3283: String e = (String) entities.nextElement();
3284: if (e == ename) {
3285: error("recursive reference to entity", ename, null);
3286: }
3287: }
3288: }
3289: entityStack.push(ename);
3290:
3291: // Don't bother if there is no input.
3292: if (sourceType == INPUT_NONE) {
3293: return;
3294: }
3295:
3296: // Set up a snapshot of the current
3297: // input source.
3298: input[0] = new Integer(sourceType);
3299: input[1] = externalEntity;
3300: input[2] = readBuffer;
3301: input[3] = new Integer(readBufferPos);
3302: input[4] = new Integer(readBufferLength);
3303: input[5] = new Integer(line);
3304: input[6] = new Integer(encoding);
3305: input[7] = new Integer(readBufferOverflow);
3306: input[8] = is;
3307: input[9] = new Integer(currentByteCount);
3308: input[10] = new Integer(column);
3309: input[11] = reader;
3310:
3311: // Push it onto the stack.
3312: inputStack.push(input);
3313: }
3314:
3315: /**
3316: * Restore a previous input source.
3317: * <p>This method restores all of the global variables associated with
3318: * the current input source.
3319: * @exception java.io.EOFException
3320: * If there are no more entries on the input stack.
3321: * @see #pushInput
3322: * @see #sourceType
3323: * @see #externalEntity
3324: * @see #readBuffer
3325: * @see #readBufferPos
3326: * @see #readBufferLength
3327: * @see #line
3328: * @see #encoding
3329: */
3330: void popInput() throws java.lang.Exception {
3331: Object input[];
3332:
3333: switch (sourceType) {
3334:
3335: case INPUT_EXTERNAL:
3336: dataBufferFlush();
3337: if (handler != null && externalEntity != null) {
3338: handler.endExternalEntity(externalEntity.getURL()
3339: .toString());
3340: }
3341: break;
3342: case INPUT_STREAM:
3343: dataBufferFlush();
3344: if (baseURI != null) {
3345: if (handler != null) {
3346: handler.endExternalEntity(baseURI);
3347: }
3348: }
3349: break;
3350: case INPUT_READER:
3351: dataBufferFlush();
3352: if (baseURI != null) {
3353: if (handler != null) {
3354: handler.endExternalEntity(baseURI);
3355: }
3356: }
3357: break;
3358: }
3359:
3360: // Throw an EOFException if there
3361: // is nothing else to pop.
3362: if (inputStack.isEmpty()) {
3363: throw new EOFException();
3364: } else {
3365: String s;
3366: input = (Object[]) inputStack.pop();
3367: s = (String) entityStack.pop();
3368: }
3369:
3370: sourceType = ((Integer) input[0]).intValue();
3371: externalEntity = (URLConnection) input[1];
3372: readBuffer = (char[]) input[2];
3373: readBufferPos = ((Integer) input[3]).intValue();
3374: readBufferLength = ((Integer) input[4]).intValue();
3375: line = ((Integer) input[5]).intValue();
3376: encoding = ((Integer) input[6]).intValue();
3377: readBufferOverflow = ((Integer) input[7]).intValue();
3378: is = (InputStream) input[8];
3379: currentByteCount = ((Integer) input[9]).intValue();
3380: column = ((Integer) input[10]).intValue();
3381: reader = (Reader) input[11];
3382: }
3383:
3384: /**
3385: * Return true if we can read the expected character.
3386: * <p>Note that the character will be removed from the input stream
3387: * on success, but will be put back on failure. Do not attempt to
3388: * read the character again if the method succeeds.
3389: * @param delim The character that should appear next. For a
3390: * insensitive match, you must supply this in upper-case.
3391: * @return true if the character was successfully read, or false if
3392: * it was not.
3393: * @see #tryRead(String)
3394: */
3395: boolean tryRead(char delim) throws java.lang.Exception {
3396: char c;
3397:
3398: // Read the character
3399: c = readCh();
3400:
3401: // Test for a match, and push the character
3402: // back if the match fails.
3403: if (c == delim) {
3404: return true;
3405: } else {
3406: unread(c);
3407: return false;
3408: }
3409: }
3410:
3411: /**
3412: * Return true if we can read the expected string.
3413: * <p>This is simply a convenience method.
3414: * <p>Note that the string will be removed from the input stream
3415: * on success, but will be put back on failure. Do not attempt to
3416: * read the string again if the method succeeds.
3417: * <p>This method will push back a character rather than an
3418: * array whenever possible (probably the majority of cases).
3419: * <p><b>NOTE:</b> This method currently has a hard-coded limit
3420: * of 100 characters for the delimiter.
3421: * @param delim The string that should appear next.
3422: * @return true if the string was successfully read, or false if
3423: * it was not.
3424: * @see #tryRead(char)
3425: */
3426: boolean tryRead(String delim) throws java.lang.Exception {
3427: char ch[] = delim.toCharArray();
3428: char c;
3429:
3430: // Compare the input, character-
3431: // by character.
3432:
3433: for (int i = 0; i < ch.length; i++) {
3434: c = readCh();
3435: if (c != ch[i]) {
3436: unread(c);
3437: if (i != 0) {
3438: unread(ch, i);
3439: }
3440: return false;
3441: }
3442: }
3443: return true;
3444: }
3445:
3446: /**
3447: * Return true if we can read some whitespace.
3448: * <p>This is simply a convenience method.
3449: * <p>This method will push back a character rather than an
3450: * array whenever possible (probably the majority of cases).
3451: * @return true if whitespace was found.
3452: */
3453: boolean tryWhitespace() throws java.lang.Exception {
3454: char c;
3455: c = readCh();
3456: if (isWhitespace(c)) {
3457: skipWhitespace();
3458: return true;
3459: } else {
3460: unread(c);
3461: return false;
3462: }
3463: }
3464:
3465: /**
3466: * Read all data until we find the specified string.
3467: * <p>This is especially useful for scanning marked sections.
3468: * <p>This is a a little inefficient right now, since it calls tryRead()
3469: * for every character.
3470: * @param delim The string delimiter
3471: * @see #tryRead(String, boolean)
3472: * @see #readCh
3473: */
3474: void parseUntil(String delim) throws java.lang.Exception {
3475: char c;
3476: int startLine = line;
3477:
3478: try {
3479: while (!tryRead(delim)) {
3480: c = readCh();
3481: dataBufferAppend(c);
3482: }
3483: } catch (EOFException e) {
3484: error(
3485: "end of input while looking for delimiter (started on line "
3486: + startLine + ')', null, delim);
3487: }
3488: }
3489:
3490: /**
3491: * Skip all data until we find the specified string.
3492: * <p>This is especially useful for scanning comments.
3493: * <p>This is a a little inefficient right now, since it calls tryRead()
3494: * for every character.
3495: * @param delim The string delimiter
3496: * @see #tryRead(String, boolean)
3497: * @see #readCh
3498: */
3499: void skipUntil(String delim) throws java.lang.Exception {
3500: while (!tryRead(delim)) {
3501: readCh();
3502: }
3503: }
3504:
3505: /**
3506: * Read just the encoding declaration (or XML declaration) at the
3507: * start of an external entity.
3508: * When this method is called, we know that the declaration is
3509: * present (or appears to be). We also know that the entity is
3510: * in some sort of ASCII-derived 8-bit encoding.
3511: * The idea of this is to let us read what the 8-bit encoding is
3512: * before we've committed to converting any more of the file; the
3513: * XML or encoding declaration must be in 7-bit ASCII, so we're
3514: * safe as long as we don't go past it.
3515: */
3516: void read8bitEncodingDeclaration() throws java.lang.Exception {
3517: int ch;
3518: readBufferPos = readBufferLength = 0;
3519:
3520: while (true) {
3521: ch = is.read();
3522: readBuffer[readBufferLength++] = (char) ch;
3523: switch (ch) {
3524: case (int) '>':
3525: return;
3526: case -1:
3527: error(
3528: "end of file before end of XML or encoding declaration.",
3529: null, "?>");
3530: return;
3531: }
3532: if (readBuffer.length == readBufferLength) {
3533: error("unfinished XML or encoding declaration", null,
3534: null);
3535: }
3536: }
3537: }
3538:
3539: //////////////////////////////////////////////////////////////////////
3540: // Low-level I/O.
3541: //////////////////////////////////////////////////////////////////////
3542:
3543: /**
3544: * Read a chunk of data from an external input source.
3545: * <p>This is simply a front-end that fills the rawReadBuffer
3546: * with bytes, then calls the appropriate encoding handler.
3547: * @see #encoding
3548: * @see #rawReadBuffer
3549: * @see #readBuffer
3550: * @see #filterCR
3551: * @see #copyUtf8ReadBuffer
3552: * @see #copyIso8859_1ReadBuffer
3553: * @see #copyUcs_2ReadBuffer
3554: * @see #copyUcs_4ReadBuffer
3555: */
3556: void readDataChunk() throws java.lang.Exception {
3557: int count, i, j;
3558:
3559: // See if we have any overflow.
3560: if (readBufferOverflow > -1) {
3561: readBuffer[0] = (char) readBufferOverflow;
3562: readBufferOverflow = -1;
3563: readBufferPos = 1;
3564: sawCR = true;
3565: } else {
3566: readBufferPos = 0;
3567: sawCR = false;
3568: }
3569:
3570: // Special situation -- we're taking
3571: // input from a character stream.
3572: if (sourceType == INPUT_READER) {
3573: count = reader.read(readBuffer, readBufferPos,
3574: READ_BUFFER_MAX - 1);
3575: if (count < 0) {
3576: readBufferLength = -1;
3577: } else {
3578: readBufferLength = readBufferPos + count;
3579: filterCR();
3580: sawCR = false;
3581: }
3582: return;
3583: }
3584:
3585: // Read as many bytes as possible
3586: // into the read buffer.
3587: count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX);
3588:
3589: // Dispatch to an encoding-specific
3590: // reader method to populate the
3591: // readBuffer.
3592: switch (encoding) {
3593: case ENCODING_UTF_8:
3594: copyUtf8ReadBuffer(count);
3595: break;
3596:
3597: case ENCODING_ISO_8859_1:
3598: copyIso8859_1ReadBuffer(count);
3599: break;
3600:
3601: case ENCODING_UCS_2_12:
3602: copyUcs2ReadBuffer(count, 8, 0);
3603: break;
3604:
3605: case ENCODING_UCS_2_21:
3606: copyUcs2ReadBuffer(count, 0, 8);
3607: break;
3608:
3609: case ENCODING_UCS_4_1234:
3610: copyUcs4ReadBuffer(count, 24, 16, 8, 0);
3611: break;
3612:
3613: case ENCODING_UCS_4_4321:
3614: copyUcs4ReadBuffer(count, 0, 8, 16, 24);
3615: break;
3616:
3617: case ENCODING_UCS_4_2143:
3618: copyUcs4ReadBuffer(count, 16, 24, 0, 8);
3619: break;
3620:
3621: case ENCODING_UCS_4_3412:
3622: copyUcs4ReadBuffer(count, 8, 0, 24, 16);
3623: break;
3624: }
3625:
3626: // Filter out all carriage returns
3627: // if we've seen any.
3628: if (sawCR) {
3629: filterCR();
3630: sawCR = false;
3631: }
3632:
3633: // Reset the position.
3634: readBufferPos = 0;
3635: currentByteCount += count;
3636: }
3637:
3638: /**
3639: * Filter carriage returns in the read buffer.
3640: * <p>CRLF becomes LF; CR becomes LF.
3641: * @see #readDataChunk
3642: * @see #readBuffer
3643: * @see #readBufferOverflow
3644: */
3645: void filterCR() {
3646: int i, j;
3647:
3648: readBufferOverflow = -1;
3649:
3650: loop: for (i = 0, j = 0; j < readBufferLength; i++, j++) {
3651: switch (readBuffer[j]) {
3652: case '\r':
3653: if (j == readBufferLength - 1) {
3654: readBufferOverflow = '\r';
3655: readBufferLength--;
3656: break loop;
3657: } else if (readBuffer[j + 1] == '\n') {
3658: j++;
3659: }
3660: readBuffer[i] = '\n';
3661: break;
3662:
3663: case '\n':
3664: default:
3665: readBuffer[i] = readBuffer[j];
3666: break;
3667: }
3668: }
3669: readBufferLength = i;
3670: }
3671:
3672: /**
3673: * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters.
3674: * <p>When readDataChunk() calls this method, the raw bytes are in
3675: * rawReadBuffer, and the final characters will appear in
3676: * readBuffer.
3677: * <p>The tricky part of this is dealing with UTF-8 multi-byte
3678: * sequences, but it doesn't seem to slow things down too much.
3679: * @param count The number of bytes to convert.
3680: * @see #readDataChunk
3681: * @see #rawReadBuffer
3682: * @see #readBuffer
3683: * @see #getNextUtf8Byte
3684: */
3685: void copyUtf8ReadBuffer(int count) throws java.lang.Exception {
3686: int i = 0;
3687: int j = readBufferPos;
3688: int b1;
3689: boolean isSurrogate = false;
3690: while (i < count) {
3691: b1 = rawReadBuffer[i++];
3692: isSurrogate = false;
3693:
3694: // Determine whether we are dealing
3695: // with a one-, two-, three-, or four-
3696: // byte sequence.
3697: if ((b1 & 0x80) == 0) {
3698: // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
3699: readBuffer[j++] = (char) b1;
3700: } else if ((b1 & 0xe0) == 0xc0) {
3701: // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
3702: readBuffer[j++] = (char) (((b1 & 0x1f) << 6) | getNextUtf8Byte(
3703: i++, count));
3704: } else if ((b1 & 0xf0) == 0xe0) {
3705: // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
3706: readBuffer[j++] = (char) (((b1 & 0x0f) << 12)
3707: | (getNextUtf8Byte(i++, count) << 6) | getNextUtf8Byte(
3708: i++, count));
3709: } else if ((b1 & 0xf8) == 0xf0) {
3710: // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
3711: // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
3712: // (uuuuu = wwww + 1)
3713: isSurrogate = true;
3714: int b2 = getNextUtf8Byte(i++, count);
3715: int b3 = getNextUtf8Byte(i++, count);
3716: int b4 = getNextUtf8Byte(i++, count);
3717: readBuffer[j++] = (char) (0xd800
3718: | ((((b1 & 0x07) << 2) | ((b2 & 0x30) >> 4) - 1) << 6)
3719: | ((b2 & 0x0f) << 2) | ((b3 & 0x30) >> 4));
3720: readBuffer[j++] = (char) (0xdc | ((b3 & 0x0f) << 6) | b4);
3721: // TODO: test that surrogate value is legal.
3722: } else {
3723: // Otherwise, the 8th bit may not be set in UTF-8
3724: encodingError(
3725: "bad start for UTF-8 multi-byte sequence", b1,
3726: i);
3727: }
3728: if (readBuffer[j - 1] == '\r') {
3729: sawCR = true;
3730: }
3731: }
3732: // How many characters have we read?
3733: readBufferLength = j;
3734: }
3735:
3736: /**
3737: * Return the next byte value in a UTF-8 sequence.
3738: * If it is not possible to get a byte from the current
3739: * entity, throw an exception.
3740: * @param pos The current position in the rawReadBuffer.
3741: * @param count The number of bytes in the rawReadBuffer
3742: * @return The significant six bits of a non-initial byte in
3743: * a UTF-8 sequence.
3744: * @exception EOFException If the sequence is incomplete.
3745: */
3746: int getNextUtf8Byte(int pos, int count) throws java.lang.Exception {
3747: int val;
3748:
3749: // Take a character from the buffer
3750: // or from the actual input stream.
3751: if (pos < count) {
3752: val = rawReadBuffer[pos];
3753: } else {
3754: val = is.read();
3755: if (val == -1) {
3756: encodingError(
3757: "unfinished multi-byte UTF-8 sequence at EOF",
3758: -1, pos);
3759: }
3760: }
3761:
3762: // Check for the correct bits at the
3763: // start.
3764: if ((val & 0xc0) != 0x80) {
3765: encodingError(
3766: "bad continuation of multi-byte UTF-8 sequence",
3767: val, pos + 1);
3768: }
3769:
3770: // Return the significant bits.
3771: return (val & 0x3f);
3772: }
3773:
3774: /**
3775: * Convert a buffer of ISO-8859-1-encoded bytes into UTF-16 characters.
3776: * <p>When readDataChunk() calls this method, the raw bytes are in
3777: * rawReadBuffer, and the final characters will appear in
3778: * readBuffer.
3779: * <p>This is a direct conversion, with no tricks.
3780: * @param count The number of bytes to convert.
3781: * @see #readDataChunk
3782: * @see #rawReadBuffer
3783: * @see #readBuffer
3784: */
3785: void copyIso8859_1ReadBuffer(int count) {
3786: int i, j;
3787: for (i = 0, j = readBufferPos; i < count; i++, j++) {
3788: readBuffer[j] = (char) (rawReadBuffer[i] & 0xff);
3789: if (readBuffer[j] == '\r') {
3790: sawCR = true;
3791: }
3792: }
3793: readBufferLength = j;
3794: }
3795:
3796: /**
3797: * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters.
3798: * <p>When readDataChunk() calls this method, the raw bytes are in
3799: * rawReadBuffer, and the final characters will appear in
3800: * readBuffer.
3801: * @param count The number of bytes to convert.
3802: * @param shift1 The number of bits to shift byte 1.
3803: * @param shift2 The number of bits to shift byte 2
3804: * @see #readDataChunk
3805: * @see #rawReadBuffer
3806: * @see #readBuffer
3807: */
3808: void copyUcs2ReadBuffer(int count, int shift1, int shift2)
3809: throws java.lang.Exception {
3810: int j = readBufferPos;
3811:
3812: if (count > 0 && (count % 2) != 0) {
3813: encodingError("odd number of bytes in UCS-2 encoding", -1,
3814: count);
3815: }
3816: for (int i = 0; i < count; i += 2) {
3817: readBuffer[j++] = (char) (((rawReadBuffer[i] & 0xff) << shift1) | ((rawReadBuffer[i + 1] & 0xff) << shift2));
3818: if (readBuffer[j - 1] == '\r') {
3819: sawCR = true;
3820: }
3821: }
3822: readBufferLength = j;
3823: }
3824:
3825: /**
3826: * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters.
3827: * <p>When readDataChunk() calls this method, the raw bytes are in
3828: * rawReadBuffer, and the final characters will appear in
3829: * readBuffer.
3830: * <p>Java has 16-bit chars, but this routine will attempt to use
3831: * surrogates to encoding values between 0x00010000 and 0x000fffff.
3832: * @param count The number of bytes to convert.
3833: * @param shift1 The number of bits to shift byte 1.
3834: * @param shift2 The number of bits to shift byte 2
3835: * @param shift3 The number of bits to shift byte 2
3836: * @param shift4 The number of bits to shift byte 2
3837: * @see #readDataChunk
3838: * @see #rawReadBuffer
3839: * @see #readBuffer
3840: */
3841: void copyUcs4ReadBuffer(int count, int shift1, int shift2,
3842: int shift3, int shift4) throws java.lang.Exception {
3843: int j = readBufferPos;
3844: int value;
3845:
3846: if (count > 0 && (count % 4) != 0) {
3847: encodingError(
3848: "number of bytes in UCS-4 encoding not divisible by 4",
3849: -1, count);
3850: }
3851: for (int i = 0; i < count; i += 4) {
3852: value = (((rawReadBuffer[i] & 0xff) << shift1)
3853: | ((rawReadBuffer[i + 1] & 0xff) << shift2)
3854: | ((rawReadBuffer[i + 2] & 0xff) << shift3) | ((rawReadBuffer[i + 3] & 0xff) << shift4));
3855: if (value < 0x0000ffff) {
3856: readBuffer[j++] = (char) value;
3857: if (value == (int) '\r') {
3858: sawCR = true;
3859: }
3860: } else if (value < 0x000fffff) {
3861: readBuffer[j++] = (char) (0xd8 | ((value & 0x000ffc00) >> 10));
3862: readBuffer[j++] = (char) (0xdc | (value & 0x0003ff));
3863: } else {
3864: encodingError("value cannot be represented in UTF-16",
3865: value, i);
3866: }
3867: }
3868: readBufferLength = j;
3869: }
3870:
3871: /**
3872: * Report a character encoding error.
3873: */
3874: void encodingError(String message, int value, int offset)
3875: throws java.lang.Exception {
3876: String uri;
3877:
3878: if (value >= 0) {
3879: message = message + " (byte value: 0x"
3880: + Integer.toHexString(value) + ')';
3881: }
3882: if (externalEntity != null) {
3883: uri = externalEntity.getURL().toString();
3884: } else {
3885: uri = baseURI;
3886: }
3887: handler.error(message, uri, -1, offset + currentByteCount);
3888: }
3889:
3890: //////////////////////////////////////////////////////////////////////
3891: // Local Variables.
3892: //////////////////////////////////////////////////////////////////////
3893:
3894: /**
3895: * Re-initialize the variables for each parse.
3896: */
3897: void initializeVariables() {
3898: // No errors; first line
3899: errorCount = 0;
3900: line = 1;
3901: column = 0;
3902:
3903: // Set up the buffers for data and names
3904: dataBufferPos = 0;
3905: dataBuffer = new char[DATA_BUFFER_INITIAL];
3906: nameBufferPos = 0;
3907: nameBuffer = new char[NAME_BUFFER_INITIAL];
3908:
3909: // Set up the DTD hash tables
3910: elementInfo = new Hashtable();
3911: entityInfo = new Hashtable();
3912: notationInfo = new Hashtable();
3913:
3914: // Set up the variables for the current
3915: // element context.
3916: currentElement = null;
3917: currentElementContent = CONTENT_UNDECLARED;
3918:
3919: // Set up the input variables
3920: sourceType = INPUT_NONE;
3921: inputStack = new Stack();
3922: entityStack = new Stack();
3923: externalEntity = null;
3924: tagAttributePos = 0;
3925: tagAttributes = new String[100];
3926: rawReadBuffer = new byte[READ_BUFFER_MAX];
3927: readBufferOverflow = -1;
3928:
3929: context = CONTEXT_NONE;
3930:
3931: symbolTable = new Object[SYMBOL_TABLE_LENGTH];
3932: }
3933:
3934: /**
3935: * Clean up after the parse to allow some garbage collection.
3936: * Leave around anything that might be useful for queries.
3937: */
3938: void cleanupVariables() {
3939: errorCount = -1;
3940: line = -1;
3941: column = -1;
3942: dataBuffer = null;
3943: nameBuffer = null;
3944: currentElement = null;
3945: currentElementContent = CONTENT_UNDECLARED;
3946: sourceType = INPUT_NONE;
3947: inputStack = null;
3948: externalEntity = null;
3949: entityStack = null;
3950: }
3951:
3952: //
3953: // The current XML handler interface.
3954: //
3955: XmlHandler handler;
3956:
3957: //
3958: // I/O information.
3959: //
3960: private Reader reader; // current reader
3961: private InputStream is; // current input stream
3962: private int line; // current line number
3963: private int column; // current column number
3964: private int sourceType; // type of input source
3965: private Stack inputStack; // stack of input soruces
3966: private URLConnection externalEntity; // current external entity
3967: private int encoding; // current character encoding.
3968: private int currentByteCount; // how many bytes read from current source.
3969:
3970: //
3971: // Maintain a count of errors.
3972: //
3973: private int errorCount;
3974:
3975: //
3976: // Buffers for decoded but unparsed character input.
3977: //
3978: private final static int READ_BUFFER_MAX = 16384;
3979: private char readBuffer[];
3980: private int readBufferPos;
3981: private int readBufferLength;
3982: private int readBufferOverflow; // overflow character from last data chunk.
3983:
3984: //
3985: // Buffer for undecoded raw byte input.
3986: //
3987: private byte rawReadBuffer[];
3988:
3989: //
3990: // Buffer for parsed character data.
3991: //
3992: private static int DATA_BUFFER_INITIAL = 4096;
3993: private char dataBuffer[];
3994: private int dataBufferPos;
3995:
3996: //
3997: // Buffer for parsed names.
3998: //
3999: private static int NAME_BUFFER_INITIAL = 1024;
4000: private char nameBuffer[];
4001: private int nameBufferPos;
4002:
4003: //
4004: // Hashtables for DTD information on elements, entities, and notations.
4005: //
4006: private Hashtable elementInfo;
4007: private Hashtable entityInfo;
4008: private Hashtable notationInfo;
4009:
4010: //
4011: // Element type currently in force.
4012: //
4013: private String currentElement;
4014: private int currentElementContent;
4015:
4016: //
4017: // Base external identifiers for resolution.
4018: //
4019: private String basePublicId;
4020: private String baseURI;
4021: private int baseEncoding;
4022: private Reader baseReader;
4023: private InputStream baseInputStream;
4024: private char baseInputBuffer[];
4025: private int baseInputBufferStart;
4026: private int baseInputBufferLength;
4027:
4028: //
4029: // Stack of entity names, to help detect recursion.
4030: //
4031: private Stack entityStack;
4032:
4033: //
4034: // Are we in a context where PEs are allowed?
4035: //
4036: private int context;
4037:
4038: //
4039: // Symbol table, for internalising names.
4040: //
4041: private Object symbolTable[];
4042: private final static int SYMBOL_TABLE_LENGTH = 1087;
4043:
4044: //
4045: // Hash table of attributes found in current start tag.
4046: //
4047: private String tagAttributes[];
4048: private int tagAttributePos;
4049:
4050: //
4051: // Utility flag: have we noticed a CR while reading the last
4052: // data chunk? If so, we will have to go back and normalise
4053: // CR/LF.
4054: //
4055: private boolean sawCR;
4056: }
|