0001: package com.jclark.xml.tok;
0002:
0003: /**
0004: * An <code>Encoding</code> object corresponds to a possible
0005: * encoding (a mapping from characters to sequences of bytes).
0006: * It provides operations on byte arrays
0007: * that represent all or part of a parsed XML entity in that encoding.
0008: * <p>
0009: * The set of ASCII characters excluding <code>$@\^`{}~</code>
0010: * have a special status; these are called <i>XML significant</i>
0011: * characters.
0012: * <p>
0013: * This class imposes certain restrictions on an encoding:
0014: * <ul>
0015: * <li>the encoding must be stateless;
0016: * <li>a single byte must not encode more than one character;
0017: * <li>all XML significant characters must be encoded by the same number
0018: * of bytes, and no character may be encoded by fewer bytes.
0019: * </ul>
0020: * <p>
0021: * Several methods operate on byte subarrays. The subarray is specified
0022: * by a byte array <code>buf</code> and two integers,
0023: * <code>off</code> and <code>end</code>; <code>off</code>
0024: * gives the index in <code>buf</code> of the first byte of the subarray
0025: * and <code>end</code> gives the
0026: * index in <code>buf</code> of the byte immediately after the last byte.
0027: * <p>
0028: * Use the <code>getInitialEncoding</code> method to get an
0029: * <code>Encoding</code> object to use to start parsing an entity.
0030: * <p>
0031: * The main operations provided by <code>Encoding</code> are
0032: * <code>tokenizeProlog</code>, <code>tokenizeContent</code> and
0033: * <code>tokenizeCdataSection</code>;
0034: * these are used to divide up an XML entity into tokens.
0035: * <code>tokenizeProlog</code> is used for the prolog of an XML document
0036: * as well as for the external subset and parameter entities (except
0037: * when referenced in an <code>EntityValue</code>);
0038: * it can also be used for parsing the <code>Misc</code>* that follows
0039: * the document element.
0040: * <code>tokenizeContent</code> is used for the document element and for
0041: * parsed general entities that are referenced in <code>content</code>
0042: * except for CDATA sections.
0043: * <code>tokenizeCdataSection</code> is used for CDATA sections, following
0044: * the <code><![CDATA[</code> up to and including the <code>]]></code>.
0045: * <p>
0046: * <code>tokenizeAttributeValue</code> and <code>tokenizeEntityValue</code>
0047: * are used to further divide up tokens returned by <code>tokenizeProlog</code>
0048: * and <code>tokenizeContent</code>; they are also used to divide up entities
0049: * referenced in attribute values or entity values.
0050: * @version $Revision: 1.15 $ $Date: 1998/12/28 08:05:18 $
0051: */
0052:
0053: public abstract class Encoding {
0054: /**
0055: * Represents one or more characters of data.
0056: */
0057: public static final int TOK_DATA_CHARS = 0;
0058:
0059: /**
0060: * Represents a newline (CR, LF or CR followed by LF) in data.
0061: */
0062: public static final int TOK_DATA_NEWLINE = TOK_DATA_CHARS + 1;
0063:
0064: /**
0065: * Represents a complete start-tag <code><name></code>,
0066: * that doesn't have any attribute specifications.
0067: */
0068: public static final int TOK_START_TAG_NO_ATTS = TOK_DATA_NEWLINE + 1;
0069:
0070: /**
0071: * Represents a complete start-tag <code><name att="val"></code>,
0072: * that contains one or more attribute specifications.
0073: */
0074: public static final int TOK_START_TAG_WITH_ATTS = TOK_START_TAG_NO_ATTS + 1;
0075:
0076: /**
0077: * Represents an empty element tag <code><name/></code>,
0078: * that doesn't have any attribute specifications.
0079: */
0080: public static final int TOK_EMPTY_ELEMENT_NO_ATTS = TOK_START_TAG_WITH_ATTS + 1;
0081:
0082: /**
0083: * Represents an empty element tag <code><name att="val"/></code>,
0084: * that contains one or more attribute specifications.
0085: */
0086: public static final int TOK_EMPTY_ELEMENT_WITH_ATTS = TOK_EMPTY_ELEMENT_NO_ATTS + 1;
0087:
0088: /**
0089: * Represents a complete end-tag <code></name></code>.
0090: */
0091: public static final int TOK_END_TAG = TOK_EMPTY_ELEMENT_WITH_ATTS + 1;
0092:
0093: /**
0094: * Represents the start of a CDATA section <code><![CDATA[</code>.
0095: */
0096: public static final int TOK_CDATA_SECT_OPEN = TOK_END_TAG + 1;
0097:
0098: /**
0099: * Represents the end of a CDATA section <code>]]></code>.
0100: */
0101: public static final int TOK_CDATA_SECT_CLOSE = TOK_CDATA_SECT_OPEN + 1;
0102:
0103: /**
0104: * Represents a general entity reference.
0105: */
0106: public static final int TOK_ENTITY_REF = TOK_CDATA_SECT_CLOSE + 1;
0107:
0108: /**
0109: * Represents a general entity reference to a one of the 5 predefined
0110: * entities <code>amp</code>, <code>lt</code>, <code>gt</code>,
0111: * <code>quot</code>, <code>apos</code>.
0112: */
0113: public static final int TOK_MAGIC_ENTITY_REF = TOK_ENTITY_REF + 1;
0114:
0115: /**
0116: * Represents a numeric character reference (decimal or hexadecimal),
0117: * when the referenced character is less than or equal to 0xFFFF
0118: * and so is represented by a single char.
0119: */
0120: public static final int TOK_CHAR_REF = TOK_MAGIC_ENTITY_REF + 1;
0121:
0122: /**
0123: * Represents a numeric character reference (decimal or hexadecimal),
0124: * when the referenced character is greater than 0xFFFF and so is
0125: * represented by a pair of chars.
0126: */
0127: public static final int TOK_CHAR_PAIR_REF = TOK_CHAR_REF + 1;
0128:
0129: /**
0130: * Represents a processing instruction.
0131: */
0132: public static final int TOK_PI = TOK_CHAR_PAIR_REF + 1;
0133:
0134: /**
0135: * Represents an XML declaration or text declaration (a processing
0136: * instruction whose target is <code>xml</code>).
0137: */
0138: public static final int TOK_XML_DECL = TOK_PI + 1;
0139:
0140: /**
0141: * Represents a comment <code><!-- comment --></code>.
0142: * This can occur both in the prolog and in content.
0143: */
0144: public static final int TOK_COMMENT = TOK_XML_DECL + 1;
0145:
0146: /**
0147: * Represents a white space character in an attribute value,
0148: * excluding white space characters that are part of line boundaries.
0149: */
0150: public static final int TOK_ATTRIBUTE_VALUE_S = TOK_COMMENT + 1;
0151:
0152: /**
0153: * Represents a parameter entity reference in the prolog.
0154: */
0155: public static final int TOK_PARAM_ENTITY_REF = TOK_ATTRIBUTE_VALUE_S + 1;
0156:
0157: /**
0158: * Represents whitespace in the prolog.
0159: * The token contains one or more whitespace characters.
0160: */
0161: public static final int TOK_PROLOG_S = TOK_PARAM_ENTITY_REF + 1;
0162:
0163: /**
0164: * Represents <code><!NAME</code> in the prolog.
0165: */
0166: public static final int TOK_DECL_OPEN = TOK_PROLOG_S + 1;
0167:
0168: /**
0169: * Represents <code>></code> in the prolog.
0170: */
0171: public static final int TOK_DECL_CLOSE = TOK_DECL_OPEN + 1;
0172:
0173: /**
0174: * Represents a name in the prolog.
0175: */
0176: public static final int TOK_NAME = TOK_DECL_CLOSE + 1;
0177:
0178: /**
0179: * Represents a name token in the prolog that is not a name.
0180: */
0181: public static final int TOK_NMTOKEN = TOK_NAME + 1;
0182:
0183: /**
0184: * Represents <code>#NAME</code> in the prolog.
0185: */
0186: public static final int TOK_POUND_NAME = TOK_NMTOKEN + 1;
0187:
0188: /**
0189: * Represents <code>|</code> in the prolog.
0190: */
0191: public static final int TOK_OR = TOK_POUND_NAME + 1;
0192:
0193: /**
0194: * Represents a <code>%</code> in the prolog that does not start
0195: * a parameter entity reference.
0196: * This can occur in an entity declaration.
0197: */
0198: public static final int TOK_PERCENT = TOK_OR + 1;
0199:
0200: /**
0201: * Represents a <code>(</code> in the prolog.
0202: */
0203: public static final int TOK_OPEN_PAREN = TOK_PERCENT + 1;
0204:
0205: /**
0206: * Represents a <code>)</code> in the prolog that is not
0207: * followed immediately by any of
0208: * <code>*</code>, <code>+</code> or <code>?</code>.
0209: */
0210: public static final int TOK_CLOSE_PAREN = TOK_OPEN_PAREN + 1;
0211:
0212: /**
0213: * Represents <code>[</code> in the prolog.
0214: */
0215: public static final int TOK_OPEN_BRACKET = TOK_CLOSE_PAREN + 1;
0216:
0217: /**
0218: * Represents <code>]</code> in the prolog.
0219: */
0220: public static final int TOK_CLOSE_BRACKET = TOK_OPEN_BRACKET + 1;
0221:
0222: /**
0223: * Represents a literal (EntityValue, AttValue, SystemLiteral or
0224: * PubidLiteral).
0225: */
0226: public static final int TOK_LITERAL = TOK_CLOSE_BRACKET + 1;
0227:
0228: /**
0229: * Represents a name followed immediately by <code>?</code>.
0230: */
0231: public static final int TOK_NAME_QUESTION = TOK_LITERAL + 1;
0232:
0233: /**
0234: * Represents a name followed immediately by <code>*</code>.
0235: */
0236: public static final int TOK_NAME_ASTERISK = TOK_NAME_QUESTION + 1;
0237:
0238: /**
0239: * Represents a name followed immediately by <code>+</code>.
0240: */
0241: public static final int TOK_NAME_PLUS = TOK_NAME_ASTERISK + 1;
0242:
0243: /**
0244: * Represents <code><![</code> in the prolog.
0245: */
0246: public static final int TOK_COND_SECT_OPEN = TOK_NAME_PLUS + 1;
0247:
0248: /**
0249: * Represents <code>]]></code> in the prolog.
0250: */
0251: public static final int TOK_COND_SECT_CLOSE = TOK_COND_SECT_OPEN + 1;
0252:
0253: /**
0254: * Represents <code>)?</code> in the prolog.
0255: */
0256: public static final int TOK_CLOSE_PAREN_QUESTION = TOK_COND_SECT_CLOSE + 1;
0257:
0258: /**
0259: * Represents <code>)*</code> in the prolog.
0260: */
0261: public static final int TOK_CLOSE_PAREN_ASTERISK = TOK_CLOSE_PAREN_QUESTION + 1;
0262:
0263: /**
0264: * Represents <code>)+</code> in the prolog.
0265: */
0266: public static final int TOK_CLOSE_PAREN_PLUS = TOK_CLOSE_PAREN_ASTERISK + 1;
0267:
0268: /**
0269: * Represents <code>,</code> in the prolog.
0270: */
0271: public static final int TOK_COMMA = TOK_CLOSE_PAREN_PLUS + 1;
0272:
0273: /**
0274: * Convert bytes to characters.
0275: * The bytes on <code>sourceBuf</code> between <code>sourceStart</code>
0276: * and <code>sourceEnd</code> are converted to characters and stored
0277: * in <code>targetBuf</code> starting at <code>targetStart</code>.
0278: * <code>(targetBuf.length - targetStart) * getMinBytesPerChar()</code>
0279: * must be at greater than or equal to
0280: * <code>sourceEnd - sourceStart</code>.
0281: * If <code>getFixedBytesPerChar</code> returns a value greater than 0,
0282: * then the return value will be equal to
0283: * <code>(sourceEnd - sourceStart)/getFixedBytesPerChar()</code>.
0284: * @return the number of characters stored into <code>targetBuf</code>
0285: * @see #getFixedBytesPerChar
0286: */
0287: public abstract int convert(byte[] sourceBuf, int sourceStart,
0288: int sourceEnd, char[] targetBuf, int targetStart);
0289:
0290: /**
0291: * Returns the number of bytes required to represent each <code>char</code>,
0292: * or zero if different <code>char</code>s are represented by different
0293: * numbers of bytes. The value returned will 0, 1, 2, or 4.
0294: */
0295: public abstract int getFixedBytesPerChar();
0296:
0297: private static Encoding utf8Encoding;
0298: private static Encoding utf16LittleEndianEncoding;
0299: private static Encoding utf16BigEndianEncoding;
0300: private static Encoding internalEncoding;
0301: private static Encoding iso8859_1Encoding;
0302: private static Encoding asciiEncoding;
0303:
0304: private static final byte UTF8_ENCODING = 0;
0305: private static final byte UTF16_LITTLE_ENDIAN_ENCODING = 1;
0306: private static final byte UTF16_BIG_ENDIAN_ENCODING = 2;
0307: private static final byte INTERNAL_ENCODING = 3;
0308: private static final byte ISO8859_1_ENCODING = 4;
0309: private static final byte ASCII_ENCODING = 5;
0310:
0311: private static synchronized Encoding getEncoding(byte enc) {
0312: switch (enc) {
0313: case UTF8_ENCODING:
0314: if (utf8Encoding == null)
0315: utf8Encoding = new UTF8Encoding();
0316: return utf8Encoding;
0317: case UTF16_LITTLE_ENDIAN_ENCODING:
0318: if (utf16LittleEndianEncoding == null)
0319: utf16LittleEndianEncoding = new UTF16LittleEndianEncoding();
0320: return utf16LittleEndianEncoding;
0321: case UTF16_BIG_ENDIAN_ENCODING:
0322: if (utf16BigEndianEncoding == null)
0323: utf16BigEndianEncoding = new UTF16BigEndianEncoding();
0324: return utf16BigEndianEncoding;
0325: case INTERNAL_ENCODING:
0326: if (internalEncoding == null)
0327: internalEncoding = new InternalEncoding();
0328: return internalEncoding;
0329: case ISO8859_1_ENCODING:
0330: if (iso8859_1Encoding == null)
0331: iso8859_1Encoding = new ISO8859_1Encoding();
0332: return iso8859_1Encoding;
0333: case ASCII_ENCODING:
0334: if (asciiEncoding == null)
0335: asciiEncoding = new ASCIIEncoding();
0336: return asciiEncoding;
0337: }
0338: return null;
0339: }
0340:
0341: Encoding getUTF16Encoding() {
0342: return getEncoding(UTF16_BIG_ENDIAN_ENCODING);
0343: }
0344:
0345: // Bytes with type < 0 may not be data in content.
0346: // The negation of the lead byte type gives the total number of bytes.
0347: static final int BT_LEAD2 = -2;
0348: static final int BT_LEAD3 = BT_LEAD2 - 1;
0349: static final int BT_LEAD4 = BT_LEAD3 - 1;
0350: static final int BT_NONXML = BT_LEAD4 - 1;
0351: static final int BT_MALFORM = BT_NONXML - 1;
0352: static final int BT_LT = BT_MALFORM - 1;
0353: static final int BT_AMP = BT_LT - 1;
0354: static final int BT_RSQB = BT_AMP - 1;
0355: static final int BT_CR = BT_RSQB - 1;
0356: static final int BT_LF = BT_CR - 1;
0357: // Bytes with type >= 0 are treated as data in content.
0358: static final int BT_GT = 0;
0359: static final int BT_QUOT = BT_GT + 1;
0360: static final int BT_APOS = BT_QUOT + 1;
0361: static final int BT_EQUALS = BT_APOS + 1;
0362: static final int BT_QUEST = BT_EQUALS + 1;
0363: static final int BT_EXCL = BT_QUEST + 1;
0364: static final int BT_SOL = BT_EXCL + 1;
0365: static final int BT_SEMI = BT_SOL + 1;
0366: static final int BT_NUM = BT_SEMI + 1;
0367: static final int BT_LSQB = BT_NUM + 1;
0368: static final int BT_S = BT_LSQB + 1;
0369: static final int BT_NMSTRT = BT_S + 1;
0370: static final int BT_NAME = BT_NMSTRT + 1;
0371: static final int BT_MINUS = BT_NAME + 1;
0372: static final int BT_OTHER = BT_MINUS + 1;
0373: static final int BT_PERCNT = BT_OTHER + 1;
0374: static final int BT_LPAR = BT_PERCNT + 1;
0375: static final int BT_RPAR = BT_LPAR + 1;
0376: static final int BT_AST = BT_RPAR + 1;
0377: static final int BT_PLUS = BT_AST + 1;
0378: static final int BT_COMMA = BT_PLUS + 1;
0379: static final int BT_VERBAR = BT_COMMA + 1;
0380:
0381: final static byte[] asciiTypeTable = {
0382: /* 0x00 */BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
0383: /* 0x04 */BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
0384: /* 0x08 */BT_NONXML, BT_S, BT_LF, BT_NONXML,
0385: /* 0x0C */BT_NONXML, BT_CR, BT_NONXML, BT_NONXML,
0386: /* 0x10 */BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
0387: /* 0x14 */BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
0388: /* 0x18 */BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
0389: /* 0x1C */BT_NONXML, BT_NONXML, BT_NONXML, BT_NONXML,
0390: /* 0x20 */BT_S, BT_EXCL, BT_QUOT, BT_NUM,
0391: /* 0x24 */BT_OTHER, BT_PERCNT, BT_AMP, BT_APOS,
0392: /* 0x28 */BT_LPAR, BT_RPAR, BT_AST, BT_PLUS,
0393: /* 0x2C */BT_COMMA, BT_MINUS, BT_NAME, BT_SOL,
0394: /* 0x30 */BT_NAME, BT_NAME, BT_NAME, BT_NAME,
0395: /* 0x34 */BT_NAME, BT_NAME, BT_NAME, BT_NAME,
0396: /* 0x38 */BT_NAME, BT_NAME, BT_NMSTRT, BT_SEMI,
0397: /* 0x3C */BT_LT, BT_EQUALS, BT_GT, BT_QUEST,
0398: /* 0x40 */BT_OTHER, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
0399: /* 0x44 */BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
0400: /* 0x48 */BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
0401: /* 0x4C */BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
0402: /* 0x50 */BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
0403: /* 0x54 */BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
0404: /* 0x58 */BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_LSQB,
0405: /* 0x5C */BT_OTHER, BT_RSQB, BT_OTHER, BT_NMSTRT,
0406: /* 0x60 */BT_OTHER, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
0407: /* 0x64 */BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
0408: /* 0x68 */BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
0409: /* 0x6C */BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
0410: /* 0x70 */BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
0411: /* 0x74 */BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_NMSTRT,
0412: /* 0x78 */BT_NMSTRT, BT_NMSTRT, BT_NMSTRT, BT_OTHER,
0413: /* 0x7C */BT_VERBAR, BT_OTHER, BT_OTHER, BT_OTHER };
0414:
0415: // The minimum number of bytes per character.
0416: private/* final */int minBPC;
0417:
0418: Encoding(int minBPC) {
0419: this .minBPC = minBPC;
0420: }
0421:
0422: // There are guaranteed to be minBPC available bytes starting at off.
0423: abstract int byteType(byte[] buf, int off);
0424:
0425: abstract int byteToAscii(byte[] buf, int off);
0426:
0427: // This must only be called when c is an (XML significant) ASCII character.
0428: abstract boolean charMatches(byte[] buf, int off, char c);
0429:
0430: // Called only when byteType(buf, off) == BT_LEAD2
0431: int byteType2(byte[] buf, int off) {
0432: return BT_OTHER;
0433: }
0434:
0435: // Called only when byteType(buf, off) == BT_LEAD3
0436: int byteType3(byte[] buf, int off) {
0437: return BT_OTHER;
0438: }
0439:
0440: // Called only when byteType(buf, off) == BT_LEAD4
0441: int byteType4(byte[] buf, int off) {
0442: return BT_OTHER;
0443: }
0444:
0445: void check2(byte[] buf, int off) throws InvalidTokenException {
0446: }
0447:
0448: void check3(byte[] buf, int off) throws InvalidTokenException {
0449: }
0450:
0451: void check4(byte[] buf, int off) throws InvalidTokenException {
0452: }
0453:
0454: /**
0455: * Moves a position forward.
0456: * On entry, <code>pos</code> gives the position of the byte at index
0457: * <code>off</code> in <code>buf</code>.
0458: * On exit, it <code>pos</code> will give the position of the byte at index
0459: * <code>end</code>, which must be greater than or equal to <code>off</code>.
0460: * The bytes between <code>off</code> and <code>end</code> must encode
0461: * one or more complete characters.
0462: * A carriage return followed by a line feed will be treated as a single
0463: * line delimiter provided that they are given to <code>movePosition</code>
0464: * together.
0465: */
0466: public abstract void movePosition(byte[] buf, int off, int end,
0467: Position pos);
0468:
0469: // end encoding specific part
0470:
0471: private final void checkCharMatches(byte[] buf, int off, char c)
0472: throws InvalidTokenException {
0473: if (!charMatches(buf, off, c))
0474: throw new InvalidTokenException(off);
0475: }
0476:
0477: /* off points to character following "<!-" */
0478:
0479: private final int scanComment(byte[] buf, int off, int end,
0480: Token token) throws InvalidTokenException,
0481: PartialTokenException {
0482: if (off != end) {
0483: checkCharMatches(buf, off, '-');
0484: off += minBPC;
0485: while (off != end) {
0486: switch (byteType(buf, off)) {
0487: case BT_LEAD2:
0488: if (end - off < 2)
0489: throw new PartialCharException(off);
0490: check2(buf, off);
0491: off += 2;
0492: break;
0493: case BT_LEAD3:
0494: if (end - off < 3)
0495: throw new PartialCharException(off);
0496: check3(buf, off);
0497: off += 3;
0498: break;
0499: case BT_LEAD4:
0500: if (end - off < 4)
0501: throw new PartialCharException(off);
0502: check4(buf, off);
0503: off += 4;
0504: break;
0505: case BT_NONXML:
0506: case BT_MALFORM:
0507: throw new InvalidTokenException(off);
0508: case BT_MINUS:
0509: if ((off += minBPC) == end)
0510: throw new PartialTokenException();
0511: if (charMatches(buf, off, '-')) {
0512: if ((off += minBPC) == end)
0513: throw new PartialTokenException();
0514: checkCharMatches(buf, off, '>');
0515: token.tokenEnd = off + minBPC;
0516: return TOK_COMMENT;
0517: }
0518: break;
0519: default:
0520: off += minBPC;
0521: break;
0522: }
0523: }
0524: }
0525: throw new PartialTokenException();
0526: }
0527:
0528: /* off points to character following "<!" */
0529:
0530: private final int scanDecl(byte[] buf, int off, int end, Token token)
0531: throws InvalidTokenException, PartialTokenException {
0532: if (off == end)
0533: throw new PartialTokenException();
0534: switch (byteType(buf, off)) {
0535: case BT_MINUS:
0536: return scanComment(buf, off + minBPC, end, token);
0537: case BT_LSQB:
0538: token.tokenEnd = off + minBPC;
0539: return TOK_COND_SECT_OPEN;
0540: case BT_NMSTRT:
0541: off += minBPC;
0542: break;
0543: default:
0544: throw new InvalidTokenException(off);
0545: }
0546: while (off != end) {
0547: switch (byteType(buf, off)) {
0548: case BT_PERCNT:
0549: if (off + minBPC == end)
0550: throw new PartialTokenException();
0551: /* don't allow <!ENTITY% foo "whatever"> */
0552: switch (byteType(buf, off + minBPC)) {
0553: case BT_S:
0554: case BT_CR:
0555: case BT_LF:
0556: case BT_PERCNT:
0557: throw new InvalidTokenException(off);
0558: }
0559: /* fall through */
0560: case BT_S:
0561: case BT_CR:
0562: case BT_LF:
0563: token.tokenEnd = off;
0564: return TOK_DECL_OPEN;
0565: case BT_NMSTRT:
0566: off += minBPC;
0567: break;
0568: default:
0569: throw new InvalidTokenException(off);
0570: }
0571: }
0572: throw new PartialTokenException();
0573: }
0574:
0575: private final boolean targetIsXml(byte[] buf, int off, int end)
0576: throws InvalidTokenException {
0577: boolean upper = false;
0578: if (end - off != minBPC * 3)
0579: return false;
0580: switch (byteToAscii(buf, off)) {
0581: case 'x':
0582: break;
0583: case 'X':
0584: upper = true;
0585: break;
0586: default:
0587: return false;
0588: }
0589: off += minBPC;
0590: switch (byteToAscii(buf, off)) {
0591: case 'm':
0592: break;
0593: case 'M':
0594: upper = true;
0595: break;
0596: default:
0597: return false;
0598: }
0599: off += minBPC;
0600: switch (byteToAscii(buf, off)) {
0601: case 'l':
0602: break;
0603: case 'L':
0604: upper = true;
0605: break;
0606: default:
0607: return false;
0608: }
0609: if (upper)
0610: throw new InvalidTokenException(off,
0611: InvalidTokenException.XML_TARGET);
0612: return true;
0613: }
0614:
0615: /* off points to character following "<?" */
0616:
0617: private final int scanPi(byte[] buf, int off, int end, Token token)
0618: throws PartialTokenException, InvalidTokenException {
0619: int target = off;
0620: if (off == end)
0621: throw new PartialTokenException();
0622: switch (byteType(buf, off)) {
0623: case BT_NMSTRT:
0624: off += minBPC;
0625: break;
0626: case BT_LEAD2:
0627: if (end - off < 2)
0628: throw new PartialCharException(off);
0629: if (byteType2(buf, off) != BT_NMSTRT)
0630: throw new InvalidTokenException(off);
0631: off += 2;
0632: break;
0633: case BT_LEAD3:
0634: if (end - off < 3)
0635: throw new PartialCharException(off);
0636: if (byteType3(buf, off) != BT_NMSTRT)
0637: throw new InvalidTokenException(off);
0638: off += 3;
0639: break;
0640: case BT_LEAD4:
0641: if (end - off < 4)
0642: throw new PartialCharException(off);
0643: if (byteType4(buf, off) != BT_NMSTRT)
0644: throw new InvalidTokenException(off);
0645: off += 4;
0646: break;
0647: default:
0648: throw new InvalidTokenException(off);
0649: }
0650: while (off != end) {
0651: switch (byteType(buf, off)) {
0652: case BT_NMSTRT:
0653: case BT_NAME:
0654: case BT_MINUS:
0655: off += minBPC;
0656: break;
0657: case BT_LEAD2:
0658: if (end - off < 2)
0659: throw new PartialCharException(off);
0660: if (!isNameChar2(buf, off))
0661: throw new InvalidTokenException(off);
0662: off += 2;
0663: break;
0664: case BT_LEAD3:
0665: if (end - off < 3)
0666: throw new PartialCharException(off);
0667: if (!isNameChar3(buf, off))
0668: throw new InvalidTokenException(off);
0669: off += 3;
0670: break;
0671: case BT_LEAD4:
0672: if (end - off < 4)
0673: throw new PartialCharException(off);
0674: if (!isNameChar4(buf, off))
0675: throw new InvalidTokenException(off);
0676: off += 4;
0677: break;
0678: case BT_S:
0679: case BT_CR:
0680: case BT_LF:
0681: boolean isXml = targetIsXml(buf, target, off);
0682: token.nameEnd = off;
0683: off += minBPC;
0684: while (off != end) {
0685: switch (byteType(buf, off)) {
0686: case BT_LEAD2:
0687: if (end - off < 2)
0688: throw new PartialCharException(off);
0689: check2(buf, off);
0690: off += 2;
0691: break;
0692: case BT_LEAD3:
0693: if (end - off < 3)
0694: throw new PartialCharException(off);
0695: check3(buf, off);
0696: off += 3;
0697: break;
0698: case BT_LEAD4:
0699: if (end - off < 4)
0700: throw new PartialCharException(off);
0701: check4(buf, off);
0702: off += 4;
0703: break;
0704: case BT_NONXML:
0705: case BT_MALFORM:
0706: throw new InvalidTokenException(off);
0707: case BT_QUEST:
0708: off += minBPC;
0709: if (off == end)
0710: throw new PartialTokenException();
0711: if (charMatches(buf, off, '>')) {
0712: token.tokenEnd = off + minBPC;
0713: if (isXml)
0714: return TOK_XML_DECL;
0715: else
0716: return TOK_PI;
0717: }
0718: break;
0719: default:
0720: off += minBPC;
0721: break;
0722: }
0723: }
0724: throw new PartialTokenException();
0725: case BT_QUEST:
0726: token.nameEnd = off;
0727: off += minBPC;
0728: if (off == end)
0729: throw new PartialTokenException();
0730: checkCharMatches(buf, off, '>');
0731: token.tokenEnd = off + minBPC;
0732: return (targetIsXml(buf, target, token.nameEnd) ? TOK_XML_DECL
0733: : TOK_PI);
0734: default:
0735: throw new InvalidTokenException(off);
0736: }
0737: }
0738: throw new PartialTokenException();
0739: }
0740:
0741: /* off points to character following "<![" */
0742:
0743: private static final String CDATA = "CDATA[";
0744:
0745: private final int scanCdataSection(byte[] buf, int off, int end,
0746: Token token) throws PartialTokenException,
0747: InvalidTokenException {
0748: /* "CDATA[".length() == 6 */
0749: if (end - off < 6 * minBPC)
0750: throw new PartialTokenException();
0751: for (int i = 0; i < CDATA.length(); i++, off += minBPC)
0752: checkCharMatches(buf, off, CDATA.charAt(i));
0753: token.tokenEnd = off;
0754: return TOK_CDATA_SECT_OPEN;
0755: }
0756:
0757: /**
0758: * Scans the first token of a byte subarrary that starts with the
0759: * content of a CDATA section.
0760: * Returns one of the following integers according to the type of token
0761: * that the subarray starts with:
0762: * <ul>
0763: * <li><code>TOK_DATA_CHARS</code>
0764: * <li><code>TOK_DATA_NEWLINE</code>
0765: * <li><code>TOK_CDATA_SECT_CLOSE</code>
0766: * </ul>
0767: * <p>
0768: * Information about the token is stored in <code>token</code>.
0769: * <p>
0770: * After <code>TOK_CDATA_SECT_CLOSE</code> is returned, the application
0771: * should use <code>tokenizeContent</code>.
0772: *
0773: * @exception EmptyTokenException if the subarray is empty
0774: * @exception PartialTokenException if the subarray contains only part of
0775: * a legal token
0776: * @exception InvalidTokenException if the subarrary does not start
0777: * with a legal token or part of one
0778: * @exception ExtensibleTokenException if the subarray encodes just a carriage
0779: * return ('\r')
0780: *
0781: * @see #TOK_DATA_CHARS
0782: * @see #TOK_DATA_NEWLINE
0783: * @see #TOK_CDATA_SECT_CLOSE
0784: * @see Token
0785: * @see EmptyTokenException
0786: * @see PartialTokenException
0787: * @see InvalidTokenException
0788: * @see ExtensibleTokenException
0789: * @see #tokenizeContent
0790: */
0791: public final int tokenizeCdataSection(byte[] buf, int off, int end,
0792: Token token) throws EmptyTokenException,
0793: PartialTokenException, InvalidTokenException,
0794: ExtensibleTokenException {
0795: if (minBPC > 1)
0796: end = adjustEnd(off, end);
0797: if (off == end)
0798: throw new EmptyTokenException();
0799: switch (byteType(buf, off)) {
0800: case BT_RSQB:
0801: off += minBPC;
0802: if (off == end)
0803: throw new PartialTokenException();
0804: if (!charMatches(buf, off, ']'))
0805: break;
0806: off += minBPC;
0807: if (off == end)
0808: throw new PartialTokenException();
0809: if (!charMatches(buf, off, '>')) {
0810: off -= minBPC;
0811: break;
0812: }
0813: token.tokenEnd = off + minBPC;
0814: return TOK_CDATA_SECT_CLOSE;
0815: case BT_CR:
0816: off += minBPC;
0817: if (off == end)
0818: throw new ExtensibleTokenException(TOK_DATA_NEWLINE);
0819: if (byteType(buf, off) == BT_LF)
0820: off += minBPC;
0821: token.tokenEnd = off;
0822: return TOK_DATA_NEWLINE;
0823: case BT_LF:
0824: token.tokenEnd = off + minBPC;
0825: return TOK_DATA_NEWLINE;
0826: case BT_NONXML:
0827: case BT_MALFORM:
0828: throw new InvalidTokenException(off);
0829: case BT_LEAD2:
0830: if (end - off < 2)
0831: throw new PartialCharException(off);
0832: check2(buf, off);
0833: off += 2;
0834: break;
0835: case BT_LEAD3:
0836: if (end - off < 3)
0837: throw new PartialCharException(off);
0838: check3(buf, off);
0839: off += 3;
0840: break;
0841: case BT_LEAD4:
0842: if (end - off < 4)
0843: throw new PartialCharException(off);
0844: check4(buf, off);
0845: off += 4;
0846: break;
0847: default:
0848: off += minBPC;
0849: break;
0850: }
0851: token.tokenEnd = extendCdata(buf, off, end);
0852: return TOK_DATA_CHARS;
0853: }
0854:
0855: int extendCdata(final byte[] buf, int off, final int end)
0856: throws InvalidTokenException {
0857: while (off != end) {
0858: switch (byteType(buf, off)) {
0859: case BT_LEAD2:
0860: if (end - off < 2)
0861: return off;
0862: check2(buf, off);
0863: off += 2;
0864: break;
0865: case BT_LEAD3:
0866: if (end - off < 3)
0867: return off;
0868: check3(buf, off);
0869: off += 3;
0870: break;
0871: case BT_LEAD4:
0872: if (end - off < 4)
0873: return off;
0874: check4(buf, off);
0875: off += 4;
0876: break;
0877: case BT_RSQB:
0878: case BT_NONXML:
0879: case BT_MALFORM:
0880: case BT_CR:
0881: case BT_LF:
0882: return off;
0883: default:
0884: off += minBPC;
0885: break;
0886: }
0887: }
0888: return off;
0889: }
0890:
0891: /* off points to character following "</" */
0892:
0893: private final int scanEndTag(byte[] buf, int off, int end,
0894: Token token) throws PartialTokenException,
0895: InvalidTokenException {
0896: if (off == end)
0897: throw new PartialTokenException();
0898: switch (byteType(buf, off)) {
0899: case BT_NMSTRT:
0900: off += minBPC;
0901: break;
0902: case BT_LEAD2:
0903: if (end - off < 2)
0904: throw new PartialCharException(off);
0905: if (byteType2(buf, off) != BT_NMSTRT)
0906: throw new InvalidTokenException(off);
0907: off += 2;
0908: break;
0909: case BT_LEAD3:
0910: if (end - off < 3)
0911: throw new PartialCharException(off);
0912: if (byteType3(buf, off) != BT_NMSTRT)
0913: throw new InvalidTokenException(off);
0914: off += 3;
0915: break;
0916: case BT_LEAD4:
0917: if (end - off < 4)
0918: throw new PartialCharException(off);
0919: if (byteType4(buf, off) != BT_NMSTRT)
0920: throw new InvalidTokenException(off);
0921: off += 4;
0922: break;
0923: default:
0924: throw new InvalidTokenException(off);
0925: }
0926: while (off != end) {
0927: switch (byteType(buf, off)) {
0928: case BT_NMSTRT:
0929: case BT_NAME:
0930: case BT_MINUS:
0931: off += minBPC;
0932: break;
0933: case BT_LEAD2:
0934: if (end - off < 2)
0935: throw new PartialCharException(off);
0936: if (!isNameChar2(buf, off))
0937: throw new InvalidTokenException(off);
0938: off += 2;
0939: break;
0940: case BT_LEAD3:
0941: if (end - off < 3)
0942: throw new PartialCharException(off);
0943: if (!isNameChar3(buf, off))
0944: throw new InvalidTokenException(off);
0945: off += 3;
0946: break;
0947: case BT_LEAD4:
0948: if (end - off < 4)
0949: throw new PartialCharException(off);
0950: if (!isNameChar4(buf, off))
0951: throw new InvalidTokenException(off);
0952: off += 4;
0953: break;
0954: case BT_S:
0955: case BT_CR:
0956: case BT_LF:
0957: token.nameEnd = off;
0958: for (off += minBPC; off != end; off += minBPC) {
0959: switch (byteType(buf, off)) {
0960: case BT_S:
0961: case BT_CR:
0962: case BT_LF:
0963: break;
0964: case BT_GT:
0965: token.tokenEnd = off + minBPC;
0966: return TOK_END_TAG;
0967: default:
0968: throw new InvalidTokenException(off);
0969: }
0970: }
0971: throw new PartialTokenException();
0972: case BT_GT:
0973: token.nameEnd = off;
0974: token.tokenEnd = off + minBPC;
0975: return TOK_END_TAG;
0976: default:
0977: throw new InvalidTokenException(off);
0978: }
0979: }
0980: throw new PartialTokenException();
0981: }
0982:
0983: /* off points to character following "&#X" */
0984:
0985: private final int scanHexCharRef(byte[] buf, int off, int end,
0986: Token token) throws PartialTokenException,
0987: InvalidTokenException {
0988: if (off != end) {
0989: int c = byteToAscii(buf, off);
0990: int num;
0991: switch (c) {
0992: case '0':
0993: case '1':
0994: case '2':
0995: case '3':
0996: case '4':
0997: case '5':
0998: case '6':
0999: case '7':
1000: case '8':
1001: case '9':
1002: num = c - '0';
1003: break;
1004: case 'A':
1005: case 'B':
1006: case 'C':
1007: case 'D':
1008: case 'E':
1009: case 'F':
1010: num = c - ('A' - 10);
1011: break;
1012: case 'a':
1013: case 'b':
1014: case 'c':
1015: case 'd':
1016: case 'e':
1017: case 'f':
1018: num = c - ('a' - 10);
1019: break;
1020: default:
1021: throw new InvalidTokenException(off);
1022: }
1023: for (off += minBPC; off != end; off += minBPC) {
1024: c = byteToAscii(buf, off);
1025: switch (c) {
1026: case '0':
1027: case '1':
1028: case '2':
1029: case '3':
1030: case '4':
1031: case '5':
1032: case '6':
1033: case '7':
1034: case '8':
1035: case '9':
1036: num = (num << 4) + c - '0';
1037: break;
1038: case 'A':
1039: case 'B':
1040: case 'C':
1041: case 'D':
1042: case 'E':
1043: case 'F':
1044: num = (num << 4) + c - ('A' - 10);
1045: break;
1046: case 'a':
1047: case 'b':
1048: case 'c':
1049: case 'd':
1050: case 'e':
1051: case 'f':
1052: num = (num << 4) + c - ('a' - 10);
1053: break;
1054: case ';':
1055: token.tokenEnd = off + minBPC;
1056: return setRefChar(num, token);
1057: default:
1058: throw new InvalidTokenException(off);
1059: }
1060: if (num >= 0x110000)
1061: throw new InvalidTokenException(off);
1062: }
1063: }
1064: throw new PartialTokenException();
1065: }
1066:
1067: /* off points to character following "&#" */
1068:
1069: private final int scanCharRef(byte[] buf, int off, int end,
1070: Token token) throws PartialTokenException,
1071: InvalidTokenException {
1072: if (off != end) {
1073: int c = byteToAscii(buf, off);
1074: switch (c) {
1075: case 'x':
1076: return scanHexCharRef(buf, off + minBPC, end, token);
1077: case '0':
1078: case '1':
1079: case '2':
1080: case '3':
1081: case '4':
1082: case '5':
1083: case '6':
1084: case '7':
1085: case '8':
1086: case '9':
1087: break;
1088: default:
1089: throw new InvalidTokenException(off);
1090: }
1091: int num = c - '0';
1092: for (off += minBPC; off != end; off += minBPC) {
1093: c = byteToAscii(buf, off);
1094: switch (c) {
1095: case '0':
1096: case '1':
1097: case '2':
1098: case '3':
1099: case '4':
1100: case '5':
1101: case '6':
1102: case '7':
1103: case '8':
1104: case '9':
1105: num = num * 10 + (c - '0');
1106: if (num < 0x110000)
1107: break;
1108: /* fall through */
1109: default:
1110: throw new InvalidTokenException(off);
1111: case ';':
1112: token.tokenEnd = off + minBPC;
1113: return setRefChar(num, token);
1114: }
1115: }
1116: }
1117: throw new PartialTokenException();
1118: }
1119:
1120: /* num is known to be < 0x110000; return the token code */
1121: private final int setRefChar(int num, Token token)
1122: throws InvalidTokenException {
1123: if (num < 0x10000) {
1124: switch (charTypeTable[num >> 8][num & 0xFF]) {
1125: case BT_NONXML:
1126: case BT_LEAD4:
1127: case BT_MALFORM:
1128: throw new InvalidTokenException(token.tokenEnd - minBPC);
1129: }
1130: token.refChar1 = (char) num;
1131: return TOK_CHAR_REF;
1132: } else {
1133: num -= 0x10000;
1134: token.refChar1 = (char) ((num >> 10) + 0xD800);
1135: token.refChar2 = (char) ((num & ((1 << 10) - 1)) + 0xDC00);
1136: return TOK_CHAR_PAIR_REF;
1137: }
1138: }
1139:
1140: private final boolean isMagicEntityRef(byte[] buf, int off,
1141: int end, Token token) {
1142: switch (byteToAscii(buf, off)) {
1143: case 'a':
1144: if (end - off < minBPC * 4)
1145: break;
1146: switch (byteToAscii(buf, off + minBPC)) {
1147: case 'm':
1148: if (charMatches(buf, off + minBPC * 2, 'p')
1149: && charMatches(buf, off + minBPC * 3, ';')) {
1150: token.tokenEnd = off + minBPC * 4;
1151: token.refChar1 = '&';
1152: return true;
1153: }
1154: break;
1155: case 'p':
1156: if (end - off >= minBPC * 5
1157: && charMatches(buf, off + minBPC * 2, 'o')
1158: && charMatches(buf, off + minBPC * 3, 's')
1159: && charMatches(buf, off + minBPC * 4, ';')) {
1160: token.tokenEnd = off + minBPC * 5;
1161: token.refChar1 = '\'';
1162: return true;
1163: }
1164: break;
1165: }
1166: break;
1167: case 'l':
1168: if (end - off >= minBPC * 3
1169: && charMatches(buf, off + minBPC, 't')
1170: && charMatches(buf, off + minBPC * 2, ';')) {
1171: token.tokenEnd = off + minBPC * 3;
1172: token.refChar1 = '<';
1173: return true;
1174: }
1175: break;
1176: case 'g':
1177: if (end - off >= minBPC * 3
1178: && charMatches(buf, off + minBPC, 't')
1179: && charMatches(buf, off + minBPC * 2, ';')) {
1180: token.tokenEnd = off + minBPC * 3;
1181: token.refChar1 = '>';
1182: return true;
1183: }
1184: break;
1185: case 'q':
1186: if (end - off >= minBPC * 5
1187: && charMatches(buf, off + minBPC, 'u')
1188: && charMatches(buf, off + minBPC * 2, 'o')
1189: && charMatches(buf, off + minBPC * 3, 't')
1190: && charMatches(buf, off + minBPC * 4, ';')) {
1191: token.tokenEnd = off + minBPC * 5;
1192: token.refChar1 = '"';
1193: return true;
1194: }
1195: break;
1196: }
1197: return false;
1198: }
1199:
1200: /* off points to character following "&" */
1201:
1202: private final int scanRef(byte[] buf, int off, int end, Token token)
1203: throws PartialTokenException, InvalidTokenException {
1204: if (off == end)
1205: throw new PartialTokenException();
1206: if (isMagicEntityRef(buf, off, end, token))
1207: return TOK_MAGIC_ENTITY_REF;
1208: switch (byteType(buf, off)) {
1209: case BT_NMSTRT:
1210: off += minBPC;
1211: break;
1212: case BT_LEAD2:
1213: if (end - off < 2)
1214: throw new PartialCharException(off);
1215: if (byteType2(buf, off) != BT_NMSTRT)
1216: throw new InvalidTokenException(off);
1217: off += 2;
1218: break;
1219: case BT_LEAD3:
1220: if (end - off < 3)
1221: throw new PartialCharException(off);
1222: if (byteType3(buf, off) != BT_NMSTRT)
1223: throw new InvalidTokenException(off);
1224: off += 3;
1225: break;
1226: case BT_LEAD4:
1227: if (end - off < 4)
1228: throw new PartialCharException(off);
1229: if (byteType4(buf, off) != BT_NMSTRT)
1230: throw new InvalidTokenException(off);
1231: off += 4;
1232: break;
1233: case BT_NUM:
1234: return scanCharRef(buf, off + minBPC, end, token);
1235: default:
1236: throw new InvalidTokenException(off);
1237: }
1238: while (off != end) {
1239: switch (byteType(buf, off)) {
1240: case BT_NMSTRT:
1241: case BT_NAME:
1242: case BT_MINUS:
1243: off += minBPC;
1244: break;
1245: case BT_LEAD2:
1246: if (end - off < 2)
1247: throw new PartialCharException(off);
1248: if (!isNameChar2(buf, off))
1249: throw new InvalidTokenException(off);
1250: off += 2;
1251: break;
1252: case BT_LEAD3:
1253: if (end - off < 3)
1254: throw new PartialCharException(off);
1255: if (!isNameChar3(buf, off))
1256: throw new InvalidTokenException(off);
1257: off += 3;
1258: break;
1259: case BT_LEAD4:
1260: if (end - off < 4)
1261: throw new PartialCharException(off);
1262: if (!isNameChar4(buf, off))
1263: throw new InvalidTokenException(off);
1264: off += 4;
1265: break;
1266: case BT_SEMI:
1267: token.nameEnd = off;
1268: token.tokenEnd = off + minBPC;
1269: return TOK_ENTITY_REF;
1270: default:
1271: throw new InvalidTokenException(off);
1272: }
1273: }
1274: throw new PartialTokenException();
1275: }
1276:
1277: /* off points to character following first character of attribute name */
1278:
1279: private final int scanAtts(int nameStart, byte[] buf, int off,
1280: int end, ContentToken token) throws PartialTokenException,
1281: InvalidTokenException {
1282: int nameEnd = -1;
1283: while (off != end) {
1284: switch (byteType(buf, off)) {
1285: case BT_NMSTRT:
1286: case BT_NAME:
1287: case BT_MINUS:
1288: off += minBPC;
1289: break;
1290: case BT_LEAD2:
1291: if (end - off < 2)
1292: throw new PartialCharException(off);
1293: if (!isNameChar2(buf, off))
1294: throw new InvalidTokenException(off);
1295: off += 2;
1296: break;
1297: case BT_LEAD3:
1298: if (end - off < 3)
1299: throw new PartialCharException(off);
1300: if (!isNameChar3(buf, off))
1301: throw new InvalidTokenException(off);
1302: off += 3;
1303: break;
1304: case BT_LEAD4:
1305: if (end - off < 4)
1306: throw new PartialCharException(off);
1307: if (!isNameChar4(buf, off))
1308: throw new InvalidTokenException(off);
1309: off += 4;
1310: break;
1311: case BT_S:
1312: case BT_CR:
1313: case BT_LF:
1314: nameEnd = off;
1315: loop: for (;;) {
1316: off += minBPC;
1317: if (off == end)
1318: throw new PartialTokenException();
1319: switch (byteType(buf, off)) {
1320: case BT_EQUALS:
1321: break loop;
1322: case BT_S:
1323: case BT_LF:
1324: case BT_CR:
1325: break;
1326: default:
1327: throw new InvalidTokenException(off);
1328: }
1329: }
1330: /* fall through */
1331: case BT_EQUALS: {
1332: if (nameEnd < 0)
1333: nameEnd = off;
1334: int open;
1335: for (;;) {
1336:
1337: off += minBPC;
1338: if (off == end)
1339: throw new PartialTokenException();
1340: open = byteType(buf, off);
1341: if (open == BT_QUOT || open == BT_APOS)
1342: break;
1343: switch (open) {
1344: case BT_S:
1345: case BT_LF:
1346: case BT_CR:
1347: break;
1348: default:
1349: throw new InvalidTokenException(off);
1350: }
1351: }
1352: off += minBPC;
1353: int valueStart = off;
1354: boolean normalized = true;
1355: /* in attribute value */
1356: for (;;) {
1357: int t;
1358: if (off == end)
1359: throw new PartialTokenException();
1360: t = byteType(buf, off);
1361: if (t == open)
1362: break;
1363: switch (t) {
1364: case BT_NONXML:
1365: case BT_MALFORM:
1366: throw new InvalidTokenException(off);
1367: case BT_LEAD2:
1368: if (end - off < 2)
1369: throw new PartialCharException(off);
1370: check2(buf, off);
1371: off += 2;
1372: break;
1373: case BT_LEAD3:
1374: if (end - off < 3)
1375: throw new PartialCharException(off);
1376: check3(buf, off);
1377: off += 3;
1378: break;
1379: case BT_LEAD4:
1380: if (end - off < 4)
1381: throw new PartialCharException(off);
1382: check4(buf, off);
1383: off += 4;
1384: break;
1385: case BT_AMP: {
1386: normalized = false;
1387: int saveNameEnd = token.nameEnd;
1388: scanRef(buf, off + minBPC, end, token);
1389: token.nameEnd = saveNameEnd;
1390: off = token.tokenEnd;
1391: break;
1392: }
1393: case BT_S:
1394: if (normalized
1395: && (off == valueStart
1396: || byteToAscii(buf, off) != ' ' || (off
1397: + minBPC != end && (byteToAscii(
1398: buf, off + minBPC) == ' ' || byteType(
1399: buf, off + minBPC) == open))))
1400: normalized = false;
1401: off += minBPC;
1402: break;
1403: case BT_LT:
1404: throw new InvalidTokenException(off);
1405: case BT_LF:
1406: case BT_CR:
1407: normalized = false;
1408: /* fall through */
1409: default:
1410: off += minBPC;
1411: break;
1412: }
1413: }
1414: token.appendAttribute(nameStart, nameEnd, valueStart,
1415: off, normalized);
1416: off += minBPC;
1417: if (off == end)
1418: throw new PartialTokenException();
1419: int t = byteType(buf, off);
1420: switch (t) {
1421: case BT_S:
1422: case BT_CR:
1423: case BT_LF:
1424: off += minBPC;
1425: if (off == end)
1426: throw new PartialTokenException();
1427: t = byteType(buf, off);
1428: break;
1429: case BT_GT:
1430: case BT_SOL:
1431: break;
1432: default:
1433: throw new InvalidTokenException(off);
1434: }
1435: /* off points to closing quote */
1436: skipToName: for (;;) {
1437: switch (t) {
1438: case BT_NMSTRT:
1439: nameStart = off;
1440: off += minBPC;
1441: break skipToName;
1442: case BT_LEAD2:
1443: if (end - off < 2)
1444: throw new PartialCharException(off);
1445: if (byteType2(buf, off) != BT_NMSTRT)
1446: throw new InvalidTokenException(off);
1447: nameStart = off;
1448: off += 2;
1449: break skipToName;
1450: case BT_LEAD3:
1451: if (end - off < 3)
1452: throw new PartialCharException(off);
1453: if (byteType3(buf, off) != BT_NMSTRT)
1454: throw new InvalidTokenException(off);
1455: nameStart = off;
1456: off += 3;
1457: break skipToName;
1458: case BT_LEAD4:
1459: if (end - off < 4)
1460: throw new PartialCharException(off);
1461: if (byteType4(buf, off) != BT_NMSTRT)
1462: throw new InvalidTokenException(off);
1463: nameStart = off;
1464: off += 4;
1465: break skipToName;
1466: case BT_S:
1467: case BT_CR:
1468: case BT_LF:
1469: break;
1470: case BT_GT:
1471: token.checkAttributeUniqueness(buf);
1472: token.tokenEnd = off + minBPC;
1473: return TOK_START_TAG_WITH_ATTS;
1474: case BT_SOL:
1475: off += minBPC;
1476: if (off == end)
1477: throw new PartialTokenException();
1478: checkCharMatches(buf, off, '>');
1479: token.checkAttributeUniqueness(buf);
1480: token.tokenEnd = off + minBPC;
1481: return TOK_EMPTY_ELEMENT_WITH_ATTS;
1482: default:
1483: throw new InvalidTokenException(off);
1484: }
1485: off += minBPC;
1486: if (off == end)
1487: throw new PartialTokenException();
1488: t = byteType(buf, off);
1489: }
1490: nameEnd = -1;
1491: break;
1492: }
1493: default:
1494: throw new InvalidTokenException(off);
1495: }
1496: }
1497: throw new PartialTokenException();
1498: }
1499:
1500: /* off points to character following "<" */
1501:
1502: private final int scanLt(byte[] buf, int off, int end,
1503: ContentToken token) throws PartialTokenException,
1504: InvalidTokenException {
1505: if (off == end)
1506: throw new PartialTokenException();
1507: switch (byteType(buf, off)) {
1508: case BT_NMSTRT:
1509: off += minBPC;
1510: break;
1511: case BT_LEAD2:
1512: if (end - off < 2)
1513: throw new PartialCharException(off);
1514: if (byteType2(buf, off) != BT_NMSTRT)
1515: throw new InvalidTokenException(off);
1516: off += 2;
1517: break;
1518: case BT_LEAD3:
1519: if (end - off < 3)
1520: throw new PartialCharException(off);
1521: if (byteType3(buf, off) != BT_NMSTRT)
1522: throw new InvalidTokenException(off);
1523: off += 3;
1524: break;
1525: case BT_LEAD4:
1526: if (end - off < 4)
1527: throw new PartialCharException(off);
1528: if (byteType4(buf, off) != BT_NMSTRT)
1529: throw new InvalidTokenException(off);
1530: off += 4;
1531: break;
1532: case BT_EXCL:
1533: if ((off += minBPC) == end)
1534: throw new PartialTokenException();
1535: switch (byteType(buf, off)) {
1536: case BT_MINUS:
1537: return scanComment(buf, off + minBPC, end, token);
1538: case BT_LSQB:
1539: return scanCdataSection(buf, off + minBPC, end, token);
1540: }
1541: throw new InvalidTokenException(off);
1542: case BT_QUEST:
1543: return scanPi(buf, off + minBPC, end, token);
1544: case BT_SOL:
1545: return scanEndTag(buf, off + minBPC, end, token);
1546: default:
1547: throw new InvalidTokenException(off);
1548: }
1549: /* we have a start-tag */
1550: token.nameEnd = -1;
1551: token.clearAttributes();
1552: while (off != end) {
1553: switch (byteType(buf, off)) {
1554: case BT_NMSTRT:
1555: case BT_NAME:
1556: case BT_MINUS:
1557: off += minBPC;
1558: break;
1559: case BT_LEAD2:
1560: if (end - off < 2)
1561: throw new PartialCharException(off);
1562: if (!isNameChar2(buf, off))
1563: throw new InvalidTokenException(off);
1564: off += 2;
1565: break;
1566: case BT_LEAD3:
1567: if (end - off < 3)
1568: throw new PartialCharException(off);
1569: if (!isNameChar3(buf, off))
1570: throw new InvalidTokenException(off);
1571: off += 3;
1572: break;
1573: case BT_LEAD4:
1574: if (end - off < 4)
1575: throw new PartialCharException(off);
1576: if (!isNameChar4(buf, off))
1577: throw new InvalidTokenException(off);
1578: off += 4;
1579: break;
1580: case BT_S:
1581: case BT_CR:
1582: case BT_LF:
1583: token.nameEnd = off;
1584: off += minBPC;
1585: loop: for (;;) {
1586: if (off == end)
1587: throw new PartialTokenException();
1588: switch (byteType(buf, off)) {
1589: case BT_NMSTRT:
1590: return scanAtts(off, buf, off + minBPC, end,
1591: token);
1592: case BT_LEAD2:
1593: if (end - off < 2)
1594: throw new PartialCharException(off);
1595: if (byteType2(buf, off) != BT_NMSTRT)
1596: throw new InvalidTokenException(off);
1597: return scanAtts(off, buf, off + 2, end, token);
1598: case BT_LEAD3:
1599: if (end - off < 3)
1600: throw new PartialCharException(off);
1601: if (byteType3(buf, off) != BT_NMSTRT)
1602: throw new InvalidTokenException(off);
1603: return scanAtts(off, buf, off + 3, end, token);
1604: case BT_LEAD4:
1605: if (end - off < 4)
1606: throw new PartialCharException(off);
1607: if (byteType4(buf, off) != BT_NMSTRT)
1608: throw new InvalidTokenException(off);
1609: return scanAtts(off, buf, off + 4, end, token);
1610: case BT_GT:
1611: case BT_SOL:
1612: break loop;
1613: case BT_S:
1614: case BT_CR:
1615: case BT_LF:
1616: off += minBPC;
1617: break;
1618: default:
1619: throw new InvalidTokenException(off);
1620: }
1621: }
1622: break;
1623: case BT_GT:
1624: if (token.nameEnd < 0)
1625: token.nameEnd = off;
1626: token.tokenEnd = off + minBPC;
1627: return TOK_START_TAG_NO_ATTS;
1628: case BT_SOL:
1629: if (token.nameEnd < 0)
1630: token.nameEnd = off;
1631: off += minBPC;
1632: if (off == end)
1633: throw new PartialTokenException();
1634: checkCharMatches(buf, off, '>');
1635: token.tokenEnd = off + minBPC;
1636: return TOK_EMPTY_ELEMENT_NO_ATTS;
1637: default:
1638: throw new InvalidTokenException(off);
1639: }
1640: }
1641: throw new PartialTokenException();
1642: }
1643:
1644: // Ensure that we always scan a multiple of minBPC bytes.
1645:
1646: private final int adjustEnd(int off, int end)
1647: throws PartialCharException {
1648: int n = end - off;
1649: if ((n & (minBPC - 1)) != 0) {
1650: n &= ~(minBPC - 1);
1651: if (n == 0)
1652: throw new PartialCharException(off);
1653: return off + n;
1654: } else
1655: return end;
1656: }
1657:
1658: /**
1659: * Scans the first token of a byte subarrary that contains content.
1660: * Returns one of the following integers according to the type of token
1661: * that the subarray starts with:
1662: * <ul>
1663: * <li><code>TOK_START_TAG_NO_ATTS</code>
1664: * <li><code>TOK_START_TAG_WITH_ATTS</code>
1665: * <li><code>TOK_EMPTY_ELEMENT_NO_ATTS</code>
1666: * <li><code>TOK_EMPTY_ELEMENT_WITH_ATTS</code>
1667: * <li><code>TOK_END_TAG</code>
1668: * <li><code>TOK_DATA_CHARS</code>
1669: * <li><code>TOK_DATA_NEWLINE</code>
1670: * <li><code>TOK_CDATA_SECT_OPEN</code>
1671: * <li><code>TOK_ENTITY_REF</code>
1672: * <li><code>TOK_MAGIC_ENTITY_REF</code>
1673: * <li><code>TOK_CHAR_REF</code>
1674: * <li><code>TOK_CHAR_PAIR_REF</code>
1675: * <li><code>TOK_PI</code>
1676: * <li><code>TOK_XML_DECL</code>
1677: * <li><code>TOK_COMMENT</code>
1678: * </ul>
1679: * <p>
1680: * Information about the token is stored in <code>token</code>.
1681: * <p>
1682: * When <code>TOK_CDATA_SECT_OPEN</code> is returned,
1683: * <code>tokenizeCdataSection</code> should be called until
1684: * it returns <code>TOK_CDATA_SECT</code>.
1685: *
1686: * @exception EmptyTokenException if the subarray is empty
1687: * @exception PartialTokenException if the subarray contains only part of
1688: * a legal token
1689: * @exception InvalidTokenException if the subarrary does not start
1690: * with a legal token or part of one
1691: * @exception ExtensibleTokenException if the subarray encodes just a carriage
1692: * return ('\r')
1693: *
1694: * @see #TOK_START_TAG_NO_ATTS
1695: * @see #TOK_START_TAG_WITH_ATTS
1696: * @see #TOK_EMPTY_ELEMENT_NO_ATTS
1697: * @see #TOK_EMPTY_ELEMENT_WITH_ATTS
1698: * @see #TOK_END_TAG
1699: * @see #TOK_DATA_CHARS
1700: * @see #TOK_DATA_NEWLINE
1701: * @see #TOK_CDATA_SECT_OPEN
1702: * @see #TOK_ENTITY_REF
1703: * @see #TOK_MAGIC_ENTITY_REF
1704: * @see #TOK_CHAR_REF
1705: * @see #TOK_CHAR_PAIR_REF
1706: * @see #TOK_PI
1707: * @see #TOK_XML_DECL
1708: * @see #TOK_COMMENT
1709: * @see ContentToken
1710: * @see EmptyTokenException
1711: * @see PartialTokenException
1712: * @see InvalidTokenException
1713: * @see ExtensibleTokenException
1714: * @see #tokenizeCdataSection
1715: */
1716: public final int tokenizeContent(byte[] buf, int off, int end,
1717: ContentToken token) throws PartialTokenException,
1718: InvalidTokenException, EmptyTokenException,
1719: ExtensibleTokenException {
1720:
1721: if (minBPC > 1)
1722: end = adjustEnd(off, end);
1723: if (off == end)
1724: throw new EmptyTokenException();
1725: switch (byteType(buf, off)) {
1726: case BT_LT:
1727: return scanLt(buf, off + minBPC, end, token);
1728: case BT_AMP:
1729: return scanRef(buf, off + minBPC, end, token);
1730: case BT_CR:
1731: off += minBPC;
1732: if (off == end)
1733: throw new ExtensibleTokenException(TOK_DATA_NEWLINE);
1734: if (byteType(buf, off) == BT_LF)
1735: off += minBPC;
1736: token.tokenEnd = off;
1737: return TOK_DATA_NEWLINE;
1738: case BT_LF:
1739: token.tokenEnd = off + minBPC;
1740: return TOK_DATA_NEWLINE;
1741: case BT_RSQB:
1742: off += minBPC;
1743: if (off == end)
1744: throw new ExtensibleTokenException(TOK_DATA_CHARS);
1745: if (!charMatches(buf, off, ']'))
1746: break;
1747: off += minBPC;
1748: if (off == end)
1749: throw new ExtensibleTokenException(TOK_DATA_CHARS);
1750: if (!charMatches(buf, off, '>')) {
1751: off -= minBPC;
1752: break;
1753: }
1754: throw new InvalidTokenException(off);
1755: case BT_NONXML:
1756: case BT_MALFORM:
1757: throw new InvalidTokenException(off);
1758: case BT_LEAD2:
1759: if (end - off < 2)
1760: throw new PartialCharException(off);
1761: check2(buf, off);
1762: off += 2;
1763: break;
1764: case BT_LEAD3:
1765: if (end - off < 3)
1766: throw new PartialCharException(off);
1767: check3(buf, off);
1768: off += 3;
1769: break;
1770: case BT_LEAD4:
1771: if (end - off < 4)
1772: throw new PartialCharException(off);
1773: check4(buf, off);
1774: off += 4;
1775: break;
1776: default:
1777: off += minBPC;
1778: break;
1779: }
1780: token.tokenEnd = extendData(buf, off, end);
1781: return TOK_DATA_CHARS;
1782: }
1783:
1784: int extendData(final byte[] buf, int off, final int end)
1785: throws InvalidTokenException {
1786: while (off != end) {
1787: switch (byteType(buf, off)) {
1788: case BT_LEAD2:
1789: if (end - off < 2)
1790: return off;
1791: check2(buf, off);
1792: off += 2;
1793: break;
1794: case BT_LEAD3:
1795: if (end - off < 3)
1796: return off;
1797: check3(buf, off);
1798: off += 3;
1799: break;
1800: case BT_LEAD4:
1801: if (end - off < 4)
1802: return off;
1803: check4(buf, off);
1804: off += 4;
1805: break;
1806: case BT_RSQB:
1807: case BT_AMP:
1808: case BT_LT:
1809: case BT_NONXML:
1810: case BT_MALFORM:
1811: case BT_CR:
1812: case BT_LF:
1813: return off;
1814: default:
1815: off += minBPC;
1816: break;
1817: }
1818: }
1819: return off;
1820: }
1821:
1822: /* off points to character following "%" */
1823:
1824: private final int scanPercent(byte[] buf, int off, int end,
1825: Token token) throws PartialTokenException,
1826: InvalidTokenException {
1827: if (off == end)
1828: throw new PartialTokenException();
1829: switch (byteType(buf, off)) {
1830: case BT_NMSTRT:
1831: off += minBPC;
1832: break;
1833: case BT_LEAD2:
1834: if (end - off < 2)
1835: throw new PartialCharException(off);
1836: if (byteType2(buf, off) != BT_NMSTRT)
1837: throw new InvalidTokenException(off);
1838: off += 2;
1839: break;
1840: case BT_LEAD3:
1841: if (end - off < 3)
1842: throw new PartialCharException(off);
1843: if (byteType3(buf, off) != BT_NMSTRT)
1844: throw new InvalidTokenException(off);
1845: off += 3;
1846: break;
1847: case BT_LEAD4:
1848: if (end - off < 4)
1849: throw new PartialCharException(off);
1850: if (byteType4(buf, off) != BT_NMSTRT)
1851: throw new InvalidTokenException(off);
1852: off += 4;
1853: break;
1854: case BT_S:
1855: case BT_LF:
1856: case BT_CR:
1857: case BT_PERCNT:
1858: token.tokenEnd = off;
1859: return TOK_PERCENT;
1860: default:
1861: throw new InvalidTokenException(off);
1862: }
1863: while (off != end) {
1864: switch (byteType(buf, off)) {
1865: case BT_NMSTRT:
1866: case BT_NAME:
1867: case BT_MINUS:
1868: off += minBPC;
1869: break;
1870: case BT_LEAD2:
1871: if (end - off < 2)
1872: throw new PartialCharException(off);
1873: if (!isNameChar2(buf, off))
1874: throw new InvalidTokenException(off);
1875: off += 2;
1876: break;
1877: case BT_LEAD3:
1878: if (end - off < 3)
1879: throw new PartialCharException(off);
1880: if (!isNameChar3(buf, off))
1881: throw new InvalidTokenException(off);
1882: off += 3;
1883: break;
1884: case BT_LEAD4:
1885: if (end - off < 4)
1886: throw new PartialCharException(off);
1887: if (!isNameChar4(buf, off))
1888: throw new InvalidTokenException(off);
1889: off += 4;
1890: break;
1891: case BT_SEMI:
1892: token.nameEnd = off;
1893: token.tokenEnd = off + minBPC;
1894: return TOK_PARAM_ENTITY_REF;
1895: default:
1896: throw new InvalidTokenException(off);
1897: }
1898: }
1899: throw new PartialTokenException();
1900: }
1901:
1902: private final int scanPoundName(byte[] buf, int off, int end,
1903: Token token) throws PartialTokenException,
1904: InvalidTokenException, ExtensibleTokenException {
1905: if (off == end)
1906: throw new PartialTokenException();
1907: switch (byteType(buf, off)) {
1908: case BT_NMSTRT:
1909: off += minBPC;
1910: break;
1911: case BT_LEAD2:
1912: if (end - off < 2)
1913: throw new PartialCharException(off);
1914: if (byteType2(buf, off) != BT_NMSTRT)
1915: throw new InvalidTokenException(off);
1916: off += 2;
1917: break;
1918: case BT_LEAD3:
1919: if (end - off < 3)
1920: throw new PartialCharException(off);
1921: if (byteType3(buf, off) != BT_NMSTRT)
1922: throw new InvalidTokenException(off);
1923: off += 3;
1924: break;
1925: case BT_LEAD4:
1926: if (end - off < 4)
1927: throw new PartialCharException(off);
1928: if (byteType4(buf, off) != BT_NMSTRT)
1929: throw new InvalidTokenException(off);
1930: off += 4;
1931: break;
1932: default:
1933: throw new InvalidTokenException(off);
1934: }
1935: while (off != end) {
1936: switch (byteType(buf, off)) {
1937: case BT_NMSTRT:
1938: case BT_NAME:
1939: case BT_MINUS:
1940: off += minBPC;
1941: break;
1942: case BT_LEAD2:
1943: if (end - off < 2)
1944: throw new PartialCharException(off);
1945: if (!isNameChar2(buf, off))
1946: throw new InvalidTokenException(off);
1947: off += 2;
1948: break;
1949: case BT_LEAD3:
1950: if (end - off < 3)
1951: throw new PartialCharException(off);
1952: if (!isNameChar3(buf, off))
1953: throw new InvalidTokenException(off);
1954: off += 3;
1955: break;
1956: case BT_LEAD4:
1957: if (end - off < 4)
1958: throw new PartialCharException(off);
1959: if (!isNameChar4(buf, off))
1960: throw new InvalidTokenException(off);
1961: off += 4;
1962: break;
1963: case BT_CR:
1964: case BT_LF:
1965: case BT_S:
1966: case BT_RPAR:
1967: case BT_GT:
1968: case BT_PERCNT:
1969: case BT_VERBAR:
1970: token.tokenEnd = off;
1971: return TOK_POUND_NAME;
1972: default:
1973: throw new InvalidTokenException(off);
1974: }
1975: }
1976: throw new ExtensibleTokenException(TOK_POUND_NAME);
1977: }
1978:
1979: private final int scanLit(int open, byte[] buf, int off, int end,
1980: Token token) throws PartialTokenException,
1981: InvalidTokenException, ExtensibleTokenException {
1982: while (off != end) {
1983: int t = byteType(buf, off);
1984: switch (t) {
1985: case BT_LEAD2:
1986: if (end - off < 2)
1987: throw new PartialTokenException();
1988: check2(buf, off);
1989: off += 2;
1990: break;
1991: case BT_LEAD3:
1992: if (end - off < 3)
1993: throw new PartialTokenException();
1994: check3(buf, off);
1995: off += 3;
1996: break;
1997: case BT_LEAD4:
1998: if (end - off < 4)
1999: throw new PartialTokenException();
2000: check4(buf, off);
2001: off += 4;
2002: break;
2003: case BT_NONXML:
2004: case BT_MALFORM:
2005: throw new InvalidTokenException(off);
2006: case BT_QUOT:
2007: case BT_APOS:
2008: off += minBPC;
2009: if (t != open)
2010: break;
2011: if (off == end)
2012: throw new ExtensibleTokenException(TOK_LITERAL);
2013: switch (byteType(buf, off)) {
2014: case BT_S:
2015: case BT_CR:
2016: case BT_LF:
2017: case BT_GT:
2018: case BT_PERCNT:
2019: case BT_LSQB:
2020: token.tokenEnd = off;
2021: return TOK_LITERAL;
2022: default:
2023: throw new InvalidTokenException(off);
2024: }
2025: default:
2026: off += minBPC;
2027: break;
2028: }
2029: }
2030: throw new PartialTokenException();
2031: }
2032:
2033: /**
2034: * Returns an encoding object to be used to start parsing an external entity.
2035: * The encoding is chosen based on the initial 4 bytes of the entity.
2036: *
2037: * @param buf the byte array containing the initial bytes of the entity
2038: * @param off the index in <code>buf</code> of the first byte of the entity
2039: * @param end the index in <code>buf</code> following the last available
2040: * byte of the entity; <code>end - off</code> must be greater than or equal
2041: * to 4 unless the entity has fewer that 4 bytes, in which case it must
2042: * be equal to the length of the entity
2043: * @param token receives information about the presence of a byte order
2044: * mark; if the entity starts with a byte order mark
2045: * then <code>token.getTokenEnd()</code>
2046: * will return <code>off + 2</code>, otherwise it will return
2047: * <code>off</code>
2048: *
2049: * @see TextDecl
2050: * @see XmlDecl
2051: * @see #TOK_XML_DECL
2052: * @see #getEncoding
2053: * @see #getInternalEncoding
2054: */
2055: public static final Encoding getInitialEncoding(byte[] buf,
2056: int off, int end, Token token) {
2057: token.tokenEnd = off;
2058: switch (end - off) {
2059: case 0:
2060: break;
2061: case 1:
2062: if (buf[off] < 0)
2063: return null;
2064: break;
2065: default:
2066: int b0 = buf[off] & 0xFF;
2067: int b1 = buf[off + 1] & 0xFF;
2068: switch ((b0 << 8) | b1) {
2069: case 0xFEFF:
2070: token.tokenEnd = off + 2;
2071: /* fall through */
2072: case '<': /* not legal; but not a fatal error */
2073: return getEncoding(UTF16_BIG_ENDIAN_ENCODING);
2074: case 0xFFFE:
2075: token.tokenEnd = off + 2;
2076: /* fall through */
2077: case '<' << 8: /* not legal; but not a fatal error */
2078: return getEncoding(UTF16_LITTLE_ENDIAN_ENCODING);
2079: }
2080: }
2081: return getEncoding(UTF8_ENCODING);
2082: }
2083:
2084: /**
2085: * Returns an <code>Encoding</code> corresponding to
2086: * the specified IANA character set name.
2087: * Returns this <code>Encoding</code> if the name is null.
2088: * Returns null if the specified encoding is not supported.
2089: * Note that there are two distinct <code>Encoding</code> objects
2090: * associated with the name <code>UTF-16</code>, one for
2091: * each possible byte order; if this <code>Encoding</code>
2092: * is UTF-16 with little-endian byte ordering, then
2093: * <code>getEncoding("UTF-16")</code> will return this,
2094: * otherwise it will return an <code>Encoding</code> for
2095: * UTF-16 with big-endian byte ordering.
2096: * @param name a string specifying the IANA name of the encoding; this is
2097: * case insensitive
2098: */
2099: public final Encoding getEncoding(String name) {
2100: if (name == null)
2101: return this ;
2102: if (name.equalsIgnoreCase("UTF-8"))
2103: return getEncoding(UTF8_ENCODING);
2104: if (name.equalsIgnoreCase("UTF-16"))
2105: return getUTF16Encoding();
2106: if (name.equalsIgnoreCase("ISO-8859-1"))
2107: return getEncoding(ISO8859_1_ENCODING);
2108: if (name.equalsIgnoreCase("US-ASCII"))
2109: return getEncoding(ASCII_ENCODING);
2110: return null;
2111: }
2112:
2113: /**
2114: * Returns an <code>Encoding</code> for entities encoded with
2115: * a single-byte encoding (an encoding in which each byte represents
2116: * exactly one character).
2117: * @param map a string specifying the character represented by each byte;
2118: * the string must have a length of 256; <code>map.charAt(b)</code>
2119: * specifies the character encoded by byte <code>b</code>; bytes that do
2120: * not represent any character should be mapped to <code>\uFFFD</code>
2121: */
2122: public final Encoding getSingleByteEncoding(String map) {
2123: return new SingleByteEncoding(map);
2124: }
2125:
2126: /**
2127: * Returns an <code>Encoding</code> object for use with internal entities.
2128: * This is a UTF-16 big endian encoding, except that newlines
2129: * are assumed to have been normalized into line feed,
2130: * so carriage return is treated like a space.
2131: */
2132: public final static Encoding getInternalEncoding() {
2133: return getEncoding(INTERNAL_ENCODING);
2134: }
2135:
2136: /**
2137: * Scans the first token of a byte subarray that contains part of a
2138: * prolog.
2139: * Returns one of the following integers according to the type of token
2140: * that the subarray starts with:
2141: * <ul>
2142: * <li><code>TOK_PI</code>
2143: * <li><code>TOK_XML_DECL</code>
2144: * <li><code>TOK_COMMENT</code>
2145: * <li><code>TOK_PARAM_ENTITY_REF</code>
2146: * <li><code>TOK_PROLOG_S</code>
2147: * <li><code>TOK_DECL_OPEN</code>
2148: * <li><code>TOK_DECL_CLOSE</code>
2149: * <li><code>TOK_NAME</code>
2150: * <li><code>TOK_NMTOKEN</code>
2151: * <li><code>TOK_POUND_NAME</code>
2152: * <li><code>TOK_OR</code>
2153: * <li><code>TOK_PERCENT</code>
2154: * <li><code>TOK_OPEN_PAREN</code>
2155: * <li><code>TOK_CLOSE_PAREN</code>
2156: * <li><code>TOK_OPEN_BRACKET</code>
2157: * <li><code>TOK_CLOSE_BRACKET</code>
2158: * <li><code>TOK_LITERAL</code>
2159: * <li><code>TOK_NAME_QUESTION</code>
2160: * <li><code>TOK_NAME_ASTERISK</code>
2161: * <li><code>TOK_NAME_PLUS</code>
2162: * <li><code>TOK_COND_SECT_OPEN</code>
2163: * <li><code>TOK_COND_SECT_CLOSE</code>
2164: * <li><code>TOK_CLOSE_PAREN_QUESTION</code>
2165: * <li><code>TOK_CLOSE_PAREN_ASTERISK</code>
2166: * <li><code>TOK_CLOSE_PAREN_PLUS</code>
2167: * <li><code>TOK_COMMA</code>
2168: * </ul>
2169: * @exception EmptyTokenException if the subarray is empty
2170: * @exception PartialTokenException if the subarray contains only part of
2171: * a legal token
2172: * @exception InvalidTokenException if the subarrary does not start
2173: * with a legal token or part of one
2174: * @exception EndOfPrologException if the subarray starts with the document
2175: * element; <code>tokenizeContent</code> should be used on the remainder
2176: * of the entity
2177: * @exception ExtensibleTokenException if the subarray is a legal token
2178: * but subsequent bytes in the same entity could be part of the token
2179: * @see #TOK_PI
2180: * @see #TOK_XML_DECL
2181: * @see #TOK_COMMENT
2182: * @see #TOK_PARAM_ENTITY_REF
2183: * @see #TOK_PROLOG_S
2184: * @see #TOK_DECL_OPEN
2185: * @see #TOK_DECL_CLOSE
2186: * @see #TOK_NAME
2187: * @see #TOK_NMTOKEN
2188: * @see #TOK_POUND_NAME
2189: * @see #TOK_OR
2190: * @see #TOK_PERCENT
2191: * @see #TOK_OPEN_PAREN
2192: * @see #TOK_CLOSE_PAREN
2193: * @see #TOK_OPEN_BRACKET
2194: * @see #TOK_CLOSE_BRACKET
2195: * @see #TOK_LITERAL
2196: * @see #TOK_NAME_QUESTION
2197: * @see #TOK_NAME_ASTERISK
2198: * @see #TOK_NAME_PLUS
2199: * @see #TOK_COND_SECT_OPEN
2200: * @see #TOK_COND_SECT_CLOSE
2201: * @see #TOK_CLOSE_PAREN_QUESTION
2202: * @see #TOK_CLOSE_PAREN_ASTERISK
2203: * @see #TOK_CLOSE_PAREN_PLUS
2204: * @see #TOK_COMMA
2205: * @see ContentToken
2206: * @see EmptyTokenException
2207: * @see PartialTokenException
2208: * @see InvalidTokenException
2209: * @see ExtensibleTokenException
2210: * @see EndOfPrologException
2211: */
2212:
2213: public final int tokenizeProlog(byte[] buf, int off, int end,
2214: Token token) throws PartialTokenException,
2215: InvalidTokenException, EmptyTokenException,
2216: ExtensibleTokenException, EndOfPrologException {
2217: int tok;
2218: if (minBPC > 1)
2219: end = adjustEnd(off, end);
2220: if (off == end)
2221: throw new EmptyTokenException();
2222: switch (byteType(buf, off)) {
2223: case BT_QUOT:
2224: return scanLit(BT_QUOT, buf, off + minBPC, end, token);
2225: case BT_APOS:
2226: return scanLit(BT_APOS, buf, off + minBPC, end, token);
2227: case BT_LT: {
2228: off += minBPC;
2229: if (off == end)
2230: throw new PartialTokenException();
2231: switch (byteType(buf, off)) {
2232: case BT_EXCL:
2233: return scanDecl(buf, off + minBPC, end, token);
2234: case BT_QUEST:
2235: return scanPi(buf, off + minBPC, end, token);
2236: case BT_NMSTRT:
2237: case BT_LEAD2:
2238: case BT_LEAD3:
2239: case BT_LEAD4:
2240: token.tokenEnd = off - minBPC;
2241: throw new EndOfPrologException();
2242: }
2243: throw new InvalidTokenException(off);
2244: }
2245: case BT_CR:
2246: if (off + minBPC == end)
2247: throw new ExtensibleTokenException(TOK_PROLOG_S);
2248: /* fall through */
2249: case BT_S:
2250: case BT_LF:
2251: for (;;) {
2252: off += minBPC;
2253: if (off == end)
2254: break;
2255: switch (byteType(buf, off)) {
2256: case BT_S:
2257: case BT_LF:
2258: break;
2259: case BT_CR:
2260: /* don't split CR/LF pair */
2261: if (off + minBPC != end)
2262: break;
2263: /* fall through */
2264: default:
2265: token.tokenEnd = off;
2266: return TOK_PROLOG_S;
2267: }
2268: }
2269: token.tokenEnd = off;
2270: return TOK_PROLOG_S;
2271: case BT_PERCNT:
2272: return scanPercent(buf, off + minBPC, end, token);
2273: case BT_COMMA:
2274: token.tokenEnd = off + minBPC;
2275: return TOK_COMMA;
2276: case BT_LSQB:
2277: token.tokenEnd = off + minBPC;
2278: return TOK_OPEN_BRACKET;
2279: case BT_RSQB:
2280: off += minBPC;
2281: if (off == end)
2282: throw new ExtensibleTokenException(TOK_CLOSE_BRACKET);
2283: if (charMatches(buf, off, ']')) {
2284: if (off + minBPC == end)
2285: throw new PartialTokenException();
2286: if (charMatches(buf, off + minBPC, '>')) {
2287: token.tokenEnd = off + 2 * minBPC;
2288: return TOK_COND_SECT_CLOSE;
2289: }
2290: }
2291: token.tokenEnd = off;
2292: return TOK_CLOSE_BRACKET;
2293: case BT_LPAR:
2294: token.tokenEnd = off + minBPC;
2295: return TOK_OPEN_PAREN;
2296: case BT_RPAR:
2297: off += minBPC;
2298: if (off == end)
2299: throw new ExtensibleTokenException(TOK_CLOSE_PAREN);
2300: switch (byteType(buf, off)) {
2301: case BT_AST:
2302: token.tokenEnd = off + minBPC;
2303: return TOK_CLOSE_PAREN_ASTERISK;
2304: case BT_QUEST:
2305: token.tokenEnd = off + minBPC;
2306: return TOK_CLOSE_PAREN_QUESTION;
2307: case BT_PLUS:
2308: token.tokenEnd = off + minBPC;
2309: return TOK_CLOSE_PAREN_PLUS;
2310: case BT_CR:
2311: case BT_LF:
2312: case BT_S:
2313: case BT_GT:
2314: case BT_COMMA:
2315: case BT_VERBAR:
2316: case BT_RPAR:
2317: token.tokenEnd = off;
2318: return TOK_CLOSE_PAREN;
2319: }
2320: throw new InvalidTokenException(off);
2321: case BT_VERBAR:
2322: token.tokenEnd = off + minBPC;
2323: return TOK_OR;
2324: case BT_GT:
2325: token.tokenEnd = off + minBPC;
2326: return TOK_DECL_CLOSE;
2327: case BT_NUM:
2328: return scanPoundName(buf, off + minBPC, end, token);
2329: case BT_LEAD2:
2330: if (end - off < 2)
2331: throw new PartialCharException(off);
2332: switch (byteType2(buf, off)) {
2333: case BT_NMSTRT:
2334: off += 2;
2335: tok = TOK_NAME;
2336: break;
2337: case BT_NAME:
2338: off += 2;
2339: tok = TOK_NMTOKEN;
2340: break;
2341: default:
2342: throw new InvalidTokenException(off);
2343: }
2344: break;
2345: case BT_LEAD3:
2346: if (end - off < 3)
2347: throw new PartialCharException(off);
2348: switch (byteType3(buf, off)) {
2349: case BT_NMSTRT:
2350: off += 3;
2351: tok = TOK_NAME;
2352: break;
2353: case BT_NAME:
2354: off += 3;
2355: tok = TOK_NMTOKEN;
2356: break;
2357: default:
2358: throw new InvalidTokenException(off);
2359: }
2360: break;
2361: case BT_LEAD4:
2362: if (end - off < 4)
2363: throw new PartialCharException(off);
2364: switch (byteType4(buf, off)) {
2365: case BT_NMSTRT:
2366: off += 4;
2367: tok = TOK_NAME;
2368: break;
2369: case BT_NAME:
2370: off += 4;
2371: tok = TOK_NMTOKEN;
2372: break;
2373: default:
2374: throw new InvalidTokenException(off);
2375: }
2376: break;
2377: case BT_NMSTRT:
2378: tok = TOK_NAME;
2379: off += minBPC;
2380: break;
2381: case BT_NAME:
2382: case BT_MINUS:
2383: tok = TOK_NMTOKEN;
2384: off += minBPC;
2385: break;
2386: default:
2387: throw new InvalidTokenException(off);
2388: }
2389: while (off != end) {
2390: switch (byteType(buf, off)) {
2391: case BT_NMSTRT:
2392: case BT_NAME:
2393: case BT_MINUS:
2394: off += minBPC;
2395: break;
2396: case BT_LEAD2:
2397: if (end - off < 2)
2398: throw new PartialCharException(off);
2399: if (!isNameChar2(buf, off))
2400: throw new InvalidTokenException(off);
2401: off += 2;
2402: break;
2403: case BT_LEAD3:
2404: if (end - off < 3)
2405: throw new PartialCharException(off);
2406: if (!isNameChar3(buf, off))
2407: throw new InvalidTokenException(off);
2408: off += 3;
2409: break;
2410: case BT_LEAD4:
2411: if (end - off < 4)
2412: throw new PartialCharException(off);
2413: if (!isNameChar4(buf, off))
2414: throw new InvalidTokenException(off);
2415: off += 4;
2416: break;
2417: case BT_GT:
2418: case BT_RPAR:
2419: case BT_COMMA:
2420: case BT_VERBAR:
2421: case BT_LSQB:
2422: case BT_PERCNT:
2423: case BT_S:
2424: case BT_CR:
2425: case BT_LF:
2426: token.tokenEnd = off;
2427: return tok;
2428: case BT_PLUS:
2429: if (tok != TOK_NAME)
2430: throw new InvalidTokenException(off);
2431: token.tokenEnd = off + minBPC;
2432: return TOK_NAME_PLUS;
2433: case BT_AST:
2434: if (tok != TOK_NAME)
2435: throw new InvalidTokenException(off);
2436: token.tokenEnd = off + minBPC;
2437: return TOK_NAME_ASTERISK;
2438: case BT_QUEST:
2439: if (tok != TOK_NAME)
2440: throw new InvalidTokenException(off);
2441: token.tokenEnd = off + minBPC;
2442: return TOK_NAME_QUESTION;
2443: default:
2444: throw new InvalidTokenException(off);
2445: }
2446: }
2447: throw new ExtensibleTokenException(tok);
2448: }
2449:
2450: /**
2451: * Scans the first token of a byte subarrary that contains part of
2452: * literal attribute value. The opening and closing delimiters
2453: * are not included in the subarrary.
2454: * Returns one of the following integers according to the type of
2455: * token that the subarray starts with:
2456: * <ul>
2457: * <li><code>TOK_DATA_CHARS</code>
2458: * <li><code>TOK_DATA_NEWLINE</code>
2459: * <li><code>TOK_ATTRIBUTE_VALUE_S</code>
2460: * <li><code>TOK_MAGIC_ENTITY_REF</code>
2461: * <li><code>TOK_ENTITY_REF</code>
2462: * <li><code>TOK_CHAR_REF</code>
2463: * <li><code>TOK_CHAR_PAIR_REF</code>
2464: * </ul>
2465: * @exception EmptyTokenException if the subarray is empty
2466: * @exception PartialTokenException if the subarray contains only part of
2467: * a legal token
2468: * @exception InvalidTokenException if the subarrary does not start
2469: * with a legal token or part of one
2470: * @exception ExtensibleTokenException if the subarray encodes just a carriage
2471: * return ('\r')
2472: * @see #TOK_DATA_CHARS
2473: * @see #TOK_DATA_NEWLINE
2474: * @see #TOK_ATTRIBUTE_VALUE_S
2475: * @see #TOK_MAGIC_ENTITY_REF
2476: * @see #TOK_ENTITY_REF
2477: * @see #TOK_CHAR_REF
2478: * @see #TOK_CHAR_PAIR_REF
2479: * @see Token
2480: * @see EmptyTokenException
2481: * @see PartialTokenException
2482: * @see InvalidTokenException
2483: * @see ExtensibleTokenException
2484: */
2485: public final int tokenizeAttributeValue(byte[] buf, int off,
2486: int end, Token token) throws PartialTokenException,
2487: InvalidTokenException, EmptyTokenException,
2488: ExtensibleTokenException {
2489: if (minBPC > 1)
2490: end = adjustEnd(off, end);
2491: if (off == end)
2492: throw new EmptyTokenException();
2493: int start = off;
2494: while (off != end) {
2495: switch (byteType(buf, off)) {
2496: case BT_LEAD2:
2497: if (end - off < 2)
2498: throw new PartialCharException(off);
2499: off += 2;
2500: break;
2501: case BT_LEAD3:
2502: if (end - off < 3)
2503: throw new PartialCharException(off);
2504: off += 3;
2505: break;
2506: case BT_LEAD4:
2507: if (end - off < 4)
2508: throw new PartialCharException(off);
2509: off += 4;
2510: break;
2511: case BT_AMP:
2512: if (off == start)
2513: return scanRef(buf, off + minBPC, end, token);
2514: token.tokenEnd = off;
2515: return TOK_DATA_CHARS;
2516: case BT_LT:
2517: /* this is for inside entity references */
2518: throw new InvalidTokenException(off);
2519: case BT_S:
2520: if (off == start) {
2521: token.tokenEnd = off + minBPC;
2522: return TOK_ATTRIBUTE_VALUE_S;
2523: }
2524: token.tokenEnd = off;
2525: return TOK_DATA_CHARS;
2526: case BT_LF:
2527: if (off == start) {
2528: token.tokenEnd = off + minBPC;
2529: return TOK_DATA_NEWLINE;
2530: }
2531: token.tokenEnd = off;
2532: return TOK_DATA_CHARS;
2533: case BT_CR:
2534: if (off == start) {
2535: off += minBPC;
2536: if (off == end)
2537: throw new ExtensibleTokenException(
2538: TOK_DATA_NEWLINE);
2539: if (byteType(buf, off) == BT_LF)
2540: off += minBPC;
2541: token.tokenEnd = off;
2542: return TOK_DATA_NEWLINE;
2543: }
2544: token.tokenEnd = off;
2545: return TOK_DATA_CHARS;
2546: default:
2547: off += minBPC;
2548: break;
2549: }
2550: }
2551: token.tokenEnd = off;
2552: return TOK_DATA_CHARS;
2553: }
2554:
2555: /**
2556: * Scans the first token of a byte subarrary that contains part of
2557: * literal entity value. The opening and closing delimiters
2558: * are not included in the subarrary.
2559: * Returns one of the following integers according to the type of
2560: * token that the subarray starts with:
2561: * <ul>
2562: * <li><code>TOK_DATA_CHARS</code>
2563: * <li><code>TOK_DATA_NEWLINE</code>
2564: * <li><code>TOK_PARAM_ENTITY_REF</code>
2565: * <li><code>TOK_MAGIC_ENTITY_REF</code>
2566: * <li><code>TOK_ENTITY_REF</code>
2567: * <li><code>TOK_CHAR_REF</code>
2568: * <li><code>TOK_CHAR_PAIR_REF</code>
2569: * </ul>
2570: * @exception EmptyTokenException if the subarray is empty
2571: * @exception PartialTokenException if the subarray contains only part of
2572: * a legal token
2573: * @exception InvalidTokenException if the subarrary does not start
2574: * with a legal token or part of one
2575: * @exception ExtensibleTokenException if the subarray encodes just a carriage
2576: * return ('\r')
2577: * @see #TOK_DATA_CHARS
2578: * @see #TOK_DATA_NEWLINE
2579: * @see #TOK_MAGIC_ENTITY_REF
2580: * @see #TOK_ENTITY_REF
2581: * @see #TOK_PARAM_ENTITY_REF
2582: * @see #TOK_CHAR_REF
2583: * @see #TOK_CHAR_PAIR_REF
2584: * @see Token
2585: * @see EmptyTokenException
2586: * @see PartialTokenException
2587: * @see InvalidTokenException
2588: * @see ExtensibleTokenException
2589: */
2590: public final int tokenizeEntityValue(byte[] buf, int off, int end,
2591: Token token) throws PartialTokenException,
2592: InvalidTokenException, EmptyTokenException,
2593: ExtensibleTokenException {
2594: if (minBPC > 1)
2595: end = adjustEnd(off, end);
2596: if (off == end)
2597: throw new EmptyTokenException();
2598: int start = off;
2599: while (off != end) {
2600: switch (byteType(buf, off)) {
2601: case BT_LEAD2:
2602: if (end - off < 2)
2603: throw new PartialCharException(off);
2604: off += 2;
2605: break;
2606: case BT_LEAD3:
2607: if (end - off < 3)
2608: throw new PartialCharException(off);
2609: off += 3;
2610: break;
2611: case BT_LEAD4:
2612: if (end - off < 4)
2613: throw new PartialCharException(off);
2614: off += 4;
2615: break;
2616: case BT_AMP:
2617: if (off == start)
2618: return scanRef(buf, off + minBPC, end, token);
2619: token.tokenEnd = off;
2620: return TOK_DATA_CHARS;
2621: case BT_PERCNT:
2622: if (off == start)
2623: return scanPercent(buf, off + minBPC, end, token);
2624: token.tokenEnd = off;
2625: return TOK_DATA_CHARS;
2626: case BT_LF:
2627: if (off == start) {
2628: token.tokenEnd = off + minBPC;
2629: return TOK_DATA_NEWLINE;
2630: }
2631: token.tokenEnd = off;
2632: return TOK_DATA_CHARS;
2633: case BT_CR:
2634: if (off == start) {
2635: off += minBPC;
2636: if (off == end)
2637: throw new ExtensibleTokenException(
2638: TOK_DATA_NEWLINE);
2639: if (byteType(buf, off) == BT_LF)
2640: off += minBPC;
2641: token.tokenEnd = off;
2642: return TOK_DATA_NEWLINE;
2643: }
2644: token.tokenEnd = off;
2645: return TOK_DATA_CHARS;
2646: default:
2647: off += minBPC;
2648: break;
2649: }
2650: }
2651: token.tokenEnd = off;
2652: return TOK_DATA_CHARS;
2653: }
2654:
2655: /**
2656: * Skips over an ignored conditional section.
2657: * The subarray starts following the <code><![ IGNORE [</code>.
2658: *
2659: * @return the index of the character following the closing
2660: * <code>]]></code>
2661: *
2662: * @exception PartialTokenException if the subarray does not contain the
2663: * complete ignored conditional section
2664: * @exception InvalidTokenException if the ignored conditional section
2665: * contains illegal characters
2666: */
2667: public final int skipIgnoreSect(byte[] buf, int off, int end)
2668: throws PartialTokenException, InvalidTokenException {
2669: if (minBPC > 1)
2670: end = adjustEnd(off, end);
2671: int level = 0;
2672: loop: while (off != end) {
2673: switch (byteType(buf, off)) {
2674: case BT_LEAD2:
2675: if (end - off < 2)
2676: throw new PartialCharException(off);
2677: check2(buf, off);
2678: off += 2;
2679: break;
2680: case BT_LEAD3:
2681: if (end - off < 3)
2682: throw new PartialCharException(off);
2683: check3(buf, off);
2684: off += 3;
2685: break;
2686: case BT_LEAD4:
2687: if (end - off < 4)
2688: throw new PartialCharException(off);
2689: check4(buf, off);
2690: off += 4;
2691: break;
2692: case BT_NONXML:
2693: case BT_MALFORM:
2694: throw new InvalidTokenException(off);
2695: case BT_LT:
2696: off += minBPC;
2697: if (off == end)
2698: break loop;
2699: if (!charMatches(buf, off, '!'))
2700: break;
2701: off += minBPC;
2702: if (off == end)
2703: break loop;
2704: if (!charMatches(buf, off, '['))
2705: break;
2706: level++;
2707: off += minBPC;
2708: break;
2709: case BT_RSQB:
2710: off += minBPC;
2711: if (off == end)
2712: break loop;
2713: if (!charMatches(buf, off, ']'))
2714: break;
2715: off += minBPC;
2716: if (off == end)
2717: break loop;
2718: if (charMatches(buf, off, '>')) {
2719: if (level == 0)
2720: return off + minBPC;
2721: level--;
2722: } else if (charMatches(buf, off, ']'))
2723: break;
2724: off += minBPC;
2725: break;
2726: default:
2727: off += minBPC;
2728: break;
2729: }
2730: }
2731: throw new PartialTokenException();
2732: }
2733:
2734: /**
2735: * Checks that a literal contained in the specified byte subarray
2736: * is a legal public identifier and returns a string with
2737: * the normalized content of the public id.
2738: * The subarray includes the opening and closing quotes.
2739: * @exception InvalidTokenException if it is not a legal public identifier
2740: */
2741: public final String getPublicId(byte[] buf, int off, int end)
2742: throws InvalidTokenException {
2743: StringBuffer sbuf = new StringBuffer();
2744: off += minBPC;
2745: end -= minBPC;
2746: for (; off != end; off += minBPC) {
2747: char c = (char) byteToAscii(buf, off);
2748: switch (byteType(buf, off)) {
2749: case BT_MINUS:
2750: case BT_APOS:
2751: case BT_LPAR:
2752: case BT_RPAR:
2753: case BT_PLUS:
2754: case BT_COMMA:
2755: case BT_SOL:
2756: case BT_EQUALS:
2757: case BT_QUEST:
2758: case BT_SEMI:
2759: case BT_EXCL:
2760: case BT_AST:
2761: case BT_PERCNT:
2762: case BT_NUM:
2763: sbuf.append(c);
2764: break;
2765: case BT_S:
2766: if (charMatches(buf, off, '\t'))
2767: throw new InvalidTokenException(off);
2768: /* fall through */
2769: case BT_CR:
2770: case BT_LF:
2771: if (sbuf.length() > 0
2772: && sbuf.charAt(sbuf.length() - 1) != ' ')
2773: sbuf.append(' ');
2774: break;
2775: case BT_NAME:
2776: case BT_NMSTRT:
2777: if ((c & ~0x7f) == 0) {
2778: sbuf.append(c);
2779: break;
2780: }
2781: // fall through
2782: default:
2783: switch (c) {
2784: case '$':
2785: case '@':
2786: break;
2787: default:
2788: throw new InvalidTokenException(off);
2789: }
2790: break;
2791: }
2792: }
2793: if (sbuf.length() > 0 && sbuf.charAt(sbuf.length() - 1) == ' ')
2794: sbuf.setLength(sbuf.length() - 1);
2795: return sbuf.toString();
2796: }
2797:
2798: /**
2799: * Returns true if the specified byte subarray is equal to the string.
2800: * The string must contain only XML significant characters.
2801: */
2802: public final boolean matchesXMLString(byte[] buf, int off, int end,
2803: String str) {
2804: int len = str.length();
2805: if (len * minBPC != end - off)
2806: return false;
2807: for (int i = 0; i < len; off += minBPC, i++) {
2808: if (!charMatches(buf, off, str.charAt(i)))
2809: return false;
2810: }
2811: return true;
2812: }
2813:
2814: /**
2815: * Skips over XML whitespace characters at the start of the specified
2816: * subarray.
2817: *
2818: * @return the index of the first non-whitespace character,
2819: * <code>end</code> if there is the subarray is all whitespace
2820: */
2821: public final int skipS(byte[] buf, int off, int end) {
2822: loop: while (off < end) {
2823: switch (byteType(buf, off)) {
2824: case BT_S:
2825: case BT_CR:
2826: case BT_LF:
2827: off += minBPC;
2828: break;
2829: default:
2830: break loop;
2831: }
2832: }
2833: return off;
2834: }
2835:
2836: private final boolean isNameChar2(byte[] buf, int off) {
2837: int bt = byteType2(buf, off);
2838: return bt == BT_NAME || bt == BT_NMSTRT;
2839: }
2840:
2841: private final boolean isNameChar3(byte[] buf, int off) {
2842: int bt = byteType3(buf, off);
2843: return bt == BT_NAME || bt == BT_NMSTRT;
2844: }
2845:
2846: private final boolean isNameChar4(byte[] buf, int off) {
2847: int bt = byteType4(buf, off);
2848: return bt == BT_NAME || bt == BT_NMSTRT;
2849: }
2850:
2851: private static final String nameStartSingles = "\u003a\u005f\u0386\u038c\u03da\u03dc\u03de\u03e0\u0559\u06d5\u093d\u09b2"
2852: + "\u0a5e\u0a8d\u0abd\u0ae0\u0b3d\u0b9c\u0cde\u0e30\u0e84\u0e8a\u0e8d\u0ea5"
2853: + "\u0ea7\u0eb0\u0ebd\u1100\u1109\u113c\u113e\u1140\u114c\u114e\u1150\u1159"
2854: + "\u1163\u1165\u1167\u1169\u1175\u119e\u11a8\u11ab\u11ba\u11eb\u11f0\u11f9"
2855: + "\u1f59\u1f5b\u1f5d\u1fbe\u2126\u212e\u3007";
2856: private static final String nameStartRanges = "\u0041\u005a\u0061\u007a\u00c0\u00d6\u00d8\u00f6\u00f8\u00ff\u0100\u0131"
2857: + "\u0134\u013e\u0141\u0148\u014a\u017e\u0180\u01c3\u01cd\u01f0\u01f4\u01f5"
2858: + "\u01fa\u0217\u0250\u02a8\u02bb\u02c1\u0388\u038a\u038e\u03a1\u03a3\u03ce"
2859: + "\u03d0\u03d6\u03e2\u03f3\u0401\u040c\u040e\u044f\u0451\u045c\u045e\u0481"
2860: + "\u0490\u04c4\u04c7\u04c8\u04cb\u04cc\u04d0\u04eb\u04ee\u04f5\u04f8\u04f9"
2861: + "\u0531\u0556\u0561\u0586\u05d0\u05ea\u05f0\u05f2\u0621\u063a\u0641\u064a"
2862: + "\u0671\u06b7\u06ba\u06be\u06c0\u06ce\u06d0\u06d3\u06e5\u06e6\u0905\u0939"
2863: + "\u0958\u0961\u0985\u098c\u098f\u0990\u0993\u09a8\u09aa\u09b0\u09b6\u09b9"
2864: + "\u09dc\u09dd\u09df\u09e1\u09f0\u09f1\u0a05\u0a0a\u0a0f\u0a10\u0a13\u0a28"
2865: + "\u0a2a\u0a30\u0a32\u0a33\u0a35\u0a36\u0a38\u0a39\u0a59\u0a5c\u0a72\u0a74"
2866: + "\u0a85\u0a8b\u0a8f\u0a91\u0a93\u0aa8\u0aaa\u0ab0\u0ab2\u0ab3\u0ab5\u0ab9"
2867: + "\u0b05\u0b0c\u0b0f\u0b10\u0b13\u0b28\u0b2a\u0b30\u0b32\u0b33\u0b36\u0b39"
2868: + "\u0b5c\u0b5d\u0b5f\u0b61\u0b85\u0b8a\u0b8e\u0b90\u0b92\u0b95\u0b99\u0b9a"
2869: + "\u0b9e\u0b9f\u0ba3\u0ba4\u0ba8\u0baa\u0bae\u0bb5\u0bb7\u0bb9\u0c05\u0c0c"
2870: + "\u0c0e\u0c10\u0c12\u0c28\u0c2a\u0c33\u0c35\u0c39\u0c60\u0c61\u0c85\u0c8c"
2871: + "\u0c8e\u0c90\u0c92\u0ca8\u0caa\u0cb3\u0cb5\u0cb9\u0ce0\u0ce1\u0d05\u0d0c"
2872: + "\u0d0e\u0d10\u0d12\u0d28\u0d2a\u0d39\u0d60\u0d61\u0e01\u0e2e\u0e32\u0e33"
2873: + "\u0e40\u0e45\u0e81\u0e82\u0e87\u0e88\u0e94\u0e97\u0e99\u0e9f\u0ea1\u0ea3"
2874: + "\u0eaa\u0eab\u0ead\u0eae\u0eb2\u0eb3\u0ec0\u0ec4\u0f40\u0f47\u0f49\u0f69"
2875: + "\u10a0\u10c5\u10d0\u10f6\u1102\u1103\u1105\u1107\u110b\u110c\u110e\u1112"
2876: + "\u1154\u1155\u115f\u1161\u116d\u116e\u1172\u1173\u11ae\u11af\u11b7\u11b8"
2877: + "\u11bc\u11c2\u1e00\u1e9b\u1ea0\u1ef9\u1f00\u1f15\u1f18\u1f1d\u1f20\u1f45"
2878: + "\u1f48\u1f4d\u1f50\u1f57\u1f5f\u1f7d\u1f80\u1fb4\u1fb6\u1fbc\u1fc2\u1fc4"
2879: + "\u1fc6\u1fcc\u1fd0\u1fd3\u1fd6\u1fdb\u1fe0\u1fec\u1ff2\u1ff4\u1ff6\u1ffc"
2880: + "\u212a\u212b\u2180\u2182\u3041\u3094\u30a1\u30fa\u3105\u312c\uac00\ud7a3"
2881: + "\u4e00\u9fa5\u3021\u3029";
2882: private static final String nameSingles = "\u002d\u002e\u05bf\u05c4\u0670\u093c\u094d\u09bc\u09be\u09bf\u09d7\u0a02"
2883: + "\u0a3c\u0a3e\u0a3f\u0abc\u0b3c\u0bd7\u0d57\u0e31\u0eb1\u0f35\u0f37\u0f39"
2884: + "\u0f3e\u0f3f\u0f97\u0fb9\u20e1\u3099\u309a\u00b7\u02d0\u02d1\u0387\u0640"
2885: + "\u0e46\u0ec6\u3005";
2886: private static final String nameRanges = "\u0300\u0345\u0360\u0361\u0483\u0486\u0591\u05a1\u05a3\u05b9\u05bb\u05bd"
2887: + "\u05c1\u05c2\u064b\u0652\u06d6\u06dc\u06dd\u06df\u06e0\u06e4\u06e7\u06e8"
2888: + "\u06ea\u06ed\u0901\u0903\u093e\u094c\u0951\u0954\u0962\u0963\u0981\u0983"
2889: + "\u09c0\u09c4\u09c7\u09c8\u09cb\u09cd\u09e2\u09e3\u0a40\u0a42\u0a47\u0a48"
2890: + "\u0a4b\u0a4d\u0a70\u0a71\u0a81\u0a83\u0abe\u0ac5\u0ac7\u0ac9\u0acb\u0acd"
2891: + "\u0b01\u0b03\u0b3e\u0b43\u0b47\u0b48\u0b4b\u0b4d\u0b56\u0b57\u0b82\u0b83"
2892: + "\u0bbe\u0bc2\u0bc6\u0bc8\u0bca\u0bcd\u0c01\u0c03\u0c3e\u0c44\u0c46\u0c48"
2893: + "\u0c4a\u0c4d\u0c55\u0c56\u0c82\u0c83\u0cbe\u0cc4\u0cc6\u0cc8\u0cca\u0ccd"
2894: + "\u0cd5\u0cd6\u0d02\u0d03\u0d3e\u0d43\u0d46\u0d48\u0d4a\u0d4d\u0e34\u0e3a"
2895: + "\u0e47\u0e4e\u0eb4\u0eb9\u0ebb\u0ebc\u0ec8\u0ecd\u0f18\u0f19\u0f71\u0f84"
2896: + "\u0f86\u0f8b\u0f90\u0f95\u0f99\u0fad\u0fb1\u0fb7\u20d0\u20dc\u302a\u302f"
2897: + "\u0030\u0039\u0660\u0669\u06f0\u06f9\u0966\u096f\u09e6\u09ef\u0a66\u0a6f"
2898: + "\u0ae6\u0aef\u0b66\u0b6f\u0be7\u0bef\u0c66\u0c6f\u0ce6\u0cef\u0d66\u0d6f"
2899: + "\u0e50\u0e59\u0ed0\u0ed9\u0f20\u0f29\u3031\u3035\u309d\u309e\u30fc\u30fe";
2900:
2901: /* final */static byte[][] charTypeTable;
2902:
2903: private static void setCharType(char c, int type) {
2904: if (c < 0x80)
2905: return;
2906: int hi = c >> 8;
2907: if (charTypeTable[hi] == null) {
2908: charTypeTable[hi] = new byte[256];
2909: for (int i = 0; i < 256; i++)
2910: charTypeTable[hi][i] = BT_OTHER;
2911: }
2912: charTypeTable[hi][c & 0xFF] = (byte) type;
2913: }
2914:
2915: private static void setCharType(char min, char max, int type) {
2916: byte[] shared = null;
2917: do {
2918: if ((min & 0xFF) == 0) {
2919: for (; min + 0xFF <= max; min += 0x100) {
2920: if (shared == null) {
2921: shared = new byte[256];
2922: for (int i = 0; i < 256; i++)
2923: shared[i] = (byte) type;
2924: }
2925: charTypeTable[min >> 8] = shared;
2926: if (min + 0xFF == max)
2927: return;
2928: }
2929: }
2930: setCharType(min, type);
2931: } while (min++ != max);
2932: }
2933:
2934: static {
2935: charTypeTable = new byte[256][];
2936: for (int i = 0; i < nameSingles.length(); i++)
2937: setCharType(nameSingles.charAt(i), BT_NAME);
2938: for (int i = 0; i < nameRanges.length(); i += 2)
2939: setCharType(nameRanges.charAt(i), nameRanges.charAt(i + 1),
2940: BT_NAME);
2941: for (int i = 0; i < nameStartSingles.length(); i++)
2942: setCharType(nameStartSingles.charAt(i), BT_NMSTRT);
2943: for (int i = 0; i < nameStartRanges.length(); i += 2)
2944: setCharType(nameStartRanges.charAt(i), nameStartRanges
2945: .charAt(i + 1), BT_NMSTRT);
2946: setCharType('\uD800', '\uDBFF', BT_LEAD4);
2947: setCharType('\uDC00', '\uDFFF', BT_MALFORM);
2948: setCharType('\uFFFE', '\uFFFF', BT_NONXML);
2949: byte[] other = new byte[256];
2950: for (int i = 0; i < 256; i++)
2951: other[i] = BT_OTHER;
2952: for (int i = 0; i < 256; i++)
2953: if (charTypeTable[i] == null)
2954: charTypeTable[i] = other;
2955: System.arraycopy(asciiTypeTable, 0, charTypeTable[0], 0, 128);
2956: }
2957:
2958: /**
2959: * Returns the minimum number of bytes required to represent a single
2960: * character in this encoding. The value will be 1, 2 or 4.
2961: */
2962: public final int getMinBytesPerChar() {
2963: return minBPC;
2964: }
2965: }
|