0001: /*
0002: * The Apache Software License, Version 1.1
0003: *
0004: *
0005: * Copyright (c) 1999,2000 The Apache Software Foundation. All rights
0006: * reserved.
0007: *
0008: * Redistribution and use in source and binary forms, with or without
0009: * modification, are permitted provided that the following conditions
0010: * are met:
0011: *
0012: * 1. Redistributions of source code must retain the above copyright
0013: * notice, this list of conditions and the following disclaimer.
0014: *
0015: * 2. Redistributions in binary form must reproduce the above copyright
0016: * notice, this list of conditions and the following disclaimer in
0017: * the documentation and/or other materials provided with the
0018: * distribution.
0019: *
0020: * 3. The end-user documentation included with the redistribution,
0021: * if any, must include the following acknowledgment:
0022: * "This product includes software developed by the
0023: * Apache Software Foundation (http://www.apache.org/)."
0024: * Alternately, this acknowledgment may appear in the software itself,
0025: * if and wherever such third-party acknowledgments normally appear.
0026: *
0027: * 4. The names "Xerces" and "Apache Software Foundation" must
0028: * not be used to endorse or promote products derived from this
0029: * software without prior written permission. For written
0030: * permission, please contact apache@apache.org.
0031: *
0032: * 5. Products derived from this software may not be called "Apache",
0033: * nor may "Apache" appear in their name, without prior written
0034: * permission of the Apache Software Foundation.
0035: *
0036: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
0037: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
0038: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0039: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
0040: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0041: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0042: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
0043: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0044: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
0045: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
0046: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
0047: * SUCH DAMAGE.
0048: * ====================================================================
0049: *
0050: * This software consists of voluntary contributions made by many
0051: * individuals on behalf of the Apache Software Foundation and was
0052: * originally based on software copyright (c) 1999, International
0053: * Business Machines, Inc., http://www.apache.org. For more
0054: * information on the Apache Software Foundation, please see
0055: * <http://www.apache.org/>.
0056: */
0057:
0058: package org.apache.xerces.readers;
0059:
0060: import org.apache.xerces.framework.XMLErrorReporter;
0061: import org.apache.xerces.utils.ChunkyByteArray;
0062: import org.apache.xerces.utils.ChunkyCharArray;
0063: import org.apache.xerces.utils.QName;
0064: import org.apache.xerces.utils.StringHasher;
0065: import org.apache.xerces.utils.StringPool;
0066: import org.apache.xerces.utils.XMLCharacterProperties;
0067: import java.io.IOException;
0068:
0069: /**
0070: * Reader for UCS-2 and UCS-4 encodings.
0071: * <p>
0072: * This reader is created by the UCSRecognizer class when it decides that the
0073: * byte stream is encoded in a format supported by this class. This class
0074: * was intended to be another example of an encoding sensitive reader that
0075: * could take advantage of the system design to improve performance and reduce
0076: * resource consumption, but the actual performance tuning remains to be done.
0077: *
0078: * @version $Id: UCSReader.java,v 1.7 2001/06/21 19:54:28 lmartin Exp $
0079: */
0080: final class UCSReader extends XMLEntityReader implements
0081: StringPool.StringProducer {
0082:
0083: //
0084: // Constants
0085: //
0086:
0087: // debugging
0088:
0089: /** Set to true to debug UTF-16, big-endian. */
0090: private static final boolean DEBUG_UTF16_BIG = false;
0091:
0092: //
0093: // Scanner encoding enumeration
0094: //
0095: static final int E_UCS4B = 0, // UCS-4 big endian
0096: E_UCS4L = 1, // UCS-4 little endian
0097: E_UCS2B = 2, // UCS-2 big endian with byte order mark
0098: E_UCS2L = 3, // UCS-2 little endian with byte order mark
0099: E_UCS2B_NOBOM = 4, // UCS-2 big endian without byte order mark
0100: E_UCS2L_NOBOM = 5; // UCS-2 little endian without byte order mark
0101: //
0102: //
0103: //
0104: private ChunkyByteArray fData = null;
0105: private int fEncoding = -1;
0106: private StringPool fStringPool = null;
0107: private int fBytesPerChar = -1;
0108: private boolean fBigEndian = true;
0109: private ChunkyCharArray fStringCharArray = null;
0110: private boolean fCalledCharPropInit = false;
0111:
0112: //
0113: //
0114: //
0115: UCSReader(XMLEntityHandler entityHandler,
0116: XMLErrorReporter errorReporter,
0117: boolean sendCharDataAsCharArray, ChunkyByteArray data,
0118: int encoding, StringPool stringPool) throws Exception {
0119: super (entityHandler, errorReporter, sendCharDataAsCharArray);
0120: fCurrentOffset = (encoding == E_UCS2B || encoding == E_UCS2L) ? 2
0121: : 0;
0122: fData = data;
0123: fEncoding = encoding;
0124: fStringPool = stringPool;
0125: fBytesPerChar = (fEncoding == E_UCS4B || fEncoding == E_UCS4L) ? 4
0126: : 2;
0127: fBigEndian = fEncoding == E_UCS4B || fEncoding == E_UCS2B
0128: || fEncoding == E_UCS2B_NOBOM;
0129: }
0130:
0131: //
0132: //
0133: //
0134: private int getChar(int offset) throws IOException {
0135: int b0 = fData.byteAt(offset++) & 0xff;
0136: if (b0 == 0xff && fData.atEOF(offset))
0137: return -1;
0138: int b1 = fData.byteAt(offset++) & 0xff;
0139: if (fBytesPerChar == 4) {
0140: int b2 = fData.byteAt(offset++) & 0xff;
0141: int b3 = fData.byteAt(offset++) & 0xff;
0142: if (fBigEndian)
0143: return (b0 << 24) + (b1 << 16) + (b2 << 8) + b3;
0144: else
0145: return (b3 << 24) + (b2 << 16) + (b1 << 8) + b0;
0146: } else {
0147: if (fBigEndian)
0148: return (b0 << 8) + b1;
0149: else
0150: return (b1 << 8) + b0;
0151: }
0152: }
0153:
0154: /**
0155: *
0156: */
0157: public int addString(int offset, int length) {
0158: if (length == 0)
0159: return 0;
0160: return fStringPool.addString(this , offset, length);
0161: }
0162:
0163: /**
0164: *
0165: */
0166: public int addSymbol(int offset, int length) {
0167: if (length == 0)
0168: return 0;
0169: return fStringPool.addSymbol(this , offset, length, getHashcode(
0170: offset, length));
0171: }
0172:
0173: //
0174: //
0175: //
0176: public void append(XMLEntityHandler.CharBuffer charBuffer,
0177: int offset, int length) {
0178: int endOffset = offset + length;
0179: while (offset < endOffset) {
0180: int ch;
0181: try {
0182: ch = getChar(offset);
0183: } catch (IOException ex) {
0184: ch = 0; // REVISIT
0185: }
0186: charBuffer.append((char) ch);
0187: offset += fBytesPerChar;
0188: }
0189: }
0190:
0191: //
0192: //
0193: //
0194: public void releaseString(int offset, int length) {
0195: // nothing to do...
0196: }
0197:
0198: //
0199: //
0200: //
0201: public String toString(int offset, int length) {
0202: //
0203: // REVISIT - we need to cache this operation !!
0204: //
0205: if (fStringCharArray == null)
0206: fStringCharArray = new ChunkyCharArray(fStringPool);
0207: int newOffset = fStringCharArray.length();
0208: append(fStringCharArray, offset, length);
0209: int newLength = fStringCharArray.length() - newOffset;
0210: int stringIndex = fStringCharArray.addString(newOffset,
0211: newLength);
0212: return fStringPool.toString(stringIndex);
0213: }
0214:
0215: //
0216: //
0217: //
0218: private int getHashcode(int offset, int length) {
0219: int endOffset = offset + length;
0220: int hashcode = 0;
0221: while (offset < endOffset) {
0222: int ch;
0223: try {
0224: ch = getChar(offset);
0225: } catch (IOException ex) {
0226: ch = 0; // REVISIT
0227: }
0228: hashcode = StringHasher.hashChar(hashcode, ch);
0229: offset += fBytesPerChar;
0230: }
0231: return StringHasher.finishHash(hashcode);
0232: }
0233:
0234: //
0235: public boolean equalsString(int offset, int length,
0236: char[] strChars, int strOffset, int strLength) {
0237: int endOffset = offset + length;
0238: int slen = strLength;
0239: while (offset < endOffset) {
0240: if (slen-- == 0)
0241: return false;
0242: int ch;
0243: try {
0244: ch = getChar(offset);
0245: } catch (IOException ex) {
0246: ch = 0; // REVISIT
0247: }
0248: if (ch != strChars[strOffset++])
0249: return false;
0250: offset += fBytesPerChar;
0251: }
0252: return slen == 0;
0253: }
0254:
0255: //
0256: //
0257: //
0258: private static char[] fCharacters = new char[256];
0259: private int fCharDataLength = 0;
0260:
0261: private void appendCharData(int ch) {
0262: if (fCharacters.length == fCharDataLength) {
0263: char[] newchars = new char[fCharacters.length * 2];
0264: System.arraycopy(fCharacters, 0, newchars, 0,
0265: fCharacters.length);
0266: fCharacters = newchars;
0267: }
0268: fCharacters[fCharDataLength++] = (char) ch;
0269: }
0270:
0271: public void callCharDataHandler(int offset, int length,
0272: boolean isWhitespace) throws Exception {
0273: int endOffset = offset + length;
0274: boolean skiplf = false;
0275: while (offset < endOffset) {
0276: int ch = getChar(offset);
0277: // fix for Bug23: Element Data not normalized...
0278: if (skiplf) {
0279: skiplf = false;
0280: if (ch == 0x0A) {
0281: offset += fBytesPerChar;
0282: continue;
0283: }
0284: }
0285: if (ch == 0x0D) {
0286: skiplf = true;
0287: ch = 0x0A;
0288: }
0289: appendCharData(ch);
0290: offset += fBytesPerChar;
0291: }
0292: if (fSendCharDataAsCharArray) {
0293: if (isWhitespace)
0294: fCharDataHandler.processWhitespace(fCharacters, 0,
0295: fCharDataLength);
0296: else
0297: fCharDataHandler.processCharacters(fCharacters, 0,
0298: fCharDataLength);
0299: } else {
0300: int stringIndex = fStringPool.addString(new String(
0301: fCharacters, 0, fCharDataLength));
0302: if (isWhitespace)
0303: fCharDataHandler.processWhitespace(stringIndex);
0304: else
0305: fCharDataHandler.processCharacters(stringIndex);
0306: }
0307: fCharDataLength = 0;
0308: }
0309:
0310: //
0311: //
0312: //
0313: public boolean lookingAtChar(char ch, boolean skipPastChar)
0314: throws Exception {
0315: int ch2 = getChar(fCurrentOffset);
0316: if (ch2 == ch) {
0317: if (skipPastChar) {
0318: fCharacterCounter++;
0319: fCurrentOffset += fBytesPerChar;
0320: }
0321: return true;
0322: }
0323: return false;
0324: }
0325:
0326: //
0327: //
0328: //
0329: public boolean lookingAtValidChar(boolean skipPastChar)
0330: throws Exception {
0331: int ch = getChar(fCurrentOffset);
0332: if (ch < 0x20) {
0333: if (ch == 0x09) {
0334: if (!skipPastChar)
0335: return true;
0336: fCharacterCounter++;
0337: } else if (ch == 0x0A) {
0338: if (!skipPastChar)
0339: return true;
0340: fLinefeedCounter++;
0341: fCharacterCounter = 1;
0342: } else if (ch == 0x0D) {
0343: if (!skipPastChar)
0344: return true;
0345: fCarriageReturnCounter++;
0346: fCharacterCounter = 1;
0347: } else {
0348: if (ch == -1) {
0349: return changeReaders().lookingAtValidChar(
0350: skipPastChar);
0351: }
0352: return false;
0353: }
0354: fCurrentOffset += fBytesPerChar;
0355: return true;
0356: }
0357: if (ch <= 0xD7FF) {
0358: if (skipPastChar) {
0359: fCharacterCounter++;
0360: fCurrentOffset += fBytesPerChar;
0361: }
0362: return true;
0363: }
0364: if (ch <= 0xDFFF) {
0365: // REVISIT - check that the surrogate pair is valid
0366: if (skipPastChar) {
0367: fCharacterCounter++;
0368: fCurrentOffset += fBytesPerChar;
0369: }
0370: return true;
0371: }
0372: if (ch <= 0xFFFD) {
0373: if (skipPastChar) {
0374: fCharacterCounter++;
0375: fCurrentOffset += fBytesPerChar;
0376: }
0377: return true;
0378: }
0379: return false;
0380: }
0381:
0382: //
0383: //
0384: //
0385: public boolean lookingAtSpace(boolean skipPastChar)
0386: throws Exception {
0387: int ch = getChar(fCurrentOffset);
0388: if (ch > 0x20)
0389: return false;
0390: if (ch == 0x20 || ch == 0x09) {
0391: if (!skipPastChar)
0392: return true;
0393: fCharacterCounter++;
0394: } else if (ch == 0x0A) {
0395: if (!skipPastChar)
0396: return true;
0397: fLinefeedCounter++;
0398: fCharacterCounter = 1;
0399: } else if (ch == 0x0D) {
0400: if (!skipPastChar)
0401: return true;
0402: fCarriageReturnCounter++;
0403: fCharacterCounter = 1;
0404: } else {
0405: if (ch == -1) { // REVISIT - should we be checking this here ?
0406: return changeReaders().lookingAtSpace(skipPastChar);
0407: }
0408: return false;
0409: }
0410: fCurrentOffset += fBytesPerChar;
0411: return true;
0412: }
0413:
0414: //
0415: //
0416: //
0417: public void skipToChar(char chr) throws Exception {
0418: while (true) {
0419: int ch = getChar(fCurrentOffset);
0420: if (ch == chr)
0421: return;
0422: if (ch == -1) {
0423: changeReaders().skipToChar(chr);
0424: return;
0425: }
0426: if (ch == 0x0A) {
0427: fLinefeedCounter++;
0428: fCharacterCounter = 1;
0429: } else if (ch == 0x0D) {
0430: fCarriageReturnCounter++;
0431: fCharacterCounter = 1;
0432: } else if (ch >= 0xD800 && ch < 0xDC00) {
0433: fCharacterCounter++;
0434: fCurrentOffset += fBytesPerChar;
0435: ch = getChar(fCurrentOffset);
0436: if (ch < 0xDC00 || ch >= 0xE000)
0437: continue;
0438: } else
0439: fCharacterCounter++;
0440: fCurrentOffset += fBytesPerChar;
0441: }
0442: }
0443:
0444: //
0445: //
0446: //
0447: public void skipPastSpaces() throws Exception {
0448: while (true) {
0449: int ch = getChar(fCurrentOffset);
0450: if (ch > 0x20)
0451: return;
0452: if (ch == 0x20 || ch == 0x09) {
0453: fCharacterCounter++;
0454: } else if (ch == 0x0A) {
0455: fLinefeedCounter++;
0456: fCharacterCounter = 1;
0457: } else if (ch == 0x0D) {
0458: fCarriageReturnCounter++;
0459: fCharacterCounter = 1;
0460: } else {
0461: if (ch == -1)
0462: changeReaders().skipPastSpaces();
0463: return;
0464: }
0465: fCurrentOffset += fBytesPerChar;
0466: }
0467: }
0468:
0469: //
0470: //
0471: //
0472: public void skipPastName(char fastcheck) throws Exception {
0473: int ch = getChar(fCurrentOffset);
0474: if (!fCalledCharPropInit) {
0475: XMLCharacterProperties.initCharFlags();
0476: fCalledCharPropInit = true;
0477: }
0478: if ((XMLCharacterProperties.fgCharFlags[ch] & XMLCharacterProperties.E_InitialNameCharFlag) == 0)
0479: return;
0480: while (true) {
0481: fCurrentOffset += fBytesPerChar;
0482: fCharacterCounter++;
0483: ch = getChar(fCurrentOffset);
0484: if (fastcheck == ch)
0485: return;
0486: if ((XMLCharacterProperties.fgCharFlags[ch] & XMLCharacterProperties.E_NameCharFlag) == 0)
0487: return;
0488: }
0489: }
0490:
0491: //
0492: //
0493: //
0494: public void skipPastNmtoken(char fastcheck) throws Exception {
0495: int ch = getChar(fCurrentOffset);
0496: if (!fCalledCharPropInit) {
0497: XMLCharacterProperties.initCharFlags();
0498: fCalledCharPropInit = true;
0499: }
0500: while (true) {
0501: if (fastcheck == ch)
0502: return;
0503: if ((XMLCharacterProperties.fgCharFlags[ch] & XMLCharacterProperties.E_NameCharFlag) == 0)
0504: return;
0505: fCurrentOffset += fBytesPerChar;
0506: fCharacterCounter++;
0507: ch = getChar(fCurrentOffset);
0508: }
0509: }
0510:
0511: //
0512: //
0513: //
0514: public boolean skippedString(char[] s) throws Exception {
0515: int offset = fCurrentOffset;
0516: for (int i = 0; i < s.length; i++) {
0517: if (getChar(offset) != s[i])
0518: return false;
0519: offset += fBytesPerChar;
0520: }
0521: fCurrentOffset = offset;
0522: fCharacterCounter += s.length;
0523: return true;
0524: }
0525:
0526: //
0527: //
0528: //
0529: public int scanInvalidChar() throws Exception {
0530: int ch = getChar(fCurrentOffset);
0531: if (ch == -1) {
0532: return changeReaders().scanInvalidChar();
0533: }
0534: fCurrentOffset += fBytesPerChar;
0535: if (ch == 0x0A) {
0536: fLinefeedCounter++;
0537: fCharacterCounter = 1;
0538: } else if (ch == 0x0D) {
0539: fCarriageReturnCounter++;
0540: fCharacterCounter = 1;
0541: } else {
0542: fCharacterCounter++;
0543: if (ch >= 0xD800 && ch < 0xDC00) {
0544: int ch2 = getChar(fCurrentOffset);
0545: if (ch2 >= 0xDC00 && ch2 < 0xE000) {
0546: ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00)
0547: + 0x10000;
0548: fCurrentOffset += fBytesPerChar;
0549: }
0550: }
0551: }
0552: return ch;
0553: }
0554:
0555: //
0556: //
0557: //
0558: public int scanCharRef(boolean hex) throws Exception {
0559: int ch = getChar(fCurrentOffset);
0560: if (ch == -1) {
0561: return changeReaders().scanCharRef(hex);
0562: }
0563: int num = 0;
0564: if (hex) {
0565: if (ch > 'f'
0566: || XMLCharacterProperties.fgAsciiXDigitChar[ch] == 0)
0567: return XMLEntityHandler.CHARREF_RESULT_INVALID_CHAR;
0568: num = ch - (ch < 'A' ? '0' : (ch < 'a' ? 'A' : 'a') - 10);
0569: } else {
0570: if (ch < '0' || ch > '9')
0571: return XMLEntityHandler.CHARREF_RESULT_INVALID_CHAR;
0572: num = ch - '0';
0573: }
0574: fCharacterCounter++;
0575: fCurrentOffset += fBytesPerChar;
0576: boolean toobig = false;
0577: while (true) {
0578: ch = getChar(fCurrentOffset);
0579: if (ch == -1)
0580: break;
0581: if (hex) {
0582: if (ch > 'f'
0583: || XMLCharacterProperties.fgAsciiXDigitChar[ch] == 0)
0584: break;
0585: } else {
0586: if (ch < '0' || ch > '9')
0587: break;
0588: }
0589: fCharacterCounter++;
0590: fCurrentOffset += fBytesPerChar;
0591: if (hex) {
0592: int dig = ch
0593: - (ch < 'A' ? '0' : (ch < 'a' ? 'A' : 'a') - 10);
0594: num = (num << 4) + dig;
0595: } else {
0596: int dig = ch - '0';
0597: num = (num * 10) + dig;
0598: }
0599: if (num > 0x10FFFF) {
0600: toobig = true;
0601: num = 0;
0602: }
0603: }
0604: if (ch != ';')
0605: return XMLEntityHandler.CHARREF_RESULT_SEMICOLON_REQUIRED;
0606: fCharacterCounter++;
0607: fCurrentOffset += fBytesPerChar;
0608: if (toobig)
0609: return XMLEntityHandler.CHARREF_RESULT_OUT_OF_RANGE;
0610: return num;
0611: }
0612:
0613: //
0614: //
0615: //
0616: public int scanStringLiteral() throws Exception {
0617: boolean single;
0618: if (!(single = lookingAtChar('\'', true))
0619: && !lookingAtChar('\"', true)) {
0620: return XMLEntityHandler.STRINGLIT_RESULT_QUOTE_REQUIRED;
0621: }
0622: int offset = fCurrentOffset;
0623: char qchar = single ? '\'' : '\"';
0624: while (!lookingAtChar(qchar, false)) {
0625: if (!lookingAtValidChar(true)) {
0626: return XMLEntityHandler.STRINGLIT_RESULT_INVALID_CHAR;
0627: }
0628: }
0629: int stringIndex = addString(offset, fCurrentOffset - offset);
0630: lookingAtChar(qchar, true); // move past qchar
0631: return stringIndex;
0632: }
0633:
0634: //
0635: // [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
0636: // | "'" ([^<&'] | Reference)* "'"
0637: //
0638: public int scanAttValue(char qchar, boolean asSymbol)
0639: throws Exception {
0640: int offset = fCurrentOffset;
0641: while (true) {
0642: if (lookingAtChar(qchar, false)) {
0643: break;
0644: }
0645: if (lookingAtChar(' ', true)) {
0646: continue;
0647: }
0648: if (lookingAtSpace(false)) {
0649: return XMLEntityHandler.ATTVALUE_RESULT_COMPLEX;
0650: }
0651: if (lookingAtChar('&', false)) {
0652: return XMLEntityHandler.ATTVALUE_RESULT_COMPLEX;
0653: }
0654: if (lookingAtChar('<', false)) {
0655: return XMLEntityHandler.ATTVALUE_RESULT_LESSTHAN;
0656: }
0657: if (!lookingAtValidChar(true)) {
0658: return XMLEntityHandler.ATTVALUE_RESULT_INVALID_CHAR;
0659: }
0660: }
0661: int result = asSymbol ? addSymbol(offset, fCurrentOffset
0662: - offset) : addString(offset, fCurrentOffset - offset);
0663: lookingAtChar(qchar, true);
0664: return result;
0665: }
0666:
0667: //
0668: // [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
0669: // | "'" ([^%&'] | PEReference | Reference)* "'"
0670: //
0671: public int scanEntityValue(int qchar, boolean createString)
0672: throws Exception {
0673: int offset = fCurrentOffset;
0674: while (true) {
0675: if (qchar != -1 && lookingAtChar((char) qchar, false)) {
0676: if (!createString)
0677: return XMLEntityHandler.ENTITYVALUE_RESULT_FINISHED;
0678: break;
0679: }
0680: if (lookingAtChar('&', false)) {
0681: return XMLEntityHandler.ENTITYVALUE_RESULT_REFERENCE;
0682: }
0683: if (lookingAtChar('%', false)) {
0684: return XMLEntityHandler.ENTITYVALUE_RESULT_PEREF;
0685: }
0686: if (!lookingAtValidChar(true)) {
0687: return XMLEntityHandler.ENTITYVALUE_RESULT_INVALID_CHAR;
0688: }
0689: }
0690: int result = addString(offset, fCurrentOffset - offset);
0691: lookingAtChar((char) qchar, true);
0692: return result;
0693: }
0694:
0695: //
0696: //
0697: //
0698: public boolean scanExpectedName(char fastcheck,
0699: StringPool.CharArrayRange expectedName) throws Exception {
0700: int nameOffset = fCurrentOffset;
0701: skipPastName(fastcheck);
0702: int nameLength = fCurrentOffset - nameOffset;
0703: if (nameLength == 0)
0704: return false;
0705: int nameIndex = addSymbol(nameOffset, nameLength);
0706: // DEFECT !! check name against expected name
0707: return true;
0708: }
0709:
0710: public void scanQName(char fastcheck, QName qname) throws Exception {
0711:
0712: // REVISIT: possible bugs with surrogate characters -el
0713: int nameOffset = fCurrentOffset;
0714: int ch;
0715: int prefixend = -1;
0716: int offset = fCurrentOffset;
0717: ch = getChar(fCurrentOffset);
0718: if (ch < 0x80) {
0719: if (XMLCharacterProperties.fgAsciiInitialNameChar[ch] == 0) {
0720: qname.clear();
0721: return;
0722: }
0723: if (ch == ':') {
0724: qname.clear();
0725: return;
0726: }
0727: } else {
0728: if (!fCalledCharPropInit) {
0729: XMLCharacterProperties.initCharFlags();
0730: fCalledCharPropInit = true;
0731: }
0732: if ((XMLCharacterProperties.fgCharFlags[ch] & XMLCharacterProperties.E_InitialNameCharFlag) == 0)
0733: return;
0734: }
0735:
0736: while (true) {
0737: fCurrentOffset += fBytesPerChar;
0738: fCharacterCounter++;
0739: ch = getChar(fCurrentOffset);
0740: if (fastcheck == ch) {
0741: break;
0742: }
0743: if (ch < 0x80) {
0744: if (XMLCharacterProperties.fgAsciiNameChar[ch] == 0) {
0745: break;
0746: }
0747: if (ch == ':') {
0748: if (prefixend != -1) {
0749: break;
0750: }
0751: prefixend = fCurrentOffset;
0752: //
0753: // We need to peek ahead one character. If the next character is not a
0754: // valid initial name character, or is another colon, then we cannot meet
0755: // both the Prefix and LocalPart productions for the QName production,
0756: // which means that there is no Prefix and we need to terminate the QName
0757: // at the first colon. --JR's comments
0758: //
0759:
0760: ch = getChar(fCurrentOffset + fBytesPerChar);
0761: boolean lpok = true;
0762: if (ch < 0x80) {
0763: if (XMLCharacterProperties.fgAsciiInitialNameChar[ch] == 0
0764: || ch == ':') {
0765: lpok = false;
0766: }
0767: } else {
0768: if (!fCalledCharPropInit) {
0769: XMLCharacterProperties.initCharFlags();
0770: fCalledCharPropInit = true;
0771: }
0772: if ((XMLCharacterProperties.fgCharFlags[ch] & XMLCharacterProperties.E_InitialNameCharFlag) == 0) {
0773: lpok = false;
0774: }
0775: }
0776: if (!lpok) {
0777: prefixend = -1;
0778: break;
0779: }
0780: }
0781: } else {
0782: if (!fCalledCharPropInit) {
0783: XMLCharacterProperties.initCharFlags();
0784: fCalledCharPropInit = true;
0785: }
0786: if ((XMLCharacterProperties.fgCharFlags[ch] & XMLCharacterProperties.E_NameCharFlag) == 0) {
0787: break;
0788: }
0789: }
0790: }//end while loop
0791: int length = fCurrentOffset - offset;
0792: qname.prefix = prefixend == -1 ? -1 : addSymbol(offset,
0793: prefixend - offset);
0794: qname.rawname = addSymbol(offset, length);
0795: qname.localpart = prefixend == -1 ? qname.rawname : addSymbol(
0796: prefixend + fBytesPerChar, fCurrentOffset
0797: - (prefixend + fBytesPerChar));
0798: qname.uri = StringPool.EMPTY_STRING;
0799:
0800: } // scanQName(char,QName)
0801:
0802: public int scanName(char fastcheck) throws Exception {
0803: int nameOffset = fCurrentOffset;
0804: skipPastName(fastcheck);
0805: int nameLength = fCurrentOffset - nameOffset;
0806: if (nameLength == 0)
0807: return -1;
0808: int nameIndex = addSymbol(nameOffset, nameLength);
0809: return nameIndex;
0810: }
0811:
0812: //
0813: //
0814: //
0815: private static final char[] cdata_string = { 'C', 'D', 'A', 'T',
0816: 'A', '[' };
0817:
0818: private int recognizeMarkup() throws Exception {
0819: int ch = getChar(fCurrentOffset);
0820: switch (ch) {
0821: case -1:
0822: return XMLEntityHandler.CONTENT_RESULT_MARKUP_END_OF_INPUT;
0823: case '?':
0824: fCharacterCounter++;
0825: fCurrentOffset += fBytesPerChar;
0826: return XMLEntityHandler.CONTENT_RESULT_START_OF_PI;
0827: case '!':
0828: fCharacterCounter++;
0829: fCurrentOffset += fBytesPerChar;
0830: ch = getChar(fCurrentOffset);
0831: if (ch == -1) {
0832: fCharacterCounter--;
0833: fCurrentOffset -= fBytesPerChar;
0834: ;
0835: return XMLEntityHandler.CONTENT_RESULT_MARKUP_END_OF_INPUT;
0836: }
0837: if (ch == '-') {
0838: fCharacterCounter++;
0839: fCurrentOffset += fBytesPerChar;
0840: ch = getChar(fCurrentOffset);
0841: if (ch == -1) {
0842: fCharacterCounter -= 2;
0843: fCurrentOffset -= 2;
0844: return XMLEntityHandler.CONTENT_RESULT_MARKUP_END_OF_INPUT;
0845: }
0846: if (ch == '-') {
0847: fCharacterCounter++;
0848: fCurrentOffset += fBytesPerChar;
0849: return XMLEntityHandler.CONTENT_RESULT_START_OF_COMMENT;
0850: }
0851: break;
0852: }
0853: if (ch == '[') {
0854: fCharacterCounter++;
0855: fCurrentOffset += fBytesPerChar;
0856: for (int i = 0; i < 6; i++) {
0857: ch = getChar(fCurrentOffset);
0858: if (ch == -1) {
0859: fCharacterCounter -= (2 + i);
0860: fCurrentOffset -= ((2 + i) * fBytesPerChar);
0861: return XMLEntityHandler.CONTENT_RESULT_MARKUP_END_OF_INPUT;
0862: }
0863: if (ch != cdata_string[i]) {
0864: return XMLEntityHandler.CONTENT_RESULT_MARKUP_NOT_RECOGNIZED;
0865: }
0866: fCharacterCounter++;
0867: fCurrentOffset += fBytesPerChar;
0868: }
0869: return XMLEntityHandler.CONTENT_RESULT_START_OF_CDSECT;
0870: }
0871: break;
0872: case '/':
0873: fCharacterCounter++;
0874: fCurrentOffset += fBytesPerChar;
0875: return XMLEntityHandler.CONTENT_RESULT_START_OF_ETAG;
0876: default:
0877: return XMLEntityHandler.CONTENT_RESULT_START_OF_ELEMENT;
0878: }
0879: return XMLEntityHandler.CONTENT_RESULT_MARKUP_NOT_RECOGNIZED;
0880: }
0881:
0882: private int recognizeReference() throws Exception {
0883: int ch = getChar(fCurrentOffset);
0884: if (ch == -1) {
0885: return XMLEntityHandler.CONTENT_RESULT_REFERENCE_END_OF_INPUT;
0886: }
0887: //
0888: // [67] Reference ::= EntityRef | CharRef
0889: // [68] EntityRef ::= '&' Name ';'
0890: // [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
0891: //
0892: if (ch == '#') {
0893: fCharacterCounter++;
0894: fCurrentOffset += fBytesPerChar;
0895: return XMLEntityHandler.CONTENT_RESULT_START_OF_CHARREF;
0896: } else {
0897: return XMLEntityHandler.CONTENT_RESULT_START_OF_ENTITYREF;
0898: }
0899: }
0900:
0901: public int scanContent(QName element) throws Exception {
0902: int offset = fCurrentOffset;
0903: int ch = getChar(fCurrentOffset);
0904: fCurrentOffset += fBytesPerChar;
0905: byte prop;
0906: if (!fCalledCharPropInit) {
0907: XMLCharacterProperties.initCharFlags();
0908: fCalledCharPropInit = true;
0909: }
0910: if (ch < 0x80) {
0911: if (ch == -1) {
0912: fCurrentOffset -= fBytesPerChar;
0913: return changeReaders().scanContent(element); // REVISIT - not quite...
0914: }
0915: prop = XMLCharacterProperties.fgCharFlags[ch];
0916: if ((prop & XMLCharacterProperties.E_CharDataFlag) == 0
0917: && ch != 0x0A && ch != 0x0D) {
0918: if (ch == '<') {
0919: fCharacterCounter++;
0920: if (!fInCDSect) {
0921: return recognizeMarkup();
0922: }
0923: } else if (ch == '&') {
0924: fCharacterCounter++;
0925: if (!fInCDSect) {
0926: return recognizeReference();
0927: }
0928: } else if (ch == ']') {
0929: if (getChar(fCurrentOffset) == ']'
0930: && getChar(fCurrentOffset + fBytesPerChar) == '>') {
0931: fCharacterCounter += 3;
0932: fCurrentOffset += (2 * fBytesPerChar);
0933: return XMLEntityHandler.CONTENT_RESULT_END_OF_CDSECT;
0934: }
0935: } else {
0936: fCurrentOffset -= fBytesPerChar;
0937: return XMLEntityHandler.CONTENT_RESULT_INVALID_CHAR;
0938: }
0939: } else if (ch == 0x20 || ch == 0x09 || ch == 0x0A
0940: || ch == 0x0D) {
0941: do {
0942: if (ch == 0x0A) {
0943: fLinefeedCounter++;
0944: fCharacterCounter = 1;
0945: } else if (ch == 0x0D) {
0946: fCarriageReturnCounter++;
0947: fCharacterCounter = 1;
0948: } else {
0949: fCharacterCounter++;
0950: }
0951: ch = getChar(fCurrentOffset);
0952: fCurrentOffset += fBytesPerChar;
0953: } while (ch == 0x20 || ch == 0x09 || ch == 0x0A
0954: || ch == 0x0D);
0955: if (ch < 0x80) {
0956: if (ch == -1) {
0957: fCurrentOffset -= fBytesPerChar;
0958: callCharDataHandler(offset, fCurrentOffset
0959: - offset, true);
0960: return changeReaders().scanContent(element); // REVISIT - not quite...
0961: }
0962: prop = XMLCharacterProperties.fgCharFlags[ch];
0963: if ((prop & XMLCharacterProperties.E_CharDataFlag) == 0) {
0964: if (ch == '<') {
0965: if (!fInCDSect) {
0966: callCharDataHandler(
0967: offset,
0968: (fCurrentOffset - fBytesPerChar)
0969: - offset, true);
0970: fCharacterCounter++;
0971: return recognizeMarkup();
0972: }
0973: fCharacterCounter++;
0974: } else if (ch == '&') {
0975: if (!fInCDSect) {
0976: callCharDataHandler(
0977: offset,
0978: (fCurrentOffset - fBytesPerChar)
0979: - offset, true);
0980: fCharacterCounter++;
0981: return recognizeReference();
0982: }
0983: fCharacterCounter++;
0984: } else if (ch == ']') {
0985: if (getChar(fCurrentOffset) == ']'
0986: && getChar(fCurrentOffset
0987: + fBytesPerChar) == '>') {
0988: callCharDataHandler(
0989: offset,
0990: (fCurrentOffset - fBytesPerChar)
0991: - offset, true);
0992: fCharacterCounter += 3;
0993: fCurrentOffset += (2 * fBytesPerChar);
0994: return XMLEntityHandler.CONTENT_RESULT_END_OF_CDSECT;
0995: }
0996: } else {
0997: fCurrentOffset -= fBytesPerChar;
0998: callCharDataHandler(offset, fCurrentOffset
0999: - offset, true);
1000: return XMLEntityHandler.CONTENT_RESULT_INVALID_CHAR;
1001: }
1002: }
1003: } else {
1004: if (ch >= 0xD800 && ch <= 0xDFFF) {
1005: fCurrentOffset += fBytesPerChar;
1006: } else if (ch == 0xFFFE || ch == 0xFFFF) {
1007: fCurrentOffset -= fBytesPerChar;
1008: callCharDataHandler(offset, fCurrentOffset
1009: - offset, true);
1010: return XMLEntityHandler.CONTENT_RESULT_INVALID_CHAR;
1011: }
1012: }
1013: }
1014: } else {
1015: if (ch >= 0xD800 && ch <= 0xDFFF) {
1016: fCurrentOffset += fBytesPerChar;
1017: } else if (ch == 0xFFFE || ch == 0xFFFF) {
1018: fCurrentOffset -= fBytesPerChar;
1019: return XMLEntityHandler.CONTENT_RESULT_INVALID_CHAR;
1020: }
1021: }
1022: fCharacterCounter++;
1023: while (true) {
1024: ch = getChar(fCurrentOffset);
1025: fCurrentOffset += fBytesPerChar;
1026: if (ch >= 0x80 || ch < 0)
1027: break;
1028: prop = XMLCharacterProperties.fgCharFlags[ch];
1029: if ((prop & XMLCharacterProperties.E_CharDataFlag) == 0) {
1030: if (ch == 0x0A) {
1031: fLinefeedCounter++;
1032: fCharacterCounter = 1;
1033: } else if (ch == 0x0D) {
1034: fCarriageReturnCounter++;
1035: fCharacterCounter = 1;
1036: } else
1037: break;
1038: } else
1039: fCharacterCounter++;
1040: }
1041: while (true) { // REVISIT - EOF check ?
1042: if (ch < 0x80) {
1043: if (ch == -1) {
1044: fCurrentOffset -= fBytesPerChar;
1045: callCharDataHandler(offset,
1046: fCurrentOffset - offset, false);
1047: return changeReaders().scanContent(element); // REVISIT - not quite...
1048: }
1049: prop = XMLCharacterProperties.fgCharFlags[ch];
1050: if ((prop & XMLCharacterProperties.E_CharDataFlag) == 0) {
1051: if (ch == '<') {
1052: if (!fInCDSect) {
1053: callCharDataHandler(offset,
1054: (fCurrentOffset - fBytesPerChar)
1055: - offset, false);
1056: fCharacterCounter++;
1057: return recognizeMarkup();
1058: }
1059: fCharacterCounter++;
1060: } else if (ch == '&') {
1061: if (!fInCDSect) {
1062: callCharDataHandler(offset,
1063: (fCurrentOffset - fBytesPerChar)
1064: - offset, false);
1065: fCharacterCounter++;
1066: return recognizeReference();
1067: }
1068: fCharacterCounter++;
1069: } else if (ch == 0x0A) {
1070: fLinefeedCounter++;
1071: fCharacterCounter = 1;
1072: } else if (ch == 0x0D) {
1073: fCarriageReturnCounter++;
1074: fCharacterCounter = 1;
1075: } else if (ch == ']') {
1076: if (getChar(fCurrentOffset) == ']'
1077: && getChar(fCurrentOffset
1078: + fBytesPerChar) == '>') {
1079: callCharDataHandler(offset,
1080: (fCurrentOffset - fBytesPerChar)
1081: - offset, false);
1082: fCharacterCounter += 3;
1083: fCurrentOffset += (2 * fBytesPerChar);
1084: return XMLEntityHandler.CONTENT_RESULT_END_OF_CDSECT;
1085: }
1086: fCharacterCounter++;
1087: } else {
1088: fCurrentOffset -= fBytesPerChar;
1089: callCharDataHandler(offset, fCurrentOffset
1090: - offset, false);
1091: return XMLEntityHandler.CONTENT_RESULT_INVALID_CHAR;
1092: }
1093: } else {
1094: fCharacterCounter++;
1095: }
1096: } else {
1097: if (ch >= 0xD800 && ch <= 0xDFFF) {
1098: fCharacterCounter++;
1099: fCurrentOffset += fBytesPerChar;
1100: } else if (ch == 0xFFFE || ch == 0xFFFF) {
1101: fCurrentOffset -= fBytesPerChar;
1102: callCharDataHandler(offset,
1103: fCurrentOffset - offset, false);
1104: return XMLEntityHandler.CONTENT_RESULT_INVALID_CHAR;
1105: }
1106: fCharacterCounter++;
1107: }
1108: ch = getChar(fCurrentOffset);
1109: fCurrentOffset += fBytesPerChar;
1110: }
1111: }
1112: }
|