001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017:
018: package org.apache.xerces.impl;
019:
020: import java.io.IOException;
021:
022: import org.apache.xerces.impl.msg.XMLMessageFormatter;
023: import org.apache.xerces.util.XML11Char;
024: import org.apache.xerces.util.XMLChar;
025: import org.apache.xerces.util.XMLStringBuffer;
026: import org.apache.xerces.xni.XMLString;
027: import org.apache.xerces.xni.XNIException;
028:
029: /**
030: * This class is responsible for scanning XML document structure
031: * and content. The scanner acts as the source for the document
032: * information which is communicated to the document handler.
033: * <p>
034: * This component requires the following features and properties from the
035: * component manager that uses it:
036: * <ul>
037: * <li>http://xml.org/sax/features/namespaces</li>
038: * <li>http://xml.org/sax/features/validation</li>
039: * <li>http://apache.org/xml/features/nonvalidating/load-external-dtd</li>
040: * <li>http://apache.org/xml/features/scanner/notify-char-refs</li>
041: * <li>http://apache.org/xml/features/scanner/notify-builtin-refs</li>
042: * <li>http://apache.org/xml/properties/internal/symbol-table</li>
043: * <li>http://apache.org/xml/properties/internal/error-reporter</li>
044: * <li>http://apache.org/xml/properties/internal/entity-manager</li>
045: * <li>http://apache.org/xml/properties/internal/dtd-scanner</li>
046: * </ul>
047: *
048: * @xerces.internal
049: *
050: * @author Glenn Marcy, IBM
051: * @author Andy Clark, IBM
052: * @author Arnaud Le Hors, IBM
053: * @author Eric Ye, IBM
054: *
055: * @version $Id: XML11DocumentScannerImpl.java 572055 2007-09-02 17:55:43Z mrglavas $
056: */
057: public class XML11DocumentScannerImpl extends XMLDocumentScannerImpl {
058:
059: /** String. */
060: private final XMLString fString = new XMLString();
061:
062: /** String buffer. */
063: private final XMLStringBuffer fStringBuffer = new XMLStringBuffer();
064: private final XMLStringBuffer fStringBuffer2 = new XMLStringBuffer();
065: private final XMLStringBuffer fStringBuffer3 = new XMLStringBuffer();
066:
067: //
068: // Constructors
069: //
070:
071: /** Default constructor. */
072: public XML11DocumentScannerImpl() {
073: super ();
074: } // <init>()
075:
076: //
077: // overridden methods
078: //
079:
080: // XMLDocumentFragmentImpl methods
081:
082: /**
083: * Scans element content.
084: *
085: * @return Returns the next character on the stream.
086: */
087: protected int scanContent() throws IOException, XNIException {
088:
089: XMLString content = fString;
090: int c = fEntityScanner.scanContent(content);
091: if (c == '\r' || c == 0x85 || c == 0x2028) {
092: // happens when there is the character reference
093: // but scanContent doesn't do entity expansions...
094: // is this *really* necessary??? - NG
095: fEntityScanner.scanChar();
096: fStringBuffer.clear();
097: fStringBuffer.append(fString);
098: fStringBuffer.append((char) c);
099: content = fStringBuffer;
100: c = -1;
101: }
102: if (fDocumentHandler != null && content.length > 0) {
103: fDocumentHandler.characters(content, null);
104: }
105:
106: if (c == ']' && fString.length == 0) {
107: fStringBuffer.clear();
108: fStringBuffer.append((char) fEntityScanner.scanChar());
109: // remember where we are in case we get an endEntity before we
110: // could flush the buffer out - this happens when we're parsing an
111: // entity which ends with a ]
112: fInScanContent = true;
113: //
114: // We work on a single character basis to handle cases such as:
115: // ']]]>' which we might otherwise miss.
116: //
117: if (fEntityScanner.skipChar(']')) {
118: fStringBuffer.append(']');
119: while (fEntityScanner.skipChar(']')) {
120: fStringBuffer.append(']');
121: }
122: if (fEntityScanner.skipChar('>')) {
123: reportFatalError("CDEndInContent", null);
124: }
125: }
126: if (fDocumentHandler != null && fStringBuffer.length != 0) {
127: fDocumentHandler.characters(fStringBuffer, null);
128: }
129: fInScanContent = false;
130: c = -1;
131: }
132: return c;
133:
134: } // scanContent():int
135:
136: /**
137: * Scans an attribute value and normalizes whitespace converting all
138: * whitespace characters to space characters.
139: *
140: * [10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'"
141: *
142: * @param value The XMLString to fill in with the value.
143: * @param nonNormalizedValue The XMLString to fill in with the
144: * non-normalized value.
145: * @param atName The name of the attribute being parsed (for error msgs).
146: * @param checkEntities true if undeclared entities should be reported as VC violation,
147: * false if undeclared entities should be reported as WFC violation.
148: * @param eleName The name of element to which this attribute belongs.
149: *
150: * @return true if the non-normalized and normalized value are the same
151: *
152: * <strong>Note:</strong> This method uses fStringBuffer2, anything in it
153: * at the time of calling is lost.
154: **/
155: protected boolean scanAttributeValue(XMLString value,
156: XMLString nonNormalizedValue, String atName,
157: boolean checkEntities, String eleName) throws IOException,
158: XNIException {
159: // quote
160: int quote = fEntityScanner.peekChar();
161: if (quote != '\'' && quote != '"') {
162: reportFatalError("OpenQuoteExpected", new Object[] {
163: eleName, atName });
164: }
165:
166: fEntityScanner.scanChar();
167: int entityDepth = fEntityDepth;
168:
169: int c = fEntityScanner.scanLiteral(quote, value);
170: if (DEBUG_ATTR_NORMALIZATION) {
171: System.out.println("** scanLiteral -> \""
172: + value.toString() + "\"");
173: }
174:
175: int fromIndex = 0;
176: if (c == quote
177: && (fromIndex = isUnchangedByNormalization(value)) == -1) {
178: /** Both the non-normalized and normalized attribute values are equal. **/
179: nonNormalizedValue.setValues(value);
180: int cquote = fEntityScanner.scanChar();
181: if (cquote != quote) {
182: reportFatalError("CloseQuoteExpected", new Object[] {
183: eleName, atName });
184: }
185: return true;
186: }
187: fStringBuffer2.clear();
188: fStringBuffer2.append(value);
189: normalizeWhitespace(value, fromIndex);
190: if (DEBUG_ATTR_NORMALIZATION) {
191: System.out.println("** normalizeWhitespace -> \""
192: + value.toString() + "\"");
193: }
194: if (c != quote) {
195: fScanningAttribute = true;
196: fStringBuffer.clear();
197: do {
198: fStringBuffer.append(value);
199: if (DEBUG_ATTR_NORMALIZATION) {
200: System.out.println("** value2: \""
201: + fStringBuffer.toString() + "\"");
202: }
203: if (c == '&') {
204: fEntityScanner.skipChar('&');
205: if (entityDepth == fEntityDepth) {
206: fStringBuffer2.append('&');
207: }
208: if (fEntityScanner.skipChar('#')) {
209: if (entityDepth == fEntityDepth) {
210: fStringBuffer2.append('#');
211: }
212: int ch = scanCharReferenceValue(fStringBuffer,
213: fStringBuffer2);
214: if (ch != -1) {
215: if (DEBUG_ATTR_NORMALIZATION) {
216: System.out.println("** value3: \""
217: + fStringBuffer.toString()
218: + "\"");
219: }
220: }
221: } else {
222: String entityName = fEntityScanner.scanName();
223: if (entityName == null) {
224: reportFatalError("NameRequiredInReference",
225: null);
226: } else if (entityDepth == fEntityDepth) {
227: fStringBuffer2.append(entityName);
228: }
229: if (!fEntityScanner.skipChar(';')) {
230: reportFatalError(
231: "SemicolonRequiredInReference",
232: new Object[] { entityName });
233: } else if (entityDepth == fEntityDepth) {
234: fStringBuffer2.append(';');
235: }
236: if (entityName == fAmpSymbol) {
237: fStringBuffer.append('&');
238: if (DEBUG_ATTR_NORMALIZATION) {
239: System.out.println("** value5: \""
240: + fStringBuffer.toString()
241: + "\"");
242: }
243: } else if (entityName == fAposSymbol) {
244: fStringBuffer.append('\'');
245: if (DEBUG_ATTR_NORMALIZATION) {
246: System.out.println("** value7: \""
247: + fStringBuffer.toString()
248: + "\"");
249: }
250: } else if (entityName == fLtSymbol) {
251: fStringBuffer.append('<');
252: if (DEBUG_ATTR_NORMALIZATION) {
253: System.out.println("** value9: \""
254: + fStringBuffer.toString()
255: + "\"");
256: }
257: } else if (entityName == fGtSymbol) {
258: fStringBuffer.append('>');
259: if (DEBUG_ATTR_NORMALIZATION) {
260: System.out.println("** valueB: \""
261: + fStringBuffer.toString()
262: + "\"");
263: }
264: } else if (entityName == fQuotSymbol) {
265: fStringBuffer.append('"');
266: if (DEBUG_ATTR_NORMALIZATION) {
267: System.out.println("** valueD: \""
268: + fStringBuffer.toString()
269: + "\"");
270: }
271: } else {
272: if (fEntityManager
273: .isExternalEntity(entityName)) {
274: reportFatalError(
275: "ReferenceToExternalEntity",
276: new Object[] { entityName });
277: } else {
278: if (!fEntityManager
279: .isDeclaredEntity(entityName)) {
280: //WFC & VC: Entity Declared
281: if (checkEntities) {
282: if (fValidation) {
283: fErrorReporter
284: .reportError(
285: XMLMessageFormatter.XML_DOMAIN,
286: "EntityNotDeclared",
287: new Object[] { entityName },
288: XMLErrorReporter.SEVERITY_ERROR);
289: }
290: } else {
291: reportFatalError(
292: "EntityNotDeclared",
293: new Object[] { entityName });
294: }
295: }
296: fEntityManager.startEntity(entityName,
297: true);
298: }
299: }
300: }
301: } else if (c == '<') {
302: reportFatalError("LessthanInAttValue",
303: new Object[] { eleName, atName });
304: fEntityScanner.scanChar();
305: if (entityDepth == fEntityDepth) {
306: fStringBuffer2.append((char) c);
307: }
308: } else if (c == '%' || c == ']') {
309: fEntityScanner.scanChar();
310: fStringBuffer.append((char) c);
311: if (entityDepth == fEntityDepth) {
312: fStringBuffer2.append((char) c);
313: }
314: if (DEBUG_ATTR_NORMALIZATION) {
315: System.out.println("** valueF: \""
316: + fStringBuffer.toString() + "\"");
317: }
318: }
319: // note that none of these characters should ever get through
320: // XML11EntityScanner. Not sure why
321: // this check was originally necessary. - NG
322: else if (c == '\n' || c == '\r' || c == 0x85
323: || c == 0x2028) {
324: fEntityScanner.scanChar();
325: fStringBuffer.append(' ');
326: if (entityDepth == fEntityDepth) {
327: fStringBuffer2.append('\n');
328: }
329: } else if (c != -1 && XMLChar.isHighSurrogate(c)) {
330: fStringBuffer3.clear();
331: if (scanSurrogates(fStringBuffer3)) {
332: fStringBuffer.append(fStringBuffer3);
333: if (entityDepth == fEntityDepth) {
334: fStringBuffer2.append(fStringBuffer3);
335: }
336: if (DEBUG_ATTR_NORMALIZATION) {
337: System.out.println("** valueI: \""
338: + fStringBuffer.toString() + "\"");
339: }
340: }
341: } else if (c != -1 && isInvalidLiteral(c)) {
342: reportFatalError("InvalidCharInAttValue",
343: new Object[] { eleName, atName,
344: Integer.toString(c, 16) });
345: fEntityScanner.scanChar();
346: if (entityDepth == fEntityDepth) {
347: fStringBuffer2.append((char) c);
348: }
349: }
350: c = fEntityScanner.scanLiteral(quote, value);
351: if (entityDepth == fEntityDepth) {
352: fStringBuffer2.append(value);
353: }
354: normalizeWhitespace(value);
355: } while (c != quote || entityDepth != fEntityDepth);
356: fStringBuffer.append(value);
357: if (DEBUG_ATTR_NORMALIZATION) {
358: System.out.println("** valueN: \""
359: + fStringBuffer.toString() + "\"");
360: }
361: value.setValues(fStringBuffer);
362: fScanningAttribute = false;
363: }
364: nonNormalizedValue.setValues(fStringBuffer2);
365:
366: // quote
367: int cquote = fEntityScanner.scanChar();
368: if (cquote != quote) {
369: reportFatalError("CloseQuoteExpected", new Object[] {
370: eleName, atName });
371: }
372: return nonNormalizedValue.equals(value.ch, value.offset,
373: value.length);
374: } // scanAttributeValue()
375:
376: //
377: // XMLScanner methods
378: //
379: // NOTE: this is a carbon copy of the code in XML11DTDScannerImpl;
380: // we need to override these methods in both places.
381: // this needs to be refactored!!! - NG
382: /**
383: * Scans public ID literal.
384: *
385: * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
386: * [13] PubidChar::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
387: *
388: * The returned string is normalized according to the following rule,
389: * from http://www.w3.org/TR/REC-xml#dt-pubid:
390: *
391: * Before a match is attempted, all strings of white space in the public
392: * identifier must be normalized to single space characters (#x20), and
393: * leading and trailing white space must be removed.
394: *
395: * @param literal The string to fill in with the public ID literal.
396: * @return True on success.
397: *
398: * <strong>Note:</strong> This method uses fStringBuffer, anything in it at
399: * the time of calling is lost.
400: */
401: protected boolean scanPubidLiteral(XMLString literal)
402: throws IOException, XNIException {
403: int quote = fEntityScanner.scanChar();
404: if (quote != '\'' && quote != '"') {
405: reportFatalError("QuoteRequiredInPublicID", null);
406: return false;
407: }
408:
409: fStringBuffer.clear();
410: // skip leading whitespace
411: boolean skipSpace = true;
412: boolean dataok = true;
413: while (true) {
414: int c = fEntityScanner.scanChar();
415: // REVISIT: none of these except \n and 0x20 should make it past the entity scanner
416: if (c == ' ' || c == '\n' || c == '\r' || c == 0x85
417: || c == 0x2028) {
418: if (!skipSpace) {
419: // take the first whitespace as a space and skip the others
420: fStringBuffer.append(' ');
421: skipSpace = true;
422: }
423: } else if (c == quote) {
424: if (skipSpace) {
425: // if we finished on a space let's trim it
426: fStringBuffer.length--;
427: }
428: literal.setValues(fStringBuffer);
429: break;
430: } else if (XMLChar.isPubid(c)) {
431: fStringBuffer.append((char) c);
432: skipSpace = false;
433: } else if (c == -1) {
434: reportFatalError("PublicIDUnterminated", null);
435: return false;
436: } else {
437: dataok = false;
438: reportFatalError("InvalidCharInPublicID",
439: new Object[] { Integer.toHexString(c) });
440: }
441: }
442: return dataok;
443: }
444:
445: /**
446: * Normalize whitespace in an XMLString converting all whitespace
447: * characters to space characters.
448: */
449: protected void normalizeWhitespace(XMLString value) {
450: int end = value.offset + value.length;
451: for (int i = value.offset; i < end; ++i) {
452: int c = value.ch[i];
453: if (XMLChar.isSpace(c)) {
454: value.ch[i] = ' ';
455: }
456: }
457: }
458:
459: /**
460: * Normalize whitespace in an XMLString converting all whitespace
461: * characters to space characters.
462: */
463: protected void normalizeWhitespace(XMLString value, int fromIndex) {
464: int end = value.offset + value.length;
465: for (int i = value.offset + fromIndex; i < end; ++i) {
466: int c = value.ch[i];
467: if (XMLChar.isSpace(c)) {
468: value.ch[i] = ' ';
469: }
470: }
471: }
472:
473: /**
474: * Checks whether this string would be unchanged by normalization.
475: *
476: * @return -1 if the value would be unchanged by normalization,
477: * otherwise the index of the first whitespace character which
478: * would be transformed.
479: */
480: protected int isUnchangedByNormalization(XMLString value) {
481: int end = value.offset + value.length;
482: for (int i = value.offset; i < end; ++i) {
483: int c = value.ch[i];
484: if (XMLChar.isSpace(c)) {
485: return i - value.offset;
486: }
487: }
488: return -1;
489: }
490:
491: // returns true if the given character is not
492: // valid with respect to the version of
493: // XML understood by this scanner.
494: protected boolean isInvalid(int value) {
495: return (XML11Char.isXML11Invalid(value));
496: } // isInvalid(int): boolean
497:
498: // returns true if the given character is not
499: // valid or may not be used outside a character reference
500: // with respect to the version of XML understood by this scanner.
501: protected boolean isInvalidLiteral(int value) {
502: return (!XML11Char.isXML11ValidLiteral(value));
503: } // isInvalidLiteral(int): boolean
504:
505: // returns true if the given character is
506: // a valid nameChar with respect to the version of
507: // XML understood by this scanner.
508: protected boolean isValidNameChar(int value) {
509: return (XML11Char.isXML11Name(value));
510: } // isValidNameChar(int): boolean
511:
512: // returns true if the given character is
513: // a valid nameStartChar with respect to the version of
514: // XML understood by this scanner.
515: protected boolean isValidNameStartChar(int value) {
516: return (XML11Char.isXML11NameStart(value));
517: } // isValidNameStartChar(int): boolean
518:
519: // returns true if the given character is
520: // a valid NCName character with respect to the version of
521: // XML understood by this scanner.
522: protected boolean isValidNCName(int value) {
523: return (XML11Char.isXML11NCName(value));
524: } // isValidNCName(int): boolean
525:
526: // returns true if the given character is
527: // a valid high surrogate for a nameStartChar
528: // with respect to the version of XML understood
529: // by this scanner.
530: protected boolean isValidNameStartHighSurrogate(int value) {
531: return XML11Char.isXML11NameHighSurrogate(value);
532: } // isValidNameStartHighSurrogate(int): boolean
533:
534: protected boolean versionSupported(String version) {
535: return (version.equals("1.1") || version.equals("1.0"));
536: } // versionSupported(String): boolean
537:
538: // returns the error message key for unsupported
539: // versions of XML with respect to the version of
540: // XML understood by this scanner.
541: protected String getVersionNotSupportedKey() {
542: return "VersionNotSupported11";
543: } // getVersionNotSupportedKey: String
544:
545: } // class XML11DocumentScannerImpl
|