001: package gnu.xml;
002:
003: import java.io.*;
004: import gnu.text.*;
005: import gnu.lists.*;
006: import gnu.text.Path; /* #ifdef use:java.nio */
007: import java.nio.charset.*;
008:
009: /* #endif */
010:
011: /** Reads XML from a char array.
012: * Assumes a state-less character encoding containing ascii as a sub-set,
013: * and where no byte in a multi-byte character is the same as a xml special
014: * character. Any bytes with high-order bit set are treated as if they
015: * are letters, and can be part of names.
016: *
017: * Handles CR/LF, CDATA, entity references, processing instructions, DOCTYPE,
018: * as well as the obvious (text, element, and attributes).
019: *
020: * @author Per Bothner
021: */
022:
023: public class XMLParser {
024: private static final int EXPECT_NAME_MODIFIER = 1;
025: private static final int SKIP_SPACES_MODIFIER = 2;
026: private static final int INIT_STATE = 0;
027: private static final int TEXT_STATE = 1;
028: private static final int BEGIN_ELEMENT_STATE = 2;
029: private static final int END_ELEMENT_STATE = 4;
030: private static final int SAW_ENTITY_REF = 6; // Saw '&'.
031: private static final int ATTRIBUTE_SEEN_NAME_STATE = 8;
032: private static final int MAYBE_ATTRIBUTE_STATE = 10;
033: private static final int ATTRIBUTE_SEEN_EQ_STATE = 11;
034: private static final int DOCTYPE_SEEN_STATE = 13;
035: private static final int DOCTYPE_NAME_SEEN_STATE = 16;
036: private static final int SAW_LEFT_STATE = 14;
037: private static final int SAW_LEFT_SLASH_STATE = 19; // Seen '</'
038: private static final int SAW_LEFT_EXCL_STATE = 20;
039: private static final int SAW_LEFT_QUEST_STATE = 21; // Seen '<?'
040: private static final int SAW_LEFT_EXCL_MINUS_STATE = 22;
041: private static final int SAW_AMP_STATE = 25; // Saw '&'.
042: private static final int SAW_AMP_SHARP_STATE = 26; // Saw '&#'.
043: private static final int EXPECT_RIGHT_STATE = 27;
044: private static final int PREV_WAS_CR_STATE = 28;
045: private static final int INIT_LEFT_QUEST_STATE = 30;
046: private static final int INIT_TEXT_STATE = 31;
047: private static final int INIT_LEFT_STATE = 34;
048: private static final int INVALID_VERSION_DECL = 35;
049: private static final int SAW_ERROR = 36;
050: private static final int SAW_EOF_ERROR = 37; // Unexpected end-of-file.
051:
052: static final String BAD_ENCODING_SYNTAX = "bad encoding declaration";
053:
054: public static void parse(Object uri, SourceMessages messages,
055: Consumer out) throws java.io.IOException {
056: parse(Path.openInputStream(uri), uri, messages, out);
057: }
058:
059: public static LineInputStreamReader XMLStreamReader(InputStream strm)
060: throws java.io.IOException {
061: LineInputStreamReader in = new LineInputStreamReader(strm);
062: /* #ifndef use:java.nio */
063: // in.markStart();
064: /* #endif */
065: int b1 = in.getByte();
066: int b2 = b1 < 0 ? -1 : in.getByte();
067: int b3 = b2 < 0 ? -1 : in.getByte();
068: if (b1 == 0xEF && b2 == 0xBB && b3 == 0xBF) {
069: in.resetStart(3);
070: in.setCharset("UTF-8");
071: } else if (b1 == 0xFF && b2 == 0xFE && b3 != 0) {
072: in.resetStart(2);
073: in.setCharset("UTF-16LE");
074: } else if (b1 == 0xFE && b2 == 0xFF && b3 != 0) {
075: in.resetStart(2);
076: in.setCharset("UTF-16BE");
077: } else {
078: int b4 = b3 < 0 ? -1 : in.getByte();
079: if (b1 == 0x4C && b2 == 0x6F && b3 == 0xA7 && b4 == 0x94)
080: throw new RuntimeException(
081: "XMLParser: EBCDIC encodings not supported");
082: in.resetStart(0);
083: if ((b1 == '<' && ((b2 == '?' && b3 == 'x' && b4 == 'm') || (b2 == 0
084: && b3 == '?' && b4 == 0)))
085: || (b1 == 0 && b2 == '<' && b3 == 0 && b4 == '?')) {
086: char[] buffer = in.buffer;
087: if (buffer == null)
088: in.buffer = buffer = new char[LineBufferedReader.BUFFER_SIZE];
089: int pos = 0;
090: int quote = 0;
091: for (;;) {
092: int b = in.getByte();
093: if (b == 0)
094: continue;
095: if (b < 0) // Unexpected EOF - handled later.
096: break;
097: buffer[pos++] = (char) (b & 0xFF);
098: if (quote == 0) {
099: if (b == '>')
100: break;
101: if (b == '\'' || b == '\"')
102: quote = b;
103: } else if (b == quote)
104: quote = 0;
105: }
106: in.pos = 0;
107: in.limit = pos;
108: } else
109: in.setCharset("UTF-8");
110: }
111: in.setKeepFullLines(false);
112: return in;
113: }
114:
115: public static void parse(InputStream strm, Object uri,
116: SourceMessages messages, Consumer out)
117: throws java.io.IOException {
118: LineInputStreamReader in = XMLStreamReader(strm);
119: if (uri != null)
120: in.setName(uri);
121: parse(in, messages, out);
122: in.close();
123: }
124:
125: public static void parse(LineBufferedReader in,
126: SourceMessages messages, Consumer out)
127: throws java.io.IOException {
128: XMLFilter filter = new XMLFilter(out);
129: filter.setMessages(messages);
130: filter.setSourceLocator(in);
131: filter.startDocument();
132: Object uri = in.getPath();
133: if (uri != null)
134: filter.writeDocumentUri(uri);
135: parse(in, filter);
136: filter.endDocument();
137: }
138:
139: public static void parse(LineBufferedReader in,
140: SourceMessages messages, XMLFilter filter)
141: throws java.io.IOException {
142: filter.setMessages(messages);
143: filter.setSourceLocator(in);
144: filter.startDocument();
145: Object uri = in.getPath();
146: if (uri != null)
147: filter.writeDocumentUri(uri);
148: parse(in, filter);
149: filter.endDocument();
150: in.close();
151: }
152:
153: public static void parse(LineBufferedReader in, XMLFilter out) {
154: // Cache fields in local variables, for speed.
155: char[] buffer = in.buffer;
156: int pos = in.pos;
157: int limit = in.limit;
158:
159: // The flow logic of this method is unusual. It is one big state machine,
160: // but with two "subroutines": SKIP_SPACES_MODIFIER and EXPECT_NAME_MODIFIER.
161: // There is also a "subroutine" to get a new character (and leave it in 'ch')
162: // when 'break handleChar' is executed, except this has the hard-wired
163: // continuation of switching on the 'state'.
164: //
165: // The justification for this rather usual design is performance.
166: // As long as the input is contained within 'buffer', we don't need
167: // to call input methods (only methods for emitting parsed data is
168: // called). We also maximize use of local variables - we do not
169: // access any object fields (including fields of 'this') except
170: // for getting the next char from 'buffer'. These properties mean
171: // this method can be compiled to very tight efficient code.
172:
173: int state = INIT_STATE;
174: // 0: normal - in character context.
175: // 1: seen '&'
176:
177: // The next two varibles are only relevant if state==INIT_STATE:
178: char terminator = (char) '<';
179: int continue_state = SAW_LEFT_STATE;
180: char ch = (char) ' '; // ???
181: int length = 0;
182: int dstart = -1;
183: String message = null;
184:
185: int start = limit;
186: mainLoop: for (;;) {
187: handleChar: // When done get next character.
188: switch (state) {
189: case INIT_STATE:
190: state = TEXT_STATE;
191: state = INIT_TEXT_STATE;
192: break handleChar;
193:
194: case INIT_TEXT_STATE:
195: if (ch == '<') {
196: state = INIT_LEFT_STATE;
197: break handleChar;
198: }
199: state = TEXT_STATE;
200: continue mainLoop;
201:
202: case INIT_LEFT_STATE:
203: if (ch == '?') {
204: start = pos;
205: state = EXPECT_NAME_MODIFIER + SKIP_SPACES_MODIFIER
206: + INIT_LEFT_QUEST_STATE;
207: break handleChar;
208: }
209: state = SAW_LEFT_STATE;
210: continue mainLoop;
211:
212: case INVALID_VERSION_DECL:
213: pos = dstart;
214: message = "invalid xml version specifier";
215: /* ... fall thorugh ... */
216:
217: case SAW_ERROR:
218: in.pos = pos;
219: out.error('e', message);
220: for (;;) {
221: if (pos >= limit)
222: break mainLoop;
223: ch = buffer[pos++];
224: if (ch == '>') {
225: state = TEXT_STATE;
226: break handleChar;
227: }
228: }
229:
230: case SAW_EOF_ERROR:
231: in.pos = pos;
232: out.error('f', "unexpected end-of-file");
233: return;
234:
235: case TEXT_STATE:
236: // This state handle text not inside tags (in which case
237: // terminator=='<'). It also handles attribute values (in
238: // which case terminator is '\'' or '"').
239: start = pos - 1;
240: // Not length now, but used to calculate length when done.
241: length = pos;
242: for (;;) {
243: if (ch == terminator) {
244: state = continue_state;
245: break;
246: }
247: if (ch == '&') {
248: state = SAW_AMP_STATE;
249: break;
250: }
251: if (ch == '\r') {
252: length = pos - length;
253: in.pos = pos;
254: if (length > 0)
255: out.textFromParser(buffer, start, length);
256: if (pos < limit) {
257: ch = buffer[pos];
258: if (ch == '\n') {
259: start = pos;
260: length = ++pos;
261: } else {
262: out.linefeedFromParser();
263: if (ch == 0x85) {
264: start = pos++;
265: length = pos + 1;
266: } else {
267: in.incrLineNumber(1, pos);
268: start = pos;
269: length = ++pos;
270: continue;
271: }
272: }
273: in.incrLineNumber(1, pos);
274: } else {
275: out.linefeedFromParser();
276: state = PREV_WAS_CR_STATE;
277: break handleChar;
278: }
279: } else if (ch == 0x85 || ch == 0x2028) {
280: length = pos - length;
281: in.pos = pos - 1;
282: if (length > 0)
283: out.textFromParser(buffer, start, length);
284: out.linefeedFromParser();
285: in.incrLineNumber(1, pos);
286: length = pos + 1;
287: start = pos;
288: } else if (ch == '\n') {
289: in.incrLineNumber(1, pos);
290: }
291: if (pos == limit) {
292: length--;
293: break;
294: }
295: ch = buffer[pos++];
296: }
297: length = pos - length;
298: if (length > 0) {
299: in.pos = pos;
300: out.textFromParser(buffer, start, length);
301: }
302: start = buffer.length;
303: break handleChar;
304:
305: case PREV_WAS_CR_STATE:
306: // The previous character was a '\r', and we passed along '\n'
307: // to out. If the new character is '\n' or 0x85 ignore it.
308: state = TEXT_STATE;
309: if (ch == '\n' | ch == 0x85) {
310: in.incrLineNumber(1, pos);
311: break handleChar;
312: } else {
313: in.incrLineNumber(1, pos - 1);
314: continue;
315: }
316:
317: case SKIP_SPACES_MODIFIER + EXPECT_RIGHT_STATE:
318: case SKIP_SPACES_MODIFIER + MAYBE_ATTRIBUTE_STATE:
319: case SKIP_SPACES_MODIFIER + SAW_LEFT_QUEST_STATE:
320: case SKIP_SPACES_MODIFIER + INIT_LEFT_QUEST_STATE:
321: case SKIP_SPACES_MODIFIER + DOCTYPE_SEEN_STATE:
322: // "Subroutine" for skipping whitespace.
323: if (ch == ' ' || ch == '\t')
324: break handleChar;
325: if (ch == '\n' || ch == '\r' || ch == '\u0085'
326: || ch == '\u2028') {
327: in.incrLineNumber(1, pos);
328: break handleChar;
329: }
330: // Not a space, so "return" to next state.
331: state -= SKIP_SPACES_MODIFIER;
332: continue mainLoop;
333:
334: case EXPECT_NAME_MODIFIER + BEGIN_ELEMENT_STATE:
335: case EXPECT_NAME_MODIFIER + END_ELEMENT_STATE:
336: case EXPECT_NAME_MODIFIER + ATTRIBUTE_SEEN_NAME_STATE:
337: case EXPECT_NAME_MODIFIER + SAW_ENTITY_REF:
338: case EXPECT_NAME_MODIFIER + DOCTYPE_NAME_SEEN_STATE:
339: case EXPECT_NAME_MODIFIER + SKIP_SPACES_MODIFIER
340: + SAW_LEFT_QUEST_STATE:
341: case EXPECT_NAME_MODIFIER + SKIP_SPACES_MODIFIER
342: + INIT_LEFT_QUEST_STATE:
343: length = start + 1;
344: // "Subroutine" for reading a Name.
345: for (;;) {
346: // XML 1.1 candidate recommendation:
347: // [2] Char ::= #x9 | #xA | #xD | [#x20-#x7E] | #x85
348: // | [#xA0-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
349: // [4] NameStartChar := ":" | [A-Z] | "_" | [a-z] |
350: // [#xC0-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] |
351: // [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
352: // [#x3001-#xD7FF] | [#xF900-#xEFFFF]
353: // [4a] NameChar := NameStartChar | "-" | "." | [0-9] | #xB7 |
354: // [#x0300-#x036F] | [#x203F-#x2040]
355: if ((ch >= 'a' && ch <= 'z')
356: || (ch >= 'A' && ch <= 'Z')
357: || ch == '_'
358: || ch == ':'
359: || (ch >= 0xC0 && (ch <= 0x2FF || (ch >= 0x370 && ((ch <= 0x1FFF && ch != 0x37E) || (ch >= 0x200C && (ch <= 0x200D
360: || (ch >= 0x2070 && ch <= 0x218F)
361: || (ch >= 0x2C00 && ch <= 0x2FEF)
362: || (ch >= 0x3001 && ch <= 0xD7FF) || (ch >= 0xF900 && ch <= 0xFFFD)))))))
363: || (pos > length
364: && (ch >= '0' && ch <= '9')
365: || ch == '.' || ch == '-'
366: || ch == 0xB7 || (ch > 0x300 && (ch <= 0x36F || (ch >= 0x203F && ch <= 0x2040))))) {
367: } else {
368: state -= EXPECT_NAME_MODIFIER;
369: length = pos - length;
370: if (length == 0) {
371: if (state == ATTRIBUTE_SEEN_NAME_STATE)
372: message = "missing or invalid attribute name";
373: else if (state == BEGIN_ELEMENT_STATE
374: || state == END_ELEMENT_STATE)
375: message = "missing or invalid element name";
376: else
377: message = "missing or invalid name";
378: state = SAW_ERROR;
379: }
380: continue mainLoop;
381: }
382: if (pos < limit)
383: ch = buffer[pos++];
384: else
385: break handleChar;
386: }
387: case SAW_AMP_SHARP_STATE:
388: for (;;) {
389: if (ch == ';') {
390: in.pos = pos;
391: out.emitCharacterReference(length, buffer,
392: start, pos - 1 - start);
393: state = TEXT_STATE;
394: break handleChar;
395: }
396: if (ch == 'x' && dstart == 0)
397: dstart = 16;
398: else if (length >= 0x8000000)
399: break; // Overflow likely.
400: else {
401: int base = dstart == 0 ? 10 : dstart;
402: int digit = Character.digit((char) ch, base);
403: if (digit < 0)
404: break;
405: length = length * base + digit;
406: }
407: if (pos < limit)
408: ch = buffer[pos++];
409: else
410: break handleChar;
411: }
412: in.pos = pos;
413: out.error('e', "invalid character reference");
414: state = TEXT_STATE;
415: break handleChar;
416:
417: case SAW_AMP_STATE:
418: if (ch == '#') {
419: state = SAW_AMP_SHARP_STATE;
420: start = pos;
421: length = 0; // accumulated value; -1 means error, -2 overflow
422: dstart = 0; // base - 0 means not seen yet
423: break handleChar;
424: }
425: start = pos - 1;
426: state = EXPECT_NAME_MODIFIER + SAW_ENTITY_REF;
427: continue mainLoop;
428:
429: case SAW_ENTITY_REF:
430: in.pos = pos;
431: if (ch != ';')
432: out.error('w', "missing ';'");
433: out.emitEntityReference(buffer, start, length);
434: start = limit;
435: state = TEXT_STATE;
436: break handleChar;
437:
438: case SAW_LEFT_STATE: // Saw '<'
439: if (ch == '/') {
440: state = SAW_LEFT_SLASH_STATE;
441: break handleChar;
442: }
443: if (ch == '?') {
444: start = pos;
445: state = EXPECT_NAME_MODIFIER + SKIP_SPACES_MODIFIER
446: + SAW_LEFT_QUEST_STATE;
447: break handleChar;
448: }
449: if (ch == '!') {
450: state = SAW_LEFT_EXCL_STATE;
451: start = pos;
452: break handleChar;
453: }
454: // Read Name then goto BEGIN_ELEMENT_STATE.
455: start = pos - 1;
456: state = EXPECT_NAME_MODIFIER + BEGIN_ELEMENT_STATE;
457: continue mainLoop;
458: case BEGIN_ELEMENT_STATE:
459: in.pos = pos - length; // position of start of name, for errors.
460: out.emitStartElement(buffer, start, length);
461: state = SKIP_SPACES_MODIFIER + MAYBE_ATTRIBUTE_STATE;
462: start = limit;
463: continue mainLoop;
464:
465: case SAW_LEFT_QUEST_STATE: // Seen '<?' Name Spaces
466: case INIT_LEFT_QUEST_STATE: // Seen '<?' Name Spaces
467: if (dstart < 0)
468: dstart = pos - 1;
469: for (;;) {
470: int end;
471: if (ch == '>' && buffer[end = pos - 2] == '?'
472: && end >= dstart) {
473: in.pos = pos;
474: if (length == 3 && buffer[start] == 'x'
475: && buffer[start + 1] == 'm'
476: && buffer[start + 2] == 'l') {
477: if (state == INIT_LEFT_QUEST_STATE) {
478: if (end <= dstart + 7
479: || buffer[dstart] != 'v'
480: || buffer[dstart + 1] != 'e'
481: || buffer[dstart + 2] != 'r'
482: || buffer[dstart + 3] != 's'
483: || buffer[dstart + 4] != 'i'
484: || buffer[dstart + 5] != 'o'
485: || buffer[dstart + 6] != 'n') {
486: pos = dstart;
487: message = "xml declaration without version";
488: state = SAW_ERROR;
489: continue mainLoop;
490: }
491: dstart += 7;
492: ch = buffer[dstart];
493: while (Character.isWhitespace(ch)
494: && ++dstart < end)
495: ch = buffer[dstart];
496: if (ch != '=') {
497: state = INVALID_VERSION_DECL;
498: continue mainLoop;
499: }
500: ch = buffer[++dstart];
501: while (Character.isWhitespace(ch)
502: && ++dstart < end)
503: ch = buffer[dstart];
504: if (ch != '\'' && ch != '\"') {
505: state = INVALID_VERSION_DECL;
506: continue mainLoop;
507: }
508: char quote = ch;
509: int i = ++dstart;
510: for (;; i++) {
511: if (i == end) {
512: state = INVALID_VERSION_DECL;
513: continue mainLoop;
514: }
515: ch = buffer[i];
516: if (ch == quote)
517: break;
518: }
519: if (i == dstart + 3
520: && buffer[dstart] == '1'
521: && buffer[dstart + 1] == '.'
522: && (ch = buffer[dstart + 2]) == '0'
523: || ch == '1') {
524: // Save version number, if that is useful.
525: } else {
526: state = INVALID_VERSION_DECL;
527: continue mainLoop;
528: }
529: dstart = i + 1;
530: while (dstart < end
531: && Character
532: .isWhitespace(buffer[dstart]))
533: dstart++;
534: if (end > dstart + 7
535: && buffer[dstart] == 'e'
536: && buffer[dstart + 1] == 'n'
537: && buffer[dstart + 2] == 'c'
538: && buffer[dstart + 3] == 'o'
539: && buffer[dstart + 4] == 'd'
540: && buffer[dstart + 5] == 'i'
541: && buffer[dstart + 6] == 'n'
542: && buffer[dstart + 7] == 'g') {
543: dstart += 8;
544: ch = buffer[dstart];
545: while (Character.isWhitespace(ch)
546: && ++dstart < end)
547: ch = buffer[dstart];
548: if (ch != '=') {
549: message = BAD_ENCODING_SYNTAX;
550: state = SAW_ERROR;
551: continue mainLoop;
552: }
553: ch = buffer[++dstart];
554: while (Character.isWhitespace(ch)
555: && ++dstart < end)
556: ch = buffer[dstart];
557: if (ch != '\'' && ch != '\"') {
558: message = BAD_ENCODING_SYNTAX;
559: state = SAW_ERROR;
560: continue mainLoop;
561: }
562: quote = ch;
563: i = ++dstart;
564: for (;; i++) {
565: if (i == end) {
566: message = BAD_ENCODING_SYNTAX;
567: state = SAW_ERROR;
568: continue mainLoop;
569: }
570: ch = buffer[i];
571: if (ch == quote)
572: break;
573: }
574: String encoding = new String(
575: buffer, dstart, i - dstart);
576: if (in instanceof LineInputStreamReader)
577: ((LineInputStreamReader) in)
578: .setCharset(encoding);
579: dstart = i + 1;
580: while (dstart < end
581: && Character
582: .isWhitespace(buffer[dstart]))
583: dstart++;
584: }
585: if (end != dstart) {
586: message = "junk at end of xml declaration";
587: pos = dstart;
588: state = SAW_ERROR;
589: continue mainLoop;
590: }
591: } else {
592: message = "<?xml must be at start of file";
593: state = SAW_ERROR;
594: continue mainLoop;
595: }
596: } else
597: out
598: .processingInstructionFromParser(
599: buffer, start, length,
600: dstart, end - dstart);
601: start = limit;
602: dstart = -1;
603: state = TEXT_STATE;
604: break handleChar;
605: }
606: if (pos < limit)
607: ch = buffer[pos++];
608: else
609: break handleChar;
610: }
611:
612: case SAW_LEFT_EXCL_STATE: // Seen '<!'
613: exclLoop: for (;;) {
614: if (ch == '>') {
615: length = pos - 1 - start;
616: if (length >= 4 && buffer[start] == '-'
617: && buffer[start + 1] == '-') {
618: if (buffer[pos - 2] == '-'
619: && buffer[pos - 3] == '-') {
620: in.pos = pos;
621: out.commentFromParser(buffer,
622: start + 2, length - 4);
623: break exclLoop;
624: }
625: } else if (length >= 6 && buffer[start] == '['
626: && buffer[start + 1] == 'C'
627: && buffer[start + 2] == 'D'
628: && buffer[start + 3] == 'A'
629: && buffer[start + 4] == 'T'
630: && buffer[start + 5] == 'A'
631: && buffer[start + 6] == '[') {
632: if (buffer[pos - 2] == ']'
633: && buffer[pos - 3] == ']') {
634: in.pos = pos;
635: out.writeCDATA(buffer, start + 7, pos
636: - 10 - start);
637: break exclLoop;
638: }
639: } else {
640: // FIXME ignoreing <!ELEMENT ... > etc.
641: break exclLoop;
642: }
643: } else if (pos == start + 7 && buffer[start] == 'D'
644: && buffer[start + 1] == 'O'
645: && buffer[start + 2] == 'C'
646: && buffer[start + 3] == 'T'
647: && buffer[start + 4] == 'Y'
648: && buffer[start + 5] == 'P' && ch == 'E') {
649: start = limit;
650: state = SKIP_SPACES_MODIFIER
651: + DOCTYPE_SEEN_STATE;
652: break handleChar;
653: }
654: if (pos < limit)
655: ch = buffer[pos++];
656: else
657: break handleChar;
658: }
659: start = limit;
660: state = TEXT_STATE;
661: break handleChar;
662:
663: case DOCTYPE_SEEN_STATE: /* Seen '<!DOCTYPE' S* */
664: state = EXPECT_NAME_MODIFIER + DOCTYPE_NAME_SEEN_STATE;
665: start = pos - 1;
666: continue mainLoop;
667:
668: case DOCTYPE_NAME_SEEN_STATE: /* Seen '<!DOCTYPE' S* Name */
669: if (dstart < 0) {
670: // First type - i.e. not after a handelChar call.
671: dstart = pos - 1;
672: dstart -= start; // Make relative.
673: dstart <<= 1; // Add bit for whether in a '['.
674: terminator = 0;
675: }
676: for (;;) {
677: if (ch == '\'' || ch == '\"') {
678: if (terminator == 0)
679: terminator = ch;
680: else if (terminator == ch)
681: terminator = 0;
682: } else if (terminator == 0) // I.e. not inside a string.
683: {
684: // Low-order bit of dstart is 1 if we've seen a '['.
685: if (ch == '[')
686: dstart |= 1;
687: else if (ch == ']')
688: dstart &= ~1;
689: else if (ch == '>' && (dstart & 1) == 0) {
690: in.pos = pos;
691: dstart >>= 1;
692: dstart += start;
693: out.emitDoctypeDecl(buffer, start, length,
694: dstart, pos - 1 - dstart);
695: terminator = (char) '<';
696: start = limit;
697: dstart = -1;
698: state = TEXT_STATE;
699: break handleChar;
700: }
701: }
702: if (pos < limit)
703: ch = buffer[pos++];
704: else
705: break handleChar;
706: }
707:
708: case MAYBE_ATTRIBUTE_STATE:
709: terminator = '<';
710: continue_state = SAW_LEFT_STATE;
711: if (ch == '/') {
712: in.pos = pos;
713: out.emitEndAttributes();
714: out.emitEndElement(null, 0, 0);
715: state = EXPECT_RIGHT_STATE;
716: break handleChar;
717: }
718: if (ch == '>') {
719: in.pos = pos;
720: out.emitEndAttributes();
721: state = TEXT_STATE;
722: break handleChar;
723: }
724: start = pos - 1;
725: state = EXPECT_NAME_MODIFIER
726: + ATTRIBUTE_SEEN_NAME_STATE;
727: continue mainLoop;
728: case ATTRIBUTE_SEEN_NAME_STATE:
729: if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n'
730: || ch == '\u0085' || ch == '\u2028')
731: break handleChar;
732: in.pos = pos - length; // position of start of name, for errors.
733: out.emitStartAttribute(buffer, start, length);
734: start = limit;
735: if (ch == '=') {
736: state = ATTRIBUTE_SEEN_EQ_STATE;
737: break handleChar;
738: }
739: out.emitEndAttributes();
740: message = "missing or misplaced '=' after attribute name";
741: state = SAW_ERROR;
742: continue mainLoop;
743: case ATTRIBUTE_SEEN_EQ_STATE:
744: if (ch == '\'' || ch == '\"') {
745: terminator = ch;
746: continue_state = SKIP_SPACES_MODIFIER
747: + MAYBE_ATTRIBUTE_STATE;
748: state = TEXT_STATE;
749: break handleChar;
750: }
751: if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n'
752: || ch == '\u0085' || ch == '\u2028')
753: break handleChar;
754: message = "missing or unquoted attribute value";
755: state = SAW_ERROR;
756: continue mainLoop;
757:
758: case SAW_LEFT_SLASH_STATE: // Seen '</'.
759: // Do "Name" subroutine, then goto END_ELEMENT_STATE.
760: start = pos - 1;
761: state = EXPECT_NAME_MODIFIER + END_ELEMENT_STATE;
762: continue mainLoop;
763:
764: case END_ELEMENT_STATE: // Seen '</' Name.
765: in.pos = pos;
766: out.emitEndElement(buffer, start, length);
767: start = limit;
768: // Skip spaces then goto EXPECT_RIGHT_STATE.
769: state = SKIP_SPACES_MODIFIER + EXPECT_RIGHT_STATE;
770: continue mainLoop;
771:
772: case EXPECT_RIGHT_STATE: // Looking for '>'.
773: if (ch != '>') {
774: message = "missing '>'";
775: state = SAW_ERROR;
776: continue mainLoop;
777: }
778: state = TEXT_STATE;
779: break handleChar;
780: }
781:
782: // After 'break handleChar', we get here.
783: if (pos < limit)
784: ch = buffer[pos++];
785: else {
786: int saved = pos - start;
787: try {
788: if (saved > 0) {
789: in.pos = start;
790: in.mark(saved + 1);
791: }
792: in.pos = pos;
793: int x = in.read();
794: if (x < 0) {
795: if (state == TEXT_STATE
796: || state == PREV_WAS_CR_STATE)
797: return;
798: state = SAW_EOF_ERROR;
799: continue;
800: }
801: if (saved > 0) {
802: in.reset();
803: in.skip(saved);
804: } else
805: in.unread_quick();
806: } catch (java.io.IOException ex) {
807: throw new RuntimeException(ex.getMessage());
808: }
809: pos = in.pos;
810: buffer = in.buffer;
811:
812: limit = in.limit;
813: start = saved > 0 ? pos - saved : limit;
814: ch = buffer[pos++];
815: }
816: }
817: }
818: }
|