001: package it.unimi.dsi.mg4j.util.parser;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2005-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;
025: import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
026: import it.unimi.dsi.fastutil.objects.ReferenceArraySet;
027: import it.unimi.dsi.fastutil.objects.ReferenceSet;
028: import it.unimi.dsi.fastutil.objects.ReferenceSets;
029: import it.unimi.dsi.mg4j.util.MutableString;
030: import it.unimi.dsi.mg4j.util.TextPattern;
031: import it.unimi.dsi.mg4j.util.parser.callback.Callback;
032:
033: /** A fast, lightweight, on-demand (X)HTML parser.
034: *
035: * <p>The bullet parser has been written with two specific goals in mind:
036: * web crawling and targeted data extraction from massive web data sets.
037: * To be usable in such environments, a parser must obey a number of
038: * restrictions:
039: * <ul>
040: * <li>it should avoid excessive object creation (which, for instance,
041: * forbids a significant usage of Java strings);
042: * <li>it should tolerate invalid syntax and recover reasonably; in fact,
043: * it should never throw exceptions;
044: * <li>it should perform actual parsing only on a settable feature subset:
045: * there is no reason to parse the attributes of a <samp>P</samp>
046: * element while searching for links;
047: * <li>it should parse HTML as a <em>regular language</em>, and leave context-free
048: * properties (e.g., stack maintenance and repair) to suitably designed callbacks.
049: * </ul>
050: *
051: * <p>Thus, in fact the bullet parser is not a parser. It is a bunch of
052: * spaghetti code that analyses a stream of characters pretending that
053: * it is an (X)HTML document. It has a very defensive attitude against
054: * the stream character it is parsing, but at the same time it is
055: * forgiving with all typical (X)HTML mistakes.
056: *
057: * <p>The bullet parser is officially StringFree™.
058: * <a href="http://mg4j.dsi.unimi.it/docs/it/unimi/dsi/mg4j/util/MutableString.html"><code>MutableString</code>s</a>
059: * are used for internal processing, and Java strings are used only to return attribute
060: * values. All internal maps are {@linkplain it.unimi.dsi.fastutil.objects.Reference2ObjectMap reference-based maps}
061: * from <a href="http://fastutil.dsi.unimi.it/"><samp>fastutil</samp></a>, which
062: * helps to accelerate further the parsing process.
063: *
064: * <h2>HTML data</h2>
065: *
066: * <p>The bullet parser uses attributes and methods of {@link it.unimi.dsi.mg4j.util.parser.HTMLFactory},
067: * {@link it.unimi.dsi.mg4j.util.parser.Element}, {@link it.unimi.dsi.mg4j.util.parser.Attribute}
068: * and {@link it.unimi.dsi.mg4j.util.parser.Entity}.
069: * Thus, for instance, whenever an element is to be passed around it is one
070: * of the shared objects contained in {@link it.unimi.dsi.mg4j.util.parser.Element}
071: * (e.g., {@link it.unimi.dsi.mg4j.util.parser.Element#BODY}).
072: *
073: * <h2>Callbacks</h2>
074: *
075: * <p>The result of the parsing process is the invocation of a callback.
076: * The {@linkplain it.unimi.dsi.parser.callback.Callback callback interface}
077: * of the bullet parser remembers closely SAX2, but it has some additional
078: * methods targeted at (X)HTML, such as {@link Callback#cdata(it.unimi.dsi.mg4j.util.parser.Element,char[],int,int)},
079: * which returns characters found in a CDATA section (e.g., a stylesheet).
080: *
081: * <p>Each callback must configure the parser, by requesting to perform
082: * the analysis and the callbacks it requires. A callback that wants to
083: * extract and tokenise text, for instance, will certainly require
084: * {@link #parseText(boolean) parseText(true)}, but not {@link #parseTags(boolean) parseTags(true)}.
085: * On the other hand, a callback wishing to extract links will require
086: * to {@linkplain #parseAttribute(Attribute) parse selectively} certain attribute types.
087: *
088: * <p>A more precise description follows.
089: *
090: * <h2>Writing callbacks</h2>
091: *
092: * <p>The first important issue is what has to be required to the parser. A newly
093: * created parser does not invoke any callback. It is up to every callback
094: * to add features so that it can do its job. Remember that since many
095: * callbacks can be {@linkplain it.unimi.dsi.parser.callback.ComposedCallbackBuilder composed},
096: * you must always <em>add</em> features, never <em>remove</em> them, and moreover
097: * your callbacks must be ready to be invoked with features they did not
098: * request (e.g., attribute types added by another callback).
099: *
100: * <p>The following parse features
101: * may be configured; most of them are just boolean features, a.k.a. flags:
102: * unless otherwise specified, by default all flags are set to false (e.g., by
103: * the default the parser will <em>not</em> parse tags):
104: * <ul>
105: * <li><em>tags</em> ({@link #parseTags(boolean)} method): whether tags
106: * should be parsed;
107: * <li><em>attributes</em> ({@link #parseAttributes(boolean)} and
108: * {@link #parseAttribute(Attribute) methods)}:
109: * whether attributes should be parsed (of course, setting this flag is useless
110: * if you are not parsing tags); note that setting this flag will just
111: * activate the attribute parsing feature, but you must also
112: * {@linkplain #parseAttribute(Attribute) register} every attribute
113: * whose value you want to obtain.
114: * <li><em>text</em> ({@link #parseText(boolean)}method): whether text
115: * should be parsed; if this flag is set, the parser will call the
116: * {@link it.unimi.dsi.parser.callback.Callback#characters(char[], int, int, boolean)}
117: * method for every text chunk found.
118: * <li><em>CDATA sections</em> ({@link #parseCDATA(boolean)}method): whether CDATA
119: * sections (stylesheets & scripts)
120: * should be parsed; if this flag is set, the parser will call the
121: * {@link Callback#cdata(Element,char[],int,int)}
122: * method for every CDATA section found.
123: * </ul>
124: *
125: * <h2>Invoking the parser</h2>
126: *
127: * <p>After {@linkplain #setCallback(Callback) setting the parser callback},
128: * you just call {@link #parse(char[], int, int)}.
129: * @deprecated Moved to <code>dsiutils</code>.
130: */
131:
132: @Deprecated
133: public class BulletParser {
134:
135: private static final boolean DEBUG = false;
136:
137: /** Scanning text.. */
138: protected static final int STATE_TEXT = 0;
139: /** Scanning attribute name/value pairs. */
140: protected static final int STATE_BEFORE_START_TAG_NAME = 1;
141: /** Scanning a closing tag. */
142: protected static final int STATE_BEFORE_END_TAG_NAME = 2;
143: /** Scanning attribute name/value pairs. */
144: protected static final int STATE_IN_START_TAG = 3;
145: /** Scanning a closing tag. */
146: protected static final int STATE_IN_END_TAG = 4;
147:
148: /** The maximum Unicode value accepted for a numeric entity. */
149: protected static final int MAX_ENTITY_VALUE = 65535;
150: /** The base for non-decimal entity. */
151: protected static final int HEXADECIMAL = 16;
152: /** The maximum number of digits of a hexadecimal numeric entity. */
153: protected static final int MAX_HEX_ENTITY_LENGTH = 8;
154: /** The maximum number of digits of a decimal numeric entity. */
155: protected static final int MAX_DEC_ENTITY_LENGTH = 9;
156:
157: /** Closing tag for a script element. */
158: protected static final TextPattern SCRIPT_CLOSE_TAG_PATTERN = new TextPattern(
159: "</script>", TextPattern.CASE_INSENSITIVE);
160: /** Closing tag for a style element. */
161: protected static final TextPattern STYLE_CLOSE_TAG_PATTERN = new TextPattern(
162: "</style>", TextPattern.CASE_INSENSITIVE);
163:
164: /** An array containing the non-space whitespace. */
165: protected static final char[] NONSPACE_WHITESPACE = { '\n', '\r',
166: '\t' };
167: /** An array, parallel to {@link #NONSPACE_WHITESPACE}, containing spaces. */
168: protected static final char[] SPACE = { ' ', ' ', ' ' };
169:
170: /** Closed comment. It should be "-->", but mistakes are common. */
171: protected static final TextPattern CLOSED_COMMENT = new TextPattern(
172: "->");
173: /** Closed ASP or similar tag. */
174: protected static final TextPattern CLOSED_PERCENT = new TextPattern(
175: "%>");
176: /** Closed processing instruction. */
177: protected static final TextPattern CLOSED_PIC = new TextPattern(
178: "?>");
179: /** Closed section (conditional, etc.). */
180: protected static final TextPattern CLOSED_SECTION = new TextPattern(
181: "]>");
182: /** Closed section (conditional, CDATA, etc.). */
183: protected static final TextPattern CLOSED_CDATA = new TextPattern(
184: "]]>");
185: /** TODO: what is this?. */
186: //protected static final TextPattern CLOSED_BOH = new TextPattern( "!>" );
187: /** The parsing factory used by this parser. */
188: public final ParsingFactory factory;
189:
190: /** The callback of this parser. */
191: protected Callback callback;
192: /** A map from attributes to attribute values. */
193: protected Reference2ObjectMap<Attribute, MutableString> attrMap;
194: /** Whether we should invoke the text handler. */
195: protected boolean parseText;
196: /** Whether we should invoke the CDATA section handler. */
197: protected boolean parseCDATA;
198: /** Whether we should parse tags. */
199: protected boolean parseTags;
200: /** Whether we should parse attributes. */
201: protected boolean parseAttributes;
202: /**
203: * The subset of attributes whose values will be actually parsed (if, of
204: * course, {@link #parseAttributes}is true).
205: */
206: protected ReferenceArraySet<Attribute> parsedAttrs = new ReferenceArraySet<Attribute>();
207: /**
208: * An externally visible, immutable subset of attributes whose values will
209: * be actually parsed.
210: */
211: public ReferenceSet<Attribute> parsedAttributes = ReferenceSets
212: .unmodifiable(parsedAttrs);
213: /** The character represented by the last scanned entity. */
214: protected char lastEntity;
215:
216: /** Creates a new bullet parser. */
217: public BulletParser(final ParsingFactory factory) {
218: this .factory = factory;
219: }
220:
221: /** Creates a new bullet parser using the default factory {@link HTMLFactory#INSTANCE}. */
222: public BulletParser() {
223: this (HTMLFactory.INSTANCE);
224: }
225:
226: /**
227: * Returns whether this parser will invoke the text handler.
228: *
229: * @return whether this parser will invoke the text handler.
230: * @see #parseText(boolean)
231: */
232: public boolean parseText() {
233: return parseText;
234: }
235:
236: /**
237: * Sets the text handler flag.
238: *
239: * @param parseText
240: * the new value.
241: * @return this parser.
242: */
243: public BulletParser parseText(final boolean parseText) {
244: this .parseText = parseText;
245: return this ;
246: }
247:
248: /**
249: * Returns whether this parser will invoke the CDATA-section handler.
250: *
251: * @return whether this parser will invoke the CDATA-section handler.
252: * @see #parseCDATA(boolean)
253: */
254: public boolean parseCDATA() {
255: return parseCDATA;
256: }
257:
258: /**
259: * Sets the CDATA-section handler flag.
260: *
261: * @param parseCDATA
262: * the new value.
263: * @return this parser.
264: */
265: public BulletParser parseCDATA(final boolean parseCDATA) {
266: this .parseCDATA = parseCDATA;
267: return this ;
268: }
269:
270: /**
271: * Returns whether this parser will parse tags and invoke element handlers.
272: *
273: * @return whether this parser will parse tags and invoke element handlers.
274: * @see #parseTags(boolean)
275: */
276: public boolean parseTags() {
277: return parseTags;
278: }
279:
280: /**
281: * Sets whether this parser will parse tags and invoke element handlers.
282: *
283: * @param parseTags
284: * the new value.
285: * @return this parser.
286: */
287: public BulletParser parseTags(final boolean parseTags) {
288: this .parseTags = parseTags;
289: return this ;
290: }
291:
292: /**
293: * Returns whether this parser will parse attributes.
294: *
295: * @return whether this parser will parse attributes.
296: * @see #parseAttributes(boolean)
297: */
298: public boolean parseAttributes() {
299: return parseAttributes;
300: }
301:
302: /**
303: * Sets the attribute parsing flag.
304: *
305: * @param parseAttributes
306: * the new value for the flag.
307: * @return this parser.
308: */
309: public BulletParser parseAttributes(final boolean parseAttributes) {
310: this .parseAttributes = parseAttributes;
311: return this ;
312: }
313:
314: /**
315: * Adds the given attribute to the set of attributes to be parsed.
316: *
317: * @param attribute
318: * an attribute that should be parsed.
319: * @throws IllegalStateException
320: * if {@link #parseAttributes(boolean) parseAttributes(true)}
321: * has not been invoked on this parser.
322: * @return this parser.
323: */
324: public BulletParser parseAttribute(final Attribute attribute) {
325: parsedAttrs.add(attribute);
326: return this ;
327: }
328:
329: /** Sets the callback for this parser, resetting at the same time all parsing flags.
330: *
331: * @param callback the new callback.
332: * @return this parser.
333: */
334: public BulletParser setCallback(final Callback callback) {
335: this .callback = callback;
336: parseCDATA = parseText = parseAttributes = parseTags = false;
337: parsedAttrs.clear();
338: callback.configure(this );
339: return this ;
340: }
341:
342: /** Returns the character corresponding to a given entity name.
343: *
344: * @param name the name of an entity.
345: * @return the character corresponding to the entity, or an ASCII NUL if no entity with that name was found.
346: */
347: protected char entity2Char(final MutableString name) {
348: final Entity e = factory.getEntity(name);
349: return e == null ? (char) 0 : e.character;
350: }
351:
352: /** Searches for the end of an entity.
353: *
354: * <P>This method will search for the end of an entity starting at the given offset (the offset
355: * must correspond to the ampersand).
356: *
357: * <P>Real-world HTML pages often contain hundreds of misplaced ampersands, due to the
358: * unfortunate idea of using the ampersand as query separator (<em>please</em> use the comma
359: * in new code!). All such ampersand should be specified as <samp>&amp;</samp>.
360: * If named entities are delimited using a transition
361: * from alphabetical to non-alphabetical characters, we can easily get false positives. If the parameter
362: * <code>loose</code> is false, named entities can be delimited only by whitespace or by a comma.
363: *
364: * @param a a character array containing the entity.
365: * @param offset the offset at which the entity starts (the offset must point at the ampersand).
366: * @param length an upper bound to the maximum returned position.
367: * @param loose if true, named entities can be terminated by any non-alphabetical character
368: * (instead of whitespace or comma).
369: * @param entity a support mutable string used to query {@link ParsingFactory#getEntity(MutableString)}.
370: * @return the position of the last character of the entity, or -1 if no entity was found.
371: */
372: protected int scanEntity(final char[] a, final int offset,
373: final int length, final boolean loose,
374: final MutableString entity) {
375:
376: int i, c = 0;
377: String tmpEntity;
378:
379: if (length < 2)
380: return -1;
381:
382: if (a[offset + 1] == '#') {
383: if (length > 2 && a[offset + 2] == 'x') {
384: for (i = 3; i < length
385: && i < MAX_HEX_ENTITY_LENGTH
386: && Character.digit(a[i + offset], HEXADECIMAL) != -1; i++)
387: ;
388: tmpEntity = new String(a, offset + 3, i - 3);
389: if (i != 3)
390: c = Integer.parseInt(tmpEntity, HEXADECIMAL);
391: } else {
392: for (i = 2; i < length && i < MAX_DEC_ENTITY_LENGTH
393: && Character.isDigit(a[i + offset]); i++)
394: ;
395: tmpEntity = new String(a, offset + 2, i - 2);
396: if (i != 2)
397: c = Integer.parseInt(tmpEntity);
398: }
399:
400: if (c > 0 && c < MAX_ENTITY_VALUE) {
401: lastEntity = (char) c;
402: if (i < length && a[i + offset] == ';')
403: i++;
404: return i + offset;
405: }
406: } else {
407: if (Character.isLetter(a[offset + 1])) {
408: for (i = 2; i < length
409: && Character.isLetterOrDigit(a[offset + i]); i++)
410: ;
411: if (i != 1
412: && (loose || (i < length && (Character
413: .isWhitespace(a[offset + i]) || a[offset
414: + i] == ';')))
415: && (lastEntity = entity2Char(entity.length(0)
416: .append(a, offset + 1, i - 1))) != 0) {
417: if (i < length && a[i + offset] == ';')
418: i++;
419: return i + offset;
420: }
421: }
422: }
423:
424: return -1;
425: }
426:
427: /**
428: * Replaces entities with the corresponding characters.
429: *
430: * <P>This method will modify the mutable string <code>s</code> so that all legal occurrences
431: * of entities are replaced by the corresponding character.
432: *
433: * @param s a mutable string whose entities will be replaced by the corresponding characters.
434: * @param entity a support mutable string used by {@link #scanEntity(char[], int, int, boolean, MutableString)}.
435: * @param loose a parameter that will be passed to {@link #scanEntity(char[], int, int, boolean, MutableString)}.
436: */
437: protected void replaceEntities(final MutableString s,
438: final MutableString entity, final boolean loose) {
439:
440: final char[] a = s.array();
441: int length = s.length();
442:
443: /* We examine the string *backwards*, so that i is always a valid index. */
444:
445: int i = length, j;
446: while (i-- > 0)
447: if (a[i] == '&'
448: && (j = scanEntity(a, i, length - i, loose, entity)) != -1)
449: length = s.replace(i, j, lastEntity).length();
450: }
451:
452: /** Handles markup.
453: *
454: * @param text the text.
455: * @param pos the first character in the markup after <samp><!</samp>.
456: * @param end the end of <code>text</code>.
457: * @return the position of the first character after the markup.
458: */
459:
460: protected int handleMarkup(final char[] text, int pos, final int end) {
461: // A markup instruction (doctype, comment, etc.).
462: switch (text[++pos]) {
463: case 'D':
464: case 'd':
465: // DOCTYPE
466: while (pos < end && text[pos++] != '>')
467: ;
468: break;
469:
470: case '-':
471: // comment
472: if ((pos = CLOSED_COMMENT.search(text, pos, end)) == -1)
473: pos = end;
474: else
475: pos += CLOSED_COMMENT.length();
476: break;
477:
478: default:
479: if (pos < end - 6 && text[pos] == '['
480: && text[pos + 1] == 'C' && text[pos + 2] == 'D'
481: && text[pos + 3] == 'A' && text[pos + 4] == 'T'
482: && text[pos + 5] == 'A' && text[pos + 6] == '[') {
483: // CDATA section
484: final int last = CLOSED_CDATA.search(text, pos, end);
485: if (parseCDATA)
486: callback.cdata(null, text, pos + 7,
487: (last == -1 ? end : last) - pos - 7);
488: pos = last == -1 ? end : last + CLOSED_CDATA.length();
489: }
490: // Generic markup
491: else
492: while (pos < end && text[pos++] != '>')
493: ;
494: break;
495: }
496:
497: return pos;
498: }
499:
500: /** Handles processing instruction, ASP tags etc.
501: *
502: * @param text the text.
503: * @param pos the first character in the markup after <samp><%</samp>.
504: * @param end the end of <code>text</code>.
505: * @return the position of the first character after the processing instruction.
506: */
507:
508: protected int handleProcessingInstruction(final char[] text,
509: int pos, final int end) {
510:
511: switch (text[++pos]) {
512: case '%':
513: if ((pos = CLOSED_PERCENT.search(text, pos, end)) == -1)
514: pos = end;
515: else
516: pos += CLOSED_PERCENT.length();
517: break;
518:
519: case '?':
520: if ((pos = CLOSED_PIC.search(text, pos, end)) == -1)
521: pos = end;
522: else
523: pos += CLOSED_PIC.length();
524: break;
525: case '[':
526: if ((pos = CLOSED_SECTION.search(text, pos, end)) == -1)
527: pos = end;
528: else
529: pos += CLOSED_SECTION.length();
530: break;
531: default:
532: // Generic markup
533: while (pos < end && text[pos++] != '>')
534: ;
535: break;
536: }
537: return pos;
538: }
539:
540: /**
541: * Analyze the text document to extract information.
542: *
543: * @param text a <code>char</code> array of text to be parsed.
544: */
545: public void parse(final char[] text) {
546: parse(text, 0, text.length);
547: }
548:
549: /**
550: * Analyze the text document to extract information.
551: *
552: * @param text a <code>char</code> array of text to be parsed.
553: * @param offset the offset in the array from which the parsing will begin.
554: * @param length the number of characters to be parsed.
555: */
556: public void parse(final char[] text, final int offset,
557: final int length) {
558: MutableString tagElemTypeName = new MutableString();
559: MutableString attrName = new MutableString();
560: MutableString attrValue = new MutableString();
561: MutableString entity = new MutableString();
562: MutableString characters = new MutableString();
563:
564: /* During the analysis of attribute we need a separator for values */
565: char delim;
566: /* The current character */
567: char currChar;
568: /* The state of the switch */
569: int state;
570: /* Others integer values used in the parsing process */
571: int start, k;
572: /* This boolean is set true if we have words to handle */
573: boolean flowBroken = false, parseCurrAttr;
574:
575: /* The current element. */
576: Element currentElement;
577: /* The current attribute object */
578: Attribute currAttr = null;
579: attrMap = new Reference2ObjectArrayMap<Attribute, MutableString>(
580: 16);
581:
582: callback.startDocument();
583:
584: tagElemTypeName.length(0);
585: attrName.length(0);
586: attrValue.length(0);
587: entity.length(0);
588:
589: state = STATE_TEXT;
590: currentElement = null;
591: final int end = offset + length;
592: int pos = offset;
593:
594: /* This is the main loop. */
595: while (pos < end) {
596:
597: switch (state) {
598: case STATE_TEXT:
599: currChar = text[pos];
600: if (currChar == '&') {
601:
602: // We handle both the case of an entity, and that of a stray '&'.
603: if ((k = scanEntity(text, pos, end - pos, true,
604: entity)) == -1) {
605: currChar = '&';
606: pos++;
607: } else {
608: currChar = lastEntity;
609: pos = k;
610: if (DEBUG)
611: System.err.println("Entity at: " + pos
612: + " end of entity: " + k
613: + " entity: " + entity + " char: "
614: + currChar);
615: }
616: if (parseText)
617: characters.append(currChar);
618: continue;
619: }
620:
621: // No tags can happen later than end - 2.
622: if (currChar != '<' || pos >= end - 2) {
623: if (parseText)
624: characters.append(currChar);
625: pos++;
626: continue;
627: }
628:
629: switch (text[++pos]) {
630: case '!':
631: pos = handleMarkup(text, pos, end);
632: break;
633:
634: case '%':
635: case '?':
636: pos = handleProcessingInstruction(text, pos, end);
637: break;
638:
639: default:
640: // Actually a tag. Note that we allow for </> and that we skip false positives
641: // due to sloppy HTML writing (e.g., "<-- hello! -->" ).
642: if (Character.isLetter(text[pos]))
643: state = STATE_BEFORE_START_TAG_NAME;
644: else if (text[pos] == '/'
645: && (Character.isLetter(text[pos + 1]) || text[pos + 1] == '>')) {
646: state = STATE_BEFORE_END_TAG_NAME;
647: pos++;
648: } else {
649: // Not really a tag.
650: if (parseText)
651: characters.append('<');
652: continue;
653: }
654: break;
655: }
656: if (parseText && characters.length() != 0) {
657: callback.characters(characters.array(), 0,
658: characters.length(), flowBroken);
659: characters.length(0);
660: }
661:
662: flowBroken = false;
663: break;
664:
665: case STATE_BEFORE_START_TAG_NAME:
666: case STATE_BEFORE_END_TAG_NAME:
667: // Let's get the name.
668: tagElemTypeName.length(0);
669: for (start = pos; pos < end
670: && (Character.isLetterOrDigit(text[pos])
671: || text[pos] == ':' || text[pos] == '_'
672: || text[pos] == '-' || text[pos] == '.'); pos++)
673: ;
674:
675: tagElemTypeName.append(text, start, pos - start);
676: tagElemTypeName.toLowerCase();
677:
678: currentElement = factory.getElement(tagElemTypeName);
679: if (DEBUG)
680: System.err
681: .println((state == STATE_BEFORE_START_TAG_NAME ? "Opening"
682: : "Closing")
683: + " tag for "
684: + tagElemTypeName
685: + " (element: "
686: + currentElement
687: + ")");
688:
689: if (currentElement != null && currentElement.breaksFlow)
690: flowBroken = true;
691: while (pos < end && Character.isWhitespace(text[pos]))
692: pos++;
693: state = state == STATE_BEFORE_START_TAG_NAME ? STATE_IN_START_TAG
694: : STATE_IN_END_TAG;
695: break;
696:
697: case STATE_IN_START_TAG:
698: currChar = text[pos];
699: if (currChar != '>'
700: && (currChar != '/' || pos == end - 1 || text[pos + 1] != '>')) {
701: // We got attributes.
702: if (Character.isLetter(currChar)) {
703: parseCurrAttr = false;
704: attrName.length(0);
705: for (start = pos; pos < end
706: && (Character.isLetter(text[pos]) || text[pos] == '-'); pos++)
707: ;
708: if (currentElement != null && parseAttributes) {
709: attrName.append(text, start, pos - start);
710: attrName.toLowerCase();
711: if (DEBUG)
712: System.err
713: .println("Got attribute named \""
714: + attrName + "\"");
715: currAttr = factory.getAttribute(attrName);
716: parseCurrAttr = parsedAttrs
717: .contains(currAttr);
718: }
719: // Skip whitespace
720: while (pos < end
721: && Character.isWhitespace(text[pos]))
722: pos++;
723: if (pos == end)
724: break;
725: if (text[pos] != '=') {
726: // We found an attribute without explicit value.
727: // TODO: can we avoid another string?
728: if (parseCurrAttr)
729: attrMap
730: .put(currAttr,
731: new MutableString(
732: currAttr.name));
733: break;
734: }
735:
736: pos++;
737: while (pos < end
738: && Character.isWhitespace(text[pos]))
739: pos++;
740: if (pos == end)
741: break;
742:
743: attrValue.length(0);
744: if (pos < end
745: && ((delim = text[pos]) == '"' || (delim = text[pos]) == '\'')) {
746: // An attribute value with delimiters.
747: for (start = ++pos; pos < end
748: && text[pos] != delim; pos++)
749: ;
750: if (parseCurrAttr)
751: attrValue.append(text, start,
752: pos - start).replace(
753: NONSPACE_WHITESPACE, SPACE);
754: if (pos < end)
755: pos++;
756: } else {
757: // An attribute value without delimiters. Due to very common errors, we
758: // gather characters up to the first occurrence of whitespace or '>'.
759: for (start = pos; pos < end
760: && !Character
761: .isWhitespace(text[pos])
762: && text[pos] != '>'; pos++)
763: ;
764: if (parseCurrAttr)
765: attrValue.append(text, start, pos
766: - start);
767: }
768:
769: if (parseCurrAttr) {
770: replaceEntities(attrValue, entity, false);
771: attrMap.put(currAttr, attrValue.copy());
772: if (DEBUG)
773: System.err
774: .println("Attribute value: \""
775: + attrValue + "\"");
776: }
777: // Skip whitespace
778: while (pos < end
779: && Character.isWhitespace(text[pos]))
780: pos++;
781: } else {
782: // It's a mess. Our only reasonable chance is to try to resync on the first
783: // whitespace, or alternatively to get to the end of the tag.
784: do
785: pos++;
786: while (pos < end && text[pos] != '>'
787: && !Character.isWhitespace(text[pos]));
788: // Skip whitespace
789: while (pos < end
790: && Character.isWhitespace(text[pos]))
791: pos++;
792: continue;
793: }
794: } else {
795: if (parseTags
796: && !callback.startElement(currentElement,
797: attrMap))
798: break;
799: if (attrMap != null)
800: attrMap.clear();
801:
802: if (currentElement == Element.SCRIPT
803: || currentElement == Element.STYLE) {
804: final TextPattern pattern = currentElement == Element.SCRIPT ? SCRIPT_CLOSE_TAG_PATTERN
805: : STYLE_CLOSE_TAG_PATTERN;
806: start = pos + 1;
807: pos = pattern.search(text, start, end);
808: if (pos == -1)
809: pos = end;
810: if (parseText)
811: callback.cdata(currentElement, text, start,
812: pos - start);
813: if (pos < end) {
814: if (parseTags)
815: callback.endElement(currentElement);
816: pos += pattern.length();
817: }
818: } else
819: pos += currChar == '/' ? 2 : 1;
820: state = STATE_TEXT;
821: }
822: break;
823:
824: case STATE_IN_END_TAG:
825: while (pos < end && text[pos] != '>')
826: pos++;
827: if (parseTags && currentElement != null
828: && !callback.endElement(currentElement))
829: break;
830: state = STATE_TEXT;
831: pos++;
832: break;
833:
834: default:
835: }
836:
837: }
838:
839: // We do what we can to invoke tag handlers in case of a truncated text.
840: if (state == STATE_IN_START_TAG && parseTags
841: && currentElement != null)
842: callback.startElement(currentElement, attrMap);
843: if (state == STATE_IN_END_TAG && parseTags
844: && currentElement != null)
845: callback.endElement(currentElement);
846:
847: if (state == STATE_TEXT && parseText && characters.length() > 0)
848: callback.characters(characters.array(), 0, characters
849: .length(), flowBroken);
850:
851: callback.endDocument();
852: }
853: }
|