001: /* Generated By:JavaCC: Do not edit this line. HTMLParser.java */
002: package org.apache.lucene.demo.html;
003:
004: import java.io.*;
005: import java.util.Properties;
006:
007: public class HTMLParser implements HTMLParserConstants {
008: public static int SUMMARY_LENGTH = 200;
009:
010: StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
011: StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
012: Properties metaTags = new Properties();
013: String currentMetaTag = null;
014: String currentMetaContent = null;
015: int length = 0;
016: boolean titleComplete = false;
017: boolean inTitle = false;
018: boolean inMetaTag = false;
019: boolean inStyle = false;
020: boolean afterTag = false;
021: boolean afterSpace = false;
022: String eol = System.getProperty("line.separator");
023: Reader pipeIn = null;
024: Writer pipeOut;
025: private MyPipedInputStream pipeInStream = null;
026: private PipedOutputStream pipeOutStream = null;
027:
028: private class MyPipedInputStream extends PipedInputStream {
029:
030: public MyPipedInputStream() {
031: super ();
032: }
033:
034: public MyPipedInputStream(PipedOutputStream src)
035: throws IOException {
036: super (src);
037: }
038:
039: public boolean full() throws IOException {
040: return this .available() >= PipedInputStream.PIPE_SIZE;
041: }
042: }
043:
044: /**
045: * @deprecated Use HTMLParser(FileInputStream) instead
046: */
047: public HTMLParser(File file) throws FileNotFoundException {
048: this (new FileInputStream(file));
049: }
050:
051: public String getTitle() throws IOException, InterruptedException {
052: if (pipeIn == null)
053: getReader(); // spawn parsing thread
054: while (true) {
055: synchronized (this ) {
056: if (titleComplete || pipeInStream.full())
057: break;
058: wait(10);
059: }
060: }
061: return title.toString().trim();
062: }
063:
064: public Properties getMetaTags() throws IOException,
065: InterruptedException {
066: if (pipeIn == null)
067: getReader(); // spawn parsing thread
068: while (true) {
069: synchronized (this ) {
070: if (titleComplete || pipeInStream.full())
071: break;
072: wait(10);
073: }
074: }
075: return metaTags;
076: }
077:
078: public String getSummary() throws IOException, InterruptedException {
079: if (pipeIn == null)
080: getReader(); // spawn parsing thread
081: while (true) {
082: synchronized (this ) {
083: if (summary.length() >= SUMMARY_LENGTH
084: || pipeInStream.full())
085: break;
086: wait(10);
087: }
088: }
089: if (summary.length() > SUMMARY_LENGTH)
090: summary.setLength(SUMMARY_LENGTH);
091:
092: String sum = summary.toString().trim();
093: String tit = getTitle();
094: if (sum.startsWith(tit) || sum.equals(""))
095: return tit;
096: else
097: return sum;
098: }
099:
100: public Reader getReader() throws IOException {
101: if (pipeIn == null) {
102: pipeInStream = new MyPipedInputStream();
103: pipeOutStream = new PipedOutputStream(pipeInStream);
104: pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
105: pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");
106:
107: Thread thread = new ParserThread(this );
108: thread.start(); // start parsing
109: }
110:
111: return pipeIn;
112: }
113:
114: void addToSummary(String text) {
115: if (summary.length() < SUMMARY_LENGTH) {
116: summary.append(text);
117: if (summary.length() >= SUMMARY_LENGTH) {
118: synchronized (this ) {
119: notifyAll();
120: }
121: }
122: }
123: }
124:
125: void addText(String text) throws IOException {
126: if (inStyle)
127: return;
128: if (inTitle)
129: title.append(text);
130: else {
131: addToSummary(text);
132: if (!titleComplete && !title.equals("")) { // finished title
133: synchronized (this ) {
134: titleComplete = true; // tell waiting threads
135: notifyAll();
136: }
137: }
138: }
139:
140: length += text.length();
141: pipeOut.write(text);
142:
143: afterSpace = false;
144: }
145:
146: void addMetaTag() {
147: metaTags.setProperty(currentMetaTag, currentMetaContent);
148: currentMetaTag = null;
149: currentMetaContent = null;
150: return;
151: }
152:
153: void addSpace() throws IOException {
154: if (!afterSpace) {
155: if (inTitle)
156: title.append(" ");
157: else
158: addToSummary(" ");
159:
160: String space = afterTag ? eol : " ";
161: length += space.length();
162: pipeOut.write(space);
163: afterSpace = true;
164: }
165: }
166:
167: final public void HTMLDocument() throws ParseException, IOException {
168: Token t;
169: label_1: while (true) {
170: switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) {
171: case ScriptStart:
172: case TagName:
173: case DeclName:
174: case Comment1:
175: case Comment2:
176: case Word:
177: case Entity:
178: case Space:
179: case Punct:
180: ;
181: break;
182: default:
183: jj_la1[0] = jj_gen;
184: break label_1;
185: }
186: switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) {
187: case TagName:
188: Tag();
189: afterTag = true;
190: break;
191: case DeclName:
192: t = Decl();
193: afterTag = true;
194: break;
195: case Comment1:
196: case Comment2:
197: CommentTag();
198: afterTag = true;
199: break;
200: case ScriptStart:
201: ScriptTag();
202: afterTag = true;
203: break;
204: case Word:
205: t = jj_consume_token(Word);
206: addText(t.image);
207: afterTag = false;
208: break;
209: case Entity:
210: t = jj_consume_token(Entity);
211: addText(Entities.decode(t.image));
212: afterTag = false;
213: break;
214: case Punct:
215: t = jj_consume_token(Punct);
216: addText(t.image);
217: afterTag = false;
218: break;
219: case Space:
220: jj_consume_token(Space);
221: addSpace();
222: afterTag = false;
223: break;
224: default:
225: jj_la1[1] = jj_gen;
226: jj_consume_token(-1);
227: throw new ParseException();
228: }
229: }
230: jj_consume_token(0);
231: }
232:
233: final public void Tag() throws ParseException, IOException {
234: Token t1, t2;
235: boolean inImg = false;
236: t1 = jj_consume_token(TagName);
237: String tagName = t1.image.toLowerCase();
238: if (Tags.WS_ELEMS.contains(tagName)) {
239: addSpace();
240: }
241: inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
242: inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
243: inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
244: inImg = tagName.equalsIgnoreCase("<img"); // keep track if in <IMG>
245:
246: label_2: while (true) {
247: switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) {
248: case ArgName:
249: ;
250: break;
251: default:
252: jj_la1[2] = jj_gen;
253: break label_2;
254: }
255: t1 = jj_consume_token(ArgName);
256: switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) {
257: case ArgEquals:
258: jj_consume_token(ArgEquals);
259: switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) {
260: case ArgValue:
261: case ArgQuote1:
262: case ArgQuote2:
263: t2 = ArgValue();
264: if (inImg && t1.image.equalsIgnoreCase("alt")
265: && t2 != null)
266: addText("[" + t2.image + "]");
267:
268: if (inMetaTag
269: && (t1.image.equalsIgnoreCase("name") || t1.image
270: .equalsIgnoreCase("HTTP-EQUIV"))
271: && t2 != null) {
272: currentMetaTag = t2.image.toLowerCase();
273: if (currentMetaTag != null
274: && currentMetaContent != null) {
275: addMetaTag();
276: }
277: }
278: if (inMetaTag
279: && t1.image.equalsIgnoreCase("content")
280: && t2 != null) {
281: currentMetaContent = t2.image.toLowerCase();
282: if (currentMetaTag != null
283: && currentMetaContent != null) {
284: addMetaTag();
285: }
286: }
287: break;
288: default:
289: jj_la1[3] = jj_gen;
290: ;
291: }
292: break;
293: default:
294: jj_la1[4] = jj_gen;
295: ;
296: }
297: }
298: jj_consume_token(TagEnd);
299: }
300:
301: final public Token ArgValue() throws ParseException {
302: Token t = null;
303: switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) {
304: case ArgValue:
305: t = jj_consume_token(ArgValue);
306: {
307: if (true)
308: return t;
309: }
310: break;
311: default:
312: jj_la1[5] = jj_gen;
313: if (jj_2_1(2)) {
314: jj_consume_token(ArgQuote1);
315: jj_consume_token(CloseQuote1);
316: {
317: if (true)
318: return t;
319: }
320: } else {
321: switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) {
322: case ArgQuote1:
323: jj_consume_token(ArgQuote1);
324: t = jj_consume_token(Quote1Text);
325: jj_consume_token(CloseQuote1);
326: {
327: if (true)
328: return t;
329: }
330: break;
331: default:
332: jj_la1[6] = jj_gen;
333: if (jj_2_2(2)) {
334: jj_consume_token(ArgQuote2);
335: jj_consume_token(CloseQuote2);
336: {
337: if (true)
338: return t;
339: }
340: } else {
341: switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) {
342: case ArgQuote2:
343: jj_consume_token(ArgQuote2);
344: t = jj_consume_token(Quote2Text);
345: jj_consume_token(CloseQuote2);
346: {
347: if (true)
348: return t;
349: }
350: break;
351: default:
352: jj_la1[7] = jj_gen;
353: jj_consume_token(-1);
354: throw new ParseException();
355: }
356: }
357: }
358: }
359: }
360: throw new Error("Missing return statement in function");
361: }
362:
363: final public Token Decl() throws ParseException {
364: Token t;
365: t = jj_consume_token(DeclName);
366: label_3: while (true) {
367: switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) {
368: case ArgName:
369: case ArgEquals:
370: case ArgValue:
371: case ArgQuote1:
372: case ArgQuote2:
373: ;
374: break;
375: default:
376: jj_la1[8] = jj_gen;
377: break label_3;
378: }
379: switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) {
380: case ArgName:
381: jj_consume_token(ArgName);
382: break;
383: case ArgValue:
384: case ArgQuote1:
385: case ArgQuote2:
386: ArgValue();
387: break;
388: case ArgEquals:
389: jj_consume_token(ArgEquals);
390: break;
391: default:
392: jj_la1[9] = jj_gen;
393: jj_consume_token(-1);
394: throw new ParseException();
395: }
396: }
397: jj_consume_token(TagEnd);
398: {
399: if (true)
400: return t;
401: }
402: throw new Error("Missing return statement in function");
403: }
404:
405: final public void CommentTag() throws ParseException {
406: switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) {
407: case Comment1:
408: jj_consume_token(Comment1);
409: label_4: while (true) {
410: switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) {
411: case CommentText1:
412: ;
413: break;
414: default:
415: jj_la1[10] = jj_gen;
416: break label_4;
417: }
418: jj_consume_token(CommentText1);
419: }
420: jj_consume_token(CommentEnd1);
421: break;
422: case Comment2:
423: jj_consume_token(Comment2);
424: label_5: while (true) {
425: switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) {
426: case CommentText2:
427: ;
428: break;
429: default:
430: jj_la1[11] = jj_gen;
431: break label_5;
432: }
433: jj_consume_token(CommentText2);
434: }
435: jj_consume_token(CommentEnd2);
436: break;
437: default:
438: jj_la1[12] = jj_gen;
439: jj_consume_token(-1);
440: throw new ParseException();
441: }
442: }
443:
444: final public void ScriptTag() throws ParseException {
445: jj_consume_token(ScriptStart);
446: label_6: while (true) {
447: switch ((jj_ntk == -1) ? jj_ntk() : jj_ntk) {
448: case ScriptText:
449: ;
450: break;
451: default:
452: jj_la1[13] = jj_gen;
453: break label_6;
454: }
455: jj_consume_token(ScriptText);
456: }
457: jj_consume_token(ScriptEnd);
458: }
459:
460: final private boolean jj_2_1(int xla) {
461: jj_la = xla;
462: jj_lastpos = jj_scanpos = token;
463: try {
464: return !jj_3_1();
465: } catch (LookaheadSuccess ls) {
466: return true;
467: } finally {
468: jj_save(0, xla);
469: }
470: }
471:
472: final private boolean jj_2_2(int xla) {
473: jj_la = xla;
474: jj_lastpos = jj_scanpos = token;
475: try {
476: return !jj_3_2();
477: } catch (LookaheadSuccess ls) {
478: return true;
479: } finally {
480: jj_save(1, xla);
481: }
482: }
483:
484: final private boolean jj_3_1() {
485: if (jj_scan_token(ArgQuote1))
486: return true;
487: if (jj_scan_token(CloseQuote1))
488: return true;
489: return false;
490: }
491:
492: final private boolean jj_3_2() {
493: if (jj_scan_token(ArgQuote2))
494: return true;
495: if (jj_scan_token(CloseQuote2))
496: return true;
497: return false;
498: }
499:
500: public HTMLParserTokenManager token_source;
501: SimpleCharStream jj_input_stream;
502: public Token token, jj_nt;
503: private int jj_ntk;
504: private Token jj_scanpos, jj_lastpos;
505: private int jj_la;
506: public boolean lookingAhead = false;
507: private boolean jj_semLA;
508: private int jj_gen;
509: final private int[] jj_la1 = new int[14];
510: static private int[] jj_la1_0;
511: static {
512: jj_la1_0();
513: }
514:
515: private static void jj_la1_0() {
516: jj_la1_0 = new int[] { 0x2c7e, 0x2c7e, 0x10000, 0x380000,
517: 0x20000, 0x80000, 0x100000, 0x200000, 0x3b0000,
518: 0x3b0000, 0x8000000, 0x20000000, 0x30, 0x4000, };
519: }
520:
521: final private JJCalls[] jj_2_rtns = new JJCalls[2];
522: private boolean jj_rescan = false;
523: private int jj_gc = 0;
524:
525: public HTMLParser(java.io.InputStream stream) {
526: jj_input_stream = new SimpleCharStream(stream, 1, 1);
527: token_source = new HTMLParserTokenManager(jj_input_stream);
528: token = new Token();
529: jj_ntk = -1;
530: jj_gen = 0;
531: for (int i = 0; i < 14; i++)
532: jj_la1[i] = -1;
533: for (int i = 0; i < jj_2_rtns.length; i++)
534: jj_2_rtns[i] = new JJCalls();
535: }
536:
537: public void ReInit(java.io.InputStream stream) {
538: jj_input_stream.ReInit(stream, 1, 1);
539: token_source.ReInit(jj_input_stream);
540: token = new Token();
541: jj_ntk = -1;
542: jj_gen = 0;
543: for (int i = 0; i < 14; i++)
544: jj_la1[i] = -1;
545: for (int i = 0; i < jj_2_rtns.length; i++)
546: jj_2_rtns[i] = new JJCalls();
547: }
548:
549: public HTMLParser(java.io.Reader stream) {
550: jj_input_stream = new SimpleCharStream(stream, 1, 1);
551: token_source = new HTMLParserTokenManager(jj_input_stream);
552: token = new Token();
553: jj_ntk = -1;
554: jj_gen = 0;
555: for (int i = 0; i < 14; i++)
556: jj_la1[i] = -1;
557: for (int i = 0; i < jj_2_rtns.length; i++)
558: jj_2_rtns[i] = new JJCalls();
559: }
560:
561: public void ReInit(java.io.Reader stream) {
562: jj_input_stream.ReInit(stream, 1, 1);
563: token_source.ReInit(jj_input_stream);
564: token = new Token();
565: jj_ntk = -1;
566: jj_gen = 0;
567: for (int i = 0; i < 14; i++)
568: jj_la1[i] = -1;
569: for (int i = 0; i < jj_2_rtns.length; i++)
570: jj_2_rtns[i] = new JJCalls();
571: }
572:
573: public HTMLParser(HTMLParserTokenManager tm) {
574: token_source = tm;
575: token = new Token();
576: jj_ntk = -1;
577: jj_gen = 0;
578: for (int i = 0; i < 14; i++)
579: jj_la1[i] = -1;
580: for (int i = 0; i < jj_2_rtns.length; i++)
581: jj_2_rtns[i] = new JJCalls();
582: }
583:
584: public void ReInit(HTMLParserTokenManager tm) {
585: token_source = tm;
586: token = new Token();
587: jj_ntk = -1;
588: jj_gen = 0;
589: for (int i = 0; i < 14; i++)
590: jj_la1[i] = -1;
591: for (int i = 0; i < jj_2_rtns.length; i++)
592: jj_2_rtns[i] = new JJCalls();
593: }
594:
595: final private Token jj_consume_token(int kind)
596: throws ParseException {
597: Token oldToken;
598: if ((oldToken = token).next != null)
599: token = token.next;
600: else
601: token = token.next = token_source.getNextToken();
602: jj_ntk = -1;
603: if (token.kind == kind) {
604: jj_gen++;
605: if (++jj_gc > 100) {
606: jj_gc = 0;
607: for (int i = 0; i < jj_2_rtns.length; i++) {
608: JJCalls c = jj_2_rtns[i];
609: while (c != null) {
610: if (c.gen < jj_gen)
611: c.first = null;
612: c = c.next;
613: }
614: }
615: }
616: return token;
617: }
618: token = oldToken;
619: jj_kind = kind;
620: throw generateParseException();
621: }
622:
623: static private final class LookaheadSuccess extends java.lang.Error {
624: }
625:
626: final private LookaheadSuccess jj_ls = new LookaheadSuccess();
627:
628: final private boolean jj_scan_token(int kind) {
629: if (jj_scanpos == jj_lastpos) {
630: jj_la--;
631: if (jj_scanpos.next == null) {
632: jj_lastpos = jj_scanpos = jj_scanpos.next = token_source
633: .getNextToken();
634: } else {
635: jj_lastpos = jj_scanpos = jj_scanpos.next;
636: }
637: } else {
638: jj_scanpos = jj_scanpos.next;
639: }
640: if (jj_rescan) {
641: int i = 0;
642: Token tok = token;
643: while (tok != null && tok != jj_scanpos) {
644: i++;
645: tok = tok.next;
646: }
647: if (tok != null)
648: jj_add_error_token(kind, i);
649: }
650: if (jj_scanpos.kind != kind)
651: return true;
652: if (jj_la == 0 && jj_scanpos == jj_lastpos)
653: throw jj_ls;
654: return false;
655: }
656:
657: final public Token getNextToken() {
658: if (token.next != null)
659: token = token.next;
660: else
661: token = token.next = token_source.getNextToken();
662: jj_ntk = -1;
663: jj_gen++;
664: return token;
665: }
666:
667: final public Token getToken(int index) {
668: Token t = lookingAhead ? jj_scanpos : token;
669: for (int i = 0; i < index; i++) {
670: if (t.next != null)
671: t = t.next;
672: else
673: t = t.next = token_source.getNextToken();
674: }
675: return t;
676: }
677:
678: final private int jj_ntk() {
679: if ((jj_nt = token.next) == null)
680: return (jj_ntk = (token.next = token_source.getNextToken()).kind);
681: else
682: return (jj_ntk = jj_nt.kind);
683: }
684:
685: private java.util.Vector jj_expentries = new java.util.Vector();
686: private int[] jj_expentry;
687: private int jj_kind = -1;
688: private int[] jj_lasttokens = new int[100];
689: private int jj_endpos;
690:
691: private void jj_add_error_token(int kind, int pos) {
692: if (pos >= 100)
693: return;
694: if (pos == jj_endpos + 1) {
695: jj_lasttokens[jj_endpos++] = kind;
696: } else if (jj_endpos != 0) {
697: jj_expentry = new int[jj_endpos];
698: System.arraycopy(jj_lasttokens, 0, jj_expentry, 0,
699: jj_endpos);
700: boolean exists = false;
701: for (java.util.Enumeration e = jj_expentries.elements(); e
702: .hasMoreElements();) {
703: int[] oldentry = (int[]) (e.nextElement());
704: if (oldentry.length == jj_expentry.length) {
705: exists = true;
706: for (int i = 0; i < jj_expentry.length; i++) {
707: if (oldentry[i] != jj_expentry[i]) {
708: exists = false;
709: break;
710: }
711: }
712: if (exists)
713: break;
714: }
715: }
716: if (!exists)
717: jj_expentries.addElement(jj_expentry);
718: if (pos != 0)
719: jj_lasttokens[(jj_endpos = pos) - 1] = kind;
720: }
721: }
722:
723: public ParseException generateParseException() {
724: jj_expentries.removeAllElements();
725: boolean[] la1tokens = new boolean[31];
726: for (int i = 0; i < 31; i++) {
727: la1tokens[i] = false;
728: }
729: if (jj_kind >= 0) {
730: la1tokens[jj_kind] = true;
731: jj_kind = -1;
732: }
733: for (int i = 0; i < 14; i++) {
734: if (jj_la1[i] == jj_gen) {
735: for (int j = 0; j < 32; j++) {
736: if ((jj_la1_0[i] & (1 << j)) != 0) {
737: la1tokens[j] = true;
738: }
739: }
740: }
741: }
742: for (int i = 0; i < 31; i++) {
743: if (la1tokens[i]) {
744: jj_expentry = new int[1];
745: jj_expentry[0] = i;
746: jj_expentries.addElement(jj_expentry);
747: }
748: }
749: jj_endpos = 0;
750: jj_rescan_token();
751: jj_add_error_token(0, 0);
752: int[][] exptokseq = new int[jj_expentries.size()][];
753: for (int i = 0; i < jj_expentries.size(); i++) {
754: exptokseq[i] = (int[]) jj_expentries.elementAt(i);
755: }
756: return new ParseException(token, exptokseq, tokenImage);
757: }
758:
759: final public void enable_tracing() {
760: }
761:
762: final public void disable_tracing() {
763: }
764:
765: final private void jj_rescan_token() {
766: jj_rescan = true;
767: for (int i = 0; i < 2; i++) {
768: JJCalls p = jj_2_rtns[i];
769: do {
770: if (p.gen > jj_gen) {
771: jj_la = p.arg;
772: jj_lastpos = jj_scanpos = p.first;
773: switch (i) {
774: case 0:
775: jj_3_1();
776: break;
777: case 1:
778: jj_3_2();
779: break;
780: }
781: }
782: p = p.next;
783: } while (p != null);
784: }
785: jj_rescan = false;
786: }
787:
788: final private void jj_save(int index, int xla) {
789: JJCalls p = jj_2_rtns[index];
790: while (p.gen > jj_gen) {
791: if (p.next == null) {
792: p = p.next = new JJCalls();
793: break;
794: }
795: p = p.next;
796: }
797: p.gen = jj_gen + xla - jj_la;
798: p.first = token;
799: p.arg = xla;
800: }
801:
802: static final class JJCalls {
803: int gen;
804: Token first;
805: int arg;
806: JJCalls next;
807: }
808:
809: // void handleException(Exception e) {
810: // System.out.println(e.toString()); // print the error message
811: // System.out.println("Skipping...");
812: // Token t;
813: // do {
814: // t = getNextToken();
815: // } while (t.kind != TagEnd);
816: // }
817: }
|