001: // htmlFilterOutputStream.java
002: // ---------------------------
003: // (C) by Michael Peter Christen; mc@anomic.de
004: // first published on http://www.anomic.de
005: // Frankfurt, Germany, 2004, 2005
006: //
007: // $LastChangedDate: 2006-09-15 17:01:25 +0200 (Fr, 15 Sep 2006) $
008: // $LastChangedRevision: 2598 $
009: // $LastChangedBy: theli $
010: //
011: // This program is free software; you can redistribute it and/or modify
012: // it under the terms of the GNU General Public License as published by
013: // the Free Software Foundation; either version 2 of the License, or
014: // (at your option) any later version.
015: //
016: // This program is distributed in the hope that it will be useful,
017: // but WITHOUT ANY WARRANTY; without even the implied warranty of
018: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: // GNU General Public License for more details.
020: //
021: // You should have received a copy of the GNU General Public License
022: // along with this program; if not, write to the Free Software
023: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: //
025: // Using this software in any meaning (reading, learning, copying, compiling,
026: // running) means that you agree that the Author(s) is (are) not responsible
027: // for cost, loss of data or any harm that may be caused directly or indirectly
028: // by usage of this softare or this documentation. The usage of this software
029: // is on your own risk. The installation and usage (starting/running) of this
030: // software may allow other people or application to access your computer and
031: // any attached devices and is highly dependent on the configuration of the
032: // software which must be done by the user of the software;the author(s) is
033: // (are) also not responsible for proper configuration and usage of the
034: // software, even if provoked by documentation provided together with
035: // the software.
036: //
037: // Any changes to this file according to the GPL as documented in the file
038: // gpl.txt aside this file in the shipment you received can be done to the
039: // lines that follows this copyright notice here, but changes must not be
040: // done inside the copyright notive above. A re-distribution must contain
041: // the intact and unchanged copyright notice.
042: // Contributions and changes to the program code must be marked as such.
043:
044: /*
045: This class implements an output stream. Any data written to that output
046: is automatically parsed.
047: After finishing with writing, the htmlFilter can be read out.
048:
049: */
050:
051: package de.anomic.htmlFilter;
052:
053: import java.io.File;
054: import java.io.FileOutputStream;
055: import java.io.FileReader;
056: import java.io.IOException;
057: import java.io.OutputStream;
058: import java.io.OutputStreamWriter;
059: import java.io.Reader;
060: import java.io.UnsupportedEncodingException;
061: import java.io.Writer;
062: import java.net.MalformedURLException;
063: import java.util.Enumeration;
064: import java.util.Properties;
065:
066: import de.anomic.server.serverCharBuffer;
067: import de.anomic.yacy.yacyURL;
068:
069: public final class htmlFilterWriter extends Writer {
070:
071: public static final char lb = '<';
072: public static final char rb = '>';
073: public static final char dash = '-';
074: public static final char excl = '!';
075: public static final char singlequote = '\'';
076: public static final char doublequote = '"';
077:
078: private OutputStream outStream;
079: private OutputStreamWriter out;
080: private serverCharBuffer buffer;
081: private String filterTag;
082: private Properties filterOpts;
083: private serverCharBuffer filterCont;
084: private htmlFilterScraper scraper;
085: private htmlFilterTransformer transformer;
086: private boolean inSingleQuote;
087: private boolean inDoubleQuote;
088: private boolean inComment;
089: private boolean inScript;
090: private boolean binaryUnsuspect;
091: private boolean passbyIfBinarySuspect;
092:
093: public htmlFilterWriter(OutputStream outStream,
094: String outputStreamCharset, htmlFilterScraper scraper,
095: htmlFilterTransformer transformer,
096: boolean passbyIfBinarySuspect)
097: throws UnsupportedEncodingException {
098: this .outStream = outStream;
099: this .scraper = scraper;
100: this .transformer = transformer;
101: this .buffer = new serverCharBuffer(1024);
102: this .filterTag = null;
103: this .filterOpts = null;
104: this .filterCont = null;
105: this .inSingleQuote = false;
106: this .inDoubleQuote = false;
107: this .inComment = false;
108: this .inScript = false;
109: this .binaryUnsuspect = true;
110: this .passbyIfBinarySuspect = passbyIfBinarySuspect;
111:
112: if (this .outStream != null) {
113: this .out = new OutputStreamWriter(this .outStream,
114: (outputStreamCharset == null) ? "UTF-8"
115: : outputStreamCharset);
116: }
117: }
118:
119: public static char[] genTag0raw(String tagname, boolean opening,
120: char[] tagopts) {
121: serverCharBuffer bb = new serverCharBuffer(tagname.length()
122: + tagopts.length + 3);
123: bb.append((int) '<');
124: if (!opening) {
125: bb.append((int) '/');
126: }
127: bb.append(tagname);
128: if (tagopts.length > 0) {
129: // if (tagopts[0] == (byte) 32)
130: bb.append(tagopts);
131: // else bb.append((byte) 32).append(tagopts);
132: }
133: bb.append((int) '>');
134: return bb.getChars();
135: }
136:
137: public static char[] genTag1raw(String tagname, char[] tagopts,
138: char[] text) {
139: serverCharBuffer bb = new serverCharBuffer(2 * tagname.length()
140: + tagopts.length + text.length + 5);
141: bb.append((int) '<').append(tagname);
142: if (tagopts.length > 0) {
143: // if (tagopts[0] == (byte) 32)
144: bb.append(tagopts);
145: // else bb.append((byte) 32).append(tagopts);
146: }
147: bb.append((int) '>');
148: bb.append(text);
149: bb.append((int) '<').append((int) '/').append(tagname).append(
150: (int) '>');
151: return bb.getChars();
152: }
153:
154: public static char[] genTag0(String tagname, Properties tagopts,
155: char quotechar) {
156: char[] tagoptsx = (tagopts.size() == 0) ? null : genOpts(
157: tagopts, quotechar);
158: serverCharBuffer bb = new serverCharBuffer(tagname.length()
159: + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1))
160: + tagname.length() + 2);
161: bb.append((int) '<').append(tagname);
162: if (tagoptsx != null) {
163: bb.append(32);
164: bb.append(tagoptsx);
165: }
166: bb.append((int) '>');
167: return bb.getChars();
168: }
169:
170: public static char[] genTag1(String tagname, Properties tagopts,
171: char[] text, char quotechar) {
172: char[] gt0 = genTag0(tagname, tagopts, quotechar);
173: serverCharBuffer cb = new serverCharBuffer(gt0, gt0.length
174: + text.length + tagname.length() + 3);
175: cb.append(text).append((int) '<').append((int) '/').append(
176: tagname).append((int) '>');
177: return cb.getChars();
178: }
179:
180: // a helper method for pretty-printing of properties for html tags
181: public static char[] genOpts(Properties prop, char quotechar) {
182: Enumeration<?> e = prop.propertyNames();
183: serverCharBuffer bb = new serverCharBuffer(prop.size() * 40);
184: String key;
185: while (e.hasMoreElements()) {
186: key = (String) e.nextElement();
187: bb.append(32).append(key).append((int) '=').append(
188: (int) quotechar);
189: bb.append(prop.getProperty(key));
190: bb.append((int) quotechar);
191: }
192: if (bb.length() > 0)
193: return bb.getChars(1);
194: return bb.getChars();
195: }
196:
197: private char[] filterTag(String tag, boolean opening,
198: char[] content, char quotechar) {
199: // System.out.println("FILTER1: filterTag=" + ((filterTag == null) ? "null" : filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug
200: if (filterTag == null) {
201: // we are not collection tag text
202: if (tag == null) {
203: // and this is not a tag opener/closer
204: if (scraper != null)
205: scraper.scrapeText(content, null);
206: if (transformer != null)
207: return transformer.transformText(content);
208: return content;
209: }
210:
211: // we have a new tag
212: if (opening) {
213: if ((scraper != null) && (scraper.isTag0(tag))) {
214: // this single tag is collected at once here
215: scraper.scrapeTag0(tag, new serverCharBuffer(
216: content).propParser());
217: }
218: if ((transformer != null) && (transformer.isTag0(tag))) {
219: // this single tag is collected at once here
220: return transformer.transformTag0(tag,
221: new serverCharBuffer(content).propParser(),
222: quotechar);
223: } else if (((scraper != null) && (scraper.isTag1(tag)))
224: || ((transformer != null) && (transformer
225: .isTag1(tag)))) {
226: // ok, start collecting
227: filterTag = tag;
228: filterOpts = new serverCharBuffer(content)
229: .propParser();
230: filterCont = new serverCharBuffer();
231: return new char[0];
232: } else {
233: // we ignore that thing and return it again
234: return genTag0raw(tag, true, content);
235: }
236: }
237:
238: // we ignore that thing and return it again
239: return genTag0raw(tag, false, content);
240:
241: }
242:
243: // we are collection tag text for the tag 'filterTag'
244: if (tag == null) {
245: // go on collecting content
246: if (scraper != null)
247: scraper.scrapeText(content, filterTag);
248: if (transformer != null) {
249: filterCont.append(transformer.transformText(content));
250: } else {
251: filterCont.append(content);
252: }
253: return new char[0];
254: }
255:
256: // it's a tag! which one?
257: if ((opening) || (!(tag.equals(filterTag)))) {
258: // this tag is not our concern. just add it
259: filterCont.append(genTag0raw(tag, opening, content));
260: return new char[0];
261: }
262:
263: // it's our closing tag! return complete result.
264: char[] ret;
265: if (scraper != null)
266: scraper.scrapeTag1(filterTag, filterOpts, filterCont
267: .getChars());
268: if (transformer != null) {
269: ret = transformer.transformTag1(filterTag, filterOpts,
270: filterCont.getChars(), quotechar);
271: } else {
272: ret = genTag1(filterTag, filterOpts, filterCont.getChars(),
273: quotechar);
274: }
275: filterTag = null;
276: filterOpts = null;
277: filterCont = null;
278: return ret;
279: }
280:
281: private char[] filterFinalize(char quotechar) {
282: if (filterTag == null) {
283: return new char[0];
284: }
285:
286: // it's our closing tag! return complete result.
287: char[] ret;
288: if (scraper != null)
289: scraper.scrapeTag1(filterTag, filterOpts, filterCont
290: .getChars());
291: if (transformer != null) {
292: ret = transformer.transformTag1(filterTag, filterOpts,
293: filterCont.getChars(), quotechar);
294: } else {
295: ret = genTag1(filterTag, filterOpts, filterCont.getChars(),
296: quotechar);
297: }
298: filterTag = null;
299: filterOpts = null;
300: filterCont = null;
301: return ret;
302: }
303:
304: private char[] filterSentence(char[] in, char quotechar) {
305: if (in.length == 0)
306: return in;
307: // System.out.println("FILTER0: " + new String(in)); // debug
308: // scan the string and parse structure
309: if (in.length > 2 && in[0] == lb) {
310:
311: // a tag
312: String tag;
313: int tagend;
314: if (in[1] == '/') {
315: // a closing tag
316: tagend = tagEnd(in, 2);
317: tag = new String(in, 2, tagend - 2);
318: char[] text = new char[in.length - tagend - 1];
319: System.arraycopy(in, tagend, text, 0, in.length
320: - tagend - 1);
321: return filterTag(tag, false, text, quotechar);
322: }
323:
324: // an opening tag
325: tagend = tagEnd(in, 1);
326: tag = new String(in, 1, tagend - 1);
327: char[] text = new char[in.length - tagend - 1];
328: System.arraycopy(in, tagend, text, 0, in.length - tagend
329: - 1);
330: return filterTag(tag, true, text, quotechar);
331: }
332:
333: // a text
334: return filterTag(null, true, in, quotechar);
335: }
336:
337: private static int tagEnd(char[] tag, int start) {
338: char c;
339: for (int i = start; i < tag.length; i++) {
340: c = tag[i];
341: if (c != '!' && c != '-' && (c < '0' || c > '9')
342: && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z'))
343: return i;
344: }
345: return tag.length - 1;
346: }
347:
348: public void write(int c) throws IOException {
349: // System.out.println((char) b);
350: if ((binaryUnsuspect) && (binaryHint((char) c))) {
351: binaryUnsuspect = false;
352: if (passbyIfBinarySuspect)
353: finalize();
354: }
355:
356: if (binaryUnsuspect || !passbyIfBinarySuspect) {
357: char[] filtered;
358: if (inSingleQuote) {
359: buffer.append(c);
360: if (c == singlequote)
361: inSingleQuote = false;
362: // check error cases
363: if ((c == rb) && (buffer.charAt(0) == lb)) {
364: inSingleQuote = false;
365: // the tag ends here. after filtering: pass on
366: filtered = filterSentence(buffer.getChars(),
367: singlequote);
368: if (out != null) {
369: out.write(filtered);
370: }
371: // buffer = new serverByteBuffer();
372: buffer.reset();
373: }
374: } else if (inDoubleQuote) {
375: buffer.append(c);
376: if (c == doublequote)
377: inDoubleQuote = false;
378: // check error cases
379: if (c == rb && buffer.charAt(0) == lb) {
380: inDoubleQuote = false;
381: // the tag ends here. after filtering: pass on
382: filtered = filterSentence(buffer.getChars(),
383: doublequote);
384: if (out != null)
385: out.write(filtered);
386: // buffer = new serverByteBuffer();
387: buffer.reset();
388: }
389: } else if (inComment) {
390: buffer.append(c);
391: if (c == rb && buffer.length() > 6
392: && buffer.charAt(buffer.length() - 3) == dash) {
393: // comment is at end
394: inComment = false;
395: if (out != null)
396: out.write(buffer.getChars());
397: // buffer = new serverByteBuffer();
398: buffer.reset();
399: }
400: } else if (inScript) {
401: buffer.append(c);
402: int bufferLength = buffer.length();
403: if ((c == rb) && (bufferLength > 14)
404: && (buffer.charAt(bufferLength - 8) == '/')
405: && (buffer.charAt(bufferLength - 7) == 's')
406: && (buffer.charAt(bufferLength - 6) == 'c')
407: && (buffer.charAt(bufferLength - 5) == 'r')
408: && (buffer.charAt(bufferLength - 4) == 'i')
409: && (buffer.charAt(bufferLength - 3) == 'p')
410: && (buffer.charAt(bufferLength - 2) == 't')) {
411: // script is at end
412: inScript = false;
413: if (out != null)
414: out.write(buffer.getChars());
415: // buffer = new serverByteBuffer();
416: buffer.reset();
417: }
418: } else {
419: if (buffer.length() == 0) {
420: if (c == rb) {
421: // very strange error case; we just let it pass
422: if (out != null)
423: out.write(c);
424: } else {
425: buffer.append(c);
426: }
427: } else if (buffer.charAt(0) == lb) {
428: if (c == singlequote)
429: inSingleQuote = true;
430: if (c == doublequote)
431: inDoubleQuote = true;
432: // fill in tag text
433: if ((buffer.length() == 3)
434: && (buffer.charAt(1) == excl)
435: && (buffer.charAt(2) == dash)
436: && (c == dash)) {
437: // this is the start of a comment
438: inComment = true;
439: buffer.append(c);
440: } else if ((buffer.length() == 6)
441: && (buffer.charAt(1) == 's')
442: && (buffer.charAt(2) == 'c')
443: && (buffer.charAt(3) == 'r')
444: && (buffer.charAt(4) == 'i')
445: && (buffer.charAt(5) == 'p') && (c == 't')) {
446: // this is the start of a comment
447: inScript = true;
448: buffer.append(c);
449: } else if (c == rb) {
450: buffer.append(c);
451: // the tag ends here. after filtering: pass on
452: filtered = filterSentence(buffer.getChars(),
453: doublequote);
454: if (out != null)
455: out.write(filtered);
456: // buffer = new serverByteBuffer();
457: buffer.reset();
458: } else if (c == lb) {
459: // this is an error case
460: // we consider that there is one rb missing
461: if (buffer.length() > 0) {
462: filtered = filterSentence(
463: buffer.getChars(), doublequote);
464: if (out != null)
465: out.write(filtered);
466: }
467: // buffer = new serverByteBuffer();
468: buffer.reset();
469: buffer.append(c);
470: } else {
471: buffer.append(c);
472: }
473: } else {
474: // fill in plain text
475: if (c == lb) {
476: // the text ends here
477: if (buffer.length() > 0) {
478: filtered = filterSentence(
479: buffer.getChars(), doublequote);
480: if (out != null)
481: out.write(filtered);
482: }
483: // buffer = new serverByteBuffer();
484: buffer.reset();
485: buffer.append(c);
486: } else {
487: // simply append
488: buffer.append(c);
489: }
490: }
491: }
492: } else {
493: out.write(c);
494: }
495: }
496:
497: public void write(char b[]) throws IOException {
498: write(b, 0, b.length);
499: }
500:
501: public void write(char b[], int off, int len) throws IOException {
502: // System.out.println(new String(b, off, len));
503: if ((off | len | (b.length - (len + off)) | (off + len)) < 0)
504: throw new IndexOutOfBoundsException();
505: for (int i = off; i < (len - off); i++)
506: this .write(b[i]);
507: }
508:
509: public void flush() throws IOException {
510: // we cannot flush the current string buffer to prevent that
511: // the filter process is messed up
512: // instead, we simply flush the underlying output stream
513: if (out != null)
514: out.flush();
515: // if you want to flush all, call close() at end of writing;
516: }
517:
518: public void finalize() throws IOException {
519: // if we are forced to close, we of course flush the buffer first,
520: // then close the connection
521: close();
522: }
523:
524: public void close() throws IOException {
525: char quotechar = (inSingleQuote) ? singlequote : doublequote;
526: if (buffer != null) {
527: if (buffer.length() > 0) {
528: char[] filtered = filterSentence(buffer.getChars(),
529: quotechar);
530: if (out != null)
531: out.write(filtered);
532: }
533: buffer = null;
534: }
535: char[] finalized = filterFinalize(quotechar);
536: if (out != null) {
537: if (finalized != null)
538: out.write(finalized);
539: out.flush();
540: out.close();
541: }
542: filterTag = null;
543: filterOpts = null;
544: filterCont = null;
545: // if (scraper != null) {scraper.close(); scraper = null;}
546: // if (transformer != null) {transformer.close(); transformer = null;}
547: }
548:
549: private static boolean binaryHint(char c) {
550: // return Character.isDefined(c);
551: if (c < 0)
552: return false;
553: if (c > 31)
554: return false;
555: if ((c == 8) || (c == 9) || (c == 10) || (c == 13))
556: return false;
557: // return false;
558: // System.out.println("BINARY HINT: " + (int) b);
559: return true;
560: }
561:
562: public boolean binarySuspect() {
563: return !binaryUnsuspect;
564: }
565:
566: public static void main(String[] args) {
567: // takes one argument: a file name
568: if (args.length != 1)
569: return;
570: char[] buffer = new char[512];
571: try {
572: htmlFilterContentScraper scraper = new htmlFilterContentScraper(
573: new yacyURL("http://localhost:8080", null));
574: htmlFilterTransformer transformer = new htmlFilterContentTransformer();
575: // TODO: this does not work at the moment
576: System.exit(0);
577: Reader is = new FileReader(args[0]);
578: FileOutputStream fos = new FileOutputStream(new File(
579: args[0] + ".out"));
580: Writer os = new htmlFilterWriter(fos, "UTF-8", scraper,
581: transformer, false);
582: int i;
583: while ((i = is.read(buffer)) > 0)
584: os.write(buffer, 0, i);
585: os.close();
586: fos.close();
587: is.close();
588: scraper.print();
589: } catch (MalformedURLException e) {
590: e.printStackTrace();
591: } catch (IOException e) {
592: e.printStackTrace();
593: }
594: }
595:
596: }
|