001: // htmlFilterInputStream.java
002: // (C) 2005, 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
003: // first published 2005 on http://www.anomic.de
004: //
005: // This is a part of YaCy, a peer-to-peer based web search engine
006: //
007: // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
008: // $LastChangedRevision: 1986 $
009: // $LastChangedBy: orbiter $
010: //
011: // LICENSE
012: //
013: // This program is free software; you can redistribute it and/or modify
014: // it under the terms of the GNU General Public License as published by
015: // the Free Software Foundation; either version 2 of the License, or
016: // (at your option) any later version.
017: //
018: // This program is distributed in the hope that it will be useful,
019: // but WITHOUT ANY WARRANTY; without even the implied warranty of
020: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: // GNU General Public License for more details.
022: //
023: // You should have received a copy of the GNU General Public License
024: // along with this program; if not, write to the Free Software
025: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026:
027: package de.anomic.htmlFilter;
028:
029: import java.io.BufferedInputStream;
030: import java.io.IOException;
031: import java.io.InputStream;
032: import java.io.InputStreamReader;
033: import java.io.Reader;
034: import java.io.UnsupportedEncodingException;
035: import java.io.Writer;
036: import java.util.Properties;
037:
038: import de.anomic.http.httpHeader;
039: import de.anomic.yacy.yacyURL;
040:
041: public class htmlFilterInputStream extends InputStream implements
042: htmlFilterEventListener {
043:
044: private static final int MODE_PRESCAN = 0;
045: private static final int MODE_PRESCAN_FINISHED = 1;
046: private int mode = 1;
047:
048: private long preBufferSize = 143336;
049: private long preRead = 0;
050: private BufferedInputStream bufferedIn;
051:
052: private String detectedCharset;
053: private boolean charsetChanged = false;
054: private boolean endOfHead = false;
055:
056: private Reader reader;
057: private Writer writer;
058:
059: public htmlFilterInputStream(InputStream inStream,
060: String inputStreamCharset, yacyURL rooturl,
061: htmlFilterTransformer transformer,
062: boolean passbyIfBinarySuspect)
063: throws UnsupportedEncodingException {
064: // create a input stream for buffereing
065: this .bufferedIn = new BufferedInputStream(inStream,
066: (int) this .preBufferSize);
067: this .bufferedIn.mark((int) this .preBufferSize);
068:
069: htmlFilterContentScraper scraper = new htmlFilterContentScraper(
070: rooturl);
071: scraper.registerHtmlFilterEventListener(this );
072:
073: this .reader = new InputStreamReader(this , inputStreamCharset);
074: this .writer = new htmlFilterWriter(null, null, scraper,
075: transformer, passbyIfBinarySuspect);
076: }
077:
078: public void scrapeTag0(String tagname, Properties tagopts) {
079: if (tagname == null || tagname.length() == 0)
080: return;
081:
082: if (tagname.equalsIgnoreCase("meta")) {
083: if (tagopts.containsKey("http-equiv")) {
084: String value = tagopts.getProperty("http-equiv");
085: if (value.equalsIgnoreCase("Content-Type")) {
086: String contentType = tagopts.getProperty("content",
087: "");
088: this .detectedCharset = httpHeader
089: .extractCharsetFromMimetyeHeader(contentType);
090: if (this .detectedCharset != null
091: && this .detectedCharset.length() > 0) {
092: this .charsetChanged = true;
093: } else if (tagopts.containsKey("charset")) {
094: // sometimes the charset property is configured as extra attribut. try it ...
095: this .detectedCharset = tagopts
096: .getProperty("charset");
097: this .charsetChanged = true;
098: }
099: }
100: }
101: }
102: }
103:
104: public void scrapeTag1(String tagname, Properties tagopts,
105: char[] text) {
106: if (tagname == null || tagname.length() == 0)
107: return;
108:
109: if (tagname.equalsIgnoreCase("head")) {
110: this .endOfHead = true;
111: }
112: }
113:
114: public String detectCharset() throws IOException {
115: this .mode = MODE_PRESCAN;
116:
117: // loop until we have detected the header element or the charset data
118: int c;
119: while ((c = this .reader.read()) != -1) {
120: this .writer.write(c);
121: }
122:
123: // free writer
124: this .writer = null;
125: // don't close writer here, otherwise it will shutdown our source stream
126:
127: // reset the buffer if not already done
128: if (this .mode != MODE_PRESCAN_FINISHED) {
129: this .mode++;
130: this .bufferedIn.reset();
131: }
132:
133: // return scanning result
134: return (this .charsetChanged) ? this .detectedCharset : null;
135: }
136:
137: public int read() throws IOException {
138: // mode 0 is called from within the detectCharset function
139: if (this .mode == MODE_PRESCAN) {
140: if (this .endOfHead || this .charsetChanged
141: || this .preRead >= this .preBufferSize - 1) {
142: return -1;
143: }
144: this.preRead++;
145: }
146: return this.bufferedIn.read();
147: }
148:
149: }
|