001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx;
034:
035: import java.io.File;
036: import java.io.OutputStream;
037: import java.io.RandomAccessFile;
038: import java.io.IOException; //#ifdef JDK1.1
039: import java.io.Writer;
040: import java.io.OutputStreamWriter;
041:
042: //#endif JDK1.1
043: /*#ifdef JDK1.0
044: import java.io.PrintStream;
045: #endif JDK1.0*/
046:
047: public class HTMLTransformer {
048:
049: //#ifdef JDK1.1
050: private OutputStream stream; // output stream for binary content
051: private Writer writer; // output stream for HTML content
052: //#endif JDK1.1
053: /*#ifdef JDK1.0
054: private PrintStream stream; // output stream
055: #endif JDK1.0*/
056:
057: private boolean openedStream = false;
058: // we opened the stream, so we'd better close it
059:
060: private RandomAccessFile readwrite; // output file
061:
062: private HTMLTransformer next; // next HTMLTransformer in the filter chain
063: private HTMLTransformer head; // head of filter chain
064: private HTMLTransformer tail; // tail of filter chain
065:
066: // these fields are only valid on the tail element in the filter
067: // chain
068: private String content; // content of page being printed
069: private int emitStart, emitEnd; // start and end of pending region
070: // (the last region in the page which
071: // has been emit()ed but not actually
072: // written)
073: private int transformEnd; // end of region being transformed
074:
075: /**
076: * Make an HTMLTransformer that writes pages to a
077: * stream.
078: * @param out Stream to receive HTML output
079: */
080: public HTMLTransformer(OutputStream out) {
081: head = tail = this ;
082: next = null;
083: setOutput(out);
084: }
085:
086: /**
087: * Make an HTMLTransformer that writes pages to a
088: * file.
089: * @param filename Name of file to receive HTML output
090: * @exception IOException if file cannot be opened
091: */
092: public HTMLTransformer(String filename) throws IOException {
093: head = tail = this ;
094: next = null;
095: openFile(filename, false);
096: }
097:
098: /**
099: * Make an HTMLTransformer that writes pages to a
100: * file.
101: * @param filename Name of file to receive HTML output
102: * @param seekable True if file should be opened for random access
103: */
104: public HTMLTransformer(String filename, boolean seekable)
105: throws IOException {
106: head = tail = this ;
107: next = null;
108: openFile(filename, seekable);
109: }
110:
111: /**
112: * Make an HTMLTransformer that writes pages to a
113: * downstream HTMLTransformer. Use this constructor
114: * to chain together several HTMLTransformers.
115: * @param next HTMLTransformer to receive HTML output
116: */
117: public HTMLTransformer(HTMLTransformer next) {
118: this .next = next;
119: tail = next != null ? next.tail : this ;
120: for (HTMLTransformer u = this ; u != null; u = u.next)
121: u.head = this ;
122: }
123:
124: private void openFile(String filename, boolean seekable)
125: throws IOException {
126: File file = new File(filename);
127:
128: // open a stream first, to truncate the file to 0
129: OutputStream out = Access.getAccess().writeFile(file, false);
130:
131: if (!seekable)
132: setOutput(out);
133: else {
134: out.close();
135: RandomAccessFile raf = Access.getAccess().readWriteFile(
136: file);
137: setRandomAccessFile(raf);
138: }
139:
140: openedStream = true;
141: }
142:
143: //#ifdef JDK1.1
144: public void setOutput(OutputStream out) {
145: if (next == null) {
146: stream = out;
147: writer = new OutputStreamWriter(out);
148: } else
149: next.setOutput(out);
150: }
151:
152: // public void setOutput (Writer out) {
153: // if (next == null)
154: // stream = out;
155: // else
156: // next.setOutput (out);
157: // }
158:
159: public OutputStream getOutputStream() {
160: return tail.stream;
161: }
162:
163: public Writer getOutputWriter() {
164: return tail.writer;
165: }
166:
167: //#endif JDK1.1
168:
169: /*#ifdef JDK1.0
170: public void setOutput (OutputStream out) {
171: if (next == null)
172: stream = new PrintStream (out);
173: else
174: next.setOutput (out);
175: }
176:
177: public OutputStream getOutput () {
178: return tail.stream;
179: }
180: #endif JDK1.0*/
181:
182: public void setRandomAccessFile(RandomAccessFile raf) {
183: if (next == null)
184: readwrite = raf;
185: else
186: next.setRandomAccessFile(raf);
187: }
188:
189: public RandomAccessFile getRandomAccessFile() {
190: return tail.readwrite;
191: }
192:
193: /**
194: * Writes a literal string through the HTML transformer
195: * (without parsing it or transforming it).
196: * @param string String to write
197: */
198: public synchronized void write(String string) throws IOException {
199: if (next == null)
200: emit(string);
201: else
202: next.write(string);
203: }
204:
205: /**
206: * Writes a chunk of HTML through the HTML transformer.
207: * @param region Region to write
208: */
209: public synchronized void write(Region region) throws IOException {
210: if (next == null) {
211: emitPendingRegion();
212:
213: String oldContent = content;
214: int oldEmitStart = emitStart;
215: int oldEmitEnd = emitEnd;
216: int oldTransformEnd = transformEnd;
217:
218: content = region.getSource().getContent();
219: emitStart = emitEnd = region.getStart();
220: transformEnd = region.getEnd();
221:
222: processElementsInRegion(region.getRootElement(), region
223: .getStart(), region.getEnd());
224:
225: emitPendingRegion();
226:
227: content = oldContent;
228: emitStart = oldEmitStart;
229: emitEnd = oldEmitEnd;
230: transformEnd = oldTransformEnd;
231: } else
232: next.write(region);
233: }
234:
235: /**
236: * Writes a page through the HTML transformer.
237: * @param page Page to write
238: */
239: public synchronized void writePage(Page page) throws IOException {
240: if (next == null) {
241: if (page.isHTML())
242: write(page);
243: else {
244: System.err.println("binary write of " + page.getURL());
245: writeStream(page.getContentBytes(), 0, page.getLength());
246: }
247: } else
248: next.writePage(page);
249: }
250:
251: /**
252: * Flushes transformer to its destination stream.
253: * Empties any buffers in the transformer chain.
254: */
255: public synchronized void flush() throws IOException {
256: if (next == null) {
257: emitPendingRegion();
258: if (stream != null)
259: stream.flush();
260: if (writer != null)
261: writer.flush();
262: } else
263: next.flush();
264: }
265:
266: /**
267: * Close the transformer. Flushes all buffered data
268: * to disk by calling flush(). This call may be
269: * time-consuming! Don't use the transformer again after
270: * closing it.
271: * @exception IOException if an I/O error occurs
272: */
273: public synchronized void close() throws IOException {
274: flush();
275: if (next == null) {
276: if (openedStream) {
277: if (stream != null)
278: stream.close();
279: if (readwrite != null)
280: readwrite.close();
281: }
282: } else
283: next.close();
284: }
285:
286: /**
287: * Finalizes the transformer (calling close()).
288: */
289: protected void finalize() throws Throwable {
290: close();
291: }
292:
293: /**
294: * Get the file pointer.
295: * @return current file pointer
296: * @exception IOException if this transformer not opened for random access
297: */
298: public long getFilePointer() throws IOException {
299: if (readwrite == null)
300: throw new IOException(
301: "HTMLTransformer not opened for random access");
302: return readwrite.getFilePointer();
303: }
304:
305: /**
306: * Seek to a file position.
307: * @param pos file position to seek
308: * @exception IOException if this transformer not opened for random access
309: */
310: public void seek(long pos) throws IOException {
311: if (readwrite == null)
312: throw new IOException(
313: "HTMLTransformer not opened for random access");
314: readwrite.seek(pos);
315: }
316:
317: /**
318: * Transform an element by passing it through the entire
319: * filter chain.
320: * @param elem Element to be transformed
321: */
322: protected void transformElement(Element elem) throws IOException {
323: head.handleElement(elem);
324: }
325:
326: /**
327: * Transform the contents of an element. Passes
328: * the child elements through the filter chain
329: * and emits the text between them.
330: * @param elem Element whose contents should be transformed
331: */
332: protected void transformContents(Element elem) throws IOException {
333: Tag startTag = elem.getStartTag();
334: Tag endTag = elem.getEndTag();
335:
336: tail.processElementsInRegion(elem.getChild(),
337: startTag.getEnd(), endTag != null ? endTag.getStart()
338: : elem.getEnd());
339: }
340:
341: /**
342: * Handle the transformation of an HTML element.
343: * Override this method to modify the HTML as it is
344: * written.
345: * @param elem Element to transform
346: */
347: protected void handleElement(Element elem) throws IOException {
348: if (next == null) {
349: Tag startTag = elem.getStartTag();
350: Tag endTag = elem.getEndTag();
351:
352: emit(startTag);
353: transformContents(elem);
354: if (endTag != null)
355: emit(endTag);
356: } else
357: next.handleElement(elem);
358: }
359:
360: /**
361: * Emit a region on the transformer chain's final output.
362: * (The region isn't passed through the chain.)
363: * @param r Region to emit
364: */
365: protected void emit(Region r) throws IOException {
366: tail.emitInternal(r.getSource().getContent(), r.getStart(), r
367: .getEnd());
368: }
369:
370: /**
371: * Emit a string on the transformer chain's final output.
372: * @param string String to emit
373: */
374: protected void emit(String string) throws IOException {
375: tail.emitInternal(string, 0, string.length());
376: }
377:
378: private void processElementsInRegion(Element elem, int start,
379: int end) throws IOException {
380: if (this != tail)
381: throw new RuntimeException(
382: "processElementsInRegion not called on tail");
383:
384: int p = start;
385:
386: if (elem != null && elem.getSource().getContent() == content)
387: end = Math.min(end, transformEnd);
388:
389: while (elem != null && elem.getStartTag().getEnd() <= end) {
390: emitInternal(content, p, elem.getStart());
391: transformElement(elem);
392: p = elem.getEnd();
393: elem = elem.getNext();
394: }
395: emitInternal(content, Math.min(p, end), end);
396: }
397:
398: private void emitInternal(String str, int start, int end)
399: throws IOException {
400: if (this != tail)
401: throw new RuntimeException(
402: "emitInternal not called on tail");
403:
404: if (str == content) {
405: start = Math.min(start, transformEnd);
406: end = Math.min(end, transformEnd);
407:
408: if (start == emitEnd)
409: emitEnd = end; // just extend the pending emit region
410: else {
411: emitPendingRegion();
412: emitStart = start;
413: emitEnd = end;
414: }
415: } else {
416: emitPendingRegion();
417: writeStream(str.substring(start, end));
418: }
419: }
420:
421: private void emitPendingRegion() throws IOException {
422: if (this != tail)
423: throw new RuntimeException(
424: "emitPendingRegion not called on tail");
425:
426: if (emitStart != emitEnd) {
427: writeStream(content.substring(emitStart, emitEnd));
428: emitStart = emitEnd;
429: }
430: }
431:
432: private void writeStream(String s) throws IOException {
433: if (writer != null) {
434: //#ifdef JDK1.1
435: writer.write(s);
436: //#endif JDK1.1
437: /*#ifdef JDK1.0
438: stream.print (s);
439: #endif JDK1.0*/
440: } else
441: readwrite.writeBytes(s);
442: }
443:
444: private void writeStream(byte[] buf, int offset, int len)
445: throws IOException {
446: if (stream != null) {
447: //#ifdef JDK1.1
448: stream.write(buf, offset, len);
449: //#endif JDK1.1
450: /*#ifdef JDK1.0
451: stream.write (buf, offset, len);
452: #endif JDK1.0*/
453: } else
454: readwrite.write(buf, offset, len);
455: }
456:
457: /*
458: * Testing
459: *
460: public static void main (String[] args) throws Exception {
461: Link link = new Link (args[0]);
462: Page page = new Page (link);
463:
464: OutputStream out = (args.length >= 2)
465: ? (OutputStream)new java.io.FileOutputStream (args[1])
466: : (OutputStream)System.out;
467: HTMLTransformer unparser = new TestTransformer (out);
468:
469: int len = page.getLength();
470: unparser.write (new Region (page, 0, 3*len/4));
471:
472: unparser.close ();
473: }
474: */
475: }
476:
477: /*
478: * Testing
479: *
480: class TestTransformer extends HTMLTransformer {
481: public TestTransformer (OutputStream out) {
482: super (out);
483: }
484:
485: protected void handleElement (Element elem) throws IOException {
486: System.out.println ("handling <" + elem.getTagName() + ">");
487: super.handleElement (elem);
488: }
489: }
490: */
|