001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx;
034:
035: import rcm.util.Str;
036: import java.io.*;
037: import java.net.URL;
038: import java.net.MalformedURLException;
039: import java.util.Hashtable;
040:
041: /**
042: * Transformer that concatenates multiple pages
043: * into a single HTML page.
044: * <P>
045: * The entire set of pages is preceded by a "prolog"
046: * and followed by an "epilog", which are constant
047: * strings of HTML. Each page is preceded
048: * by a "header" and followed by a "footer". Adjacent pages
049: * are separated by a "divider".
050: * <P>
051: * Concatenator performs the following
052: * transformations on pages before appending them together:
053: * <UL>
054: * <LI> deletes elements that would conflict, including
055: * <HEADf>, <TITLEf>, <BODYf>, <HTMLf>,
056: * <STYLE>, and <FRAMES>.
057: * <LI> deletes <BASEf> or replaces it with a user-specified
058: * <BASEf>
059: * <LI> changes links among the written pages into
060: * in-page references, of the form "#concatenator_N"
061: * <LI> changes links to other pages into absolute references
062: * </UL>
063: *
064: */
065:
066: // FIX: transform anchors
067: public class Concatenator extends RewritableLinkTransformer {
068:
069: boolean needRewrite = false;
070:
071: public static String defaultProlog = "<HTML><HEAD><TITLE>Concatenation</TITLE></HEAD><BODY>\n";
072: public static String defaultHeader = "<TABLE WIDTH=\"100%\"><TR>\n"
073: + "<TD ALIGN=left><A NAME=\"%a\">%t [%u]</A>\n"
074: + "<TD ALIGN=right>Page %p</TABLE>\n";
075: public static String defaultFooter = "";
076: public static String defaultDivider = "\n<DIV STYLE=\"page-break-after: always;\"><HR></DIV>\n";
077: public static String defaultEpilog = "\n</BODY></HTML>\n";
078:
079: String prolog = defaultProlog;
080: String header = defaultHeader;
081: String footer = defaultFooter;
082: String divider = defaultDivider;
083: String epilog = defaultEpilog;
084:
085: int nPages = 0;
086:
087: /**
088: * Make a new Concatenator that writes to a file.
089: * @param filename Filename to write concatenated pages to
090: * @exception IOException if file cannot be opened
091: */
092: public Concatenator(String filename) throws IOException {
093: super (makeDirs(filename));
094: }
095:
096: private static String makeDirs(String filename) throws IOException {
097: File file = new File(filename);
098: File parent = new File(file.getParent());
099: if (parent != null)
100: Access.getAccess().makeDir(parent);
101: return filename;
102: }
103:
104: /**
105: * Set the prolog.
106: * @param prolog string of HTML that is emitted at the beginning
107: * of the concatenation. Default value is: <BR>
108: * <CODE><HTML><HEAD><TITLE>Concatenation</TITLE></HEAD><BODY>\n</CODE>
109: */
110: public synchronized void setProlog(String prolog) {
111: this .prolog = prolog;
112: }
113:
114: /**
115: * Get the prolog.
116: * @return string of HTML that is emitted at the beginning
117: * of the concatenation.
118: */
119: public String getProlog() {
120: return prolog;
121: }
122:
123: /**
124: * Set the header. The header can contain macro codes which
125: * are replaced with attributes of the page about to be written:
126: * <DL>
127: * <DT>%t
128: * <DD>title of the page
129: * <DT>%u
130: * <DD>URL of page
131: * <DT>%a
132: * <DD>anchor name of the page ("pageN", where N is the page number)
133: * <DT>%p
134: * <DD>page number (starting from 1)
135: * </DL>
136: * @param header string of HTML that is emitted before
137: * each page. The default value is:<BR>
138: * <CODE> <TABLE WIDTH="100%"><TR>\n <BR>
139: * <TD ALIGN=left><A NAME="%a">%t [%u]</A>\n <BR>
140: * <TD ALIGN=right>Page %p</TABLE>\n</CODE>
141: */
142: public synchronized void setPageHeader(String header) {
143: this .header = header;
144: }
145:
146: /**
147: * Get the header.
148: * @return string of HTML that is emitted before
149: * each page.
150: */
151: public String getPageHeader() {
152: return header;
153: }
154:
155: /**
156: * Set the footer. The footer can contain the same
157: * macros as the header (%t, %u, %a, %p); see setPageHeader
158: * for more details.
159: * @param footer string of HTML that is emitted after
160: * each page.
161: */
162: public synchronized void setPageFooter(String footer) {
163: this .footer = footer;
164: }
165:
166: /**
167: * Get the footer.
168: * @return string of HTML that is emitted after
169: * each page.
170: */
171: public String getPageFooter() {
172: return footer;
173: }
174:
175: /**
176: * Set the divider.
177: * @param divider string of HTML that is emitted between
178: * each pair of pages.
179: */
180: public synchronized void setDivider(String divider) {
181: this .divider = divider;
182: }
183:
184: /**
185: * Get the divider.
186: * @return string of HTML that is emitted between
187: * each pair of pages.
188: */
189: public String getDivider() {
190: return divider;
191: }
192:
193: /**
194: * Set the epilog.
195: * @param epilog string of HTML that is emitted after
196: * the entire concatenation.
197: */
198: public synchronized void setEpilog(String epilog) {
199: this .epilog = epilog;
200: }
201:
202: /**
203: * Get the epilog.
204: * @return string of HTML that is emitted after
205: * the entire concatenation.
206: */
207: public String getEpilog() {
208: return epilog;
209: }
210:
211: /**
212: * Get number of pages written to this mirror.
213: * @return number of calls to writePage() on this mirror
214: */
215: public synchronized int getPageCount() {
216: return nPages;
217: }
218:
219: /**
220: * Rewrite the concatenation. Makes sure all the links
221: * among concatenated pages have been fixed up.
222: */
223: public synchronized void rewrite() throws IOException {
224: if (needRewrite) {
225: super .rewrite();
226: needRewrite = false;
227: }
228: }
229:
230: /**
231: * Close the concatenation. Makes sure all the links
232: * among concatenated pages have been fixed up and closes
233: * the file.
234: */
235: public synchronized void close() throws IOException {
236: if (nPages == 0)
237: write(prolog);
238: emit(epilog);
239: rewrite();
240: super .close();
241: }
242:
243: /**
244: * Write a page to the concatenation.
245: * @param page Page to write
246: */
247: public synchronized void writePage(Page page) throws IOException {
248: ++nPages;
249:
250: emit((nPages == 1) ? prolog : divider);
251:
252: String title = page.getTitle();
253: URL url = page.getURL();
254: String urlString = url.toExternalForm();
255: String anchor = "page" + nPages;
256: map(url, "#" + anchor);
257:
258: emitTemplate(header, title, urlString, anchor, nPages);
259: if (page.isImage() && page.getURL() != null)
260: super .write("<IMG SRC='" + page.getURL() + "'>");
261: else if (page.isHTML())
262: // it's HTML, can write it normally
263: super .writePage(page);
264: else
265: super .write(page.toHTML());
266: emitTemplate(footer, title, urlString, anchor, nPages);
267:
268: needRewrite = nPages > 1;
269: }
270:
271: private void emitTemplate(String template, String title,
272: String url, String anchor, int pages) throws IOException {
273: if (template == null || template.length() == 0)
274: return;
275:
276: template = Str.replace(template, "%t", title != null ? title
277: : "");
278: template = Str.replace(template, "%u", url != null ? url : "");
279: template = Str.replace(template, "%a", anchor != null ? anchor
280: : "");
281: template = Str.replace(template, "%p", String.valueOf(pages));
282: emit(template);
283: }
284:
285: /**
286: * Process an HTML element for concatenation. Deletes
287: * tags that would
288: * conflict with other pages (such as <HEAD>),
289: * changes the URLs in Link elements, and deletes
290: * or remaps the BASE element.
291: * @param elem HTML element to process
292: */
293: protected void handleElement(Element elem) throws IOException {
294: String name = elem.getTagName();
295: if (name == Tag.TITLE || name == Tag.STYLE || name == Tag.BASE
296: || name == Tag.ISINDEX || name == Tag.FRAMESET
297: || name == Tag.FRAME) {
298: // skip the entire element
299: } else if (name == Tag.HTML || name == Tag.HEAD
300: || name == Tag.BODY || name == Tag.NOFRAMES) {
301: // skip only the start and end tags; preserve the content
302: transformContents(elem);
303: } else
304: super .handleElement(elem);
305: }
306:
307: /*
308: * Testing
309: *
310: *
311: *
312: */
313: public static void main(String[] args) throws Exception {
314: HTMLTransformer out = new Concatenator(args[args.length - 1]);
315: for (int i = 0; i < args.length - 1; ++i) {
316: Link link = new Link(args[i]);
317: Page page = new Page(link);
318: out.writePage(page);
319: }
320: out.close();
321: }
322: }
|