001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx;
034:
035: import java.io.*;
036: import java.net.URL;
037: import java.net.MalformedURLException;
038: import java.util.Hashtable;
039:
040: /**
041: * Transformer that remaps URLs in links.
042: * <P>
043: * The default LinkTransformer simply converts all links
044: * to absolute URLs. Other common effects are easy to
045: * achieve:
046: * <UL>
047: * <LI>To make all links relative to a base URL, use
048: * setBase() to set a base URL.
049: * <LI>To replace certain URLs with different ones,
050: * use map() to set up the mappings.
051: * </UL>
052: * The default LinkTransformer strips out <BASE>
053: * elements. Instead, it can output a <BASE>
054: * element with a user-specified URL. Use setBase() to set
055: * the URL and setEmitBaseElement() to indicate that it
056: * should be emitted.
057: */
058: public class LinkTransformer extends HTMLTransformer {
059: protected Hashtable map;
060: protected URL base = null;
061: boolean emitBaseElement = false;
062:
063: boolean needToEmitBase = false;
064:
065: /**
066: * Make a LinkTransformer writing to a file.
067: * @param filename Filename to write to
068: */
069: public LinkTransformer(String filename) throws IOException {
070: super (filename);
071: }
072:
073: /**
074: * Make a LinkTransformer that writes pages to a
075: * file.
076: * @param filename Name of file to receive HTML output
077: * @param seekable True if file should be opened for random access
078: */
079: public LinkTransformer(String filename, boolean seekable)
080: throws IOException {
081: super (filename, seekable);
082: }
083:
084: /**
085: * Make a LinkTransformer writing to a stream.
086: * @param out stream to write to
087: */
088: public LinkTransformer(OutputStream out) {
089: super (out);
090: }
091:
092: /**
093: * Make a LinkTransformer writing to another HTMLTransformer
094: * @param next next transformer in filter chain
095: */
096: public LinkTransformer(HTMLTransformer next) {
097: super (next);
098: }
099:
100: /**
101: * Get the base URL used by the LinkTransformer.
102: * A transformed link's URL is written out relative
103: * to this URL. For instance, if the base URL is
104: * http://www.yahoo.com/Entertainment/, then a link
105: * URL http://www.yahoo.com/News/Current/
106: * would be written out as ../News/Current/.
107: * @return base URL, or null if no base URL is set. Default is null.
108: */
109: public URL getBase() {
110: return base;
111: }
112:
113: /**
114: * Set the base URL used by the LinkTransformer.
115: * A transformed link's URL is written out relative
116: * to this URL. For instance, if the base URL is
117: * http://www.yahoo.com/Entertainment/, then a link
118: * URL http://www.yahoo.com/News/Current/
119: * would be written out as ../News/Current/.
120: * @param base base URL, or null if no base URL should be used.
121: */
122: public synchronized void setBase(URL base) {
123: this .base = base;
124: }
125:
126: /**
127: * Test whether the LinkTransformer should emit a
128: * <BASE> element pointing to the base URL.
129: * @return true if a <BASE> element should be
130: * emitted with each page.
131: */
132: public boolean getEmitBaseElement() {
133: return emitBaseElement;
134: }
135:
136: /**
137: * Set whether the LinkTransformer should emit a
138: * <BASE> element pointing to the base URL.
139: * @param emitBase true if a <BASE> element should be
140: * emitted with each page.
141: */
142: public synchronized void setEmitBaseElement(boolean emitBase) {
143: emitBaseElement = emitBase;
144: }
145:
146: /**
147: * Look up the href for a URL, taking any mapping
148: * into account.
149: * @param base base URL (or null if an absolute URL is desired)
150: * @param url URL of interest
151: * @return relative href for url from base
152: */
153: public String lookup(URL base, URL url) {
154: if (map != null) {
155: Object obj = map.get(url);
156: if (obj instanceof URL)
157: return base != null ? Link.relativeTo(base, (URL) obj)
158: : obj.toString();
159: else if (obj instanceof String)
160: return base != null ? Link.relativeTo(base,
161: (String) obj) : obj.toString();
162: }
163:
164: return base != null ? Link.relativeTo(base, url) : url
165: .toString();
166: }
167:
168: /**
169: * Map a URL to an href. For example, Concatenator
170: * uses this call to map page URLs to their corresponding
171: * anchors in the concatenation.
172: * @param url URL of interest
173: * @param href href which should be returned by lookup (null, url)
174: */
175: public synchronized void map(URL url, String href) {
176: if (map == null)
177: map = new Hashtable();
178: map.put(url, href);
179: }
180:
181: /**
182: * Map a URL to a new URL. For example, Mirror
183: * uses this call to map remote URLs to their corresponding
184: * local URLs.
185: * @param url URL of interest
186: * @param newURL URL which should be returned by lookup (null, url)
187: */
188: public synchronized void map(URL url, URL newURL) {
189: if (map == null)
190: map = new Hashtable();
191: map.put(url, newURL);
192: }
193:
194: /**
195: * Test whether a URL is mapped.
196: * @param url URL of interest
197: * @return true if map () was called to remap url
198: */
199: public boolean isMapped(URL url) {
200: return map != null && map.containsKey(url);
201: }
202:
203: /**
204: * Write a page through the transformer. If
205: * getEmitBaseElement() is true and getBase() is
206: * non-null, then the transformer
207: * outputs a <BASE> element either inside the
208: * page's <HEAD> element (if present) or before
209: * the first tag that belongs in <BODY>.
210: * @param page Page to write
211: */
212: public synchronized void writePage(Page page) throws IOException {
213: needToEmitBase = emitBaseElement && base != null;
214: super .writePage(page);
215: needToEmitBase = false;
216: }
217:
218: /**
219: * Handle an element written through the transformer.
220: * Remaps attributes that contain URLs.
221: * @param elem Element to transform
222: */
223: protected void handleElement(Element elem) throws IOException {
224: Tag tag = elem.getStartTag();
225: String tagName = elem.getTagName();
226:
227: if (needToEmitBase && tag.isBodyTag()) {
228: emit("<BASE HREF=\"" + base.toString() + "\">");
229: needToEmitBase = false;
230: }
231:
232: if (elem instanceof Link)
233: handleLink((Link) elem);
234: else if (tagName == Tag.BASE)
235: handleBase(elem);
236: else if (needToEmitBase && tagName == Tag.HEAD) {
237: // put BASE at the end of HEAD, if we don't find it earlier
238: emit(elem.getStartTag());
239: transformContents(elem);
240: if (needToEmitBase) {
241: emit("<BASE HREF=\"" + base.toString() + "\">");
242: needToEmitBase = false;
243: }
244: if (elem.getEndTag() != null)
245: emit(elem.getEndTag());
246: } else
247: super .handleElement(elem);
248: }
249:
250: /**
251: * Handle a Link's transformation.
252: * Default implementation replaces the link's URL
253: * with lookup(URL).
254: * @param link Link to transform
255: */
256: protected void handleLink(Link link) throws IOException {
257: emit(link.replaceHref(lookup(base, link.getURL())));
258: transformContents(link);
259: if (link.getEndTag() != null)
260: emit(link.getEndTag());
261: }
262:
263: /**
264: * Handle the BASE element.
265: * Default implementation removes if if EmitBaseElement
266: * is false, or changes its URL to Base if EmitBaseElement
267: * is true.
268: * @param elem BASE element to transform
269: */
270: protected void handleBase(Element elem) throws IOException {
271: Tag tag = elem.getStartTag();
272: if (needToEmitBase) {
273: emit(tag.replaceHTMLAttribute("href", base.toString()));
274: needToEmitBase = false;
275: } else if (tag.hasHTMLAttribute("href")
276: && tag.countHTMLAttributes() > 1)
277: // tag has other attributes that we want to preserve
278: emit(tag.removeHTMLAttribute("href"));
279: // otherwise skip the BASE element
280: }
281:
282: /*
283: * Testing
284: *
285: public static void main (String[] args) throws Exception {
286: OutputStream out = (args.length >= 2)
287: ? (OutputStream)new java.io.FileOutputStream (args[1])
288: : (OutputStream)System.out;
289: HTMLTransformer unparser = new LinkTransformer (out);
290:
291: Link link = new Link (args[0]);
292: Page page = new Page (link);
293:
294: unparser.write (page);
295: unparser.close ();
296: }
297: */
298:
299: }
|