001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx;
034:
035: import java.io.*;
036: import java.net.URL;
037: import java.net.MalformedURLException;
038: import java.util.Vector;
039:
040: /**
041: * Offline mirror of a Web site. Web pages written to
042: * a mirror are stored as files on the local disk in a directory
043: * structure mirroring their URLs.
044: * <P>
045: */
046:
047: // FIX: discards ALL anchors (for some reason)
048: public class Mirror extends LinkTransformer {
049: String root;
050: // Root directory represented as file:/<dir>/
051:
052: Vector files = new Vector();
053: // collection of RewritableLinkTransformers, one for each
054: // file in the mirror
055:
056: boolean needRewrite = false;
057:
058: String defaultFilename = "index.html";
059:
060: // name given to a directory URL (like http://foo.com/)
061: // when it is saved to disk
062:
063: /**
064: * Make a new Mirror.
065: * @param directory Root directory (on local disk
066: * relative to which the mirror pages are stored)
067: */
068: public Mirror(String directory) throws IOException {
069: super ((HTMLTransformer) null);
070: if (!directory.endsWith("/"))
071: directory += "/";
072: File rootFile = new File(directory);
073: if (!rootFile.isAbsolute())
074: rootFile = new File(rootFile.getAbsolutePath());
075: URL rootURL = Link.FileToURL(rootFile);
076: root = rootURL.toExternalForm();
077: }
078:
079: /**
080: * Get the filename used for directory URLs.
081: * For example, if the default filename is "index.html",
082: * then the remote URL "http://www.xxx.com/path/" would
083: * map to the local pathname "www.xxx.com/path/index.html".
084: * @return default filename. Default is "index.html".
085: */
086: public String getDefaultFilename() {
087: return defaultFilename;
088: }
089:
090: /**
091: * Set the filename used for directory URLs.
092: * For example, if the default filename is "index.html",
093: * then the remote URL "http://www.xxx.com/path/" would
094: * map to the local pathname "www.xxx.com/path/index.html".
095: * @param filename Default filename.
096: */
097: public synchronized void setDefaultFilename(String filename) {
098: defaultFilename = filename;
099: }
100:
101: /**
102: * Get number of pages written to this mirror.
103: * @return number of calls to writePage() on this mirror
104: */
105: public synchronized int getPageCount() {
106: return files.size();
107: }
108:
109: public void write(Region region) throws IOException {
110: throw new IOException("write(Region) not supported by Mirror");
111: }
112:
113: public void write(String string) throws IOException {
114: throw new IOException("write(String) not supported by Mirror");
115: }
116:
117: /**
118: * Write a page to the mirror. Stores the page on the local
119: * disk, fixing up its links to point to the local
120: * copies of any pages already stored to this mirror.
121: * @param page Page to write
122: */
123: public synchronized void writePage(Page page) throws IOException {
124: URL url = page.getURL();
125: String local = toLocalFileURL(url);
126: URL localURL = new URL(local);
127: File localFile = Link.URLToFile(localURL);
128:
129: File parent = new File(localFile.getParent());
130: if (parent != null)
131: Access.getAccess().makeDir(parent);
132:
133: MirrorTransformer out = new MirrorTransformer(this , localFile);
134: out.setBase(localURL);
135: out.setEmitBaseElement(getEmitBaseElement());
136: out.writePage(page);
137: out.close();
138:
139: needRewrite = !files.isEmpty();
140: files.addElement(out);
141: }
142:
143: /**
144: * Close the mirror. Makes sure that links point to local versions of
145: * pages wherever possible.
146: */
147: public synchronized void close() throws IOException {
148: rewrite();
149: }
150:
151: /**
152: * Rewrite the mirror to make local links consistent.
153: */
154: public synchronized void rewrite() throws IOException {
155: if (needRewrite) {
156: for (int i = 0, n = files.size(); i < n; ++i) {
157: RewritableLinkTransformer r = (RewritableLinkTransformer) files
158: .elementAt(i);
159: r.rewrite();
160: }
161: needRewrite = false;
162: }
163: }
164:
165: // maps a remote URL to a local file URL ("<root>/<host>/<filename>")
166: // resulting URL is never slash-terminated
167: private String toLocalFileURL(URL remoteURL) {
168: if (isMapped(remoteURL))
169: return lookup(null, remoteURL);
170:
171: String remote = remoteURL.toExternalForm();
172: URL remoteDirURL = Link.getDirectoryURL(remoteURL);
173: String remoteDir = remoteDirURL.toExternalForm();
174: String remoteFile = (remote.length() > remoteDir.length()) ? encode(remote
175: .substring(remoteDir.length()))
176: : defaultFilename;
177: String localDir = toLocalDirURL(remoteDirURL);
178: String local = localDir + remoteFile;
179:
180: map(remoteURL, local);
181: return local;
182: }
183:
184: // Maps a remote directory URL (slash-terminated) to a local
185: // directory URL (slash-terminated)
186: private String toLocalDirURL(URL remoteURL) {
187: if (isMapped(remoteURL))
188: return lookupDir(null, remoteURL);
189:
190: String remote = remoteURL.toExternalForm();
191: String local;
192: URL remoteParentURL = Link.getParentURL(remoteURL);
193:
194: if (remoteParentURL.equals(remoteURL)) {
195: // we've reached http://host/
196: String host = remoteURL.getHost();
197: int port = remoteURL.getPort();
198: local = root
199: + encode((port != -1) ? host + ":" + port : host)
200: + '/';
201: } else {
202: String remoteParent = remoteParentURL.toExternalForm();
203: String remoteFile = encode(remote.substring(remoteParent
204: .length(), remote.length() - 1));
205: String localDir = toLocalDirURL(remoteParentURL);
206: local = localDir + remoteFile + "/";
207: }
208:
209: map(remoteURL, local);
210: return local;
211: }
212:
213: /**
214: * Map a directory URL (of the form http://host/path/) to
215: * a local directory.
216: * @param url Directory URL. Must end with a slash.
217: * @param dir Local directory relative to which descendents of
218: * url should be saved.
219: */
220: public synchronized void mapDir(URL url, String dir)
221: throws MalformedURLException {
222: if (!dir.endsWith("/"))
223: dir += "/";
224: map(Link.getDirectoryURL(url), Link.FileToURL(
225: new File(dir + defaultFilename)).toString());
226: }
227:
228: /**
229: * Lookup the local directory to which a remote directory
230: * URL maps.
231: * @param base local file URL to use as a base. If non-null,
232: * then the returned pathname is relative to this URL. If
233: * null, the returned pathname is an absolute URL (file:/path/).
234: * @param url remote directory URL to look up. Must end in slash.
235: */
236: public String lookupDir(URL base, URL url) {
237: String href = lookup(base, url);
238: int lastSlash = href.lastIndexOf('/');
239: return href.substring(0, lastSlash + 1);
240: }
241:
242: private static String canonicalDir(String dir) {
243: dir = dir.replace('\\', '/');
244: if (!dir.endsWith("/"))
245: dir += "/";
246: if (!dir.startsWith("/"))
247: dir = "/" + dir;
248: return dir;
249: }
250:
251: private static String encode(String component) {
252: char[] chars = component.toCharArray();
253:
254: for (int i = 0; i < chars.length; ++i)
255: switch (chars[i]) {
256: case 'A':
257: case 'B':
258: case 'C':
259: case 'D':
260: case 'E':
261: case 'F':
262: case 'G':
263: case 'H':
264: case 'I':
265: case 'J':
266: case 'K':
267: case 'L':
268: case 'M':
269: case 'N':
270: case 'O':
271: case 'P':
272: case 'Q':
273: case 'R':
274: case 'S':
275: case 'T':
276: case 'U':
277: case 'V':
278: case 'W':
279: case 'X':
280: case 'Y':
281: case 'Z':
282:
283: case 'a':
284: case 'b':
285: case 'c':
286: case 'd':
287: case 'e':
288: case 'f':
289: case 'g':
290: case 'h':
291: case 'i':
292: case 'j':
293: case 'k':
294: case 'l':
295: case 'm':
296: case 'n':
297: case 'o':
298: case 'p':
299: case 'q':
300: case 'r':
301: case 's':
302: case 't':
303: case 'u':
304: case 'v':
305: case 'w':
306: case 'x':
307: case 'y':
308: case 'z':
309:
310: case '0':
311: case '1':
312: case '2':
313: case '3':
314: case '4':
315: case '5':
316: case '6':
317: case '7':
318: case '8':
319: case '9':
320:
321: case '.':
322: case '-':
323: case '_':
324: case '~':
325:
326: break;
327:
328: default:
329: chars[i] = '_';
330: break;
331: }
332:
333: return new String(chars);
334: }
335:
336: /*
337: * Testing
338: *
339: */
340: public static void main(String[] args) throws Exception {
341: String directory = args[args.length - 1];
342: Mirror out = new Mirror(directory);
343: out.mapDir(new URL(args[0]), directory);
344: for (int i = 0; i < args.length - 1; ++i) {
345: Link link = new Link(args[i]);
346: Page page = new Page(link);
347: out.writePage(page);
348: }
349: out.close();
350: }
351:
352: }
353:
354: class MirrorTransformer extends RewritableLinkTransformer {
355: Mirror mirror; // on the wall?
356:
357: public MirrorTransformer(Mirror mirror, File file)
358: throws IOException {
359: super (file.toString());
360: this .mirror = mirror;
361: }
362:
363: public String lookup(URL base, URL url) {
364: return mirror.lookup(base, url);
365: }
366:
367: public void map(URL remoteURL, String href) {
368: mirror.map(remoteURL, href);
369: }
370:
371: public void map(URL remoteURL, URL url) {
372: mirror.map(remoteURL, url);
373: }
374:
375: public boolean isMapped(URL url) {
376: return mirror.isMapped(url);
377: }
378: }
|