001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx;
034:
035: import java.io.*;
036: import java.net.URL;
037:
038: /**
039: * Transformer that remaps URLs in links in such a way
040: * that if the URL mapping changes during (or after) some
041: * HTML has been transformed, the HTML can be fixed up after
042: * the fact. This class is used by Concatenator and Mirror,
043: * since in those operations, the URL mapping function
044: * changes as each page is written to the concatenation or
045: * mirror.
046: */
047: public class RewritableLinkTransformer extends LinkTransformer {
048:
049: private RewriteRegion head, tail;
050: private File file;
051: private boolean closed = false;
052:
053: /**
054: * Make a RewritableLinkTransformer.
055: * @param filename Filename to write to
056: */
057: public RewritableLinkTransformer(String filename)
058: throws IOException {
059: super (filename, true);
060: file = new File(filename);
061: }
062:
063: public void close() throws IOException {
064: super .close();
065: closed = true;
066: }
067:
068: static final String PLACEHOLDER = "@WEBSPHINX@";
069:
070: protected void handleLink(Link link) throws IOException {
071: URL url = link.getURL();
072:
073: Tag t = link.replaceHref(PLACEHOLDER);
074: String s = t.toString();
075: int prefix = s.indexOf(PLACEHOLDER);
076: if (prefix != -1) {
077: int postfix = prefix + PLACEHOLDER.length();
078:
079: emit(s.substring(0, prefix));
080:
081: String href = lookup(base, url);
082: RewriteRegion node = addURL(url, getFilePointer(), href
083: .length());
084: emit(href);
085:
086: emit(s.substring(postfix));
087: } else {
088: emit(s);
089: }
090:
091: transformContents(link);
092: if (link.getEndTag() != null)
093: emit(link.getEndTag());
094: }
095:
096: private RewriteRegion addURL(URL url, long offset, int len) {
097: RewriteRegion node = new RewriteRegion();
098: node.url = url;
099: node.offset = offset;
100: node.len = len;
101:
102: if (tail == null) {
103: head = tail = node;
104: } else {
105: node.next = tail.next;
106: tail.next = node;
107: node.prev = tail;
108: if (node.next != null)
109: node.next.prev = node;
110: tail = node;
111: }
112:
113: return node;
114: }
115:
116: static final int BUFFER_SIZE = 8;
117:
118: /**
119: * Rewrite the file, remapping all the URLs according to their
120: * current values from lookup().
121: */
122: public void rewrite() throws IOException {
123: flush();
124:
125: if (head == null)
126: // no links to rewrite
127: return;
128:
129: RandomAccessFile raf = closed ? Access.getAccess()
130: .readWriteFile(file) : getRandomAccessFile();
131:
132: byte buf[] = new byte[BUFFER_SIZE];
133: long end = raf.length();
134: long src = 0;
135: long dest = 0;
136: long left;
137: int n;
138: int growth = 0;
139: int shrinkage = 0;
140:
141: // Forward pass
142: // Rewrite only URLs which are becoming shorter
143: raf.seek(dest);
144: for (RewriteRegion loc = head; loc != null; loc = loc.next) {
145: // loop invariant: file[0..dest-1] is rewritten,
146: // and next byte to copy to file[dest] is from file[src]
147: // and raf.getFilePointer() == dest
148: long diff = dest - src;
149:
150: String href = lookup(base, loc.url);
151: loc.newHref = href;
152: loc.newLen = href.length();
153:
154: if (loc.newLen > loc.len) {
155: // new URL is longer than old URL
156: // must postpone rewriting this until the backward pass
157: growth += loc.newLen - loc.len;
158: loc.offset += diff;
159: continue;
160: } else
161: shrinkage += loc.len - loc.newLen;
162:
163: // rewrite up to loc
164: left = loc.offset - src;
165: while (left > BUFFER_SIZE) {
166: raf.seek(src);
167: raf.read(buf);
168: raf.seek(dest);
169: raf.write(buf);
170: src += BUFFER_SIZE;
171: dest += BUFFER_SIZE;
172: left -= BUFFER_SIZE;
173: }
174: if (left > 0) {
175: n = (int) left;
176: raf.seek(src);
177: raf.read(buf, 0, n);
178: raf.seek(dest);
179: raf.write(buf, 0, n);
180: src += n;
181: dest += n;
182: left -= n;
183: }
184:
185: // write loc
186: raf.writeBytes(href);
187:
188: dest += loc.newLen;
189: src += loc.len;
190:
191: loc.offset += diff;
192: loc.len = loc.newLen;
193: }
194:
195: if (src > dest) {
196: // rewrite rest of file
197: while (true) {
198: raf.seek(src);
199: if ((n = raf.read(buf)) == -1)
200: break;
201: raf.seek(dest);
202: raf.write(buf, 0, n);
203: src += n;
204: dest += n;
205: }
206: } else
207: src = dest = end;
208:
209: src = dest;
210: dest += growth;
211: for (RewriteRegion loc = tail; loc != null; loc = loc.prev) {
212: // loop invariant: file[dest...end-1] is rewritten,
213: // and next byte to copy to file[dest] is from file[src]
214: long diff = dest - src;
215:
216: if (loc.newLen <= loc.len) {
217: loc.offset += diff;
218: continue;
219: }
220:
221: // rewrite back to loc
222: left = src - (loc.offset + loc.len);
223: while (left > BUFFER_SIZE) {
224: src -= BUFFER_SIZE;
225: dest -= BUFFER_SIZE;
226: left -= BUFFER_SIZE;
227: raf.seek(src);
228: raf.read(buf);
229: raf.seek(dest);
230: raf.write(buf);
231: }
232: if (left > 0) {
233: n = (int) left;
234: src -= n;
235: dest -= n;
236: raf.seek(src);
237: raf.read(buf, 0, n);
238: raf.seek(dest);
239: raf.write(buf, 0, n);
240: }
241:
242: // write loc
243: dest -= loc.newLen;
244: src -= loc.len;
245: raf.seek(dest);
246: raf.writeBytes(loc.newHref);
247:
248: loc.offset = dest;
249: loc.len = loc.newLen;
250: }
251:
252: if (src != dest)
253: System.err.println("ASSERTION FAILURE: src=" + src
254: + "!=dest=" + dest);
255:
256: if (shrinkage > growth) {
257: // overwrite the rest of the file with spaces
258: for (int i = 0; i < BUFFER_SIZE; ++i)
259: buf[i] = (byte) ' ';
260: left = shrinkage - growth;
261: raf.seek(end - left);
262: while (left > BUFFER_SIZE) {
263: raf.write(buf);
264: left -= BUFFER_SIZE;
265: }
266: if (left > 0)
267: raf.write(buf, 0, (int) left);
268: } else
269: raf.seek(end + (growth - shrinkage));
270:
271: if (closed)
272: raf.close();
273: }
274:
275: /*
276: * Testing
277: *
278: public static void main (String[] args) throws Exception {
279: RewritableLinkTransformer unparser = new TestILTransformer (args[1]);
280: Link link = new Link (args[0]);
281: Page page = new Page (link);
282: System.out.println ("Writing " + page.toDescription());
283: unparser.writePage (page);
284: System.out.println ("Rewriting while open");
285: unparser.rewrite ();
286: unparser.close ();
287: System.out.println ("Rewriting after close");
288: unparser.rewrite ();
289: }
290: */
291: }
292:
293: class RewriteRegion {
294: URL url;
295: long offset;
296: int len;
297:
298: String newHref;
299: int newLen;
300:
301: RewriteRegion next;
302: RewriteRegion prev;
303: }
304:
305: /*
306: * Testing
307: *
308: class TestILTransformer extends RewritableLinkTransformer {
309: public TestILTransformer (String filename) throws IOException {
310: super (filename);
311: }
312:
313: final static String BIG_STRING =
314: "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
315: +"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
316: +"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
317: +"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@";
318:
319: public String lookup (URL base, URL url) {
320: if (closed)
321: return super.lookup (base, url);
322: else if (Math.random() > 0.5)
323: return BIG_STRING.substring (0, url.toString().length()*2);
324: else
325: return "";
326: }
327: }
328: */
|