001: /* ByteReplayCharSequenceFactory
002: *
003: * (Re)Created on Dec 21, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.io;
024:
025: import java.io.IOException;
026: import java.io.RandomAccessFile;
027: import java.io.UnsupportedEncodingException;
028: import java.util.logging.Level;
029: import java.util.logging.Logger;
030:
031: import org.archive.util.DevUtils;
032:
033: /**
034: * Provides a (Replay)CharSequence view on recorded stream bytes (a prefix
035: * buffer and overflow backing file).
036: *
037: * Treats the byte stream as 8-bit.
038: *
039: * <p>Uses a wraparound rolling buffer of the last windowSize bytes read
040: * from disk in memory; as long as the 'random access' of a CharSequence
041: * user stays within this window, access should remain fairly efficient.
042: * (So design any regexps pointed at these CharSequences to work within
043: * that range!)
044: *
045: * <p>When rereading of a location is necessary, the whole window is
046: * recentered around the location requested. (TODO: More research
047: * into whether this is the best strategy.)
048: *
049: * <p>An implementation of a ReplayCharSequence done with ByteBuffers -- one
050: * to wrap the passed prefix buffer and the second, a memory-mapped
051: * ByteBuffer view into the backing file -- was consistently slower: ~10%.
052: * My tests did the following. Made a buffer filled w/ regular content.
053: * This buffer was used as the prefix buffer. The buffer content was
054: * written MULTIPLER times to a backing file. I then did accesses w/ the
055: * following pattern: Skip forward 32 bytes, then back 16 bytes, and then
056: * read forward from byte 16-32. Repeat. Though I varied the size of the
057: * buffer to the size of the backing file,from 3-10, the difference of 10%
058: * or so seemed to persist. Same if I tried to favor get() over get(index).
059: * I used a profiler, JMP, to study times taken (St.Ack did above comment).
060: *
061: * <p>TODO determine in memory mapped files is better way to do this;
062: * probably not -- they don't offer the level of control over
063: * total memory used that this approach does.
064: *
065: * @author Gordon Mohr
066: * @version $Revision: 5027 $, $Date: 2007-03-29 00:30:33 +0000 (Thu, 29 Mar 2007) $
067: */
068: class ByteReplayCharSequence implements ReplayCharSequence {
069:
070: protected static Logger logger = Logger
071: .getLogger(ByteReplayCharSequence.class.getName());
072:
073: /**
074: * Buffer that holds the first bit of content.
075: *
076: * Once this is exhausted we go to the backing file.
077: */
078: private byte[] prefixBuffer;
079:
080: /**
081: * Total length of character stream to replay minus the HTTP headers
082: * if present.
083: *
084: * Used to find EOS.
085: */
086: protected int length;
087:
088: /**
089: * Absolute length of the stream.
090: *
091: * Includes HTTP headers. Needed doing calc. in the below figuring
092: * how much to load into buffer.
093: */
094: private int absoluteLength = -1;
095:
096: /**
097: * Buffer window on to backing file.
098: */
099: private byte[] wraparoundBuffer;
100:
101: /**
102: * Absolute index into underlying bytestream where wrap starts.
103: */
104: private int wrapOrigin;
105:
106: /**
107: * Index in wraparoundBuffer that corresponds to wrapOrigin
108: */
109: private int wrapOffset;
110:
111: /**
112: * Name of backing file we go to when we've exhausted content from the
113: * prefix buffer.
114: */
115: private String backingFilename;
116:
117: /**
118: * Random access to the backing file.
119: */
120: private RandomAccessFile raFile;
121:
122: /**
123: * Offset into prefix buffer at which content beings.
124: */
125: private int contentOffset;
126:
127: /**
128: * 8-bit encoding used reading single bytes from buffer and
129: * stream.
130: */
131: private static final String DEFAULT_SINGLE_BYTE_ENCODING = "ISO-8859-1";
132:
133: /**
134: * Constructor.
135: *
136: * @param buffer In-memory buffer of recordings prefix. We read from
137: * here first and will only go to the backing file if <code>size</code>
138: * requested is greater than <code>buffer.length</code>.
139: * @param size Total size of stream to replay in bytes. Used to find
140: * EOS. This is total length of content including HTTP headers if
141: * present.
142: * @param responseBodyStart Where the response body starts in bytes.
143: * Used to skip over the HTTP headers if present.
144: * @param backingFilename Path to backing file with content in excess of
145: * whats in <code>buffer</code>.
146: *
147: * @throws IOException
148: */
149: public ByteReplayCharSequence(byte[] buffer, long size,
150: long responseBodyStart, String backingFilename)
151: throws IOException {
152:
153: this .length = (int) (size - responseBodyStart);
154: this .absoluteLength = (int) size;
155: this .prefixBuffer = buffer;
156: this .contentOffset = (int) responseBodyStart;
157:
158: // If amount to read is > than what is in our prefix buffer, then
159: // open the backing file.
160: if (size > buffer.length) {
161: this .backingFilename = backingFilename;
162: this .raFile = new RandomAccessFile(backingFilename, "r");
163: this .wraparoundBuffer = new byte[this .prefixBuffer.length];
164: this .wrapOrigin = this .prefixBuffer.length;
165: this .wrapOffset = 0;
166: loadBuffer();
167: }
168: }
169:
170: /**
171: * @return Length of characters in stream to replay. Starts counting
172: * at the HTTP header/body boundary.
173: */
174: public int length() {
175: return this .length;
176: }
177:
178: /**
179: * Get character at passed absolute position.
180: *
181: * Called by {@link #charAt(int)} which has a relative index into the
182: * content, one that doesn't account for HTTP header if present.
183: *
184: * @param index Index into content adjusted to accomodate initial offset
185: * to get us past the HTTP header if present (i.e.
186: * {@link #contentOffset}).
187: *
188: * @return Characater at offset <code>index</code>.
189: */
190: public char charAt(int index) {
191: int c = -1;
192: // Add to index start-of-content offset to get us over HTTP header
193: // if present.
194: index += this .contentOffset;
195: if (index < this .prefixBuffer.length) {
196: // If index is into our prefix buffer.
197: c = this .prefixBuffer[index];
198: } else if (index >= this .wrapOrigin
199: && (index - this .wrapOrigin) < this .wraparoundBuffer.length) {
200: // If index is into our buffer window on underlying backing file.
201: c = this .wraparoundBuffer[((index - this .wrapOrigin) + this .wrapOffset)
202: % this .wraparoundBuffer.length];
203: } else {
204: // Index is outside of both prefix buffer and our buffer window
205: // onto the underlying backing file. Fix the buffer window
206: // location.
207: c = faultCharAt(index);
208: }
209: // Stream is treated as single byte. Make sure characters returned
210: // are not negative.
211: return (char) (c & 0xff);
212: }
213:
214: /**
215: * Get a character that's outside the current buffers.
216: *
217: * will cause the wraparoundBuffer to be changed to
218: * cover a region including the index
219: *
220: * if index is higher than the highest index in the
221: * wraparound buffer, buffer is moved forward such
222: * that requested char is last item in buffer
223: *
224: * if index is lower than lowest index in the
225: * wraparound buffer, buffet is reset centered around
226: * index
227: *
228: * @param index Index of character to fetch.
229: * @return A character that's outside the current buffers
230: */
231: private int faultCharAt(int index) {
232: if (Thread.interrupted()) {
233: throw new RuntimeException("thread interrupted");
234: }
235: if (index >= this .wrapOrigin + this .wraparoundBuffer.length) {
236: // Moving forward
237: while (index >= this .wrapOrigin
238: + this .wraparoundBuffer.length) {
239: // TODO optimize this
240: advanceBuffer();
241: }
242: return charAt(index - this .contentOffset);
243: }
244: // Moving backward
245: recenterBuffer(index);
246: return charAt(index - this .contentOffset);
247: }
248:
249: /**
250: * Move the buffer window on backing file back centering current access
251: * position in middle of window.
252: *
253: * @param index Index of character to access.
254: */
255: private void recenterBuffer(int index) {
256: if (logger.isLoggable(Level.FINE)) {
257: logger.fine("Recentering around " + index + " in "
258: + this .backingFilename);
259: }
260: this .wrapOrigin = index - (this .wraparoundBuffer.length / 2);
261: if (this .wrapOrigin < this .prefixBuffer.length) {
262: this .wrapOrigin = this .prefixBuffer.length;
263: }
264: this .wrapOffset = 0;
265: loadBuffer();
266: }
267:
268: /**
269: * Load from backing file into the wrapper buffer.
270: */
271: private void loadBuffer() {
272: long len = -1;
273: try {
274: len = this .raFile.length();
275: this .raFile
276: .seek(this .wrapOrigin - this .prefixBuffer.length);
277: this .raFile.readFully(this .wraparoundBuffer, 0, Math.min(
278: this .wraparoundBuffer.length, this .absoluteLength
279: - this .wrapOrigin));
280: }
281:
282: catch (IOException e) {
283: // TODO convert this to a runtime error?
284: DevUtils.logger.log(Level.SEVERE, "raFile.seek("
285: + (this .wrapOrigin - this .prefixBuffer.length)
286: + ")\n"
287: + "raFile.readFully(wraparoundBuffer,0,"
288: + (Math.min(this .wraparoundBuffer.length,
289: this .length - this .wrapOrigin)) + ")\n"
290: + "raFile.length()" + len + "\n"
291: + DevUtils.extraInfo(), e);
292: throw new RuntimeException(e);
293: }
294: }
295:
296: /**
297: * Roll the wraparound buffer forward one position
298: */
299: private void advanceBuffer() {
300: try {
301: this .wraparoundBuffer[this .wrapOffset] = (byte) this .raFile
302: .read();
303: this .wrapOffset++;
304: this .wrapOffset %= this .wraparoundBuffer.length;
305: this .wrapOrigin++;
306: } catch (IOException e) {
307: DevUtils.logger.log(Level.SEVERE, "advanceBuffer()"
308: + DevUtils.extraInfo(), e);
309: throw new RuntimeException(e);
310: }
311: }
312:
313: public CharSequence subSequence(int start, int end) {
314: return new CharSubSequence(this , start, end);
315: }
316:
317: /**
318: * Cleanup resources.
319: *
320: * @exception IOException Failed close of random access file.
321: */
322: public void close() throws IOException {
323: this .prefixBuffer = null;
324: if (this .raFile != null) {
325: this .raFile.close();
326: this .raFile = null;
327: }
328: }
329:
330: /* (non-Javadoc)
331: * @see java.lang.Object#finalize()
332: */
333: protected void finalize() throws Throwable {
334: super .finalize();
335: close();
336: }
337:
338: /**
339: * Convenience method for getting a substring.
340: * @deprecated please use subSequence() and then toString() directly
341: */
342: public String substring(int offset, int len) {
343: return subSequence(offset, offset + len).toString();
344: }
345:
346: /* (non-Javadoc)
347: * @see java.lang.Object#toString()
348: */
349: public String toString() {
350: StringBuilder sb = new StringBuilder(this.length());
351: sb.append(this);
352: return sb.toString();
353: }
354: }
|