001: /* MultiByteReplayCharSequenceFactory
002: *
003: * (Re)Created on Dec 21, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.io;
024:
025: import java.io.BufferedReader;
026: import java.io.BufferedWriter;
027: import java.io.File;
028: import java.io.FileInputStream;
029: import java.io.FileNotFoundException;
030: import java.io.FileOutputStream;
031: import java.io.IOException;
032: import java.io.InputStreamReader;
033: import java.io.OutputStreamWriter;
034: import java.io.Writer;
035: import java.nio.ByteBuffer;
036: import java.nio.CharBuffer;
037: import java.nio.channels.FileChannel;
038: import java.nio.charset.Charset;
039: import java.nio.charset.CharsetDecoder;
040: import java.nio.charset.CoderResult;
041: import java.nio.charset.CodingErrorAction;
042: import java.util.logging.Level;
043: import java.util.logging.Logger;
044:
045: /**
046: * Provides a (Replay)CharSequence view on recorded streams (a prefix
047: * buffer and overflow backing file) that can handle streams of multibyte
048: * characters.
049: *
050: * If possible, use {@link ByteReplayCharSequence}. It performs better even
051: * for the single byte case (Decoding is an expensive process).
052: *
053: * <p>Call close on this class when done so can clean up resources.
054: *
055: * <p>Implementation currently works by checking to see if content to read
056: * all fits the in-memory buffer. If so, we decode into a CharBuffer and
057: * keep this around for CharSequence operations. This CharBuffer is
058: * discarded on close.
059: *
060: * <p>If content length is greater than in-memory buffer, we decode the
061: * buffer plus backing file into a new file named for the backing file w/
062: * a suffix of the encoding we write the file as. We then run w/ a
063: * memory-mapped CharBuffer against this file to implement CharSequence.
064: * Reasons for this implemenation are that CharSequence wants to return the
065: * length of the CharSequence.
066: *
067: * <p>Obvious optimizations would keep around decodings whether the
068: * in-memory decoded buffer or the file of decodings written to disk but the
069: * general usage pattern processing URIs is that the decoding is used by one
070: * processor only. Also of note, files usually fit into the in-memory
071: * buffer.
072: *
073: * <p>We might also be able to keep up 3 windows that moved across the file
074: * decoding a window at a time trying to keep one of the buffers just in
075: * front of the regex processing returning it a length that would be only
076: * the length of current position to end of current block or else the length
077: * could be got by multipling the backing files length by the decoders'
078: * estimate of average character size. This would save us writing out the
079: * decoded file. We'd have to do the latter for files that are
080: * > Integer.MAX_VALUE.
081: *
082: * @author stack
083: * @version $Revision: 4844 $, $Date: 2007-01-10 17:18:34 +0000 (Wed, 10 Jan 2007) $
084: */
085: public class MultiByteReplayCharSequence implements ReplayCharSequence {
086:
087: protected static Logger logger = Logger
088: .getLogger(MultiByteReplayCharSequence.class.getName());
089:
090: /**
091: * Name of the encoding we use writing out concatenated decoded prefix
092: * buffer and decoded backing file.
093: *
094: * <p>This define is also used as suffix for the file that holds the
095: * decodings. The name of the file that holds the decoding is the name
096: * of the backing file w/ this encoding for a suffix.
097: *
098: * <p>See <a ref="http://java.sun.com/j2se/1.4.2/docs/guide/intl/encoding.doc.html">Encoding</a>.
099: */
100: private static final String WRITE_ENCODING = "UTF-16BE";
101:
102: /**
103: * CharBuffer of decoded content.
104: *
105: * Content of this buffer is unicode.
106: */
107: private CharBuffer content = null;
108:
109: /**
110: * File that has decoded content.
111: *
112: * Keep it around so we can remove on close.
113: */
114: private File decodedFile = null;
115:
116: /**
117: * Constructor for all in-memory operation.
118: *
119: * @param buffer In-memory buffer of recordings prefix. We read from
120: * here first and will only go to the backing file if <code>size</code>
121: * requested is greater than <code>buffer.length</code>.
122: * @param size Total size of stream to replay in bytes. Used to find
123: * EOS. This is total length of content including HTTP headers if
124: * present.
125: * @param responseBodyStart Where the response body starts in bytes.
126: * Used to skip over the HTTP headers if present.
127: * @param backingFilename Path to backing file with content in excess of
128: * whats in <code>buffer</code>.
129: * @param encoding Encoding to use reading the passed prefix buffer and
130: * backing file. For now, should be java canonical name for the
131: * encoding. (If null is passed, we will default to
132: * ByteReplayCharSequence).
133: *
134: * @throws IOException
135: */
136: public MultiByteReplayCharSequence(byte[] buffer, long size,
137: long responseBodyStart, String encoding) throws IOException {
138: super ();
139: this .content = decodeInMemory(buffer, size, responseBodyStart,
140: encoding);
141: }
142:
143: /**
144: * Constructor for overflow-to-disk-file operation.
145: *
146: * @param contentReplayInputStream inputStream of content
147: * @param backingFilename hint for name of temp file
148: * @param characterEncoding Encoding to use reading the stream.
149: * For now, should be java canonical name for the
150: * encoding.
151: *
152: * @throws IOException
153: */
154: public MultiByteReplayCharSequence(
155: ReplayInputStream contentReplayInputStream,
156: String backingFilename, String characterEncoding)
157: throws IOException {
158: super ();
159: this .content = decodeToFile(contentReplayInputStream,
160: backingFilename, characterEncoding);
161: }
162:
163: /**
164: * Decode passed buffer and backing file into a CharBuffer.
165: *
166: * This method writes a new file made of the decoded concatenation of
167: * the in-memory prefix buffer and the backing file. Returns a
168: * charSequence view onto this new file.
169: *
170: * @param buffer In-memory buffer of recordings prefix. We read from
171: * here first and will only go to the backing file if <code>size</code>
172: * requested is greater than <code>buffer.length</code>.
173: * @param size Total size of stream to replay in bytes. Used to find
174: * EOS. This is total length of content including HTTP headers if
175: * present.
176: * @param responseBodyStart Where the response body starts in bytes.
177: * Used to skip over the HTTP headers if present.
178: * @param backingFilename Path to backing file with content in excess of
179: * whats in <code>buffer</code>.
180: * @param encoding Encoding to use reading the passed prefix buffer and
181: * backing file. For now, should be java canonical name for the
182: * encoding. (If null is passed, we will default to
183: * ByteReplayCharSequence).
184: *
185: * @return A CharBuffer view on decodings of the contents of passed
186: * buffer.
187: * @throws IOException
188: */
189: private CharBuffer decodeToFile(ReplayInputStream inStream,
190: String backingFilename, String encoding) throws IOException {
191:
192: CharBuffer charBuffer = null;
193:
194: BufferedReader reader = new BufferedReader(
195: new InputStreamReader(inStream, encoding));
196:
197: this .decodedFile = new File(backingFilename + "."
198: + WRITE_ENCODING);
199: BufferedWriter writer = new BufferedWriter(
200: new OutputStreamWriter(new FileOutputStream(
201: this .decodedFile), WRITE_ENCODING));
202:
203: int c;
204: while ((c = reader.read()) >= 0) {
205: writer.write(c);
206: }
207: writer.close();
208:
209: charBuffer = getReadOnlyMemoryMappedBuffer(this .decodedFile)
210: .asCharBuffer();
211:
212: return charBuffer;
213: }
214:
215: /**
216: * Decode passed buffer into a CharBuffer.
217: *
218: * This method decodes a memory buffer returning a memory buffer.
219: *
220: * @param buffer In-memory buffer of recordings prefix. We read from
221: * here first and will only go to the backing file if <code>size</code>
222: * requested is greater than <code>buffer.length</code>.
223: * @param size Total size of stream to replay in bytes. Used to find
224: * EOS. This is total length of content including HTTP headers if
225: * present.
226: * @param responseBodyStart Where the response body starts in bytes.
227: * Used to skip over the HTTP headers if present.
228: * @param encoding Encoding to use reading the passed prefix buffer and
229: * backing file. For now, should be java canonical name for the
230: * encoding. (If null is passed, we will default to
231: * ByteReplayCharSequence).
232: *
233: * @return A CharBuffer view on decodings of the contents of passed
234: * buffer.
235: */
236: private CharBuffer decodeInMemory(byte[] buffer, long size,
237: long responseBodyStart, String encoding) {
238: ByteBuffer bb = ByteBuffer.wrap(buffer);
239: // Move past the HTTP header if present.
240: bb.position((int) responseBodyStart);
241: // Set the end-of-buffer to be end-of-content.
242: bb.limit((int) size);
243: return (Charset.forName(encoding)).decode(bb)
244: .asReadOnlyBuffer();
245: }
246:
247: /**
248: * Create read-only memory-mapped buffer onto passed file.
249: *
250: * @param file File to get memory-mapped buffer on.
251: * @return Read-only memory-mapped ByteBuffer view on to passed file.
252: * @throws IOException
253: */
254: private ByteBuffer getReadOnlyMemoryMappedBuffer(File file)
255: throws IOException {
256:
257: ByteBuffer bb = null;
258: FileInputStream in = null;
259: FileChannel c = null;
260: assert file.exists() : "No file " + file.getAbsolutePath();
261:
262: try {
263: in = new FileInputStream(file);
264: c = in.getChannel();
265: // TODO: Confirm the READ_ONLY works. I recall it not working.
266: // The buffers seem to always say that the buffer is writeable.
267: bb = c.map(FileChannel.MapMode.READ_ONLY, 0, c.size())
268: .asReadOnlyBuffer();
269: }
270:
271: finally {
272: if (c != null && c.isOpen()) {
273: c.close();
274: }
275: if (in != null) {
276: in.close();
277: }
278: }
279:
280: return bb;
281: }
282:
283: private void deleteFile(File fileToDelete) {
284: deleteFile(fileToDelete, null);
285: }
286:
287: private void deleteFile(File fileToDelete, final Exception e) {
288: if (e != null) {
289: // Log why the delete to help with debug of java.io.FileNotFoundException:
290: // ....tt53http.ris.UTF-16BE.
291: logger.severe("Deleting " + fileToDelete + " because of "
292: + e.toString());
293: }
294: if (fileToDelete != null && fileToDelete.exists()) {
295: fileToDelete.delete();
296: }
297: }
298:
299: public void close() {
300: this .content = null;
301: deleteFile(this .decodedFile);
302: // clear decodedFile -- so that double-close (as in
303: // finalize()) won't delete a later instance with same name
304: // see bug [ 1218961 ] "failed get of replay" in ExtractorHTML... usu: UTF-16BE
305: this .decodedFile = null;
306: }
307:
308: protected void finalize() throws Throwable {
309: super .finalize();
310: // Maybe TODO: eliminate close here, requiring explicit close instead
311: close();
312: }
313:
314: public int length() {
315: return this .content.limit();
316: }
317:
318: public char charAt(int index) {
319: return this .content.get(index);
320: }
321:
322: public CharSequence subSequence(int start, int end) {
323: return new CharSubSequence(this , start, end);
324: }
325:
326: public String toString() {
327: StringBuffer sb = new StringBuffer(length());
328: // could use StringBuffer.append(CharSequence) if willing to do 1.5 & up
329: for (int i = 0; i < length(); i++) {
330: sb.append(charAt(i));
331: }
332: return sb.toString();
333: }
334: }
|