001: package it.unimi.dsi.fastutil.io;
002:
003: /*
004: * fastutil: Fast & compact type-specific collections for Java
005: *
006: * Copyright (C) 2005-2008 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or
009: * modify it under the terms of the GNU Lesser General Public
010: * License as published by the Free Software Foundation; either
011: * version 2.1 of the License, or (at your option) any later version.
012: *
013: * This library is distributed in the hope that it will be useful,
014: * but WITHOUT ANY WARRANTY; without even the implied warranty of
015: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
016: * Lesser General Public License for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public
019: * License along with this library; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.bytes.ByteArrays;
025: import it.unimi.dsi.fastutil.io.MeasurableInputStream;
026: import it.unimi.dsi.fastutil.io.RepositionableStream;
027:
028: import java.io.IOException;
029: import java.io.InputStream;
030: import java.nio.channels.FileChannel;
031: import java.util.EnumSet;
032:
033: /** Lightweight, unsynchronized, aligned input stream buffering class with
034: * {@linkplain #skip(long) true skipping},
035: * {@linkplain MeasurableInputStream measurability},
036: * {@linkplain RepositionableStream repositionability}
037: * and {@linkplain #readLine(byte[], int, int, EnumSet) line reading} support.
038: *
039: * <P>This class provides buffering for input streams, but it does so with
040: * purposes and an internal logic that are radically different from the ones
041: * adopted in {@link java.io.BufferedInputStream}. The main features follow.
042: *
043: * <ul>
044: * <li><P>There is no support for marking. All methods are unsychronized.
045: *
046: * <li><P>As an additional feature, this class implements the {@link
047: * RepositionableStream} and {@link MeasurableInputStream} interfaces.
048: * An instance of this class will try to cast
049: * the underlying byte stream to a {@link RepositionableStream} and to fetch by
050: * reflection the {@link java.nio.channels.FileChannel} underlying the given
051: * output stream, in this order. If either reference can be successfully
052: * fetched, you can use {@link #position(long)} to reposition the stream.
053: * Much in the same way, an instance of this class will try to cast the
054: * the underlying byte stream to a {@link MeasurableInputStream}, and if this
055: * operation is successful, or if a {@link java.nio.channels.FileChannel} can
056: * be detected, then {@link #position()} and {@link #length()} will work as expected.
057: *
058: *
059: * <li><p>Due to erratic and unpredictable behaviour of {@link InputStream#skip(long)},
060: * which does not correspond to its specification and which Sun refuses to fix
061: * (see <a href="http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6222822">bug 6222822</a>;
062: * don't be fooled by the “closed, fixed” label),
063: * this class peeks at the underlying stream and if it is {@link System#in} it uses
064: * repeated reads instead of calling {@link InputStream#skip(long)} on the underlying stream; moreover,
065: * skips and reads are tried alternately, so to guarantee that skipping
066: * less bytes than requested can be caused only by reaching the end of file.
067: *
068: * <li><p>This class keeps also track of the number of bytes read so far, so
069: * to be able to implemented {@link MeasurableInputStream#position()}
070: * independently of underlying input stream.
071: *
072: * <li><p>This class has limited support for
073: * {@linkplain #readLine(byte[], int, int, EnumSet) “reading a line”}
074: * (whatever that means) from the underlying input stream. You can choose the set of
075: * {@linkplain FastBufferedInputStream.LineTerminator line terminators} that
076: * delimit lines.
077: *
078: * </ul>
079: * @since 4.4
080: */
081:
082: public class FastBufferedInputStream extends MeasurableInputStream
083: implements RepositionableStream {
084:
085: /** The default size of the internal buffer in bytes (8Ki). */
086: public final static int DEFAULT_BUFFER_SIZE = 8 * 1024;
087:
088: /** An enumeration of the supported line terminators. */
089: public static enum LineTerminator {
090: /** A carriage return (CR, ASCII 13). */
091: CR,
092: /** A line feed (LF, ASCII 10). */
093: LF,
094: /** A carriage return followed by a line feed (CR/LF, ASCII 13/10). */
095: CR_LF
096: }
097:
098: /** A set containing <em>all available</em> line terminators. */
099: public final static EnumSet<LineTerminator> ALL_TERMINATORS = EnumSet
100: .allOf(LineTerminator.class);
101:
102: /** The underlying input stream. */
103: protected InputStream is;
104:
105: /** The internal buffer. */
106: protected byte buffer[];
107:
108: /** The current position in the buffer. */
109: protected int pos;
110:
111: /** The number of bytes ever read (reset upon a call to {@link #position(long)}).
112: * In particular, this will always represent the index (in the underlying input stream)
113: * of the first available byte in the buffer. */
114: protected long readBytes;
115:
116: /** The number of buffer bytes available starting from {@link #pos}. */
117: protected int avail;
118:
119: /** The cached file channel underlying {@link #is}, if any. */
120: private FileChannel fileChannel;
121:
122: /** {@link #is} cast to a positionable stream, if possible. */
123: private RepositionableStream rs;
124:
125: /** {@link #is} cast to a measurable input stream, if possible. */
126: private MeasurableInputStream ms;
127:
128: /** Creates a new fast buffered input stream by wrapping a given input stream with a given buffer size.
129: *
130: * @param is an input stream to wrap.
131: * @param bufSize the size in bytes of the internal buffer (greater than zero).
132: */
133:
134: public FastBufferedInputStream(final InputStream is,
135: final int bufSize) {
136: if (bufSize <= 0)
137: throw new IllegalArgumentException("Illegal buffer size: "
138: + bufSize);
139: this .is = is;
140: buffer = new byte[bufSize];
141:
142: if (is instanceof RepositionableStream)
143: rs = (RepositionableStream) is;
144: if (is instanceof MeasurableInputStream)
145: ms = (MeasurableInputStream) is;
146:
147: if (rs == null) {
148:
149: try {
150: fileChannel = (FileChannel) (is.getClass().getMethod(
151: "getChannel", new Class[] {})).invoke(is,
152: new Object[] {});
153: } catch (IllegalAccessException e) {
154: } catch (IllegalArgumentException e) {
155: } catch (NoSuchMethodException e) {
156: } catch (java.lang.reflect.InvocationTargetException e) {
157: } catch (ClassCastException e) {
158: }
159: }
160: }
161:
162: /** Creates a new fast buffered input stream by wrapping a given input stream with a buffer of {@link #DEFAULT_BUFFER_SIZE} bytes.
163: *
164: * @param is an input stream to wrap.
165: */
166: public FastBufferedInputStream(final InputStream is) {
167: this (is, DEFAULT_BUFFER_SIZE);
168: }
169:
170: /** Checks whether no more bytes will be returned.
171: *
172: * <p>This method will refill the internal buffer.
173: *
174: * @return true if there are no characters in the internal buffer and
175: * the underlying reader is exhausted.
176: */
177:
178: protected boolean noMoreCharacters() throws IOException {
179: if (avail == 0) {
180: avail = is.read(buffer);
181: if (avail <= 0) {
182: avail = 0;
183: return true;
184: }
185: pos = 0;
186: }
187: return false;
188: }
189:
190: public int read() throws IOException {
191: if (noMoreCharacters())
192: return -1;
193: avail--;
194: readBytes++;
195: return buffer[pos++] & 0xFF;
196: }
197:
198: public int read(final byte b[], final int offset, final int length)
199: throws IOException {
200: if (length <= avail) {
201: System.arraycopy(buffer, pos, b, offset, length);
202: pos += length;
203: avail -= length;
204: readBytes += length;
205: return length;
206: }
207:
208: final int head = avail;
209:
210: System.arraycopy(buffer, pos, b, offset, head);
211: pos = avail = 0;
212: readBytes += head;
213:
214: if (length > buffer.length) {
215: // We read directly into the destination
216: final int result = is.read(b, offset + head, length - head);
217: if (result > 0)
218: readBytes += result;
219: return result < 0 ? (head == 0 ? -1 : head) : result + head;
220: }
221:
222: if (noMoreCharacters())
223: return head == 0 ? -1 : head;
224:
225: final int toRead = Math.min(length - head, avail);
226: readBytes += toRead;
227: System.arraycopy(buffer, 0, b, offset + head, toRead);
228: pos = toRead;
229: avail -= toRead;
230:
231: // Note that head >= 0, and necessarily toRead > 0
232: return toRead + head;
233: }
234:
235: /** Reads a line into the given byte array using {@linkplain #ALL_TERMINATORS all terminators}.
236: *
237: * @param array byte array where the next line will be stored.
238: * @return the number of bytes actually placed in <code>array</code>, or -1 at end of file.
239: * @see #readLine(byte[], int, int, EnumSet)
240: */
241:
242: public int readLine(final byte[] array) throws IOException {
243: return readLine(array, 0, array.length, ALL_TERMINATORS);
244: }
245:
246: /** Reads a line into the given byte array.
247: *
248: * @param array byte array where the next line will be stored.
249: * @param terminators a set containing the line termination sequences that we want
250: * to consider as valid.
251: * @return the number of bytes actually placed in <code>array</code>, or -1 at end of file.
252: * @see #readLine(byte[], int, int, EnumSet)
253: */
254:
255: public int readLine(final byte[] array,
256: final EnumSet<LineTerminator> terminators)
257: throws IOException {
258: return readLine(array, 0, array.length, terminators);
259: }
260:
261: /** Reads a line into the given byte-array fragment using {@linkplain #ALL_TERMINATORS all terminators}.
262: *
263: * @param array byte array where the next line will be stored.
264: * @param off the first byte to use in <code>array</code>.
265: * @param len the maximum number of bytes to read.
266: * @return the number of bytes actually placed in <code>array</code>, or -1 at end of file.
267: * @see #readLine(byte[], int, int, EnumSet)
268: */
269: public int readLine(final byte[] array, final int off, final int len)
270: throws IOException {
271: return readLine(array, off, len, ALL_TERMINATORS);
272: }
273:
274: /** Reads a line into the given byte-array fragment.
275: *
276: * <P>Reading lines (i.e., characters) out of a byte stream is not always sensible
277: * (methods available to that purpose in old versions of Java have been mercilessly deprecated).
278: * Nonetheless, in several situations, such as when decoding network protocols or headers
279: * known to be ASCII, it is very useful to be able to read a line from a byte stream.
280: *
281: * <p>This method will attempt to read the next line into <code>array</code> starting at <code>off</code>,
282: * reading at most <code>len</code> bytes. The read, however, will be stopped by the end of file or
283: * when meeting a {@linkplain LineTerminator <em>line terminator</em>}. Of course, for this operation
284: * to be sensible the encoding of the text contained in the stream, if any, must not generate spurious
285: * carriage returns or line feeds. Note that the termination detection uses a maximisation
286: * criterion, so if you specify both {@link LineTerminator#CR} and
287: * {@link LineTerminator#CR_LF} meeting a pair CR/LF will consider the whole pair a terminator.
288: *
289: * <p>Terminators are <em>not</em> copied into <em>array</em> or included in the returned count. The
290: * returned integer can be used to check whether the line is complete: if it is smaller than
291: * <code>len</code>, then more bytes might be available, but note that this method (contrarily
292: * to {@link #read(byte[], int, int)}) can legitimately return zero when <code>len</code>
293: * is nonzero just because a terminator was found as the first character. Thus, the intended
294: * usage of this method is to call it on a given array, check whether <code>len</code> bytes
295: * have been read, and if so try again (possibly extending the array) until a number of read bytes
296: * strictly smaller than <code>len</code> (possibly, -1) is returned.
297: *
298: * <p>If you need to guarantee that a full line is read, use the following idiom:
299: * <pre>
300: * int start = off, len;
301: * while( ( len = readLine( array, start, array.length - start, terminators ) ) == array.length - start ) {
302: * start += len;
303: * array = ByteArrays.grow( array, array.length + 1 );
304: * };
305: * </pre>
306: *
307: * <p>At the end of the loop, the line will be placed in <code>array</code> starting at
308: * <code>off</code> (inclusive) and ending at <code>start + Math.max( len, 0 )</code> (exclusive).
309: *
310: * @param array byte array where the next line will be stored.
311: * @param off the first byte to use in <code>array</code>.
312: * @param len the maximum number of bytes to read.
313: * @param terminators a set containing the line termination sequences that we want
314: * to consider as valid.
315: * @return the number of bytes actually placed in <code>array</code>, or -1 at end of file.
316: * Note that the returned number will be <code>len</code> if no line termination sequence
317: * specified in <code>terminators</code> has been met before scanning <code>len</code> byte,
318: * and if also we did not meet the end of file.
319: */
320:
321: public int readLine(final byte[] array, final int off,
322: final int len, final EnumSet<LineTerminator> terminators)
323: throws IOException {
324: ByteArrays.ensureOffsetLength(array, off, len);
325: if (len == 0)
326: return 0; // 0-length reads always return 0
327: if (noMoreCharacters())
328: return -1;
329: int i, k = 0, remaining = len, read = 0; // The number of bytes still to be read
330: for (;;) {
331: for (i = 0; i < avail && i < remaining
332: && (k = buffer[pos + i]) != '\n' && k != '\r'; i++)
333: ;
334: System.arraycopy(buffer, pos, array, off + read, i);
335: pos += i;
336: avail -= i;
337: read += i;
338: remaining -= i;
339: if (remaining == 0) {
340: readBytes += read;
341: return read; // We did not stop because of a terminator
342: }
343:
344: if (avail > 0) { // We met a terminator
345: if (k == '\n') { // LF first
346: pos++;
347: avail--;
348: if (terminators.contains(LineTerminator.LF)) {
349: readBytes += read + 1;
350: return read;
351: } else {
352: array[off + read++] = '\n';
353: remaining--;
354: }
355: } else if (k == '\r') { // CR first
356: pos++;
357: avail--;
358:
359: if (terminators.contains(LineTerminator.CR_LF)) {
360: if (avail > 0) {
361: if (buffer[pos] == '\n') { // CR/LF with LF already in the buffer.
362: pos++;
363: avail--;
364: readBytes += read + 2;
365: return read;
366: }
367: } else { // We must search for the LF.
368: if (noMoreCharacters()) {
369: // Not found a matching LF because of end of file, will return CR in buffer if not a terminator
370:
371: if (!terminators
372: .contains(LineTerminator.CR)) {
373: array[off + read++] = '\r';
374: remaining--;
375: readBytes += read;
376: } else
377: readBytes += read + 1;
378:
379: return read;
380: }
381: if (buffer[0] == '\n') {
382: // Found matching LF, won't return terminators in the buffer
383: pos++;
384: avail--;
385: readBytes += read + 2;
386: return read;
387: }
388: }
389: }
390:
391: if (terminators.contains(LineTerminator.CR)) {
392: readBytes += read + 1;
393: return read;
394: }
395:
396: array[off + read++] = '\r';
397: remaining--;
398: }
399: } else if (noMoreCharacters()) {
400: readBytes += read;
401: return read;
402: }
403: }
404: }
405:
406: public void position(long newPosition) throws IOException {
407:
408: final long position = readBytes;
409:
410: /** Note that this check will succeed also in the case of
411: * an empty buffer and position == newPosition. This behaviour is
412: * intentional, as it delays buffering to when it is actually
413: * necessary and avoids useless class the underlying stream. */
414:
415: if (newPosition <= position + avail
416: && newPosition >= position - pos) {
417: pos += newPosition - position;
418: avail -= newPosition - position;
419: readBytes = newPosition;
420: return;
421: }
422:
423: if (rs != null)
424: rs.position(newPosition);
425: else if (fileChannel != null)
426: fileChannel.position(newPosition);
427: else
428: throw new UnsupportedOperationException(
429: "position() can only be called if the underlying byte stream implements the RepositionableStream interface or if the getChannel() method of the underlying byte stream exists and returns a FileChannel");
430: readBytes = newPosition;
431:
432: avail = pos = 0;
433: }
434:
435: public long position() throws IOException {
436: return readBytes;
437: }
438:
439: /** Returns the length of the underlying input stream, if it is {@linkplain MeasurableInputStream measurable}.
440: *
441: * @return the length of the underlying input stream.
442: * @throws UnsupportedOperationException if the underlying input stream is not {@linkplain MeasurableInputStream measurable}.
443: */
444:
445: public long length() throws IOException {
446: if (ms != null)
447: return ms.length();
448: if (fileChannel != null)
449: return fileChannel.size();
450: throw new UnsupportedOperationException();
451: }
452:
453: /** Skips the given amount of bytes by repeated reads.
454: *
455: * <strong>Warning</strong>: this method uses destructively the internal buffer.
456: *
457: * @param n the number of bytes to skip.
458: * @return the number of bytes actually skipped.
459: * @see InputStream#skip(long)
460: */
461:
462: private long skipByReading(final long n) throws IOException {
463: long toSkip = n;
464: int len;
465: while (toSkip > 0) {
466: len = is.read(buffer, 0, (int) Math.min(buffer.length,
467: toSkip));
468: if (len > 0)
469: toSkip -= len;
470: else
471: break;
472: }
473:
474: return n - toSkip;
475: }
476:
477: /** Skips over and discards the given number of bytes of data from this fast buffered input stream.
478: *
479: * <p>As explained in the {@linkplain FastBufferedInputStream class documentation}, the semantics
480: * of {@link InputStream#skip(long)} is fatally flawed. This method provides additional semantics as follows:
481: * it will skip the provided number of bytes, unless the end of file has been reached.
482: *
483: * <p>Additionally, if the underlying input stream is {@link System#in} this method will use
484: * repeated reads instead of invoking {@link InputStream#skip(long)}.
485: *
486: * @param n the number of bytes to skip.
487: * @return the number of bytes actually skipped; it can be smaller than <code>n</code>
488: * only if the end of file has been reached.
489: * @see InputStream#skip(long)
490: */
491:
492: public long skip(final long n) throws IOException {
493: if (n <= avail) {
494: final int m = (int) n;
495: pos += m;
496: avail -= m;
497: readBytes += n;
498: return n;
499: }
500:
501: long toSkip = n - avail, result = 0;
502: avail = 0;
503:
504: while (toSkip != 0
505: && (result = is == System.in ? skipByReading(toSkip)
506: : is.skip(toSkip)) < toSkip) {
507: if (result == 0) {
508: if (is.read() == -1)
509: break;
510: toSkip--;
511: } else
512: toSkip -= result;
513: }
514:
515: final long t = n - (toSkip - result);
516: readBytes += t;
517: return t;
518: }
519:
520: public int available() throws IOException {
521: return (int) Math.min(is.available() + (long) avail,
522: Integer.MAX_VALUE);
523: }
524:
525: public void close() throws IOException {
526: if (is == null)
527: return;
528: if (is != System.in)
529: is.close();
530: is = null;
531: buffer = null;
532: }
533:
534: /** Resets the internal logic of this fast buffered input stream, clearing the buffer.
535: *
536: * <p>All buffering information is discarded, and the number of bytes read so far
537: * (and thus, also the {@linkplain #position() current position})
538: * is adjusted to reflect this fact.
539: *
540: * <p>This method is mainly useful for re-reading
541: * files that have been overwritten externally.
542: */
543:
544: public void flush() {
545: if (is == null)
546: return;
547: readBytes += avail;
548: avail = pos = 0;
549: }
550:
551: /** Resets the internal logic of this fast buffered input stream.
552: *
553: * @deprecated As of <samp>fastutil</samp> 5.0.4, replaced by {@link #flush()}. The old
554: * semantics of this method does not contradict {@link InputStream}'s contract, as
555: * the semantics of {@link #reset()} is undefined if {@link InputStream#markSupported()}
556: * returns false. On the other hand, the name was really a poor choice.
557: */
558: @Deprecated
559: public void reset() {
560: flush();
561: }
562: }
|