001: /* PieceReader
002: *
003: * Created on September 12, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.util.ms;
024:
025: import java.io.IOException;
026:
027: import org.archive.io.Endian;
028: import org.archive.io.SeekInputStream;
029: import org.archive.io.SeekReader;
030:
031: class PieceReader extends SeekReader {
032:
033: private PieceTable table;
034: private SeekInputStream doc;
035:
036: private boolean unicode;
037: private int charPos;
038: private int limit;
039:
040: public PieceReader(PieceTable table, SeekInputStream doc)
041: throws IOException {
042: this .table = table;
043: this .doc = doc;
044: charPos = 0;
045: limit = -1;
046: }
047:
048: private void seekIfNecessary() throws IOException {
049: if (doc == null) {
050: throw new IOException("Stream closed.");
051: }
052: if (charPos >= table.getMaxCharPos()) {
053: return;
054: }
055: if (charPos < limit) {
056: return;
057: }
058: Piece piece = table.next();
059: unicode = piece.isUnicode();
060: limit = piece.getCharPosLimit();
061: doc.position(piece.getFilePos());
062: }
063:
064: public int read() throws IOException {
065: seekIfNecessary();
066: if (doc == null) {
067: throw new IOException("Stream closed.");
068: }
069: if (charPos >= table.getMaxCharPos()) {
070: return -1;
071: }
072:
073: int ch;
074: if (unicode) {
075: ch = Endian.littleChar(doc);
076: } else {
077: ch = Cp1252.decode(doc.read());
078: }
079: charPos++;
080: return ch;
081: }
082:
083: public int read(char[] buf, int ofs, int len) throws IOException {
084: // FIXME: Think of a faster implementation that will work with
085: // both unicode and non-unicode.
086: seekIfNecessary();
087: if (doc == null) {
088: throw new IOException("Stream closed.");
089: }
090: if (charPos >= table.getMaxCharPos()) {
091: return 0;
092: }
093: for (int i = 0; i < len; i++) {
094: int ch = read();
095: if (ch < 0) {
096: return i;
097: }
098: buf[ofs + i] = (char) ch;
099: }
100: return len;
101: }
102:
103: public void close() throws IOException {
104: doc.close();
105: table = null;
106: }
107:
108: public long position() throws IOException {
109: return charPos;
110: }
111:
112: public void position(long p) throws IOException {
113: if (p > Integer.MAX_VALUE) {
114: throw new IOException("File too large.");
115: }
116: int charPos = (int) p;
117: Piece piece = table.pieceFor(charPos);
118: if (piece == null) {
119: throw new IOException("Illegal position: " + p);
120: }
121: unicode = piece.isUnicode();
122: limit = piece.getCharPosLimit();
123:
124: int ofs = charPos - piece.getCharPosStart();
125: this.charPos = charPos;
126: doc.position(piece.getFilePos() + ofs);
127: }
128: }
|