001: /* PieceTable
002: *
003: * Created on September 12, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.util.ms;
024:
025: import java.io.IOException;
026: import java.util.logging.Level;
027: import java.util.logging.Logger;
028:
029: import org.archive.io.BufferedSeekInputStream;
030: import org.archive.io.Endian;
031: import org.archive.io.OriginSeekInputStream;
032: import org.archive.io.SafeSeekInputStream;
033: import org.archive.io.SeekInputStream;
034:
035: /**
036: * The piece table of a .doc file.
037: *
038: * <p>The piece table maps logical character positions of a document's text
039: * stream to actual file stream positions. The piece table is stored as two
040: * parallel arrays. The first array contains 32-bit integers representing
041: * the logical character positions. The second array contains 64-bit data
042: * structures that are mostly mysterious to me, except that they contain a
043: * 32-bit subfile offset. The second array is stored immediately after the
044: * first array. I call the first array the <i>charPos</i> array and the
045: * second array the <i>filePos</i> array.
046: *
047: * <p>The arrays are preceded by a special tag byte (2), followed by the
048: * combined size of both arrays in bytes. The number of piece table entries
049: * must be deduced from this byte size.
050: *
051: * <p>Because of this bizarre structure, caching piece table entries is
052: * something of a challenge. A single piece table entry is actually located
053: * in two different file locations. If there are many piece table entries,
054: * then the charPos and filePos information may be separated by many bytes,
055: * potentially crossing block boundaries. The approach I took was to use
056: * two different buffered streams. Up to n charPos offsets and n filePos
057: * structures can be buffered in the two streams, preventing any file seeking
058: * from occurring when looking up piece information. (File seeking must
059: * still occur to jump from one piece to the next.)
060: *
061: * <p>Note that the vast majority of .doc files in the world will have exactly
062: * 1 piece table entry, representing the complete text of the document. Only
063: * those documents that were "fast-saved" should have multiple pieces.
064: *
065: * <p>Finally, the text contained in a .doc file can either contain 16-bit
066: * unicode characters (charset UTF-16LE) or 8-bit CP1252 characters. One
067: * .doc file can contain both kinds of pieces. Whether or not a piece is
068: * Cp1252 is stored as a flag in the filePos value, bizarrely enough. If
069: * the flag is set, then the actual file position is the filePos with the
070: * flag cleared, then divided by 2.
071: *
072: * @author pjack
073: */
074: class PieceTable {
075:
076: final static Logger LOGGER = Logger.getLogger(PieceTable.class
077: .getName());
078:
079: /** The bit that indicates if a piece uses Cp1252 or unicode. */
080: final static int CP1252_INDICATOR = 1 << 30;
081:
082: /** The mask to use to clear the Cp1252 flag bit. */
083: final static int CP1252_MASK = ~(3 << 30);
084:
085: /** The total number of pieces in the table. */
086: private int count;
087:
088: /** The total number of characters in the text stream. */
089: private int maxCharPos;
090:
091: /** The index of the current piece. */
092: private int current;
093:
094: /** The most recently returned piece from this table. */
095: private Piece currentPiece;
096:
097: /** The buffered stream that provides character position information. */
098: private SeekInputStream charPos;
099:
100: /** The buffered stream that provides file pointer information. */
101: private SeekInputStream filePos;
102:
103: /**
104: * Constructor.
105: *
106: * @param tableStream the stream containing the piece table
107: * @param offset the starting offset of the piece table
108: * @param maxCharPos the total number of characters in the document
109: * @param cachedRecords the number of piece table entries to cache
110: * @throws IOException if an IO error occurs
111: */
112: public PieceTable(SeekInputStream tableStream, int offset,
113: int maxCharPos, int cachedRecords) throws IOException {
114: tableStream.position(offset);
115: skipProperties(tableStream);
116: int sizeInBytes = Endian.littleInt(tableStream);
117: this .count = (sizeInBytes - 4) / 12;
118: cachedRecords = Math.min(cachedRecords, count);
119: long tp = tableStream.position() + 4;
120: long charPosStart = tp;
121: long filePosStart = tp + count * 4 + 2;
122:
123: this .filePos = wrap(tableStream, filePosStart,
124: cachedRecords * 8);
125: this .charPos = wrap(tableStream, charPosStart,
126: cachedRecords * 4);
127: this .maxCharPos = maxCharPos;
128:
129: if (LOGGER.isLoggable(Level.FINEST)) {
130: LOGGER.finest("Size in bytes: " + sizeInBytes);
131: LOGGER.finest("Piece table count: " + count);
132: for (Piece piece = next(); piece != null; piece = next()) {
133: LOGGER.finest("#" + current + ": " + piece.toString());
134: }
135: current = 0;
136: }
137: }
138:
139: /**
140: * Wraps the raw table stream. This is used to create the charPos and
141: * filePos streams. The streams that this method returns are "safe",
142: * meaning that the charPos and filePos position() fields never clobber
143: * each other. They are buffered, meaning that up to <i>n</i> elements
144: * can be read before the disk is accessed again. And they are "origined",
145: * meaning result.position(0) actually positions the stream at the
146: * beginning of the piece table array, not the beginning of the file.
147: *
148: * @param input the stream to wrap
149: * @param pos the origin for the returned stream
150: * @param cache the number of bytes for the returned stream to buffer
151: * @return the wrapped stream
152: * @throws IOException if an IO error occurs
153: */
154: private SeekInputStream wrap(SeekInputStream input, long pos,
155: int cache) throws IOException {
156: input.position(pos);
157: SeekInputStream r = new SafeSeekInputStream(input);
158: r = new OriginSeekInputStream(r, pos);
159: r = new BufferedSeekInputStream(r, cache);
160: return r;
161: }
162:
163: /**
164: * Skips over any property information that may precede a piece table.
165: * These property structures contain stylesheet information that applies
166: * to the piece table. Since we're only interested in the text itself,
167: * we just ignore this property stuff. (I suppose a third buffered
168: * stream could be used to add style information to {@link Piece}, but
169: * we don't need it.)
170: *
171: * @param input the input stream containing the piece table
172: * @throws IOException if an IO error occurs
173: */
174: private static void skipProperties(SeekInputStream input)
175: throws IOException {
176: int tag = input.read();
177: while (tag == 1) {
178: int size = Endian.littleChar(input);
179: while (size > 0) {
180: size -= input.skip(size);
181: }
182: tag = input.read();
183: }
184: if (tag != 2) {
185: throw new IllegalStateException();
186: }
187: }
188:
189: /**
190: * Returns the maximum character position. Put another way, returns the
191: * total number of characters in the document.
192: *
193: * @return the maximum character position
194: */
195: public int getMaxCharPos() {
196: return maxCharPos;
197: }
198:
199: /**
200: * Returns the next piece in the piece table.
201: *
202: * @return the next piece in the piece table, or null if there is no
203: * next piece
204: * @throws IOException if an IO error occurs
205: */
206: public Piece next() throws IOException {
207: if (current >= count) {
208: currentPiece = null;
209: return null;
210: }
211:
212: int cp;
213: if (current == count - 1) {
214: cp = maxCharPos;
215: } else {
216: charPos.position(current * 4);
217: cp = Endian.littleInt(charPos);
218: }
219: filePos.position(current * 8);
220: int encoded = Endian.littleInt(filePos);
221:
222: if (LOGGER.isLoggable(Level.FINEST)) {
223: StringBuffer sb = new StringBuffer(Integer
224: .toBinaryString(encoded));
225: while (sb.length() < 32) {
226: sb.insert(0, '0');
227: }
228: LOGGER.finest("Encoded offset: " + sb.toString());
229: }
230:
231: current++;
232:
233: int start;
234: if (currentPiece == null) {
235: start = 0;
236: } else {
237: start = currentPiece.getCharPosLimit();
238: }
239: if ((encoded & CP1252_INDICATOR) == 0) {
240: Piece piece = new Piece(encoded, start, cp, true);
241: currentPiece = piece;
242: return piece;
243: } else {
244: int filePos = (encoded & CP1252_MASK) / 2;
245: Piece piece = new Piece(filePos, start, cp, false);
246: currentPiece = piece;
247: return piece;
248: }
249: }
250:
251: /**
252: * Returns the piece containing the given character position.
253: *
254: * @param charPos the character position whose piece to return
255: * @return that piece, or null if no such piece exists (if charPos
256: * is greater than getMaxCharPos())
257: * @throws IOException if an IO error occurs
258: */
259: public Piece pieceFor(int charPos) throws IOException {
260: if (currentPiece.contains(charPos)) {
261: return currentPiece;
262: }
263:
264: // FIXME: Use binary search to find piece index
265:
266: current = 0;
267: currentPiece = null;
268: next();
269:
270: while (currentPiece != null) {
271: if (currentPiece.contains(charPos)) {
272: return currentPiece;
273: }
274: next();
275: }
276:
277: return null;
278: }
279:
280: }
|