001: /* Doc
002: *
003: * Created on September 12, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.util.ms;
024:
025: import java.io.File;
026: import java.io.IOException;
027: import java.io.RandomAccessFile;
028: import java.util.List;
029: import java.util.logging.Level;
030: import java.util.logging.Logger;
031:
032: import org.archive.io.Endian;
033: import org.archive.io.RandomAccessInputStream;
034: import org.archive.io.SeekInputStream;
035: import org.archive.io.SeekReader;
036:
037: /**
038: * Reads .doc files.
039: *
040: * @author pjack
041: */
042: public class Doc {
043:
044: final private static Logger LOGGER = Logger.getLogger(Doc.class
045: .getName());
046:
047: /**
048: * Static utility library, do not instantiate.
049: */
050: private Doc() {
051: }
052:
053: /**
054: * Returns the text of the .doc file with the given file name.
055: *
056: * @param docFilename the name of the file whose text to return
057: * @return the text of that file
058: * @throws IOException if an IO error occurs
059: */
060: public static SeekReader getText(String docFilename)
061: throws IOException {
062: return getText(new File(docFilename));
063: }
064:
065: /**
066: * Returns the text of the given .doc file.
067: *
068: * @param doc the .doc file whose text to return
069: * @return the text of that file
070: * @throws IOException if an IO error occurs
071: */
072: public static SeekReader getText(File doc) throws IOException {
073: RandomAccessFile raf = new RandomAccessFile(doc, "r");
074: RandomAccessInputStream rais = new RandomAccessInputStream(raf);
075: return getText(rais);
076: }
077:
078: /**
079: * Returns the text of the given .doc file.
080: *
081: * @param doc the .doc file whose text to return
082: * @return the text of that file
083: * @throws IOException if an IO error occurs
084: */
085: public static SeekReader getText(SeekInputStream doc)
086: throws IOException {
087: BlockFileSystem bfs = new DefaultBlockFileSystem(doc, 16);
088: return getText(bfs, 20);
089: }
090:
091: /**
092: * Returns the text for the given .doc file. The given cacheSize refers
093: * to the number of the .doc file's piece table entries to cache. Most
094: * .doc files only have 1 piece table entry; however, a "fast-saved"
095: * .doc file might have several. A cacheSize of 20 should be ample for
096: * most .doc files in the world. Since piece table entries are small --
097: * only 12 bytes each -- caching them prevents many otherwise necessary
098: * file pointer repositionings.
099: *
100: * @param wordDoc the .doc file as a BlockFileSystem
101: * @param cacheSize the number of piece table entries to cache
102: * @return a reader that will return the text in the file
103: * @throws IOException if an IO error occurs
104: */
105: public static SeekReader getText(BlockFileSystem wordDoc,
106: int cacheSize) throws IOException {
107: List<Entry> entries = wordDoc.getRoot().list();
108: Entry main = find(entries, "WordDocument");
109: SeekInputStream mainStream = main.open();
110:
111: mainStream.position(10);
112: int flags = Endian.littleChar(mainStream);
113: boolean complex = (flags & 0x0004) == 0x0004;
114: boolean tableOne = (flags & 0x0200) == 0x0200;
115: String tableName = tableOne ? "1Table" : "0Table";
116: Entry table = find(entries, tableName);
117: if (LOGGER.isLoggable(Level.FINEST)) {
118: LOGGER.finest("Main entry: " + main);
119: LOGGER.finest("Table entry: " + table);
120: }
121: SeekInputStream tableStream = table.open();
122:
123: mainStream.position(24);
124: int fcMin = Endian.littleInt(mainStream);
125: int fcMax = Endian.littleInt(mainStream);
126:
127: mainStream.position(76);
128: int cppText = Endian.littleInt(mainStream);
129:
130: mainStream.position(418);
131: int fcClx = Endian.littleInt(mainStream);
132: int fcSz = Endian.littleInt(mainStream);
133:
134: if (LOGGER.isLoggable(Level.FINE)) {
135: LOGGER.fine("fcMin: " + fcMin);
136: LOGGER.fine("fcMax: " + fcMax);
137: LOGGER.fine("FcClx: " + fcClx);
138: LOGGER.fine("szClx: " + fcSz);
139: LOGGER.fine("complex: " + complex);
140: LOGGER.fine("cppText: " + cppText);
141: }
142: PieceTable pt = new PieceTable(tableStream, fcClx, fcMax
143: - fcMin, cacheSize);
144: return new PieceReader(pt, mainStream);
145: }
146:
147: private static Entry find(List<Entry> entries, String name) {
148: for (Entry e : entries) {
149: if (e.getName().equals(name)) {
150: return e;
151: }
152: }
153: return null;
154: }
155:
156: }
|