001: /* ====================================================================
002: Licensed to the Apache Software Foundation (ASF) under one or more
003: contributor license agreements. See the NOTICE file distributed with
004: this work for additional information regarding copyright ownership.
005: The ASF licenses this file to You under the Apache License, Version 2.0
006: (the "License"); you may not use this file except in compliance with
007: the License. You may obtain a copy of the License at
008:
009: http://www.apache.org/licenses/LICENSE-2.0
010:
011: Unless required by applicable law or agreed to in writing, software
012: distributed under the License is distributed on an "AS IS" BASIS,
013: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: See the License for the specific language governing permissions and
015: limitations under the License.
016: ==================================================================== */
017: package org.apache.poi.hdgf.extractor;
018:
019: import java.io.FileInputStream;
020: import java.io.IOException;
021: import java.io.InputStream;
022: import java.util.ArrayList;
023:
024: import org.apache.poi.POITextExtractor;
025: import org.apache.poi.hdgf.HDGFDiagram;
026: import org.apache.poi.hdgf.chunks.Chunk.Command;
027: import org.apache.poi.hdgf.streams.ChunkStream;
028: import org.apache.poi.hdgf.streams.PointerContainingStream;
029: import org.apache.poi.hdgf.streams.Stream;
030: import org.apache.poi.poifs.filesystem.POIFSFileSystem;
031:
032: /**
033: * Class to find all the text in a Visio file, and return it.
034: * Can opperate on the command line (outputs to stdout), or
035: * can return the text for you (eg for use with Lucene).
036: */
037: public class VisioTextExtractor extends POITextExtractor {
038: private HDGFDiagram hdgf;
039: private POIFSFileSystem fs;
040:
041: public VisioTextExtractor(HDGFDiagram hdgf) {
042: super (hdgf);
043: this .hdgf = hdgf;
044: }
045:
046: public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
047: this (new HDGFDiagram(fs));
048: this .fs = fs;
049: }
050:
051: public VisioTextExtractor(InputStream inp) throws IOException {
052: this (new POIFSFileSystem(inp));
053: }
054:
055: /**
056: * Locates all the text entries in the file, and returns their
057: * contents.
058: */
059: public String[] getAllText() {
060: ArrayList text = new ArrayList();
061: for (int i = 0; i < hdgf.getTopLevelStreams().length; i++) {
062: findText(hdgf.getTopLevelStreams()[i], text);
063: }
064: return (String[]) text.toArray(new String[text.size()]);
065: }
066:
067: private void findText(Stream stream, ArrayList text) {
068: if (stream instanceof PointerContainingStream) {
069: PointerContainingStream ps = (PointerContainingStream) stream;
070: for (int i = 0; i < ps.getPointedToStreams().length; i++) {
071: findText(ps.getPointedToStreams()[i], text);
072: }
073: }
074: if (stream instanceof ChunkStream) {
075: ChunkStream cs = (ChunkStream) stream;
076: for (int i = 0; i < cs.getChunks().length; i++) {
077: if (cs.getChunks()[i] != null
078: && cs.getChunks()[i].getName() != null
079: && cs.getChunks()[i].getName().equals("Text")) {
080: // First command
081: Command cmd = cs.getChunks()[i].getCommands()[0];
082: if (cmd != null && cmd.getValue() != null) {
083: text.add(cmd.getValue().toString());
084: }
085: }
086: }
087: }
088: }
089:
090: /**
091: * Returns the textual contents of the file.
092: * Each textual object's text will be separated
093: * by a newline
094: */
095: public String getText() {
096: StringBuffer text = new StringBuffer();
097: String[] allText = getAllText();
098: for (int i = 0; i < allText.length; i++) {
099: text.append(allText[i]);
100: if (!allText[i].endsWith("\r")
101: && !allText[i].endsWith("\n")) {
102: text.append("\n");
103: }
104: }
105: return text.toString();
106: }
107:
108: public static void main(String[] args) throws Exception {
109: if (args.length == 0) {
110: System.err.println("Use:");
111: System.err.println(" VisioTextExtractor <file.vsd>");
112: System.exit(1);
113: }
114:
115: VisioTextExtractor extractor = new VisioTextExtractor(
116: new FileInputStream(args[0]));
117:
118: // Print not PrintLn as already has \n added to it
119: System.out.print(extractor.getText());
120: }
121: }
|