001: /* ====================================================================
002: Licensed to the Apache Software Foundation (ASF) under one or more
003: contributor license agreements. See the NOTICE file distributed with
004: this work for additional information regarding copyright ownership.
005: The ASF licenses this file to You under the Apache License, Version 2.0
006: (the "License"); you may not use this file except in compliance with
007: the License. You may obtain a copy of the License at
008:
009: http://www.apache.org/licenses/LICENSE-2.0
010:
011: Unless required by applicable law or agreed to in writing, software
012: distributed under the License is distributed on an "AS IS" BASIS,
013: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: See the License for the specific language governing permissions and
015: limitations under the License.
016: ==================================================================== */
017:
018: package org.apache.poi.hwpf;
019:
020: import org.apache.poi.hwpf.HWPFDocument;
021: import org.apache.poi.hwpf.usermodel.*;
022: import org.apache.poi.hwpf.model.*;
023:
024: import java.io.*;
025:
026: public class Word2Forrest {
027: Writer _out;
028: HWPFDocument _doc;
029:
030: public Word2Forrest(HWPFDocument doc, OutputStream stream)
031: throws IOException, UnsupportedEncodingException {
032: OutputStreamWriter out = new OutputStreamWriter(stream, "UTF-8");
033: _out = out;
034: _doc = doc;
035:
036: init();
037: openDocument();
038: openBody();
039:
040: Range r = doc.getRange();
041: StyleSheet styleSheet = doc.getStyleSheet();
042:
043: int sectionLevel = 0;
044: int lenParagraph = r.numParagraphs();
045: boolean inCode = false;
046: for (int x = 0; x < lenParagraph; x++) {
047: Paragraph p = r.getParagraph(x);
048: String text = p.text();
049: if (text.trim().length() == 0) {
050: continue;
051: }
052: StyleDescription paragraphStyle = styleSheet
053: .getStyleDescription(p.getStyleIndex());
054: String styleName = paragraphStyle.getName();
055: if (styleName.startsWith("Heading")) {
056: if (inCode) {
057: closeSource();
058: inCode = false;
059: }
060:
061: int headerLevel = Integer.parseInt(styleName
062: .substring(8));
063: if (headerLevel > sectionLevel) {
064: openSection();
065: } else {
066: for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) {
067: closeSection();
068: }
069: openSection();
070: }
071: sectionLevel = headerLevel;
072: openTitle();
073: writePlainText(text);
074: closeTitle();
075: } else {
076: int cruns = p.numCharacterRuns();
077: CharacterRun run = p.getCharacterRun(0);
078: String fontName = run.getFontName();
079: if (fontName.startsWith("Courier")) {
080: if (!inCode) {
081: openSource();
082: inCode = true;
083: }
084: writePlainText(p.text());
085: } else {
086: if (inCode) {
087: inCode = false;
088: closeSource();
089: }
090: openParagraph();
091: writePlainText(p.text());
092: closeParagraph();
093: }
094: }
095: }
096: for (int x = 0; x < sectionLevel; x++) {
097: closeSection();
098: }
099: closeBody();
100: closeDocument();
101: _out.flush();
102:
103: }
104:
105: public void init() throws IOException {
106: _out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n");
107: _out
108: .write("<!DOCTYPE document PUBLIC \"-//APACHE//DTD Documentation V1.1//EN\" \"./dtd/document-v11.dtd\">\r\n");
109: }
110:
111: public void openDocument() throws IOException {
112: _out.write("<document>\r\n");
113: }
114:
115: public void closeDocument() throws IOException {
116: _out.write("</document>\r\n");
117: }
118:
119: public void openBody() throws IOException {
120: _out.write("<body>\r\n");
121: }
122:
123: public void closeBody() throws IOException {
124: _out.write("</body>\r\n");
125: }
126:
127: public void openSection() throws IOException {
128: _out.write("<section>");
129:
130: }
131:
132: public void closeSection() throws IOException {
133: _out.write("</section>");
134:
135: }
136:
137: public void openTitle() throws IOException {
138: _out.write("<title>");
139: }
140:
141: public void closeTitle() throws IOException {
142: _out.write("</title>");
143: }
144:
145: public void writePlainText(String text) throws IOException {
146: _out.write(text);
147: }
148:
149: public void openParagraph() throws IOException {
150: _out.write("<p>");
151: }
152:
153: public void closeParagraph() throws IOException {
154: _out.write("</p>");
155: }
156:
157: public void openSource() throws IOException {
158: _out.write("<source><![CDATA[");
159: }
160:
161: public void closeSource() throws IOException {
162: _out.write("]]></source>");
163: }
164:
165: public static void main(String[] args) {
166: try {
167: OutputStream out = new FileOutputStream("c:\\test.xml");
168:
169: new Word2Forrest(new HWPFDocument(new FileInputStream(
170: args[0])), out);
171: out.close();
172: } catch (Throwable t) {
173: t.printStackTrace();
174: }
175:
176: }
177: }
|