001: /**
002: * Copyright (c) 2003-2004, www.pdfbox.org
003: * All rights reserved.
004: *
005: * Redistribution and use in source and binary forms, with or without
006: * modification, are permitted provided that the following conditions are met:
007: *
008: * 1. Redistributions of source code must retain the above copyright notice,
009: * this list of conditions and the following disclaimer.
010: * 2. Redistributions in binary form must reproduce the above copyright notice,
011: * this list of conditions and the following disclaimer in the documentation
012: * and/or other materials provided with the distribution.
013: * 3. Neither the name of pdfbox; nor the names of its
014: * contributors may be used to endorse or promote products derived from this
015: * software without specific prior written permission.
016: *
017: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
018: * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
019: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
020: * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
021: * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
022: * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023: * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
024: * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
026: * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027: *
028: * http://www.pdfbox.org
029: *
030: */package org.pdfbox.util;
031:
032: import java.io.IOException;
033:
034: import java.util.Iterator;
035: import java.util.List;
036:
037: import org.pdfbox.pdmodel.PDDocument;
038:
039: /**
040: * Wrap stripped text in simple HTML, trying to form HTML paragraphs.
041: * Paragraphs broken by pages, columns, or figures are not mended.
042: *
043: *
044: * @author jjb - http://www.johnjbarton.com
045: * @version $Revision: 1.3 $
046: */
047: public class PDFText2HTML extends PDFTextStripper {
048: private static final int INITIAL_PDF_TO_HTML_BYTES = 8192;
049:
050: private TextPosition beginTitle;
051: private TextPosition afterEndTitle;
052: private String titleGuess;
053: private boolean suppressParagraphs;
054: private boolean onFirstPage = true;
055:
056: /**
057: * Constructor.
058: *
059: * @throws IOException If there is an error during initialization.
060: */
061: public PDFText2HTML() throws IOException {
062: titleGuess = "";
063: beginTitle = null;
064: afterEndTitle = null;
065: suppressParagraphs = false;
066: }
067:
068: /**
069: * Write the header to the output document.
070: *
071: * @throws IOException If there is a problem writing out the header to the document.
072: */
073: protected void writeHeader() throws IOException {
074: StringBuffer buf = new StringBuffer(INITIAL_PDF_TO_HTML_BYTES);
075: buf.append("<html><head>");
076: buf.append("<title>");
077: buf.append(getTitleGuess());
078: buf.append("</title>");
079: buf.append("</head>");
080: buf.append("<body>\n");
081: getOutput().write(buf.toString());
082: }
083:
084: /**
085: * The guess to the document title.
086: *
087: * @return A string that is the title of this document.
088: */
089: protected String getTitleGuess() {
090: return titleGuess;
091: }
092:
093: /**
094: * {@inheritDoc}
095: */
096: protected void flushText() throws IOException {
097: Iterator textIter = getCharactersByArticle().iterator();
098:
099: if (onFirstPage) {
100: guessTitle(textIter);
101: writeHeader();
102: onFirstPage = false;
103: }
104: super .flushText();
105: }
106:
107: /**
108: * {@inheritDoc}
109: */
110: public void endDocument(PDDocument pdf) throws IOException {
111: output.write("</body></html>");
112: }
113:
114: /**
115: * This method will attempt to guess the title of the document.
116: *
117: * @param textIter The characters on the first page.
118: * @return The text position that is guessed to be the title.
119: */
120: protected TextPosition guessTitle(Iterator textIter) {
121: float lastFontSize = -1.0f;
122: int stringsInFont = 0;
123: StringBuffer titleText = new StringBuffer();
124: while (textIter.hasNext()) {
125: Iterator textByArticle = ((List) textIter.next())
126: .iterator();
127: while (textByArticle.hasNext()) {
128: TextPosition position = (TextPosition) textByArticle
129: .next();
130: float currentFontSize = position.getFontSize();
131: if (currentFontSize != lastFontSize) {
132: if (beginTitle != null) { // font change in candidate title.
133: if (stringsInFont == 0) {
134: beginTitle = null; // false alarm
135: titleText.setLength(0);
136: } else {
137: // had a significant font with some words: call it a title
138: titleGuess = titleText.toString();
139: afterEndTitle = position;
140: return beginTitle;
141: }
142: } else { // font change and begin == null
143: if (currentFontSize > 13.0f) { // most body text is 12pt max I guess
144: beginTitle = position;
145: }
146: }
147:
148: lastFontSize = currentFontSize;
149: stringsInFont = 0;
150: }
151: stringsInFont++;
152: if (beginTitle != null) {
153: titleText.append(position.getCharacter() + " ");
154: }
155: }
156: }
157: return beginTitle; // null
158: }
159:
160: /**
161: * Write out the paragraph separator.
162: *
163: * @throws IOException If there is an error writing to the stream.
164: */
165: protected void startParagraph() throws IOException {
166: if (!suppressParagraphs) {
167: getOutput().write("<p>");
168: }
169: }
170:
171: /**
172: * Write out the paragraph separator.
173: *
174: * @throws IOException If there is an error writing to the stream.
175: */
176: protected void endParagraph() throws IOException {
177: if (!suppressParagraphs) {
178: getOutput().write("</p>");
179: }
180: }
181:
182: /**
183: * {@inheritDoc}
184: */
185: protected void writeCharacters(TextPosition position)
186: throws IOException {
187: if (position == beginTitle) {
188: output.write("<H1>");
189: suppressParagraphs = true;
190: }
191: if (position == afterEndTitle) {
192: output.write("</H1>"); // end title and start first paragraph
193: suppressParagraphs = false;
194: }
195:
196: String chars = position.getCharacter();
197:
198: for (int i = 0; i < chars.length(); i++) {
199: char c = chars.charAt(i);
200: if ((c < 32) || (c > 126)) {
201: int charAsInt = c;
202: output.write("&#" + charAsInt + ";");
203: } else {
204: switch (c) {
205: case 34:
206: output.write(""");
207: break;
208: case 38:
209: output.write("&");
210: break;
211: case 60:
212: output.write("<");
213: break;
214: case 62:
215: output.write(">");
216: break;
217: default:
218: output.write(c);
219: }
220: }
221: }
222: }
223:
224: /**
225: * @return Returns the suppressParagraphs.
226: */
227: public boolean isSuppressParagraphs() {
228: return suppressParagraphs;
229: }
230:
231: /**
232: * @param shouldSuppressParagraphs The suppressParagraphs to set.
233: */
234: public void setSuppressParagraphs(boolean shouldSuppressParagraphs) {
235: this.suppressParagraphs = shouldSuppressParagraphs;
236: }
237: }
|