001: /**
002: * Copyright (c) 2006, www.pdfbox.org
003: * All rights reserved.
004: *
005: * Redistribution and use in source and binary forms, with or without
006: * modification, are permitted provided that the following conditions are met:
007: *
008: * 1. Redistributions of source code must retain the above copyright notice,
009: * this list of conditions and the following disclaimer.
010: * 2. Redistributions in binary form must reproduce the above copyright notice,
011: * this list of conditions and the following disclaimer in the documentation
012: * and/or other materials provided with the distribution.
013: * 3. Neither the name of pdfbox; nor the names of its
014: * contributors may be used to endorse or promote products derived from this
015: * software without specific prior written permission.
016: *
017: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
018: * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
019: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
020: * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
021: * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
022: * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023: * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
024: * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
026: * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027: *
028: * http://www.pdfbox.org
029: *
030: */package org.pdfbox.util;
031:
032: import java.io.ByteArrayOutputStream;
033: import java.io.IOException;
034: import java.io.OutputStreamWriter;
035: import java.io.Writer;
036: import java.util.regex.Matcher;
037: import java.util.regex.Pattern;
038:
039: import org.pdfbox.pdmodel.PDDocument;
040: import org.pdfbox.pdmodel.PDPage;
041:
042: /**
043: * Highlighting of words in a PDF document with an XML file.
044: *
045: * @author slagraulet (slagraulet@cardiweb.com)
046: * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
047: * @version $Revision: 1.7 $
048: *
049: * @see <a href="http://partners.adobe.com/public/developer/en/pdf/HighlightFileFormat.pdf">
050: * Adobe Highlight File Format</a>
051: */
052: public class PDFHighlighter extends PDFTextStripper {
053: private Writer highlighterOutput = null;
054: //private Color highlightColor = Color.YELLOW;
055:
056: private String[] searchedWords;
057: private ByteArrayOutputStream textOS = null;
058: private Writer textWriter = null;
059:
060: /**
061: * Default constructor.
062: *
063: * @throws IOException If there is an error constructing this class.
064: */
065: public PDFHighlighter() throws IOException {
066: super ();
067: super .setLineSeparator("");
068: super .setPageSeparator("");
069: super .setWordSeparator("");
070: super .setShouldSeparateByBeads(false);
071: super .setSuppressDuplicateOverlappingText(false);
072: }
073:
074: /**
075: * Generate an XML highlight string based on the PDF.
076: *
077: * @param pdDocument The PDF to find words in.
078: * @param highlightWord The word to search for.
079: * @param xmlOutput The resulting output xml file.
080: *
081: * @throws IOException If there is an error reading from the PDF, or writing to the XML.
082: */
083: public void generateXMLHighlight(PDDocument pdDocument,
084: String highlightWord, Writer xmlOutput) throws IOException {
085: generateXMLHighlight(pdDocument,
086: new String[] { highlightWord }, xmlOutput);
087: }
088:
089: /**
090: * Generate an XML highlight string based on the PDF.
091: *
092: * @param pdDocument The PDF to find words in.
093: * @param sWords The words to search for.
094: * @param xmlOutput The resulting output xml file.
095: *
096: * @throws IOException If there is an error reading from the PDF, or writing to the XML.
097: */
098: public void generateXMLHighlight(PDDocument pdDocument,
099: String[] sWords, Writer xmlOutput) throws IOException {
100: highlighterOutput = xmlOutput;
101: searchedWords = sWords;
102: highlighterOutput.write("<XML>\n<Body units=characters " +
103: //color and mode are not implemented by the highlight spec
104: //so don't include them for now
105: //" color=#" + getHighlightColorAsString() +
106: //" mode=active " + */
107: " version=2>\n<Highlight>\n");
108: textOS = new ByteArrayOutputStream();
109: textWriter = new OutputStreamWriter(textOS, "UTF-16");
110: writeText(pdDocument, textWriter);
111: highlighterOutput.write("</Highlight>\n</Body>\n</XML>");
112: highlighterOutput.flush();
113: }
114:
115: /**
116: * {@inheritDoc}
117: */
118: protected void endPage(PDPage pdPage) throws IOException {
119: textWriter.flush();
120:
121: String page = new String(textOS.toByteArray(), "UTF-16");
122: textOS.reset();
123: //page = page.replaceAll( "\n", "" );
124: //page = page.replaceAll( "\r", "" );
125: //page = CCRStringUtil.stripChar(page, '\n');
126: //page = CCRStringUtil.stripChar(page, '\r');
127:
128: // Traitement des listes à puces (caractères spéciaux)
129: if (page.indexOf("a") != -1) {
130: page = page.replaceAll("a[0-9]{1,3}", ".");
131: }
132:
133: for (int i = 0; i < searchedWords.length; i++) {
134: Pattern pattern = Pattern.compile(searchedWords[i],
135: Pattern.CASE_INSENSITIVE);
136: Matcher matcher = pattern.matcher(page);
137: while (matcher.find()) {
138: int begin = matcher.start();
139: int end = matcher.end();
140: highlighterOutput.write(" <loc " + "pg="
141: + (getCurrentPageNo() - 1) + " pos=" + begin
142: + " len=" + (end - begin) + ">\n");
143: }
144: }
145: }
146:
147: /**
148: * Command line application.
149: *
150: * @param args The command line arguments to the application.
151: *
152: * @throws IOException If there is an error generating the highlight file.
153: */
154: public static void main(String[] args) throws IOException {
155: PDFHighlighter xmlExtractor = new PDFHighlighter();
156: PDDocument doc = null;
157: try {
158: if (args.length < 2) {
159: usage();
160: }
161: String[] highlightStrings = new String[args.length - 1];
162: System.arraycopy(args, 1, highlightStrings, 0,
163: highlightStrings.length);
164: doc = PDDocument.load(args[0]);
165:
166: xmlExtractor.generateXMLHighlight(doc, highlightStrings,
167: new OutputStreamWriter(System.out));
168: } finally {
169: if (doc != null) {
170: doc.close();
171: }
172: }
173: }
174:
175: private static void usage() {
176: System.err.println("usage: java "
177: + PDFHighlighter.class.getName()
178: + " <pdf file> word1 word2 word3 ...");
179: System.exit(1);
180: }
181:
182: /**
183: * Get the color to highlight the strings with. Default is Color.YELLOW.
184: *
185: * @return The color to highlight strings with.
186: */
187: /*public Color getHighlightColor()
188: {
189: return highlightColor;
190: }**/
191:
192: /**
193: * Get the color to highlight the strings with. Default is Color.YELLOW.
194: *
195: * @param color The color to highlight strings with.
196: */
197: /*public void setHighlightColor(Color color)
198: {
199: this.highlightColor = color;
200: }**/
201:
202: /**
203: * Set the highlight color using HTML like rgb string. The string must be 6 characters long.
204: *
205: * @param color The color to use for highlighting. Should be in the format of "FF0000".
206: */
207: /*public void setHighlightColor( String color )
208: {
209: highlightColor = Color.decode( color );
210: }**/
211:
212: /**
213: * Get the highlight color as an HTML like string. This will return a string of six characters.
214: *
215: * @return The current highlight color. For example FF0000
216: */
217: /*public String getHighlightColorAsString()
218: {
219: //BJL: kudos to anyone that has a cleaner way of doing this!
220: String red = Integer.toHexString( highlightColor.getRed() );
221: String green = Integer.toHexString( highlightColor.getGreen() );
222: String blue = Integer.toHexString( highlightColor.getBlue() );
223:
224: return (red.length() < 2 ? "0" + red : red) +
225: (green.length() < 2 ? "0" + green : green) +
226: (blue.length() < 2 ? "0" + blue : blue);
227: }**/
228: }
|