001: /**
002: * Copyright (c) 2005-2006, www.pdfbox.org
003: * All rights reserved.
004: *
005: * Redistribution and use in source and binary forms, with or without
006: * modification, are permitted provided that the following conditions are met:
007: *
008: * 1. Redistributions of source code must retain the above copyright notice,
009: * this list of conditions and the following disclaimer.
010: * 2. Redistributions in binary form must reproduce the above copyright notice,
011: * this list of conditions and the following disclaimer in the documentation
012: * and/or other materials provided with the distribution.
013: * 3. Neither the name of pdfbox; nor the names of its
014: * contributors may be used to endorse or promote products derived from this
015: * software without specific prior written permission.
016: *
017: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
018: * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
019: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
020: * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
021: * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
022: * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023: * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
024: * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
026: * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027: *
028: * http://www.pdfbox.org
029: *
030: */package org.pdfbox.util;
031:
032: import java.awt.geom.Rectangle2D;
033: import java.io.IOException;
034: import java.io.StringWriter;
035: import java.util.ArrayList;
036: import java.util.HashMap;
037: import java.util.Iterator;
038: import java.util.List;
039: import java.util.Map;
040: import java.util.Vector;
041:
042: import org.pdfbox.cos.COSStream;
043: import org.pdfbox.pdmodel.PDPage;
044: import org.pdfbox.pdmodel.common.PDStream;
045:
046: /**
047: * This will extract text from a specified region in the PDF.
048: *
049: * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
050: * @version $Revision: 1.5 $
051: */
052: public class PDFTextStripperByArea extends PDFTextStripper {
053: private List regions = new ArrayList();
054: private Map regionArea = new HashMap();
055: private Map regionCharacterList = new HashMap();
056: private Map regionText = new HashMap();
057:
058: /**
059: * Constructor.
060: * @throws IOException If there is an error loading properties.
061: */
062: public PDFTextStripperByArea() throws IOException {
063: super ();
064: setPageSeparator("");
065: }
066:
067: /**
068: * Add a new region to group text by.
069: *
070: * @param regionName The name of the region.
071: * @param rect The rectangle area to retrieve the text from.
072: */
073: public void addRegion(String regionName, Rectangle2D rect) {
074: regions.add(regionName);
075: regionArea.put(regionName, rect);
076: }
077:
078: /**
079: * Get the list of regions that have been setup.
080: *
081: * @return A list of java.lang.String objects to identify the region names.
082: */
083: public List getRegions() {
084: return regions;
085: }
086:
087: /**
088: * Get the text for the region, this should be called after extractRegions().
089: *
090: * @param regionName The name of the region to get the text from.
091: * @return The text that was identified in that region.
092: */
093: public String getTextForRegion(String regionName) {
094: StringWriter text = (StringWriter) regionText.get(regionName);
095: return text.toString();
096: }
097:
098: /**
099: * Process the page to extract the region text.
100: *
101: * @param page The page to extract the regions from.
102: * @throws IOException If there is an error while extracting text.
103: */
104: public void extractRegions(PDPage page) throws IOException {
105: Iterator regionIter = regions.iterator();
106: while (regionIter.hasNext()) {
107: //reset the stored text for the region so this class
108: //can be reused.
109: String regionName = (String) regionIter.next();
110: Vector regionCharactersByArticle = new Vector();
111: regionCharactersByArticle.add(new ArrayList());
112: regionCharacterList.put(regionName,
113: regionCharactersByArticle);
114: regionText.put(regionName, new StringWriter());
115: }
116:
117: PDStream contentStream = page.getContents();
118: if (contentStream != null) {
119: COSStream contents = contentStream.getStream();
120: processPage(page, contents);
121: }
122: }
123:
124: /**
125: * {@inheritDoc}
126: */
127: protected void showCharacter(TextPosition text) {
128: Iterator regionIter = regionArea.keySet().iterator();
129: while (regionIter.hasNext()) {
130: String region = (String) regionIter.next();
131: Rectangle2D rect = (Rectangle2D) regionArea.get(region);
132: if (rect.contains(text.getX(), text.getY())) {
133: charactersByArticle = (Vector) regionCharacterList
134: .get(region);
135: super .showCharacter(text);
136: }
137: }
138: }
139:
140: /**
141: * This will print the text to the output stream.
142: *
143: * @throws IOException If there is an error writing the text.
144: */
145: protected void flushText() throws IOException {
146: Iterator regionIter = regionArea.keySet().iterator();
147: while (regionIter.hasNext()) {
148: String region = (String) regionIter.next();
149: charactersByArticle = (Vector) regionCharacterList
150: .get(region);
151: output = (StringWriter) regionText.get(region);
152: super.flushText();
153: }
154: }
155: }
|