001: /**
002: * Copyright (c) 2003-2005, www.pdfbox.org
003: * All rights reserved.
004: *
005: * Redistribution and use in source and binary forms, with or without
006: * modification, are permitted provided that the following conditions are met:
007: *
008: * 1. Redistributions of source code must retain the above copyright notice,
009: * this list of conditions and the following disclaimer.
010: * 2. Redistributions in binary form must reproduce the above copyright notice,
011: * this list of conditions and the following disclaimer in the documentation
012: * and/or other materials provided with the distribution.
013: * 3. Neither the name of pdfbox; nor the names of its
014: * contributors may be used to endorse or promote products derived from this
015: * software without specific prior written permission.
016: *
017: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
018: * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
019: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
020: * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
021: * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
022: * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023: * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
024: * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
026: * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027: *
028: * http://www.pdfbox.org
029: *
030: */package org.pdfbox.util;
031:
032: import java.io.IOException;
033: import java.io.StringWriter;
034: import java.io.Writer;
035:
036: import java.util.ArrayList;
037: import java.util.Collections;
038: import java.util.HashMap;
039: import java.util.Iterator;
040: import java.util.List;
041: import java.util.Map;
042: import java.util.Properties;
043: import java.util.Vector;
044:
045: import org.pdfbox.cos.COSDocument;
046: import org.pdfbox.cos.COSStream;
047:
048: import org.pdfbox.pdmodel.PDDocument;
049: import org.pdfbox.pdmodel.PDPage;
050:
051: import org.pdfbox.pdmodel.common.PDRectangle;
052: import org.pdfbox.pdmodel.common.PDStream;
053:
054: import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
055: import org.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
056:
057: import org.pdfbox.exceptions.CryptographyException;
058: import org.pdfbox.exceptions.InvalidPasswordException;
059:
060: /**
061: * This class will take a pdf document and strip out all of the text and ignore the
062: * formatting and such.
063: *
064: * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
065: * @version $Revision: 1.69 $
066: */
067: public class PDFTextStripper extends PDFStreamEngine {
068: private int currentPageNo = 0;
069: private int startPage = 1;
070: private int endPage = Integer.MAX_VALUE;
071: private PDOutlineItem startBookmark = null;
072: private int startBookmarkPageNumber = -1;
073: private PDOutlineItem endBookmark = null;
074: private int endBookmarkPageNumber = -1;
075: private PDDocument document;
076: private boolean suppressDuplicateOverlappingText = true;
077: private boolean shouldSeparateByBeads = true;
078: private boolean sortByPosition = false;
079:
080: private List pageArticles = null;
081: /**
082: * The charactersByArticle is used to extract text by article divisions. For example
083: * a PDF that has two columns like a newspaper, we want to extract the first column and
084: * then the second column. In this example the PDF would have 2 beads(or articles), one for
085: * each column. The size of the charactersByArticle would be 5, because not all text on the
086: * screen will fall into one of the articles. The five divisions are shown below
087: *
088: * Text before first article
089: * first article text
090: * text between first article and second article
091: * second article text
092: * text after second article
093: *
094: * Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
095: */
096: protected Vector charactersByArticle = new Vector();
097:
098: private Map characterListMapping = new HashMap();
099:
100: private String lineSeparator = System.getProperty("line.separator");
101: private String pageSeparator = System.getProperty("line.separator");
102: private String wordSeparator = " ";
103:
104: /**
105: * The stream to write the output to.
106: */
107: protected Writer output;
108:
109: /**
110: * Instantiate a new PDFTextStripper object. This object will load properties from
111: * Resources/PDFTextStripper.properties.
112: * @throws IOException If there is an error loading the properties.
113: */
114: public PDFTextStripper() throws IOException {
115: super (ResourceLoader
116: .loadProperties("Resources/PDFTextStripper.properties"));
117: }
118:
119: /**
120: * Instantiate a new PDFTextStripper object. Loading all of the operator mappings
121: * from the properties object that is passed in.
122: *
123: * @param props The properties containing the mapping of operators to PDFOperator
124: * classes.
125: *
126: * @throws IOException If there is an error reading the properties.
127: */
128: public PDFTextStripper(Properties props) throws IOException {
129: super (props);
130: }
131:
132: /**
133: * This will return the text of a document. See writeText. <br />
134: * NOTE: The document must not be encrypted when coming into this method.
135: *
136: * @param doc The document to get the text from.
137: *
138: * @return The text of the PDF document.
139: *
140: * @throws IOException if the doc state is invalid or it is encrypted.
141: */
142: public String getText(PDDocument doc) throws IOException {
143: StringWriter outputStream = new StringWriter();
144: writeText(doc, outputStream);
145: return outputStream.toString();
146: }
147:
148: /**
149: * @deprecated
150: * @see PDFTextStripper#getText( PDDocument )
151: * @param doc The document to extract the text from.
152: * @return The document text.
153: * @throws IOException If there is an error extracting the text.
154: */
155: public String getText(COSDocument doc) throws IOException {
156: return getText(new PDDocument(doc));
157: }
158:
159: /**
160: * @deprecated
161: * @see PDFTextStripper#writeText( PDDocument, Writer )
162: * @param doc The document to extract the text.
163: * @param outputStream The stream to write the text to.
164: * @throws IOException If there is an error extracting the text.
165: */
166: public void writeText(COSDocument doc, Writer outputStream)
167: throws IOException {
168: writeText(new PDDocument(doc), outputStream);
169: }
170:
171: /**
172: * This will take a PDDocument and write the text of that document to the print writer.
173: *
174: * @param doc The document to get the data from.
175: * @param outputStream The location to put the text.
176: *
177: * @throws IOException If the doc is in an invalid state.
178: */
179: public void writeText(PDDocument doc, Writer outputStream)
180: throws IOException {
181: resetEngine();
182:
183: currentPageNo = 0;
184: document = doc;
185: output = outputStream;
186: startDocument(document);
187:
188: if (document.isEncrypted()) {
189: // We are expecting non-encrypted documents here, but it is common
190: // for users to pass in a document that is encrypted with an empty
191: // password (such a document appears to not be encrypted by
192: // someone viewing the document, thus the confusion). We will
193: // attempt to decrypt with the empty password to handle this case.
194: //
195: try {
196: document.decrypt("");
197: } catch (CryptographyException e) {
198: throw new IOException(
199: "Error decrypting document, details: "
200: + e.getMessage());
201: } catch (InvalidPasswordException e) {
202: throw new IOException("Error: document is encrypted");
203: }
204: }
205:
206: processPages(document.getDocumentCatalog().getAllPages());
207: endDocument(document);
208: }
209:
210: /**
211: * This will process all of the pages and the text that is in them.
212: *
213: * @param pages The pages object in the document.
214: *
215: * @throws IOException If there is an error parsing the text.
216: */
217: protected void processPages(List pages) throws IOException {
218: if (startBookmark != null) {
219: startBookmarkPageNumber = getPageNumber(startBookmark,
220: pages);
221: }
222:
223: if (endBookmark != null) {
224: endBookmarkPageNumber = getPageNumber(endBookmark, pages);
225: }
226:
227: if (startBookmarkPageNumber == -1
228: && startBookmark != null
229: && endBookmarkPageNumber == -1
230: && endBookmark != null
231: && startBookmark.getCOSObject() == endBookmark
232: .getCOSObject()) {
233: //this is a special case where both the start and end bookmark
234: //are the same but point to nothing. In this case
235: //we will not extract any text.
236: startBookmarkPageNumber = 0;
237: endBookmarkPageNumber = 0;
238: }
239:
240: Iterator pageIter = pages.iterator();
241: while (pageIter.hasNext()) {
242: PDPage nextPage = (PDPage) pageIter.next();
243: PDStream contentStream = nextPage.getContents();
244: if (contentStream != null) {
245: COSStream contents = contentStream.getStream();
246: processPage(nextPage, contents);
247: }
248: }
249: }
250:
251: private int getPageNumber(PDOutlineItem bookmark, List allPages)
252: throws IOException {
253: int pageNumber = -1;
254: PDPage page = bookmark.findDestinationPage(document);
255: if (page != null) {
256: pageNumber = allPages.indexOf(page) + 1;//use one based indexing
257: }
258: return pageNumber;
259: }
260:
261: /**
262: * This method is available for subclasses of this class. It will be called before processing
263: * of the document start.
264: *
265: * @param pdf The PDF document that is being processed.
266: * @throws IOException If an IO error occurs.
267: */
268: protected void startDocument(PDDocument pdf) throws IOException {
269: // no default implementation, but available for subclasses
270: }
271:
272: /**
273: * This method is available for subclasses of this class. It will be called after processing
274: * of the document finishes.
275: *
276: * @param pdf The PDF document that is being processed.
277: * @throws IOException If an IO error occurs.
278: */
279: protected void endDocument(PDDocument pdf) throws IOException {
280: // no default implementation, but available for subclasses
281: }
282:
283: /**
284: * This will process the contents of a page.
285: *
286: * @param page The page to process.
287: * @param content The contents of the page.
288: *
289: * @throws IOException If there is an error processing the page.
290: */
291: protected void processPage(PDPage page, COSStream content)
292: throws IOException {
293: currentPageNo++;
294: if (currentPageNo >= startPage
295: && currentPageNo <= endPage
296: && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber)
297: && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) {
298: startPage(page);
299: pageArticles = page.getThreadBeads();
300: int numberOfArticleSections = 1 + pageArticles.size() * 2;
301: if (!shouldSeparateByBeads) {
302: numberOfArticleSections = 1;
303: }
304: int originalSize = charactersByArticle.size();
305: charactersByArticle.setSize(numberOfArticleSections);
306: for (int i = 0; i < numberOfArticleSections; i++) {
307: if (numberOfArticleSections < originalSize) {
308: ((List) charactersByArticle.get(i)).clear();
309: } else {
310: charactersByArticle.set(i, new ArrayList());
311: }
312: }
313:
314: characterListMapping.clear();
315: processStream(page, page.findResources(), content);
316: flushText();
317: endPage(page);
318: }
319:
320: }
321:
322: /**
323: * Start a new paragraph. Default implementation is to do nothing. Subclasses
324: * may provide additional information.
325: *
326: * @throws IOException If there is any error writing to the stream.
327: */
328: protected void startParagraph() throws IOException {
329: //default is to do nothing.
330: }
331:
332: /**
333: * End a paragraph. Default implementation is to do nothing. Subclasses
334: * may provide additional information.
335: *
336: * @throws IOException If there is any error writing to the stream.
337: */
338: protected void endParagraph() throws IOException {
339: //default is to do nothing
340: }
341:
342: /**
343: * Start a new page. Default implementation is to do nothing. Subclasses
344: * may provide additional information.
345: *
346: * @param page The page we are about to process.
347: *
348: * @throws IOException If there is any error writing to the stream.
349: */
350: protected void startPage(PDPage page) throws IOException {
351: //default is to do nothing.
352: }
353:
354: /**
355: * End a page. Default implementation is to do nothing. Subclasses
356: * may provide additional information.
357: *
358: * @param page The page we are about to process.
359: *
360: * @throws IOException If there is any error writing to the stream.
361: */
362: protected void endPage(PDPage page) throws IOException {
363: //default is to do nothing
364: }
365:
366: /**
367: * This will print the text to the output stream.
368: *
369: * @throws IOException If there is an error writing the text.
370: */
371: protected void flushText() throws IOException {
372: float currentY = -1;
373: float lastBaselineFontSize = -1;
374: float endOfLastTextX = -1;
375: float startOfNextWordX = -1;
376: float lastWordSpacing = -1;
377: TextPosition lastProcessedCharacter = null;
378:
379: for (int i = 0; i < charactersByArticle.size(); i++) {
380: startParagraph();
381: List textList = (List) charactersByArticle.get(i);
382: if (sortByPosition) {
383: TextPositionComparator comparator = new TextPositionComparator(
384: getCurrentPage());
385: Collections.sort(textList, comparator);
386: }
387: Iterator textIter = textList.iterator();
388: while (textIter.hasNext()) {
389: TextPosition position = (TextPosition) textIter.next();
390: String characterValue = position.getCharacter();
391:
392: //wordSpacing = position.getWordSpacing();
393: float wordSpacing = 0;
394:
395: if (wordSpacing == 0) {
396: //try to get width of a space character
397: wordSpacing = position.getWidthOfSpace();
398: //if still zero fall back to getting the width of the current
399: //character
400: if (wordSpacing == 0) {
401: wordSpacing = position.getWidth();
402: }
403: }
404:
405: // RDD - We add a conservative approximation for space determination.
406: // basically if there is a blank area between two characters that is
407: //equal to some percentage of the word spacing then that will be the
408: //start of the next word
409: if (lastWordSpacing <= 0) {
410: startOfNextWordX = endOfLastTextX
411: + (wordSpacing * 0.50f);
412: } else {
413: startOfNextWordX = endOfLastTextX
414: + (((wordSpacing + lastWordSpacing) / 2f) * 0.50f);
415: }
416:
417: lastWordSpacing = wordSpacing;
418:
419: // RDD - We will suppress text that is very close to the current line
420: // and which overwrites previously rendered text on this line.
421: // This is done specifically to handle a reasonably common situation
422: // where an application (MS Word, in the case of my examples) renders
423: // text four times at small (1 point) offsets in order to accomplish
424: // bold printing. You would not want to do this step if you were
425: // going to render the TextPosition objects graphically.
426: //
427: /*if ((endOfLastTextX != -1 && position.getX() < endOfLastTextX) &&
428: (currentY != -1 && Math.abs(position.getY() - currentY) < 1))
429: {
430: if (log.isDebugEnabled())
431: {
432: log.debug("Suppressing text overwrite" +
433: " x: " + position.getX() +
434: " endOfLastTextX: " + endOfLastTextX +
435: " string: " + position.getCharacter());
436: }
437: continue;
438: }*/
439:
440: // RDD - Here we determine whether this text object is on the current
441: // line. We use the lastBaselineFontSize to handle the superscript
442: // case, and the size of the current font to handle the subscript case.
443: // Text must overlap with the last rendered baseline text by at least
444: // a small amount in order to be considered as being on the same line.
445: //
446: int verticalScaling = 1;
447: if (lastBaselineFontSize < 0
448: || position.getFontSize() < 0) {
449: verticalScaling = -1;
450: }
451: if (currentY != -1
452: && ((position.getY() < (currentY - (lastBaselineFontSize * 0.9f * verticalScaling))) || (position
453: .getY() > (currentY + (position
454: .getFontSize() * 0.9f * verticalScaling))))) {
455: output.write(getLineSeparator());
456: endOfLastTextX = -1;
457: startOfNextWordX = -1;
458: currentY = -1;
459: lastBaselineFontSize = -1;
460: }
461:
462: if (startOfNextWordX != -1
463: && startOfNextWordX < position.getX()
464: && lastProcessedCharacter != null
465: &&
466: //only bother adding a space if the last character was not a space
467: lastProcessedCharacter.getCharacter() != null
468: && !lastProcessedCharacter.getCharacter()
469: .endsWith(" ")) {
470: output.write(getWordSeparator());
471: }
472:
473: if (currentY == -1) {
474: currentY = position.getY();
475: }
476:
477: if (currentY == position.getY()) {
478: lastBaselineFontSize = position.getFontSize();
479: }
480:
481: // RDD - endX is what PDF considers to be the x coordinate of the
482: // end position of the text. We use it in computing our metrics below.
483: //
484: endOfLastTextX = position.getX() + position.getWidth();
485:
486: if (characterValue != null) {
487: writeCharacters(position);
488: } else {
489: //Position.getString() is null so not writing anything
490: }
491: lastProcessedCharacter = position;
492: }
493: endParagraph();
494: }
495:
496: // RDD - newline at end of flush - required for end of page (so that the top
497: // of the next page starts on its own line.
498: //
499: output.write(getPageSeparator());
500:
501: output.flush();
502: }
503:
504: /**
505: * Write the string to the output stream.
506: *
507: * @param text The text to write to the stream.
508: * @throws IOException If there is an error when writing the text.
509: */
510: protected void writeCharacters(TextPosition text)
511: throws IOException {
512: output.write(text.getCharacter());
513: }
514:
515: /**
516: * This will determine of two floating point numbers are within a specified variance.
517: *
518: * @param first The first number to compare to.
519: * @param second The second number to compare to.
520: * @param variance The allowed variance.
521: */
522: private boolean within(float first, float second, float variance) {
523: return second > first - variance && second < first + variance;
524: }
525:
526: /**
527: * This will show add a character to the list of characters to be printed to
528: * the text file.
529: *
530: * @param text The description of the character to display.
531: */
532: protected void showCharacter(TextPosition text) {
533: boolean showCharacter = true;
534: if (suppressDuplicateOverlappingText) {
535: showCharacter = false;
536: String textCharacter = text.getCharacter();
537: float textX = text.getX();
538: float textY = text.getY();
539: List sameTextCharacters = (List) characterListMapping
540: .get(textCharacter);
541: if (sameTextCharacters == null) {
542: sameTextCharacters = new ArrayList();
543: characterListMapping.put(textCharacter,
544: sameTextCharacters);
545: }
546:
547: // RDD - Here we compute the value that represents the end of the rendered
548: // text. This value is used to determine whether subsequent text rendered
549: // on the same line overwrites the current text.
550: //
551: // We subtract any positive padding to handle cases where extreme amounts
552: // of padding are applied, then backed off (not sure why this is done, but there
553: // are cases where the padding is on the order of 10x the character width, and
554: // the TJ just backs up to compensate after each character). Also, we subtract
555: // an amount to allow for kerning (a percentage of the width of the last
556: // character).
557: //
558: boolean suppressCharacter = false;
559: float tolerance = (text.getWidth() / textCharacter.length()) / 3.0f;
560: for (int i = 0; i < sameTextCharacters.size()
561: && textCharacter != null; i++) {
562: TextPosition character = (TextPosition) sameTextCharacters
563: .get(i);
564: String charCharacter = character.getCharacter();
565: float charX = character.getX();
566: float charY = character.getY();
567: //only want to suppress
568:
569: if (charCharacter != null
570: &&
571: //charCharacter.equals( textCharacter ) &&
572: within(charX, textX, tolerance)
573: && within(charY, textY, tolerance)) {
574: suppressCharacter = true;
575: }
576: }
577: if (!suppressCharacter) {
578: sameTextCharacters.add(text);
579: showCharacter = true;
580: }
581: }
582:
583: if (showCharacter) {
584: //if we are showing the character then we need to determine which
585: //article it belongs to.
586: int foundArticleDivisionIndex = -1;
587: int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
588: int notFoundButFirstLeftArticleDivisionIndex = -1;
589: int notFoundButFirstAboveArticleDivisionIndex = -1;
590: float x = text.getX();
591: float y = text.getY();
592: if (shouldSeparateByBeads) {
593: for (int i = 0; i < pageArticles.size()
594: && foundArticleDivisionIndex == -1; i++) {
595: PDThreadBead bead = (PDThreadBead) pageArticles
596: .get(i);
597: if (bead != null) {
598: PDRectangle rect = bead.getRectangle();
599: if (rect.contains(x, y)) {
600: foundArticleDivisionIndex = i * 2 + 1;
601: } else if ((x < rect.getLowerLeftX() || y < rect
602: .getUpperRightY())
603: && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) {
604: notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
605: } else if (x < rect.getLowerLeftX()
606: && notFoundButFirstLeftArticleDivisionIndex == -1) {
607: notFoundButFirstLeftArticleDivisionIndex = i * 2;
608: } else if (y < rect.getUpperRightY()
609: && notFoundButFirstAboveArticleDivisionIndex == -1) {
610: notFoundButFirstAboveArticleDivisionIndex = i * 2;
611: }
612: } else {
613: foundArticleDivisionIndex = 0;
614: }
615: }
616: } else {
617: foundArticleDivisionIndex = 0;
618: }
619: int articleDivisionIndex = -1;
620: if (foundArticleDivisionIndex != -1) {
621: articleDivisionIndex = foundArticleDivisionIndex;
622: } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) {
623: articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
624: } else if (notFoundButFirstLeftArticleDivisionIndex != -1) {
625: articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
626: } else if (notFoundButFirstAboveArticleDivisionIndex != -1) {
627: articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
628: } else {
629: articleDivisionIndex = charactersByArticle.size() - 1;
630: }
631: List textList = (List) charactersByArticle
632: .get(articleDivisionIndex);
633: textList.add(text);
634: }
635: }
636:
637: /**
638: * This is the page that the text extraction will start on. The pages start
639: * at page 1. For example in a 5 page PDF document, if the start page is 1
640: * then all pages will be extracted. If the start page is 4 then pages 4 and 5
641: * will be extracted. The default value is 1.
642: *
643: * @return Value of property startPage.
644: */
645: public int getStartPage() {
646: return startPage;
647: }
648:
649: /**
650: * This will set the first page to be extracted by this class.
651: *
652: * @param startPageValue New value of property startPage.
653: */
654: public void setStartPage(int startPageValue) {
655: startPage = startPageValue;
656: }
657:
658: /**
659: * This will get the last page that will be extracted. This is inclusive,
660: * for example if a 5 page PDF an endPage value of 5 would extract the
661: * entire document, an end page of 2 would extract pages 1 and 2. This defaults
662: * to Integer.MAX_VALUE such that all pages of the pdf will be extracted.
663: *
664: * @return Value of property endPage.
665: */
666: public int getEndPage() {
667: return endPage;
668: }
669:
670: /**
671: * This will set the last page to be extracted by this class.
672: *
673: * @param endPageValue New value of property endPage.
674: */
675: public void setEndPage(int endPageValue) {
676: endPage = endPageValue;
677: }
678:
679: /**
680: * Set the desired line separator for output text. The line.separator
681: * system property is used if the line separator preference is not set
682: * explicitly using this method.
683: *
684: * @param separator The desired line separator string.
685: */
686: public void setLineSeparator(String separator) {
687: lineSeparator = separator;
688: }
689:
690: /**
691: * This will get the line separator.
692: *
693: * @return The desired line separator string.
694: */
695: public String getLineSeparator() {
696: return lineSeparator;
697: }
698:
699: /**
700: * Set the desired page separator for output text. The line.separator
701: * system property is used if the page separator preference is not set
702: * explicitly using this method.
703: *
704: * @param separator The desired page separator string.
705: */
706: public void setPageSeparator(String separator) {
707: pageSeparator = separator;
708: }
709:
710: /**
711: * This will get the word separator.
712: *
713: * @return The desired word separator string.
714: */
715: public String getWordSeparator() {
716: return wordSeparator;
717: }
718:
719: /**
720: * Set the desired word separator for output text. The PDFBox text extraction
721: * algorithm will output a space character if there is enough space between
722: * two words. By default a space character is used. If you need and accurate
723: * count of characters that are found in a PDF document then you might want to
724: * set the word separator to the empty string.
725: *
726: * @param separator The desired page separator string.
727: */
728: public void setWordSeparator(String separator) {
729: wordSeparator = separator;
730: }
731:
732: /**
733: * This will get the page separator.
734: *
735: * @return The page separator string.
736: */
737: public String getPageSeparator() {
738: return pageSeparator;
739: }
740:
741: /**
742: * @return Returns the suppressDuplicateOverlappingText.
743: */
744: public boolean shouldSuppressDuplicateOverlappingText() {
745: return suppressDuplicateOverlappingText;
746: }
747:
748: /**
749: * Get the current page number that is being processed.
750: *
751: * @return A 1 based number representing the current page.
752: */
753: protected int getCurrentPageNo() {
754: return currentPageNo;
755: }
756:
757: /**
758: * The output stream that is being written to.
759: *
760: * @return The stream that output is being written to.
761: */
762: protected Writer getOutput() {
763: return output;
764: }
765:
766: /**
767: * Character strings are grouped by articles. It is quite common that there
768: * will only be a single article. This returns a List that contains List objects,
769: * the inner lists will contain TextPosition objects.
770: *
771: * @return A double List of TextPositions for all text strings on the page.
772: */
773: protected List getCharactersByArticle() {
774: return charactersByArticle;
775: }
776:
777: /**
778: * By default the text stripper will attempt to remove text that overlapps each other.
779: * Word paints the same character several times in order to make it look bold. By setting
780: * this to false all text will be extracted, which means that certain sections will be
781: * duplicated, but better performance will be noticed.
782: *
783: * @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set.
784: */
785: public void setSuppressDuplicateOverlappingText(
786: boolean suppressDuplicateOverlappingTextValue) {
787: this .suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue;
788: }
789:
790: /**
791: * This will tell if the text stripper should separate by beads.
792: *
793: * @return If the text will be grouped by beads.
794: */
795: public boolean shouldSeparateByBeads() {
796: return shouldSeparateByBeads;
797: }
798:
799: /**
800: * Set if the text stripper should group the text output by a list of beads. The default value is true!
801: *
802: * @param aShouldSeparateByBeads The new grouping of beads.
803: */
804: public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) {
805: this .shouldSeparateByBeads = aShouldSeparateByBeads;
806: }
807:
808: /**
809: * Get the bookmark where text extraction should end, inclusive. Default is null.
810: *
811: * @return The ending bookmark.
812: */
813: public PDOutlineItem getEndBookmark() {
814: return endBookmark;
815: }
816:
817: /**
818: * Set the bookmark where the text extraction should stop.
819: *
820: * @param aEndBookmark The ending bookmark.
821: */
822: public void setEndBookmark(PDOutlineItem aEndBookmark) {
823: endBookmark = aEndBookmark;
824: }
825:
826: /**
827: * Get the bookmark where text extraction should start, inclusive. Default is null.
828: *
829: * @return The starting bookmark.
830: */
831: public PDOutlineItem getStartBookmark() {
832: return startBookmark;
833: }
834:
835: /**
836: * Set the bookmark where text extraction should start, inclusive.
837: *
838: * @param aStartBookmark The starting bookmark.
839: */
840: public void setStartBookmark(PDOutlineItem aStartBookmark) {
841: startBookmark = aStartBookmark;
842: }
843:
844: /**
845: * This will tell if the text stripper should sort the text tokens
846: * before writing to the stream.
847: *
848: * @return true If the text tokens will be sorted before being written.
849: */
850: public boolean shouldSortByPosition() {
851: return sortByPosition;
852: }
853:
854: /**
855: * The order of the text tokens in a PDF file may not be in the same
856: * as they appear visually on the screen. For example, a PDF writer may
857: * write out all text by font, so all bold or larger text, then make a second
858: * pass and write out the normal text.<br/>
859: * The default is to <b>not</b> sort by position.<br/>
860: * <br/>
861: * A PDF writer could choose to write each character in a different order. By
862: * default PDFBox does <b>not</b> sort the text tokens before processing them due to
863: * performance reasons.
864: *
865: * @param newSortByPosition Tell PDFBox to sort the text positions.
866: */
867: public void setSortByPosition(boolean newSortByPosition) {
868: sortByPosition = newSortByPosition;
869: }
870: }
|