Source Code Cross Referenced for PDFTextStripper.java in  » PDF » PDFBox-0.7.3 » org » pdfbox » util » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » PDF » PDFBox 0.7.3 » org.pdfbox.util 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /**
002:         * Copyright (c) 2003-2005, www.pdfbox.org
003:         * All rights reserved.
004:         *
005:         * Redistribution and use in source and binary forms, with or without
006:         * modification, are permitted provided that the following conditions are met:
007:         *
008:         * 1. Redistributions of source code must retain the above copyright notice,
009:         *    this list of conditions and the following disclaimer.
010:         * 2. Redistributions in binary form must reproduce the above copyright notice,
011:         *    this list of conditions and the following disclaimer in the documentation
012:         *    and/or other materials provided with the distribution.
013:         * 3. Neither the name of pdfbox; nor the names of its
014:         *    contributors may be used to endorse or promote products derived from this
015:         *    software without specific prior written permission.
016:         *
017:         * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
018:         * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
019:         * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
020:         * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
021:         * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
022:         * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023:         * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
024:         * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025:         * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
026:         * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027:         *
028:         * http://www.pdfbox.org
029:         *
030:         */package org.pdfbox.util;
031:
032:        import java.io.IOException;
033:        import java.io.StringWriter;
034:        import java.io.Writer;
035:
036:        import java.util.ArrayList;
037:        import java.util.Collections;
038:        import java.util.HashMap;
039:        import java.util.Iterator;
040:        import java.util.List;
041:        import java.util.Map;
042:        import java.util.Properties;
043:        import java.util.Vector;
044:
045:        import org.pdfbox.cos.COSDocument;
046:        import org.pdfbox.cos.COSStream;
047:
048:        import org.pdfbox.pdmodel.PDDocument;
049:        import org.pdfbox.pdmodel.PDPage;
050:
051:        import org.pdfbox.pdmodel.common.PDRectangle;
052:        import org.pdfbox.pdmodel.common.PDStream;
053:
054:        import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
055:        import org.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
056:
057:        import org.pdfbox.exceptions.CryptographyException;
058:        import org.pdfbox.exceptions.InvalidPasswordException;
059:
060:        /**
061:         * This class will take a pdf document and strip out all of the text and ignore the
062:         * formatting and such.
063:         *
064:         * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
065:         * @version $Revision: 1.69 $
066:         */
067:        public class PDFTextStripper extends PDFStreamEngine {
068:            private int currentPageNo = 0;
069:            private int startPage = 1;
070:            private int endPage = Integer.MAX_VALUE;
071:            private PDOutlineItem startBookmark = null;
072:            private int startBookmarkPageNumber = -1;
073:            private PDOutlineItem endBookmark = null;
074:            private int endBookmarkPageNumber = -1;
075:            private PDDocument document;
076:            private boolean suppressDuplicateOverlappingText = true;
077:            private boolean shouldSeparateByBeads = true;
078:            private boolean sortByPosition = false;
079:
080:            private List pageArticles = null;
081:            /**
082:             * The charactersByArticle is used to extract text by article divisions.  For example
083:             * a PDF that has two columns like a newspaper, we want to extract the first column and
084:             * then the second column.  In this example the PDF would have 2 beads(or articles), one for
085:             * each column.  The size of the charactersByArticle would be 5, because not all text on the 
086:             * screen will fall into one of the articles.  The five divisions are shown below
087:             * 
088:             * Text before first article
089:             * first article text
090:             * text between first article and second article
091:             * second article text
092:             * text after second article
093:             * 
094:             * Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
095:             */
096:            protected Vector charactersByArticle = new Vector();
097:
098:            private Map characterListMapping = new HashMap();
099:
100:            private String lineSeparator = System.getProperty("line.separator");
101:            private String pageSeparator = System.getProperty("line.separator");
102:            private String wordSeparator = " ";
103:
104:            /**
105:             * The stream to write the output to.
106:             */
107:            protected Writer output;
108:
109:            /**
110:             * Instantiate a new PDFTextStripper object.  This object will load properties from
111:             * Resources/PDFTextStripper.properties.
112:             * @throws IOException If there is an error loading the properties.
113:             */
114:            public PDFTextStripper() throws IOException {
115:                super (ResourceLoader
116:                        .loadProperties("Resources/PDFTextStripper.properties"));
117:            }
118:
119:            /**
120:             * Instantiate a new PDFTextStripper object.  Loading all of the operator mappings
121:             * from the properties object that is passed in.
122:             * 
123:             * @param props The properties containing the mapping of operators to PDFOperator 
124:             * classes.
125:             * 
126:             * @throws IOException If there is an error reading the properties.
127:             */
128:            public PDFTextStripper(Properties props) throws IOException {
129:                super (props);
130:            }
131:
132:            /**
133:             * This will return the text of a document.  See writeText. <br />
134:             * NOTE: The document must not be encrypted when coming into this method.
135:             *
136:             * @param doc The document to get the text from.
137:             *
138:             * @return The text of the PDF document.
139:             *
140:             * @throws IOException if the doc state is invalid or it is encrypted.
141:             */
142:            public String getText(PDDocument doc) throws IOException {
143:                StringWriter outputStream = new StringWriter();
144:                writeText(doc, outputStream);
145:                return outputStream.toString();
146:            }
147:
148:            /**
149:             * @deprecated
150:             * @see PDFTextStripper#getText( PDDocument )
151:             * @param doc The document to extract the text from.
152:             * @return The document text.
153:             * @throws IOException If there is an error extracting the text.
154:             */
155:            public String getText(COSDocument doc) throws IOException {
156:                return getText(new PDDocument(doc));
157:            }
158:
159:            /**
160:             * @deprecated
161:             * @see PDFTextStripper#writeText( PDDocument, Writer )
162:             * @param doc The document to extract the text.
163:             * @param outputStream The stream to write the text to.
164:             * @throws IOException If there is an error extracting the text.
165:             */
166:            public void writeText(COSDocument doc, Writer outputStream)
167:                    throws IOException {
168:                writeText(new PDDocument(doc), outputStream);
169:            }
170:
171:            /**
172:             * This will take a PDDocument and write the text of that document to the print writer.
173:             *
174:             * @param doc The document to get the data from.
175:             * @param outputStream The location to put the text.
176:             *
177:             * @throws IOException If the doc is in an invalid state.
178:             */
179:            public void writeText(PDDocument doc, Writer outputStream)
180:                    throws IOException {
181:                resetEngine();
182:
183:                currentPageNo = 0;
184:                document = doc;
185:                output = outputStream;
186:                startDocument(document);
187:
188:                if (document.isEncrypted()) {
189:                    // We are expecting non-encrypted documents here, but it is common
190:                    // for users to pass in a document that is encrypted with an empty
191:                    // password (such a document appears to not be encrypted by
192:                    // someone viewing the document, thus the confusion).  We will
193:                    // attempt to decrypt with the empty password to handle this case.
194:                    //
195:                    try {
196:                        document.decrypt("");
197:                    } catch (CryptographyException e) {
198:                        throw new IOException(
199:                                "Error decrypting document, details: "
200:                                        + e.getMessage());
201:                    } catch (InvalidPasswordException e) {
202:                        throw new IOException("Error: document is encrypted");
203:                    }
204:                }
205:
206:                processPages(document.getDocumentCatalog().getAllPages());
207:                endDocument(document);
208:            }
209:
210:            /**
211:             * This will process all of the pages and the text that is in them.
212:             *
213:             * @param pages The pages object in the document.
214:             *
215:             * @throws IOException If there is an error parsing the text.
216:             */
217:            protected void processPages(List pages) throws IOException {
218:                if (startBookmark != null) {
219:                    startBookmarkPageNumber = getPageNumber(startBookmark,
220:                            pages);
221:                }
222:
223:                if (endBookmark != null) {
224:                    endBookmarkPageNumber = getPageNumber(endBookmark, pages);
225:                }
226:
227:                if (startBookmarkPageNumber == -1
228:                        && startBookmark != null
229:                        && endBookmarkPageNumber == -1
230:                        && endBookmark != null
231:                        && startBookmark.getCOSObject() == endBookmark
232:                                .getCOSObject()) {
233:                    //this is a special case where both the start and end bookmark
234:                    //are the same but point to nothing.  In this case
235:                    //we will not extract any text.
236:                    startBookmarkPageNumber = 0;
237:                    endBookmarkPageNumber = 0;
238:                }
239:
240:                Iterator pageIter = pages.iterator();
241:                while (pageIter.hasNext()) {
242:                    PDPage nextPage = (PDPage) pageIter.next();
243:                    PDStream contentStream = nextPage.getContents();
244:                    if (contentStream != null) {
245:                        COSStream contents = contentStream.getStream();
246:                        processPage(nextPage, contents);
247:                    }
248:                }
249:            }
250:
251:            private int getPageNumber(PDOutlineItem bookmark, List allPages)
252:                    throws IOException {
253:                int pageNumber = -1;
254:                PDPage page = bookmark.findDestinationPage(document);
255:                if (page != null) {
256:                    pageNumber = allPages.indexOf(page) + 1;//use one based indexing
257:                }
258:                return pageNumber;
259:            }
260:
261:            /**
262:             * This method is available for subclasses of this class.  It will be called before processing
263:             * of the document start.
264:             * 
265:             * @param pdf The PDF document that is being processed.
266:             * @throws IOException If an IO error occurs.
267:             */
268:            protected void startDocument(PDDocument pdf) throws IOException {
269:                // no default implementation, but available for subclasses    
270:            }
271:
272:            /**
273:             * This method is available for subclasses of this class.  It will be called after processing
274:             * of the document finishes.
275:             * 
276:             * @param pdf The PDF document that is being processed.
277:             * @throws IOException If an IO error occurs.
278:             */
279:            protected void endDocument(PDDocument pdf) throws IOException {
280:                // no default implementation, but available for subclasses
281:            }
282:
283:            /**
284:             * This will process the contents of a page.
285:             *
286:             * @param page The page to process.
287:             * @param content The contents of the page.
288:             *
289:             * @throws IOException If there is an error processing the page.
290:             */
291:            protected void processPage(PDPage page, COSStream content)
292:                    throws IOException {
293:                currentPageNo++;
294:                if (currentPageNo >= startPage
295:                        && currentPageNo <= endPage
296:                        && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber)
297:                        && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) {
298:                    startPage(page);
299:                    pageArticles = page.getThreadBeads();
300:                    int numberOfArticleSections = 1 + pageArticles.size() * 2;
301:                    if (!shouldSeparateByBeads) {
302:                        numberOfArticleSections = 1;
303:                    }
304:                    int originalSize = charactersByArticle.size();
305:                    charactersByArticle.setSize(numberOfArticleSections);
306:                    for (int i = 0; i < numberOfArticleSections; i++) {
307:                        if (numberOfArticleSections < originalSize) {
308:                            ((List) charactersByArticle.get(i)).clear();
309:                        } else {
310:                            charactersByArticle.set(i, new ArrayList());
311:                        }
312:                    }
313:
314:                    characterListMapping.clear();
315:                    processStream(page, page.findResources(), content);
316:                    flushText();
317:                    endPage(page);
318:                }
319:
320:            }
321:
322:            /**
323:             * Start a new paragraph.  Default implementation is to do nothing.  Subclasses
324:             * may provide additional information.
325:             * 
326:             * @throws IOException If there is any error writing to the stream.
327:             */
328:            protected void startParagraph() throws IOException {
329:                //default is to do nothing.
330:            }
331:
332:            /**
333:             * End a paragraph.  Default implementation is to do nothing.  Subclasses
334:             * may provide additional information.
335:             * 
336:             * @throws IOException If there is any error writing to the stream.
337:             */
338:            protected void endParagraph() throws IOException {
339:                //default is to do nothing
340:            }
341:
342:            /**
343:             * Start a new page.  Default implementation is to do nothing.  Subclasses
344:             * may provide additional information.
345:             * 
346:             * @param page The page we are about to process.
347:             * 
348:             * @throws IOException If there is any error writing to the stream.
349:             */
350:            protected void startPage(PDPage page) throws IOException {
351:                //default is to do nothing.
352:            }
353:
354:            /**
355:             * End a page.  Default implementation is to do nothing.  Subclasses
356:             * may provide additional information.
357:             * 
358:             * @param page The page we are about to process.
359:             * 
360:             * @throws IOException If there is any error writing to the stream.
361:             */
362:            protected void endPage(PDPage page) throws IOException {
363:                //default is to do nothing
364:            }
365:
366:            /**
367:             * This will print the text to the output stream.
368:             *
369:             * @throws IOException If there is an error writing the text.
370:             */
371:            protected void flushText() throws IOException {
372:                float currentY = -1;
373:                float lastBaselineFontSize = -1;
374:                float endOfLastTextX = -1;
375:                float startOfNextWordX = -1;
376:                float lastWordSpacing = -1;
377:                TextPosition lastProcessedCharacter = null;
378:
379:                for (int i = 0; i < charactersByArticle.size(); i++) {
380:                    startParagraph();
381:                    List textList = (List) charactersByArticle.get(i);
382:                    if (sortByPosition) {
383:                        TextPositionComparator comparator = new TextPositionComparator(
384:                                getCurrentPage());
385:                        Collections.sort(textList, comparator);
386:                    }
387:                    Iterator textIter = textList.iterator();
388:                    while (textIter.hasNext()) {
389:                        TextPosition position = (TextPosition) textIter.next();
390:                        String characterValue = position.getCharacter();
391:
392:                        //wordSpacing = position.getWordSpacing();
393:                        float wordSpacing = 0;
394:
395:                        if (wordSpacing == 0) {
396:                            //try to get width of a space character
397:                            wordSpacing = position.getWidthOfSpace();
398:                            //if still zero fall back to getting the width of the current
399:                            //character
400:                            if (wordSpacing == 0) {
401:                                wordSpacing = position.getWidth();
402:                            }
403:                        }
404:
405:                        // RDD - We add a conservative approximation for space determination.
406:                        // basically if there is a blank area between two characters that is
407:                        //equal to some percentage of the word spacing then that will be the
408:                        //start of the next word
409:                        if (lastWordSpacing <= 0) {
410:                            startOfNextWordX = endOfLastTextX
411:                                    + (wordSpacing * 0.50f);
412:                        } else {
413:                            startOfNextWordX = endOfLastTextX
414:                                    + (((wordSpacing + lastWordSpacing) / 2f) * 0.50f);
415:                        }
416:
417:                        lastWordSpacing = wordSpacing;
418:
419:                        // RDD - We will suppress text that is very close to the current line
420:                        // and which overwrites previously rendered text on this line.
421:                        // This is done specifically to handle a reasonably common situation
422:                        // where an application (MS Word, in the case of my examples) renders
423:                        // text four times at small (1 point) offsets in order to accomplish
424:                        // bold printing.  You would not want to do this step if you were
425:                        // going to render the TextPosition objects graphically.
426:                        //
427:                        /*if ((endOfLastTextX != -1 && position.getX() < endOfLastTextX) &&
428:                            (currentY != -1 && Math.abs(position.getY() - currentY) < 1))
429:                        {
430:                            if (log.isDebugEnabled())
431:                            {
432:                                log.debug("Suppressing text overwrite" +
433:                                          " x: " + position.getX() +
434:                                          " endOfLastTextX: " + endOfLastTextX +
435:                                          " string: " + position.getCharacter());
436:                            }
437:                            continue;
438:                        }*/
439:
440:                        // RDD - Here we determine whether this text object is on the current
441:                        // line.  We use the lastBaselineFontSize to handle the superscript
442:                        // case, and the size of the current font to handle the subscript case.
443:                        // Text must overlap with the last rendered baseline text by at least
444:                        // a small amount in order to be considered as being on the same line.
445:                        //
446:                        int verticalScaling = 1;
447:                        if (lastBaselineFontSize < 0
448:                                || position.getFontSize() < 0) {
449:                            verticalScaling = -1;
450:                        }
451:                        if (currentY != -1
452:                                && ((position.getY() < (currentY - (lastBaselineFontSize * 0.9f * verticalScaling))) || (position
453:                                        .getY() > (currentY + (position
454:                                        .getFontSize() * 0.9f * verticalScaling))))) {
455:                            output.write(getLineSeparator());
456:                            endOfLastTextX = -1;
457:                            startOfNextWordX = -1;
458:                            currentY = -1;
459:                            lastBaselineFontSize = -1;
460:                        }
461:
462:                        if (startOfNextWordX != -1
463:                                && startOfNextWordX < position.getX()
464:                                && lastProcessedCharacter != null
465:                                &&
466:                                //only bother adding a space if the last character was not a space
467:                                lastProcessedCharacter.getCharacter() != null
468:                                && !lastProcessedCharacter.getCharacter()
469:                                        .endsWith(" ")) {
470:                            output.write(getWordSeparator());
471:                        }
472:
473:                        if (currentY == -1) {
474:                            currentY = position.getY();
475:                        }
476:
477:                        if (currentY == position.getY()) {
478:                            lastBaselineFontSize = position.getFontSize();
479:                        }
480:
481:                        // RDD - endX is what PDF considers to be the x coordinate of the
482:                        // end position of the text.  We use it in computing our metrics below.
483:                        //
484:                        endOfLastTextX = position.getX() + position.getWidth();
485:
486:                        if (characterValue != null) {
487:                            writeCharacters(position);
488:                        } else {
489:                            //Position.getString() is null so not writing anything
490:                        }
491:                        lastProcessedCharacter = position;
492:                    }
493:                    endParagraph();
494:                }
495:
496:                // RDD - newline at end of flush - required for end of page (so that the top
497:                // of the next page starts on its own line.
498:                //
499:                output.write(getPageSeparator());
500:
501:                output.flush();
502:            }
503:
504:            /**
505:             * Write the string to the output stream.
506:             *  
507:             * @param text The text to write to the stream.
508:             * @throws IOException If there is an error when writing the text.
509:             */
510:            protected void writeCharacters(TextPosition text)
511:                    throws IOException {
512:                output.write(text.getCharacter());
513:            }
514:
515:            /**
516:             * This will determine of two floating point numbers are within a specified variance.
517:             *
518:             * @param first The first number to compare to.
519:             * @param second The second number to compare to.
520:             * @param variance The allowed variance.
521:             */
522:            private boolean within(float first, float second, float variance) {
523:                return second > first - variance && second < first + variance;
524:            }
525:
526:            /**
527:             * This will show add a character to the list of characters to be printed to
528:             * the text file.
529:             *
530:             * @param text The description of the character to display.
531:             */
532:            protected void showCharacter(TextPosition text) {
533:                boolean showCharacter = true;
534:                if (suppressDuplicateOverlappingText) {
535:                    showCharacter = false;
536:                    String textCharacter = text.getCharacter();
537:                    float textX = text.getX();
538:                    float textY = text.getY();
539:                    List sameTextCharacters = (List) characterListMapping
540:                            .get(textCharacter);
541:                    if (sameTextCharacters == null) {
542:                        sameTextCharacters = new ArrayList();
543:                        characterListMapping.put(textCharacter,
544:                                sameTextCharacters);
545:                    }
546:
547:                    // RDD - Here we compute the value that represents the end of the rendered
548:                    // text.  This value is used to determine whether subsequent text rendered
549:                    // on the same line overwrites the current text.
550:                    //
551:                    // We subtract any positive padding to handle cases where extreme amounts
552:                    // of padding are applied, then backed off (not sure why this is done, but there
553:                    // are cases where the padding is on the order of 10x the character width, and
554:                    // the TJ just backs up to compensate after each character).  Also, we subtract
555:                    // an amount to allow for kerning (a percentage of the width of the last
556:                    // character).
557:                    //
558:                    boolean suppressCharacter = false;
559:                    float tolerance = (text.getWidth() / textCharacter.length()) / 3.0f;
560:                    for (int i = 0; i < sameTextCharacters.size()
561:                            && textCharacter != null; i++) {
562:                        TextPosition character = (TextPosition) sameTextCharacters
563:                                .get(i);
564:                        String charCharacter = character.getCharacter();
565:                        float charX = character.getX();
566:                        float charY = character.getY();
567:                        //only want to suppress
568:
569:                        if (charCharacter != null
570:                                &&
571:                                //charCharacter.equals( textCharacter ) &&
572:                                within(charX, textX, tolerance)
573:                                && within(charY, textY, tolerance)) {
574:                            suppressCharacter = true;
575:                        }
576:                    }
577:                    if (!suppressCharacter) {
578:                        sameTextCharacters.add(text);
579:                        showCharacter = true;
580:                    }
581:                }
582:
583:                if (showCharacter) {
584:                    //if we are showing the character then we need to determine which
585:                    //article it belongs to.
586:                    int foundArticleDivisionIndex = -1;
587:                    int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
588:                    int notFoundButFirstLeftArticleDivisionIndex = -1;
589:                    int notFoundButFirstAboveArticleDivisionIndex = -1;
590:                    float x = text.getX();
591:                    float y = text.getY();
592:                    if (shouldSeparateByBeads) {
593:                        for (int i = 0; i < pageArticles.size()
594:                                && foundArticleDivisionIndex == -1; i++) {
595:                            PDThreadBead bead = (PDThreadBead) pageArticles
596:                                    .get(i);
597:                            if (bead != null) {
598:                                PDRectangle rect = bead.getRectangle();
599:                                if (rect.contains(x, y)) {
600:                                    foundArticleDivisionIndex = i * 2 + 1;
601:                                } else if ((x < rect.getLowerLeftX() || y < rect
602:                                        .getUpperRightY())
603:                                        && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) {
604:                                    notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
605:                                } else if (x < rect.getLowerLeftX()
606:                                        && notFoundButFirstLeftArticleDivisionIndex == -1) {
607:                                    notFoundButFirstLeftArticleDivisionIndex = i * 2;
608:                                } else if (y < rect.getUpperRightY()
609:                                        && notFoundButFirstAboveArticleDivisionIndex == -1) {
610:                                    notFoundButFirstAboveArticleDivisionIndex = i * 2;
611:                                }
612:                            } else {
613:                                foundArticleDivisionIndex = 0;
614:                            }
615:                        }
616:                    } else {
617:                        foundArticleDivisionIndex = 0;
618:                    }
619:                    int articleDivisionIndex = -1;
620:                    if (foundArticleDivisionIndex != -1) {
621:                        articleDivisionIndex = foundArticleDivisionIndex;
622:                    } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) {
623:                        articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
624:                    } else if (notFoundButFirstLeftArticleDivisionIndex != -1) {
625:                        articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
626:                    } else if (notFoundButFirstAboveArticleDivisionIndex != -1) {
627:                        articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
628:                    } else {
629:                        articleDivisionIndex = charactersByArticle.size() - 1;
630:                    }
631:                    List textList = (List) charactersByArticle
632:                            .get(articleDivisionIndex);
633:                    textList.add(text);
634:                }
635:            }
636:
637:            /**
638:             * This is the page that the text extraction will start on.  The pages start
639:             * at page 1.  For example in a 5 page PDF document, if the start page is 1
640:             * then all pages will be extracted.  If the start page is 4 then pages 4 and 5
641:             * will be extracted.  The default value is 1.
642:             *
643:             * @return Value of property startPage.
644:             */
645:            public int getStartPage() {
646:                return startPage;
647:            }
648:
649:            /**
650:             * This will set the first page to be extracted by this class.
651:             *
652:             * @param startPageValue New value of property startPage.
653:             */
654:            public void setStartPage(int startPageValue) {
655:                startPage = startPageValue;
656:            }
657:
658:            /**
659:             * This will get the last page that will be extracted.  This is inclusive,
660:             * for example if a 5 page PDF an endPage value of 5 would extract the
661:             * entire document, an end page of 2 would extract pages 1 and 2.  This defaults
662:             * to Integer.MAX_VALUE such that all pages of the pdf will be extracted.
663:             *
664:             * @return Value of property endPage.
665:             */
666:            public int getEndPage() {
667:                return endPage;
668:            }
669:
670:            /**
671:             * This will set the last page to be extracted by this class.
672:             *
673:             * @param endPageValue New value of property endPage.
674:             */
675:            public void setEndPage(int endPageValue) {
676:                endPage = endPageValue;
677:            }
678:
679:            /**
680:             * Set the desired line separator for output text.  The line.separator
681:             * system property is used if the line separator preference is not set
682:             * explicitly using this method.
683:             *
684:             * @param separator The desired line separator string.
685:             */
686:            public void setLineSeparator(String separator) {
687:                lineSeparator = separator;
688:            }
689:
690:            /**
691:             * This will get the line separator.
692:             *
693:             * @return The desired line separator string.
694:             */
695:            public String getLineSeparator() {
696:                return lineSeparator;
697:            }
698:
699:            /**
700:             * Set the desired page separator for output text.  The line.separator
701:             * system property is used if the page separator preference is not set
702:             * explicitly using this method.
703:             *
704:             * @param separator The desired page separator string.
705:             */
706:            public void setPageSeparator(String separator) {
707:                pageSeparator = separator;
708:            }
709:
710:            /**
711:             * This will get the word separator.
712:             *
713:             * @return The desired word separator string.
714:             */
715:            public String getWordSeparator() {
716:                return wordSeparator;
717:            }
718:
719:            /**
720:             * Set the desired word separator for output text.  The PDFBox text extraction
721:             * algorithm will output a space character if there is enough space between
722:             * two words.  By default a space character is used.  If you need and accurate
723:             * count of characters that are found in a PDF document then you might want to
724:             * set the word separator to the empty string.
725:             *
726:             * @param separator The desired page separator string.
727:             */
728:            public void setWordSeparator(String separator) {
729:                wordSeparator = separator;
730:            }
731:
732:            /**
733:             * This will get the page separator.
734:             *
735:             * @return The page separator string.
736:             */
737:            public String getPageSeparator() {
738:                return pageSeparator;
739:            }
740:
741:            /**
742:             * @return Returns the suppressDuplicateOverlappingText.
743:             */
744:            public boolean shouldSuppressDuplicateOverlappingText() {
745:                return suppressDuplicateOverlappingText;
746:            }
747:
748:            /**
749:             * Get the current page number that is being processed.
750:             * 
751:             * @return A 1 based number representing the current page.
752:             */
753:            protected int getCurrentPageNo() {
754:                return currentPageNo;
755:            }
756:
757:            /**
758:             * The output stream that is being written to.
759:             * 
760:             * @return The stream that output is being written to.
761:             */
762:            protected Writer getOutput() {
763:                return output;
764:            }
765:
766:            /**
767:             * Character strings are grouped by articles.  It is quite common that there
768:             * will only be a single article.  This returns a List that contains List objects,
769:             * the inner lists will contain TextPosition objects.
770:             * 
771:             * @return A double List of TextPositions for all text strings on the page.
772:             */
773:            protected List getCharactersByArticle() {
774:                return charactersByArticle;
775:            }
776:
777:            /**
778:             * By default the text stripper will attempt to remove text that overlapps each other.
779:             * Word paints the same character several times in order to make it look bold.  By setting
780:             * this to false all text will be extracted, which means that certain sections will be 
781:             * duplicated, but better performance will be noticed.
782:             * 
783:             * @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set.
784:             */
785:            public void setSuppressDuplicateOverlappingText(
786:                    boolean suppressDuplicateOverlappingTextValue) {
787:                this .suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue;
788:            }
789:
790:            /**
791:             * This will tell if the text stripper should separate by beads.
792:             * 
793:             * @return If the text will be grouped by beads.
794:             */
795:            public boolean shouldSeparateByBeads() {
796:                return shouldSeparateByBeads;
797:            }
798:
799:            /**
800:             * Set if the text stripper should group the text output by a list of beads.  The default value is true!
801:             * 
802:             * @param aShouldSeparateByBeads The new grouping of beads.
803:             */
804:            public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) {
805:                this .shouldSeparateByBeads = aShouldSeparateByBeads;
806:            }
807:
808:            /**
809:             * Get the bookmark where text extraction should end, inclusive.  Default is null.
810:             * 
811:             * @return The ending bookmark.
812:             */
813:            public PDOutlineItem getEndBookmark() {
814:                return endBookmark;
815:            }
816:
817:            /**
818:             * Set the bookmark where the text extraction should stop.
819:             * 
820:             * @param aEndBookmark The ending bookmark.
821:             */
822:            public void setEndBookmark(PDOutlineItem aEndBookmark) {
823:                endBookmark = aEndBookmark;
824:            }
825:
826:            /**
827:             * Get the bookmark where text extraction should start, inclusive.  Default is null.
828:             * 
829:             * @return The starting bookmark.
830:             */
831:            public PDOutlineItem getStartBookmark() {
832:                return startBookmark;
833:            }
834:
835:            /**
836:             * Set the bookmark where text extraction should start, inclusive.
837:             * 
838:             * @param aStartBookmark The starting bookmark.
839:             */
840:            public void setStartBookmark(PDOutlineItem aStartBookmark) {
841:                startBookmark = aStartBookmark;
842:            }
843:
844:            /**
845:             * This will tell if the text stripper should sort the text tokens
846:             * before writing to the stream.
847:             * 
848:             * @return true If the text tokens will be sorted before being written.
849:             */
850:            public boolean shouldSortByPosition() {
851:                return sortByPosition;
852:            }
853:
854:            /**
855:             * The order of the text tokens in a PDF file may not be in the same
856:             * as they appear visually on the screen.  For example, a PDF writer may
857:             * write out all text by font, so all bold or larger text, then make a second
858:             * pass and write out the normal text.<br/>
859:             * The default is to <b>not</b> sort by position.<br/>
860:             * <br/>
861:             * A PDF writer could choose to write each character in a different order.  By
862:             * default PDFBox does <b>not</b> sort the text tokens before processing them due to
863:             * performance reasons.
864:             *     
865:             * @param newSortByPosition Tell PDFBox to sort the text positions.
866:             */
867:            public void setSortByPosition(boolean newSortByPosition) {
868:                sortByPosition = newSortByPosition;
869:            }
870:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.