001: package it.stefanochizzolini.clown.samples;
002:
003: import it.stefanochizzolini.clown.documents.Document;
004: import it.stefanochizzolini.clown.documents.Page;
005: import it.stefanochizzolini.clown.documents.Pages;
006: import it.stefanochizzolini.clown.documents.contents.objects.ContainerObject;
007: import it.stefanochizzolini.clown.documents.contents.objects.ContentObject;
008: import it.stefanochizzolini.clown.documents.contents.objects.Operation;
009: import it.stefanochizzolini.clown.documents.contents.objects.Text;
010: import it.stefanochizzolini.clown.files.File;
011: import it.stefanochizzolini.clown.objects.IPdfString;
012: import it.stefanochizzolini.clown.objects.PdfArray;
013: import it.stefanochizzolini.clown.objects.PdfDirectObject;
014: import it.stefanochizzolini.clown.objects.PdfName;
015: import it.stefanochizzolini.clown.tokens.FileFormatException;
016:
017: import java.util.List;
018: import java.util.Scanner;
019:
020: /**
021: This sample is a rough stub that demonstrates a basic way to extract text from
022: a document.
023: <h3>Remarks</h3>
024: <p>This implementation is definitely simplistic: its purpose is NOT to provide
025: a real-life solution for PDF text mining; it lacks advanced features such as
026: character encoding management, glyph position detection, dehyphenation,
027: page-breaks handling and so on. Its purpose is to test the new content-stream parsing
028: functionality.</p>
029: <p>So, read my lips: this-is-just-a-toy (for now!).</p>
030: */
031: public class TextExtractionSample implements ISample {
032: public void run(PDFClownSampleLoader loader) {
033: // (boilerplate user choice -- ignore it)
034: String filePath = loader
035: .getPdfFileChoice("Please select a PDF file");
036:
037: // Open the PDF file!
038: File file;
039: try {
040: file = new File(filePath);
041: } catch (FileFormatException e) {
042: throw new RuntimeException(filePath
043: + " file has a bad file format.", e);
044: } catch (Exception e) {
045: throw new RuntimeException(
046: filePath + " file access error.", e);
047: }
048:
049: // Get the PDF document!
050: Document document = file.getDocument();
051: // Get the page collection!
052: Pages pages = document.getPages();
053: //TODO:IMPL see PDF:1.6:5.9
054: for (Page page : pages) {
055: System.out.println("\nScanning page "
056: + (page.getIndex() + 1) + "...\n");
057:
058: extract(page.getContents());
059:
060: System.out.println("\nENTER (Scan next page)");
061: System.out.println("[Q] (End scanning)");
062: System.out.print("Please select: ");
063:
064: Scanner in = new Scanner(System.in);
065: try {
066: String choice = in.nextLine();
067: if (choice.toUpperCase().equals("Q")) // Quit.
068: break;
069: } catch (Exception e) {
070: }
071: }
072: }
073:
074: private void extract(List<? extends ContentObject> objects) {
075: for (ContentObject object : objects) {
076: if (object instanceof Text) {
077: for (Operation operation : ((Text) object).getObjects()) {
078: boolean hit = false;
079: String operator = operation.getOperator();
080: if (operator.equals("TJ") || operator.equals("Tj")
081: || operator.equals("'")
082: || operator.equals("''")) {
083: for (PdfDirectObject operand : operation
084: .getOperands()) {
085: if (operand instanceof IPdfString) {
086: hit = true;
087: System.out.print(((IPdfString) operand)
088: .getStringValue()
089: + " ");
090: } else if (operand instanceof PdfArray) {
091: for (PdfDirectObject item : ((PdfArray) operand)) {
092: if (item instanceof IPdfString) {
093: hit = true;
094: System.out
095: .print(((IPdfString) item)
096: .getStringValue()
097: + " ");
098: }
099: }
100: }
101: }
102: if (hit) {
103: System.out.println();
104: }
105: }
106: }
107: } else if (object instanceof ContainerObject) {
108: extract(((ContainerObject) object).getObjects());
109: }
110: }
111: }
112: }
|