001: /**
002: * Copyright (c) 2006, www.pdfbox.org
003: * All rights reserved.
004: *
005: * Redistribution and use in source and binary forms, with or without
006: * modification, are permitted provided that the following conditions are met:
007: *
008: * 1. Redistributions of source code must retain the above copyright notice,
009: * this list of conditions and the following disclaimer.
010: * 2. Redistributions in binary form must reproduce the above copyright notice,
011: * this list of conditions and the following disclaimer in the documentation
012: * and/or other materials provided with the distribution.
013: * 3. Neither the name of pdfbox; nor the names of its
014: * contributors may be used to endorse or promote products derived from this
015: * software without specific prior written permission.
016: *
017: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
018: * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
019: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
020: * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
021: * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
022: * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023: * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
024: * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
026: * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027: *
028: * http://www.pdfbox.org
029: *
030: */package org.pdfbox.examples.util;
031:
032: import org.pdfbox.pdfparser.PDFStreamParser;
033: import org.pdfbox.pdfwriter.ContentStreamWriter;
034:
035: import org.pdfbox.pdmodel.PDDocument;
036: import org.pdfbox.pdmodel.PDPage;
037: import org.pdfbox.pdmodel.common.PDStream;
038: import org.pdfbox.util.PDFOperator;
039:
040: import java.util.ArrayList;
041: import java.util.List;
042:
043: /**
044: * This is an example on how to remove all text from PDF document.
045: *
046: * Usage: java org.pdfbox.examples.util.RemoveAllText <input-pdf> <output-pdf>
047: *
048: * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
049: * @version $Revision: 1.2 $
050: */
051: public class RemoveAllText {
052: /**
053: * Default constructor.
054: */
055: private RemoveAllText() {
056: //example class should not be instantiated
057: }
058:
059: /**
060: * This will remove all text from a PDF document.
061: *
062: * @param args The command line arguments.
063: *
064: * @throws Exception If there is an error parsing the document.
065: */
066: public static void main(String[] args) throws Exception {
067: if (args.length != 2) {
068: usage();
069: } else {
070: PDDocument document = null;
071: try {
072: document = PDDocument.load(args[0]);
073: if (document.isEncrypted()) {
074: System.err
075: .println("Error: Encrypted documents are not supported for this example.");
076: System.exit(1);
077: }
078: List allPages = document.getDocumentCatalog()
079: .getAllPages();
080: for (int i = 0; i < allPages.size(); i++) {
081: PDPage page = (PDPage) allPages.get(i);
082: PDFStreamParser parser = new PDFStreamParser(page
083: .getContents());
084: parser.parse();
085: List tokens = parser.getTokens();
086: List newTokens = new ArrayList();
087: for (int j = 0; j < tokens.size(); j++) {
088: Object token = tokens.get(j);
089: if (token instanceof PDFOperator) {
090: PDFOperator op = (PDFOperator) token;
091: if (op.getOperation().equals("TJ")
092: || op.getOperation().equals("Tj")) {
093: //remove the one argument to this operator
094: newTokens.remove(newTokens.size() - 1);
095: continue;
096: }
097: }
098: newTokens.add(token);
099:
100: }
101: PDStream newContents = new PDStream(document);
102: ContentStreamWriter writer = new ContentStreamWriter(
103: newContents.createOutputStream());
104: writer.writeTokens(newTokens);
105: newContents.addCompression();
106: page.setContents(newContents);
107: }
108: document.save(args[1]);
109: } finally {
110: if (document != null) {
111: document.close();
112: }
113: }
114: }
115: }
116:
117: /**
118: * This will print the usage for this document.
119: */
120: private static void usage() {
121: System.err
122: .println("Usage: java org.pdfbox.examples.pdmodel.RemoveAllText <input-pdf> <output-pdf>");
123: }
124:
125: }
|