001: package com.canoo.webtest.plugins.pdftest.htmlunit.pdfbox;
002:
003: import java.awt.geom.Rectangle2D;
004: import java.io.IOException;
005: import java.io.StringWriter;
006: import java.util.ArrayList;
007: import java.util.HashMap;
008: import java.util.Iterator;
009: import java.util.List;
010: import java.util.ListIterator;
011: import java.util.Map;
012: import java.util.TreeMap;
013:
014: import org.apache.commons.collections.Predicate;
015: import org.apache.commons.collections.functors.AndPredicate;
016: import org.apache.commons.collections.functors.TruePredicate;
017: import org.apache.commons.io.IOUtils;
018: import org.apache.commons.logging.Log;
019: import org.apache.commons.logging.LogFactory;
020: import org.pdfbox.cos.COSBase;
021: import org.pdfbox.cos.COSBoolean;
022: import org.pdfbox.cos.COSDictionary;
023: import org.pdfbox.cos.COSFloat;
024: import org.pdfbox.cos.COSInteger;
025: import org.pdfbox.cos.COSName;
026: import org.pdfbox.cos.COSNull;
027: import org.pdfbox.cos.COSString;
028: import org.pdfbox.exceptions.InvalidPasswordException;
029: import org.pdfbox.pdmodel.PDDocument;
030: import org.pdfbox.pdmodel.PDPage;
031: import org.pdfbox.pdmodel.common.PDRectangle;
032: import org.pdfbox.pdmodel.encryption.PDStandardEncryption;
033: import org.pdfbox.pdmodel.font.PDFont;
034: import org.pdfbox.pdmodel.interactive.action.type.PDAction;
035: import org.pdfbox.pdmodel.interactive.action.type.PDActionGoTo;
036: import org.pdfbox.pdmodel.interactive.action.type.PDActionURI;
037: import org.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
038: import org.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
039: import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
040: import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
041: import org.pdfbox.pdmodel.interactive.form.PDAcroForm;
042: import org.pdfbox.pdmodel.interactive.form.PDField;
043: import org.pdfbox.util.PDFTextStripper;
044: import org.pdfbox.util.PDFTextStripperByArea;
045: import org.pdfbox.util.TextPosition;
046:
047: import com.canoo.webtest.plugins.pdftest.htmlunit.PDFEncryptionPermission;
048: import com.canoo.webtest.plugins.pdftest.htmlunit.PDFField;
049: import com.canoo.webtest.plugins.pdftest.htmlunit.PDFInvalidPasswordException;
050: import com.canoo.webtest.plugins.pdftest.htmlunit.PDFPage;
051: import com.gargoylesoftware.htmlunit.WebResponse;
052: import com.gargoylesoftware.htmlunit.WebWindow;
053:
054: /**
055: * Implementation of {@link PDFPage} based on <a href="http://www.pdfbox.org/">PDFBox</a>.
056: * @author Etienne Studer
057: * @author Paul King
058: * @author Marc Guillemot
059: */
060: public class PdfBoxPDFPage implements PDFPage {
061: private PDDocument pdfDocument_;
062: private List bookmarks_;
063: private final WebWindow webWindow_;
064: private final WebResponse webResponse_;
065:
066: private static final COSName INFO_PROPERTY_TITLE = COSName
067: .getPDFName("Title"); // title of document
068:
069: private boolean cleanUpCalled;
070: private static int counter = 0;
071: private static int allocated = 0;
072:
073: public void cleanUp() throws IOException {
074: cleanUpCalled = true;
075: allocated--;
076: if (pdfDocument_ != null)
077: pdfDocument_.close();
078: }
079:
080: public PdfBoxPDFPage(final WebResponse webResponse,
081: final WebWindow webWindow) {
082: webWindow_ = webWindow;
083: webResponse_ = webResponse;
084:
085: pdfDocument_ = loadPDFDocument();
086: counter++;
087: allocated++;
088: }
089:
090: protected PDDocument loadPDFDocument() {
091: try {
092: return PDDocument.load(getWebResponse()
093: .getContentAsStream());
094: } catch (final IOException e) {
095: getLog().warn(
096: "Failed parsing PDF document "
097: + getWebResponse().getUrl() + ": "
098: + e.getMessage(), e);
099: }
100:
101: return null;
102: }
103:
104: /**
105: * Return the log object for this web client
106: * @return The log object
107: */
108: protected final Log getLog() {
109: return LogFactory.getLog(getClass());
110: }
111:
112: private COSDictionary getInfoDictionary() {
113: final COSDictionary encryptProperties = getPDFDocument()
114: .getDocumentInformation().getDictionary();
115: return encryptProperties != null ? encryptProperties
116: : new COSDictionary();
117: }
118:
119: private static void assertKeyExists(COSName key,
120: COSDictionary properties) {
121: if (properties.keyList().contains(key)) {
122: return;
123: }
124:
125: throw new IllegalArgumentException("Specified property key '"
126: + key.getName() + "' does not exist.");
127: }
128:
129: public String getDocumentTitle() {
130: assertKeyExists(INFO_PROPERTY_TITLE, getInfoDictionary());
131: COSString title = (COSString) getInfoDictionary().getItem(
132: INFO_PROPERTY_TITLE);
133: return title.getString();
134: }
135:
136: public WebWindow getEnclosingWindow() {
137: return webWindow_;
138: }
139:
140: public WebResponse getWebResponse() {
141: return webResponse_;
142: }
143:
144: public void initialize() throws IOException {
145: // TODO Auto-generated method stub
146:
147: }
148:
149: public int getNumberOfPages() {
150: return getPDFDocument().getNumberOfPages();
151: }
152:
153: /**
154: * Gets the PDF document
155: * @return the document
156: * @throws RuntimeException if the PDF document couldn't be parsed
157: */
158: protected PDDocument getPDFDocument() {
159: if (cleanUpCalled) {
160: pdfDocument_ = loadPDFDocument();
161: cleanUpCalled = false;
162: }
163: if (pdfDocument_ == null)
164: throw new RuntimeException(
165: "Can't work on pdf document as it couldn't get parsed");
166: return pdfDocument_;
167: }
168:
169: public List getFields() {
170: return getFields(TruePredicate.INSTANCE);
171: }
172:
173: public void decrypt(String password) {
174: try {
175: getPDFDocument().decrypt(password);
176: } catch (final InvalidPasswordException e) {
177: throw new PDFInvalidPasswordException(e);
178: } catch (final Exception e) {
179: throw new RuntimeException(
180: "Problem decrypting the document", e);
181: }
182: }
183:
184: public boolean isEncrypted() {
185: return getPDFDocument().isEncrypted();
186: }
187:
188: public String getText(int startPage, int endPage) {
189: return getTextInternal(startPage, endPage);
190: }
191:
192: protected String getTextInternal(int startPage, int endPage) {
193: try {
194: final PDFTextStripper textStripper = new PDFTextStripper();
195: textStripper.setStartPage(startPage);
196: textStripper.setEndPage(endPage);
197: return textStripper.getText(getPDFDocument());
198: } catch (final IOException e) {
199: throw new RuntimeException("Problem extracting text", e);
200: }
201: }
202:
203: protected List getFields(final Predicate filter) {
204: final PDAcroForm acroForm = getPDFDocument()
205: .getDocumentCatalog().getAcroForm();
206: final List response = new ArrayList();
207:
208: try {
209: if (acroForm != null) {
210: final List fields = acroForm.getFields();
211: for (final Iterator iter = fields.iterator(); iter
212: .hasNext();) {
213: final PDField field = (PDField) iter.next();
214: final List kids = field.getKids();
215: if (kids != null && !kids.isEmpty()) {
216: for (final Iterator iterKids = kids.iterator(); iterKids
217: .hasNext();) {
218: final PDField childField = (PDField) iterKids
219: .next();
220: if (filter.evaluate(childField)) {
221: response.add(PdfBoxPDFField
222: .wrap(childField));
223: }
224: }
225: } else if (filter.evaluate(field)) {
226: response.add(PdfBoxPDFField.wrap(field));
227: }
228: }
229: }
230: } catch (final IOException e) {
231: throw new RuntimeException("Failed reading fields", e);
232: }
233:
234: return response;
235: }
236:
237: public List getFields(final String name, final PDFField.Type type) {
238: return getFields(PdfBoxPDFField.FieldPredicate
239: .buildNamePredicate(name));
240: }
241:
242: public List getFields(final String name, final int pageNumber,
243: final PDFField.Type type) {
244: final Predicate predicateName = PdfBoxPDFField.FieldPredicate
245: .buildNamePredicate(name);
246: final Predicate predicatePage = PdfBoxPDFField.FieldPredicate
247: .buildPageNumberPredicate(pageNumber);
248:
249: final Predicate predicate = new AndPredicate(predicateName,
250: predicatePage);
251:
252: return getFields(predicate);
253: }
254:
255: public boolean hasPermission(
256: final PDFEncryptionPermission permission) {
257: final PDStandardEncryption info;
258: try {
259: info = (PDStandardEncryption) getPDFDocument()
260: .getEncryptionDictionary();
261: } catch (final IOException e) {
262: throw new RuntimeException("Can't read permissions", e);
263: }
264:
265: if (PDFEncryptionPermission.ASSEMBLY.equals(permission))
266: return info.canAssembleDocument();
267: else if (PDFEncryptionPermission.COPY.equals(permission))
268: return info.canExtractContent();
269: else if (PDFEncryptionPermission.DEGRADED_PRINTING
270: .equals(permission))
271: return info.canPrintDegraded();
272: else if (PDFEncryptionPermission.FILL_IN.equals(permission))
273: return info.canFillInForm();
274: else if (PDFEncryptionPermission.MODIFY_ANNOTATIONS
275: .equals(permission))
276: return info.canModifyAnnotations();
277: else if (PDFEncryptionPermission.MODIFY_CONTENTS
278: .equals(permission))
279: return info.canModify();
280: else if (PDFEncryptionPermission.PRINTING.equals(permission))
281: return info.canPrint();
282: else if (PDFEncryptionPermission.SCREEN_READERS
283: .equals(permission))
284: return info.canExtractForAccessibility();
285:
286: throw new IllegalArgumentException("Unknown pdf permission: "
287: + permission);
288: }
289:
290: public String getEncryptProperty(final String key) {
291: final COSDictionary encryptProperties = getPDFDocument()
292: .getDocument().getEncryptionDictionary();
293: return stringValue(encryptProperties.getDictionaryObject(key));
294: }
295:
296: static String stringValue(final COSBase element) {
297: if (element == null) {
298: return null;
299: } else if (element instanceof COSString) {
300: return ((COSString) element).getString();
301: } else if (element instanceof COSName) {
302: return ((COSName) element).getName();
303: } else if (element instanceof COSBoolean) {
304: return String.valueOf(((COSBoolean) element).getValue());
305: } else if (element instanceof COSInteger) {
306: return String.valueOf(((COSInteger) element).intValue());
307: } else if (element instanceof COSFloat) {
308: return String.valueOf(((COSFloat) element).floatValue());
309: } else if (element instanceof COSNull) {
310: return "null";
311: } else
312: return String.valueOf(element);
313: }
314:
315: public int getEncryptionStrength() {
316: try {
317: return getPDFDocument().getEncryptionDictionary()
318: .getLength();
319: } catch (final IOException e) {
320: throw new RuntimeException(
321: "Failed reading encryption strength", e);
322: }
323: }
324:
325: public String getInfoProperty(final String key) {
326: final COSDictionary properties = getPDFDocument()
327: .getDocumentInformation().getDictionary();
328: if (properties == null)
329: return null;
330:
331: final COSName pdfName = COSName.getPDFName(key);
332: return stringValue(properties.getDictionaryObject(pdfName));
333: }
334:
335: public boolean isUserPassword(final String password) {
336: try {
337: return getPDFDocument().isUserPassword(password);
338: } catch (final Exception e) {
339: throw new RuntimeException(
340: "Failed verifying user password", e);
341: }
342: }
343:
344: public boolean isOwnerPassword(final String password) {
345: try {
346: return getPDFDocument().isOwnerPassword(password);
347: } catch (final Exception e) {
348: throw new RuntimeException(
349: "Failed verifying owner password", e);
350: }
351: }
352:
353: public List getBookmarks() {
354: if (bookmarks_ == null)
355: bookmarks_ = extractBookmarks();
356:
357: return bookmarks_;
358: }
359:
360: private List extractBookmarks() {
361: final PDDocumentOutline outline = getPDFDocument()
362: .getDocumentCatalog().getDocumentOutline();
363: final List result = new ArrayList();
364: if (outline != null) {
365: PDOutlineItem child = outline.getFirstChild();
366: while (child != null) {
367: final PdfBoxPDFBookmark topBookmark = new PdfBoxPDFBookmark(
368: child, null);
369: result.add(topBookmark);
370: result.addAll(topBookmark.getAllChildren());
371: child = child.getNextSibling();
372: }
373: }
374: return result;
375: }
376:
377: public List getFonts() {
378: final List fonts = new ArrayList();
379: final List pages = getPDFDocument().getDocumentCatalog()
380: .getAllPages();
381: for (final ListIterator iter = pages.listIterator(); iter
382: .hasNext();) {
383: final PDPage page = (PDPage) iter.next();
384: try {
385: for (final Iterator fontIterator = page.findResources()
386: .getFonts().values().iterator(); fontIterator
387: .hasNext();) {
388: final PDFont font = (PDFont) fontIterator.next();
389: fonts
390: .add(new PDFBoxPDFFont(font, iter
391: .nextIndex())); // nextIndex() because page number start with 1 not 0
392: }
393: } catch (final IOException e) {
394: throw new RuntimeException(
395: "Failed retrieving the fonts on page "
396: + iter.nextIndex(), e);
397: }
398: }
399: return fonts;
400: }
401:
402: public List getFields(int pageNumber) {
403: final Predicate predicatePage = PdfBoxPDFField.FieldPredicate
404: .buildPageNumberPredicate(pageNumber);
405: return getFields(predicatePage);
406: }
407:
408: public List getFields(final String name, final int pageNumber) {
409: final Predicate predicateName = PdfBoxPDFField.FieldPredicate
410: .buildNamePredicate(name);
411: final Predicate predicatePage = PdfBoxPDFField.FieldPredicate
412: .buildPageNumberPredicate(pageNumber);
413:
414: final Predicate predicate = new AndPredicate(predicateName,
415: predicatePage);
416:
417: return getFields(predicate);
418: }
419:
420: public List getFields(final String name) {
421: return getFields(PdfBoxPDFField.FieldPredicate
422: .buildNamePredicate(name));
423: }
424:
425: /**
426: * Gets the links from the document
427: * @return the links
428: */
429: public List getLinks() {
430: final List result = new ArrayList();
431: final List allPages = getPDFDocument().getDocumentCatalog()
432: .getAllPages();
433: for (final ListIterator iter = allPages.listIterator(); iter
434: .hasNext();) {
435: final PDPage page = (PDPage) iter.next();
436: processPage(result, page, iter.nextIndex());
437: }
438: return result;
439: }
440:
441: private static void processPage(final List result,
442: final PDPage page, final int pageNum) {
443: try {
444: final PDFTextStripperByArea stripper = new PDFTextStripperByArea();
445: final List linkAnnotations = new ArrayList();
446: final List linkRegions = new ArrayList();
447: extractAnnotations(page, stripper, linkAnnotations,
448: linkRegions);
449: stripper.extractRegions(page);
450: final Map uriMap = new HashMap();
451: final Map textMap = new HashMap();
452: collateLinks(linkAnnotations, linkRegions, uriMap, textMap,
453: stripper);
454: final Iterator it = uriMap.keySet().iterator();
455: while (it.hasNext()) {
456: final Object key = it.next();
457: result.add(new PDFBoxPDFLink((String) textMap.get(key),
458: (String) uriMap.get(key), pageNum));
459: }
460: } catch (final IOException e) {
461: // ignore
462: }
463: }
464:
465: private static void collateLinks(final List linkAnnotations,
466: final List linkRegions, final Map uriMap,
467: final Map textMap, final PDFTextStripperByArea stripper)
468: throws IOException {
469: for (int j = 0; j < linkAnnotations.size(); j++) {
470: final PDAnnotationLink link = (PDAnnotationLink) linkAnnotations
471: .get(j);
472: final PDAction action = link.getAction();
473: final String urlText = stripper.getTextForRegion(Integer
474: .toString(j));
475: if (action instanceof PDActionURI) {
476: final PDActionURI uri = (PDActionURI) action;
477: // internal links have no text
478: if (urlText.length() > 0) {
479: textMap.put(linkRegions.get(j), urlText);
480: }
481: uriMap.put(linkRegions.get(j), uri.getURI());
482: } else if (action instanceof PDActionGoTo) {
483: // internal link text associated with goto
484: if (urlText.length() > 0) {
485: textMap.put(linkRegions.get(j), urlText);
486: }
487: }
488: }
489: }
490:
491: private static List extractAnnotations(final PDPage page,
492: final PDFTextStripperByArea stripper,
493: final List linkAnnotations, final List linkRegions)
494: throws IOException {
495: final List annotations = page.getAnnotations();
496: for (int j = 0; j < annotations.size(); j++) {
497: final PDAnnotation annot = (PDAnnotation) annotations
498: .get(j);
499: if (annot instanceof PDAnnotationLink) {
500: final PDRectangle rect = annot.getRectangle();
501: //need to reposition link rectangle to match text space plus add
502: //a little to account for descenders and the like
503: final float x = rect.getLowerLeftX() - 1;
504: float y = rect.getUpperRightY() - 1;
505: final float width = rect.getWidth() + 2;
506: final float height = rect.getHeight()
507: + rect.getHeight() / 4;
508: final int rotation = page.findRotation();
509: if (rotation == 0) {
510: final PDRectangle pageSize = page.findMediaBox();
511: y = pageSize.getHeight() - y;
512: }
513:
514: final Rectangle2D.Float awtRect = new Rectangle2D.Float(
515: x, y, width, height);
516: stripper.addRegion(Integer.toString(j), awtRect);
517: linkAnnotations.add(annot);
518: linkRegions.add(awtRect);
519: }
520: }
521: return annotations;
522: }
523:
524: public String getText(final String fragmentSeparator,
525: final String lineSeparator, final String pageSeparator,
526: final String mode) {
527: return getText(0, getNumberOfPages(), fragmentSeparator,
528: lineSeparator, pageSeparator, mode);
529: }
530:
531: private String getText(final int startPage, final int endPage,
532: final String fragmentSeparator, final String lineSeparator,
533: final String pageSeparator, final String mode) {
534: final StringBuffer buf = new StringBuffer();
535: if (MODE_NORMAL.equals(mode)) {
536: buf.append(getTextInternal(startPage, endPage,
537: lineSeparator, pageSeparator));
538: } else {
539: for (int page = startPage; page <= endPage; page++) {
540: final List fragments = getFragments(page,
541: fragmentSeparator, lineSeparator);
542: final String tmp = collateFragments(fragments,
543: fragmentSeparator, lineSeparator);
544: if (tmp.length() > 0) {
545: buf.append(tmp);
546: buf.append(pageSeparator);
547: }
548: }
549: }
550: return buf.toString();
551: }
552:
553: private String getTextInternal(final int startPage,
554: final int endPage, final String lineSeparator,
555: final String pageSeparator) {
556: final StringWriter output = new StringWriter();
557: try {
558: final PDFTextStripper textStripper = new PDFTextStripper();
559: textStripper.setPageSeparator(pageSeparator);
560: textStripper.setLineSeparator(lineSeparator);
561: textStripper.setStartPage(startPage);
562: textStripper.setEndPage(endPage);
563: textStripper.writeText(getPDFDocument(), output);
564: return output.toString();
565: } catch (final Exception e) {
566: throw new RuntimeException(
567: "Error while extracting text from document.", e);
568: } finally {
569: IOUtils.closeQuietly(output);
570: }
571: }
572:
573: public List getFragments(int page, final String fragmentSeparator,
574: final String lineSeparator) {
575: final List fragments = new ArrayList();
576:
577: final StringWriter output = new StringWriter();
578: try {
579: final PDFTextStripper textStripper = new PDFTextStripper() {
580: protected void showCharacter(TextPosition textPosition) {
581: fragments.add(textPosition);
582: }
583: };
584: textStripper.setLineSeparator(lineSeparator);
585: textStripper.setStartPage(page);
586: textStripper.setEndPage(page);
587: textStripper.writeText(getPDFDocument(), output);
588: return fragments;
589: } catch (final Exception e) {
590: throw new RuntimeException(
591: "Error while extracting text from document.", e);
592: } finally {
593: IOUtils.closeQuietly(output);
594: }
595: }
596:
597: private String collateFragments(List fragments,
598: String fragmentSeparator, String lineSeparator) {
599: final Map linesOfText = new TreeMap();
600: regroup(fragments, linesOfText);
601: final Map linesOfString = new TreeMap();
602: coalesce(linesOfText, linesOfString);
603: return fragmentsToString(linesOfString, fragmentSeparator,
604: lineSeparator);
605: }
606:
607: private void coalesce(Map linesOfText, Map linesOfString) {
608: Iterator kit = linesOfText.keySet().iterator();
609: while (kit.hasNext()) {
610: Integer key = (Integer) kit.next();
611: linesOfString.put(key, coalesceLine((Map) linesOfText
612: .get(key)));
613: }
614: }
615:
616: private Map coalesceLine(Map input) {
617: final Map output = new TreeMap();
618: final Iterator kit = input.keySet().iterator();
619: TextPosition lastFragment = null;
620: String lastString = null;
621: Integer lastKey = null;
622: while (kit.hasNext()) {
623: final Integer key = (Integer) kit.next();
624: final TextPosition this Fragment = (TextPosition) input
625: .get(key);
626: if (lastFragment != null
627: && adjacent(lastFragment, this Fragment)) {
628: lastFragment = this Fragment;
629: lastString += this Fragment.getCharacter();
630: } else {
631: if (lastFragment != null) {
632: output.put(lastKey, lastString);
633: }
634: lastFragment = this Fragment;
635: lastString = this Fragment.getCharacter();
636: lastKey = key;
637: }
638: if (lastFragment != null) {
639: output.put(lastKey, lastString);
640: }
641: }
642: return output;
643: }
644:
645: private boolean adjacent(final TextPosition lastFragment,
646: final TextPosition this Fragment) {
647: final int TOLERANCE = 2;
648: return this Fragment.getX()
649: - (lastFragment.getX() + lastFragment.getWidth()
650: * lastFragment.getXScale()) < TOLERANCE;
651: }
652:
653: private void regroup(final List fragments, final Map lines) {
654: for (int i = 0; i < fragments.size(); i++) {
655: final TextPosition textPosition = (TextPosition) fragments
656: .get(i);
657: final Integer y = new Integer((int) textPosition.getY());
658: final Integer x = new Integer((int) textPosition.getX());
659: final Map pieces;
660: if (lines.containsKey(y)) {
661: pieces = (TreeMap) lines.get(y);
662: } else {
663: pieces = new TreeMap();
664: }
665: pieces.put(x, textPosition);
666: lines.put(y, pieces);
667: }
668: }
669:
670: private String fragmentsToString(Map linesOfString,
671: String fragmentSeparator, String lineSeparator) {
672: StringBuffer buf = new StringBuffer();
673: Iterator lit = linesOfString.values().iterator();
674: while (lit.hasNext()) {
675: Map pieces = (Map) lit.next();
676: Iterator pit = pieces.values().iterator();
677: while (pit.hasNext()) {
678: String piece = (String) pit.next();
679: buf.append(piece);
680: if (pit.hasNext()) {
681: buf.append(fragmentSeparator);
682: }
683: }
684: buf.append(lineSeparator);
685: }
686: return buf.toString();
687: }
688:
689: }
|