001: /**
002: * Copyright (c) 2003-2006, www.pdfbox.org
003: * All rights reserved.
004: *
005: * Redistribution and use in source and binary forms, with or without
006: * modification, are permitted provided that the following conditions are met:
007: *
008: * 1. Redistributions of source code must retain the above copyright notice,
009: * this list of conditions and the following disclaimer.
010: * 2. Redistributions in binary form must reproduce the above copyright notice,
011: * this list of conditions and the following disclaimer in the documentation
012: * and/or other materials provided with the distribution.
013: * 3. Neither the name of pdfbox; nor the names of its
014: * contributors may be used to endorse or promote products derived from this
015: * software without specific prior written permission.
016: *
017: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
018: * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
019: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
020: * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
021: * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
022: * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023: * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
024: * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
026: * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027: *
028: * http://www.pdfbox.org
029: *
030: */package org.pdfbox.util;
031:
032: import java.io.IOException;
033:
034: import java.util.ArrayList;
035: import java.util.HashMap;
036: import java.util.Iterator;
037: import java.util.List;
038: import java.util.Map;
039: import java.util.Properties;
040: import java.util.Stack;
041:
042: import org.pdfbox.cos.COSObject;
043: import org.pdfbox.cos.COSStream;
044: import org.pdfbox.exceptions.WrappedIOException;
045:
046: import org.pdfbox.pdmodel.PDPage;
047: import org.pdfbox.pdmodel.PDResources;
048:
049: import org.pdfbox.pdmodel.font.PDFont;
050:
051: import org.pdfbox.pdmodel.graphics.PDGraphicsState;
052:
053: import org.pdfbox.util.operator.OperatorProcessor;
054:
055: /**
056: * This class will run through a PDF content stream and execute certain operations
057: * and provide a callback interface for clients that want to do things with the stream.
058: * See the PDFTextStripper class for an example of how to use this class.
059: *
060: * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
061: * @version $Revision: 1.35 $
062: */
063: public class PDFStreamEngine {
064: private static final byte[] SPACE_BYTES = { (byte) 32 };
065:
066: private PDGraphicsState graphicsState = null;
067:
068: private Matrix textMatrix = null;
069: private Matrix textLineMatrix = null;
070: private Stack graphicsStack = new Stack();
071: //private PDResources resources = null;
072:
073: private Map operators = new HashMap();
074:
075: private Stack streamResourcesStack = new Stack();
076:
077: private PDPage page;
078:
079: private Map documentFontCache = new HashMap();
080:
081: /**
082: * This is a simple internal class used by the Stream engine to handle the
083: * resources stack.
084: */
085: private static class StreamResources {
086: private Map fonts;
087: private Map colorSpaces;
088: private Map xobjects;
089: private Map graphicsStates;
090: private PDResources resources;
091: }
092:
093: /**
094: * Constructor.
095: */
096: public PDFStreamEngine() {
097: //default constructor
098: }
099:
100: /**
101: * Constructor with engine properties. The property keys are all
102: * PDF operators, the values are class names used to execute those
103: * operators.
104: *
105: * @param properties The engine properties.
106: *
107: * @throws IOException If there is an error setting the engine properties.
108: */
109: public PDFStreamEngine(Properties properties) throws IOException {
110: try {
111: Iterator keys = properties.keySet().iterator();
112: while (keys.hasNext()) {
113: String operator = (String) keys.next();
114: String operatorClass = properties.getProperty(operator);
115: OperatorProcessor op = (OperatorProcessor) Class
116: .forName(operatorClass).newInstance();
117: registerOperatorProcessor(operator, op);
118: }
119: } catch (Exception e) {
120: throw new WrappedIOException(e);
121: }
122: }
123:
124: /**
125: * Register a custom operator processor with the engine.
126: *
127: * @param operator The operator as a string.
128: * @param op Processor instance.
129: */
130: public void registerOperatorProcessor(String operator,
131: OperatorProcessor op) {
132: op.setContext(this );
133: operators.put(operator, op);
134: }
135:
136: /**
137: * This method must be called between processing documents. The
138: * PDFStreamEngine caches information for the document between pages
139: * and this will release the cached information. This only needs
140: * to be called if processing a new document.
141: *
142: */
143: public void resetEngine() {
144: documentFontCache.clear();
145: }
146:
147: /**
148: * This will process the contents of the stream.
149: *
150: * @param aPage The page.
151: * @param resources The location to retrieve resources.
152: * @param cosStream the Stream to execute.
153: *
154: *
155: * @throws IOException if there is an error accessing the stream.
156: */
157: public void processStream(PDPage aPage, PDResources resources,
158: COSStream cosStream) throws IOException {
159: graphicsState = new PDGraphicsState();
160: textMatrix = null;
161: textLineMatrix = null;
162: graphicsStack.clear();
163: streamResourcesStack.clear();
164:
165: processSubStream(aPage, resources, cosStream);
166: }
167:
168: /**
169: * Process a sub stream of the current stream.
170: *
171: * @param aPage The page used for drawing.
172: * @param resources The resources used when processing the stream.
173: * @param cosStream The stream to process.
174: *
175: * @throws IOException If there is an exception while processing the stream.
176: */
177: public void processSubStream(PDPage aPage, PDResources resources,
178: COSStream cosStream) throws IOException {
179: page = aPage;
180: if (resources != null) {
181: StreamResources sr = new StreamResources();
182: sr.fonts = resources.getFonts(documentFontCache);
183: sr.colorSpaces = resources.getColorSpaces();
184: sr.xobjects = resources.getXObjects();
185: sr.graphicsStates = resources.getGraphicsStates();
186: sr.resources = resources;
187: streamResourcesStack.push(sr);
188: }
189: try {
190: List arguments = new ArrayList();
191: List tokens = cosStream.getStreamTokens();
192: if (tokens != null) {
193: Iterator iter = tokens.iterator();
194: while (iter.hasNext()) {
195: Object next = iter.next();
196: if (next instanceof COSObject) {
197: arguments.add(((COSObject) next).getObject());
198: } else if (next instanceof PDFOperator) {
199: processOperator((PDFOperator) next, arguments);
200: arguments = new ArrayList();
201: } else {
202: arguments.add(next);
203: }
204: }
205: }
206: } finally {
207: if (resources != null) {
208: streamResourcesStack.pop();
209: }
210: }
211:
212: }
213:
214: /**
215: * A method provided as an event interface to allow a subclass to perform
216: * some specific functionality when a character needs to be displayed.
217: *
218: * @param text The character to be displayed.
219: */
220: protected void showCharacter(TextPosition text) {
221: //subclasses can override to provide specific functionality.
222: }
223:
224: /**
225: * You should override this method if you want to perform an action when a
226: * string is being shown.
227: *
228: * @param string The string to display.
229: *
230: * @throws IOException If there is an error showing the string
231: */
232: public void showString(byte[] string) throws IOException {
233: float spaceWidth = 0;
234: float spacing = 0;
235: StringBuffer stringResult = new StringBuffer(string.length);
236:
237: float characterHorizontalDisplacement = 0;
238: float characterVerticalDisplacement = 0;
239: float spaceDisplacement = 0;
240: float fontSize = graphicsState.getTextState().getFontSize();
241: float horizontalScaling = graphicsState.getTextState()
242: .getHorizontalScalingPercent() / 100f;
243: float verticalScaling = horizontalScaling;//not sure if this is right but what else to do???
244: float rise = graphicsState.getTextState().getRise();
245: final float wordSpacing = graphicsState.getTextState()
246: .getWordSpacing();
247: final float characterSpacing = graphicsState.getTextState()
248: .getCharacterSpacing();
249: float wordSpacingDisplacement = 0;
250:
251: PDFont font = graphicsState.getTextState().getFont();
252:
253: //This will typically be 1000 but in the case of a type3 font
254: //this might be a different number
255: float glyphSpaceToTextSpaceFactor = 1f / font.getFontMatrix()
256: .getValue(0, 0);
257: float averageWidth = font.getAverageFontWidth();
258:
259: Matrix initialMatrix = new Matrix();
260: initialMatrix.setValue(0, 0, 1);
261: initialMatrix.setValue(0, 1, 0);
262: initialMatrix.setValue(0, 2, 0);
263: initialMatrix.setValue(1, 0, 0);
264: initialMatrix.setValue(1, 1, 1);
265: initialMatrix.setValue(1, 2, 0);
266: initialMatrix.setValue(2, 0, 0);
267: initialMatrix.setValue(2, 1, rise);
268: initialMatrix.setValue(2, 2, 1);
269:
270: //this
271: int codeLength = 1;
272: Matrix ctm = graphicsState.getCurrentTransformationMatrix();
273:
274: //lets see what the space displacement should be
275: spaceDisplacement = (font.getFontWidth(SPACE_BYTES, 0, 1) / glyphSpaceToTextSpaceFactor);
276: if (spaceDisplacement == 0) {
277: spaceDisplacement = (averageWidth / glyphSpaceToTextSpaceFactor);
278: //The average space width appears to be higher than necessary
279: //so lets make it a little bit smaller.
280: spaceDisplacement *= .80f;
281: }
282: int pageRotation = page.findRotation();
283: Matrix trm = initialMatrix.multiply(textMatrix).multiply(ctm);
284: float x = trm.getValue(2, 0);
285: float y = trm.getValue(2, 1);
286: if (pageRotation == 0) {
287: trm.setValue(2, 1, -y + page.findMediaBox().getHeight());
288: } else if (pageRotation == 90) {
289: trm.setValue(2, 0, y);
290: trm.setValue(2, 1, x);
291: } else if (pageRotation == 270) {
292: trm.setValue(2, 0, -y + page.findMediaBox().getHeight());
293: trm.setValue(2, 1, x);
294: }
295: for (int i = 0; i < string.length; i += codeLength) {
296: codeLength = 1;
297:
298: String c = font.encode(string, i, codeLength);
299: if (c == null && i + 1 < string.length) {
300: //maybe a multibyte encoding
301: codeLength++;
302: c = font.encode(string, i, codeLength);
303: }
304: stringResult.append(c);
305:
306: //todo, handle horizontal displacement
307: characterHorizontalDisplacement += (font.getFontWidth(
308: string, i, codeLength) / glyphSpaceToTextSpaceFactor);
309: characterVerticalDisplacement = Math.max(
310: characterVerticalDisplacement, font.getFontHeight(
311: string, i, codeLength)
312: / glyphSpaceToTextSpaceFactor);
313:
314: // PDF Spec - 5.5.2 Word Spacing
315: //
316: // Word spacing works the same was as character spacing, but applies
317: // only to the space character, code 32.
318: //
319: // Note: Word spacing is applied to every occurrence of the single-byte
320: // character code 32 in a string. This can occur when using a simple
321: // font or a composite font that defines code 32 as a single-byte code.
322: // It does not apply to occurrences of the byte value 32 in multiple-byte
323: // codes.
324: //
325: // RDD - My interpretation of this is that only character code 32's that
326: // encode to spaces should have word spacing applied. Cases have been
327: // observed where a font has a space character with a character code
328: // other than 32, and where word spacing (Tw) was used. In these cases,
329: // applying word spacing to either the non-32 space or to the character
330: // code 32 non-space resulted in errors consistent with this interpretation.
331: //
332: if ((string[i] == 0x20) && c.equals(" ")) {
333: spacing += wordSpacing + characterSpacing;
334: } else {
335: spacing += characterSpacing;
336: }
337: // We want to update the textMatrix using the width, in text space units.
338: //
339:
340: }
341:
342: //The adjustment will always be zero. The adjustment as shown in the
343: //TJ operator will be handled separately.
344: float adjustment = 0;
345: //todo, need to compute the vertical displacement
346: float ty = 0;
347: float tx = ((characterHorizontalDisplacement - adjustment
348: / glyphSpaceToTextSpaceFactor)
349: * fontSize + spacing)
350: * horizontalScaling;
351:
352: float xScale = trm.getXScale();
353: float yScale = trm.getYScale();
354: float xPos = trm.getXPosition();
355: float yPos = trm.getYPosition();
356: spaceWidth = spaceDisplacement * xScale * fontSize;
357: wordSpacingDisplacement = wordSpacing * xScale * fontSize;
358: Matrix td = new Matrix();
359: td.setValue(2, 0, tx);
360: td.setValue(2, 1, ty);
361:
362: float xPosBefore = textMatrix.getXPosition();
363: float yPosBefore = textMatrix.getYPosition();
364: textMatrix = td.multiply(textMatrix);
365:
366: float totalStringWidth = 0;
367: float totalStringHeight = characterVerticalDisplacement
368: * fontSize * yScale;
369: if (pageRotation == 0) {
370: totalStringWidth = (textMatrix.getXPosition() - xPosBefore);
371: } else if (pageRotation == 90) {
372: totalStringWidth = (textMatrix.getYPosition() - yPosBefore);
373: } else if (pageRotation == 270) {
374: totalStringWidth = (yPosBefore - textMatrix.getYPosition());
375: }
376: showCharacter(new TextPosition(xPos, yPos, xScale, yScale,
377: totalStringWidth, totalStringHeight, spaceWidth,
378: stringResult.toString(), font, fontSize,
379: wordSpacingDisplacement));
380: }
381:
382: /**
383: * This is used to handle an operation.
384: *
385: * @param operation The operation to perform.
386: * @param arguments The list of arguments.
387: *
388: * @throws IOException If there is an error processing the operation.
389: */
390: public void processOperator(String operation, List arguments)
391: throws IOException {
392: PDFOperator oper = PDFOperator.getOperator(operation);
393: processOperator(oper, arguments);
394: }
395:
396: /**
397: * This is used to handle an operation.
398: *
399: * @param operator The operation to perform.
400: * @param arguments The list of arguments.
401: *
402: * @throws IOException If there is an error processing the operation.
403: */
404: protected void processOperator(PDFOperator operator, List arguments)
405: throws IOException {
406: String operation = operator.getOperation();
407: OperatorProcessor processor = (OperatorProcessor) operators
408: .get(operation);
409: if (processor != null) {
410: processor.process(operator, arguments);
411: }
412: }
413:
414: /**
415: * @return Returns the colorSpaces.
416: */
417: public Map getColorSpaces() {
418: return ((StreamResources) streamResourcesStack.peek()).colorSpaces;
419: }
420:
421: /**
422: * @return Returns the colorSpaces.
423: */
424: public Map getXObjects() {
425: return ((StreamResources) streamResourcesStack.peek()).xobjects;
426: }
427:
428: /**
429: * @param value The colorSpaces to set.
430: */
431: public void setColorSpaces(Map value) {
432: ((StreamResources) streamResourcesStack.peek()).colorSpaces = value;
433: }
434:
435: /**
436: * @return Returns the fonts.
437: */
438: public Map getFonts() {
439: return ((StreamResources) streamResourcesStack.peek()).fonts;
440: }
441:
442: /**
443: * @param value The fonts to set.
444: */
445: public void setFonts(Map value) {
446: ((StreamResources) streamResourcesStack.peek()).fonts = value;
447: }
448:
449: /**
450: * @return Returns the graphicsStack.
451: */
452: public Stack getGraphicsStack() {
453: return graphicsStack;
454: }
455:
456: /**
457: * @param value The graphicsStack to set.
458: */
459: public void setGraphicsStack(Stack value) {
460: graphicsStack = value;
461: }
462:
463: /**
464: * @return Returns the graphicsState.
465: */
466: public PDGraphicsState getGraphicsState() {
467: return graphicsState;
468: }
469:
470: /**
471: * @param value The graphicsState to set.
472: */
473: public void setGraphicsState(PDGraphicsState value) {
474: graphicsState = value;
475: }
476:
477: /**
478: * @return Returns the graphicsStates.
479: */
480: public Map getGraphicsStates() {
481: return ((StreamResources) streamResourcesStack.peek()).graphicsStates;
482: }
483:
484: /**
485: * @param value The graphicsStates to set.
486: */
487: public void setGraphicsStates(Map value) {
488: ((StreamResources) streamResourcesStack.peek()).graphicsStates = value;
489: }
490:
491: /**
492: * @return Returns the textLineMatrix.
493: */
494: public Matrix getTextLineMatrix() {
495: return textLineMatrix;
496: }
497:
498: /**
499: * @param value The textLineMatrix to set.
500: */
501: public void setTextLineMatrix(Matrix value) {
502: textLineMatrix = value;
503: }
504:
505: /**
506: * @return Returns the textMatrix.
507: */
508: public Matrix getTextMatrix() {
509: return textMatrix;
510: }
511:
512: /**
513: * @param value The textMatrix to set.
514: */
515: public void setTextMatrix(Matrix value) {
516: textMatrix = value;
517: }
518:
519: /**
520: * @return Returns the resources.
521: */
522: public PDResources getResources() {
523: return ((StreamResources) streamResourcesStack.peek()).resources;
524: }
525:
526: /**
527: * Get the current page that is being processed.
528: *
529: * @return The page being processed.
530: */
531: public PDPage getCurrentPage() {
532: return page;
533: }
534: }
|