Source Code Cross Referenced for BaseParser.java in  » PDF » PDFBox-0.7.3 » org » pdfbox » pdfparser » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » PDF » PDFBox 0.7.3 » org.pdfbox.pdfparser 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        /**
0002:         * Copyright (c) 2003-2006, www.pdfbox.org
0003:         * All rights reserved.
0004:         *
0005:         * Redistribution and use in source and binary forms, with or without
0006:         * modification, are permitted provided that the following conditions are met:
0007:         *
0008:         * 1. Redistributions of source code must retain the above copyright notice,
0009:         *    this list of conditions and the following disclaimer.
0010:         * 2. Redistributions in binary form must reproduce the above copyright notice,
0011:         *    this list of conditions and the following disclaimer in the documentation
0012:         *    and/or other materials provided with the distribution.
0013:         * 3. Neither the name of pdfbox; nor the names of its
0014:         *    contributors may be used to endorse or promote products derived from this
0015:         *    software without specific prior written permission.
0016:         *
0017:         * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
0018:         * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
0019:         * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0020:         * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
0021:         * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0022:         * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0023:         * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
0024:         * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0025:         * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0026:         * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0027:         *
0028:         * http://www.pdfbox.org
0029:         *
0030:         */package org.pdfbox.pdfparser;
0031:
0032:        import java.io.BufferedInputStream;
0033:        import java.io.InputStream;
0034:        import java.io.IOException;
0035:        import java.io.OutputStream;
0036:
0037:        import java.util.ArrayList;
0038:        import java.util.List;
0039:
0040:        import org.pdfbox.io.ByteArrayPushBackInputStream;
0041:        import org.pdfbox.io.PushBackInputStream;
0042:        import org.pdfbox.io.RandomAccess;
0043:
0044:        import org.pdfbox.cos.COSArray;
0045:        import org.pdfbox.cos.COSBase;
0046:        import org.pdfbox.cos.COSBoolean;
0047:        import org.pdfbox.cos.COSDictionary;
0048:        import org.pdfbox.cos.COSDocument;
0049:        import org.pdfbox.cos.COSInteger;
0050:        import org.pdfbox.cos.COSName;
0051:        import org.pdfbox.cos.COSNull;
0052:        import org.pdfbox.cos.COSNumber;
0053:        import org.pdfbox.cos.COSObject;
0054:        import org.pdfbox.cos.COSStream;
0055:        import org.pdfbox.cos.COSString;
0056:
0057:        import org.pdfbox.persistence.util.COSObjectKey;
0058:
0059:        /**
0060:         * This class is used to contain parsing logic that will be used by both the
0061:         * PDFParser and the COSStreamParser.
0062:         *
0063:         * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
0064:         * @version $Revision: 1.59 $
0065:         */
0066:        public abstract class BaseParser {
0067:            /**
0068:             * This is a byte array that will be used for comparisons.
0069:             */
0070:            public static final byte[] ENDSTREAM = new byte[] { 101, 110, 100,
0071:                    115, 116, 114, 101, 97, 109 };//"endstream".getBytes( "ISO-8859-1" );
0072:
0073:            /**
0074:             * This is a byte array that will be used for comparisons.
0075:             */
0076:            public static final String DEF = "def";
0077:
0078:            /**
0079:             * This is the stream that will be read from.
0080:             */
0081:            //protected PushBackByteArrayStream pdfSource;
0082:            protected PushBackInputStream pdfSource;
0083:
0084:            /**
0085:             * moved xref here, is a persistence construct
0086:             * maybe not needed anyway when not read from behind with delayed
0087:             * access to objects.
0088:             */
0089:            private List xrefs = new ArrayList();
0090:
0091:            private COSDocument document;
0092:
0093:            /**
0094:             * Constructor.
0095:             *
0096:             * @param input The input stream to read the data from.
0097:             * 
0098:             * @throws IOException If there is an error reading the input stream.
0099:             */
0100:            public BaseParser(InputStream input) throws IOException {
0101:                //pdfSource = new PushBackByteArrayStream( input );
0102:                pdfSource = new PushBackInputStream(new BufferedInputStream(
0103:                        input, 16384), 4096);
0104:            }
0105:
0106:            /**
0107:             * Constructor.
0108:             *
0109:             * @param input The array to read the data from.
0110:             * 
0111:             * @throws IOException If there is an error reading the byte data.
0112:             */
0113:            protected BaseParser(byte[] input) throws IOException {
0114:                pdfSource = new ByteArrayPushBackInputStream(input);
0115:            }
0116:
0117:            /**
0118:             * Set the document for this stream.
0119:             * 
0120:             * @param doc The current document.
0121:             */
0122:            public void setDocument(COSDocument doc) {
0123:                document = doc;
0124:            }
0125:
0126:            private static boolean isHexDigit(char ch) {
0127:                return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f')
0128:                        || (ch >= 'A' && ch <= 'F');
0129:                // the line below can lead to problems with certain versions of the IBM JIT compiler
0130:                // (and is slower anyway)
0131:                //return (HEXDIGITS.indexOf(ch) != -1);
0132:            }
0133:
0134:            /**
0135:             * This will parse a PDF dictionary value.
0136:             *
0137:             * @return The parsed Dictionary object.
0138:             *
0139:             * @throws IOException If there is an error parsing the dictionary object.
0140:             */
0141:            private COSBase parseCOSDictionaryValue() throws IOException {
0142:                COSBase retval = null;
0143:                COSBase number = parseDirObject();
0144:                skipSpaces();
0145:                char next = (char) pdfSource.peek();
0146:                if (next >= '0' && next <= '9') {
0147:                    COSBase generationNumber = parseDirObject();
0148:                    skipSpaces();
0149:                    char r = (char) pdfSource.read();
0150:                    if (r != 'R') {
0151:                        throw new IOException("expected='R' actual='" + r
0152:                                + "' " + pdfSource);
0153:                    }
0154:                    COSObjectKey key = new COSObjectKey(((COSInteger) number)
0155:                            .intValue(), ((COSInteger) generationNumber)
0156:                            .intValue());
0157:                    retval = document.getObjectFromPool(key);
0158:                } else {
0159:                    retval = number;
0160:                }
0161:                return retval;
0162:            }
0163:
0164:            /**
0165:             * This will parse a PDF dictionary.
0166:             *
0167:             * @return The parsed dictionary.
0168:             *
0169:             * @throws IOException IF there is an error reading the stream.
0170:             */
0171:            protected COSDictionary parseCOSDictionary() throws IOException {
0172:                char c = (char) pdfSource.read();
0173:                if (c != '<') {
0174:                    throw new IOException("expected='<' actual='" + c + "'");
0175:                }
0176:                c = (char) pdfSource.read();
0177:                if (c != '<') {
0178:                    throw new IOException("expected='<' actual='" + c + "' "
0179:                            + pdfSource);
0180:                }
0181:                skipSpaces();
0182:                COSDictionary obj = new COSDictionary();
0183:                boolean done = false;
0184:                while (!done) {
0185:                    skipSpaces();
0186:                    c = (char) pdfSource.peek();
0187:                    if (c == '>') {
0188:                        done = true;
0189:                    } else {
0190:                        COSName key = parseCOSName();
0191:                        COSBase value = parseCOSDictionaryValue();
0192:                        skipSpaces();
0193:                        if (((char) pdfSource.peek()) == 'd') {
0194:                            //if the next string is 'def' then we are parsing a cmap stream
0195:                            //and want to ignore it, otherwise throw an exception.
0196:                            String potentialDEF = readString();
0197:                            if (!potentialDEF.equals(DEF)) {
0198:                                pdfSource.unread(potentialDEF.getBytes());
0199:                            } else {
0200:                                skipSpaces();
0201:                            }
0202:                        }
0203:
0204:                        if (value == null) {
0205:                            throw new IOException("Bad Dictionary Declaration "
0206:                                    + pdfSource);
0207:                        }
0208:                        obj.setItem(key, value);
0209:                    }
0210:                }
0211:                char ch = (char) pdfSource.read();
0212:                if (ch != '>') {
0213:                    throw new IOException("expected='>' actual='" + ch + "'");
0214:                }
0215:                ch = (char) pdfSource.read();
0216:                if (ch != '>') {
0217:                    throw new IOException("expected='>' actual='" + ch + "'");
0218:                }
0219:                return obj;
0220:            }
0221:
0222:            /**
0223:             * This will read a COSStream from the input stream.
0224:             *
0225:             * @param file The file to write the stream to when reading.
0226:             * @param dic The dictionary that goes with this stream.
0227:             *
0228:             * @return The parsed pdf stream.
0229:             *
0230:             * @throws IOException If there is an error reading the stream.
0231:             */
0232:            protected COSStream parseCOSStream(COSDictionary dic,
0233:                    RandomAccess file) throws IOException {
0234:                COSStream stream = new COSStream(dic, file);
0235:                OutputStream out = null;
0236:                try {
0237:                    String streamString = readString();
0238:                    //long streamLength;
0239:
0240:                    if (!streamString.equals("stream")) {
0241:                        throw new IOException("expected='stream' actual='"
0242:                                + streamString + "'");
0243:                    }
0244:
0245:                    //PDF Ref 3.2.7 A stream must be followed by either
0246:                    //a CRLF or LF but nothing else.
0247:
0248:                    int whitespace = pdfSource.read();
0249:
0250:                    //see brother_scan_cover.pdf, it adds whitespaces
0251:                    //after the stream but before the start of the 
0252:                    //data, so just read those first
0253:                    while (whitespace == 0x20) {
0254:                        whitespace = pdfSource.read();
0255:                    }
0256:
0257:                    if (whitespace == 0x0D) {
0258:                        whitespace = pdfSource.read();
0259:                        if (whitespace != 0x0A) {
0260:                            pdfSource.unread(whitespace);
0261:                            //The spec says this is invalid but it happens in the real
0262:                            //world so we must support it.
0263:                            //throw new IOException("expected='0x0A' actual='0x" +
0264:                            //    Integer.toHexString(whitespace) + "' " + pdfSource);
0265:                        }
0266:                    } else if (whitespace == 0x0A) {
0267:                        //that is fine
0268:                    } else {
0269:                        //we are in an error.
0270:                        //but again we will do a lenient parsing and just assume that everything
0271:                        //is fine
0272:                        pdfSource.unread(whitespace);
0273:                        //throw new IOException("expected='0x0D or 0x0A' actual='0x" +
0274:                        //Integer.toHexString(whitespace) + "' " + pdfSource);
0275:
0276:                    }
0277:
0278:                    COSBase streamLength = dic
0279:                            .getDictionaryObject(COSName.LENGTH);
0280:                    /*long length = -1;
0281:                    if( streamLength instanceof COSNumber )
0282:                    {
0283:                        length = ((COSNumber)streamLength).intValue();
0284:                    }
0285:                    else if( streamLength instanceof COSObject &&
0286:                             ((COSObject)streamLength).getObject() instanceof COSNumber )
0287:                    {
0288:                        length = ((COSNumber)((COSObject)streamLength).getObject()).intValue();
0289:                    }*/
0290:
0291:                    //length = -1;
0292:                    //streamLength = null;
0293:                    //Need to keep track of the
0294:                    out = stream.createFilteredStream(streamLength);
0295:                    String endStream = null;
0296:                    //the length is wrong in some pdf documents which means
0297:                    //that PDFBox must basically ignore it in order to be able to read
0298:                    //the most number of PDF documents.  This of course is a penalty hit,
0299:                    //maybe I could implement a faster parser.
0300:                    /**if( length != -1 )
0301:                    {
0302:                        byte[] buffer = new byte[1024];
0303:                        int amountRead = 0;
0304:                        int totalAmountRead = 0;
0305:                        while( amountRead != -1 && totalAmountRead < length )
0306:                        {
0307:                            int maxAmountToRead = Math.min(buffer.length, (int)(length-totalAmountRead));
0308:                            amountRead = pdfSource.read(buffer,0,maxAmountToRead);
0309:                            totalAmountRead += amountRead;
0310:                            if( amountRead != -1 )
0311:                            {
0312:                                out.write( buffer, 0, amountRead );
0313:                            }
0314:                        }
0315:                    }
0316:                    else
0317:                    {**/
0318:                    readUntilEndStream(out);
0319:                    /**}*/
0320:                    skipSpaces();
0321:                    endStream = readString();
0322:
0323:                    if (!endStream.equals("endstream")) {
0324:                        readUntilEndStream(out);
0325:                        endStream = readString();
0326:                        if (!endStream.equals("endstream")) {
0327:                            throw new IOException(
0328:                                    "expected='endstream' actual='" + endStream
0329:                                            + "' " + pdfSource);
0330:                        }
0331:                    }
0332:                } finally {
0333:                    if (out != null) {
0334:                        out.close();
0335:                    }
0336:                }
0337:                return stream;
0338:            }
0339:
0340:            private void readUntilEndStream(OutputStream out)
0341:                    throws IOException {
0342:                int currentIndex = 0;
0343:                int byteRead = 0;
0344:                //this is the additional bytes buffered but not written
0345:                int additionalBytes = 0;
0346:                byte[] buffer = new byte[ENDSTREAM.length + additionalBytes];
0347:                int writeIndex = 0;
0348:                while (!cmpCircularBuffer(buffer, currentIndex, ENDSTREAM)
0349:                        && byteRead != -1) {
0350:                    writeIndex = currentIndex - buffer.length;
0351:                    if (writeIndex >= 0) {
0352:                        out.write(buffer[writeIndex % buffer.length]);
0353:                    }
0354:                    byteRead = pdfSource.read();
0355:                    buffer[currentIndex % buffer.length] = (byte) byteRead;
0356:                    currentIndex++;
0357:                }
0358:
0359:                //we want to ignore the end of the line data when reading a stream
0360:                //so will make an attempt to ignore it.
0361:                /*writeIndex = currentIndex - buffer.length;
0362:                if( buffer[writeIndex%buffer.length] == 13 &&
0363:                    buffer[(writeIndex+1)%buffer.length] == 10 )
0364:                {
0365:                    //then ignore the newline before the endstream
0366:                }
0367:                else if( buffer[(writeIndex+1)%buffer.length] == 10 )
0368:                {
0369:                    //Then first byte is data, second byte is newline
0370:                    out.write( buffer[writeIndex%buffer.length] );
0371:                }
0372:                else
0373:                {
0374:                    out.write( buffer[writeIndex%buffer.length] );
0375:                    out.write( buffer[(writeIndex+1)%buffer.length] );
0376:                }*/
0377:
0378:                /**
0379:                 * Old way of handling newlines before endstream
0380:                for( int i=0; i<additionalBytes; i++ )
0381:                {
0382:                    writeIndex = currentIndex - buffer.length;
0383:                    if( writeIndex >=0 &&
0384:                        //buffer[writeIndex%buffer.length] != 10 &&
0385:                        buffer[writeIndex%buffer.length] != 13 )
0386:                    {
0387:                        out.write( buffer[writeIndex%buffer.length] );
0388:                    }
0389:                    currentIndex++;
0390:                }
0391:                 */
0392:                pdfSource.unread(ENDSTREAM);
0393:
0394:            }
0395:
0396:            /**
0397:             * This basically checks to see if the next compareTo.length bytes of the
0398:             * buffer match the compareTo byte array.
0399:             */
0400:            private boolean cmpCircularBuffer(byte[] buffer, int currentIndex,
0401:                    byte[] compareTo) {
0402:                int cmpLen = compareTo.length;
0403:                int buflen = buffer.length;
0404:                boolean match = true;
0405:                int off = currentIndex - cmpLen;
0406:                if (off < 0) {
0407:                    match = false;
0408:                }
0409:                for (int i = 0; match && i < cmpLen; ++i) {
0410:                    match = buffer[(off + i) % buflen] == compareTo[i];
0411:                }
0412:                return match;
0413:            }
0414:
0415:            /**
0416:             * This will parse a PDF string.
0417:             *
0418:             * @return The parsed PDF string.
0419:             *
0420:             * @throws IOException If there is an error reading from the stream.
0421:             */
0422:            protected COSString parseCOSString() throws IOException {
0423:                char nextChar = (char) pdfSource.read();
0424:                COSString retval = new COSString();
0425:                char openBrace;
0426:                char closeBrace;
0427:                if (nextChar == '(') {
0428:                    openBrace = '(';
0429:                    closeBrace = ')';
0430:                } else if (nextChar == '<') {
0431:                    openBrace = '<';
0432:                    closeBrace = '>';
0433:                } else {
0434:                    throw new IOException(
0435:                            "parseCOSString string should start with '(' or '<' and not '"
0436:                                    + nextChar + "' " + pdfSource);
0437:                }
0438:
0439:                //This is the number of braces read
0440:                //
0441:                int braces = 1;
0442:                int c = pdfSource.read();
0443:                while (braces > 0 && c != -1) {
0444:                    char ch = (char) c;
0445:                    int nextc = -2; // not yet read
0446:                    //if( log.isDebugEnabled() )
0447:                    //{
0448:                    //    log.debug( "Parsing COSString character '" + c + "' code=" + (int)c );
0449:                    //}
0450:
0451:                    if (ch == closeBrace) {
0452:                        braces--;
0453:                        byte[] nextThreeBytes = new byte[3];
0454:                        int amountRead = pdfSource.read(nextThreeBytes);
0455:
0456:                        //lets handle the special case seen in Bull  River Rules and Regulations.pdf
0457:                        //The dictionary looks like this
0458:                        //    2 0 obj
0459:                        //    <<
0460:                        //        /Type /Info
0461:                        //        /Creator (PaperPort http://www.scansoft.com)
0462:                        //        /Producer (sspdflib 1.0 http://www.scansoft.com)
0463:                        //        /Title ( (5)
0464:                        //        /Author ()
0465:                        //        /Subject ()
0466:                        //
0467:                        // Notice the /Title, the braces are not even but they should
0468:                        // be.  So lets assume that if we encounter an this scenario
0469:                        //   <end_brace><new_line><opening_slash> then that
0470:                        // means that there is an error in the pdf and assume that
0471:                        // was the end of the document.
0472:                        if (amountRead == 3) {
0473:                            if (nextThreeBytes[0] == 0x0d
0474:                                    && nextThreeBytes[1] == 0x0a
0475:                                    && nextThreeBytes[2] == 0x2f) {
0476:                                braces = 0;
0477:                            }
0478:                        }
0479:                        pdfSource.unread(nextThreeBytes, 0, amountRead);
0480:                        if (braces != 0) {
0481:                            retval.append(ch);
0482:                        }
0483:                    } else if (ch == openBrace) {
0484:                        braces++;
0485:                        retval.append(ch);
0486:                    } else if (ch == '\\') {
0487:                        //patched by ram
0488:                        char next = (char) pdfSource.read();
0489:                        switch (next) {
0490:                        case 'n':
0491:                            retval.append('\n');
0492:                            break;
0493:                        case 'r':
0494:                            retval.append('\r');
0495:                            break;
0496:                        case 't':
0497:                            retval.append('\t');
0498:                            break;
0499:                        case 'b':
0500:                            retval.append('\b');
0501:                            break;
0502:                        case 'f':
0503:                            retval.append('\f');
0504:                            break;
0505:                        case '(':
0506:                        case ')':
0507:                        case '\\':
0508:                            retval.append(next);
0509:                            break;
0510:                        case 10:
0511:                        case 13:
0512:                            //this is a break in the line so ignore it and the newline and continue
0513:                            c = pdfSource.read();
0514:                            while (isEOL(c) && c != -1) {
0515:                                c = pdfSource.read();
0516:                            }
0517:                            nextc = c;
0518:                            break;
0519:                        case '0':
0520:                        case '1':
0521:                        case '2':
0522:                        case '3':
0523:                        case '4':
0524:                        case '5':
0525:                        case '6':
0526:                        case '7': {
0527:                            StringBuffer octal = new StringBuffer();
0528:                            octal.append(next);
0529:                            c = pdfSource.read();
0530:                            char digit = (char) c;
0531:                            if (digit >= '0' && digit <= '7') {
0532:                                octal.append(digit);
0533:                                c = pdfSource.read();
0534:                                digit = (char) c;
0535:                                if (digit >= '0' && digit <= '7') {
0536:                                    octal.append(digit);
0537:                                } else {
0538:                                    nextc = c;
0539:                                }
0540:                            } else {
0541:                                nextc = c;
0542:                            }
0543:
0544:                            int character = 0;
0545:                            try {
0546:                                character = Integer.parseInt(octal.toString(),
0547:                                        8);
0548:                            } catch (NumberFormatException e) {
0549:                                throw new IOException(
0550:                                        "Error: Expected octal character, actual='"
0551:                                                + octal + "'");
0552:                            }
0553:                            retval.append(character);
0554:                            break;
0555:                        }
0556:                        default: {
0557:                            retval.append('\\');
0558:                            retval.append(next);
0559:                            //another ficken problem with PDF's, sometimes the \ doesn't really
0560:                            //mean escape like the PDF spec says it does, sometimes is should be literal
0561:                            //which is what we will assume here.
0562:                            //throw new IOException( "Unexpected break sequence '" + next + "' " + pdfSource );
0563:                        }
0564:                        }
0565:                    } else {
0566:                        if (openBrace == '<') {
0567:                            if (isHexDigit(ch)) {
0568:                                retval.append(ch);
0569:                            }
0570:                        } else {
0571:                            retval.append(ch);
0572:                        }
0573:                    }
0574:                    if (nextc != -2) {
0575:                        c = nextc;
0576:                    } else {
0577:                        c = pdfSource.read();
0578:                    }
0579:                }
0580:                if (c != -1) {
0581:                    pdfSource.unread(c);
0582:                }
0583:                if (openBrace == '<') {
0584:                    retval = COSString.createFromHexString(retval.getString());
0585:                }
0586:                return retval;
0587:            }
0588:
0589:            /**
0590:             * This will parse a PDF array object.
0591:             *
0592:             * @return The parsed PDF array.
0593:             *
0594:             * @throws IOException If there is an error parsing the stream.
0595:             */
0596:            protected COSArray parseCOSArray() throws IOException {
0597:                char ch = (char) pdfSource.read();
0598:                if (ch != '[') {
0599:                    throw new IOException("expected='[' actual='" + ch + "'");
0600:                }
0601:                COSArray po = new COSArray();
0602:                COSBase pbo = null;
0603:                skipSpaces();
0604:                int i = 0;
0605:                while (((i = pdfSource.peek()) > 0) && ((char) i != ']')) {
0606:                    pbo = parseDirObject();
0607:                    if (pbo instanceof  COSObject) {
0608:                        COSInteger genNumber = (COSInteger) po
0609:                                .remove(po.size() - 1);
0610:                        COSInteger number = (COSInteger) po
0611:                                .remove(po.size() - 1);
0612:                        COSObjectKey key = new COSObjectKey(number.intValue(),
0613:                                genNumber.intValue());
0614:                        pbo = document.getObjectFromPool(key);
0615:                    }
0616:                    if (pbo != null) {
0617:                        po.add(pbo);
0618:                    } else {
0619:                        //it could be a bad object in the array which is just skipped
0620:                    }
0621:                    skipSpaces();
0622:                }
0623:                pdfSource.read(); //read ']'
0624:                skipSpaces();
0625:                return po;
0626:            }
0627:
0628:            /**
0629:             * Determine if a character terminates a PDF name.
0630:             *
0631:             * @param ch The character
0632:             * @return <code>true</code> if the character terminates a PDF name, otherwise <code>false</code>.
0633:             */
0634:            protected boolean isEndOfName(char ch) {
0635:                return (ch == ' ' || ch == 13 || ch == 10 || ch == 9
0636:                        || ch == '>' || ch == '<' || ch == '[' || ch == '/'
0637:                        || ch == ']' || ch == ')' || ch == '(' || ch == -1 //EOF
0638:                );
0639:            }
0640:
0641:            /**
0642:             * This will parse a PDF name from the stream.
0643:             *
0644:             * @return The parsed PDF name.
0645:             *
0646:             * @throws IOException If there is an error reading from the stream.
0647:             */
0648:            protected COSName parseCOSName() throws IOException {
0649:                COSName retval = null;
0650:                int c = pdfSource.read();
0651:                if ((char) c != '/') {
0652:                    throw new IOException("expected='/' actual='" + (char) c
0653:                            + "'-" + c + " " + pdfSource);
0654:                }
0655:                // costruisce il nome
0656:                StringBuffer buffer = new StringBuffer();
0657:                c = pdfSource.read();
0658:                while (c != -1) {
0659:                    char ch = (char) c;
0660:                    if (ch == '#') {
0661:                        char ch1 = (char) pdfSource.read();
0662:                        char ch2 = (char) pdfSource.read();
0663:
0664:                        // Prior to PDF v1.2, the # was not a special character.  Also,
0665:                        // it has been observed that various PDF tools do not follow the
0666:                        // spec with respect to the # escape, even though they report
0667:                        // PDF versions of 1.2 or later.  The solution here is that we
0668:                        // interpret the # as an escape only when it is followed by two
0669:                        // valid hex digits.
0670:                        //
0671:                        if (isHexDigit(ch1) && isHexDigit(ch2)) {
0672:                            String hex = "" + ch1 + ch2;
0673:                            try {
0674:                                buffer.append((char) Integer.parseInt(hex, 16));
0675:                            } catch (NumberFormatException e) {
0676:                                throw new IOException(
0677:                                        "Error: expected hex number, actual='"
0678:                                                + hex + "'");
0679:                            }
0680:                            c = pdfSource.read();
0681:                        } else {
0682:                            pdfSource.unread(ch2);
0683:                            c = ch1;
0684:                            buffer.append(ch);
0685:                        }
0686:                    } else if (isEndOfName(ch)) {
0687:                        break;
0688:                    } else {
0689:                        buffer.append(ch);
0690:                        c = pdfSource.read();
0691:                    }
0692:                }
0693:                if (c != -1) {
0694:                    pdfSource.unread(c);
0695:                }
0696:                retval = COSName.getPDFName(buffer.toString());
0697:                return retval;
0698:            }
0699:
0700:            /**
0701:             * This will parse a boolean object from the stream.
0702:             *
0703:             * @return The parsed boolean object.
0704:             *
0705:             * @throws IOException If an IO error occurs during parsing.
0706:             */
0707:            protected COSBoolean parseBoolean() throws IOException {
0708:                COSBoolean retval = null;
0709:                char c = (char) pdfSource.peek();
0710:                if (c == 't') {
0711:                    byte[] trueArray = new byte[4];
0712:                    int amountRead = pdfSource.read(trueArray, 0, 4);
0713:                    String trueString = new String(trueArray, 0, amountRead);
0714:                    if (!trueString.equals("true")) {
0715:                        throw new IOException(
0716:                                "Error parsing boolean: expected='true' actual='"
0717:                                        + trueString + "'");
0718:                    } else {
0719:                        retval = COSBoolean.TRUE;
0720:                    }
0721:                } else if (c == 'f') {
0722:                    byte[] falseArray = new byte[5];
0723:                    int amountRead = pdfSource.read(falseArray, 0, 5);
0724:                    String falseString = new String(falseArray, 0, amountRead);
0725:                    if (!falseString.equals("false")) {
0726:                        throw new IOException(
0727:                                "Error parsing boolean: expected='true' actual='"
0728:                                        + falseString + "'");
0729:                    } else {
0730:                        retval = COSBoolean.FALSE;
0731:                    }
0732:                } else {
0733:                    throw new IOException(
0734:                            "Error parsing boolean expected='t or f' actual='"
0735:                                    + c + "'");
0736:                }
0737:                return retval;
0738:            }
0739:
0740:            /**
0741:             * This will parse a directory object from the stream.
0742:             *
0743:             * @return The parsed object.
0744:             *
0745:             * @throws IOException If there is an error during parsing.
0746:             */
0747:            protected COSBase parseDirObject() throws IOException {
0748:                COSBase retval = null;
0749:
0750:                skipSpaces();
0751:                int nextByte = pdfSource.peek();
0752:                char c = (char) nextByte;
0753:                switch (c) {
0754:                case '<': {
0755:                    int leftBracket = pdfSource.read();//pull off first left bracket
0756:                    c = (char) pdfSource.peek(); //check for second left bracket
0757:                    pdfSource.unread(leftBracket);
0758:                    if (c == '<') {
0759:
0760:                        retval = parseCOSDictionary();
0761:                        skipSpaces();
0762:                    } else {
0763:                        retval = parseCOSString();
0764:                    }
0765:                    break;
0766:                }
0767:                case '[': // array
0768:                {
0769:                    retval = parseCOSArray();
0770:                    break;
0771:                }
0772:                case '(':
0773:                    retval = parseCOSString();
0774:                    break;
0775:                case '/': // name
0776:                    retval = parseCOSName();
0777:                    break;
0778:                case 'n': // null
0779:                {
0780:                    String nullString = readString();
0781:                    if (!nullString.equals("null")) {
0782:                        throw new IOException("Expected='null' actual='"
0783:                                + nullString + "'");
0784:                    }
0785:                    retval = COSNull.NULL;
0786:                    break;
0787:                }
0788:                case 't': {
0789:                    byte[] trueBytes = new byte[4];
0790:                    int amountRead = pdfSource.read(trueBytes, 0, 4);
0791:                    String trueString = new String(trueBytes, 0, amountRead);
0792:                    if (trueString.equals("true")) {
0793:                        retval = COSBoolean.TRUE;
0794:                    } else {
0795:                        throw new IOException("expected true actual='"
0796:                                + trueString + "' " + pdfSource);
0797:                    }
0798:                    break;
0799:                }
0800:                case 'f': {
0801:                    byte[] falseBytes = new byte[5];
0802:                    int amountRead = pdfSource.read(falseBytes, 0, 5);
0803:                    String falseString = new String(falseBytes, 0, amountRead);
0804:                    if (falseString.equals("false")) {
0805:                        retval = COSBoolean.FALSE;
0806:                    } else {
0807:                        throw new IOException("expected false actual='"
0808:                                + falseString + "' " + pdfSource);
0809:                    }
0810:                    break;
0811:                }
0812:                case 'R':
0813:                    pdfSource.read();
0814:                    retval = new COSObject(null);
0815:                    break;
0816:                case (char) -1:
0817:                    return null;
0818:                default: {
0819:                    if (Character.isDigit(c) || c == '-' || c == '+'
0820:                            || c == '.') {
0821:                        StringBuffer buf = new StringBuffer();
0822:                        int ic = pdfSource.read();
0823:                        c = (char) ic;
0824:                        while (Character.isDigit(c) || c == '-' || c == '+'
0825:                                || c == '.' || c == 'E' || c == 'e') {
0826:                            buf.append(c);
0827:                            ic = pdfSource.read();
0828:                            c = (char) ic;
0829:                        }
0830:                        if (ic != -1) {
0831:                            pdfSource.unread(ic);
0832:                        }
0833:                        retval = COSNumber.get(buf.toString());
0834:                    } else {
0835:                        //This is not suppose to happen, but we will allow for it
0836:                        //so we are more compatible with POS writers that don't
0837:                        //follow the spec
0838:                        String badString = readString();
0839:                        //throw new IOException( "Unknown dir object c='" + c +
0840:                        //"' peek='" + (char)pdfSource.peek() + "' " + pdfSource );
0841:                        if (badString == null || badString.length() == 0) {
0842:                            int peek = pdfSource.peek();
0843:                            // we can end up in an infinite loop otherwise
0844:                            throw new IOException("Unknown dir object c='" + c
0845:                                    + "' cInt=" + (int) c + " peek='"
0846:                                    + (char) peek + "' peekInt=" + peek + " "
0847:                                    + pdfSource);
0848:                        }
0849:
0850:                    }
0851:                }
0852:                }
0853:                return retval;
0854:            }
0855:
0856:            /**
0857:             * This will read the next string from the stream.
0858:             *
0859:             * @return The string that was read from the stream.
0860:             *
0861:             * @throws IOException If there is an error reading from the stream.
0862:             */
0863:            protected String readString() throws IOException {
0864:                skipSpaces();
0865:                StringBuffer buffer = new StringBuffer();
0866:                int c = pdfSource.read();
0867:                while (!isEndOfName((char) c) && !isClosing(c) && c != -1) {
0868:                    buffer.append((char) c);
0869:                    c = pdfSource.read();
0870:                }
0871:                if (c != -1) {
0872:                    pdfSource.unread(c);
0873:                }
0874:                return buffer.toString();
0875:            }
0876:
0877:            /**
0878:             * This will read bytes until the end of line marker occurs.
0879:             *
0880:             * @param theString The next expected string in the stream.
0881:             *
0882:             * @return The characters between the current position and the end of the line.
0883:             *
0884:             * @throws IOException If there is an error reading from the stream or theString does not match what was read.
0885:             */
0886:            protected String readExpectedString(String theString)
0887:                    throws IOException {
0888:                int c = pdfSource.read();
0889:                while (isWhitespace(c) && c != -1) {
0890:                    c = pdfSource.read();
0891:                }
0892:                StringBuffer buffer = new StringBuffer(theString.length());
0893:                int charsRead = 0;
0894:                while (!isEOL(c) && c != -1 && charsRead < theString.length()) {
0895:                    char next = (char) c;
0896:                    buffer.append(next);
0897:                    if (theString.charAt(charsRead) == next) {
0898:                        charsRead++;
0899:                    } else {
0900:                        throw new IOException("Error: Expected to read '"
0901:                                + theString + "' instead started reading '"
0902:                                + buffer.toString() + "'");
0903:                    }
0904:                    c = pdfSource.read();
0905:                }
0906:                while (isEOL(c) && c != -1) {
0907:                    c = pdfSource.read();
0908:                }
0909:                if (c != -1) {
0910:                    pdfSource.unread(c);
0911:                }
0912:                return buffer.toString();
0913:            }
0914:
0915:            /**
0916:             * This will read the next string from the stream up to a certain length.
0917:             *
0918:             * @param length The length to stop reading at.
0919:             *
0920:             * @return The string that was read from the stream of length 0 to length.
0921:             *
0922:             * @throws IOException If there is an error reading from the stream.
0923:             */
0924:            protected String readString(int length) throws IOException {
0925:                skipSpaces();
0926:
0927:                int c = pdfSource.read();
0928:
0929:                //average string size is around 2 and the normal string buffer size is
0930:                //about 16 so lets save some space.
0931:                StringBuffer buffer = new StringBuffer(length);
0932:                while (!isWhitespace(c) && !isClosing(c) && c != -1
0933:                        && buffer.length() < length && c != '[' && c != '<'
0934:                        && c != '(' && c != '/') {
0935:                    buffer.append((char) c);
0936:                    c = pdfSource.read();
0937:                }
0938:                if (c != -1) {
0939:                    pdfSource.unread(c);
0940:                }
0941:                return buffer.toString();
0942:            }
0943:
0944:            /**
0945:             * This will tell if the next character is a closing brace( close of PDF array ).
0946:             *
0947:             * @return true if the next byte is ']', false otherwise.
0948:             *
0949:             * @throws IOException If an IO error occurs.
0950:             */
0951:            protected boolean isClosing() throws IOException {
0952:                return isClosing(pdfSource.peek());
0953:            }
0954:
0955:            /**
0956:             * This will tell if the next character is a closing brace( close of PDF array ).
0957:             *
0958:             * @param c The character to check against end of line
0959:             * @return true if the next byte is ']', false otherwise.
0960:             */
0961:            protected boolean isClosing(int c) {
0962:                return c == ']';
0963:            }
0964:
0965:            /**
0966:             * This will read bytes until the end of line marker occurs.
0967:             *
0968:             * @return The characters between the current position and the end of the line.
0969:             *
0970:             * @throws IOException If there is an error reading from the stream.
0971:             */
0972:            protected String readLine() throws IOException {
0973:                int c = pdfSource.read();
0974:                while (isWhitespace(c) && c != -1) {
0975:                    c = pdfSource.read();
0976:                }
0977:                StringBuffer buffer = new StringBuffer(11);
0978:
0979:                while (!isEOL(c) && c != -1) {
0980:                    buffer.append((char) c);
0981:                    c = pdfSource.read();
0982:                }
0983:                while (isEOL(c) && c != -1) {
0984:                    c = pdfSource.read();
0985:                }
0986:                if (c != -1) {
0987:                    pdfSource.unread(c);
0988:                }
0989:                return buffer.toString();
0990:            }
0991:
0992:            /**
0993:             * This will tell if the next byte to be read is an end of line byte.
0994:             *
0995:             * @return true if the next byte is 0x0A or 0x0D.
0996:             *
0997:             * @throws IOException If there is an error reading from the stream.
0998:             */
0999:            protected boolean isEOL() throws IOException {
1000:                return isEOL(pdfSource.peek());
1001:            }
1002:
1003:            /**
1004:             * This will tell if the next byte to be read is an end of line byte.
1005:             *
1006:             * @param c The character to check against end of line
1007:             * @return true if the next byte is 0x0A or 0x0D.
1008:             */
1009:            protected boolean isEOL(int c) {
1010:                return c == 10 || c == 13;
1011:            }
1012:
1013:            /**
1014:             * This will tell if the next byte is whitespace or not.
1015:             *
1016:             * @return true if the next byte in the stream is a whitespace character.
1017:             *
1018:             * @throws IOException If there is an error reading from the stream.
1019:             */
1020:            protected boolean isWhitespace() throws IOException {
1021:                return isWhitespace(pdfSource.peek());
1022:            }
1023:
1024:            /**
1025:             * This will tell if the next byte is whitespace or not.
1026:             *
1027:             * @param c The character to check against whitespace
1028:             *
1029:             * @return true if the next byte in the stream is a whitespace character.
1030:             */
1031:            protected boolean isWhitespace(int c) {
1032:                return c == 0 || c == 9 || c == 12 || c == 10 || c == 13
1033:                        || c == 32;
1034:            }
1035:
1036:            /**
1037:             * This will skip all spaces and comments that are present.
1038:             *
1039:             * @throws IOException If there is an error reading from the stream.
1040:             */
1041:            protected void skipSpaces() throws IOException {
1042:                //log( "skipSpaces() " + pdfSource );
1043:                int c = pdfSource.read();
1044:                // identical to, but faster as: isWhiteSpace(c) || c == 37
1045:                while (c == 0 || c == 9 || c == 12 || c == 10 || c == 13
1046:                        || c == 32 || c == 37)//37 is the % character, a comment
1047:                {
1048:                    if (c == 37) {
1049:                        // skip past the comment section
1050:                        c = pdfSource.read();
1051:                        while (!isEOL(c) && c != -1) {
1052:                            c = pdfSource.read();
1053:                        }
1054:                    } else {
1055:                        c = pdfSource.read();
1056:                    }
1057:                }
1058:                if (c != -1) {
1059:                    pdfSource.unread(c);
1060:                }
1061:                //log( "skipSpaces() done peek='" + (char)pdfSource.peek() + "'" );
1062:            }
1063:
1064:            /**
1065:             * This will read an integer from the stream.
1066:             *
1067:             * @return The integer that was read from the stream.
1068:             *
1069:             * @throws IOException If there is an error reading from the stream.
1070:             */
1071:            protected int readInt() throws IOException {
1072:                skipSpaces();
1073:                int retval = 0;
1074:
1075:                int lastByte = 0;
1076:                StringBuffer intBuffer = new StringBuffer();
1077:                while ((lastByte = pdfSource.read()) != 32 && lastByte != 10
1078:                        && lastByte != 13 && lastByte != 0 && //See sourceforge bug 853328
1079:                        lastByte != -1) {
1080:                    intBuffer.append((char) lastByte);
1081:                }
1082:                try {
1083:                    retval = Integer.parseInt(intBuffer.toString());
1084:                } catch (NumberFormatException e) {
1085:                    throw new IOException(
1086:                            "Error: Expected an integer type, actual='"
1087:                                    + intBuffer + "'");
1088:                }
1089:                return retval;
1090:            }
1091:
1092:            /**
1093:             * This will add an xref.
1094:             *
1095:             * @param xref The xref to add.
1096:             */
1097:            public void addXref(PDFXref xref) {
1098:                xrefs.add(xref);
1099:            }
1100:
1101:            /**
1102:             * This will get all of the xrefs.
1103:             *
1104:             * @return A list of all xrefs.
1105:             */
1106:            public List getXrefs() {
1107:                return xrefs;
1108:            }
1109:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.