001: /*
002: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
003: *
004: * Copyright 1997-2007 Sun Microsystems, Inc. All rights reserved.
005: *
006: * The contents of this file are subject to the terms of either the GNU
007: * General Public License Version 2 only ("GPL") or the Common Development
008: * and Distribution License("CDDL") (collectively, the "License"). You
009: * may not use this file except in compliance with the License. You can obtain
010: * a copy of the License at https://glassfish.dev.java.net/public/CDDL+GPL.html
011: * or glassfish/bootstrap/legal/LICENSE.txt. See the License for the specific
012: * language governing permissions and limitations under the License.
013: *
014: * When distributing the software, include this License Header Notice in each
015: * file and include the License file at glassfish/bootstrap/legal/LICENSE.txt.
016: * Sun designates this particular file as subject to the "Classpath" exception
017: * as provided by Sun in the GPL Version 2 section of the License file that
018: * accompanied this code. If applicable, add the following below the License
019: * Header, with the fields enclosed by brackets [] replaced by your own
020: * identifying information: "Portions Copyrighted [year]
021: * [name of copyright owner]"
022: *
023: * Contributor(s):
024: *
025: * If you wish your version of this file to be governed by only the CDDL or
026: * only the GPL Version 2, indicate your decision by adding "[Contributor]
027: * elects to include this software in this distribution under the [CDDL or GPL
028: * Version 2] license." If you don't indicate a single choice of license, a
029: * recipient has the option to distribute your version of this file under
030: * either the CDDL, the GPL Version 2 or to extend the choice of license to
031: * its licensees as provided above. However, if you add GPL Version 2 code
032: * and therefore, elected the GPL Version 2 license, then the option applies
033: * only if the new code is made subject to such option by the copyright
034: * holder.
035: */
036: package org.jvnet.mimepull;
037:
038: import java.io.InputStream;
039: import java.io.IOException;
040: import java.io.PushbackInputStream;
041: import java.util.*;
042: import java.nio.ByteBuffer;
043:
044: /**
045: * Pull parser for the MIME messages. Applications can use pull API to continue
046: * the parsing MIME messages lazily.
047: *
048: * <pre>
049: * for e.g.:
050: * <p>
051: *
052: * MIMEParser parser = ...
053: * Iterator<MIMEEvent> it = parser.iterator();
054: * while(it.hasNext()) {
055: * MIMEEvent event = it.next();
056: * ...
057: * }
058: * </pre>
059: *
060: * @author Jitendra Kotamraju
061: */
062: class MIMEParser implements Iterable<MIMEEvent> {
063: // Actually, the grammar doesn't support whitespace characters
064: // after boundary. But the mail implementation checks for it.
065: // We will only check for these many whitespace characters after boundary
066: private static final int NO_LWSP = 1000;
067:
068: private enum STATE {
069: START_MESSAGE, SKIP_PREAMBLE, START_PART, HEADERS, BODY, END_PART, END_MESSAGE
070: }
071:
072: private STATE state = STATE.START_MESSAGE;
073:
074: private final InputStream in;
075: private final byte[] bndbytes;
076: private final int bl;
077: private final MIMEConfig config;
078: private final int[] bcs = new int[128]; // BnM algo: Bad Character Shift table
079: private final int[] gss; // BnM algo : Good Suffix Shift table
080:
081: /**
082: * Have we parsed the data from our InputStream yet?
083: */
084: private boolean parsed;
085:
086: /*
087: * Read and process body partsList until we see the
088: * terminating boundary line (or EOF).
089: */
090: private boolean done = false;
091:
092: private boolean eof;
093: private final int capacity;
094: private byte[] buf;
095: private int len;
096: private boolean bol; // beginning of the line
097:
098: MIMEParser(InputStream in, String boundary, MIMEConfig config) {
099: this .in = in;
100: this .bndbytes = getBytes("--" + boundary);
101: bl = bndbytes.length;
102: this .config = config;
103: gss = new int[bl];
104: compileBoundaryPattern();
105:
106: // \r\n + boundary + "--\r\n" + lots of LWSP
107: capacity = config.chunkSize + 2 + bl + 4 + NO_LWSP;
108: createBuf(capacity);
109: }
110:
111: /**
112: * Returns iterator for the parsing events. Use the iterator to advance
113: * the parsing.
114: *
115: * @return iterator for parsing events
116: */
117: public Iterator<MIMEEvent> iterator() {
118: return new MIMEEventIterator();
119: }
120:
121: class MIMEEventIterator implements Iterator<MIMEEvent> {
122:
123: public boolean hasNext() {
124: return !parsed;
125: }
126:
127: public MIMEEvent next() {
128: switch (state) {
129: case START_MESSAGE:
130: state = STATE.SKIP_PREAMBLE;
131: return MIMEEvent.START_MESSAGE;
132:
133: case SKIP_PREAMBLE:
134: skipPreamble();
135: // fall through
136: case START_PART:
137: state = STATE.HEADERS;
138: return MIMEEvent.START_PART;
139:
140: case HEADERS:
141: InternetHeaders ih = readHeaders();
142: state = STATE.BODY;
143: bol = true;
144: return new MIMEEvent.Headers(ih);
145:
146: case BODY:
147: ByteBuffer buf = readBody();
148: bol = false;
149: return new MIMEEvent.Content(buf);
150:
151: case END_PART:
152: if (done) {
153: state = STATE.END_MESSAGE;
154: } else {
155: state = STATE.START_PART;
156: }
157: return MIMEEvent.END_PART;
158:
159: case END_MESSAGE:
160: parsed = true;
161: return MIMEEvent.END_MESSAGE;
162:
163: default:
164: throw new MIMEParsingException(
165: "Unknown Parser state = " + state);
166: }
167: }
168:
169: public void remove() {
170: throw new UnsupportedOperationException();
171: }
172: }
173:
174: /**
175: * Collects the headers for the current part by parsing mesage stream.
176: *
177: * @return headers for the current part
178: */
179: private InternetHeaders readHeaders() {
180: if (!eof) {
181: fillBuf();
182: }
183: return new InternetHeaders(new LineInputStream());
184: }
185:
186: /**
187: * Reads and saves the part of the current attachment part's content.
188: * At the end of this method, buf should have the remaining data
189: * at index 0.
190: *
191: * @return a chunk of the part's content
192: *
193: */
194: private ByteBuffer readBody() {
195: if (!eof) {
196: fillBuf();
197: }
198: int start = match(buf, 0, len); // matches boundary
199: if (start == -1) {
200: // No boundary is found
201: assert eof || len >= config.chunkSize;
202: int chunkSize = eof ? len : config.chunkSize;
203: if (eof) {
204: done = true;
205: state = STATE.END_PART;
206: }
207: return adjustBuf(chunkSize, len - chunkSize);
208: }
209: // Found boundary.
210: // Is it at the start of a line ?
211: int chunkLen = start;
212: if (bol && start == 0) {
213: // nothing to do
214: } else if (start > 0
215: && (buf[start - 1] == '\n' || buf[start - 1] == '\r')) {
216: --chunkLen;
217: if (buf[start - 1] == '\n' && start > 1
218: && buf[start - 2] == '\r') {
219: --chunkLen;
220: }
221: } else {
222: return adjustBuf(start + 1, len - start - 1); // boundary is not at beginning of a line
223: }
224:
225: if (start + bl + 1 < len && buf[start + bl] == '-'
226: && buf[start + bl + 1] == '-') {
227: state = STATE.END_PART;
228: done = true;
229: return adjustBuf(chunkLen, 0);
230: }
231:
232: // Consider all the whitespace in boundary+whitespace+"\r\n"
233: int lwsp = 0;
234: for (int i = start + bl; i < len
235: && (buf[i] == ' ' || buf[i] == '\t'); i++) {
236: ++lwsp;
237: }
238:
239: // Check for \n or \r\n
240: if (start + bl + lwsp < len
241: && (buf[start + bl + lwsp] == '\n' || buf[start + bl
242: + lwsp] == '\r')) {
243: if (buf[start + bl + lwsp] == '\n') {
244: state = STATE.END_PART;
245: return adjustBuf(chunkLen, len - start - bl - lwsp - 1);
246: } else if (start + bl + lwsp + 1 < len
247: && buf[start + bl + lwsp + 1] == '\n') {
248: state = STATE.END_PART;
249: return adjustBuf(chunkLen, len - start - bl - lwsp - 2);
250: }
251: }
252:
253: // Let us give chance to consume atleast NO_LWSP whitespace characters
254: if (lwsp > 0 && start > config.chunkSize) {
255: return adjustBuf(start, len - start);
256: }
257:
258: // Not a proper boundary
259: return adjustBuf(start + 1, len - start - 1);
260: }
261:
262: /**
263: * Returns a chunk from the original buffer. A new buffer is
264: * created with the remaining bytes.
265: *
266: * @param chunkSize create a chunk with these many bytes
267: * @param remaining bytes from the end of the buffer that need to be copied to
268: * the beginning of the new buffer
269: * @return chunk
270: */
271: private ByteBuffer adjustBuf(int chunkSize, int remaining) {
272: assert buf != null;
273: assert chunkSize >= 0;
274: assert remaining >= 0;
275:
276: byte[] temp = buf;
277: // create a new buf and adjust it without this chunk
278: createBuf(remaining);
279: System.arraycopy(temp, len - remaining, buf, 0, remaining);
280: len = remaining;
281:
282: return ByteBuffer.wrap(temp, 0, chunkSize);
283: }
284:
285: private void createBuf(int min) {
286: buf = new byte[min < capacity ? capacity : min];
287: }
288:
289: /**
290: * Skips the preamble to find the first attachment part
291: */
292: private void skipPreamble() {
293:
294: while (true) {
295: if (!eof) {
296: fillBuf();
297: }
298: int start = match(buf, 0, len); // matches boundary
299: if (start == -1) {
300: // No boundary is found
301: if (eof) {
302: throw new MIMEParsingException(
303: "Missing start boundary");
304: } else {
305: adjustBuf(len - bl + 1, bl - 1);
306: continue;
307: }
308: }
309:
310: if (start > config.chunkSize) {
311: adjustBuf(start, len - start);
312: continue;
313: }
314: // Consider all the whitespace boundary+whitespace+"\r\n"
315: int lwsp = 0;
316: for (int i = start + bl; i < len
317: && (buf[i] == ' ' || buf[i] == '\t'); i++) {
318: ++lwsp;
319: }
320: // Check for \n or \r\n
321: if (start + bl + lwsp < len
322: && (buf[start + bl + lwsp] == '\n' || buf[start
323: + bl + lwsp] == '\r')) {
324: if (buf[start + bl + lwsp] == '\n') {
325: adjustBuf(start + bl + lwsp + 1, len - start - bl
326: - lwsp - 1);
327: break;
328: } else if (start + bl + lwsp + 1 < len
329: && buf[start + bl + lwsp + 1] == '\n') {
330: adjustBuf(start + bl + lwsp + 2, len - start - bl
331: - lwsp - 2);
332: break;
333: }
334: }
335: adjustBuf(start + 1, len - start - 1);
336: }
337: }
338:
339: private static byte[] getBytes(String s) {
340: char[] chars = s.toCharArray();
341: int size = chars.length;
342: byte[] bytes = new byte[size];
343:
344: for (int i = 0; i < size;)
345: bytes[i] = (byte) chars[i++];
346: return bytes;
347: }
348:
349: /**
350: * Boyer-Moore search method. Copied from java.util.regex.Pattern.java
351: *
352: * Pre calculates arrays needed to generate the bad character
353: * shift and the good suffix shift. Only the last seven bits
354: * are used to see if chars match; This keeps the tables small
355: * and covers the heavily used ASCII range, but occasionally
356: * results in an aliased match for the bad character shift.
357: */
358: private void compileBoundaryPattern() {
359: int i, j;
360:
361: // Precalculate part of the bad character shift
362: // It is a table for where in the pattern each
363: // lower 7-bit value occurs
364: for (i = 0; i < bndbytes.length; i++) {
365: bcs[bndbytes[i] & 0x7F] = i + 1;
366: }
367:
368: // Precalculate the good suffix shift
369: // i is the shift amount being considered
370: NEXT: for (i = bndbytes.length; i > 0; i--) {
371: // j is the beginning index of suffix being considered
372: for (j = bndbytes.length - 1; j >= i; j--) {
373: // Testing for good suffix
374: if (bndbytes[j] == bndbytes[j - i]) {
375: // src[j..len] is a good suffix
376: gss[j - 1] = i;
377: } else {
378: // No match. The array has already been
379: // filled up with correct values before.
380: continue NEXT;
381: }
382: }
383: // This fills up the remaining of optoSft
384: // any suffix can not have larger shift amount
385: // then its sub-suffix. Why???
386: while (j > 0) {
387: gss[--j] = i;
388: }
389: }
390: // Set the guard value because of unicode compression
391: gss[bndbytes.length - 1] = 1;
392: }
393:
394: /**
395: * Finds the boundary in the given buffer using Boyer-Moore algo.
396: * Copied from java.util.regex.Pattern.java
397: *
398: * @param mybuf boundary to be searched in this mybuf
399: * @param off start index in mybuf
400: * @param len number of bytes in mybuf
401: *
402: * @return -1 if there is no match or index where the match starts
403: */
404: private int match(byte[] mybuf, int off, int len) {
405: int last = len - bndbytes.length;
406:
407: // Loop over all possible match positions in text
408: NEXT: while (off <= last) {
409: // Loop over pattern from right to left
410: for (int j = bndbytes.length - 1; j >= 0; j--) {
411: byte ch = mybuf[off + j];
412: if (ch != bndbytes[j]) {
413: // Shift search to the right by the maximum of the
414: // bad character shift and the good suffix shift
415: off += Math.max(j + 1 - bcs[ch & 0x7F], gss[j]);
416: continue NEXT;
417: }
418: }
419: // Entire pattern matched starting at off
420: return off;
421: }
422: return -1;
423: }
424:
425: /**
426: * Fills the remaining buf to the full capacity
427: */
428: private void fillBuf() {
429: assert !eof;
430: while (len < buf.length) {
431: int read;
432: try {
433: read = in.read(buf, len, buf.length - len);
434: } catch (IOException ioe) {
435: throw new MIMEParsingException(ioe);
436: }
437: if (read == -1) {
438: eof = true;
439: break;
440: } else {
441: len += read;
442: }
443: }
444: }
445:
446: private void doubleBuf() {
447: byte[] temp = new byte[2 * len];
448: System.arraycopy(buf, 0, temp, 0, len);
449: buf = temp;
450: fillBuf();
451: }
452:
453: class LineInputStream {
454: private int offset;
455:
456: /**
457: * Read a line containing only ASCII characters from the input
458: * stream. A line is terminated by a CR or NL or CR-NL sequence.
459: * A common error is a CR-CR-NL sequence, which will also terminate
460: * a line.
461: * The line terminator is not returned as part of the returned
462: * String. Returns null if no data is available. <p>
463: *
464: * This class is similar to the deprecated
465: * <code>DataInputStream.readLine()</code>
466: */
467: public String readLine() throws IOException {
468:
469: int hdrLen = 0;
470: int lwsp = 0;
471: while (offset + hdrLen < len) {
472: if (buf[offset + hdrLen] == '\n') {
473: lwsp = 1;
474: break;
475: }
476: if (offset + hdrLen + 1 == len) {
477: doubleBuf();
478: }
479: if (offset + hdrLen + 1 >= len) { // No more data in the stream
480: assert eof;
481: return null;
482: }
483: if (buf[offset + hdrLen] == '\r'
484: && buf[offset + hdrLen + 1] == '\n') {
485: lwsp = 2;
486: break;
487: }
488: ++hdrLen;
489: }
490: if (hdrLen == 0) {
491: adjustBuf(offset + lwsp, len - offset - lwsp);
492: return null;
493: }
494:
495: String hdr = new String(buf, offset, hdrLen);
496: offset += hdrLen + lwsp;
497: return hdr;
498: }
499:
500: }
501:
502: }
|