001: /*
002: * The contents of this file are subject to the terms
003: * of the Common Development and Distribution License
004: * (the "License"). You may not use this file except
005: * in compliance with the License.
006: *
007: * You can obtain a copy of the license at
008: * https://jwsdp.dev.java.net/CDDLv1.0.html
009: * See the License for the specific language governing
010: * permissions and limitations under the License.
011: *
012: * When distributing Covered Code, include this CDDL
013: * HEADER in each file and include the License file at
014: * https://jwsdp.dev.java.net/CDDLv1.0.html If applicable,
015: * add the following below this CDDL HEADER, with the
016: * fields enclosed by brackets "[]" replaced with your
017: * own identifying information: Portions Copyright [yyyy]
018: * [name of copyright owner]
019: */
020: /*
021: * @(#)HeaderTokenizer.java 1.9 02/03/27
022: */
023:
024: /*
025: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
026: *
027: * Copyright 1997-2007 Sun Microsystems, Inc. All rights reserved.
028: *
029: * The contents of this file are subject to the terms of either the GNU
030: * General Public License Version 2 only ("GPL") or the Common Development
031: * and Distribution License("CDDL") (collectively, the "License"). You
032: * may not use this file except in compliance with the License. You can obtain
033: * a copy of the License at https://glassfish.dev.java.net/public/CDDL+GPL.html
034: * or glassfish/bootstrap/legal/LICENSE.txt. See the License for the specific
035: * language governing permissions and limitations under the License.
036: *
037: * When distributing the software, include this License Header Notice in each
038: * file and include the License file at glassfish/bootstrap/legal/LICENSE.txt.
039: * Sun designates this particular file as subject to the "Classpath" exception
040: * as provided by Sun in the GPL Version 2 section of the License file that
041: * accompanied this code. If applicable, add the following below the License
042: * Header, with the fields enclosed by brackets [] replaced by your own
043: * identifying information: "Portions Copyrighted [year]
044: * [name of copyright owner]"
045: *
046: * Contributor(s):
047: *
048: * If you wish your version of this file to be governed by only the CDDL or
049: * only the GPL Version 2, indicate your decision by adding "[Contributor]
050: * elects to include this software in this distribution under the [CDDL or GPL
051: * Version 2] license." If you don't indicate a single choice of license, a
052: * recipient has the option to distribute your version of this file under
053: * either the CDDL, the GPL Version 2 or to extend the choice of license to
054: * its licensees as provided above. However, if you add GPL Version 2 code
055: * and therefore, elected the GPL Version 2 license, then the option applies
056: * only if the new code is made subject to such option by the copyright
057: * holder.
058: */
059:
060: package com.sun.xml.messaging.saaj.packaging.mime.internet;
061:
062: /**
063: * This class tokenizes RFC822 and MIME headers into the basic
064: * symbols specified by RFC822 and MIME. <p>
065: *
066: * This class handles folded headers (ie headers with embedded
067: * CRLF SPACE sequences). The folds are removed in the returned
068: * tokens.
069: *
070: * @version 1.9, 02/03/27
071: * @author John Mani
072: */
073:
074: public class HeaderTokenizer {
075:
076: /**
077: * The Token class represents tokens returned by the
078: * HeaderTokenizer.
079: */
080: public static class Token {
081:
082: private int type;
083: private String value;
084:
085: /**
086: * Token type indicating an ATOM.
087: */
088: public static final int ATOM = -1;
089:
090: /**
091: * Token type indicating a quoted string. The value
092: * field contains the string without the quotes.
093: */
094: public static final int QUOTEDSTRING = -2;
095:
096: /**
097: * Token type indicating a comment. The value field
098: * contains the comment string without the comment
099: * start and end symbols.
100: */
101: public static final int COMMENT = -3;
102:
103: /**
104: * Token type indicating end of input.
105: */
106: public static final int EOF = -4;
107:
108: /**
109: * Constructor.
110: * @param type Token type
111: * @param value Token value
112: */
113: public Token(int type, String value) {
114: this .type = type;
115: this .value = value;
116: }
117:
118: /**
119: * Return the type of the token. If the token represents a
120: * delimiter or a control character, the type is that character
121: * itself, converted to an integer. Otherwise, it's value is
122: * one of the following:
123: * <ul>
124: * <li><code>ATOM</code> A sequence of ASCII characters
125: * delimited by either SPACE, CTL, "(", <"> or the
126: * specified SPECIALS
127: * <li><code>QUOTEDSTRING</code> A sequence of ASCII characters
128: * within quotes
129: * <li><code>COMMENT</code> A sequence of ASCII characters
130: * within "(" and ")".
131: * <li><code>EOF</code> End of header
132: * </ul>
133: */
134: public int getType() {
135: return type;
136: }
137:
138: /**
139: * Returns the value of the token just read. When the current
140: * token is a quoted string, this field contains the body of the
141: * string, without the quotes. When the current token is a comment,
142: * this field contains the body of the comment.
143: *
144: * @return token value
145: */
146: public String getValue() {
147: return value;
148: }
149: }
150:
151: private String string; // the string to be tokenized
152: private boolean skipComments; // should comments be skipped ?
153: private String delimiters; // delimiter string
154: private int currentPos; // current parse position
155: private int maxPos; // string length
156: private int nextPos; // track start of next Token for next()
157: private int peekPos; // track start of next Token for peek()
158:
159: /**
160: * RFC822 specials
161: */
162: public final static String RFC822 = "()<>@,;:\\\"\t .[]";
163:
164: /**
165: * MIME specials
166: */
167: public final static String MIME = "()<>@,;:\\\"\t []/?=";
168:
169: // The EOF Token
170: private final static Token EOFToken = new Token(Token.EOF, null);
171:
172: /**
173: * Constructor that takes a rfc822 style header.
174: *
175: * @param header The rfc822 header to be tokenized
176: * @param delimiters Set of delimiter characters
177: * to be used to delimit ATOMS. These
178: * are usually <code>RFC822</code> or
179: * <code>MIME</code>
180: * @param skipComments If true, comments are skipped and
181: * not returned as tokens
182: */
183: public HeaderTokenizer(String header, String delimiters,
184: boolean skipComments) {
185: string = (header == null) ? "" : header; // paranoia ?!
186: this .skipComments = skipComments;
187: this .delimiters = delimiters;
188: currentPos = nextPos = peekPos = 0;
189: maxPos = string.length();
190: }
191:
192: /**
193: * Constructor. Comments are ignored and not returned as tokens
194: *
195: * @param header The header that is tokenized
196: * @param delimiters The delimiters to be used
197: */
198: public HeaderTokenizer(String header, String delimiters) {
199: this (header, delimiters, true);
200: }
201:
202: /**
203: * Constructor. The RFC822 defined delimiters - RFC822 - are
204: * used to delimit ATOMS. Also comments are skipped and not
205: * returned as tokens
206: */
207: public HeaderTokenizer(String header) {
208: this (header, RFC822);
209: }
210:
211: /**
212: * Parses the next token from this String. <p>
213: *
214: * Clients sit in a loop calling next() to parse successive
215: * tokens until an EOF Token is returned.
216: *
217: * @return the next Token
218: * @exception ParseException if the parse fails
219: */
220: public Token next() throws ParseException {
221: Token tk;
222:
223: currentPos = nextPos; // setup currentPos
224: tk = getNext();
225: nextPos = peekPos = currentPos; // update currentPos and peekPos
226: return tk;
227: }
228:
229: /**
230: * Peek at the next token, without actually removing the token
231: * from the parse stream. Invoking this method multiple times
232: * will return successive tokens, until <code>next()</code> is
233: * called. <p>
234: *
235: * @return the next Token
236: * @exception ParseException if the parse fails
237: */
238: public Token peek() throws ParseException {
239: Token tk;
240:
241: currentPos = peekPos; // setup currentPos
242: tk = getNext();
243: peekPos = currentPos; // update peekPos
244: return tk;
245: }
246:
247: /**
248: * Return the rest of the Header.
249: *
250: * @return String rest of header. null is returned if we are
251: * already at end of header
252: */
253: public String getRemainder() {
254: return string.substring(nextPos);
255: }
256:
257: /*
258: * Return the next token starting from 'currentPos'. After the
259: * parse, 'currentPos' is updated to point to the start of the
260: * next token.
261: */
262: private Token getNext() throws ParseException {
263: // If we're already at end of string, return EOF
264: if (currentPos >= maxPos)
265: return EOFToken;
266:
267: // Skip white-space, position currentPos beyond the space
268: if (skipWhiteSpace() == Token.EOF)
269: return EOFToken;
270:
271: char c;
272: int start;
273: boolean filter = false;
274:
275: c = string.charAt(currentPos);
276:
277: // Check or Skip comments and position currentPos
278: // beyond the comment
279: while (c == '(') {
280: // Parsing comment ..
281: int nesting;
282: for (start = ++currentPos, nesting = 1; nesting > 0
283: && currentPos < maxPos; currentPos++) {
284: c = string.charAt(currentPos);
285: if (c == '\\') { // Escape sequence
286: currentPos++; // skip the escaped character
287: filter = true;
288: } else if (c == '\r')
289: filter = true;
290: else if (c == '(')
291: nesting++;
292: else if (c == ')')
293: nesting--;
294: }
295: if (nesting != 0)
296: throw new ParseException("Unbalanced comments");
297:
298: if (!skipComments) {
299: // Return the comment, if we are asked to.
300: // Note that the comment start & end markers are ignored.
301: String s;
302: if (filter) // need to go thru the token again.
303: s = filterToken(string, start, currentPos - 1);
304: else
305: s = string.substring(start, currentPos - 1);
306:
307: return new Token(Token.COMMENT, s);
308: }
309:
310: // Skip any whitespace after the comment.
311: if (skipWhiteSpace() == Token.EOF)
312: return EOFToken;
313: c = string.charAt(currentPos);
314: }
315:
316: // Check for quoted-string and position currentPos
317: // beyond the terminating quote
318: if (c == '"') {
319: for (start = ++currentPos; currentPos < maxPos; currentPos++) {
320: c = string.charAt(currentPos);
321: if (c == '\\') { // Escape sequence
322: currentPos++;
323: filter = true;
324: } else if (c == '\r')
325: filter = true;
326: else if (c == '"') {
327: currentPos++;
328: String s;
329:
330: if (filter)
331: s = filterToken(string, start, currentPos - 1);
332: else
333: s = string.substring(start, currentPos - 1);
334:
335: return new Token(Token.QUOTEDSTRING, s);
336: }
337: }
338: throw new ParseException("Unbalanced quoted string");
339: }
340:
341: // Check for SPECIAL or CTL
342: if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) {
343: currentPos++; // re-position currentPos
344: char ch[] = new char[1];
345: ch[0] = c;
346: return new Token((int) c, new String(ch));
347: }
348:
349: // Check for ATOM
350: for (start = currentPos; currentPos < maxPos; currentPos++) {
351: c = string.charAt(currentPos);
352: // ATOM is delimited by either SPACE, CTL, "(", <">
353: // or the specified SPECIALS
354: if (c < 040 || c >= 0177 || c == '(' || c == ' '
355: || c == '"' || delimiters.indexOf(c) >= 0)
356: break;
357: }
358: return new Token(Token.ATOM, string
359: .substring(start, currentPos));
360: }
361:
362: // Skip SPACE, HT, CR and NL
363: private int skipWhiteSpace() {
364: char c;
365: for (; currentPos < maxPos; currentPos++)
366: if (((c = string.charAt(currentPos)) != ' ') && (c != '\t')
367: && (c != '\r') && (c != '\n'))
368: return currentPos;
369: return Token.EOF;
370: }
371:
372: /* Process escape sequences and embedded LWSPs from a comment or
373: * quoted string.
374: */
375: private static String filterToken(String s, int start, int end) {
376: StringBuffer sb = new StringBuffer();
377: char c;
378: boolean gotEscape = false;
379: boolean gotCR = false;
380:
381: for (int i = start; i < end; i++) {
382: c = s.charAt(i);
383: if (c == '\n' && gotCR) {
384: // This LF is part of an unescaped
385: // CRLF sequence (i.e, LWSP). Skip it.
386: gotCR = false;
387: continue;
388: }
389:
390: gotCR = false;
391: if (!gotEscape) {
392: // Previous character was NOT '\'
393: if (c == '\\') // skip this character
394: gotEscape = true;
395: else if (c == '\r') // skip this character
396: gotCR = true;
397: else
398: // append this character
399: sb.append(c);
400: } else {
401: // Previous character was '\'. So no need to
402: // bother with any special processing, just
403: // append this character
404: sb.append(c);
405: gotEscape = false;
406: }
407: }
408: return sb.toString();
409: }
410: }
|