001: // MimeParser.java
002: // $Id: MimeParser.java,v 1.16 2003/02/24 10:31:10 ylafon Exp $
003: // (c) COPYRIGHT MIT and INRIA, 1996.
004: // Please first read the full copyright statement in file COPYRIGHT.html
005:
006: package org.w3c.www.mime;
007:
008: import java.io.BufferedInputStream;
009: import java.io.FileInputStream;
010: import java.io.IOException;
011: import java.io.InputStream;
012: import java.io.OutputStream;
013: import java.io.PrintStream;
014:
015: import org.w3c.www.http.HttpAccept;
016: import org.w3c.www.http.HttpAcceptCharset;
017: import org.w3c.www.http.HttpAcceptLanguage;
018: import org.w3c.www.http.HttpMessage;
019: import org.w3c.www.http.HttpRequestMessage;
020:
021: /**
022: * The MimeParser class parses an input MIME stream.
023: */
024:
025: public class MimeParser {
026: protected int ch = -1;
027: protected InputStream input = null;
028: protected byte buffer[] = new byte[128];
029: protected int bsize = 0;
030:
031: /**
032: * The factory used to create new MIME header holders.
033: */
034: protected MimeParserFactory factory = null;
035:
036: protected void expect(int car) throws MimeParserException,
037: IOException {
038: if (car != ch) {
039: String sc = (new Character((char) car)).toString();
040: String se = (new Character((char) ch)).toString();
041: throw new MimeParserException("expecting " + sc + "(" + car
042: + ")" + " got " + se + "(" + ch + ")\n"
043: + "context: " + new String(buffer, 0, 0, bsize)
044: + "\n");
045: }
046: ch = input.read();
047: }
048:
049: protected void skipSpaces() throws MimeParserException, IOException {
050: while ((ch == ' ') || (ch == '\t'))
051: ch = input.read();
052: }
053:
054: protected final void append(int c) {
055: if (bsize + 1 >= buffer.length) {
056: byte nb[] = new byte[buffer.length * 2];
057: System.arraycopy(buffer, 0, nb, 0, buffer.length);
058: buffer = nb;
059: }
060: buffer[bsize++] = (byte) c;
061: }
062:
063: /*
064: * Get the header name:
065: */
066:
067: protected String parse822HeaderName() throws MimeParserException,
068: IOException {
069: bsize = 0;
070: while ((ch >= 32) && (ch != ':')) {
071: append((char) ch);
072: ch = input.read();
073: }
074: skipSpaces();
075: expect(':');
076: if (bsize <= 0)
077: throw new MimeParserException("expected a header name.");
078: return new String(buffer, 0, 0, bsize);
079: }
080:
081: /*
082: * Get the header body, still trying to be 822 compliant *and* HTTP
083: * robust, which is unfortunately a contradiction.
084: */
085: protected void parse822HeaderBody() throws MimeParserException,
086: IOException {
087: parse822HeaderBody(true);
088: }
089:
090: /*
091: * Get the header body, still trying to be 822 compliant *and* HTTP
092: * robust, which is unfortunately a contradiction.
093: */
094:
095: protected void parse822HeaderBodyLenient()
096: throws MimeParserException, IOException {
097: bsize = 0;
098: skipSpaces();
099: boolean quoted = false;
100: loop: while (true) {
101: switch (ch) {
102: case -1:
103: break loop;
104: case '\r':
105: if ((ch = input.read()) != '\n') {
106: append('\r');
107: continue;
108: }
109: // no break intentional
110: case '\n':
111: // do as if '\r' had been received. This defeats 822, but
112: // makes HTTP more "robust". I wish HTTP were a binary
113: // protocol.
114: switch (ch = input.read()) {
115: case ' ':
116: case '\t':
117: // header continuation, eat LWS then add a SP
118: do {
119: ch = input.read();
120: } while ((ch == ' ') || (ch == '\t'));
121: if ((ch == '\r') || (ch == '\n')) {
122: // empty continuation, restart to check
123: continue;
124: }
125: append(' ');
126: append(ch);
127: break;
128: default:
129: break loop;
130: }
131: break;
132: case '\\':
133: append((char) ch);
134: if (quoted) {
135: ch = input.read();
136: append((char) ch);
137: }
138: break;
139: case '\"':
140: quoted = !quoted;
141: default:
142: append((char) ch);
143: break;
144: }
145: ch = input.read();
146: }
147: return;
148: }
149:
150: /*
151: * Get the header body, still trying to be 822 compliant *and* HTTP
152: * robust, which is unfortunately a contradiction.
153: * @param lenient boolean, true for robustness, false to stricter spec
154: * adherence
155: */
156: protected void parse822HeaderBody(boolean lenient)
157: throws MimeParserException, IOException {
158: if (lenient) {
159: parse822HeaderBodyLenient();
160: } else {
161: parse822HeaderBodyStrict();
162: }
163: }
164:
165: /*
166: * Get the header body, still trying to be 822 compliant *and* HTTP
167: * robust, which is unfortunately a contradiction.
168: *
169: */
170: protected void parse822HeaderBodyStrict()
171: throws MimeParserException, IOException {
172: bsize = 0;
173: skipSpaces();
174: boolean quoted = false;
175: boolean gotr = false;
176: loop: while (true) {
177: switch (ch) {
178: case -1:
179: break loop;
180: case '\r':
181: if ((ch = input.read()) != '\n') {
182: append('\r');
183: continue;
184: }
185: gotr = true;
186: continue;
187: // no break intentional
188: case '\n':
189: if (quoted) {
190: if (gotr) {
191: append('\r');
192: append('\n');
193: break;
194: }
195: throw new MimeParserException("MimeParser: "
196: + "\\n not allowed in " + "quoted string");
197: }
198: // do as if '\r' had been received. This defeats 822, but
199: // makes HTTP more "robust". I wish HTTP were a binary
200: // protocol.
201: if (gotr) {
202: switch (ch = input.read()) {
203: case ' ':
204: case '\t':
205: // header continuation, eat LWS then add a SP
206: do {
207: ch = input.read();
208: } while ((ch == ' ') || (ch == '\t'));
209: if (ch == '\r') {
210: continue;
211: }
212: append(' ');
213: append(ch);
214: gotr = false;
215: break;
216: default:
217: break loop;
218: }
219: } else {
220: append('\n');
221: }
222: break;
223: case '\\':
224: gotr = false;
225: append((char) ch);
226: if (quoted) {
227: ch = input.read();
228: append((char) ch);
229: }
230: break;
231: case '\"':
232: gotr = false;
233: quoted = !quoted;
234: default:
235: if (quoted) {
236: if ((ch < 32) && (ch != '\t')) {
237: throw new MimeParserException("MimeParser: "
238: + "CTRL not allowed in "
239: + "quoted string");
240: }
241: }
242: gotr = false;
243: append((char) ch);
244: break;
245: }
246: ch = input.read();
247: }
248: return;
249: }
250:
251: /*
252: * Parse the given input stream for an HTTP 1.1 token.
253: */
254:
255: protected String parseToken(boolean lower)
256: throws MimeParserException, IOException {
257: bsize = 0;
258: while (true) {
259: switch (ch) {
260: // CTLs
261: case -1:
262: case 0:
263: case 1:
264: case 2:
265: case 3:
266: case 4:
267: case 5:
268: case 6:
269: case 7:
270: case 8:
271: case 9:
272: case 10:
273: case 11:
274: case 12:
275: case 13:
276: case 14:
277: case 15:
278: case 16:
279: case 17:
280: case 18:
281: case 19:
282: case 20:
283: case 21:
284: case 22:
285: case 23:
286: case 24:
287: case 25:
288: case 26:
289: case 27:
290: case 28:
291: case 29:
292: case 30:
293: case 31:
294: // tspecials
295: case '(':
296: case ')':
297: case '<':
298: case '>':
299: case '@':
300: case ',':
301: case ';':
302: case ':':
303: case '\\':
304: case '\"':
305: case '/':
306: case '[':
307: case ']':
308: case '?':
309: case '=':
310: case '{':
311: case '}':
312: case ' ':
313: return new String(buffer, 0, 0, bsize);
314: default:
315: append((char) (lower ? Character.toLowerCase((char) ch)
316: : ch));
317: }
318: ch = input.read();
319: }
320: }
321:
322: protected void parse822Headers(MimeHeaderHolder msg, boolean lenient)
323: throws MimeParserException, IOException {
324: while (true) {
325: if (ch == '\r') {
326: if ((ch = input.read()) == '\n')
327: return;
328: } else if (lenient && (ch == '\n')) {
329: return;
330: }
331: String name = parse822HeaderName();
332: skipSpaces();
333: parse822HeaderBody(lenient);
334: msg.notifyHeader(name, buffer, 0, bsize);
335: }
336: }
337:
338: protected void parse822Headers(MimeHeaderHolder msg)
339: throws MimeParserException, IOException {
340: parse822Headers(msg, true);
341: }
342:
343: /**
344: * parse the stream, and create a MimeHeaderHolder containing all
345: * the parsed headers, note that invalid headers will trigger an exception
346: * in stirct mode, and will just be removed in lenient mode
347: * @param lenient, a boolean, true if we want to be kind with bad people
348: * @return a MimeHeaderHolder instance containing the aprsed headers
349: */
350: public MimeHeaderHolder parse(boolean lenient)
351: throws MimeParserException, IOException {
352: MimeHeaderHolder msg = factory.createHeaderHolder(this );
353: ch = input.read();
354: cached = true;
355: if (!msg.notifyBeginParsing(this )) {
356: if (!cached)
357: ch = input.read();
358: if (lenient) {
359: try {
360: parse822Headers(msg, lenient);
361: } catch (MimeParserException ex) {
362: // be lenient ;)
363: }
364: } else {
365: parse822Headers(msg, lenient);
366: }
367: }
368: msg.notifyEndParsing(this );
369: return msg;
370: }
371:
372: /**
373: * parse the stream, and create a MimeHeaderHolder containing all
374: * the parsed headers, in lenient mode
375: * Always be lenient by default (general rule is: be lenient in what you
376: * accept conservative with what you generate).
377: */
378: public MimeHeaderHolder parse() throws MimeParserException,
379: IOException {
380: return parse(true);
381: }
382:
383: boolean cached = false;
384:
385: public int read() throws IOException {
386: if (cached)
387: cached = false;
388: else
389: ch = input.read();
390: return ch;
391: }
392:
393: public void unread(int ch) {
394: if (cached)
395: throw new RuntimeException("cannot unread more then once !");
396: this .ch = ch;
397: cached = true;
398: }
399:
400: /**
401: * Get the message body, as an input stream.
402: * @return The input stream used by the parser to get data, after
403: * a call to <code>parse</code>, this input stream contains exactly
404: * the body of the message.
405: */
406:
407: public InputStream getInputStream() {
408: return input;
409: }
410:
411: /**
412: * Create an instance of the MIMEParser class.
413: * @param in The input stream to be parsed as a MIME stream.
414: * @param factory The factory used to create MIME header holders.
415: */
416:
417: public MimeParser(InputStream input, MimeParserFactory factory) {
418: this .input = input;
419: this .factory = factory;
420: }
421:
422: /**
423: * Debuging
424: */
425:
426: public static void main(String args[]) {
427: try {
428: String factoryname = args[0];
429: String filename = args[1];
430: // Create the factory:
431: MimeParserFactory f = null;
432: f = (MimeParserFactory) Class.forName(factoryname)
433: .newInstance();
434: // Create the parser:
435: InputStream in = (new BufferedInputStream(
436: new FileInputStream(filename)));
437: MimeParser p = new MimeParser(in, f);
438: HttpRequestMessage m = (HttpRequestMessage) p.parse();
439: HttpAccept a[] = m.getAccept();
440: for (int i = 0; i < a.length; i++) {
441: System.out.println("accept: " + a[i].getMimeType());
442: }
443: HttpAcceptLanguage l[] = m.getAcceptLanguage();
444: for (int i = 0; i < l.length; i++) {
445: System.out
446: .println("accept-lang: " + l[i].getLanguage());
447: }
448: HttpAcceptCharset c[] = m.getAcceptCharset();
449: for (int i = 0; i < c.length; i++) {
450: System.out.println("accept-charset: "
451: + c[i].getCharset());
452: }
453: m.emit(System.out);
454: } catch (Exception ex) {
455: ex.printStackTrace();
456: System.out.println("MimeParser <factory> <file>");
457: }
458: }
459:
460: }
|