001: // Copyright (C) 1998-2001 by Jason Hunter <jhunter_AT_acm_DOT_org>.
002: // All rights reserved. Use of this class is limited.
003: // Please see the LICENSE for more information.
004:
005: package com.oreilly.servlet.multipart;
006:
007: import java.io.IOException;
008: import java.util.Enumeration;
009: import java.util.Vector;
010:
011: import javax.servlet.http.HttpServletRequest;
012: import javax.servlet.ServletInputStream;
013:
014: /**
015: * A utility class to handle <code>multipart/form-data</code> requests,
016: * the kind of requests that support file uploads. This class uses a
017: * "pull" model where the reading of incoming files and parameters is
018: * controlled by the client code, which allows incoming files to be stored
019: * into any <code>OutputStream</code>. If you wish to use an API which
020: * resembles <code>HttpServletRequest</code>, use the "push" model
021: * <code>MultipartRequest</code> instead. It's an easy-to-use wrapper
022: * around this class.
023: * <p>
024: * This class can receive arbitrarily large files (up to an artificial limit
025: * you can set), and fairly efficiently too.
026: * It cannot handle nested data (multipart content within multipart content).
027: * It <b>can</b> now with the latest release handle internationalized content
028: * (such as non Latin-1 filenames).
029: * <p>
030: * It also optionally includes enhanced buffering and Content-Length
031: * limitation. Buffering is only required if your servlet container is
032: * poorly implemented (many are, including Tomcat 3.2),
033: * but it is generally recommended because it will make a slow servlet
034: * container a lot faster, and will only make a fast servlet container a
035: * little slower. Content-Length limiting is usually only required if you find
036: * that your servlet is hanging trying to read the input stram from the POST,
037: * and it is similarly recommended because it only has a minimal impact on
038: * performance.
039: * <p>
040: * See the included upload.war for an example of how to use this class.
041: * <p>
042: * The full file upload specification is contained in experimental RFC 1867,
043: * available at <a href="http://www.ietf.org/rfc/rfc1867.txt">
044: * http://www.ietf.org/rfc/rfc1867.txt</a>.
045: *
046: * @see com.oreilly.servlet.MultipartRequest
047: *
048: * @author Jason Hunter
049: * @author Geoff Soutter
050: * @version 1.11, 2002/11/01, added constructor that takes an encoding, to
051: * make sure chars are always read correctly
052: * @version 1.10, 2002/11/01, added support for a preamble before the first
053: * boundary marker
054: * @version 1.9, 2002/11/01, added support to parse odd Opera Content-Type
055: * @version 1.8, 2002/11/01, added support for lynx with unquoted param vals
056: * @version 1.7, 2002/04/30, fixed bug if a line was '\n' alone
057: * @version 1.6, 2002/04/30, added better internationalization support, thanks
058: * to Changshin Lee
059: * @version 1.5, 2002/04/30, added Opera header fix, thanks to Nic Ferrier
060: * @version 1.4, 2001/03/23, added IE5 bug workaround supporting \n as line
061: * ending, thanks to Michael Alyn Miller
062: * @version 1.3, 2001/01/22, added support for boundaries surrounded by quotes
063: * and content-disposition after content-type,
064: * thanks to Scott Stark
065: * @version 1.2, 2001/01/22, getFilePath() support thanks to Stefan Eissing
066: * @version 1.1, 2000/10/29, integrating old WebSphere fix
067: * @version 1.0, 2000/10/27, initial revision
068: */
069: public class MultipartParser {
070:
071: /** input stream to read parts from */
072: private ServletInputStream in;
073:
074: /** MIME boundary that delimits parts */
075: private String boundary;
076:
077: /** reference to the last file part we returned */
078: private FilePart lastFilePart;
079:
080: /** buffer for readLine method */
081: private byte[] buf = new byte[8 * 1024];
082:
083: /** default encoding */
084: private static String DEFAULT_ENCODING = "ISO-8859-1";
085:
086: /** preferred encoding */
087: private String encoding = DEFAULT_ENCODING;
088:
089: /**
090: * Creates a <code>MultipartParser</code> from the specified request,
091: * which limits the upload size to the specified length, buffers for
092: * performance and prevent attempts to read past the amount specified
093: * by the Content-Length.
094: *
095: * @param req the servlet request.
096: * @param maxSize the maximum size of the POST content.
097: */
098: public MultipartParser(HttpServletRequest req, int maxSize)
099: throws IOException {
100: this (req, maxSize, true, true);
101: }
102:
103: /**
104: * Creates a <code>MultipartParser</code> from the specified request,
105: * which limits the upload size to the specified length, and optionally
106: * buffers for performance and prevents attempts to read past the amount
107: * specified by the Content-Length.
108: *
109: * @param req the servlet request.
110: * @param maxSize the maximum size of the POST content.
111: * @param buffer whether to do internal buffering or let the server buffer,
112: * useful for servers that don't buffer
113: * @param limitLength boolean flag to indicate if we need to filter
114: * the request's input stream to prevent trying to
115: * read past the end of the stream.
116: */
117: public MultipartParser(HttpServletRequest req, int maxSize,
118: boolean buffer, boolean limitLength) throws IOException {
119: this (req, maxSize, buffer, limitLength, null);
120: }
121:
122: /**
123: * Creates a <code>MultipartParser</code> from the specified request,
124: * which limits the upload size to the specified length, and optionally
125: * buffers for performance and prevents attempts to read past the amount
126: * specified by the Content-Length, and with a specified encoding.
127: *
128: * @param req the servlet request.
129: * @param maxSize the maximum size of the POST content.
130: * @param buffer whether to do internal buffering or let the server buffer,
131: * useful for servers that don't buffer
132: * @param limitLength boolean flag to indicate if we need to filter
133: * the request's input stream to prevent trying to
134: * read past the end of the stream.
135: * @param encoding the encoding to use for parsing, default is ISO-8859-1.
136: */
137: public MultipartParser(HttpServletRequest req, int maxSize,
138: boolean buffer, boolean limitLength, String encoding)
139: throws IOException {
140: // First make sure we know the encoding to handle chars correctly.
141: // Thanks to Andreas Granzer, andreas.granzer@wave-solutions.com,
142: // for pointing out the need to have this in the constructor.
143: if (encoding != null) {
144: setEncoding(encoding);
145: }
146:
147: // Check the content type to make sure it's "multipart/form-data"
148: // Access header two ways to work around WebSphere oddities
149: String type = null;
150: String type1 = req.getHeader("Content-Type");
151: String type2 = req.getContentType();
152: // If one value is null, choose the other value
153: if (type1 == null && type2 != null) {
154: type = type2;
155: } else if (type2 == null && type1 != null) {
156: type = type1;
157: }
158: // If neither value is null, choose the longer value
159: else if (type1 != null && type2 != null) {
160: type = (type1.length() > type2.length() ? type1 : type2);
161: }
162:
163: if (type == null
164: || !type.toLowerCase()
165: .startsWith("multipart/form-data")) {
166: throw new IOException(
167: "Posted content type isn't multipart/form-data");
168: }
169:
170: // Check the content length to prevent denial of service attacks
171: int length = req.getContentLength();
172: if (length > maxSize) {
173: throw new IOException("Posted content length of " + length
174: + " exceeds limit of " + maxSize);
175: }
176:
177: // Get the boundary string; it's included in the content type.
178: // Should look something like "------------------------12012133613061"
179: String boundary = extractBoundary(type);
180: if (boundary == null) {
181: throw new IOException(
182: "Separation boundary was not specified");
183: }
184:
185: ServletInputStream in = req.getInputStream();
186:
187: // If required, wrap the real input stream with classes that
188: // "enhance" its behaviour for performance and stability
189: if (buffer) {
190: in = new BufferedServletInputStream(in);
191: }
192: if (limitLength) {
193: in = new LimitedServletInputStream(in, length);
194: }
195:
196: // Save our values for later
197: this .in = in;
198: this .boundary = boundary;
199:
200: // Read until we hit the boundary
201: // Some clients send a preamble (per RFC 2046), so ignore that
202: // Thanks to Ben Johnson, ben.johnson@merrillcorp.com, for pointing out
203: // the need for preamble support.
204: do {
205: String line = readLine();
206: if (line == null) {
207: throw new IOException(
208: "Corrupt form data: premature ending");
209: }
210: // See if this line is the boundary, and if so break
211: if (line.startsWith(boundary)) {
212: break; // success
213: }
214: } while (true);
215: }
216:
217: /**
218: * Sets the encoding used to parse from here onward. The default is
219: * ISO-8859-1. Encodings are actually best passed into the contructor,
220: * so even the initial line reads are correct.
221: *
222: * @param encoding The encoding to use for parsing
223: */
224: public void setEncoding(String encoding) {
225: this .encoding = encoding;
226: }
227:
228: /**
229: * Read the next part arriving in the stream. Will be either a
230: * <code>FilePart</code> or a <code>ParamPart</code>, or <code>null</code>
231: * to indicate there are no more parts to read. The order of arrival
232: * corresponds to the order of the form elements in the submitted form.
233: *
234: * @return either a <code>FilePart</code>, a <code>ParamPart</code> or
235: * <code>null</code> if there are no more parts to read.
236: * @exception IOException if an input or output exception has occurred.
237: *
238: * @see FilePart
239: * @see ParamPart
240: */
241: public Part readNextPart() throws IOException {
242: // Make sure the last file was entirely read from the input
243: if (lastFilePart != null) {
244: lastFilePart.getInputStream().close();
245: lastFilePart = null;
246: }
247:
248: // Read the headers; they look like this (not all may be present):
249: // Content-Disposition: form-data; name="field1"; filename="file1.txt"
250: // Content-Type: type/subtype
251: // Content-Transfer-Encoding: binary
252: Vector headers = new Vector();
253:
254: String line = readLine();
255: if (line == null) {
256: // No parts left, we're done
257: return null;
258: }
259: else if (line.length() == 0) {
260: // IE4 on Mac sends an empty line at the end; treat that as the end.
261: // Thanks to Daniel Lemire and Henri Tourigny for this fix.
262: return null;
263: }
264:
265: // Read the following header lines we hit an empty line
266: // A line starting with whitespace is considered a continuation;
267: // that requires a little special logic. Thanks to Nic Ferrier for
268: // identifying a good fix.
269: while (line != null && line.length() > 0) {
270: String nextLine = null;
271: boolean getNextLine = true;
272: while (getNextLine) {
273: nextLine = readLine();
274: if (nextLine != null
275: && (nextLine.startsWith(" ")
276: || nextLine.startsWith("\t"))) {
277: line = line + nextLine;
278: }
279: else {
280: getNextLine = false;
281: }
282: }
283: // Add the line to the header list
284: headers.addElement(line);
285: line = nextLine;
286: }
287:
288: // If we got a null above, it's the end
289: if (line == null) {
290: return null;
291: }
292:
293: String name = null;
294: String filename = null;
295: String origname = null;
296: String contentType = "text/plain"; // rfc1867 says this is the default
297:
298: Enumeration enum = headers.elements();
299: while (enum.hasMoreElements()) {
300: String headerline = (String) enum.nextElement();
301: if (headerline.toLowerCase().startsWith("content-disposition:")) {
302: // Parse the content-disposition line
303: String[] dispInfo = extractDispositionInfo(headerline);
304: // String disposition = dispInfo[0]; // not currently used
305: name = dispInfo[1];
306: filename = dispInfo[2];
307: origname = dispInfo[3];
308: }
309: else if (headerline.toLowerCase().startsWith("content-type:")) {
310: // Get the content type, or null if none specified
311: String type = extractContentType(headerline);
312: if (type != null) {
313: contentType = type;
314: }
315: }
316: }
317:
318: // Now, finally, we read the content (end after reading the boundary)
319: if (filename == null) {
320: // This is a parameter, add it to the vector of values
321: // The encoding is needed to help parse the value
322: return new ParamPart(name, in, boundary, encoding);
323: }
324: else {
325: // This is a file
326: if (filename.equals("")) {
327: filename = null; // empty filename, probably an "empty" file param
328: }
329: lastFilePart = new FilePart(name, in, boundary,
330: contentType, filename, origname);
331: return lastFilePart;
332: }
333: }
334:
335: /**
336: * Extracts and returns the boundary token from a line.
337: *
338: * @return the boundary token.
339: */
340: private String extractBoundary(String line) {
341: // Use lastIndexOf() because IE 4.01 on Win98 has been known to send the
342: // "boundary=" string multiple times. Thanks to David Wall for this fix.
343: int index = line.lastIndexOf("boundary=");
344: if (index == -1) {
345: return null;
346: }
347: String boundary = line.substring(index + 9); // 9 for "boundary="
348: if (boundary.charAt(0) == '"') {
349: // The boundary is enclosed in quotes, strip them
350: index = boundary.lastIndexOf('"');
351: boundary = boundary.substring(1, index);
352: }
353:
354: // The real boundary is always preceeded by an extra "--"
355: boundary = "--" + boundary;
356:
357: return boundary;
358: }
359:
360: /**
361: * Extracts and returns disposition info from a line, as a <code>String<code>
362: * array with elements: disposition, name, filename.
363: *
364: * @return String[] of elements: disposition, name, filename.
365: * @exception IOException if the line is malformatted.
366: */
367: private String[] extractDispositionInfo(String line)
368: throws IOException {
369: // Return the line's data as an array: disposition, name, filename
370: String[] retval = new String[4];
371:
372: // Convert the line to a lowercase string without the ending \r\n
373: // Keep the original line for error messages and for variable names.
374: String origline = line;
375: line = origline.toLowerCase();
376:
377: // Get the content disposition, should be "form-data"
378: int start = line.indexOf("content-disposition: ");
379: int end = line.indexOf(";");
380: if (start == -1 || end == -1) {
381: throw new IOException("Content disposition corrupt: "
382: + origline);
383: }
384: String disposition = line.substring(start + 21, end);
385: if (!disposition.equals("form-data")) {
386: throw new IOException("Invalid content disposition: "
387: + disposition);
388: }
389:
390: // Get the field name
391: start = line.indexOf("name=\"", end); // start at last semicolon
392: end = line.indexOf("\"", start + 7); // skip name=\"
393: int startOffset = 6;
394: if (start == -1 || end == -1) {
395: // Some browsers like lynx don't surround with ""
396: // Thanks to Deon van der Merwe, dvdm@truteq.co.za, for noticing
397: start = line.indexOf("name=", end);
398: end = line.indexOf(";", start + 6);
399: if (start == -1) {
400: throw new IOException("Content disposition corrupt: "
401: + origline);
402: } else if (end == -1) {
403: end = line.length();
404: }
405: startOffset = 5; // without quotes we have one fewer char to skip
406: }
407: String name = origline.substring(start + startOffset, end);
408:
409: // Get the filename, if given
410: String filename = null;
411: String origname = null;
412: start = line.indexOf("filename=\"", end + 2); // start after name
413: end = line.indexOf("\"", start + 10); // skip filename=\"
414: if (start != -1 && end != -1) { // note the !=
415: filename = origline.substring(start + 10, end);
416: origname = filename;
417: // The filename may contain a full path. Cut to just the filename.
418: int slash = Math.max(filename.lastIndexOf('/'), filename
419: .lastIndexOf('\\'));
420: if (slash > -1) {
421: filename = filename.substring(slash + 1); // past last slash
422: }
423: }
424:
425: // Return a String array: disposition, name, filename
426: // empty filename denotes no file posted!
427: retval[0] = disposition;
428: retval[1] = name;
429: retval[2] = filename;
430: retval[3] = origname;
431: return retval;
432: }
433:
434: /**
435: * Extracts and returns the content type from a line, or null if the
436: * line was empty.
437: *
438: * @return content type, or null if line was empty.
439: * @exception IOException if the line is malformatted.
440: */
441: private static String extractContentType(String line)
442: throws IOException {
443: // Convert the line to a lowercase string
444: line = line.toLowerCase();
445:
446: // Get the content type, if any
447: // Note that Opera at least puts extra info after the type, so handle
448: // that. For example: Content-Type: text/plain; name="foo"
449: // Thanks to Leon Poyyayil, leon.poyyayil@trivadis.com, for noticing this.
450: int end = line.indexOf(";");
451: if (end == -1) {
452: end = line.length();
453: }
454:
455: return line.substring(13, end).trim(); // "content-type:" is 13
456: }
457:
458: /**
459: * Read the next line of input.
460: *
461: * @return a String containing the next line of input from the stream,
462: * or null to indicate the end of the stream.
463: * @exception IOException if an input or output exception has occurred.
464: */
465: private String readLine() throws IOException {
466: StringBuffer sbuf = new StringBuffer();
467: int result;
468: String line;
469:
470: do {
471: result = in.readLine(buf, 0, buf.length); // does +=
472: if (result != -1) {
473: sbuf.append(new String(buf, 0, result, encoding));
474: }
475: } while (result == buf.length); // loop only if the buffer was filled
476:
477: if (sbuf.length() == 0) {
478: return null; // nothing read, must be at the end of stream
479: }
480:
481: // Cut off the trailing \n or \r\n
482: // It should always be \r\n but IE5 sometimes does just \n
483: // Thanks to Luke Blaikie for helping make this work with \n
484: int len = sbuf.length();
485: if (len >= 2 && sbuf.charAt(len - 2) == '\r') {
486: sbuf.setLength(len - 2); // cut \r\n
487: } else if (len >= 1 && sbuf.charAt(len - 1) == '\n') {
488: sbuf.setLength(len - 1); // cut \n
489: }
490: return sbuf.toString();
491: }
492: }
|