0001: /*
0002: * The contents of this file are subject to the terms
0003: * of the Common Development and Distribution License
0004: * (the "License"). You may not use this file except
0005: * in compliance with the License.
0006: *
0007: * You can obtain a copy of the license at
0008: * https://jwsdp.dev.java.net/CDDLv1.0.html
0009: * See the License for the specific language governing
0010: * permissions and limitations under the License.
0011: *
0012: * When distributing Covered Code, include this CDDL
0013: * HEADER in each file and include the License file at
0014: * https://jwsdp.dev.java.net/CDDLv1.0.html If applicable,
0015: * add the following below this CDDL HEADER, with the
0016: * fields enclosed by brackets "[]" replaced with your
0017: * own identifying information: Portions Copyright [yyyy]
0018: * [name of copyright owner]
0019: */
0020: /*
0021: * @(#)MimeUtility.java 1.45 03/03/10
0022: */
0023:
0024: /*
0025: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
0026: *
0027: * Copyright 1997-2007 Sun Microsystems, Inc. All rights reserved.
0028: *
0029: * The contents of this file are subject to the terms of either the GNU
0030: * General Public License Version 2 only ("GPL") or the Common Development
0031: * and Distribution License("CDDL") (collectively, the "License"). You
0032: * may not use this file except in compliance with the License. You can obtain
0033: * a copy of the License at https://glassfish.dev.java.net/public/CDDL+GPL.html
0034: * or glassfish/bootstrap/legal/LICENSE.txt. See the License for the specific
0035: * language governing permissions and limitations under the License.
0036: *
0037: * When distributing the software, include this License Header Notice in each
0038: * file and include the License file at glassfish/bootstrap/legal/LICENSE.txt.
0039: * Sun designates this particular file as subject to the "Classpath" exception
0040: * as provided by Sun in the GPL Version 2 section of the License file that
0041: * accompanied this code. If applicable, add the following below the License
0042: * Header, with the fields enclosed by brackets [] replaced by your own
0043: * identifying information: "Portions Copyrighted [year]
0044: * [name of copyright owner]"
0045: *
0046: * Contributor(s):
0047: *
0048: * If you wish your version of this file to be governed by only the CDDL or
0049: * only the GPL Version 2, indicate your decision by adding "[Contributor]
0050: * elects to include this software in this distribution under the [CDDL or GPL
0051: * Version 2] license." If you don't indicate a single choice of license, a
0052: * recipient has the option to distribute your version of this file under
0053: * either the CDDL, the GPL Version 2 or to extend the choice of license to
0054: * its licensees as provided above. However, if you add GPL Version 2 code
0055: * and therefore, elected the GPL Version 2 license, then the option applies
0056: * only if the new code is made subject to such option by the copyright
0057: * holder.
0058: */
0059:
0060: package com.sun.xml.messaging.saaj.packaging.mime.internet;
0061:
0062: import java.io.*;
0063: import java.util.*;
0064:
0065: import javax.activation.DataHandler;
0066: import javax.activation.DataSource;
0067:
0068: import com.sun.xml.messaging.saaj.packaging.mime.MessagingException;
0069: import com.sun.xml.messaging.saaj.packaging.mime.util.*;
0070:
0071: /**
0072: * This is a utility class that provides various MIME related
0073: * functionality. <p>
0074: *
0075: * There are a set of methods to encode and decode MIME headers as
0076: * per RFC 2047. A brief description on handling such headers is
0077: * given below: <p>
0078: *
0079: * RFC 822 mail headers <strong>must</strong> contain only US-ASCII
0080: * characters. Headers that contain non US-ASCII characters must be
0081: * encoded so that they contain only US-ASCII characters. Basically,
0082: * this process involves using either BASE64 or QP to encode certain
0083: * characters. RFC 2047 describes this in detail. <p>
0084: *
0085: * In Java, Strings contain (16 bit) Unicode characters. ASCII is a
0086: * subset of Unicode (and occupies the range 0 - 127). A String
0087: * that contains only ASCII characters is already mail-safe. If the
0088: * String contains non US-ASCII characters, it must be encoded. An
0089: * additional complexity in this step is that since Unicode is not
0090: * yet a widely used charset, one might want to first charset-encode
0091: * the String into another charset and then do the transfer-encoding.
0092: * <p>
0093: * Note that to get the actual bytes of a mail-safe String (say,
0094: * for sending over SMTP), one must do
0095: * <p><blockquote><pre>
0096: *
0097: * byte[] bytes = string.getBytes("iso-8859-1");
0098: *
0099: * </pre></blockquote><p>
0100: *
0101: * The <code>setHeader</code> and <code>addHeader</code> methods
0102: * on MimeMessage and MimeBodyPart assume that the given header values
0103: * are Unicode strings that contain only US-ASCII characters. Hence
0104: * the callers of those methods must insure that the values they pass
0105: * do not contain non US-ASCII characters. The methods in this class
0106: * help do this. <p>
0107: *
0108: * The <code>getHeader</code> family of methods on MimeMessage and
0109: * MimeBodyPart return the raw header value. These might be encoded
0110: * as per RFC 2047, and if so, must be decoded into Unicode Strings.
0111: * The methods in this class help to do this. <p>
0112: *
0113: * Several System properties control strict conformance to the MIME
0114: * spec. Note that these are not session properties but must be set
0115: * globally as System properties. <p>
0116: *
0117: * The <code>mail.mime.decodetext.strict</code> property controls
0118: * decoding of MIME encoded words. The MIME spec requires that encoded
0119: * words start at the beginning of a whitespace separated word. Some
0120: * mailers incorrectly include encoded words in the middle of a word.
0121: * If the <code>mail.mime.decodetext.strict</code> System property is
0122: * set to <code>"false"</code>, an attempt will be made to decode these
0123: * illegal encoded words. The default is true. <p>
0124: *
0125: * The <code>mail.mime.encodeeol.strict</code> property controls the
0126: * choice of Content-Transfer-Encoding for MIME parts that are not of
0127: * type "text". Often such parts will contain textual data for which
0128: * an encoding that allows normal end of line conventions is appropriate.
0129: * In rare cases, such a part will appear to contain entirely textual
0130: * data, but will require an encoding that preserves CR and LF characters
0131: * without change. If the <code>mail.mime.decodetext.strict</code>
0132: * System property is set to <code>"true"</code>, such an encoding will
0133: * be used when necessary. The default is false. <p>
0134: *
0135: * In addition, the <code>mail.mime.charset</code> System property can
0136: * be used to specify the default MIME charset to use for encoded words
0137: * and text parts that don't otherwise specify a charset. Normally, the
0138: * default MIME charset is derived from the default Java charset, as
0139: * specified in the <code>file.encoding</code> System property. Most
0140: * applications will have no need to explicitly set the default MIME
0141: * charset. In cases where the default MIME charset to be used for
0142: * mail messages is different than the charset used for files stored on
0143: * the system, this property should be set.
0144: *
0145: * @version 1.45, 03/03/10
0146: * @author John Mani
0147: * @author Bill Shannon
0148: */
0149:
0150: public class MimeUtility {
0151:
0152: // This class cannot be instantiated
0153: private MimeUtility() {
0154: }
0155:
0156: public static final int ALL = -1;
0157:
0158: private static final int BUFFER_SIZE = 1024;
0159: private static boolean decodeStrict = true;
0160: private static boolean encodeEolStrict = false;
0161: private static boolean foldEncodedWords = false;
0162: private static boolean foldText = true;
0163:
0164: static {
0165: try {
0166: String s = System
0167: .getProperty("mail.mime.decodetext.strict");
0168: // default to true
0169: decodeStrict = s == null || !s.equalsIgnoreCase("false");
0170: s = System.getProperty("mail.mime.encodeeol.strict");
0171: // default to false
0172: encodeEolStrict = s != null && s.equalsIgnoreCase("true");
0173: s = System.getProperty("mail.mime.foldencodedwords");
0174: // default to false
0175: foldEncodedWords = s != null && s.equalsIgnoreCase("true");
0176: s = System.getProperty("mail.mime.foldtext");
0177: // default to true
0178: foldText = s == null || !s.equalsIgnoreCase("false");
0179: } catch (SecurityException sex) {
0180: // ignore it
0181: }
0182: }
0183:
0184: /**
0185: * Get the content-transfer-encoding that should be applied
0186: * to the input stream of this datasource, to make it mailsafe. <p>
0187: *
0188: * The algorithm used here is: <br>
0189: * <ul>
0190: * <li>
0191: * If the primary type of this datasource is "text" and if all
0192: * the bytes in its input stream are US-ASCII, then the encoding
0193: * is "7bit". If more than half of the bytes are non-US-ASCII, then
0194: * the encoding is "base64". If less than half of the bytes are
0195: * non-US-ASCII, then the encoding is "quoted-printable".
0196: * <li>
0197: * If the primary type of this datasource is not "text", then if
0198: * all the bytes of its input stream are US-ASCII, the encoding
0199: * is "7bit". If there is even one non-US-ASCII character, the
0200: * encoding is "base64".
0201: * </ul>
0202: *
0203: * @param ds DataSource
0204: * @return the encoding. This is either "7bit",
0205: * "quoted-printable" or "base64"
0206: */
0207: public static String getEncoding(DataSource ds) {
0208: ContentType cType = null;
0209: InputStream is = null;
0210: String encoding = null;
0211:
0212: try {
0213: cType = new ContentType(ds.getContentType());
0214: is = ds.getInputStream();
0215: } catch (Exception ex) {
0216: return "base64"; // what else ?!
0217: }
0218:
0219: boolean isText = cType.match("text/*");
0220: // if not text, stop processing when we see non-ASCII
0221: int i = checkAscii(is, ALL, !isText);
0222: switch (i) {
0223: case ALL_ASCII:
0224: encoding = "7bit"; // all ascii
0225: break;
0226: case MOSTLY_ASCII:
0227: encoding = "quoted-printable"; // mostly ascii
0228: break;
0229: default:
0230: encoding = "base64"; // mostly binary
0231: break;
0232: }
0233:
0234: // Close the input stream
0235: try {
0236: is.close();
0237: } catch (IOException ioex) {
0238: }
0239:
0240: return encoding;
0241: }
0242:
0243: /**
0244: * Same as <code>getEncoding(DataSource)</code> except that instead
0245: * of reading the data from an <code>InputStream</code> it uses the
0246: * <code>writeTo</code> method to examine the data. This is more
0247: * efficient in the common case of a <code>DataHandler</code>
0248: * created with an object and a MIME type (for example, a
0249: * "text/plain" String) because all the I/O is done in this
0250: * thread. In the case requiring an <code>InputStream</code> the
0251: * <code>DataHandler</code> uses a thread, a pair of pipe streams,
0252: * and the <code>writeTo</code> method to produce the data. <p>
0253: *
0254: * @since JavaMail 1.2
0255: */
0256: public static String getEncoding(DataHandler dh) {
0257: ContentType cType = null;
0258: String encoding = null;
0259:
0260: /*
0261: * Try to pick the most efficient means of determining the
0262: * encoding. If this DataHandler was created using a DataSource,
0263: * the getEncoding(DataSource) method is typically faster. If
0264: * the DataHandler was created with an object, this method is
0265: * much faster. To distinguish the two cases, we use a heuristic.
0266: * A DataHandler created with an object will always have a null name.
0267: * A DataHandler created with a DataSource will usually have a
0268: * non-null name.
0269: *
0270: * XXX - This is actually quite a disgusting hack, but it makes
0271: * a common case run over twice as fast.
0272: */
0273: if (dh.getName() != null)
0274: return getEncoding(dh.getDataSource());
0275:
0276: try {
0277: cType = new ContentType(dh.getContentType());
0278: } catch (Exception ex) {
0279: return "base64"; // what else ?!
0280: }
0281:
0282: if (cType.match("text/*")) {
0283: // Check all of the available bytes
0284: AsciiOutputStream aos = new AsciiOutputStream(false, false);
0285: try {
0286: dh.writeTo(aos);
0287: } catch (IOException ex) {
0288: } // ignore it
0289: switch (aos.getAscii()) {
0290: case ALL_ASCII:
0291: encoding = "7bit"; // all ascii
0292: break;
0293: case MOSTLY_ASCII:
0294: encoding = "quoted-printable"; // mostly ascii
0295: break;
0296: default:
0297: encoding = "base64"; // mostly binary
0298: break;
0299: }
0300: } else { // not "text"
0301: // Check all of available bytes, break out if we find
0302: // at least one non-US-ASCII character
0303: AsciiOutputStream aos = new AsciiOutputStream(true,
0304: encodeEolStrict);
0305: try {
0306: dh.writeTo(aos);
0307: } catch (IOException ex) {
0308: } // ignore it
0309: if (aos.getAscii() == ALL_ASCII) // all ascii
0310: encoding = "7bit";
0311: else
0312: // found atleast one non-ascii character, use b64
0313: encoding = "base64";
0314: }
0315:
0316: return encoding;
0317: }
0318:
0319: /**
0320: * Decode the given input stream. The Input stream returned is
0321: * the decoded input stream. All the encodings defined in RFC 2045
0322: * are supported here. They include "base64", "quoted-printable",
0323: * "7bit", "8bit", and "binary". In addition, "uuencode" is also
0324: * supported.
0325: *
0326: * @param is input stream
0327: * @param encoding the encoding of the stream.
0328: * @return decoded input stream.
0329: */
0330: public static InputStream decode(InputStream is, String encoding)
0331: throws MessagingException {
0332: if (encoding.equalsIgnoreCase("base64"))
0333: return new BASE64DecoderStream(is);
0334: else if (encoding.equalsIgnoreCase("quoted-printable"))
0335: return new QPDecoderStream(is);
0336: else if (encoding.equalsIgnoreCase("uuencode")
0337: || encoding.equalsIgnoreCase("x-uuencode")
0338: || encoding.equalsIgnoreCase("x-uue"))
0339: return new UUDecoderStream(is);
0340: else if (encoding.equalsIgnoreCase("binary")
0341: || encoding.equalsIgnoreCase("7bit")
0342: || encoding.equalsIgnoreCase("8bit"))
0343: return is;
0344: else
0345: throw new MessagingException("Unknown encoding: "
0346: + encoding);
0347: }
0348:
0349: /**
0350: * Wrap an encoder around the given output stream.
0351: * All the encodings defined in RFC 2045 are supported here.
0352: * They include "base64", "quoted-printable", "7bit", "8bit" and
0353: * "binary". In addition, "uuencode" is also supported.
0354: *
0355: * @param os output stream
0356: * @param encoding the encoding of the stream.
0357: * @return output stream that applies the
0358: * specified encoding.
0359: */
0360: public static OutputStream encode(OutputStream os, String encoding)
0361: throws MessagingException {
0362: if (encoding == null)
0363: return os;
0364: else if (encoding.equalsIgnoreCase("base64"))
0365: return new BASE64EncoderStream(os);
0366: else if (encoding.equalsIgnoreCase("quoted-printable"))
0367: return new QPEncoderStream(os);
0368: else if (encoding.equalsIgnoreCase("uuencode")
0369: || encoding.equalsIgnoreCase("x-uuencode")
0370: || encoding.equalsIgnoreCase("x-uue"))
0371: return new UUEncoderStream(os);
0372: else if (encoding.equalsIgnoreCase("binary")
0373: || encoding.equalsIgnoreCase("7bit")
0374: || encoding.equalsIgnoreCase("8bit"))
0375: return os;
0376: else
0377: throw new MessagingException("Unknown encoding: "
0378: + encoding);
0379: }
0380:
0381: /**
0382: * Wrap an encoder around the given output stream.
0383: * All the encodings defined in RFC 2045 are supported here.
0384: * They include "base64", "quoted-printable", "7bit", "8bit" and
0385: * "binary". In addition, "uuencode" is also supported.
0386: * The <code>filename</code> parameter is used with the "uuencode"
0387: * encoding and is included in the encoded output.
0388: *
0389: * @param os output stream
0390: * @param encoding the encoding of the stream.
0391: * @param filename name for the file being encoded (only used
0392: * with uuencode)
0393: * @return output stream that applies the
0394: * specified encoding.
0395: * @since JavaMail 1.2
0396: */
0397: public static OutputStream encode(OutputStream os, String encoding,
0398: String filename) throws MessagingException {
0399: if (encoding == null)
0400: return os;
0401: else if (encoding.equalsIgnoreCase("base64"))
0402: return new BASE64EncoderStream(os);
0403: else if (encoding.equalsIgnoreCase("quoted-printable"))
0404: return new QPEncoderStream(os);
0405: else if (encoding.equalsIgnoreCase("uuencode")
0406: || encoding.equalsIgnoreCase("x-uuencode")
0407: || encoding.equalsIgnoreCase("x-uue"))
0408: return new UUEncoderStream(os, filename);
0409: else if (encoding.equalsIgnoreCase("binary")
0410: || encoding.equalsIgnoreCase("7bit")
0411: || encoding.equalsIgnoreCase("8bit"))
0412: return os;
0413: else
0414: throw new MessagingException("Unknown encoding: "
0415: + encoding);
0416: }
0417:
0418: /**
0419: * Encode a RFC 822 "text" token into mail-safe form as per
0420: * RFC 2047. <p>
0421: *
0422: * The given Unicode string is examined for non US-ASCII
0423: * characters. If the string contains only US-ASCII characters,
0424: * it is returned as-is. If the string contains non US-ASCII
0425: * characters, it is first character-encoded using the platform's
0426: * default charset, then transfer-encoded using either the B or
0427: * Q encoding. The resulting bytes are then returned as a Unicode
0428: * string containing only ASCII characters. <p>
0429: *
0430: * Note that this method should be used to encode only
0431: * "unstructured" RFC 822 headers. <p>
0432: *
0433: * Example of usage:
0434: * <p><blockquote><pre>
0435: *
0436: * MimeBodyPart part = ...
0437: * String rawvalue = "FooBar Mailer, Japanese version 1.1"
0438: * try {
0439: * // If we know for sure that rawvalue contains only US-ASCII
0440: * // characters, we can skip the encoding part
0441: * part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue));
0442: * } catch (UnsupportedEncodingException e) {
0443: * // encoding failure
0444: * } catch (MessagingException me) {
0445: * // setHeader() failure
0446: * }
0447: *
0448: * </pre></blockquote><p>
0449: *
0450: * @param text unicode string
0451: * @return Unicode string containing only US-ASCII characters
0452: * @exception UnsupportedEncodingException if the encoding fails
0453: */
0454: public static String encodeText(String text)
0455: throws UnsupportedEncodingException {
0456: return encodeText(text, null, null);
0457: }
0458:
0459: /**
0460: * Encode a RFC 822 "text" token into mail-safe form as per
0461: * RFC 2047. <p>
0462: *
0463: * The given Unicode string is examined for non US-ASCII
0464: * characters. If the string contains only US-ASCII characters,
0465: * it is returned as-is. If the string contains non US-ASCII
0466: * characters, it is first character-encoded using the specified
0467: * charset, then transfer-encoded using either the B or Q encoding.
0468: * The resulting bytes are then returned as a Unicode string
0469: * containing only ASCII characters. <p>
0470: *
0471: * Note that this method should be used to encode only
0472: * "unstructured" RFC 822 headers.
0473: *
0474: * @param text the header value
0475: * @param charset the charset. If this parameter is null, the
0476: * platform's default chatset is used.
0477: * @param encoding the encoding to be used. Currently supported
0478: * values are "B" and "Q". If this parameter is null, then
0479: * the "Q" encoding is used if most of characters to be
0480: * encoded are in the ASCII charset, otherwise "B" encoding
0481: * is used.
0482: * @return Unicode string containing only US-ASCII characters
0483: */
0484: public static String encodeText(String text, String charset,
0485: String encoding) throws UnsupportedEncodingException {
0486: return encodeWord(text, charset, encoding, false);
0487: }
0488:
0489: /**
0490: * Decode "unstructured" headers, that is, headers that are defined
0491: * as '*text' as per RFC 822. <p>
0492: *
0493: * The string is decoded using the algorithm specified in
0494: * RFC 2047, Section 6.1.1. If the charset-conversion fails
0495: * for any sequence, an UnsupportedEncodingException is thrown.
0496: * If the String is not an RFC 2047 style encoded header, it is
0497: * returned as-is <p>
0498: *
0499: * Example of usage:
0500: * <p><blockquote><pre>
0501: *
0502: * MimeBodyPart part = ...
0503: * String rawvalue = null;
0504: * String value = null;
0505: * try {
0506: * if ((rawvalue = part.getHeader("X-mailer")[0]) != null)
0507: * value = MimeUtility.decodeText(rawvalue);
0508: * } catch (UnsupportedEncodingException e) {
0509: * // Don't care
0510: * value = rawvalue;
0511: * } catch (MessagingException me) { }
0512: *
0513: * return value;
0514: *
0515: * </pre></blockquote><p>
0516: *
0517: * @param etext the possibly encoded value
0518: * @exception UnsupportedEncodingException if the charset
0519: * conversion failed.
0520: */
0521: public static String decodeText(String etext)
0522: throws UnsupportedEncodingException {
0523: /*
0524: * We look for sequences separated by "linear-white-space".
0525: * (as per RFC 2047, Section 6.1.1)
0526: * RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL.
0527: */
0528: String lwsp = " \t\n\r";
0529: StringTokenizer st;
0530:
0531: /*
0532: * First, lets do a quick run thru the string and check
0533: * whether the sequence "=?" exists at all. If none exists,
0534: * we know there are no encoded-words in here and we can just
0535: * return the string as-is, without suffering thru the later
0536: * decoding logic.
0537: * This handles the most common case of unencoded headers
0538: * efficiently.
0539: */
0540: if (etext.indexOf("=?") == -1)
0541: return etext;
0542:
0543: // Encoded words found. Start decoding ...
0544:
0545: st = new StringTokenizer(etext, lwsp, true);
0546: StringBuffer sb = new StringBuffer(); // decode buffer
0547: StringBuffer wsb = new StringBuffer(); // white space buffer
0548: boolean prevWasEncoded = false;
0549:
0550: while (st.hasMoreTokens()) {
0551: char c;
0552: String s = st.nextToken();
0553: // If whitespace, append it to the whitespace buffer
0554: if (((c = s.charAt(0)) == ' ') || (c == '\t')
0555: || (c == '\r') || (c == '\n'))
0556: wsb.append(c);
0557: else {
0558: // Check if token is an 'encoded-word' ..
0559: String word;
0560: try {
0561: word = decodeWord(s);
0562: // Yes, this IS an 'encoded-word'.
0563: if (!prevWasEncoded && wsb.length() > 0) {
0564: // if the previous word was also encoded, we
0565: // should ignore the collected whitespace. Else
0566: // we include the whitespace as well.
0567: sb.append(wsb);
0568: }
0569: prevWasEncoded = true;
0570: } catch (ParseException pex) {
0571: // This is NOT an 'encoded-word'.
0572: word = s;
0573: // possibly decode inner encoded words
0574: if (!decodeStrict)
0575: word = decodeInnerWords(word);
0576: // include colleced whitespace ..
0577: if (wsb.length() > 0)
0578: sb.append(wsb);
0579: prevWasEncoded = false;
0580: }
0581: sb.append(word); // append the actual word
0582: wsb.setLength(0); // reset wsb for reuse
0583: }
0584: }
0585: return sb.toString();
0586: }
0587:
0588: /**
0589: * Encode a RFC 822 "word" token into mail-safe form as per
0590: * RFC 2047. <p>
0591: *
0592: * The given Unicode string is examined for non US-ASCII
0593: * characters. If the string contains only US-ASCII characters,
0594: * it is returned as-is. If the string contains non US-ASCII
0595: * characters, it is first character-encoded using the platform's
0596: * default charset, then transfer-encoded using either the B or
0597: * Q encoding. The resulting bytes are then returned as a Unicode
0598: * string containing only ASCII characters. <p>
0599: *
0600: * This method is meant to be used when creating RFC 822 "phrases".
0601: * The InternetAddress class, for example, uses this to encode
0602: * it's 'phrase' component.
0603: *
0604: * @param text unicode string
0605: * @return Array of Unicode strings containing only US-ASCII
0606: * characters.
0607: * @exception UnsupportedEncodingException if the encoding fails
0608: */
0609: public static String encodeWord(String word)
0610: throws UnsupportedEncodingException {
0611: return encodeWord(word, null, null);
0612: }
0613:
0614: /**
0615: * Encode a RFC 822 "word" token into mail-safe form as per
0616: * RFC 2047. <p>
0617: *
0618: * The given Unicode string is examined for non US-ASCII
0619: * characters. If the string contains only US-ASCII characters,
0620: * it is returned as-is. If the string contains non US-ASCII
0621: * characters, it is first character-encoded using the specified
0622: * charset, then transfer-encoded using either the B or Q encoding.
0623: * The resulting bytes are then returned as a Unicode string
0624: * containing only ASCII characters. <p>
0625: *
0626: * @param text unicode string
0627: * @param charset the MIME charset
0628: * @param encoding the encoding to be used. Currently supported
0629: * values are "B" and "Q". If this parameter is null, then
0630: * the "Q" encoding is used if most of characters to be
0631: * encoded are in the ASCII charset, otherwise "B" encoding
0632: * is used.
0633: * @return Unicode string containing only US-ASCII characters
0634: * @exception UnsupportedEncodingException if the encoding fails
0635: */
0636: public static String encodeWord(String word, String charset,
0637: String encoding) throws UnsupportedEncodingException {
0638: return encodeWord(word, charset, encoding, true);
0639: }
0640:
0641: /*
0642: * Encode the given string. The parameter 'encodingWord' should
0643: * be true if a RFC 822 "word" token is being encoded and false if a
0644: * RFC 822 "text" token is being encoded. This is because the
0645: * "Q" encoding defined in RFC 2047 has more restrictions when
0646: * encoding "word" tokens. (Sigh)
0647: */
0648: private static String encodeWord(String string, String charset,
0649: String encoding, boolean encodingWord)
0650: throws UnsupportedEncodingException {
0651:
0652: // If 'string' contains only US-ASCII characters, just
0653: // return it.
0654: int ascii = checkAscii(string);
0655: if (ascii == ALL_ASCII)
0656: return string;
0657:
0658: // Else, apply the specified charset conversion.
0659: String jcharset;
0660: if (charset == null) { // use default charset
0661: jcharset = getDefaultJavaCharset(); // the java charset
0662: charset = getDefaultMIMECharset(); // the MIME equivalent
0663: } else
0664: // MIME charset -> java charset
0665: jcharset = javaCharset(charset);
0666:
0667: // If no transfer-encoding is specified, figure one out.
0668: if (encoding == null) {
0669: if (ascii != MOSTLY_NONASCII)
0670: encoding = "Q";
0671: else
0672: encoding = "B";
0673: }
0674:
0675: boolean b64;
0676: if (encoding.equalsIgnoreCase("B"))
0677: b64 = true;
0678: else if (encoding.equalsIgnoreCase("Q"))
0679: b64 = false;
0680: else
0681: throw new UnsupportedEncodingException(
0682: "Unknown transfer encoding: " + encoding);
0683:
0684: StringBuffer outb = new StringBuffer(); // the output buffer
0685: doEncode(string, b64, jcharset,
0686: // As per RFC 2047, size of an encoded string should not
0687: // exceed 75 bytes.
0688: // 7 = size of "=?", '?', 'B'/'Q', '?', "?="
0689: 75 - 7 - charset.length(), // the available space
0690: "=?" + charset + "?" + encoding + "?", // prefix
0691: true, encodingWord, outb);
0692:
0693: return outb.toString();
0694: }
0695:
0696: private static void doEncode(String string, boolean b64,
0697: String jcharset, int avail, String prefix, boolean first,
0698: boolean encodingWord, StringBuffer buf)
0699: throws UnsupportedEncodingException {
0700:
0701: // First find out what the length of the encoded version of
0702: // 'string' would be.
0703: byte[] bytes = string.getBytes(jcharset);
0704: int len;
0705: if (b64) // "B" encoding
0706: len = BEncoderStream.encodedLength(bytes);
0707: else
0708: // "Q"
0709: len = QEncoderStream.encodedLength(bytes, encodingWord);
0710:
0711: int size;
0712: if ((len > avail) && ((size = string.length()) > 1)) {
0713: // If the length is greater than 'avail', split 'string'
0714: // into two and recurse.
0715: doEncode(string.substring(0, size / 2), b64, jcharset,
0716: avail, prefix, first, encodingWord, buf);
0717: doEncode(string.substring(size / 2, size), b64, jcharset,
0718: avail, prefix, false, encodingWord, buf);
0719: } else {
0720: // length <= than 'avail'. Encode the given string
0721: ByteArrayOutputStream os = new ByteArrayOutputStream(
0722: BUFFER_SIZE);
0723: OutputStream eos; // the encoder
0724: if (b64) // "B" encoding
0725: eos = new BEncoderStream(os);
0726: else
0727: // "Q" encoding
0728: eos = new QEncoderStream(os, encodingWord);
0729:
0730: try { // do the encoding
0731: eos.write(bytes);
0732: eos.close();
0733: } catch (IOException ioex) {
0734: }
0735:
0736: byte[] encodedBytes = os.toByteArray(); // the encoded stuff
0737: // Now write out the encoded (all ASCII) bytes into our
0738: // StringBuffer
0739: if (!first) // not the first line of this sequence
0740: if (foldEncodedWords)
0741: buf.append("\r\n "); // start a continuation line
0742: else
0743: buf.append(" "); // line will be folded later
0744:
0745: buf.append(prefix);
0746: for (int i = 0; i < encodedBytes.length; i++)
0747: buf.append((char) encodedBytes[i]);
0748: buf.append("?="); // terminate the current sequence
0749: }
0750: }
0751:
0752: /**
0753: * The string is parsed using the rules in RFC 2047 for parsing
0754: * an "encoded-word". If the parse fails, a ParseException is
0755: * thrown. Otherwise, it is transfer-decoded, and then
0756: * charset-converted into Unicode. If the charset-conversion
0757: * fails, an UnsupportedEncodingException is thrown.<p>
0758: *
0759: * @param eword the possibly encoded value
0760: * @exception ParseException if the string is not an
0761: * encoded-word as per RFC 2047.
0762: * @exception UnsupportedEncodingException if the charset
0763: * conversion failed.
0764: */
0765: public static String decodeWord(String eword)
0766: throws ParseException, UnsupportedEncodingException {
0767:
0768: if (!eword.startsWith("=?")) // not an encoded word
0769: throw new ParseException();
0770:
0771: // get charset
0772: int start = 2;
0773: int pos;
0774: if ((pos = eword.indexOf('?', start)) == -1)
0775: throw new ParseException();
0776: String charset = javaCharset(eword.substring(start, pos));
0777:
0778: // get encoding
0779: start = pos + 1;
0780: if ((pos = eword.indexOf('?', start)) == -1)
0781: throw new ParseException();
0782: String encoding = eword.substring(start, pos);
0783:
0784: // get encoded-sequence
0785: start = pos + 1;
0786: if ((pos = eword.indexOf("?=", start)) == -1)
0787: throw new ParseException();
0788: String word = eword.substring(start, pos);
0789:
0790: try {
0791: // Extract the bytes from word
0792: ByteArrayInputStream bis = new ByteArrayInputStream(
0793: ASCIIUtility.getBytes(word));
0794:
0795: // Get the appropriate decoder
0796: InputStream is;
0797: if (encoding.equalsIgnoreCase("B"))
0798: is = new BASE64DecoderStream(bis);
0799: else if (encoding.equalsIgnoreCase("Q"))
0800: is = new QDecoderStream(bis);
0801: else
0802: throw new UnsupportedEncodingException(
0803: "unknown encoding: " + encoding);
0804:
0805: // For b64 & q, size of decoded word <= size of word. So
0806: // the decoded bytes must fit into the 'bytes' array. This
0807: // is certainly more efficient than writing bytes into a
0808: // ByteArrayOutputStream and then pulling out the byte[]
0809: // from it.
0810: int count = bis.available();
0811: byte[] bytes = new byte[count];
0812: // count is set to the actual number of decoded bytes
0813: count = is.read(bytes, 0, count);
0814:
0815: // Finally, convert the decoded bytes into a String using
0816: // the specified charset
0817: String s = new String(bytes, 0, count, charset);
0818: if (pos + 2 < eword.length()) {
0819: // there's still more text in the string
0820: String rest = eword.substring(pos + 2);
0821: if (!decodeStrict)
0822: rest = decodeInnerWords(rest);
0823: s += rest;
0824: }
0825: return s;
0826: } catch (UnsupportedEncodingException uex) {
0827: // explicitly catch and rethrow this exception, otherwise
0828: // the below IOException catch will swallow this up!
0829: throw uex;
0830: } catch (IOException ioex) {
0831: // Shouldn't happen.
0832: throw new ParseException();
0833: } catch (IllegalArgumentException iex) {
0834: /* An unknown charset of the form ISO-XXX-XXX, will cause
0835: * the JDK to throw an IllegalArgumentException ... Since the
0836: * JDK will attempt to create a classname using this string,
0837: * but valid classnames must not contain the character '-',
0838: * and this results in an IllegalArgumentException, rather than
0839: * the expected UnsupportedEncodingException. Yikes
0840: */
0841: throw new UnsupportedEncodingException();
0842: }
0843: }
0844:
0845: /**
0846: * Look for encoded words within a word. The MIME spec doesn't
0847: * allow this, but many broken mailers, especially Japanese mailers,
0848: * produce such incorrect encodings.
0849: */
0850: private static String decodeInnerWords(String word)
0851: throws UnsupportedEncodingException {
0852: int start = 0, i;
0853: StringBuffer buf = new StringBuffer();
0854: while ((i = word.indexOf("=?", start)) >= 0) {
0855: buf.append(word.substring(start, i));
0856: int end = word.indexOf("?=", i);
0857: if (end < 0)
0858: break;
0859: String s = word.substring(i, end + 2);
0860: try {
0861: s = decodeWord(s);
0862: } catch (ParseException pex) {
0863: // ignore it, just use the original string
0864: }
0865: buf.append(s);
0866: start = end + 2;
0867: }
0868: if (start == 0)
0869: return word;
0870: if (start < word.length())
0871: buf.append(word.substring(start));
0872: return buf.toString();
0873: }
0874:
0875: /**
0876: * A utility method to quote a word, if the word contains any
0877: * characters from the specified 'specials' list.<p>
0878: *
0879: * The <code>HeaderTokenizer</code> class defines two special
0880: * sets of delimiters - MIME and RFC 822. <p>
0881: *
0882: * This method is typically used during the generation of
0883: * RFC 822 and MIME header fields.
0884: *
0885: * @param word word to be quoted
0886: * @param specials the set of special characters
0887: * @return the possibly quoted word
0888: * @see javax.mail.internet.HeaderTokenizer#MIME
0889: * @see javax.mail.internet.HeaderTokenizer#RFC822
0890: */
0891: public static String quote(String word, String specials) {
0892: int len = word.length();
0893:
0894: /*
0895: * Look for any "bad" characters, Escape and
0896: * quote the entire string if necessary.
0897: */
0898: boolean needQuoting = false;
0899: for (int i = 0; i < len; i++) {
0900: char c = word.charAt(i);
0901: if (c == '"' || c == '\\' || c == '\r' || c == '\n') {
0902: // need to escape them and then quote the whole string
0903: StringBuffer sb = new StringBuffer(len + 3);
0904: sb.append('"');
0905: sb.append(word.substring(0, i));
0906: int lastc = 0;
0907: for (int j = i; j < len; j++) {
0908: char cc = word.charAt(j);
0909: if ((cc == '"') || (cc == '\\') || (cc == '\r')
0910: || (cc == '\n'))
0911: if (cc == '\n' && lastc == '\r')
0912: ; // do nothing, CR was already escaped
0913: else
0914: sb.append('\\'); // Escape the character
0915: sb.append(cc);
0916: lastc = cc;
0917: }
0918: sb.append('"');
0919: return sb.toString();
0920: } else if (c < 040 || c >= 0177 || specials.indexOf(c) >= 0)
0921: // These characters cause the string to be quoted
0922: needQuoting = true;
0923: }
0924:
0925: if (needQuoting) {
0926: StringBuffer sb = new StringBuffer(len + 2);
0927: sb.append('"').append(word).append('"');
0928: return sb.toString();
0929: } else
0930: return word;
0931: }
0932:
0933: /**
0934: * Fold a string at linear whitespace so that each line is no longer
0935: * than 76 characters, if possible. If there are more than 76
0936: * non-whitespace characters consecutively, the string is folded at
0937: * the first whitespace after that sequence. The parameter
0938: * <code>used</code> indicates how many characters have been used in
0939: * the current line; it is usually the length of the header name. <p>
0940: *
0941: * Note that line breaks in the string aren't escaped; they probably
0942: * should be.
0943: *
0944: * @param used characters used in line so far
0945: * @param s the string to fold
0946: * @return the folded string
0947: */
0948: /*public*/static String fold(int used, String s) {
0949: if (!foldText)
0950: return s;
0951:
0952: int end;
0953: char c;
0954: // Strip trailing spaces
0955: for (end = s.length() - 1; end >= 0; end--) {
0956: c = s.charAt(end);
0957: if (c != ' ' && c != '\t')
0958: break;
0959: }
0960: if (end != s.length() - 1)
0961: s = s.substring(0, end + 1);
0962:
0963: // if the string fits now, just return it
0964: if (used + s.length() <= 76)
0965: return s;
0966:
0967: // have to actually fold the string
0968: StringBuffer sb = new StringBuffer(s.length() + 4);
0969: char lastc = 0;
0970: while (used + s.length() > 76) {
0971: int lastspace = -1;
0972: for (int i = 0; i < s.length(); i++) {
0973: if (lastspace != -1 && used + i > 76)
0974: break;
0975: c = s.charAt(i);
0976: if (c == ' ' || c == '\t')
0977: if (!(lastc == ' ' || lastc == '\t'))
0978: lastspace = i;
0979: lastc = c;
0980: }
0981: if (lastspace == -1) {
0982: // no space, use the whole thing
0983: sb.append(s);
0984: s = "";
0985: used = 0;
0986: break;
0987: }
0988: sb.append(s.substring(0, lastspace));
0989: sb.append("\r\n");
0990: lastc = s.charAt(lastspace);
0991: sb.append(lastc);
0992: s = s.substring(lastspace + 1);
0993: used = 1;
0994: }
0995: sb.append(s);
0996: return sb.toString();
0997: }
0998:
0999: /**
1000: * Unfold a folded header. Any line breaks that aren't escaped and
1001: * are followed by whitespace are removed.
1002: *
1003: * @param s the string to unfold
1004: * @return the unfolded string
1005: */
1006: /*public*/static String unfold(String s) {
1007: if (!foldText)
1008: return s;
1009:
1010: StringBuffer sb = null;
1011: int i;
1012: while ((i = indexOfAny(s, "\r\n")) >= 0) {
1013: int start = i;
1014: int l = s.length();
1015: i++; // skip CR or NL
1016: if (i < l && s.charAt(i - 1) == '\r' && s.charAt(i) == '\n')
1017: i++; // skip LF
1018: if (start == 0 || s.charAt(start - 1) != '\\') {
1019: char c;
1020: // if next line starts with whitespace, skip all of it
1021: // XXX - always has to be true?
1022: if (i < l && ((c = s.charAt(i)) == ' ' || c == '\t')) {
1023: i++; // skip whitespace
1024: while (i < l
1025: && ((c = s.charAt(i)) == ' ' || c == '\t'))
1026: i++;
1027: if (sb == null)
1028: sb = new StringBuffer(s.length());
1029: if (start != 0) {
1030: sb.append(s.substring(0, start));
1031: sb.append(' ');
1032: }
1033: s = s.substring(i);
1034: continue;
1035: }
1036: // it's not a continuation line, just leave it in
1037: if (sb == null)
1038: sb = new StringBuffer(s.length());
1039: sb.append(s.substring(0, i));
1040: s = s.substring(i);
1041: } else {
1042: // there's a backslash at "start - 1"
1043: // strip it out, but leave in the line break
1044: if (sb == null)
1045: sb = new StringBuffer(s.length());
1046: sb.append(s.substring(0, start - 1));
1047: sb.append(s.substring(start, i));
1048: s = s.substring(i);
1049: }
1050: }
1051: if (sb != null) {
1052: sb.append(s);
1053: return sb.toString();
1054: } else
1055: return s;
1056: }
1057:
1058: /**
1059: * Return the first index of any of the characters in "any" in "s",
1060: * or -1 if none are found.
1061: *
1062: * This should be a method on String.
1063: */
1064: private static int indexOfAny(String s, String any) {
1065: return indexOfAny(s, any, 0);
1066: }
1067:
1068: private static int indexOfAny(String s, String any, int start) {
1069: try {
1070: int len = s.length();
1071: for (int i = start; i < len; i++) {
1072: if (any.indexOf(s.charAt(i)) >= 0)
1073: return i;
1074: }
1075: return -1;
1076: } catch (StringIndexOutOfBoundsException e) {
1077: return -1;
1078: }
1079: }
1080:
1081: /**
1082: * Convert a MIME charset name into a valid Java charset name. <p>
1083: *
1084: * @param charset the MIME charset name
1085: * @return the Java charset equivalent. If a suitable mapping is
1086: * not available, the passed in charset is itself returned.
1087: */
1088: public static String javaCharset(String charset) {
1089: if (mime2java == null || charset == null)
1090: // no mapping table, or charset parameter is null
1091: return charset;
1092:
1093: String alias = (String) mime2java.get(charset.toLowerCase());
1094: return alias == null ? charset : alias;
1095: }
1096:
1097: /**
1098: * Convert a java charset into its MIME charset name. <p>
1099: *
1100: * Note that a future version of JDK (post 1.2) might provide
1101: * this functionality, in which case, we may deprecate this
1102: * method then.
1103: *
1104: * @param charset the JDK charset
1105: * @return the MIME/IANA equivalent. If a mapping
1106: * is not possible, the passed in charset itself
1107: * is returned.
1108: * @since JavaMail 1.1
1109: */
1110: public static String mimeCharset(String charset) {
1111: if (java2mime == null || charset == null)
1112: // no mapping table or charset param is null
1113: return charset;
1114:
1115: String alias = (String) java2mime.get(charset.toLowerCase());
1116: return alias == null ? charset : alias;
1117: }
1118:
1119: private static String defaultJavaCharset;
1120: private static String defaultMIMECharset;
1121:
1122: /**
1123: * Get the default charset corresponding to the system's current
1124: * default locale. If the System property <code>mail.mime.charset</code>
1125: * is set, a system charset corresponding to this MIME charset will be
1126: * returned. <p>
1127: *
1128: * @return the default charset of the system's default locale,
1129: * as a Java charset. (NOT a MIME charset)
1130: * @since JavaMail 1.1
1131: */
1132: public static String getDefaultJavaCharset() {
1133: if (defaultJavaCharset == null) {
1134: /*
1135: * If mail.mime.charset is set, it controls the default
1136: * Java charset as well.
1137: */
1138: String mimecs = null;
1139: try {
1140: mimecs = System.getProperty("mail.mime.charset");
1141: } catch (SecurityException ex) {
1142: } // ignore it
1143: if (mimecs != null && mimecs.length() > 0) {
1144: defaultJavaCharset = javaCharset(mimecs);
1145: return defaultJavaCharset;
1146: }
1147:
1148: try {
1149: defaultJavaCharset = System.getProperty(
1150: "file.encoding", "8859_1");
1151: } catch (SecurityException sex) {
1152:
1153: class NullInputStream extends InputStream {
1154: public int read() {
1155: return 0;
1156: }
1157: }
1158: InputStreamReader reader = new InputStreamReader(
1159: new NullInputStream());
1160: defaultJavaCharset = reader.getEncoding();
1161: if (defaultJavaCharset == null)
1162: defaultJavaCharset = "8859_1";
1163: }
1164: }
1165:
1166: return defaultJavaCharset;
1167: }
1168:
1169: /*
1170: * Get the default MIME charset for this locale.
1171: */
1172: static String getDefaultMIMECharset() {
1173: if (defaultMIMECharset == null) {
1174: try {
1175: defaultMIMECharset = System
1176: .getProperty("mail.mime.charset");
1177: } catch (SecurityException ex) {
1178: } // ignore it
1179: }
1180: if (defaultMIMECharset == null)
1181: defaultMIMECharset = mimeCharset(getDefaultJavaCharset());
1182: return defaultMIMECharset;
1183: }
1184:
1185: // Tables to map MIME charset names to Java names and vice versa.
1186: // XXX - Should eventually use J2SE 1.4 java.nio.charset.Charset
1187: private static Hashtable mime2java;
1188: private static Hashtable java2mime;
1189:
1190: static {
1191: java2mime = new Hashtable(40);
1192: mime2java = new Hashtable(10);
1193:
1194: try {
1195: // Use this class's classloader to load the mapping file
1196: // XXX - we should use SecuritySupport, but it's in another package
1197: InputStream is = com.sun.xml.messaging.saaj.packaging.mime.internet.MimeUtility.class
1198: .getResourceAsStream("/META-INF/javamail.charset.map");
1199:
1200: if (is != null) {
1201: is = new LineInputStream(is);
1202:
1203: // Load the JDK-to-MIME charset mapping table
1204: loadMappings((LineInputStream) is, java2mime);
1205:
1206: // Load the MIME-to-JDK charset mapping table
1207: loadMappings((LineInputStream) is, mime2java);
1208: }
1209: } catch (Exception ex) {
1210: }
1211:
1212: // If we didn't load the tables, e.g., because we didn't have
1213: // permission, load them manually. The entries here should be
1214: // the same as the default javamail.charset.map.
1215: if (java2mime.isEmpty()) {
1216: java2mime.put("8859_1", "ISO-8859-1");
1217: java2mime.put("iso8859_1", "ISO-8859-1");
1218: java2mime.put("ISO8859-1", "ISO-8859-1");
1219:
1220: java2mime.put("8859_2", "ISO-8859-2");
1221: java2mime.put("iso8859_2", "ISO-8859-2");
1222: java2mime.put("ISO8859-2", "ISO-8859-2");
1223:
1224: java2mime.put("8859_3", "ISO-8859-3");
1225: java2mime.put("iso8859_3", "ISO-8859-3");
1226: java2mime.put("ISO8859-3", "ISO-8859-3");
1227:
1228: java2mime.put("8859_4", "ISO-8859-4");
1229: java2mime.put("iso8859_4", "ISO-8859-4");
1230: java2mime.put("ISO8859-4", "ISO-8859-4");
1231:
1232: java2mime.put("8859_5", "ISO-8859-5");
1233: java2mime.put("iso8859_5", "ISO-8859-5");
1234: java2mime.put("ISO8859-5", "ISO-8859-5");
1235:
1236: java2mime.put("8859_6", "ISO-8859-6");
1237: java2mime.put("iso8859_6", "ISO-8859-6");
1238: java2mime.put("ISO8859-6", "ISO-8859-6");
1239:
1240: java2mime.put("8859_7", "ISO-8859-7");
1241: java2mime.put("iso8859_7", "ISO-8859-7");
1242: java2mime.put("ISO8859-7", "ISO-8859-7");
1243:
1244: java2mime.put("8859_8", "ISO-8859-8");
1245: java2mime.put("iso8859_8", "ISO-8859-8");
1246: java2mime.put("ISO8859-8", "ISO-8859-8");
1247:
1248: java2mime.put("8859_9", "ISO-8859-9");
1249: java2mime.put("iso8859_9", "ISO-8859-9");
1250: java2mime.put("ISO8859-9", "ISO-8859-9");
1251:
1252: java2mime.put("SJIS", "Shift_JIS");
1253: java2mime.put("MS932", "Shift_JIS");
1254: java2mime.put("JIS", "ISO-2022-JP");
1255: java2mime.put("ISO2022JP", "ISO-2022-JP");
1256: java2mime.put("EUC_JP", "euc-jp");
1257: java2mime.put("KOI8_R", "koi8-r");
1258: java2mime.put("EUC_CN", "euc-cn");
1259: java2mime.put("EUC_TW", "euc-tw");
1260: java2mime.put("EUC_KR", "euc-kr");
1261: }
1262: if (mime2java.isEmpty()) {
1263: mime2java.put("iso-2022-cn", "ISO2022CN");
1264: mime2java.put("iso-2022-kr", "ISO2022KR");
1265: mime2java.put("utf-8", "UTF8");
1266: mime2java.put("utf8", "UTF8");
1267: mime2java.put("ja_jp.iso2022-7", "ISO2022JP");
1268: mime2java.put("ja_jp.eucjp", "EUCJIS");
1269: mime2java.put("euc-kr", "KSC5601");
1270: mime2java.put("euckr", "KSC5601");
1271: mime2java.put("us-ascii", "ISO-8859-1");
1272: mime2java.put("x-us-ascii", "ISO-8859-1");
1273: }
1274: }
1275:
1276: private static void loadMappings(LineInputStream is, Hashtable table) {
1277: String currLine;
1278:
1279: while (true) {
1280: try {
1281: currLine = is.readLine();
1282: } catch (IOException ioex) {
1283: break; // error in reading, stop
1284: }
1285:
1286: if (currLine == null) // end of file, stop
1287: break;
1288: if (currLine.startsWith("--") && currLine.endsWith("--"))
1289: // end of this table
1290: break;
1291:
1292: // ignore empty lines and comments
1293: if (currLine.trim().length() == 0
1294: || currLine.startsWith("#"))
1295: continue;
1296:
1297: // A valid entry is of the form <key><separator><value>
1298: // where, <separator> := SPACE | HT. Parse this
1299: StringTokenizer tk = new StringTokenizer(currLine, " \t");
1300: try {
1301: String key = tk.nextToken();
1302: String value = tk.nextToken();
1303: table.put(key.toLowerCase(), value);
1304: } catch (NoSuchElementException nex) {
1305: }
1306: }
1307: }
1308:
1309: static final int ALL_ASCII = 1;
1310: static final int MOSTLY_ASCII = 2;
1311: static final int MOSTLY_NONASCII = 3;
1312:
1313: /**
1314: * Check if the given string contains non US-ASCII characters.
1315: * @param s string
1316: * @return ALL_ASCII if all characters in the string
1317: * belong to the US-ASCII charset. MOSTLY_ASCII
1318: * if more than half of the available characters
1319: * are US-ASCII characters. Else MOSTLY_NONASCII.
1320: */
1321: static int checkAscii(String s) {
1322: int ascii = 0, non_ascii = 0;
1323: int l = s.length();
1324:
1325: for (int i = 0; i < l; i++) {
1326: if (nonascii((int) s.charAt(i))) // non-ascii
1327: non_ascii++;
1328: else
1329: ascii++;
1330: }
1331:
1332: if (non_ascii == 0)
1333: return ALL_ASCII;
1334: if (ascii > non_ascii)
1335: return MOSTLY_ASCII;
1336:
1337: return MOSTLY_NONASCII;
1338: }
1339:
1340: /**
1341: * Check if the given byte array contains non US-ASCII characters.
1342: * @param b byte array
1343: * @return ALL_ASCII if all characters in the string
1344: * belong to the US-ASCII charset. MOSTLY_ASCII
1345: * if more than half of the available characters
1346: * are US-ASCII characters. Else MOSTLY_NONASCII.
1347: *
1348: * XXX - this method is no longer used
1349: */
1350: static int checkAscii(byte[] b) {
1351: int ascii = 0, non_ascii = 0;
1352:
1353: for (int i = 0; i < b.length; i++) {
1354: // The '&' operator automatically causes b[i] to be promoted
1355: // to an int, and we mask out the higher bytes in the int
1356: // so that the resulting value is not a negative integer.
1357: if (nonascii(b[i] & 0xff)) // non-ascii
1358: non_ascii++;
1359: else
1360: ascii++;
1361: }
1362:
1363: if (non_ascii == 0)
1364: return ALL_ASCII;
1365: if (ascii > non_ascii)
1366: return MOSTLY_ASCII;
1367:
1368: return MOSTLY_NONASCII;
1369: }
1370:
1371: /**
1372: * Check if the given input stream contains non US-ASCII characters.
1373: * Upto <code>max</code> bytes are checked. If <code>max</code> is
1374: * set to <code>ALL</code>, then all the bytes available in this
1375: * input stream are checked. If <code>breakOnNonAscii</code> is true
1376: * the check terminates when the first non-US-ASCII character is
1377: * found and MOSTLY_NONASCII is returned. Else, the check continues
1378: * till <code>max</code> bytes or till the end of stream.
1379: *
1380: * @param is the input stream
1381: * @param max maximum bytes to check for. The special value
1382: * ALL indicates that all the bytes in this input
1383: * stream must be checked.
1384: * @param breakOnNonAscii if <code>true</code>, then terminate the
1385: * the check when the first non-US-ASCII character
1386: * is found.
1387: * @return ALL_ASCII if all characters in the string
1388: * belong to the US-ASCII charset. MOSTLY_ASCII
1389: * if more than half of the available characters
1390: * are US-ASCII characters. Else MOSTLY_NONASCII.
1391: */
1392: static int checkAscii(InputStream is, int max,
1393: boolean breakOnNonAscii) {
1394: int ascii = 0, non_ascii = 0;
1395: int len;
1396: int block = 4096;
1397: int linelen = 0;
1398: boolean longLine = false, badEOL = false;
1399: boolean checkEOL = encodeEolStrict && breakOnNonAscii;
1400: byte buf[] = null;
1401: if (max != 0) {
1402: block = (max == ALL) ? 4096 : Math.min(max, 4096);
1403: buf = new byte[block];
1404: }
1405: while (max != 0) {
1406: try {
1407: if ((len = is.read(buf, 0, block)) == -1)
1408: break;
1409: int lastb = 0;
1410: for (int i = 0; i < len; i++) {
1411: // The '&' operator automatically causes b[i] to
1412: // be promoted to an int, and we mask out the higher
1413: // bytes in the int so that the resulting value is
1414: // not a negative integer.
1415: int b = buf[i] & 0xff;
1416: if (checkEOL
1417: && ((lastb == '\r' && b != '\n') || (lastb != '\r' && b == '\n')))
1418: badEOL = true;
1419: if (b == '\r' || b == '\n')
1420: linelen = 0;
1421: else {
1422: linelen++;
1423: if (linelen > 998) // 1000 - CRLF
1424: longLine = true;
1425: }
1426: if (nonascii(b)) { // non-ascii
1427: if (breakOnNonAscii) // we are done
1428: return MOSTLY_NONASCII;
1429: else
1430: non_ascii++;
1431: } else
1432: ascii++;
1433: lastb = b;
1434: }
1435: } catch (IOException ioex) {
1436: break;
1437: }
1438: if (max != ALL)
1439: max -= len;
1440: }
1441:
1442: if (max == 0 && breakOnNonAscii)
1443: // We have been told to break on the first non-ascii character.
1444: // We haven't got any non-ascii character yet, but then we
1445: // have not checked all of the available bytes either. So we
1446: // cannot say for sure that this input stream is ALL_ASCII,
1447: // and hence we must play safe and return MOSTLY_NONASCII
1448:
1449: return MOSTLY_NONASCII;
1450:
1451: if (non_ascii == 0) { // no non-us-ascii characters so far
1452: // If we're looking at non-text data, and we saw CR without LF
1453: // or vice versa, consider this mostly non-ASCII so that it
1454: // will be base64 encoded (since the quoted-printable encoder
1455: // doesn't encode this case properly).
1456: if (badEOL)
1457: return MOSTLY_NONASCII;
1458: // if we've seen a long line, we degrade to mostly ascii
1459: else if (longLine)
1460: return MOSTLY_ASCII;
1461: else
1462: return ALL_ASCII;
1463: }
1464: if (ascii > non_ascii) // mostly ascii
1465: return MOSTLY_ASCII;
1466: return MOSTLY_NONASCII;
1467: }
1468:
1469: static final boolean nonascii(int b) {
1470: return b >= 0177
1471: || (b < 040 && b != '\r' && b != '\n' && b != '\t');
1472: }
1473: }
1474:
1475: /**
1476: * An OutputStream that determines whether the data written to
1477: * it is all ASCII, mostly ASCII, or mostly non-ASCII.
1478: */
1479: class AsciiOutputStream extends OutputStream {
1480: private boolean breakOnNonAscii;
1481: private int ascii = 0, non_ascii = 0;
1482: private int linelen = 0;
1483: private boolean longLine = false;
1484: private boolean badEOL = false;
1485: private boolean checkEOL = false;
1486: private int lastb = 0;
1487: private int ret = 0;
1488:
1489: public AsciiOutputStream(boolean breakOnNonAscii,
1490: boolean encodeEolStrict) {
1491: this .breakOnNonAscii = breakOnNonAscii;
1492: checkEOL = encodeEolStrict && breakOnNonAscii;
1493: }
1494:
1495: public void write(int b) throws IOException {
1496: check(b);
1497: }
1498:
1499: public void write(byte b[]) throws IOException {
1500: write(b, 0, b.length);
1501: }
1502:
1503: public void write(byte b[], int off, int len) throws IOException {
1504: len += off;
1505: for (int i = off; i < len; i++)
1506: check(b[i]);
1507: }
1508:
1509: private final void check(int b) throws IOException {
1510: b &= 0xff;
1511: if (checkEOL
1512: && ((lastb == '\r' && b != '\n') || (lastb != '\r' && b == '\n')))
1513: badEOL = true;
1514: if (b == '\r' || b == '\n')
1515: linelen = 0;
1516: else {
1517: linelen++;
1518: if (linelen > 998) // 1000 - CRLF
1519: longLine = true;
1520: }
1521: if (MimeUtility.nonascii(b)) { // non-ascii
1522: non_ascii++;
1523: if (breakOnNonAscii) { // we are done
1524: ret = MimeUtility.MOSTLY_NONASCII;
1525: throw new EOFException();
1526: }
1527: } else
1528: ascii++;
1529: lastb = b;
1530: }
1531:
1532: /**
1533: * Return ASCII-ness of data stream.
1534: */
1535: public int getAscii() {
1536: if (ret != 0)
1537: return ret;
1538: // If we're looking at non-text data, and we saw CR without LF
1539: // or vice versa, consider this mostly non-ASCII so that it
1540: // will be base64 encoded (since the quoted-printable encoder
1541: // doesn't encode this case properly).
1542: if (badEOL)
1543: return MimeUtility.MOSTLY_NONASCII;
1544: else if (non_ascii == 0) { // no non-us-ascii characters so far
1545: // if we've seen a long line, we degrade to mostly ascii
1546: if (longLine)
1547: return MimeUtility.MOSTLY_ASCII;
1548: else
1549: return MimeUtility.ALL_ASCII;
1550: }
1551: if (ascii > non_ascii) // mostly ascii
1552: return MimeUtility.MOSTLY_ASCII;
1553: return MimeUtility.MOSTLY_NONASCII;
1554: }
1555: }
|