001: /*
002: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
003: *
004: * Copyright 1997-2007 Sun Microsystems, Inc. All rights reserved.
005: *
006: * The contents of this file are subject to the terms of either the GNU
007: * General Public License Version 2 only ("GPL") or the Common
008: * Development and Distribution License("CDDL") (collectively, the
009: * "License"). You may not use this file except in compliance with the
010: * License. You can obtain a copy of the License at
011: * http://www.netbeans.org/cddl-gplv2.html
012: * or nbbuild/licenses/CDDL-GPL-2-CP. See the License for the
013: * specific language governing permissions and limitations under the
014: * License. When distributing the software, include this License Header
015: * Notice in each file and include the License file at
016: * nbbuild/licenses/CDDL-GPL-2-CP. Sun designates this
017: * particular file as subject to the "Classpath" exception as provided
018: * by Sun in the GPL Version 2 section of the License file that
019: * accompanied this code. If applicable, add the following below the
020: * License Header, with the fields enclosed by brackets [] replaced by
021: * your own identifying information:
022: * "Portions Copyrighted [year] [name of copyright owner]"
023: *
024: * Contributor(s):
025: *
026: * The Original Software is NetBeans. The Initial Developer of the Original
027: * Software is Sun Microsystems, Inc. Portions Copyright 1997-2007 Sun
028: * Microsystems, Inc. All Rights Reserved.
029: *
030: * If you wish your version of this file to be governed by only the CDDL
031: * or only the GPL Version 2, indicate your decision by adding
032: * "[Contributor] elects to include this software in this distribution
033: * under the [CDDL or GPL Version 2] license." If you do not indicate a
034: * single choice of license, a recipient has the option to distribute
035: * your version of this file under either the CDDL, the GPL Version 2 or
036: * to extend the choice of license to its licensees as provided above.
037: * However, if you add GPL Version 2 code and therefore, elected the GPL
038: * Version 2 license, then the option applies only if the new code is
039: * made subject to such option by the copyright holder.
040: */
041: package org.netbeans.modules.visualweb.insync;
042:
043: import java.io.*;
044: import javax.swing.text.*;
045:
046: /**
047: * XML uses inband encoding detection - this class obtains it.
048: *
049: * @author Petr Kuzel
050: * @version 1.0
051: */
052: public class EncodingHelper extends Object {
053:
054: // heuristic constant guessing max prolog length
055: private static final int EXPECTED_PROLOG_LENGTH = 1000;
056:
057: /** Detect input stream encoding.
058: * The stream stays intact.
059: * @return java encoding names ("UTF8", "ASCII", etc.) or null
060: * if the stream is not markable or enoding cannot be detected.
061: */
062: public static String detectEncoding(InputStream in)
063: throws IOException {
064:
065: if (!in.markSupported()) {
066: //if ( Util.THIS.isLoggable() ) /* then */ Util.THIS.debug("EncodingHelper got unmarkable stream: " + in.getClass()); // NOI18N
067: return null;
068: }
069:
070: try {
071: in.mark(EXPECTED_PROLOG_LENGTH);
072:
073: byte[] bytes = new byte[EXPECTED_PROLOG_LENGTH];
074: for (int i = 0; i < bytes.length; i++) {
075: try {
076: int datum = in.read();
077: if (datum == -1)
078: break;
079: bytes[i] = (byte) datum;
080: } catch (EOFException ex) {
081: }
082: }
083:
084: String enc = autoDetectEncoding(bytes);
085: if (enc == null)
086: return null;
087:
088: enc = detectDeclaredEncoding(bytes, enc);
089: if (enc == null)
090: return null;
091:
092: return Convertors.iana2java(enc);
093: } finally {
094: in.reset();
095: }
096: }
097:
098: /**
099: * @return Java encoding family identifier or <tt>null</tt> for unrecognized
100: */
101: static String autoDetectEncoding(byte[] buf) throws IOException {
102:
103: if (buf.length >= 4) {
104: switch (buf[0]) {
105: case 0:
106: // byte order mark of (1234-big endian) or (2143) USC-4
107: // or '<' encoded as UCS-4 (1234, 2143, 3412) or UTF-16BE
108: if (buf[1] == (byte) 0x3c && buf[2] == (byte) 0x00
109: && buf[3] == (byte) 0x3f) {
110: return "UnicodeBigUnmarked";
111: }
112: // else it's probably UCS-4
113: break;
114:
115: case 0x3c:
116: switch (buf[1]) {
117: // First character is '<'; could be XML without
118: // an XML directive such as "<hello>", "<!-- ...", // NOI18N
119: // and so on.
120:
121: // 3c 00 3f 00 UTF-16 little endian
122: case 0x00:
123: if (buf[2] == (byte) 0x3f && buf[3] == (byte) 0x00) {
124: return "UnicodeLittleUnmarked";
125: }
126: break;
127:
128: // 3c 3f 78 6d == ASCII and supersets '<?xm'
129: case '?':
130: if (buf[2] == 'x' && buf[3] == 'm') {
131: return "UTF8"; // NOI18N
132: }
133: break;
134: }
135: break;
136:
137: // 4c 6f a7 94 ... some EBCDIC code page
138: case 0x4c:
139: if (buf[1] == (byte) 0x6f && buf[2] == (byte) 0xa7
140: && buf[3] == (byte) 0x94) {
141: return "Cp037"; // NOI18N
142: }
143: break;
144:
145: // UTF-16 big-endian marked
146: case (byte) 0xfe:
147: if (buf[1] == (byte) 0xff
148: && (buf[2] != 0 || buf[3] != 0)) {
149: return "UnicodeBig"; // NOI18N
150: }
151: break;
152:
153: // UTF-16 little-endian marked
154: case (byte) 0xff:
155: if (buf[1] == (byte) 0xfe
156: && (buf[2] != 0 || buf[3] != 0)) {
157: return "UnicodeLittle"; // NOI18N
158: }
159: break;
160:
161: // UTF-8 byte order mark
162: case (byte) 0xef:
163: if (buf[1] == (byte) 0xbb && buf[2] == (byte) 0xbf) {
164: return "UTF8"; //NOI18N
165: }
166: break;
167:
168: }
169: }
170:
171: return null;
172: }
173:
174: /**
175: * Look for encoding='' anyway stop at <tt>?></tt>
176: * @return found encoding or null if none declared
177: */
178: static String detectDeclaredEncoding(byte[] data,
179: String baseEncoding) throws IOException {
180:
181: StringBuffer buf = new StringBuffer();
182: Reader r;
183: char delimiter = '"';
184:
185: r = new InputStreamReader(new ByteArrayInputStream(data),
186: baseEncoding);
187: try {
188: for (int c = r.read(); c != -1; c = r.read()) {
189: buf.append((char) c);
190: }
191: } catch (IOException ex) {
192: // EOF of data out of boundary
193: // dont care try to guess from given data
194: }
195:
196: String s = buf.toString();
197:
198: int iend = s.indexOf("?>");
199: iend = iend == -1 ? s.length() : iend;
200:
201: int iestart = s.indexOf("encoding");
202: if (iestart == -1 || iestart > iend)
203: return null;
204:
205: char[] chars = s.toCharArray();
206:
207: int i = iestart;
208:
209: for (; i < iend; i++) {
210: if (chars[i] == '=')
211: break;
212: }
213:
214: for (; i < iend; i++) {
215: if (chars[i] == '\'' || chars[i] == '"') {
216: delimiter = chars[i];
217: break;
218: }
219:
220: }
221:
222: i++;
223:
224: int ivalstart = i;
225: for (; i < iend; i++) {
226: if (chars[i] == delimiter) {
227: return new String(chars, ivalstart, i - ivalstart);
228: }
229: }
230:
231: return null;
232: }
233:
234: /**
235: * Parse MIME content type for attributes.
236: */
237: static String parseMIMECharSet(String mime) {
238:
239: final String CHARSET = "charset";
240:
241: if (mime != null) {
242: int i;
243:
244: mime = mime.toLowerCase();
245: i = mime.indexOf(';');
246: if (i != -1) {
247: String attributes;
248:
249: attributes = mime.substring(i + 1);
250: mime = mime.substring(0, i);
251:
252: // use "charset=..." if it's available // NOI18N
253: i = attributes.indexOf(CHARSET); // NOI18N
254: if (i != -1) {
255: attributes = attributes.substring(i
256: + CHARSET.length());
257: // strip out subsequent attributes
258: if ((i = attributes.indexOf(';')) != -1)
259: attributes = attributes.substring(0, i);
260: // find start of value
261: if ((i = attributes.indexOf('=')) != -1) {
262: attributes = attributes.substring(i + 1);
263: // strip out rfc822 comments
264: if ((i = attributes.indexOf('(')) != -1)
265: attributes = attributes.substring(0, i);
266: // double quotes are optional
267: if ((i = attributes.indexOf('"')) != -1) {
268: attributes = attributes.substring(i + 1);
269: attributes = attributes.substring(0,
270: attributes.indexOf('"'));
271: }
272: return attributes.trim();
273: // XXX "\;", "\)" etc were mishandled above // NOI18N
274: }
275: }
276: }
277: }
278:
279: return null;
280: }
281:
282: /** Document itself is encoded as Unicode, but in
283: * the document prolog is an encoding attribute.
284: * @return java encoding names ("UTF8", "ASCII", etc.) or null if no guess
285: */
286: public static String detectEncoding(Document doc)
287: throws IOException {
288:
289: if (doc == null)
290: return null;
291:
292: try {
293:
294: String text = doc
295: .getText(
296: 0,
297: doc.getLength() > EXPECTED_PROLOG_LENGTH ? EXPECTED_PROLOG_LENGTH
298: : doc.getLength());
299: InputStream in = new ByteArrayInputStream(text.getBytes());
300: return detectEncoding(in);
301:
302: } catch (BadLocationException ex) {
303: throw new RuntimeException(ex.toString());
304: }
305:
306: }
307:
308: }
|