001: /***** BEGIN LICENSE BLOCK *****
002: * Version: CPL 1.0/GPL 2.0/LGPL 2.1
003: *
004: * The contents of this file are subject to the Common Public
005: * License Version 1.0 (the "License"); you may not use this file
006: * except in compliance with the License. You may obtain a copy of
007: * the License at http://www.eclipse.org/legal/cpl-v10.html
008: *
009: * Software distributed under the License is distributed on an "AS
010: * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
011: * implied. See the License for the specific language governing
012: * rights and limitations under the License.
013: *
014: * Copyright (C) 2007 Koichiro Ohba <koichiro@meadowy.org>
015: *
016: * Alternatively, the contents of this file may be used under the terms of
017: * either of the GNU General Public License Version 2 or later (the "GPL"),
018: * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
019: * in which case the provisions of the GPL or the LGPL are applicable instead
020: * of those above. If you wish to allow use of your version of this file only
021: * under the terms of either the GPL or the LGPL, and not to allow others to
022: * use your version of this file under the terms of the CPL, indicate your
023: * decision by deleting the provisions above and replace them with the notice
024: * and other provisions required by the GPL or the LGPL. If you do not delete
025: * the provisions above, a recipient may use your version of this file under
026: * the terms of any one of the CPL, the GPL or the LGPL.
027: ***** END LICENSE BLOCK *****/package org.jruby;
028:
029: import java.nio.ByteBuffer;
030: import java.nio.CharBuffer;
031: import java.nio.charset.CharacterCodingException;
032: import java.nio.charset.Charset;
033: import java.nio.charset.CharsetDecoder;
034: import java.nio.charset.CharsetEncoder;
035: import java.nio.charset.UnsupportedCharsetException;
036: import java.util.HashMap;
037: import java.util.Map;
038:
039: import org.jruby.runtime.CallbackFactory;
040: import org.jruby.runtime.builtin.IRubyObject;
041: import org.jruby.util.ByteList;
042: import org.jruby.util.KCode;
043:
044: public class RubyNKF {
045: public static final NKFCharset AUTO = new NKFCharset(0,
046: "x-JISAutoDetect");
047: public static final NKFCharset JIS = new NKFCharset(1,
048: "iso-2022-jp");
049: public static final NKFCharset EUC = new NKFCharset(2, "EUC-JP");
050: public static final NKFCharset SJIS = new NKFCharset(3,
051: "Windows-31J");
052: public static final NKFCharset BINARY = new NKFCharset(4, null);
053: public static final NKFCharset NOCONV = new NKFCharset(4, null);
054: public static final NKFCharset UNKNOWN = new NKFCharset(0, null);
055: public static final NKFCharset ASCII = new NKFCharset(5,
056: "iso-8859-1");
057: public static final NKFCharset UTF8 = new NKFCharset(6, "UTF-8");
058: public static final NKFCharset UTF16 = new NKFCharset(8, "UTF-16");
059: public static final NKFCharset UTF32 = new NKFCharset(12, "UTF-32");
060: public static final NKFCharset OTHER = new NKFCharset(16, null);
061:
062: public static class NKFCharset {
063: private final int value;
064: private final String charset;
065:
066: public NKFCharset(int v, String c) {
067: value = v;
068: charset = c;
069: }
070:
071: public int getValue() {
072: return value;
073: }
074:
075: public String getCharset() {
076: return charset;
077: }
078: }
079:
080: public static void createNKF(Ruby runtime) {
081: RubyModule nkfModule = runtime.defineModule("NKF");
082: CallbackFactory callbackFactory = runtime
083: .callbackFactory(RubyNKF.class);
084:
085: nkfModule.defineConstant("AUTO", RubyFixnum.newFixnum(runtime,
086: AUTO.getValue()));
087: nkfModule.defineConstant("JIS", RubyFixnum.newFixnum(runtime,
088: JIS.getValue()));
089: nkfModule.defineConstant("EUC", RubyFixnum.newFixnum(runtime,
090: EUC.getValue()));
091: nkfModule.defineConstant("SJIS", RubyFixnum.newFixnum(runtime,
092: SJIS.getValue()));
093: nkfModule.defineConstant("BINARY", RubyFixnum.newFixnum(
094: runtime, BINARY.getValue()));
095: nkfModule.defineConstant("NOCONV", RubyFixnum.newFixnum(
096: runtime, NOCONV.getValue()));
097: nkfModule.defineConstant("UNKNOWN", RubyFixnum.newFixnum(
098: runtime, UNKNOWN.getValue()));
099: nkfModule.defineConstant("ASCII", RubyFixnum.newFixnum(runtime,
100: ASCII.getValue()));
101: nkfModule.defineConstant("UTF8", RubyFixnum.newFixnum(runtime,
102: UTF8.getValue()));
103: nkfModule.defineConstant("UTF16", RubyFixnum.newFixnum(runtime,
104: UTF16.getValue()));
105: nkfModule.defineConstant("UTF32", RubyFixnum.newFixnum(runtime,
106: UTF32.getValue()));
107: nkfModule.defineConstant("OTHER", RubyFixnum.newFixnum(runtime,
108: OTHER.getValue()));
109:
110: RubyString version = runtime
111: .newString("2.0.7 (JRuby 2007-05-11)");
112: RubyString nkfVersion = runtime.newString("2.0.7");
113: RubyString nkfDate = runtime.newString("2007-05-11");
114:
115: version.freeze();
116: nkfVersion.freeze();
117: nkfDate.freeze();
118:
119: nkfModule.defineModuleFunction("nkf", callbackFactory
120: .getFastSingletonMethod("nkf", RubyKernel.IRUBY_OBJECT,
121: RubyKernel.IRUBY_OBJECT));
122: nkfModule.defineModuleFunction("guess", callbackFactory
123: .getFastSingletonMethod("guess",
124: RubyKernel.IRUBY_OBJECT));
125: nkfModule.defineModuleFunction("guess1", callbackFactory
126: .getFastSingletonMethod("guess1",
127: RubyKernel.IRUBY_OBJECT));
128: nkfModule.defineModuleFunction("guess2", callbackFactory
129: .getFastSingletonMethod("guess2",
130: RubyKernel.IRUBY_OBJECT));
131: }
132:
133: public static IRubyObject guess(IRubyObject recv, IRubyObject s) {
134: Ruby runtime = recv.getRuntime();
135: if (!s.respondsTo("to_str")) {
136: throw runtime.newTypeError("can't convert "
137: + s.getMetaClass() + " into String");
138: }
139: ByteList bytes = s.convertToString().getByteList();
140: ByteBuffer buf = ByteBuffer.wrap(bytes.unsafeBytes(), bytes
141: .begin(), bytes.length());
142: CharsetDecoder decoder = Charset.forName("x-JISAutoDetect")
143: .newDecoder();
144: try {
145: decoder.decode(buf);
146: } catch (CharacterCodingException e) {
147: return runtime.newFixnum(UNKNOWN.getValue());
148: }
149: if (!decoder.isCharsetDetected()) {
150: return runtime.newFixnum(UNKNOWN.getValue());
151: }
152: Charset charset = decoder.detectedCharset();
153: String name = charset.name();
154: // System.out.println("detect: " + name + "\n");
155: if ("Shift_JIS".equals(name))
156: return runtime.newFixnum(SJIS.getValue());
157: if ("windows-31j".equals(name))
158: return runtime.newFixnum(SJIS.getValue());
159: else if ("EUC-JP".equals(name))
160: return runtime.newFixnum(EUC.getValue());
161: else if ("ISO-2022-JP".equals(name))
162: return runtime.newFixnum(JIS.getValue());
163: else
164: return runtime.newFixnum(UNKNOWN.getValue());
165: }
166:
167: public static IRubyObject guess1(IRubyObject recv, IRubyObject str) {
168: return guess(recv, str);
169: }
170:
171: public static IRubyObject guess2(IRubyObject recv, IRubyObject str) {
172: return guess(recv, str);
173: }
174:
175: public static IRubyObject nkf(IRubyObject recv, IRubyObject opt,
176: IRubyObject str) {
177: Ruby runtime = recv.getRuntime();
178: if (!opt.respondsTo("to_str")) {
179: throw runtime.newTypeError("can't convert "
180: + opt.getMetaClass() + " into String");
181: }
182: if (!str.respondsTo("to_str")) {
183: throw runtime.newTypeError("can't convert "
184: + str.getMetaClass() + " into String");
185: }
186:
187: Map options = parseOpt(opt.convertToString().toString());
188:
189: NKFCharset nc = (NKFCharset) options.get("input");
190: if (nc.getValue() == AUTO.getValue()) {
191: KCode kcode = runtime.getKCode();
192: if (kcode == KCode.SJIS) {
193: nc = SJIS;
194: } else if (kcode == KCode.EUC) {
195: nc = EUC;
196: } else if (kcode == KCode.UTF8) {
197: nc = UTF8;
198: }
199: }
200: String decodeCharset = nc.getCharset();
201: String encodeCharset = ((NKFCharset) options.get("output"))
202: .getCharset();
203:
204: return convert(decodeCharset, encodeCharset, str);
205: }
206:
207: private static IRubyObject convert(String decodeCharset,
208: String encodeCharset, IRubyObject str) {
209: Ruby runtime = str.getRuntime();
210: CharsetDecoder decoder;
211: CharsetEncoder encoder;
212: try {
213: decoder = Charset.forName(decodeCharset).newDecoder();
214: encoder = Charset.forName(encodeCharset).newEncoder();
215: } catch (UnsupportedCharsetException e) {
216: throw runtime.newArgumentError("invalid encoding");
217: }
218:
219: ByteList bytes = str.convertToString().getByteList();
220: ByteBuffer buf = ByteBuffer.wrap(bytes.unsafeBytes(), bytes
221: .begin(), bytes.length());
222: try {
223: CharBuffer cbuf = decoder.decode(buf);
224: buf = encoder.encode(cbuf);
225: } catch (CharacterCodingException e) {
226: throw runtime.newArgumentError("invalid encoding");
227: }
228: byte[] arr = buf.array();
229:
230: return runtime.newString(new ByteList(arr, 0, buf.limit()));
231:
232: }
233:
234: private static int optionUTF(String s, int i) {
235: int n = 8;
236: if (i + 1 < s.length() && Character.isDigit(s.charAt(i + 1))) {
237: n = Character.digit(s.charAt(i + 1), 10);
238: if (i + 2 < s.length()
239: && Character.isDigit(s.charAt(i + 2))) {
240: n *= 10;
241: n += Character.digit(s.charAt(i + 2), 10);
242: }
243: }
244: return n;
245: }
246:
247: private static Map parseOpt(String s) {
248: Map options = new HashMap();
249:
250: // default options
251: options.put("input", AUTO);
252: options.put("output", JIS);
253:
254: for (int i = 0; i < s.length(); i++) {
255: switch (s.charAt(i)) {
256: case 'b':
257: break;
258: case 'u':
259: break;
260: case 'j': // iso-2022-jp
261: options.put("output", JIS);
262: break;
263: case 's': // Shift_JIS
264: options.put("output", SJIS);
265: break;
266: case 'e': // EUC-JP
267: options.put("output", EUC);
268: break;
269: case 'w': // UTF-8
270: {
271: int n = optionUTF(s, i);
272: if (n == 32)
273: options.put("output", UTF32);
274: else if (n == 16)
275: options.put("output", UTF16);
276: else
277: options.put("output", UTF8);
278: }
279: break;
280: case 'J': // iso-2022-jp
281: options.put("input", JIS);
282: break;
283: case 'S': // Shift_JIS
284: options.put("input", SJIS);
285: break;
286: case 'E': // EUC-JP
287: options.put("input", EUC);
288: break;
289: case 'W': // UTF-8
290: {
291: int n = optionUTF(s, i);
292: if (n == 32)
293: options.put("input", UTF32);
294: else if (n == 16)
295: options.put("input", UTF16);
296: else
297: options.put("input", UTF8);
298: }
299: break;
300: case 't':
301: break;
302: case 'r':
303: break;
304: case 'h':
305: break;
306: case 'm':
307: break;
308: case 'M':
309: break;
310: case 'l':
311: break;
312: case 'f':
313: break;
314: case 'F':
315: break;
316: case 'Z':
317: break;
318: case 'X':
319: break;
320: case 'x':
321: break;
322: case 'B':
323: break;
324: case 'T':
325: break;
326: case 'd':
327: break;
328: case 'c':
329: break;
330: case 'I':
331: break;
332: case 'L':
333: break;
334: case '-':
335: if (s.charAt(i + 1) == '-') {
336: // long name option
337: }
338: default:
339: }
340: }
341: return options;
342: }
343: }
|