001: /*
002: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
003: *
004: * This code is free software; you can redistribute it and/or modify it
005: * under the terms of the GNU General Public License version 2 only, as
006: * published by the Free Software Foundation. Sun designates this
007: * particular file as subject to the "Classpath" exception as provided
008: * by Sun in the LICENSE file that accompanied this code.
009: *
010: * This code is distributed in the hope that it will be useful, but WITHOUT
011: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
012: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
013: * version 2 for more details (a copy is included in the LICENSE file that
014: * accompanied this code).
015: *
016: * You should have received a copy of the GNU General Public License version
017: * 2 along with this work; if not, write to the Free Software Foundation,
018: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
019: *
020: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
021: * CA 95054 USA or visit www.sun.com if you need additional information or
022: * have any questions.
023: */
024: /*
025: /*
026: *******************************************************************************
027: * Copyright (C) 2003-2004, International Business Machines Corporation and *
028: * others. All Rights Reserved. *
029: *******************************************************************************
030: */
031: //
032: // CHANGELOG
033: // 2005-05-19 Edward Wang
034: // - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java
035: // - move from package com.ibm.icu.text to package sun.net.idn
036: // - use ParseException instead of StringPrepParseException
037: // - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()'
038: // - remove all @deprecated tag to make compiler happy
039: //
040: package sun.net.idn;
041:
042: import java.io.BufferedInputStream;
043: import java.io.ByteArrayInputStream;
044: import java.io.IOException;
045: import java.io.InputStream;
046: import java.text.ParseException;
047:
048: import sun.text.Normalizer;
049: import sun.text.normalizer.CharTrie;
050: import sun.text.normalizer.Trie;
051: import sun.text.normalizer.NormalizerImpl;
052: import sun.text.normalizer.VersionInfo;
053: import sun.text.normalizer.UCharacter;
054: import sun.text.normalizer.UCharacterIterator;
055: import sun.text.normalizer.UTF16;
056: import sun.net.idn.UCharacterDirection;
057: import sun.net.idn.StringPrepDataReader;
058:
059: /**
060: * StringPrep API implements the StingPrep framework as described by
061: * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.
062: * StringPrep prepares Unicode strings for use in network protocols.
063: * Profiles of StingPrep are set of rules and data according to which the
064: * Unicode Strings are prepared. Each profiles contains tables which describe
065: * how a code point should be treated. The tables are broadly classied into
066: * <ul>
067: * <li> Unassigned Table: Contains code points that are unassigned
068: * in the Unicode Version supported by StringPrep. Currently
069: * RFC 3454 supports Unicode 3.2. </li>
070: * <li> Prohibited Table: Contains code points that are prohibted from
071: * the output of the StringPrep processing function. </li>
072: * <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
073: * </ul>
074: *
075: * The procedure for preparing Unicode strings:
076: * <ol>
077: * <li> Map: For each character in the input, check if it has a mapping
078: * and, if so, replace it with its mapping. </li>
079: * <li> Normalize: Possibly normalize the result of step 1 using Unicode
080: * normalization. </li>
081: * <li> Prohibit: Check for any characters that are not allowed in the
082: * output. If any are found, return an error.</li>
083: * <li> Check bidi: Possibly check for right-to-left characters, and if
084: * any are found, make sure that the whole string satisfies the
085: * requirements for bidirectional strings. If the string does not
086: * satisfy the requirements for bidirectional strings, return an
087: * error. </li>
088: * </ol>
089: * @author Ram Viswanadha
090: * @draft ICU 2.8
091: */
092: public final class StringPrep {
093: /**
094: * Option to prohibit processing of unassigned code points in the input
095: *
096: * @see #prepare
097: * @draft ICU 2.8
098: */
099: public static final int DEFAULT = 0x0000;
100:
101: /**
102: * Option to allow processing of unassigned code points in the input
103: *
104: * @see #prepare
105: * @draft ICU 2.8
106: */
107: public static final int ALLOW_UNASSIGNED = 0x0001;
108:
109: private static final int UNASSIGNED = 0x0000;
110: private static final int MAP = 0x0001;
111: private static final int PROHIBITED = 0x0002;
112: private static final int DELETE = 0x0003;
113: private static final int TYPE_LIMIT = 0x0004;
114:
115: private static final int NORMALIZATION_ON = 0x0001;
116: private static final int CHECK_BIDI_ON = 0x0002;
117:
118: private static final int TYPE_THRESHOLD = 0xFFF0;
119: private static final int MAX_INDEX_VALUE = 0x3FBF; /*16139*/
120: private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
121:
122: /* indexes[] value names */
123: private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */
124: private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */
125: private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */
126: private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /* The starting index of 1 UChar mapping index in the mapping data array */
127: private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /* The starting index of 2 UChars mapping index in the mapping data array */
128: private static final int THREE_UCHARS_MAPPING_INDEX_START = 5;
129: private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6;
130: private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */
131: private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */
132:
133: /**
134: * Default buffer size of datafile
135: */
136: private static final int DATA_BUFFER_SIZE = 25000;
137:
138: /* Wrappers for Trie implementations */
139: private static final class StringPrepTrieImpl implements
140: Trie.DataManipulate {
141: private CharTrie sprepTrie = null;
142:
143: /**
144: * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
145: * data the index array offset of the indexes for that lead surrogate.
146: * @param property data value for a surrogate from the trie, including
147: * the folding offset
148: * @return data offset or 0 if there is no data for the lead surrogate
149: */
150: public int getFoldingOffset(int value) {
151: return value;
152: }
153: }
154:
155: // CharTrie implmentation for reading the trie data
156: private StringPrepTrieImpl sprepTrieImpl;
157: // Indexes read from the data file
158: private int[] indexes;
159: // mapping data read from the data file
160: private char[] mappingData;
161: // format version of the data file
162: private byte[] formatVersion;
163: // the version of Unicode supported by the data file
164: private VersionInfo sprepUniVer;
165: // the Unicode version of last entry in the
166: // NormalizationCorrections.txt file if normalization
167: // is turned on
168: private VersionInfo normCorrVer;
169: // Option to turn on Normalization
170: private boolean doNFKC;
171: // Option to turn on checking for BiDi rules
172: private boolean checkBiDi;
173:
174: private char getCodePointValue(int ch) {
175: return sprepTrieImpl.sprepTrie.getCodePointValue(ch);
176: }
177:
178: private static VersionInfo getVersionInfo(int comp) {
179: int micro = comp & 0xFF;
180: int milli = (comp >> 8) & 0xFF;
181: int minor = (comp >> 16) & 0xFF;
182: int major = (comp >> 24) & 0xFF;
183: return VersionInfo.getInstance(major, minor, milli, micro);
184: }
185:
186: private static VersionInfo getVersionInfo(byte[] version) {
187: if (version.length != 4) {
188: return null;
189: }
190: return VersionInfo.getInstance((int) version[0],
191: (int) version[1], (int) version[2], (int) version[3]);
192: }
193:
194: /**
195: * Creates an StringPrep object after reading the input stream.
196: * The object does not hold a reference to the input steam, so the stream can be
197: * closed after the method returns.
198: *
199: * @param inputStream The stream for reading the StringPrep profile binarySun
200: * @throws IOException
201: * @draft ICU 2.8
202: */
203: public StringPrep(InputStream inputStream) throws IOException {
204:
205: BufferedInputStream b = new BufferedInputStream(inputStream,
206: DATA_BUFFER_SIZE);
207:
208: StringPrepDataReader reader = new StringPrepDataReader(b);
209:
210: // read the indexes
211: indexes = reader.readIndexes(INDEX_TOP);
212:
213: byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];
214:
215: //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
216: mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE] / 2];
217: // load the rest of the data data and initialize the data members
218: reader.read(sprepBytes, mappingData);
219:
220: sprepTrieImpl = new StringPrepTrieImpl();
221: sprepTrieImpl.sprepTrie = new CharTrie(
222: new ByteArrayInputStream(sprepBytes), sprepTrieImpl);
223:
224: // get the data format version
225: formatVersion = reader.getDataFormatVersion();
226:
227: // get the options
228: doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
229: checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
230: sprepUniVer = getVersionInfo(reader.getUnicodeVersion());
231: normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
232: VersionInfo normUniVer = NormalizerImpl.getUnicodeVersion();
233: if (normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
234: normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
235: ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
236: ) {
237: throw new IOException(
238: "Normalization Correction version not supported");
239: }
240: b.close();
241: }
242:
243: private static final class Values {
244: boolean isIndex;
245: int value;
246: int type;
247:
248: public void reset() {
249: isIndex = false;
250: value = 0;
251: type = -1;
252: }
253: }
254:
255: private static final void getValues(char trieWord, Values values) {
256: values.reset();
257: if (trieWord == 0) {
258: /*
259: * Initial value stored in the mapping table
260: * just return TYPE_LIMIT .. so that
261: * the source codepoint is copied to the destination
262: */
263: values.type = TYPE_LIMIT;
264: } else if (trieWord >= TYPE_THRESHOLD) {
265: values.type = (trieWord - TYPE_THRESHOLD);
266: } else {
267: /* get the type */
268: values.type = MAP;
269: /* ascertain if the value is index or delta */
270: if ((trieWord & 0x02) > 0) {
271: values.isIndex = true;
272: values.value = trieWord >> 2; //mask off the lower 2 bits and shift
273:
274: } else {
275: values.isIndex = false;
276: values.value = ((int) (trieWord << 16)) >> 16;
277: values.value = (values.value >> 2);
278:
279: }
280:
281: if ((trieWord >> 2) == MAX_INDEX_VALUE) {
282: values.type = DELETE;
283: values.isIndex = false;
284: values.value = 0;
285: }
286: }
287: }
288:
289: private StringBuffer map(UCharacterIterator iter, int options)
290: throws ParseException {
291:
292: Values val = new Values();
293: char result = 0;
294: int ch = UCharacterIterator.DONE;
295: StringBuffer dest = new StringBuffer();
296: boolean allowUnassigned = ((options & ALLOW_UNASSIGNED) > 0);
297:
298: while ((ch = iter.nextCodePoint()) != UCharacterIterator.DONE) {
299:
300: result = getCodePointValue(ch);
301: getValues(result, val);
302:
303: // check if the source codepoint is unassigned
304: if (val.type == UNASSIGNED && allowUnassigned == false) {
305: throw new ParseException(
306: "An unassigned code point was found in the input "
307: + iter.getText(), iter.getIndex());
308: } else if ((val.type == MAP)) {
309: int index, length;
310:
311: if (val.isIndex) {
312: index = val.value;
313: if (index >= indexes[ONE_UCHAR_MAPPING_INDEX_START]
314: && index < indexes[TWO_UCHARS_MAPPING_INDEX_START]) {
315: length = 1;
316: } else if (index >= indexes[TWO_UCHARS_MAPPING_INDEX_START]
317: && index < indexes[THREE_UCHARS_MAPPING_INDEX_START]) {
318: length = 2;
319: } else if (index >= indexes[THREE_UCHARS_MAPPING_INDEX_START]
320: && index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]) {
321: length = 3;
322: } else {
323: length = mappingData[index++];
324: }
325: /* copy mapping to destination */
326: dest.append(mappingData, index, length);
327: continue;
328:
329: } else {
330: ch -= val.value;
331: }
332: } else if (val.type == DELETE) {
333: // just consume the codepoint and contine
334: continue;
335: }
336: //copy the source into destination
337: UTF16.append(dest, ch);
338: }
339:
340: return dest;
341: }
342:
343: private StringBuffer normalize(StringBuffer src) {
344: /*
345: * Option UNORM_BEFORE_PRI_29:
346: *
347: * IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
348: * requires strict adherence to Unicode 3.2 normalization,
349: * including buggy composition from before fixing Public Review Issue #29.
350: * Note that this results in some valid but nonsensical text to be
351: * either corrupted or rejected, depending on the text.
352: * See http://www.unicode.org/review/resolved-pri.html#pri29
353: * See unorm.cpp and cnormtst.c
354: */
355: return new StringBuffer(Normalizer.normalize(src.toString(),
356: java.text.Normalizer.Form.NFKC, Normalizer.UNICODE_3_2
357: | NormalizerImpl.BEFORE_PRI_29));
358: }
359:
360: /*
361: boolean isLabelSeparator(int ch){
362: int result = getCodePointValue(ch);
363: if( (result & 0x07) == LABEL_SEPARATOR){
364: return true;
365: }
366: return false;
367: }
368: */
369: /*
370: 1) Map -- For each character in the input, check if it has a mapping
371: and, if so, replace it with its mapping.
372:
373: 2) Normalize -- Possibly normalize the result of step 1 using Unicode
374: normalization.
375:
376: 3) Prohibit -- Check for any characters that are not allowed in the
377: output. If any are found, return an error.
378:
379: 4) Check bidi -- Possibly check for right-to-left characters, and if
380: any are found, make sure that the whole string satisfies the
381: requirements for bidirectional strings. If the string does not
382: satisfy the requirements for bidirectional strings, return an
383: error.
384: [Unicode3.2] defines several bidirectional categories; each character
385: has one bidirectional category assigned to it. For the purposes of
386: the requirements below, an "RandALCat character" is a character that
387: has Unicode bidirectional categories "R" or "AL"; an "LCat character"
388: is a character that has Unicode bidirectional category "L". Note
389:
390:
391: that there are many characters which fall in neither of the above
392: definitions; Latin digits (<U+0030> through <U+0039>) are examples of
393: this because they have bidirectional category "EN".
394:
395: In any profile that specifies bidirectional character handling, all
396: three of the following requirements MUST be met:
397:
398: 1) The characters in section 5.8 MUST be prohibited.
399:
400: 2) If a string contains any RandALCat character, the string MUST NOT
401: contain any LCat character.
402:
403: 3) If a string contains any RandALCat character, a RandALCat
404: character MUST be the first character of the string, and a
405: RandALCat character MUST be the last character of the string.
406: */
407: /**
408: * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
409: * checks for prohited and BiDi characters in the order defined by RFC 3454
410: * depending on the options specified in the profile.
411: *
412: * @param src A UCharacterIterator object containing the source string
413: * @param options A bit set of options:
414: *
415: * - StringPrep.NONE Prohibit processing of unassigned code points in the input
416: *
417: * - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points are in the input
418: * as normal Unicode code points.
419: *
420: * @return StringBuffer A StringBuffer containing the output
421: * @throws ParseException
422: * @draft ICU 2.8
423: */
424: public StringBuffer prepare(UCharacterIterator src, int options)
425: throws ParseException {
426:
427: // map
428: StringBuffer mapOut = map(src, options);
429: StringBuffer normOut = mapOut;// initialize
430:
431: if (doNFKC) {
432: // normalize
433: normOut = normalize(mapOut);
434: }
435:
436: int ch;
437: char result;
438: UCharacterIterator iter = UCharacterIterator
439: .getInstance(normOut);
440: Values val = new Values();
441: int direction = UCharacterDirection.CHAR_DIRECTION_COUNT, firstCharDir = UCharacterDirection.CHAR_DIRECTION_COUNT;
442: int rtlPos = -1, ltrPos = -1;
443: boolean rightToLeft = false, leftToRight = false;
444:
445: while ((ch = iter.nextCodePoint()) != UCharacterIterator.DONE) {
446: result = getCodePointValue(ch);
447: getValues(result, val);
448:
449: if (val.type == PROHIBITED) {
450: throw new ParseException(
451: "A prohibited code point was found in the input"
452: + iter.getText(), val.value);
453: }
454:
455: direction = UCharacter.getDirection(ch);
456: if (firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT) {
457: firstCharDir = direction;
458: }
459: if (direction == UCharacterDirection.LEFT_TO_RIGHT) {
460: leftToRight = true;
461: ltrPos = iter.getIndex() - 1;
462: }
463: if (direction == UCharacterDirection.RIGHT_TO_LEFT
464: || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) {
465: rightToLeft = true;
466: rtlPos = iter.getIndex() - 1;
467: }
468: }
469: if (checkBiDi == true) {
470: // satisfy 2
471: if (leftToRight == true && rightToLeft == true) {
472: throw new ParseException(
473: "The input does not conform to the rules for BiDi code points."
474: + iter.getText(),
475: (rtlPos > ltrPos) ? rtlPos : ltrPos);
476: }
477:
478: //satisfy 3
479: if (rightToLeft == true
480: && !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) && (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))) {
481: throw new ParseException(
482: "The input does not conform to the rules for BiDi code points."
483: + iter.getText(),
484: (rtlPos > ltrPos) ? rtlPos : ltrPos);
485: }
486: }
487: return normOut;
488:
489: }
490: }
|