001: /*
002: *******************************************************************************
003: * Copyright (C) 2003-2005, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */
007: package com.ibm.icu.text;
008:
009: import java.io.BufferedInputStream;
010: import java.io.ByteArrayInputStream;
011: import java.io.IOException;
012: import java.io.InputStream;
013:
014: import com.ibm.icu.impl.CharTrie;
015: import com.ibm.icu.impl.StringPrepDataReader;
016: import com.ibm.icu.impl.Trie;
017: import com.ibm.icu.impl.NormalizerImpl;
018: import com.ibm.icu.impl.UBiDiProps;
019:
020: import com.ibm.icu.util.VersionInfo;
021:
022: import com.ibm.icu.lang.UCharacter;
023: import com.ibm.icu.lang.UCharacterDirection;
024:
025: /**
026: * StringPrep API implements the StingPrep framework as described by
027: * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.
028: * StringPrep prepares Unicode strings for use in network protocols.
029: * Profiles of StingPrep are set of rules and data according to which the
030: * Unicode Strings are prepared. Each profiles contains tables which describe
031: * how a code point should be treated. The tables are broadly classied into
032: * <ul>
033: * <li> Unassigned Table: Contains code points that are unassigned
034: * in the Unicode Version supported by StringPrep. Currently
035: * RFC 3454 supports Unicode 3.2. </li>
036: * <li> Prohibited Table: Contains code points that are prohibted from
037: * the output of the StringPrep processing function. </li>
038: * <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
039: * </ul>
040: *
041: * The procedure for preparing Unicode strings:
042: * <ol>
043: * <li> Map: For each character in the input, check if it has a mapping
044: * and, if so, replace it with its mapping. </li>
045: * <li> Normalize: Possibly normalize the result of step 1 using Unicode
046: * normalization. </li>
047: * <li> Prohibit: Check for any characters that are not allowed in the
048: * output. If any are found, return an error.</li>
049: * <li> Check bidi: Possibly check for right-to-left characters, and if
050: * any are found, make sure that the whole string satisfies the
051: * requirements for bidirectional strings. If the string does not
052: * satisfy the requirements for bidirectional strings, return an
053: * error. </li>
054: * </ol>
055: * @author Ram Viswanadha
056: * @stable ICU 2.8
057: */
058: public final class StringPrep {
059: /**
060: * Option to prohibit processing of unassigned code points in the input
061: *
062: * @see #prepare
063: * @stable ICU 2.8
064: */
065: public static final int DEFAULT = 0x0000;
066:
067: /**
068: * Option to allow processing of unassigned code points in the input
069: *
070: * @see #prepare
071: * @stable ICU 2.8
072: */
073: public static final int ALLOW_UNASSIGNED = 0x0001;
074:
075: private static final int UNASSIGNED = 0x0000;
076: private static final int MAP = 0x0001;
077: private static final int PROHIBITED = 0x0002;
078: private static final int DELETE = 0x0003;
079: private static final int TYPE_LIMIT = 0x0004;
080:
081: private static final int NORMALIZATION_ON = 0x0001;
082: private static final int CHECK_BIDI_ON = 0x0002;
083:
084: private static final int TYPE_THRESHOLD = 0xFFF0;
085: private static final int MAX_INDEX_VALUE = 0x3FBF; /*16139*/
086: private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
087:
088: /* indexes[] value names */
089: private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */
090: private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */
091: private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */
092: private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /* The starting index of 1 UChar mapping index in the mapping data array */
093: private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /* The starting index of 2 UChars mapping index in the mapping data array */
094: private static final int THREE_UCHARS_MAPPING_INDEX_START = 5;
095: private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6;
096: private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */
097: private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */
098:
099: /**
100: * Default buffer size of datafile
101: */
102: private static final int DATA_BUFFER_SIZE = 25000;
103:
104: // CharTrie implmentation for reading the trie data
105: private CharTrie sprepTrie;
106: // Indexes read from the data file
107: private int[] indexes;
108: // mapping data read from the data file
109: private char[] mappingData;
110: // format version of the data file
111: private byte[] formatVersion;
112: // the version of Unicode supported by the data file
113: private VersionInfo sprepUniVer;
114: // the Unicode version of last entry in the
115: // NormalizationCorrections.txt file if normalization
116: // is turned on
117: private VersionInfo normCorrVer;
118: // Option to turn on Normalization
119: private boolean doNFKC;
120: // Option to turn on checking for BiDi rules
121: private boolean checkBiDi;
122: // bidi properties
123: private UBiDiProps bdp;
124:
125: private char getCodePointValue(int ch) {
126: return sprepTrie.getCodePointValue(ch);
127: }
128:
129: private static VersionInfo getVersionInfo(int comp) {
130: int micro = comp & 0xFF;
131: int milli = (comp >> 8) & 0xFF;
132: int minor = (comp >> 16) & 0xFF;
133: int major = (comp >> 24) & 0xFF;
134: return VersionInfo.getInstance(major, minor, milli, micro);
135: }
136:
137: private static VersionInfo getVersionInfo(byte[] version) {
138: if (version.length != 4) {
139: return null;
140: }
141: return VersionInfo.getInstance((int) version[0],
142: (int) version[1], (int) version[2], (int) version[3]);
143: }
144:
145: /**
146: * Creates an StringPrep object after reading the input stream.
147: * The object does not hold a reference to the input steam, so the stream can be
148: * closed after the method returns.
149: *
150: * @param inputStream The stream for reading the StringPrep profile binarySun
151: * @throws IOException
152: * @stable ICU 2.8
153: */
154: public StringPrep(InputStream inputStream) throws IOException {
155:
156: BufferedInputStream b = new BufferedInputStream(inputStream,
157: DATA_BUFFER_SIZE);
158:
159: StringPrepDataReader reader = new StringPrepDataReader(b);
160:
161: // read the indexes
162: indexes = reader.readIndexes(INDEX_TOP);
163:
164: byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];
165:
166: //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
167: mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE] / 2];
168: // load the rest of the data data and initialize the data members
169: reader.read(sprepBytes, mappingData);
170:
171: sprepTrie = new CharTrie(new ByteArrayInputStream(sprepBytes),
172: null);
173:
174: // get the data format version
175: formatVersion = reader.getDataFormatVersion();
176:
177: // get the options
178: doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
179: checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
180: sprepUniVer = getVersionInfo(reader.getUnicodeVersion());
181: normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
182: VersionInfo normUniVer = Normalizer.getUnicodeVersion();
183: if (normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
184: normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
185: ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
186: ) {
187: throw new IOException(
188: "Normalization Correction version not supported");
189: }
190: b.close();
191:
192: if (checkBiDi) {
193: bdp = UBiDiProps.getSingleton();
194: }
195: }
196:
197: private static final class Values {
198: boolean isIndex;
199: int value;
200: int type;
201:
202: public void reset() {
203: isIndex = false;
204: value = 0;
205: type = -1;
206: }
207: }
208:
209: private static final void getValues(char trieWord, Values values) {
210: values.reset();
211: if (trieWord == 0) {
212: /*
213: * Initial value stored in the mapping table
214: * just return TYPE_LIMIT .. so that
215: * the source codepoint is copied to the destination
216: */
217: values.type = TYPE_LIMIT;
218: } else if (trieWord >= TYPE_THRESHOLD) {
219: values.type = (trieWord - TYPE_THRESHOLD);
220: } else {
221: /* get the type */
222: values.type = MAP;
223: /* ascertain if the value is index or delta */
224: if ((trieWord & 0x02) > 0) {
225: values.isIndex = true;
226: values.value = trieWord >> 2; //mask off the lower 2 bits and shift
227:
228: } else {
229: values.isIndex = false;
230: values.value = ((int) (trieWord << 16)) >> 16;
231: values.value = (values.value >> 2);
232:
233: }
234:
235: if ((trieWord >> 2) == MAX_INDEX_VALUE) {
236: values.type = DELETE;
237: values.isIndex = false;
238: values.value = 0;
239: }
240: }
241: }
242:
243: private StringBuffer map(UCharacterIterator iter, int options)
244: throws StringPrepParseException {
245:
246: Values val = new Values();
247: char result = 0;
248: int ch = UCharacterIterator.DONE;
249: StringBuffer dest = new StringBuffer();
250: boolean allowUnassigned = ((options & ALLOW_UNASSIGNED) > 0);
251:
252: while ((ch = iter.nextCodePoint()) != UCharacterIterator.DONE) {
253:
254: result = getCodePointValue(ch);
255: getValues(result, val);
256:
257: // check if the source codepoint is unassigned
258: if (val.type == UNASSIGNED && allowUnassigned == false) {
259: throw new StringPrepParseException(
260: "An unassigned code point was found in the input",
261: StringPrepParseException.UNASSIGNED_ERROR, iter
262: .getText(), iter.getIndex());
263: } else if ((val.type == MAP)) {
264: int index, length;
265:
266: if (val.isIndex) {
267: index = val.value;
268: if (index >= indexes[ONE_UCHAR_MAPPING_INDEX_START]
269: && index < indexes[TWO_UCHARS_MAPPING_INDEX_START]) {
270: length = 1;
271: } else if (index >= indexes[TWO_UCHARS_MAPPING_INDEX_START]
272: && index < indexes[THREE_UCHARS_MAPPING_INDEX_START]) {
273: length = 2;
274: } else if (index >= indexes[THREE_UCHARS_MAPPING_INDEX_START]
275: && index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]) {
276: length = 3;
277: } else {
278: length = mappingData[index++];
279: }
280: /* copy mapping to destination */
281: dest.append(mappingData, index, length);
282: continue;
283:
284: } else {
285: ch -= val.value;
286: }
287: } else if (val.type == DELETE) {
288: // just consume the codepoint and contine
289: continue;
290: }
291: //copy the source into destination
292: UTF16.append(dest, ch);
293: }
294:
295: return dest;
296: }
297:
298: private StringBuffer normalize(StringBuffer src) {
299: /*
300: * Option UNORM_BEFORE_PRI_29:
301: *
302: * IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
303: * requires strict adherence to Unicode 3.2 normalization,
304: * including buggy composition from before fixing Public Review Issue #29.
305: * Note that this results in some valid but nonsensical text to be
306: * either corrupted or rejected, depending on the text.
307: * See http://www.unicode.org/review/resolved-pri.html#pri29
308: * See unorm.cpp and cnormtst.c
309: */
310: return new StringBuffer(Normalizer.normalize(src.toString(),
311: Normalizer.NFKC, Normalizer.UNICODE_3_2
312: | NormalizerImpl.BEFORE_PRI_29));
313: }
314:
315: /*
316: boolean isLabelSeparator(int ch){
317: int result = getCodePointValue(ch);
318: if( (result & 0x07) == LABEL_SEPARATOR){
319: return true;
320: }
321: return false;
322: }
323: */
324: /*
325: 1) Map -- For each character in the input, check if it has a mapping
326: and, if so, replace it with its mapping.
327:
328: 2) Normalize -- Possibly normalize the result of step 1 using Unicode
329: normalization.
330:
331: 3) Prohibit -- Check for any characters that are not allowed in the
332: output. If any are found, return an error.
333:
334: 4) Check bidi -- Possibly check for right-to-left characters, and if
335: any are found, make sure that the whole string satisfies the
336: requirements for bidirectional strings. If the string does not
337: satisfy the requirements for bidirectional strings, return an
338: error.
339: [Unicode3.2] defines several bidirectional categories; each character
340: has one bidirectional category assigned to it. For the purposes of
341: the requirements below, an "RandALCat character" is a character that
342: has Unicode bidirectional categories "R" or "AL"; an "LCat character"
343: is a character that has Unicode bidirectional category "L". Note
344:
345:
346: that there are many characters which fall in neither of the above
347: definitions; Latin digits (<U+0030> through <U+0039>) are examples of
348: this because they have bidirectional category "EN".
349:
350: In any profile that specifies bidirectional character handling, all
351: three of the following requirements MUST be met:
352:
353: 1) The characters in section 5.8 MUST be prohibited.
354:
355: 2) If a string contains any RandALCat character, the string MUST NOT
356: contain any LCat character.
357:
358: 3) If a string contains any RandALCat character, a RandALCat
359: character MUST be the first character of the string, and a
360: RandALCat character MUST be the last character of the string.
361: */
362: /**
363: * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
364: * checks for prohited and BiDi characters in the order defined by RFC 3454
365: * depending on the options specified in the profile.
366: *
367: * @param src A UCharacterIterator object containing the source string
368: * @param options A bit set of options:
369: *
370: * - StringPrep.NONE Prohibit processing of unassigned code points in the input
371: *
372: * - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points are in the input
373: * as normal Unicode code points.
374: *
375: * @return StringBuffer A StringBuffer containing the output
376: * @throws ParseException
377: * @stable ICU 2.8
378: */
379: public StringBuffer prepare(UCharacterIterator src, int options)
380: throws StringPrepParseException {
381:
382: // map
383: StringBuffer mapOut = map(src, options);
384: StringBuffer normOut = mapOut;// initialize
385:
386: if (doNFKC) {
387: // normalize
388: normOut = normalize(mapOut);
389: }
390:
391: int ch;
392: char result;
393: UCharacterIterator iter = UCharacterIterator
394: .getInstance(normOut);
395: Values val = new Values();
396: int direction = UCharacterDirection.CHAR_DIRECTION_COUNT, firstCharDir = UCharacterDirection.CHAR_DIRECTION_COUNT;
397: int rtlPos = -1, ltrPos = -1;
398: boolean rightToLeft = false, leftToRight = false;
399:
400: while ((ch = iter.nextCodePoint()) != UCharacterIterator.DONE) {
401: result = getCodePointValue(ch);
402: getValues(result, val);
403:
404: if (val.type == PROHIBITED) {
405: throw new StringPrepParseException(
406: "A prohibited code point was found in the input",
407: StringPrepParseException.PROHIBITED_ERROR, iter
408: .getText(), val.value);
409: }
410:
411: if (checkBiDi) {
412: direction = bdp.getClass(ch);
413: if (firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT) {
414: firstCharDir = direction;
415: }
416: if (direction == UCharacterDirection.LEFT_TO_RIGHT) {
417: leftToRight = true;
418: ltrPos = iter.getIndex() - 1;
419: }
420: if (direction == UCharacterDirection.RIGHT_TO_LEFT
421: || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) {
422: rightToLeft = true;
423: rtlPos = iter.getIndex() - 1;
424: }
425: }
426: }
427: if (checkBiDi == true) {
428: // satisfy 2
429: if (leftToRight == true && rightToLeft == true) {
430: throw new StringPrepParseException(
431: "The input does not conform to the rules for BiDi code points.",
432: StringPrepParseException.CHECK_BIDI_ERROR, iter
433: .getText(), (rtlPos > ltrPos) ? rtlPos
434: : ltrPos);
435: }
436:
437: //satisfy 3
438: if (rightToLeft == true
439: && !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) && (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))) {
440: throw new StringPrepParseException(
441: "The input does not conform to the rules for BiDi code points.",
442: StringPrepParseException.CHECK_BIDI_ERROR, iter
443: .getText(), (rtlPos > ltrPos) ? rtlPos
444: : ltrPos);
445: }
446: }
447: return normOut;
448:
449: }
450: }
|