0001: /**
0002: *******************************************************************************
0003: * Copyright (C) 1996-2006, International Business Machines Corporation and *
0004: * others. All Rights Reserved. *
0005: *******************************************************************************
0006: */package com.ibm.icu.impl;
0007:
0008: import java.io.InputStream;
0009: import java.io.BufferedInputStream;
0010: import java.io.IOException;
0011: import java.util.MissingResourceException;
0012:
0013: import com.ibm.icu.text.UTF16;
0014: import com.ibm.icu.text.UnicodeSet;
0015: import com.ibm.icu.lang.UCharacter;
0016: import com.ibm.icu.lang.UCharacterCategory;
0017:
0018: /**
0019: * Internal class to manage character names.
0020: * Since data for names are stored
0021: * in an array of char, by default indexes used in this class is refering to
0022: * a 2 byte count, unless otherwise stated. Cases where the index is refering
0023: * to a byte count, the index is halved and depending on whether the index is
0024: * even or odd, the MSB or LSB of the result char at the halved index is
0025: * returned. For indexes to an array of int, the index is multiplied by 2,
0026: * result char at the multiplied index and its following char is returned as an
0027: * int.
0028: * <a href=../lang/UCharacter.html>UCharacter</a> acts as a public facade for this class
0029: * Note : 0 - 0x1F are control characters without names in Unicode 3.0
0030: * @author Syn Wee Quek
0031: * @since nov0700
0032: */
0033:
0034: public final class UCharacterName {
0035: // public data members ----------------------------------------------
0036:
0037: /**
0038: * Number of lines per group
0039: * 1 << GROUP_SHIFT_
0040: */
0041: public static final int LINES_PER_GROUP_ = 1 << 5;
0042: /**
0043: * Maximum number of groups
0044: */
0045: public int m_groupcount_ = 0;
0046:
0047: // public methods ---------------------------------------------------
0048:
0049: /**
0050: * Gets the only instance of UCharacterName
0051: * @return only instance of UCharacterName
0052: * @exception MissingResourceException thrown when reading of name data fails
0053: */
0054: public static UCharacterName getInstance() {
0055: if (INSTANCE_ == null) {
0056: try {
0057: INSTANCE_ = new UCharacterName();
0058: } catch (IOException e) {
0059: throw new MissingResourceException(
0060: "Could not construct UCharacterName. Missing unames.icu",
0061: "", "");
0062: } catch (Exception e) {
0063: throw new MissingResourceException(e.getMessage(), "",
0064: "");
0065: }
0066: }
0067: return INSTANCE_;
0068: }
0069:
0070: /**
0071: * Retrieve the name of a Unicode code point.
0072: * Depending on <code>choice</code>, the character name written into the
0073: * buffer is the "modern" name or the name that was defined in Unicode
0074: * version 1.0.
0075: * The name contains only "invariant" characters
0076: * like A-Z, 0-9, space, and '-'.
0077: *
0078: * @param ch the code point for which to get the name.
0079: * @param choice Selector for which name to get.
0080: * @return if code point is above 0x1fff, null is returned
0081: */
0082: public String getName(int ch, int choice) {
0083: if (ch < UCharacter.MIN_VALUE || ch > UCharacter.MAX_VALUE
0084: || choice > UCharacterNameChoice.CHAR_NAME_CHOICE_COUNT) {
0085: return null;
0086: }
0087:
0088: String result = null;
0089:
0090: result = getAlgName(ch, choice);
0091:
0092: // getting normal character name
0093: if (result == null || result.length() == 0) {
0094: if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) {
0095: result = getExtendedName(ch);
0096: } else {
0097: result = getGroupName(ch, choice);
0098: }
0099: }
0100:
0101: return result;
0102: }
0103:
0104: /**
0105: * Find a character by its name and return its code point value
0106: * @param choice selector to indicate if argument name is a Unicode 1.0
0107: * or the most current version
0108: * @param name the name to search for
0109: * @return code point
0110: */
0111: public int getCharFromName(int choice, String name) {
0112: // checks for illegal arguments
0113: if (choice >= UCharacterNameChoice.CHAR_NAME_CHOICE_COUNT
0114: || name == null || name.length() == 0) {
0115: return -1;
0116: }
0117:
0118: // try extended names first
0119: int result = getExtendedChar(name.toLowerCase(), choice);
0120: if (result >= -1) {
0121: return result;
0122: }
0123:
0124: String upperCaseName = name.toUpperCase();
0125: // try algorithmic names first, if fails then try group names
0126: // int result = getAlgorithmChar(choice, uppercasename);
0127:
0128: if (choice != UCharacterNameChoice.UNICODE_10_CHAR_NAME) {
0129: int count = 0;
0130: if (m_algorithm_ != null) {
0131: count = m_algorithm_.length;
0132: }
0133: for (count--; count >= 0; count--) {
0134: result = m_algorithm_[count].getChar(upperCaseName);
0135: if (result >= 0) {
0136: return result;
0137: }
0138: }
0139: }
0140:
0141: if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) {
0142: result = getGroupChar(upperCaseName,
0143: UCharacterNameChoice.UNICODE_CHAR_NAME);
0144: if (result == -1) {
0145: result = getGroupChar(upperCaseName,
0146: UCharacterNameChoice.UNICODE_10_CHAR_NAME);
0147: }
0148: } else {
0149: result = getGroupChar(upperCaseName, choice);
0150: }
0151: return result;
0152: }
0153:
0154: // these are all UCharacterNameIterator use methods -------------------
0155:
0156: /**
0157: * Reads a block of compressed lengths of 32 strings and expands them into
0158: * offsets and lengths for each string. Lengths are stored with a
0159: * variable-width encoding in consecutive nibbles:
0160: * If a nibble<0xc, then it is the length itself (0 = empty string).
0161: * If a nibble>=0xc, then it forms a length value with the following
0162: * nibble.
0163: * The offsets and lengths arrays must be at least 33 (one more) long
0164: * because there is no check here at the end if the last nibble is still
0165: * used.
0166: * @param index of group string object in array
0167: * @param offsets array to store the value of the string offsets
0168: * @param lengths array to store the value of the string length
0169: * @return next index of the data string immediately after the lengths
0170: * in terms of byte address
0171: */
0172: public int getGroupLengths(int index, char offsets[],
0173: char lengths[]) {
0174: char length = 0xffff;
0175: byte b = 0, n = 0;
0176: int shift;
0177: index = index * m_groupsize_; // byte count offsets of group strings
0178: int stringoffset = UCharacterUtility.toInt(m_groupinfo_[index
0179: + OFFSET_HIGH_OFFSET_], m_groupinfo_[index
0180: + OFFSET_LOW_OFFSET_]);
0181:
0182: offsets[0] = 0;
0183:
0184: // all 32 lengths must be read to get the offset of the first group
0185: // string
0186: for (int i = 0; i < LINES_PER_GROUP_; stringoffset++) {
0187: b = m_groupstring_[stringoffset];
0188: shift = 4;
0189:
0190: while (shift >= 0) {
0191: // getting nibble
0192: n = (byte) ((b >> shift) & 0x0F);
0193: if (length == 0xffff && n > SINGLE_NIBBLE_MAX_) {
0194: length = (char) ((n - 12) << 4);
0195: } else {
0196: if (length != 0xffff) {
0197: lengths[i] = (char) ((length | n) + 12);
0198: } else {
0199: lengths[i] = (char) n;
0200: }
0201:
0202: if (i < LINES_PER_GROUP_) {
0203: offsets[i + 1] = (char) (offsets[i] + lengths[i]);
0204: }
0205:
0206: length = 0xffff;
0207: i++;
0208: }
0209:
0210: shift -= 4;
0211: }
0212: }
0213: return stringoffset;
0214: }
0215:
0216: /**
0217: * Gets the name of the argument group index.
0218: * UnicodeData.txt uses ';' as a field separator, so no field can contain
0219: * ';' as part of its contents. In unames.icu, it is marked as
0220: * token[';'] == -1 only if the semicolon is used in the data file - which
0221: * is iff we have Unicode 1.0 names or ISO comments.
0222: * So, it will be token[';'] == -1 if we store U1.0 names/ISO comments
0223: * although we know that it will never be part of a name.
0224: * Equivalent to ICU4C's expandName.
0225: * @param index of the group name string in byte count
0226: * @param length of the group name string
0227: * @param choice of Unicode 1.0 name or the most current name
0228: * @return name of the group
0229: */
0230: public String getGroupName(int index, int length, int choice) {
0231: if (choice == UCharacterNameChoice.UNICODE_10_CHAR_NAME
0232: || choice == UCharacterNameChoice.ISO_COMMENT_) {
0233: if (';' >= m_tokentable_.length
0234: || m_tokentable_[';'] == 0xFFFF) {
0235: // skip the modern name
0236: int oldindex = index;
0237: index += UCharacterUtility.skipByteSubString(
0238: m_groupstring_, index, length, (byte) ';');
0239: length -= (index - oldindex);
0240: if (choice == UCharacterNameChoice.ISO_COMMENT_) {
0241: // skips the 1.0 Name to the iso comment part
0242: oldindex = index;
0243: index += UCharacterUtility.skipByteSubString(
0244: m_groupstring_, index, length, (byte) ';');
0245: length -= (index - oldindex);
0246: }
0247: } else {
0248: // the semicolon byte is a token number, therefore only modern
0249: // names are stored in unames.dat and there is no such
0250: // requested Unicode 1.0 name here
0251: length = 0;
0252: }
0253: }
0254:
0255: synchronized (m_utilStringBuffer_) {
0256: m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
0257: byte b;
0258: char token;
0259: for (int i = 0; i < length;) {
0260: b = m_groupstring_[index + i];
0261: i++;
0262:
0263: if (b >= m_tokentable_.length) {
0264: if (b == ';') {
0265: break;
0266: }
0267: m_utilStringBuffer_.append(b); // implicit letter
0268: } else {
0269: token = m_tokentable_[b & 0x00ff];
0270: if (token == 0xFFFE) {
0271: // this is a lead byte for a double-byte token
0272: token = m_tokentable_[b << 8
0273: | (m_groupstring_[index + i] & 0x00ff)];
0274: i++;
0275: }
0276: if (token == 0xFFFF) {
0277: if (b == ';') {
0278: // skip the semicolon if we are seeking extended
0279: // names and there was no 2.0 name but there
0280: // is a 1.0 name.
0281: if (m_utilStringBuffer_.length() == 0
0282: && choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) {
0283: continue;
0284: }
0285: break;
0286: }
0287: // explicit letter
0288: m_utilStringBuffer_.append((char) (b & 0x00ff));
0289: } else { // write token word
0290: UCharacterUtility.getNullTermByteSubString(
0291: m_utilStringBuffer_, m_tokenstring_,
0292: token);
0293: }
0294: }
0295: }
0296:
0297: if (m_utilStringBuffer_.length() > 0) {
0298: return m_utilStringBuffer_.toString();
0299: }
0300: }
0301: return null;
0302: }
0303:
0304: /**
0305: * Retrieves the extended name
0306: */
0307: public String getExtendedName(int ch) {
0308: String result = getName(ch,
0309: UCharacterNameChoice.UNICODE_CHAR_NAME);
0310: if (result == null) {
0311: if (getType(ch) == UCharacterCategory.CONTROL) {
0312: result = getName(ch,
0313: UCharacterNameChoice.UNICODE_10_CHAR_NAME);
0314: }
0315: if (result == null) {
0316: result = getExtendedOr10Name(ch);
0317: }
0318: }
0319: return result;
0320: }
0321:
0322: /**
0323: * Gets the group index for the codepoint, or the group before it.
0324: * @param codepoint
0325: * @return group index containing codepoint or the group before it.
0326: */
0327: public int getGroup(int codepoint) {
0328: int endGroup = m_groupcount_;
0329: int msb = getCodepointMSB(codepoint);
0330: int result = 0;
0331: // binary search for the group of names that contains the one for
0332: // code
0333: // find the group that contains codepoint, or the highest before it
0334: while (result < endGroup - 1) {
0335: int gindex = (result + endGroup) >> 1;
0336: if (msb < getGroupMSB(gindex)) {
0337: endGroup = gindex;
0338: } else {
0339: result = gindex;
0340: }
0341: }
0342: return result;
0343: }
0344:
0345: /**
0346: * Gets the extended and 1.0 name when the most current unicode names
0347: * fail
0348: * @param ch codepoint
0349: * @return name of codepoint extended or 1.0
0350: */
0351: public String getExtendedOr10Name(int ch) {
0352: String result = null;
0353: if (getType(ch) == UCharacterCategory.CONTROL) {
0354: result = getName(ch,
0355: UCharacterNameChoice.UNICODE_10_CHAR_NAME);
0356: }
0357: if (result == null) {
0358: int type = getType(ch);
0359: // Return unknown if the table of names above is not up to
0360: // date.
0361: if (type >= TYPE_NAMES_.length) {
0362: result = UNKNOWN_TYPE_NAME_;
0363: } else {
0364: result = TYPE_NAMES_[type];
0365: }
0366: synchronized (m_utilStringBuffer_) {
0367: m_utilStringBuffer_.delete(0, m_utilStringBuffer_
0368: .length());
0369: m_utilStringBuffer_.append('<');
0370: m_utilStringBuffer_.append(result);
0371: m_utilStringBuffer_.append('-');
0372: String chStr = Integer.toHexString(ch).toUpperCase();
0373: int zeros = 4 - chStr.length();
0374: while (zeros > 0) {
0375: m_utilStringBuffer_.append('0');
0376: zeros--;
0377: }
0378: m_utilStringBuffer_.append(chStr);
0379: m_utilStringBuffer_.append('>');
0380: result = m_utilStringBuffer_.toString();
0381: }
0382: }
0383: return result;
0384: }
0385:
0386: /**
0387: * Gets the MSB from the group index
0388: * @param gindex group index
0389: * @return the MSB of the group if gindex is valid, -1 otherwise
0390: */
0391: public int getGroupMSB(int gindex) {
0392: if (gindex >= m_groupcount_) {
0393: return -1;
0394: }
0395: return m_groupinfo_[gindex * m_groupsize_];
0396: }
0397:
0398: /**
0399: * Gets the MSB of the codepoint
0400: * @param codepoint
0401: * @return the MSB of the codepoint
0402: */
0403: public static int getCodepointMSB(int codepoint) {
0404: return codepoint >> GROUP_SHIFT_;
0405: }
0406:
0407: /**
0408: * Gets the maximum codepoint + 1 of the group
0409: * @param msb most significant byte of the group
0410: * @return limit codepoint of the group
0411: */
0412: public static int getGroupLimit(int msb) {
0413: return (msb << GROUP_SHIFT_) + LINES_PER_GROUP_;
0414: }
0415:
0416: /**
0417: * Gets the minimum codepoint of the group
0418: * @param msb most significant byte of the group
0419: * @return minimum codepoint of the group
0420: */
0421: public static int getGroupMin(int msb) {
0422: return msb << GROUP_SHIFT_;
0423: }
0424:
0425: /**
0426: * Gets the offset to a group
0427: * @param codepoint
0428: * @return offset to a group
0429: */
0430: public static int getGroupOffset(int codepoint) {
0431: return codepoint & GROUP_MASK_;
0432: }
0433:
0434: /**
0435: * Gets the minimum codepoint of a group
0436: * @param codepoint
0437: * @return minimum codepoint in the group which codepoint belongs to
0438: */
0439: ///CLOVER:OFF
0440: public static int getGroupMinFromCodepoint(int codepoint) {
0441: return codepoint & ~GROUP_MASK_;
0442: }
0443:
0444: ///CLOVER:ON
0445:
0446: /**
0447: * Get the Algorithm range length
0448: * @return Algorithm range length
0449: */
0450: public int getAlgorithmLength() {
0451: return m_algorithm_.length;
0452: }
0453:
0454: /**
0455: * Gets the start of the range
0456: * @param index algorithm index
0457: * @return algorithm range start
0458: */
0459: public int getAlgorithmStart(int index) {
0460: return m_algorithm_[index].m_rangestart_;
0461: }
0462:
0463: /**
0464: * Gets the end of the range
0465: * @param index algorithm index
0466: * @return algorithm range end
0467: */
0468: public int getAlgorithmEnd(int index) {
0469: return m_algorithm_[index].m_rangeend_;
0470: }
0471:
0472: /**
0473: * Gets the Algorithmic name of the codepoint
0474: * @param index algorithmic range index
0475: * @param codepoint
0476: * @return algorithmic name of codepoint
0477: */
0478: public String getAlgorithmName(int index, int codepoint) {
0479: String result = null;
0480: synchronized (m_utilStringBuffer_) {
0481: m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
0482: m_algorithm_[index].appendName(codepoint,
0483: m_utilStringBuffer_);
0484: result = m_utilStringBuffer_.toString();
0485: }
0486: return result;
0487: }
0488:
0489: /**
0490: * Gets the group name of the character
0491: * @param ch character to get the group name
0492: * @param choice name choice selector to choose a unicode 1.0 or newer name
0493: */
0494: public String getGroupName(int ch, int choice) {
0495: // gets the msb
0496: int msb = getCodepointMSB(ch);
0497: int group = getGroup(ch);
0498:
0499: // return this if it is an exact match
0500: if (msb == m_groupinfo_[group * m_groupsize_]) {
0501: int index = getGroupLengths(group, m_groupoffsets_,
0502: m_grouplengths_);
0503: int offset = ch & GROUP_MASK_;
0504: return getGroupName(index + m_groupoffsets_[offset],
0505: m_grouplengths_[offset], choice);
0506: }
0507:
0508: return null;
0509: }
0510:
0511: // these are transliterator use methods ---------------------------------
0512:
0513: /**
0514: * Gets the maximum length of any codepoint name.
0515: * Equivalent to uprv_getMaxCharNameLength.
0516: * @return the maximum length of any codepoint name
0517: */
0518: public int getMaxCharNameLength() {
0519: if (initNameSetsLengths()) {
0520: return m_maxNameLength_;
0521: } else {
0522: return 0;
0523: }
0524: }
0525:
0526: /**
0527: * Gets the maximum length of any iso comments.
0528: * Equivalent to uprv_getMaxISOCommentLength.
0529: * @return the maximum length of any codepoint name
0530: */
0531: ///CLOVER:OFF
0532: public int getMaxISOCommentLength() {
0533: if (initNameSetsLengths()) {
0534: return m_maxISOCommentLength_;
0535: } else {
0536: return 0;
0537: }
0538: }
0539:
0540: ///CLOVER:ON
0541:
0542: /**
0543: * Fills set with characters that are used in Unicode character names.
0544: * Equivalent to uprv_getCharNameCharacters.
0545: * @param set USet to receive characters. Existing contents are deleted.
0546: */
0547: public void getCharNameCharacters(UnicodeSet set) {
0548: convert(m_nameSet_, set);
0549: }
0550:
0551: /**
0552: * Fills set with characters that are used in Unicode character names.
0553: * Equivalent to uprv_getISOCommentCharacters.
0554: * @param set USet to receive characters. Existing contents are deleted.
0555: */
0556: ///CLOVER:OFF
0557: public void getISOCommentCharacters(UnicodeSet set) {
0558: convert(m_ISOCommentSet_, set);
0559: }
0560:
0561: ///CLOVER:ON
0562:
0563: // package private inner class --------------------------------------
0564:
0565: /**
0566: * Algorithmic name class
0567: */
0568: static final class AlgorithmName {
0569: // package private data members ----------------------------------
0570:
0571: /**
0572: * Constant type value of the different AlgorithmName
0573: */
0574: static final int TYPE_0_ = 0;
0575: static final int TYPE_1_ = 1;
0576:
0577: // package private constructors ----------------------------------
0578:
0579: /**
0580: * Constructor
0581: */
0582: AlgorithmName() {
0583: }
0584:
0585: // package private methods ---------------------------------------
0586:
0587: /**
0588: * Sets the information for accessing the algorithmic names
0589: * @param rangestart starting code point that lies within this name group
0590: * @param rangeend end code point that lies within this name group
0591: * @param type algorithm type. There's 2 kinds of algorithmic type. First
0592: * which uses code point as part of its name and the other uses
0593: * variant postfix strings
0594: * @param variant algorithmic variant
0595: * @return true if values are valid
0596: */
0597: boolean setInfo(int rangestart, int rangeend, byte type,
0598: byte variant) {
0599: if (rangestart >= UCharacter.MIN_VALUE
0600: && rangestart <= rangeend
0601: && rangeend <= UCharacter.MAX_VALUE
0602: && (type == TYPE_0_ || type == TYPE_1_)) {
0603: m_rangestart_ = rangestart;
0604: m_rangeend_ = rangeend;
0605: m_type_ = type;
0606: m_variant_ = variant;
0607: return true;
0608: }
0609: return false;
0610: }
0611:
0612: /**
0613: * Sets the factor data
0614: * @param factor Array of factor
0615: * @return true if factors are valid
0616: */
0617: boolean setFactor(char factor[]) {
0618: if (factor.length == m_variant_) {
0619: m_factor_ = factor;
0620: return true;
0621: }
0622: return false;
0623: }
0624:
0625: /**
0626: * Sets the name prefix
0627: * @param prefix
0628: * @return true if prefix is set
0629: */
0630: boolean setPrefix(String prefix) {
0631: if (prefix != null && prefix.length() > 0) {
0632: m_prefix_ = prefix;
0633: return true;
0634: }
0635: return false;
0636: }
0637:
0638: /**
0639: * Sets the variant factorized name data
0640: * @param string variant factorized name data
0641: * @return true if values are set
0642: */
0643: boolean setFactorString(byte string[]) {
0644: // factor and variant string can be empty for things like
0645: // hanggul code points
0646: m_factorstring_ = string;
0647: return true;
0648: }
0649:
0650: /**
0651: * Checks if code point lies in Algorithm object at index
0652: * @param ch code point
0653: */
0654: boolean contains(int ch) {
0655: return m_rangestart_ <= ch && ch <= m_rangeend_;
0656: }
0657:
0658: /**
0659: * Appends algorithm name of code point into StringBuffer.
0660: * Note this method does not check for validity of code point in Algorithm,
0661: * result is undefined if code point does not belong in Algorithm.
0662: * @param ch code point
0663: * @param str StringBuffer to append to
0664: */
0665: void appendName(int ch, StringBuffer str) {
0666: str.append(m_prefix_);
0667: switch (m_type_) {
0668: case TYPE_0_:
0669: // prefix followed by hex digits indicating variants
0670: Utility.hex(ch, m_variant_, str);
0671: break;
0672: case TYPE_1_:
0673: // prefix followed by factorized-elements
0674: int offset = ch - m_rangestart_;
0675: int indexes[] = m_utilIntBuffer_;
0676: int factor;
0677:
0678: // write elements according to the factors
0679: // the factorized elements are determined by modulo
0680: // arithmetic
0681: synchronized (m_utilIntBuffer_) {
0682: for (int i = m_variant_ - 1; i > 0; i--) {
0683: factor = m_factor_[i] & 0x00FF;
0684: indexes[i] = offset % factor;
0685: offset /= factor;
0686: }
0687:
0688: // we don't need to calculate the last modulus because
0689: // start <= code <= end guarantees here that
0690: // code <= factors[0]
0691: indexes[0] = offset;
0692:
0693: // joining up the factorized strings
0694: str.append(getFactorString(indexes, m_variant_));
0695: }
0696: break;
0697: }
0698: }
0699:
0700: /**
0701: * Gets the character for the argument algorithmic name
0702: * @return the algorithmic char or -1 otherwise.
0703: */
0704: int getChar(String name) {
0705: int prefixlen = m_prefix_.length();
0706: if (name.length() < prefixlen
0707: || !m_prefix_.equals(name.substring(0, prefixlen))) {
0708: return -1;
0709: }
0710:
0711: switch (m_type_) {
0712: case TYPE_0_:
0713: try {
0714: int result = Integer.parseInt(name
0715: .substring(prefixlen), 16);
0716: // does it fit into the range?
0717: if (m_rangestart_ <= result
0718: && result <= m_rangeend_) {
0719: return result;
0720: }
0721: } catch (NumberFormatException e) {
0722: return -1;
0723: }
0724: break;
0725: case TYPE_1_:
0726: // repetitative suffix name comparison done here
0727: // offset is the character code - start
0728: for (int ch = m_rangestart_; ch <= m_rangeend_; ch++) {
0729: int offset = ch - m_rangestart_;
0730: int indexes[] = m_utilIntBuffer_;
0731: int factor;
0732:
0733: // write elements according to the factors
0734: // the factorized elements are determined by modulo
0735: // arithmetic
0736: synchronized (m_utilIntBuffer_) {
0737: for (int i = m_variant_ - 1; i > 0; i--) {
0738: factor = m_factor_[i] & 0x00FF;
0739: indexes[i] = offset % factor;
0740: offset /= factor;
0741: }
0742:
0743: // we don't need to calculate the last modulus
0744: // because start <= code <= end guarantees here that
0745: // code <= factors[0]
0746: indexes[0] = offset;
0747:
0748: // joining up the factorized strings
0749: if (compareFactorString(indexes, m_variant_,
0750: name, prefixlen)) {
0751: return ch;
0752: }
0753: }
0754: }
0755: }
0756:
0757: return -1;
0758: }
0759:
0760: /**
0761: * Adds all chars in the set of algorithmic names into the set.
0762: * Equivalent to part of calcAlgNameSetsLengths.
0763: * @param set int set to add the chars of the algorithm names into
0764: * @param maxlength maximum length to compare to
0765: * @return the length that is either maxlength of the length of this
0766: * algorithm name if it is longer than maxlength
0767: */
0768: int add(int set[], int maxlength) {
0769: // prefix length
0770: int length = UCharacterName.add(set, m_prefix_);
0771: switch (m_type_) {
0772: case TYPE_0_: {
0773: // name = prefix + (range->variant times) hex-digits
0774: // prefix
0775: length += m_variant_;
0776: /* synwee to check
0777: * addString(set, (const char *)(range + 1))
0778: + range->variant;*/
0779: break;
0780: }
0781: case TYPE_1_: {
0782: // name = prefix factorized-elements
0783: // get the set and maximum factor suffix length for each
0784: // factor
0785: for (int i = m_variant_ - 1; i > 0; i--) {
0786: int maxfactorlength = 0;
0787: int count = 0;
0788: for (int factor = m_factor_[i]; factor > 0; --factor) {
0789: synchronized (m_utilStringBuffer_) {
0790: m_utilStringBuffer_.delete(0,
0791: m_utilStringBuffer_.length());
0792: count = UCharacterUtility
0793: .getNullTermByteSubString(
0794: m_utilStringBuffer_,
0795: m_factorstring_, count);
0796: UCharacterName
0797: .add(set, m_utilStringBuffer_);
0798: if (m_utilStringBuffer_.length() > maxfactorlength) {
0799: maxfactorlength = m_utilStringBuffer_
0800: .length();
0801: }
0802: }
0803: }
0804: length += maxfactorlength;
0805: }
0806: }
0807: }
0808: if (length > maxlength) {
0809: return length;
0810: }
0811: return maxlength;
0812: }
0813:
0814: // private data members ------------------------------------------
0815:
0816: /**
0817: * Algorithmic data information
0818: */
0819: private int m_rangestart_;
0820: private int m_rangeend_;
0821: private byte m_type_;
0822: private byte m_variant_;
0823: private char m_factor_[];
0824: private String m_prefix_;
0825: private byte m_factorstring_[];
0826: /**
0827: * Utility StringBuffer
0828: */
0829: private StringBuffer m_utilStringBuffer_ = new StringBuffer();
0830: /**
0831: * Utility int buffer
0832: */
0833: private int m_utilIntBuffer_[] = new int[256];
0834:
0835: // private methods -----------------------------------------------
0836:
0837: /**
0838: * Gets the indexth string in each of the argument factor block
0839: * @param index array with each index corresponding to each factor block
0840: * @param length length of the array index
0841: * @return the combined string of the array of indexth factor string in
0842: * factor block
0843: */
0844: private String getFactorString(int index[], int length) {
0845: int size = m_factor_.length;
0846: if (index == null || length != size) {
0847: return null;
0848: }
0849:
0850: synchronized (m_utilStringBuffer_) {
0851: m_utilStringBuffer_.delete(0, m_utilStringBuffer_
0852: .length());
0853: int count = 0;
0854: int factor;
0855: size--;
0856: for (int i = 0; i <= size; i++) {
0857: factor = m_factor_[i];
0858: count = UCharacterUtility
0859: .skipNullTermByteSubString(m_factorstring_,
0860: count, index[i]);
0861: count = UCharacterUtility
0862: .getNullTermByteSubString(
0863: m_utilStringBuffer_,
0864: m_factorstring_, count);
0865: if (i != size) {
0866: count = UCharacterUtility
0867: .skipNullTermByteSubString(
0868: m_factorstring_, count, factor
0869: - index[i] - 1);
0870: }
0871: }
0872: return m_utilStringBuffer_.toString();
0873: }
0874: }
0875:
0876: /**
0877: * Compares the indexth string in each of the argument factor block with
0878: * the argument string
0879: * @param index array with each index corresponding to each factor block
0880: * @param length index array length
0881: * @param str string to compare with
0882: * @param offset of str to start comparison
0883: * @return true if string matches
0884: */
0885: private boolean compareFactorString(int index[], int length,
0886: String str, int offset) {
0887: int size = m_factor_.length;
0888: if (index == null || length != size)
0889: return false;
0890:
0891: int count = 0;
0892: int strcount = offset;
0893: int factor;
0894: size--;
0895: for (int i = 0; i <= size; i++) {
0896: factor = m_factor_[i];
0897: count = UCharacterUtility.skipNullTermByteSubString(
0898: m_factorstring_, count, index[i]);
0899: strcount = UCharacterUtility
0900: .compareNullTermByteSubString(str,
0901: m_factorstring_, strcount, count);
0902: if (strcount < 0) {
0903: return false;
0904: }
0905:
0906: if (i != size) {
0907: count = UCharacterUtility
0908: .skipNullTermByteSubString(m_factorstring_,
0909: count, factor - index[i]);
0910: }
0911: }
0912: if (strcount != str.length()) {
0913: return false;
0914: }
0915: return true;
0916: }
0917: }
0918:
0919: // package private data members --------------------------------------
0920:
0921: /**
0922: * Size of each groups
0923: */
0924: int m_groupsize_ = 0;
0925:
0926: // package private methods --------------------------------------------
0927:
0928: /**
0929: * Sets the token data
0930: * @param token array of tokens
0931: * @param tokenstring array of string values of the tokens
0932: * @return false if there is a data error
0933: */
0934: boolean setToken(char token[], byte tokenstring[]) {
0935: if (token != null && tokenstring != null && token.length > 0
0936: && tokenstring.length > 0) {
0937: m_tokentable_ = token;
0938: m_tokenstring_ = tokenstring;
0939: return true;
0940: }
0941: return false;
0942: }
0943:
0944: /**
0945: * Set the algorithm name information array
0946: * @param alg Algorithm information array
0947: * @return true if the group string offset has been set correctly
0948: */
0949: boolean setAlgorithm(AlgorithmName alg[]) {
0950: if (alg != null && alg.length != 0) {
0951: m_algorithm_ = alg;
0952: return true;
0953: }
0954: return false;
0955: }
0956:
0957: /**
0958: * Sets the number of group and size of each group in number of char
0959: * @param count number of groups
0960: * @param size size of group in char
0961: * @return true if group size is set correctly
0962: */
0963: boolean setGroupCountSize(int count, int size) {
0964: if (count <= 0 || size <= 0) {
0965: return false;
0966: }
0967: m_groupcount_ = count;
0968: m_groupsize_ = size;
0969: return true;
0970: }
0971:
0972: /**
0973: * Sets the group name data
0974: * @param group index information array
0975: * @param groupstring name information array
0976: * @return false if there is a data error
0977: */
0978: boolean setGroup(char group[], byte groupstring[]) {
0979: if (group != null && groupstring != null && group.length > 0
0980: && groupstring.length > 0) {
0981: m_groupinfo_ = group;
0982: m_groupstring_ = groupstring;
0983: return true;
0984: }
0985: return false;
0986: }
0987:
0988: // private data members ----------------------------------------------
0989:
0990: /**
0991: * Data used in unames.icu
0992: */
0993: private char m_tokentable_[];
0994: private byte m_tokenstring_[];
0995: private char m_groupinfo_[];
0996: private byte m_groupstring_[];
0997: private AlgorithmName m_algorithm_[];
0998:
0999: /**
1000: * Group use
1001: */
1002: private char m_groupoffsets_[] = new char[LINES_PER_GROUP_ + 1];
1003: private char m_grouplengths_[] = new char[LINES_PER_GROUP_ + 1];
1004:
1005: /**
1006: * Default name of the name datafile
1007: */
1008: private static final String NAME_FILE_NAME_ = ICUResourceBundle.ICU_BUNDLE
1009: + "/unames.icu";
1010: /**
1011: * Shift count to retrieve group information
1012: */
1013: private static final int GROUP_SHIFT_ = 5;
1014: /**
1015: * Mask to retrieve the offset for a particular character within a group
1016: */
1017: private static final int GROUP_MASK_ = LINES_PER_GROUP_ - 1;
1018: /**
1019: * Default buffer size of datafile
1020: */
1021: private static final int NAME_BUFFER_SIZE_ = 100000;
1022:
1023: /**
1024: * Position of offsethigh in group information array
1025: */
1026: private static final int OFFSET_HIGH_OFFSET_ = 1;
1027:
1028: /**
1029: * Position of offsetlow in group information array
1030: */
1031: private static final int OFFSET_LOW_OFFSET_ = 2;
1032: /**
1033: * Double nibble indicator, any nibble > this number has to be combined
1034: * with its following nibble
1035: */
1036: private static final int SINGLE_NIBBLE_MAX_ = 11;
1037:
1038: /*
1039: * Maximum length of character names (regular & 1.0).
1040: */
1041: //private static int MAX_NAME_LENGTH_ = 0;
1042: /*
1043: * Maximum length of ISO comments.
1044: */
1045: //private static int MAX_ISO_COMMENT_LENGTH_ = 0;
1046: /**
1047: * Set of chars used in character names (regular & 1.0).
1048: * Chars are platform-dependent (can be EBCDIC).
1049: */
1050: private int m_nameSet_[] = new int[8];
1051: /**
1052: * Set of chars used in ISO comments. (regular & 1.0).
1053: * Chars are platform-dependent (can be EBCDIC).
1054: */
1055: private int m_ISOCommentSet_[] = new int[8];
1056: /**
1057: * Utility StringBuffer
1058: */
1059: private StringBuffer m_utilStringBuffer_ = new StringBuffer();
1060: /**
1061: * Utility int buffer
1062: */
1063: private int m_utilIntBuffer_[] = new int[2];
1064: /**
1065: * Maximum ISO comment length
1066: */
1067: private int m_maxISOCommentLength_;
1068: /**
1069: * Maximum name length
1070: */
1071: private int m_maxNameLength_;
1072: /**
1073: * Singleton instance
1074: */
1075: private static UCharacterName INSTANCE_ = null;
1076: /**
1077: * Type names used for extended names
1078: */
1079: private static final String TYPE_NAMES_[] = { "unassigned",
1080: "uppercase letter", "lowercase letter", "titlecase letter",
1081: "modifier letter", "other letter", "non spacing mark",
1082: "enclosing mark", "combining spacing mark",
1083: "decimal digit number", "letter number", "other number",
1084: "space separator", "line separator", "paragraph separator",
1085: "control", "format", "private use area", "surrogate",
1086: "dash punctuation", "start punctuation", "end punctuation",
1087: "connector punctuation", "other punctuation",
1088: "math symbol", "currency symbol", "modifier symbol",
1089: "other symbol", "initial punctuation", "final punctuation",
1090: "noncharacter", "lead surrogate", "trail surrogate" };
1091: /**
1092: * Unknown type name
1093: */
1094: private static final String UNKNOWN_TYPE_NAME_ = "unknown";
1095: /**
1096: * Not a character type
1097: */
1098: private static final int NON_CHARACTER_ = UCharacterCategory.CHAR_CATEGORY_COUNT;
1099: /**
1100: * Lead surrogate type
1101: */
1102: private static final int LEAD_SURROGATE_ = UCharacterCategory.CHAR_CATEGORY_COUNT + 1;
1103: /**
1104: * Trail surrogate type
1105: */
1106: private static final int TRAIL_SURROGATE_ = UCharacterCategory.CHAR_CATEGORY_COUNT + 2;
1107: /**
1108: * Extended category count
1109: */
1110: static final int EXTENDED_CATEGORY_ = UCharacterCategory.CHAR_CATEGORY_COUNT + 3;
1111:
1112: // private constructor ------------------------------------------------
1113:
1114: /**
1115: * <p>Protected constructor for use in UCharacter.</p>
1116: * @exception IOException thrown when data reading fails
1117: */
1118: private UCharacterName() throws IOException {
1119: InputStream is = ICUData.getRequiredStream(NAME_FILE_NAME_);
1120: BufferedInputStream b = new BufferedInputStream(is,
1121: NAME_BUFFER_SIZE_);
1122: UCharacterNameReader reader = new UCharacterNameReader(b);
1123: reader.read(this );
1124: b.close();
1125: }
1126:
1127: // private methods ---------------------------------------------------
1128:
1129: /**
1130: * Gets the algorithmic name for the argument character
1131: * @param ch character to determine name for
1132: * @param choice name choice
1133: * @return the algorithmic name or null if not found
1134: */
1135: private String getAlgName(int ch, int choice) {
1136: // Do not write algorithmic Unicode 1.0 names because Unihan names are
1137: // the same as the modern ones, extension A was only introduced with
1138: // Unicode 3.0, and the Hangul syllable block was moved and changed
1139: // around Unicode 1.1.5.
1140: if (choice != UCharacterNameChoice.UNICODE_10_CHAR_NAME) {
1141: // index in terms integer index
1142: synchronized (m_utilStringBuffer_) {
1143: m_utilStringBuffer_.delete(0, m_utilStringBuffer_
1144: .length());
1145:
1146: for (int index = m_algorithm_.length - 1; index >= 0; index--) {
1147: if (m_algorithm_[index].contains(ch)) {
1148: m_algorithm_[index].appendName(ch,
1149: m_utilStringBuffer_);
1150: return m_utilStringBuffer_.toString();
1151: }
1152: }
1153: }
1154: }
1155: return null;
1156: }
1157:
1158: /**
1159: * Getting the character with the tokenized argument name
1160: * @param name of the character
1161: * @return character with the tokenized argument name or -1 if character
1162: * is not found
1163: */
1164: private synchronized int getGroupChar(String name, int choice) {
1165: for (int i = 0; i < m_groupcount_; i++) {
1166: // populating the data set of grouptable
1167:
1168: int startgpstrindex = getGroupLengths(i, m_groupoffsets_,
1169: m_grouplengths_);
1170:
1171: // shift out to function
1172: int result = getGroupChar(startgpstrindex, m_grouplengths_,
1173: name, choice);
1174: if (result != -1) {
1175: return (m_groupinfo_[i * m_groupsize_] << GROUP_SHIFT_)
1176: | result;
1177: }
1178: }
1179: return -1;
1180: }
1181:
1182: /**
1183: * Compares and retrieve character if name is found within the argument
1184: * group
1185: * @param index index where the set of names reside in the group block
1186: * @param length list of lengths of the strings
1187: * @param name character name to search for
1188: * @param choice of either 1.0 or the most current unicode name
1189: * @return relative character in the group which matches name, otherwise if
1190: * not found, -1 will be returned
1191: */
1192: private int getGroupChar(int index, char length[], String name,
1193: int choice) {
1194: byte b = 0;
1195: char token;
1196: int len;
1197: int namelen = name.length();
1198: int nindex;
1199: int count;
1200:
1201: for (int result = 0; result <= LINES_PER_GROUP_; result++) {
1202: nindex = 0;
1203: len = length[result];
1204:
1205: if (choice == UCharacterNameChoice.UNICODE_10_CHAR_NAME) {
1206: int oldindex = index;
1207: index += UCharacterUtility.skipByteSubString(
1208: m_groupstring_, index, len, (byte) ';');
1209: len -= (index - oldindex);
1210: }
1211:
1212: // number of tokens is > the length of the name
1213: // write each letter directly, and write a token word per token
1214: for (count = 0; count < len && nindex != -1
1215: && nindex < namelen;) {
1216: b = m_groupstring_[index + count];
1217: count++;
1218:
1219: if (b >= m_tokentable_.length) {
1220: if (name.charAt(nindex++) != (b & 0xFF)) {
1221: nindex = -1;
1222: }
1223: } else {
1224: token = m_tokentable_[b & 0xFF];
1225: if (token == 0xFFFE) {
1226: // this is a lead byte for a double-byte token
1227: token = m_tokentable_[b << 8
1228: | (m_groupstring_[index + count] & 0x00ff)];
1229: count++;
1230: }
1231: if (token == 0xFFFF) {
1232: if (name.charAt(nindex++) != (b & 0xFF)) {
1233: nindex = -1;
1234: }
1235: } else {
1236: // compare token with name
1237: nindex = UCharacterUtility
1238: .compareNullTermByteSubString(name,
1239: m_tokenstring_, nindex, token);
1240: }
1241: }
1242: }
1243:
1244: if (namelen == nindex
1245: && (count == len || m_groupstring_[index + count] == ';')) {
1246: return result;
1247: }
1248:
1249: index += len;
1250: }
1251: return -1;
1252: }
1253:
1254: /**
1255: * Gets the character extended type
1256: * @param ch character to be tested
1257: * @return extended type it is associated with
1258: */
1259: private static int getType(int ch) {
1260: if (UCharacterUtility.isNonCharacter(ch)) {
1261: // not a character we return a invalid category count
1262: return NON_CHARACTER_;
1263: }
1264: int result = UCharacter.getType(ch);
1265: if (result == UCharacterCategory.SURROGATE) {
1266: if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
1267: result = LEAD_SURROGATE_;
1268: } else {
1269: result = TRAIL_SURROGATE_;
1270: }
1271: }
1272: return result;
1273: }
1274:
1275: /**
1276: * Getting the character with extended name of the form <....>.
1277: * @param name of the character to be found
1278: * @param choice name choice
1279: * @return character associated with the name, -1 if such character is not
1280: * found and -2 if we should continue with the search.
1281: */
1282: private static int getExtendedChar(String name, int choice) {
1283: if (name.charAt(0) == '<') {
1284: if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) {
1285: int endIndex = name.length() - 1;
1286: if (name.charAt(endIndex) == '>') {
1287: int startIndex = name.lastIndexOf('-');
1288: if (startIndex >= 0) { // We've got a category.
1289: startIndex++;
1290: int result = -1;
1291: try {
1292: result = Integer.parseInt(name.substring(
1293: startIndex, endIndex), 16);
1294: } catch (NumberFormatException e) {
1295: return -1;
1296: }
1297: // Now validate the category name. We could use a
1298: // binary search, or a trie, if we really wanted to.
1299: String type = name.substring(1, startIndex - 1);
1300: int length = TYPE_NAMES_.length;
1301: for (int i = 0; i < length; ++i) {
1302: if (type.compareTo(TYPE_NAMES_[i]) == 0) {
1303: if (getType(result) == i) {
1304: return result;
1305: }
1306: break;
1307: }
1308: }
1309: }
1310: }
1311: }
1312: return -1;
1313: }
1314: return -2;
1315: }
1316:
1317: // sets of name characters, maximum name lengths -----------------------
1318:
1319: /**
1320: * Adds a codepoint into a set of ints.
1321: * Equivalent to SET_ADD.
1322: * @param set set to add to
1323: * @param ch 16 bit char to add
1324: */
1325: private static void add(int set[], char ch) {
1326: set[ch >>> 5] |= 1 << (ch & 0x1f);
1327: }
1328:
1329: /**
1330: * Checks if a codepoint is a part of a set of ints.
1331: * Equivalent to SET_CONTAINS.
1332: * @param set set to check in
1333: * @param ch 16 bit char to check
1334: * @return true if codepoint is part of the set, false otherwise
1335: */
1336: private static boolean contains(int set[], char ch) {
1337: return (set[ch >>> 5] & (1 << (ch & 0x1f))) != 0;
1338: }
1339:
1340: /**
1341: * Adds all characters of the argument str and gets the length
1342: * Equivalent to calcStringSetLength.
1343: * @param set set to add all chars of str to
1344: * @param str string to add
1345: */
1346: private static int add(int set[], String str) {
1347: int result = str.length();
1348:
1349: for (int i = result - 1; i >= 0; i--) {
1350: add(set, str.charAt(i));
1351: }
1352: return result;
1353: }
1354:
1355: /**
1356: * Adds all characters of the argument str and gets the length
1357: * Equivalent to calcStringSetLength.
1358: * @param set set to add all chars of str to
1359: * @param str string to add
1360: */
1361: private static int add(int set[], StringBuffer str) {
1362: int result = str.length();
1363:
1364: for (int i = result - 1; i >= 0; i--) {
1365: add(set, str.charAt(i));
1366: }
1367: return result;
1368: }
1369:
1370: /**
1371: * Adds all algorithmic names into the name set.
1372: * Equivalent to part of calcAlgNameSetsLengths.
1373: * @param maxlength length to compare to
1374: * @return the maximum length of any possible algorithmic name if it is >
1375: * maxlength, otherwise maxlength is returned.
1376: */
1377: private int addAlgorithmName(int maxlength) {
1378: int result = 0;
1379: for (int i = m_algorithm_.length - 1; i >= 0; i--) {
1380: result = m_algorithm_[i].add(m_nameSet_, maxlength);
1381: if (result > maxlength) {
1382: maxlength = result;
1383: }
1384: }
1385: return maxlength;
1386: }
1387:
1388: /**
1389: * Adds all extended names into the name set.
1390: * Equivalent to part of calcExtNameSetsLengths.
1391: * @param maxlength length to compare to
1392: * @return the maxlength of any possible extended name.
1393: */
1394: private int addExtendedName(int maxlength) {
1395: for (int i = TYPE_NAMES_.length - 1; i >= 0; i--) {
1396: // for each category, count the length of the category name
1397: // plus 9 =
1398: // 2 for <>
1399: // 1 for -
1400: // 6 for most hex digits per code point
1401: int length = 9 + add(m_nameSet_, TYPE_NAMES_[i]);
1402: if (length > maxlength) {
1403: maxlength = length;
1404: }
1405: }
1406: return maxlength;
1407: }
1408:
1409: /**
1410: * Adds names of a group to the argument set.
1411: * Equivalent to calcNameSetLength.
1412: * @param offset of the group name string in byte count
1413: * @param length of the group name string
1414: * @param tokenlength array to store the length of each token
1415: * @param set to add to
1416: * @return the length of the name string and the length of the group
1417: * string parsed
1418: */
1419: private int[] addGroupName(int offset, int length,
1420: byte tokenlength[], int set[]) {
1421: int resultnlength = 0;
1422: int resultplength = 0;
1423: while (resultplength < length) {
1424: char b = (char) (m_groupstring_[offset + resultplength] & 0xff);
1425: resultplength++;
1426: if (b == ';') {
1427: break;
1428: }
1429:
1430: if (b >= m_tokentable_.length) {
1431: add(set, b); // implicit letter
1432: resultnlength++;
1433: } else {
1434: char token = m_tokentable_[b & 0x00ff];
1435: if (token == 0xFFFE) {
1436: // this is a lead byte for a double-byte token
1437: b = (char) (b << 8 | (m_groupstring_[offset
1438: + resultplength] & 0x00ff));
1439: token = m_tokentable_[b];
1440: resultplength++;
1441: }
1442: if (token == 0xFFFF) {
1443: add(set, b);
1444: resultnlength++;
1445: } else {
1446: // count token word
1447: // use cached token length
1448: byte tlength = tokenlength[b];
1449: if (tlength == 0) {
1450: synchronized (m_utilStringBuffer_) {
1451: m_utilStringBuffer_.delete(0,
1452: m_utilStringBuffer_.length());
1453: UCharacterUtility.getNullTermByteSubString(
1454: m_utilStringBuffer_,
1455: m_tokenstring_, token);
1456: tlength = (byte) add(set,
1457: m_utilStringBuffer_);
1458: }
1459: tokenlength[b] = tlength;
1460: }
1461: resultnlength += tlength;
1462: }
1463: }
1464: }
1465: m_utilIntBuffer_[0] = resultnlength;
1466: m_utilIntBuffer_[1] = resultplength;
1467: return m_utilIntBuffer_;
1468: }
1469:
1470: /**
1471: * Adds names of all group to the argument set.
1472: * Sets the data member m_max*Length_.
1473: * Method called only once.
1474: * Equivalent to calcGroupNameSetsLength.
1475: * @param maxlength length to compare to
1476: */
1477: private void addGroupName(int maxlength) {
1478: int maxisolength = 0;
1479: char offsets[] = new char[LINES_PER_GROUP_ + 2];
1480: char lengths[] = new char[LINES_PER_GROUP_ + 2];
1481: byte tokenlengths[] = new byte[m_tokentable_.length];
1482:
1483: // enumerate all groups
1484: // for (int i = m_groupcount_ - 1; i >= 0; i --) {
1485: for (int i = 0; i < m_groupcount_; i++) {
1486: int offset = getGroupLengths(i, offsets, lengths);
1487: // enumerate all lines in each group
1488: // for (int linenumber = LINES_PER_GROUP_ - 1; linenumber >= 0;
1489: // linenumber --) {
1490: for (int linenumber = 0; linenumber < LINES_PER_GROUP_; linenumber++) {
1491: int lineoffset = offset + offsets[linenumber];
1492: int length = lengths[linenumber];
1493: if (length == 0) {
1494: continue;
1495: }
1496:
1497: // read regular name
1498: int parsed[] = addGroupName(lineoffset, length,
1499: tokenlengths, m_nameSet_);
1500: if (parsed[0] > maxlength) {
1501: // 0 for name length
1502: maxlength = parsed[0];
1503: }
1504: lineoffset += parsed[1];
1505: if (parsed[1] >= length) {
1506: // 1 for parsed group string length
1507: continue;
1508: }
1509: length -= parsed[1];
1510: // read Unicode 1.0 name
1511: parsed = addGroupName(lineoffset, length, tokenlengths,
1512: m_nameSet_);
1513: if (parsed[0] > maxlength) {
1514: // 0 for name length
1515: maxlength = parsed[0];
1516: }
1517: lineoffset += parsed[1];
1518: if (parsed[1] >= length) {
1519: // 1 for parsed group string length
1520: continue;
1521: }
1522: length -= parsed[1];
1523: // read ISO comment
1524: parsed = addGroupName(lineoffset, length, tokenlengths,
1525: m_ISOCommentSet_);
1526: if (parsed[1] > maxisolength) {
1527: maxisolength = length;
1528: }
1529: }
1530: }
1531:
1532: // set gMax... - name length last for threading
1533: m_maxISOCommentLength_ = maxisolength;
1534: m_maxNameLength_ = maxlength;
1535: }
1536:
1537: /**
1538: * Sets up the name sets and the calculation of the maximum lengths.
1539: * Equivalent to calcNameSetsLengths.
1540: */
1541: private boolean initNameSetsLengths() {
1542: if (m_maxNameLength_ > 0) {
1543: return true;
1544: }
1545:
1546: String extra = "0123456789ABCDEF<>-";
1547: // set hex digits, used in various names, and <>-, used in extended
1548: // names
1549: for (int i = extra.length() - 1; i >= 0; i--) {
1550: add(m_nameSet_, extra.charAt(i));
1551: }
1552:
1553: // set sets and lengths from algorithmic names
1554: m_maxNameLength_ = addAlgorithmName(0);
1555: // set sets and lengths from extended names
1556: m_maxNameLength_ = addExtendedName(m_maxNameLength_);
1557: // set sets and lengths from group names, set global maximum values
1558: addGroupName(m_maxNameLength_);
1559: return true;
1560: }
1561:
1562: /**
1563: * Converts the char set cset into a Unicode set uset.
1564: * Equivalent to charSetToUSet.
1565: * @param set Set of 256 bit flags corresponding to a set of chars.
1566: * @param uset USet to receive characters. Existing contents are deleted.
1567: */
1568: private void convert(int set[], UnicodeSet uset) {
1569: uset.clear();
1570: if (!initNameSetsLengths()) {
1571: return;
1572: }
1573:
1574: // build a char string with all chars that are used in character names
1575: for (char c = 255; c > 0; c--) {
1576: if (contains(set, c)) {
1577: uset.add(c);
1578: }
1579: }
1580: }
1581: }
|