001: /*
002: * @(#)XmlChars.java 1.1 00/08/05
003: *
004: * Copyright (c) 1998 Sun Microsystems, Inc. All Rights Reserved.
005: */
006:
007: package com.sun.xml.dtdparser;
008:
009: /**
010: * Methods in this class are used to determine whether characters may
011: * appear in certain roles in XML documents. Such methods are used
012: * both to parse and to create such documents.
013: *
014: * @author David Brownell
015: * @version 1.1, 00/08/05
016: */
017: public class XmlChars {
018: // can't construct instances
019: private XmlChars() {
020: }
021:
022: /**
023: * Returns true if the argument, a UCS-4 character code, is valid in
024: * XML documents. Unicode characters fit into the low sixteen
025: * bits of a UCS-4 character, and pairs of Unicode <em>surrogate
026: * characters</em> can be combined to encode UCS-4 characters in
027: * documents containing only Unicode. (The <code>char</code> datatype
028: * in the Java Programming Language represents Unicode characters,
029: * including unpaired surrogates.)
030: * <p/>
031: * <P> In XML, UCS-4 characters can also be encoded by the use of
032: * <em>character references</em> such as <b>&#x12345678;</b>, which
033: * happens to refer to a character that is disallowed in XML documents.
034: * UCS-4 characters allowed in XML documents can be expressed with
035: * one or two Unicode characters.
036: *
037: * @param ucs4char The 32-bit UCS-4 character being tested.
038: */
039: static public boolean isChar(int ucs4char) {
040: // [2] Char ::= #x0009 | #x000A | #x000D
041: // | [#x0020-#xD7FF]
042: // ... surrogates excluded!
043: // | [#xE000-#xFFFD]
044: // | [#x10000-#x10ffff]
045: return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
046: || ucs4char == 0x000A || ucs4char == 0x0009
047: || ucs4char == 0x000D
048: || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD) || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
049: }
050:
051: /**
052: * Returns true if the character is allowed to be a non-initial
053: * character in names according to the XML recommendation.
054: *
055: * @see #isNCNameChar(char)
056: * @see #isLetter(char)
057: */
058: public static boolean isNameChar(char c) {
059: // [4] NameChar ::= Letter | Digit | '.' | '_' | ':'
060: // | CombiningChar | Extender
061:
062: if (isLetter2(c))
063: return true;
064: else if (c == '>')
065: return false;
066: else if (c == '.' || c == '-' || c == '_' || c == ':'
067: || isExtender(c))
068: return true;
069: else
070: return false;
071: }
072:
073: /**
074: * Returns true if the character is allowed to be a non-initial
075: * character in unscoped names according to the rules of the XML
076: * Namespaces proposed recommendation. Except for precluding
077: * the colon (used to separate names from their scopes) these
078: * characters are just as allowed by the XML recommendation.
079: *
080: * @see #isNameChar(char)
081: * @see #isLetter(char)
082: */
083: public static boolean isNCNameChar(char c) {
084: // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_'
085: // | CombiningChar | Extender
086: return c != ':' && isNameChar(c);
087: }
088:
089: /**
090: * Returns true if the character is allowed where XML supports
091: * whitespace characters, false otherwise.
092: */
093: public static boolean isSpace(char c) {
094: return c == ' ' || c == '\t' || c == '\n' || c == '\r';
095: }
096:
097: /*
098: * NOTE: java.lang.Character.getType() values are:
099: *
100: * UNASSIGNED = 0,
101: *
102: * UPPERCASE_LETTER = 1, // Lu
103: * LOWERCASE_LETTER = 2, // Ll
104: * TITLECASE_LETTER = 3, // Lt
105: * MODIFIER_LETTER = 4, // Lm
106: * OTHER_LETTER = 5, // Lo
107: * NON_SPACING_MARK = 6, // Mn
108: * ENCLOSING_MARK = 7, // Me
109: * COMBINING_SPACING_MARK = 8, // Mc
110: * DECIMAL_DIGIT_NUMBER = 9, // Nd
111: * LETTER_NUMBER = 10, // Nl
112: * OTHER_NUMBER = 11, // No
113: * SPACE_SEPARATOR = 12, // Zs
114: * LINE_SEPARATOR = 13, // Zl
115: * PARAGRAPH_SEPARATOR = 14, // Zp
116: * CONTROL = 15, // Cc
117: * FORMAT = 16, // Cf
118: * // 17 reserved for proposed Ci category
119: * PRIVATE_USE = 18, // Co
120: * SURROGATE = 19, // Cs
121: * DASH_PUNCTUATION = 20, // Pd
122: * START_PUNCTUATION = 21, // Ps
123: * END_PUNCTUATION = 22, // Pe
124: * CONNECTOR_PUNCTUATION = 23, // Pc
125: * OTHER_PUNCTUATION = 24, // Po
126: * MATH_SYMBOL = 25, // Sm
127: * CURRENCY_SYMBOL = 26, // Sc
128: * MODIFIER_SYMBOL = 27, // Sk
129: * OTHER_SYMBOL = 28; // So
130: */
131:
132: /**
133: * Returns true if the character is an XML "letter". XML Names must
134: * start with Letters or a few other characters, but other characters
135: * in names must only satisfy the <em>isNameChar</em> predicate.
136: *
137: * @see #isNameChar(char)
138: * @see #isNCNameChar(char)
139: */
140: public static boolean isLetter(char c) {
141: // [84] Letter ::= BaseChar | Ideographic
142: // [85] BaseChar ::= ... too much to repeat
143: // [86] Ideographic ::= ... too much to repeat
144:
145: //
146: // Optimize the typical case.
147: //
148: if (c >= 'a' && c <= 'z')
149: return true;
150: if (c == '/')
151: return false;
152: if (c >= 'A' && c <= 'Z')
153: return true;
154:
155: //
156: // Since the tables are too ridiculous to use in code,
157: // we're using the footnotes here to drive this test.
158: //
159: switch (Character.getType(c)) {
160: // app. B footnote says these are 'name start'
161: // chars' ...
162: case Character.LOWERCASE_LETTER: // Ll
163: case Character.UPPERCASE_LETTER: // Lu
164: case Character.OTHER_LETTER: // Lo
165: case Character.TITLECASE_LETTER: // Lt
166: case Character.LETTER_NUMBER: // Nl
167:
168: // OK, here we just have some exceptions to check...
169: return !isCompatibilityChar(c)
170: // per "5.14 of Unicode", rule out some combiners
171: && !(c >= 0x20dd && c <= 0x20e0);
172:
173: default:
174: // check for some exceptions: these are "alphabetic"
175: return ((c >= 0x02bb && c <= 0x02c1) || c == 0x0559
176: || c == 0x06e5 || c == 0x06e6);
177: }
178: }
179:
180: //
181: // XML 1.0 discourages "compatibility" characters in names; these
182: // were defined to permit passing through some information stored in
183: // older non-Unicode character sets. These always have alternative
184: // representations in Unicode, e.g. using combining chars.
185: //
186: private static boolean isCompatibilityChar(char c) {
187: // the numerous comparisions here seem unavoidable,
188: // but the switch can reduce the number which must
189: // actually be executed.
190:
191: switch ((c >> 8) & 0x0ff) {
192: case 0x00:
193: // ISO Latin/1 has a few compatibility characters
194: return c == 0x00aa || c == 0x00b5 || c == 0x00ba;
195:
196: case 0x01:
197: // as do Latin Extended A and (parts of) B
198: return (c >= 0x0132 && c <= 0x0133)
199: || (c >= 0x013f && c <= 0x0140) || c == 0x0149
200: || c == 0x017f || (c >= 0x01c4 && c <= 0x01cc)
201: || (c >= 0x01f1 && c <= 0x01f3);
202:
203: case 0x02:
204: // some spacing modifiers
205: return (c >= 0x02b0 && c <= 0x02b8)
206: || (c >= 0x02e0 && c <= 0x02e4);
207:
208: case 0x03:
209: return c == 0x037a; // Greek
210:
211: case 0x05:
212: return c == 0x0587; // Armenian
213:
214: case 0x0e:
215: return c >= 0x0edc && c <= 0x0edd; // Laotian
216:
217: case 0x11:
218: // big chunks of Hangul Jamo are all "compatibility"
219: return c == 0x1101 || c == 0x1104 || c == 0x1108
220: || c == 0x110a || c == 0x110d
221: || (c >= 0x1113 && c <= 0x113b) || c == 0x113d
222: || c == 0x113f || (c >= 0x1141 && c <= 0x114b)
223: || c == 0x114d || c == 0x114f
224: || (c >= 0x1151 && c <= 0x1153)
225: || (c >= 0x1156 && c <= 0x1158) || c == 0x1162
226: || c == 0x1164 || c == 0x1166 || c == 0x1168
227: || (c >= 0x116a && c <= 0x116c)
228: || (c >= 0x116f && c <= 0x1171) || c == 0x1174
229: || (c >= 0x1176 && c <= 0x119d)
230: || (c >= 0x119f && c <= 0x11a2)
231: || (c >= 0x11a9 && c <= 0x11aa)
232: || (c >= 0x11ac && c <= 0x11ad)
233: || (c >= 0x11b0 && c <= 0x11b6) || c == 0x11b9
234: || c == 0x11bb || (c >= 0x11c3 && c <= 0x11ea)
235: || (c >= 0x11ec && c <= 0x11ef)
236: || (c >= 0x11f1 && c <= 0x11f8);
237:
238: case 0x20:
239: return c == 0x207f; // superscript
240:
241: case 0x21:
242: return
243: // various letterlike symbols
244: c == 0x2102 || c == 0x2107 || (c >= 0x210a && c <= 0x2113)
245: || c == 0x2115 || (c >= 0x2118 && c <= 0x211d)
246: || c == 0x2124 || c == 0x2128
247: || (c >= 0x212c && c <= 0x212d)
248: || (c >= 0x212f && c <= 0x2138)
249:
250: // most Roman numerals (less 1K, 5K, 10K)
251: || (c >= 0x2160 && c <= 0x217f);
252:
253: case 0x30:
254: // some Hiragana
255: return c >= 0x309b && c <= 0x309c;
256:
257: case 0x31:
258: // all Hangul Compatibility Jamo
259: return c >= 0x3131 && c <= 0x318e;
260:
261: case 0xf9:
262: case 0xfa:
263: case 0xfb:
264: case 0xfc:
265: case 0xfd:
266: case 0xfe:
267: case 0xff:
268: // the whole "compatibility" area is for that purpose!
269: return true;
270:
271: default:
272: // most of Unicode isn't flagged as being for compatibility
273: return false;
274: }
275: }
276:
277: // guts of isNameChar/isNCNameChar
278: private static boolean isLetter2(char c) {
279: // [84] Letter ::= BaseChar | Ideographic
280: // [85] BaseChar ::= ... too much to repeat
281: // [86] Ideographic ::= ... too much to repeat
282: // [87] CombiningChar ::= ... too much to repeat
283:
284: //
285: // Optimize the typical case.
286: //
287: if (c >= 'a' && c <= 'z')
288: return true;
289: if (c == '>')
290: return false;
291: if (c >= 'A' && c <= 'Z')
292: return true;
293:
294: //
295: // Since the tables are too ridiculous to use in code,
296: // we're using the footnotes here to drive this test.
297: //
298: switch (Character.getType(c)) {
299: // app. B footnote says these are 'name start'
300: // chars' ...
301: case Character.LOWERCASE_LETTER: // Ll
302: case Character.UPPERCASE_LETTER: // Lu
303: case Character.OTHER_LETTER: // Lo
304: case Character.TITLECASE_LETTER: // Lt
305: case Character.LETTER_NUMBER: // Nl
306: // ... and these are name characters 'other
307: // than name start characters'
308: case Character.COMBINING_SPACING_MARK: // Mc
309: case Character.ENCLOSING_MARK: // Me
310: case Character.NON_SPACING_MARK: // Mn
311: case Character.MODIFIER_LETTER: // Lm
312: case Character.DECIMAL_DIGIT_NUMBER: // Nd
313:
314: // OK, here we just have some exceptions to check...
315: return !isCompatibilityChar(c)
316: // per "5.14 of Unicode", rule out some combiners
317: && !(c >= 0x20dd && c <= 0x20e0);
318:
319: default:
320: // added a character ...
321: return c == 0x0387;
322: }
323: }
324:
325: private static boolean isDigit(char c) {
326: // [88] Digit ::= ...
327:
328: //
329: // java.lang.Character.isDigit is correct from the XML point
330: // of view except that it allows "fullwidth" digits.
331: //
332: return Character.isDigit(c)
333: && !((c >= 0xff10) && (c <= 0xff19));
334: }
335:
336: private static boolean isExtender(char c) {
337: // [89] Extender ::= ...
338: return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
339: || c == 0x0640 || c == 0x0e46 || c == 0x0ec6
340: || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
341: || (c >= 0x309d && c <= 0x309e)
342: || (c >= 0x30fc && c <= 0x30fe);
343: }
344: }
|