001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017:
018: package org.apache.xerces.util;
019:
020: import java.util.Arrays;
021:
022: /**
023: * This class defines the basic properties of characters in XML 1.1. The data
024: * in this class can be used to verify that a character is a valid
025: * XML 1.1 character or if the character is a space, name start, or name
026: * character.
027: * <p>
028: * A series of convenience methods are supplied to ease the burden
029: * of the developer. Using the character as an index into the <code>XML11CHARS</code>
030: * array and applying the appropriate mask flag (e.g.
031: * <code>MASK_VALID</code>), yields the same results as calling the
032: * convenience methods. There is one exception: check the comments
033: * for the <code>isValid</code> method for details.
034: *
035: * @author Glenn Marcy, IBM
036: * @author Andy Clark, IBM
037: * @author Arnaud Le Hors, IBM
038: * @author Neil Graham, IBM
039: * @author Michael Glavassevich, IBM
040: *
041: * @version $Id: XML11Char.java 447241 2006-09-18 05:12:57Z mrglavas $
042: */
043: public class XML11Char {
044:
045: //
046: // Constants
047: //
048:
049: /** Character flags for XML 1.1. */
050: private static final byte XML11CHARS[] = new byte[1 << 16];
051:
052: /** XML 1.1 Valid character mask. */
053: public static final int MASK_XML11_VALID = 0x01;
054:
055: /** XML 1.1 Space character mask. */
056: public static final int MASK_XML11_SPACE = 0x02;
057:
058: /** XML 1.1 Name start character mask. */
059: public static final int MASK_XML11_NAME_START = 0x04;
060:
061: /** XML 1.1 Name character mask. */
062: public static final int MASK_XML11_NAME = 0x08;
063:
064: /** XML 1.1 control character mask */
065: public static final int MASK_XML11_CONTROL = 0x10;
066:
067: /** XML 1.1 content for external entities (valid - "special" chars - control chars) */
068: public static final int MASK_XML11_CONTENT = 0x20;
069:
070: /** XML namespaces 1.1 NCNameStart */
071: public static final int MASK_XML11_NCNAME_START = 0x40;
072:
073: /** XML namespaces 1.1 NCName */
074: public static final int MASK_XML11_NCNAME = 0x80;
075:
076: /** XML 1.1 content for internal entities (valid - "special" chars) */
077: public static final int MASK_XML11_CONTENT_INTERNAL = MASK_XML11_CONTROL
078: | MASK_XML11_CONTENT;
079:
080: //
081: // Static initialization
082: //
083:
084: static {
085:
086: // Initializing the Character Flag Array
087: // Code generated by: XML11CharGenerator.
088:
089: Arrays.fill(XML11CHARS, 1, 9, (byte) 17); // Fill 8 of value (byte) 17
090: XML11CHARS[9] = 35;
091: XML11CHARS[10] = 3;
092: Arrays.fill(XML11CHARS, 11, 13, (byte) 17); // Fill 2 of value (byte) 17
093: XML11CHARS[13] = 3;
094: Arrays.fill(XML11CHARS, 14, 32, (byte) 17); // Fill 18 of value (byte) 17
095: XML11CHARS[32] = 35;
096: Arrays.fill(XML11CHARS, 33, 38, (byte) 33); // Fill 5 of value (byte) 33
097: XML11CHARS[38] = 1;
098: Arrays.fill(XML11CHARS, 39, 45, (byte) 33); // Fill 6 of value (byte) 33
099: Arrays.fill(XML11CHARS, 45, 47, (byte) -87); // Fill 2 of value (byte) -87
100: XML11CHARS[47] = 33;
101: Arrays.fill(XML11CHARS, 48, 58, (byte) -87); // Fill 10 of value (byte) -87
102: XML11CHARS[58] = 45;
103: XML11CHARS[59] = 33;
104: XML11CHARS[60] = 1;
105: Arrays.fill(XML11CHARS, 61, 65, (byte) 33); // Fill 4 of value (byte) 33
106: Arrays.fill(XML11CHARS, 65, 91, (byte) -19); // Fill 26 of value (byte) -19
107: Arrays.fill(XML11CHARS, 91, 93, (byte) 33); // Fill 2 of value (byte) 33
108: XML11CHARS[93] = 1;
109: XML11CHARS[94] = 33;
110: XML11CHARS[95] = -19;
111: XML11CHARS[96] = 33;
112: Arrays.fill(XML11CHARS, 97, 123, (byte) -19); // Fill 26 of value (byte) -19
113: Arrays.fill(XML11CHARS, 123, 127, (byte) 33); // Fill 4 of value (byte) 33
114: Arrays.fill(XML11CHARS, 127, 133, (byte) 17); // Fill 6 of value (byte) 17
115: XML11CHARS[133] = 35;
116: Arrays.fill(XML11CHARS, 134, 160, (byte) 17); // Fill 26 of value (byte) 17
117: Arrays.fill(XML11CHARS, 160, 183, (byte) 33); // Fill 23 of value (byte) 33
118: XML11CHARS[183] = -87;
119: Arrays.fill(XML11CHARS, 184, 192, (byte) 33); // Fill 8 of value (byte) 33
120: Arrays.fill(XML11CHARS, 192, 215, (byte) -19); // Fill 23 of value (byte) -19
121: XML11CHARS[215] = 33;
122: Arrays.fill(XML11CHARS, 216, 247, (byte) -19); // Fill 31 of value (byte) -19
123: XML11CHARS[247] = 33;
124: Arrays.fill(XML11CHARS, 248, 768, (byte) -19); // Fill 520 of value (byte) -19
125: Arrays.fill(XML11CHARS, 768, 880, (byte) -87); // Fill 112 of value (byte) -87
126: Arrays.fill(XML11CHARS, 880, 894, (byte) -19); // Fill 14 of value (byte) -19
127: XML11CHARS[894] = 33;
128: Arrays.fill(XML11CHARS, 895, 8192, (byte) -19); // Fill 7297 of value (byte) -19
129: Arrays.fill(XML11CHARS, 8192, 8204, (byte) 33); // Fill 12 of value (byte) 33
130: Arrays.fill(XML11CHARS, 8204, 8206, (byte) -19); // Fill 2 of value (byte) -19
131: Arrays.fill(XML11CHARS, 8206, 8232, (byte) 33); // Fill 26 of value (byte) 33
132: XML11CHARS[8232] = 35;
133: Arrays.fill(XML11CHARS, 8233, 8255, (byte) 33); // Fill 22 of value (byte) 33
134: Arrays.fill(XML11CHARS, 8255, 8257, (byte) -87); // Fill 2 of value (byte) -87
135: Arrays.fill(XML11CHARS, 8257, 8304, (byte) 33); // Fill 47 of value (byte) 33
136: Arrays.fill(XML11CHARS, 8304, 8592, (byte) -19); // Fill 288 of value (byte) -19
137: Arrays.fill(XML11CHARS, 8592, 11264, (byte) 33); // Fill 2672 of value (byte) 33
138: Arrays.fill(XML11CHARS, 11264, 12272, (byte) -19); // Fill 1008 of value (byte) -19
139: Arrays.fill(XML11CHARS, 12272, 12289, (byte) 33); // Fill 17 of value (byte) 33
140: Arrays.fill(XML11CHARS, 12289, 55296, (byte) -19); // Fill 43007 of value (byte) -19
141: Arrays.fill(XML11CHARS, 57344, 63744, (byte) 33); // Fill 6400 of value (byte) 33
142: Arrays.fill(XML11CHARS, 63744, 64976, (byte) -19); // Fill 1232 of value (byte) -19
143: Arrays.fill(XML11CHARS, 64976, 65008, (byte) 33); // Fill 32 of value (byte) 33
144: Arrays.fill(XML11CHARS, 65008, 65534, (byte) -19); // Fill 526 of value (byte) -19
145:
146: } // <clinit>()
147:
148: //
149: // Public static methods
150: //
151:
152: /**
153: * Returns true if the specified character is a space character
154: * as amdended in the XML 1.1 specification.
155: *
156: * @param c The character to check.
157: */
158: public static boolean isXML11Space(int c) {
159: return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_SPACE) != 0);
160: } // isXML11Space(int):boolean
161:
162: /**
163: * Returns true if the specified character is valid. This method
164: * also checks the surrogate character range from 0x10000 to 0x10FFFF.
165: * <p>
166: * If the program chooses to apply the mask directly to the
167: * <code>XML11CHARS</code> array, then they are responsible for checking
168: * the surrogate character range.
169: *
170: * @param c The character to check.
171: */
172: public static boolean isXML11Valid(int c) {
173: return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_VALID) != 0)
174: || (0x10000 <= c && c <= 0x10FFFF);
175: } // isXML11Valid(int):boolean
176:
177: /**
178: * Returns true if the specified character is invalid.
179: *
180: * @param c The character to check.
181: */
182: public static boolean isXML11Invalid(int c) {
183: return !isXML11Valid(c);
184: } // isXML11Invalid(int):boolean
185:
186: /**
187: * Returns true if the specified character is valid and permitted outside
188: * of a character reference.
189: * That is, this method will return false for the same set as
190: * isXML11Valid, except it also reports false for "control characters".
191: *
192: * @param c The character to check.
193: */
194: public static boolean isXML11ValidLiteral(int c) {
195: return ((c < 0x10000 && ((XML11CHARS[c] & MASK_XML11_VALID) != 0 && (XML11CHARS[c] & MASK_XML11_CONTROL) == 0)) || (0x10000 <= c && c <= 0x10FFFF));
196: } // isXML11ValidLiteral(int):boolean
197:
198: /**
199: * Returns true if the specified character can be considered
200: * content in an external parsed entity.
201: *
202: * @param c The character to check.
203: */
204: public static boolean isXML11Content(int c) {
205: return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT) != 0)
206: || (0x10000 <= c && c <= 0x10FFFF);
207: } // isXML11Content(int):boolean
208:
209: /**
210: * Returns true if the specified character can be considered
211: * content in an internal parsed entity.
212: *
213: * @param c The character to check.
214: */
215: public static boolean isXML11InternalEntityContent(int c) {
216: return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_CONTENT_INTERNAL) != 0)
217: || (0x10000 <= c && c <= 0x10FFFF);
218: } // isXML11InternalEntityContent(int):boolean
219:
220: /**
221: * Returns true if the specified character is a valid name start
222: * character as defined by production [4] in the XML 1.1
223: * specification.
224: *
225: * @param c The character to check.
226: */
227: public static boolean isXML11NameStart(int c) {
228: return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME_START) != 0)
229: || (0x10000 <= c && c < 0xF0000);
230: } // isXML11NameStart(int):boolean
231:
232: /**
233: * Returns true if the specified character is a valid name
234: * character as defined by production [4a] in the XML 1.1
235: * specification.
236: *
237: * @param c The character to check.
238: */
239: public static boolean isXML11Name(int c) {
240: return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NAME) != 0)
241: || (c >= 0x10000 && c < 0xF0000);
242: } // isXML11Name(int):boolean
243:
244: /**
245: * Returns true if the specified character is a valid NCName start
246: * character as defined by production [4] in Namespaces in XML
247: * 1.1 recommendation.
248: *
249: * @param c The character to check.
250: */
251: public static boolean isXML11NCNameStart(int c) {
252: return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME_START) != 0)
253: || (0x10000 <= c && c < 0xF0000);
254: } // isXML11NCNameStart(int):boolean
255:
256: /**
257: * Returns true if the specified character is a valid NCName
258: * character as defined by production [5] in Namespaces in XML
259: * 1.1 recommendation.
260: *
261: * @param c The character to check.
262: */
263: public static boolean isXML11NCName(int c) {
264: return (c < 0x10000 && (XML11CHARS[c] & MASK_XML11_NCNAME) != 0)
265: || (0x10000 <= c && c < 0xF0000);
266: } // isXML11NCName(int):boolean
267:
268: /**
269: * Returns whether the given character is a valid
270: * high surrogate for a name character. This includes
271: * all high surrogates for characters [0x10000-0xEFFFF].
272: * In other words everything excluding planes 15 and 16.
273: *
274: * @param c The character to check.
275: */
276: public static boolean isXML11NameHighSurrogate(int c) {
277: return (0xD800 <= c && c <= 0xDB7F);
278: }
279:
280: /*
281: * [5] Name ::= NameStartChar NameChar*
282: */
283: /**
284: * Check to see if a string is a valid Name according to [5]
285: * in the XML 1.1 Recommendation
286: *
287: * @param name string to check
288: * @return true if name is a valid Name
289: */
290: public static boolean isXML11ValidName(String name) {
291: int length = name.length();
292: if (length == 0)
293: return false;
294: int i = 1;
295: char ch = name.charAt(0);
296: if (!isXML11NameStart(ch)) {
297: if (length > 1 && isXML11NameHighSurrogate(ch)) {
298: char ch2 = name.charAt(1);
299: if (!XMLChar.isLowSurrogate(ch2)
300: || !isXML11NameStart(XMLChar.supplemental(ch,
301: ch2))) {
302: return false;
303: }
304: i = 2;
305: } else {
306: return false;
307: }
308: }
309: while (i < length) {
310: ch = name.charAt(i);
311: if (!isXML11Name(ch)) {
312: if (++i < length && isXML11NameHighSurrogate(ch)) {
313: char ch2 = name.charAt(i);
314: if (!XMLChar.isLowSurrogate(ch2)
315: || !isXML11Name(XMLChar.supplemental(ch,
316: ch2))) {
317: return false;
318: }
319: } else {
320: return false;
321: }
322: }
323: ++i;
324: }
325: return true;
326: } // isXML11ValidName(String):boolean
327:
328: /*
329: * from the namespace 1.1 rec
330: * [4] NCName ::= NCNameStartChar NCNameChar*
331: */
332: /**
333: * Check to see if a string is a valid NCName according to [4]
334: * from the XML Namespaces 1.1 Recommendation
335: *
336: * @param ncName string to check
337: * @return true if name is a valid NCName
338: */
339: public static boolean isXML11ValidNCName(String ncName) {
340: int length = ncName.length();
341: if (length == 0)
342: return false;
343: int i = 1;
344: char ch = ncName.charAt(0);
345: if (!isXML11NCNameStart(ch)) {
346: if (length > 1 && isXML11NameHighSurrogate(ch)) {
347: char ch2 = ncName.charAt(1);
348: if (!XMLChar.isLowSurrogate(ch2)
349: || !isXML11NCNameStart(XMLChar.supplemental(ch,
350: ch2))) {
351: return false;
352: }
353: i = 2;
354: } else {
355: return false;
356: }
357: }
358: while (i < length) {
359: ch = ncName.charAt(i);
360: if (!isXML11NCName(ch)) {
361: if (++i < length && isXML11NameHighSurrogate(ch)) {
362: char ch2 = ncName.charAt(i);
363: if (!XMLChar.isLowSurrogate(ch2)
364: || !isXML11NCName(XMLChar.supplemental(ch,
365: ch2))) {
366: return false;
367: }
368: } else {
369: return false;
370: }
371: }
372: ++i;
373: }
374: return true;
375: } // isXML11ValidNCName(String):boolean
376:
377: /*
378: * [7] Nmtoken ::= (NameChar)+
379: */
380: /**
381: * Check to see if a string is a valid Nmtoken according to [7]
382: * in the XML 1.1 Recommendation
383: *
384: * @param nmtoken string to check
385: * @return true if nmtoken is a valid Nmtoken
386: */
387: public static boolean isXML11ValidNmtoken(String nmtoken) {
388: int length = nmtoken.length();
389: if (length == 0)
390: return false;
391: for (int i = 0; i < length; ++i) {
392: char ch = nmtoken.charAt(i);
393: if (!isXML11Name(ch)) {
394: if (++i < length && isXML11NameHighSurrogate(ch)) {
395: char ch2 = nmtoken.charAt(i);
396: if (!XMLChar.isLowSurrogate(ch2)
397: || !isXML11Name(XMLChar.supplemental(ch,
398: ch2))) {
399: return false;
400: }
401: } else {
402: return false;
403: }
404: }
405: }
406: return true;
407: } // isXML11ValidName(String):boolean
408:
409: } // class XML11Char
|