001: /*
002: * Java HTML Tidy - JTidy
003: * HTML parser and pretty printer
004: *
005: * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
006: * Institute of Technology, Institut National de Recherche en
007: * Informatique et en Automatique, Keio University). All Rights
008: * Reserved.
009: *
010: * Contributing Author(s):
011: *
012: * Dave Raggett <dsr@w3.org>
013: * Andy Quick <ac.quick@sympatico.ca> (translation to Java)
014: * Gary L Peskin <garyp@firstech.com> (Java development)
015: * Sami Lempinen <sami@lempinen.net> (release management)
016: * Fabrizio Giustina <fgiust at users.sourceforge.net>
017: *
018: * The contributing author(s) would like to thank all those who
019: * helped with testing, bug fixes, and patience. This wouldn't
020: * have been possible without all of you.
021: *
022: * COPYRIGHT NOTICE:
023: *
024: * This software and documentation is provided "as is," and
025: * the copyright holders and contributing author(s) make no
026: * representations or warranties, express or implied, including
027: * but not limited to, warranties of merchantability or fitness
028: * for any particular purpose or that the use of the software or
029: * documentation will not infringe any third party patents,
030: * copyrights, trademarks or other rights.
031: *
032: * The copyright holders and contributing author(s) will not be
033: * liable for any direct, indirect, special or consequential damages
034: * arising out of any use of the software or documentation, even if
035: * advised of the possibility of such damage.
036: *
037: * Permission is hereby granted to use, copy, modify, and distribute
038: * this source code, or portions hereof, documentation and executables,
039: * for any purpose, without fee, subject to the following restrictions:
040: *
041: * 1. The origin of this source code must not be misrepresented.
042: * 2. Altered versions must be plainly marked as such and must
043: * not be misrepresented as being the original source.
044: * 3. This Copyright notice may not be removed or altered from any
045: * source or altered source distribution.
046: *
047: * The copyright holders and contributing author(s) specifically
048: * permit, without fee, and encourage the use of this source code
049: * as a component for supporting the Hypertext Markup Language in
050: * commercial products. If you use this source code in a product,
051: * acknowledgment is not required but would be appreciated.
052: *
053: */
054:
055: package org.w3c.tidy;
056:
057: /**
058: * Utility class with handy methods, mainly for String handling or for reproducing c behaviours.
059: * @author Fabrizio Giustina
060: * @version $Revision $ ($Author $)
061: */
062: public final class TidyUtils {
063:
064: /**
065: * char type: digit.
066: */
067: private static final short DIGIT = 1;
068:
069: /**
070: * char type: letter.
071: */
072: private static final short LETTER = 2;
073:
074: /**
075: * char type: namechar.
076: */
077: private static final short NAMECHAR = 4;
078:
079: /**
080: * char type: whitespace.
081: */
082: private static final short WHITE = 8;
083:
084: /**
085: * char type: newline.
086: */
087: private static final short NEWLINE = 16;
088:
089: /**
090: * char type: lowercase.
091: */
092: private static final short LOWERCASE = 32;
093:
094: /**
095: * char type: uppercase.
096: */
097: private static final short UPPERCASE = 64;
098:
099: /**
100: * used to classify chars for lexical purposes.
101: */
102: private static short[] lexmap = new short[128];
103:
104: static {
105: mapStr("\r\n\f", (short) (NEWLINE | WHITE));
106: mapStr(" \t", WHITE);
107: mapStr("-.:_", NAMECHAR);
108: mapStr("0123456789", (short) (DIGIT | NAMECHAR));
109: mapStr("abcdefghijklmnopqrstuvwxyz", (short) (LOWERCASE
110: | LETTER | NAMECHAR));
111: mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short) (UPPERCASE
112: | LETTER | NAMECHAR));
113: }
114:
115: /**
116: * utility class, don't instantiate.
117: */
118: private TidyUtils() {
119: // unused
120: }
121:
122: /**
123: * Converts a int to a boolean.
124: * @param value int value
125: * @return <code>true</code> if value is != 0
126: */
127: static boolean toBoolean(int value) {
128: return value != 0;
129: }
130:
131: /**
132: * convert an int to unsigned (& 0xFF).
133: * @param c signed int
134: * @return unsigned int
135: */
136: static int toUnsigned(int c) {
137: return c & 0xFF;
138: }
139:
140: /**
141: * check if the first String contains the second one.
142: * @param s1 full String
143: * @param len1 maximum position in String
144: * @param s2 String to search for
145: * @return true if s1 contains s2 in the range 0-len1
146: */
147: static boolean wsubstrn(String s1, int len1, String s2) {
148: int searchIndex = s1.indexOf(s2);
149: return searchIndex > -1 && searchIndex <= len1;
150: }
151:
152: /**
153: * check if the first String contains the second one (ignore case).
154: * @param s1 full String
155: * @param len1 maximum position in String
156: * @param s2 String to search for
157: * @return true if s1 contains s2 in the range 0-len1
158: */
159: static boolean wsubstrncase(String s1, int len1, String s2) {
160: return wsubstrn(s1.toLowerCase(), len1, s2.toLowerCase());
161: }
162:
163: /**
164: * return offset of cc from beginning of s1, -1 if not found.
165: * @param s1 String
166: * @param len1 maximum offset (values > than lenl are ignored and returned as -1)
167: * @param cc character to search for
168: * @return index of cc in s1
169: */
170: static int wstrnchr(String s1, int len1, char cc) {
171: int indexOf = s1.indexOf(cc);
172: if (indexOf < len1) {
173: return indexOf;
174: }
175:
176: return -1;
177: }
178:
179: /**
180: * Same as wsubstrn, but without a specified length.
181: * @param s1 full String
182: * @param s2 String to search for
183: * @return <code>true</code> if s2 is found in s2 (case insensitive search)
184: */
185: static boolean wsubstr(String s1, String s2) {
186: int i;
187: int len1 = s1.length();
188: int len2 = s2.length();
189:
190: for (i = 0; i <= len1 - len2; ++i) {
191: if (s2.equalsIgnoreCase(s1.substring(i))) {
192: return true;
193: }
194: }
195:
196: return false;
197: }
198:
199: /**
200: * Is the character a hex digit?
201: * @param c char
202: * @return <code>true</code> if he given character is a hex digit
203: */
204: static boolean isxdigit(char c) {
205: return Character.isDigit(c)
206: || (Character.toLowerCase(c) >= 'a' && Character
207: .toLowerCase(c) <= 'f');
208: }
209:
210: /**
211: * Check if the string valueToCheck is contained in validValues array (case insesitie comparison).
212: * @param validValues array of valid values
213: * @param valueToCheck value to search for
214: * @return <code>true</code> if valueToCheck is found in validValues
215: */
216: static boolean isInValuesIgnoreCase(String[] validValues,
217: String valueToCheck) {
218: int len = validValues.length;
219: for (int j = 0; j < len; j++) {
220: if (validValues[j].equalsIgnoreCase(valueToCheck)) {
221: return true;
222: }
223: }
224: return false;
225: }
226:
227: /**
228: * Return true if substring s is in p and isn't all in upper case. This is used to check the case of SYSTEM, PUBLIC,
229: * DTD and EN.
230: * @param s substring
231: * @param p full string
232: * @param len how many chars to check in p
233: * @return true if substring s is in p and isn't all in upper case
234: */
235: public static boolean findBadSubString(String s, String p, int len) {
236: int n = s.length();
237: int i = 0;
238: String ps;
239:
240: while (n < len) {
241: ps = p.substring(i, i + n);
242: if (s.equalsIgnoreCase(ps)) {
243: return (!ps.equals(s.substring(0, n)));
244: }
245:
246: ++i;
247: --len;
248: }
249:
250: return false;
251: }
252:
253: /**
254: * Is the given char a valid xml letter?
255: * @param c char
256: * @return <code>true</code> if the char is a valid xml letter
257: */
258: static boolean isXMLLetter(char c) {
259: return ((c >= 0x41 && c <= 0x5a) || (c >= 0x61 && c <= 0x7a)
260: || (c >= 0xc0 && c <= 0xd6) || (c >= 0xd8 && c <= 0xf6)
261: || (c >= 0xf8 && c <= 0xff)
262: || (c >= 0x100 && c <= 0x131)
263: || (c >= 0x134 && c <= 0x13e)
264: || (c >= 0x141 && c <= 0x148)
265: || (c >= 0x14a && c <= 0x17e)
266: || (c >= 0x180 && c <= 0x1c3)
267: || (c >= 0x1cd && c <= 0x1f0)
268: || (c >= 0x1f4 && c <= 0x1f5)
269: || (c >= 0x1fa && c <= 0x217)
270: || (c >= 0x250 && c <= 0x2a8)
271: || (c >= 0x2bb && c <= 0x2c1) || c == 0x386
272: || (c >= 0x388 && c <= 0x38a) || c == 0x38c
273: || (c >= 0x38e && c <= 0x3a1)
274: || (c >= 0x3a3 && c <= 0x3ce)
275: || (c >= 0x3d0 && c <= 0x3d6) || c == 0x3da
276: || c == 0x3dc || c == 0x3de || c == 0x3e0
277: || (c >= 0x3e2 && c <= 0x3f3)
278: || (c >= 0x401 && c <= 0x40c)
279: || (c >= 0x40e && c <= 0x44f)
280: || (c >= 0x451 && c <= 0x45c)
281: || (c >= 0x45e && c <= 0x481)
282: || (c >= 0x490 && c <= 0x4c4)
283: || (c >= 0x4c7 && c <= 0x4c8)
284: || (c >= 0x4cb && c <= 0x4cc)
285: || (c >= 0x4d0 && c <= 0x4eb)
286: || (c >= 0x4ee && c <= 0x4f5)
287: || (c >= 0x4f8 && c <= 0x4f9)
288: || (c >= 0x531 && c <= 0x556) || c == 0x559
289: || (c >= 0x561 && c <= 0x586)
290: || (c >= 0x5d0 && c <= 0x5ea)
291: || (c >= 0x5f0 && c <= 0x5f2)
292: || (c >= 0x621 && c <= 0x63a)
293: || (c >= 0x641 && c <= 0x64a)
294: || (c >= 0x671 && c <= 0x6b7)
295: || (c >= 0x6ba && c <= 0x6be)
296: || (c >= 0x6c0 && c <= 0x6ce)
297: || (c >= 0x6d0 && c <= 0x6d3) || c == 0x6d5
298: || (c >= 0x6e5 && c <= 0x6e6)
299: || (c >= 0x905 && c <= 0x939) || c == 0x93d
300: || (c >= 0x958 && c <= 0x961)
301: || (c >= 0x985 && c <= 0x98c)
302: || (c >= 0x98f && c <= 0x990)
303: || (c >= 0x993 && c <= 0x9a8)
304: || (c >= 0x9aa && c <= 0x9b0) || c == 0x9b2
305: || (c >= 0x9b6 && c <= 0x9b9)
306: || (c >= 0x9dc && c <= 0x9dd)
307: || (c >= 0x9df && c <= 0x9e1)
308: || (c >= 0x9f0 && c <= 0x9f1)
309: || (c >= 0xa05 && c <= 0xa0a)
310: || (c >= 0xa0f && c <= 0xa10)
311: || (c >= 0xa13 && c <= 0xa28)
312: || (c >= 0xa2a && c <= 0xa30)
313: || (c >= 0xa32 && c <= 0xa33)
314: || (c >= 0xa35 && c <= 0xa36)
315: || (c >= 0xa38 && c <= 0xa39)
316: || (c >= 0xa59 && c <= 0xa5c) || c == 0xa5e
317: || (c >= 0xa72 && c <= 0xa74)
318: || (c >= 0xa85 && c <= 0xa8b) || c == 0xa8d
319: || (c >= 0xa8f && c <= 0xa91)
320: || (c >= 0xa93 && c <= 0xaa8)
321: || (c >= 0xaaa && c <= 0xab0)
322: || (c >= 0xab2 && c <= 0xab3)
323: || (c >= 0xab5 && c <= 0xab9) || c == 0xabd
324: || c == 0xae0 || (c >= 0xb05 && c <= 0xb0c)
325: || (c >= 0xb0f && c <= 0xb10)
326: || (c >= 0xb13 && c <= 0xb28)
327: || (c >= 0xb2a && c <= 0xb30)
328: || (c >= 0xb32 && c <= 0xb33)
329: || (c >= 0xb36 && c <= 0xb39) || c == 0xb3d
330: || (c >= 0xb5c && c <= 0xb5d)
331: || (c >= 0xb5f && c <= 0xb61)
332: || (c >= 0xb85 && c <= 0xb8a)
333: || (c >= 0xb8e && c <= 0xb90)
334: || (c >= 0xb92 && c <= 0xb95)
335: || (c >= 0xb99 && c <= 0xb9a) || c == 0xb9c
336: || (c >= 0xb9e && c <= 0xb9f)
337: || (c >= 0xba3 && c <= 0xba4)
338: || (c >= 0xba8 && c <= 0xbaa)
339: || (c >= 0xbae && c <= 0xbb5)
340: || (c >= 0xbb7 && c <= 0xbb9)
341: || (c >= 0xc05 && c <= 0xc0c)
342: || (c >= 0xc0e && c <= 0xc10)
343: || (c >= 0xc12 && c <= 0xc28)
344: || (c >= 0xc2a && c <= 0xc33)
345: || (c >= 0xc35 && c <= 0xc39)
346: || (c >= 0xc60 && c <= 0xc61)
347: || (c >= 0xc85 && c <= 0xc8c)
348: || (c >= 0xc8e && c <= 0xc90)
349: || (c >= 0xc92 && c <= 0xca8)
350: || (c >= 0xcaa && c <= 0xcb3)
351: || (c >= 0xcb5 && c <= 0xcb9) || c == 0xcde
352: || (c >= 0xce0 && c <= 0xce1)
353: || (c >= 0xd05 && c <= 0xd0c)
354: || (c >= 0xd0e && c <= 0xd10)
355: || (c >= 0xd12 && c <= 0xd28)
356: || (c >= 0xd2a && c <= 0xd39)
357: || (c >= 0xd60 && c <= 0xd61)
358: || (c >= 0xe01 && c <= 0xe2e) || c == 0xe30
359: || (c >= 0xe32 && c <= 0xe33)
360: || (c >= 0xe40 && c <= 0xe45)
361: || (c >= 0xe81 && c <= 0xe82) || c == 0xe84
362: || (c >= 0xe87 && c <= 0xe88) || c == 0xe8a
363: || c == 0xe8d || (c >= 0xe94 && c <= 0xe97)
364: || (c >= 0xe99 && c <= 0xe9f)
365: || (c >= 0xea1 && c <= 0xea3) || c == 0xea5
366: || c == 0xea7 || (c >= 0xeaa && c <= 0xeab)
367: || (c >= 0xead && c <= 0xeae) || c == 0xeb0
368: || (c >= 0xeb2 && c <= 0xeb3) || c == 0xebd
369: || (c >= 0xec0 && c <= 0xec4)
370: || (c >= 0xf40 && c <= 0xf47)
371: || (c >= 0xf49 && c <= 0xf69)
372: || (c >= 0x10a0 && c <= 0x10c5)
373: || (c >= 0x10d0 && c <= 0x10f6) || c == 0x1100
374: || (c >= 0x1102 && c <= 0x1103)
375: || (c >= 0x1105 && c <= 0x1107) || c == 0x1109
376: || (c >= 0x110b && c <= 0x110c)
377: || (c >= 0x110e && c <= 0x1112) || c == 0x113c
378: || c == 0x113e || c == 0x1140 || c == 0x114c
379: || c == 0x114e || c == 0x1150
380: || (c >= 0x1154 && c <= 0x1155) || c == 0x1159
381: || (c >= 0x115f && c <= 0x1161) || c == 0x1163
382: || c == 0x1165 || c == 0x1167 || c == 0x1169
383: || (c >= 0x116d && c <= 0x116e)
384: || (c >= 0x1172 && c <= 0x1173) || c == 0x1175
385: || c == 0x119e || c == 0x11a8 || c == 0x11ab
386: || (c >= 0x11ae && c <= 0x11af)
387: || (c >= 0x11b7 && c <= 0x11b8) || c == 0x11ba
388: || (c >= 0x11bc && c <= 0x11c2) || c == 0x11eb
389: || c == 0x11f0 || c == 0x11f9
390: || (c >= 0x1e00 && c <= 0x1e9b)
391: || (c >= 0x1ea0 && c <= 0x1ef9)
392: || (c >= 0x1f00 && c <= 0x1f15)
393: || (c >= 0x1f18 && c <= 0x1f1d)
394: || (c >= 0x1f20 && c <= 0x1f45)
395: || (c >= 0x1f48 && c <= 0x1f4d)
396: || (c >= 0x1f50 && c <= 0x1f57) || c == 0x1f59
397: || c == 0x1f5b || c == 0x1f5d
398: || (c >= 0x1f5f && c <= 0x1f7d)
399: || (c >= 0x1f80 && c <= 0x1fb4)
400: || (c >= 0x1fb6 && c <= 0x1fbc) || c == 0x1fbe
401: || (c >= 0x1fc2 && c <= 0x1fc4)
402: || (c >= 0x1fc6 && c <= 0x1fcc)
403: || (c >= 0x1fd0 && c <= 0x1fd3)
404: || (c >= 0x1fd6 && c <= 0x1fdb)
405: || (c >= 0x1fe0 && c <= 0x1fec)
406: || (c >= 0x1ff2 && c <= 0x1ff4)
407: || (c >= 0x1ff6 && c <= 0x1ffc) || c == 0x2126
408: || (c >= 0x212a && c <= 0x212b) || c == 0x212e
409: || (c >= 0x2180 && c <= 0x2182)
410: || (c >= 0x3041 && c <= 0x3094)
411: || (c >= 0x30a1 && c <= 0x30fa)
412: || (c >= 0x3105 && c <= 0x312c)
413: || (c >= 0xac00 && c <= 0xd7a3)
414: || (c >= 0x4e00 && c <= 0x9fa5) || c == 0x3007
415: || (c >= 0x3021 && c <= 0x3029)
416: || (c >= 0x4e00 && c <= 0x9fa5) || c == 0x3007 || (c >= 0x3021 && c <= 0x3029));
417: }
418:
419: /**
420: * Is the given char valid in xml name?
421: * @param c char
422: * @return <code>true</code> if the char is a valid xml name char
423: */
424: static boolean isXMLNamechar(char c) {
425: return (isXMLLetter(c) || c == '.' || c == '_' || c == ':'
426: || c == '-' || (c >= 0x300 && c <= 0x345)
427: || (c >= 0x360 && c <= 0x361)
428: || (c >= 0x483 && c <= 0x486)
429: || (c >= 0x591 && c <= 0x5a1)
430: || (c >= 0x5a3 && c <= 0x5b9)
431: || (c >= 0x5bb && c <= 0x5bd) || c == 0x5bf
432: || (c >= 0x5c1 && c <= 0x5c2) || c == 0x5c4
433: || (c >= 0x64b && c <= 0x652) || c == 0x670
434: || (c >= 0x6d6 && c <= 0x6dc)
435: || (c >= 0x6dd && c <= 0x6df)
436: || (c >= 0x6e0 && c <= 0x6e4)
437: || (c >= 0x6e7 && c <= 0x6e8)
438: || (c >= 0x6ea && c <= 0x6ed)
439: || (c >= 0x901 && c <= 0x903) || c == 0x93c
440: || (c >= 0x93e && c <= 0x94c) || c == 0x94d
441: || (c >= 0x951 && c <= 0x954)
442: || (c >= 0x962 && c <= 0x963)
443: || (c >= 0x981 && c <= 0x983) || c == 0x9bc
444: || c == 0x9be || c == 0x9bf
445: || (c >= 0x9c0 && c <= 0x9c4)
446: || (c >= 0x9c7 && c <= 0x9c8)
447: || (c >= 0x9cb && c <= 0x9cd) || c == 0x9d7
448: || (c >= 0x9e2 && c <= 0x9e3) || c == 0xa02
449: || c == 0xa3c || c == 0xa3e || c == 0xa3f
450: || (c >= 0xa40 && c <= 0xa42)
451: || (c >= 0xa47 && c <= 0xa48)
452: || (c >= 0xa4b && c <= 0xa4d)
453: || (c >= 0xa70 && c <= 0xa71)
454: || (c >= 0xa81 && c <= 0xa83) || c == 0xabc
455: || (c >= 0xabe && c <= 0xac5)
456: || (c >= 0xac7 && c <= 0xac9)
457: || (c >= 0xacb && c <= 0xacd)
458: || (c >= 0xb01 && c <= 0xb03) || c == 0xb3c
459: || (c >= 0xb3e && c <= 0xb43)
460: || (c >= 0xb47 && c <= 0xb48)
461: || (c >= 0xb4b && c <= 0xb4d)
462: || (c >= 0xb56 && c <= 0xb57)
463: || (c >= 0xb82 && c <= 0xb83)
464: || (c >= 0xbbe && c <= 0xbc2)
465: || (c >= 0xbc6 && c <= 0xbc8)
466: || (c >= 0xbca && c <= 0xbcd) || c == 0xbd7
467: || (c >= 0xc01 && c <= 0xc03)
468: || (c >= 0xc3e && c <= 0xc44)
469: || (c >= 0xc46 && c <= 0xc48)
470: || (c >= 0xc4a && c <= 0xc4d)
471: || (c >= 0xc55 && c <= 0xc56)
472: || (c >= 0xc82 && c <= 0xc83)
473: || (c >= 0xcbe && c <= 0xcc4)
474: || (c >= 0xcc6 && c <= 0xcc8)
475: || (c >= 0xcca && c <= 0xccd)
476: || (c >= 0xcd5 && c <= 0xcd6)
477: || (c >= 0xd02 && c <= 0xd03)
478: || (c >= 0xd3e && c <= 0xd43)
479: || (c >= 0xd46 && c <= 0xd48)
480: || (c >= 0xd4a && c <= 0xd4d) || c == 0xd57
481: || c == 0xe31 || (c >= 0xe34 && c <= 0xe3a)
482: || (c >= 0xe47 && c <= 0xe4e) || c == 0xeb1
483: || (c >= 0xeb4 && c <= 0xeb9)
484: || (c >= 0xebb && c <= 0xebc)
485: || (c >= 0xec8 && c <= 0xecd)
486: || (c >= 0xf18 && c <= 0xf19) || c == 0xf35
487: || c == 0xf37 || c == 0xf39 || c == 0xf3e || c == 0xf3f
488: || (c >= 0xf71 && c <= 0xf84)
489: || (c >= 0xf86 && c <= 0xf8b)
490: || (c >= 0xf90 && c <= 0xf95) || c == 0xf97
491: || (c >= 0xf99 && c <= 0xfad)
492: || (c >= 0xfb1 && c <= 0xfb7) || c == 0xfb9
493: || (c >= 0x20d0 && c <= 0x20dc) || c == 0x20e1
494: || (c >= 0x302a && c <= 0x302f) || c == 0x3099
495: || c == 0x309a || (c >= 0x30 && c <= 0x39)
496: || (c >= 0x660 && c <= 0x669)
497: || (c >= 0x6f0 && c <= 0x6f9)
498: || (c >= 0x966 && c <= 0x96f)
499: || (c >= 0x9e6 && c <= 0x9ef)
500: || (c >= 0xa66 && c <= 0xa6f)
501: || (c >= 0xae6 && c <= 0xaef)
502: || (c >= 0xb66 && c <= 0xb6f)
503: || (c >= 0xbe7 && c <= 0xbef)
504: || (c >= 0xc66 && c <= 0xc6f)
505: || (c >= 0xce6 && c <= 0xcef)
506: || (c >= 0xd66 && c <= 0xd6f)
507: || (c >= 0xe50 && c <= 0xe59)
508: || (c >= 0xed0 && c <= 0xed9)
509: || (c >= 0xf20 && c <= 0xf29) || c == 0xb7
510: || c == 0x2d0 || c == 0x2d1 || c == 0x387 || c == 0x640
511: || c == 0xe46 || c == 0xec6 || c == 0x3005
512: || (c >= 0x3031 && c <= 0x3035)
513: || (c >= 0x309d && c <= 0x309e) || (c >= 0x30fc && c <= 0x30fe));
514: }
515:
516: /**
517: * Is the given character a single or double quote?
518: * @param c char
519: * @return <code>true</code> if c is " or '
520: */
521: static boolean isQuote(int c) {
522: return (c == '\'' || c == '\"');
523: }
524:
525: /**
526: * Should always be able convert to/from UTF-8, so encoding exceptions are converted to an Error to avoid adding
527: * throws declarations in lots of methods.
528: * @param str String
529: * @return utf8 bytes
530: * @see String#getBytes()
531: */
532: public static byte[] getBytes(String str) {
533: try {
534: return str.getBytes("UTF8");
535: } catch (java.io.UnsupportedEncodingException e) {
536: throw new Error("String to UTF-8 conversion failed: "
537: + e.getMessage());
538: }
539: }
540:
541: /**
542: * Should always be able convert to/from UTF-8, so encoding exceptions are converted to an Error to avoid adding
543: * throws declarations in lots of methods.
544: * @param bytes byte array
545: * @param offset starting offset in byte array
546: * @param length length in byte array starting from offset
547: * @return same as <code>new String(bytes, offset, length, "UTF8")</code>
548: */
549: public static String getString(byte[] bytes, int offset, int length) {
550: try {
551: return new String(bytes, offset, length, "UTF8");
552: } catch (java.io.UnsupportedEncodingException e) {
553: throw new Error("UTF-8 to string conversion failed: "
554: + e.getMessage());
555: }
556: }
557:
558: /**
559: * Return the last char in string. This is useful when trailing quotemark is missing on an attribute
560: * @param str String
561: * @return last char in String
562: */
563: public static int lastChar(String str) {
564: if (str != null && str.length() > 0) {
565: return str.charAt(str.length() - 1);
566: }
567:
568: return 0;
569: }
570:
571: /**
572: * Determines if the specified character is whitespace.
573: * @param c char
574: * @return <code>true</code> if char is whitespace.
575: */
576: public static boolean isWhite(char c) {
577: short m = map(c);
578: return TidyUtils.toBoolean(m & WHITE);
579: }
580:
581: /**
582: * Is the given char a digit?
583: * @param c char
584: * @return <code>true</code> if the given char is a digit
585: */
586: public static boolean isDigit(char c) {
587: short m;
588: m = map(c);
589: return TidyUtils.toBoolean(m & DIGIT);
590: }
591:
592: /**
593: * Is the given char a letter?
594: * @param c char
595: * @return <code>true</code> if the given char is a letter
596: */
597: public static boolean isLetter(char c) {
598: short m;
599: m = map(c);
600: return TidyUtils.toBoolean(m & LETTER);
601: }
602:
603: /**
604: * Is the given char valid in name? (letter, digit or "-", ".", ":", "_")
605: * @param c char
606: * @return <code>true</code> if char is a name char.
607: */
608: public static boolean isNamechar(char c) {
609: short map = map(c);
610:
611: return TidyUtils.toBoolean(map & NAMECHAR);
612: }
613:
614: /**
615: * Determines if the specified character is a lowercase character.
616: * @param c char
617: * @return <code>true</code> if char is lower case.
618: */
619: public static boolean isLower(char c) {
620: short map = map(c);
621:
622: return TidyUtils.toBoolean(map & LOWERCASE);
623: }
624:
625: /**
626: * Determines if the specified character is a uppercase character.
627: * @param c char
628: * @return <code>true</code> if char is upper case.
629: */
630: public static boolean isUpper(char c) {
631: short map = map(c);
632:
633: return TidyUtils.toBoolean(map & UPPERCASE);
634: }
635:
636: /**
637: * Maps the given character to its lowercase equivalent.
638: * @param c char
639: * @return lowercase char.
640: */
641: public static char toLower(char c) {
642: short m = map(c);
643:
644: if (TidyUtils.toBoolean(m & UPPERCASE)) {
645: c = (char) (c + 'a' - 'A');
646: }
647:
648: return c;
649: }
650:
651: /**
652: * Maps the given character to its uppercase equivalent.
653: * @param c char
654: * @return uppercase char.
655: */
656: public static char toUpper(char c) {
657: short m = map(c);
658:
659: if (TidyUtils.toBoolean(m & LOWERCASE)) {
660: c = (char) (c + 'A' - 'a');
661: }
662:
663: return c;
664: }
665:
666: /**
667: * Fold case of a char.
668: * @param c char
669: * @param tocaps convert to caps
670: * @param xmlTags use xml tags? If true no change will be performed
671: * @return folded char
672: * @todo check the use of xmlTags parameter
673: */
674: public static char foldCase(char c, boolean tocaps, boolean xmlTags) {
675:
676: if (!xmlTags) {
677:
678: if (tocaps) {
679: if (isLower(c)) {
680: c = toUpper(c);
681: }
682: } else {
683: // force to lower case
684: if (isUpper(c)) {
685: c = toLower(c);
686: }
687: }
688: }
689:
690: return c;
691: }
692:
693: /**
694: * Classify chars in String and put them in lexmap.
695: * @param str String
696: * @param code code associated to chars in the String
697: */
698: private static void mapStr(String str, short code) {
699: int c;
700: for (int i = 0; i < str.length(); i++) {
701: c = str.charAt(i);
702: lexmap[c] |= code;
703: }
704: }
705:
706: /**
707: * Returns the constant which defines the classification of char in lexmap.
708: * @param c char
709: * @return char type
710: */
711: private static short map(char c) {
712: return (c < 128 ? lexmap[c] : 0);
713: }
714:
715: /**
716: * Is the given character encoding supported?
717: * @param name character encoding name
718: * @return <code>true</code> if encoding is supported, false otherwhise.
719: */
720: public static boolean isCharEncodingSupported(String name) {
721: name = EncodingNameMapper.toJava(name);
722: if (name == null) {
723: return false;
724: }
725:
726: try {
727: "".getBytes(name);
728: } catch (java.io.UnsupportedEncodingException e) {
729: return false;
730: }
731: return true;
732: }
733: }
|