001: /*
002: * Copyright 2001-2007 Hippo (www.hippo.nl)
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016: package nl.hippo.cms.spellchecking;
017:
018: /**
019: * A collection of <code>String</code> handling utility methods.</p><p>
020: *
021: * Some of the methods available in this class have equivalents in the
022: * <code>java.lang.String</code> class. However, the implementations
023: * provided here are a lot faster, since they do not deal with character
024: * internationalization issues.</p><p>
025: *
026: * @author Bruno Martins
027: *
028: */
029: public class StringUtils {
030:
031: /** The single instance of this class. */
032: private static final StringUtils _theInstance = new StringUtils();
033:
034: /**
035: * Tests whether a given character is alphabetic, numeric or the
036: * hyphen character.
037: *
038: * @param c The character to be tested.
039: * @return whether the given character is alphameric or not.
040: */
041: public static boolean isAlphaNumeric(char c) {
042: return c == '-' || (c >= 'a' && c <= 'z')
043: || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9');
044: }
045:
046: /**
047: * Counts the occurrence of the given char in a String.
048: *
049: * @param str The string to be tested.
050: * @param c the char to be counted.
051: * @return the frequency of occurrence for the character in the String.
052: */
053: public static int count(String str, char c) {
054: int index = 0;
055: char[] chars = str.toCharArray();
056: for (int i = 0; i < chars.length; i++) {
057: if (chars[i] == c)
058: index++;
059: }
060: return index;
061: }
062:
063: /**
064: * Matches two strings.
065: *
066: * @param a The first string.
067: * @param b The second string.
068: * @return the index where the two strings stop matching starting from 0.
069: */
070: public static int matchStrings(String a, String b) {
071: int i;
072: char[] ca = a.toCharArray();
073: char[] cb = b.toCharArray();
074: int len = (ca.length < cb.length) ? ca.length : cb.length;
075: for (i = 0; i < len; i++)
076: if (ca[i] != cb[i])
077: break;
078: return i;
079: }
080:
081: /**
082: * Return the single instance of this class.
083: *
084: * @return An instance of <code>StringUtils</code>.
085: */
086: public static StringUtils getInstance() {
087: return _theInstance;
088: }
089:
090: /**
091: * Reverse a given String.
092: *
093: * @param s The String to reverse.
094: * @return The reversed string.
095: */
096: public static String invertString(String s) {
097: if ((s == null) || (s == ""))
098: return "";
099: byte[] b = s.getBytes();
100: byte[] c = new byte[b.length];
101: int x = b.length;
102: for (int i = 0; i < x; i++)
103: c[x - i - 1] = b[i];
104: return new String(c);
105: }
106:
107: /**
108: * Returns a new string resulting from replacing all occurrences of the
109: * String search in the String source, with the string replace.
110: *
111: * @param source The original String.
112: * @param search The string to be replaces.
113: * @param replace The replacement String.
114: * @return The resulting String.
115: */
116: public static String replace(String source, String search,
117: String replace) {
118: int sind = -1;
119: String aux = "", s = source;
120: while (!s.equals("")) {
121: sind = s.indexOf(search);
122: if (sind != -1) {
123: aux += s.substring(0, sind) + replace;
124: s = s.substring(sind + 1);
125: } else {
126: aux += s;
127: s = "";
128: }
129: }
130: return aux;
131: }
132:
133: /**
134: * Replaces accented characters with their variations without
135: * the diacritics.
136: *
137: * TODO: add more non-Portuguese diacritic characters.
138: *
139: *@param chr the character to check.
140: *@return The character without the diacritic.
141: */
142: public static char replaceAccent(char chr) {
143: switch (chr) {
144: default:
145: return chr;
146: }
147: }
148:
149: /**
150: * Checks if a given character has diacritics. For instance,
151: * isAccent('a') would return false, whereas isAccent('รก') would return true.
152: *
153: *@param chr the char to check.
154: *@return true if the character has a diacritic and false otherwise.
155: */
156: public static boolean isAccent(char chr) {
157: for (int i = 0; i < specialChars.length; i++)
158: if (chr == specialChars[i])
159: return true;
160: return false;
161: }
162:
163: /**
164: * Checks if a given character is uppercase. For instance,
165: * isUpperCase('a') would return false, whereas isUpperCase('A') would return true.
166: *
167: *@param chr the char to check.
168: *@return true if the character is uppercase and false otherwise.
169: */
170: public static boolean isUpperCase(char chr) {
171: return chr == Character.toUpperCase(chr);
172: }
173:
174: /**
175: * Takes a numeric string and separates groups of 3 characters
176: * with a '.' character. For instance separateNumberWithDots(n)
177: * would return "1.000".
178: *
179: * @param n A numeric String.
180: *
181: * @return The resulting String.
182: */
183: public static String separateNumberWithDots(String n) {
184: return separateNumberWithDots(n, 3);
185: }
186:
187: /**
188: * Takes a numeric string and separates groups of "n" characters
189: * with a '.' character. For instance separateNumberWithDots(n,3)
190: * would return "1.000"
191: *
192: * @param n A numeric String.
193: * @param s The number of characters to group.
194: * @return The resulting String.
195: */
196: public static String separateNumberWithDots(String n, int s) {
197: int c = 0;
198: String saux = "";
199: for (int i = n.length() - 1; i > 0; i--) {
200: saux = n.charAt(i) + saux;
201: c++;
202: if (c == s) {
203: saux = '.' + saux;
204: c = 0;
205: }
206: }
207: saux = n.charAt(0) + saux;
208: return saux;
209: }
210:
211: /**
212: * Converts all of the characters in a given String to lower case.
213: *
214: * @param str A String.
215: * @param accents if true, then besides converting the string to lower case
216: * accented characters are also replaces with their versions without the diacritics.
217: * @return The resulting String.
218: */
219: public static String toLowerCase(String str, boolean accents) {
220: int len = str.length();
221: int different = -1;
222: int i;
223: char ch;
224: char ch2;
225: for (i = len - 1; i >= 0; i--) {
226: ch = str.charAt(i);
227: ch2 = Character.toLowerCase(ch);
228: if (accents) {
229: ch2 = replaceAccent(ch2);
230: }
231: if (ch2 != ch) {
232: different = i;
233: break;
234: }
235: }
236: if (different == -1) {
237: return str;
238: } else {
239: char[] chars = new char[len];
240: str.getChars(0, len, chars, 0);
241: for (i = different; i >= 0; i--) {
242: ch = Character.toLowerCase(chars[i]);
243: if (accents) {
244: ch = replaceAccent(ch);
245: }
246: chars[i] = ch;
247: }
248: return new String(chars);
249: }
250: }
251:
252: /**
253: * Return an array with all the valid accented characters.
254: *
255: * TODO: add more non-Portuguese diacritic characters.
256: *
257: * @return An array with all the valid accented characters.
258: */
259: public static char[] getSpecialChars() {
260: return specialChars;
261: }
262:
263: /** An array with all the valid accented characters. */
264: private static char specialChars[] = {};
265:
266: /**
267: * Checks if the character at a given position of a given string is a vowel.
268: * The Y character is also considered.
269: *
270: * TODO: Should portuguese accented characters be considered vowels?
271: *
272: * @param in A String.
273: * @param at The position in the String.
274: * @return true if the the character at position at of the string in is a vowel
275: * and false otherwise.
276: */
277: public final static boolean isVowel(String in, int at) {
278: return isVowel(in, at, in.length());
279: }
280:
281: /**
282: * Checks if the character at a given position of a given string is a vowel
283: * The Y character is also considered.
284: *
285: * TODO: Should portuguese accented characters be considered vowels?
286: *
287: * @param in A String.
288: * @param at The position in the String.
289: * @param length The maximum lengh of the String to check.
290: * @return true if the the character at position at of the string in is a vowel
291: * and false otherwise.
292: */
293: public static boolean isVowel(String in, int at, int length) {
294: if ((at < 0) || (at >= length))
295: return false;
296: char it = Character.toLowerCase(in.charAt(at));
297: if ((it == 'A') || (it == 'E') || (it == 'I') || (it == 'O')
298: || (it == 'U') || (it == 'Y'))
299: return true;
300: return false;
301: }
302:
303: /**
304: * Checks if a given String is capitalizated.
305: *
306: * @param str A String.
307: * @return true if the given String is capitalizated and false otherwise.
308: */
309: public static boolean isCapitalizated(String str) {
310: if (str == null || str.length() == 0)
311: return false;
312: str = str.trim();
313: if (str.endsWith(" da") || str.startsWith("da ")
314: || str.endsWith(" das") || str.startsWith("das ")
315: || str.endsWith(" do") || str.startsWith("do ")
316: || str.endsWith(" dos") || str.startsWith("dos ")
317: || str.endsWith(" de") || str.startsWith("de ")
318: || str.endsWith(" a") || str.startsWith("a ")
319: || str.endsWith(" as") || str.startsWith("as ")
320: || str.endsWith(" e") || str.startsWith("e ")
321: || str.endsWith(" o") || str.startsWith("o ")
322: || str.endsWith(" os") || str.startsWith("os ")
323: || str.endsWith(" ou") || str.startsWith("ou ")
324: || str.endsWith(" d'el") || str.startsWith("d'el ")
325: || str.endsWith(" of") || str.startsWith("of ")
326: || str.endsWith(" and") || str.startsWith("and ")
327: || str.endsWith(" or") || str.startsWith("or ")
328: || str.endsWith(" the") || str.startsWith("or "))
329: return false;
330: if (str.toUpperCase().equals(str))
331: return true;
332: String capitalizated = capitalizate(str, false, true);
333: return capitalizated.equals(str);
334: }
335:
336: /**
337: * Capitalizates a given String.
338: *
339: * @param str A String.
340: * @return The capitalizated String.
341: */
342: public static String capitalizate(String str) {
343: return capitalizate(str, false);
344: }
345:
346: /**
347: * Capitalizates a given String.
348: *
349: * @param str A String.
350: * @return The capitalizated String.
351: */
352: public static String capitalizate(String str, boolean accents) {
353: return capitalizate(str, accents, false);
354: }
355:
356: /**
357: * Trims and capitalizates a given String, with specific rules for
358: * Portuguese words.
359: *
360: * @param str A String.
361: * @return The capitalizated String.
362: */
363: public static String capitalizate(String str, boolean accents,
364: boolean abbreviations) {
365: str = str.trim();
366: String lowerCase = toLowerCase(str, accents);
367: if (lowerCase.length() == 0)
368: return lowerCase;
369: int index = lowerCase.indexOf(" ");
370: if (index == -1) {
371: if (lowerCase.equals("da") || lowerCase.equals("das")
372: || lowerCase.equals("do")
373: || lowerCase.equals("dos")
374: || lowerCase.equals("de") || lowerCase.equals("a")
375: || lowerCase.equals("as") || lowerCase.equals("e")
376: || lowerCase.equals("o") || lowerCase.equals("os")
377: || lowerCase.equals("ou")
378: || lowerCase.equals("entre")
379: || lowerCase.equals("d'el")
380: || lowerCase.equals("of")
381: || lowerCase.equals("and")
382: || lowerCase.equals("or")
383: || lowerCase.equals("the"))
384: return lowerCase;
385: if (lowerCase.startsWith("d'") && lowerCase.length() > 2) {
386: char ch = str.charAt(2);
387: return lowerCase.charAt(0)
388: + "'"
389: + Character.toUpperCase(ch)
390: + (abbreviations ? str.substring(3) : lowerCase
391: .substring(3));
392: } else if (lowerCase.startsWith("o'")
393: && lowerCase.length() > 2) {
394: char ch = str.charAt(2);
395: char ch2 = str.charAt(0);
396: return Character.toUpperCase(ch2)
397: + "'"
398: + Character.toUpperCase(ch)
399: + (abbreviations ? str.substring(3) : lowerCase
400: .substring(3));
401: } else if (lowerCase.startsWith("mc")
402: && lowerCase.length() > 2) {
403: char ch = str.charAt(2);
404: char ch2 = str.charAt(0);
405: return Character.toUpperCase(ch2)
406: + "c"
407: + Character.toUpperCase(ch)
408: + (abbreviations ? str.substring(3) : lowerCase
409: .substring(3));
410: } else if ((index = lowerCase.indexOf("-")) > 0
411: && !lowerCase.endsWith("-")) {
412: char ch = str.charAt(2);
413: String aux = str.substring(index + 1);
414: if (aux.startsWith("o-") || aux.startsWith("a-")
415: || aux.startsWith("e-")) {
416: aux = aux.substring(0, 2)
417: + capitalizate(aux.substring(2), accents,
418: abbreviations);
419: } else if (aux.startsWith("os-")
420: || aux.startsWith("as-")) {
421: aux = aux.substring(0, 3)
422: + capitalizate(aux.substring(3), accents,
423: abbreviations);
424: } else if (!aux.startsWith("lh") && !aux.equals("o")
425: && !aux.equals("a") && !aux.equals("os")
426: && !aux.equals("as") && !aux.equals("me")
427: && !aux.equals("mo")) {
428: aux = capitalizate(aux, accents, abbreviations);
429: }
430: return Character.toUpperCase(ch)
431: + (abbreviations ? str.substring(3, index)
432: : lowerCase.substring(3, index)) + "-"
433: + aux;
434: } else {
435: char ch = str.charAt(0);
436: return Character.toUpperCase(ch)
437: + lowerCase.substring(1);
438: }
439: } else {
440: String result = capitalizate(str.substring(0, index),
441: accents, abbreviations)
442: + " "
443: + capitalizate(str.substring(index + 1), accents,
444: abbreviations);
445: if (result.equals("a") || result.startsWith("a ")
446: || result.equals("as") || result.startsWith("as ")
447: || result.equals("o") || result.startsWith("o ")
448: || result.equals("os") || result.startsWith("os ")
449: || result.equals("de") || result.startsWith("de ")
450: || result.equals("da") || result.startsWith("da ")
451: || result.equals("das")
452: || result.startsWith("das ") || result.equals("do")
453: || result.startsWith("do ") || result.equals("dos")
454: || result.startsWith("dos ")
455: || result.equals("entre")
456: || result.startsWith("entre ")
457: || result.startsWith("d'")
458: || result.startsWith("o' ") || result.equals("the")
459: || result.startsWith("the ")) {
460: result = Character.toUpperCase(result.charAt(0))
461: + result.substring(1);
462: }
463: index = result.indexOf("Jornal ");
464: if (index != -1)
465: result = result.substring(0, index)
466: + "Jornal "
467: + Character.toUpperCase(result
468: .charAt(index + 7))
469: + result.substring(index + 8);
470: return result;
471: }
472: }
473:
474: /**
475: * Sole constructor, private because this is a Singleton class.
476: */
477: private StringUtils() {
478: }
479:
480: }
|