001: /*
002: * Copyright 2001-2007 Hippo (www.hippo.nl)
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016: package nl.hippo.cms.spellchecking;
017:
018: import java.text.BreakIterator;
019: import java.util.*;
020:
021: /**
022: * A word finder for normal text documents, which searches text for sequences
023: * of words and text blocks.This class also defines common methods and behaviour for the various word finding
024: * subclasses.
025: *
026: * @see java.util.StringTokenizer
027: * @see java.text.BreakIterator
028: * @see TeXWordFinder
029: * @see XMLWordFinder
030: * @author Bruno Martins
031: * @author Jeroen Reijn
032: */
033: public class DefaultWordFinder {
034:
035: /** A string with the current word for the word finder. */
036: protected String currentWord;
037:
038: /** A string with the word next to the current one. */
039: protected String nextWord;
040:
041: /** The index of the current word in the input text. */
042: protected int currentWordPos;
043:
044: /** The index of the next word in the input text. */
045: protected int nextWordPos;
046:
047: /** The index of the current segment in the input text. */
048: protected int currentSegmentPos;
049:
050: /** The index of the next segment in the input text. */
051: protected int nextSegmentPos;
052:
053: /** A boolean flag indicating if the current word marks the begining of a sentence. */
054: protected boolean startsSentence;
055:
056: /** The input text. */
057: protected String text;
058:
059: /** Solve the tokenization hard cases. */
060: protected boolean solveHardCases = false;
061:
062: /**
063: * An iterator over the input text.
064: *
065: * @see java.text.BreakIterator
066: */
067: protected BreakIterator sentenceIterator;
068:
069: /**
070: * Returns the current word N-gram from the input. An N-gram is defined as the
071: * word sequence between the current position and the next n words.
072: *
073: * @param n Number of consecutive words on the n-grams.
074: * @return A String with the current word N-gram.
075: */
076: public String currentWordGram(int n) {
077: String segment = currentSegment();
078: if (segment == null)
079: return null;
080: String s[] = splitWords(segment);
081: int j = 0;
082: StringBuffer s2 = new StringBuffer(s[j++]);
083: for (int k = j; k < n && k < s.length; k++) {
084: s2.append(" ");
085: s2.append(s[k]);
086: }
087: return s2.toString();
088: }
089:
090: /**
091: * Returns the current word N-gram from the input. An N-gram is defined as the
092: * character sequence between the current position and the next n characters.
093: *
094: * @param n Number of consecutive characters on the n-grams.
095: * @return A String with the current word N-gram.
096: */
097: public String currentNGram(int n) {
098: StringBuffer aux = new StringBuffer();
099: for (int i = currentWordPos; i < currentWordPos + n
100: && i < text.length(); i++)
101: aux.append(text.charAt(i));
102: return aux.toString();
103: }
104:
105: /**
106: * Returns the current text segment from the input. A segment is defined as the
107: * character sequence between the current position and the next non-alphanumeric character,
108: * considering also white spaces.
109: *
110: * @return A String with the current text segment.
111: */
112: public String currentSegment() {
113: if (currentSegmentPos >= text.length())
114: return null;
115: if (nextSegmentPos > currentSegmentPos)
116: return text.substring(currentSegmentPos, nextSegmentPos);
117: while (currentSegmentPos < text.length()) {
118: if (!isWordChar(text, currentSegmentPos))
119: currentSegmentPos++;
120: else
121: break;
122: }
123: if (currentSegmentPos >= text.length())
124: return null;
125: nextSegmentPos = currentSegmentPos + 1;
126: while (nextSegmentPos < text.length()) {
127: if (isWordChar(text, nextSegmentPos)
128: || text.charAt(nextSegmentPos) == ' ')
129: nextSegmentPos++;
130: else
131: break;
132: }
133: if (solveHardCases) {
134: String segment = text.substring(currentSegmentPos,
135: nextSegmentPos);
136: String segment2 = solveHardCases(segment);
137: if (segment2.length() > segment.length())
138: replaceSegment(segment2);
139: }
140: //JR
141: System.out.println(text.substring(currentSegmentPos,
142: nextSegmentPos));
143: return text.substring(currentSegmentPos, nextSegmentPos);
144: }
145:
146: /**
147: * Returns the next text segment from the input. A segment is defined as the
148: * character sequence between the current position and the next non-alphanumeric character,
149: * considering also white spaces.If there are no more segments to return, it retuns a null String.
150: *
151: * @return A String with the next text segment.
152: */
153: public String nextSegment() {
154: if (currentSegmentPos >= text.length())
155: return null;
156: currentSegmentPos = nextSegmentPos;
157: int oldWordPos = -1;
158: while (true) {
159: next();
160: if (currentWordPos == oldWordPos)
161: break;
162: if (currentWordPos >= currentSegmentPos)
163: break;
164: oldWordPos = currentWordPos;
165: }
166: return currentSegment();
167: }
168:
169: /**
170: * Replaces the current text segment. After a call to this method,
171: * a call to currentSegment() returns the new text segment and a call to getText()
172: * returns the text supplied to this WordFinder with the current segment replaced.
173: *
174: * @param newSegment A String with the new text segment.
175: */
176: public void replaceSegment(String newSegment) {
177: String currentSegment = currentSegment();
178: if (currentSegmentPos >= text.length()
179: || currentSegment == null)
180: return;
181: StringBuffer sb = new StringBuffer(text.substring(0,
182: currentSegmentPos));
183: sb.append(newSegment);
184: sb.append(text.substring(currentSegmentPos
185: + currentSegment.length()));
186: int diff = newSegment.length() - currentSegment.length();
187: nextSegmentPos += diff;
188: if (nextWord != null)
189: nextWordPos += diff;
190: text = sb.toString();
191: sentenceIterator.setText(text);
192: int start = currentWordPos;
193: sentenceIterator.following(start);
194: startsSentence = sentenceIterator.current() == start;
195: }
196:
197: /**
198: * Constructor for DefaultWordFinder.
199: *
200: * @param inText A String with the input text to tokenize.
201: */
202: public DefaultWordFinder(String inText) {
203: setText(inText);
204: }
205:
206: /**
207: * Constructor for DefaultWordFinder.
208: */
209: public DefaultWordFinder() {
210: this ("");
211: }
212:
213: /**
214: * Returns the text associated with this DefaultWordFinder.
215: *
216: *@return A String with the text associated with this DefaultWordFinder.
217: */
218: public String getText() {
219: return text;
220: }
221:
222: /**
223: * Changes the text associates with this DefaultWordFinder.
224: *
225: * @param newText The new String with the input text to tokenize.
226: */
227: public void setText(String newText) {
228: text = newText;
229: currentWord = new String("");
230: nextWord = new String("");
231: currentWordPos = 0;
232: nextWordPos = 0;
233: currentSegmentPos = 0;
234: nextSegmentPos = 0;
235: startsSentence = true;
236: sentenceIterator = BreakIterator.getSentenceInstance();
237: sentenceIterator.setText(text);
238: next();
239: }
240:
241: /**
242: * Returns the current word in the text.
243: *
244: * @return A String with the current word in the text.
245: */
246: public String current() {
247: return currentWord;
248: }
249:
250: /**
251: * Tests if there are more words available from the text.
252: *
253: * @return true if and only if there is at least one word in the
254: * string after the current position, and false otherwise.
255: */
256: public boolean hasNext() {
257: return nextWord != null;
258: }
259:
260: /**
261: * Replaces the current word in the text. After a call to this method,
262: * a call to current() returns the new word and a call to getText() returns the
263: * text supplied to this WordFinder with the current word replaced.
264: *
265: * @param newWord A string with the replacement word.
266: */
267: public void replace(String newWord) {
268: if (currentWord == null)
269: return;
270: StringBuffer sb = new StringBuffer(text.substring(0,
271: currentWordPos));
272: sb.append(newWord);
273: sb
274: .append(text.substring(currentWordPos
275: + currentWord.length()));
276: int diff = newWord.length() - currentWord.length();
277: nextSegmentPos += diff;
278: if (nextWord != null)
279: nextWordPos += diff;
280: text = sb.toString();
281: sentenceIterator.setText(text);
282: int start = currentWordPos;
283: sentenceIterator.following(start);
284: startsSentence = sentenceIterator.current() == start;
285: }
286:
287: /**
288: * Replaces the current bigram (current word and the next as returned by lookahead) in
289: * the text. After a call to this method, a call to current() returns the Bigram and a
290: * call to getText() returns the text supplied to this WordFinder with the current
291: * Bigram replaced.
292: *
293: * @param newBigram A string with the replacement Bigram.
294: */
295: public void replaceBigram(String newBigram) {
296: int startPos = currentWordPos;
297: String next = lookAhead();
298: if (next != null)
299: next();
300: if (currentWord == null)
301: return;
302: StringBuffer sb = new StringBuffer(text.substring(0, startPos));
303: sb.append(newBigram);
304: sb
305: .append(text.substring(currentWordPos
306: + currentWord.length()));
307: int diff = newBigram.length() - currentWord.length();
308: nextSegmentPos += diff;
309: if (nextWord != null)
310: nextWordPos += diff;
311: text = sb.toString();
312: sentenceIterator.setText(text);
313: int start = currentWordPos;
314: sentenceIterator.following(start);
315: startsSentence = sentenceIterator.current() == start;
316: }
317:
318: /**
319: * Retuns the next word without advancing the tokenizer, cheking if the character
320: * separating both words is an empty space. This is usefull for getting BiGrams from
321: * the text.
322: *
323: * @return The next word in the text, or null.
324: */
325: public String lookAhead() {
326: if (nextWord == null)
327: return null;
328: if (text.charAt(nextWordPos - 1) == ' ')
329: return nextWord;
330: else
331: return null;
332: }
333:
334: /**
335: * Checks if the current word marks the begining of a sentence.
336: *
337: * @return true if the current word marks the begining of
338: * a sentence and false otherwise.
339: */
340: public boolean startsSentence() {
341: if (currentWord == null)
342: return false;
343: return startsSentence;
344: }
345:
346: /**
347: * Produces a string representation of this word finder by returning
348: * the associated text.
349: */
350: public String toString() {
351: return text;
352: }
353:
354: /**
355: * Checks if the character at a given position in a String is part of a word.
356: * Special characters such as '.' or '-' are considered alphanumeric or not depending
357: * on the surrounding characters.
358: *
359: * @param text The text String.
360: * @param posn The position for the character in the String.
361: * @return true if the character at the given position is alphanumeric and false otherwise.
362: */
363: protected static boolean isWordChar(String text, int posn) {
364: if (posn < 0 || posn >= text.length())
365: return false;
366: boolean out = false;
367: char curr = text.charAt(posn);
368: if ((posn == 0) || (posn == text.length() - 1)) {
369: return Character.isLetterOrDigit(curr);
370: }
371: char prev = text.charAt(posn - 1);
372: char next = text.charAt(posn + 1);
373: String prevWord = "" + prev;
374: for (int i = posn - 2; i >= 0; i--) {
375: char chr = text.charAt(i);
376: if (chr == ' ' || chr == '\t' || chr == '\n' || chr == '\r')
377: break;
378: prevWord = chr + prevWord;
379: }
380: String prevWordLowerCase = prevWord.toLowerCase();
381: switch (curr) {
382: case '\'':
383: out = (Character.isLetter(prev) && Character.isLetter(next));
384: out |= (Character.isDigit(prev) && (!Character
385: .isLetterOrDigit(next) || next == '\''));
386: break;
387: case '$':
388: out = (Character.isDigit(prev) && Character.isDigit(next));
389: out |= (!Character.isLetterOrDigit(prev) && Character
390: .isDigit(next));
391: break;
392: case '@':
393: out = (Character.isLetterOrDigit(prev) && Character
394: .isLetterOrDigit(next));
395: break;
396: case '.':
397: out = (Character.isDigit(prev) && Character.isDigit(next));
398: out |= Character.isLetter(next)
399: && prevWord.indexOf('@') > 0;
400: out |= prevWord.startsWith("http://")
401: && Character.isLetterOrDigit(next);
402: out |= prevWord.startsWith("ftp://")
403: && Character.isLetterOrDigit(next);
404: out |= prevWord.startsWith("www")
405: && Character.isLetterOrDigit(next);
406: // Common abreviations
407: out |= prevWordLowerCase.equals("lda")
408: && (next == ' ' || Character.isLetter(next));
409: out |= prevWordLowerCase.equals("sr")
410: && (next == ' ' || Character.isLetter(next));
411: out |= prevWordLowerCase.equals("sra")
412: && (next == ' ' || Character.isLetter(next));
413: out |= prevWordLowerCase.equals("sr(a)")
414: && (next == ' ' || Character.isLetter(next));
415: out |= prevWordLowerCase.equals("dr")
416: && (next == ' ' || Character.isLetter(next));
417: out |= prevWordLowerCase.equals("dra")
418: && (next == ' ' || Character.isLetter(next));
419: out |= prevWordLowerCase.equals("dr(a)")
420: && (next == ' ' || Character.isLetter(next));
421: out |= prevWordLowerCase.equals("exmo")
422: && (next == ' ' || Character.isLetter(next));
423: out |= prevWordLowerCase.equals("exma")
424: && (next == ' ' || Character.isLetter(next));
425: out |= prevWordLowerCase.equals("exmo(a)")
426: && (next == ' ' || Character.isLetter(next));
427: break;
428: case ',':
429: out = (Character.isDigit(prev) && Character.isDigit(next));
430: break;
431: case '%':
432: out = (Character.isDigit(prev) && !Character
433: .isLetterOrDigit(next));
434: out |= prevWord.startsWith("http://")
435: && Character.isDigit(next);
436: out |= prevWord.startsWith("ftp://")
437: && Character.isDigit(next);
438: break;
439: case ':':
440: out = (Character.isDigit(prev) && Character.isDigit(next));
441: out |= prevWord.startsWith("http");
442: out |= prevWord.startsWith("ftp");
443: break;
444: case '/':
445: out = (Character.isDigit(prev) && Character.isDigit(next));
446: out |= prevWord.startsWith("http:");
447: out |= prevWord.startsWith("ftp:");
448: out |= prevWord.startsWith("www.");
449: break;
450: case '=':
451: out = prevWord.startsWith("http://")
452: && prevWord.indexOf("?") != -1
453: && Character.isLetterOrDigit(next);
454: out |= prevWord.startsWith("ftp://")
455: && prevWord.indexOf("?") != -1
456: && Character.isLetterOrDigit(next);
457: break;
458: case '?':
459: case '~':
460: out = prevWord.startsWith("http://")
461: && Character.isLetterOrDigit(next);
462: out |= prevWord.startsWith("ftp://")
463: && Character.isLetterOrDigit(next);
464: break;
465: case '+':
466: case '*':
467: out = (Character.isDigit(prev) && Character.isDigit(next));
468: break;
469: case '_':
470: out = (Character.isDigit(prev) && Character.isDigit(next));
471: out = (Character.isLetter(prev) && Character.isLetter(next) && StringUtils
472: .isUpperCase(next));
473: break;
474: case '-':
475: out = (Character.isDigit(prev) && Character.isDigit(next));
476: out = (Character.isLetter(prev) && Character.isLetter(next) && StringUtils
477: .isUpperCase(next));
478: break;
479: default:
480: out = Character.isLetterOrDigit(curr);
481: }
482: return out;
483: }
484:
485: /**
486: * Checks if a given character is alphanumeric.
487: *
488: * @param c The char to check.
489: * @return true if the given character is alphanumeric and false otherwise.
490: */
491: protected static boolean isWordChar(char c) {
492: boolean out = false;
493: if (Character.isLetterOrDigit(c) || (c == '\'')) {
494: out = true;
495: }
496: return out;
497: }
498:
499: /**
500: * Ignore all characters from the text after the first occurence of a given character.
501: *
502: * @param index A starting index for the text from where characters should be ignored
503: * @param startIgnore The character that marks the begining of the sequence to be ignored.
504: *
505: * @return the index in the text marking the begining of the ignored sequence, or -1 if no
506: * sequence was ignored (the supplied character does not occur in the text).
507: */
508: protected int ignore(int index, char startIgnore) {
509: return ignore(index, new Character(startIgnore), null);
510: }
511:
512: /**
513: * Ignore all characters from the text between the first occurence of a given character
514: * and the next occurence of another given character.
515: *
516: * @param index A starting index for the text from where characters should be ignored.
517: * @param startIgnore The character that marks the begining of the sequence to be ignored.
518: * @param endIgnore The character that marks the ending of the sequence to be ignored.
519: *
520: * @return the index in the text marking the begining of the ignored sequence, or -1 if no
521: * sequence was ignored (the supplied starting character does not occur in the text).
522: */
523: protected int ignore(int index, char startIgnore, char endIgnore) {
524: return ignore(index, new Character(startIgnore), new Character(
525: endIgnore));
526: }
527:
528: /**
529: * Ignore all characters from the text between the first occurence of a given character
530: * and the next occurence of another given character.
531: *
532: * @param index A starting index for the text from where characters should be ignored.
533: * @param startIgnore The character that marks the begining of the sequence to be ignored.
534: * @param endIgnore The character that marks the ending of the sequence to be ignored, or null
535: * if all the next characters from the text are to be ignored.
536: *
537: * @return the index in the text marking the begining of the ignored sequence, or -1 if no
538: * sequence was ignored (the supplied starting character does not occur in the text).
539: */
540: protected int ignore(int index, Character startIgnore,
541: Character endIgnore) {
542: if (index < 0 || index >= text.length())
543: return -1;
544: int newIndex = index;
545: if (newIndex < text.length()) {
546: Character curChar = new Character(text.charAt(newIndex));
547: if (curChar.equals(startIgnore)) {
548: newIndex++;
549: while (newIndex < text.length()) {
550: curChar = new Character(text.charAt(newIndex));
551: if (endIgnore != null && curChar.equals(endIgnore)) {
552: //[JR] not necessary because it will
553: //newIndex++;
554: break;
555: } else if (endIgnore == null
556: && !Character.isLetterOrDigit(curChar
557: .charValue())) {
558: break;
559: }
560: newIndex++;
561: }
562: }
563: }
564: return newIndex;
565: }
566:
567: /**
568: * Ignore all characters from the text between the first occurence of a given String
569: * and the next occurence of another given String.
570: *
571: * @param index A starting index for the text from where characters should be ignored.
572: * @param startIgnore The String that marks the begining of the sequence to be ignored.
573: * @param endIgnore The String that marks the ending of the sequence to be ignored.
574: *
575: * @return the index in the text marking the begining of the ignored sequence, or -1 if no
576: * sequence was ignored (the supplied starting String does not occur in the text).
577: */
578: protected int ignore(int index, String startIgnore, String endIgnore) {
579: int newIndex = index;
580: int len = text.length();
581: int slen = startIgnore.length();
582: int elen = endIgnore.length();
583: if (!((newIndex + slen) >= len)) {
584: String seg = text.substring(newIndex, newIndex + slen);
585: if (seg.equals(startIgnore)) {
586: newIndex += slen;
587: cycle: while (true) {
588: if (newIndex == (text.length() - elen))
589: break cycle;
590: String ss = text.substring(newIndex, newIndex
591: + elen);
592: if (ss.equals(endIgnore)) {
593: newIndex += elen;
594: break cycle;
595: } else
596: newIndex++;
597: }
598: }
599: }
600: return newIndex;
601: }
602:
603: /**
604: * This method scans the text from the end of the last word, and returns a
605: * String corresponding to the next word. If there are no more words to
606: * return, it retuns a null String.
607: *
608: * @return the next word.
609: */
610: public String next() {
611: if (nextWord == null)
612: return null;
613: currentWord = nextWord;
614: currentWordPos = nextWordPos;
615: int current = sentenceIterator.current();
616: if (current == currentWordPos)
617: startsSentence = true;
618: else {
619: startsSentence = false;
620: if (currentWordPos + currentWord.length() > current)
621: sentenceIterator.next();
622: }
623: int i = currentWordPos + currentWord.length();
624: boolean finished = false;
625: while (i < text.length() && !finished) {
626: if (isWordChar(text, i)) {
627: nextWordPos = i;
628: int end = getNextWordEnd(text, i);
629: nextWord = text.substring(i, end);
630: finished = true;
631: }
632: i++;
633: }
634: if (!finished)
635: nextWord = null;
636: if (solveHardCases) {
637: String aux = solveHardCases(currentWord);
638: int diff = aux.length() - currentWord.length();
639: if (diff > 0) {
640: StringBuffer sb = new StringBuffer(text.substring(0,
641: currentWordPos));
642: sb.append(aux);
643: sb.append(text.substring(currentWordPos
644: + currentWord.length()));
645: currentWord = aux.substring(0, aux.indexOf(" "));
646: nextWord = aux.substring(aux.indexOf(" ") + 1);
647: nextSegmentPos += diff;
648: text = sb.toString();
649: sentenceIterator.setText(text);
650: int start = currentWordPos;
651: sentenceIterator.following(start);
652: startsSentence = sentenceIterator.current() == start;
653: }
654: }
655: return currentWord;
656: }
657:
658: /**
659: * Returns the position in the string <em>after</em> the end of the next word.
660: *
661: * Note that this return value should not be used as an index into the string
662: * without checking first that it is in range, since it is possible for the
663: * value <code>text.length()</code> to be returned by this method.
664: *
665: * @param text A string with the text to check.
666: * @param startPos the starting position in the text to check.
667: * @return the index position in the string after the end of the next word.
668: */
669: private static int getNextWordEnd(String text, int startPos) {
670: for (int i = startPos; i < text.length(); i++) {
671: if (!isWordChar(text, i))
672: return i;
673: }
674: return text.length();
675: }
676:
677: /**
678: * Splits a given String into an array with its constituent words.
679: *
680: * @param text A String.
681: * @return An array with the words extracted from the String.
682: */
683: public static String[] splitWords(String text) {
684: List aux = new Vector();
685: DefaultWordFinder finder = new DefaultWordFinder(text);
686: String str;
687: while ((str = finder.next()) != null)
688: aux.add(str);
689: return (String[]) (aux.toArray(new String[0]));
690: }
691:
692: /**
693: * Splits a given String into an array with its constituent text segments.
694: *
695: * @param text A String.
696: * @return An array with the text segments extracted from the String.
697: */
698: public static String[] splitSegments(String text) {
699: List aux = new Vector();
700: DefaultWordFinder finder = new DefaultWordFinder(text);
701: String str;
702: while ((str = finder.nextSegment()) != null)
703: aux.add(str);
704: return (String[]) (aux.toArray(new String[0]));
705: }
706:
707: /**
708: * Splits a given String into an array with its constituent word n-grams.
709: *
710: * @param text A String.
711: * @param n Number of consecutive words on the n-grams.
712: * @return An array with the word n-grams extracted from the String.
713: */
714: public static String[] splitWordGrams(String text, int n) {
715: String aux[] = splitSegments(text);
716: List list = new Vector();
717: for (int i = 0; i < aux.length; i++) {
718: String s[] = splitWords(aux[i]);
719: int j = 0;
720: do {
721: StringBuffer s2 = new StringBuffer(s[j++]);
722: for (int k = j; k < n && k < s.length; k++) {
723: s2.append(" ");
724: s2.append(s[k]);
725: }
726: list.add(s2.toString());
727: } while (j < s.length - n);
728: }
729: return (String[]) (list.toArray(new String[0]));
730: }
731:
732: /**
733: * Splits a given String into an array with its constituent character n-grams.
734: *
735: * @param text A String.
736: * @param n Number of consecutive characters on the n-grams.
737: * @return An array with the character n-grams extracted from the String.
738: */
739: public static String[] splitNGrams(String text, int n) {
740: int lastn[] = new int[n];
741: List list = new Vector();
742: for (int i = 0; i < text.length(); i++) {
743: if (i < n)
744: lastn[i] = 0;
745: else {
746: for (int j = 0; j < n - 1; j++)
747: lastn[j] = lastn[j + 1];
748: lastn[n - 1] = text.charAt(i);
749: StringBuffer aux = new StringBuffer();
750: for (int j = 0; j < n; j++)
751: aux.append((char) lastn[j]);
752: list.add(aux.toString());
753: }
754: }
755: return (String[]) (list.toArray(new String[0]));
756: }
757:
758: /**
759: * Resolves the hard tokenization cases which envolve splitting the original
760: * word in two words (e.g. doesn't -> "does not").
761: *
762: * TODO: Disambiguate some cases.
763: *
764: * @param text A string.
765: * @return The string with the hard cases solved.
766: */
767: private static String solveHardCases(String text) {
768: String tokens[] = text.split(" ");
769: StringBuffer newString = new StringBuffer();
770: for (int i = 0; i < tokens.length; i++) {
771: if (i != 0)
772: newString.append(" ");
773: String aux = tokens[i].toLowerCase();
774: if (aux.equals("daquilo"))
775: aux = tokens[i].charAt(0) + "e aquilo";
776: else if (aux.equals("disso"))
777: aux = tokens[i].charAt(0) + "e isso";
778: else if (aux.equals("disto"))
779: aux = tokens[i].charAt(0) + "e isto";
780: else if (aux.equals("dele"))
781: aux = tokens[i].charAt(0) + "e ele";
782: else if (aux.equals("dela"))
783: aux = tokens[i].charAt(0) + "e ela";
784: else if (aux.equals("deles"))
785: aux = tokens[i].charAt(0) + "e eles";
786: else if (aux.equals("delas"))
787: aux = tokens[i].charAt(0) + "e elas";
788: else if (aux.equals("do"))
789: aux = tokens[i].charAt(0) + "e o";
790: else if (aux.equals("dos"))
791: aux = tokens[i].charAt(0) + "e os";
792: else if (aux.equals("da"))
793: aux = tokens[i].charAt(0) + "e a";
794: else if (aux.equals("das"))
795: aux = tokens[i].charAt(0) + "e as";
796: else if (aux.equals("pelo"))
797: aux = tokens[i].charAt(0) + "or o";
798: else if (aux.equals("pela"))
799: aux = tokens[i].charAt(0) + "or a";
800: else if (aux.equals("pelos"))
801: aux = tokens[i].charAt(0) + "or os";
802: else if (aux.equals("pelas"))
803: aux = tokens[i].charAt(0) + "or as";
804: else if (aux.equals("p'lo"))
805: aux = tokens[i].charAt(0) + "or o";
806: else if (aux.equals("p'la"))
807: aux = tokens[i].charAt(0) + "or a";
808: else if (aux.equals("p'los"))
809: aux = tokens[i].charAt(0) + "or os";
810: else if (aux.equals("p'las"))
811: aux = tokens[i].charAt(0) + "or as";
812: else if (aux.equals("p'ra"))
813: aux = tokens[i].charAt(0) + "ara a";
814: else if (aux.equals("p'ro"))
815: aux = tokens[i].charAt(0) + "ara o";
816: else if (aux.equals("p'ras"))
817: aux = tokens[i].charAt(0) + "ara as";
818: else if (aux.equals("p'ros"))
819: aux = tokens[i].charAt(0) + "ara os";
820: else if (aux.equals("deste"))
821: aux = tokens[i].charAt(0) + "e este";
822: else if (aux.equals("destes"))
823: aux = tokens[i].charAt(0) + "e estes";
824: else if (aux.equals("desta"))
825: aux = tokens[i].charAt(0) + "e esta";
826: else if (aux.equals("destas"))
827: aux = tokens[i].charAt(0) + "e estas";
828: else if (aux.equals("desse"))
829: aux = tokens[i].charAt(0) + "e esse";
830: else if (aux.equals("desses"))
831: aux = tokens[i].charAt(0) + "e esses";
832: else if (aux.equals("dessa"))
833: aux = tokens[i].charAt(0) + "e essa";
834: else if (aux.equals("dessas"))
835: aux = tokens[i].charAt(0) + "e essas";
836: else if (aux.equals("i'm"))
837: aux = tokens[i].charAt(0) + " am";
838: else if (aux.equals("don't"))
839: aux = tokens[i].charAt(0) + "o not";
840: else if (aux.equals("won't"))
841: aux = tokens[i].charAt(0) + "ill not";
842: else if (aux.equals("haven't"))
843: aux = tokens[i].charAt(0) + "ave not";
844: else if (aux.equals("does't"))
845: aux = tokens[i].charAt(0) + "oes not";
846: else if (aux.equals("dessa"))
847: aux = tokens[i].charAt(0) + "e essa";
848: else if (aux.equals("dessas"))
849: aux = tokens[i].charAt(0) + "e essas";
850: else if (aux.equals("na"))
851: aux = Character.isUpperCase(tokens[i].charAt(0)) ? "Em a"
852: : "em a";
853: else if (aux.equals("nas"))
854: aux = Character.isUpperCase(tokens[i].charAt(0)) ? "Em as"
855: : "em as";
856: else if (aux.equals("no"))
857: aux = Character.isUpperCase(tokens[i].charAt(0)) ? "Em o"
858: : "em o";
859: else if (aux.equals("nos"))
860: aux = Character.isUpperCase(tokens[i].charAt(0)) ? "Em os"
861: : "em os";
862: else if (aux.equals("num"))
863: aux = Character.isUpperCase(tokens[i].charAt(0)) ? "Em um"
864: : "em um";
865: else if (aux.equals("nuns"))
866: aux = Character.isUpperCase(tokens[i].charAt(0)) ? "Em uns"
867: : "em uns";
868: else if (aux.equals("nele"))
869: aux = Character.isUpperCase(tokens[i].charAt(0)) ? "Em ele"
870: : "em ele";
871: else if (aux.equals("nela"))
872: aux = Character.isUpperCase(tokens[i].charAt(0)) ? "Em ela"
873: : "em ela";
874: else if (aux.equals("neles"))
875: aux = Character.isUpperCase(tokens[i].charAt(0)) ? "Em eles"
876: : "em eles";
877: else if (aux.equals("nelas"))
878: aux = Character.isUpperCase(tokens[i].charAt(0)) ? "Em elas"
879: : "em elas";
880: else if (aux.equals("nisto"))
881: aux = Character.isUpperCase(tokens[i].charAt(0)) ? "Em isto"
882: : "em isto";
883: else if (aux.equals("naquilo"))
884: aux = Character.isUpperCase(tokens[i].charAt(0)) ? "Em aquilo"
885: : "em aquilo";
886: newString.append(aux);
887: }
888: return newString.toString();
889: }
890:
891: }
|