001: /*
002: * Copyright 2001-2007 Hippo (www.hippo.nl)
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016: package nl.hippo.cms.spellchecking;
017:
018: import java.util.List;
019: import java.util.Vector;
020:
021: /**
022: * A word finder for XMLdocuments, which searches text for
023: * sequences of letters, but ignores tags.
024: *
025: * @see DefaultWordFinder
026: * @author Bruno Martins
027: */
028: public class XMLWordFinder extends DefaultWordFinder {
029:
030: /**
031: * Constructor for XMLWordFinder.
032: *
033: * @param inText A String with the input text to tokenize.
034: */
035: public XMLWordFinder(String inText) {
036: super (inText);
037: }
038:
039: /**
040: * Constructor for XMLWordFinder.
041: */
042: public XMLWordFinder() {
043: super ();
044: }
045:
046: /**
047: * Returns the current text segment from the input. A segment is defined as the
048: * character sequence between the current position and the next non-alphanumeric character,
049: * considering also white spaces.
050: *
051: * @return A String with the current text segment.
052: */
053: public String currentSegment() {
054: String seg = super .currentSegment();
055: while (seg != null && seg.startsWith("<") && seg.endsWith("<")) {
056: nextSegment();
057: seg = super .currentSegment();
058: }
059: return seg;
060: }
061:
062: /**
063: * This method scans the text from the end of the last word, and returns a
064: * String corresponding to the next word. If there are no more words to
065: * return, it retuns a null String.
066: *
067: * @return the next word.
068: */
069: public String next() {
070: if (!hasNext())
071: return null;
072: if (currentWord == null)
073: return null;
074: currentWord = nextWord;
075: currentWordPos = nextWordPos;
076: int current = sentenceIterator.current();
077: if (current == currentWordPos)
078: startsSentence = true;
079: else {
080: startsSentence = false;
081: if (currentWordPos + currentWord.length() > current)
082: sentenceIterator.next();
083: }
084: int i = currentWordPos + currentWord.length();
085: boolean finished = false;
086: boolean started = false;
087: search: /* Find words. */
088: while (i < text.length() && !finished) {
089: if (!started && isWordChar(text, i)) {
090: nextWordPos = i++;
091: started = true;
092: continue search;
093: } else if (started) {
094: if (isWordChar(text, i)) {
095: i++;
096: continue search;
097: } else {
098: nextWord = text.substring(nextWordPos, i);
099: finished = true;
100: break search;
101: }
102: }
103: //Ignore things inside tags.
104: i = ignore(i, '<', '>');
105: i++;
106: }
107: if (!started)
108: nextWord = null;
109: else if (!finished)
110: nextWord = text.substring(nextWordPos, i);
111: return currentWord;
112: }
113:
114: /**
115: * Splits a given String into an array with its constituent words.
116: *
117: * @param text A String.
118: * @return An array with the words extracted from the String.
119: */
120: public static String[] splitWords(String text) {
121: List aux = new Vector();
122: XMLWordFinder finder = new XMLWordFinder(text);
123: String str;
124: while ((str = finder.next()) != null) {
125: aux.add(str);
126: }
127: return (String[]) (aux.toArray(new String[0]));
128: }
129:
130: /**
131: * Splits a given String into an array with its constituent text segments.
132: *
133: * @param text A String.
134: * @return An array with the text segments extracted from the String.
135: */
136: public static String[] splitSegments(String text) {
137: List aux = new Vector();
138: XMLWordFinder finder = new XMLWordFinder(text);
139: String str;
140: while ((str = finder.nextSegment()) != null) {
141: aux.add(str);
142: System.out.println("splitSegments: " + str);
143: }
144: return (String[]) (aux.toArray(new String[0]));
145: }
146:
147: }
|