001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * NGramTokenizer.java
019: * Copyright (C) 2007 Sebastian Germesin
020: */
021:
022: package weka.core.tokenizers;
023:
024: import weka.core.Option;
025: import weka.core.Utils;
026:
027: import java.util.Enumeration;
028: import java.util.LinkedList;
029: import java.util.Vector;
030:
031: /**
032: <!-- globalinfo-start -->
033: * Splits a string into an n-gram with min and max grams.
034: * <p/>
035: <!-- globalinfo-end -->
036: *
037: <!-- options-start -->
038: * Valid options are: <p/>
039: *
040: * <pre> -delimiters <value>
041: * The delimiters to use
042: * (default ' \n\t.,;:'"()?!').</pre>
043: *
044: * <pre> -max <int>
045: * The max size of the Ngram (default = 3).</pre>
046: *
047: * <pre> -min <int>
048: * The min size of the Ngram (default = 1).</pre>
049: *
050: <!-- options-end -->
051: *
052: * @author Sebastian Germesin (sebastian.germesin@dfki.de)
053: * @author FracPete (fracpete at waikato dot ac dot nz)
054: * @version $Revision: 1.1 $
055: */
056: public class NGramTokenizer extends CharacterDelimitedTokenizer {
057:
058: /** for serialization */
059: private static final long serialVersionUID = -2181896254171647219L;
060:
061: /** the maximum number of N */
062: protected int m_NMax = 3;
063:
064: /** the minimum number of N */
065: protected int m_NMin = 1;
066:
067: /** the current length of the N-grams */
068: protected int m_N;
069:
070: /** the number of strings available */
071: protected int m_MaxPosition;
072:
073: /** the current position for returning elements */
074: protected int m_CurrentPosition;
075:
076: /** all the available grams */
077: protected String[] m_SplitString;
078:
079: /**
080: * Returns a string describing the stemmer
081: *
082: * @return a description suitable for displaying in the
083: * explorer/experimenter gui
084: */
085: public String globalInfo() {
086: return "Splits a string into an n-gram with min and max grams.";
087: }
088:
089: /**
090: * Returns an enumeration of all the available options..
091: *
092: * @return an enumeration of all available options.
093: */
094: public Enumeration listOptions() {
095: Vector result;
096: Enumeration enm;
097:
098: result = new Vector();
099:
100: enm = super .listOptions();
101: while (enm.hasMoreElements())
102: result.addElement(enm.nextElement());
103:
104: result.addElement(new Option(
105: "\tThe max size of the Ngram (default = 3).", "max", 1,
106: "-max <int>"));
107:
108: result.addElement(new Option(
109: "\tThe min size of the Ngram (default = 1).", "min", 1,
110: "-min <int>"));
111:
112: return result.elements();
113: }
114:
115: /**
116: * Gets the current option settings for the OptionHandler.
117: *
118: * @return the list of current option settings as an array of
119: * strings
120: */
121: public String[] getOptions() {
122: Vector<String> result;
123: String[] options;
124: int i;
125:
126: result = new Vector<String>();
127:
128: options = super .getOptions();
129: for (i = 0; i < options.length; i++)
130: result.add(options[i]);
131:
132: result.add("-max");
133: result.add("" + getNGramMaxSize());
134:
135: result.add("-min");
136: result.add("" + getNGramMinSize());
137:
138: return result.toArray(new String[result.size()]);
139: }
140:
141: /**
142: * Parses a given list of options. <p/>
143: *
144: <!-- options-start -->
145: * Valid options are: <p/>
146: *
147: * <pre> -delimiters <value>
148: * The delimiters to use
149: * (default ' \n\t.,;:'"()?!').</pre>
150: *
151: * <pre> -max <int>
152: * The max size of the Ngram (default = 3).</pre>
153: *
154: * <pre> -min <int>
155: * The min size of the Ngram (default = 1).</pre>
156: *
157: <!-- options-end -->
158: *
159: * @param options the list of options as an array of strings
160: * @throws Exception if an option is not supported
161: */
162: public void setOptions(String[] options) throws Exception {
163: String value;
164:
165: super .setOptions(options);
166:
167: value = Utils.getOption("max", options);
168: if (value.length() != 0)
169: setNGramMaxSize(Integer.parseInt(value));
170: else
171: setNGramMaxSize(3);
172:
173: value = Utils.getOption("min", options);
174: if (value.length() != 0)
175: setNGramMinSize(Integer.parseInt(value));
176: else
177: setNGramMinSize(1);
178: }
179:
180: /**
181: * Gets the max N of the NGram.
182: *
183: * @return the size (N) of the NGram.
184: */
185: public int getNGramMaxSize() {
186: return m_NMax;
187: }
188:
189: /**
190: * Sets the max size of the Ngram.
191: *
192: * @param value the size of the NGram.
193: */
194: public void setNGramMaxSize(int value) {
195: if (value < 1)
196: m_NMax = 1;
197: else
198: m_NMax = value;
199: }
200:
201: /**
202: * Returns the tip text for this property.
203: *
204: * @return tip text for this property suitable for
205: * displaying in the explorer/experimenter gui
206: */
207: public String NGramMaxSizeTipText() {
208: return "The max N of the NGram.";
209: }
210:
211: /**
212: * Sets the min size of the Ngram.
213: *
214: * @param value the size of the NGram.
215: */
216: public void setNGramMinSize(int value) {
217: if (value < 1)
218: m_NMin = 1;
219: else
220: m_NMin = value;
221: }
222:
223: /**
224: * Gets the min N of the NGram.
225: *
226: * @return the size (N) of the NGram.
227: */
228: public int getNGramMinSize() {
229: return m_NMin;
230: }
231:
232: /**
233: * Returns the tip text for this property.
234: *
235: * @return tip text for this property suitable for
236: * displaying in the explorer/experimenter gui
237: */
238: public String NGramMinSizeTipText() {
239: return "The min N of the NGram.";
240: }
241:
242: /**
243: * returns true if there's more elements available
244: *
245: * @return true if there are more elements available
246: */
247: public boolean hasMoreElements() {
248: return (m_CurrentPosition < m_MaxPosition
249: && m_N - 1 + m_CurrentPosition < m_MaxPosition && m_N >= m_NMin);
250: }
251:
252: /**
253: * Returns N-grams and also (N-1)-grams and .... and 1-grams.
254: *
255: * @return the next element
256: */
257: public Object nextElement() {
258: String retValue = "";
259:
260: for (int i = 0; i < m_N
261: && i + m_CurrentPosition < m_MaxPosition; i++)
262: retValue += " " + m_SplitString[m_CurrentPosition + i];
263:
264: m_CurrentPosition++;
265:
266: if (m_CurrentPosition + m_N - 1 == m_MaxPosition) {
267: m_CurrentPosition = 0;
268: m_N--;
269: }
270:
271: return retValue.trim();
272: }
273:
274: /**
275: * filters out empty strings in m_SplitString and
276: * replaces m_SplitString with the cleaned version.
277: *
278: * @see #m_SplitString
279: */
280: protected void filterOutEmptyStrings() {
281: String[] newSplit;
282: LinkedList<String> clean = new LinkedList<String>();
283:
284: for (int i = 0; i < m_SplitString.length; i++) {
285: if (!m_SplitString[i].equals(""))
286: clean.add(m_SplitString[i]);
287: }
288:
289: newSplit = new String[clean.size()];
290: for (int i = 0; i < clean.size(); i++)
291: newSplit[i] = clean.get(i);
292:
293: m_SplitString = newSplit;
294: }
295:
296: /**
297: * Sets the string to tokenize. Tokenization happens immediately.
298: *
299: * @param s the string to tokenize
300: */
301: public void tokenize(String s) {
302: m_N = m_NMax;
303: m_SplitString = s.split("[" + getDelimiters() + "]");
304:
305: filterOutEmptyStrings();
306:
307: m_CurrentPosition = 0;
308: m_MaxPosition = m_SplitString.length;
309: }
310:
311: /**
312: * Runs the tokenizer with the given options and strings to tokenize.
313: * The tokens are printed to stdout.
314: *
315: * @param args the commandline options and strings to tokenize
316: */
317: public static void main(String[] args) {
318: runTokenizer(new NGramTokenizer(), args);
319: }
320: }
|