001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * AlphabeticStringTokenizer.java
019: * Copyright (C) 2003, 2007 University of Waikato, Hamilton, New Zealand
020: */
021:
022: package weka.core.tokenizers;
023:
024: import java.util.NoSuchElementException;
025:
026: /**
027: <!-- globalinfo-start -->
028: * Alphabetic string tokenizer, tokens are to be formed only from contiguous alphabetic sequences.
029: * <p/>
030: <!-- globalinfo-end -->
031: *
032: * @author Asrhaf M. Kibriya (amk14@cs.waikato.ac.nz)
033: * @author FracPete (fracpete at waikato dot ac dot nz)
034: * @version $Revision: 1.1 $
035: */
036: public class AlphabeticTokenizer extends Tokenizer {
037:
038: /** for serialization */
039: private static final long serialVersionUID = 6705199562609861697L;
040:
041: /** the characters of the string */
042: protected char[] m_Str;
043:
044: /** the current position */
045: protected int m_CurrentPos;
046:
047: /**
048: * Returns a string describing the stemmer
049: *
050: * @return a description suitable for displaying in the
051: * explorer/experimenter gui
052: */
053: public String globalInfo() {
054: return "Alphabetic string tokenizer, tokens are to be formed only from "
055: + "contiguous alphabetic sequences.";
056: }
057:
058: /**
059: * returns whether there are more elements still
060: *
061: * @return true if there are still more elements
062: */
063: public boolean hasMoreElements() {
064: int beginpos = m_CurrentPos;
065:
066: while ((beginpos < m_Str.length)
067: && ((m_Str[beginpos] < 'a') || (m_Str[beginpos] > 'z'))
068: && ((m_Str[beginpos] < 'A') || (m_Str[beginpos] > 'Z'))) {
069: beginpos++;
070: }
071: m_CurrentPos = beginpos;
072:
073: if ((beginpos < m_Str.length)
074: && (((m_Str[beginpos] >= 'a') && (m_Str[beginpos] <= 'z')) || ((m_Str[beginpos] >= 'A') && (m_Str[beginpos] <= 'Z')))) {
075: return true;
076: } else {
077: return false;
078: }
079: }
080:
081: /**
082: * returns the next element
083: *
084: * @return the next element
085: */
086: public Object nextElement() {
087: int beginpos, endpos;
088:
089: beginpos = m_CurrentPos;
090:
091: while ((beginpos < m_Str.length)
092: && ((m_Str[beginpos] < 'a') && (m_Str[beginpos] > 'z'))
093: && ((m_Str[beginpos] < 'A') && (m_Str[beginpos] > 'Z'))) {
094: beginpos++;
095: }
096: m_CurrentPos = endpos = beginpos;
097:
098: if (beginpos >= m_Str.length)
099: throw new NoSuchElementException("No more tokens present");
100:
101: while ((endpos < m_Str.length)
102: && (((m_Str[endpos] >= 'a') && (m_Str[endpos] <= 'z')) || ((m_Str[endpos] >= 'A') && (m_Str[endpos] <= 'Z')))) {
103: endpos++;
104: }
105:
106: String s = new String(m_Str, beginpos, endpos - m_CurrentPos);
107: m_CurrentPos = endpos;
108:
109: return s;
110: }
111:
112: /**
113: * Sets the string to tokenize. Tokenization happens immediately.
114: *
115: * @param s the string to tokenize
116: */
117: public void tokenize(String s) {
118: m_CurrentPos = 0;
119: m_Str = new char[s.length()];
120: s.getChars(0, s.length(), m_Str, 0);
121: }
122:
123: /**
124: * Runs the tokenizer with the given options and strings to tokenize.
125: * The tokens are printed to stdout.
126: *
127: * @param args the commandline options and strings to tokenize
128: */
129: public static void main(String[] args) {
130: runTokenizer(new AlphabeticTokenizer(), args);
131: }
132: }
|