001: /*
002: ***********************************************************************
003: * Copyright (C) 2005, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: ***********************************************************************
006: *
007: */
008:
009: package com.ibm.icu.dev.tool.charsetdet.sbcs;
010:
011: import com.ibm.icu.text.UnicodeSet;
012:
013: /**
014: * @author emader
015: *
016: * TODO To change the template for this generated type comment go to
017: * Window - Preferences - Java - Code Style - Code Templates
018: */
019: public class NGramParser {
020:
021: public interface NGramParserClient {
022: char nextChar();
023:
024: void handleNGram(String key);
025: }
026:
027: private static final int A_NULL = 0;
028: private static final int A_ADDC = 1;
029: private static final int A_ADDS = 2;
030:
031: /*
032: * Character classes
033: */
034: public static final int C_IGNORE = 0;
035: public static final int C_LETTER = 1;
036: public static final int C_PUNCT = 2;
037:
038: private static final int S_START = 0;
039: private static final int S_LETTER = 1;
040: private static final int S_PUNCT = 2;
041:
042: static final class StateEntry {
043: private int newState;
044: private int action;
045:
046: StateEntry(int theState, int theAction) {
047: newState = theState;
048: action = theAction;
049: }
050:
051: public int getNewState() {
052: return newState;
053: }
054:
055: public int getAction() {
056: return action;
057: }
058: }
059:
060: private StateEntry[][] stateTable = {
061: { new StateEntry(S_START, A_NULL),
062: new StateEntry(S_LETTER, A_ADDC),
063: new StateEntry(S_PUNCT, A_ADDS) },
064: { new StateEntry(S_LETTER, A_NULL),
065: new StateEntry(S_LETTER, A_ADDC),
066: new StateEntry(S_PUNCT, A_ADDS) },
067: { new StateEntry(S_PUNCT, A_NULL),
068: new StateEntry(S_LETTER, A_ADDC),
069: new StateEntry(S_PUNCT, A_NULL) } };
070:
071: protected final int N_GRAM_SIZE = 3;
072:
073: private char[] letters = new char[N_GRAM_SIZE];
074: private int letterCount;
075:
076: private static UnicodeSet letterSet = new UnicodeSet("[:letter:]");
077:
078: private NGramParserClient client;
079:
080: /**
081: *
082: */
083: public NGramParser(NGramParserClient theClient) {
084: client = theClient;
085: letterCount = 0;
086: }
087:
088: public void setClient(NGramParserClient theClient) {
089: client = theClient;
090: }
091:
092: // TODO Is this good enough, or are there other C_IGNORE characters?
093: // TODO Could this make Latin letters C_PUNCT for non-Latin scripts?
094: public static int getCharClass(char ch) {
095: if (ch == '\'' || ch == '\uFEFF') {
096: return C_IGNORE;
097: }
098:
099: if (letterSet.contains(ch)) {
100: return C_LETTER;
101: }
102:
103: return C_PUNCT;
104: }
105:
106: public void reset() {
107: letterCount = 0;
108: }
109:
110: public void addLetter(char letter) {
111: // somewhat clever stuff goes here...
112: letters[letterCount++] = letter;
113:
114: if (letterCount >= N_GRAM_SIZE) {
115: String key = new String(letters);
116:
117: client.handleNGram(key);
118:
119: letterCount = N_GRAM_SIZE - 1;
120: for (int i = 0; i < letterCount; i += 1) {
121: letters[i] = letters[i + 1];
122: }
123: }
124: }
125:
126: public void parse() {
127: char ch;
128: int state = 0;
129:
130: // this is where the clever stuff goes...
131: while ((ch = client.nextChar()) != 0) {
132: int charClass = getCharClass(ch);
133: StateEntry entry = stateTable[state][charClass];
134:
135: state = entry.getNewState();
136:
137: switch (entry.getAction()) {
138: case A_ADDC:
139: addLetter(Character.toLowerCase(ch));
140: break;
141:
142: case A_ADDS:
143: addLetter(' ');
144: break;
145:
146: case A_NULL:
147: default:
148: break;
149: }
150: }
151:
152: addLetter(' ');
153: }
154: }
|