001: package org.apache.lucene.analysis;
002:
003: import java.util.AbstractSet;
004: import java.util.Collection;
005: import java.util.Iterator;
006:
007: /**
008: * Licensed to the Apache Software Foundation (ASF) under one or more
009: * contributor license agreements. See the NOTICE file distributed with
010: * this work for additional information regarding copyright ownership.
011: * The ASF licenses this file to You under the Apache License, Version 2.0
012: * (the "License"); you may not use this file except in compliance with
013: * the License. You may obtain a copy of the License at
014: *
015: * http://www.apache.org/licenses/LICENSE-2.0
016: *
017: * Unless required by applicable law or agreed to in writing, software
018: * distributed under the License is distributed on an "AS IS" BASIS,
019: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
020: * See the License for the specific language governing permissions and
021: * limitations under the License.
022: */
023:
024: /**
025: * A simple class that stores Strings as char[]'s in a
026: * hash table. Note that this is not a general purpose
027: * class. For example, it cannot remove items from the
028: * set, nor does it resize its hash table to be smaller,
029: * etc. It is designed to be quick to test if a char[]
030: * is in the set without the necessity of converting it
031: * to a String first.
032: */
033:
034: public class CharArraySet extends AbstractSet {
035: private final static int INIT_SIZE = 8;
036: private char[][] entries;
037: private int count;
038: private final boolean ignoreCase;
039:
040: /** Create set with enough capacity to hold startSize
041: * terms */
042: public CharArraySet(int startSize, boolean ignoreCase) {
043: this .ignoreCase = ignoreCase;
044: int size = INIT_SIZE;
045: while (startSize + (startSize >> 2) > size)
046: size <<= 1;
047: entries = new char[size][];
048: }
049:
050: /** Create set from a Collection of char[] or String */
051: public CharArraySet(Collection c, boolean ignoreCase) {
052: this (c.size(), ignoreCase);
053: addAll(c);
054: }
055:
056: /** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
057: * are in the set */
058: public boolean contains(char[] text, int off, int len) {
059: return entries[getSlot(text, off, len)] != null;
060: }
061:
062: /** true if the <code>CharSequence</code> is in the set */
063: public boolean contains(CharSequence cs) {
064: return entries[getSlot(cs)] != null;
065: }
066:
067: private int getSlot(char[] text, int off, int len) {
068: int code = getHashCode(text, off, len);
069: int pos = code & (entries.length - 1);
070: char[] text2 = entries[pos];
071: if (text2 != null && !equals(text, off, len, text2)) {
072: final int inc = ((code >> 8) + code) | 1;
073: do {
074: code += inc;
075: pos = code & (entries.length - 1);
076: text2 = entries[pos];
077: } while (text2 != null && !equals(text, off, len, text2));
078: }
079: return pos;
080: }
081:
082: /** Returns true if the String is in the set */
083: private int getSlot(CharSequence text) {
084: int code = getHashCode(text);
085: int pos = code & (entries.length - 1);
086: char[] text2 = entries[pos];
087: if (text2 != null && !equals(text, text2)) {
088: final int inc = ((code >> 8) + code) | 1;
089: do {
090: code += inc;
091: pos = code & (entries.length - 1);
092: text2 = entries[pos];
093: } while (text2 != null && !equals(text, text2));
094: }
095: return pos;
096: }
097:
098: /** Add this CharSequence into the set */
099: public boolean add(CharSequence text) {
100: return add(text.toString()); // could be more efficient
101: }
102:
103: /** Add this String into the set */
104: public boolean add(String text) {
105: return add(text.toCharArray());
106: }
107:
108: /** Add this char[] directly to the set.
109: * If ignoreCase is true for this Set, the text array will be directly modified.
110: * The user should never modify this text array after calling this method.
111: */
112: public boolean add(char[] text) {
113: if (ignoreCase)
114: for (int i = 0; i < text.length; i++)
115: text[i] = Character.toLowerCase(text[i]);
116: int slot = getSlot(text, 0, text.length);
117: if (entries[slot] != null)
118: return false;
119: entries[slot] = text;
120: count++;
121:
122: if (count + (count >> 2) > entries.length) {
123: rehash();
124: }
125:
126: return true;
127: }
128:
129: private boolean equals(char[] text1, int off, int len, char[] text2) {
130: if (len != text2.length)
131: return false;
132: if (ignoreCase) {
133: for (int i = 0; i < len; i++) {
134: if (Character.toLowerCase(text1[off + i]) != text2[i])
135: return false;
136: }
137: } else {
138: for (int i = 0; i < len; i++) {
139: if (text1[off + i] != text2[i])
140: return false;
141: }
142: }
143: return true;
144: }
145:
146: private boolean equals(CharSequence text1, char[] text2) {
147: int len = text1.length();
148: if (len != text2.length)
149: return false;
150: if (ignoreCase) {
151: for (int i = 0; i < len; i++) {
152: if (Character.toLowerCase(text1.charAt(i)) != text2[i])
153: return false;
154: }
155: } else {
156: for (int i = 0; i < len; i++) {
157: if (text1.charAt(i) != text2[i])
158: return false;
159: }
160: }
161: return true;
162: }
163:
164: private void rehash() {
165: final int newSize = 2 * entries.length;
166: char[][] oldEntries = entries;
167: entries = new char[newSize][];
168:
169: for (int i = 0; i < oldEntries.length; i++) {
170: char[] text = oldEntries[i];
171: if (text != null) {
172: // todo: could be faster... no need to compare strings on collision
173: entries[getSlot(text, 0, text.length)] = text;
174: }
175: }
176: }
177:
178: private int getHashCode(char[] text, int offset, int len) {
179: int code = 0;
180: final int stop = offset + len;
181: if (ignoreCase) {
182: for (int i = offset; i < stop; i++) {
183: code = code * 31 + Character.toLowerCase(text[i]);
184: }
185: } else {
186: for (int i = offset; i < stop; i++) {
187: code = code * 31 + text[i];
188: }
189: }
190: return code;
191: }
192:
193: private int getHashCode(CharSequence text) {
194: int code;
195: if (ignoreCase) {
196: code = 0;
197: int len = text.length();
198: for (int i = 0; i < len; i++) {
199: code = code * 31
200: + Character.toLowerCase(text.charAt(i));
201: }
202: } else {
203: if (false && text instanceof String) {
204: code = text.hashCode();
205: } else {
206: code = 0;
207: int len = text.length();
208: for (int i = 0; i < len; i++) {
209: code = code * 31 + text.charAt(i);
210: }
211: }
212: }
213: return code;
214: }
215:
216: public int size() {
217: return count;
218: }
219:
220: public boolean isEmpty() {
221: return count == 0;
222: }
223:
224: public boolean contains(Object o) {
225: if (o instanceof char[]) {
226: char[] text = (char[]) o;
227: return contains(text, 0, text.length);
228: } else if (o instanceof CharSequence) {
229: return contains((CharSequence) o);
230: }
231: return false;
232: }
233:
234: public boolean add(Object o) {
235: if (o instanceof char[]) {
236: return add((char[]) o);
237: } else if (o instanceof String) {
238: return add((String) o);
239: } else if (o instanceof CharSequence) {
240: return add((CharSequence) o);
241: } else {
242: return add(o.toString());
243: }
244: }
245:
246: /** The Iterator<String> for this set. Strings are constructed on the fly, so
247: * use <code>nextCharArray</code> for more efficient access. */
248: public class CharArraySetIterator implements Iterator {
249: int pos = -1;
250: char[] next;
251:
252: CharArraySetIterator() {
253: goNext();
254: }
255:
256: private void goNext() {
257: next = null;
258: pos++;
259: while (pos < entries.length
260: && (next = entries[pos]) == null)
261: pos++;
262: }
263:
264: public boolean hasNext() {
265: return next != null;
266: }
267:
268: /** do not modify the returned char[] */
269: public char[] nextCharArray() {
270: char[] ret = next;
271: goNext();
272: return ret;
273: }
274:
275: /** Returns the next String, as a Set<String> would...
276: * use nextCharArray() for better efficiency. */
277: public Object next() {
278: return new String(nextCharArray());
279: }
280:
281: public void remove() {
282: throw new UnsupportedOperationException();
283: }
284: }
285:
286: public Iterator iterator() {
287: return new CharArraySetIterator();
288: }
289:
290: }
|