001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017:
018: package org.apache.xerces.util;
019:
020: /**
021: * This class is a symbol table implementation that guarantees that
022: * strings used as identifiers are unique references. Multiple calls
023: * to <code>addSymbol</code> will always return the same string
024: * reference.
025: * <p>
026: * The symbol table performs the same task as <code>String.intern()</code>
027: * with the following differences:
028: * <ul>
029: * <li>
030: * A new string object does not need to be created in order to
031: * retrieve a unique reference. Symbols can be added by using
032: * a series of characters in a character array.
033: * </li>
034: * <li>
035: * Users of the symbol table can provide their own symbol hashing
036: * implementation. For example, a simple string hashing algorithm
037: * may fail to produce a balanced set of hashcodes for symbols
038: * that are <em>mostly</em> unique. Strings with similar leading
039: * characters are especially prone to this poor hashing behavior.
040: * </li>
041: * </ul>
042: *
043: * An instance of <code>SymbolTable</code> has two parameters that affect its
044: * performance: <i>initial capacity</i> and <i>load factor</i>. The
045: * <i>capacity</i> is the number of <i>buckets</i> in the SymbolTable, and the
046: * <i>initial capacity</i> is simply the capacity at the time the SymbolTable
047: * is created. Note that the SymbolTable is <i>open</i>: in the case of a "hash
048: * collision", a single bucket stores multiple entries, which must be searched
049: * sequentially. The <i>load factor</i> is a measure of how full the SymbolTable
050: * is allowed to get before its capacity is automatically increased.
051: * When the number of entries in the SymbolTable exceeds the product of the load
052: * factor and the current capacity, the capacity is increased by calling the
053: * <code>rehash</code> method.<p>
054: *
055: * Generally, the default load factor (.75) offers a good tradeoff between
056: * time and space costs. Higher values decrease the space overhead but
057: * increase the time cost to look up an entry (which is reflected in most
058: * <tt>SymbolTable</tt> operations, including <tt>addSymbol</tt> and <tt>containsSymbol</tt>).<p>
059: *
060: * The initial capacity controls a tradeoff between wasted space and the
061: * need for <code>rehash</code> operations, which are time-consuming.
062: * No <code>rehash</code> operations will <i>ever</i> occur if the initial
063: * capacity is greater than the maximum number of entries the
064: * <tt>Hashtable</tt> will contain divided by its load factor. However,
065: * setting the initial capacity too high can waste space.<p>
066: *
067: * If many entries are to be made into a <code>SymbolTable</code>,
068: * creating it with a sufficiently large capacity may allow the
069: * entries to be inserted more efficiently than letting it perform
070: * automatic rehashing as needed to grow the table. <p>
071:
072: * @see SymbolHash
073: *
074: * @author Andy Clark
075: * @author John Kim, IBM
076: *
077: * @version $Id: SymbolTable.java 496157 2007-01-14 21:24:28Z mrglavas $
078: */
079: public class SymbolTable {
080:
081: //
082: // Constants
083: //
084:
085: /** Default table size. */
086: protected static final int TABLE_SIZE = 101;
087:
088: //
089: // Data
090: //
091:
092: /** Buckets. */
093: protected Entry[] fBuckets = null;
094:
095: /** actual table size **/
096: protected int fTableSize;
097:
098: /** The total number of entries in the hash table. */
099: protected transient int fCount;
100:
101: /** The table is rehashed when its size exceeds this threshold. (The
102: * value of this field is (int)(capacity * loadFactor).) */
103: protected int fThreshold;
104:
105: /** The load factor for the SymbolTable. */
106: protected float fLoadFactor;
107:
108: //
109: // Constructors
110: //
111:
112: /**
113: * Constructs a new, empty SymbolTable with the specified initial
114: * capacity and the specified load factor.
115: *
116: * @param initialCapacity the initial capacity of the SymbolTable.
117: * @param loadFactor the load factor of the SymbolTable.
118: * @throws IllegalArgumentException if the initial capacity is less
119: * than zero, or if the load factor is nonpositive.
120: */
121: public SymbolTable(int initialCapacity, float loadFactor) {
122:
123: if (initialCapacity < 0) {
124: throw new IllegalArgumentException("Illegal Capacity: "
125: + initialCapacity);
126: }
127:
128: if (loadFactor <= 0 || Float.isNaN(loadFactor)) {
129: throw new IllegalArgumentException("Illegal Load: "
130: + loadFactor);
131: }
132:
133: if (initialCapacity == 0) {
134: initialCapacity = 1;
135: }
136:
137: fLoadFactor = loadFactor;
138: fTableSize = initialCapacity;
139: fBuckets = new Entry[fTableSize];
140: fThreshold = (int) (fTableSize * loadFactor);
141: fCount = 0;
142: }
143:
144: /**
145: * Constructs a new, empty SymbolTable with the specified initial capacity
146: * and default load factor, which is <tt>0.75</tt>.
147: *
148: * @param initialCapacity the initial capacity of the hashtable.
149: * @throws IllegalArgumentException if the initial capacity is less
150: * than zero.
151: */
152: public SymbolTable(int initialCapacity) {
153: this (initialCapacity, 0.75f);
154: }
155:
156: /**
157: * Constructs a new, empty SymbolTable with a default initial capacity (101)
158: * and load factor, which is <tt>0.75</tt>.
159: */
160: public SymbolTable() {
161: this (TABLE_SIZE, 0.75f);
162: }
163:
164: //
165: // Public methods
166: //
167:
168: /**
169: * Adds the specified symbol to the symbol table and returns a
170: * reference to the unique symbol. If the symbol already exists,
171: * the previous symbol reference is returned instead, in order
172: * guarantee that symbol references remain unique.
173: *
174: * @param symbol The new symbol.
175: */
176: public String addSymbol(String symbol) {
177:
178: // search for identical symbol
179: int bucket = hash(symbol) % fTableSize;
180: for (Entry entry = fBuckets[bucket]; entry != null; entry = entry.next) {
181: if (entry.symbol.equals(symbol)) {
182: return entry.symbol;
183: }
184: }
185:
186: if (fCount >= fThreshold) {
187: // Rehash the table if the threshold is exceeded
188: rehash();
189: bucket = hash(symbol) % fTableSize;
190: }
191:
192: // create new entry
193: Entry entry = new Entry(symbol, fBuckets[bucket]);
194: fBuckets[bucket] = entry;
195: ++fCount;
196: return entry.symbol;
197:
198: } // addSymbol(String):String
199:
200: /**
201: * Adds the specified symbol to the symbol table and returns a
202: * reference to the unique symbol. If the symbol already exists,
203: * the previous symbol reference is returned instead, in order
204: * guarantee that symbol references remain unique.
205: *
206: * @param buffer The buffer containing the new symbol.
207: * @param offset The offset into the buffer of the new symbol.
208: * @param length The length of the new symbol in the buffer.
209: */
210: public String addSymbol(char[] buffer, int offset, int length) {
211:
212: // search for identical symbol
213: int bucket = hash(buffer, offset, length) % fTableSize;
214: OUTER: for (Entry entry = fBuckets[bucket]; entry != null; entry = entry.next) {
215: if (length == entry.characters.length) {
216: for (int i = 0; i < length; i++) {
217: if (buffer[offset + i] != entry.characters[i]) {
218: continue OUTER;
219: }
220: }
221: return entry.symbol;
222: }
223: }
224:
225: if (fCount >= fThreshold) {
226: // Rehash the table if the threshold is exceeded
227: rehash();
228: bucket = hash(buffer, offset, length) % fTableSize;
229: }
230:
231: // add new entry
232: Entry entry = new Entry(buffer, offset, length,
233: fBuckets[bucket]);
234: fBuckets[bucket] = entry;
235: ++fCount;
236: return entry.symbol;
237:
238: } // addSymbol(char[],int,int):String
239:
240: /**
241: * Returns a hashcode value for the specified symbol. The value
242: * returned by this method must be identical to the value returned
243: * by the <code>hash(char[],int,int)</code> method when called
244: * with the character array that comprises the symbol string.
245: *
246: * @param symbol The symbol to hash.
247: */
248: public int hash(String symbol) {
249: return symbol.hashCode() & 0x7FFFFFF;
250: } // hash(String):int
251:
252: /**
253: * Returns a hashcode value for the specified symbol information.
254: * The value returned by this method must be identical to the value
255: * returned by the <code>hash(String)</code> method when called
256: * with the string object created from the symbol information.
257: *
258: * @param buffer The character buffer containing the symbol.
259: * @param offset The offset into the character buffer of the start
260: * of the symbol.
261: * @param length The length of the symbol.
262: */
263: public int hash(char[] buffer, int offset, int length) {
264:
265: int code = 0;
266: for (int i = 0; i < length; ++i) {
267: code = code * 31 + buffer[offset + i];
268: }
269: return code & 0x7FFFFFF;
270:
271: } // hash(char[],int,int):int
272:
273: /**
274: * Increases the capacity of and internally reorganizes this
275: * SymbolTable, in order to accommodate and access its entries more
276: * efficiently. This method is called automatically when the
277: * number of keys in the SymbolTable exceeds this hashtable's capacity
278: * and load factor.
279: */
280: protected void rehash() {
281:
282: int oldCapacity = fBuckets.length;
283: Entry[] oldTable = fBuckets;
284:
285: int newCapacity = oldCapacity * 2 + 1;
286: Entry[] newTable = new Entry[newCapacity];
287:
288: fThreshold = (int) (newCapacity * fLoadFactor);
289: fBuckets = newTable;
290: fTableSize = fBuckets.length;
291:
292: for (int i = oldCapacity; i-- > 0;) {
293: for (Entry old = oldTable[i]; old != null;) {
294: Entry e = old;
295: old = old.next;
296:
297: int index = hash(e.characters, 0, e.characters.length)
298: % newCapacity;
299: e.next = newTable[index];
300: newTable[index] = e;
301: }
302: }
303: }
304:
305: /**
306: * Returns true if the symbol table already contains the specified
307: * symbol.
308: *
309: * @param symbol The symbol to look for.
310: */
311: public boolean containsSymbol(String symbol) {
312:
313: // search for identical symbol
314: int bucket = hash(symbol) % fTableSize;
315: int length = symbol.length();
316: OUTER: for (Entry entry = fBuckets[bucket]; entry != null; entry = entry.next) {
317: if (length == entry.characters.length) {
318: for (int i = 0; i < length; i++) {
319: if (symbol.charAt(i) != entry.characters[i]) {
320: continue OUTER;
321: }
322: }
323: return true;
324: }
325: }
326:
327: return false;
328:
329: } // containsSymbol(String):boolean
330:
331: /**
332: * Returns true if the symbol table already contains the specified
333: * symbol.
334: *
335: * @param buffer The buffer containing the symbol to look for.
336: * @param offset The offset into the buffer.
337: * @param length The length of the symbol in the buffer.
338: */
339: public boolean containsSymbol(char[] buffer, int offset, int length) {
340:
341: // search for identical symbol
342: int bucket = hash(buffer, offset, length) % fTableSize;
343: OUTER: for (Entry entry = fBuckets[bucket]; entry != null; entry = entry.next) {
344: if (length == entry.characters.length) {
345: for (int i = 0; i < length; i++) {
346: if (buffer[offset + i] != entry.characters[i]) {
347: continue OUTER;
348: }
349: }
350: return true;
351: }
352: }
353:
354: return false;
355:
356: } // containsSymbol(char[],int,int):boolean
357:
358: //
359: // Classes
360: //
361:
362: /**
363: * This class is a symbol table entry. Each entry acts as a node
364: * in a linked list.
365: */
366: protected static final class Entry {
367:
368: //
369: // Data
370: //
371:
372: /** Symbol. */
373: public final String symbol;
374:
375: /**
376: * Symbol characters. This information is duplicated here for
377: * comparison performance.
378: */
379: public final char[] characters;
380:
381: /** The next entry. */
382: public Entry next;
383:
384: //
385: // Constructors
386: //
387:
388: /**
389: * Constructs a new entry from the specified symbol and next entry
390: * reference.
391: */
392: public Entry(String symbol, Entry next) {
393: this .symbol = symbol.intern();
394: characters = new char[symbol.length()];
395: symbol.getChars(0, characters.length, characters, 0);
396: this .next = next;
397: }
398:
399: /**
400: * Constructs a new entry from the specified symbol information and
401: * next entry reference.
402: */
403: public Entry(char[] ch, int offset, int length, Entry next) {
404: characters = new char[length];
405: System.arraycopy(ch, offset, characters, 0, length);
406: symbol = new String(characters).intern();
407: this .next = next;
408: }
409:
410: } // class Entry
411:
412: } // class SymbolTable
|