001: /*
002: * The Apache Software License, Version 1.1
003: *
004: *
005: * Copyright (c) 1999 The Apache Software Foundation. All rights
006: * reserved.
007: *
008: * Redistribution and use in source and binary forms, with or without
009: * modification, are permitted provided that the following conditions
010: * are met:
011: *
012: * 1. Redistributions of source code must retain the above copyright
013: * notice, this list of conditions and the following disclaimer.
014: *
015: * 2. Redistributions in binary form must reproduce the above copyright
016: * notice, this list of conditions and the following disclaimer in
017: * the documentation and/or other materials provided with the
018: * distribution.
019: *
020: * 3. The end-user documentation included with the redistribution,
021: * if any, must include the following acknowledgment:
022: * "This product includes software developed by the
023: * Apache Software Foundation (http://www.apache.org/)."
024: * Alternately, this acknowledgment may appear in the software itself,
025: * if and wherever such third-party acknowledgments normally appear.
026: *
027: * 4. The names "Xerces" and "Apache Software Foundation" must
028: * not be used to endorse or promote products derived from this
029: * software without prior written permission. For written
030: * permission, please contact apache@apache.org.
031: *
032: * 5. Products derived from this software may not be called "Apache",
033: * nor may "Apache" appear in their name, without prior written
034: * permission of the Apache Software Foundation.
035: *
036: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
037: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
038: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
039: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
040: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
041: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
042: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
043: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
044: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
045: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
046: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
047: * SUCH DAMAGE.
048: * ====================================================================
049: *
050: * This software consists of voluntary contributions made by many
051: * individuals on behalf of the Apache Software Foundation and was
052: * originally based on software copyright (c) 1999, International
053: * Business Machines, Inc., http://www.apache.org. For more
054: * information on the Apache Software Foundation, please see
055: * <http://www.apache.org/>.
056: */
057:
058: package org.apache.xerces.utils;
059:
060: /**
061: *
062: * @version
063: */
064: public final class SymbolCache {
065: //
066: // Symbol Cache
067: //
068: public static final int CHAR_OFFSET = 0;
069: public static final int INDEX_OFFSET = 1;
070: public static final int NEXT_OFFSET = 2;
071: public static final int CACHE_RECORD_SIZE = 3;
072:
073: // start with 4 entries per level on cache lines other than 0
074: public static final int INITIAL_CACHE_RECORD_COUNT = 4;
075:
076: // start with 85 entries on cache line 0 (85*3+1 = 256)
077: public static final int HEAD_INITIAL_CACHE_RECORD_COUNT = 85;
078: //
079: public char[] fSymbolChars = new char[8192];
080: public int fSymbolCharsOffset = 0;
081: public int[][] fCacheLines = new int[256][];
082: public int fCacheLineCount = 0;
083:
084: /*
085: *
086: * A brief description of how a SymbolCache is organized:
087: *
088: * The first entry in any fCacheLines[] array is a count of the number of
089: * records stored in that array. Subsequent entries are arranged in groups
090: * of three: a character value; the symbol handle, if this is the last
091: * character in a symbol, or -1 otherwise; and a pointer to the next character
092: * in the symbol, or -1 if there is none. All symbol entries begin in
093: * fCacheLines[0].
094: *
095: * For example, if the symbols "help", "he:p", "hel", and "bob" are
096: * entered, and have symbol indices 11, 22, 33 and 44, respectively, the
097: * fCacheLines array will look something like this (ignoring zero elements):
098: *
099: * fCacheLines[0] = {2, 'h', -1, 1, 'b', -1, 5}
100: * fCacheLines[1] = {1, 'e', -1, 2}
101: * fCacheLines[2] = {2, 'l', 33, 3, ':', -1, 4}
102: * fCacheLines[3] = {1, 'p', 11, -1}
103: * fCacheLines[4] = {1, 'p', 22, -1}
104: * fCacheLines[5] = {1, 'o', -1, 6}
105: * fCacheLines[6] = {1, 'b', 44, -1}
106: *
107: * The initial character of every symbol is stored in fCacheLines[0]. Other
108: * elements of fCacheLines contain records for characters that share common
109: * prefixes. For instance, fCacheLines[2] contains records for all symbols
110: * that begin with "he". Note also that the symbol "hel" has both a symbol
111: * index and a pointer to the record containing the next character, as both
112: * "hel" and "help" are symbols in the cache.
113: *
114: */
115:
116: /****
117: public void dumpCache() {
118: System.out.println("fSymbolChars.length == "+fSymbolChars.length);
119:
120: for (int i = 0; i < fSymbolCharsOffset; i++)
121: System.out.println("fSymbolChars["+i+"] == "+fSymbolChars[i]);
122:
123: for (int i = 0; i < fCacheLineCount; i++) {
124: System.out.print("fCacheLines["+i+"] (num records == "+
125: fCacheLines[i][0]+") == {");
126:
127: int offset = 1;
128: for (int j = 0; j < fCacheLines[i][0]; j++) {
129: System.out.print("{char="+
130: (new Character((char)fCacheLines[i][offset+CHAR_OFFSET]).
131: toString())+
132: "; idx="+fCacheLines[i][offset+INDEX_OFFSET]+
133: "; next="+fCacheLines[i][offset+NEXT_OFFSET]+"}");
134: offset += CACHE_RECORD_SIZE;
135: }
136: System.out.println("} - (Actual size == "+fCacheLines[i].length+")");
137: }
138: }
139: ****/
140:
141: public SymbolCache() {
142: fCacheLines[fCacheLineCount++] = new int[1 + (HEAD_INITIAL_CACHE_RECORD_COUNT * CACHE_RECORD_SIZE)];
143: }
144:
145: //
146: public void reset() {
147: fSymbolCharsOffset = 0;
148: fCacheLineCount = 0;
149: fCacheLines[fCacheLineCount++] = new int[1 + (HEAD_INITIAL_CACHE_RECORD_COUNT * CACHE_RECORD_SIZE)];
150: }
151:
152: //
153: //
154: //
155: public char[] getSymbolChars() {
156: return fSymbolChars;
157: }
158:
159: //
160: // Symbol interfaces
161: //
162: public String createSymbol(int symbolHandle, int startOffset,
163: int entry, int[] entries, int offset) {
164: int slen = fSymbolCharsOffset - startOffset;
165: String str = new String(fSymbolChars, startOffset, slen);
166: try {
167: entries[offset + SymbolCache.INDEX_OFFSET] = symbolHandle;
168: } catch (ArrayIndexOutOfBoundsException ex) {
169: throw new RuntimeException("UTL001 untested");
170: }
171: return str;
172: }
173:
174: public int addSymbolToCache(String str, int slen, int symbolHandle) {
175: int charsOffset = fSymbolCharsOffset;
176: if (slen == 0) // EMPTY_STRING cannot be a legal "symbol"
177: return charsOffset;
178: int strIndex = 0;
179: char ch = str.charAt(strIndex++);
180: try {
181: fSymbolChars[fSymbolCharsOffset] = ch;
182: } catch (ArrayIndexOutOfBoundsException ex) {
183: char[] newChars = new char[fSymbolChars.length * 2];
184: System.arraycopy(fSymbolChars, 0, newChars, 0,
185: fSymbolChars.length);
186: fSymbolChars = newChars;
187: fSymbolChars[fSymbolCharsOffset] = ch;
188: }
189: fSymbolCharsOffset++;
190: int entry = 0;
191: int[] entries = fCacheLines[entry];
192: int count = entries[0];
193: int i = 0;
194: int offset = 1;
195: while (true) {
196: if (i == count)
197: break;
198: if (entries[offset + CHAR_OFFSET] != ch) {
199: i++;
200: offset += CACHE_RECORD_SIZE;
201: continue;
202: }
203: if (strIndex == slen) {
204: if (entries[offset + INDEX_OFFSET] != -1) {
205: // How did we miss this during lookup ?
206: throw new RuntimeException("addSymbolToCache");
207: }
208: entries[offset + INDEX_OFFSET] = symbolHandle;
209: return charsOffset;
210: }
211: ch = str.charAt(strIndex++);
212: try {
213: fSymbolChars[fSymbolCharsOffset] = ch;
214: } catch (ArrayIndexOutOfBoundsException ex) {
215: char[] newChars = new char[fSymbolChars.length * 2];
216: System.arraycopy(fSymbolChars, 0, newChars, 0,
217: fSymbolChars.length);
218: fSymbolChars = newChars;
219: fSymbolChars[fSymbolCharsOffset] = ch;
220: }
221: fSymbolCharsOffset++;
222: entry = entries[offset + NEXT_OFFSET];
223: try {
224: entries = fCacheLines[entry];
225: } catch (ArrayIndexOutOfBoundsException ex) {
226: if (entry == -1) {
227: entry = fCacheLineCount++;
228: entries[offset + NEXT_OFFSET] = entry;
229: entries = new int[1 + (INITIAL_CACHE_RECORD_COUNT * CACHE_RECORD_SIZE)];
230: try {
231: fCacheLines[entry] = entries;
232: } catch (ArrayIndexOutOfBoundsException ex2) {
233: int[][] newCache = new int[entry * 2][];
234: System.arraycopy(fCacheLines, 0, newCache, 0,
235: entry);
236: fCacheLines = newCache;
237: fCacheLines[entry] = entries;
238: }
239: } else {
240: entries = fCacheLines[entry];
241: throw new RuntimeException("UTL001 untested");
242: }
243: }
244: count = entries[0];
245: i = 0;
246: offset = 1;
247: }
248: while (true) {
249: entries[0]++;
250: try {
251: entries[offset + CHAR_OFFSET] = ch;
252: } catch (ArrayIndexOutOfBoundsException ex) {
253: int newSize = 1 + ((offset - 1) * 2);
254: int[] newEntries = new int[newSize];
255: System.arraycopy(entries, 0, newEntries, 0, offset);
256: fCacheLines[entry] = entries = newEntries;
257: entries[offset + CHAR_OFFSET] = ch;
258: }
259: if (strIndex == slen) {
260: entries[offset + INDEX_OFFSET] = symbolHandle;
261: entries[offset + NEXT_OFFSET] = -1;
262: break;
263: }
264: entry = fCacheLineCount++;
265: entries[offset + INDEX_OFFSET] = -1;
266: entries[offset + NEXT_OFFSET] = entry;
267: entries = new int[1 + (INITIAL_CACHE_RECORD_COUNT * CACHE_RECORD_SIZE)];
268: try {
269: fCacheLines[entry] = entries;
270: } catch (ArrayIndexOutOfBoundsException ex) {
271: int[][] newCache = new int[entry * 2][];
272: System.arraycopy(fCacheLines, 0, newCache, 0, entry);
273: fCacheLines = newCache;
274: fCacheLines[entry] = entries;
275: }
276: offset = 1;
277: ch = str.charAt(strIndex++);
278: try {
279: fSymbolChars[fSymbolCharsOffset] = ch;
280: } catch (ArrayIndexOutOfBoundsException ex) {
281: char[] newChars = new char[fSymbolChars.length * 2];
282: System.arraycopy(fSymbolChars, 0, newChars, 0,
283: fSymbolChars.length);
284: fSymbolChars = newChars;
285: fSymbolChars[fSymbolCharsOffset] = ch;
286: }
287: fSymbolCharsOffset++;
288: }
289: return charsOffset;
290: }
291:
292: public void updateCacheLine(int charsOffset, int totalMisses,
293: int length) {
294: //System.err.println("found symbol " + toString(symbolIndex) + " after " + totalMisses + " total misses (" + (totalMisses/length) + " misses per character).");
295: int entry = 0;
296: int[] entries = fCacheLines[0];
297: int ch = fSymbolChars[charsOffset++];
298: int count = entries[0];
299: int offset = 1 + ((count - 1) * CACHE_RECORD_SIZE);
300: int misses = 0;
301: while (true) {
302: if (ch != entries[offset + CHAR_OFFSET]) {
303: offset -= CACHE_RECORD_SIZE;
304: misses++;
305: continue;
306: }
307: if (misses > 4) {
308: int symIndex = entries[offset + INDEX_OFFSET];
309: int nextIndex = entries[offset + NEXT_OFFSET];
310: System.arraycopy(entries, offset + CACHE_RECORD_SIZE,
311: entries, offset, misses * CACHE_RECORD_SIZE);
312: offset = 1 + ((count - 1) * CACHE_RECORD_SIZE);
313: entries[offset + CHAR_OFFSET] = ch;
314: entries[offset + INDEX_OFFSET] = symIndex;
315: entries[offset + NEXT_OFFSET] = nextIndex;
316: }
317: if (--length == 0)
318: break;
319: entry = entries[offset + NEXT_OFFSET];
320: entries = fCacheLines[entry];
321: ch = fSymbolChars[charsOffset++];
322: count = entries[0];
323: offset = 1 + ((count - 1) * CACHE_RECORD_SIZE);
324: misses = 0;
325: }
326: }
327: }
|