001: /*
002: * The contents of this file are subject to the Mozilla Public License
003: * Version 1.1 (the "License"); you may not use this file except in
004: * compliance with the License. You may obtain a copy of the License at
005: * http://www.mozilla.org/MPL/
006: *
007: * Software distributed under the License is distributed on an "AS IS"
008: * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
009: * License for the specific language governing rights and limitations
010: * under the License.
011: *
012: * The Original Code is iSQL-Viewer, A Mutli-Platform Database Tool.
013: *
014: * The Initial Developer of the Original Code is iSQL-Viewer, A Mutli-Platform Database Tool.
015: * Portions created by Mark A. Kobold are Copyright (C) 2000-2007. All Rights Reserved.
016: *
017: * Contributor(s):
018: * Mark A. Kobold [mkobold <at> isqlviewer <dot> com].
019: *
020: * If you didn't download this code from the following link, you should check
021: * if you aren't using an obsolete version: http://www.isqlviewer.com
022: */
023: package org.isqlviewer.sql.processor;
024:
025: import java.util.TreeMap;
026:
027: /**
028: * TODO Add Scanner JavaDoc inforamation
029: * <p>
030: *
031: * @author Mark A. Kobold <mkobold at isqlviewer dot com>
032: * @version 1.0
033: */
034: public abstract class AbstractProcessor {
035:
036: /**
037: * <p>
038: * Read one token from the start of the current text buffer, given the start offset, end offset, and current scanner
039: * state. The method moves the start offset past the token, updates the scanner state, and returns the type of the
040: * token just scanned.
041: * <p>
042: * The scanner state is a representative token type. It is either the state left after the last call to read, or the
043: * type of the old token at the same position if rescanning, or WHITESPACE if at the start of a document. The method
044: * succeeds in all cases, returning whitespace or comment or error tokens where necessary. Each line of a multi-line
045: * comment is treated as a separate token, to improve incremental rescanning. If the buffer does not extend to the
046: * end of the document, the last token returned for the buffer may be incomplete and the caller must rescan it. The
047: * read method can be overridden to implement different languages. The default version splits plain text into words,
048: * numbers and punctuation.
049: */
050: protected TokenType read() {
051:
052: char c = buffer[start];
053: TokenType type;
054: // Ignore the state, since there is only one.
055: if (Character.isWhitespace(c)) {
056: type = TokenType.WHITESPACE;
057: while (++start < end) {
058: if (!Character.isWhitespace(buffer[start]))
059: break;
060: }
061: } else if (Character.isLetter(c)) {
062: type = TokenType.WORD;
063: while (++start < end) {
064: c = buffer[start];
065: if (Character.isLetter(c) || Character.isDigit(c))
066: continue;
067: if (c == '-' || c == '\'' || c == '_')
068: continue;
069: break;
070: }
071: } else if (Character.isDigit(c)) {
072: type = TokenType.NUMBER;
073: while (++start < end) {
074: c = buffer[start];
075: if (!Character.isDigit(c) && c != '.')
076: break;
077: }
078: } else if (c >= '!' || c <= '~') {
079: type = TokenType.PUNCTUATION;
080: start++;
081: } else {
082: type = TokenType.UNRECOGNIZED;
083: start++;
084: }
085:
086: // state = WHITESPACE;
087: return type;
088: }
089:
090: /**
091: * The current buffer of text being scanned.
092: */
093: protected char[] buffer;
094:
095: /**
096: * The current offset within the buffer, at which to scan the next token.
097: */
098: protected int start;
099:
100: /**
101: * The end offset in the buffer.
102: */
103: protected int end;
104:
105: /**
106: * The current scanner state, as a representative token type.
107: */
108: protected TokenType state = TokenType.WHITESPACE;
109:
110: // The array of tokens forms a gap buffer. The total length of the text is
111: // tracked, and tokens after the gap have (negative) positions relative to
112: // the end of the text. While scanning, the gap represents the area to be
113: // scanned, no tokens after the gap can be taken as valid, and in particular
114: // the end-of-text sentinel token is after the gap.
115:
116: private Token[] tokens;
117: private int gap, endgap, textLength;
118: private boolean scanning;
119: private int position;
120:
121: /**
122: * The symbol table can be accessed by <code>initSymbolTable</code> or <code>lookup</code>, if they are
123: * overridden. Symbols are inserted with <code>symbolTable.put(sym,sym)</code> and extracted with
124: * <code>symbolTable.get(sym)</code>.
125: */
126: protected TreeMap<String, TextSymbol> symbolTable;
127:
128: public/**
129: * Create a new Scanner representing an empty text document. For non-incremental scanning, use change() to
130: * report the document size, then pass the entire text to the scan() method in one go, or if coming from an
131: * input stream, a bufferful at a time.
132: */
133: AbstractProcessor() {
134:
135: tokens = new Token[1];
136: gap = 0;
137: endgap = 0;
138: textLength = 0;
139: symbolTable = new TreeMap<String, TextSymbol>(
140: String.CASE_INSENSITIVE_ORDER);
141: initSymbolTable();
142: TextSymbol endOfText = new TextSymbol(TokenType.WHITESPACE, "");
143: tokens[0] = new Token(endOfText, 0);
144: scanning = false;
145: position = 0;
146: }
147:
148: /**
149: * Find the number of available valid tokens, not counting tokens in or after any area yet to be rescanned.
150: */
151: public int size() {
152:
153: if (scanning)
154: return gap;
155: return gap + tokens.length - endgap;
156: }
157:
158: /**
159: * Find the n'th token, or null if it is not currently valid.
160: */
161: public Token getToken(int n) {
162:
163: if (n < 0 || n >= gap && scanning)
164: return null;
165: if (n >= gap)
166: moveGap(n + 1);
167: return tokens[n];
168: }
169:
170: /**
171: * Find the index of the valid token starting before, but nearest to, text position p. This uses an O(log(n)) binary
172: * chop search.
173: */
174: public int find(int p) {
175:
176: int firstIndex = 0, lastIndex, mid, midpos;
177: if (!scanning) {
178: moveGap(gap + tokens.length - endgap);
179: }
180: lastIndex = gap - 1;
181: if (p > tokens[lastIndex].position) {
182: return lastIndex;
183: }
184: while (lastIndex > firstIndex + 1) {
185: mid = (firstIndex + lastIndex) / 2;
186: midpos = tokens[mid].position;
187: if (p > midpos) {
188: firstIndex = mid;
189: } else {
190: lastIndex = mid;
191: }
192: }
193: return firstIndex;
194: }
195:
196: /**
197: * Report the position of an edit, the length of the text being replaced, and the length of the replacement text, to
198: * prepare for rescanning. The call returns the index of the token at which rescanning will start.
199: */
200: public int change(int firstIndex, int len, int newLen) {
201:
202: if (firstIndex < 0 || len < 0 || newLen < 0
203: || firstIndex + len > textLength) {
204: throw new Error("change(" + firstIndex + "," + len + ","
205: + newLen + ")");
206: }
207: textLength += newLen - len;
208: int currentEnd = firstIndex + newLen;
209: if (scanning) {
210: while (gap > 0 && tokens[gap - 1].position > firstIndex) {
211: gap--;
212: }
213: if (gap > 0) {
214: gap--;
215: }
216: if (gap > 0) {
217: gap--;
218: position = tokens[gap].position;
219: state = tokens[gap].symbol.type;
220: } else {
221: position = 0;
222: state = TokenType.WHITESPACE;
223: }
224: while (tokens[endgap].position + textLength < currentEnd) {
225: endgap++;
226: }
227: return gap;
228: }
229: if (endgap == tokens.length) {
230: moveGap(gap - 1);
231: }
232: scanning = true;
233: while (tokens[endgap].position + textLength < firstIndex) {
234: tokens[endgap].position += textLength;
235: tokens[gap++] = tokens[endgap++];
236: }
237: while (gap > 0 && tokens[gap - 1].position > firstIndex) {
238: tokens[--endgap] = tokens[--gap];
239: tokens[endgap].position -= textLength;
240: }
241: if (gap > 0) {
242: gap--;
243: }
244: if (gap > 0) {
245: gap--;
246: position = tokens[gap].position;
247: state = tokens[gap].symbol.type;
248: } else {
249: position = 0;
250: state = TokenType.WHITESPACE;
251: }
252: while (tokens[endgap].position + textLength < currentEnd) {
253: endgap++;
254: }
255: return gap;
256: }
257:
258: /**
259: * Find out at what text position any remaining scanning work should start, or -1 if scanning is complete.
260: */
261: public int position() {
262:
263: if (!scanning)
264: return -1;
265: return position;
266: }
267:
268: /**
269: * Scan or rescan a given read-only segment of text. The segment is assumed to represent a portion of the document
270: * starting at <code>position()</code>. Return the number of tokens successfully scanned, excluding any partial
271: * token at the end of the text segment but not at the end of the document. If the result is 0, the call should be
272: * retried with a longer segment.
273: */
274: public int scan(char[] array, int offset, int length) {
275:
276: if (!scanning) {
277: throw new Error("scan called when not scanning");
278: }
279: if (position + length > textLength) {
280: return -1;
281: }
282: boolean all = position + length == textLength;
283: end = start + length;
284: int startGap = gap;
285:
286: buffer = array;
287: start = offset;
288: end = start + length;
289: while (start < end) {
290: int tokenStart = start;
291: TokenType type = read();
292: if (start == end && !all)
293: break;
294:
295: if (type != TokenType.WHITESPACE) {
296: String name = new String(buffer, tokenStart, start
297: - tokenStart);
298: TextSymbol sym = lookup(type, name);
299: Token t = new Token(sym, position);
300: if (gap >= endgap) {
301: checkCapacity(gap + tokens.length - endgap + 1);
302: }
303: tokens[gap++] = t;
304: }
305:
306: // Try to synchronise
307:
308: while (tokens[endgap].position + textLength < position)
309: endgap++;
310: if (position + start - tokenStart == textLength)
311: scanning = false;
312: else if (gap > 0
313: && tokens[endgap].position + textLength == position
314: && tokens[endgap].symbol.type == type) {
315: endgap++;
316: scanning = false;
317: break;
318: }
319: position += start - tokenStart;
320: }
321: checkCapacity(gap + tokens.length - endgap);
322: return gap - startGap;
323: }
324:
325: /**
326: * Create the initial symbol table. This can be overridden to enter keywords, for example. The default
327: * implementation does nothing.
328: */
329: protected abstract void initSymbolTable();
330:
331: /**
332: * Lookup a symbol in the symbol table. This can be overridden to implement keyword detection, for example. The
333: * default implementation just uses the table to ensure that there is only one shared occurrence of each symbol.
334: */
335: protected TextSymbol lookup(TokenType type, String name) {
336:
337: TextSymbol sym = symbolTable.get(name);
338: if (sym != null) {
339: return sym;
340: }
341: sym = new TextSymbol(type, name);
342: symbolTable.put(name, sym);
343: return sym;
344: }
345:
346: // Change the size of the gap buffer, doubling it if it fills up, and
347: // halving if it becomes less than a quarter full.
348: private void checkCapacity(int capacity) {
349:
350: int oldCapacity = tokens.length;
351: if (capacity <= oldCapacity && 4 * capacity >= oldCapacity)
352: return;
353: Token[] oldTokens = tokens;
354: int newCapacity;
355: if (capacity > oldCapacity) {
356: newCapacity = oldCapacity * 2;
357: if (newCapacity < capacity)
358: newCapacity = capacity;
359: } else {
360: newCapacity = capacity * 2;
361: }
362:
363: tokens = new Token[newCapacity];
364: System.arraycopy(oldTokens, 0, tokens, 0, gap);
365: int n = oldCapacity - endgap;
366: System.arraycopy(oldTokens, endgap, tokens, newCapacity - n, n);
367: endgap = newCapacity - n;
368: }
369:
370: // Move the gap to a new index within the tokens array. When preparing to
371: // pass a token back to a caller, this is used to ensure that the token's
372: // position is relative to the start of the text and not the end.
373: private void moveGap(int newgap) {
374:
375: if (scanning)
376: throw new Error("moveGap called while scanning");
377: if (newgap < 0 || newgap > gap + tokens.length - endgap) {
378: throw new Error("bad argument to moveGap");
379: }
380: if (gap < newgap) {
381: while (gap < newgap) {
382: tokens[endgap].position += textLength;
383: tokens[gap++] = tokens[endgap++];
384: }
385: } else if (gap > newgap) {
386: while (gap > newgap) {
387: tokens[--endgap] = tokens[--gap];
388: tokens[endgap].position -= textLength;
389: }
390: }
391: }
392: }
|