001: /*
002: * Sun Public License Notice
003: *
004: * The contents of this file are subject to the Sun Public License
005: * Version 1.0 (the "License"). You may not use this file except in
006: * compliance with the License. A copy of the License is available at
007: * http://www.sun.com/
008: *
009: * The Original Code is NetBeans. The Initial Developer of the Original
010: * Code is Sun Microsystems, Inc. Portions Copyright 1997-2000 Sun
011: * Microsystems, Inc. All Rights Reserved.
012: */
013:
014: package org.netbeans.editor;
015:
016: /**
017: * Lexical analyzer that works on a given text buffer. It allows to sequentially
018: * parse a given character buffer by calling <tt>nextToken()</tt> that returns
019: * the token-ids.
020: *
021: * After the token is found by calling the <tt>nextToken</tt> method, the
022: * <tt>getTokenOffset()</tt> method can be used to get the starting offset of
023: * the current token in the buffer. The <tt>getTokenLength()</tt> gives the
024: * length of the current token.
025: *
026: * The heart of the analyzer is the <tt>parseToken()</tt> method which parses
027: * the text and returns the token-id of the last token found. The
028: * <tt>parseToken()</tt> method is called from the <tt>nextToken()</tt>. It
029: * operates with two important variables. The <tt>offset</tt> variable
030: * identifies the currently scanned character in the buffer. The
031: * <tt>tokenOffset</tt> is the begining of the current token. The
032: * <tt>state</tt> variable that identifies the current internal state of the
033: * analyzer is set accordingly when the characters are parsed. If the
034: * <tt>parseToken()</tt> recognizes a token, it returns its ID and the
035: * <tt>tokenOffset</tt> is its begining in the buffer and
036: * <tt>offset - tokenOffset</tt> is its length. When the token is processed
037: * the value of <tt>tokenOffset</tt> is set to be the same as current value of
038: * the <tt>offset</tt> and the parsing continues.
039: *
040: * Internal states are the integer constants used internally by analyzer. They
041: * are assigned to the <tt>state</tt> variable to express that the analyzer
042: * has moved from one state to another. They are usually numbered starting from
043: * zero but they don't have to. The only reserved value is -1 which is reserved
044: * for the INIT state - the initial internal state of the analyzer.
045: *
046: * There is also the support for defining the persistent info about the current
047: * state of the analyzer. This info can be later used to restore the parsing
048: * from some particular state instead of parsing from the begining of the
049: * buffer. This feature is very useful if there are the modifications performed
050: * in the document. The info is stored in the <tt>StateInfo</tt> interface
051: * with the <tt>BaseStateInfo</tt> as the basic implementation. It enables to
052: * get and set the two important values from the persistent point of view. The
053: * first one is the value of the <tt>state</tt> variable. The other one is the
054: * difference <tt>offset - tokenOffset</tt> which is called pre-scan. The
055: * particular analyzer can define additional values important for the persistent
056: * storage. The <tt>createStateInfo()</tt> can be overriden to create custom
057: * state-info and <tt>loadState()</tt> and <tt>storeState()</tt> can be
058: * overriden to get/set the additional values.
059: *
060: * The <tt>load()</tt> method sets the buffer to be parsed. There is a special
061: * parameter in the load() method called position that allows a relation of the
062: * character buffer passed to the load() method and the position of the buffer's
063: * data in the document. For this extended functionality the document must be
064: * passed to the constructor of the lexical analyzer at some level.
065: *
066: *
067: * @author Miloslav Metelka
068: * @version 1.00
069: */
070:
071: public class Syntax {
072:
073: /** Is the state of analyzer equal to a given state info? */
074: public static final int EQUAL_STATE = 0;
075:
076: /** Is the state of analyzer different from given state info? */
077: public static final int DIFFERENT_STATE = 1;
078:
079: /** Initial internal state of the analyzer */
080: public static final int INIT = -1;
081:
082: /**
083: * Internal state of the lexical analyzer. At the begining it's set to INIT
084: * value but it is changed by <tt>parseToken()</tt> as the characters are
085: * processed one by one.
086: */
087: protected int state = INIT;
088:
089: /** Text buffer to scan */
090: protected char buffer[];
091:
092: /** Current offset in the buffer */
093: protected int offset;
094:
095: /** Offset holding the begining of the current token */
096: protected int tokenOffset;
097:
098: /** This variable is the length of the token that was found */
099: protected int tokenLength;
100:
101: /**
102: * Path from which the found token-id comes from. The
103: * <tt>TokenContext.getContextPath()</tt> can be used to get the path. If
104: * the lexical analyzer doesn't use any children token-contexts it can
105: * assign the path in the constructor.
106: */
107: protected TokenContextPath tokenContextPath;
108:
109: /**
110: * Setting this flag to true means that there are currently no more buffers
111: * available so that analyzer should return all the tokens including those
112: * whose successful scanning would be otherwise left for later when the next
113: * buffer will be available. Setting this flag to true ensures that all the
114: * characters in the current buffer will be processed. The lexical analyzer
115: * should on one hand process all the characters but on the other hand it
116: * should "save" its context. For example if the scanner finds the unclosed
117: * comment at the end of the buffer it should return the comment token but
118: * stay in the "being in comment" internal state.
119: */
120: protected boolean lastBuffer;
121:
122: /** On which offset in the buffer scanning should stop. */
123: protected int stopOffset;
124:
125: /**
126: * The position in the document that logically corresponds to the stopOffset
127: * value. If there's no relation to the document, it's -1. The reason why
128: * the relation to the document's data is expressed through the stopOffset
129: * to stopPosition relation is because the stopOffset is the only offset
130: * that doesn't change rapidly in the operation of the lexical analyzer.
131: */
132: protected int stopPosition;
133:
134: /**
135: * This variable can be populated by the parseToken() method in case the
136: * user types an errorneous construction but it's clear what correct token
137: * he meant to write. For example if the user writes a single '0x' it's an
138: * errorneous construct but it's clear that the user wants to enter the
139: * hexa-number. In this situation the parseToken() should report error, but
140: * it should also set the supposedTokenID to the hexa-number token. This
141: * information is used while drawing the text. If the caret stand inside or
142: * around such token, it calls the getSupposedTokenID() after calling the
143: * nextToken() and if it's non-null it uses it instead of the original
144: * token.
145: */
146: protected TokenID supposedTokenID;
147:
148: /**
149: * Function that should be called externally to scan the text. It manages
150: * the call to parseToken() and cares about the proper setting of the
151: * offsets. It can be extended to support any custom debugging required.
152: */
153: public TokenID nextToken() {
154: // Return immediately when at the end of buffer
155: if (tokenOffset >= stopOffset) {
156: tokenLength = 0;
157: return null; // signal no token found
158: }
159:
160: // Divide non-debug and debug sections
161: supposedTokenID = null;
162: TokenID tokenID = parseToken();
163: if (tokenID != null) { // regular token found
164: tokenLength = offset - tokenOffset;
165: tokenOffset = offset; // move to the next token
166: if (tokenLength == 0) { // test for empty token
167: return nextToken(); // repeat until non-empty token is found
168: }
169: } else { // EOT reached
170: tokenLength = 0;
171: }
172:
173: return tokenID;
174: }
175:
176: /**
177: * This is core function of analyzer and it returns either the token-id or
178: * null to indicate that the end of buffer was found. The function scans the
179: * active character and does one or more of the following actions: 1. change
180: * internal analyzer state 2. set the token-context-path and return token-id
181: * 3. adjust current position to signal different end of token; the
182: * character that offset points to is not included in the token
183: */
184: protected TokenID parseToken() {
185: return null;
186: }
187:
188: /**
189: * Load the state from syntax mark into analyzer. This method is used when
190: *
191: * @param stateInfo
192: * info about the state of the lexical analyzer to load. It can
193: * be null to indicate there's no previous state so the analyzer
194: * starts from its initial state.
195: * @param buffer
196: * buffer that will be scanned
197: * @param offset
198: * offset of the first character that will be scanned
199: * @param len
200: * length of the area to be scanned
201: * @param lastBuffer
202: * whether this is the last buffer in the document. All the
203: * tokens will be returned including the last possibly incomplete
204: * one. If the data come from the document, the simple rule for
205: * this parameter is (doc.getLength() == stop-position) where
206: * stop-position is the position corresponding to the (offset +
207: * len) in the buffer that comes from the document data.
208: * @param stopPosition
209: * position in the document that corresponds to (offset + len)
210: * offset in the provided buffer. It has only sense if the data
211: * in the buffer come from the document. It helps in writing the
212: * advanced analyzers that need to interact with some other data
213: * in the document than only those provided in the character
214: * buffer. If there is no relation to the document data, the
215: * stopPosition parameter must be filled with -1 which means an
216: * invalid value. The stop-position is passed (instead of
217: * start-position) because it doesn't change through the analyzer
218: * operation. It corresponds to the <tt>stopOffset</tt> that
219: * also doesn't change through the analyzer operation so any
220: * buffer-offset can be transferred to position by computing
221: * <tt>stopPosition + buffer-offset - stopOffset</tt> where
222: * stopOffset is the instance variable that is assigned to
223: * <tt>offset + len</tt> in the body of relocate().
224: */
225: public void load(StateInfo stateInfo, char buffer[], int offset,
226: int len, boolean lastBuffer, int stopPosition) {
227: this .buffer = buffer;
228: this .offset = offset;
229: this .tokenOffset = offset;
230: this .stopOffset = offset + len;
231: this .lastBuffer = lastBuffer;
232: this .stopPosition = stopPosition;
233:
234: if (stateInfo != null) {
235: loadState(stateInfo);
236: } else {
237: loadInitState();
238: }
239: }
240:
241: /**
242: * Relocate scanning to another buffer. This is used to continue scanning
243: * after previously reported EOT. Relocation delta between current offset
244: * and the requested offset is computed and all the offsets are relocated.
245: * If there's a non-zero preScan in the analyzer, it is a caller's
246: * responsibility to provide all the preScan characters in the relocation
247: * buffer.
248: *
249: * @param buffer
250: * next buffer where the scan will continue.
251: * @param offset
252: * offset where the scan will continue. It's not decremented by
253: * the current preScan.
254: * @param len
255: * length of the area to be scanned. It's not extended by the
256: * current preScan.
257: * @param lastBuffer
258: * whether this is the last buffer in the document. All the
259: * tokens will be returned including the last possibly incomplete
260: * one. If the data come from the document, the simple rule for
261: * this parameter is (doc.getLength() == stop-position) where
262: * stop-position is the position corresponding to the (offset +
263: * len) in the buffer that comes from the document data.
264: * @param stopPosition
265: * position in the document that corresponds to (offset + len)
266: * offset in the provided buffer. It has only sense if the data
267: * in the buffer come from the document. It helps in writing the
268: * advanced analyzers that need to interact with some other data
269: * in the document than only those provided in the character
270: * buffer. If there is no relation to the document data, the
271: * stopPosition parameter must be filled with -1 which means an
272: * invalid value. The stop-position is passed (instead of
273: * start-position) because it doesn't change through the analyzer
274: * operation. It corresponds to the <tt>stopOffset</tt> that
275: * also doesn't change through the analyzer operation so any
276: * buffer-offset can be transferred to position by computing
277: * <tt>stopPosition + buffer-offset - stopOffset</tt> where
278: * stopOffset is the instance variable that is assigned to
279: * <tt>offset + len</tt> in the body of relocate().
280: */
281: public void relocate(char buffer[], int offset, int len,
282: boolean lastBuffer, int stopPosition) {
283: this .buffer = buffer;
284: this .lastBuffer = lastBuffer;
285:
286: int delta = offset - this .offset; // delta according to current offset
287: this .offset += delta;
288: this .tokenOffset += delta;
289: this .stopOffset = offset + len;
290: this .stopPosition = stopPosition;
291: }
292:
293: /** Get the current buffer */
294: public char[] getBuffer() {
295: return buffer;
296: }
297:
298: /** Get the current scanning offset */
299: public int getOffset() {
300: return offset;
301: }
302:
303: /** Get start of token in scanned buffer. */
304: public int getTokenOffset() {
305: return offset - tokenLength;
306: }
307:
308: /** Get length of token in scanned buffer. */
309: public int getTokenLength() {
310: return tokenLength;
311: }
312:
313: /** Get the token-context-path of the returned token. */
314: public TokenContextPath getTokenContextPath() {
315: return tokenContextPath;
316: }
317:
318: public TokenID getSupposedTokenID() {
319: return supposedTokenID;
320: }
321:
322: /**
323: * Get the pre-scan which is a number of characters between offset and
324: * tokenOffset. If there's no more characters in the current buffer, the
325: * analyzer returns EOT, but it can be in a state when there are already
326: * some characters parsed at the end of the current buffer but the token is
327: * still incomplete and it cannot be returned yet. The pre-scan value helps
328: * to determine how many characters from the end of the current buffer
329: * should be present at the begining of the next buffer so that the current
330: * incomplete token can be returned as the first token when parsing the next
331: * buffer.
332: */
333: public int getPreScan() {
334: return offset - tokenOffset;
335: }
336:
337: /**
338: * Initialize the analyzer when scanning from the begining of the document
339: * or when the state stored in syntax mark is null for some reason or to
340: * explicitly reset the analyzer to the initial state. The offsets must not
341: * be touched by this method.
342: */
343: public void loadInitState() {
344: state = INIT;
345: }
346:
347: public void reset() {
348: tokenLength = stopOffset = tokenOffset = offset = 0;
349: loadInitState();
350: }
351:
352: /**
353: * Load valid mark state into the analyzer. Offsets are already initialized
354: * when this method is called. This method must get the state from the mark
355: * and set it to the analyzer. Then it must decrease tokenOffset by the
356: * preScan stored in the mark state.
357: *
358: * @param markState
359: * mark state to be loaded into syntax. It must be non-null
360: * value.
361: */
362: public void loadState(StateInfo stateInfo) {
363: state = stateInfo.getState();
364: tokenOffset -= stateInfo.getPreScan();
365: }
366:
367: /** Store state of this analyzer into given mark state. */
368: public void storeState(StateInfo stateInfo) {
369: stateInfo.setState(state);
370: stateInfo.setPreScan(getPreScan());
371: }
372:
373: /** Compare state of this analyzer to given state info */
374: public int compareState(StateInfo stateInfo) {
375: if (stateInfo != null) {
376: return ((stateInfo.getState() == state) && stateInfo
377: .getPreScan() == getPreScan()) ? EQUAL_STATE
378: : DIFFERENT_STATE;
379: } else {
380: return DIFFERENT_STATE;
381: }
382: }
383:
384: /** Create state info appropriate for particular analyzer */
385: public StateInfo createStateInfo() {
386: return new BaseStateInfo();
387: }
388:
389: /**
390: * Get state name as string. It can be used for debugging purposes by
391: * developer of new syntax analyzer. The states that this function
392: * recognizes can include all constants used in analyzer so that it can be
393: * used everywhere in analyzer to convert numbers to more practical strings.
394: */
395: public String getStateName(int stateNumber) {
396: switch (stateNumber) {
397: case INIT:
398: return "INIT"; // NOI18N
399:
400: default:
401: return "Unknown state " + stateNumber; // NOI18N
402: }
403: }
404:
405: /** Syntax information as String */
406: public String toString() {
407: return "tokenOffset=" + tokenOffset // NOI18N
408: + ", offset=" + offset // NOI18N
409: + ", state=" + getStateName(state) // NOI18N
410: + ", stopOffset=" + stopOffset // NOI18N
411: + ", lastBuffer=" + lastBuffer; // NOI18N
412: }
413:
414: /**
415: * Interface that stores two basic pieces of information about the state of
416: * the whole lexical analyzer - its internal state and preScan.
417: */
418: public interface StateInfo {
419:
420: /** Get the internal state */
421: public int getState();
422:
423: /** Store the internal state */
424: public void setState(int state);
425:
426: /** Get the preScan value */
427: public int getPreScan();
428:
429: /** Store the preScan value */
430: public void setPreScan(int preScan);
431:
432: }
433:
434: /** Base implementation of the StateInfo interface */
435: public static class BaseStateInfo implements StateInfo {
436:
437: /** analyzer state */
438: private int state;
439:
440: /** Pre-scan length */
441: private int preScan;
442:
443: public int getState() {
444: return state;
445: }
446:
447: public void setState(int state) {
448: this .state = state;
449: }
450:
451: public int getPreScan() {
452: return preScan;
453: }
454:
455: public void setPreScan(int preScan) {
456: this .preScan = preScan;
457: }
458:
459: public String toString(Syntax syntax) {
460: return "state=" + syntax.getStateName(getState())
461: + ", preScan=" + getPreScan(); // NOI18N
462: }
463:
464: }
465:
466: }
|