001: /*
002: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
003: *
004: * Copyright 1997-2007 Sun Microsystems, Inc. All rights reserved.
005: *
006: * The contents of this file are subject to the terms of either the GNU
007: * General Public License Version 2 only ("GPL") or the Common
008: * Development and Distribution License("CDDL") (collectively, the
009: * "License"). You may not use this file except in compliance with the
010: * License. You can obtain a copy of the License at
011: * http://www.netbeans.org/cddl-gplv2.html
012: * or nbbuild/licenses/CDDL-GPL-2-CP. See the License for the
013: * specific language governing permissions and limitations under the
014: * License. When distributing the software, include this License Header
015: * Notice in each file and include the License file at
016: * nbbuild/licenses/CDDL-GPL-2-CP. Sun designates this
017: * particular file as subject to the "Classpath" exception as provided
018: * by Sun in the GPL Version 2 section of the License file that
019: * accompanied this code. If applicable, add the following below the
020: * License Header, with the fields enclosed by brackets [] replaced by
021: * your own identifying information:
022: * "Portions Copyrighted [year] [name of copyright owner]"
023: *
024: * Contributor(s):
025: *
026: * The Original Software is NetBeans. The Initial Developer of the Original
027: * Software is Sun Microsystems, Inc. Portions Copyright 1997-2006 Sun
028: * Microsystems, Inc. All Rights Reserved.
029: *
030: * If you wish your version of this file to be governed by only the CDDL
031: * or only the GPL Version 2, indicate your decision by adding
032: * "[Contributor] elects to include this software in this distribution
033: * under the [CDDL or GPL Version 2] license." If you do not indicate a
034: * single choice of license, a recipient has the option to distribute
035: * your version of this file under either the CDDL, the GPL Version 2 or
036: * to extend the choice of license to its licensees as provided above.
037: * However, if you add GPL Version 2 code and therefore, elected the GPL
038: * Version 2 license, then the option applies only if the new code is
039: * made subject to such option by the copyright holder.
040: */
041:
042: package org.netbeans.editor;
043:
044: /**
045: * Lexical analyzer that works on a given text buffer. It allows
046: * to sequentially parse a given character buffer by calling
047: * <tt>nextToken()</tt> that returns the token-ids.
048: *
049: * After the token is found by calling the <tt>nextToken</tt> method,
050: * the <tt>getTokenOffset()</tt> method can be used
051: * to get the starting offset of the current
052: * token in the buffer. The <tt>getTokenLength()</tt> gives the length
053: * of the current token.
054: *
055: * The heart of the analyzer is the <tt>parseToken()</tt> method which
056: * parses the text and returns the token-id of the last token found.
057: * The <tt>parseToken()</tt> method is called from the <tt>nextToken()</tt>.
058: * It operates with two important variables. The <tt>offset</tt>
059: * variable identifies the currently scanned character in the buffer.
060: * The <tt>tokenOffset</tt> is the begining of the current token.
061: * The <tt>state</tt> variable that identifies the current internal
062: * state of the analyzer is set accordingly when the characters are parsed.
063: * If the <tt>parseToken()</tt> recognizes a token, it returns its ID
064: * and the <tt>tokenOffset</tt> is its begining in the buffer and
065: * <tt>offset - tokenOffset</tt> is its length. When the token is processed
066: * the value of <tt>tokenOffset</tt> is set to be the same as current
067: * value of the <tt>offset</tt> and the parsing continues.
068: *
069: * Internal states are the integer constants used internally by analyzer.
070: * They are assigned to the <tt>state</tt> variable to express
071: * that the analyzer has moved from one state to another.
072: * They are usually numbered starting from zero but they don't
073: * have to. The only reserved value is -1 which is reserved
074: * for the INIT state - the initial internal state of the analyzer.
075: *
076: * There is also the support for defining the persistent info about
077: * the current state of the analyzer. This info can be later used
078: * to restore the parsing from some particular state instead of
079: * parsing from the begining of the buffer. This feature is very
080: * useful if there are the modifications performed in the document.
081: * The info is stored in the <tt>StateInfo</tt> interface
082: * with the <tt>BaseStateInfo</tt> as the basic implementation.
083: * It enables to get and set the two important values
084: * from the persistent point of view.
085: * The first one is the value of the <tt>state</tt> variable.
086: * The other one is the difference <tt>offset - tokenOffset</tt>
087: * which is called pre-scan. The particular analyzer can define
088: * additional values important for the persistent storage.
089: * The <tt>createStateInfo()</tt> can be overriden to create
090: * custom state-info and <tt>loadState()</tt> and <tt>storeState()</tt>
091: * can be overriden to get/set the additional values.
092: *
093: * The <tt>load()</tt> method sets the buffer to be parsed.
094: * There is a special parameter in the load() method called position
095: * that allows a relation of the character buffer passed to the load()
096: * method and the position of the buffer's data in the document.
097: * For this extended functionality the document must be passed
098: * to the constructor of the lexical analyzer at some level.
099: *
100: *
101: * @author Miloslav Metelka
102: * @version 1.00
103: */
104:
105: public class Syntax {
106:
107: /** Is the state of analyzer equal to a given state info? */
108: public static final int EQUAL_STATE = 0;
109:
110: /** Is the state of analyzer different from given state info? */
111: public static final int DIFFERENT_STATE = 1;
112:
113: /** Initial internal state of the analyzer */
114: public static final int INIT = -1;
115:
116: /** Internal state of the lexical analyzer. At the begining
117: * it's set to INIT value but it is changed by <tt>parseToken()</tt>
118: * as the characters are processed one by one.
119: */
120: protected int state = INIT;
121:
122: /** Text buffer to scan */
123: protected char buffer[];
124:
125: /** Current offset in the buffer */
126: protected int offset;
127:
128: /** Offset holding the begining of the current token */
129: protected int tokenOffset;
130:
131: /** This variable is the length of the token that was found */
132: protected int tokenLength;
133:
134: /** Path from which the found token-id comes from.
135: * The <tt>TokenContext.getContextPath()</tt> can be used
136: * to get the path. If the lexical analyzer doesn't use
137: * any children token-contexts it can assign
138: * the path in the constructor.
139: */
140: protected TokenContextPath tokenContextPath;
141:
142: /** Setting this flag to true means that there are currently no more
143: * buffers available so that analyzer should return all the tokens
144: * including those whose successful scanning would be otherwise
145: * left for later when the next buffer will be available. Setting
146: * this flag to true ensures that all the characters in the current
147: * buffer will be processed.
148: * The lexical analyzer should on one hand process all the characters
149: * but on the other hand it should "save" its context. For example
150: * if the scanner finds the unclosed comment at the end of the buffer
151: * it should return the comment token but
152: * stay in the "being in comment" internal state.
153: */
154: protected boolean lastBuffer;
155:
156: /** On which offset in the buffer scanning should stop. */
157: protected int stopOffset;
158:
159: /** The position in the document that logically corresponds
160: * to the stopOffset value. If there's no relation
161: * to the document, it's -1. The reason why the relation
162: * to the document's data is expressed through
163: * the stopOffset to stopPosition relation is because
164: * the stopOffset is the only offset that doesn't change
165: * rapidly in the operation of the lexical analyzer.
166: */
167: protected int stopPosition;
168:
169: /** This variable can be populated by the parseToken() method
170: * in case the user types an errorneous construction but
171: * it's clear what correct token he meant to write.
172: * For example if the user writes a single '0x' it's an errorneous
173: * construct but it's clear that the user wants to enter
174: * the hexa-number. In this situation the parseToken()
175: * should report error, but it should also set the supposedTokenID
176: * to the hexa-number token.
177: * This information is used while drawing the text. If the caret
178: * stand inside or around such token, it calls the getSupposedTokenID()
179: * after calling the nextToken() and if it's non-null it uses it
180: * instead of the original token.
181: */
182: protected TokenID supposedTokenID;
183:
184: /** Function that should be called externally to scan the text.
185: * It manages the call to parseToken() and cares about the proper
186: * setting of the offsets.
187: * It can be extended to support any custom debugging required.
188: */
189: public TokenID nextToken() {
190: // Return immediately when at the end of buffer
191: if (tokenOffset >= stopOffset) {
192: tokenLength = 0;
193: return null; // signal no token found
194: }
195:
196: // Divide non-debug and debug sections
197: supposedTokenID = null;
198: TokenID tokenID = parseToken();
199: if (tokenID != null) { // regular token found
200: tokenLength = offset - tokenOffset;
201: tokenOffset = offset; // move to the next token
202: if (tokenLength == 0) { // test for empty token
203: return nextToken(); // repeat until non-empty token is found
204: }
205: } else { // EOT reached
206: tokenLength = 0;
207: }
208:
209: return tokenID;
210: }
211:
212: /** This is core function of analyzer and it returns either the token-id
213: * or null to indicate that the end of buffer was found.
214: * The function scans the active character and does one or more
215: * of the following actions:
216: * 1. change internal analyzer state
217: * 2. set the token-context-path and return token-id
218: * 3. adjust current position to signal different end of token;
219: * the character that offset points to is not included in the token
220: */
221: protected TokenID parseToken() {
222: return null;
223: }
224:
225: /** Load the state from syntax mark into analyzer. This method is used when
226: * @param stateInfo info about the state of the lexical analyzer to load.
227: * It can be null to indicate there's no previous state so the analyzer
228: * starts from its initial state.
229: * @param buffer buffer that will be scanned
230: * @param offset offset of the first character that will be scanned
231: * @param len length of the area to be scanned
232: * @param lastBuffer whether this is the last buffer in the document. All the tokens
233: * will be returned including the last possibly incomplete one. If the data
234: * come from the document, the simple rule for this parameter
235: * is (doc.getLength() == stop-position) where stop-position
236: * is the position corresponding to the (offset + len) in the buffer
237: * that comes from the document data.
238: * @param stopPosition position in the document that corresponds to (offset + len) offset
239: * in the provided buffer. It has only sense if the data in the buffer come from the document.
240: * It helps in writing the advanced analyzers that need to interact with some other data
241: * in the document than only those provided in the character buffer.
242: * If there is no relation to the document data, the stopPosition parameter
243: * must be filled with -1 which means an invalid value.
244: * The stop-position is passed (instead of start-position) because it doesn't
245: * change through the analyzer operation. It corresponds to the <tt>stopOffset</tt>
246: * that also doesn't change through the analyzer operation so any
247: * buffer-offset can be transferred to position by computing
248: * <tt>stopPosition + buffer-offset - stopOffset</tt>
249: * where stopOffset is the instance variable that is assigned
250: * to <tt>offset + len</tt> in the body of relocate().
251: */
252: public void load(StateInfo stateInfo, char buffer[], int offset,
253: int len, boolean lastBuffer, int stopPosition) {
254: this .buffer = buffer;
255: this .offset = offset;
256: this .tokenOffset = offset;
257: this .stopOffset = offset + len;
258: this .lastBuffer = lastBuffer;
259: this .stopPosition = stopPosition;
260:
261: if (stateInfo != null) {
262: loadState(stateInfo);
263: } else {
264: loadInitState();
265: }
266: }
267:
268: /** Relocate scanning to another buffer.
269: * This is used to continue scanning after previously
270: * reported EOT. Relocation delta between current offset and the requested offset
271: * is computed and all the offsets are relocated. If there's a non-zero preScan
272: * in the analyzer, it is a caller's responsibility to provide all the preScan
273: * characters in the relocation buffer.
274: * @param buffer next buffer where the scan will continue.
275: * @param offset offset where the scan will continue.
276: * It's not decremented by the current preScan.
277: * @param len length of the area to be scanned.
278: * It's not extended by the current preScan.
279: * @param lastBuffer whether this is the last buffer in the document. All the tokens
280: * will be returned including the last possibly incomplete one. If the data
281: * come from the document, the simple rule for this parameter
282: * is (doc.getLength() == stop-position) where stop-position
283: * is the position corresponding to the (offset + len) in the buffer
284: * that comes from the document data.
285: * @param stopPosition position in the document that corresponds to (offset + len) offset
286: * in the provided buffer. It has only sense if the data in the buffer come from the document.
287: * It helps in writing the advanced analyzers that need to interact with some other data
288: * in the document than only those provided in the character buffer.
289: * If there is no relation to the document data, the stopPosition parameter
290: * must be filled with -1 which means an invalid value.
291: * The stop-position is passed (instead of start-position) because it doesn't
292: * change through the analyzer operation. It corresponds to the <tt>stopOffset</tt>
293: * that also doesn't change through the analyzer operation so any
294: * buffer-offset can be transferred to position by computing
295: * <tt>stopPosition + buffer-offset - stopOffset</tt>
296: * where stopOffset is the instance variable that is assigned
297: * to <tt>offset + len</tt> in the body of relocate().
298: */
299: public void relocate(char buffer[], int offset, int len,
300: boolean lastBuffer, int stopPosition) {
301: this .buffer = buffer;
302: this .lastBuffer = lastBuffer;
303:
304: int delta = offset - this .offset; // delta according to current offset
305: this .offset += delta;
306: this .tokenOffset += delta;
307: this .stopOffset = offset + len;
308: this .stopPosition = stopPosition;
309: }
310:
311: /** Get the current buffer */
312: public char[] getBuffer() {
313: return buffer;
314: }
315:
316: /** Get the current scanning offset */
317: public int getOffset() {
318: return offset;
319: }
320:
321: /** Get start of token in scanned buffer. */
322: public int getTokenOffset() {
323: return offset - tokenLength;
324: }
325:
326: /** Get length of token in scanned buffer. */
327: public int getTokenLength() {
328: return tokenLength;
329: }
330:
331: /** Get the token-context-path of the returned token. */
332: public TokenContextPath getTokenContextPath() {
333: return tokenContextPath;
334: }
335:
336: public TokenID getSupposedTokenID() {
337: return supposedTokenID;
338: }
339:
340: /** Get the pre-scan which is a number
341: * of characters between offset and tokenOffset.
342: * If there's no more characters in the current buffer,
343: * the analyzer returns EOT, but it can be in a state when
344: * there are already some characters parsed at the end of
345: * the current buffer but the token
346: * is still incomplete and it cannot be returned yet.
347: * The pre-scan value helps to determine how many characters
348: * from the end of the current buffer should be present
349: * at the begining of the next buffer so that the current
350: * incomplete token can be returned as the first token
351: * when parsing the next buffer.
352: */
353: public int getPreScan() {
354: return offset - tokenOffset;
355: }
356:
357: /** Initialize the analyzer when scanning from the begining
358: * of the document or when the state stored in syntax mark
359: * is null for some reason or to explicitly reset the analyzer
360: * to the initial state. The offsets must not be touched by this method.
361: */
362: public void loadInitState() {
363: state = INIT;
364: }
365:
366: public void reset() {
367: tokenLength = stopOffset = tokenOffset = offset = 0;
368: loadInitState();
369: }
370:
371: /** Load valid mark state into the analyzer. Offsets
372: * are already initialized when this method is called. This method
373: * must get the state from the mark and set it to the analyzer. Then
374: * it must decrease tokenOffset by the preScan stored in the mark state.
375: * @param markState mark state to be loaded into syntax. It must be non-null value.
376: */
377: public void loadState(StateInfo stateInfo) {
378: state = stateInfo.getState();
379: tokenOffset -= stateInfo.getPreScan();
380: }
381:
382: /** Store state of this analyzer into given mark state. */
383: public void storeState(StateInfo stateInfo) {
384: stateInfo.setState(state);
385: stateInfo.setPreScan(getPreScan());
386: }
387:
388: /** Compare state of this analyzer to given state info */
389: public int compareState(StateInfo stateInfo) {
390: if (stateInfo != null) {
391: return ((stateInfo.getState() == state) && stateInfo
392: .getPreScan() == getPreScan()) ? EQUAL_STATE
393: : DIFFERENT_STATE;
394: } else {
395: return DIFFERENT_STATE;
396: }
397: }
398:
399: /** Create state info appropriate for particular analyzer */
400: public StateInfo createStateInfo() {
401: return new BaseStateInfo();
402: }
403:
404: /** Get state name as string. It can be used for debugging purposes
405: * by developer of new syntax analyzer. The states that this function
406: * recognizes can include all constants used in analyzer so that it can
407: * be used everywhere in analyzer to convert numbers to more practical strings.
408: */
409: public String getStateName(int stateNumber) {
410: switch (stateNumber) {
411: case INIT:
412: return "INIT"; // NOI18N
413:
414: default:
415: return "Unknown state " + stateNumber; // NOI18N
416: }
417: }
418:
419: /** Syntax information as String */
420: public String toString() {
421: return "tokenOffset=" + tokenOffset // NOI18N
422: + ", offset=" + offset // NOI18N
423: + ", state=" + getStateName(state) // NOI18N
424: + ", stopOffset=" + stopOffset // NOI18N
425: + ", lastBuffer=" + lastBuffer; // NOI18N
426: }
427:
428: /** Interface that stores two basic pieces of information about
429: * the state of the whole lexical analyzer - its internal state and preScan.
430: */
431: public interface StateInfo {
432:
433: /** Get the internal state */
434: public int getState();
435:
436: /** Store the internal state */
437: public void setState(int state);
438:
439: /** Get the preScan value */
440: public int getPreScan();
441:
442: /** Store the preScan value */
443: public void setPreScan(int preScan);
444:
445: }
446:
447: /** Base implementation of the StateInfo interface */
448: public static class BaseStateInfo implements StateInfo {
449:
450: /** analyzer state */
451: private int state;
452:
453: /** Pre-scan length */
454: private int preScan;
455:
456: public int getState() {
457: return state;
458: }
459:
460: public void setState(int state) {
461: this .state = state;
462: }
463:
464: public int getPreScan() {
465: return preScan;
466: }
467:
468: public void setPreScan(int preScan) {
469: this .preScan = preScan;
470: }
471:
472: public String toString(Syntax syntax) {
473: return "state=" // NOI18N
474: + ((syntax != null) ? syntax
475: .getStateName(getState()) : Integer
476: .toString(getState()))
477: + ", preScan="
478: + getPreScan(); // NOI18N
479: }
480:
481: public String toString() {
482: return toString(null);
483: }
484:
485: }
486:
487: }
|