001: /*
002: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
003: *
004: * Copyright 1997-2007 Sun Microsystems, Inc. All rights reserved.
005: *
006: * The contents of this file are subject to the terms of either the GNU
007: * General Public License Version 2 only ("GPL") or the Common
008: * Development and Distribution License("CDDL") (collectively, the
009: * "License"). You may not use this file except in compliance with the
010: * License. You can obtain a copy of the License at
011: * http://www.netbeans.org/cddl-gplv2.html
012: * or nbbuild/licenses/CDDL-GPL-2-CP. See the License for the
013: * specific language governing permissions and limitations under the
014: * License. When distributing the software, include this License Header
015: * Notice in each file and include the License file at
016: * nbbuild/licenses/CDDL-GPL-2-CP. Sun designates this
017: * particular file as subject to the "Classpath" exception as provided
018: * by Sun in the GPL Version 2 section of the License file that
019: * accompanied this code. If applicable, add the following below the
020: * License Header, with the fields enclosed by brackets [] replaced by
021: * your own identifying information:
022: * "Portions Copyrighted [year] [name of copyright owner]"
023: *
024: * Contributor(s):
025: *
026: * The Original Software is NetBeans. The Initial Developer of the Original
027: * Software is Sun Microsystems, Inc. Portions Copyright 1997-2007 Sun
028: * Microsystems, Inc. All Rights Reserved.
029: *
030: * If you wish your version of this file to be governed by only the CDDL
031: * or only the GPL Version 2, indicate your decision by adding
032: * "[Contributor] elects to include this software in this distribution
033: * under the [CDDL or GPL Version 2] license." If you do not indicate a
034: * single choice of license, a recipient has the option to distribute
035: * your version of this file under either the CDDL, the GPL Version 2 or
036: * to extend the choice of license to its licensees as provided above.
037: * However, if you add GPL Version 2 code and therefore, elected the GPL
038: * Version 2 license, then the option applies only if the new code is
039: * made subject to such option by the copyright holder.
040: */
041:
042: package org.netbeans.lib.lexer.inc;
043:
044: import java.util.logging.Level;
045: import java.util.logging.Logger;
046: import org.netbeans.api.lexer.TokenId;
047: import org.netbeans.lib.lexer.LanguageOperation;
048: import org.netbeans.lib.lexer.LexerInputOperation;
049: import org.netbeans.lib.lexer.LexerUtilsConstants;
050: import org.netbeans.lib.lexer.token.AbstractToken;
051: import org.netbeans.spi.lexer.TokenValidator;
052:
053: /**
054: * Token updater fixes a list of tokens constructed for a document
055: * after text of the document gets modified.
056: * <br>
057: * Subclasses need to define all the abstract methods
058: * so that the updating method can work on real token sequences.
059: *
060: * <p>
061: * Updater looks similar to list iterator
062: * but there are differences in the semantics
063: * of iterator's modification operations.
064: * <p>
065: * The algorithm used in the {@link #update(int, int)}
066: * is based on "General Incremental Lexical Analysis" written
067: * by Tim A. Wagner and Susan L. Graham, University
068: * of California, Berkeley. It's available online
069: * at <a href="http://www.cs.berkeley.edu/Research/Projects/harmonia/papers/twagner-lexing.pdf">
070: * twagner-lexing.pdf</a>.
071: * <br>
072: * Ending <code>EOF</code> token is not used but the lookahead
073: * of the ending token(s) is increased by one (past the end of the input)
074: * if they have reached the EOF.
075: * <br>
076: * Non-startable tokens are not supported.
077: * <br>
078: * When updating a token with lookback one as a result
079: * of modification the lookahead of the preceding token is inspected
080: * to find out whether the modification has really affected it.
081: * This can often save the previous token from being relexed.
082: * <br>
083: * Currently the algorithm computes the lookback values on the fly
084: * and it does not store the lookback in the tokens. For typical languages
085: * the lookback is reasonably small (0, 1 or 2) so it's usually not worth
086: * to consume extra space in token instances for storing of the lookback.
087: * There would also be an additional overhead of updating the lookback
088: * values in the tokens after the modification and the algorithm code would
089: * be somewhat less readable.
090: *
091: * <p>
092: * The algorithm removes the affected tokens in the natural order as they
093: * follow in the token stream. That can be used when the removed tokens
094: * need to be collected (e.g. in an array).
095: * <br>
096: * If the offset and state after token recognition matches
097: * the end offset and state after recognition of the originally present
098: * token then the relexing is stopped because a match was found and the newly
099: * produced tokens would match the present ones.
100: * <br>
101: * Otherwise the token(s) in the list are removed and replaced
102: * by the relexed token and the relexing continues until a match is reached.
103: *
104: * @author Miloslav Metelka
105: * @version 1.00
106: */
107:
108: public final class TokenListUpdater {
109:
110: // -J-Dorg.netbeans.lib.lexer.inc.TokenListUpdater.level=FINE
111: private static final Logger LOG = Logger
112: .getLogger(TokenListUpdater.class.getName());
113:
114: /**
115: * Use incremental algorithm to update the list of tokens
116: * after a modification done in the underlying storage.
117: *
118: * @param tokenList non-null token list that is being updated. It may be top-level list
119: * or embedded token list.
120: * @param modOffset offset where the modification occurred.
121: * @param insertedLength number of characters inserted at modOffset.
122: * @param removedLength number of characters removed at modOffset.
123: * @param change non-null change that will incorporate the performed chagnes.
124: * @param zeroIndexRelexState state used for relexing at index 0.
125: */
126: public static <T extends TokenId> void update(
127: MutableTokenList<T> tokenList, int modOffset,
128: int insertedLength, int removedLength,
129: TokenListChange<T> change, Object zeroIndexRelexState) {
130: // Fetch offset where the modification occurred
131: LanguageOperation<T> languageOperation = LexerUtilsConstants
132: .innerLanguageOperation(tokenList.languagePath());
133:
134: int tokenCount = tokenList.tokenCountCurrent(); // presently created token count
135: // Now determine which token is the first to be relexed.
136: // If it would be either modified token or previous-of-modified token
137: // (for modification right at the begining of modified token)
138: // then the token will be attempted to be validated (without running
139: // a lexer).
140: AbstractToken<T> modToken;
141: // modTokenOffset holds begining of the token in which the modification occurred.
142: int modTokenOffset;
143: // index points to the modified token
144: int index;
145:
146: boolean loggable = LOG.isLoggable(Level.FINE);
147: if (loggable) {
148: LOG.log(Level.FINE,
149: "TokenListUpdater.update() STARTED\nmodOffset="
150: + modOffset + ", insertedLength="
151: + insertedLength + ", removedLength="
152: + removedLength + ", tokenCount="
153: + tokenCount + "\n");
154: }
155:
156: if (tokenCount == 0) { // no tokens yet or all removed
157: if (!tokenList.isFullyLexed()) {
158: // No tokens created yet (they get created lazily).
159: if (loggable) {
160: LOG
161: .log(Level.FINE,
162: "TokenListUpdater.update() FINISHED: Not fully lexed yet.\n");
163: }
164: return; // Do nothing in this case
165: }
166: // If fully lexed and no tokens then the tokens should start
167: // right at the modification offset
168: modToken = null;
169: modTokenOffset = modOffset;
170: index = 0;
171:
172: } else { // at least one token exists
173: // Check whether the modification at modOffset might affect existing tokens
174: // Get index of the token in which the modification occurred
175: // Get the offset of the last token into modTokenOffset variable
176: index = tokenCount - 1;
177: modTokenOffset = tokenList.tokenOffset(index);
178: if (modOffset >= modTokenOffset) { // inside or above the last token?
179: modToken = token(tokenList, index);
180: int modTokenEndOffset = modTokenOffset
181: + modToken.length();
182: if (modOffset >= modTokenEndOffset) { // above last token
183: // Modification was right at the end boundary of the last token
184: // or above it (token list can be created lazily so that is valid case).
185: // Check whether the last token could be affected at all
186: // by checking the last token's lookahead.
187: // For fully lexed inputs the characters added to the end
188: // must be properly lexed and notified (even if the last present
189: // token has zero lookahead).
190: if (!tokenList.isFullyLexed()
191: && modOffset >= modTokenEndOffset
192: + tokenList.lookahead(index)) {
193: if (loggable) {
194: LOG.log(Level.FINE,
195: "TokenListUpdater.update() FINISHED: Not fully lexed yet. modTokenOffset="
196: + modTokenOffset
197: + ", modToken.length()="
198: + modToken.length() + "\n");
199: }
200: return; // not affected at all
201: }
202:
203: index++;
204: modToken = null;
205: modTokenOffset = modTokenEndOffset;
206: } // else -> modification inside the last token
207:
208: } else { // modification in non-last token
209: // Find modified token by binary search
210: int low = 0; // use index as 'high'
211: while (low <= index) {
212: int mid = (low + index) / 2;
213: int midStartOffset = tokenList.tokenOffset(mid);
214:
215: if (midStartOffset < modOffset) {
216: low = mid + 1;
217: } else if (midStartOffset > modOffset) {
218: index = mid - 1;
219: } else {
220: // Token starting exactly at modOffset found
221: index = mid;
222: modTokenOffset = midStartOffset;
223: break;
224: }
225: }
226: if (index < low) { // no token starting right at 'modOffset'
227: modTokenOffset = tokenList.tokenOffset(index);
228: }
229: modToken = token(tokenList, index);
230: if (loggable) {
231: LOG
232: .log(Level.FINE, "BIN-SEARCH: index="
233: + index + ", modTokenOffset="
234: + modTokenOffset
235: + ", modToken.id()="
236: + modToken.id() + "\n");
237: }
238: }
239: }
240:
241: // Store the index that points to the modified token
242: // i.e. modification at its begining or inside.
243: // Index variable can later be modified but present value is important
244: // for moving of the offset gap later.
245: change.setOffsetGapIndex(index);
246:
247: // Index and offset from which the relexing will start.
248: int relexIndex;
249: int relexOffset;
250: // Whether the token validation should be attempted or not.
251: boolean attemptValidation = false;
252:
253: if (index == 0) { // modToken is first in the list
254: relexIndex = index;
255: relexOffset = modTokenOffset;
256: // Can validate modToken if removal does not span whole token
257: if (modToken != null && removedLength < modToken.length()) {
258: attemptValidation = true;
259: }
260:
261: } else { // Previous token exists
262: // Check for insert-only right at the end of the previous token
263: if (modOffset == modTokenOffset && removedLength == 0) {
264: index--; // move to previous token
265: modToken = token(tokenList, index);
266: modTokenOffset -= modToken.length();
267: }
268:
269: // Check whether modification affected previous token
270: if (index == 0
271: || modTokenOffset + tokenList.lookahead(index - 1) <= modOffset) {
272: // Modification did not affect previous token
273: relexIndex = index;
274: relexOffset = modTokenOffset;
275: // Check whether modification was localized to modToken only
276: if (modOffset + removedLength < modTokenOffset
277: + modToken.length()) {
278: attemptValidation = true;
279: }
280:
281: } else { // at least previous token affected
282: relexOffset = modTokenOffset
283: - token(tokenList, index - 1).length();
284: relexIndex = index - 2; // Start with token below previous token
285:
286: // Go back and mark all affected tokens for removals
287: while (relexIndex >= 0) {
288: AbstractToken<T> token = token(tokenList,
289: relexIndex);
290: // Check if token was not affected by modification
291: if (relexOffset + tokenList.lookahead(relexIndex) <= modOffset) {
292: break;
293: }
294: relexIndex--;
295: relexOffset -= token.length();
296: }
297: relexIndex++; // Next token will be relexed
298: }
299: }
300:
301: // The lowest offset at which the relexing can end
302: // (the relexing may end at higher offset if the relexed
303: // tokens will end at different boundaries than the original
304: // tokens or if the states after the tokens' recognition
305: // will differ from the original states in the original tokens.
306: int matchOffset;
307:
308: // Perform token validation of modToken if possible.
309: // The index variable will hold the token index right before the matching point.
310: if (attemptValidation) {
311: matchOffset = modTokenOffset + modToken.length();
312: TokenValidator tokenValidator = languageOperation
313: .tokenValidator(modToken.id());
314: if (tokenValidator != null
315: && (tokenList.getClass() != IncTokenList.class)) {
316:
317: // if (tokenValidator.validateToken(modToken, modOffset - modTokenOffset, modRelOffset,
318: // removedLength, insertedLength)
319: // ) {
320: // // Update positions
321: // change.initRemovedAddedOffsets()
322:
323: // return; // validated successfully
324: // }
325: }
326:
327: } else { // Validation cannot be attempted
328: // Need to compute matchOffset and matchIndex
329: // by iterating forward
330: if (index < tokenCount) {
331: matchOffset = modTokenOffset + modToken.length();
332: int removeEndOffset = modOffset + removedLength;
333: while (matchOffset < removeEndOffset
334: && index + 1 < tokenCount) {
335: index++;
336: matchOffset += token(tokenList, index).length();
337: }
338:
339: } else
340: // After last token
341: matchOffset = modTokenOffset;
342: }
343:
344: // State from which the lexer can be started
345: Object relexState = (relexIndex > 0) ? tokenList
346: .state(relexIndex - 1) : zeroIndexRelexState;
347: // Update the matchOffset so that it corresponds to the state
348: // after the modification
349: matchOffset += insertedLength - removedLength;
350: change.setOffset(relexOffset);
351:
352: // Variables' values:
353: // 'index' - points to modified token. Or index == tokenCount for modification
354: // past the last token.
355: // 'tokenCount' - token count in the original token list.
356: // 'relexIndex' - points to the token that will be relexed as first.
357: // 'relexOffset' - points to begining of the token that will be relexed as first.
358: // 'matchOffset' - points to end of token after which the fixed token list could
359: // possibly match the original token list. Points to end of token at 'index'
360: // variable if 'index < tokenCount' and to the end of the last token
361: // if 'index == tokenCount'.
362:
363: // Check whether relexing is necessary.
364: // Necessary condition for no-relexing is that the matchToken
365: // has zero lookahead (if lookahead would be >0
366: // then the matchToken would be affected and relexOffset != matchOffset).
367: // The states before relex token must match the state after the modified token
368: // In case of removal starting and ending at token boundaries
369: // the relexing might not be necessary.
370: boolean relex = (relexOffset != matchOffset)
371: || index >= tokenCount
372: || !LexerUtilsConstants.statesEqual(relexState,
373: tokenList.state(index));
374:
375: // There is an extra condition that the lookahead of the matchToken
376: // must not span the next (retained) token. This condition helps to ensure
377: // that the lookaheads will be the same like during regular batch lexing.
378: // As the empty tokens are not allowed the situation may only occur
379: // for lookahead > 1.
380: int lookahead;
381: if (!relex && (lookahead = tokenList.lookahead(index)) > 1
382: && index + 1 < tokenCount) {
383: relex = (lookahead > token(tokenList, index + 1).length()); // check next token
384: }
385:
386: if (loggable) {
387: LOG.log(Level.FINE, "BEFORE-RELEX: index=" + index
388: + ", modTokenOffset=" + modTokenOffset
389: + ", relexIndex=" + relexIndex + ", relexOffset="
390: + relexOffset + ", relexState=" + relexState
391: + ", matchOffset=" + matchOffset
392: + ", perform relex: " + relex + "\n");
393: }
394:
395: if (relex) { // Start relexing
396: LexerInputOperation<T> lexerInputOperation = tokenList
397: .createLexerInputOperation(relexIndex, relexOffset,
398: relexState);
399:
400: do { // Fetch new tokens from lexer as necessary
401: AbstractToken<T> token = lexerInputOperation
402: .nextToken();
403: if (token == null) {
404: attemptValidation = false;
405: break;
406: }
407:
408: lookahead = lexerInputOperation.lookahead();
409: Object state = lexerInputOperation.lexerState();
410: if (loggable) {
411: LOG.log(Level.FINE, "LEXED-TOKEN: id=" + token.id()
412: + ", length=" + token.length()
413: + ", lookahead=" + lookahead + ", state="
414: + state + "\n");
415: }
416:
417: change.addToken(token, lookahead, state);
418:
419: relexOffset += token.length();
420: // Remove obsolete tokens that would cover the area of just lexed token
421: // 'index' will point to the last token that was removed
422: // 'matchOffset' will point to the end of the last removed token
423: if (relexOffset > matchOffset && index < tokenCount) {
424: attemptValidation = false;
425: do {
426: index++;
427: if (index == tokenCount) {
428: // Make sure the number of removed tokens will be computed properly later
429: modToken = null;
430: // Check whether it should lex till the end
431: // or whether 'Match at anything' should be done
432: if (tokenList.isFullyLexed()) {
433: // Will lex till the end of input
434: matchOffset = Integer.MAX_VALUE;
435: } else {
436: // Force stop lexing
437: relex = false;
438: }
439: break;
440: }
441: matchOffset += token(tokenList, index).length();
442: } while (relexOffset > matchOffset);
443: }
444:
445: // Check whether the new token ends at matchOffset with the same state
446: // like the original which typically means end of relexing
447: if (relexOffset == matchOffset
448: && (index < tokenCount)
449: && LexerUtilsConstants.statesEqual(state,
450: tokenList.state(index))) {
451: // Here it's a potential match and the relexing could end.
452: // However there are additional conditions that need to be checked.
453: // 1. Check whether lookahead of the last relexed token
454: // does not exceed length plus LA of the subsequent (original) token.
455: // See initial part of SimpleRandomTest.test() verifies this.
456: // 2. Algorithm attempts to have the same lookaheads in tokens
457: // like the regular batch scanning would produce.
458: // Although not strictly necessary requirement
459: // it helps to simplify the debugging in case the lexer does not work
460: // well in the incremental setup.
461: // The following code checks that the lookahead of the original match token
462: // (i.e. the token right before matchOffset) does "end" inside
463: // the next token - if not then relexing the next token is done.
464: // The second part of SimpleRandomTest.test() verifies this.
465:
466: // 'index' points to the last token that was removed
467: int matchTokenLookahead = tokenList
468: .lookahead(index);
469: // Optimistically suppose that the relexing will end
470: relex = false;
471: // When assuming non-empty tokens the lookahead 1
472: // just reaches the end of the next token
473: // so lookhead < 1 is always fine from this point of view.
474: if (matchTokenLookahead > 1 || lookahead > 1) {
475: // Start with token right after the last removed token starting at matchOffset
476: int i = index + 1;
477: // Process additional removals by increasing 'index'
478: // 'lookahead' holds
479: while (i < tokenCount) {
480: int tokenLength = token(tokenList, i)
481: .length();
482: lookahead -= tokenLength; // decrease extra lookahead
483: matchTokenLookahead -= tokenLength;
484: if (lookahead <= 0
485: && matchTokenLookahead <= 0) {
486: break; // No more work
487: }
488: if (lookahead != tokenList.lookahead(i)
489: || matchTokenLookahead > 0) {
490: // This token must be relexed
491: if (loggable) {
492: LOG.log(Level.FINE,
493: "EXTRA-RELEX: index="
494: + index
495: + ", lookahead="
496: + lookahead
497: + ", tokenLength="
498: + tokenLength
499: + "\n");
500: }
501: index = i;
502: matchOffset += tokenLength;
503: relex = true;
504: // Continue - further tokens may be affected
505: }
506: i++;
507: }
508: }
509:
510: if (!relex) {
511: if (attemptValidation) {
512: // if (modToken.id() == token.id()
513: // && tokenList.lookahead(index) == lookahead
514: // && !modToken.isFlyweight()
515: // && !token.isFlyweight()
516: // && (tokenList.getClass() != IncTokenList.class
517: // || change.tokenHierarchyOperation().canModifyToken(index, modToken))
518: // && LexerSpiTokenPackageAccessor.get().restoreToken(
519: // languageOperation.tokenHandler(),
520: // modToken, token)
521: // ) {
522: // // Restored successfully
523: // // TODO implement - fix token's length and return
524: // // now default in fact to failed validation
525: // }
526: attemptValidation = false;
527: }
528: }
529: }
530: } while (relex); // End of the relexing loop
531: lexerInputOperation.release();
532:
533: // If at least two tokens were lexed it's possible that e.g. the last added token
534: // will be the same like the last removed token and in such case
535: // the addition of the last token should be 'undone'.
536: // This all may happen due to the fact that for larger lookaheads
537: // the algorithm must relex the token(s) within lookahead (see the code above).
538: int lastAddedTokenIndex = change
539: .addedTokensOrBranchesCount() - 1;
540: // There should remain at least one added token since that one
541: // may not be the same like the original removed one because
542: // token lengths would differ because of the input source modification.
543: while (lastAddedTokenIndex >= 1 && index > relexIndex
544: && index < tokenCount) {
545: AbstractToken<T> addedToken = LexerUtilsConstants
546: .token(change.addedTokensOrBranches().get(
547: lastAddedTokenIndex));
548: AbstractToken<T> removedToken = token(tokenList, index);
549: if (addedToken.id() != removedToken.id()
550: || addedToken.length() != removedToken.length()
551: || change.laState().lookahead(
552: lastAddedTokenIndex) != tokenList
553: .lookahead(index)
554: || !LexerUtilsConstants.statesEqual(change
555: .laState().state(lastAddedTokenIndex),
556: tokenList.state(index))) {
557: break;
558: }
559: // Last removed and added tokens are the same so undo the addition
560: if (loggable) {
561: LOG.log(Level.FINE, "RETAIN-ORIGINAL: index="
562: + index + ", id=" + removedToken.id()
563: + "\n");
564: }
565: lastAddedTokenIndex--;
566: index--;
567: relexOffset -= addedToken.length();
568: change.removeLastAddedToken();
569: }
570: }
571:
572: // Now ensure that the original tokens will be replaced by the relexed ones.
573: int removedTokenCount = (modToken != null) ? (index
574: - relexIndex + 1) : (index - relexIndex);
575: if (loggable) {
576: LOG.log(Level.FINE,
577: "TokenListUpdater.update() FINISHED: Removed:"
578: + removedTokenCount + ", Added:"
579: + change.addedTokensOrBranchesCount()
580: + " tokens.\n");
581: }
582: change.setIndex(relexIndex);
583: change.setAddedEndOffset(relexOffset);
584: tokenList.replaceTokens(change, removedTokenCount,
585: insertedLength - removedLength);
586: }
587:
588: private static <T extends TokenId> AbstractToken<T> token(
589: MutableTokenList<T> tokenList, int index) {
590: Object tokenOrEmbeddingContainer = tokenList
591: .tokenOrEmbeddingContainerUnsync(index); // Unsync impl suffices
592: return LexerUtilsConstants.token(tokenOrEmbeddingContainer);
593: }
594:
595: }
|