001: /*
002: * SequenceStore.java: string, comment and special sequence handling in tokenizers
003: *
004: * Copyright (C) 2003, 2004 Heiko Blau
005: *
006: * This file belongs to the JTopas Library.
007: * JTopas is free software; you can redistribute it and/or modify it
008: * under the terms of the GNU Lesser General Public License as published by the
009: * Free Software Foundation; either version 2.1 of the License, or (at your
010: * option) any later version.
011: *
012: * This software is distributed in the hope that it will be useful, but WITHOUT
013: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
014: * FITNESS FOR A PARTICULAR PURPOSE.
015: * See the GNU Lesser General Public License for more details.
016: *
017: * You should have received a copy of the GNU Lesser General Public License along
018: * with JTopas. If not, write to the
019: *
020: * Free Software Foundation, Inc.
021: * 59 Temple Place, Suite 330,
022: * Boston, MA 02111-1307
023: * USA
024: *
025: * or check the Internet: http://www.fsf.org
026: *
027: * Contact:
028: * email: heiko@susebox.de
029: */
030:
031: package de.susebox.jtopas.impl;
032:
033: //-----------------------------------------------------------------------------
034: // Imports
035: //
036: import java.util.Iterator;
037: import java.util.TreeMap;
038: import java.util.NoSuchElementException;
039:
040: import de.susebox.java.lang.ExtRuntimeException;
041:
042: import de.susebox.jtopas.Token;
043: import de.susebox.jtopas.TokenizerProperty;
044: import de.susebox.jtopas.TokenizerProperties;
045: import de.susebox.jtopas.TokenizerException;
046:
047: import de.susebox.jtopas.spi.SequenceHandler;
048: import de.susebox.jtopas.spi.KeywordHandler;
049: import de.susebox.jtopas.spi.DataProvider;
050:
051: //-----------------------------------------------------------------------------
052: // Class SequenceStore
053: //
054:
055: /**
056: * This class is used by {@link de.susebox.jtopas.StandardTokenizerProperties}
057: * to store and search special sequences, comments and strings as well as
058: * keywords. The class is not suitable for standalone use since it does not check
059: * parameters for <code>null</code> values, assumes a thread-safe context etc.
060: *
061: * @see de.susebox.jtopas.StandardTokenizerProperties
062: * @see de.susebox.jtopas.spi.SequenceHandler
063: * @author Heiko Blau
064: */
065: public class SequenceStore implements SequenceHandler, KeywordHandler {
066:
067: //---------------------------------------------------------------------------
068: // Constants
069: //
070:
071: /**
072: * This number is the size of the array that is directly indexed by letters
073: */
074: public static char DIRECT_INDEX_COUNT = 256;
075:
076: //---------------------------------------------------------------------------
077: // Constructors
078: //
079:
080: /**
081: * The constructor initializes a <code>SequenceStore</code> with the given
082: * comparision policy (prefix comparison).
083: *
084: * @param useExactLength if <code>true</code> search only for a property that
085: * has the length of {@link de.susebox.jtopas.spi.DataProvider#getLength}
086: */
087: public SequenceStore(boolean useExactLength) {
088: _useExactLength = useExactLength;
089: _maxLength = 0;
090: _asciiArray = new PropertyList[DIRECT_INDEX_COUNT];
091: _nonASCIIMap = new TreeMap();
092: }
093:
094: //---------------------------------------------------------------------------
095: // Methods of the SequenceHandler interface
096: //
097:
098: /**
099: * This method returns <code>true</code> if there are any special sequences,
100: * strings or comments registered in this instance. See the
101: * {@link de.susebox.jtopas.spi.SequenceHandler} interface for details.
102: *
103: * @return <code>true</code> if there are any special sequences, strings or
104: * comments available, <code>false</code> otherwise.
105: */
106: public boolean hasSequenceCommentOrString() {
107: return _maxLength > 0;
108: }
109:
110: /**
111: * This method checks if a given range of data starts with a special sequence,
112: * a comment or a string. See {@link de.susebox.jtopas.spi.SequenceHandler} for
113: * details.
114: *
115: * @param dataProvider the source to get the data range from
116: * @return a {@link de.susebox.jtopas.TokenizerProperty} if a special sequence,
117: * comment or string could be detected, <code>null</code> otherwise
118: * @throws TokenizerException generic exception
119: * @throws NullPointerException if no {@link DataProvider} is given
120: */
121: public TokenizerProperty startsWithSequenceCommentOrString(
122: DataProvider dataProvider) throws TokenizerException,
123: NullPointerException {
124: // only if characters are available
125: if (dataProvider.getLength() > 0) {
126: int len = dataProvider.getLength();
127: char startChar = getStartChar(dataProvider.getCharAt(0));
128: PropertyList list = getList(startChar);
129:
130: while (list != null) {
131: TokenizerProperty prop = list._property;
132: String image = prop.getImages()[0];
133: int imageLen = image.length();
134:
135: // compare only if the enough data is available
136: if (_useExactLength && imageLen < len) {
137: break; // dont check shorter properties
138: } else if (imageLen <= len
139: && comparePrefix(image, dataProvider, 1) == 0) {
140: return prop; // single point of success
141: }
142: list = list._next;
143: }
144: }
145:
146: // not found
147: return null;
148: }
149:
150: /**
151: * This method returns the length of the longest special sequence, comment or
152: * string prefix that is known to this <code>SequenceStore</code>. See
153: * {@link de.susebox.jtopas.spi.SequenceHandler} for details.
154: *
155: * @return the number of characters needed in the worst case to identify a
156: * special sequence
157: */
158: public int getSequenceMaxLength() {
159: return _maxLength;
160: }
161:
162: //---------------------------------------------------------------------------
163: // Methods of the KeywordHandler interface
164: //
165:
166: /**
167: * This method returns <code>true</code> if there are any keywords registered
168: * in this instance. See the {@link de.susebox.jtopas.spi.KeywordHandler}
169: * interface for details.
170: *
171: * @return <code>true</code> if there are any keywords available,
172: * <code>false</code> otherwise.
173: */
174: public boolean hasKeywords() {
175: // this classis is either used to store special sequences or keywords.
176: return hasSequenceCommentOrString();
177: }
178:
179: /**
180: * This method checks if the given data form a keyword.
181: * See {@link de.susebox.jtopas.spi.KeywordHandler} for details.
182: *
183: * @param dataProvider the source to get the data range from
184: * @return a {@link de.susebox.jtopas.TokenizerProperty} if keyword has been found,
185: * <code>null</code> otherwise
186: * @throws TokenizerException generic exception
187: * @throws NullPointerException if no {@link DataProvider} is given
188: */
189: public TokenizerProperty isKeyword(DataProvider dataProvider)
190: throws TokenizerException, NullPointerException {
191: return startsWithSequenceCommentOrString(dataProvider);
192: }
193:
194: //---------------------------------------------------------------------------
195: // Implementation
196: //
197:
198: /**
199: * The method returns the normalized start character. This default implementation
200: * returns the given character itself, for case-insensitive handling see the
201: * derived class {@link NoCaseSequenceStore}.
202: *
203: * @param startChar a not normalized start character
204: * @return the normalized start character
205: */
206: protected char getStartChar(char startChar) {
207: return startChar;
208: }
209:
210: /**
211: * Addingt or replacing a special sequence, comment or string.
212: *
213: * @param property the description of the new sequence
214: * @return the previously version of the given property or <code>null</code>
215: */
216: public TokenizerProperty addSpecialSequence(
217: TokenizerProperty property) {
218: String image = property.getImages()[0];
219: int length = image.length();
220: char startChar = getStartChar(image.charAt(0));
221:
222: if (_maxLength < length) {
223: _maxLength = length;
224: }
225: if (startChar >= 0 && startChar < DIRECT_INDEX_COUNT) {
226: return insertDirect(startChar, property);
227: } else {
228: return insertMapped(startChar, property);
229: }
230: }
231:
232: /**
233: * Removing a special sequence from the store. If the special sequence denoted
234: * by the given string does not exist the method returns <code>null</code>.
235: *
236: * @param image sequence to remove
237: * @return the removed property or <code>null</code> if the sequence was not found
238: */
239: public TokenizerProperty removeSpecialSequence(String image) {
240: return searchString(image, true);
241: }
242:
243: /**
244: * Get the full description of a special sequence property.
245: *
246: * @param image sequence to find
247: * @return the full sequence description or <code>null</code>
248: */
249: public TokenizerProperty getSpecialSequence(String image) {
250: return searchString(image, false);
251: }
252:
253: /**
254: * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
255: * objects.
256: *
257: * @param type type of special sequence like {@link de.susebox.jtopas.Token#STRING}
258: * @return enumeration of {@link TokenizerProperty} objects
259: */
260: public Iterator getSpecialSequences(int type) {
261: return new SpecialSequencesIterator(this , type);
262: }
263:
264: /**
265: * Addingt or replacing a keyword.
266: *
267: * @param property the description of the new keyword
268: * @return the previously version of the given property or <code>null</code>
269: */
270: public TokenizerProperty addKeyword(TokenizerProperty property) {
271: return addSpecialSequence(property);
272: }
273:
274: /**
275: * Removing a special sequence from the store. If the special sequence denoted
276: * by the given string does not exist the method returns <code>null</code>.
277: *
278: * @param image sequence to remove
279: * @return the removed property or <code>null</code> if the sequence was not found
280: */
281: public TokenizerProperty removeKeyword(String image) {
282: return removeSpecialSequence(image);
283: }
284:
285: /**
286: * Get the full description of a keyword property.
287: *
288: * @param image keyword candidate to look for
289: * @return the full keyword description or <code>null</code>
290: */
291: public TokenizerProperty getKeyword(String image) {
292: return getSpecialSequence(image);
293: }
294:
295: /**
296: * This method returns an {@link java.util.Iterator} of {@link TokenizerProperty}
297: * objects describing keywords.
298: *
299: * @return enumeration of {@link TokenizerProperty} objects representing
300: * keywords
301: */
302: public Iterator getKeywords() {
303: return getSpecialSequences(Token.KEYWORD);
304: }
305:
306: /**
307: * Get the property list for a given character.
308: *
309: * @param startChar start character
310: * @return list of properties starting with the given character
311: */
312: private PropertyList getList(char startChar) {
313: // get the list: here we try a very fast access for the ASCII characters
314: // via unchecked access and caught exceptions
315: PropertyList list;
316:
317: try {
318: // direct indexed sequence
319: list = _asciiArray[startChar];
320: } catch (IndexOutOfBoundsException ex) {
321: // mapped sequence
322: list = (PropertyList) _nonASCIIMap.get(new Character(
323: startChar));
324: }
325: return list;
326: }
327:
328: /**
329: * Search a string in the given list. Optionally, remove it. Removal may also
330: * reorganize the indexed array or non-ASCII map.
331: *
332: * @param image sequence to search
333: * @param removeIt if <code>true</code> remove a found sequence from the list
334: * @return the property or <code>null</code> if the sequence was not found
335: */
336: private TokenizerProperty searchString(String image,
337: boolean removeIt) {
338: char startChar = getStartChar(image.charAt(0));
339: PropertyList list = getList(startChar);
340: PropertyList prev = null;
341:
342: while (list != null) {
343: TokenizerProperty prop = list._property;
344: String img = prop.getImages()[0];
345: int res = compare(img, image, 1);
346:
347: if (res == 0) {
348: if (removeIt) {
349: if (prev != null) {
350: prev._next = list._next;
351: } else {
352: list = list._next;
353: if (startChar >= 0
354: && startChar < DIRECT_INDEX_COUNT) {
355: _asciiArray[startChar] = list;
356: } else if (list != null) {
357: _nonASCIIMap.put(new Character(startChar),
358: list);
359: } else {
360: _nonASCIIMap
361: .remove(new Character(startChar));
362: }
363: }
364: }
365: return prop;
366: } else if (res < 0) {
367: break;
368: }
369: prev = list;
370: list = list._next;
371: }
372: return null;
373: }
374:
375: /**
376: * Insert a new property into the direct-index array.
377: *
378: * @param property the description of the new sequence
379: * @return the previously version of the given property or <code>null</code>
380: */
381: private TokenizerProperty insertDirect(char startChar,
382: TokenizerProperty property) {
383: // the first element with the given start letter ...
384: if (_asciiArray[startChar] == null) {
385: _asciiArray[startChar] = new PropertyList(property);
386: return null;
387:
388: // ... or inserting/replacing in an existing list
389: } else {
390: return putIntoList(_asciiArray[startChar], property);
391: }
392: }
393:
394: /**
395: * Insert a new property into the hash table for real unicode letters.
396: *
397: * @param property the description of the new sequence
398: * @return the previously version of the given property or <code>null</code>
399: */
400: private TokenizerProperty insertMapped(char startChar,
401: TokenizerProperty property) {
402: Character key = new Character(getStartChar(startChar));
403: PropertyList list = (PropertyList) _nonASCIIMap.get(key);
404:
405: if (list == null) {
406: _nonASCIIMap.put(key, new PropertyList(property));
407: return null;
408: } else {
409: return putIntoList(list, property);
410: }
411: }
412:
413: /**
414: * Insert/replace a property in a property list. The list is ordered by string
415: * comparison. This is important for the search in {@link #startsWithSequenceCommentOrString}.
416: *
417: * @param list insert or replace in this list
418: * @param property the description of the new sequence
419: * @return the previously version of the given property or <code>null</code>
420: */
421: private TokenizerProperty putIntoList(PropertyList list,
422: TokenizerProperty property) {
423: String newImage = property.getImages()[0];
424: PropertyList prev;
425:
426: do {
427: TokenizerProperty prop = list._property;
428: String image = prop.getImages()[0];
429: int res = compare(image, newImage, 1);
430:
431: if (res == 0) {
432: list._property = property;
433: return prop;
434: } else if (res < 0) {
435: list._next = new PropertyList(prop, list._next);
436: list._property = property;
437: return null;
438: }
439: prev = list;
440: } while ((list = prev._next) != null);
441:
442: // Append element
443: prev._next = new PropertyList(property);
444: return null;
445: }
446:
447: /**
448: * Compare method for sequences. Longer Strings are greater, shorter are lesser.
449: * Strings with equal lengths are compared in the usual way.
450: *
451: * @param thisImage first string to compare
452: * @param thatImage second string to compare
453: * @param fromIndex start comparison from this index
454: * @return 0 if equal, < 0 if thisImage < thatImage, > 0 otherwise
455: */
456: private int compare(String this Image, String thatImage,
457: int fromIndex) {
458: int this Length = this Image.length();
459: int thatLength = thatImage.length();
460:
461: if (this Length != thatLength) {
462: return this Length - thatLength;
463: }
464:
465: while (fromIndex < this Length) {
466: int res = compare(this Image.charAt(fromIndex), thatImage
467: .charAt(fromIndex));
468:
469: if (res != 0) {
470: return res;
471: }
472: fromIndex++;
473: }
474: return 0;
475: }
476:
477: /**
478: * Compare method for a string and a character array. The method assumes that
479: * the character array holds at least as many characters as the given string.
480: *<br>
481: * See {@link #compare} for details how the comparison is performed.
482: *
483: * @param prefix string to compare
484: * @param dataProvider source to get the other characters to compare
485: * @param offset start comparison from this index
486: * @return 0 if equal, < 0 if thisImage < thatImage, > 0 otherwise
487: */
488: private int comparePrefix(String prefix, DataProvider dataProvider,
489: int offset) {
490: while (offset < prefix.length()) {
491: int res = compare(prefix.charAt(offset), dataProvider
492: .getCharAt(offset));
493:
494: if (res != 0) {
495: return res;
496: }
497: offset++;
498: }
499: return 0;
500: }
501:
502: /**
503: * Compare tho characters. Returns the difference of the to characters, 0 if
504: * they are equal. The default implementation compares case-sensitive, for the
505: * lexicographical solution see {@link NoCaseSequenceStore}..............
506: *
507: * @param char1 first character to compare
508: * @param char2 first character to compare
509: * @return 0 if equal, < 0 if char1 < char2, > 0 otherwise
510: */
511: protected int compare(char char1, char char2) {
512: return char1 - char2;
513: }
514:
515: //---------------------------------------------------------------------------
516: // Inner class
517: //
518:
519: /**
520: * List element for equaly starting special sequences.
521: */
522: final class PropertyList {
523:
524: /**
525: * Constructor taking the {@link TokenizerProperty} as its single argument.
526: *
527: * @param property a {@link TokenizerProperty} instance
528: */
529: PropertyList(TokenizerProperty property) {
530: this (property, null);
531: }
532:
533: /**
534: * Constructor taking a {@link TokenizerProperty} and the next list element.
535: * For the next element, <code>null</code> is a valid value.
536: *
537: * @param property a {@link TokenizerProperty} instance
538: * @param next the following {@link PropertyList} element
539: */
540: PropertyList(TokenizerProperty property, PropertyList next) {
541: _property = property;
542: _next = next;
543: }
544:
545: // members
546: public PropertyList _next;
547: public TokenizerProperty _property;
548: }
549:
550: /**
551: * Iterator for comments, strings and ordinary special sequences.
552: * Instances of this inner class are returned when a call to {@link #getSpecialSequences}
553: * is done. Each element of the enumeration contains a {@link TokenizerProperty}
554: * element, that in turn has the comment, special sequence etc. together with
555: * its companion
556: */
557: final class SpecialSequencesIterator implements Iterator {
558:
559: /**
560: * constructor taking the calling <code>Tokenizer</code> and the type of the
561: * {@link TokenizerProperty}. If the type is 0 then special sequences, line and
562: * block comments are returned in one iterator
563: *
564: * @param parent the calling tokenizer
565: * @param type type of the <code>TokenizerProperty</code>
566: */
567: public SpecialSequencesIterator(SequenceStore parent, int type) {
568: _type = type;
569: _parent = parent;
570: }
571:
572: /**
573: * Checking for the next element in a special sequence list, that has the
574: * required type. This method is the one that ultimately decides if there are
575: * more elements or not.
576: *
577: * @return <code>true</code> if there is a matching {@link TokenizerProperty}
578: * element, <code>false</code> otherwise
579: */
580: private boolean listHasNext() {
581: while (_currentList != null) {
582: if (_type == 0
583: || _currentList._property.getType() == _type) {
584: return true;
585: }
586: _currentList = _currentList._next;
587: }
588: return false;
589: }
590:
591: /**
592: * The well known method from the {@link java.util.Iterator} interface.
593: *
594: * @return <code>true</code> if there are more {@link TokenizerProperty}
595: * elements, <code>false</code> otherwise
596: */
597: public boolean hasNext() {
598: // simple: check the current list for a successor
599: if (listHasNext()) {
600: return true;
601: }
602:
603: // already reached the tree map iterator ?
604: if (_mapIterator != null) {
605: while (_mapIterator.hasNext()) {
606: _currentList = (PropertyList) _mapIterator.next();
607: if (listHasNext()) {
608: return true;
609: }
610: }
611:
612: // ... or still the direct index array ?
613: } else {
614: if (_parent._asciiArray != null) {
615: while (++_currentIndex < DIRECT_INDEX_COUNT) {
616: if ((_currentList = _parent._asciiArray[_currentIndex]) != null) {
617: if (listHasNext()) {
618: return true;
619: }
620: }
621: }
622: }
623: if (_parent._nonASCIIMap != null) {
624: _mapIterator = _parent._nonASCIIMap.values()
625: .iterator();
626: _currentList = null;
627: return hasNext();
628: }
629: }
630:
631: // no (more) sequences
632: return false;
633: }
634:
635: /**
636: * Retrieve the next {@link TokenizerProperty} in this enumeration.
637: *
638: * @return a {@link TokenizerProperty} of the desired type or <code>null</code>
639: * @throws NoSuchElementException if there is no more element in this iterator
640: */
641: public Object next() throws NoSuchElementException {
642: if (!hasNext()) {
643: throw new NoSuchElementException();
644: }
645:
646: _currentElem = _currentList;
647: _currentList = _currentList._next;
648: return _currentElem._property;
649: }
650:
651: /**
652: * Remove the current special sequence entry from the collection. This is an
653: * alternative to {@link Tokenizer#removeSpecialSequence}.
654: *
655: * @throws IllegalStateExcpetion if {@link #next} has not been called before or
656: * <code>remove</code> has been called already after the last <code>next</code>.
657: */
658: public void remove() throws IllegalStateException {
659: // if current element is not set
660: if (_currentElem == null) {
661: throw new IllegalStateException();
662: }
663:
664: // remove current element
665: TokenizerProperty prop = _currentElem._property;
666:
667: _currentElem = null;
668: _parent.searchString(prop.getImages()[0], true);
669: }
670:
671: // members
672: private SequenceStore _parent = null;
673: private int _type = Token.UNKNOWN;
674: private Iterator _mapIterator = null;
675: private int _currentIndex = -1;
676: private PropertyList _currentList = null;
677: private PropertyList _currentElem = null;
678: }
679:
680: //---------------------------------------------------------------------------
681: // Members
682: //
683: private PropertyList[] _asciiArray;
684: private TreeMap _nonASCIIMap = null;
685: private int _maxLength;
686: private boolean _useExactLength;
687: }
|