001: /*
002: * StandardWhitespaceHandler.java: default implementation of WhitespaceHandler
003: *
004: * Copyright (C) 2002 Heiko Blau
005: *
006: * This file belongs to the JTopas Library.
007: * JTopas is free software; you can redistribute it and/or modify it
008: * under the terms of the GNU Lesser General Public License as published by the
009: * Free Software Foundation; either version 2.1 of the License, or (at your
010: * option) any later version.
011: *
012: * This software is distributed in the hope that it will be useful, but WITHOUT
013: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
014: * FITNESS FOR A PARTICULAR PURPOSE.
015: * See the GNU Lesser General Public License for more details.
016: *
017: * You should have received a copy of the GNU Lesser General Public License along
018: * with JTopas. If not, write to the
019: *
020: * Free Software Foundation, Inc.
021: * 59 Temple Place, Suite 330,
022: * Boston, MA 02111-1307
023: * USA
024: *
025: * or check the Internet: http://www.fsf.org
026: *
027: * Contact:
028: * email: heiko@susebox.de
029: */
030:
031: package de.susebox.jtopas.spi;
032:
033: //-----------------------------------------------------------------------------
034: // Imports
035: //
036: import de.susebox.jtopas.TokenizerProperties;
037: import de.susebox.jtopas.TokenizerException;
038: import de.susebox.jtopas.Flags;
039:
040: //-----------------------------------------------------------------------------
041: // class StandardWhitespaceHandler
042: //
043:
044: /**<p>
045: * Simple implementation of the {@link WhitespaceHandler} interface. This class
046: * works only with the {@link de.susebox.jtopas.TokenizerProperties} interface
047: * methods and is aware of changes in these properties. It does not cache any
048: * information and is therefore a more or less slow way to handle whitespaces.
049: *</p><p>
050: * This class is a bridge between arbitrary {@link de.susebox.jtopas.Tokenizer}
051: * implementations using the SPI interface {@link WhitespaceHandler} and any
052: * {@link de.susebox.jtopas.TokenizerProperties} implementation that does not
053: * implement the <code>WhitespaceHandler</code> interface itself.
054: *</p>
055: *
056: * @see WhitespaceHandler
057: * @see de.susebox.jtopas.Tokenizer
058: * @see de.susebox.jtopas.TokenizerProperties
059: * @author Heiko Blau
060: */
061: public class StandardWhitespaceHandler implements WhitespaceHandler {
062:
063: /**
064: * The constructor takes the {@link de.susebox.jtopas.TokenizerProperties}
065: * that provide the whitespaces.
066: *
067: * @param props the {@link de.susebox.jtopas.TokenizerProperties} to take the
068: * whitespaces from
069: */
070: public StandardWhitespaceHandler(TokenizerProperties props) {
071: _properties = props;
072: }
073:
074: /**
075: * This method checks if the given character is a whitespace.
076: *
077: * @param testChar check this character
078: * @return <code>true</code> if the given character is a whitespace,
079: * <code>false</code> otherwise
080: */
081: public boolean isWhitespace(char testChar) {
082: String whitespaces;
083:
084: if (_properties != null
085: && (whitespaces = _properties.getWhitespaces()) != null) {
086: if (_properties.isFlagSet(Flags.F_NO_CASE)) {
087: return whitespaces.toLowerCase().indexOf(
088: Character.toLowerCase(testChar)) >= 0;
089: } else {
090: return whitespaces.indexOf(testChar) >= 0;
091: }
092: } else {
093: return false;
094: }
095: }
096:
097: /**
098: * This method detects the number of whitespace characters the data range given
099: * through the {@link DataProvider} parameter starts with.
100: *
101: * @param dataProvider the source to get the data range from
102: * @return number of whitespace characters starting from the given offset
103: * @throws NullPointerException if no {@link DataProvider} is given
104: * @see DataProvider
105: */
106: public int countLeadingWhitespaces(DataProvider dataProvider)
107: throws NullPointerException {
108: int len = 0;
109: int maxChars = dataProvider.getLength();
110:
111: while (len < maxChars
112: && isWhitespace(dataProvider.getCharAt(len))) {
113: len++;
114: }
115: return len;
116: }
117:
118: /**
119: * If a {@link de.susebox.jtopas.Tokenizer} performs line counting, it is often
120: * nessecary to know if newline characters is considered to be a whitespace.
121: * See {@link WhitespaceHandler} for details.
122: *
123: * @return <code>true</code> if newline characters are in the current whitespace set,
124: * <code>false</code> otherwise
125: *
126: */
127: public boolean newlineIsWhitespace() {
128: String whitespaces;
129: boolean isWhitespace;
130:
131: if (_properties != null
132: && (whitespaces = _properties.getWhitespaces()) != null) {
133: return newlineIsWhitespace(whitespaces);
134: } else {
135: return false;
136: }
137: }
138:
139: //---------------------------------------------------------------------------
140: // Implementation
141: //
142:
143: /**
144: * Check a set that may contain ranges
145: *
146: * @param set the whitespace set
147: */
148: private boolean newlineIsWhitespace(String set) {
149: int len = (set != null) ? set.length() : 0;
150: char start, end, setChar;
151: boolean crFound = false;
152: boolean lfFound = false;
153:
154: for (int ii = 0; ii < len; ++ii) {
155: switch (setChar = set.charAt(ii)) {
156: case '-':
157: start = (ii > 0) ? set.charAt(ii - 1) : 0;
158: end = (ii < len - 1) ? set.charAt(ii + 1) : 0xFFFF;
159: if ('\n' >= start && '\n' <= end) {
160: lfFound = true;
161: }
162: if ('\r' >= start && '\r' <= end) {
163: crFound = true;
164: }
165: ii += 2;
166: break;
167:
168: case '\r':
169: crFound = true;
170: break;
171:
172: case '\n':
173: lfFound = true;
174: break;
175:
176: case '\\':
177: ii++;
178: break;
179: }
180:
181: // both characters found ?
182: if (crFound && lfFound) {
183: return true;
184: }
185: }
186:
187: // not found
188: return false;
189: }
190:
191: //---------------------------------------------------------------------------
192: // Members
193: //
194:
195: /**
196: * The {@link TokenizerProperties} that provide the whitespaces and the
197: * control flags.
198: */
199: private TokenizerProperties _properties = null;
200: }
|