001: /*
002: * PatternMatcher.java: Interface for pattern-aware tokenizers.
003: *
004: * Copyright (C) 2003 Heiko Blau
005: *
006: * This file belongs to the JTopas Library.
007: * JTopas is free software; you can redistribute it and/or modify it
008: * under the terms of the GNU Lesser General Public License as published by the
009: * Free Software Foundation; either version 2.1 of the License, or (at your
010: * option) any later version.
011: *
012: * This software is distributed in the hope that it will be useful, but WITHOUT
013: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
014: * FITNESS FOR A PARTICULAR PURPOSE.
015: * See the GNU Lesser General Public License for more details.
016: *
017: * You should have received a copy of the GNU Lesser General Public License along
018: * with JTopas. If not, write to the
019: *
020: * Free Software Foundation, Inc.
021: * 59 Temple Place, Suite 330,
022: * Boston, MA 02111-1307
023: * USA
024: *
025: * or check the Internet: http://www.fsf.org
026: *
027: * Contact:
028: * email: heiko@susebox.de
029: */
030:
031: package de.susebox.jtopas.impl;
032:
033: //-----------------------------------------------------------------------------
034: // Imports
035: //
036: import java.util.regex.Pattern;
037: import java.util.regex.Matcher;
038: import java.util.regex.PatternSyntaxException;
039:
040: import de.susebox.jtopas.TokenizerProperty;
041: import de.susebox.jtopas.Flags;
042: import de.susebox.jtopas.TokenizerException;
043:
044: import de.susebox.jtopas.spi.PatternHandler;
045: import de.susebox.jtopas.spi.DataProvider;
046:
047: //-----------------------------------------------------------------------------
048: // Class PatternMatcher
049: //
050:
051: /**<p>
052: * Implementation of the {@link PatternHandler} interface using the JDK 1.4
053: * package <code>java.util.regex</code>.
054: *</p>
055: *
056: * @author Heiko Blau
057: */
058: public class PatternMatcher implements PatternHandler {
059:
060: //---------------------------------------------------------------------------
061: // Constructors
062: //
063:
064: /**
065: * The constructor takes a pattern and the {@link TokenizerProperty} object
066: * associated with this instance of <code>PatternMatcher</code>. The global
067: * flags are passed to control the behaviour for attributes that are not
068: * specified in the property itself (e.g. case-sensitivity).
069: *
070: * @param prop the {@link TokenizerProperty} associated with this object
071: * @param globalFlags flags that are to be used if not set explicitely in the property
072: * @throws NullPointerException if the given parameter is <code>null</code>
073: */
074: public PatternMatcher(TokenizerProperty prop, int globalFlags)
075: throws NullPointerException {
076: _globalFlags = globalFlags;
077: setProperty(prop);
078: }
079:
080: //---------------------------------------------------------------------------
081: // Methods of the PatternHandler interface
082: //
083:
084: /**
085: * The method is a dummy implementation for the interface {@link PatternHandler}
086: * and always returns <code>true</code>.
087: *
088: * @return always <code>true</code>
089: */
090: public boolean hasPattern() {
091: return true;
092: }
093:
094: /**
095: * This method checks if the start of a character range given through the
096: * {@link DataProvider} matches a pattern. See {@link PatternHandler#matches}
097: * for details.
098: *
099: * @param dataProvider the source to get the data from
100: * @param freePatternOnly if <code>true</code> only unbounded pattern should be
101: * checked (pattern not enclosed in whitespaces, separators etc.)
102: * @return a {@link PatternHandler.Result} object or <code>null</code> if no
103: * match was found
104: * @throws TokenizerException generic exception
105: * @throws NullPointerException if no {@link DataProvider} is given
106: */
107: public PatternHandler.Result matches(DataProvider dataProvider)
108: throws TokenizerException, NullPointerException {
109: // invoke JDK 1.4 or jakarta regexp API
110: try {
111: String[] groups;
112:
113: _matcher.reset(new DataProviderCharSequence(dataProvider));
114: if (_matcher.lookingAt()) {
115: if (_property
116: .isFlagSet(
117: Flags.F_RETURN_IMAGE_PARTS,
118: (_globalFlags & Flags.F_RETURN_IMAGE_PARTS) != 0)) {
119: // get the capturing groups
120: groups = new String[_matcher.groupCount() + 1];
121: for (int index = 0; index < groups.length; ++index) {
122: groups[index] = _matcher.group(index);
123: }
124: } else {
125: groups = new String[] {};
126: }
127: return new LocalResult(_property, _matcher.end(),
128: groups);
129: } else {
130: return null;
131: }
132: } catch (Exception ex) {
133: throw new TokenizerException(ex);
134: }
135: }
136:
137: //---------------------------------------------------------------------------
138: // Methods
139: //
140:
141: /**
142: * Setting the {@link TokenizerProperty} for this <code>PatternMatcher</code>.
143: * This method will recompile the regular expression pattern.
144: *
145: * @param prop the {@link TokenizerProperty} associated with this object
146: * @throws NullPointerException if the given parameter is <code>null</code>
147: */
148: public void setProperty(TokenizerProperty prop)
149: throws NullPointerException {
150: // no pattern given
151: if (prop == null) {
152: throw new NullPointerException("No property given.");
153: } else if (prop.getImages() == null
154: || prop.getImages().length < 1
155: || prop.getImages()[0] == null) {
156: throw new NullPointerException(
157: "Property contains no pattern image.");
158: }
159:
160: // compile the pattern
161: int flags = Pattern.MULTILINE | Pattern.DOTALL;
162:
163: if (prop.isFlagSet(Flags.F_NO_CASE,
164: (_globalFlags & Flags.F_NO_CASE) != 0)) {
165: flags |= Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
166: }
167: _matcher = Pattern.compile(prop.getImages()[0], flags).matcher(
168: "");
169:
170: // set property
171: _property = prop;
172: }
173:
174: /**
175: * Retrieving the {@link TokenizerProperty} of this <code>PatternMatcher</code>.
176: *
177: * @return the {@link TokenizerProperty} associated with this object
178: */
179: public TokenizerProperty getProperty() {
180: return _property;
181: }
182:
183: //---------------------------------------------------------------------------
184: // Inner Classes
185: //
186:
187: /**
188: * The result of a match operation.
189: */
190: private final class LocalResult implements PatternHandler.Result {
191:
192: /**
193: * The constructor gets all the nessecary parameters.
194: *
195: * @param prop the pattern property
196: * @param lengthOfMatch the detected number of characters that match the pattern
197: * @param groups array with the capturing groups
198: */
199: protected LocalResult(TokenizerProperty prop,
200: int lengthOfMatch, String[] groups) {
201: _property = prop;
202: _lengthOfMatch = lengthOfMatch;
203: _groups = groups;
204: }
205:
206: /**
207: * Returns the capturing groups of a match.
208: *
209: * @return the capturing groups of the last pattern match in {@link #matches}.
210: */
211: public String[] getGroups() throws TokenizerException {
212: return _groups;
213: }
214:
215: /**
216: * Returns the number of characters that are part of a match.
217: *
218: * @return length of match
219: */
220: public int getLengthOfMatch() {
221: return _lengthOfMatch;
222: }
223:
224: /**
225: * Returns the {@link TokenizerProperty} that describes the pattern that
226: * matches data passed to {@link PatternHandler#matches}.
227: *
228: * @return the pattern property of a successful match
229: */
230: public TokenizerProperty getProperty() {
231: return _property;
232: }
233:
234: // member
235: private TokenizerProperty _property;
236: private int _lengthOfMatch;
237: private String[] _groups;
238: }
239:
240: /**
241: * An implementation of the JDK 1.4 {@link java.lang.CharSequence} interface
242: * backed by a {@link DataProvider}.
243: */
244: private final class DataProviderCharSequence implements
245: CharSequence {
246:
247: /**
248: * The constructor takes the reference to the {@link DataProvider}.
249: *
250: * @param dataProvider the backing <code>DataProvider</code>
251: */
252: public DataProviderCharSequence(DataProvider dataProvider) {
253: this (dataProvider, dataProvider.getStartPosition(),
254: dataProvider.getLength());
255: }
256:
257: /**
258: * The constructor takes the reference to the {@link DataProvider}, the
259: * start position and length. It is nessecary for the {@link #subSequence}
260: * method
261: *
262: * @param dataProvider the backing <code>DataProvider</code>
263: */
264: private DataProviderCharSequence(DataProvider dataProvider,
265: int start, int length) {
266: _dataProvider = dataProvider;
267: _start = start;
268: _length = length;
269: }
270:
271: /**
272: * Returns the character at the specified index. An index ranges from zero
273: * to <code>length() - 1</code>. The first character of the sequence is at
274: * index zero, the next at index one, and so on, as for array
275: * indexing. </p>
276: *
277: * @param index the index of the character to be returned
278: * @return the specified character
279: * @throws ArrayIndexOutOfBoundsException
280: * if the <code>index</code> argument is negative or not less than
281: * <code>length()</code>
282: */
283: public char charAt(int index)
284: throws ArrayIndexOutOfBoundsException {
285: return _dataProvider.getCharAt(_start + index
286: - _dataProvider.getStartPosition());
287: }
288:
289: /** Returns the length of this character sequence. The length is the number
290: * of 16-bit Unicode characters in the sequence. </p>
291: *
292: * @return the number of characters in this sequence
293: *
294: */
295: public int length() {
296: return _length;
297: }
298:
299: /**
300: * Returns a new character sequence that is a subsequence of this sequence.
301: * See {@link java.lang.CharSequence#subSequence} for details.
302: *
303: * @param start the start index, inclusive
304: * @param end the end index, exclusive
305: * @return the specified subsequence
306: * @throws IndexOutOfBoundsException
307: * if <code>start</code> or <code>end</code> are negative,
308: * if <code>end</code> is greater than <code>length()</code>,
309: * or if <code>start</code> is greater than <code>end</code>
310: */
311: public CharSequence subSequence(int start, int end) {
312: if (start < 0 || end < 0 || end > length() || start > end) {
313: throw new IndexOutOfBoundsException();
314: }
315: return new DataProviderCharSequence(_dataProvider, _start
316: + start, end - start);
317: }
318:
319: /**
320: * Returns the string representation for the <code>DataProvider</code>.
321: *
322: * @return the string consisting of all available data in the DataProvider.
323: */
324: public String toString() {
325: int realStart = _start - _dataProvider.getStartPosition();
326:
327: return _dataProvider.toString().substring(realStart,
328: realStart + _length);
329: }
330:
331: // members
332: private DataProvider _dataProvider = null;
333: private int _start = 0;
334: private int _length = 0;
335: }
336:
337: //---------------------------------------------------------------------------
338: // Members
339: //
340: private TokenizerProperty _property = null;
341: private Matcher _matcher = null;
342: private int _globalFlags = 0;
343: }
|