001: /*
002: * PatternHandler.java: Interface for pattern-aware tokenizers.
003: *
004: * Copyright (C) 2002 Heiko Blau
005: *
006: * This file belongs to the JTopas Library.
007: * JTopas is free software; you can redistribute it and/or modify it
008: * under the terms of the GNU Lesser General Public License as published by the
009: * Free Software Foundation; either version 2.1 of the License, or (at your
010: * option) any later version.
011: *
012: * This software is distributed in the hope that it will be useful, but WITHOUT
013: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
014: * FITNESS FOR A PARTICULAR PURPOSE.
015: * See the GNU Lesser General Public License for more details.
016: *
017: * You should have received a copy of the GNU Lesser General Public License along
018: * with JTopas. If not, write to the
019: *
020: * Free Software Foundation, Inc.
021: * 59 Temple Place, Suite 330,
022: * Boston, MA 02111-1307
023: * USA
024: *
025: * or check the Internet: http://www.fsf.org
026: *
027: * Contact:
028: * email: heiko@susebox.de
029: */
030:
031: package de.susebox.jtopas.spi;
032:
033: //-----------------------------------------------------------------------------
034: // Imports
035: //
036: import de.susebox.jtopas.TokenizerProperty;
037: import de.susebox.jtopas.TokenizerException;
038:
039: //-----------------------------------------------------------------------------
040: // Interface PatternHandler
041: //
042:
043: /**<p>
044: * This interface must be implemented by classes that should be used as a
045: * pattern handler for a {@link de.susebox.jtopas.Tokenizer}. Pattern are usually
046: * regular expressions that are applied on token images to check if that image
047: * matches the pattern.
048: *</p>
049: *
050: * @see de.susebox.jtopas.Tokenizer
051: * @see de.susebox.jtopas.TokenizerProperties
052: * @see de.susebox.jtopas.spi.DataMapper
053: * @author Heiko Blau
054: */
055: public interface PatternHandler {
056:
057: /**
058: * This method can be used by a {@link de.susebox.jtopas.Tokenizer} implementation
059: * for a fast detection if pattern matching must be performed at all. If the method
060: * returns <code>false</code> time-consuming preparations can be skipped.
061: *
062: * @return <code>true</code> if there actually are pattern that can be tested
063: * for a match, <code>false</code> otherwise.
064: */
065: public boolean hasPattern();
066:
067: /**
068: * This method checks if the start of a character range given through the
069: * {@link DataProvider} matches a pattern. An implementation should use
070: * a {@link de.susebox.jtopas.TokenizerException} to report problems.
071: *<br>
072: * The method returns <code>null</code> if the beginning of the character range
073: * doesn't match a pattern known to the <code>PatternHandler</code>. Otherwise
074: * it returns an object with the implemented interface {@link PatternHandler.Result}.
075: *<br>
076: * The pattern check is repeated if the method returns a match that is exactly
077: * as long as the given data range and more data is available. Since it is
078: * probably a rare case, that where are not enough data to find a complete or
079: * no match, the overhead of a repeated check on part of the data is neglected.
080: *<br>
081: * If a pattern handler has more than one pattern that could be applied to the
082: * given data, it should return the longest possible match.
083: *
084: * @param dataProvider the source to get the data from
085: * @param lengthOfMatch if a match is found, the method places the length of
086: * it into the first element of this array
087: * @return a {@link PatternHandler.Result} object or <code>null</code> if no
088: * match was found
089: * @throws TokenizerException generic exception
090: * @throws NullPointerException if no {@link DataProvider} is given
091: */
092: public PatternHandler.Result matches(DataProvider dataProvider)
093: throws TokenizerException, NullPointerException;
094:
095: //---------------------------------------------------------------------------
096: // Inner Interfaces
097: //
098:
099: /**
100: * An inner interface for the pattern match result.
101: */
102: public static interface Result {
103:
104: /**
105: * Returns the {@link TokenizerProperty} that describes the pattern that
106: * matches data passed to {@link PatternHandler#matches}. The returned value
107: * is <strong>not</strong> <code>null</code>.
108: *
109: * @return the pattern property of a successful match
110: */
111: public TokenizerProperty getProperty();
112:
113: /**
114: * Returns the number of characters that are part of a match.
115: *
116: * @return length of match
117: */
118: public int getLengthOfMatch();
119:
120: /**
121: * Returns the capturing groups of a match. It is used if the calling tokenizer
122: * needs these groups (e. g. if the flag {@link de.susebox.jtopas.TokenizerProperties#F_RETURN_IMAGE_PARTS}
123: * is set).
124: *<br>
125: * The return value must not be null or empty. The first element (array index 0)
126: * must contain the whole pattern match (as described in the Java 1.4
127: * documentation for {@link java.util.regex.Matcher} or the newer Java 1.5
128: * {@link java.util.regex.MatchResult}).
129: *
130: * @return the capturing groups of the last pattern match in {@link #matches}.
131: */
132: public String[] getGroups() throws TokenizerException;
133: }
134: }
|