001: package it.unimi.dsi.mg4j.query;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2005-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITfNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.io.FastBufferedReader;
025: import it.unimi.dsi.io.WordReader;
026: import it.unimi.dsi.lang.MutableString;
027:
028: import java.io.IOException;
029:
030: // ALERT: this class need desperately to be documented.
031:
032: /** A mutable string with a special method to append text that should be marked.
033: *
034: * <p>A marking mutable string can mark several <em>fields</em> (which will often correspond to indexed fields).
035: * Each time you {@linkplain #startField(SelectedInterval[]) start a field}, you pass some intervals to be marked. Then,
036: * you call {@link #appendAndMark(WordReader)}, which will add words and nonwords coming from the provided
037: * {@link it.unimi.dsi.io.WordReader}, marking as suggested by the interval set. The number of words
038: * around each interval can be set in the constructor. When a field is finished, you must call {@link #endField()}.
039: */
040: public class MarkingMutableString extends MutableString {
041: private static final long serialVersionUID = 1L;
042:
043: /** The default number of words before and after each interval. */
044: public final static int DEFAULT_INTERVAL_SURROUND = 8;
045:
046: public boolean resume = true;
047: public boolean marking;
048: /** The current set of intervals for marking. */
049: private SelectedInterval[] interval;
050: private int count;
051: private int currMarkingInterval, currResumeInterval;
052: private boolean skipping;
053: private boolean oneCharOut;
054:
055: private final Marker marker;
056: private final EscapeStrategy escapeStrategy;
057:
058: /** An escaping strategy. Such a strategy is used by a {@link MarkingMutableString} to escape
059: * strings passed to the {@link MarkingMutableString#appendAndMark(WordReader)} method. */
060:
061: public interface EscapeStrategy {
062: public MutableString escape(MutableString s);
063: };
064:
065: private static final char[] HTML_ESCAPE_CHAR = new char[] { '<',
066: '&' };
067: private static final String[] HTML_ESCAPE_STRING = new String[] {
068: "<", "&" };
069:
070: /** A singleton for the strategy that escapes HTML. */
071:
072: private static final class HtmlEscape implements EscapeStrategy {
073: private HtmlEscape() {
074: }
075:
076: public MutableString escape(final MutableString s) {
077: return s.replace(HTML_ESCAPE_CHAR, HTML_ESCAPE_STRING);
078: }
079: }
080:
081: /** A singleton for the null escape strategy (which does nothing). */
082:
083: public static final EscapeStrategy NULL_ESCAPE = new NullEscape();
084:
085: private static final class NullEscape implements EscapeStrategy {
086: private NullEscape() {
087: }
088:
089: public MutableString escape(final MutableString s) {
090: return s;
091: }
092: }
093:
094: /** A singleton for the HTML escape strategy. */
095:
096: public static final EscapeStrategy HTML_ESCAPE = new HtmlEscape();
097: /** The number of surrounding word around each interval. */
098: private final int intervalSurround;
099:
100: /** Creates a new loose empty marking mutable string.
101: *
102: * @param marker a marker that will decide how to highlight intervals.
103: * @param escapeStrategy the escape strategy for strings passed to {@link #appendAndMark(WordReader)}, or <code>null</code>.
104: * @param intervalSurround the number of words printed before and after each interval.
105: */
106: public MarkingMutableString(final Marker marker,
107: final EscapeStrategy escapeStrategy,
108: final int intervalSurround) {
109: this .marker = marker;
110: this .escapeStrategy = escapeStrategy;
111: this .intervalSurround = intervalSurround;
112: }
113:
114: /** Creates a new loose empty marking mutable string default interval surround.
115: *
116: * @param marker a marker that will decide how to highlight intervals.
117: * @param escapeStrategy the escape strategy for strings passed to {@link #appendAndMark(WordReader)}, or <code>null</code>.
118: */
119: public MarkingMutableString(final Marker marker,
120: final EscapeStrategy escapeStrategy) {
121: this (marker, escapeStrategy, DEFAULT_INTERVAL_SURROUND);
122: }
123:
124: /** Creates a new loose empty marking mutable string with default interval surround,
125: * no escaping strategy and no term processor.
126: *
127: * @param marker a marker that will decide how to highlight intervals.
128: */
129: public MarkingMutableString(final Marker marker) {
130: this (marker, NULL_ESCAPE);
131: }
132:
133: /** Prepares this marking mutable string for a new field. We append
134: * {@link TextMarker#startOfField()},
135: * the interval marking state is reset and the intervals for marking are set to <code>interval</code>.
136: *
137: * @param interval the new selected-interval array for marking.
138: */
139:
140: public MarkingMutableString startField(
141: final SelectedInterval[] interval) {
142: if (interval == null)
143: throw new IllegalArgumentException();
144: count = -1;
145: currResumeInterval = currMarkingInterval = 0;
146: skipping = oneCharOut = marking = false;
147: this .interval = interval;
148: append(marker.startOfField());
149: return this ;
150: }
151:
152: /** Closes the current field. The value of {@link TextMarker#startOfField()} is appended to the string.
153: */
154: public MarkingMutableString endField() {
155: append(marker.endOfField());
156: return this ;
157: }
158:
159: private int leftRadius(int currResumeInterval) {
160: switch (interval[currResumeInterval].type) {
161: case WHOLE:
162: return intervalSurround;
163: case PREFIX:
164: return intervalSurround;
165: case SUFFIX:
166: return 0;
167: default:
168: throw new IllegalArgumentException();
169: }
170: }
171:
172: private int rightRadius(int currResumeInterval) {
173: switch (interval[currResumeInterval].type) {
174: case WHOLE:
175: return intervalSurround;
176: case PREFIX:
177: return 0;
178: case SUFFIX:
179: return intervalSurround;
180: default:
181: throw new IllegalArgumentException();
182: }
183: }
184:
185: public MarkingMutableString appendAndMark(final String s) {
186: return appendAndMark(new MutableString(s));
187: }
188:
189: public MarkingMutableString appendAndMark(final MutableString s) {
190: return appendAndMark(new FastBufferedReader(s));
191: }
192:
193: public MarkingMutableString appendAndMark(
194: final WordReader wordReader) {
195: //System.err.println( interval[ currInterval ] + "|" + new String( array, offset, length ) );
196:
197: MutableString word = new MutableString(), nonWord = new MutableString();
198: try {
199: while (wordReader.next(word, nonWord)) {
200: if (word.length() != 0)
201: count++;
202:
203: if (resume) {
204: while (currResumeInterval < interval.length
205: && interval[currResumeInterval].interval
206: .compareTo(
207: count,
208: leftRadius(currResumeInterval),
209: rightRadius(currResumeInterval)) > 0)
210: currResumeInterval++;
211: if (currResumeInterval == interval.length
212: || !interval[currResumeInterval].interval
213: .contains(
214: count,
215: leftRadius(currResumeInterval),
216: rightRadius(currResumeInterval))) {
217: if (!skipping && oneCharOut)
218: append(marker.endOfBlock());
219: // There's nothing else we can do...
220: if (resume
221: && currResumeInterval == interval.length)
222: return this ;
223: // Otherwise, we continue, but skipping.
224: skipping = true;
225: continue;
226: }
227:
228: if (skipping)
229: append(marker.startOfBlock());
230: skipping = false;
231: }
232:
233: if (word.length() != 0) {
234: if (!marking
235: && currMarkingInterval < interval.length
236: && interval[currMarkingInterval].interval
237: .contains(count)) {
238: append(marker.startOfMark());
239: marking = true;
240: }
241:
242: append(word);
243:
244: if (marking
245: && (currMarkingInterval == interval.length || !interval[currMarkingInterval].interval
246: .contains(count + 1))) {
247: append(marker.endOfMark());
248: marking = false;
249: }
250:
251: oneCharOut = true;
252: if (currMarkingInterval < interval.length
253: && interval[currMarkingInterval].interval
254: .compareTo(count + 1) > 0)
255: currMarkingInterval++;
256: }
257:
258: if (nonWord.length() > 0) {
259: oneCharOut = true;
260: nonWord.squeezeWhitespace();
261: append(escapeStrategy.escape(nonWord));
262: }
263: }
264:
265: if (marking)
266: append(marker.endOfMark());
267: } catch (IOException e) {
268: throw new RuntimeException(e);
269: }
270:
271: return this;
272: }
273: }
|