001: package it.unimi.dsi.mg4j.util.parser.callback;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2006-2007 Paolo Boldi
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import it.unimi.dsi.fastutil.objects.ObjectArrayList;
025: import it.unimi.dsi.fastutil.objects.ObjectList;
026: import it.unimi.dsi.mg4j.tool.Scan.VirtualDocumentFragment;
027: import it.unimi.dsi.lang.MutableString;
028: import it.unimi.dsi.parser.Attribute;
029: import it.unimi.dsi.parser.BulletParser;
030: import it.unimi.dsi.parser.Element;
031: import it.unimi.dsi.parser.callback.DefaultCallback;
032: import it.unimi.dsi.util.CircularCharArrayBuffer;
033:
034: import java.util.Map;
035:
036: import org.apache.log4j.Logger;
037:
038: /** A callback extracting anchor text. When instantiating the extractor, you can specify the number of characters to
039: * be considered before the anchor, after the anchor or during the anchor (just the first characters are taken into
040: * consideration in the last two characters, and just the last ones in the first case).
041: *
042: * <p>At the end of parsing, the result (the list of anchors) is available in {@link #anchors}, whose
043: * elements provide the content of the <samp>href</samp> attribute
044: * the text of the anchor and around the anchor; text is however modified so that fragment of words at the beginning
045: * of the pre-anchor context, or at the end of the post-anchor context, are cut away.
046: *
047: * <p>For example, a fragment like:
048: *
049: * <code>
050: * ...foo fOO FOO FOO <a href="xxx">ANCHOR TEXT</a> BAR BAR BAr bar...
051: * </code>
052: *
053: * (where the uppercase part represents the pre- and post-anchor context) generates the element
054: *
055: * <code>
056: * Anchor("xxx", "FOO FOO ANCHOR TEXT BAR BAR")
057: * </code>
058: */
059:
060: public class AnchorExtractor extends DefaultCallback {
061:
062: /** A class representing an anchor. It is used to return the results of parsing.
063: *
064: */
065: public final static class Anchor implements VirtualDocumentFragment {
066: private static final long serialVersionUID = 1L;
067: /** The content of the <samp>href</samp> attribute for this anchor. */
068: private final MutableString href;
069: /** The text surrounding this anchor. */
070: private final MutableString anchorText;
071:
072: public Anchor(final MutableString href,
073: final MutableString anchorText) {
074: this .href = href;
075: this .anchorText = anchorText;
076: }
077:
078: public MutableString documentSpecifier() {
079: return href;
080: }
081:
082: public MutableString text() {
083: return anchorText;
084: }
085:
086: public String toString() {
087: return "<" + href + ", \"" + anchorText + "\">";
088: }
089: }
090:
091: public static final Logger LOGGER = Logger
092: .getLogger(AnchorExtractor.class);
093: public static final boolean DEBUG = false;
094:
095: /** The resulting list of {@linkplain Anchor anchors}. */
096: public final ObjectList<Anchor> anchors = new ObjectArrayList<Anchor>();
097:
098: /** The circular buffer for pre-anchor context. */
099: private final CircularCharArrayBuffer preAnchor;
100: /** The circular buffer for anchor. */
101: private final MutableString anchor;
102: /** The maximum number of characters in the anchor. */
103: private final int maxAnchor;
104: /** The maximum number of characters after anchor. */
105: private final int maxAfter;
106: /** The post-anchor. */
107: private final MutableString postAnchor;
108: /** The current URL (if state is IN_ANCHOR). */
109: private MutableString url;
110: /** The resulting string (pre+anchor+post). */
111: private MutableString result;
112: /** When an anchor opens, the pre-anchor buffer is copied in this array. */
113: private char[] preAnchorArray;
114:
115: private enum State {
116: BEFORE_ANCHOR, IN_ANCHOR, AFTER_ANCHOR
117: };
118:
119: private State state;
120:
121: /**
122: *
123: * @param maxBefore maximum number of words to be considered before of the anchor.
124: * @param maxAfter maximum number of words to be considered after the anchor.
125: */
126: public AnchorExtractor(int maxBefore, int maxAnchor, int maxAfter) {
127: preAnchor = new CircularCharArrayBuffer(maxBefore);
128: anchor = new MutableString(maxAnchor);
129: postAnchor = new MutableString(maxAfter);
130: result = new MutableString(maxBefore + maxAnchor + maxAfter);
131: this .maxAfter = maxAfter;
132: this .maxAnchor = maxAnchor;
133: state = State.BEFORE_ANCHOR;
134: }
135:
136: public void configure(final BulletParser parser) {
137: parser.parseTags(true);
138: parser.parseAttributes(true);
139: parser.parseText(true);
140: parser.parseAttribute(Attribute.HREF);
141: }
142:
143: public void startDocument() {
144: state = State.BEFORE_ANCHOR;
145: anchors.clear();
146: preAnchor.clear();
147: anchor.setLength(0);
148: postAnchor.setLength(0);
149: url = null;
150: }
151:
152: public void endDocument() {
153: if (url != null) {
154: emit();
155: }
156: url = null;
157: }
158:
159: public boolean startElement(final Element element,
160: final Map<Attribute, MutableString> attrMap) {
161: if (element == Element.A && attrMap != null
162: && attrMap.containsKey(Attribute.HREF)) {
163: if (state == State.AFTER_ANCHOR) {
164: emit();
165: state = State.BEFORE_ANCHOR;
166: }
167: if (state == State.BEFORE_ANCHOR) {
168: preAnchorArray = preAnchor.toCharArray();
169: preAnchor.clear();
170: if (DEBUG)
171: System.out.println("Freezing now pre: <"
172: + new String(preAnchorArray) + ">");
173: state = State.IN_ANCHOR;
174: url = attrMap.get(Attribute.HREF);
175: anchor.setLength(0);
176: postAnchor.setLength(0);
177: }
178: }
179: return true;
180: }
181:
182: public boolean endElement(final Element element) {
183: if (element == Element.A && state == State.IN_ANCHOR) {
184: state = State.AFTER_ANCHOR;
185: }
186: return true;
187: }
188:
189: public boolean characters(final char[] characters,
190: final int offset, final int length, final boolean flowBroken) {
191: switch (state) {
192: case BEFORE_ANCHOR:
193: preAnchor.add(characters, offset, length);
194: break;
195: case IN_ANCHOR:
196: anchor.append(characters, offset, Math.min(length,
197: maxAnchor - anchor.length()));
198: break;
199: case AFTER_ANCHOR:
200: preAnchor.add(characters, offset, length);
201: postAnchor.append(characters, offset, Math.min(length,
202: maxAfter - postAnchor.length()));
203: break;
204: }
205: if (state == State.AFTER_ANCHOR
206: && postAnchor.length() == maxAfter && url != null) {
207: emit();
208: state = State.BEFORE_ANCHOR;
209: }
210: return true;
211: }
212:
213: private void emit() {
214: int posPre, posPost, posAnchor;
215:
216: // Cut pre until the first start of word
217: posPre = 0;
218: if (preAnchorArray.length > 0
219: && Character.isLetterOrDigit(preAnchorArray[posPre]))
220: // Skip starting non-space
221: for (; posPre < preAnchorArray.length
222: && Character
223: .isLetterOrDigit(preAnchorArray[posPre]); posPre++)
224: ;
225: // Same for post
226: char[] postAnchorArray = postAnchor.array();
227: posPost = postAnchor.length() - 1;
228: if (posPost >= 0
229: && Character.isLetterOrDigit(postAnchorArray[posPost])) {
230: // Skip ending non-space
231: for (; posPost >= 0
232: && Character
233: .isLetterOrDigit(postAnchorArray[posPost]); posPost--)
234: ;
235: }
236: // Same for anchor
237: char[] anchorArray = anchor.array();
238: posAnchor = anchor.length() - 1;
239: if (anchor.length() == maxAnchor && posAnchor >= 0
240: && Character.isLetterOrDigit(anchorArray[posAnchor]))
241: // Skip starting non-space
242: for (; posAnchor >= 0
243: && Character
244: .isLetterOrDigit(anchorArray[posAnchor]); posAnchor--)
245: ;
246:
247: result.setLength(0);
248: result.append(preAnchorArray, posPre,
249: preAnchorArray.length - posPre).append(anchorArray, 0,
250: posAnchor + 1).append(postAnchorArray, 0, posPost + 1);
251: anchors.add(new Anchor(url, result.copy()));
252: url = null;
253: }
254: }
|