001: // Jericho HTML Parser - Java based library for analysing and manipulating HTML
002: // Version 2.5
003: // Copyright (C) 2007 Martin Jericho
004: // http://jerichohtml.sourceforge.net/
005: //
006: // This library is free software; you can redistribute it and/or
007: // modify it under the terms of either one of the following licences:
008: //
009: // 1. The Eclipse Public License (EPL) version 1.0,
010: // included in this distribution in the file licence-epl-1.0.html
011: // or available at http://www.eclipse.org/legal/epl-v10.html
012: //
013: // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
014: // included in this distribution in the file licence-lgpl-2.1.txt
015: // or available at http://www.gnu.org/licenses/lgpl.txt
016: //
017: // This library is distributed on an "AS IS" basis,
018: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
019: // See the individual licence texts for more details.
020:
021: package au.id.jericho.lib.html;
022:
023: import java.util.*;
024: import java.io.*;
025: import java.net.*;
026:
027: /**
028: * Extracts the textual content from HTML markup.
029: * <p>
030: * The output is ideal for feeding into a text search engine such as <a target="_blank" href="http://lucene.apache.org/java/">Apache Lucene</a>,
031: * especially when the {@link #setIncludeAttributes(boolean) IncludeAttributes} property has been set to <code>true</code>.
032: * <p>
033: * Use one of the following methods to obtain the output:
034: * <ul style="margin-top: 0">
035: * <li>{@link #writeTo(Writer)}</li>
036: * <li>{@link #toString()}</li>
037: * <li>{@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}</li>
038: * </ul>
039: * <p>
040: * The process removes all of the tags and
041: * {@linkplain CharacterReference#decodeCollapseWhiteSpace(CharSequence) decodes the result, collapsing all white space}.
042: * A space character is included in the output where a <a href="TagType.html#Normal">normal</a> tag is present in the source,
043: * unless the tag belongs to an {@linkplain HTMLElements#getInlineLevelElementNames() inline-level} element.
044: * An exception to this is the {@link HTMLElementName#BR BR} element, which is also converted to a space despite being an inline-level element.
045: * <p>
046: * Text inside {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements contained within this segment
047: * is ignored.
048: * <p>
049: * Setting the {@link #setExcludeNonHTMLElements(boolean) ExcludeNonHTMLElements} property results in the exclusion of any content within a
050: * <a href="HTMLElements.html#NonHTMLElement">non-HTML element</a>.
051: * <p>
052: * See the {@link #excludeElement(StartTag)} method for details on how to implement a more complex mechanism to determine whether the
053: * {@linkplain Element#getContent() content} of each {@link Element} is to be excluded from the output.
054: * <p>
055: * All tags that are not <a href="TagType.html#Normal">normal</a> tags, such as {@linkplain TagType#isServerTag() server tags},
056: * {@linkplain StartTagType#COMMENT comments} etc., are removed from the output without adding whitespace to the output.
057: * <p>
058: * Note that segments on which the {@link Segment#ignoreWhenParsing()} method has been called are treated as text rather than markup,
059: * resulting in their inclusion in the output.
060: * To remove specific segments before extracting the text, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or
061: * {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment to be removed.
062: * Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())}
063: * and perform the text extraction on this new source object.
064: * <p>
065: * Extracting the text from an entire {@link Source} object performs a {@linkplain Source#fullSequentialParse() full sequential parse} automatically.
066: * <p>
067: * To perform a simple rendering of HTML markup into text, which is more readable than the output of this class, use the {@link Renderer} class instead.
068: * <dl>
069: * <dt>Example:</dt>
070: * <dd>Using the default settings, the source segment:<br />
071: * "<code><div><b>O</b>ne</div><div title="Two"><b>Th</b><script>//a script </script>ree</div></code>"<br />
072: * produces the text "<code>One Two Three</code>".
073: * </dl>
074: */
075: public class TextExtractor implements CharStreamSource {
076: private final Segment segment;
077: private boolean convertNonBreakingSpaces = true;
078: private boolean includeAttributes = false;
079: private boolean excludeNonHTMLElements = false;
080:
081: /**
082: * Constructs a new <code>TextExtractor</code> based on the specified {@link Segment}.
083: * @param segment the segment from which the text will be extracted.
084: * @see Segment#getTextExtractor()
085: */
086: public TextExtractor(final Segment segment) {
087: this .segment = segment;
088: }
089:
090: // Documentation inherited from CharStreamSource
091: public void writeTo(final Writer writer) throws IOException {
092: writer.write(toString());
093: writer.flush();
094: }
095:
096: // Documentation inherited from CharStreamSource
097: public long getEstimatedMaximumOutputLength() {
098: return segment.length();
099: }
100:
101: // Documentation inherited from CharStreamSource
102: public String toString() {
103: return new Processor(segment, getConvertNonBreakingSpaces(),
104: getIncludeAttributes(), getExcludeNonHTMLElements())
105: .toString();
106: }
107:
108: /**
109: * Sets whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces.
110: * <p>
111: * The default value is <code>true</code>.
112: *
113: * @param convertNonBreakingSpaces specifies whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces.
114: * @return this <code>TextExtractor</code> instance, allowing multiple property setting methods to be chained in a single statement.
115: * @see #getConvertNonBreakingSpaces()
116: */
117: public TextExtractor setConvertNonBreakingSpaces(
118: boolean convertNonBreakingSpaces) {
119: this .convertNonBreakingSpaces = convertNonBreakingSpaces;
120: return this ;
121: }
122:
123: /**
124: * Indicates whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces.
125: * <p>
126: * See the {@link #setConvertNonBreakingSpaces(boolean)} method for a full description of this property.
127: *
128: * @return <code>true</code> if non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces, otherwise <code>false</code>.
129: */
130: public boolean getConvertNonBreakingSpaces() {
131: return convertNonBreakingSpaces;
132: }
133:
134: /**
135: * Sets whether the values of
136: * <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-title">title</a>,
137: * <a target="_blank" href="http://www.w3.org/TR/html401/struct/objects.html#adef-alt">alt</a>,
138: * <a target="_blank" href="http://www.w3.org/TR/html401/interact/forms.html#adef-label-OPTION">label</a>,
139: * <a target="_blank" href="http://www.w3.org/TR/html401/struct/tables.html#adef-summary">summary</a>, and
140: * <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-content">content</a>
141: * attributes of {@linkplain StartTagType#NORMAL normal} tags are to be included in the output.
142: * <p>
143: * The value of a <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-content">content</a> attribute is
144: * only included if a <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-name-META">name</a> attribute is also present,
145: * as the content attribute of a {@link HTMLElementName#META META} tag only contains human readable text if the name attribute is used as opposed to an
146: * <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-http-equiv">http-equiv</a> attribute.
147: * <p>
148: * The default value is <code>false</code>.
149: *
150: * @param includeAttributes specifies whether the attribute values are included in the output.
151: * @return this <code>TextExtractor</code> instance, allowing multiple property setting methods to be chained in a single statement.
152: * @see #getIncludeAttributes()
153: */
154: public TextExtractor setIncludeAttributes(boolean includeAttributes) {
155: this .includeAttributes = includeAttributes;
156: return this ;
157: }
158:
159: /**
160: * Indicates whether the values of
161: * <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-title">title</a>,
162: * <a target="_blank" href="http://www.w3.org/TR/html401/struct/objects.html#adef-alt">alt</a>,
163: * <a target="_blank" href="http://www.w3.org/TR/html401/interact/forms.html#adef-label-OPTION">label</a>, and
164: * <a target="_blank" href="http://www.w3.org/TR/html401/struct/tables.html#adef-summary">summary</a>, and
165: * <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-content">content</a>
166: * attributes of {@linkplain StartTagType#NORMAL normal} tags are to be included in the output.
167: * <p>
168: * See the {@link #setIncludeAttributes(boolean)} method for a full description of this property.
169: *
170: * @return <code>true</code> if the attribute values are to be included in the output, otherwise <code>false</code>.
171: */
172: public boolean getIncludeAttributes() {
173: return includeAttributes;
174: }
175:
176: /**
177: * Sets whether the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
178: * <p>
179: * The default value is <code>false</code>, meaning that content from all elements meeting the other criteria is included.
180: *
181: * @param excludeNonHTMLElements specifies whether content <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
182: * @return this <code>TextExtractor</code> instance, allowing multiple property setting methods to be chained in a single statement.
183: * @see #getExcludeNonHTMLElements()
184: */
185: public TextExtractor setExcludeNonHTMLElements(
186: boolean excludeNonHTMLElements) {
187: this .excludeNonHTMLElements = excludeNonHTMLElements;
188: return this ;
189: }
190:
191: /**
192: * Indicates whether the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
193: * <p>
194: * See the {@link #setExcludeNonHTMLElements(boolean)} method for a full description of this property.
195: *
196: * @return <code>true</code> if the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output, otherwise <code>false</code>.
197: */
198: public boolean getExcludeNonHTMLElements() {
199: return excludeNonHTMLElements;
200: }
201:
202: /**
203: * Indicates whether the text inside the {@link Element} of the specified start tag should be excluded from the output.
204: * <p>
205: * During the text extraction process, every start tag encountered in the segment is checked using this method to determine whether the text inside its
206: * {@linkplain StartTag#getElement() associated element} should be excluded from the output.
207: * <p>
208: * The default implementation of this method is to always return <code>false</code>, so that every element is included,
209: * but the method can be overridden in a subclass to perform a check of arbitrary complexity on each start tag.
210: * <p>
211: * All elements nested inside an excluded element are also implicitly excluded, as are all
212: * {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements.
213: * Such elements are skipped over without calling this method, so there is no way to include them by overriding the method.
214: * <p>
215: * <dl>
216: * <dt>Example:</dt>
217: * <dd>
218: * To extract the text from a <code>segment</code>, excluding any text inside elements with the attribute <code>class="NotIndexed"</code>:<br /><br />
219: * <code>
220: * TextExtractor textExtractor=new TextExtractor(segment) {<br />
221: * public boolean excludeElement(StartTag startTag) {<br />
222: * return "NotIndexed".equalsIgnoreCase(startTag.getAttributeValue("class"));<br />
223: * }<br />
224: * };<br />
225: * String extractedText=textExtractor.toString();
226: * </code>
227: * </dd>
228: * </dl>
229: * @param startTag the start tag of the element to check for inclusion.
230: * @return <true> if the text inside the {@link Element} of the specified start tag should be excluded from the output, otherwise <code>false</code>.
231: */
232: public boolean excludeElement(final StartTag startTag) {
233: return false;
234: }
235:
236: /**
237: * This class does the actual work, but is first passed final copies of all the parameters for efficiency.
238: * Note at present this is not implemented in a memory-efficient manner.
239: * Once the CharacterReference.decodeCollapseWhiteSpace functionality is available as a FilterWriter (coming in release 3.0),
240: * the main algorithm with be implemented in the writeTo(Writer) method to allow for more memory-efficient processing.
241: */
242: private final class Processor {
243: private final Segment segment;
244: private final Source source;
245: private final boolean convertNonBreakingSpaces;
246: private final boolean includeAttributes;
247: private final boolean excludeNonHTMLElements;
248:
249: public Processor(final Segment segment,
250: final boolean convertNonBreakingSpaces,
251: final boolean includeAttributes,
252: final boolean excludeNonHTMLElements) {
253: this .segment = segment;
254: source = segment.source;
255: this .convertNonBreakingSpaces = convertNonBreakingSpaces;
256: this .includeAttributes = includeAttributes;
257: this .excludeNonHTMLElements = excludeNonHTMLElements;
258: }
259:
260: public String toString() {
261: final StringBuffer sb = new StringBuffer(segment.length());
262: int textBegin = segment.begin;
263: for (final Iterator i = segment.findAllTags().iterator(); i
264: .hasNext();) {
265: final Tag tag = (Tag) i.next();
266: final int textEnd = tag.begin;
267: if (textEnd < textBegin)
268: continue;
269: while (textBegin < textEnd)
270: sb.append(source.charAt(textBegin++));
271: if (tag.getTagType() == StartTagType.NORMAL) {
272: final StartTag startTag = (StartTag) tag;
273: if (tag.name == HTMLElementName.SCRIPT
274: || tag.name == HTMLElementName.STYLE
275: || excludeElement(startTag)
276: || (excludeNonHTMLElements && !HTMLElements
277: .getElementNames().contains(
278: tag.name))) {
279: textBegin = startTag.getElement().end;
280: continue;
281: }
282: if (includeAttributes) {
283: final Attributes attributes = startTag
284: .getAttributes();
285: // add title attribute:
286: final Attribute titleAttribute = attributes
287: .get("title");
288: if (titleAttribute != null)
289: sb.append(' ').append(
290: titleAttribute.getValueSegment())
291: .append(' ');
292: // add alt attribute (APPLET, AREA, IMG and INPUT elements):
293: final Attribute altAttribute = attributes
294: .get("alt");
295: if (altAttribute != null)
296: sb.append(' ').append(
297: altAttribute.getValueSegment())
298: .append(' ');
299: // add label attribute (OPTION and OPTGROUP elements):
300: final Attribute labelAttribute = attributes
301: .get("label");
302: if (labelAttribute != null)
303: sb.append(' ').append(
304: labelAttribute.getValueSegment())
305: .append(' ');
306: // add summary attribute (TABLE element):
307: final Attribute summaryAttribute = attributes
308: .get("summary");
309: if (summaryAttribute != null)
310: sb.append(' ').append(
311: summaryAttribute.getValueSegment())
312: .append(' ');
313: // add content attribute (META element):
314: final Attribute contentAttribute = attributes
315: .get("content");
316: if (contentAttribute != null
317: && attributes.get("name") != null)
318: sb.append(' ').append(
319: contentAttribute.getValueSegment())
320: .append(' ');
321: // don't bother with the prompt attribute from the ININDEX element as the element is deprecated and very rarely used.
322: }
323: }
324: // Treat both start and end tags not belonging to inline-level elements as whitespace:
325: if (tag.getName() == HTMLElementName.BR
326: || !HTMLElements.getInlineLevelElementNames()
327: .contains(tag.getName()))
328: sb.append(' ');
329: textBegin = tag.end;
330: }
331: while (textBegin < segment.end)
332: sb.append(source.charAt(textBegin++));
333: final String decodedText = CharacterReference
334: .decodeCollapseWhiteSpace(sb,
335: convertNonBreakingSpaces);
336: return decodedText;
337: }
338: }
339: }
|