001: /**********************************************************************************
002: * $URL: https://source.sakaiproject.org/svn/search/tags/sakai_2-4-1/search-impl/impl/src/java/org/sakaiproject/search/component/adapter/contenthosting/HTMLParser.java $
003: * $Id: HTMLParser.java 22588 2007-03-14 09:53:30Z ian@caret.cam.ac.uk $
004: ***********************************************************************************
005: *
006: * Copyright (c) 2003, 2004, 2005, 2006, 2007 The Sakai Foundation.
007: *
008: * Licensed under the Educational Community License, Version 1.0 (the "License");
009: * you may not use this file except in compliance with the License.
010: * You may obtain a copy of the License at
011: *
012: * http://www.opensource.org/licenses/ecl1.php
013: *
014: * Unless required by applicable law or agreed to in writing, software
015: * distributed under the License is distributed on an "AS IS" BASIS,
016: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017: * See the License for the specific language governing permissions and
018: * limitations under the License.
019: *
020: **********************************************************************************/package org.sakaiproject.search.component.adapter.contenthosting;
021:
022: import java.io.BufferedReader;
023: import java.io.InputStreamReader;
024: import java.util.HashMap;
025: import java.util.Iterator;
026: import java.util.Map;
027:
028: import org.apache.commons.logging.Log;
029: import org.apache.commons.logging.LogFactory;
030:
031: /**
032: * @author ieb
033: */
034: public class HTMLParser implements Iterator<String> {
035: private static Log log = LogFactory.getLog(HTMLParser.class);
036:
037: private static final char[][] IGNORE_TAGS = new char[][] {
038: "script".toCharArray(), "head".toCharArray(),
039: "style".toCharArray() };
040:
041: private static final String PAD = " ";
042:
043: private static final Map<String, String> entities = new HashMap<String, String>();
044:
045: static {
046: try {
047: BufferedReader br = new BufferedReader(
048: new InputStreamReader(
049: HTMLParser.class
050: .getResourceAsStream("/org/sakaiproject/search/component/bundle/htmlentities.config")));
051: for (String line = br.readLine(); line != null; line = br
052: .readLine()) {
053: if (!line.startsWith("#")) {
054: String[] parts = line.split("=");
055: char code = (char) Integer.parseInt(parts[1]);
056: entities.put(parts[0], new String(
057: new char[] { code }));
058: }
059: }
060: br.close();
061: } catch (Exception ex) {
062: log.error("Unable to load HTML Entities", ex);
063: }
064: };
065:
066: private int[] elementStack = new int[1024];
067:
068: private int ignore = elementStack.length;
069:
070: private boolean notxml = false;
071:
072: private char[] cbuf;
073:
074: private int current = 0;
075:
076: private int clen = 0;;
077:
078: private int endstack = 0;
079:
080: private int last = 0;
081:
082: public HTMLParser(String content) {
083: cbuf = content.toCharArray();
084: current = 0;
085: clen = cbuf.length;
086: }
087:
088: public String getTagName(String tag, int start) {
089: tag = tag.substring(start);
090: char[] c = tag.toCharArray();
091: String[] words = tag.split("\\s", 2);
092: if (words != null && words.length != 0) {
093: return words[0];
094: } else {
095: return tag;
096: }
097: }
098:
099: /*
100: * (non-Javadoc)
101: *
102: * @see java.util.Iterator#hasNext()
103: */
104: public boolean hasNext() {
105: if (current >= clen) {
106: return false;
107: }
108: for (int i = current; i < clen; i++) {
109: if (cbuf[i] == '<') {
110: current = i;
111: return true;
112: }
113: }
114: current = clen - 1;
115: return true;
116: }
117:
118: /*
119: * (non-Javadoc)
120: *
121: * @see java.util.Iterator#next()
122: */
123: public String next() {
124: int tagend = clen - 1;
125: int elend = -1;
126: int tagstart = current + 1;
127: boolean ignoreBefore = !(ignore > endstack);
128: boolean ignoreAfter = ignoreBefore;
129:
130: for (int i = current; i < clen; i++) {
131: if (elend == -1 && Character.isWhitespace(cbuf[i])) {
132: elend = i;
133: }
134: if (cbuf[i] == '>') {
135: tagend = i;
136: if (elend == -1) {
137: elend = tagend;
138: }
139: break;
140: }
141: }
142: if (tagstart < clen) {
143: if (cbuf[tagend - 1] == '/') {
144: } else if (tagstart + 2 < clen && cbuf[tagstart] == '!'
145: && cbuf[tagstart + 1] == '-'
146: && cbuf[tagstart + 2] == '-') {
147: } else if (cbuf[tagstart] == '/') {
148: tagstart++;
149: if (!notxml) {
150: boolean match = true;
151: if ((elend - tagstart) == (elementStack[endstack - 1] - elementStack[endstack - 2])) {
152: int j = elementStack[endstack - 2];
153: for (int i = 0; i < (elend - tagstart); i++) {
154: if (Character
155: .toLowerCase(cbuf[tagstart + i]) != Character
156: .toLowerCase(cbuf[j + i])) {
157: match = false;
158: break;
159: }
160: }
161: }
162:
163: if (match) {
164: endstack -= 2;
165: ignoreAfter = !(ignore > endstack);
166: } else {
167: notxml = true;
168: }
169: }
170: } else {
171: if (!notxml) {
172: elementStack[endstack] = tagstart;
173: elementStack[endstack + 1] = elend;
174: endstack += 2;
175: if (!ignoreAfter) {
176: for (int i = 0; i < IGNORE_TAGS.length; i++) {
177: if (IGNORE_TAGS[i].length == (elend - tagstart)) {
178: ignoreAfter = true;
179: for (int j = 0; j < IGNORE_TAGS[i].length; j++) {
180: if (IGNORE_TAGS[i][j] != Character
181: .toLowerCase(cbuf[tagstart
182: + j])) {
183: ignoreAfter = false;
184: break;
185: }
186: }
187: if (ignoreAfter) {
188: break;
189: }
190: }
191: }
192: }
193: }
194: }
195: }
196:
197: String t = "";
198: if (notxml || !ignoreBefore) {
199: if (true) {
200: StringBuilder sb = new StringBuilder();
201:
202: for (int i = last; i < current; i++) {
203: if (cbuf[i] == '&') {
204: if (cbuf[i + 1] == '#') {
205: for (int j = i; j < current; j++) {
206: if (cbuf[j] == ';') {
207: String entity = new String(cbuf,
208: i + 2, j - (i + 2));
209: sb.append((char) Integer.decode(
210: entity).intValue());
211: i = j;
212: break;
213: }
214: }
215: } else {
216: for (int j = i; j < current; j++) {
217: if (cbuf[j] == ';') {
218:
219: String entity = new String(cbuf, i,
220: j - i + 1);
221: String s = (String) entities
222: .get(entity);
223: if (s == null) {
224: s = entity;
225: } else if (s.length() > 0) {
226: sb.append(s.charAt(0));
227: }
228: i = j;
229: break;
230: }
231: }
232: }
233: } else {
234: sb.append(cbuf[i]);
235: }
236: }
237: t = sb.toString();
238: } else {
239: t = new String(cbuf, last, current - last);
240: }
241: }
242: last = tagend + 1;
243: current = last;
244:
245: if (ignoreAfter) {
246: if (!ignoreBefore) {
247: ignore = endstack;
248: }
249: } else {
250: ignore = endstack + 2;
251: }
252: return t;
253: }
254:
255: /*
256: * (non-Javadoc)
257: *
258: * @see java.util.Iterator#remove()
259: */
260: public void remove() {
261: }
262:
263: }
|