001: /**********************************************************************************
002: * $URL: https://source.sakaiproject.org/svn/search/tags/sakai_2-4-1/search-impl/impl/src/java/org/sakaiproject/search/component/adapter/contenthosting/HtmlContentDigester.java $
003: * $Id: HtmlContentDigester.java 21699 2007-02-17 18:02:31Z ian@caret.cam.ac.uk $
004: ***********************************************************************************
005: *
006: * Copyright (c) 2003, 2004, 2005, 2006 The Sakai Foundation.
007: *
008: * Licensed under the Educational Community License, Version 1.0 (the "License");
009: * you may not use this file except in compliance with the License.
010: * You may obtain a copy of the License at
011: *
012: * http://www.opensource.org/licenses/ecl1.php
013: *
014: * Unless required by applicable law or agreed to in writing, software
015: * distributed under the License is distributed on an "AS IS" BASIS,
016: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017: * See the License for the specific language governing permissions and
018: * limitations under the License.
019: *
020: **********************************************************************************/package org.sakaiproject.search.component.adapter.contenthosting;
021:
022: import java.io.ByteArrayOutputStream;
023: import java.io.IOException;
024: import java.io.InputStream;
025: import java.io.Reader;
026: import java.io.StringReader;
027: import java.io.UnsupportedEncodingException;
028: import java.util.Iterator;
029:
030: import org.apache.commons.logging.Log;
031: import org.apache.commons.logging.LogFactory;
032: import org.sakaiproject.content.api.ContentResource;
033: import org.sakaiproject.exception.ServerOverloadException;
034: import org.sakaiproject.search.api.SearchUtils;
035: import org.sakaiproject.search.component.adapter.util.DigestHtml;
036: import org.w3c.tidy.Tidy;
037:
038: /**
039: * @author ieb
040: */
041: public class HtmlContentDigester extends BaseContentDigester {
042: private static Log log = LogFactory
043: .getLog(HtmlContentDigester.class);
044:
045: private boolean useDirectParser = true;
046:
047: /*
048: * (non-Javadoc)
049: *
050: * @see org.sakaiproject.search.component.adapter.contenthosting.ContentDigester#getContent(org.sakaiproject.content.api.ContentResource)
051: */
052: public String getContent(ContentResource contentResource)
053:
054: {
055: if (contentResource != null
056: && contentResource.getContentLength() > maxDigestSize) {
057: throw new RuntimeException(
058: "Attempt to get too much content as a string on "
059: + contentResource.getReference());
060: }
061: if (useDirectParser) {
062: try {
063: String content = new String(contentResource
064: .getContent(), "UTF-8");
065: StringBuilder sb = new StringBuilder();
066: for (Iterator<String> i = new HTMLParser(content); i
067: .hasNext();) {
068: String s = i.next();
069: if (s.length() > 0) {
070: SearchUtils.appendCleanString(s, sb);
071: }
072: }
073: return sb.toString();
074: } catch (ServerOverloadException ex) {
075: throw new RuntimeException(
076: "Failed get Resource Content ", ex);
077:
078: } catch (UnsupportedEncodingException e) {
079: throw new RuntimeException(
080: "Failed get Resource Content ", e);
081: }
082: } else {
083:
084: InputStream contentStream = null;
085: Tidy tidy = new Tidy();
086: ByteArrayOutputStream baos = new ByteArrayOutputStream();
087: try {
088:
089: contentStream = contentResource.streamContent();
090: log.info("Raw Content was " + contentStream);
091: tidy.setQuiet(true);
092: tidy.setShowWarnings(false);
093: tidy.setOnlyErrors(true);
094: tidy.parse(contentStream, baos);
095:
096: String tidyOut = SearchUtils.appendCleanString(
097: new String(baos.toByteArray(), "UTF-8"), null)
098: .toString();
099: log.info(contentResource.getReference()
100: + " Tidy Output was " + tidyOut);
101: log.debug("Tidy Output was " + tidyOut);
102: return DigestHtml.digest(tidyOut);
103:
104: } catch (ServerOverloadException e) {
105: throw new RuntimeException(
106: "Failed get Resource Content ", e);
107: } catch (UnsupportedEncodingException e) {
108: throw new RuntimeException(
109: "Failed get Resource Content ", e);
110: } finally {
111: if (baos != null) {
112: try {
113: baos.close();
114: } catch (IOException e) {
115: }
116: }
117: if (contentStream != null) {
118: try {
119: contentStream.close();
120: } catch (IOException e) {
121: }
122: }
123: }
124: }
125: }
126:
127: /*
128: * (non-Javadoc)
129: *
130: * @see org.sakaiproject.search.component.adapter.contenthosting.ContentDigester#getContentReader(org.sakaiproject.content.api.ContentResource, int)
131: */
132: public Reader getContentReader(ContentResource contentResource) {
133: // TODO Auto-generated method stub
134: return new StringReader(getContent(contentResource));
135: }
136: }
|