001: /*
002: * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
003: *
004: * Redistribution and use in source and binary forms, with or without
005: * modification, are permitted provided that the following conditions
006: * are met:
007: *
008: * - Redistributions of source code must retain the above copyright
009: * notice, this list of conditions and the following disclaimer.
010: *
011: * - Redistribution in binary form must reproduce the above copyright
012: * notice, this list of conditions and the following disclaimer in
013: * the documentation and/or other materials provided with the
014: * distribution.
015: *
016: * Neither the name of Sun Microsystems, Inc. or the names of
017: * contributors may be used to endorse or promote products derived
018: * from this software without specific prior written permission.
019: *
020: * This software is provided "AS IS," without a warranty of any
021: * kind. ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND
022: * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,
023: * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT, ARE HEREBY
024: * EXCLUDED. SUN AND ITS LICENSORS SHALL NOT BE LIABLE FOR ANY DAMAGES
025: * SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
026: * DISTRIBUTING THE SOFTWARE OR ITS DERIVATIVES. IN NO EVENT WILL SUN
027: * OR ITS LICENSORS BE LIABLE FOR ANY LOST REVENUE, PROFIT OR DATA, OR
028: * FOR DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL OR
029: * PUNITIVE DAMAGES, HOWEVER CAUSED AND REGARDLESS OF THE THEORY OF
030: * LIABILITY, ARISING OUT OF THE USE OF OR INABILITY TO USE SOFTWARE,
031: * EVEN IF SUN HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
032: *
033: * You acknowledge that Software is not designed, licensed or intended
034: * any nuclear facility.
035: */
036:
037: /*
038: * HTMLParser.java
039: *
040: * Created on July 18, 2005, 4:05 PM
041: *
042: */
043:
044: package com.sun.portal.search.util;
045:
046: import com.sun.portal.search.soif.SOIF;
047: import java.io.File;
048: import java.io.FileInputStream;
049: import java.io.InputStreamReader;
050: import java.io.Reader;
051: import javax.swing.text.html.HTML.Tag;
052: import javax.swing.text.html.HTML.Attribute;
053: import javax.swing.text.html.HTMLEditorKit.ParserCallback;
054: import javax.swing.text.html.parser.ParserDelegator;
055:
056: public class HTMLParser {
057: SOIF rd;
058: String textField = "partial-text";
059: StringBuffer textBuffer = new StringBuffer();
060:
061: /** Creates a new instance of HTMLParser */
062: class myParserCallback extends ParserCallback {
063:
064: String[] metas = { "author", "title", "subject", "keywords",
065: "comment" };
066:
067: /** Creates a new instance of myParserCallback */
068: public myParserCallback() {
069: }
070:
071: public void handleSimpleTag(javax.swing.text.html.HTML.Tag t,
072: javax.swing.text.MutableAttributeSet a, int pos) {
073: //System.err.println("handleSimpleTag:" + t.toString());
074: if (t.equals(Tag.META)) {
075: String name = (String) a.getAttribute(Attribute.NAME);
076: if (name != null) {
077: for (int i = 0; i < this .metas.length; i++) {
078: //System.err.println("name=" + name);
079: if (name.equalsIgnoreCase(metas[i])) {
080: String content = (String) a
081: .getAttribute(Attribute.CONTENT);
082: rd.insert(metas[i], content);
083: break;
084: }
085: }
086: }
087: }
088: super .handleSimpleTag(t, a, pos);
089: }
090:
091: public void handleText(char[] data, int pos) {
092: //System.err.println("handleText:" + pos);
093: textBuffer.append(data);
094: textBuffer.append('\n');
095:
096: super .handleText(data, pos);
097: }
098:
099: }
100:
101: public HTMLParser(SOIF s) {
102: this .rd = s;
103: }
104:
105: public HTMLParser(SOIF s, String textField) {
106: this .rd = s;
107: if (textField != null && textField.length() > 0) {
108: this .textField = textField;
109: }
110: }
111:
112: public void parse(File f) throws Exception {
113: Reader r = new InputStreamReader(new FileInputStream(f),
114: "UTF-8");
115: myParserCallback cb = new myParserCallback();
116: new ParserDelegator().parse(r, cb, true);
117: rd.replace(textField, textBuffer.toString());
118: }
119:
120: /**
121: * @param args the command line arguments
122: */
123: public static void main(String[] args) throws Exception {
124: SOIF rd = new SOIF("Document", "-");
125: HTMLParser parser = new HTMLParser(rd);
126: parser.parse(new File(args[0]));
127: System.err.println("RD=\n" + rd.toString());
128: }
129:
130: }
|