001: /*
002: * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
003: *
004: * Redistribution and use in source and binary forms, with or without
005: * modification, are permitted provided that the following conditions
006: * are met:
007: *
008: * - Redistributions of source code must retain the above copyright
009: * notice, this list of conditions and the following disclaimer.
010: *
011: * - Redistribution in binary form must reproduce the above copyright
012: * notice, this list of conditions and the following disclaimer in
013: * the documentation and/or other materials provided with the
014: * distribution.
015: *
016: * Neither the name of Sun Microsystems, Inc. or the names of
017: * contributors may be used to endorse or promote products derived
018: * from this software without specific prior written permission.
019: *
020: * This software is provided "AS IS," without a warranty of any
021: * kind. ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND
022: * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,
023: * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT, ARE HEREBY
024: * EXCLUDED. SUN AND ITS LICENSORS SHALL NOT BE LIABLE FOR ANY DAMAGES
025: * SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
026: * DISTRIBUTING THE SOFTWARE OR ITS DERIVATIVES. IN NO EVENT WILL SUN
027: * OR ITS LICENSORS BE LIABLE FOR ANY LOST REVENUE, PROFIT OR DATA, OR
028: * FOR DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL OR
029: * PUNITIVE DAMAGES, HOWEVER CAUSED AND REGARDLESS OF THE THEORY OF
030: * LIABILITY, ARISING OUT OF THE USE OF OR INABILITY TO USE SOFTWARE,
031: * EVEN IF SUN HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
032: *
033: * You acknowledge that Software is not designed, licensed or intended
034: * any nuclear facility.
035: */
036:
037: /*
038: * DocumentConverter.java
039: *
040: * Created on July 26, 2005, 1:38 PM
041: *
042: */
043:
044: package com.sun.portal.search.util;
045:
046: import com.sun.portal.search.soif.SOIF;
047: import java.io.File;
048: import java.io.FileOutputStream;
049: import java.io.IOException;
050: import java.util.Date;
051: import java.text.MessageFormat;
052:
053: public class DocumentConverter {
054: /**
055: * cli pattern
056: */
057: static public final String converterUsage = "{0}" + File.separator
058: + "run-cs-cli{3} converter {1} {2}";
059: static public final String scriptExtension = System.getProperty(
060: "os.name").toLowerCase().startsWith("win") ? ".bat" : "";
061: private String tempDir;
062: private String serverRoot;
063:
064: /**
065: * Creates a new instance of DocumentConverter
066: * @param serverRoot search server root
067: * @param tempDir tmp dir
068: */
069: public DocumentConverter(String serverRoot, String tempDir) {
070: this .tempDir = tempDir;
071: this .serverRoot = serverRoot;
072: }
073:
074: /**
075: * Converts an attached file, which is stored as a soif attribute, into a html.
076: * It ,then, extracts meta tags from the html, such as title and author.
077: * @param s SOIF object with attached file.
078: * @param attributeSrc The attribute holds the file data
079: * @param attributeDest The attribute for holding extracted text
080: * @param deleteSrc true - delete the attached file.
081: * false - keep it.
082: */
083: public void ConvertToSOIF(SOIF s, String attributeSrc,
084: String attributeDest, boolean deleteSrc) {
085: if (s.contains(attributeSrc)) {
086: Date now = new Date();
087: try {
088: String tmpsrc = tempDir + File.separator + "src_"
089: + Long.toString(now.getTime());
090: FileOutputStream fout = new FileOutputStream(tmpsrc);
091: fout.write(s.getBytes(attributeSrc));
092: fout.close();
093: if (deleteSrc) {
094: s.remove(attributeSrc);
095: }
096: String tmpdest = tempDir + File.separator + "dest_"
097: + Long.toString(now.getTime());
098: Runtime rt = Runtime.getRuntime();
099: Object[] args = { serverRoot, tmpsrc, tmpdest,
100: scriptExtension };
101: String cmd = MessageFormat.format(converterUsage, args);
102: //System.err.println("running command:" + cmd);
103: Process p = rt.exec(cmd, null, new File(serverRoot));
104: try {
105: p.waitFor();
106: //System.err.println("exit value:" + p.exitValue());
107: HTMLParser parser = new HTMLParser(s, attributeDest);
108: try {
109: parser.parse(new File(tmpdest));
110: } catch (Exception pe) {
111: //System.err.println("parse exception:" + pe);
112: }
113: } catch (InterruptedException e) {
114: //System.err.println("interrupted exception:" + e);
115: }
116: try {
117: new File(tmpsrc).delete();
118: } catch (Exception e) {
119: }
120: try {
121: new File(tmpdest).delete();
122: } catch (Exception e) {
123: }
124:
125: } catch (Exception fe) {
126: //System.err.println("IO exception:" + fe);
127: }
128: }
129: }
130:
131: }
|