001: package com.dappit.Dapper.parser;
002:
003: import java.io.BufferedReader;
004: import java.io.StringReader;
005: import java.util.Vector;
006:
007: import org.dom4j.DocumentException;
008: import org.w3c.dom.Document;
009:
010: /**
011: * @author Ohad Serfaty
012: *
013: * A Mozilla native Html Parser
014: *
015: */
016: public class MozillaParser {
017:
018: boolean isParsing = false;
019: static boolean isInitialized = false;
020: DomDocumentBuilder domBuilder = new DomDocumentBuilder();
021: InstructionsPool instructionsPool = new InstructionsPool();
022: private static String MozillaInitializedJvmProperty = "MozillaParser.Initialized";
023:
024: public static final double LatestParserVersion = 2.0;
025: public static final double ProcutionOneParserVersion = 1.7;
026: private double currentParserVersion = LatestParserVersion;
027:
028: /**
029: * initialize the mozilla XPCOM embedded components with the proper
030: * components base directory
031: *
032: * @param componentBase
033: * mozilla's components directory (e.g
034: * /home/ohad/mozilla/dist/bin )
035: */
036: private synchronized static native void initXPCOM(
037: String componentBase) throws ParserInitializationException;
038:
039: /**
040: * Native function. parse an html function using mozilla's html parser and
041: * make callbacks to the java local sink ( DomDocumentBuilder for that
042: * matter)
043: *
044: * @param html
045: * HTML to parse.
046: * @throws ParserInitializationException
047: */
048: private native void parseHtml(String html, double parserVersion)
049: throws ParserInitializationException;
050:
051: /**
052: *
053: * A callback is being made from native code to this function.
054: *
055: * @param domOperation
056: * @param domArgument
057: */
058: public void callback(int domOperation, String domArgument) {
059: //System.out.println("called back with :"+domOperation +" " + domArgument );
060: this .instructionsPool.addInstruction(domOperation, domArgument);
061: }
062:
063: public void callNativeHtmlParser(String html)
064: throws DocumentException {
065:
066: // System.out.println("*********** native html parser... calling ");
067: html = html.replaceAll("<\\s*(STYLE|style)\\s*>",
068: "<$1 harmless=''> ");
069: html = html.replaceAll("<\\s*(script|SCRIPT)\\s*>",
070: "<$1 harmless=''> ");
071: this .instructionsPool.reset();
072: try {
073: this .parseHtml(html, this .currentParserVersion);
074: } catch (Throwable e) {
075: System.err.println("Warning: could not parse html :"
076: + e.getMessage());
077: throw new DocumentException(e);
078: }
079: }
080:
081: public Document parse(String html) throws DocumentException {
082: callNativeHtmlParser(html);
083: return this .domBuilder.buildDocument(instructionsPool);
084: }
085:
086: public void dump() {
087: this .instructionsPool.dump();
088: }
089:
090: /**
091: * Initialize the mozilla html parser with a DLL to load and a mozilla
092: * component base
093: *
094: * @param dllToLoad
095: * @param componentsBase
096: * @throws ParserInitializationException
097: */
098: public static void init(String parserLibrary, String componentsBase)
099: throws Exception {
100: String initialized = System
101: .getProperty(MozillaInitializedJvmProperty);
102: if (initialized == null) {
103: System.setProperty(MozillaInitializedJvmProperty, "true");
104: } else {
105: System.err
106: .println("Warning : MozillaParser detected an additional attempt to initialize XPCOM. operation ignored.");
107: return;
108: }
109: try {
110: System.load(parserLibrary);
111: } catch (Throwable e) {
112: System.err
113: .println("Warning:Could not load library "
114: + parserLibrary
115: + " Possible reason : "
116: + "You have to include both mozilla.dist.bin."
117: + EnviromentController
118: .getOperatingSystemName()
119: + " And mozilla.dist.bin."
120: + EnviromentController
121: .getOperatingSystemName()
122: + " "
123: + "In the right environment variable (windows:PATH , Linux: LD_LIBRARY_PATH , macosx: DYLD_LIBRARY_PATH )");
124: throw new ParserInitializationException(e);
125: }
126: initXPCOM(componentsBase);
127: }
128:
129: /**
130: * @return
131: */
132: public Vector<Integer> getDomBuilderArguments() {
133: return this .instructionsPool.getInstructions();
134: }
135:
136: public InstructionsPool getInstructionsPool() {
137: return this .instructionsPool;
138: }
139:
140: public void setParserVersion(double parserVersion) {
141: if (parserVersion != LatestParserVersion
142: && parserVersion != ProcutionOneParserVersion)
143: throw new Error("Ilegal parser version");
144: this.currentParserVersion = parserVersion;
145: }
146: }
|