001: /**
002: *
003: */package com.dappit.Dapper.parser;
004:
005: import java.util.Iterator;
006: import java.util.Vector;
007: import java.util.regex.Matcher;
008: import java.util.regex.Pattern;
009:
010: import org.dom4j.dom.DOMDocument;
011: import org.dom4j.dom.DOMEntityReference;
012: import org.w3c.dom.CDATASection;
013: import org.w3c.dom.Comment;
014: import org.w3c.dom.Document;
015: import org.w3c.dom.Element;
016: import org.w3c.dom.NodeList;
017: import org.w3c.dom.Text;
018:
019: /**
020: * @author Ohad Serfaty
021: *
022: * A class for building DOM documents from mozilla's content sink instructions
023: *
024: * supported operations are : OpenNode <tag name> CloseNode <tag name> AddText
025: * <content> AddLeaf <tag name> WriteAttributeKey <key> - in pair with the next
026: * op : WriteAttributeValue <value> CloseLead AddComment AddEntity
027: *
028: * Unsupported ( fot the time being ) : AddInstruction AddTitle
029: *
030: *
031: * Note that this class is reusable , you can use reset() to clear the content
032: * of the dom.
033: *
034: */
035: public class DomDocumentBuilder {
036:
037: private static char charMinusOne = (char) -1;
038:
039: public static String getCDATASection(String domArgument) {
040: if (!domArgument.contains("CDATA"))
041: return null;
042: Pattern pat = Pattern.compile(
043: "(.*)\\<\\!(\\s*)\\[CDATA(.*)\\]\\]\\>(.*)",
044: Pattern.DOTALL + Pattern.MULTILINE);
045: Matcher mat = pat.matcher(domArgument);
046: if (mat.find()) {
047: String group3 = mat.group(3);
048: if (group3.startsWith("["))
049: group3 = group3.replaceFirst("\\[", "");
050: String result = mat.group(1) + group3 + mat.group(4);
051: return result;
052: }
053: return null;
054: }
055:
056: /**
057: * Finalize and build the dom document.
058: *
059: * @return
060: */
061: public Document buildDocument(InstructionsPool instructionsPool) {
062: // System.out.println("building document...");
063: DOMDocument resultDocument = new DOMDocument();
064: Iterator<Integer> i = instructionsPool.operations.iterator();
065: Iterator<String> j = instructionsPool.arguments.iterator();
066: Element currentElement = null;
067: boolean isInLeaf = false;
068: boolean closeHtml = true;
069:
070: while (i.hasNext()) {
071: int domOperation = i.next();
072: String domArgument = j.next();
073: //System.out.println("Operation :" + ParserInstruction.getOperationString(domOperation)+" Arg:~" + domArgument+"~");
074: switch (domOperation) {
075: // Open node :
076: case ParserInstruction.OpenNode:
077: closeHtml = true;
078: Element childNode = resultDocument
079: .createElement(domArgument.toLowerCase());
080: if (currentElement == null) {
081: resultDocument
082: .setRootElement((org.dom4j.Element) childNode);
083: currentElement = childNode;
084: } else {
085: if (!domArgument.equalsIgnoreCase("html")) {
086: currentElement.appendChild(childNode);
087: currentElement = childNode;
088: } else
089: closeHtml = false;
090: }
091: break;
092: // Close node :
093: case ParserInstruction.CloseNode: {
094: if (currentElement == null) {
095: System.err
096: .println("Error : Close Node where no OpenNode was called. trying to fix...");
097: // this.dump();
098: } else if (closeHtml)
099: currentElement = (Element) currentElement
100: .getParentNode();
101:
102: }
103: break;
104: case ParserInstruction.AddText:
105: case ParserInstruction.AddContent:
106: // System.out.println(currentElement.getNodeName() +" : Adding
107: // text :" + domArgument);
108: // check : try and resolve this with a <newline> from mozilla
109: // instead :
110: boolean script = false;
111: boolean style = false;
112:
113: if (currentElement.getNodeName().equalsIgnoreCase(
114: "script"))
115: script = true;
116: else if (currentElement.getNodeName().equalsIgnoreCase(
117: "style"))
118: style = true;
119: else
120: domArgument = DomDocumentBuilder
121: .fixText(domArgument);
122:
123: // System.out.println("Body content :" + domArgument);
124: // System.out.println("AddText "+domArgument.length());
125: if (domArgument.length() >= 1) {
126: if (!script && !style) {
127: Text textNode = resultDocument
128: .createTextNode(domArgument);
129: currentElement.appendChild(textNode);
130: } else {
131: domArgument = domArgument.trim();
132: String cdata = getCDATASection(domArgument);
133:
134: if (cdata != null) {
135: if (script)
136: cdata = DomDocumentBuilder
137: .fixText(cdata);
138: else
139: cdata = DomDocumentBuilder
140: .fixText(domArgument);
141: CDATASection cdataSection = resultDocument
142: .createCDATASection(cdata);
143: currentElement.appendChild(cdataSection);
144: } else {
145: domArgument = DomDocumentBuilder
146: .fixText(domArgument);
147: Text textNode = resultDocument
148: .createTextNode(domArgument);
149: currentElement.appendChild(textNode);
150: }
151: }
152: }
153: break;
154: case ParserInstruction.AddLeaf:
155: Element leafElement = resultDocument
156: .createElement(domArgument);
157: currentElement.appendChild(leafElement);
158: currentElement = leafElement;
159: isInLeaf = true;
160: break;
161: case ParserInstruction.WriteAttributeKey:
162: // add an attribute with the next lookahead operation :
163: domOperation = i.next(); // Fetch the next operation , must
164: // be WriteAttributeValue
165: String value = j.next(); // Feth the attributes value.
166: if (!domArgument.toLowerCase().trim().equalsIgnoreCase(
167: "_moz-userdefined"))
168: currentElement.setAttribute(domArgument
169: .toLowerCase(), DomDocumentBuilder
170: .fixText(value));
171: break;
172: case ParserInstruction.CloseLeaf:
173: if (isInLeaf) {
174: currentElement = (Element) currentElement
175: .getParentNode();
176: isInLeaf = false;
177: }
178: break;
179: case ParserInstruction.AddEntity:
180: DOMEntityReference entity = (DOMEntityReference) resultDocument
181: .createEntityReference(domArgument);
182: // a bugfix for a c++ problem in the mozilla parser:
183: if (!Character.isDigit(domArgument.charAt(0)))
184: entity.setText("&" + domArgument + ";");
185: else
186: entity.setText("");
187: currentElement.appendChild(entity);
188: break;
189: case ParserInstruction.AddComment:
190: Comment comment = resultDocument
191: .createComment(domArgument);
192: currentElement.appendChild(comment);
193: break;
194: case ParserInstruction.SetTitle:
195: Element titleNode = resultDocument
196: .createElement("title");
197: titleNode.appendChild(resultDocument
198: .createTextNode(fixText(domArgument)));
199: NodeList headElements = resultDocument
200: .getElementsByTagName("head");
201: // Add the title with the new text :
202: if (headElements.getLength() > 0)
203: headElements.item(0).appendChild(titleNode);
204: break;
205: }
206: }
207: return resultDocument;
208: }
209:
210: public static String fixText(String text) {
211: StringBuilder fixedText = new StringBuilder();
212: char[] charArray = text.toCharArray();
213: for (int i = 0; i < charArray.length; i++) {
214: char ch = charArray[i];
215: if (ch == '&') {
216: char ch2 = charArray.length >= i + 2 ? charArray[i + 1]
217: : charMinusOne;
218: char ch3 = charArray.length >= i + 3 ? charArray[i + 2]
219: : charMinusOne;
220: char ch4 = charArray.length >= i + 4 ? charArray[i + 3]
221: : charMinusOne;
222: char ch5 = charArray.length >= i + 5 ? charArray[i + 4]
223: : charMinusOne;
224: char ch6 = charArray.length >= i + 6 ? charArray[i + 5]
225: : charMinusOne;
226: // char ch7 = charArray.length > i+7 ? charArray[i+6] :
227: // charMinusOne;
228: // System.out.println("ch3:" + ch3 +" ch7:" + ch6);
229: if (ch2 == '#') {
230: if (ch3 == '1') {
231: if ((ch4 == '0' && ch5 == ';'))
232: i = i + 4;
233: } else if (ch3 == '9' && ch4 == ';') {
234: i = i + 3;
235: }
236: } else if (ch2 == 'l' && ch3 == 't' && ch4 == ';') {
237: fixedText.append('>');
238: i = i + 2;
239: } else if (ch2 == 'g' && ch3 == 't' && ch4 == ';') {
240: fixedText.append('<');
241: i = i + 3;
242: } else if (ch2 == 'a' && ch3 == 'm' && ch4 == 'p'
243: && ch5 == ';') {
244: fixedText.append('&');
245: i = i + 4;
246: } else if (ch2 == 'q' && ch3 == 'u' && ch4 == 'o'
247: && ch5 == 't' && ch6 == ';') {
248: fixedText.append('"');
249: i = i + 5;
250: }
251: } else if (ch == '\n')
252: fixedText.append(ch);
253: else if (ch == 32)
254: fixedText.append(' ');
255: else if (ch < 32 && ch > 0 && ch != 9)
256: ;
257: else
258: fixedText.append(ch);
259: }
260: return fixedText.toString();
261: }
262:
263: // Old version of the fixText function :
264: // private static final String String32 = new String(new byte[]{ 32 });
265: // private static final String String0 = new String(new byte[]{ 0 });
266: // private static final String String1 = new String(new byte[]{ 0x1 });
267: // private static final String String14 = new String(new byte[]{ 0x14 });
268: // private static final String String1d = new String(new byte[]{ 0x1d });
269: // private static final String String0xf = new String(new byte[]{ 0xf });
270: // private static final String String0x1A = new String(new byte[]{ 0x1A });
271: // private static final String String0x12 = new String(new byte[]{ 0x12 });
272: // private static final String String0x8 = new String(new byte[]{ 0x8 });
273: // private static final String String0x1f = new String(new byte[]{ 0x1f });
274: // private static final String String0x2 = new String(new byte[]{ 0x2 });
275: // private static final String String0x7 = new String(new byte[]{ 0x7 });
276: // private static final String String0x18 = new String(new byte[]{ 0x18 });
277: // private static final String String0x19 = new String(new byte[]{ 0x19 });
278: // private static final String String0x1B = new String(new byte[]{ 0x1B });
279: // private static final String String0x1C = new String(new byte[]{ 0x1C });
280: // private static final String String0x11 = new String(new byte[]{ 0x11 });
281: // private static final String String0x10 = new String(new byte[]{ 0x10 });
282: // private static final String String0x13 = new String(new byte[]{ 0x13 });
283: //
284: // private static String fixTextOld(String text)
285: // {
286: // String fixedText = new String(text);
287: // fixedText = fixedText.replaceAll(" ", "");
288: // fixedText = fixedText.replaceAll("	", "");
289: // fixedText = fixedText.replaceAll(" ", " ");
290: // fixedText = fixedText.replaceAll("Â", "\"");
291: // fixedText = fixedText.replaceAll(""", "\"");
292: // fixedText = fixedText.replaceAll("<", "<");
293: // fixedText = fixedText.replaceAll(">", ">");
294: // fixedText = fixedText.replaceAll("&", "&");
295: // fixedText = fixedText.replaceAll(String32, " ");
296: // fixedText = fixedText.replaceAll("["+String0+String1+String14+String1d+String0xf + String0xf+
297: // String0x1A + String0x12 + String0x8 + String0x1f+ String0x2+String0x7+String0x18+String0x19+String0x1B+String0x1C+
298: // String0x11+String0x10+String0x13+"]" , "");
299: // return fixedText;
300: // }
301:
302: }
|