01: package com.opensymphony.module.sitemesh.parser;
02:
03: import com.opensymphony.module.sitemesh.Page;
04: import com.opensymphony.module.sitemesh.PageParser;
05: import com.opensymphony.module.sitemesh.html.HTMLProcessor;
06: import com.opensymphony.module.sitemesh.html.State;
07: import com.opensymphony.module.sitemesh.html.StateTransitionRule;
08: import com.opensymphony.module.sitemesh.html.tokenizer.TagTokenizer;
09: import com.opensymphony.module.sitemesh.html.util.CharArray;
10: import com.opensymphony.module.sitemesh.html.rules.BodyTagRule;
11: import com.opensymphony.module.sitemesh.html.rules.ContentBlockExtractingRule;
12: import com.opensymphony.module.sitemesh.html.rules.FramesetRule;
13: import com.opensymphony.module.sitemesh.html.rules.HeadExtractingRule;
14: import com.opensymphony.module.sitemesh.html.rules.HtmlAttributesRule;
15: import com.opensymphony.module.sitemesh.html.rules.MSOfficeDocumentPropertiesRule;
16: import com.opensymphony.module.sitemesh.html.rules.MetaTagRule;
17: import com.opensymphony.module.sitemesh.html.rules.ParameterExtractingRule;
18: import com.opensymphony.module.sitemesh.html.rules.TitleExtractingRule;
19: import com.opensymphony.module.sitemesh.html.rules.PageBuilder;
20:
21: import java.io.IOException;
22:
23: /**
24: * <p>Builds an HTMLPage object from an HTML document. This behaves
25: * similarly to the FastPageParser, however it's a complete rewrite that is simpler to add custom features to such as
26: * extraction and transformation of elements.</p>
27: *
28: * <p>To customize the rules used, this class can be extended and have the userDefinedRules() methods overridden.</p>
29: *
30: * @author Joe Walnes
31: *
32: * @see HTMLProcessor
33: */
34: public class HTMLPageParser implements PageParser {
35:
36: public Page parse(char[] data) throws IOException {
37: CharArray head = new CharArray(64);
38: CharArray body = new CharArray(4096);
39: TokenizedHTMLPage page = new TokenizedHTMLPage(data, body, head);
40: HTMLProcessor processor = new HTMLProcessor(data, body);
41: State html = processor.defaultState();
42:
43: // Core rules for SiteMesh to be functional.
44: html.addRule(new HeadExtractingRule(head)); // contents of <head>
45: html.addRule(new BodyTagRule(page, body)); // contents of <body>
46: html.addRule(new TitleExtractingRule(page)); // the <title>
47: html.addRule(new FramesetRule(page)); // if the page is a frameset
48:
49: // Additional rules - designed to be tweaked.
50: addUserDefinedRules(html, page);
51:
52: processor.process();
53: return page;
54: }
55:
56: protected void addUserDefinedRules(State html, PageBuilder page) {
57: // Ensure that while in <xml> tag, none of the other rules kick in.
58: // For example <xml><book><title>hello</title></book></xml> should not change the affect the title of the page.
59: State xml = new State();
60: html.addRule(new StateTransitionRule("xml", xml));
61:
62: // Useful properties
63: html.addRule(new HtmlAttributesRule(page)); // attributes in <html> element
64: html.addRule(new MetaTagRule(page)); // all <meta> tags
65: html.addRule(new ParameterExtractingRule(page)); // <parameter> blocks
66: html.addRule(new ContentBlockExtractingRule(page)); // <content> blocks
67:
68: // Capture properties written to documents by MS Office (author, version, company, etc).
69: // Note: These properties are from the xml state, not the html state.
70: xml.addRule(new MSOfficeDocumentPropertiesRule(page));
71: }
72:
73: }
|