001: /*
002: * Copyright 2004 Outerthought bvba and Schaubroeck nv
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016: package org.outerj.daisy.jspwiki_import;
017:
018: import org.apache.commons.httpclient.HttpClient;
019: import org.apache.commons.httpclient.HttpMethod;
020: import org.apache.commons.httpclient.HttpStatus;
021: import org.apache.commons.httpclient.methods.GetMethod;
022: import org.apache.xerces.parsers.DOMParser;
023: import org.cyberneko.html.HTMLConfiguration;
024: import org.xml.sax.*;
025: import org.xml.sax.helpers.AttributesImpl;
026: import org.jaxen.dom.DOMXPath;
027: import org.w3c.dom.Element;
028: import org.w3c.dom.NodeList;
029: import org.w3c.dom.Node;
030: import org.outerj.daisy.htmlcleaner.HtmlCleanerFactory;
031: import org.outerj.daisy.htmlcleaner.HtmlCleanerTemplate;
032: import org.outerj.daisy.htmlcleaner.HtmlCleaner;
033: import org.outerj.daisy.repository.*;
034: import org.outerj.daisy.repository.clientimpl.RemoteRepositoryManager;
035:
036: import javax.xml.transform.dom.DOMSource;
037: import javax.xml.transform.Transformer;
038: import javax.xml.transform.stream.StreamResult;
039: import javax.xml.transform.sax.SAXTransformerFactory;
040: import javax.xml.transform.sax.TransformerHandler;
041: import javax.xml.transform.sax.SAXResult;
042: import javax.xml.parsers.DocumentBuilderFactory;
043: import javax.xml.parsers.DocumentBuilder;
044: import javax.xml.parsers.SAXParserFactory;
045: import javax.xml.parsers.SAXParser;
046: import java.util.*;
047: import java.io.*;
048: import java.net.URLDecoder;
049:
050: /**
051: * Standalone app to import contents of a JSP Wiki into daisy. Currently
052: * only written with the purpose of importing the Cocoon Wiki content to
053: * have some meaningful, and meaningful-sized testdata.
054: *
055: * <p>The import runs in two passes: first all wiki pages are imported
056: * into daisy, then links are translated from wiki page names to daisy
057: * document ids.
058: *
059: * <p>To run, after maven build, execute target/runimport.sh.
060: *
061: * <p>To make this usable as a generic utility, at least the hardcoded
062: * wiki location and daisy username, collection and url should be specifiable
063: * using command line parameters.
064: *
065: */
066: public class JspWikiImporter {
067: private String wikiPageURL = "http://wiki.cocoondev.org/Wiki.jsp?page=";
068: private String collectionName = "cocoon";
069: private String daisyUser = "jspwiki-import";
070: private String daisyPassword = "topsecret";
071: private HashSet allPageNames = new HashSet();
072: private DocumentBuilder documentBuilder;
073: private HtmlCleanerTemplate htmlCleanerTemplate;
074: private SAXTransformerFactory transformerFactory = (SAXTransformerFactory) SAXTransformerFactory
075: .newInstance();
076: private Repository repository;
077: private HashMap importPages = new HashMap();
078: private HashMap importedImages = new HashMap();
079: private HashMap importedAttachments = new HashMap();
080: private DocumentCollection collection;
081: private static HashSet skipPages = new HashSet();
082: static {
083: skipPages.add("UndefinedPages");
084: skipPages.add("UnusedPages");
085: skipPages.add("IndexPage");
086: skipPages.add("RecentChanges");
087: skipPages.add("FullRecentChanges");
088: }
089:
090: public static void main(String[] args) throws Exception {
091: new JspWikiImporter().run();
092: }
093:
094: public void run() throws Exception {
095: // initialize some stuff
096: System.out.println("Doing preparations...");
097: documentBuilder = DocumentBuilderFactory.newInstance()
098: .newDocumentBuilder();
099: File htmlCleanerConfig = new File(
100: "../daisywiki/frontend/src/cocoon/webapp/daisy/resources/conf/htmlcleaner.xml");
101: htmlCleanerTemplate = new HtmlCleanerFactory()
102: .buildTemplate(new InputSource(new FileInputStream(
103: htmlCleanerConfig)));
104:
105: // connect to daisy
106: System.out.println("Connecting to daisy...");
107: Credentials credentials = new Credentials(daisyUser,
108: daisyPassword);
109: RepositoryManager repositoryManager = new RemoteRepositoryManager(
110: "http://localhost:9263", credentials);
111: repository = repositoryManager.getRepository(credentials);
112: collection = repository.getCollectionManager()
113: .getCollectionByName(collectionName, false);
114:
115: // load wiki page names
116: System.out.println("Fetching list of all pages on the wiki...");
117: loadPageNames();
118: System.out.println(allPageNames.size()
119: + " pages found on the wiki.");
120: System.out.println();
121:
122: String[] pages = (String[]) allPageNames
123: .toArray(new String[allPageNames.size()]);
124: for (int i = 0; i < pages.length; i++) {
125: if (pages[i].startsWith("Wyona")) {
126: System.out.println("Skipping page " + pages[i]);
127: } else if (skipPages.contains(pages[i])) {
128: System.out.println("Skipping page " + pages[i]);
129: } else {
130: System.out.println("Fetching page " + pages[i]
131: + "... (" + i + " of " + pages.length + ")");
132: byte[] pageData = fetchPage(pages[i]);
133:
134: System.out.println("Parsing and cleaning HTML...");
135: org.w3c.dom.Document pageDocument = parseHtml(pageData);
136: DOMXPath xpath = new DOMXPath("//div[@class='content']");
137: Element contentDiv = (Element) xpath
138: .selectSingleNode(pageDocument);
139: if (contentDiv == null)
140: throw new Exception("No content found in page "
141: + pages[i]);
142: String contentData = serialize(contentDivToDoc(contentDiv));
143: byte[] cleanedContent = clean(contentData);
144:
145: System.out.println("Storing page in Daisy...");
146: Document document = repository.createDocument(pages[i],
147: "SimpleDocument");
148: document.setPart("SimpleDocumentContent", "text/xml",
149: cleanedContent);
150: document.addToCollection(collection);
151: document.save();
152: importPages.put(pages[i], new Long(document.getId()));
153: System.out.println("Done\n");
154: }
155: }
156:
157: System.out.println("\n\nWILL NOW START LINK TRANSLATION\n\n");
158:
159: Iterator importPagesIt = importPages.entrySet().iterator();
160: while (importPagesIt.hasNext()) {
161: Map.Entry entry = (Map.Entry) importPagesIt.next();
162: String pageName = (String) entry.getKey();
163: long pageId = ((Long) entry.getValue()).longValue();
164:
165: System.out.println("Translating links for document "
166: + pageName + "...");
167: Document document = repository.getDocument(pageId, true);
168: byte[] pageData = document.getPart("SimpleDocumentContent")
169: .getData();
170: byte[] newData = clean(translateLinks(pageData));
171: document.setPart("SimpleDocumentContent", "text/xml",
172: newData);
173: document.save();
174: System.out.println("Done\n");
175: }
176:
177: }
178:
179: private byte[] clean(String htmlData) throws Exception {
180: HtmlCleaner cleaner = htmlCleanerTemplate.newHtmlCleaner();
181: return cleaner.cleanToByteArray(htmlData);
182: }
183:
184: private org.w3c.dom.Document contentDivToDoc(Element contentDiv) {
185: org.w3c.dom.Document doc = documentBuilder.newDocument();
186: Element htmlEl = doc.createElementNS(null, "html");
187: doc.appendChild(htmlEl);
188: Element bodyEl = doc.createElementNS(null, "body");
189: htmlEl.appendChild(bodyEl);
190: NodeList childNodes = contentDiv.getChildNodes();
191: for (int i = 0; i < childNodes.getLength(); i++) {
192: Node node = childNodes.item(i);
193: boolean append = true;
194: if (node instanceof Element
195: && node.getLocalName().equals("h1")) {
196: Element divEl = (Element) node;
197: if (divEl.getAttribute("class").equals("pagename")) {
198: append = false;
199: }
200: } else if (node instanceof Element
201: && node.getLocalName().equals("div")) {
202: Element divEl = (Element) node;
203: // detect end of content by presence of a div with class bottom.
204: if (divEl.getAttribute("class").equals("bottom")) {
205: return doc;
206: }
207: }
208: if (append)
209: bodyEl.appendChild(doc.importNode(node, true));
210: }
211: return doc;
212: }
213:
214: private String serialize(org.w3c.dom.Document doc) throws Exception {
215: TransformerHandler serializer = transformerFactory
216: .newTransformerHandler();
217: StringWriter writer = new StringWriter();
218: serializer.setResult(new StreamResult(writer));
219:
220: Transformer streamer = transformerFactory.newTransformer();
221: streamer.transform(new DOMSource(doc), new SAXResult(
222: new ExtraCleanup(serializer)));
223: return writer.toString();
224: }
225:
226: private void loadPageNames() throws Exception {
227: byte[] indexPageData = fetchPage("IndexPage");
228: org.w3c.dom.Document document = parseHtml(indexPageData);
229: DOMXPath xpath = new DOMXPath("//a[@class='wikipage']");
230: List nodes = xpath.selectNodes(document);
231: Iterator nodesIt = nodes.iterator();
232: while (nodesIt.hasNext()) {
233: Element element = (Element) nodesIt.next();
234: String href = element.getAttribute("href");
235: if (href.startsWith(wikiPageURL))
236: allPageNames.add(href.substring(wikiPageURL.length()));
237: }
238: }
239:
240: private byte[] fetchPage(String pageName) throws Exception {
241: HttpClient client = new HttpClient();
242: HttpMethod method = new GetMethod(wikiPageURL + pageName);
243: int status = client.executeMethod(method);
244: if (status != HttpStatus.SC_OK)
245: throw new Exception("Problem retrieving wiki page "
246: + pageName + " : " + method.getStatusCode() + " : "
247: + HttpStatus.getStatusText(method.getStatusCode()));
248: return method.getResponseBody();
249: }
250:
251: private org.w3c.dom.Document parseHtml(byte[] data)
252: throws Exception {
253: DOMParser parser = new DOMParser(new HTMLConfiguration());
254: parser.setFeature("http://xml.org/sax/features/namespaces",
255: true);
256: parser
257: .setFeature(
258: "http://cyberneko.org/html/features/override-namespaces",
259: false);
260: parser.setFeature(
261: "http://cyberneko.org/html/features/insert-namespaces",
262: false);
263: parser.setProperty(
264: "http://cyberneko.org/html/properties/names/elems",
265: "lower");
266: parser.setProperty(
267: "http://cyberneko.org/html/properties/names/attrs",
268: "lower");
269:
270: parser.parse(new InputSource(new ByteArrayInputStream(data)));
271: return parser.getDocument();
272: }
273:
274: private String translateLinks(byte[] data) throws Exception {
275: TransformerHandler serializer = transformerFactory
276: .newTransformerHandler();
277: StringWriter writer = new StringWriter();
278: serializer.setResult(new StreamResult(writer));
279:
280: SAXParserFactory parserFactory = SAXParserFactory.newInstance();
281: parserFactory.setNamespaceAware(true);
282: SAXParser parser = parserFactory.newSAXParser();
283: parser.getXMLReader().setContentHandler(
284: new LinkTranslator(serializer));
285: parser.getXMLReader().parse(
286: new InputSource(new ByteArrayInputStream(data)));
287:
288: return writer.toString();
289: }
290:
291: class AbstractTransformer implements ContentHandler {
292: protected ContentHandler consumer;
293:
294: public AbstractTransformer(ContentHandler consumer) {
295: this .consumer = consumer;
296: }
297:
298: public void endDocument() throws SAXException {
299: consumer.endDocument();
300: }
301:
302: public void startDocument() throws SAXException {
303: consumer.startDocument();
304: }
305:
306: public void characters(char ch[], int start, int length)
307: throws SAXException {
308: consumer.characters(ch, start, length);
309: }
310:
311: public void ignorableWhitespace(char ch[], int start, int length)
312: throws SAXException {
313: consumer.ignorableWhitespace(ch, start, length);
314: }
315:
316: public void endPrefixMapping(String prefix) throws SAXException {
317: consumer.endPrefixMapping(prefix);
318: }
319:
320: public void skippedEntity(String name) throws SAXException {
321: consumer.skippedEntity(name);
322: }
323:
324: public void setDocumentLocator(Locator locator) {
325: consumer.setDocumentLocator(locator);
326: }
327:
328: public void processingInstruction(String target, String data)
329: throws SAXException {
330: consumer.processingInstruction(target, data);
331: }
332:
333: public void startPrefixMapping(String prefix, String uri)
334: throws SAXException {
335: consumer.startPrefixMapping(prefix, uri);
336: }
337:
338: public void endElement(String namespaceURI, String localName,
339: String qName) throws SAXException {
340: consumer.endElement(namespaceURI, localName, qName);
341: }
342:
343: public void startElement(String namespaceURI, String localName,
344: String qName, Attributes atts) throws SAXException {
345: consumer.startElement(namespaceURI, localName, qName, atts);
346: }
347: }
348:
349: class LinkTranslator extends AbstractTransformer {
350:
351: public LinkTranslator(ContentHandler consumer) {
352: super (consumer);
353: }
354:
355: public void startElement(String uri, String localName,
356: String qName, Attributes attributes)
357: throws SAXException {
358: if (uri.equals("") && localName.equals("a")) {
359: int index = attributes.getIndex("href");
360: String href = (index != -1 ? attributes.getValue(index)
361: : null);
362: if (href != null && href.startsWith(wikiPageURL)) {
363: String linkedPage = href.substring(wikiPageURL
364: .length());
365: Long linkedPageId = (Long) importPages
366: .get(linkedPage);
367: System.out.println("attempt translation of "
368: + linkedPage + " to " + linkedPageId);
369: if (linkedPageId != null) {
370: AttributesImpl newAttrs = new AttributesImpl(
371: attributes);
372: newAttrs.setAttribute(
373: newAttrs.getIndex("href"), "", "href",
374: "href", "CDATA", "daisy:"
375: + linkedPageId.longValue());
376: attributes = newAttrs;
377: }
378: }
379: }
380: consumer.startElement(uri, localName, qName, attributes);
381: }
382: }
383:
384: class ExtraCleanup extends AbstractTransformer {
385: private boolean dropNextImgEndTag = false;
386:
387: public ExtraCleanup(ContentHandler consumer) {
388: super (consumer);
389: }
390:
391: public void startElement(String namespaceURI, String localName,
392: String qName, Attributes atts) throws SAXException {
393: if (namespaceURI.equals("")
394: && localName.equals("img")
395: && ("http://wiki.cocoondev.org/images/out.png"
396: .equals(atts.getValue("src")) || "images/attachment_small.png"
397: .equals(atts.getValue("src")))) {
398: dropNextImgEndTag = true;
399: // skip element
400: } else if (namespaceURI.equals("")
401: && localName.equals("img")) {
402: String src = atts.getValue("src");
403: if (src != null) {
404: if (importedImages.containsKey(src)) {
405: AttributesImpl newAttrs = new AttributesImpl();
406: newAttrs.addAttribute("", "src", "src",
407: "CDATA", "daisy:"
408: + importedImages.get(src));
409: } else {
410: try {
411: HttpClient client = new HttpClient();
412: HttpMethod method = new GetMethod(src);
413: int status = client.executeMethod(method);
414: if (status >= 300 && status < 400) {
415: method = new GetMethod(method
416: .getResponseHeader("location")
417: .getValue());
418: status = client.executeMethod(method);
419: }
420: if (status != HttpStatus.SC_OK)
421: throw new Exception(
422: "Problem retrieving image "
423: + src
424: + " : "
425: + method
426: .getStatusCode()
427: + " : "
428: + HttpStatus
429: .getStatusText(method
430: .getStatusCode()));
431: byte[] data = method.getResponseBody();
432: String name = getImageName(src);
433: Document imageDocument = repository
434: .createDocument(name, "Image");
435: imageDocument.setPart("ImageData", method
436: .getResponseHeader("Content-Type")
437: .getValue(), data);
438: imageDocument.addToCollection(collection);
439: imageDocument.save();
440: importedImages.put(src, String
441: .valueOf(imageDocument.getId()));
442: AttributesImpl newAttrs = new AttributesImpl();
443: newAttrs.addAttribute("", "src", "src",
444: "CDATA", "daisy:"
445: + imageDocument.getId());
446: super .startElement("", "img", "img",
447: newAttrs);
448: System.out.println("Imported image " + src
449: + " as " + name);
450: } catch (Exception e) {
451: throw new SAXException(
452: "Error getting image " + src, e);
453: }
454: }
455: }
456: } else if (namespaceURI.equals("") && localName.equals("a")
457: && "attachment".equals(atts.getValue("class"))) {
458: String src = atts.getValue("href");
459: String decodedSrc = null;
460: try {
461: decodedSrc = URLDecoder.decode(src, "UTF-8");
462: } catch (UnsupportedEncodingException e) {
463: throw new SAXException(e);
464: }
465: if (importedAttachments.containsKey(src)) {
466: AttributesImpl newAttrs = new AttributesImpl();
467: newAttrs.addAttribute("", "src", "src", "CDATA",
468: "daisy:" + importedAttachments.get(src));
469: } else {
470: try {
471: HttpClient client = new HttpClient();
472: HttpMethod method = new GetMethod(src);
473: int status = client.executeMethod(method);
474: if (status != HttpStatus.SC_OK)
475: throw new Exception(
476: "Problem retrieving attachment "
477: + src
478: + " : "
479: + method.getStatusCode()
480: + " : "
481: + HttpStatus
482: .getStatusText(method
483: .getStatusCode()));
484: byte[] data = method.getResponseBody();
485: String name = getImageName(decodedSrc);
486: Document attachmentDocument = repository
487: .createDocument(name, "Attachment");
488: attachmentDocument.setPart("AttachmentData",
489: method
490: .getResponseHeader(
491: "Content-Type")
492: .getValue(), data);
493: attachmentDocument.addToCollection(collection);
494: attachmentDocument.save();
495: importedAttachments.put(src, String
496: .valueOf(attachmentDocument.getId()));
497: AttributesImpl newAttrs = new AttributesImpl();
498: newAttrs.addAttribute("", "href", "href",
499: "CDATA", "daisy:"
500: + attachmentDocument.getId());
501: super .startElement("", "a", "a", newAttrs);
502: System.out.println("Imported attachment " + src
503: + " as " + name);
504: } catch (Exception e) {
505: throw new SAXException(
506: "Error getting attachment " + src, e);
507: }
508: }
509: } else {
510: super
511: .startElement(namespaceURI, localName, qName,
512: atts);
513: }
514: }
515:
516: private String getImageName(String src) {
517: String name = src.substring(src.lastIndexOf('/') + 1);
518: int dotpos = name.lastIndexOf('.');
519: if (dotpos != -1) {
520: name = name.substring(0, dotpos);
521: }
522: return name;
523: }
524:
525: public void endElement(String namespaceURI, String localName,
526: String qName) throws SAXException {
527: if (dropNextImgEndTag && namespaceURI.equals("")
528: && localName.equals("img")) {
529: // skip
530: dropNextImgEndTag = false;
531: // note that this code assumes img elements are never nested.
532: } else {
533: super.endElement(namespaceURI, localName, qName);
534: }
535: }
536: }
537: }
|